From 9fa18a67b58d70b842a4664a7fb8d7fb180355f9 Mon Sep 17 00:00:00 2001
From: Fabian Montero <fabian@posixlycorrect.com>
Date: Mon, 9 Mar 2026 13:24:41 -0600
Subject: [PATCH] initial commit

---
 CLAUDE.md        |  40 +++++
 README.md        |  63 +++++++
 bac_extract.py   | 419 +++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |   1 +
 4 files changed, 523 insertions(+)
 create mode 100644 CLAUDE.md
 create mode 100644 README.md
 create mode 100755 bac_extract.py
 create mode 100644 requirements.txt

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..08dc084
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,40 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+Single-script Python tool that extracts credit card transactions from BAC Costa Rica statement PDFs. Parses section "B) Detalle de compras del periodo" and outputs JSON.
+
+## Dependencies
+
+- pdfplumber (>=0.10.0)
+
+## Usage
+
+```bash
+python bac_extract.py <pdf_file> <card_suffix> [options]
+
+# Examples
+python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
+python bac_extract.py statement.pdf 1234 -o output.json -v
+```
+
+Options:
+- `-o, --output`: Output JSON path (default: transactions.json)
+- `--pretty`: Pretty-print JSON
+- `-v, --verbose`: Enable debug logging
+
+## Architecture
+
+The extraction pipeline:
+1. Validates PDF is a BAC statement (`is_bac_statement`)
+2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`)
+3. Extracts tables page-by-page using pdfplumber
+4. Filters transactions by card suffix (last 4 digits)
+5. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators
+
+Key parsing functions:
+- `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15"
+- `parse_amount`: Handles "1,234.56" and trailing negatives "100.00-"
+- `extract_card_holder`: Matches "************1234 NAME" pattern
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d4af911
--- /dev/null
+++ b/README.md
@@ -0,0 +1,63 @@
+# BAC Statement Extractor
+
+Extracts credit card transactions from BAC Costa Rica statement PDFs. Parses section "B) Detalle de compras del periodo" and outputs JSON.
+
+## Dependencies
+
+- Python 3.10+
+- pdfplumber (>=0.10.0)
+
+## Usage
+
+```bash
+python bac_extract.py <pdf_file> <card_suffix> [options]
+```
+
+**Arguments:**
+- `pdf_file`: Path to the BAC statement PDF
+- `card_suffix`: Last 4 digits of the card to filter
+
+**Options:**
+- `-o, --output`: Output JSON path (default: transactions.json)
+- `--pretty`: Pretty-print JSON output
+- `-v, --verbose`: Enable debug logging
+
+**Examples:**
+```bash
+python bac_extract.py statement.pdf 1234 --pretty
+python bac_extract.py statement.pdf 1234 -o output.json -v
+```
+
+## Output Format
+
+```json
+{
+  "metadata": {
+    "source_file": "statement.pdf",
+    "extraction_date": "2025-01-15T12:00:00Z",
+    "statement_date": "2025-01-10",
+    "card_filter": "1234",
+    "total_transactions": 5
+  },
+  "card_holder": {
+    "card_suffix": "1234",
+    "name": "CARD HOLDER NAME"
+  },
+  "transactions": [
+    {
+      "reference": "123456789012",
+      "date": "2025-01-09",
+      "description": "EXAMPLE STORE",
+      "location": null,
+      "currency": "CRC",
+      "amount_crc": 1234.56,
+      "amount_usd": null
+    }
+  ],
+  "summary": {
+    "total_crc": 50000.00,
+    "total_usd": 0.00,
+    "transaction_count": 5
+  }
+}
+```
diff --git a/bac_extract.py b/bac_extract.py
new file mode 100755
index 0000000..850bef1
--- /dev/null
+++ b/bac_extract.py
@@ -0,0 +1,419 @@
+#!/usr/bin/env python3
+"""
+BAC Credit Card Statement Extractor
+
+Extracts transactions from BAC Costa Rica credit card statement PDFs.
+Specifically targets section "B) Detalle de compras del periodo".
+"""
+
+import argparse
+import json
+import logging
+import re
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+import pdfplumber
+
+# Spanish month abbreviations to month numbers
+SPANISH_MONTHS = {
+    "ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6,
+    "JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12
+}
+
+# Card holder pattern: ************XXXX NAME
+CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)")
+
+# Date pattern: D-MMM-YY or DD-MMM-YY
+DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE)
+
+# Transaction line pattern:
+# Reference  Date        Description         Location (optional)  Currency  Amount
+# 123456789012 9-ENE-26 EXAMPLE STORE                              CRC      1,234.56
+TRANSACTION_PATTERN = re.compile(
+    r"^(\d{12})\s+"                          # Reference (12 digits)
+    r"(\d{1,2}-[A-Z]{3}-\d{2})\s+"           # Date
+    r"(.+?)\s+"                               # Description
+    r"(CRC|USD)\s+"                           # Currency
+    r"([\d,]+\.\d{2})(-)?$",                 # Amount (with optional trailing minus)
+    re.IGNORECASE
+)
+
+logger = logging.getLogger(__name__)
+
+
+def parse_spanish_date(date_str: str) -> Optional[str]:
+    """Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD)."""
+    if not date_str:
+        return None
+
+    match = DATE_PATTERN.match(date_str.strip())
+    if not match:
+        return None
+
+    day, month_abbr, year = match.groups()
+    month = SPANISH_MONTHS.get(month_abbr.upper())
+    if not month:
+        return None
+
+    # Assume 2000s for 2-digit year
+    full_year = 2000 + int(year)
+
+    try:
+        return f"{full_year:04d}-{month:02d}-{int(day):02d}"
+    except ValueError:
+        return None
+
+
+def parse_amount(amount_str: str) -> Optional[float]:
+    """
+    Parse amount string with comma thousands separator.
+    Handles trailing '-' for negative values.
+    """
+    if not amount_str or not amount_str.strip():
+        return None
+
+    amount_str = amount_str.strip()
+
+    # Check for trailing negative sign
+    is_negative = amount_str.endswith("-")
+    if is_negative:
+        amount_str = amount_str[:-1].strip()
+
+    # Remove thousands separators (commas) and handle decimal point
+    # Format: 1,234.56 or 1,234,567.89
+    try:
+        amount_str = amount_str.replace(",", "")
+        amount = float(amount_str)
+        return -amount if is_negative else amount
+    except ValueError:
+        return None
+
+
+def is_bac_statement(pdf: pdfplumber.PDF) -> bool:
+    """Check if the PDF is a BAC credit card statement."""
+    if not pdf.pages:
+        return False
+
+    first_page_text = pdf.pages[0].extract_text() or ""
+    return "BAC" in first_page_text and "TARJETA" in first_page_text.upper()
+
+
+def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
+    """Extract the statement date from the PDF."""
+    if not pdf.pages:
+        return None
+
+    first_page_text = pdf.pages[0].extract_text() or ""
+
+    # Look for date patterns in the first page
+    # Common format: "Fecha de corte: DD-MMM-YY" or similar
+    date_matches = DATE_PATTERN.findall(first_page_text)
+    if date_matches:
+        # Use the first date found as statement date
+        day, month_abbr, year = date_matches[0]
+        month = SPANISH_MONTHS.get(month_abbr.upper())
+        if month:
+            full_year = 2000 + int(year)
+            return f"{full_year:04d}-{month:02d}-{int(day):02d}"
+
+    return None
+
+
+def find_section_b_start(page_text: str) -> bool:
+    """Check if page contains start of section B."""
+    patterns = [
+        r"B\)\s*Detalle\s+de\s+compras",
+        r"Detalle\s+de\s+compras\s+del\s+periodo",
+    ]
+    for pattern in patterns:
+        if re.search(pattern, page_text, re.IGNORECASE):
+            return True
+    return False
+
+
+def is_section_end(text: str) -> bool:
+    """Check if we've reached the end of section B."""
+    end_patterns = [
+        r"Total\s+de\s+compras\s+del\s+periodo",
+        r"C\)\s*Detalle\s+de\s+intereses",
+        r"Detalle\s+de\s+intereses",
+        r"D\)\s*Detalle",
+    ]
+    for pattern in end_patterns:
+        if re.search(pattern, text, re.IGNORECASE):
+            return True
+    return False
+
+
+def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
+    """
+    Extract card holder info from a row.
+    Returns (card_suffix, name) or None.
+    """
+    match = CARD_HOLDER_PATTERN.search(row_text)
+    if match:
+        return match.group(1), match.group(2).strip()
+    return None
+
+
+def parse_transaction_line(line: str) -> Optional[dict]:
+    """
+    Parse a text-based transaction line.
+
+    Format: Reference Date Description [Location] Currency Amount
+    Example: 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56
+    """
+    line = line.strip()
+    if not line:
+        return None
+
+    match = TRANSACTION_PATTERN.match(line)
+    if not match:
+        return None
+
+    reference = match.group(1)
+    date_str = match.group(2)
+    desc_and_loc = match.group(3).strip()
+    currency = match.group(4).upper()
+    amount_str = match.group(5)
+    is_negative = match.group(6) == "-"
+
+    # Parse date
+    date = parse_spanish_date(date_str)
+    if not date:
+        logger.warning(f"Could not parse date '{date_str}' for reference {reference}")
+        return None
+
+    # Parse amount
+    amount = parse_amount(amount_str)
+    if amount is None:
+        logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}")
+        return None
+    if is_negative:
+        amount = -amount
+
+    # Split description and location
+    # Location is typically at the end, often a short suffix like "ANILL", "San Jose"
+    # For now, keep everything as description
+    description = desc_and_loc
+    location = None
+
+    # Set amount in appropriate currency field
+    amount_crc = amount if currency == "CRC" else None
+    amount_usd = amount if currency == "USD" else None
+
+    return {
+        "reference": reference,
+        "date": date,
+        "description": description,
+        "location": location,
+        "currency": currency,
+        "amount_crc": amount_crc,
+        "amount_usd": amount_usd,
+    }
+
+
+def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False) -> dict:
+    """
+    Extract transactions from a BAC credit card statement PDF.
+
+    Args:
+        pdf_path: Path to the PDF file
+        card_suffix: Last 4 digits of card to filter
+        verbose: Enable verbose logging
+
+    Returns:
+        Dictionary with metadata, card_holder, transactions, and summary
+    """
+    if verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    with pdfplumber.open(pdf_path) as pdf:
+        # Validate this is a BAC statement
+        if not is_bac_statement(pdf):
+            raise ValueError("PDF does not appear to be a BAC credit card statement")
+
+        statement_date = extract_statement_date(pdf)
+
+        transactions = []
+        current_card_suffix = None
+        current_card_name = None
+        in_section_b = False
+        section_b_found = False
+        card_suffix_found = False
+
+        # Start from page 2 (index 1) as page 1 is summary only
+        start_page = 1 if len(pdf.pages) > 1 else 0
+
+        for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1):
+            page_text = page.extract_text() or ""
+
+            logger.debug(f"Processing page {page_num}")
+
+            # Check for section B start
+            if not in_section_b and find_section_b_start(page_text):
+                in_section_b = True
+                section_b_found = True
+                logger.debug(f"Found section B on page {page_num}")
+
+            # Check for section end
+            if in_section_b and is_section_end(page_text):
+                logger.debug(f"Found section end on page {page_num}")
+                # Still process this page, but mark we're ending
+
+            if not in_section_b:
+                continue
+
+            # Parse text line by line
+            for line in page_text.split("\n"):
+                line = line.strip()
+                if not line:
+                    continue
+
+                # Check for card holder line
+                card_info = extract_card_holder(line)
+                if card_info:
+                    current_card_suffix, current_card_name = card_info
+                    logger.debug(f"Found card holder: {current_card_suffix} - {current_card_name}")
+                    if current_card_suffix == card_suffix:
+                        card_suffix_found = True
+                    continue
+
+                # Skip if we're not tracking the right card
+                if current_card_suffix != card_suffix:
+                    continue
+
+                # Try to parse as transaction
+                transaction = parse_transaction_line(line)
+                if transaction:
+                    transactions.append(transaction)
+                    logger.debug(f"Extracted transaction: {transaction['reference']}")
+
+            # Check if we've passed section B
+            if in_section_b and is_section_end(page_text):
+                break
+
+        if not section_b_found:
+            raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
+
+        if not card_suffix_found:
+            raise ValueError(f"Card suffix '{card_suffix}' not found in statement")
+
+        # Calculate summary
+        total_crc = sum(t["amount_crc"] or 0 for t in transactions)
+        total_usd = sum(t["amount_usd"] or 0 for t in transactions)
+
+        # Get card holder info
+        card_holder = None
+        if card_suffix_found:
+            card_holder = {
+                "card_suffix": card_suffix,
+                "name": current_card_name if current_card_suffix == card_suffix else None
+            }
+
+        return {
+            "metadata": {
+                "source_file": pdf_path.name,
+                "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
+                "statement_date": statement_date,
+                "card_filter": card_suffix,
+                "total_transactions": len(transactions)
+            },
+            "card_holder": card_holder,
+            "transactions": transactions,
+            "summary": {
+                "total_crc": round(total_crc, 2),
+                "total_usd": round(total_usd, 2),
+                "transaction_count": len(transactions)
+            }
+        }
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Extract transactions from BAC Costa Rica credit card statement PDFs",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
+  python bac_extract.py statement.pdf 1234 -o output.json -v
+        """
+    )
+
+    parser.add_argument(
+        "pdf_file",
+        type=Path,
+        help="Path to the BAC statement PDF"
+    )
+
+    parser.add_argument(
+        "card_suffix",
+        type=str,
+        help="Last 4 digits of card to filter (e.g., 1234)"
+    )
+
+    parser.add_argument(
+        "-o", "--output",
+        type=Path,
+        default=Path("transactions.json"),
+        help="Output JSON file path (default: transactions.json)"
+    )
+
+    parser.add_argument(
+        "--pretty",
+        action="store_true",
+        help="Pretty-print JSON output"
+    )
+
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Enable verbose logging"
+    )
+
+    args = parser.parse_args()
+
+    # Validate card suffix
+    if not args.card_suffix.isdigit() or len(args.card_suffix) != 4:
+        print(f"Error: Card suffix must be exactly 4 digits, got '{args.card_suffix}'", file=sys.stderr)
+        sys.exit(1)
+
+    # Validate PDF file exists
+    if not args.pdf_file.exists():
+        print(f"Error: File not found: {args.pdf_file}", file=sys.stderr)
+        sys.exit(1)
+
+    if not args.pdf_file.suffix.lower() == ".pdf":
+        print(f"Error: File must be a PDF: {args.pdf_file}", file=sys.stderr)
+        sys.exit(1)
+
+    try:
+        result = extract_transactions(args.pdf_file, args.card_suffix, args.verbose)
+
+        # Write output
+        indent = 2 if args.pretty else None
+        with open(args.output, "w", encoding="utf-8") as f:
+            json.dump(result, f, indent=indent, ensure_ascii=False)
+
+        print(f"Extracted {result['summary']['transaction_count']} transactions to {args.output}")
+        print(f"Total CRC: {result['summary']['total_crc']:,.2f}")
+        print(f"Total USD: {result['summary']['total_usd']:,.2f}")
+
+    except ValueError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error processing PDF: {e}", file=sys.stderr)
+        if args.verbose:
+            import traceback
+            traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..0e1e4ff
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+pdfplumber>=0.10.0