3 changed files with 292 additions and 137 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,2 @@
 *.pdf
 *.json
-__pycache__/
-testStatements/
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -4,24 +4,20 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co

 ## Project Overview

-Single-script Python tool that extracts credit card transactions from BAC Costa Rica statement PDFs. Parses sections B (purchases), D (other charges), and E (voluntary services) and outputs JSON.
+Single-script Python tool that extracts credit card transactions from BAC Costa Rica statement PDFs. Parses section "B) Detalle de compras del periodo" and outputs JSON.

 ## Dependencies

 - pdfplumber (>=0.10.0)

-## Commands
+## Usage

 ```bash
-# Run tests
-python testStatements/run_tests.py
-
-# Run extractor
-python bac_extract.py <pdf_file> [options]
+python bac_extract.py <pdf_file> <card_suffix> [options]

 # Examples
-python bac_extract.py EstadodeCuenta.pdf --pretty
-python bac_extract.py statement.pdf -o output.json -v
+python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
+python bac_extract.py statement.pdf 1234 -o output.json -v
 ```

 Options:
@ -33,15 +29,12 @@ Options:

 The extraction pipeline:
 1. Validates PDF is a BAC statement (`is_bac_statement`)
-2. Iterates pages line-by-line, detecting section boundaries via `SECTIONS` dict patterns
-3. Parses transactions matching `TRANSACTION_PATTERN` regex
-4. Outputs card holders, transactions by section, and summaries
-
-Key data structures:
- `SECTIONS`: Maps section IDs (B/D/E) to start/end regex patterns and output keys
- `SPANISH_MONTHS`: Spanish month abbreviations for date parsing
+2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`)
+3. Extracts tables page-by-page using pdfplumber
+4. Filters transactions by card suffix (last 4 digits)
+5. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators

 Key parsing functions:
 - `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15"
 - `parse_amount`: Handles "1,234.56" and trailing negatives "100.00-"
- `matches_patterns`: Generic regex pattern matcher for section detection
+- `extract_card_holder`: Matches "************1234 NAME" pattern
--- a/bac_extract.py
+++ b/bac_extract.py
@ -3,10 +3,7 @@
 BAC Credit Card Statement Extractor

 Extracts transactions from BAC Costa Rica credit card statement PDFs.
-Targets sections:
-  B) Detalle de compras del periodo
-  D) Detalle de otros cargos
-  E) Detalle de productos y servicios de elección voluntaria
+Specifically targets section "B) Detalle de compras del periodo".
 """

 import argparse
@ -20,63 +17,76 @@ from typing import Optional

 import pdfplumber

+# Spanish month abbreviations to month numbers
 SPANISH_MONTHS = {
    "ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6,
    "JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12
 }

+# Card holder pattern: ************XXXX NAME
 CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)")
+
+# Date pattern: D-MMM-YY or DD-MMM-YY
 DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE)
+
+# Transaction line pattern:
+# Reference  Date        Description         Location (optional)  Currency  Amount
+# 123456789012 9-ENE-26 EXAMPLE STORE                              CRC      1,234.56
 TRANSACTION_PATTERN = re.compile(
-    r"^(\d{12,13})\s+"
-    r"(\d{1,2}-[A-Z]{3}-\d{2})\s+"
-    r"(.+?)\s+"
-    r"(CRC|USD)\s+"
-    r"([\d,]+\.\d{2})(-)?$",
+    r"^(\d{12})\s+"                          # Reference (12 digits)
+    r"(\d{1,2}-[A-Z]{3}-\d{2})\s+"           # Date
+    r"(.+?)\s+"                               # Description
+    r"(CRC|USD)\s+"                           # Currency
+    r"([\d,]+\.\d{2})(-)?$",                 # Amount (with optional trailing minus)
    re.IGNORECASE
 )

-# Section definitions: start patterns, end patterns, output key
-SECTIONS = {
-    "B": {
-        "start": [r"B\)\s*Detalle\s+de\s+compras", r"Detalle\s+de\s+compras\s+del\s+periodo"],
-        "end": [r"Total\s+de\s+compras\s+del\s+periodo", r"C\)\s*Detalle", r"D\)\s*Detalle", r"E\)\s*Detalle"],
-        "key": "purchases",
-    },
-    "D": {
-        "start": [r"D\)\s*Detalle\s+de\s+otros\s+cargos"],
-        "end": [r"Total\s+por\s+concepto\s+otros\s+cargos", r"E\)\s*Detalle"],
-        "key": "other_charges",
-    },
-    "E": {
-        "start": [r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios"],
-        "end": [r"Total\s+por\s+concepto\s+de\s+productos", r"F\)\s*Cargos"],
-        "key": "voluntary_services",
-    },
-}
-
 logger = logging.getLogger(__name__)


 def parse_spanish_date(date_str: str) -> Optional[str]:
    """Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD)."""
-    match = DATE_PATTERN.match(date_str.strip()) if date_str else None
+    if not date_str:
+        return None
+
+    match = DATE_PATTERN.match(date_str.strip())
    if not match:
        return None
+
    day, month_abbr, year = match.groups()
    month = SPANISH_MONTHS.get(month_abbr.upper())
    if not month:
        return None
-    return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}"
+
+    # Assume 2000s for 2-digit year
+    full_year = 2000 + int(year)
+
+    try:
+        return f"{full_year:04d}-{month:02d}-{int(day):02d}"
+    except ValueError:
+        return None


 def parse_amount(amount_str: str) -> Optional[float]:
-    """Parse amount with comma thousands separator. Handles trailing '-' for negatives."""
-    if not amount_str or not (amount_str := amount_str.strip()):
+    """
+    Parse amount string with comma thousands separator.
+    Handles trailing '-' for negative values.
+    """
+    if not amount_str or not amount_str.strip():
        return None
+
+    amount_str = amount_str.strip()
+
+    # Check for trailing negative sign
    is_negative = amount_str.endswith("-")
+    if is_negative:
+        amount_str = amount_str[:-1].strip()
+
+    # Remove thousands separators (commas) and handle decimal point
+    # Format: 1,234.56 or 1,234,567.89
    try:
-        amount = float(amount_str.rstrip("-").replace(",", ""))
+        amount_str = amount_str.replace(",", "")
+        amount = float(amount_str)
        return -amount if is_negative else amount
    except ValueError:
        return None
@ -86,6 +96,7 @@ def is_bac_statement(pdf: pdfplumber.PDF) -> bool:
    """Check if the PDF is a BAC credit card statement."""
    if not pdf.pages:
        return False
+
    first_page_text = pdf.pages[0].extract_text() or ""
    return "BAC" in first_page_text and "TARJETA" in first_page_text.upper()

@ -94,114 +105,215 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
    """Extract the statement date from the PDF."""
    if not pdf.pages:
        return None
+
    first_page_text = pdf.pages[0].extract_text() or ""
+
+    # Look for date patterns in the first page
+    # Common format: "Fecha de corte: DD-MMM-YY" or similar
    date_matches = DATE_PATTERN.findall(first_page_text)
-    if not date_matches:
-        return None
+    if date_matches:
+        # Use the first date found as statement date
        day, month_abbr, year = date_matches[0]
        month = SPANISH_MONTHS.get(month_abbr.upper())
-    if not month:
+        if month:
+            full_year = 2000 + int(year)
+            return f"{full_year:04d}-{month:02d}-{int(day):02d}"
+
    return None
-    return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}"


-def matches_patterns(text: str, patterns: list[str]) -> bool:
-    """Check if text matches any of the given regex patterns."""
-    return any(re.search(p, text, re.IGNORECASE) for p in patterns)
+def find_section_b_start(page_text: str) -> bool:
+    """Check if page contains start of section B."""
+    patterns = [
+        r"B\)\s*Detalle\s+de\s+compras",
+        r"Detalle\s+de\s+compras\s+del\s+periodo",
+    ]
+    for pattern in patterns:
+        if re.search(pattern, page_text, re.IGNORECASE):
+            return True
+    return False
+
+
+def is_section_end(text: str) -> bool:
+    """Check if we've reached the end of section B."""
+    end_patterns = [
+        r"Total\s+de\s+compras\s+del\s+periodo",
+        r"C\)\s*Detalle\s+de\s+intereses",
+        r"Detalle\s+de\s+intereses",
+        r"D\)\s*Detalle",
+    ]
+    for pattern in end_patterns:
+        if re.search(pattern, text, re.IGNORECASE):
+            return True
+    return False
+
+
+def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
+    """
+    Extract card holder info from a row.
+    Returns (card_suffix, name) or None.
+    """
+    match = CARD_HOLDER_PATTERN.search(row_text)
+    if match:
+        return match.group(1), match.group(2).strip()
+    return None


 def parse_transaction_line(line: str) -> Optional[dict]:
-    """Parse a transaction line into a dict, or return None if not a transaction."""
-    match = TRANSACTION_PATTERN.match(line.strip())
+    """
+    Parse a text-based transaction line.
+
+    Format: Reference Date Description [Location] Currency Amount
+    Example: 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56
+    """
+    line = line.strip()
+    if not line:
+        return None
+
+    match = TRANSACTION_PATTERN.match(line)
    if not match:
        return None

-    reference, date_str, description, currency, amount_str, neg = match.groups()
-    currency = currency.upper()
+    reference = match.group(1)
+    date_str = match.group(2)
+    desc_and_loc = match.group(3).strip()
+    currency = match.group(4).upper()
+    amount_str = match.group(5)
+    is_negative = match.group(6) == "-"

+    # Parse date
    date = parse_spanish_date(date_str)
-    amount = parse_amount(amount_str)
-    if not date or amount is None:
-        logger.warning(f"Could not parse transaction: {line}")
+    if not date:
+        logger.warning(f"Could not parse date '{date_str}' for reference {reference}")
        return None
-    if neg:
+
+    # Parse amount
+    amount = parse_amount(amount_str)
+    if amount is None:
+        logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}")
+        return None
+    if is_negative:
        amount = -amount

+    # Split description and location
+    # Location is typically at the end, often a short suffix like "ANILL", "San Jose"
+    # For now, keep everything as description
+    description = desc_and_loc
+    location = None
+
+    # Set amount in appropriate currency field
+    amount_crc = amount if currency == "CRC" else None
+    amount_usd = amount if currency == "USD" else None
+
    return {
        "reference": reference,
        "date": date,
-        "description": description.strip(),
-        "location": None,
+        "description": description,
+        "location": location,
        "currency": currency,
-        "amount_crc": amount if currency == "CRC" else None,
-        "amount_usd": amount if currency == "USD" else None,
+        "amount_crc": amount_crc,
+        "amount_usd": amount_usd,
    }


-def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
-    """Extract transactions from a BAC credit card statement PDF."""
-    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
+def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False) -> dict:
+    """
+    Extract transactions from a BAC credit card statement PDF.
+
+    Args:
+        pdf_path: Path to the PDF file
+        card_suffix: Last 4 digits of card to filter
+        verbose: Enable verbose logging
+
+    Returns:
+        Dictionary with metadata, card_holder, transactions, and summary
+    """
+    if verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)

    with pdfplumber.open(pdf_path) as pdf:
+        # Validate this is a BAC statement
        if not is_bac_statement(pdf):
            raise ValueError("PDF does not appear to be a BAC credit card statement")

        statement_date = extract_statement_date(pdf)
-        transactions = {s["key"]: [] for s in SECTIONS.values()}
-        card_holders = []
-        seen_card_suffixes = set()
-        current_section = None
-        sections_completed = set()

+        transactions = []
+        current_card_suffix = None
+        current_card_name = None
+        in_section_b = False
+        section_b_found = False
+        card_suffix_found = False
+
+        # Start from page 2 (index 1) as page 1 is summary only
        start_page = 1 if len(pdf.pages) > 1 else 0
+
        for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1):
            page_text = page.extract_text() or ""
+
            logger.debug(f"Processing page {page_num}")

+            # Check for section B start
+            if not in_section_b and find_section_b_start(page_text):
+                in_section_b = True
+                section_b_found = True
+                logger.debug(f"Found section B on page {page_num}")
+
+            # Check for section end
+            if in_section_b and is_section_end(page_text):
+                logger.debug(f"Found section end on page {page_num}")
+                # Still process this page, but mark we're ending
+
+            if not in_section_b:
+                continue
+
+            # Parse text line by line
            for line in page_text.split("\n"):
                line = line.strip()
                if not line:
                    continue

-                # Check for section end
-                if current_section and matches_patterns(line, SECTIONS[current_section]["end"]):
-                    logger.debug(f"Section {current_section} ended on page {page_num}")
-                    sections_completed.add(current_section)
-                    current_section = None
+                # Check for card holder line
+                card_info = extract_card_holder(line)
+                if card_info:
+                    current_card_suffix, current_card_name = card_info
+                    logger.debug(f"Found card holder: {current_card_suffix} - {current_card_name}")
+                    if current_card_suffix == card_suffix:
+                        card_suffix_found = True
+                    continue

-                # Check for section start
-                if current_section is None:
-                    for sec_id, sec in SECTIONS.items():
-                        if sec_id not in sections_completed and matches_patterns(line, sec["start"]):
-                            current_section = sec_id
-                            logger.debug(f"Found section {sec_id} on page {page_num}")
+                # Skip if we're not tracking the right card
+                if current_card_suffix != card_suffix:
+                    continue
+
+                # Try to parse as transaction
+                transaction = parse_transaction_line(line)
+                if transaction:
+                    transactions.append(transaction)
+                    logger.debug(f"Extracted transaction: {transaction['reference']}")
+
+            # Check if we've passed section B
+            if in_section_b and is_section_end(page_text):
                break
-                    continue

-                # Extract card holder
-                match = CARD_HOLDER_PATTERN.search(line)
-                if match:
-                    suffix, name = match.group(1), match.group(2).strip()
-                    if suffix not in seen_card_suffixes:
-                        card_holders.append({"card_suffix": suffix, "name": name})
-                        seen_card_suffixes.add(suffix)
-                        logger.debug(f"Found card holder: {suffix} - {name}")
-                    continue
-
-                # Parse transaction
-                txn = parse_transaction_line(line)
-                if txn:
-                    transactions[SECTIONS[current_section]["key"]].append(txn)
-                    logger.debug(f"Extracted {current_section} transaction: {txn['reference']}")
-
-        if "B" not in sections_completed and not transactions["purchases"]:
+        if not section_b_found:
            raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")

-        def summarize(txns):
-            return {
-                "total_crc": round(sum(t["amount_crc"] or 0 for t in txns), 2),
-                "total_usd": round(sum(t["amount_usd"] or 0 for t in txns), 2),
-                "count": len(txns),
+        if not card_suffix_found:
+            raise ValueError(f"Card suffix '{card_suffix}' not found in statement")
+
+        # Calculate summary
+        total_crc = sum(t["amount_crc"] or 0 for t in transactions)
+        total_usd = sum(t["amount_usd"] or 0 for t in transactions)
+
+        # Get card holder info
+        card_holder = None
+        if card_suffix_found:
+            card_holder = {
+                "card_suffix": card_suffix,
+                "name": current_card_name if current_card_suffix == card_suffix else None
            }

        return {
@ -209,46 +321,98 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
                "source_file": pdf_path.name,
                "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
                "statement_date": statement_date,
-                "total_transactions": sum(len(t) for t in transactions.values()),
+                "card_filter": card_suffix,
+                "total_transactions": len(transactions)
            },
-            "card_holders": card_holders,
-            **transactions,
-            "summary": {key: summarize(txns) for key, txns in transactions.items()},
+            "card_holder": card_holder,
+            "transactions": transactions,
+            "summary": {
+                "total_crc": round(total_crc, 2),
+                "total_usd": round(total_usd, 2),
+                "transaction_count": len(transactions)
+            }
        }


 def main():
-    parser = argparse.ArgumentParser(description="Extract transactions from BAC CR statement PDFs")
-    parser.add_argument("pdf_file", type=Path, help="Path to the BAC statement PDF")
-    parser.add_argument("-o", "--output", type=Path, default=Path("transactions.json"))
-    parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
-    parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose logging")
+    parser = argparse.ArgumentParser(
+        description="Extract transactions from BAC Costa Rica credit card statement PDFs",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
+  python bac_extract.py statement.pdf 1234 -o output.json -v
+        """
+    )
+
+    parser.add_argument(
+        "pdf_file",
+        type=Path,
+        help="Path to the BAC statement PDF"
+    )
+
+    parser.add_argument(
+        "card_suffix",
+        type=str,
+        help="Last 4 digits of card to filter (e.g., 1234)"
+    )
+
+    parser.add_argument(
+        "-o", "--output",
+        type=Path,
+        default=Path("transactions.json"),
+        help="Output JSON file path (default: transactions.json)"
+    )
+
+    parser.add_argument(
+        "--pretty",
+        action="store_true",
+        help="Pretty-print JSON output"
+    )
+
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Enable verbose logging"
+    )
+
    args = parser.parse_args()

+    # Validate card suffix
+    if not args.card_suffix.isdigit() or len(args.card_suffix) != 4:
+        print(f"Error: Card suffix must be exactly 4 digits, got '{args.card_suffix}'", file=sys.stderr)
+        sys.exit(1)
+
+    # Validate PDF file exists
    if not args.pdf_file.exists():
-        sys.exit(f"Error: File not found: {args.pdf_file}")
-    if args.pdf_file.suffix.lower() != ".pdf":
-        sys.exit(f"Error: File must be a PDF: {args.pdf_file}")
+        print(f"Error: File not found: {args.pdf_file}", file=sys.stderr)
+        sys.exit(1)
+
+    if not args.pdf_file.suffix.lower() == ".pdf":
+        print(f"Error: File must be a PDF: {args.pdf_file}", file=sys.stderr)
+        sys.exit(1)

    try:
-        result = extract_transactions(args.pdf_file, args.verbose)
-        with open(args.output, "w", encoding="utf-8") as f:
-            json.dump(result, f, indent=2 if args.pretty else None, ensure_ascii=False)
+        result = extract_transactions(args.pdf_file, args.card_suffix, args.verbose)

-        summary = result["summary"]
-        print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}")
-        for key, label in [("purchases", "Purchases (B)"), ("other_charges", "Other charges (D)"),
-                           ("voluntary_services", "Voluntary services (E)")]:
-            s = summary[key]
-            print(f"  {label:25} {s['count']:3d}  CRC {s['total_crc']:>12,.2f}  USD {s['total_usd']:>10,.2f}")
+        # Write output
+        indent = 2 if args.pretty else None
+        with open(args.output, "w", encoding="utf-8") as f:
+            json.dump(result, f, indent=indent, ensure_ascii=False)
+
+        print(f"Extracted {result['summary']['transaction_count']} transactions to {args.output}")
+        print(f"Total CRC: {result['summary']['total_crc']:,.2f}")
+        print(f"Total USD: {result['summary']['total_usd']:,.2f}")

    except ValueError as e:
-        sys.exit(f"Error: {e}")
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
    except Exception as e:
+        print(f"Error processing PDF: {e}", file=sys.stderr)
        if args.verbose:
            import traceback
            traceback.print_exc()
-        sys.exit(f"Error processing PDF: {e}")
+        sys.exit(1)


 if __name__ == "__main__":