remove card suffix functionality

2026-03-09 13:59:03 -06:00 · 2026-03-09 13:59:03 -06:00 · a05f701f16
commit a05f701f16
parent 62450842c3
2 changed files with 39 additions and 94 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -13,11 +13,11 @@ Single-script Python tool that extracts credit card transactions from BAC Costa
 ## Usage
 ```bash
-python bac_extract.py <pdf_file> <card_suffix> [options]
+python bac_extract.py <pdf_file> [options]
 # Examples
-python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
+python bac_extract.py EstadodeCuenta.pdf --pretty
-python bac_extract.py statement.pdf 1234 -o output.json -v
+python bac_extract.py statement.pdf -o output.json -v
 ```
 Options:
@ -31,8 +31,7 @@ The extraction pipeline:
 1. Validates PDF is a BAC statement (`is_bac_statement`)
 2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`)
 3. Extracts tables page-by-page using pdfplumber
-4. Filters transactions by card suffix (last 4 digits)
+4. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators
 5. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators
 Key parsing functions:
 - `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15"
--- a/bac_extract.py
+++ b/bac_extract.py
@ -72,21 +72,19 @@ def parse_amount(amount_str: str) -> Optional[float]:
    Parse amount string with comma thousands separator.
    Handles trailing '-' for negative values.
    """
-    if not amount_str or not amount_str.strip():
+    if not amount_str:
        return None
    amount_str = amount_str.strip()
    if not amount_str:
        return None
    # Check for trailing negative sign
    is_negative = amount_str.endswith("-")
    if is_negative:
-        amount_str = amount_str[:-1].strip()
+        amount_str = amount_str[:-1]
    # Remove thousands separators (commas) and handle decimal point
    # Format: 1,234.56 or 1,234,567.89
    try:
-        amount_str = amount_str.replace(",", "")
+        amount = float(amount_str.replace(",", ""))
        amount = float(amount_str)
        return -amount if is_negative else amount
    except ValueError:
        return None
@ -107,19 +105,17 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
        return None
    first_page_text = pdf.pages[0].extract_text() or ""
    # Look for date patterns in the first page
    # Common format: "Fecha de corte: DD-MMM-YY" or similar
    date_matches = DATE_PATTERN.findall(first_page_text)
-    if date_matches:
+    if not date_matches:
-        # Use the first date found as statement date
+        return None
        day, month_abbr, year = date_matches[0]
        month = SPANISH_MONTHS.get(month_abbr.upper())
        if month:
            full_year = 2000 + int(year)
            return f"{full_year:04d}-{month:02d}-{int(day):02d}"
-    return None
+    day, month_abbr, year = date_matches[0]
    month = SPANISH_MONTHS.get(month_abbr.upper())
    if not month:
        return None
    full_year = 2000 + int(year)
    return f"{full_year:04d}-{month:02d}-{int(day):02d}"
 def find_section_b_start(page_text: str) -> bool:
@ -128,10 +124,7 @@ def find_section_b_start(page_text: str) -> bool:
        r"B\)\s*Detalle\s+de\s+compras",
        r"Detalle\s+de\s+compras\s+del\s+periodo",
    ]
-    for pattern in patterns:
+    return any(re.search(p, page_text, re.IGNORECASE) for p in patterns)
        if re.search(pattern, page_text, re.IGNORECASE):
            return True
    return False
 def is_section_end(text: str) -> bool:
@ -142,10 +135,7 @@ def is_section_end(text: str) -> bool:
        r"Detalle\s+de\s+intereses",
        r"D\)\s*Detalle",
    ]
-    for pattern in end_patterns:
+    return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
        if re.search(pattern, text, re.IGNORECASE):
            return True
    return False
 def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
@ -176,18 +166,16 @@ def parse_transaction_line(line: str) -> Optional[dict]:
    reference = match.group(1)
    date_str = match.group(2)
-    desc_and_loc = match.group(3).strip()
+    description = match.group(3).strip()
    currency = match.group(4).upper()
    amount_str = match.group(5)
    is_negative = match.group(6) == "-"
    # Parse date
    date = parse_spanish_date(date_str)
    if not date:
        logger.warning(f"Could not parse date '{date_str}' for reference {reference}")
        return None
    # Parse amount
    amount = parse_amount(amount_str)
    if amount is None:
        logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}")
@ -195,34 +183,23 @@ def parse_transaction_line(line: str) -> Optional[dict]:
    if is_negative:
        amount = -amount
    # Split description and location
    # Location is typically at the end, often a short suffix like "ANILL", "San Jose"
    # For now, keep everything as description
    description = desc_and_loc
    location = None
    # Set amount in appropriate currency field
    amount_crc = amount if currency == "CRC" else None
    amount_usd = amount if currency == "USD" else None
    return {
        "reference": reference,
        "date": date,
        "description": description,
-        "location": location,
+        "location": None,
        "currency": currency,
-        "amount_crc": amount_crc,
+        "amount_crc": amount if currency == "CRC" else None,
-        "amount_usd": amount_usd,
+        "amount_usd": amount if currency == "USD" else None,
    }
-def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False) -> dict:
+def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
    """
    Extract transactions from a BAC credit card statement PDF.
    Args:
        pdf_path: Path to the PDF file
        card_suffix: Last 4 digits of card to filter
        verbose: Enable verbose logging
    Returns:
@ -241,11 +218,10 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False
        statement_date = extract_statement_date(pdf)
        transactions = []
-        current_card_suffix = None
+        card_suffix = None
-        current_card_name = None
+        card_holder_name = None
        in_section_b = False
        section_b_found = False
        card_suffix_found = False
        # Start from page 2 (index 1) as page 1 is summary only
        start_page = 1 if len(pdf.pages) > 1 else 0
@ -261,67 +237,48 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False
                section_b_found = True
                logger.debug(f"Found section B on page {page_num}")
            # Check for section end
            if in_section_b and is_section_end(page_text):
                logger.debug(f"Found section end on page {page_num}")
                # Still process this page, but mark we're ending
            if not in_section_b:
                continue
            # Check for section end (still process this page before breaking)
            reached_section_end = is_section_end(page_text)
            if reached_section_end:
                logger.debug(f"Found section end on page {page_num}")
            # Parse text line by line
            for line in page_text.split("\n"):
                line = line.strip()
                if not line:
                    continue
                # Check for card holder line
                card_info = extract_card_holder(line)
                if card_info:
-                    current_card_suffix, current_card_name = card_info
+                    card_suffix, card_holder_name = card_info
-                    logger.debug(f"Found card holder: {current_card_suffix} - {current_card_name}")
+                    logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
                    if current_card_suffix == card_suffix:
                        card_suffix_found = True
                    continue
                # Skip if we're not tracking the right card
                if current_card_suffix != card_suffix:
                    continue
                # Try to parse as transaction
                transaction = parse_transaction_line(line)
                if transaction:
                    transactions.append(transaction)
                    logger.debug(f"Extracted transaction: {transaction['reference']}")
-            # Check if we've passed section B
+            if reached_section_end:
            if in_section_b and is_section_end(page_text):
                break
        if not section_b_found:
            raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
        if not card_suffix_found:
            raise ValueError(f"Card suffix '{card_suffix}' not found in statement")
        # Calculate summary
        total_crc = sum(t["amount_crc"] or 0 for t in transactions)
        total_usd = sum(t["amount_usd"] or 0 for t in transactions)
-        # Get card holder info
+        card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None
        card_holder = None
        if card_suffix_found:
            card_holder = {
                "card_suffix": card_suffix,
                "name": current_card_name if current_card_suffix == card_suffix else None
            }
        return {
            "metadata": {
                "source_file": pdf_path.name,
                "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
                "statement_date": statement_date,
                "card_filter": card_suffix,
                "total_transactions": len(transactions)
            },
            "card_holder": card_holder,
@ -340,8 +297,8 @@ def main():
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
-  python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
+  python bac_extract.py EstadodeCuenta.pdf --pretty
-  python bac_extract.py statement.pdf 1234 -o output.json -v
+  python bac_extract.py statement.pdf -o output.json -v
        """
    )
@ -351,12 +308,6 @@ Examples:
        help="Path to the BAC statement PDF"
    )
    parser.add_argument(
        "card_suffix",
        type=str,
        help="Last 4 digits of card to filter (e.g., 1234)"
    )
    parser.add_argument(
        "-o", "--output",
        type=Path,
@ -378,11 +329,6 @@ Examples:
    args = parser.parse_args()
    # Validate card suffix
    if not args.card_suffix.isdigit() or len(args.card_suffix) != 4:
        print(f"Error: Card suffix must be exactly 4 digits, got '{args.card_suffix}'", file=sys.stderr)
        sys.exit(1)
    # Validate PDF file exists
    if not args.pdf_file.exists():
        print(f"Error: File not found: {args.pdf_file}", file=sys.stderr)
@ -393,7 +339,7 @@ Examples:
        sys.exit(1)
    try:
-        result = extract_transactions(args.pdf_file, args.card_suffix, args.verbose)
+        result = extract_transactions(args.pdf_file, args.verbose)
        # Write output
        indent = 2 if args.pretty else None