target sections D and E

2026-03-09 14:44:47 -06:00 · 2026-03-09 14:44:47 -06:00 · 69a773e1b3
commit 69a773e1b3
parent a05f701f16
1 changed files with 134 additions and 45 deletions
--- a/bac_extract.py
+++ b/bac_extract.py
@ -3,7 +3,10 @@
 BAC Credit Card Statement Extractor
 Extracts transactions from BAC Costa Rica credit card statement PDFs.
-Specifically targets section "B) Detalle de compras del periodo".
+Targets sections:
  B) Detalle de compras del periodo
  D) Detalle de otros cargos
  E) Detalle de productos y servicios de elección voluntaria
 """
 import argparse
@ -118,17 +121,27 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
    return f"{full_year:04d}-{month:02d}-{int(day):02d}"
-def find_section_b_start(page_text: str) -> bool:
+def find_section_b_start(text: str) -> bool:
-    """Check if page contains start of section B."""
+    """Check if text contains start of section B (purchases)."""
    patterns = [
        r"B\)\s*Detalle\s+de\s+compras",
        r"Detalle\s+de\s+compras\s+del\s+periodo",
    ]
-    return any(re.search(p, page_text, re.IGNORECASE) for p in patterns)
+    return any(re.search(p, text, re.IGNORECASE) for p in patterns)
-def is_section_end(text: str) -> bool:
+def find_section_d_start(text: str) -> bool:
-    """Check if we've reached the end of section B."""
+    """Check if text contains start of section D (other charges)."""
    return bool(re.search(r"D\)\s*Detalle\s+de\s+otros\s+cargos", text, re.IGNORECASE))
 def find_section_e_start(text: str) -> bool:
    """Check if text contains start of section E (voluntary products/services)."""
    return bool(re.search(r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios", text, re.IGNORECASE))
 def is_section_b_end(text: str) -> bool:
    """Check if text indicates the end of section B."""
    end_patterns = [
        r"Total\s+de\s+compras\s+del\s+periodo",
        r"C\)\s*Detalle\s+de\s+intereses",
@ -138,6 +151,24 @@ def is_section_end(text: str) -> bool:
    return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
 def is_section_d_end(text: str) -> bool:
    """Check if text indicates the end of section D."""
    end_patterns = [
        r"Total\s+por\s+concepto\s+otros\s+cargos",
        r"E\)\s*Detalle",
    ]
    return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
 def is_section_e_end(text: str) -> bool:
    """Check if text indicates the end of section E."""
    end_patterns = [
        r"Total\s+por\s+concepto\s+de\s+productos",
        r"F\)\s*Cargos",
    ]
    return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
 def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
    """
    Extract card holder info from a row.
@ -203,7 +234,8 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
        verbose: Enable verbose logging
    Returns:
-        Dictionary with metadata, card_holder, transactions, and summary
+        Dictionary with metadata, card_holders, purchases, other_charges,
        voluntary_services, and summary
    """
    if verbose:
        logging.basicConfig(level=logging.DEBUG)
@ -217,11 +249,18 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
        statement_date = extract_statement_date(pdf)
-        transactions = []
+        # Transactions by section
-        card_suffix = None
+        purchases = []           # Section B
-        card_holder_name = None
+        other_charges = []       # Section D
-        in_section_b = False
+        voluntary_services = []  # Section E
-        section_b_found = False
+
        # Track card holders (may have multiple)
        card_holders = []
        seen_card_suffixes = set()
        # Section tracking: None, "B", "D", "E"
        current_section = None
        sections_found = set()
        # Start from page 2 (index 1) as page 1 is summary only
        start_page = 1 if len(pdf.pages) > 1 else 0
@ -231,19 +270,48 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
            logger.debug(f"Processing page {page_num}")
-            # Check for section B start
+            # Check for section transitions (order matters: check ends before starts)
-            if not in_section_b and find_section_b_start(page_text):
+            # Section B end
-                in_section_b = True
+            if current_section == "B" and is_section_b_end(page_text):
-                section_b_found = True
+                logger.debug(f"Section B ended on page {page_num}")
                current_section = None
            # Section D end
            if current_section == "D" and is_section_d_end(page_text):
                logger.debug(f"Section D ended on page {page_num}")
                current_section = None
            # Section E end
            if current_section == "E" and is_section_e_end(page_text):
                logger.debug(f"Section E ended on page {page_num}")
                current_section = None
            # Check for section starts
            if current_section is None and find_section_b_start(page_text):
                current_section = "B"
                sections_found.add("B")
                logger.debug(f"Found section B on page {page_num}")
-            if not in_section_b:
+            if current_section is None and find_section_d_start(page_text):
                current_section = "D"
                sections_found.add("D")
                logger.debug(f"Found section D on page {page_num}")
            if current_section is None and find_section_e_start(page_text):
                current_section = "E"
                sections_found.add("E")
                logger.debug(f"Found section E on page {page_num}")
            if current_section is None:
                continue
-            # Check for section end (still process this page before breaking)
+            # Select the appropriate transaction list
-            reached_section_end = is_section_end(page_text)
+            if current_section == "B":
-            if reached_section_end:
+                target_list = purchases
-                logger.debug(f"Found section end on page {page_num}")
+            elif current_section == "D":
                target_list = other_charges
            else:  # "E"
                target_list = voluntary_services
            # Parse text line by line
            for line in page_text.split("\n"):
@ -251,42 +319,55 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
                if not line:
                    continue
-                card_info = extract_card_holder(line)
+                # Extract card holder info (only in section B)
-                if card_info:
+                if current_section == "B":
-                    card_suffix, card_holder_name = card_info
+                    card_info = extract_card_holder(line)
-                    logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
+                    if card_info:
-                    continue
+                        card_suffix, card_holder_name = card_info
                        if card_suffix not in seen_card_suffixes:
                            card_holders.append({
                                "card_suffix": card_suffix,
                                "name": card_holder_name
                            })
                            seen_card_suffixes.add(card_suffix)
                            logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
                        continue
                transaction = parse_transaction_line(line)
                if transaction:
-                    transactions.append(transaction)
+                    target_list.append(transaction)
-                    logger.debug(f"Extracted transaction: {transaction['reference']}")
+                    logger.debug(f"Extracted {current_section} transaction: {transaction['reference']}")
-            if reached_section_end:
+        if "B" not in sections_found:
                break
        if not section_b_found:
            raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
-        # Calculate summary
+        # Calculate summaries
-        total_crc = sum(t["amount_crc"] or 0 for t in transactions)
+        def calculate_summary(txns):
-        total_usd = sum(t["amount_usd"] or 0 for t in transactions)
+            total_crc = sum(t["amount_crc"] or 0 for t in txns)
            total_usd = sum(t["amount_usd"] or 0 for t in txns)
            return {
                "total_crc": round(total_crc, 2),
                "total_usd": round(total_usd, 2),
                "count": len(txns)
            }
-        card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None
+        total_transactions = len(purchases) + len(other_charges) + len(voluntary_services)
        return {
            "metadata": {
                "source_file": pdf_path.name,
                "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
                "statement_date": statement_date,
-                "total_transactions": len(transactions)
+                "total_transactions": total_transactions
            },
-            "card_holder": card_holder,
+            "card_holders": card_holders,
-            "transactions": transactions,
+            "purchases": purchases,
            "other_charges": other_charges,
            "voluntary_services": voluntary_services,
            "summary": {
-                "total_crc": round(total_crc, 2),
+                "purchases": calculate_summary(purchases),
-                "total_usd": round(total_usd, 2),
+                "other_charges": calculate_summary(other_charges),
-                "transaction_count": len(transactions)
+                "voluntary_services": calculate_summary(voluntary_services)
            }
        }
@ -346,9 +427,17 @@ Examples:
        with open(args.output, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=indent, ensure_ascii=False)
-        print(f"Extracted {result['summary']['transaction_count']} transactions to {args.output}")
+        summary = result['summary']
-        print(f"Total CRC: {result['summary']['total_crc']:,.2f}")
+        print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}")
-        print(f"Total USD: {result['summary']['total_usd']:,.2f}")
+        print(f"  Purchases (B):          {summary['purchases']['count']:3d}  "
              f"CRC {summary['purchases']['total_crc']:>12,.2f}  "
              f"USD {summary['purchases']['total_usd']:>10,.2f}")
        print(f"  Other charges (D):      {summary['other_charges']['count']:3d}  "
              f"CRC {summary['other_charges']['total_crc']:>12,.2f}  "
              f"USD {summary['other_charges']['total_usd']:>10,.2f}")
        print(f"  Voluntary services (E): {summary['voluntary_services']['count']:3d}  "
              f"CRC {summary['voluntary_services']['total_crc']:>12,.2f}  "
              f"USD {summary['voluntary_services']['total_usd']:>10,.2f}")
    except ValueError as e:
        print(f"Error: {e}", file=sys.stderr)