target sections D and E

2026-03-09 14:44:47 -06:00 · 2026-03-09 14:44:47 -06:00 · 69a773e1b3
commit 69a773e1b3
parent a05f701f16
1 changed files with 134 additions and 45 deletions
--- a/bac_extract.py
+++ b/bac_extract.py
@ -3,7 +3,10 @@
 BAC Credit Card Statement Extractor

 Extracts transactions from BAC Costa Rica credit card statement PDFs.
-Specifically targets section "B) Detalle de compras del periodo".
+Targets sections:
+  B) Detalle de compras del periodo
+  D) Detalle de otros cargos
+  E) Detalle de productos y servicios de elección voluntaria
 """

 import argparse
@ -118,17 +121,27 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
    return f"{full_year:04d}-{month:02d}-{int(day):02d}"


-def find_section_b_start(page_text: str) -> bool:
-    """Check if page contains start of section B."""
+def find_section_b_start(text: str) -> bool:
+    """Check if text contains start of section B (purchases)."""
    patterns = [
        r"B\)\s*Detalle\s+de\s+compras",
        r"Detalle\s+de\s+compras\s+del\s+periodo",
    ]
-    return any(re.search(p, page_text, re.IGNORECASE) for p in patterns)
+    return any(re.search(p, text, re.IGNORECASE) for p in patterns)


-def is_section_end(text: str) -> bool:
-    """Check if we've reached the end of section B."""
+def find_section_d_start(text: str) -> bool:
+    """Check if text contains start of section D (other charges)."""
+    return bool(re.search(r"D\)\s*Detalle\s+de\s+otros\s+cargos", text, re.IGNORECASE))
+
+
+def find_section_e_start(text: str) -> bool:
+    """Check if text contains start of section E (voluntary products/services)."""
+    return bool(re.search(r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios", text, re.IGNORECASE))
+
+
+def is_section_b_end(text: str) -> bool:
+    """Check if text indicates the end of section B."""
    end_patterns = [
        r"Total\s+de\s+compras\s+del\s+periodo",
        r"C\)\s*Detalle\s+de\s+intereses",
@ -138,6 +151,24 @@ def is_section_end(text: str) -> bool:
    return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)


+def is_section_d_end(text: str) -> bool:
+    """Check if text indicates the end of section D."""
+    end_patterns = [
+        r"Total\s+por\s+concepto\s+otros\s+cargos",
+        r"E\)\s*Detalle",
+    ]
+    return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
+
+
+def is_section_e_end(text: str) -> bool:
+    """Check if text indicates the end of section E."""
+    end_patterns = [
+        r"Total\s+por\s+concepto\s+de\s+productos",
+        r"F\)\s*Cargos",
+    ]
+    return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
+
+
 def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
    """
    Extract card holder info from a row.
@ -203,7 +234,8 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
        verbose: Enable verbose logging

    Returns:
-        Dictionary with metadata, card_holder, transactions, and summary
+        Dictionary with metadata, card_holders, purchases, other_charges,
+        voluntary_services, and summary
    """
    if verbose:
        logging.basicConfig(level=logging.DEBUG)
@ -217,11 +249,18 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:

        statement_date = extract_statement_date(pdf)

-        transactions = []
-        card_suffix = None
-        card_holder_name = None
-        in_section_b = False
-        section_b_found = False
+        # Transactions by section
+        purchases = []           # Section B
+        other_charges = []       # Section D
+        voluntary_services = []  # Section E
+
+        # Track card holders (may have multiple)
+        card_holders = []
+        seen_card_suffixes = set()
+
+        # Section tracking: None, "B", "D", "E"
+        current_section = None
+        sections_found = set()

        # Start from page 2 (index 1) as page 1 is summary only
        start_page = 1 if len(pdf.pages) > 1 else 0
@ -231,19 +270,48 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:

            logger.debug(f"Processing page {page_num}")

-            # Check for section B start
-            if not in_section_b and find_section_b_start(page_text):
-                in_section_b = True
-                section_b_found = True
+            # Check for section transitions (order matters: check ends before starts)
+            # Section B end
+            if current_section == "B" and is_section_b_end(page_text):
+                logger.debug(f"Section B ended on page {page_num}")
+                current_section = None
+
+            # Section D end
+            if current_section == "D" and is_section_d_end(page_text):
+                logger.debug(f"Section D ended on page {page_num}")
+                current_section = None
+
+            # Section E end
+            if current_section == "E" and is_section_e_end(page_text):
+                logger.debug(f"Section E ended on page {page_num}")
+                current_section = None
+
+            # Check for section starts
+            if current_section is None and find_section_b_start(page_text):
+                current_section = "B"
+                sections_found.add("B")
                logger.debug(f"Found section B on page {page_num}")

-            if not in_section_b:
+            if current_section is None and find_section_d_start(page_text):
+                current_section = "D"
+                sections_found.add("D")
+                logger.debug(f"Found section D on page {page_num}")
+
+            if current_section is None and find_section_e_start(page_text):
+                current_section = "E"
+                sections_found.add("E")
+                logger.debug(f"Found section E on page {page_num}")
+
+            if current_section is None:
                continue

-            # Check for section end (still process this page before breaking)
-            reached_section_end = is_section_end(page_text)
-            if reached_section_end:
-                logger.debug(f"Found section end on page {page_num}")
+            # Select the appropriate transaction list
+            if current_section == "B":
+                target_list = purchases
+            elif current_section == "D":
+                target_list = other_charges
+            else:  # "E"
+                target_list = voluntary_services

            # Parse text line by line
            for line in page_text.split("\n"):
@ -251,42 +319,55 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
                if not line:
                    continue

-                card_info = extract_card_holder(line)
-                if card_info:
-                    card_suffix, card_holder_name = card_info
-                    logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
-                    continue
+                # Extract card holder info (only in section B)
+                if current_section == "B":
+                    card_info = extract_card_holder(line)
+                    if card_info:
+                        card_suffix, card_holder_name = card_info
+                        if card_suffix not in seen_card_suffixes:
+                            card_holders.append({
+                                "card_suffix": card_suffix,
+                                "name": card_holder_name
+                            })
+                            seen_card_suffixes.add(card_suffix)
+                            logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
+                        continue

                transaction = parse_transaction_line(line)
                if transaction:
-                    transactions.append(transaction)
-                    logger.debug(f"Extracted transaction: {transaction['reference']}")
+                    target_list.append(transaction)
+                    logger.debug(f"Extracted {current_section} transaction: {transaction['reference']}")

-            if reached_section_end:
-                break
-
-        if not section_b_found:
+        if "B" not in sections_found:
            raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")

-        # Calculate summary
-        total_crc = sum(t["amount_crc"] or 0 for t in transactions)
-        total_usd = sum(t["amount_usd"] or 0 for t in transactions)
+        # Calculate summaries
+        def calculate_summary(txns):
+            total_crc = sum(t["amount_crc"] or 0 for t in txns)
+            total_usd = sum(t["amount_usd"] or 0 for t in txns)
+            return {
+                "total_crc": round(total_crc, 2),
+                "total_usd": round(total_usd, 2),
+                "count": len(txns)
+            }

-        card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None
+        total_transactions = len(purchases) + len(other_charges) + len(voluntary_services)

        return {
            "metadata": {
                "source_file": pdf_path.name,
                "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
                "statement_date": statement_date,
-                "total_transactions": len(transactions)
+                "total_transactions": total_transactions
            },
-            "card_holder": card_holder,
-            "transactions": transactions,
+            "card_holders": card_holders,
+            "purchases": purchases,
+            "other_charges": other_charges,
+            "voluntary_services": voluntary_services,
            "summary": {
-                "total_crc": round(total_crc, 2),
-                "total_usd": round(total_usd, 2),
-                "transaction_count": len(transactions)
+                "purchases": calculate_summary(purchases),
+                "other_charges": calculate_summary(other_charges),
+                "voluntary_services": calculate_summary(voluntary_services)
            }
        }

@ -346,9 +427,17 @@ Examples:
        with open(args.output, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=indent, ensure_ascii=False)

-        print(f"Extracted {result['summary']['transaction_count']} transactions to {args.output}")
-        print(f"Total CRC: {result['summary']['total_crc']:,.2f}")
-        print(f"Total USD: {result['summary']['total_usd']:,.2f}")
+        summary = result['summary']
+        print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}")
+        print(f"  Purchases (B):          {summary['purchases']['count']:3d}  "
+              f"CRC {summary['purchases']['total_crc']:>12,.2f}  "
+              f"USD {summary['purchases']['total_usd']:>10,.2f}")
+        print(f"  Other charges (D):      {summary['other_charges']['count']:3d}  "
+              f"CRC {summary['other_charges']['total_crc']:>12,.2f}  "
+              f"USD {summary['other_charges']['total_usd']:>10,.2f}")
+        print(f"  Voluntary services (E): {summary['voluntary_services']['count']:3d}  "
+              f"CRC {summary['voluntary_services']['total_crc']:>12,.2f}  "
+              f"USD {summary['voluntary_services']['total_usd']:>10,.2f}")

    except ValueError as e:
        print(f"Error: {e}", file=sys.stderr)