diff --git a/bac_extract.py b/bac_extract.py index adff408..6e19108 100755 --- a/bac_extract.py +++ b/bac_extract.py @@ -3,7 +3,10 @@ BAC Credit Card Statement Extractor Extracts transactions from BAC Costa Rica credit card statement PDFs. -Specifically targets section "B) Detalle de compras del periodo". +Targets sections: + B) Detalle de compras del periodo + D) Detalle de otros cargos + E) Detalle de productos y servicios de elección voluntaria """ import argparse @@ -118,17 +121,27 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]: return f"{full_year:04d}-{month:02d}-{int(day):02d}" -def find_section_b_start(page_text: str) -> bool: - """Check if page contains start of section B.""" +def find_section_b_start(text: str) -> bool: + """Check if text contains start of section B (purchases).""" patterns = [ r"B\)\s*Detalle\s+de\s+compras", r"Detalle\s+de\s+compras\s+del\s+periodo", ] - return any(re.search(p, page_text, re.IGNORECASE) for p in patterns) + return any(re.search(p, text, re.IGNORECASE) for p in patterns) -def is_section_end(text: str) -> bool: - """Check if we've reached the end of section B.""" +def find_section_d_start(text: str) -> bool: + """Check if text contains start of section D (other charges).""" + return bool(re.search(r"D\)\s*Detalle\s+de\s+otros\s+cargos", text, re.IGNORECASE)) + + +def find_section_e_start(text: str) -> bool: + """Check if text contains start of section E (voluntary products/services).""" + return bool(re.search(r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios", text, re.IGNORECASE)) + + +def is_section_b_end(text: str) -> bool: + """Check if text indicates the end of section B.""" end_patterns = [ r"Total\s+de\s+compras\s+del\s+periodo", r"C\)\s*Detalle\s+de\s+intereses", @@ -138,6 +151,24 @@ def is_section_end(text: str) -> bool: return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) +def is_section_d_end(text: str) -> bool: + """Check if text indicates the end of section D.""" + end_patterns = [ + r"Total\s+por\s+concepto\s+otros\s+cargos", + r"E\)\s*Detalle", + ] + return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) + + +def is_section_e_end(text: str) -> bool: + """Check if text indicates the end of section E.""" + end_patterns = [ + r"Total\s+por\s+concepto\s+de\s+productos", + r"F\)\s*Cargos", + ] + return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) + + def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]: """ Extract card holder info from a row. @@ -203,7 +234,8 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict: verbose: Enable verbose logging Returns: - Dictionary with metadata, card_holder, transactions, and summary + Dictionary with metadata, card_holders, purchases, other_charges, + voluntary_services, and summary """ if verbose: logging.basicConfig(level=logging.DEBUG) @@ -217,11 +249,18 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict: statement_date = extract_statement_date(pdf) - transactions = [] - card_suffix = None - card_holder_name = None - in_section_b = False - section_b_found = False + # Transactions by section + purchases = [] # Section B + other_charges = [] # Section D + voluntary_services = [] # Section E + + # Track card holders (may have multiple) + card_holders = [] + seen_card_suffixes = set() + + # Section tracking: None, "B", "D", "E" + current_section = None + sections_found = set() # Start from page 2 (index 1) as page 1 is summary only start_page = 1 if len(pdf.pages) > 1 else 0 @@ -231,19 +270,48 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict: logger.debug(f"Processing page {page_num}") - # Check for section B start - if not in_section_b and find_section_b_start(page_text): - in_section_b = True - section_b_found = True + # Check for section transitions (order matters: check ends before starts) + # Section B end + if current_section == "B" and is_section_b_end(page_text): + logger.debug(f"Section B ended on page {page_num}") + current_section = None + + # Section D end + if current_section == "D" and is_section_d_end(page_text): + logger.debug(f"Section D ended on page {page_num}") + current_section = None + + # Section E end + if current_section == "E" and is_section_e_end(page_text): + logger.debug(f"Section E ended on page {page_num}") + current_section = None + + # Check for section starts + if current_section is None and find_section_b_start(page_text): + current_section = "B" + sections_found.add("B") logger.debug(f"Found section B on page {page_num}") - if not in_section_b: + if current_section is None and find_section_d_start(page_text): + current_section = "D" + sections_found.add("D") + logger.debug(f"Found section D on page {page_num}") + + if current_section is None and find_section_e_start(page_text): + current_section = "E" + sections_found.add("E") + logger.debug(f"Found section E on page {page_num}") + + if current_section is None: continue - # Check for section end (still process this page before breaking) - reached_section_end = is_section_end(page_text) - if reached_section_end: - logger.debug(f"Found section end on page {page_num}") + # Select the appropriate transaction list + if current_section == "B": + target_list = purchases + elif current_section == "D": + target_list = other_charges + else: # "E" + target_list = voluntary_services # Parse text line by line for line in page_text.split("\n"): @@ -251,42 +319,55 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict: if not line: continue - card_info = extract_card_holder(line) - if card_info: - card_suffix, card_holder_name = card_info - logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}") - continue + # Extract card holder info (only in section B) + if current_section == "B": + card_info = extract_card_holder(line) + if card_info: + card_suffix, card_holder_name = card_info + if card_suffix not in seen_card_suffixes: + card_holders.append({ + "card_suffix": card_suffix, + "name": card_holder_name + }) + seen_card_suffixes.add(card_suffix) + logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}") + continue transaction = parse_transaction_line(line) if transaction: - transactions.append(transaction) - logger.debug(f"Extracted transaction: {transaction['reference']}") + target_list.append(transaction) + logger.debug(f"Extracted {current_section} transaction: {transaction['reference']}") - if reached_section_end: - break - - if not section_b_found: + if "B" not in sections_found: raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF") - # Calculate summary - total_crc = sum(t["amount_crc"] or 0 for t in transactions) - total_usd = sum(t["amount_usd"] or 0 for t in transactions) + # Calculate summaries + def calculate_summary(txns): + total_crc = sum(t["amount_crc"] or 0 for t in txns) + total_usd = sum(t["amount_usd"] or 0 for t in txns) + return { + "total_crc": round(total_crc, 2), + "total_usd": round(total_usd, 2), + "count": len(txns) + } - card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None + total_transactions = len(purchases) + len(other_charges) + len(voluntary_services) return { "metadata": { "source_file": pdf_path.name, "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), "statement_date": statement_date, - "total_transactions": len(transactions) + "total_transactions": total_transactions }, - "card_holder": card_holder, - "transactions": transactions, + "card_holders": card_holders, + "purchases": purchases, + "other_charges": other_charges, + "voluntary_services": voluntary_services, "summary": { - "total_crc": round(total_crc, 2), - "total_usd": round(total_usd, 2), - "transaction_count": len(transactions) + "purchases": calculate_summary(purchases), + "other_charges": calculate_summary(other_charges), + "voluntary_services": calculate_summary(voluntary_services) } } @@ -346,9 +427,17 @@ Examples: with open(args.output, "w", encoding="utf-8") as f: json.dump(result, f, indent=indent, ensure_ascii=False) - print(f"Extracted {result['summary']['transaction_count']} transactions to {args.output}") - print(f"Total CRC: {result['summary']['total_crc']:,.2f}") - print(f"Total USD: {result['summary']['total_usd']:,.2f}") + summary = result['summary'] + print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}") + print(f" Purchases (B): {summary['purchases']['count']:3d} " + f"CRC {summary['purchases']['total_crc']:>12,.2f} " + f"USD {summary['purchases']['total_usd']:>10,.2f}") + print(f" Other charges (D): {summary['other_charges']['count']:3d} " + f"CRC {summary['other_charges']['total_crc']:>12,.2f} " + f"USD {summary['other_charges']['total_usd']:>10,.2f}") + print(f" Voluntary services (E): {summary['voluntary_services']['count']:3d} " + f"CRC {summary['voluntary_services']['total_crc']:>12,.2f} " + f"USD {summary['voluntary_services']['total_usd']:>10,.2f}") except ValueError as e: print(f"Error: {e}", file=sys.stderr)