#!/usr/bin/env python3 """ BAC Credit Card Statement Extractor Extracts transactions from BAC Costa Rica credit card statement PDFs. Targets sections: B) Detalle de compras del periodo D) Detalle de otros cargos E) Detalle de productos y servicios de elección voluntaria """ import argparse import json import logging import re import sys from datetime import datetime, timezone from pathlib import Path from typing import Optional import pdfplumber # Spanish month abbreviations to month numbers SPANISH_MONTHS = { "ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6, "JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12 } # Card holder pattern: ************XXXX NAME CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)") # Date pattern: D-MMM-YY or DD-MMM-YY DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE) # Transaction line pattern: # Reference Date Description Location (optional) Currency Amount # 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56 TRANSACTION_PATTERN = re.compile( r"^(\d{12})\s+" # Reference (12 digits) r"(\d{1,2}-[A-Z]{3}-\d{2})\s+" # Date r"(.+?)\s+" # Description r"(CRC|USD)\s+" # Currency r"([\d,]+\.\d{2})(-)?$", # Amount (with optional trailing minus) re.IGNORECASE ) logger = logging.getLogger(__name__) def parse_spanish_date(date_str: str) -> Optional[str]: """Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD).""" if not date_str: return None match = DATE_PATTERN.match(date_str.strip()) if not match: return None day, month_abbr, year = match.groups() month = SPANISH_MONTHS.get(month_abbr.upper()) if not month: return None # Assume 2000s for 2-digit year full_year = 2000 + int(year) try: return f"{full_year:04d}-{month:02d}-{int(day):02d}" except ValueError: return None def parse_amount(amount_str: str) -> Optional[float]: """ Parse amount string with comma thousands separator. Handles trailing '-' for negative values. """ if not amount_str: return None amount_str = amount_str.strip() if not amount_str: return None is_negative = amount_str.endswith("-") if is_negative: amount_str = amount_str[:-1] try: amount = float(amount_str.replace(",", "")) return -amount if is_negative else amount except ValueError: return None def is_bac_statement(pdf: pdfplumber.PDF) -> bool: """Check if the PDF is a BAC credit card statement.""" if not pdf.pages: return False first_page_text = pdf.pages[0].extract_text() or "" return "BAC" in first_page_text and "TARJETA" in first_page_text.upper() def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]: """Extract the statement date from the PDF.""" if not pdf.pages: return None first_page_text = pdf.pages[0].extract_text() or "" date_matches = DATE_PATTERN.findall(first_page_text) if not date_matches: return None day, month_abbr, year = date_matches[0] month = SPANISH_MONTHS.get(month_abbr.upper()) if not month: return None full_year = 2000 + int(year) return f"{full_year:04d}-{month:02d}-{int(day):02d}" def find_section_b_start(text: str) -> bool: """Check if text contains start of section B (purchases).""" patterns = [ r"B\)\s*Detalle\s+de\s+compras", r"Detalle\s+de\s+compras\s+del\s+periodo", ] return any(re.search(p, text, re.IGNORECASE) for p in patterns) def find_section_d_start(text: str) -> bool: """Check if text contains start of section D (other charges).""" return bool(re.search(r"D\)\s*Detalle\s+de\s+otros\s+cargos", text, re.IGNORECASE)) def find_section_e_start(text: str) -> bool: """Check if text contains start of section E (voluntary products/services).""" return bool(re.search(r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios", text, re.IGNORECASE)) def is_section_b_end(text: str) -> bool: """Check if text indicates the end of section B.""" end_patterns = [ r"Total\s+de\s+compras\s+del\s+periodo", r"C\)\s*Detalle\s+de\s+intereses", r"Detalle\s+de\s+intereses", r"D\)\s*Detalle", ] return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) def is_section_d_end(text: str) -> bool: """Check if text indicates the end of section D.""" end_patterns = [ r"Total\s+por\s+concepto\s+otros\s+cargos", r"E\)\s*Detalle", ] return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) def is_section_e_end(text: str) -> bool: """Check if text indicates the end of section E.""" end_patterns = [ r"Total\s+por\s+concepto\s+de\s+productos", r"F\)\s*Cargos", ] return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]: """ Extract card holder info from a row. Returns (card_suffix, name) or None. """ match = CARD_HOLDER_PATTERN.search(row_text) if match: return match.group(1), match.group(2).strip() return None def parse_transaction_line(line: str) -> Optional[dict]: """ Parse a text-based transaction line. Format: Reference Date Description [Location] Currency Amount Example: 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56 """ line = line.strip() if not line: return None match = TRANSACTION_PATTERN.match(line) if not match: return None reference = match.group(1) date_str = match.group(2) description = match.group(3).strip() currency = match.group(4).upper() amount_str = match.group(5) is_negative = match.group(6) == "-" date = parse_spanish_date(date_str) if not date: logger.warning(f"Could not parse date '{date_str}' for reference {reference}") return None amount = parse_amount(amount_str) if amount is None: logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}") return None if is_negative: amount = -amount return { "reference": reference, "date": date, "description": description, "location": None, "currency": currency, "amount_crc": amount if currency == "CRC" else None, "amount_usd": amount if currency == "USD" else None, } def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict: """ Extract transactions from a BAC credit card statement PDF. Args: pdf_path: Path to the PDF file verbose: Enable verbose logging Returns: Dictionary with metadata, card_holders, purchases, other_charges, voluntary_services, and summary """ if verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) with pdfplumber.open(pdf_path) as pdf: # Validate this is a BAC statement if not is_bac_statement(pdf): raise ValueError("PDF does not appear to be a BAC credit card statement") statement_date = extract_statement_date(pdf) # Transactions by section purchases = [] # Section B other_charges = [] # Section D voluntary_services = [] # Section E # Track card holders (may have multiple) card_holders = [] seen_card_suffixes = set() # Section tracking: None, "B", "D", "E" current_section = None sections_found = set() # Start from page 2 (index 1) as page 1 is summary only start_page = 1 if len(pdf.pages) > 1 else 0 for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1): page_text = page.extract_text() or "" logger.debug(f"Processing page {page_num}") # Check for section transitions (order matters: check ends before starts) # Section B end if current_section == "B" and is_section_b_end(page_text): logger.debug(f"Section B ended on page {page_num}") current_section = None # Section D end if current_section == "D" and is_section_d_end(page_text): logger.debug(f"Section D ended on page {page_num}") current_section = None # Section E end if current_section == "E" and is_section_e_end(page_text): logger.debug(f"Section E ended on page {page_num}") current_section = None # Check for section starts if current_section is None and find_section_b_start(page_text): current_section = "B" sections_found.add("B") logger.debug(f"Found section B on page {page_num}") if current_section is None and find_section_d_start(page_text): current_section = "D" sections_found.add("D") logger.debug(f"Found section D on page {page_num}") if current_section is None and find_section_e_start(page_text): current_section = "E" sections_found.add("E") logger.debug(f"Found section E on page {page_num}") if current_section is None: continue # Select the appropriate transaction list if current_section == "B": target_list = purchases elif current_section == "D": target_list = other_charges else: # "E" target_list = voluntary_services # Parse text line by line for line in page_text.split("\n"): line = line.strip() if not line: continue # Extract card holder info (only in section B) if current_section == "B": card_info = extract_card_holder(line) if card_info: card_suffix, card_holder_name = card_info if card_suffix not in seen_card_suffixes: card_holders.append({ "card_suffix": card_suffix, "name": card_holder_name }) seen_card_suffixes.add(card_suffix) logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}") continue transaction = parse_transaction_line(line) if transaction: target_list.append(transaction) logger.debug(f"Extracted {current_section} transaction: {transaction['reference']}") if "B" not in sections_found: raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF") # Calculate summaries def calculate_summary(txns): total_crc = sum(t["amount_crc"] or 0 for t in txns) total_usd = sum(t["amount_usd"] or 0 for t in txns) return { "total_crc": round(total_crc, 2), "total_usd": round(total_usd, 2), "count": len(txns) } total_transactions = len(purchases) + len(other_charges) + len(voluntary_services) return { "metadata": { "source_file": pdf_path.name, "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), "statement_date": statement_date, "total_transactions": total_transactions }, "card_holders": card_holders, "purchases": purchases, "other_charges": other_charges, "voluntary_services": voluntary_services, "summary": { "purchases": calculate_summary(purchases), "other_charges": calculate_summary(other_charges), "voluntary_services": calculate_summary(voluntary_services) } } def main(): parser = argparse.ArgumentParser( description="Extract transactions from BAC Costa Rica credit card statement PDFs", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python bac_extract.py EstadodeCuenta.pdf --pretty python bac_extract.py statement.pdf -o output.json -v """ ) parser.add_argument( "pdf_file", type=Path, help="Path to the BAC statement PDF" ) parser.add_argument( "-o", "--output", type=Path, default=Path("transactions.json"), help="Output JSON file path (default: transactions.json)" ) parser.add_argument( "--pretty", action="store_true", help="Pretty-print JSON output" ) parser.add_argument( "-v", "--verbose", action="store_true", help="Enable verbose logging" ) args = parser.parse_args() # Validate PDF file exists if not args.pdf_file.exists(): print(f"Error: File not found: {args.pdf_file}", file=sys.stderr) sys.exit(1) if not args.pdf_file.suffix.lower() == ".pdf": print(f"Error: File must be a PDF: {args.pdf_file}", file=sys.stderr) sys.exit(1) try: result = extract_transactions(args.pdf_file, args.verbose) # Write output indent = 2 if args.pretty else None with open(args.output, "w", encoding="utf-8") as f: json.dump(result, f, indent=indent, ensure_ascii=False) summary = result['summary'] print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}") print(f" Purchases (B): {summary['purchases']['count']:3d} " f"CRC {summary['purchases']['total_crc']:>12,.2f} " f"USD {summary['purchases']['total_usd']:>10,.2f}") print(f" Other charges (D): {summary['other_charges']['count']:3d} " f"CRC {summary['other_charges']['total_crc']:>12,.2f} " f"USD {summary['other_charges']['total_usd']:>10,.2f}") print(f" Voluntary services (E): {summary['voluntary_services']['count']:3d} " f"CRC {summary['voluntary_services']['total_crc']:>12,.2f} " f"USD {summary['voluntary_services']['total_usd']:>10,.2f}") except ValueError as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) except Exception as e: print(f"Error processing PDF: {e}", file=sys.stderr) if args.verbose: import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()