From 62450842c31299f9f1644927d7b00cc46f05c6c2 Mon Sep 17 00:00:00 2001 From: Fabian Montero Date: Mon, 9 Mar 2026 13:58:34 -0600 Subject: [PATCH 1/5] update gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 76de699..f04df3a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.pdf *.json +__pycache__/ From a05f701f1662b83228279fc8f35f2040bc31b728 Mon Sep 17 00:00:00 2001 From: Fabian Montero Date: Mon, 9 Mar 2026 13:59:03 -0600 Subject: [PATCH 2/5] remove card suffix functionality --- CLAUDE.md | 9 ++-- bac_extract.py | 124 ++++++++++++++----------------------------------- 2 files changed, 39 insertions(+), 94 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 08dc084..ee95b39 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -13,11 +13,11 @@ Single-script Python tool that extracts credit card transactions from BAC Costa ## Usage ```bash -python bac_extract.py [options] +python bac_extract.py [options] # Examples -python bac_extract.py EstadodeCuenta.pdf 1234 --pretty -python bac_extract.py statement.pdf 1234 -o output.json -v +python bac_extract.py EstadodeCuenta.pdf --pretty +python bac_extract.py statement.pdf -o output.json -v ``` Options: @@ -31,8 +31,7 @@ The extraction pipeline: 1. Validates PDF is a BAC statement (`is_bac_statement`) 2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`) 3. Extracts tables page-by-page using pdfplumber -4. Filters transactions by card suffix (last 4 digits) -5. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators +4. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators Key parsing functions: - `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15" diff --git a/bac_extract.py b/bac_extract.py index 850bef1..adff408 100755 --- a/bac_extract.py +++ b/bac_extract.py @@ -72,21 +72,19 @@ def parse_amount(amount_str: str) -> Optional[float]: Parse amount string with comma thousands separator. Handles trailing '-' for negative values. """ - if not amount_str or not amount_str.strip(): + if not amount_str: return None amount_str = amount_str.strip() + if not amount_str: + return None - # Check for trailing negative sign is_negative = amount_str.endswith("-") if is_negative: - amount_str = amount_str[:-1].strip() + amount_str = amount_str[:-1] - # Remove thousands separators (commas) and handle decimal point - # Format: 1,234.56 or 1,234,567.89 try: - amount_str = amount_str.replace(",", "") - amount = float(amount_str) + amount = float(amount_str.replace(",", "")) return -amount if is_negative else amount except ValueError: return None @@ -107,19 +105,17 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]: return None first_page_text = pdf.pages[0].extract_text() or "" - - # Look for date patterns in the first page - # Common format: "Fecha de corte: DD-MMM-YY" or similar date_matches = DATE_PATTERN.findall(first_page_text) - if date_matches: - # Use the first date found as statement date - day, month_abbr, year = date_matches[0] - month = SPANISH_MONTHS.get(month_abbr.upper()) - if month: - full_year = 2000 + int(year) - return f"{full_year:04d}-{month:02d}-{int(day):02d}" + if not date_matches: + return None - return None + day, month_abbr, year = date_matches[0] + month = SPANISH_MONTHS.get(month_abbr.upper()) + if not month: + return None + + full_year = 2000 + int(year) + return f"{full_year:04d}-{month:02d}-{int(day):02d}" def find_section_b_start(page_text: str) -> bool: @@ -128,10 +124,7 @@ def find_section_b_start(page_text: str) -> bool: r"B\)\s*Detalle\s+de\s+compras", r"Detalle\s+de\s+compras\s+del\s+periodo", ] - for pattern in patterns: - if re.search(pattern, page_text, re.IGNORECASE): - return True - return False + return any(re.search(p, page_text, re.IGNORECASE) for p in patterns) def is_section_end(text: str) -> bool: @@ -142,10 +135,7 @@ def is_section_end(text: str) -> bool: r"Detalle\s+de\s+intereses", r"D\)\s*Detalle", ] - for pattern in end_patterns: - if re.search(pattern, text, re.IGNORECASE): - return True - return False + return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]: @@ -176,18 +166,16 @@ def parse_transaction_line(line: str) -> Optional[dict]: reference = match.group(1) date_str = match.group(2) - desc_and_loc = match.group(3).strip() + description = match.group(3).strip() currency = match.group(4).upper() amount_str = match.group(5) is_negative = match.group(6) == "-" - # Parse date date = parse_spanish_date(date_str) if not date: logger.warning(f"Could not parse date '{date_str}' for reference {reference}") return None - # Parse amount amount = parse_amount(amount_str) if amount is None: logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}") @@ -195,34 +183,23 @@ def parse_transaction_line(line: str) -> Optional[dict]: if is_negative: amount = -amount - # Split description and location - # Location is typically at the end, often a short suffix like "ANILL", "San Jose" - # For now, keep everything as description - description = desc_and_loc - location = None - - # Set amount in appropriate currency field - amount_crc = amount if currency == "CRC" else None - amount_usd = amount if currency == "USD" else None - return { "reference": reference, "date": date, "description": description, - "location": location, + "location": None, "currency": currency, - "amount_crc": amount_crc, - "amount_usd": amount_usd, + "amount_crc": amount if currency == "CRC" else None, + "amount_usd": amount if currency == "USD" else None, } -def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False) -> dict: +def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict: """ Extract transactions from a BAC credit card statement PDF. Args: pdf_path: Path to the PDF file - card_suffix: Last 4 digits of card to filter verbose: Enable verbose logging Returns: @@ -241,11 +218,10 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False statement_date = extract_statement_date(pdf) transactions = [] - current_card_suffix = None - current_card_name = None + card_suffix = None + card_holder_name = None in_section_b = False section_b_found = False - card_suffix_found = False # Start from page 2 (index 1) as page 1 is summary only start_page = 1 if len(pdf.pages) > 1 else 0 @@ -261,67 +237,48 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False section_b_found = True logger.debug(f"Found section B on page {page_num}") - # Check for section end - if in_section_b and is_section_end(page_text): - logger.debug(f"Found section end on page {page_num}") - # Still process this page, but mark we're ending - if not in_section_b: continue + # Check for section end (still process this page before breaking) + reached_section_end = is_section_end(page_text) + if reached_section_end: + logger.debug(f"Found section end on page {page_num}") + # Parse text line by line for line in page_text.split("\n"): line = line.strip() if not line: continue - # Check for card holder line card_info = extract_card_holder(line) if card_info: - current_card_suffix, current_card_name = card_info - logger.debug(f"Found card holder: {current_card_suffix} - {current_card_name}") - if current_card_suffix == card_suffix: - card_suffix_found = True + card_suffix, card_holder_name = card_info + logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}") continue - # Skip if we're not tracking the right card - if current_card_suffix != card_suffix: - continue - - # Try to parse as transaction transaction = parse_transaction_line(line) if transaction: transactions.append(transaction) logger.debug(f"Extracted transaction: {transaction['reference']}") - # Check if we've passed section B - if in_section_b and is_section_end(page_text): + if reached_section_end: break if not section_b_found: raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF") - if not card_suffix_found: - raise ValueError(f"Card suffix '{card_suffix}' not found in statement") - # Calculate summary total_crc = sum(t["amount_crc"] or 0 for t in transactions) total_usd = sum(t["amount_usd"] or 0 for t in transactions) - # Get card holder info - card_holder = None - if card_suffix_found: - card_holder = { - "card_suffix": card_suffix, - "name": current_card_name if current_card_suffix == card_suffix else None - } + card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None return { "metadata": { "source_file": pdf_path.name, "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), "statement_date": statement_date, - "card_filter": card_suffix, "total_transactions": len(transactions) }, "card_holder": card_holder, @@ -340,8 +297,8 @@ def main(): formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - python bac_extract.py EstadodeCuenta.pdf 1234 --pretty - python bac_extract.py statement.pdf 1234 -o output.json -v + python bac_extract.py EstadodeCuenta.pdf --pretty + python bac_extract.py statement.pdf -o output.json -v """ ) @@ -351,12 +308,6 @@ Examples: help="Path to the BAC statement PDF" ) - parser.add_argument( - "card_suffix", - type=str, - help="Last 4 digits of card to filter (e.g., 1234)" - ) - parser.add_argument( "-o", "--output", type=Path, @@ -378,11 +329,6 @@ Examples: args = parser.parse_args() - # Validate card suffix - if not args.card_suffix.isdigit() or len(args.card_suffix) != 4: - print(f"Error: Card suffix must be exactly 4 digits, got '{args.card_suffix}'", file=sys.stderr) - sys.exit(1) - # Validate PDF file exists if not args.pdf_file.exists(): print(f"Error: File not found: {args.pdf_file}", file=sys.stderr) @@ -393,7 +339,7 @@ Examples: sys.exit(1) try: - result = extract_transactions(args.pdf_file, args.card_suffix, args.verbose) + result = extract_transactions(args.pdf_file, args.verbose) # Write output indent = 2 if args.pretty else None From 69a773e1b31464e492d1fcde2f5d2178629eec12 Mon Sep 17 00:00:00 2001 From: Fabian Montero Date: Mon, 9 Mar 2026 14:44:47 -0600 Subject: [PATCH 3/5] target sections D and E --- bac_extract.py | 179 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 134 insertions(+), 45 deletions(-) diff --git a/bac_extract.py b/bac_extract.py index adff408..6e19108 100755 --- a/bac_extract.py +++ b/bac_extract.py @@ -3,7 +3,10 @@ BAC Credit Card Statement Extractor Extracts transactions from BAC Costa Rica credit card statement PDFs. -Specifically targets section "B) Detalle de compras del periodo". +Targets sections: + B) Detalle de compras del periodo + D) Detalle de otros cargos + E) Detalle de productos y servicios de elección voluntaria """ import argparse @@ -118,17 +121,27 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]: return f"{full_year:04d}-{month:02d}-{int(day):02d}" -def find_section_b_start(page_text: str) -> bool: - """Check if page contains start of section B.""" +def find_section_b_start(text: str) -> bool: + """Check if text contains start of section B (purchases).""" patterns = [ r"B\)\s*Detalle\s+de\s+compras", r"Detalle\s+de\s+compras\s+del\s+periodo", ] - return any(re.search(p, page_text, re.IGNORECASE) for p in patterns) + return any(re.search(p, text, re.IGNORECASE) for p in patterns) -def is_section_end(text: str) -> bool: - """Check if we've reached the end of section B.""" +def find_section_d_start(text: str) -> bool: + """Check if text contains start of section D (other charges).""" + return bool(re.search(r"D\)\s*Detalle\s+de\s+otros\s+cargos", text, re.IGNORECASE)) + + +def find_section_e_start(text: str) -> bool: + """Check if text contains start of section E (voluntary products/services).""" + return bool(re.search(r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios", text, re.IGNORECASE)) + + +def is_section_b_end(text: str) -> bool: + """Check if text indicates the end of section B.""" end_patterns = [ r"Total\s+de\s+compras\s+del\s+periodo", r"C\)\s*Detalle\s+de\s+intereses", @@ -138,6 +151,24 @@ def is_section_end(text: str) -> bool: return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) +def is_section_d_end(text: str) -> bool: + """Check if text indicates the end of section D.""" + end_patterns = [ + r"Total\s+por\s+concepto\s+otros\s+cargos", + r"E\)\s*Detalle", + ] + return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) + + +def is_section_e_end(text: str) -> bool: + """Check if text indicates the end of section E.""" + end_patterns = [ + r"Total\s+por\s+concepto\s+de\s+productos", + r"F\)\s*Cargos", + ] + return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) + + def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]: """ Extract card holder info from a row. @@ -203,7 +234,8 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict: verbose: Enable verbose logging Returns: - Dictionary with metadata, card_holder, transactions, and summary + Dictionary with metadata, card_holders, purchases, other_charges, + voluntary_services, and summary """ if verbose: logging.basicConfig(level=logging.DEBUG) @@ -217,11 +249,18 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict: statement_date = extract_statement_date(pdf) - transactions = [] - card_suffix = None - card_holder_name = None - in_section_b = False - section_b_found = False + # Transactions by section + purchases = [] # Section B + other_charges = [] # Section D + voluntary_services = [] # Section E + + # Track card holders (may have multiple) + card_holders = [] + seen_card_suffixes = set() + + # Section tracking: None, "B", "D", "E" + current_section = None + sections_found = set() # Start from page 2 (index 1) as page 1 is summary only start_page = 1 if len(pdf.pages) > 1 else 0 @@ -231,19 +270,48 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict: logger.debug(f"Processing page {page_num}") - # Check for section B start - if not in_section_b and find_section_b_start(page_text): - in_section_b = True - section_b_found = True + # Check for section transitions (order matters: check ends before starts) + # Section B end + if current_section == "B" and is_section_b_end(page_text): + logger.debug(f"Section B ended on page {page_num}") + current_section = None + + # Section D end + if current_section == "D" and is_section_d_end(page_text): + logger.debug(f"Section D ended on page {page_num}") + current_section = None + + # Section E end + if current_section == "E" and is_section_e_end(page_text): + logger.debug(f"Section E ended on page {page_num}") + current_section = None + + # Check for section starts + if current_section is None and find_section_b_start(page_text): + current_section = "B" + sections_found.add("B") logger.debug(f"Found section B on page {page_num}") - if not in_section_b: + if current_section is None and find_section_d_start(page_text): + current_section = "D" + sections_found.add("D") + logger.debug(f"Found section D on page {page_num}") + + if current_section is None and find_section_e_start(page_text): + current_section = "E" + sections_found.add("E") + logger.debug(f"Found section E on page {page_num}") + + if current_section is None: continue - # Check for section end (still process this page before breaking) - reached_section_end = is_section_end(page_text) - if reached_section_end: - logger.debug(f"Found section end on page {page_num}") + # Select the appropriate transaction list + if current_section == "B": + target_list = purchases + elif current_section == "D": + target_list = other_charges + else: # "E" + target_list = voluntary_services # Parse text line by line for line in page_text.split("\n"): @@ -251,42 +319,55 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict: if not line: continue - card_info = extract_card_holder(line) - if card_info: - card_suffix, card_holder_name = card_info - logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}") - continue + # Extract card holder info (only in section B) + if current_section == "B": + card_info = extract_card_holder(line) + if card_info: + card_suffix, card_holder_name = card_info + if card_suffix not in seen_card_suffixes: + card_holders.append({ + "card_suffix": card_suffix, + "name": card_holder_name + }) + seen_card_suffixes.add(card_suffix) + logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}") + continue transaction = parse_transaction_line(line) if transaction: - transactions.append(transaction) - logger.debug(f"Extracted transaction: {transaction['reference']}") + target_list.append(transaction) + logger.debug(f"Extracted {current_section} transaction: {transaction['reference']}") - if reached_section_end: - break - - if not section_b_found: + if "B" not in sections_found: raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF") - # Calculate summary - total_crc = sum(t["amount_crc"] or 0 for t in transactions) - total_usd = sum(t["amount_usd"] or 0 for t in transactions) + # Calculate summaries + def calculate_summary(txns): + total_crc = sum(t["amount_crc"] or 0 for t in txns) + total_usd = sum(t["amount_usd"] or 0 for t in txns) + return { + "total_crc": round(total_crc, 2), + "total_usd": round(total_usd, 2), + "count": len(txns) + } - card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None + total_transactions = len(purchases) + len(other_charges) + len(voluntary_services) return { "metadata": { "source_file": pdf_path.name, "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), "statement_date": statement_date, - "total_transactions": len(transactions) + "total_transactions": total_transactions }, - "card_holder": card_holder, - "transactions": transactions, + "card_holders": card_holders, + "purchases": purchases, + "other_charges": other_charges, + "voluntary_services": voluntary_services, "summary": { - "total_crc": round(total_crc, 2), - "total_usd": round(total_usd, 2), - "transaction_count": len(transactions) + "purchases": calculate_summary(purchases), + "other_charges": calculate_summary(other_charges), + "voluntary_services": calculate_summary(voluntary_services) } } @@ -346,9 +427,17 @@ Examples: with open(args.output, "w", encoding="utf-8") as f: json.dump(result, f, indent=indent, ensure_ascii=False) - print(f"Extracted {result['summary']['transaction_count']} transactions to {args.output}") - print(f"Total CRC: {result['summary']['total_crc']:,.2f}") - print(f"Total USD: {result['summary']['total_usd']:,.2f}") + summary = result['summary'] + print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}") + print(f" Purchases (B): {summary['purchases']['count']:3d} " + f"CRC {summary['purchases']['total_crc']:>12,.2f} " + f"USD {summary['purchases']['total_usd']:>10,.2f}") + print(f" Other charges (D): {summary['other_charges']['count']:3d} " + f"CRC {summary['other_charges']['total_crc']:>12,.2f} " + f"USD {summary['other_charges']['total_usd']:>10,.2f}") + print(f" Voluntary services (E): {summary['voluntary_services']['count']:3d} " + f"CRC {summary['voluntary_services']['total_crc']:>12,.2f} " + f"USD {summary['voluntary_services']['total_usd']:>10,.2f}") except ValueError as e: print(f"Error: {e}", file=sys.stderr) From 6fc7da8899ad69a2e358a6ffbd40ee6f2e343ab5 Mon Sep 17 00:00:00 2001 From: Fabian Montero Date: Mon, 9 Mar 2026 15:39:02 -0600 Subject: [PATCH 4/5] update gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index f04df3a..fcd11ea 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.pdf *.json __pycache__/ +testStatements/ From 14734d3125dd9da48bda8c55731ffd7abf68912c Mon Sep 17 00:00:00 2001 From: Fabian Montero Date: Mon, 9 Mar 2026 15:39:16 -0600 Subject: [PATCH 5/5] fix bugs and simplify --- CLAUDE.md | 20 ++- bac_extract.py | 393 ++++++++++++------------------------------------- 2 files changed, 111 insertions(+), 302 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index ee95b39..5b058a9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,15 +4,19 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Project Overview -Single-script Python tool that extracts credit card transactions from BAC Costa Rica statement PDFs. Parses section "B) Detalle de compras del periodo" and outputs JSON. +Single-script Python tool that extracts credit card transactions from BAC Costa Rica statement PDFs. Parses sections B (purchases), D (other charges), and E (voluntary services) and outputs JSON. ## Dependencies - pdfplumber (>=0.10.0) -## Usage +## Commands ```bash +# Run tests +python testStatements/run_tests.py + +# Run extractor python bac_extract.py [options] # Examples @@ -29,11 +33,15 @@ Options: The extraction pipeline: 1. Validates PDF is a BAC statement (`is_bac_statement`) -2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`) -3. Extracts tables page-by-page using pdfplumber -4. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators +2. Iterates pages line-by-line, detecting section boundaries via `SECTIONS` dict patterns +3. Parses transactions matching `TRANSACTION_PATTERN` regex +4. Outputs card holders, transactions by section, and summaries + +Key data structures: +- `SECTIONS`: Maps section IDs (B/D/E) to start/end regex patterns and output keys +- `SPANISH_MONTHS`: Spanish month abbreviations for date parsing Key parsing functions: - `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15" - `parse_amount`: Handles "1,234.56" and trailing negatives "100.00-" -- `extract_card_holder`: Matches "************1234 NAME" pattern +- `matches_patterns`: Generic regex pattern matcher for section detection diff --git a/bac_extract.py b/bac_extract.py index 6e19108..4757911 100755 --- a/bac_extract.py +++ b/bac_extract.py @@ -20,74 +20,63 @@ from typing import Optional import pdfplumber -# Spanish month abbreviations to month numbers SPANISH_MONTHS = { "ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6, "JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12 } -# Card holder pattern: ************XXXX NAME CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)") - -# Date pattern: D-MMM-YY or DD-MMM-YY DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE) - -# Transaction line pattern: -# Reference Date Description Location (optional) Currency Amount -# 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56 TRANSACTION_PATTERN = re.compile( - r"^(\d{12})\s+" # Reference (12 digits) - r"(\d{1,2}-[A-Z]{3}-\d{2})\s+" # Date - r"(.+?)\s+" # Description - r"(CRC|USD)\s+" # Currency - r"([\d,]+\.\d{2})(-)?$", # Amount (with optional trailing minus) + r"^(\d{12,13})\s+" + r"(\d{1,2}-[A-Z]{3}-\d{2})\s+" + r"(.+?)\s+" + r"(CRC|USD)\s+" + r"([\d,]+\.\d{2})(-)?$", re.IGNORECASE ) +# Section definitions: start patterns, end patterns, output key +SECTIONS = { + "B": { + "start": [r"B\)\s*Detalle\s+de\s+compras", r"Detalle\s+de\s+compras\s+del\s+periodo"], + "end": [r"Total\s+de\s+compras\s+del\s+periodo", r"C\)\s*Detalle", r"D\)\s*Detalle", r"E\)\s*Detalle"], + "key": "purchases", + }, + "D": { + "start": [r"D\)\s*Detalle\s+de\s+otros\s+cargos"], + "end": [r"Total\s+por\s+concepto\s+otros\s+cargos", r"E\)\s*Detalle"], + "key": "other_charges", + }, + "E": { + "start": [r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios"], + "end": [r"Total\s+por\s+concepto\s+de\s+productos", r"F\)\s*Cargos"], + "key": "voluntary_services", + }, +} + logger = logging.getLogger(__name__) def parse_spanish_date(date_str: str) -> Optional[str]: """Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD).""" - if not date_str: - return None - - match = DATE_PATTERN.match(date_str.strip()) + match = DATE_PATTERN.match(date_str.strip()) if date_str else None if not match: return None - day, month_abbr, year = match.groups() month = SPANISH_MONTHS.get(month_abbr.upper()) if not month: return None - - # Assume 2000s for 2-digit year - full_year = 2000 + int(year) - - try: - return f"{full_year:04d}-{month:02d}-{int(day):02d}" - except ValueError: - return None + return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}" def parse_amount(amount_str: str) -> Optional[float]: - """ - Parse amount string with comma thousands separator. - Handles trailing '-' for negative values. - """ - if not amount_str: + """Parse amount with comma thousands separator. Handles trailing '-' for negatives.""" + if not amount_str or not (amount_str := amount_str.strip()): return None - - amount_str = amount_str.strip() - if not amount_str: - return None - is_negative = amount_str.endswith("-") - if is_negative: - amount_str = amount_str[:-1] - try: - amount = float(amount_str.replace(",", "")) + amount = float(amount_str.rstrip("-").replace(",", "")) return -amount if is_negative else amount except ValueError: return None @@ -97,7 +86,6 @@ def is_bac_statement(pdf: pdfplumber.PDF) -> bool: """Check if the PDF is a BAC credit card statement.""" if not pdf.pages: return False - first_page_text = pdf.pages[0].extract_text() or "" return "BAC" in first_page_text and "TARJETA" in first_page_text.upper() @@ -106,118 +94,43 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]: """Extract the statement date from the PDF.""" if not pdf.pages: return None - first_page_text = pdf.pages[0].extract_text() or "" date_matches = DATE_PATTERN.findall(first_page_text) if not date_matches: return None - day, month_abbr, year = date_matches[0] month = SPANISH_MONTHS.get(month_abbr.upper()) if not month: return None - - full_year = 2000 + int(year) - return f"{full_year:04d}-{month:02d}-{int(day):02d}" + return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}" -def find_section_b_start(text: str) -> bool: - """Check if text contains start of section B (purchases).""" - patterns = [ - r"B\)\s*Detalle\s+de\s+compras", - r"Detalle\s+de\s+compras\s+del\s+periodo", - ] +def matches_patterns(text: str, patterns: list[str]) -> bool: + """Check if text matches any of the given regex patterns.""" return any(re.search(p, text, re.IGNORECASE) for p in patterns) -def find_section_d_start(text: str) -> bool: - """Check if text contains start of section D (other charges).""" - return bool(re.search(r"D\)\s*Detalle\s+de\s+otros\s+cargos", text, re.IGNORECASE)) - - -def find_section_e_start(text: str) -> bool: - """Check if text contains start of section E (voluntary products/services).""" - return bool(re.search(r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios", text, re.IGNORECASE)) - - -def is_section_b_end(text: str) -> bool: - """Check if text indicates the end of section B.""" - end_patterns = [ - r"Total\s+de\s+compras\s+del\s+periodo", - r"C\)\s*Detalle\s+de\s+intereses", - r"Detalle\s+de\s+intereses", - r"D\)\s*Detalle", - ] - return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) - - -def is_section_d_end(text: str) -> bool: - """Check if text indicates the end of section D.""" - end_patterns = [ - r"Total\s+por\s+concepto\s+otros\s+cargos", - r"E\)\s*Detalle", - ] - return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) - - -def is_section_e_end(text: str) -> bool: - """Check if text indicates the end of section E.""" - end_patterns = [ - r"Total\s+por\s+concepto\s+de\s+productos", - r"F\)\s*Cargos", - ] - return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) - - -def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]: - """ - Extract card holder info from a row. - Returns (card_suffix, name) or None. - """ - match = CARD_HOLDER_PATTERN.search(row_text) - if match: - return match.group(1), match.group(2).strip() - return None - - def parse_transaction_line(line: str) -> Optional[dict]: - """ - Parse a text-based transaction line. - - Format: Reference Date Description [Location] Currency Amount - Example: 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56 - """ - line = line.strip() - if not line: - return None - - match = TRANSACTION_PATTERN.match(line) + """Parse a transaction line into a dict, or return None if not a transaction.""" + match = TRANSACTION_PATTERN.match(line.strip()) if not match: return None - reference = match.group(1) - date_str = match.group(2) - description = match.group(3).strip() - currency = match.group(4).upper() - amount_str = match.group(5) - is_negative = match.group(6) == "-" + reference, date_str, description, currency, amount_str, neg = match.groups() + currency = currency.upper() date = parse_spanish_date(date_str) - if not date: - logger.warning(f"Could not parse date '{date_str}' for reference {reference}") - return None - amount = parse_amount(amount_str) - if amount is None: - logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}") + if not date or amount is None: + logger.warning(f"Could not parse transaction: {line}") return None - if is_negative: + if neg: amount = -amount return { "reference": reference, "date": date, - "description": description, + "description": description.strip(), "location": None, "currency": currency, "amount_crc": amount if currency == "CRC" else None, @@ -226,228 +139,116 @@ def parse_transaction_line(line: str) -> Optional[dict]: def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict: - """ - Extract transactions from a BAC credit card statement PDF. - - Args: - pdf_path: Path to the PDF file - verbose: Enable verbose logging - - Returns: - Dictionary with metadata, card_holders, purchases, other_charges, - voluntary_services, and summary - """ - if verbose: - logging.basicConfig(level=logging.DEBUG) - else: - logging.basicConfig(level=logging.INFO) + """Extract transactions from a BAC credit card statement PDF.""" + logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) with pdfplumber.open(pdf_path) as pdf: - # Validate this is a BAC statement if not is_bac_statement(pdf): raise ValueError("PDF does not appear to be a BAC credit card statement") statement_date = extract_statement_date(pdf) - - # Transactions by section - purchases = [] # Section B - other_charges = [] # Section D - voluntary_services = [] # Section E - - # Track card holders (may have multiple) + transactions = {s["key"]: [] for s in SECTIONS.values()} card_holders = [] seen_card_suffixes = set() - - # Section tracking: None, "B", "D", "E" current_section = None - sections_found = set() + sections_completed = set() - # Start from page 2 (index 1) as page 1 is summary only start_page = 1 if len(pdf.pages) > 1 else 0 - for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1): page_text = page.extract_text() or "" - logger.debug(f"Processing page {page_num}") - # Check for section transitions (order matters: check ends before starts) - # Section B end - if current_section == "B" and is_section_b_end(page_text): - logger.debug(f"Section B ended on page {page_num}") - current_section = None - - # Section D end - if current_section == "D" and is_section_d_end(page_text): - logger.debug(f"Section D ended on page {page_num}") - current_section = None - - # Section E end - if current_section == "E" and is_section_e_end(page_text): - logger.debug(f"Section E ended on page {page_num}") - current_section = None - - # Check for section starts - if current_section is None and find_section_b_start(page_text): - current_section = "B" - sections_found.add("B") - logger.debug(f"Found section B on page {page_num}") - - if current_section is None and find_section_d_start(page_text): - current_section = "D" - sections_found.add("D") - logger.debug(f"Found section D on page {page_num}") - - if current_section is None and find_section_e_start(page_text): - current_section = "E" - sections_found.add("E") - logger.debug(f"Found section E on page {page_num}") - - if current_section is None: - continue - - # Select the appropriate transaction list - if current_section == "B": - target_list = purchases - elif current_section == "D": - target_list = other_charges - else: # "E" - target_list = voluntary_services - - # Parse text line by line for line in page_text.split("\n"): line = line.strip() if not line: continue - # Extract card holder info (only in section B) - if current_section == "B": - card_info = extract_card_holder(line) - if card_info: - card_suffix, card_holder_name = card_info - if card_suffix not in seen_card_suffixes: - card_holders.append({ - "card_suffix": card_suffix, - "name": card_holder_name - }) - seen_card_suffixes.add(card_suffix) - logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}") - continue + # Check for section end + if current_section and matches_patterns(line, SECTIONS[current_section]["end"]): + logger.debug(f"Section {current_section} ended on page {page_num}") + sections_completed.add(current_section) + current_section = None - transaction = parse_transaction_line(line) - if transaction: - target_list.append(transaction) - logger.debug(f"Extracted {current_section} transaction: {transaction['reference']}") + # Check for section start + if current_section is None: + for sec_id, sec in SECTIONS.items(): + if sec_id not in sections_completed and matches_patterns(line, sec["start"]): + current_section = sec_id + logger.debug(f"Found section {sec_id} on page {page_num}") + break + continue - if "B" not in sections_found: + # Extract card holder + match = CARD_HOLDER_PATTERN.search(line) + if match: + suffix, name = match.group(1), match.group(2).strip() + if suffix not in seen_card_suffixes: + card_holders.append({"card_suffix": suffix, "name": name}) + seen_card_suffixes.add(suffix) + logger.debug(f"Found card holder: {suffix} - {name}") + continue + + # Parse transaction + txn = parse_transaction_line(line) + if txn: + transactions[SECTIONS[current_section]["key"]].append(txn) + logger.debug(f"Extracted {current_section} transaction: {txn['reference']}") + + if "B" not in sections_completed and not transactions["purchases"]: raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF") - # Calculate summaries - def calculate_summary(txns): - total_crc = sum(t["amount_crc"] or 0 for t in txns) - total_usd = sum(t["amount_usd"] or 0 for t in txns) + def summarize(txns): return { - "total_crc": round(total_crc, 2), - "total_usd": round(total_usd, 2), - "count": len(txns) + "total_crc": round(sum(t["amount_crc"] or 0 for t in txns), 2), + "total_usd": round(sum(t["amount_usd"] or 0 for t in txns), 2), + "count": len(txns), } - total_transactions = len(purchases) + len(other_charges) + len(voluntary_services) - return { "metadata": { "source_file": pdf_path.name, "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), "statement_date": statement_date, - "total_transactions": total_transactions + "total_transactions": sum(len(t) for t in transactions.values()), }, "card_holders": card_holders, - "purchases": purchases, - "other_charges": other_charges, - "voluntary_services": voluntary_services, - "summary": { - "purchases": calculate_summary(purchases), - "other_charges": calculate_summary(other_charges), - "voluntary_services": calculate_summary(voluntary_services) - } + **transactions, + "summary": {key: summarize(txns) for key, txns in transactions.items()}, } def main(): - parser = argparse.ArgumentParser( - description="Extract transactions from BAC Costa Rica credit card statement PDFs", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - python bac_extract.py EstadodeCuenta.pdf --pretty - python bac_extract.py statement.pdf -o output.json -v - """ - ) - - parser.add_argument( - "pdf_file", - type=Path, - help="Path to the BAC statement PDF" - ) - - parser.add_argument( - "-o", "--output", - type=Path, - default=Path("transactions.json"), - help="Output JSON file path (default: transactions.json)" - ) - - parser.add_argument( - "--pretty", - action="store_true", - help="Pretty-print JSON output" - ) - - parser.add_argument( - "-v", "--verbose", - action="store_true", - help="Enable verbose logging" - ) - + parser = argparse.ArgumentParser(description="Extract transactions from BAC CR statement PDFs") + parser.add_argument("pdf_file", type=Path, help="Path to the BAC statement PDF") + parser.add_argument("-o", "--output", type=Path, default=Path("transactions.json")) + parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output") + parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose logging") args = parser.parse_args() - # Validate PDF file exists if not args.pdf_file.exists(): - print(f"Error: File not found: {args.pdf_file}", file=sys.stderr) - sys.exit(1) - - if not args.pdf_file.suffix.lower() == ".pdf": - print(f"Error: File must be a PDF: {args.pdf_file}", file=sys.stderr) - sys.exit(1) + sys.exit(f"Error: File not found: {args.pdf_file}") + if args.pdf_file.suffix.lower() != ".pdf": + sys.exit(f"Error: File must be a PDF: {args.pdf_file}") try: result = extract_transactions(args.pdf_file, args.verbose) - - # Write output - indent = 2 if args.pretty else None with open(args.output, "w", encoding="utf-8") as f: - json.dump(result, f, indent=indent, ensure_ascii=False) + json.dump(result, f, indent=2 if args.pretty else None, ensure_ascii=False) - summary = result['summary'] + summary = result["summary"] print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}") - print(f" Purchases (B): {summary['purchases']['count']:3d} " - f"CRC {summary['purchases']['total_crc']:>12,.2f} " - f"USD {summary['purchases']['total_usd']:>10,.2f}") - print(f" Other charges (D): {summary['other_charges']['count']:3d} " - f"CRC {summary['other_charges']['total_crc']:>12,.2f} " - f"USD {summary['other_charges']['total_usd']:>10,.2f}") - print(f" Voluntary services (E): {summary['voluntary_services']['count']:3d} " - f"CRC {summary['voluntary_services']['total_crc']:>12,.2f} " - f"USD {summary['voluntary_services']['total_usd']:>10,.2f}") + for key, label in [("purchases", "Purchases (B)"), ("other_charges", "Other charges (D)"), + ("voluntary_services", "Voluntary services (E)")]: + s = summary[key] + print(f" {label:25} {s['count']:3d} CRC {s['total_crc']:>12,.2f} USD {s['total_usd']:>10,.2f}") except ValueError as e: - print(f"Error: {e}", file=sys.stderr) - sys.exit(1) + sys.exit(f"Error: {e}") except Exception as e: - print(f"Error processing PDF: {e}", file=sys.stderr) if args.verbose: import traceback traceback.print_exc() - sys.exit(1) + sys.exit(f"Error processing PDF: {e}") if __name__ == "__main__":