From 9fa18a67b58d70b842a4664a7fb8d7fb180355f9 Mon Sep 17 00:00:00 2001 From: Fabian Montero Date: Mon, 9 Mar 2026 13:24:41 -0600 Subject: [PATCH] initial commit --- CLAUDE.md | 40 +++++ README.md | 63 +++++++ bac_extract.py | 419 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 4 files changed, 523 insertions(+) create mode 100644 CLAUDE.md create mode 100644 README.md create mode 100755 bac_extract.py create mode 100644 requirements.txt diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..08dc084 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,40 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +Single-script Python tool that extracts credit card transactions from BAC Costa Rica statement PDFs. Parses section "B) Detalle de compras del periodo" and outputs JSON. + +## Dependencies + +- pdfplumber (>=0.10.0) + +## Usage + +```bash +python bac_extract.py [options] + +# Examples +python bac_extract.py EstadodeCuenta.pdf 1234 --pretty +python bac_extract.py statement.pdf 1234 -o output.json -v +``` + +Options: +- `-o, --output`: Output JSON path (default: transactions.json) +- `--pretty`: Pretty-print JSON +- `-v, --verbose`: Enable debug logging + +## Architecture + +The extraction pipeline: +1. Validates PDF is a BAC statement (`is_bac_statement`) +2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`) +3. Extracts tables page-by-page using pdfplumber +4. Filters transactions by card suffix (last 4 digits) +5. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators + +Key parsing functions: +- `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15" +- `parse_amount`: Handles "1,234.56" and trailing negatives "100.00-" +- `extract_card_holder`: Matches "************1234 NAME" pattern diff --git a/README.md b/README.md new file mode 100644 index 0000000..d4af911 --- /dev/null +++ b/README.md @@ -0,0 +1,63 @@ +# BAC Statement Extractor + +Extracts credit card transactions from BAC Costa Rica statement PDFs. Parses section "B) Detalle de compras del periodo" and outputs JSON. + +## Dependencies + +- Python 3.10+ +- pdfplumber (>=0.10.0) + +## Usage + +```bash +python bac_extract.py [options] +``` + +**Arguments:** +- `pdf_file`: Path to the BAC statement PDF +- `card_suffix`: Last 4 digits of the card to filter + +**Options:** +- `-o, --output`: Output JSON path (default: transactions.json) +- `--pretty`: Pretty-print JSON output +- `-v, --verbose`: Enable debug logging + +**Examples:** +```bash +python bac_extract.py statement.pdf 1234 --pretty +python bac_extract.py statement.pdf 1234 -o output.json -v +``` + +## Output Format + +```json +{ + "metadata": { + "source_file": "statement.pdf", + "extraction_date": "2025-01-15T12:00:00Z", + "statement_date": "2025-01-10", + "card_filter": "1234", + "total_transactions": 5 + }, + "card_holder": { + "card_suffix": "1234", + "name": "CARD HOLDER NAME" + }, + "transactions": [ + { + "reference": "123456789012", + "date": "2025-01-09", + "description": "EXAMPLE STORE", + "location": null, + "currency": "CRC", + "amount_crc": 1234.56, + "amount_usd": null + } + ], + "summary": { + "total_crc": 50000.00, + "total_usd": 0.00, + "transaction_count": 5 + } +} +``` diff --git a/bac_extract.py b/bac_extract.py new file mode 100755 index 0000000..850bef1 --- /dev/null +++ b/bac_extract.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 +""" +BAC Credit Card Statement Extractor + +Extracts transactions from BAC Costa Rica credit card statement PDFs. +Specifically targets section "B) Detalle de compras del periodo". +""" + +import argparse +import json +import logging +import re +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +import pdfplumber + +# Spanish month abbreviations to month numbers +SPANISH_MONTHS = { + "ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6, + "JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12 +} + +# Card holder pattern: ************XXXX NAME +CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)") + +# Date pattern: D-MMM-YY or DD-MMM-YY +DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE) + +# Transaction line pattern: +# Reference Date Description Location (optional) Currency Amount +# 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56 +TRANSACTION_PATTERN = re.compile( + r"^(\d{12})\s+" # Reference (12 digits) + r"(\d{1,2}-[A-Z]{3}-\d{2})\s+" # Date + r"(.+?)\s+" # Description + r"(CRC|USD)\s+" # Currency + r"([\d,]+\.\d{2})(-)?$", # Amount (with optional trailing minus) + re.IGNORECASE +) + +logger = logging.getLogger(__name__) + + +def parse_spanish_date(date_str: str) -> Optional[str]: + """Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD).""" + if not date_str: + return None + + match = DATE_PATTERN.match(date_str.strip()) + if not match: + return None + + day, month_abbr, year = match.groups() + month = SPANISH_MONTHS.get(month_abbr.upper()) + if not month: + return None + + # Assume 2000s for 2-digit year + full_year = 2000 + int(year) + + try: + return f"{full_year:04d}-{month:02d}-{int(day):02d}" + except ValueError: + return None + + +def parse_amount(amount_str: str) -> Optional[float]: + """ + Parse amount string with comma thousands separator. + Handles trailing '-' for negative values. + """ + if not amount_str or not amount_str.strip(): + return None + + amount_str = amount_str.strip() + + # Check for trailing negative sign + is_negative = amount_str.endswith("-") + if is_negative: + amount_str = amount_str[:-1].strip() + + # Remove thousands separators (commas) and handle decimal point + # Format: 1,234.56 or 1,234,567.89 + try: + amount_str = amount_str.replace(",", "") + amount = float(amount_str) + return -amount if is_negative else amount + except ValueError: + return None + + +def is_bac_statement(pdf: pdfplumber.PDF) -> bool: + """Check if the PDF is a BAC credit card statement.""" + if not pdf.pages: + return False + + first_page_text = pdf.pages[0].extract_text() or "" + return "BAC" in first_page_text and "TARJETA" in first_page_text.upper() + + +def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]: + """Extract the statement date from the PDF.""" + if not pdf.pages: + return None + + first_page_text = pdf.pages[0].extract_text() or "" + + # Look for date patterns in the first page + # Common format: "Fecha de corte: DD-MMM-YY" or similar + date_matches = DATE_PATTERN.findall(first_page_text) + if date_matches: + # Use the first date found as statement date + day, month_abbr, year = date_matches[0] + month = SPANISH_MONTHS.get(month_abbr.upper()) + if month: + full_year = 2000 + int(year) + return f"{full_year:04d}-{month:02d}-{int(day):02d}" + + return None + + +def find_section_b_start(page_text: str) -> bool: + """Check if page contains start of section B.""" + patterns = [ + r"B\)\s*Detalle\s+de\s+compras", + r"Detalle\s+de\s+compras\s+del\s+periodo", + ] + for pattern in patterns: + if re.search(pattern, page_text, re.IGNORECASE): + return True + return False + + +def is_section_end(text: str) -> bool: + """Check if we've reached the end of section B.""" + end_patterns = [ + r"Total\s+de\s+compras\s+del\s+periodo", + r"C\)\s*Detalle\s+de\s+intereses", + r"Detalle\s+de\s+intereses", + r"D\)\s*Detalle", + ] + for pattern in end_patterns: + if re.search(pattern, text, re.IGNORECASE): + return True + return False + + +def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]: + """ + Extract card holder info from a row. + Returns (card_suffix, name) or None. + """ + match = CARD_HOLDER_PATTERN.search(row_text) + if match: + return match.group(1), match.group(2).strip() + return None + + +def parse_transaction_line(line: str) -> Optional[dict]: + """ + Parse a text-based transaction line. + + Format: Reference Date Description [Location] Currency Amount + Example: 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56 + """ + line = line.strip() + if not line: + return None + + match = TRANSACTION_PATTERN.match(line) + if not match: + return None + + reference = match.group(1) + date_str = match.group(2) + desc_and_loc = match.group(3).strip() + currency = match.group(4).upper() + amount_str = match.group(5) + is_negative = match.group(6) == "-" + + # Parse date + date = parse_spanish_date(date_str) + if not date: + logger.warning(f"Could not parse date '{date_str}' for reference {reference}") + return None + + # Parse amount + amount = parse_amount(amount_str) + if amount is None: + logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}") + return None + if is_negative: + amount = -amount + + # Split description and location + # Location is typically at the end, often a short suffix like "ANILL", "San Jose" + # For now, keep everything as description + description = desc_and_loc + location = None + + # Set amount in appropriate currency field + amount_crc = amount if currency == "CRC" else None + amount_usd = amount if currency == "USD" else None + + return { + "reference": reference, + "date": date, + "description": description, + "location": location, + "currency": currency, + "amount_crc": amount_crc, + "amount_usd": amount_usd, + } + + +def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False) -> dict: + """ + Extract transactions from a BAC credit card statement PDF. + + Args: + pdf_path: Path to the PDF file + card_suffix: Last 4 digits of card to filter + verbose: Enable verbose logging + + Returns: + Dictionary with metadata, card_holder, transactions, and summary + """ + if verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + with pdfplumber.open(pdf_path) as pdf: + # Validate this is a BAC statement + if not is_bac_statement(pdf): + raise ValueError("PDF does not appear to be a BAC credit card statement") + + statement_date = extract_statement_date(pdf) + + transactions = [] + current_card_suffix = None + current_card_name = None + in_section_b = False + section_b_found = False + card_suffix_found = False + + # Start from page 2 (index 1) as page 1 is summary only + start_page = 1 if len(pdf.pages) > 1 else 0 + + for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1): + page_text = page.extract_text() or "" + + logger.debug(f"Processing page {page_num}") + + # Check for section B start + if not in_section_b and find_section_b_start(page_text): + in_section_b = True + section_b_found = True + logger.debug(f"Found section B on page {page_num}") + + # Check for section end + if in_section_b and is_section_end(page_text): + logger.debug(f"Found section end on page {page_num}") + # Still process this page, but mark we're ending + + if not in_section_b: + continue + + # Parse text line by line + for line in page_text.split("\n"): + line = line.strip() + if not line: + continue + + # Check for card holder line + card_info = extract_card_holder(line) + if card_info: + current_card_suffix, current_card_name = card_info + logger.debug(f"Found card holder: {current_card_suffix} - {current_card_name}") + if current_card_suffix == card_suffix: + card_suffix_found = True + continue + + # Skip if we're not tracking the right card + if current_card_suffix != card_suffix: + continue + + # Try to parse as transaction + transaction = parse_transaction_line(line) + if transaction: + transactions.append(transaction) + logger.debug(f"Extracted transaction: {transaction['reference']}") + + # Check if we've passed section B + if in_section_b and is_section_end(page_text): + break + + if not section_b_found: + raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF") + + if not card_suffix_found: + raise ValueError(f"Card suffix '{card_suffix}' not found in statement") + + # Calculate summary + total_crc = sum(t["amount_crc"] or 0 for t in transactions) + total_usd = sum(t["amount_usd"] or 0 for t in transactions) + + # Get card holder info + card_holder = None + if card_suffix_found: + card_holder = { + "card_suffix": card_suffix, + "name": current_card_name if current_card_suffix == card_suffix else None + } + + return { + "metadata": { + "source_file": pdf_path.name, + "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + "statement_date": statement_date, + "card_filter": card_suffix, + "total_transactions": len(transactions) + }, + "card_holder": card_holder, + "transactions": transactions, + "summary": { + "total_crc": round(total_crc, 2), + "total_usd": round(total_usd, 2), + "transaction_count": len(transactions) + } + } + + +def main(): + parser = argparse.ArgumentParser( + description="Extract transactions from BAC Costa Rica credit card statement PDFs", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python bac_extract.py EstadodeCuenta.pdf 1234 --pretty + python bac_extract.py statement.pdf 1234 -o output.json -v + """ + ) + + parser.add_argument( + "pdf_file", + type=Path, + help="Path to the BAC statement PDF" + ) + + parser.add_argument( + "card_suffix", + type=str, + help="Last 4 digits of card to filter (e.g., 1234)" + ) + + parser.add_argument( + "-o", "--output", + type=Path, + default=Path("transactions.json"), + help="Output JSON file path (default: transactions.json)" + ) + + parser.add_argument( + "--pretty", + action="store_true", + help="Pretty-print JSON output" + ) + + parser.add_argument( + "-v", "--verbose", + action="store_true", + help="Enable verbose logging" + ) + + args = parser.parse_args() + + # Validate card suffix + if not args.card_suffix.isdigit() or len(args.card_suffix) != 4: + print(f"Error: Card suffix must be exactly 4 digits, got '{args.card_suffix}'", file=sys.stderr) + sys.exit(1) + + # Validate PDF file exists + if not args.pdf_file.exists(): + print(f"Error: File not found: {args.pdf_file}", file=sys.stderr) + sys.exit(1) + + if not args.pdf_file.suffix.lower() == ".pdf": + print(f"Error: File must be a PDF: {args.pdf_file}", file=sys.stderr) + sys.exit(1) + + try: + result = extract_transactions(args.pdf_file, args.card_suffix, args.verbose) + + # Write output + indent = 2 if args.pretty else None + with open(args.output, "w", encoding="utf-8") as f: + json.dump(result, f, indent=indent, ensure_ascii=False) + + print(f"Extracted {result['summary']['transaction_count']} transactions to {args.output}") + print(f"Total CRC: {result['summary']['total_crc']:,.2f}") + print(f"Total USD: {result['summary']['total_usd']:,.2f}") + + except ValueError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error processing PDF: {e}", file=sys.stderr) + if args.verbose: + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0e1e4ff --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +pdfplumber>=0.10.0