bac_tools/bac_extract.py

#!/usr/bin/env python3
"""
BAC Credit Card Statement Extractor

Extracts transactions from BAC Costa Rica credit card statement PDFs.
Targets sections:
  B) Detalle de compras del periodo
  D) Detalle de otros cargos
  E) Detalle de productos y servicios de elección voluntaria
"""

import argparse
import json
import logging
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

import pdfplumber

SPANISH_MONTHS = {
    "ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6,
    "JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12
}

CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)")
DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE)
TRANSACTION_PATTERN = re.compile(
    r"^(\d{12,13})\s+"
    r"(\d{1,2}-[A-Z]{3}-\d{2})\s+"
    r"(.+?)\s+"
    r"(CRC|USD)\s+"
    r"([\d,]+\.\d{2})(-)?$",
    re.IGNORECASE
)

# Section definitions: start patterns, end patterns, output key
SECTIONS = {
    "B": {
        "start": [r"B\)\s*Detalle\s+de\s+compras", r"Detalle\s+de\s+compras\s+del\s+periodo"],
        "end": [r"Total\s+de\s+compras\s+del\s+periodo", r"C\)\s*Detalle", r"D\)\s*Detalle", r"E\)\s*Detalle"],
        "key": "purchases",
    },
    "D": {
        "start": [r"D\)\s*Detalle\s+de\s+otros\s+cargos"],
        "end": [r"Total\s+por\s+concepto\s+otros\s+cargos", r"E\)\s*Detalle"],
        "key": "other_charges",
    },
    "E": {
        "start": [r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios"],
        "end": [r"Total\s+por\s+concepto\s+de\s+productos", r"F\)\s*Cargos"],
        "key": "voluntary_services",
    },
}

logger = logging.getLogger(__name__)


def parse_spanish_date(date_str: str) -> Optional[str]:
    """Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD)."""
    match = DATE_PATTERN.match(date_str.strip()) if date_str else None
    if not match:
        return None
    day, month_abbr, year = match.groups()
    month = SPANISH_MONTHS.get(month_abbr.upper())
    if not month:
        return None
    return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}"


def parse_amount(amount_str: str) -> Optional[float]:
    """Parse amount with comma thousands separator. Handles trailing '-' for negatives."""
    if not amount_str or not (amount_str := amount_str.strip()):
        return None
    is_negative = amount_str.endswith("-")
    try:
        amount = float(amount_str.rstrip("-").replace(",", ""))
        return -amount if is_negative else amount
    except ValueError:
        return None


def is_bac_statement(pdf: pdfplumber.PDF) -> bool:
    """Check if the PDF is a BAC credit card statement."""
    if not pdf.pages:
        return False
    first_page_text = pdf.pages[0].extract_text() or ""
    return "BAC" in first_page_text and "TARJETA" in first_page_text.upper()


def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
    """Extract the statement date from the PDF."""
    if not pdf.pages:
        return None
    first_page_text = pdf.pages[0].extract_text() or ""
    date_matches = DATE_PATTERN.findall(first_page_text)
    if not date_matches:
        return None
    day, month_abbr, year = date_matches[0]
    month = SPANISH_MONTHS.get(month_abbr.upper())
    if not month:
        return None
    return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}"


def matches_patterns(text: str, patterns: list[str]) -> bool:
    """Check if text matches any of the given regex patterns."""
    return any(re.search(p, text, re.IGNORECASE) for p in patterns)


def parse_transaction_line(line: str) -> Optional[dict]:
    """Parse a transaction line into a dict, or return None if not a transaction."""
    match = TRANSACTION_PATTERN.match(line.strip())
    if not match:
        return None

    reference, date_str, description, currency, amount_str, neg = match.groups()
    currency = currency.upper()

    date = parse_spanish_date(date_str)
    amount = parse_amount(amount_str)
    if not date or amount is None:
        logger.warning(f"Could not parse transaction: {line}")
        return None
    if neg:
        amount = -amount

    return {
        "reference": reference,
        "date": date,
        "description": description.strip(),
        "location": None,
        "currency": currency,
        "amount_crc": amount if currency == "CRC" else None,
        "amount_usd": amount if currency == "USD" else None,
    }


def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
    """Extract transactions from a BAC credit card statement PDF."""
    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)

    with pdfplumber.open(pdf_path) as pdf:
        if not is_bac_statement(pdf):
            raise ValueError("PDF does not appear to be a BAC credit card statement")

        statement_date = extract_statement_date(pdf)
        transactions = {s["key"]: [] for s in SECTIONS.values()}
        card_holders = []
        seen_card_suffixes = set()
        current_section = None
        sections_completed = set()

        start_page = 1 if len(pdf.pages) > 1 else 0
        for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1):
            page_text = page.extract_text() or ""
            logger.debug(f"Processing page {page_num}")

            for line in page_text.split("\n"):
                line = line.strip()
                if not line:
                    continue

                # Check for section end
                if current_section and matches_patterns(line, SECTIONS[current_section]["end"]):
                    logger.debug(f"Section {current_section} ended on page {page_num}")
                    sections_completed.add(current_section)
                    current_section = None

                # Check for section start
                if current_section is None:
                    for sec_id, sec in SECTIONS.items():
                        if sec_id not in sections_completed and matches_patterns(line, sec["start"]):
                            current_section = sec_id
                            logger.debug(f"Found section {sec_id} on page {page_num}")
                            break
                    continue

                # Extract card holder
                match = CARD_HOLDER_PATTERN.search(line)
                if match:
                    suffix, name = match.group(1), match.group(2).strip()
                    if suffix not in seen_card_suffixes:
                        card_holders.append({"card_suffix": suffix, "name": name})
                        seen_card_suffixes.add(suffix)
                        logger.debug(f"Found card holder: {suffix} - {name}")
                    continue

                # Parse transaction
                txn = parse_transaction_line(line)
                if txn:
                    transactions[SECTIONS[current_section]["key"]].append(txn)
                    logger.debug(f"Extracted {current_section} transaction: {txn['reference']}")

        if "B" not in sections_completed and not transactions["purchases"]:
            raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")

        def summarize(txns):
            return {
                "total_crc": round(sum(t["amount_crc"] or 0 for t in txns), 2),
                "total_usd": round(sum(t["amount_usd"] or 0 for t in txns), 2),
                "count": len(txns),
            }

        return {
            "metadata": {
                "source_file": pdf_path.name,
                "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
                "statement_date": statement_date,
                "total_transactions": sum(len(t) for t in transactions.values()),
            },
            "card_holders": card_holders,
            **transactions,
            "summary": {key: summarize(txns) for key, txns in transactions.items()},
        }


def main():
    parser = argparse.ArgumentParser(description="Extract transactions from BAC CR statement PDFs")
    parser.add_argument("pdf_file", type=Path, help="Path to the BAC statement PDF")
    parser.add_argument("-o", "--output", type=Path, default=Path("transactions.json"))
    parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
    parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose logging")
    args = parser.parse_args()

    if not args.pdf_file.exists():
        sys.exit(f"Error: File not found: {args.pdf_file}")
    if args.pdf_file.suffix.lower() != ".pdf":
        sys.exit(f"Error: File must be a PDF: {args.pdf_file}")

    try:
        result = extract_transactions(args.pdf_file, args.verbose)
        with open(args.output, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=2 if args.pretty else None, ensure_ascii=False)

        summary = result["summary"]
        print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}")
        for key, label in [("purchases", "Purchases (B)"), ("other_charges", "Other charges (D)"),
                           ("voluntary_services", "Voluntary services (E)")]:
            s = summary[key]
            print(f"  {label:25} {s['count']:3d}  CRC {s['total_crc']:>12,.2f}  USD {s['total_usd']:>10,.2f}")

    except ValueError as e:
        sys.exit(f"Error: {e}")
    except Exception as e:
        if args.verbose:
            import traceback
            traceback.print_exc()
        sys.exit(f"Error processing PDF: {e}")


if __name__ == "__main__":
    main()