bac_tools/bac_extract.py

#!/usr/bin/env python3
"""
BAC Credit Card Statement Extractor

Extracts transactions from BAC Costa Rica credit card statement PDFs.
Targets sections:
  B) Detalle de compras del periodo
  D) Detalle de otros cargos
  E) Detalle de productos y servicios de elección voluntaria
"""

import argparse
import json
import logging
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

import pdfplumber

# Spanish month abbreviations to month numbers
SPANISH_MONTHS = {
    "ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6,
    "JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12
}

# Card holder pattern: ************XXXX NAME
CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)")

# Date pattern: D-MMM-YY or DD-MMM-YY
DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE)

# Transaction line pattern:
# Reference  Date        Description         Location (optional)  Currency  Amount
# 123456789012 9-ENE-26 EXAMPLE STORE                              CRC      1,234.56
TRANSACTION_PATTERN = re.compile(
    r"^(\d{12})\s+"                          # Reference (12 digits)
    r"(\d{1,2}-[A-Z]{3}-\d{2})\s+"           # Date
    r"(.+?)\s+"                               # Description
    r"(CRC|USD)\s+"                           # Currency
    r"([\d,]+\.\d{2})(-)?$",                 # Amount (with optional trailing minus)
    re.IGNORECASE
)

logger = logging.getLogger(__name__)


def parse_spanish_date(date_str: str) -> Optional[str]:
    """Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD)."""
    if not date_str:
        return None

    match = DATE_PATTERN.match(date_str.strip())
    if not match:
        return None

    day, month_abbr, year = match.groups()
    month = SPANISH_MONTHS.get(month_abbr.upper())
    if not month:
        return None

    # Assume 2000s for 2-digit year
    full_year = 2000 + int(year)

    try:
        return f"{full_year:04d}-{month:02d}-{int(day):02d}"
    except ValueError:
        return None


def parse_amount(amount_str: str) -> Optional[float]:
    """
    Parse amount string with comma thousands separator.
    Handles trailing '-' for negative values.
    """
    if not amount_str:
        return None

    amount_str = amount_str.strip()
    if not amount_str:
        return None

    is_negative = amount_str.endswith("-")
    if is_negative:
        amount_str = amount_str[:-1]

    try:
        amount = float(amount_str.replace(",", ""))
        return -amount if is_negative else amount
    except ValueError:
        return None


def is_bac_statement(pdf: pdfplumber.PDF) -> bool:
    """Check if the PDF is a BAC credit card statement."""
    if not pdf.pages:
        return False

    first_page_text = pdf.pages[0].extract_text() or ""
    return "BAC" in first_page_text and "TARJETA" in first_page_text.upper()


def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
    """Extract the statement date from the PDF."""
    if not pdf.pages:
        return None

    first_page_text = pdf.pages[0].extract_text() or ""
    date_matches = DATE_PATTERN.findall(first_page_text)
    if not date_matches:
        return None

    day, month_abbr, year = date_matches[0]
    month = SPANISH_MONTHS.get(month_abbr.upper())
    if not month:
        return None

    full_year = 2000 + int(year)
    return f"{full_year:04d}-{month:02d}-{int(day):02d}"


def find_section_b_start(text: str) -> bool:
    """Check if text contains start of section B (purchases)."""
    patterns = [
        r"B\)\s*Detalle\s+de\s+compras",
        r"Detalle\s+de\s+compras\s+del\s+periodo",
    ]
    return any(re.search(p, text, re.IGNORECASE) for p in patterns)


def find_section_d_start(text: str) -> bool:
    """Check if text contains start of section D (other charges)."""
    return bool(re.search(r"D\)\s*Detalle\s+de\s+otros\s+cargos", text, re.IGNORECASE))


def find_section_e_start(text: str) -> bool:
    """Check if text contains start of section E (voluntary products/services)."""
    return bool(re.search(r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios", text, re.IGNORECASE))


def is_section_b_end(text: str) -> bool:
    """Check if text indicates the end of section B."""
    end_patterns = [
        r"Total\s+de\s+compras\s+del\s+periodo",
        r"C\)\s*Detalle\s+de\s+intereses",
        r"Detalle\s+de\s+intereses",
        r"D\)\s*Detalle",
    ]
    return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)


def is_section_d_end(text: str) -> bool:
    """Check if text indicates the end of section D."""
    end_patterns = [
        r"Total\s+por\s+concepto\s+otros\s+cargos",
        r"E\)\s*Detalle",
    ]
    return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)


def is_section_e_end(text: str) -> bool:
    """Check if text indicates the end of section E."""
    end_patterns = [
        r"Total\s+por\s+concepto\s+de\s+productos",
        r"F\)\s*Cargos",
    ]
    return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)


def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
    """
    Extract card holder info from a row.
    Returns (card_suffix, name) or None.
    """
    match = CARD_HOLDER_PATTERN.search(row_text)
    if match:
        return match.group(1), match.group(2).strip()
    return None


def parse_transaction_line(line: str) -> Optional[dict]:
    """
    Parse a text-based transaction line.

    Format: Reference Date Description [Location] Currency Amount
    Example: 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56
    """
    line = line.strip()
    if not line:
        return None

    match = TRANSACTION_PATTERN.match(line)
    if not match:
        return None

    reference = match.group(1)
    date_str = match.group(2)
    description = match.group(3).strip()
    currency = match.group(4).upper()
    amount_str = match.group(5)
    is_negative = match.group(6) == "-"

    date = parse_spanish_date(date_str)
    if not date:
        logger.warning(f"Could not parse date '{date_str}' for reference {reference}")
        return None

    amount = parse_amount(amount_str)
    if amount is None:
        logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}")
        return None
    if is_negative:
        amount = -amount

    return {
        "reference": reference,
        "date": date,
        "description": description,
        "location": None,
        "currency": currency,
        "amount_crc": amount if currency == "CRC" else None,
        "amount_usd": amount if currency == "USD" else None,
    }


def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
    """
    Extract transactions from a BAC credit card statement PDF.

    Args:
        pdf_path: Path to the PDF file
        verbose: Enable verbose logging

    Returns:
        Dictionary with metadata, card_holders, purchases, other_charges,
        voluntary_services, and summary
    """
    if verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    with pdfplumber.open(pdf_path) as pdf:
        # Validate this is a BAC statement
        if not is_bac_statement(pdf):
            raise ValueError("PDF does not appear to be a BAC credit card statement")

        statement_date = extract_statement_date(pdf)

        # Transactions by section
        purchases = []           # Section B
        other_charges = []       # Section D
        voluntary_services = []  # Section E

        # Track card holders (may have multiple)
        card_holders = []
        seen_card_suffixes = set()

        # Section tracking: None, "B", "D", "E"
        current_section = None
        sections_found = set()

        # Start from page 2 (index 1) as page 1 is summary only
        start_page = 1 if len(pdf.pages) > 1 else 0

        for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1):
            page_text = page.extract_text() or ""

            logger.debug(f"Processing page {page_num}")

            # Check for section transitions (order matters: check ends before starts)
            # Section B end
            if current_section == "B" and is_section_b_end(page_text):
                logger.debug(f"Section B ended on page {page_num}")
                current_section = None

            # Section D end
            if current_section == "D" and is_section_d_end(page_text):
                logger.debug(f"Section D ended on page {page_num}")
                current_section = None

            # Section E end
            if current_section == "E" and is_section_e_end(page_text):
                logger.debug(f"Section E ended on page {page_num}")
                current_section = None

            # Check for section starts
            if current_section is None and find_section_b_start(page_text):
                current_section = "B"
                sections_found.add("B")
                logger.debug(f"Found section B on page {page_num}")

            if current_section is None and find_section_d_start(page_text):
                current_section = "D"
                sections_found.add("D")
                logger.debug(f"Found section D on page {page_num}")

            if current_section is None and find_section_e_start(page_text):
                current_section = "E"
                sections_found.add("E")
                logger.debug(f"Found section E on page {page_num}")

            if current_section is None:
                continue

            # Select the appropriate transaction list
            if current_section == "B":
                target_list = purchases
            elif current_section == "D":
                target_list = other_charges
            else:  # "E"
                target_list = voluntary_services

            # Parse text line by line
            for line in page_text.split("\n"):
                line = line.strip()
                if not line:
                    continue

                # Extract card holder info (only in section B)
                if current_section == "B":
                    card_info = extract_card_holder(line)
                    if card_info:
                        card_suffix, card_holder_name = card_info
                        if card_suffix not in seen_card_suffixes:
                            card_holders.append({
                                "card_suffix": card_suffix,
                                "name": card_holder_name
                            })
                            seen_card_suffixes.add(card_suffix)
                            logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
                        continue

                transaction = parse_transaction_line(line)
                if transaction:
                    target_list.append(transaction)
                    logger.debug(f"Extracted {current_section} transaction: {transaction['reference']}")

        if "B" not in sections_found:
            raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")

        # Calculate summaries
        def calculate_summary(txns):
            total_crc = sum(t["amount_crc"] or 0 for t in txns)
            total_usd = sum(t["amount_usd"] or 0 for t in txns)
            return {
                "total_crc": round(total_crc, 2),
                "total_usd": round(total_usd, 2),
                "count": len(txns)
            }

        total_transactions = len(purchases) + len(other_charges) + len(voluntary_services)

        return {
            "metadata": {
                "source_file": pdf_path.name,
                "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
                "statement_date": statement_date,
                "total_transactions": total_transactions
            },
            "card_holders": card_holders,
            "purchases": purchases,
            "other_charges": other_charges,
            "voluntary_services": voluntary_services,
            "summary": {
                "purchases": calculate_summary(purchases),
                "other_charges": calculate_summary(other_charges),
                "voluntary_services": calculate_summary(voluntary_services)
            }
        }


def main():
    parser = argparse.ArgumentParser(
        description="Extract transactions from BAC Costa Rica credit card statement PDFs",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python bac_extract.py EstadodeCuenta.pdf --pretty
  python bac_extract.py statement.pdf -o output.json -v
        """
    )

    parser.add_argument(
        "pdf_file",
        type=Path,
        help="Path to the BAC statement PDF"
    )

    parser.add_argument(
        "-o", "--output",
        type=Path,
        default=Path("transactions.json"),
        help="Output JSON file path (default: transactions.json)"
    )

    parser.add_argument(
        "--pretty",
        action="store_true",
        help="Pretty-print JSON output"
    )

    parser.add_argument(
        "-v", "--verbose",
        action="store_true",
        help="Enable verbose logging"
    )

    args = parser.parse_args()

    # Validate PDF file exists
    if not args.pdf_file.exists():
        print(f"Error: File not found: {args.pdf_file}", file=sys.stderr)
        sys.exit(1)

    if not args.pdf_file.suffix.lower() == ".pdf":
        print(f"Error: File must be a PDF: {args.pdf_file}", file=sys.stderr)
        sys.exit(1)

    try:
        result = extract_transactions(args.pdf_file, args.verbose)

        # Write output
        indent = 2 if args.pretty else None
        with open(args.output, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=indent, ensure_ascii=False)

        summary = result['summary']
        print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}")
        print(f"  Purchases (B):          {summary['purchases']['count']:3d}  "
              f"CRC {summary['purchases']['total_crc']:>12,.2f}  "
              f"USD {summary['purchases']['total_usd']:>10,.2f}")
        print(f"  Other charges (D):      {summary['other_charges']['count']:3d}  "
              f"CRC {summary['other_charges']['total_crc']:>12,.2f}  "
              f"USD {summary['other_charges']['total_usd']:>10,.2f}")
        print(f"  Voluntary services (E): {summary['voluntary_services']['count']:3d}  "
              f"CRC {summary['voluntary_services']['total_crc']:>12,.2f}  "
              f"USD {summary['voluntary_services']['total_usd']:>10,.2f}")

    except ValueError as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"Error processing PDF: {e}", file=sys.stderr)
        if args.verbose:
            import traceback
            traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()