bac_tools/bac_extract.py

#!/usr/bin/env python3
"""
BAC Credit Card Statement Extractor

Extracts transactions from BAC Costa Rica credit card statement PDFs.
Specifically targets section "B) Detalle de compras del periodo".
"""

import argparse
import json
import logging
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

import pdfplumber

# Spanish month abbreviations to month numbers
SPANISH_MONTHS = {
    "ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6,
    "JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12
}

# Card holder pattern: ************XXXX NAME
CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)")

# Date pattern: D-MMM-YY or DD-MMM-YY
DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE)

# Transaction line pattern:
# Reference  Date        Description         Location (optional)  Currency  Amount
# 123456789012 9-ENE-26 EXAMPLE STORE                              CRC      1,234.56
TRANSACTION_PATTERN = re.compile(
    r"^(\d{12})\s+"                          # Reference (12 digits)
    r"(\d{1,2}-[A-Z]{3}-\d{2})\s+"           # Date
    r"(.+?)\s+"                               # Description
    r"(CRC|USD)\s+"                           # Currency
    r"([\d,]+\.\d{2})(-)?$",                 # Amount (with optional trailing minus)
    re.IGNORECASE
)

logger = logging.getLogger(__name__)


def parse_spanish_date(date_str: str) -> Optional[str]:
    """Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD)."""
    if not date_str:
        return None

    match = DATE_PATTERN.match(date_str.strip())
    if not match:
        return None

    day, month_abbr, year = match.groups()
    month = SPANISH_MONTHS.get(month_abbr.upper())
    if not month:
        return None

    # Assume 2000s for 2-digit year
    full_year = 2000 + int(year)

    try:
        return f"{full_year:04d}-{month:02d}-{int(day):02d}"
    except ValueError:
        return None


def parse_amount(amount_str: str) -> Optional[float]:
    """
    Parse amount string with comma thousands separator.
    Handles trailing '-' for negative values.
    """
    if not amount_str or not amount_str.strip():
        return None

    amount_str = amount_str.strip()

    # Check for trailing negative sign
    is_negative = amount_str.endswith("-")
    if is_negative:
        amount_str = amount_str[:-1].strip()

    # Remove thousands separators (commas) and handle decimal point
    # Format: 1,234.56 or 1,234,567.89
    try:
        amount_str = amount_str.replace(",", "")
        amount = float(amount_str)
        return -amount if is_negative else amount
    except ValueError:
        return None


def is_bac_statement(pdf: pdfplumber.PDF) -> bool:
    """Check if the PDF is a BAC credit card statement."""
    if not pdf.pages:
        return False

    first_page_text = pdf.pages[0].extract_text() or ""
    return "BAC" in first_page_text and "TARJETA" in first_page_text.upper()


def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
    """Extract the statement date from the PDF."""
    if not pdf.pages:
        return None

    first_page_text = pdf.pages[0].extract_text() or ""

    # Look for date patterns in the first page
    # Common format: "Fecha de corte: DD-MMM-YY" or similar
    date_matches = DATE_PATTERN.findall(first_page_text)
    if date_matches:
        # Use the first date found as statement date
        day, month_abbr, year = date_matches[0]
        month = SPANISH_MONTHS.get(month_abbr.upper())
        if month:
            full_year = 2000 + int(year)
            return f"{full_year:04d}-{month:02d}-{int(day):02d}"

    return None


def find_section_b_start(page_text: str) -> bool:
    """Check if page contains start of section B."""
    patterns = [
        r"B\)\s*Detalle\s+de\s+compras",
        r"Detalle\s+de\s+compras\s+del\s+periodo",
    ]
    for pattern in patterns:
        if re.search(pattern, page_text, re.IGNORECASE):
            return True
    return False


def is_section_end(text: str) -> bool:
    """Check if we've reached the end of section B."""
    end_patterns = [
        r"Total\s+de\s+compras\s+del\s+periodo",
        r"C\)\s*Detalle\s+de\s+intereses",
        r"Detalle\s+de\s+intereses",
        r"D\)\s*Detalle",
    ]
    for pattern in end_patterns:
        if re.search(pattern, text, re.IGNORECASE):
            return True
    return False


def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
    """
    Extract card holder info from a row.
    Returns (card_suffix, name) or None.
    """
    match = CARD_HOLDER_PATTERN.search(row_text)
    if match:
        return match.group(1), match.group(2).strip()
    return None


def parse_transaction_line(line: str) -> Optional[dict]:
    """
    Parse a text-based transaction line.

    Format: Reference Date Description [Location] Currency Amount
    Example: 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56
    """
    line = line.strip()
    if not line:
        return None

    match = TRANSACTION_PATTERN.match(line)
    if not match:
        return None

    reference = match.group(1)
    date_str = match.group(2)
    desc_and_loc = match.group(3).strip()
    currency = match.group(4).upper()
    amount_str = match.group(5)
    is_negative = match.group(6) == "-"

    # Parse date
    date = parse_spanish_date(date_str)
    if not date:
        logger.warning(f"Could not parse date '{date_str}' for reference {reference}")
        return None

    # Parse amount
    amount = parse_amount(amount_str)
    if amount is None:
        logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}")
        return None
    if is_negative:
        amount = -amount

    # Split description and location
    # Location is typically at the end, often a short suffix like "ANILL", "San Jose"
    # For now, keep everything as description
    description = desc_and_loc
    location = None

    # Set amount in appropriate currency field
    amount_crc = amount if currency == "CRC" else None
    amount_usd = amount if currency == "USD" else None

    return {
        "reference": reference,
        "date": date,
        "description": description,
        "location": location,
        "currency": currency,
        "amount_crc": amount_crc,
        "amount_usd": amount_usd,
    }


def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False) -> dict:
    """
    Extract transactions from a BAC credit card statement PDF.

    Args:
        pdf_path: Path to the PDF file
        card_suffix: Last 4 digits of card to filter
        verbose: Enable verbose logging

    Returns:
        Dictionary with metadata, card_holder, transactions, and summary
    """
    if verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    with pdfplumber.open(pdf_path) as pdf:
        # Validate this is a BAC statement
        if not is_bac_statement(pdf):
            raise ValueError("PDF does not appear to be a BAC credit card statement")

        statement_date = extract_statement_date(pdf)

        transactions = []
        current_card_suffix = None
        current_card_name = None
        in_section_b = False
        section_b_found = False
        card_suffix_found = False

        # Start from page 2 (index 1) as page 1 is summary only
        start_page = 1 if len(pdf.pages) > 1 else 0

        for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1):
            page_text = page.extract_text() or ""

            logger.debug(f"Processing page {page_num}")

            # Check for section B start
            if not in_section_b and find_section_b_start(page_text):
                in_section_b = True
                section_b_found = True
                logger.debug(f"Found section B on page {page_num}")

            # Check for section end
            if in_section_b and is_section_end(page_text):
                logger.debug(f"Found section end on page {page_num}")
                # Still process this page, but mark we're ending

            if not in_section_b:
                continue

            # Parse text line by line
            for line in page_text.split("\n"):
                line = line.strip()
                if not line:
                    continue

                # Check for card holder line
                card_info = extract_card_holder(line)
                if card_info:
                    current_card_suffix, current_card_name = card_info
                    logger.debug(f"Found card holder: {current_card_suffix} - {current_card_name}")
                    if current_card_suffix == card_suffix:
                        card_suffix_found = True
                    continue

                # Skip if we're not tracking the right card
                if current_card_suffix != card_suffix:
                    continue

                # Try to parse as transaction
                transaction = parse_transaction_line(line)
                if transaction:
                    transactions.append(transaction)
                    logger.debug(f"Extracted transaction: {transaction['reference']}")

            # Check if we've passed section B
            if in_section_b and is_section_end(page_text):
                break

        if not section_b_found:
            raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")

        if not card_suffix_found:
            raise ValueError(f"Card suffix '{card_suffix}' not found in statement")

        # Calculate summary
        total_crc = sum(t["amount_crc"] or 0 for t in transactions)
        total_usd = sum(t["amount_usd"] or 0 for t in transactions)

        # Get card holder info
        card_holder = None
        if card_suffix_found:
            card_holder = {
                "card_suffix": card_suffix,
                "name": current_card_name if current_card_suffix == card_suffix else None
            }

        return {
            "metadata": {
                "source_file": pdf_path.name,
                "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
                "statement_date": statement_date,
                "card_filter": card_suffix,
                "total_transactions": len(transactions)
            },
            "card_holder": card_holder,
            "transactions": transactions,
            "summary": {
                "total_crc": round(total_crc, 2),
                "total_usd": round(total_usd, 2),
                "transaction_count": len(transactions)
            }
        }


def main():
    parser = argparse.ArgumentParser(
        description="Extract transactions from BAC Costa Rica credit card statement PDFs",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
  python bac_extract.py statement.pdf 1234 -o output.json -v
        """
    )

    parser.add_argument(
        "pdf_file",
        type=Path,
        help="Path to the BAC statement PDF"
    )

    parser.add_argument(
        "card_suffix",
        type=str,
        help="Last 4 digits of card to filter (e.g., 1234)"
    )

    parser.add_argument(
        "-o", "--output",
        type=Path,
        default=Path("transactions.json"),
        help="Output JSON file path (default: transactions.json)"
    )

    parser.add_argument(
        "--pretty",
        action="store_true",
        help="Pretty-print JSON output"
    )

    parser.add_argument(
        "-v", "--verbose",
        action="store_true",
        help="Enable verbose logging"
    )

    args = parser.parse_args()

    # Validate card suffix
    if not args.card_suffix.isdigit() or len(args.card_suffix) != 4:
        print(f"Error: Card suffix must be exactly 4 digits, got '{args.card_suffix}'", file=sys.stderr)
        sys.exit(1)

    # Validate PDF file exists
    if not args.pdf_file.exists():
        print(f"Error: File not found: {args.pdf_file}", file=sys.stderr)
        sys.exit(1)

    if not args.pdf_file.suffix.lower() == ".pdf":
        print(f"Error: File must be a PDF: {args.pdf_file}", file=sys.stderr)
        sys.exit(1)

    try:
        result = extract_transactions(args.pdf_file, args.card_suffix, args.verbose)

        # Write output
        indent = 2 if args.pretty else None
        with open(args.output, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=indent, ensure_ascii=False)

        print(f"Extracted {result['summary']['transaction_count']} transactions to {args.output}")
        print(f"Total CRC: {result['summary']['total_crc']:,.2f}")
        print(f"Total USD: {result['summary']['total_usd']:,.2f}")

    except ValueError as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"Error processing PDF: {e}", file=sys.stderr)
        if args.verbose:
            import traceback
            traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()