bac_tools/bac_extract.py

255 lines
9.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
BAC Credit Card Statement Extractor
Extracts transactions from BAC Costa Rica credit card statement PDFs.
Targets sections:
B) Detalle de compras del periodo
D) Detalle de otros cargos
E) Detalle de productos y servicios de elección voluntaria
"""
import argparse
import json
import logging
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import pdfplumber
SPANISH_MONTHS = {
"ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6,
"JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12
}
CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)")
DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE)
TRANSACTION_PATTERN = re.compile(
r"^(\d{12,13})\s+"
r"(\d{1,2}-[A-Z]{3}-\d{2})\s+"
r"(.+?)\s+"
r"(CRC|USD)\s+"
r"([\d,]+\.\d{2})(-)?$",
re.IGNORECASE
)
# Section definitions: start patterns, end patterns, output key
SECTIONS = {
"B": {
"start": [r"B\)\s*Detalle\s+de\s+compras", r"Detalle\s+de\s+compras\s+del\s+periodo"],
"end": [r"Total\s+de\s+compras\s+del\s+periodo", r"C\)\s*Detalle", r"D\)\s*Detalle", r"E\)\s*Detalle"],
"key": "purchases",
},
"D": {
"start": [r"D\)\s*Detalle\s+de\s+otros\s+cargos"],
"end": [r"Total\s+por\s+concepto\s+otros\s+cargos", r"E\)\s*Detalle"],
"key": "other_charges",
},
"E": {
"start": [r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios"],
"end": [r"Total\s+por\s+concepto\s+de\s+productos", r"F\)\s*Cargos"],
"key": "voluntary_services",
},
}
logger = logging.getLogger(__name__)
def parse_spanish_date(date_str: str) -> Optional[str]:
"""Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD)."""
match = DATE_PATTERN.match(date_str.strip()) if date_str else None
if not match:
return None
day, month_abbr, year = match.groups()
month = SPANISH_MONTHS.get(month_abbr.upper())
if not month:
return None
return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}"
def parse_amount(amount_str: str) -> Optional[float]:
"""Parse amount with comma thousands separator. Handles trailing '-' for negatives."""
if not amount_str or not (amount_str := amount_str.strip()):
return None
is_negative = amount_str.endswith("-")
try:
amount = float(amount_str.rstrip("-").replace(",", ""))
return -amount if is_negative else amount
except ValueError:
return None
def is_bac_statement(pdf: pdfplumber.PDF) -> bool:
"""Check if the PDF is a BAC credit card statement."""
if not pdf.pages:
return False
first_page_text = pdf.pages[0].extract_text() or ""
return "BAC" in first_page_text and "TARJETA" in first_page_text.upper()
def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
"""Extract the statement date from the PDF."""
if not pdf.pages:
return None
first_page_text = pdf.pages[0].extract_text() or ""
date_matches = DATE_PATTERN.findall(first_page_text)
if not date_matches:
return None
day, month_abbr, year = date_matches[0]
month = SPANISH_MONTHS.get(month_abbr.upper())
if not month:
return None
return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}"
def matches_patterns(text: str, patterns: list[str]) -> bool:
"""Check if text matches any of the given regex patterns."""
return any(re.search(p, text, re.IGNORECASE) for p in patterns)
def parse_transaction_line(line: str) -> Optional[dict]:
"""Parse a transaction line into a dict, or return None if not a transaction."""
match = TRANSACTION_PATTERN.match(line.strip())
if not match:
return None
reference, date_str, description, currency, amount_str, neg = match.groups()
currency = currency.upper()
date = parse_spanish_date(date_str)
amount = parse_amount(amount_str)
if not date or amount is None:
logger.warning(f"Could not parse transaction: {line}")
return None
if neg:
amount = -amount
return {
"reference": reference,
"date": date,
"description": description.strip(),
"location": None,
"currency": currency,
"amount_crc": amount if currency == "CRC" else None,
"amount_usd": amount if currency == "USD" else None,
}
def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
"""Extract transactions from a BAC credit card statement PDF."""
logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
with pdfplumber.open(pdf_path) as pdf:
if not is_bac_statement(pdf):
raise ValueError("PDF does not appear to be a BAC credit card statement")
statement_date = extract_statement_date(pdf)
transactions = {s["key"]: [] for s in SECTIONS.values()}
card_holders = []
seen_card_suffixes = set()
current_section = None
sections_completed = set()
start_page = 1 if len(pdf.pages) > 1 else 0
for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1):
page_text = page.extract_text() or ""
logger.debug(f"Processing page {page_num}")
for line in page_text.split("\n"):
line = line.strip()
if not line:
continue
# Check for section end
if current_section and matches_patterns(line, SECTIONS[current_section]["end"]):
logger.debug(f"Section {current_section} ended on page {page_num}")
sections_completed.add(current_section)
current_section = None
# Check for section start
if current_section is None:
for sec_id, sec in SECTIONS.items():
if sec_id not in sections_completed and matches_patterns(line, sec["start"]):
current_section = sec_id
logger.debug(f"Found section {sec_id} on page {page_num}")
break
continue
# Extract card holder
match = CARD_HOLDER_PATTERN.search(line)
if match:
suffix, name = match.group(1), match.group(2).strip()
if suffix not in seen_card_suffixes:
card_holders.append({"card_suffix": suffix, "name": name})
seen_card_suffixes.add(suffix)
logger.debug(f"Found card holder: {suffix} - {name}")
continue
# Parse transaction
txn = parse_transaction_line(line)
if txn:
transactions[SECTIONS[current_section]["key"]].append(txn)
logger.debug(f"Extracted {current_section} transaction: {txn['reference']}")
if "B" not in sections_completed and not transactions["purchases"]:
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
def summarize(txns):
return {
"total_crc": round(sum(t["amount_crc"] or 0 for t in txns), 2),
"total_usd": round(sum(t["amount_usd"] or 0 for t in txns), 2),
"count": len(txns),
}
return {
"metadata": {
"source_file": pdf_path.name,
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
"statement_date": statement_date,
"total_transactions": sum(len(t) for t in transactions.values()),
},
"card_holders": card_holders,
**transactions,
"summary": {key: summarize(txns) for key, txns in transactions.items()},
}
def main():
parser = argparse.ArgumentParser(description="Extract transactions from BAC CR statement PDFs")
parser.add_argument("pdf_file", type=Path, help="Path to the BAC statement PDF")
parser.add_argument("-o", "--output", type=Path, default=Path("transactions.json"))
parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose logging")
args = parser.parse_args()
if not args.pdf_file.exists():
sys.exit(f"Error: File not found: {args.pdf_file}")
if args.pdf_file.suffix.lower() != ".pdf":
sys.exit(f"Error: File must be a PDF: {args.pdf_file}")
try:
result = extract_transactions(args.pdf_file, args.verbose)
with open(args.output, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2 if args.pretty else None, ensure_ascii=False)
summary = result["summary"]
print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}")
for key, label in [("purchases", "Purchases (B)"), ("other_charges", "Other charges (D)"),
("voluntary_services", "Voluntary services (E)")]:
s = summary[key]
print(f" {label:25} {s['count']:3d} CRC {s['total_crc']:>12,.2f} USD {s['total_usd']:>10,.2f}")
except ValueError as e:
sys.exit(f"Error: {e}")
except Exception as e:
if args.verbose:
import traceback
traceback.print_exc()
sys.exit(f"Error processing PDF: {e}")
if __name__ == "__main__":
main()