bac_tools/bac_extract.py

256 lines
9.4 KiB
Python
Raw Permalink Normal View History

2026-03-09 13:24:41 -06:00
#!/usr/bin/env python3
"""
BAC Credit Card Statement Extractor
Extracts transactions from BAC Costa Rica credit card statement PDFs.
2026-03-09 14:44:47 -06:00
Targets sections:
B) Detalle de compras del periodo
D) Detalle de otros cargos
E) Detalle de productos y servicios de elección voluntaria
2026-03-09 13:24:41 -06:00
"""
import argparse
import json
import logging
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import pdfplumber
SPANISH_MONTHS = {
"ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6,
"JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12
}
CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)")
DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE)
TRANSACTION_PATTERN = re.compile(
2026-03-09 15:39:16 -06:00
r"^(\d{12,13})\s+"
r"(\d{1,2}-[A-Z]{3}-\d{2})\s+"
r"(.+?)\s+"
r"(CRC|USD)\s+"
r"([\d,]+\.\d{2})(-)?$",
2026-03-09 13:24:41 -06:00
re.IGNORECASE
)
2026-03-09 15:39:16 -06:00
# Section definitions: start patterns, end patterns, output key
SECTIONS = {
"B": {
"start": [r"B\)\s*Detalle\s+de\s+compras", r"Detalle\s+de\s+compras\s+del\s+periodo"],
"end": [r"Total\s+de\s+compras\s+del\s+periodo", r"C\)\s*Detalle", r"D\)\s*Detalle", r"E\)\s*Detalle"],
"key": "purchases",
},
"D": {
"start": [r"D\)\s*Detalle\s+de\s+otros\s+cargos"],
"end": [r"Total\s+por\s+concepto\s+otros\s+cargos", r"E\)\s*Detalle"],
"key": "other_charges",
},
"E": {
"start": [r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios"],
"end": [r"Total\s+por\s+concepto\s+de\s+productos", r"F\)\s*Cargos"],
"key": "voluntary_services",
},
}
2026-03-09 13:24:41 -06:00
logger = logging.getLogger(__name__)
def parse_spanish_date(date_str: str) -> Optional[str]:
"""Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD)."""
2026-03-09 15:39:16 -06:00
match = DATE_PATTERN.match(date_str.strip()) if date_str else None
2026-03-09 13:24:41 -06:00
if not match:
return None
day, month_abbr, year = match.groups()
month = SPANISH_MONTHS.get(month_abbr.upper())
if not month:
return None
2026-03-09 15:39:16 -06:00
return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}"
2026-03-09 13:24:41 -06:00
def parse_amount(amount_str: str) -> Optional[float]:
2026-03-09 15:39:16 -06:00
"""Parse amount with comma thousands separator. Handles trailing '-' for negatives."""
if not amount_str or not (amount_str := amount_str.strip()):
2026-03-09 13:59:03 -06:00
return None
2026-03-09 13:24:41 -06:00
is_negative = amount_str.endswith("-")
try:
2026-03-09 15:39:16 -06:00
amount = float(amount_str.rstrip("-").replace(",", ""))
2026-03-09 13:24:41 -06:00
return -amount if is_negative else amount
except ValueError:
return None
def is_bac_statement(pdf: pdfplumber.PDF) -> bool:
"""Check if the PDF is a BAC credit card statement."""
if not pdf.pages:
return False
first_page_text = pdf.pages[0].extract_text() or ""
return "BAC" in first_page_text and "TARJETA" in first_page_text.upper()
def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
"""Extract the statement date from the PDF."""
if not pdf.pages:
return None
first_page_text = pdf.pages[0].extract_text() or ""
date_matches = DATE_PATTERN.findall(first_page_text)
2026-03-09 13:59:03 -06:00
if not date_matches:
return None
day, month_abbr, year = date_matches[0]
month = SPANISH_MONTHS.get(month_abbr.upper())
if not month:
return None
2026-03-09 15:39:16 -06:00
return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}"
2026-03-09 13:24:41 -06:00
2026-03-09 15:39:16 -06:00
def matches_patterns(text: str, patterns: list[str]) -> bool:
"""Check if text matches any of the given regex patterns."""
2026-03-09 14:44:47 -06:00
return any(re.search(p, text, re.IGNORECASE) for p in patterns)
2026-03-09 13:24:41 -06:00
def parse_transaction_line(line: str) -> Optional[dict]:
2026-03-09 15:39:16 -06:00
"""Parse a transaction line into a dict, or return None if not a transaction."""
match = TRANSACTION_PATTERN.match(line.strip())
2026-03-09 13:24:41 -06:00
if not match:
return None
2026-03-09 15:39:16 -06:00
reference, date_str, description, currency, amount_str, neg = match.groups()
currency = currency.upper()
2026-03-09 13:24:41 -06:00
date = parse_spanish_date(date_str)
amount = parse_amount(amount_str)
2026-03-09 15:39:16 -06:00
if not date or amount is None:
logger.warning(f"Could not parse transaction: {line}")
2026-03-09 13:24:41 -06:00
return None
2026-03-09 15:39:16 -06:00
if neg:
2026-03-09 13:24:41 -06:00
amount = -amount
return {
"reference": reference,
"date": date,
2026-03-09 15:39:16 -06:00
"description": description.strip(),
2026-03-09 13:59:03 -06:00
"location": None,
2026-03-09 13:24:41 -06:00
"currency": currency,
2026-03-09 13:59:03 -06:00
"amount_crc": amount if currency == "CRC" else None,
"amount_usd": amount if currency == "USD" else None,
2026-03-09 13:24:41 -06:00
}
2026-03-09 13:59:03 -06:00
def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
2026-03-09 15:39:16 -06:00
"""Extract transactions from a BAC credit card statement PDF."""
logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
2026-03-09 13:24:41 -06:00
with pdfplumber.open(pdf_path) as pdf:
if not is_bac_statement(pdf):
raise ValueError("PDF does not appear to be a BAC credit card statement")
statement_date = extract_statement_date(pdf)
2026-03-09 15:39:16 -06:00
transactions = {s["key"]: [] for s in SECTIONS.values()}
2026-03-09 14:44:47 -06:00
card_holders = []
seen_card_suffixes = set()
current_section = None
2026-03-09 15:39:16 -06:00
sections_completed = set()
2026-03-09 13:24:41 -06:00
start_page = 1 if len(pdf.pages) > 1 else 0
for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1):
page_text = page.extract_text() or ""
logger.debug(f"Processing page {page_num}")
for line in page_text.split("\n"):
line = line.strip()
if not line:
continue
2026-03-09 15:39:16 -06:00
# Check for section end
if current_section and matches_patterns(line, SECTIONS[current_section]["end"]):
logger.debug(f"Section {current_section} ended on page {page_num}")
sections_completed.add(current_section)
current_section = None
# Check for section start
if current_section is None:
for sec_id, sec in SECTIONS.items():
if sec_id not in sections_completed and matches_patterns(line, sec["start"]):
current_section = sec_id
logger.debug(f"Found section {sec_id} on page {page_num}")
break
continue
# Extract card holder
match = CARD_HOLDER_PATTERN.search(line)
if match:
suffix, name = match.group(1), match.group(2).strip()
if suffix not in seen_card_suffixes:
card_holders.append({"card_suffix": suffix, "name": name})
seen_card_suffixes.add(suffix)
logger.debug(f"Found card holder: {suffix} - {name}")
continue
# Parse transaction
txn = parse_transaction_line(line)
if txn:
transactions[SECTIONS[current_section]["key"]].append(txn)
logger.debug(f"Extracted {current_section} transaction: {txn['reference']}")
if "B" not in sections_completed and not transactions["purchases"]:
2026-03-09 13:24:41 -06:00
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
2026-03-09 15:39:16 -06:00
def summarize(txns):
2026-03-09 14:44:47 -06:00
return {
2026-03-09 15:39:16 -06:00
"total_crc": round(sum(t["amount_crc"] or 0 for t in txns), 2),
"total_usd": round(sum(t["amount_usd"] or 0 for t in txns), 2),
"count": len(txns),
2026-03-09 14:44:47 -06:00
}
2026-03-09 13:24:41 -06:00
return {
"metadata": {
"source_file": pdf_path.name,
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
"statement_date": statement_date,
2026-03-09 15:39:16 -06:00
"total_transactions": sum(len(t) for t in transactions.values()),
2026-03-09 13:24:41 -06:00
},
2026-03-09 14:44:47 -06:00
"card_holders": card_holders,
2026-03-09 15:39:16 -06:00
**transactions,
"summary": {key: summarize(txns) for key, txns in transactions.items()},
2026-03-09 13:24:41 -06:00
}
def main():
2026-03-09 15:39:16 -06:00
parser = argparse.ArgumentParser(description="Extract transactions from BAC CR statement PDFs")
parser.add_argument("pdf_file", type=Path, help="Path to the BAC statement PDF")
parser.add_argument("-o", "--output", type=Path, default=Path("transactions.json"))
parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose logging")
2026-03-09 13:24:41 -06:00
args = parser.parse_args()
if not args.pdf_file.exists():
2026-03-09 15:39:16 -06:00
sys.exit(f"Error: File not found: {args.pdf_file}")
if args.pdf_file.suffix.lower() != ".pdf":
sys.exit(f"Error: File must be a PDF: {args.pdf_file}")
2026-03-09 13:24:41 -06:00
try:
2026-03-09 13:59:03 -06:00
result = extract_transactions(args.pdf_file, args.verbose)
2026-03-09 13:24:41 -06:00
with open(args.output, "w", encoding="utf-8") as f:
2026-03-09 15:39:16 -06:00
json.dump(result, f, indent=2 if args.pretty else None, ensure_ascii=False)
2026-03-09 13:24:41 -06:00
2026-03-09 15:39:16 -06:00
summary = result["summary"]
2026-03-09 14:44:47 -06:00
print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}")
2026-03-09 15:39:16 -06:00
for key, label in [("purchases", "Purchases (B)"), ("other_charges", "Other charges (D)"),
("voluntary_services", "Voluntary services (E)")]:
s = summary[key]
print(f" {label:25} {s['count']:3d} CRC {s['total_crc']:>12,.2f} USD {s['total_usd']:>10,.2f}")
2026-03-09 13:24:41 -06:00
except ValueError as e:
2026-03-09 15:39:16 -06:00
sys.exit(f"Error: {e}")
2026-03-09 13:24:41 -06:00
except Exception as e:
if args.verbose:
import traceback
traceback.print_exc()
2026-03-09 15:39:16 -06:00
sys.exit(f"Error processing PDF: {e}")
2026-03-09 13:24:41 -06:00
if __name__ == "__main__":
main()