bac_tools/bac_extract.py

365 lines
11 KiB
Python
Executable file

#!/usr/bin/env python3
"""
BAC Credit Card Statement Extractor
Extracts transactions from BAC Costa Rica credit card statement PDFs.
Specifically targets section "B) Detalle de compras del periodo".
"""
import argparse
import json
import logging
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import pdfplumber
# Spanish month abbreviations to month numbers
SPANISH_MONTHS = {
"ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6,
"JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12
}
# Card holder pattern: ************XXXX NAME
CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)")
# Date pattern: D-MMM-YY or DD-MMM-YY
DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE)
# Transaction line pattern:
# Reference Date Description Location (optional) Currency Amount
# 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56
TRANSACTION_PATTERN = re.compile(
r"^(\d{12})\s+" # Reference (12 digits)
r"(\d{1,2}-[A-Z]{3}-\d{2})\s+" # Date
r"(.+?)\s+" # Description
r"(CRC|USD)\s+" # Currency
r"([\d,]+\.\d{2})(-)?$", # Amount (with optional trailing minus)
re.IGNORECASE
)
logger = logging.getLogger(__name__)
def parse_spanish_date(date_str: str) -> Optional[str]:
"""Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD)."""
if not date_str:
return None
match = DATE_PATTERN.match(date_str.strip())
if not match:
return None
day, month_abbr, year = match.groups()
month = SPANISH_MONTHS.get(month_abbr.upper())
if not month:
return None
# Assume 2000s for 2-digit year
full_year = 2000 + int(year)
try:
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
except ValueError:
return None
def parse_amount(amount_str: str) -> Optional[float]:
"""
Parse amount string with comma thousands separator.
Handles trailing '-' for negative values.
"""
if not amount_str:
return None
amount_str = amount_str.strip()
if not amount_str:
return None
is_negative = amount_str.endswith("-")
if is_negative:
amount_str = amount_str[:-1]
try:
amount = float(amount_str.replace(",", ""))
return -amount if is_negative else amount
except ValueError:
return None
def is_bac_statement(pdf: pdfplumber.PDF) -> bool:
"""Check if the PDF is a BAC credit card statement."""
if not pdf.pages:
return False
first_page_text = pdf.pages[0].extract_text() or ""
return "BAC" in first_page_text and "TARJETA" in first_page_text.upper()
def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
"""Extract the statement date from the PDF."""
if not pdf.pages:
return None
first_page_text = pdf.pages[0].extract_text() or ""
date_matches = DATE_PATTERN.findall(first_page_text)
if not date_matches:
return None
day, month_abbr, year = date_matches[0]
month = SPANISH_MONTHS.get(month_abbr.upper())
if not month:
return None
full_year = 2000 + int(year)
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
def find_section_b_start(page_text: str) -> bool:
"""Check if page contains start of section B."""
patterns = [
r"B\)\s*Detalle\s+de\s+compras",
r"Detalle\s+de\s+compras\s+del\s+periodo",
]
return any(re.search(p, page_text, re.IGNORECASE) for p in patterns)
def is_section_end(text: str) -> bool:
"""Check if we've reached the end of section B."""
end_patterns = [
r"Total\s+de\s+compras\s+del\s+periodo",
r"C\)\s*Detalle\s+de\s+intereses",
r"Detalle\s+de\s+intereses",
r"D\)\s*Detalle",
]
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
"""
Extract card holder info from a row.
Returns (card_suffix, name) or None.
"""
match = CARD_HOLDER_PATTERN.search(row_text)
if match:
return match.group(1), match.group(2).strip()
return None
def parse_transaction_line(line: str) -> Optional[dict]:
"""
Parse a text-based transaction line.
Format: Reference Date Description [Location] Currency Amount
Example: 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56
"""
line = line.strip()
if not line:
return None
match = TRANSACTION_PATTERN.match(line)
if not match:
return None
reference = match.group(1)
date_str = match.group(2)
description = match.group(3).strip()
currency = match.group(4).upper()
amount_str = match.group(5)
is_negative = match.group(6) == "-"
date = parse_spanish_date(date_str)
if not date:
logger.warning(f"Could not parse date '{date_str}' for reference {reference}")
return None
amount = parse_amount(amount_str)
if amount is None:
logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}")
return None
if is_negative:
amount = -amount
return {
"reference": reference,
"date": date,
"description": description,
"location": None,
"currency": currency,
"amount_crc": amount if currency == "CRC" else None,
"amount_usd": amount if currency == "USD" else None,
}
def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
"""
Extract transactions from a BAC credit card statement PDF.
Args:
pdf_path: Path to the PDF file
verbose: Enable verbose logging
Returns:
Dictionary with metadata, card_holder, transactions, and summary
"""
if verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
with pdfplumber.open(pdf_path) as pdf:
# Validate this is a BAC statement
if not is_bac_statement(pdf):
raise ValueError("PDF does not appear to be a BAC credit card statement")
statement_date = extract_statement_date(pdf)
transactions = []
card_suffix = None
card_holder_name = None
in_section_b = False
section_b_found = False
# Start from page 2 (index 1) as page 1 is summary only
start_page = 1 if len(pdf.pages) > 1 else 0
for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1):
page_text = page.extract_text() or ""
logger.debug(f"Processing page {page_num}")
# Check for section B start
if not in_section_b and find_section_b_start(page_text):
in_section_b = True
section_b_found = True
logger.debug(f"Found section B on page {page_num}")
if not in_section_b:
continue
# Check for section end (still process this page before breaking)
reached_section_end = is_section_end(page_text)
if reached_section_end:
logger.debug(f"Found section end on page {page_num}")
# Parse text line by line
for line in page_text.split("\n"):
line = line.strip()
if not line:
continue
card_info = extract_card_holder(line)
if card_info:
card_suffix, card_holder_name = card_info
logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
continue
transaction = parse_transaction_line(line)
if transaction:
transactions.append(transaction)
logger.debug(f"Extracted transaction: {transaction['reference']}")
if reached_section_end:
break
if not section_b_found:
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
# Calculate summary
total_crc = sum(t["amount_crc"] or 0 for t in transactions)
total_usd = sum(t["amount_usd"] or 0 for t in transactions)
card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None
return {
"metadata": {
"source_file": pdf_path.name,
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
"statement_date": statement_date,
"total_transactions": len(transactions)
},
"card_holder": card_holder,
"transactions": transactions,
"summary": {
"total_crc": round(total_crc, 2),
"total_usd": round(total_usd, 2),
"transaction_count": len(transactions)
}
}
def main():
parser = argparse.ArgumentParser(
description="Extract transactions from BAC Costa Rica credit card statement PDFs",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python bac_extract.py EstadodeCuenta.pdf --pretty
python bac_extract.py statement.pdf -o output.json -v
"""
)
parser.add_argument(
"pdf_file",
type=Path,
help="Path to the BAC statement PDF"
)
parser.add_argument(
"-o", "--output",
type=Path,
default=Path("transactions.json"),
help="Output JSON file path (default: transactions.json)"
)
parser.add_argument(
"--pretty",
action="store_true",
help="Pretty-print JSON output"
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Enable verbose logging"
)
args = parser.parse_args()
# Validate PDF file exists
if not args.pdf_file.exists():
print(f"Error: File not found: {args.pdf_file}", file=sys.stderr)
sys.exit(1)
if not args.pdf_file.suffix.lower() == ".pdf":
print(f"Error: File must be a PDF: {args.pdf_file}", file=sys.stderr)
sys.exit(1)
try:
result = extract_transactions(args.pdf_file, args.verbose)
# Write output
indent = 2 if args.pretty else None
with open(args.output, "w", encoding="utf-8") as f:
json.dump(result, f, indent=indent, ensure_ascii=False)
print(f"Extracted {result['summary']['transaction_count']} transactions to {args.output}")
print(f"Total CRC: {result['summary']['total_crc']:,.2f}")
print(f"Total USD: {result['summary']['total_usd']:,.2f}")
except ValueError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error processing PDF: {e}", file=sys.stderr)
if args.verbose:
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()