Compare commits

...

5 commits

3 changed files with 135 additions and 290 deletions

2
.gitignore vendored
View file

@ -1,2 +1,4 @@
*.pdf *.pdf
*.json *.json
__pycache__/
testStatements/

View file

@ -4,20 +4,24 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
## Project Overview ## Project Overview
Single-script Python tool that extracts credit card transactions from BAC Costa Rica statement PDFs. Parses section "B) Detalle de compras del periodo" and outputs JSON. Single-script Python tool that extracts credit card transactions from BAC Costa Rica statement PDFs. Parses sections B (purchases), D (other charges), and E (voluntary services) and outputs JSON.
## Dependencies ## Dependencies
- pdfplumber (>=0.10.0) - pdfplumber (>=0.10.0)
## Usage ## Commands
```bash ```bash
python bac_extract.py <pdf_file> <card_suffix> [options] # Run tests
python testStatements/run_tests.py
# Run extractor
python bac_extract.py <pdf_file> [options]
# Examples # Examples
python bac_extract.py EstadodeCuenta.pdf 1234 --pretty python bac_extract.py EstadodeCuenta.pdf --pretty
python bac_extract.py statement.pdf 1234 -o output.json -v python bac_extract.py statement.pdf -o output.json -v
``` ```
Options: Options:
@ -29,12 +33,15 @@ Options:
The extraction pipeline: The extraction pipeline:
1. Validates PDF is a BAC statement (`is_bac_statement`) 1. Validates PDF is a BAC statement (`is_bac_statement`)
2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`) 2. Iterates pages line-by-line, detecting section boundaries via `SECTIONS` dict patterns
3. Extracts tables page-by-page using pdfplumber 3. Parses transactions matching `TRANSACTION_PATTERN` regex
4. Filters transactions by card suffix (last 4 digits) 4. Outputs card holders, transactions by section, and summaries
5. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators
Key data structures:
- `SECTIONS`: Maps section IDs (B/D/E) to start/end regex patterns and output keys
- `SPANISH_MONTHS`: Spanish month abbreviations for date parsing
Key parsing functions: Key parsing functions:
- `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15" - `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15"
- `parse_amount`: Handles "1,234.56" and trailing negatives "100.00-" - `parse_amount`: Handles "1,234.56" and trailing negatives "100.00-"
- `extract_card_holder`: Matches "************1234 NAME" pattern - `matches_patterns`: Generic regex pattern matcher for section detection

View file

@ -3,7 +3,10 @@
BAC Credit Card Statement Extractor BAC Credit Card Statement Extractor
Extracts transactions from BAC Costa Rica credit card statement PDFs. Extracts transactions from BAC Costa Rica credit card statement PDFs.
Specifically targets section "B) Detalle de compras del periodo". Targets sections:
B) Detalle de compras del periodo
D) Detalle de otros cargos
E) Detalle de productos y servicios de elección voluntaria
""" """
import argparse import argparse
@ -17,76 +20,63 @@ from typing import Optional
import pdfplumber import pdfplumber
# Spanish month abbreviations to month numbers
SPANISH_MONTHS = { SPANISH_MONTHS = {
"ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6, "ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6,
"JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12 "JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12
} }
# Card holder pattern: ************XXXX NAME
CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)") CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)")
# Date pattern: D-MMM-YY or DD-MMM-YY
DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE) DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE)
# Transaction line pattern:
# Reference Date Description Location (optional) Currency Amount
# 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56
TRANSACTION_PATTERN = re.compile( TRANSACTION_PATTERN = re.compile(
r"^(\d{12})\s+" # Reference (12 digits) r"^(\d{12,13})\s+"
r"(\d{1,2}-[A-Z]{3}-\d{2})\s+" # Date r"(\d{1,2}-[A-Z]{3}-\d{2})\s+"
r"(.+?)\s+" # Description r"(.+?)\s+"
r"(CRC|USD)\s+" # Currency r"(CRC|USD)\s+"
r"([\d,]+\.\d{2})(-)?$", # Amount (with optional trailing minus) r"([\d,]+\.\d{2})(-)?$",
re.IGNORECASE re.IGNORECASE
) )
# Section definitions: start patterns, end patterns, output key
SECTIONS = {
"B": {
"start": [r"B\)\s*Detalle\s+de\s+compras", r"Detalle\s+de\s+compras\s+del\s+periodo"],
"end": [r"Total\s+de\s+compras\s+del\s+periodo", r"C\)\s*Detalle", r"D\)\s*Detalle", r"E\)\s*Detalle"],
"key": "purchases",
},
"D": {
"start": [r"D\)\s*Detalle\s+de\s+otros\s+cargos"],
"end": [r"Total\s+por\s+concepto\s+otros\s+cargos", r"E\)\s*Detalle"],
"key": "other_charges",
},
"E": {
"start": [r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios"],
"end": [r"Total\s+por\s+concepto\s+de\s+productos", r"F\)\s*Cargos"],
"key": "voluntary_services",
},
}
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def parse_spanish_date(date_str: str) -> Optional[str]: def parse_spanish_date(date_str: str) -> Optional[str]:
"""Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD).""" """Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD)."""
if not date_str: match = DATE_PATTERN.match(date_str.strip()) if date_str else None
return None
match = DATE_PATTERN.match(date_str.strip())
if not match: if not match:
return None return None
day, month_abbr, year = match.groups() day, month_abbr, year = match.groups()
month = SPANISH_MONTHS.get(month_abbr.upper()) month = SPANISH_MONTHS.get(month_abbr.upper())
if not month: if not month:
return None return None
return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}"
# Assume 2000s for 2-digit year
full_year = 2000 + int(year)
try:
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
except ValueError:
return None
def parse_amount(amount_str: str) -> Optional[float]: def parse_amount(amount_str: str) -> Optional[float]:
""" """Parse amount with comma thousands separator. Handles trailing '-' for negatives."""
Parse amount string with comma thousands separator. if not amount_str or not (amount_str := amount_str.strip()):
Handles trailing '-' for negative values.
"""
if not amount_str or not amount_str.strip():
return None return None
amount_str = amount_str.strip()
# Check for trailing negative sign
is_negative = amount_str.endswith("-") is_negative = amount_str.endswith("-")
if is_negative:
amount_str = amount_str[:-1].strip()
# Remove thousands separators (commas) and handle decimal point
# Format: 1,234.56 or 1,234,567.89
try: try:
amount_str = amount_str.replace(",", "") amount = float(amount_str.rstrip("-").replace(",", ""))
amount = float(amount_str)
return -amount if is_negative else amount return -amount if is_negative else amount
except ValueError: except ValueError:
return None return None
@ -96,7 +86,6 @@ def is_bac_statement(pdf: pdfplumber.PDF) -> bool:
"""Check if the PDF is a BAC credit card statement.""" """Check if the PDF is a BAC credit card statement."""
if not pdf.pages: if not pdf.pages:
return False return False
first_page_text = pdf.pages[0].extract_text() or "" first_page_text = pdf.pages[0].extract_text() or ""
return "BAC" in first_page_text and "TARJETA" in first_page_text.upper() return "BAC" in first_page_text and "TARJETA" in first_page_text.upper()
@ -105,215 +94,114 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
"""Extract the statement date from the PDF.""" """Extract the statement date from the PDF."""
if not pdf.pages: if not pdf.pages:
return None return None
first_page_text = pdf.pages[0].extract_text() or "" first_page_text = pdf.pages[0].extract_text() or ""
# Look for date patterns in the first page
# Common format: "Fecha de corte: DD-MMM-YY" or similar
date_matches = DATE_PATTERN.findall(first_page_text) date_matches = DATE_PATTERN.findall(first_page_text)
if date_matches: if not date_matches:
# Use the first date found as statement date return None
day, month_abbr, year = date_matches[0] day, month_abbr, year = date_matches[0]
month = SPANISH_MONTHS.get(month_abbr.upper()) month = SPANISH_MONTHS.get(month_abbr.upper())
if month: if not month:
full_year = 2000 + int(year)
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
return None return None
return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}"
def find_section_b_start(page_text: str) -> bool: def matches_patterns(text: str, patterns: list[str]) -> bool:
"""Check if page contains start of section B.""" """Check if text matches any of the given regex patterns."""
patterns = [ return any(re.search(p, text, re.IGNORECASE) for p in patterns)
r"B\)\s*Detalle\s+de\s+compras",
r"Detalle\s+de\s+compras\s+del\s+periodo",
]
for pattern in patterns:
if re.search(pattern, page_text, re.IGNORECASE):
return True
return False
def is_section_end(text: str) -> bool:
"""Check if we've reached the end of section B."""
end_patterns = [
r"Total\s+de\s+compras\s+del\s+periodo",
r"C\)\s*Detalle\s+de\s+intereses",
r"Detalle\s+de\s+intereses",
r"D\)\s*Detalle",
]
for pattern in end_patterns:
if re.search(pattern, text, re.IGNORECASE):
return True
return False
def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
"""
Extract card holder info from a row.
Returns (card_suffix, name) or None.
"""
match = CARD_HOLDER_PATTERN.search(row_text)
if match:
return match.group(1), match.group(2).strip()
return None
def parse_transaction_line(line: str) -> Optional[dict]: def parse_transaction_line(line: str) -> Optional[dict]:
""" """Parse a transaction line into a dict, or return None if not a transaction."""
Parse a text-based transaction line. match = TRANSACTION_PATTERN.match(line.strip())
Format: Reference Date Description [Location] Currency Amount
Example: 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56
"""
line = line.strip()
if not line:
return None
match = TRANSACTION_PATTERN.match(line)
if not match: if not match:
return None return None
reference = match.group(1) reference, date_str, description, currency, amount_str, neg = match.groups()
date_str = match.group(2) currency = currency.upper()
desc_and_loc = match.group(3).strip()
currency = match.group(4).upper()
amount_str = match.group(5)
is_negative = match.group(6) == "-"
# Parse date
date = parse_spanish_date(date_str) date = parse_spanish_date(date_str)
if not date:
logger.warning(f"Could not parse date '{date_str}' for reference {reference}")
return None
# Parse amount
amount = parse_amount(amount_str) amount = parse_amount(amount_str)
if amount is None: if not date or amount is None:
logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}") logger.warning(f"Could not parse transaction: {line}")
return None return None
if is_negative: if neg:
amount = -amount amount = -amount
# Split description and location
# Location is typically at the end, often a short suffix like "ANILL", "San Jose"
# For now, keep everything as description
description = desc_and_loc
location = None
# Set amount in appropriate currency field
amount_crc = amount if currency == "CRC" else None
amount_usd = amount if currency == "USD" else None
return { return {
"reference": reference, "reference": reference,
"date": date, "date": date,
"description": description, "description": description.strip(),
"location": location, "location": None,
"currency": currency, "currency": currency,
"amount_crc": amount_crc, "amount_crc": amount if currency == "CRC" else None,
"amount_usd": amount_usd, "amount_usd": amount if currency == "USD" else None,
} }
def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False) -> dict: def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
""" """Extract transactions from a BAC credit card statement PDF."""
Extract transactions from a BAC credit card statement PDF. logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
Args:
pdf_path: Path to the PDF file
card_suffix: Last 4 digits of card to filter
verbose: Enable verbose logging
Returns:
Dictionary with metadata, card_holder, transactions, and summary
"""
if verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
with pdfplumber.open(pdf_path) as pdf: with pdfplumber.open(pdf_path) as pdf:
# Validate this is a BAC statement
if not is_bac_statement(pdf): if not is_bac_statement(pdf):
raise ValueError("PDF does not appear to be a BAC credit card statement") raise ValueError("PDF does not appear to be a BAC credit card statement")
statement_date = extract_statement_date(pdf) statement_date = extract_statement_date(pdf)
transactions = {s["key"]: [] for s in SECTIONS.values()}
card_holders = []
seen_card_suffixes = set()
current_section = None
sections_completed = set()
transactions = []
current_card_suffix = None
current_card_name = None
in_section_b = False
section_b_found = False
card_suffix_found = False
# Start from page 2 (index 1) as page 1 is summary only
start_page = 1 if len(pdf.pages) > 1 else 0 start_page = 1 if len(pdf.pages) > 1 else 0
for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1): for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1):
page_text = page.extract_text() or "" page_text = page.extract_text() or ""
logger.debug(f"Processing page {page_num}") logger.debug(f"Processing page {page_num}")
# Check for section B start
if not in_section_b and find_section_b_start(page_text):
in_section_b = True
section_b_found = True
logger.debug(f"Found section B on page {page_num}")
# Check for section end
if in_section_b and is_section_end(page_text):
logger.debug(f"Found section end on page {page_num}")
# Still process this page, but mark we're ending
if not in_section_b:
continue
# Parse text line by line
for line in page_text.split("\n"): for line in page_text.split("\n"):
line = line.strip() line = line.strip()
if not line: if not line:
continue continue
# Check for card holder line # Check for section end
card_info = extract_card_holder(line) if current_section and matches_patterns(line, SECTIONS[current_section]["end"]):
if card_info: logger.debug(f"Section {current_section} ended on page {page_num}")
current_card_suffix, current_card_name = card_info sections_completed.add(current_section)
logger.debug(f"Found card holder: {current_card_suffix} - {current_card_name}") current_section = None
if current_card_suffix == card_suffix:
card_suffix_found = True
continue
# Skip if we're not tracking the right card # Check for section start
if current_card_suffix != card_suffix: if current_section is None:
continue for sec_id, sec in SECTIONS.items():
if sec_id not in sections_completed and matches_patterns(line, sec["start"]):
# Try to parse as transaction current_section = sec_id
transaction = parse_transaction_line(line) logger.debug(f"Found section {sec_id} on page {page_num}")
if transaction:
transactions.append(transaction)
logger.debug(f"Extracted transaction: {transaction['reference']}")
# Check if we've passed section B
if in_section_b and is_section_end(page_text):
break break
continue
if not section_b_found: # Extract card holder
match = CARD_HOLDER_PATTERN.search(line)
if match:
suffix, name = match.group(1), match.group(2).strip()
if suffix not in seen_card_suffixes:
card_holders.append({"card_suffix": suffix, "name": name})
seen_card_suffixes.add(suffix)
logger.debug(f"Found card holder: {suffix} - {name}")
continue
# Parse transaction
txn = parse_transaction_line(line)
if txn:
transactions[SECTIONS[current_section]["key"]].append(txn)
logger.debug(f"Extracted {current_section} transaction: {txn['reference']}")
if "B" not in sections_completed and not transactions["purchases"]:
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF") raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
if not card_suffix_found: def summarize(txns):
raise ValueError(f"Card suffix '{card_suffix}' not found in statement") return {
"total_crc": round(sum(t["amount_crc"] or 0 for t in txns), 2),
# Calculate summary "total_usd": round(sum(t["amount_usd"] or 0 for t in txns), 2),
total_crc = sum(t["amount_crc"] or 0 for t in transactions) "count": len(txns),
total_usd = sum(t["amount_usd"] or 0 for t in transactions)
# Get card holder info
card_holder = None
if card_suffix_found:
card_holder = {
"card_suffix": card_suffix,
"name": current_card_name if current_card_suffix == card_suffix else None
} }
return { return {
@ -321,98 +209,46 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False
"source_file": pdf_path.name, "source_file": pdf_path.name,
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
"statement_date": statement_date, "statement_date": statement_date,
"card_filter": card_suffix, "total_transactions": sum(len(t) for t in transactions.values()),
"total_transactions": len(transactions)
}, },
"card_holder": card_holder, "card_holders": card_holders,
"transactions": transactions, **transactions,
"summary": { "summary": {key: summarize(txns) for key, txns in transactions.items()},
"total_crc": round(total_crc, 2),
"total_usd": round(total_usd, 2),
"transaction_count": len(transactions)
}
} }
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(description="Extract transactions from BAC CR statement PDFs")
description="Extract transactions from BAC Costa Rica credit card statement PDFs", parser.add_argument("pdf_file", type=Path, help="Path to the BAC statement PDF")
formatter_class=argparse.RawDescriptionHelpFormatter, parser.add_argument("-o", "--output", type=Path, default=Path("transactions.json"))
epilog=""" parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
Examples: parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose logging")
python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
python bac_extract.py statement.pdf 1234 -o output.json -v
"""
)
parser.add_argument(
"pdf_file",
type=Path,
help="Path to the BAC statement PDF"
)
parser.add_argument(
"card_suffix",
type=str,
help="Last 4 digits of card to filter (e.g., 1234)"
)
parser.add_argument(
"-o", "--output",
type=Path,
default=Path("transactions.json"),
help="Output JSON file path (default: transactions.json)"
)
parser.add_argument(
"--pretty",
action="store_true",
help="Pretty-print JSON output"
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Enable verbose logging"
)
args = parser.parse_args() args = parser.parse_args()
# Validate card suffix
if not args.card_suffix.isdigit() or len(args.card_suffix) != 4:
print(f"Error: Card suffix must be exactly 4 digits, got '{args.card_suffix}'", file=sys.stderr)
sys.exit(1)
# Validate PDF file exists
if not args.pdf_file.exists(): if not args.pdf_file.exists():
print(f"Error: File not found: {args.pdf_file}", file=sys.stderr) sys.exit(f"Error: File not found: {args.pdf_file}")
sys.exit(1) if args.pdf_file.suffix.lower() != ".pdf":
sys.exit(f"Error: File must be a PDF: {args.pdf_file}")
if not args.pdf_file.suffix.lower() == ".pdf":
print(f"Error: File must be a PDF: {args.pdf_file}", file=sys.stderr)
sys.exit(1)
try: try:
result = extract_transactions(args.pdf_file, args.card_suffix, args.verbose) result = extract_transactions(args.pdf_file, args.verbose)
# Write output
indent = 2 if args.pretty else None
with open(args.output, "w", encoding="utf-8") as f: with open(args.output, "w", encoding="utf-8") as f:
json.dump(result, f, indent=indent, ensure_ascii=False) json.dump(result, f, indent=2 if args.pretty else None, ensure_ascii=False)
print(f"Extracted {result['summary']['transaction_count']} transactions to {args.output}") summary = result["summary"]
print(f"Total CRC: {result['summary']['total_crc']:,.2f}") print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}")
print(f"Total USD: {result['summary']['total_usd']:,.2f}") for key, label in [("purchases", "Purchases (B)"), ("other_charges", "Other charges (D)"),
("voluntary_services", "Voluntary services (E)")]:
s = summary[key]
print(f" {label:25} {s['count']:3d} CRC {s['total_crc']:>12,.2f} USD {s['total_usd']:>10,.2f}")
except ValueError as e: except ValueError as e:
print(f"Error: {e}", file=sys.stderr) sys.exit(f"Error: {e}")
sys.exit(1)
except Exception as e: except Exception as e:
print(f"Error processing PDF: {e}", file=sys.stderr)
if args.verbose: if args.verbose:
import traceback import traceback
traceback.print_exc() traceback.print_exc()
sys.exit(1) sys.exit(f"Error processing PDF: {e}")
if __name__ == "__main__": if __name__ == "__main__":