Compare commits

..

No commits in common. "14734d3125dd9da48bda8c55731ffd7abf68912c" and "a51f672316cad1e4e21376dd55369a795986aff4" have entirely different histories.

3 changed files with 292 additions and 137 deletions

2
.gitignore vendored
View file

@ -1,4 +1,2 @@
*.pdf
*.json
__pycache__/
testStatements/

View file

@ -4,24 +4,20 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
## Project Overview
Single-script Python tool that extracts credit card transactions from BAC Costa Rica statement PDFs. Parses sections B (purchases), D (other charges), and E (voluntary services) and outputs JSON.
Single-script Python tool that extracts credit card transactions from BAC Costa Rica statement PDFs. Parses section "B) Detalle de compras del periodo" and outputs JSON.
## Dependencies
- pdfplumber (>=0.10.0)
## Commands
## Usage
```bash
# Run tests
python testStatements/run_tests.py
# Run extractor
python bac_extract.py <pdf_file> [options]
python bac_extract.py <pdf_file> <card_suffix> [options]
# Examples
python bac_extract.py EstadodeCuenta.pdf --pretty
python bac_extract.py statement.pdf -o output.json -v
python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
python bac_extract.py statement.pdf 1234 -o output.json -v
```
Options:
@ -33,15 +29,12 @@ Options:
The extraction pipeline:
1. Validates PDF is a BAC statement (`is_bac_statement`)
2. Iterates pages line-by-line, detecting section boundaries via `SECTIONS` dict patterns
3. Parses transactions matching `TRANSACTION_PATTERN` regex
4. Outputs card holders, transactions by section, and summaries
Key data structures:
- `SECTIONS`: Maps section IDs (B/D/E) to start/end regex patterns and output keys
- `SPANISH_MONTHS`: Spanish month abbreviations for date parsing
2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`)
3. Extracts tables page-by-page using pdfplumber
4. Filters transactions by card suffix (last 4 digits)
5. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators
Key parsing functions:
- `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15"
- `parse_amount`: Handles "1,234.56" and trailing negatives "100.00-"
- `matches_patterns`: Generic regex pattern matcher for section detection
- `extract_card_holder`: Matches "************1234 NAME" pattern

View file

@ -3,10 +3,7 @@
BAC Credit Card Statement Extractor
Extracts transactions from BAC Costa Rica credit card statement PDFs.
Targets sections:
B) Detalle de compras del periodo
D) Detalle de otros cargos
E) Detalle de productos y servicios de elección voluntaria
Specifically targets section "B) Detalle de compras del periodo".
"""
import argparse
@ -20,63 +17,76 @@ from typing import Optional
import pdfplumber
# Spanish month abbreviations to month numbers
SPANISH_MONTHS = {
"ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6,
"JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12
}
# Card holder pattern: ************XXXX NAME
CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)")
# Date pattern: D-MMM-YY or DD-MMM-YY
DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE)
# Transaction line pattern:
# Reference Date Description Location (optional) Currency Amount
# 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56
TRANSACTION_PATTERN = re.compile(
r"^(\d{12,13})\s+"
r"(\d{1,2}-[A-Z]{3}-\d{2})\s+"
r"(.+?)\s+"
r"(CRC|USD)\s+"
r"([\d,]+\.\d{2})(-)?$",
r"^(\d{12})\s+" # Reference (12 digits)
r"(\d{1,2}-[A-Z]{3}-\d{2})\s+" # Date
r"(.+?)\s+" # Description
r"(CRC|USD)\s+" # Currency
r"([\d,]+\.\d{2})(-)?$", # Amount (with optional trailing minus)
re.IGNORECASE
)
# Section definitions: start patterns, end patterns, output key
SECTIONS = {
"B": {
"start": [r"B\)\s*Detalle\s+de\s+compras", r"Detalle\s+de\s+compras\s+del\s+periodo"],
"end": [r"Total\s+de\s+compras\s+del\s+periodo", r"C\)\s*Detalle", r"D\)\s*Detalle", r"E\)\s*Detalle"],
"key": "purchases",
},
"D": {
"start": [r"D\)\s*Detalle\s+de\s+otros\s+cargos"],
"end": [r"Total\s+por\s+concepto\s+otros\s+cargos", r"E\)\s*Detalle"],
"key": "other_charges",
},
"E": {
"start": [r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios"],
"end": [r"Total\s+por\s+concepto\s+de\s+productos", r"F\)\s*Cargos"],
"key": "voluntary_services",
},
}
logger = logging.getLogger(__name__)
def parse_spanish_date(date_str: str) -> Optional[str]:
"""Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD)."""
match = DATE_PATTERN.match(date_str.strip()) if date_str else None
if not date_str:
return None
match = DATE_PATTERN.match(date_str.strip())
if not match:
return None
day, month_abbr, year = match.groups()
month = SPANISH_MONTHS.get(month_abbr.upper())
if not month:
return None
return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}"
# Assume 2000s for 2-digit year
full_year = 2000 + int(year)
try:
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
except ValueError:
return None
def parse_amount(amount_str: str) -> Optional[float]:
"""Parse amount with comma thousands separator. Handles trailing '-' for negatives."""
if not amount_str or not (amount_str := amount_str.strip()):
"""
Parse amount string with comma thousands separator.
Handles trailing '-' for negative values.
"""
if not amount_str or not amount_str.strip():
return None
amount_str = amount_str.strip()
# Check for trailing negative sign
is_negative = amount_str.endswith("-")
if is_negative:
amount_str = amount_str[:-1].strip()
# Remove thousands separators (commas) and handle decimal point
# Format: 1,234.56 or 1,234,567.89
try:
amount = float(amount_str.rstrip("-").replace(",", ""))
amount_str = amount_str.replace(",", "")
amount = float(amount_str)
return -amount if is_negative else amount
except ValueError:
return None
@ -86,6 +96,7 @@ def is_bac_statement(pdf: pdfplumber.PDF) -> bool:
"""Check if the PDF is a BAC credit card statement."""
if not pdf.pages:
return False
first_page_text = pdf.pages[0].extract_text() or ""
return "BAC" in first_page_text and "TARJETA" in first_page_text.upper()
@ -94,114 +105,215 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
"""Extract the statement date from the PDF."""
if not pdf.pages:
return None
first_page_text = pdf.pages[0].extract_text() or ""
# Look for date patterns in the first page
# Common format: "Fecha de corte: DD-MMM-YY" or similar
date_matches = DATE_PATTERN.findall(first_page_text)
if not date_matches:
return None
if date_matches:
# Use the first date found as statement date
day, month_abbr, year = date_matches[0]
month = SPANISH_MONTHS.get(month_abbr.upper())
if not month:
if month:
full_year = 2000 + int(year)
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
return None
return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}"
def matches_patterns(text: str, patterns: list[str]) -> bool:
"""Check if text matches any of the given regex patterns."""
return any(re.search(p, text, re.IGNORECASE) for p in patterns)
def find_section_b_start(page_text: str) -> bool:
"""Check if page contains start of section B."""
patterns = [
r"B\)\s*Detalle\s+de\s+compras",
r"Detalle\s+de\s+compras\s+del\s+periodo",
]
for pattern in patterns:
if re.search(pattern, page_text, re.IGNORECASE):
return True
return False
def is_section_end(text: str) -> bool:
"""Check if we've reached the end of section B."""
end_patterns = [
r"Total\s+de\s+compras\s+del\s+periodo",
r"C\)\s*Detalle\s+de\s+intereses",
r"Detalle\s+de\s+intereses",
r"D\)\s*Detalle",
]
for pattern in end_patterns:
if re.search(pattern, text, re.IGNORECASE):
return True
return False
def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
"""
Extract card holder info from a row.
Returns (card_suffix, name) or None.
"""
match = CARD_HOLDER_PATTERN.search(row_text)
if match:
return match.group(1), match.group(2).strip()
return None
def parse_transaction_line(line: str) -> Optional[dict]:
"""Parse a transaction line into a dict, or return None if not a transaction."""
match = TRANSACTION_PATTERN.match(line.strip())
"""
Parse a text-based transaction line.
Format: Reference Date Description [Location] Currency Amount
Example: 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56
"""
line = line.strip()
if not line:
return None
match = TRANSACTION_PATTERN.match(line)
if not match:
return None
reference, date_str, description, currency, amount_str, neg = match.groups()
currency = currency.upper()
reference = match.group(1)
date_str = match.group(2)
desc_and_loc = match.group(3).strip()
currency = match.group(4).upper()
amount_str = match.group(5)
is_negative = match.group(6) == "-"
# Parse date
date = parse_spanish_date(date_str)
amount = parse_amount(amount_str)
if not date or amount is None:
logger.warning(f"Could not parse transaction: {line}")
if not date:
logger.warning(f"Could not parse date '{date_str}' for reference {reference}")
return None
if neg:
# Parse amount
amount = parse_amount(amount_str)
if amount is None:
logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}")
return None
if is_negative:
amount = -amount
# Split description and location
# Location is typically at the end, often a short suffix like "ANILL", "San Jose"
# For now, keep everything as description
description = desc_and_loc
location = None
# Set amount in appropriate currency field
amount_crc = amount if currency == "CRC" else None
amount_usd = amount if currency == "USD" else None
return {
"reference": reference,
"date": date,
"description": description.strip(),
"location": None,
"description": description,
"location": location,
"currency": currency,
"amount_crc": amount if currency == "CRC" else None,
"amount_usd": amount if currency == "USD" else None,
"amount_crc": amount_crc,
"amount_usd": amount_usd,
}
def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
"""Extract transactions from a BAC credit card statement PDF."""
logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False) -> dict:
"""
Extract transactions from a BAC credit card statement PDF.
Args:
pdf_path: Path to the PDF file
card_suffix: Last 4 digits of card to filter
verbose: Enable verbose logging
Returns:
Dictionary with metadata, card_holder, transactions, and summary
"""
if verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
with pdfplumber.open(pdf_path) as pdf:
# Validate this is a BAC statement
if not is_bac_statement(pdf):
raise ValueError("PDF does not appear to be a BAC credit card statement")
statement_date = extract_statement_date(pdf)
transactions = {s["key"]: [] for s in SECTIONS.values()}
card_holders = []
seen_card_suffixes = set()
current_section = None
sections_completed = set()
transactions = []
current_card_suffix = None
current_card_name = None
in_section_b = False
section_b_found = False
card_suffix_found = False
# Start from page 2 (index 1) as page 1 is summary only
start_page = 1 if len(pdf.pages) > 1 else 0
for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1):
page_text = page.extract_text() or ""
logger.debug(f"Processing page {page_num}")
# Check for section B start
if not in_section_b and find_section_b_start(page_text):
in_section_b = True
section_b_found = True
logger.debug(f"Found section B on page {page_num}")
# Check for section end
if in_section_b and is_section_end(page_text):
logger.debug(f"Found section end on page {page_num}")
# Still process this page, but mark we're ending
if not in_section_b:
continue
# Parse text line by line
for line in page_text.split("\n"):
line = line.strip()
if not line:
continue
# Check for section end
if current_section and matches_patterns(line, SECTIONS[current_section]["end"]):
logger.debug(f"Section {current_section} ended on page {page_num}")
sections_completed.add(current_section)
current_section = None
# Check for card holder line
card_info = extract_card_holder(line)
if card_info:
current_card_suffix, current_card_name = card_info
logger.debug(f"Found card holder: {current_card_suffix} - {current_card_name}")
if current_card_suffix == card_suffix:
card_suffix_found = True
continue
# Check for section start
if current_section is None:
for sec_id, sec in SECTIONS.items():
if sec_id not in sections_completed and matches_patterns(line, sec["start"]):
current_section = sec_id
logger.debug(f"Found section {sec_id} on page {page_num}")
# Skip if we're not tracking the right card
if current_card_suffix != card_suffix:
continue
# Try to parse as transaction
transaction = parse_transaction_line(line)
if transaction:
transactions.append(transaction)
logger.debug(f"Extracted transaction: {transaction['reference']}")
# Check if we've passed section B
if in_section_b and is_section_end(page_text):
break
continue
# Extract card holder
match = CARD_HOLDER_PATTERN.search(line)
if match:
suffix, name = match.group(1), match.group(2).strip()
if suffix not in seen_card_suffixes:
card_holders.append({"card_suffix": suffix, "name": name})
seen_card_suffixes.add(suffix)
logger.debug(f"Found card holder: {suffix} - {name}")
continue
# Parse transaction
txn = parse_transaction_line(line)
if txn:
transactions[SECTIONS[current_section]["key"]].append(txn)
logger.debug(f"Extracted {current_section} transaction: {txn['reference']}")
if "B" not in sections_completed and not transactions["purchases"]:
if not section_b_found:
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
def summarize(txns):
return {
"total_crc": round(sum(t["amount_crc"] or 0 for t in txns), 2),
"total_usd": round(sum(t["amount_usd"] or 0 for t in txns), 2),
"count": len(txns),
if not card_suffix_found:
raise ValueError(f"Card suffix '{card_suffix}' not found in statement")
# Calculate summary
total_crc = sum(t["amount_crc"] or 0 for t in transactions)
total_usd = sum(t["amount_usd"] or 0 for t in transactions)
# Get card holder info
card_holder = None
if card_suffix_found:
card_holder = {
"card_suffix": card_suffix,
"name": current_card_name if current_card_suffix == card_suffix else None
}
return {
@ -209,46 +321,98 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
"source_file": pdf_path.name,
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
"statement_date": statement_date,
"total_transactions": sum(len(t) for t in transactions.values()),
"card_filter": card_suffix,
"total_transactions": len(transactions)
},
"card_holders": card_holders,
**transactions,
"summary": {key: summarize(txns) for key, txns in transactions.items()},
"card_holder": card_holder,
"transactions": transactions,
"summary": {
"total_crc": round(total_crc, 2),
"total_usd": round(total_usd, 2),
"transaction_count": len(transactions)
}
}
def main():
parser = argparse.ArgumentParser(description="Extract transactions from BAC CR statement PDFs")
parser.add_argument("pdf_file", type=Path, help="Path to the BAC statement PDF")
parser.add_argument("-o", "--output", type=Path, default=Path("transactions.json"))
parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose logging")
parser = argparse.ArgumentParser(
description="Extract transactions from BAC Costa Rica credit card statement PDFs",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
python bac_extract.py statement.pdf 1234 -o output.json -v
"""
)
parser.add_argument(
"pdf_file",
type=Path,
help="Path to the BAC statement PDF"
)
parser.add_argument(
"card_suffix",
type=str,
help="Last 4 digits of card to filter (e.g., 1234)"
)
parser.add_argument(
"-o", "--output",
type=Path,
default=Path("transactions.json"),
help="Output JSON file path (default: transactions.json)"
)
parser.add_argument(
"--pretty",
action="store_true",
help="Pretty-print JSON output"
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Enable verbose logging"
)
args = parser.parse_args()
# Validate card suffix
if not args.card_suffix.isdigit() or len(args.card_suffix) != 4:
print(f"Error: Card suffix must be exactly 4 digits, got '{args.card_suffix}'", file=sys.stderr)
sys.exit(1)
# Validate PDF file exists
if not args.pdf_file.exists():
sys.exit(f"Error: File not found: {args.pdf_file}")
if args.pdf_file.suffix.lower() != ".pdf":
sys.exit(f"Error: File must be a PDF: {args.pdf_file}")
print(f"Error: File not found: {args.pdf_file}", file=sys.stderr)
sys.exit(1)
if not args.pdf_file.suffix.lower() == ".pdf":
print(f"Error: File must be a PDF: {args.pdf_file}", file=sys.stderr)
sys.exit(1)
try:
result = extract_transactions(args.pdf_file, args.verbose)
with open(args.output, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2 if args.pretty else None, ensure_ascii=False)
result = extract_transactions(args.pdf_file, args.card_suffix, args.verbose)
summary = result["summary"]
print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}")
for key, label in [("purchases", "Purchases (B)"), ("other_charges", "Other charges (D)"),
("voluntary_services", "Voluntary services (E)")]:
s = summary[key]
print(f" {label:25} {s['count']:3d} CRC {s['total_crc']:>12,.2f} USD {s['total_usd']:>10,.2f}")
# Write output
indent = 2 if args.pretty else None
with open(args.output, "w", encoding="utf-8") as f:
json.dump(result, f, indent=indent, ensure_ascii=False)
print(f"Extracted {result['summary']['transaction_count']} transactions to {args.output}")
print(f"Total CRC: {result['summary']['total_crc']:,.2f}")
print(f"Total USD: {result['summary']['total_usd']:,.2f}")
except ValueError as e:
sys.exit(f"Error: {e}")
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error processing PDF: {e}", file=sys.stderr)
if args.verbose:
import traceback
traceback.print_exc()
sys.exit(f"Error processing PDF: {e}")
sys.exit(1)
if __name__ == "__main__":