fix bugs and simplify
This commit is contained in:
parent
6fc7da8899
commit
14734d3125
2 changed files with 111 additions and 302 deletions
20
CLAUDE.md
20
CLAUDE.md
|
|
@ -4,15 +4,19 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
||||||
|
|
||||||
## Project Overview
|
## Project Overview
|
||||||
|
|
||||||
Single-script Python tool that extracts credit card transactions from BAC Costa Rica statement PDFs. Parses section "B) Detalle de compras del periodo" and outputs JSON.
|
Single-script Python tool that extracts credit card transactions from BAC Costa Rica statement PDFs. Parses sections B (purchases), D (other charges), and E (voluntary services) and outputs JSON.
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
|
|
||||||
- pdfplumber (>=0.10.0)
|
- pdfplumber (>=0.10.0)
|
||||||
|
|
||||||
## Usage
|
## Commands
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
# Run tests
|
||||||
|
python testStatements/run_tests.py
|
||||||
|
|
||||||
|
# Run extractor
|
||||||
python bac_extract.py <pdf_file> [options]
|
python bac_extract.py <pdf_file> [options]
|
||||||
|
|
||||||
# Examples
|
# Examples
|
||||||
|
|
@ -29,11 +33,15 @@ Options:
|
||||||
|
|
||||||
The extraction pipeline:
|
The extraction pipeline:
|
||||||
1. Validates PDF is a BAC statement (`is_bac_statement`)
|
1. Validates PDF is a BAC statement (`is_bac_statement`)
|
||||||
2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`)
|
2. Iterates pages line-by-line, detecting section boundaries via `SECTIONS` dict patterns
|
||||||
3. Extracts tables page-by-page using pdfplumber
|
3. Parses transactions matching `TRANSACTION_PATTERN` regex
|
||||||
4. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators
|
4. Outputs card holders, transactions by section, and summaries
|
||||||
|
|
||||||
|
Key data structures:
|
||||||
|
- `SECTIONS`: Maps section IDs (B/D/E) to start/end regex patterns and output keys
|
||||||
|
- `SPANISH_MONTHS`: Spanish month abbreviations for date parsing
|
||||||
|
|
||||||
Key parsing functions:
|
Key parsing functions:
|
||||||
- `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15"
|
- `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15"
|
||||||
- `parse_amount`: Handles "1,234.56" and trailing negatives "100.00-"
|
- `parse_amount`: Handles "1,234.56" and trailing negatives "100.00-"
|
||||||
- `extract_card_holder`: Matches "************1234 NAME" pattern
|
- `matches_patterns`: Generic regex pattern matcher for section detection
|
||||||
|
|
|
||||||
393
bac_extract.py
393
bac_extract.py
|
|
@ -20,74 +20,63 @@ from typing import Optional
|
||||||
|
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
|
|
||||||
# Spanish month abbreviations to month numbers
|
|
||||||
SPANISH_MONTHS = {
|
SPANISH_MONTHS = {
|
||||||
"ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6,
|
"ENE": 1, "FEB": 2, "MAR": 3, "ABR": 4, "MAY": 5, "JUN": 6,
|
||||||
"JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12
|
"JUL": 7, "AGO": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DIC": 12
|
||||||
}
|
}
|
||||||
|
|
||||||
# Card holder pattern: ************XXXX NAME
|
|
||||||
CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)")
|
CARD_HOLDER_PATTERN = re.compile(r"\*{12}(\d{4})\s+(.+)")
|
||||||
|
|
||||||
# Date pattern: D-MMM-YY or DD-MMM-YY
|
|
||||||
DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE)
|
DATE_PATTERN = re.compile(r"(\d{1,2})-([A-Z]{3})-(\d{2})", re.IGNORECASE)
|
||||||
|
|
||||||
# Transaction line pattern:
|
|
||||||
# Reference Date Description Location (optional) Currency Amount
|
|
||||||
# 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56
|
|
||||||
TRANSACTION_PATTERN = re.compile(
|
TRANSACTION_PATTERN = re.compile(
|
||||||
r"^(\d{12})\s+" # Reference (12 digits)
|
r"^(\d{12,13})\s+"
|
||||||
r"(\d{1,2}-[A-Z]{3}-\d{2})\s+" # Date
|
r"(\d{1,2}-[A-Z]{3}-\d{2})\s+"
|
||||||
r"(.+?)\s+" # Description
|
r"(.+?)\s+"
|
||||||
r"(CRC|USD)\s+" # Currency
|
r"(CRC|USD)\s+"
|
||||||
r"([\d,]+\.\d{2})(-)?$", # Amount (with optional trailing minus)
|
r"([\d,]+\.\d{2})(-)?$",
|
||||||
re.IGNORECASE
|
re.IGNORECASE
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Section definitions: start patterns, end patterns, output key
|
||||||
|
SECTIONS = {
|
||||||
|
"B": {
|
||||||
|
"start": [r"B\)\s*Detalle\s+de\s+compras", r"Detalle\s+de\s+compras\s+del\s+periodo"],
|
||||||
|
"end": [r"Total\s+de\s+compras\s+del\s+periodo", r"C\)\s*Detalle", r"D\)\s*Detalle", r"E\)\s*Detalle"],
|
||||||
|
"key": "purchases",
|
||||||
|
},
|
||||||
|
"D": {
|
||||||
|
"start": [r"D\)\s*Detalle\s+de\s+otros\s+cargos"],
|
||||||
|
"end": [r"Total\s+por\s+concepto\s+otros\s+cargos", r"E\)\s*Detalle"],
|
||||||
|
"key": "other_charges",
|
||||||
|
},
|
||||||
|
"E": {
|
||||||
|
"start": [r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios"],
|
||||||
|
"end": [r"Total\s+por\s+concepto\s+de\s+productos", r"F\)\s*Cargos"],
|
||||||
|
"key": "voluntary_services",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def parse_spanish_date(date_str: str) -> Optional[str]:
|
def parse_spanish_date(date_str: str) -> Optional[str]:
|
||||||
"""Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD)."""
|
"""Parse Spanish date format (D-MMM-YY) to ISO format (YYYY-MM-DD)."""
|
||||||
if not date_str:
|
match = DATE_PATTERN.match(date_str.strip()) if date_str else None
|
||||||
return None
|
|
||||||
|
|
||||||
match = DATE_PATTERN.match(date_str.strip())
|
|
||||||
if not match:
|
if not match:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
day, month_abbr, year = match.groups()
|
day, month_abbr, year = match.groups()
|
||||||
month = SPANISH_MONTHS.get(month_abbr.upper())
|
month = SPANISH_MONTHS.get(month_abbr.upper())
|
||||||
if not month:
|
if not month:
|
||||||
return None
|
return None
|
||||||
|
return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}"
|
||||||
# Assume 2000s for 2-digit year
|
|
||||||
full_year = 2000 + int(year)
|
|
||||||
|
|
||||||
try:
|
|
||||||
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
|
|
||||||
except ValueError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def parse_amount(amount_str: str) -> Optional[float]:
|
def parse_amount(amount_str: str) -> Optional[float]:
|
||||||
"""
|
"""Parse amount with comma thousands separator. Handles trailing '-' for negatives."""
|
||||||
Parse amount string with comma thousands separator.
|
if not amount_str or not (amount_str := amount_str.strip()):
|
||||||
Handles trailing '-' for negative values.
|
|
||||||
"""
|
|
||||||
if not amount_str:
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
amount_str = amount_str.strip()
|
|
||||||
if not amount_str:
|
|
||||||
return None
|
|
||||||
|
|
||||||
is_negative = amount_str.endswith("-")
|
is_negative = amount_str.endswith("-")
|
||||||
if is_negative:
|
|
||||||
amount_str = amount_str[:-1]
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
amount = float(amount_str.replace(",", ""))
|
amount = float(amount_str.rstrip("-").replace(",", ""))
|
||||||
return -amount if is_negative else amount
|
return -amount if is_negative else amount
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return None
|
return None
|
||||||
|
|
@ -97,7 +86,6 @@ def is_bac_statement(pdf: pdfplumber.PDF) -> bool:
|
||||||
"""Check if the PDF is a BAC credit card statement."""
|
"""Check if the PDF is a BAC credit card statement."""
|
||||||
if not pdf.pages:
|
if not pdf.pages:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
first_page_text = pdf.pages[0].extract_text() or ""
|
first_page_text = pdf.pages[0].extract_text() or ""
|
||||||
return "BAC" in first_page_text and "TARJETA" in first_page_text.upper()
|
return "BAC" in first_page_text and "TARJETA" in first_page_text.upper()
|
||||||
|
|
||||||
|
|
@ -106,118 +94,43 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
|
||||||
"""Extract the statement date from the PDF."""
|
"""Extract the statement date from the PDF."""
|
||||||
if not pdf.pages:
|
if not pdf.pages:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
first_page_text = pdf.pages[0].extract_text() or ""
|
first_page_text = pdf.pages[0].extract_text() or ""
|
||||||
date_matches = DATE_PATTERN.findall(first_page_text)
|
date_matches = DATE_PATTERN.findall(first_page_text)
|
||||||
if not date_matches:
|
if not date_matches:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
day, month_abbr, year = date_matches[0]
|
day, month_abbr, year = date_matches[0]
|
||||||
month = SPANISH_MONTHS.get(month_abbr.upper())
|
month = SPANISH_MONTHS.get(month_abbr.upper())
|
||||||
if not month:
|
if not month:
|
||||||
return None
|
return None
|
||||||
|
return f"{2000 + int(year):04d}-{month:02d}-{int(day):02d}"
|
||||||
full_year = 2000 + int(year)
|
|
||||||
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
|
|
||||||
|
|
||||||
|
|
||||||
def find_section_b_start(text: str) -> bool:
|
def matches_patterns(text: str, patterns: list[str]) -> bool:
|
||||||
"""Check if text contains start of section B (purchases)."""
|
"""Check if text matches any of the given regex patterns."""
|
||||||
patterns = [
|
|
||||||
r"B\)\s*Detalle\s+de\s+compras",
|
|
||||||
r"Detalle\s+de\s+compras\s+del\s+periodo",
|
|
||||||
]
|
|
||||||
return any(re.search(p, text, re.IGNORECASE) for p in patterns)
|
return any(re.search(p, text, re.IGNORECASE) for p in patterns)
|
||||||
|
|
||||||
|
|
||||||
def find_section_d_start(text: str) -> bool:
|
|
||||||
"""Check if text contains start of section D (other charges)."""
|
|
||||||
return bool(re.search(r"D\)\s*Detalle\s+de\s+otros\s+cargos", text, re.IGNORECASE))
|
|
||||||
|
|
||||||
|
|
||||||
def find_section_e_start(text: str) -> bool:
|
|
||||||
"""Check if text contains start of section E (voluntary products/services)."""
|
|
||||||
return bool(re.search(r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios", text, re.IGNORECASE))
|
|
||||||
|
|
||||||
|
|
||||||
def is_section_b_end(text: str) -> bool:
|
|
||||||
"""Check if text indicates the end of section B."""
|
|
||||||
end_patterns = [
|
|
||||||
r"Total\s+de\s+compras\s+del\s+periodo",
|
|
||||||
r"C\)\s*Detalle\s+de\s+intereses",
|
|
||||||
r"Detalle\s+de\s+intereses",
|
|
||||||
r"D\)\s*Detalle",
|
|
||||||
]
|
|
||||||
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
|
|
||||||
|
|
||||||
|
|
||||||
def is_section_d_end(text: str) -> bool:
|
|
||||||
"""Check if text indicates the end of section D."""
|
|
||||||
end_patterns = [
|
|
||||||
r"Total\s+por\s+concepto\s+otros\s+cargos",
|
|
||||||
r"E\)\s*Detalle",
|
|
||||||
]
|
|
||||||
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
|
|
||||||
|
|
||||||
|
|
||||||
def is_section_e_end(text: str) -> bool:
|
|
||||||
"""Check if text indicates the end of section E."""
|
|
||||||
end_patterns = [
|
|
||||||
r"Total\s+por\s+concepto\s+de\s+productos",
|
|
||||||
r"F\)\s*Cargos",
|
|
||||||
]
|
|
||||||
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
|
|
||||||
|
|
||||||
|
|
||||||
def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
|
|
||||||
"""
|
|
||||||
Extract card holder info from a row.
|
|
||||||
Returns (card_suffix, name) or None.
|
|
||||||
"""
|
|
||||||
match = CARD_HOLDER_PATTERN.search(row_text)
|
|
||||||
if match:
|
|
||||||
return match.group(1), match.group(2).strip()
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def parse_transaction_line(line: str) -> Optional[dict]:
|
def parse_transaction_line(line: str) -> Optional[dict]:
|
||||||
"""
|
"""Parse a transaction line into a dict, or return None if not a transaction."""
|
||||||
Parse a text-based transaction line.
|
match = TRANSACTION_PATTERN.match(line.strip())
|
||||||
|
|
||||||
Format: Reference Date Description [Location] Currency Amount
|
|
||||||
Example: 123456789012 9-ENE-26 EXAMPLE STORE CRC 1,234.56
|
|
||||||
"""
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
return None
|
|
||||||
|
|
||||||
match = TRANSACTION_PATTERN.match(line)
|
|
||||||
if not match:
|
if not match:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
reference = match.group(1)
|
reference, date_str, description, currency, amount_str, neg = match.groups()
|
||||||
date_str = match.group(2)
|
currency = currency.upper()
|
||||||
description = match.group(3).strip()
|
|
||||||
currency = match.group(4).upper()
|
|
||||||
amount_str = match.group(5)
|
|
||||||
is_negative = match.group(6) == "-"
|
|
||||||
|
|
||||||
date = parse_spanish_date(date_str)
|
date = parse_spanish_date(date_str)
|
||||||
if not date:
|
|
||||||
logger.warning(f"Could not parse date '{date_str}' for reference {reference}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
amount = parse_amount(amount_str)
|
amount = parse_amount(amount_str)
|
||||||
if amount is None:
|
if not date or amount is None:
|
||||||
logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}")
|
logger.warning(f"Could not parse transaction: {line}")
|
||||||
return None
|
return None
|
||||||
if is_negative:
|
if neg:
|
||||||
amount = -amount
|
amount = -amount
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"reference": reference,
|
"reference": reference,
|
||||||
"date": date,
|
"date": date,
|
||||||
"description": description,
|
"description": description.strip(),
|
||||||
"location": None,
|
"location": None,
|
||||||
"currency": currency,
|
"currency": currency,
|
||||||
"amount_crc": amount if currency == "CRC" else None,
|
"amount_crc": amount if currency == "CRC" else None,
|
||||||
|
|
@ -226,228 +139,116 @@ def parse_transaction_line(line: str) -> Optional[dict]:
|
||||||
|
|
||||||
|
|
||||||
def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
|
def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
|
||||||
"""
|
"""Extract transactions from a BAC credit card statement PDF."""
|
||||||
Extract transactions from a BAC credit card statement PDF.
|
logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
|
||||||
|
|
||||||
Args:
|
|
||||||
pdf_path: Path to the PDF file
|
|
||||||
verbose: Enable verbose logging
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary with metadata, card_holders, purchases, other_charges,
|
|
||||||
voluntary_services, and summary
|
|
||||||
"""
|
|
||||||
if verbose:
|
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
|
||||||
else:
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
|
|
||||||
with pdfplumber.open(pdf_path) as pdf:
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
# Validate this is a BAC statement
|
|
||||||
if not is_bac_statement(pdf):
|
if not is_bac_statement(pdf):
|
||||||
raise ValueError("PDF does not appear to be a BAC credit card statement")
|
raise ValueError("PDF does not appear to be a BAC credit card statement")
|
||||||
|
|
||||||
statement_date = extract_statement_date(pdf)
|
statement_date = extract_statement_date(pdf)
|
||||||
|
transactions = {s["key"]: [] for s in SECTIONS.values()}
|
||||||
# Transactions by section
|
|
||||||
purchases = [] # Section B
|
|
||||||
other_charges = [] # Section D
|
|
||||||
voluntary_services = [] # Section E
|
|
||||||
|
|
||||||
# Track card holders (may have multiple)
|
|
||||||
card_holders = []
|
card_holders = []
|
||||||
seen_card_suffixes = set()
|
seen_card_suffixes = set()
|
||||||
|
|
||||||
# Section tracking: None, "B", "D", "E"
|
|
||||||
current_section = None
|
current_section = None
|
||||||
sections_found = set()
|
sections_completed = set()
|
||||||
|
|
||||||
# Start from page 2 (index 1) as page 1 is summary only
|
|
||||||
start_page = 1 if len(pdf.pages) > 1 else 0
|
start_page = 1 if len(pdf.pages) > 1 else 0
|
||||||
|
|
||||||
for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1):
|
for page_num, page in enumerate(pdf.pages[start_page:], start=start_page + 1):
|
||||||
page_text = page.extract_text() or ""
|
page_text = page.extract_text() or ""
|
||||||
|
|
||||||
logger.debug(f"Processing page {page_num}")
|
logger.debug(f"Processing page {page_num}")
|
||||||
|
|
||||||
# Check for section transitions (order matters: check ends before starts)
|
|
||||||
# Section B end
|
|
||||||
if current_section == "B" and is_section_b_end(page_text):
|
|
||||||
logger.debug(f"Section B ended on page {page_num}")
|
|
||||||
current_section = None
|
|
||||||
|
|
||||||
# Section D end
|
|
||||||
if current_section == "D" and is_section_d_end(page_text):
|
|
||||||
logger.debug(f"Section D ended on page {page_num}")
|
|
||||||
current_section = None
|
|
||||||
|
|
||||||
# Section E end
|
|
||||||
if current_section == "E" and is_section_e_end(page_text):
|
|
||||||
logger.debug(f"Section E ended on page {page_num}")
|
|
||||||
current_section = None
|
|
||||||
|
|
||||||
# Check for section starts
|
|
||||||
if current_section is None and find_section_b_start(page_text):
|
|
||||||
current_section = "B"
|
|
||||||
sections_found.add("B")
|
|
||||||
logger.debug(f"Found section B on page {page_num}")
|
|
||||||
|
|
||||||
if current_section is None and find_section_d_start(page_text):
|
|
||||||
current_section = "D"
|
|
||||||
sections_found.add("D")
|
|
||||||
logger.debug(f"Found section D on page {page_num}")
|
|
||||||
|
|
||||||
if current_section is None and find_section_e_start(page_text):
|
|
||||||
current_section = "E"
|
|
||||||
sections_found.add("E")
|
|
||||||
logger.debug(f"Found section E on page {page_num}")
|
|
||||||
|
|
||||||
if current_section is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Select the appropriate transaction list
|
|
||||||
if current_section == "B":
|
|
||||||
target_list = purchases
|
|
||||||
elif current_section == "D":
|
|
||||||
target_list = other_charges
|
|
||||||
else: # "E"
|
|
||||||
target_list = voluntary_services
|
|
||||||
|
|
||||||
# Parse text line by line
|
|
||||||
for line in page_text.split("\n"):
|
for line in page_text.split("\n"):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Extract card holder info (only in section B)
|
# Check for section end
|
||||||
if current_section == "B":
|
if current_section and matches_patterns(line, SECTIONS[current_section]["end"]):
|
||||||
card_info = extract_card_holder(line)
|
logger.debug(f"Section {current_section} ended on page {page_num}")
|
||||||
if card_info:
|
sections_completed.add(current_section)
|
||||||
card_suffix, card_holder_name = card_info
|
current_section = None
|
||||||
if card_suffix not in seen_card_suffixes:
|
|
||||||
card_holders.append({
|
|
||||||
"card_suffix": card_suffix,
|
|
||||||
"name": card_holder_name
|
|
||||||
})
|
|
||||||
seen_card_suffixes.add(card_suffix)
|
|
||||||
logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
transaction = parse_transaction_line(line)
|
# Check for section start
|
||||||
if transaction:
|
if current_section is None:
|
||||||
target_list.append(transaction)
|
for sec_id, sec in SECTIONS.items():
|
||||||
logger.debug(f"Extracted {current_section} transaction: {transaction['reference']}")
|
if sec_id not in sections_completed and matches_patterns(line, sec["start"]):
|
||||||
|
current_section = sec_id
|
||||||
|
logger.debug(f"Found section {sec_id} on page {page_num}")
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
|
||||||
if "B" not in sections_found:
|
# Extract card holder
|
||||||
|
match = CARD_HOLDER_PATTERN.search(line)
|
||||||
|
if match:
|
||||||
|
suffix, name = match.group(1), match.group(2).strip()
|
||||||
|
if suffix not in seen_card_suffixes:
|
||||||
|
card_holders.append({"card_suffix": suffix, "name": name})
|
||||||
|
seen_card_suffixes.add(suffix)
|
||||||
|
logger.debug(f"Found card holder: {suffix} - {name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Parse transaction
|
||||||
|
txn = parse_transaction_line(line)
|
||||||
|
if txn:
|
||||||
|
transactions[SECTIONS[current_section]["key"]].append(txn)
|
||||||
|
logger.debug(f"Extracted {current_section} transaction: {txn['reference']}")
|
||||||
|
|
||||||
|
if "B" not in sections_completed and not transactions["purchases"]:
|
||||||
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
|
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
|
||||||
|
|
||||||
# Calculate summaries
|
def summarize(txns):
|
||||||
def calculate_summary(txns):
|
|
||||||
total_crc = sum(t["amount_crc"] or 0 for t in txns)
|
|
||||||
total_usd = sum(t["amount_usd"] or 0 for t in txns)
|
|
||||||
return {
|
return {
|
||||||
"total_crc": round(total_crc, 2),
|
"total_crc": round(sum(t["amount_crc"] or 0 for t in txns), 2),
|
||||||
"total_usd": round(total_usd, 2),
|
"total_usd": round(sum(t["amount_usd"] or 0 for t in txns), 2),
|
||||||
"count": len(txns)
|
"count": len(txns),
|
||||||
}
|
}
|
||||||
|
|
||||||
total_transactions = len(purchases) + len(other_charges) + len(voluntary_services)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"source_file": pdf_path.name,
|
"source_file": pdf_path.name,
|
||||||
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
||||||
"statement_date": statement_date,
|
"statement_date": statement_date,
|
||||||
"total_transactions": total_transactions
|
"total_transactions": sum(len(t) for t in transactions.values()),
|
||||||
},
|
},
|
||||||
"card_holders": card_holders,
|
"card_holders": card_holders,
|
||||||
"purchases": purchases,
|
**transactions,
|
||||||
"other_charges": other_charges,
|
"summary": {key: summarize(txns) for key, txns in transactions.items()},
|
||||||
"voluntary_services": voluntary_services,
|
|
||||||
"summary": {
|
|
||||||
"purchases": calculate_summary(purchases),
|
|
||||||
"other_charges": calculate_summary(other_charges),
|
|
||||||
"voluntary_services": calculate_summary(voluntary_services)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(description="Extract transactions from BAC CR statement PDFs")
|
||||||
description="Extract transactions from BAC Costa Rica credit card statement PDFs",
|
parser.add_argument("pdf_file", type=Path, help="Path to the BAC statement PDF")
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
parser.add_argument("-o", "--output", type=Path, default=Path("transactions.json"))
|
||||||
epilog="""
|
parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
|
||||||
Examples:
|
parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose logging")
|
||||||
python bac_extract.py EstadodeCuenta.pdf --pretty
|
|
||||||
python bac_extract.py statement.pdf -o output.json -v
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"pdf_file",
|
|
||||||
type=Path,
|
|
||||||
help="Path to the BAC statement PDF"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"-o", "--output",
|
|
||||||
type=Path,
|
|
||||||
default=Path("transactions.json"),
|
|
||||||
help="Output JSON file path (default: transactions.json)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--pretty",
|
|
||||||
action="store_true",
|
|
||||||
help="Pretty-print JSON output"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"-v", "--verbose",
|
|
||||||
action="store_true",
|
|
||||||
help="Enable verbose logging"
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Validate PDF file exists
|
|
||||||
if not args.pdf_file.exists():
|
if not args.pdf_file.exists():
|
||||||
print(f"Error: File not found: {args.pdf_file}", file=sys.stderr)
|
sys.exit(f"Error: File not found: {args.pdf_file}")
|
||||||
sys.exit(1)
|
if args.pdf_file.suffix.lower() != ".pdf":
|
||||||
|
sys.exit(f"Error: File must be a PDF: {args.pdf_file}")
|
||||||
if not args.pdf_file.suffix.lower() == ".pdf":
|
|
||||||
print(f"Error: File must be a PDF: {args.pdf_file}", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = extract_transactions(args.pdf_file, args.verbose)
|
result = extract_transactions(args.pdf_file, args.verbose)
|
||||||
|
|
||||||
# Write output
|
|
||||||
indent = 2 if args.pretty else None
|
|
||||||
with open(args.output, "w", encoding="utf-8") as f:
|
with open(args.output, "w", encoding="utf-8") as f:
|
||||||
json.dump(result, f, indent=indent, ensure_ascii=False)
|
json.dump(result, f, indent=2 if args.pretty else None, ensure_ascii=False)
|
||||||
|
|
||||||
summary = result['summary']
|
summary = result["summary"]
|
||||||
print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}")
|
print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}")
|
||||||
print(f" Purchases (B): {summary['purchases']['count']:3d} "
|
for key, label in [("purchases", "Purchases (B)"), ("other_charges", "Other charges (D)"),
|
||||||
f"CRC {summary['purchases']['total_crc']:>12,.2f} "
|
("voluntary_services", "Voluntary services (E)")]:
|
||||||
f"USD {summary['purchases']['total_usd']:>10,.2f}")
|
s = summary[key]
|
||||||
print(f" Other charges (D): {summary['other_charges']['count']:3d} "
|
print(f" {label:25} {s['count']:3d} CRC {s['total_crc']:>12,.2f} USD {s['total_usd']:>10,.2f}")
|
||||||
f"CRC {summary['other_charges']['total_crc']:>12,.2f} "
|
|
||||||
f"USD {summary['other_charges']['total_usd']:>10,.2f}")
|
|
||||||
print(f" Voluntary services (E): {summary['voluntary_services']['count']:3d} "
|
|
||||||
f"CRC {summary['voluntary_services']['total_crc']:>12,.2f} "
|
|
||||||
f"USD {summary['voluntary_services']['total_usd']:>10,.2f}")
|
|
||||||
|
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
print(f"Error: {e}", file=sys.stderr)
|
sys.exit(f"Error: {e}")
|
||||||
sys.exit(1)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing PDF: {e}", file=sys.stderr)
|
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
sys.exit(1)
|
sys.exit(f"Error processing PDF: {e}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue