target sections D and E

This commit is contained in:
Fabian Montero 2026-03-09 14:44:47 -06:00
parent a05f701f16
commit 69a773e1b3
Signed by: fabian
GPG key ID: 8036F30EDBAC8447

View file

@ -3,7 +3,10 @@
BAC Credit Card Statement Extractor BAC Credit Card Statement Extractor
Extracts transactions from BAC Costa Rica credit card statement PDFs. Extracts transactions from BAC Costa Rica credit card statement PDFs.
Specifically targets section "B) Detalle de compras del periodo". Targets sections:
B) Detalle de compras del periodo
D) Detalle de otros cargos
E) Detalle de productos y servicios de elección voluntaria
""" """
import argparse import argparse
@ -118,17 +121,27 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
return f"{full_year:04d}-{month:02d}-{int(day):02d}" return f"{full_year:04d}-{month:02d}-{int(day):02d}"
def find_section_b_start(page_text: str) -> bool: def find_section_b_start(text: str) -> bool:
"""Check if page contains start of section B.""" """Check if text contains start of section B (purchases)."""
patterns = [ patterns = [
r"B\)\s*Detalle\s+de\s+compras", r"B\)\s*Detalle\s+de\s+compras",
r"Detalle\s+de\s+compras\s+del\s+periodo", r"Detalle\s+de\s+compras\s+del\s+periodo",
] ]
return any(re.search(p, page_text, re.IGNORECASE) for p in patterns) return any(re.search(p, text, re.IGNORECASE) for p in patterns)
def is_section_end(text: str) -> bool: def find_section_d_start(text: str) -> bool:
"""Check if we've reached the end of section B.""" """Check if text contains start of section D (other charges)."""
return bool(re.search(r"D\)\s*Detalle\s+de\s+otros\s+cargos", text, re.IGNORECASE))
def find_section_e_start(text: str) -> bool:
"""Check if text contains start of section E (voluntary products/services)."""
return bool(re.search(r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios", text, re.IGNORECASE))
def is_section_b_end(text: str) -> bool:
"""Check if text indicates the end of section B."""
end_patterns = [ end_patterns = [
r"Total\s+de\s+compras\s+del\s+periodo", r"Total\s+de\s+compras\s+del\s+periodo",
r"C\)\s*Detalle\s+de\s+intereses", r"C\)\s*Detalle\s+de\s+intereses",
@ -138,6 +151,24 @@ def is_section_end(text: str) -> bool:
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
def is_section_d_end(text: str) -> bool:
"""Check if text indicates the end of section D."""
end_patterns = [
r"Total\s+por\s+concepto\s+otros\s+cargos",
r"E\)\s*Detalle",
]
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
def is_section_e_end(text: str) -> bool:
"""Check if text indicates the end of section E."""
end_patterns = [
r"Total\s+por\s+concepto\s+de\s+productos",
r"F\)\s*Cargos",
]
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]: def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
""" """
Extract card holder info from a row. Extract card holder info from a row.
@ -203,7 +234,8 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
verbose: Enable verbose logging verbose: Enable verbose logging
Returns: Returns:
Dictionary with metadata, card_holder, transactions, and summary Dictionary with metadata, card_holders, purchases, other_charges,
voluntary_services, and summary
""" """
if verbose: if verbose:
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
@ -217,11 +249,18 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
statement_date = extract_statement_date(pdf) statement_date = extract_statement_date(pdf)
transactions = [] # Transactions by section
card_suffix = None purchases = [] # Section B
card_holder_name = None other_charges = [] # Section D
in_section_b = False voluntary_services = [] # Section E
section_b_found = False
# Track card holders (may have multiple)
card_holders = []
seen_card_suffixes = set()
# Section tracking: None, "B", "D", "E"
current_section = None
sections_found = set()
# Start from page 2 (index 1) as page 1 is summary only # Start from page 2 (index 1) as page 1 is summary only
start_page = 1 if len(pdf.pages) > 1 else 0 start_page = 1 if len(pdf.pages) > 1 else 0
@ -231,19 +270,48 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
logger.debug(f"Processing page {page_num}") logger.debug(f"Processing page {page_num}")
# Check for section B start # Check for section transitions (order matters: check ends before starts)
if not in_section_b and find_section_b_start(page_text): # Section B end
in_section_b = True if current_section == "B" and is_section_b_end(page_text):
section_b_found = True logger.debug(f"Section B ended on page {page_num}")
current_section = None
# Section D end
if current_section == "D" and is_section_d_end(page_text):
logger.debug(f"Section D ended on page {page_num}")
current_section = None
# Section E end
if current_section == "E" and is_section_e_end(page_text):
logger.debug(f"Section E ended on page {page_num}")
current_section = None
# Check for section starts
if current_section is None and find_section_b_start(page_text):
current_section = "B"
sections_found.add("B")
logger.debug(f"Found section B on page {page_num}") logger.debug(f"Found section B on page {page_num}")
if not in_section_b: if current_section is None and find_section_d_start(page_text):
current_section = "D"
sections_found.add("D")
logger.debug(f"Found section D on page {page_num}")
if current_section is None and find_section_e_start(page_text):
current_section = "E"
sections_found.add("E")
logger.debug(f"Found section E on page {page_num}")
if current_section is None:
continue continue
# Check for section end (still process this page before breaking) # Select the appropriate transaction list
reached_section_end = is_section_end(page_text) if current_section == "B":
if reached_section_end: target_list = purchases
logger.debug(f"Found section end on page {page_num}") elif current_section == "D":
target_list = other_charges
else: # "E"
target_list = voluntary_services
# Parse text line by line # Parse text line by line
for line in page_text.split("\n"): for line in page_text.split("\n"):
@ -251,42 +319,55 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
if not line: if not line:
continue continue
card_info = extract_card_holder(line) # Extract card holder info (only in section B)
if card_info: if current_section == "B":
card_suffix, card_holder_name = card_info card_info = extract_card_holder(line)
logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}") if card_info:
continue card_suffix, card_holder_name = card_info
if card_suffix not in seen_card_suffixes:
card_holders.append({
"card_suffix": card_suffix,
"name": card_holder_name
})
seen_card_suffixes.add(card_suffix)
logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
continue
transaction = parse_transaction_line(line) transaction = parse_transaction_line(line)
if transaction: if transaction:
transactions.append(transaction) target_list.append(transaction)
logger.debug(f"Extracted transaction: {transaction['reference']}") logger.debug(f"Extracted {current_section} transaction: {transaction['reference']}")
if reached_section_end: if "B" not in sections_found:
break
if not section_b_found:
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF") raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
# Calculate summary # Calculate summaries
total_crc = sum(t["amount_crc"] or 0 for t in transactions) def calculate_summary(txns):
total_usd = sum(t["amount_usd"] or 0 for t in transactions) total_crc = sum(t["amount_crc"] or 0 for t in txns)
total_usd = sum(t["amount_usd"] or 0 for t in txns)
return {
"total_crc": round(total_crc, 2),
"total_usd": round(total_usd, 2),
"count": len(txns)
}
card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None total_transactions = len(purchases) + len(other_charges) + len(voluntary_services)
return { return {
"metadata": { "metadata": {
"source_file": pdf_path.name, "source_file": pdf_path.name,
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
"statement_date": statement_date, "statement_date": statement_date,
"total_transactions": len(transactions) "total_transactions": total_transactions
}, },
"card_holder": card_holder, "card_holders": card_holders,
"transactions": transactions, "purchases": purchases,
"other_charges": other_charges,
"voluntary_services": voluntary_services,
"summary": { "summary": {
"total_crc": round(total_crc, 2), "purchases": calculate_summary(purchases),
"total_usd": round(total_usd, 2), "other_charges": calculate_summary(other_charges),
"transaction_count": len(transactions) "voluntary_services": calculate_summary(voluntary_services)
} }
} }
@ -346,9 +427,17 @@ Examples:
with open(args.output, "w", encoding="utf-8") as f: with open(args.output, "w", encoding="utf-8") as f:
json.dump(result, f, indent=indent, ensure_ascii=False) json.dump(result, f, indent=indent, ensure_ascii=False)
print(f"Extracted {result['summary']['transaction_count']} transactions to {args.output}") summary = result['summary']
print(f"Total CRC: {result['summary']['total_crc']:,.2f}") print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}")
print(f"Total USD: {result['summary']['total_usd']:,.2f}") print(f" Purchases (B): {summary['purchases']['count']:3d} "
f"CRC {summary['purchases']['total_crc']:>12,.2f} "
f"USD {summary['purchases']['total_usd']:>10,.2f}")
print(f" Other charges (D): {summary['other_charges']['count']:3d} "
f"CRC {summary['other_charges']['total_crc']:>12,.2f} "
f"USD {summary['other_charges']['total_usd']:>10,.2f}")
print(f" Voluntary services (E): {summary['voluntary_services']['count']:3d} "
f"CRC {summary['voluntary_services']['total_crc']:>12,.2f} "
f"USD {summary['voluntary_services']['total_usd']:>10,.2f}")
except ValueError as e: except ValueError as e:
print(f"Error: {e}", file=sys.stderr) print(f"Error: {e}", file=sys.stderr)