target sections D and E

This commit is contained in:
Fabian Montero 2026-03-09 14:44:47 -06:00
parent a05f701f16
commit 69a773e1b3
Signed by: fabian
GPG key ID: 8036F30EDBAC8447

View file

@ -3,7 +3,10 @@
BAC Credit Card Statement Extractor
Extracts transactions from BAC Costa Rica credit card statement PDFs.
Specifically targets section "B) Detalle de compras del periodo".
Targets sections:
B) Detalle de compras del periodo
D) Detalle de otros cargos
E) Detalle de productos y servicios de elección voluntaria
"""
import argparse
@ -118,17 +121,27 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
def find_section_b_start(page_text: str) -> bool:
"""Check if page contains start of section B."""
def find_section_b_start(text: str) -> bool:
"""Check if text contains start of section B (purchases)."""
patterns = [
r"B\)\s*Detalle\s+de\s+compras",
r"Detalle\s+de\s+compras\s+del\s+periodo",
]
return any(re.search(p, page_text, re.IGNORECASE) for p in patterns)
return any(re.search(p, text, re.IGNORECASE) for p in patterns)
def is_section_end(text: str) -> bool:
"""Check if we've reached the end of section B."""
def find_section_d_start(text: str) -> bool:
"""Check if text contains start of section D (other charges)."""
return bool(re.search(r"D\)\s*Detalle\s+de\s+otros\s+cargos", text, re.IGNORECASE))
def find_section_e_start(text: str) -> bool:
"""Check if text contains start of section E (voluntary products/services)."""
return bool(re.search(r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios", text, re.IGNORECASE))
def is_section_b_end(text: str) -> bool:
"""Check if text indicates the end of section B."""
end_patterns = [
r"Total\s+de\s+compras\s+del\s+periodo",
r"C\)\s*Detalle\s+de\s+intereses",
@ -138,6 +151,24 @@ def is_section_end(text: str) -> bool:
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
def is_section_d_end(text: str) -> bool:
"""Check if text indicates the end of section D."""
end_patterns = [
r"Total\s+por\s+concepto\s+otros\s+cargos",
r"E\)\s*Detalle",
]
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
def is_section_e_end(text: str) -> bool:
"""Check if text indicates the end of section E."""
end_patterns = [
r"Total\s+por\s+concepto\s+de\s+productos",
r"F\)\s*Cargos",
]
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
"""
Extract card holder info from a row.
@ -203,7 +234,8 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
verbose: Enable verbose logging
Returns:
Dictionary with metadata, card_holder, transactions, and summary
Dictionary with metadata, card_holders, purchases, other_charges,
voluntary_services, and summary
"""
if verbose:
logging.basicConfig(level=logging.DEBUG)
@ -217,11 +249,18 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
statement_date = extract_statement_date(pdf)
transactions = []
card_suffix = None
card_holder_name = None
in_section_b = False
section_b_found = False
# Transactions by section
purchases = [] # Section B
other_charges = [] # Section D
voluntary_services = [] # Section E
# Track card holders (may have multiple)
card_holders = []
seen_card_suffixes = set()
# Section tracking: None, "B", "D", "E"
current_section = None
sections_found = set()
# Start from page 2 (index 1) as page 1 is summary only
start_page = 1 if len(pdf.pages) > 1 else 0
@ -231,19 +270,48 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
logger.debug(f"Processing page {page_num}")
# Check for section B start
if not in_section_b and find_section_b_start(page_text):
in_section_b = True
section_b_found = True
# Check for section transitions (order matters: check ends before starts)
# Section B end
if current_section == "B" and is_section_b_end(page_text):
logger.debug(f"Section B ended on page {page_num}")
current_section = None
# Section D end
if current_section == "D" and is_section_d_end(page_text):
logger.debug(f"Section D ended on page {page_num}")
current_section = None
# Section E end
if current_section == "E" and is_section_e_end(page_text):
logger.debug(f"Section E ended on page {page_num}")
current_section = None
# Check for section starts
if current_section is None and find_section_b_start(page_text):
current_section = "B"
sections_found.add("B")
logger.debug(f"Found section B on page {page_num}")
if not in_section_b:
if current_section is None and find_section_d_start(page_text):
current_section = "D"
sections_found.add("D")
logger.debug(f"Found section D on page {page_num}")
if current_section is None and find_section_e_start(page_text):
current_section = "E"
sections_found.add("E")
logger.debug(f"Found section E on page {page_num}")
if current_section is None:
continue
# Check for section end (still process this page before breaking)
reached_section_end = is_section_end(page_text)
if reached_section_end:
logger.debug(f"Found section end on page {page_num}")
# Select the appropriate transaction list
if current_section == "B":
target_list = purchases
elif current_section == "D":
target_list = other_charges
else: # "E"
target_list = voluntary_services
# Parse text line by line
for line in page_text.split("\n"):
@ -251,42 +319,55 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
if not line:
continue
card_info = extract_card_holder(line)
if card_info:
card_suffix, card_holder_name = card_info
logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
continue
# Extract card holder info (only in section B)
if current_section == "B":
card_info = extract_card_holder(line)
if card_info:
card_suffix, card_holder_name = card_info
if card_suffix not in seen_card_suffixes:
card_holders.append({
"card_suffix": card_suffix,
"name": card_holder_name
})
seen_card_suffixes.add(card_suffix)
logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
continue
transaction = parse_transaction_line(line)
if transaction:
transactions.append(transaction)
logger.debug(f"Extracted transaction: {transaction['reference']}")
target_list.append(transaction)
logger.debug(f"Extracted {current_section} transaction: {transaction['reference']}")
if reached_section_end:
break
if not section_b_found:
if "B" not in sections_found:
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
# Calculate summary
total_crc = sum(t["amount_crc"] or 0 for t in transactions)
total_usd = sum(t["amount_usd"] or 0 for t in transactions)
# Calculate summaries
def calculate_summary(txns):
total_crc = sum(t["amount_crc"] or 0 for t in txns)
total_usd = sum(t["amount_usd"] or 0 for t in txns)
return {
"total_crc": round(total_crc, 2),
"total_usd": round(total_usd, 2),
"count": len(txns)
}
card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None
total_transactions = len(purchases) + len(other_charges) + len(voluntary_services)
return {
"metadata": {
"source_file": pdf_path.name,
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
"statement_date": statement_date,
"total_transactions": len(transactions)
"total_transactions": total_transactions
},
"card_holder": card_holder,
"transactions": transactions,
"card_holders": card_holders,
"purchases": purchases,
"other_charges": other_charges,
"voluntary_services": voluntary_services,
"summary": {
"total_crc": round(total_crc, 2),
"total_usd": round(total_usd, 2),
"transaction_count": len(transactions)
"purchases": calculate_summary(purchases),
"other_charges": calculate_summary(other_charges),
"voluntary_services": calculate_summary(voluntary_services)
}
}
@ -346,9 +427,17 @@ Examples:
with open(args.output, "w", encoding="utf-8") as f:
json.dump(result, f, indent=indent, ensure_ascii=False)
print(f"Extracted {result['summary']['transaction_count']} transactions to {args.output}")
print(f"Total CRC: {result['summary']['total_crc']:,.2f}")
print(f"Total USD: {result['summary']['total_usd']:,.2f}")
summary = result['summary']
print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}")
print(f" Purchases (B): {summary['purchases']['count']:3d} "
f"CRC {summary['purchases']['total_crc']:>12,.2f} "
f"USD {summary['purchases']['total_usd']:>10,.2f}")
print(f" Other charges (D): {summary['other_charges']['count']:3d} "
f"CRC {summary['other_charges']['total_crc']:>12,.2f} "
f"USD {summary['other_charges']['total_usd']:>10,.2f}")
print(f" Voluntary services (E): {summary['voluntary_services']['count']:3d} "
f"CRC {summary['voluntary_services']['total_crc']:>12,.2f} "
f"USD {summary['voluntary_services']['total_usd']:>10,.2f}")
except ValueError as e:
print(f"Error: {e}", file=sys.stderr)