target sections D and E
This commit is contained in:
parent
a05f701f16
commit
69a773e1b3
1 changed files with 134 additions and 45 deletions
179
bac_extract.py
179
bac_extract.py
|
|
@ -3,7 +3,10 @@
|
||||||
BAC Credit Card Statement Extractor
|
BAC Credit Card Statement Extractor
|
||||||
|
|
||||||
Extracts transactions from BAC Costa Rica credit card statement PDFs.
|
Extracts transactions from BAC Costa Rica credit card statement PDFs.
|
||||||
Specifically targets section "B) Detalle de compras del periodo".
|
Targets sections:
|
||||||
|
B) Detalle de compras del periodo
|
||||||
|
D) Detalle de otros cargos
|
||||||
|
E) Detalle de productos y servicios de elección voluntaria
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
@ -118,17 +121,27 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
|
||||||
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
|
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
|
||||||
|
|
||||||
|
|
||||||
def find_section_b_start(page_text: str) -> bool:
|
def find_section_b_start(text: str) -> bool:
|
||||||
"""Check if page contains start of section B."""
|
"""Check if text contains start of section B (purchases)."""
|
||||||
patterns = [
|
patterns = [
|
||||||
r"B\)\s*Detalle\s+de\s+compras",
|
r"B\)\s*Detalle\s+de\s+compras",
|
||||||
r"Detalle\s+de\s+compras\s+del\s+periodo",
|
r"Detalle\s+de\s+compras\s+del\s+periodo",
|
||||||
]
|
]
|
||||||
return any(re.search(p, page_text, re.IGNORECASE) for p in patterns)
|
return any(re.search(p, text, re.IGNORECASE) for p in patterns)
|
||||||
|
|
||||||
|
|
||||||
def is_section_end(text: str) -> bool:
|
def find_section_d_start(text: str) -> bool:
|
||||||
"""Check if we've reached the end of section B."""
|
"""Check if text contains start of section D (other charges)."""
|
||||||
|
return bool(re.search(r"D\)\s*Detalle\s+de\s+otros\s+cargos", text, re.IGNORECASE))
|
||||||
|
|
||||||
|
|
||||||
|
def find_section_e_start(text: str) -> bool:
|
||||||
|
"""Check if text contains start of section E (voluntary products/services)."""
|
||||||
|
return bool(re.search(r"E\)\s*Detalle\s+de\s+productos\s+y\s+servicios", text, re.IGNORECASE))
|
||||||
|
|
||||||
|
|
||||||
|
def is_section_b_end(text: str) -> bool:
|
||||||
|
"""Check if text indicates the end of section B."""
|
||||||
end_patterns = [
|
end_patterns = [
|
||||||
r"Total\s+de\s+compras\s+del\s+periodo",
|
r"Total\s+de\s+compras\s+del\s+periodo",
|
||||||
r"C\)\s*Detalle\s+de\s+intereses",
|
r"C\)\s*Detalle\s+de\s+intereses",
|
||||||
|
|
@ -138,6 +151,24 @@ def is_section_end(text: str) -> bool:
|
||||||
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
|
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
|
||||||
|
|
||||||
|
|
||||||
|
def is_section_d_end(text: str) -> bool:
|
||||||
|
"""Check if text indicates the end of section D."""
|
||||||
|
end_patterns = [
|
||||||
|
r"Total\s+por\s+concepto\s+otros\s+cargos",
|
||||||
|
r"E\)\s*Detalle",
|
||||||
|
]
|
||||||
|
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
|
||||||
|
|
||||||
|
|
||||||
|
def is_section_e_end(text: str) -> bool:
|
||||||
|
"""Check if text indicates the end of section E."""
|
||||||
|
end_patterns = [
|
||||||
|
r"Total\s+por\s+concepto\s+de\s+productos",
|
||||||
|
r"F\)\s*Cargos",
|
||||||
|
]
|
||||||
|
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
|
||||||
|
|
||||||
|
|
||||||
def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
|
def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
|
||||||
"""
|
"""
|
||||||
Extract card holder info from a row.
|
Extract card holder info from a row.
|
||||||
|
|
@ -203,7 +234,8 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
|
||||||
verbose: Enable verbose logging
|
verbose: Enable verbose logging
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with metadata, card_holder, transactions, and summary
|
Dictionary with metadata, card_holders, purchases, other_charges,
|
||||||
|
voluntary_services, and summary
|
||||||
"""
|
"""
|
||||||
if verbose:
|
if verbose:
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
@ -217,11 +249,18 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
|
||||||
|
|
||||||
statement_date = extract_statement_date(pdf)
|
statement_date = extract_statement_date(pdf)
|
||||||
|
|
||||||
transactions = []
|
# Transactions by section
|
||||||
card_suffix = None
|
purchases = [] # Section B
|
||||||
card_holder_name = None
|
other_charges = [] # Section D
|
||||||
in_section_b = False
|
voluntary_services = [] # Section E
|
||||||
section_b_found = False
|
|
||||||
|
# Track card holders (may have multiple)
|
||||||
|
card_holders = []
|
||||||
|
seen_card_suffixes = set()
|
||||||
|
|
||||||
|
# Section tracking: None, "B", "D", "E"
|
||||||
|
current_section = None
|
||||||
|
sections_found = set()
|
||||||
|
|
||||||
# Start from page 2 (index 1) as page 1 is summary only
|
# Start from page 2 (index 1) as page 1 is summary only
|
||||||
start_page = 1 if len(pdf.pages) > 1 else 0
|
start_page = 1 if len(pdf.pages) > 1 else 0
|
||||||
|
|
@ -231,19 +270,48 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
|
||||||
|
|
||||||
logger.debug(f"Processing page {page_num}")
|
logger.debug(f"Processing page {page_num}")
|
||||||
|
|
||||||
# Check for section B start
|
# Check for section transitions (order matters: check ends before starts)
|
||||||
if not in_section_b and find_section_b_start(page_text):
|
# Section B end
|
||||||
in_section_b = True
|
if current_section == "B" and is_section_b_end(page_text):
|
||||||
section_b_found = True
|
logger.debug(f"Section B ended on page {page_num}")
|
||||||
|
current_section = None
|
||||||
|
|
||||||
|
# Section D end
|
||||||
|
if current_section == "D" and is_section_d_end(page_text):
|
||||||
|
logger.debug(f"Section D ended on page {page_num}")
|
||||||
|
current_section = None
|
||||||
|
|
||||||
|
# Section E end
|
||||||
|
if current_section == "E" and is_section_e_end(page_text):
|
||||||
|
logger.debug(f"Section E ended on page {page_num}")
|
||||||
|
current_section = None
|
||||||
|
|
||||||
|
# Check for section starts
|
||||||
|
if current_section is None and find_section_b_start(page_text):
|
||||||
|
current_section = "B"
|
||||||
|
sections_found.add("B")
|
||||||
logger.debug(f"Found section B on page {page_num}")
|
logger.debug(f"Found section B on page {page_num}")
|
||||||
|
|
||||||
if not in_section_b:
|
if current_section is None and find_section_d_start(page_text):
|
||||||
|
current_section = "D"
|
||||||
|
sections_found.add("D")
|
||||||
|
logger.debug(f"Found section D on page {page_num}")
|
||||||
|
|
||||||
|
if current_section is None and find_section_e_start(page_text):
|
||||||
|
current_section = "E"
|
||||||
|
sections_found.add("E")
|
||||||
|
logger.debug(f"Found section E on page {page_num}")
|
||||||
|
|
||||||
|
if current_section is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check for section end (still process this page before breaking)
|
# Select the appropriate transaction list
|
||||||
reached_section_end = is_section_end(page_text)
|
if current_section == "B":
|
||||||
if reached_section_end:
|
target_list = purchases
|
||||||
logger.debug(f"Found section end on page {page_num}")
|
elif current_section == "D":
|
||||||
|
target_list = other_charges
|
||||||
|
else: # "E"
|
||||||
|
target_list = voluntary_services
|
||||||
|
|
||||||
# Parse text line by line
|
# Parse text line by line
|
||||||
for line in page_text.split("\n"):
|
for line in page_text.split("\n"):
|
||||||
|
|
@ -251,42 +319,55 @@ def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
card_info = extract_card_holder(line)
|
# Extract card holder info (only in section B)
|
||||||
if card_info:
|
if current_section == "B":
|
||||||
card_suffix, card_holder_name = card_info
|
card_info = extract_card_holder(line)
|
||||||
logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
|
if card_info:
|
||||||
continue
|
card_suffix, card_holder_name = card_info
|
||||||
|
if card_suffix not in seen_card_suffixes:
|
||||||
|
card_holders.append({
|
||||||
|
"card_suffix": card_suffix,
|
||||||
|
"name": card_holder_name
|
||||||
|
})
|
||||||
|
seen_card_suffixes.add(card_suffix)
|
||||||
|
logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
|
||||||
|
continue
|
||||||
|
|
||||||
transaction = parse_transaction_line(line)
|
transaction = parse_transaction_line(line)
|
||||||
if transaction:
|
if transaction:
|
||||||
transactions.append(transaction)
|
target_list.append(transaction)
|
||||||
logger.debug(f"Extracted transaction: {transaction['reference']}")
|
logger.debug(f"Extracted {current_section} transaction: {transaction['reference']}")
|
||||||
|
|
||||||
if reached_section_end:
|
if "B" not in sections_found:
|
||||||
break
|
|
||||||
|
|
||||||
if not section_b_found:
|
|
||||||
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
|
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
|
||||||
|
|
||||||
# Calculate summary
|
# Calculate summaries
|
||||||
total_crc = sum(t["amount_crc"] or 0 for t in transactions)
|
def calculate_summary(txns):
|
||||||
total_usd = sum(t["amount_usd"] or 0 for t in transactions)
|
total_crc = sum(t["amount_crc"] or 0 for t in txns)
|
||||||
|
total_usd = sum(t["amount_usd"] or 0 for t in txns)
|
||||||
|
return {
|
||||||
|
"total_crc": round(total_crc, 2),
|
||||||
|
"total_usd": round(total_usd, 2),
|
||||||
|
"count": len(txns)
|
||||||
|
}
|
||||||
|
|
||||||
card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None
|
total_transactions = len(purchases) + len(other_charges) + len(voluntary_services)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"source_file": pdf_path.name,
|
"source_file": pdf_path.name,
|
||||||
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
||||||
"statement_date": statement_date,
|
"statement_date": statement_date,
|
||||||
"total_transactions": len(transactions)
|
"total_transactions": total_transactions
|
||||||
},
|
},
|
||||||
"card_holder": card_holder,
|
"card_holders": card_holders,
|
||||||
"transactions": transactions,
|
"purchases": purchases,
|
||||||
|
"other_charges": other_charges,
|
||||||
|
"voluntary_services": voluntary_services,
|
||||||
"summary": {
|
"summary": {
|
||||||
"total_crc": round(total_crc, 2),
|
"purchases": calculate_summary(purchases),
|
||||||
"total_usd": round(total_usd, 2),
|
"other_charges": calculate_summary(other_charges),
|
||||||
"transaction_count": len(transactions)
|
"voluntary_services": calculate_summary(voluntary_services)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -346,9 +427,17 @@ Examples:
|
||||||
with open(args.output, "w", encoding="utf-8") as f:
|
with open(args.output, "w", encoding="utf-8") as f:
|
||||||
json.dump(result, f, indent=indent, ensure_ascii=False)
|
json.dump(result, f, indent=indent, ensure_ascii=False)
|
||||||
|
|
||||||
print(f"Extracted {result['summary']['transaction_count']} transactions to {args.output}")
|
summary = result['summary']
|
||||||
print(f"Total CRC: {result['summary']['total_crc']:,.2f}")
|
print(f"Extracted {result['metadata']['total_transactions']} transactions to {args.output}")
|
||||||
print(f"Total USD: {result['summary']['total_usd']:,.2f}")
|
print(f" Purchases (B): {summary['purchases']['count']:3d} "
|
||||||
|
f"CRC {summary['purchases']['total_crc']:>12,.2f} "
|
||||||
|
f"USD {summary['purchases']['total_usd']:>10,.2f}")
|
||||||
|
print(f" Other charges (D): {summary['other_charges']['count']:3d} "
|
||||||
|
f"CRC {summary['other_charges']['total_crc']:>12,.2f} "
|
||||||
|
f"USD {summary['other_charges']['total_usd']:>10,.2f}")
|
||||||
|
print(f" Voluntary services (E): {summary['voluntary_services']['count']:3d} "
|
||||||
|
f"CRC {summary['voluntary_services']['total_crc']:>12,.2f} "
|
||||||
|
f"USD {summary['voluntary_services']['total_usd']:>10,.2f}")
|
||||||
|
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
print(f"Error: {e}", file=sys.stderr)
|
print(f"Error: {e}", file=sys.stderr)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue