remove card suffix functionality

This commit is contained in:
Fabian Montero 2026-03-09 13:59:03 -06:00
parent 62450842c3
commit a05f701f16
Signed by: fabian
GPG key ID: 8036F30EDBAC8447
2 changed files with 39 additions and 94 deletions

View file

@ -13,11 +13,11 @@ Single-script Python tool that extracts credit card transactions from BAC Costa
## Usage ## Usage
```bash ```bash
python bac_extract.py <pdf_file> <card_suffix> [options] python bac_extract.py <pdf_file> [options]
# Examples # Examples
python bac_extract.py EstadodeCuenta.pdf 1234 --pretty python bac_extract.py EstadodeCuenta.pdf --pretty
python bac_extract.py statement.pdf 1234 -o output.json -v python bac_extract.py statement.pdf -o output.json -v
``` ```
Options: Options:
@ -31,8 +31,7 @@ The extraction pipeline:
1. Validates PDF is a BAC statement (`is_bac_statement`) 1. Validates PDF is a BAC statement (`is_bac_statement`)
2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`) 2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`)
3. Extracts tables page-by-page using pdfplumber 3. Extracts tables page-by-page using pdfplumber
4. Filters transactions by card suffix (last 4 digits) 4. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators
5. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators
Key parsing functions: Key parsing functions:
- `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15" - `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15"

View file

@ -72,21 +72,19 @@ def parse_amount(amount_str: str) -> Optional[float]:
Parse amount string with comma thousands separator. Parse amount string with comma thousands separator.
Handles trailing '-' for negative values. Handles trailing '-' for negative values.
""" """
if not amount_str or not amount_str.strip(): if not amount_str:
return None return None
amount_str = amount_str.strip() amount_str = amount_str.strip()
if not amount_str:
return None
# Check for trailing negative sign
is_negative = amount_str.endswith("-") is_negative = amount_str.endswith("-")
if is_negative: if is_negative:
amount_str = amount_str[:-1].strip() amount_str = amount_str[:-1]
# Remove thousands separators (commas) and handle decimal point
# Format: 1,234.56 or 1,234,567.89
try: try:
amount_str = amount_str.replace(",", "") amount = float(amount_str.replace(",", ""))
amount = float(amount_str)
return -amount if is_negative else amount return -amount if is_negative else amount
except ValueError: except ValueError:
return None return None
@ -107,19 +105,17 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
return None return None
first_page_text = pdf.pages[0].extract_text() or "" first_page_text = pdf.pages[0].extract_text() or ""
# Look for date patterns in the first page
# Common format: "Fecha de corte: DD-MMM-YY" or similar
date_matches = DATE_PATTERN.findall(first_page_text) date_matches = DATE_PATTERN.findall(first_page_text)
if date_matches: if not date_matches:
# Use the first date found as statement date return None
day, month_abbr, year = date_matches[0]
month = SPANISH_MONTHS.get(month_abbr.upper())
if month:
full_year = 2000 + int(year)
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
return None day, month_abbr, year = date_matches[0]
month = SPANISH_MONTHS.get(month_abbr.upper())
if not month:
return None
full_year = 2000 + int(year)
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
def find_section_b_start(page_text: str) -> bool: def find_section_b_start(page_text: str) -> bool:
@ -128,10 +124,7 @@ def find_section_b_start(page_text: str) -> bool:
r"B\)\s*Detalle\s+de\s+compras", r"B\)\s*Detalle\s+de\s+compras",
r"Detalle\s+de\s+compras\s+del\s+periodo", r"Detalle\s+de\s+compras\s+del\s+periodo",
] ]
for pattern in patterns: return any(re.search(p, page_text, re.IGNORECASE) for p in patterns)
if re.search(pattern, page_text, re.IGNORECASE):
return True
return False
def is_section_end(text: str) -> bool: def is_section_end(text: str) -> bool:
@ -142,10 +135,7 @@ def is_section_end(text: str) -> bool:
r"Detalle\s+de\s+intereses", r"Detalle\s+de\s+intereses",
r"D\)\s*Detalle", r"D\)\s*Detalle",
] ]
for pattern in end_patterns: return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
if re.search(pattern, text, re.IGNORECASE):
return True
return False
def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]: def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
@ -176,18 +166,16 @@ def parse_transaction_line(line: str) -> Optional[dict]:
reference = match.group(1) reference = match.group(1)
date_str = match.group(2) date_str = match.group(2)
desc_and_loc = match.group(3).strip() description = match.group(3).strip()
currency = match.group(4).upper() currency = match.group(4).upper()
amount_str = match.group(5) amount_str = match.group(5)
is_negative = match.group(6) == "-" is_negative = match.group(6) == "-"
# Parse date
date = parse_spanish_date(date_str) date = parse_spanish_date(date_str)
if not date: if not date:
logger.warning(f"Could not parse date '{date_str}' for reference {reference}") logger.warning(f"Could not parse date '{date_str}' for reference {reference}")
return None return None
# Parse amount
amount = parse_amount(amount_str) amount = parse_amount(amount_str)
if amount is None: if amount is None:
logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}") logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}")
@ -195,34 +183,23 @@ def parse_transaction_line(line: str) -> Optional[dict]:
if is_negative: if is_negative:
amount = -amount amount = -amount
# Split description and location
# Location is typically at the end, often a short suffix like "ANILL", "San Jose"
# For now, keep everything as description
description = desc_and_loc
location = None
# Set amount in appropriate currency field
amount_crc = amount if currency == "CRC" else None
amount_usd = amount if currency == "USD" else None
return { return {
"reference": reference, "reference": reference,
"date": date, "date": date,
"description": description, "description": description,
"location": location, "location": None,
"currency": currency, "currency": currency,
"amount_crc": amount_crc, "amount_crc": amount if currency == "CRC" else None,
"amount_usd": amount_usd, "amount_usd": amount if currency == "USD" else None,
} }
def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False) -> dict: def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
""" """
Extract transactions from a BAC credit card statement PDF. Extract transactions from a BAC credit card statement PDF.
Args: Args:
pdf_path: Path to the PDF file pdf_path: Path to the PDF file
card_suffix: Last 4 digits of card to filter
verbose: Enable verbose logging verbose: Enable verbose logging
Returns: Returns:
@ -241,11 +218,10 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False
statement_date = extract_statement_date(pdf) statement_date = extract_statement_date(pdf)
transactions = [] transactions = []
current_card_suffix = None card_suffix = None
current_card_name = None card_holder_name = None
in_section_b = False in_section_b = False
section_b_found = False section_b_found = False
card_suffix_found = False
# Start from page 2 (index 1) as page 1 is summary only # Start from page 2 (index 1) as page 1 is summary only
start_page = 1 if len(pdf.pages) > 1 else 0 start_page = 1 if len(pdf.pages) > 1 else 0
@ -261,67 +237,48 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False
section_b_found = True section_b_found = True
logger.debug(f"Found section B on page {page_num}") logger.debug(f"Found section B on page {page_num}")
# Check for section end
if in_section_b and is_section_end(page_text):
logger.debug(f"Found section end on page {page_num}")
# Still process this page, but mark we're ending
if not in_section_b: if not in_section_b:
continue continue
# Check for section end (still process this page before breaking)
reached_section_end = is_section_end(page_text)
if reached_section_end:
logger.debug(f"Found section end on page {page_num}")
# Parse text line by line # Parse text line by line
for line in page_text.split("\n"): for line in page_text.split("\n"):
line = line.strip() line = line.strip()
if not line: if not line:
continue continue
# Check for card holder line
card_info = extract_card_holder(line) card_info = extract_card_holder(line)
if card_info: if card_info:
current_card_suffix, current_card_name = card_info card_suffix, card_holder_name = card_info
logger.debug(f"Found card holder: {current_card_suffix} - {current_card_name}") logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
if current_card_suffix == card_suffix:
card_suffix_found = True
continue continue
# Skip if we're not tracking the right card
if current_card_suffix != card_suffix:
continue
# Try to parse as transaction
transaction = parse_transaction_line(line) transaction = parse_transaction_line(line)
if transaction: if transaction:
transactions.append(transaction) transactions.append(transaction)
logger.debug(f"Extracted transaction: {transaction['reference']}") logger.debug(f"Extracted transaction: {transaction['reference']}")
# Check if we've passed section B if reached_section_end:
if in_section_b and is_section_end(page_text):
break break
if not section_b_found: if not section_b_found:
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF") raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
if not card_suffix_found:
raise ValueError(f"Card suffix '{card_suffix}' not found in statement")
# Calculate summary # Calculate summary
total_crc = sum(t["amount_crc"] or 0 for t in transactions) total_crc = sum(t["amount_crc"] or 0 for t in transactions)
total_usd = sum(t["amount_usd"] or 0 for t in transactions) total_usd = sum(t["amount_usd"] or 0 for t in transactions)
# Get card holder info card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None
card_holder = None
if card_suffix_found:
card_holder = {
"card_suffix": card_suffix,
"name": current_card_name if current_card_suffix == card_suffix else None
}
return { return {
"metadata": { "metadata": {
"source_file": pdf_path.name, "source_file": pdf_path.name,
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
"statement_date": statement_date, "statement_date": statement_date,
"card_filter": card_suffix,
"total_transactions": len(transactions) "total_transactions": len(transactions)
}, },
"card_holder": card_holder, "card_holder": card_holder,
@ -340,8 +297,8 @@ def main():
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=""" epilog="""
Examples: Examples:
python bac_extract.py EstadodeCuenta.pdf 1234 --pretty python bac_extract.py EstadodeCuenta.pdf --pretty
python bac_extract.py statement.pdf 1234 -o output.json -v python bac_extract.py statement.pdf -o output.json -v
""" """
) )
@ -351,12 +308,6 @@ Examples:
help="Path to the BAC statement PDF" help="Path to the BAC statement PDF"
) )
parser.add_argument(
"card_suffix",
type=str,
help="Last 4 digits of card to filter (e.g., 1234)"
)
parser.add_argument( parser.add_argument(
"-o", "--output", "-o", "--output",
type=Path, type=Path,
@ -378,11 +329,6 @@ Examples:
args = parser.parse_args() args = parser.parse_args()
# Validate card suffix
if not args.card_suffix.isdigit() or len(args.card_suffix) != 4:
print(f"Error: Card suffix must be exactly 4 digits, got '{args.card_suffix}'", file=sys.stderr)
sys.exit(1)
# Validate PDF file exists # Validate PDF file exists
if not args.pdf_file.exists(): if not args.pdf_file.exists():
print(f"Error: File not found: {args.pdf_file}", file=sys.stderr) print(f"Error: File not found: {args.pdf_file}", file=sys.stderr)
@ -393,7 +339,7 @@ Examples:
sys.exit(1) sys.exit(1)
try: try:
result = extract_transactions(args.pdf_file, args.card_suffix, args.verbose) result = extract_transactions(args.pdf_file, args.verbose)
# Write output # Write output
indent = 2 if args.pretty else None indent = 2 if args.pretty else None