From a05f701f1662b83228279fc8f35f2040bc31b728 Mon Sep 17 00:00:00 2001 From: Fabian Montero Date: Mon, 9 Mar 2026 13:59:03 -0600 Subject: [PATCH] remove card suffix functionality --- CLAUDE.md | 9 ++-- bac_extract.py | 124 ++++++++++++++----------------------------------- 2 files changed, 39 insertions(+), 94 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 08dc084..ee95b39 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -13,11 +13,11 @@ Single-script Python tool that extracts credit card transactions from BAC Costa ## Usage ```bash -python bac_extract.py [options] +python bac_extract.py [options] # Examples -python bac_extract.py EstadodeCuenta.pdf 1234 --pretty -python bac_extract.py statement.pdf 1234 -o output.json -v +python bac_extract.py EstadodeCuenta.pdf --pretty +python bac_extract.py statement.pdf -o output.json -v ``` Options: @@ -31,8 +31,7 @@ The extraction pipeline: 1. Validates PDF is a BAC statement (`is_bac_statement`) 2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`) 3. Extracts tables page-by-page using pdfplumber -4. Filters transactions by card suffix (last 4 digits) -5. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators +4. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators Key parsing functions: - `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15" diff --git a/bac_extract.py b/bac_extract.py index 850bef1..adff408 100755 --- a/bac_extract.py +++ b/bac_extract.py @@ -72,21 +72,19 @@ def parse_amount(amount_str: str) -> Optional[float]: Parse amount string with comma thousands separator. Handles trailing '-' for negative values. """ - if not amount_str or not amount_str.strip(): + if not amount_str: return None amount_str = amount_str.strip() + if not amount_str: + return None - # Check for trailing negative sign is_negative = amount_str.endswith("-") if is_negative: - amount_str = amount_str[:-1].strip() + amount_str = amount_str[:-1] - # Remove thousands separators (commas) and handle decimal point - # Format: 1,234.56 or 1,234,567.89 try: - amount_str = amount_str.replace(",", "") - amount = float(amount_str) + amount = float(amount_str.replace(",", "")) return -amount if is_negative else amount except ValueError: return None @@ -107,19 +105,17 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]: return None first_page_text = pdf.pages[0].extract_text() or "" - - # Look for date patterns in the first page - # Common format: "Fecha de corte: DD-MMM-YY" or similar date_matches = DATE_PATTERN.findall(first_page_text) - if date_matches: - # Use the first date found as statement date - day, month_abbr, year = date_matches[0] - month = SPANISH_MONTHS.get(month_abbr.upper()) - if month: - full_year = 2000 + int(year) - return f"{full_year:04d}-{month:02d}-{int(day):02d}" + if not date_matches: + return None - return None + day, month_abbr, year = date_matches[0] + month = SPANISH_MONTHS.get(month_abbr.upper()) + if not month: + return None + + full_year = 2000 + int(year) + return f"{full_year:04d}-{month:02d}-{int(day):02d}" def find_section_b_start(page_text: str) -> bool: @@ -128,10 +124,7 @@ def find_section_b_start(page_text: str) -> bool: r"B\)\s*Detalle\s+de\s+compras", r"Detalle\s+de\s+compras\s+del\s+periodo", ] - for pattern in patterns: - if re.search(pattern, page_text, re.IGNORECASE): - return True - return False + return any(re.search(p, page_text, re.IGNORECASE) for p in patterns) def is_section_end(text: str) -> bool: @@ -142,10 +135,7 @@ def is_section_end(text: str) -> bool: r"Detalle\s+de\s+intereses", r"D\)\s*Detalle", ] - for pattern in end_patterns: - if re.search(pattern, text, re.IGNORECASE): - return True - return False + return any(re.search(p, text, re.IGNORECASE) for p in end_patterns) def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]: @@ -176,18 +166,16 @@ def parse_transaction_line(line: str) -> Optional[dict]: reference = match.group(1) date_str = match.group(2) - desc_and_loc = match.group(3).strip() + description = match.group(3).strip() currency = match.group(4).upper() amount_str = match.group(5) is_negative = match.group(6) == "-" - # Parse date date = parse_spanish_date(date_str) if not date: logger.warning(f"Could not parse date '{date_str}' for reference {reference}") return None - # Parse amount amount = parse_amount(amount_str) if amount is None: logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}") @@ -195,34 +183,23 @@ def parse_transaction_line(line: str) -> Optional[dict]: if is_negative: amount = -amount - # Split description and location - # Location is typically at the end, often a short suffix like "ANILL", "San Jose" - # For now, keep everything as description - description = desc_and_loc - location = None - - # Set amount in appropriate currency field - amount_crc = amount if currency == "CRC" else None - amount_usd = amount if currency == "USD" else None - return { "reference": reference, "date": date, "description": description, - "location": location, + "location": None, "currency": currency, - "amount_crc": amount_crc, - "amount_usd": amount_usd, + "amount_crc": amount if currency == "CRC" else None, + "amount_usd": amount if currency == "USD" else None, } -def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False) -> dict: +def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict: """ Extract transactions from a BAC credit card statement PDF. Args: pdf_path: Path to the PDF file - card_suffix: Last 4 digits of card to filter verbose: Enable verbose logging Returns: @@ -241,11 +218,10 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False statement_date = extract_statement_date(pdf) transactions = [] - current_card_suffix = None - current_card_name = None + card_suffix = None + card_holder_name = None in_section_b = False section_b_found = False - card_suffix_found = False # Start from page 2 (index 1) as page 1 is summary only start_page = 1 if len(pdf.pages) > 1 else 0 @@ -261,67 +237,48 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False section_b_found = True logger.debug(f"Found section B on page {page_num}") - # Check for section end - if in_section_b and is_section_end(page_text): - logger.debug(f"Found section end on page {page_num}") - # Still process this page, but mark we're ending - if not in_section_b: continue + # Check for section end (still process this page before breaking) + reached_section_end = is_section_end(page_text) + if reached_section_end: + logger.debug(f"Found section end on page {page_num}") + # Parse text line by line for line in page_text.split("\n"): line = line.strip() if not line: continue - # Check for card holder line card_info = extract_card_holder(line) if card_info: - current_card_suffix, current_card_name = card_info - logger.debug(f"Found card holder: {current_card_suffix} - {current_card_name}") - if current_card_suffix == card_suffix: - card_suffix_found = True + card_suffix, card_holder_name = card_info + logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}") continue - # Skip if we're not tracking the right card - if current_card_suffix != card_suffix: - continue - - # Try to parse as transaction transaction = parse_transaction_line(line) if transaction: transactions.append(transaction) logger.debug(f"Extracted transaction: {transaction['reference']}") - # Check if we've passed section B - if in_section_b and is_section_end(page_text): + if reached_section_end: break if not section_b_found: raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF") - if not card_suffix_found: - raise ValueError(f"Card suffix '{card_suffix}' not found in statement") - # Calculate summary total_crc = sum(t["amount_crc"] or 0 for t in transactions) total_usd = sum(t["amount_usd"] or 0 for t in transactions) - # Get card holder info - card_holder = None - if card_suffix_found: - card_holder = { - "card_suffix": card_suffix, - "name": current_card_name if current_card_suffix == card_suffix else None - } + card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None return { "metadata": { "source_file": pdf_path.name, "extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), "statement_date": statement_date, - "card_filter": card_suffix, "total_transactions": len(transactions) }, "card_holder": card_holder, @@ -340,8 +297,8 @@ def main(): formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - python bac_extract.py EstadodeCuenta.pdf 1234 --pretty - python bac_extract.py statement.pdf 1234 -o output.json -v + python bac_extract.py EstadodeCuenta.pdf --pretty + python bac_extract.py statement.pdf -o output.json -v """ ) @@ -351,12 +308,6 @@ Examples: help="Path to the BAC statement PDF" ) - parser.add_argument( - "card_suffix", - type=str, - help="Last 4 digits of card to filter (e.g., 1234)" - ) - parser.add_argument( "-o", "--output", type=Path, @@ -378,11 +329,6 @@ Examples: args = parser.parse_args() - # Validate card suffix - if not args.card_suffix.isdigit() or len(args.card_suffix) != 4: - print(f"Error: Card suffix must be exactly 4 digits, got '{args.card_suffix}'", file=sys.stderr) - sys.exit(1) - # Validate PDF file exists if not args.pdf_file.exists(): print(f"Error: File not found: {args.pdf_file}", file=sys.stderr) @@ -393,7 +339,7 @@ Examples: sys.exit(1) try: - result = extract_transactions(args.pdf_file, args.card_suffix, args.verbose) + result = extract_transactions(args.pdf_file, args.verbose) # Write output indent = 2 if args.pretty else None