remove card suffix functionality

This commit is contained in:
Fabian Montero 2026-03-09 13:59:03 -06:00
parent 62450842c3
commit a05f701f16
Signed by: fabian
GPG key ID: 8036F30EDBAC8447
2 changed files with 39 additions and 94 deletions

View file

@ -72,21 +72,19 @@ def parse_amount(amount_str: str) -> Optional[float]:
Parse amount string with comma thousands separator.
Handles trailing '-' for negative values.
"""
if not amount_str or not amount_str.strip():
if not amount_str:
return None
amount_str = amount_str.strip()
if not amount_str:
return None
# Check for trailing negative sign
is_negative = amount_str.endswith("-")
if is_negative:
amount_str = amount_str[:-1].strip()
amount_str = amount_str[:-1]
# Remove thousands separators (commas) and handle decimal point
# Format: 1,234.56 or 1,234,567.89
try:
amount_str = amount_str.replace(",", "")
amount = float(amount_str)
amount = float(amount_str.replace(",", ""))
return -amount if is_negative else amount
except ValueError:
return None
@ -107,19 +105,17 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
return None
first_page_text = pdf.pages[0].extract_text() or ""
# Look for date patterns in the first page
# Common format: "Fecha de corte: DD-MMM-YY" or similar
date_matches = DATE_PATTERN.findall(first_page_text)
if date_matches:
# Use the first date found as statement date
day, month_abbr, year = date_matches[0]
month = SPANISH_MONTHS.get(month_abbr.upper())
if month:
full_year = 2000 + int(year)
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
if not date_matches:
return None
return None
day, month_abbr, year = date_matches[0]
month = SPANISH_MONTHS.get(month_abbr.upper())
if not month:
return None
full_year = 2000 + int(year)
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
def find_section_b_start(page_text: str) -> bool:
@ -128,10 +124,7 @@ def find_section_b_start(page_text: str) -> bool:
r"B\)\s*Detalle\s+de\s+compras",
r"Detalle\s+de\s+compras\s+del\s+periodo",
]
for pattern in patterns:
if re.search(pattern, page_text, re.IGNORECASE):
return True
return False
return any(re.search(p, page_text, re.IGNORECASE) for p in patterns)
def is_section_end(text: str) -> bool:
@ -142,10 +135,7 @@ def is_section_end(text: str) -> bool:
r"Detalle\s+de\s+intereses",
r"D\)\s*Detalle",
]
for pattern in end_patterns:
if re.search(pattern, text, re.IGNORECASE):
return True
return False
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
@ -176,18 +166,16 @@ def parse_transaction_line(line: str) -> Optional[dict]:
reference = match.group(1)
date_str = match.group(2)
desc_and_loc = match.group(3).strip()
description = match.group(3).strip()
currency = match.group(4).upper()
amount_str = match.group(5)
is_negative = match.group(6) == "-"
# Parse date
date = parse_spanish_date(date_str)
if not date:
logger.warning(f"Could not parse date '{date_str}' for reference {reference}")
return None
# Parse amount
amount = parse_amount(amount_str)
if amount is None:
logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}")
@ -195,34 +183,23 @@ def parse_transaction_line(line: str) -> Optional[dict]:
if is_negative:
amount = -amount
# Split description and location
# Location is typically at the end, often a short suffix like "ANILL", "San Jose"
# For now, keep everything as description
description = desc_and_loc
location = None
# Set amount in appropriate currency field
amount_crc = amount if currency == "CRC" else None
amount_usd = amount if currency == "USD" else None
return {
"reference": reference,
"date": date,
"description": description,
"location": location,
"location": None,
"currency": currency,
"amount_crc": amount_crc,
"amount_usd": amount_usd,
"amount_crc": amount if currency == "CRC" else None,
"amount_usd": amount if currency == "USD" else None,
}
def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False) -> dict:
def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
"""
Extract transactions from a BAC credit card statement PDF.
Args:
pdf_path: Path to the PDF file
card_suffix: Last 4 digits of card to filter
verbose: Enable verbose logging
Returns:
@ -241,11 +218,10 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False
statement_date = extract_statement_date(pdf)
transactions = []
current_card_suffix = None
current_card_name = None
card_suffix = None
card_holder_name = None
in_section_b = False
section_b_found = False
card_suffix_found = False
# Start from page 2 (index 1) as page 1 is summary only
start_page = 1 if len(pdf.pages) > 1 else 0
@ -261,67 +237,48 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False
section_b_found = True
logger.debug(f"Found section B on page {page_num}")
# Check for section end
if in_section_b and is_section_end(page_text):
logger.debug(f"Found section end on page {page_num}")
# Still process this page, but mark we're ending
if not in_section_b:
continue
# Check for section end (still process this page before breaking)
reached_section_end = is_section_end(page_text)
if reached_section_end:
logger.debug(f"Found section end on page {page_num}")
# Parse text line by line
for line in page_text.split("\n"):
line = line.strip()
if not line:
continue
# Check for card holder line
card_info = extract_card_holder(line)
if card_info:
current_card_suffix, current_card_name = card_info
logger.debug(f"Found card holder: {current_card_suffix} - {current_card_name}")
if current_card_suffix == card_suffix:
card_suffix_found = True
card_suffix, card_holder_name = card_info
logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
continue
# Skip if we're not tracking the right card
if current_card_suffix != card_suffix:
continue
# Try to parse as transaction
transaction = parse_transaction_line(line)
if transaction:
transactions.append(transaction)
logger.debug(f"Extracted transaction: {transaction['reference']}")
# Check if we've passed section B
if in_section_b and is_section_end(page_text):
if reached_section_end:
break
if not section_b_found:
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
if not card_suffix_found:
raise ValueError(f"Card suffix '{card_suffix}' not found in statement")
# Calculate summary
total_crc = sum(t["amount_crc"] or 0 for t in transactions)
total_usd = sum(t["amount_usd"] or 0 for t in transactions)
# Get card holder info
card_holder = None
if card_suffix_found:
card_holder = {
"card_suffix": card_suffix,
"name": current_card_name if current_card_suffix == card_suffix else None
}
card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None
return {
"metadata": {
"source_file": pdf_path.name,
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
"statement_date": statement_date,
"card_filter": card_suffix,
"total_transactions": len(transactions)
},
"card_holder": card_holder,
@ -340,8 +297,8 @@ def main():
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
python bac_extract.py statement.pdf 1234 -o output.json -v
python bac_extract.py EstadodeCuenta.pdf --pretty
python bac_extract.py statement.pdf -o output.json -v
"""
)
@ -351,12 +308,6 @@ Examples:
help="Path to the BAC statement PDF"
)
parser.add_argument(
"card_suffix",
type=str,
help="Last 4 digits of card to filter (e.g., 1234)"
)
parser.add_argument(
"-o", "--output",
type=Path,
@ -378,11 +329,6 @@ Examples:
args = parser.parse_args()
# Validate card suffix
if not args.card_suffix.isdigit() or len(args.card_suffix) != 4:
print(f"Error: Card suffix must be exactly 4 digits, got '{args.card_suffix}'", file=sys.stderr)
sys.exit(1)
# Validate PDF file exists
if not args.pdf_file.exists():
print(f"Error: File not found: {args.pdf_file}", file=sys.stderr)
@ -393,7 +339,7 @@ Examples:
sys.exit(1)
try:
result = extract_transactions(args.pdf_file, args.card_suffix, args.verbose)
result = extract_transactions(args.pdf_file, args.verbose)
# Write output
indent = 2 if args.pretty else None