remove card suffix functionality
This commit is contained in:
parent
62450842c3
commit
a05f701f16
2 changed files with 39 additions and 94 deletions
|
|
@ -13,11 +13,11 @@ Single-script Python tool that extracts credit card transactions from BAC Costa
|
|||
## Usage
|
||||
|
||||
```bash
|
||||
python bac_extract.py <pdf_file> <card_suffix> [options]
|
||||
python bac_extract.py <pdf_file> [options]
|
||||
|
||||
# Examples
|
||||
python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
|
||||
python bac_extract.py statement.pdf 1234 -o output.json -v
|
||||
python bac_extract.py EstadodeCuenta.pdf --pretty
|
||||
python bac_extract.py statement.pdf -o output.json -v
|
||||
```
|
||||
|
||||
Options:
|
||||
|
|
@ -31,8 +31,7 @@ The extraction pipeline:
|
|||
1. Validates PDF is a BAC statement (`is_bac_statement`)
|
||||
2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`)
|
||||
3. Extracts tables page-by-page using pdfplumber
|
||||
4. Filters transactions by card suffix (last 4 digits)
|
||||
5. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators
|
||||
4. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators
|
||||
|
||||
Key parsing functions:
|
||||
- `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15"
|
||||
|
|
|
|||
124
bac_extract.py
124
bac_extract.py
|
|
@ -72,21 +72,19 @@ def parse_amount(amount_str: str) -> Optional[float]:
|
|||
Parse amount string with comma thousands separator.
|
||||
Handles trailing '-' for negative values.
|
||||
"""
|
||||
if not amount_str or not amount_str.strip():
|
||||
if not amount_str:
|
||||
return None
|
||||
|
||||
amount_str = amount_str.strip()
|
||||
if not amount_str:
|
||||
return None
|
||||
|
||||
# Check for trailing negative sign
|
||||
is_negative = amount_str.endswith("-")
|
||||
if is_negative:
|
||||
amount_str = amount_str[:-1].strip()
|
||||
amount_str = amount_str[:-1]
|
||||
|
||||
# Remove thousands separators (commas) and handle decimal point
|
||||
# Format: 1,234.56 or 1,234,567.89
|
||||
try:
|
||||
amount_str = amount_str.replace(",", "")
|
||||
amount = float(amount_str)
|
||||
amount = float(amount_str.replace(",", ""))
|
||||
return -amount if is_negative else amount
|
||||
except ValueError:
|
||||
return None
|
||||
|
|
@ -107,19 +105,17 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
|
|||
return None
|
||||
|
||||
first_page_text = pdf.pages[0].extract_text() or ""
|
||||
|
||||
# Look for date patterns in the first page
|
||||
# Common format: "Fecha de corte: DD-MMM-YY" or similar
|
||||
date_matches = DATE_PATTERN.findall(first_page_text)
|
||||
if date_matches:
|
||||
# Use the first date found as statement date
|
||||
day, month_abbr, year = date_matches[0]
|
||||
month = SPANISH_MONTHS.get(month_abbr.upper())
|
||||
if month:
|
||||
full_year = 2000 + int(year)
|
||||
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
|
||||
if not date_matches:
|
||||
return None
|
||||
|
||||
return None
|
||||
day, month_abbr, year = date_matches[0]
|
||||
month = SPANISH_MONTHS.get(month_abbr.upper())
|
||||
if not month:
|
||||
return None
|
||||
|
||||
full_year = 2000 + int(year)
|
||||
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
|
||||
|
||||
|
||||
def find_section_b_start(page_text: str) -> bool:
|
||||
|
|
@ -128,10 +124,7 @@ def find_section_b_start(page_text: str) -> bool:
|
|||
r"B\)\s*Detalle\s+de\s+compras",
|
||||
r"Detalle\s+de\s+compras\s+del\s+periodo",
|
||||
]
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, page_text, re.IGNORECASE):
|
||||
return True
|
||||
return False
|
||||
return any(re.search(p, page_text, re.IGNORECASE) for p in patterns)
|
||||
|
||||
|
||||
def is_section_end(text: str) -> bool:
|
||||
|
|
@ -142,10 +135,7 @@ def is_section_end(text: str) -> bool:
|
|||
r"Detalle\s+de\s+intereses",
|
||||
r"D\)\s*Detalle",
|
||||
]
|
||||
for pattern in end_patterns:
|
||||
if re.search(pattern, text, re.IGNORECASE):
|
||||
return True
|
||||
return False
|
||||
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
|
||||
|
||||
|
||||
def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
|
||||
|
|
@ -176,18 +166,16 @@ def parse_transaction_line(line: str) -> Optional[dict]:
|
|||
|
||||
reference = match.group(1)
|
||||
date_str = match.group(2)
|
||||
desc_and_loc = match.group(3).strip()
|
||||
description = match.group(3).strip()
|
||||
currency = match.group(4).upper()
|
||||
amount_str = match.group(5)
|
||||
is_negative = match.group(6) == "-"
|
||||
|
||||
# Parse date
|
||||
date = parse_spanish_date(date_str)
|
||||
if not date:
|
||||
logger.warning(f"Could not parse date '{date_str}' for reference {reference}")
|
||||
return None
|
||||
|
||||
# Parse amount
|
||||
amount = parse_amount(amount_str)
|
||||
if amount is None:
|
||||
logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}")
|
||||
|
|
@ -195,34 +183,23 @@ def parse_transaction_line(line: str) -> Optional[dict]:
|
|||
if is_negative:
|
||||
amount = -amount
|
||||
|
||||
# Split description and location
|
||||
# Location is typically at the end, often a short suffix like "ANILL", "San Jose"
|
||||
# For now, keep everything as description
|
||||
description = desc_and_loc
|
||||
location = None
|
||||
|
||||
# Set amount in appropriate currency field
|
||||
amount_crc = amount if currency == "CRC" else None
|
||||
amount_usd = amount if currency == "USD" else None
|
||||
|
||||
return {
|
||||
"reference": reference,
|
||||
"date": date,
|
||||
"description": description,
|
||||
"location": location,
|
||||
"location": None,
|
||||
"currency": currency,
|
||||
"amount_crc": amount_crc,
|
||||
"amount_usd": amount_usd,
|
||||
"amount_crc": amount if currency == "CRC" else None,
|
||||
"amount_usd": amount if currency == "USD" else None,
|
||||
}
|
||||
|
||||
|
||||
def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False) -> dict:
|
||||
def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
|
||||
"""
|
||||
Extract transactions from a BAC credit card statement PDF.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the PDF file
|
||||
card_suffix: Last 4 digits of card to filter
|
||||
verbose: Enable verbose logging
|
||||
|
||||
Returns:
|
||||
|
|
@ -241,11 +218,10 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False
|
|||
statement_date = extract_statement_date(pdf)
|
||||
|
||||
transactions = []
|
||||
current_card_suffix = None
|
||||
current_card_name = None
|
||||
card_suffix = None
|
||||
card_holder_name = None
|
||||
in_section_b = False
|
||||
section_b_found = False
|
||||
card_suffix_found = False
|
||||
|
||||
# Start from page 2 (index 1) as page 1 is summary only
|
||||
start_page = 1 if len(pdf.pages) > 1 else 0
|
||||
|
|
@ -261,67 +237,48 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False
|
|||
section_b_found = True
|
||||
logger.debug(f"Found section B on page {page_num}")
|
||||
|
||||
# Check for section end
|
||||
if in_section_b and is_section_end(page_text):
|
||||
logger.debug(f"Found section end on page {page_num}")
|
||||
# Still process this page, but mark we're ending
|
||||
|
||||
if not in_section_b:
|
||||
continue
|
||||
|
||||
# Check for section end (still process this page before breaking)
|
||||
reached_section_end = is_section_end(page_text)
|
||||
if reached_section_end:
|
||||
logger.debug(f"Found section end on page {page_num}")
|
||||
|
||||
# Parse text line by line
|
||||
for line in page_text.split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Check for card holder line
|
||||
card_info = extract_card_holder(line)
|
||||
if card_info:
|
||||
current_card_suffix, current_card_name = card_info
|
||||
logger.debug(f"Found card holder: {current_card_suffix} - {current_card_name}")
|
||||
if current_card_suffix == card_suffix:
|
||||
card_suffix_found = True
|
||||
card_suffix, card_holder_name = card_info
|
||||
logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
|
||||
continue
|
||||
|
||||
# Skip if we're not tracking the right card
|
||||
if current_card_suffix != card_suffix:
|
||||
continue
|
||||
|
||||
# Try to parse as transaction
|
||||
transaction = parse_transaction_line(line)
|
||||
if transaction:
|
||||
transactions.append(transaction)
|
||||
logger.debug(f"Extracted transaction: {transaction['reference']}")
|
||||
|
||||
# Check if we've passed section B
|
||||
if in_section_b and is_section_end(page_text):
|
||||
if reached_section_end:
|
||||
break
|
||||
|
||||
if not section_b_found:
|
||||
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
|
||||
|
||||
if not card_suffix_found:
|
||||
raise ValueError(f"Card suffix '{card_suffix}' not found in statement")
|
||||
|
||||
# Calculate summary
|
||||
total_crc = sum(t["amount_crc"] or 0 for t in transactions)
|
||||
total_usd = sum(t["amount_usd"] or 0 for t in transactions)
|
||||
|
||||
# Get card holder info
|
||||
card_holder = None
|
||||
if card_suffix_found:
|
||||
card_holder = {
|
||||
"card_suffix": card_suffix,
|
||||
"name": current_card_name if current_card_suffix == card_suffix else None
|
||||
}
|
||||
card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None
|
||||
|
||||
return {
|
||||
"metadata": {
|
||||
"source_file": pdf_path.name,
|
||||
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
||||
"statement_date": statement_date,
|
||||
"card_filter": card_suffix,
|
||||
"total_transactions": len(transactions)
|
||||
},
|
||||
"card_holder": card_holder,
|
||||
|
|
@ -340,8 +297,8 @@ def main():
|
|||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
|
||||
python bac_extract.py statement.pdf 1234 -o output.json -v
|
||||
python bac_extract.py EstadodeCuenta.pdf --pretty
|
||||
python bac_extract.py statement.pdf -o output.json -v
|
||||
"""
|
||||
)
|
||||
|
||||
|
|
@ -351,12 +308,6 @@ Examples:
|
|||
help="Path to the BAC statement PDF"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"card_suffix",
|
||||
type=str,
|
||||
help="Last 4 digits of card to filter (e.g., 1234)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-o", "--output",
|
||||
type=Path,
|
||||
|
|
@ -378,11 +329,6 @@ Examples:
|
|||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate card suffix
|
||||
if not args.card_suffix.isdigit() or len(args.card_suffix) != 4:
|
||||
print(f"Error: Card suffix must be exactly 4 digits, got '{args.card_suffix}'", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Validate PDF file exists
|
||||
if not args.pdf_file.exists():
|
||||
print(f"Error: File not found: {args.pdf_file}", file=sys.stderr)
|
||||
|
|
@ -393,7 +339,7 @@ Examples:
|
|||
sys.exit(1)
|
||||
|
||||
try:
|
||||
result = extract_transactions(args.pdf_file, args.card_suffix, args.verbose)
|
||||
result = extract_transactions(args.pdf_file, args.verbose)
|
||||
|
||||
# Write output
|
||||
indent = 2 if args.pretty else None
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue