remove card suffix functionality
This commit is contained in:
parent
62450842c3
commit
a05f701f16
2 changed files with 39 additions and 94 deletions
|
|
@ -13,11 +13,11 @@ Single-script Python tool that extracts credit card transactions from BAC Costa
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python bac_extract.py <pdf_file> <card_suffix> [options]
|
python bac_extract.py <pdf_file> [options]
|
||||||
|
|
||||||
# Examples
|
# Examples
|
||||||
python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
|
python bac_extract.py EstadodeCuenta.pdf --pretty
|
||||||
python bac_extract.py statement.pdf 1234 -o output.json -v
|
python bac_extract.py statement.pdf -o output.json -v
|
||||||
```
|
```
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
|
|
@ -31,8 +31,7 @@ The extraction pipeline:
|
||||||
1. Validates PDF is a BAC statement (`is_bac_statement`)
|
1. Validates PDF is a BAC statement (`is_bac_statement`)
|
||||||
2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`)
|
2. Locates section B via regex patterns (`find_section_b_start`, `is_section_end`)
|
||||||
3. Extracts tables page-by-page using pdfplumber
|
3. Extracts tables page-by-page using pdfplumber
|
||||||
4. Filters transactions by card suffix (last 4 digits)
|
4. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators
|
||||||
5. Parses Spanish dates (D-MMM-YY format) and amounts with comma separators
|
|
||||||
|
|
||||||
Key parsing functions:
|
Key parsing functions:
|
||||||
- `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15"
|
- `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15"
|
||||||
|
|
|
||||||
124
bac_extract.py
124
bac_extract.py
|
|
@ -72,21 +72,19 @@ def parse_amount(amount_str: str) -> Optional[float]:
|
||||||
Parse amount string with comma thousands separator.
|
Parse amount string with comma thousands separator.
|
||||||
Handles trailing '-' for negative values.
|
Handles trailing '-' for negative values.
|
||||||
"""
|
"""
|
||||||
if not amount_str or not amount_str.strip():
|
if not amount_str:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
amount_str = amount_str.strip()
|
amount_str = amount_str.strip()
|
||||||
|
if not amount_str:
|
||||||
|
return None
|
||||||
|
|
||||||
# Check for trailing negative sign
|
|
||||||
is_negative = amount_str.endswith("-")
|
is_negative = amount_str.endswith("-")
|
||||||
if is_negative:
|
if is_negative:
|
||||||
amount_str = amount_str[:-1].strip()
|
amount_str = amount_str[:-1]
|
||||||
|
|
||||||
# Remove thousands separators (commas) and handle decimal point
|
|
||||||
# Format: 1,234.56 or 1,234,567.89
|
|
||||||
try:
|
try:
|
||||||
amount_str = amount_str.replace(",", "")
|
amount = float(amount_str.replace(",", ""))
|
||||||
amount = float(amount_str)
|
|
||||||
return -amount if is_negative else amount
|
return -amount if is_negative else amount
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return None
|
return None
|
||||||
|
|
@ -107,19 +105,17 @@ def extract_statement_date(pdf: pdfplumber.PDF) -> Optional[str]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
first_page_text = pdf.pages[0].extract_text() or ""
|
first_page_text = pdf.pages[0].extract_text() or ""
|
||||||
|
|
||||||
# Look for date patterns in the first page
|
|
||||||
# Common format: "Fecha de corte: DD-MMM-YY" or similar
|
|
||||||
date_matches = DATE_PATTERN.findall(first_page_text)
|
date_matches = DATE_PATTERN.findall(first_page_text)
|
||||||
if date_matches:
|
if not date_matches:
|
||||||
# Use the first date found as statement date
|
return None
|
||||||
day, month_abbr, year = date_matches[0]
|
|
||||||
month = SPANISH_MONTHS.get(month_abbr.upper())
|
|
||||||
if month:
|
|
||||||
full_year = 2000 + int(year)
|
|
||||||
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
|
|
||||||
|
|
||||||
return None
|
day, month_abbr, year = date_matches[0]
|
||||||
|
month = SPANISH_MONTHS.get(month_abbr.upper())
|
||||||
|
if not month:
|
||||||
|
return None
|
||||||
|
|
||||||
|
full_year = 2000 + int(year)
|
||||||
|
return f"{full_year:04d}-{month:02d}-{int(day):02d}"
|
||||||
|
|
||||||
|
|
||||||
def find_section_b_start(page_text: str) -> bool:
|
def find_section_b_start(page_text: str) -> bool:
|
||||||
|
|
@ -128,10 +124,7 @@ def find_section_b_start(page_text: str) -> bool:
|
||||||
r"B\)\s*Detalle\s+de\s+compras",
|
r"B\)\s*Detalle\s+de\s+compras",
|
||||||
r"Detalle\s+de\s+compras\s+del\s+periodo",
|
r"Detalle\s+de\s+compras\s+del\s+periodo",
|
||||||
]
|
]
|
||||||
for pattern in patterns:
|
return any(re.search(p, page_text, re.IGNORECASE) for p in patterns)
|
||||||
if re.search(pattern, page_text, re.IGNORECASE):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def is_section_end(text: str) -> bool:
|
def is_section_end(text: str) -> bool:
|
||||||
|
|
@ -142,10 +135,7 @@ def is_section_end(text: str) -> bool:
|
||||||
r"Detalle\s+de\s+intereses",
|
r"Detalle\s+de\s+intereses",
|
||||||
r"D\)\s*Detalle",
|
r"D\)\s*Detalle",
|
||||||
]
|
]
|
||||||
for pattern in end_patterns:
|
return any(re.search(p, text, re.IGNORECASE) for p in end_patterns)
|
||||||
if re.search(pattern, text, re.IGNORECASE):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
|
def extract_card_holder(row_text: str) -> Optional[tuple[str, str]]:
|
||||||
|
|
@ -176,18 +166,16 @@ def parse_transaction_line(line: str) -> Optional[dict]:
|
||||||
|
|
||||||
reference = match.group(1)
|
reference = match.group(1)
|
||||||
date_str = match.group(2)
|
date_str = match.group(2)
|
||||||
desc_and_loc = match.group(3).strip()
|
description = match.group(3).strip()
|
||||||
currency = match.group(4).upper()
|
currency = match.group(4).upper()
|
||||||
amount_str = match.group(5)
|
amount_str = match.group(5)
|
||||||
is_negative = match.group(6) == "-"
|
is_negative = match.group(6) == "-"
|
||||||
|
|
||||||
# Parse date
|
|
||||||
date = parse_spanish_date(date_str)
|
date = parse_spanish_date(date_str)
|
||||||
if not date:
|
if not date:
|
||||||
logger.warning(f"Could not parse date '{date_str}' for reference {reference}")
|
logger.warning(f"Could not parse date '{date_str}' for reference {reference}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Parse amount
|
|
||||||
amount = parse_amount(amount_str)
|
amount = parse_amount(amount_str)
|
||||||
if amount is None:
|
if amount is None:
|
||||||
logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}")
|
logger.warning(f"Could not parse amount '{amount_str}' for reference {reference}")
|
||||||
|
|
@ -195,34 +183,23 @@ def parse_transaction_line(line: str) -> Optional[dict]:
|
||||||
if is_negative:
|
if is_negative:
|
||||||
amount = -amount
|
amount = -amount
|
||||||
|
|
||||||
# Split description and location
|
|
||||||
# Location is typically at the end, often a short suffix like "ANILL", "San Jose"
|
|
||||||
# For now, keep everything as description
|
|
||||||
description = desc_and_loc
|
|
||||||
location = None
|
|
||||||
|
|
||||||
# Set amount in appropriate currency field
|
|
||||||
amount_crc = amount if currency == "CRC" else None
|
|
||||||
amount_usd = amount if currency == "USD" else None
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"reference": reference,
|
"reference": reference,
|
||||||
"date": date,
|
"date": date,
|
||||||
"description": description,
|
"description": description,
|
||||||
"location": location,
|
"location": None,
|
||||||
"currency": currency,
|
"currency": currency,
|
||||||
"amount_crc": amount_crc,
|
"amount_crc": amount if currency == "CRC" else None,
|
||||||
"amount_usd": amount_usd,
|
"amount_usd": amount if currency == "USD" else None,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False) -> dict:
|
def extract_transactions(pdf_path: Path, verbose: bool = False) -> dict:
|
||||||
"""
|
"""
|
||||||
Extract transactions from a BAC credit card statement PDF.
|
Extract transactions from a BAC credit card statement PDF.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pdf_path: Path to the PDF file
|
pdf_path: Path to the PDF file
|
||||||
card_suffix: Last 4 digits of card to filter
|
|
||||||
verbose: Enable verbose logging
|
verbose: Enable verbose logging
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
@ -241,11 +218,10 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False
|
||||||
statement_date = extract_statement_date(pdf)
|
statement_date = extract_statement_date(pdf)
|
||||||
|
|
||||||
transactions = []
|
transactions = []
|
||||||
current_card_suffix = None
|
card_suffix = None
|
||||||
current_card_name = None
|
card_holder_name = None
|
||||||
in_section_b = False
|
in_section_b = False
|
||||||
section_b_found = False
|
section_b_found = False
|
||||||
card_suffix_found = False
|
|
||||||
|
|
||||||
# Start from page 2 (index 1) as page 1 is summary only
|
# Start from page 2 (index 1) as page 1 is summary only
|
||||||
start_page = 1 if len(pdf.pages) > 1 else 0
|
start_page = 1 if len(pdf.pages) > 1 else 0
|
||||||
|
|
@ -261,67 +237,48 @@ def extract_transactions(pdf_path: Path, card_suffix: str, verbose: bool = False
|
||||||
section_b_found = True
|
section_b_found = True
|
||||||
logger.debug(f"Found section B on page {page_num}")
|
logger.debug(f"Found section B on page {page_num}")
|
||||||
|
|
||||||
# Check for section end
|
|
||||||
if in_section_b and is_section_end(page_text):
|
|
||||||
logger.debug(f"Found section end on page {page_num}")
|
|
||||||
# Still process this page, but mark we're ending
|
|
||||||
|
|
||||||
if not in_section_b:
|
if not in_section_b:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Check for section end (still process this page before breaking)
|
||||||
|
reached_section_end = is_section_end(page_text)
|
||||||
|
if reached_section_end:
|
||||||
|
logger.debug(f"Found section end on page {page_num}")
|
||||||
|
|
||||||
# Parse text line by line
|
# Parse text line by line
|
||||||
for line in page_text.split("\n"):
|
for line in page_text.split("\n"):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check for card holder line
|
|
||||||
card_info = extract_card_holder(line)
|
card_info = extract_card_holder(line)
|
||||||
if card_info:
|
if card_info:
|
||||||
current_card_suffix, current_card_name = card_info
|
card_suffix, card_holder_name = card_info
|
||||||
logger.debug(f"Found card holder: {current_card_suffix} - {current_card_name}")
|
logger.debug(f"Found card holder: {card_suffix} - {card_holder_name}")
|
||||||
if current_card_suffix == card_suffix:
|
|
||||||
card_suffix_found = True
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip if we're not tracking the right card
|
|
||||||
if current_card_suffix != card_suffix:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Try to parse as transaction
|
|
||||||
transaction = parse_transaction_line(line)
|
transaction = parse_transaction_line(line)
|
||||||
if transaction:
|
if transaction:
|
||||||
transactions.append(transaction)
|
transactions.append(transaction)
|
||||||
logger.debug(f"Extracted transaction: {transaction['reference']}")
|
logger.debug(f"Extracted transaction: {transaction['reference']}")
|
||||||
|
|
||||||
# Check if we've passed section B
|
if reached_section_end:
|
||||||
if in_section_b and is_section_end(page_text):
|
|
||||||
break
|
break
|
||||||
|
|
||||||
if not section_b_found:
|
if not section_b_found:
|
||||||
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
|
raise ValueError("Section 'B) Detalle de compras del periodo' not found in PDF")
|
||||||
|
|
||||||
if not card_suffix_found:
|
|
||||||
raise ValueError(f"Card suffix '{card_suffix}' not found in statement")
|
|
||||||
|
|
||||||
# Calculate summary
|
# Calculate summary
|
||||||
total_crc = sum(t["amount_crc"] or 0 for t in transactions)
|
total_crc = sum(t["amount_crc"] or 0 for t in transactions)
|
||||||
total_usd = sum(t["amount_usd"] or 0 for t in transactions)
|
total_usd = sum(t["amount_usd"] or 0 for t in transactions)
|
||||||
|
|
||||||
# Get card holder info
|
card_holder = {"card_suffix": card_suffix, "name": card_holder_name} if card_suffix else None
|
||||||
card_holder = None
|
|
||||||
if card_suffix_found:
|
|
||||||
card_holder = {
|
|
||||||
"card_suffix": card_suffix,
|
|
||||||
"name": current_card_name if current_card_suffix == card_suffix else None
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"source_file": pdf_path.name,
|
"source_file": pdf_path.name,
|
||||||
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
"extraction_date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
||||||
"statement_date": statement_date,
|
"statement_date": statement_date,
|
||||||
"card_filter": card_suffix,
|
|
||||||
"total_transactions": len(transactions)
|
"total_transactions": len(transactions)
|
||||||
},
|
},
|
||||||
"card_holder": card_holder,
|
"card_holder": card_holder,
|
||||||
|
|
@ -340,8 +297,8 @@ def main():
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
epilog="""
|
epilog="""
|
||||||
Examples:
|
Examples:
|
||||||
python bac_extract.py EstadodeCuenta.pdf 1234 --pretty
|
python bac_extract.py EstadodeCuenta.pdf --pretty
|
||||||
python bac_extract.py statement.pdf 1234 -o output.json -v
|
python bac_extract.py statement.pdf -o output.json -v
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -351,12 +308,6 @@ Examples:
|
||||||
help="Path to the BAC statement PDF"
|
help="Path to the BAC statement PDF"
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"card_suffix",
|
|
||||||
type=str,
|
|
||||||
help="Last 4 digits of card to filter (e.g., 1234)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-o", "--output",
|
"-o", "--output",
|
||||||
type=Path,
|
type=Path,
|
||||||
|
|
@ -378,11 +329,6 @@ Examples:
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Validate card suffix
|
|
||||||
if not args.card_suffix.isdigit() or len(args.card_suffix) != 4:
|
|
||||||
print(f"Error: Card suffix must be exactly 4 digits, got '{args.card_suffix}'", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# Validate PDF file exists
|
# Validate PDF file exists
|
||||||
if not args.pdf_file.exists():
|
if not args.pdf_file.exists():
|
||||||
print(f"Error: File not found: {args.pdf_file}", file=sys.stderr)
|
print(f"Error: File not found: {args.pdf_file}", file=sys.stderr)
|
||||||
|
|
@ -393,7 +339,7 @@ Examples:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = extract_transactions(args.pdf_file, args.card_suffix, args.verbose)
|
result = extract_transactions(args.pdf_file, args.verbose)
|
||||||
|
|
||||||
# Write output
|
# Write output
|
||||||
indent = 2 if args.pretty else None
|
indent = 2 if args.pretty else None
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue