diff --git a/.gitignore b/.gitignore index 4fe82f1..fcd11ea 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ *.pdf *.json -*.png __pycache__/ testStatements/ diff --git a/CLAUDE.md b/CLAUDE.md index e4f77b3..5b058a9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,14 +4,11 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Project Overview -Python tools for BAC Costa Rica credit card statement processing: -- `bac_extract.py`: Extracts transactions from statement PDFs to JSON -- `bac_analyze.py`: Analyzes JSON output with categorization and graphs +Single-script Python tool that extracts credit card transactions from BAC Costa Rica statement PDFs. Parses sections B (purchases), D (other charges), and E (voluntary services) and outputs JSON. ## Dependencies -- pdfplumber (>=0.10.0) - PDF extraction -- matplotlib (>=3.5.0) - graphs (optional, only for bac_analyze.py --graph) +- pdfplumber (>=0.10.0) ## Commands @@ -19,22 +16,22 @@ Python tools for BAC Costa Rica credit card statement processing: # Run tests python testStatements/run_tests.py -# Extract transactions from PDF -python bac_extract.py statement.pdf --pretty -python bac_extract.py statement.pdf -o output.json -v +# Run extractor +python bac_extract.py [options] -# Analyze transactions (supports multiple JSON files) -python bac_analyze.py transactions.json -python bac_analyze.py *.json --graph all -python bac_analyze.py *.json --graph bar -o spending.png -python bac_analyze.py *.json --categories my_categories.json +# Examples +python bac_extract.py EstadodeCuenta.pdf --pretty +python bac_extract.py statement.pdf -o output.json -v ``` +Options: +- `-o, --output`: Output JSON path (default: transactions.json) +- `--pretty`: Pretty-print JSON +- `-v, --verbose`: Enable debug logging + ## Architecture -### bac_extract.py - -Extraction pipeline: +The extraction pipeline: 1. Validates PDF is a BAC statement (`is_bac_statement`) 2. Iterates pages line-by-line, detecting section boundaries via `SECTIONS` dict patterns 3. Parses transactions matching `TRANSACTION_PATTERN` regex @@ -44,10 +41,7 @@ Key data structures: - `SECTIONS`: Maps section IDs (B/D/E) to start/end regex patterns and output keys - `SPANISH_MONTHS`: Spanish month abbreviations for date parsing -### bac_analyze.py - -Analysis pipeline: -1. Loads transactions from one or more JSON files (purchases only) -2. Categorizes by matching description against patterns in `categories.json` -3. Aggregates by category and month, keeping CRC/USD separate -4. Outputs text summary and optional graphs (bar/pie/timeline/all) +Key parsing functions: +- `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15" +- `parse_amount`: Handles "1,234.56" and trailing negatives "100.00-" +- `matches_patterns`: Generic regex pattern matcher for section detection diff --git a/README.md b/README.md index 709c00d..a56332a 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,13 @@ -# BAC Statement Tools +# BAC Statement Extractor -Tools for processing BAC Costa Rica credit card statement PDFs. +Extracts credit card transactions from BAC Costa Rica statement PDFs. Parses sections B (purchases), D (other charges), and E (voluntary services) and outputs JSON. ## Dependencies - Python 3.10+ - pdfplumber (>=0.10.0) -- matplotlib (>=3.5.0) - optional, for graphs -## Extraction - -Extract transactions from statement PDFs to JSON. +## Usage ```bash python bac_extract.py [options] @@ -27,56 +24,6 @@ python bac_extract.py statement.pdf --pretty python bac_extract.py statement.pdf -o output.json -v ``` -## Analysis - -Analyze extracted transactions with category breakdowns and graphs. - -```bash -python bac_analyze.py [options] -``` - -**Options:** -- `--graph {bar,pie,timeline,all}`: Generate graph(s) -- `-o, --output`: Output file for graph (default: spending_.png) -- `--show`: Display graph interactively -- `--categories`: Custom categories file (default: categories.json) - -**Examples:** -```bash -# Text summary -python bac_analyze.py transactions.json - -# Analyze multiple statements -python bac_analyze.py *.json - -# Generate all graphs -python bac_analyze.py *.json --graph all - -# Generate bar chart with custom output -python bac_analyze.py *.json --graph bar -o spending.png - -# Use custom categories -python bac_analyze.py *.json --categories my_categories.json -``` - -## Categories - -Create a `categories.json` file to customize spending categories. Each category maps to a list of merchant name patterns (case-insensitive substring match). - -```json -{ - "Groceries": ["SUPERMARKET", "WALMART", "FRESH MARKET"], - "Gas": ["SERVICENTRO", "DELTA", "SHELL"], - "Restaurants": ["RESTAURANT", "CAFE", "PIZZA", "SUSHI"], - "Transportation": ["UBER", "TAXI", "PARKING"], - "Entertainment": ["CINEMA", "NETFLIX", "STEAM"], - "Utilities": ["ELECTRIC", "WATER", "INTERNET"], - "Subscriptions": ["SPOTIFY", "YOUTUBE", "CHATGPT"] -} -``` - -Transactions not matching any pattern are categorized as "Other". - ## Output Format ```json diff --git a/bac_analyze.py b/bac_analyze.py deleted file mode 100755 index c3808c0..0000000 --- a/bac_analyze.py +++ /dev/null @@ -1,251 +0,0 @@ -#!/usr/bin/env python3 -""" -BAC Spending Analysis Tool - -Analyzes transaction JSON output from bac_extract.py. -Provides spending categorization and visualization. -""" - -import argparse -import json -import sys -from collections import defaultdict -from pathlib import Path - -try: - import matplotlib.pyplot as plt - HAS_MATPLOTLIB = True -except ImportError: - HAS_MATPLOTLIB = False - - -def load_transactions(json_files: list[Path]) -> list[dict]: - """Load and merge transactions from multiple JSON files.""" - transactions = [] - for path in json_files: - with open(path, encoding="utf-8") as f: - data = json.load(f) - # Only include purchases, skip other_charges and voluntary_services - transactions.extend(data.get("purchases", [])) - return transactions - - -def load_categories(path: Path) -> dict[str, list[str]]: - """Load category patterns from JSON file.""" - with open(path, encoding="utf-8") as f: - return json.load(f) - - -def categorize(description: str, categories: dict[str, list[str]]) -> str: - """Return category for a transaction description.""" - desc_upper = description.upper() - for category, patterns in categories.items(): - for pattern in patterns: - if pattern.upper() in desc_upper: - return category - return "Other" - - -def aggregate_by_category( - transactions: list[dict], categories: dict[str, list[str]] -) -> dict[str, dict[str, float]]: - """Sum spending per category, separate CRC/USD.""" - result = defaultdict(lambda: {"crc": 0.0, "usd": 0.0}) - for txn in transactions: - cat = categorize(txn["description"], categories) - if txn["amount_crc"]: - result[cat]["crc"] += txn["amount_crc"] - if txn["amount_usd"]: - result[cat]["usd"] += txn["amount_usd"] - return dict(result) - - -def aggregate_by_month(transactions: list[dict]) -> dict[str, dict[str, float]]: - """Sum spending per month (YYYY-MM), separate CRC/USD.""" - result = defaultdict(lambda: {"crc": 0.0, "usd": 0.0}) - for txn in transactions: - month = txn["date"][:7] # YYYY-MM - if txn["amount_crc"]: - result[month]["crc"] += txn["amount_crc"] - if txn["amount_usd"]: - result[month]["usd"] += txn["amount_usd"] - return dict(result) - - -def print_summary(by_category: dict, by_month: dict): - """Print text summary to stdout.""" - print("=== Spending by Category ===") - - # Sort by CRC amount descending - sorted_cats = sorted(by_category.items(), key=lambda x: x[1]["crc"], reverse=True) - total_crc = 0.0 - total_usd = 0.0 - - for cat, amounts in sorted_cats: - crc, usd = amounts["crc"], amounts["usd"] - total_crc += crc - total_usd += usd - print(f"{cat:20} CRC {crc:>12,.2f} USD {usd:>8,.2f}") - - print("-" * 50) - print(f"{'Total':20} CRC {total_crc:>12,.2f} USD {total_usd:>8,.2f}") - - print("\n=== Monthly Spending ===") - for month in sorted(by_month.keys()): - amounts = by_month[month] - print(f"{month}: CRC {amounts['crc']:>12,.2f} USD {amounts['usd']:>8,.2f}") - - -def plot_bar(data: dict, output: Path, show: bool): - """Bar chart of category spending (CRC).""" - # Sort by amount descending - sorted_items = sorted(data.items(), key=lambda x: x[1]["crc"], reverse=True) - categories = [item[0] for item in sorted_items] - amounts = [item[1]["crc"] for item in sorted_items] - - fig, ax = plt.subplots(figsize=(10, 6)) - bars = ax.barh(categories, amounts, color="steelblue") - ax.set_xlabel("Amount (CRC)") - ax.set_title("Spending by Category") - ax.invert_yaxis() - - # Add value labels - for bar, amount in zip(bars, amounts): - ax.text(bar.get_width() + max(amounts) * 0.01, bar.get_y() + bar.get_height() / 2, - f"{amount:,.0f}", va="center", fontsize=9) - - plt.tight_layout() - plt.savefig(output, dpi=150) - print(f"Saved bar chart to {output}") - - if show: - plt.show() - plt.close() - - -def plot_pie(data: dict, output: Path, show: bool): - """Pie chart of category distribution (CRC).""" - # Filter out zero/negative and sort - filtered = {k: v["crc"] for k, v in data.items() if v["crc"] > 0} - sorted_items = sorted(filtered.items(), key=lambda x: x[1], reverse=True) - - categories = [item[0] for item in sorted_items] - amounts = [item[1] for item in sorted_items] - - fig, ax = plt.subplots(figsize=(10, 8)) - wedges, texts, autotexts = ax.pie( - amounts, labels=categories, autopct="%1.1f%%", - startangle=90, pctdistance=0.75 - ) - ax.set_title("Spending Distribution by Category (CRC)") - - plt.tight_layout() - plt.savefig(output, dpi=150) - print(f"Saved pie chart to {output}") - - if show: - plt.show() - plt.close() - - -def plot_timeline(data: dict, output: Path, show: bool): - """Line chart of monthly spending (CRC).""" - months = sorted(data.keys()) - amounts = [data[m]["crc"] for m in months] - - fig, ax = plt.subplots(figsize=(10, 6)) - ax.plot(months, amounts, marker="o", linewidth=2, markersize=8, color="steelblue") - ax.fill_between(months, amounts, alpha=0.3, color="steelblue") - - ax.set_xlabel("Month") - ax.set_ylabel("Amount (CRC)") - ax.set_title("Monthly Spending") - ax.tick_params(axis="x", rotation=45) - - # Add value labels - for month, amount in zip(months, amounts): - ax.annotate(f"{amount:,.0f}", (month, amount), - textcoords="offset points", xytext=(0, 10), - ha="center", fontsize=9) - - plt.tight_layout() - plt.savefig(output, dpi=150) - print(f"Saved timeline chart to {output}") - - if show: - plt.show() - plt.close() - - -def main(): - parser = argparse.ArgumentParser( - description="Analyze spending from BAC transaction JSON files" - ) - parser.add_argument( - "json_files", type=Path, nargs="+", - help="JSON files from bac_extract.py" - ) - parser.add_argument( - "--graph", choices=["bar", "pie", "timeline", "all"], - help="Generate graph type (use 'all' for all graphs)" - ) - parser.add_argument( - "-o", "--output", type=Path, - help="Output file for graph (default: spending_.png)" - ) - parser.add_argument( - "--show", action="store_true", - help="Display graph interactively" - ) - parser.add_argument( - "--categories", type=Path, default=Path("categories.json"), - help="Custom categories file (default: categories.json)" - ) - args = parser.parse_args() - - # Validate input files - for path in args.json_files: - if not path.exists(): - sys.exit(f"Error: File not found: {path}") - - # Check matplotlib early if graph requested - if args.graph and not HAS_MATPLOTLIB: - sys.exit("Error: matplotlib is required for graphs. Install with: pip install matplotlib") - - # Load categories - if not args.categories.exists(): - sys.exit(f"Error: Categories file not found: {args.categories}") - categories = load_categories(args.categories) - - # Load transactions - transactions = load_transactions(args.json_files) - if not transactions: - sys.exit("Error: No transactions found in input files") - - # Aggregate data - by_category = aggregate_by_category(transactions, categories) - by_month = aggregate_by_month(transactions) - - # Print summary - print_summary(by_category, by_month) - - # Generate graph if requested - if args.graph: - if args.graph == "all": - prefix = args.output.stem if args.output else "spending" - suffix = args.output.suffix if args.output else ".png" - plot_bar(by_category, Path(f"{prefix}_bar{suffix}"), args.show) - plot_pie(by_category, Path(f"{prefix}_pie{suffix}"), args.show) - plot_timeline(by_month, Path(f"{prefix}_timeline{suffix}"), args.show) - else: - output = args.output or Path(f"spending_{args.graph}.png") - if args.graph == "bar": - plot_bar(by_category, output, args.show) - elif args.graph == "pie": - plot_pie(by_category, output, args.show) - elif args.graph == "timeline": - plot_timeline(by_month, output, args.show) - - -if __name__ == "__main__": - main()