diff --git a/.gitignore b/.gitignore index fcd11ea..4fe82f1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.pdf *.json +*.png __pycache__/ testStatements/ diff --git a/CLAUDE.md b/CLAUDE.md index 5b058a9..e4f77b3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,11 +4,14 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Project Overview -Single-script Python tool that extracts credit card transactions from BAC Costa Rica statement PDFs. Parses sections B (purchases), D (other charges), and E (voluntary services) and outputs JSON. +Python tools for BAC Costa Rica credit card statement processing: +- `bac_extract.py`: Extracts transactions from statement PDFs to JSON +- `bac_analyze.py`: Analyzes JSON output with categorization and graphs ## Dependencies -- pdfplumber (>=0.10.0) +- pdfplumber (>=0.10.0) - PDF extraction +- matplotlib (>=3.5.0) - graphs (optional, only for bac_analyze.py --graph) ## Commands @@ -16,22 +19,22 @@ Single-script Python tool that extracts credit card transactions from BAC Costa # Run tests python testStatements/run_tests.py -# Run extractor -python bac_extract.py [options] - -# Examples -python bac_extract.py EstadodeCuenta.pdf --pretty +# Extract transactions from PDF +python bac_extract.py statement.pdf --pretty python bac_extract.py statement.pdf -o output.json -v -``` -Options: -- `-o, --output`: Output JSON path (default: transactions.json) -- `--pretty`: Pretty-print JSON -- `-v, --verbose`: Enable debug logging +# Analyze transactions (supports multiple JSON files) +python bac_analyze.py transactions.json +python bac_analyze.py *.json --graph all +python bac_analyze.py *.json --graph bar -o spending.png +python bac_analyze.py *.json --categories my_categories.json +``` ## Architecture -The extraction pipeline: +### bac_extract.py + +Extraction pipeline: 1. Validates PDF is a BAC statement (`is_bac_statement`) 2. Iterates pages line-by-line, detecting section boundaries via `SECTIONS` dict patterns 3. Parses transactions matching `TRANSACTION_PATTERN` regex @@ -41,7 +44,10 @@ Key data structures: - `SECTIONS`: Maps section IDs (B/D/E) to start/end regex patterns and output keys - `SPANISH_MONTHS`: Spanish month abbreviations for date parsing -Key parsing functions: -- `parse_spanish_date`: Converts "15-ENE-25" to "2025-01-15" -- `parse_amount`: Handles "1,234.56" and trailing negatives "100.00-" -- `matches_patterns`: Generic regex pattern matcher for section detection +### bac_analyze.py + +Analysis pipeline: +1. Loads transactions from one or more JSON files (purchases only) +2. Categorizes by matching description against patterns in `categories.json` +3. Aggregates by category and month, keeping CRC/USD separate +4. Outputs text summary and optional graphs (bar/pie/timeline/all) diff --git a/README.md b/README.md index a56332a..709c00d 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,16 @@ -# BAC Statement Extractor +# BAC Statement Tools -Extracts credit card transactions from BAC Costa Rica statement PDFs. Parses sections B (purchases), D (other charges), and E (voluntary services) and outputs JSON. +Tools for processing BAC Costa Rica credit card statement PDFs. ## Dependencies - Python 3.10+ - pdfplumber (>=0.10.0) +- matplotlib (>=3.5.0) - optional, for graphs -## Usage +## Extraction + +Extract transactions from statement PDFs to JSON. ```bash python bac_extract.py [options] @@ -24,6 +27,56 @@ python bac_extract.py statement.pdf --pretty python bac_extract.py statement.pdf -o output.json -v ``` +## Analysis + +Analyze extracted transactions with category breakdowns and graphs. + +```bash +python bac_analyze.py [options] +``` + +**Options:** +- `--graph {bar,pie,timeline,all}`: Generate graph(s) +- `-o, --output`: Output file for graph (default: spending_.png) +- `--show`: Display graph interactively +- `--categories`: Custom categories file (default: categories.json) + +**Examples:** +```bash +# Text summary +python bac_analyze.py transactions.json + +# Analyze multiple statements +python bac_analyze.py *.json + +# Generate all graphs +python bac_analyze.py *.json --graph all + +# Generate bar chart with custom output +python bac_analyze.py *.json --graph bar -o spending.png + +# Use custom categories +python bac_analyze.py *.json --categories my_categories.json +``` + +## Categories + +Create a `categories.json` file to customize spending categories. Each category maps to a list of merchant name patterns (case-insensitive substring match). + +```json +{ + "Groceries": ["SUPERMARKET", "WALMART", "FRESH MARKET"], + "Gas": ["SERVICENTRO", "DELTA", "SHELL"], + "Restaurants": ["RESTAURANT", "CAFE", "PIZZA", "SUSHI"], + "Transportation": ["UBER", "TAXI", "PARKING"], + "Entertainment": ["CINEMA", "NETFLIX", "STEAM"], + "Utilities": ["ELECTRIC", "WATER", "INTERNET"], + "Subscriptions": ["SPOTIFY", "YOUTUBE", "CHATGPT"] +} +``` + +Transactions not matching any pattern are categorized as "Other". + ## Output Format ```json diff --git a/bac_analyze.py b/bac_analyze.py new file mode 100755 index 0000000..c3808c0 --- /dev/null +++ b/bac_analyze.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +""" +BAC Spending Analysis Tool + +Analyzes transaction JSON output from bac_extract.py. +Provides spending categorization and visualization. +""" + +import argparse +import json +import sys +from collections import defaultdict +from pathlib import Path + +try: + import matplotlib.pyplot as plt + HAS_MATPLOTLIB = True +except ImportError: + HAS_MATPLOTLIB = False + + +def load_transactions(json_files: list[Path]) -> list[dict]: + """Load and merge transactions from multiple JSON files.""" + transactions = [] + for path in json_files: + with open(path, encoding="utf-8") as f: + data = json.load(f) + # Only include purchases, skip other_charges and voluntary_services + transactions.extend(data.get("purchases", [])) + return transactions + + +def load_categories(path: Path) -> dict[str, list[str]]: + """Load category patterns from JSON file.""" + with open(path, encoding="utf-8") as f: + return json.load(f) + + +def categorize(description: str, categories: dict[str, list[str]]) -> str: + """Return category for a transaction description.""" + desc_upper = description.upper() + for category, patterns in categories.items(): + for pattern in patterns: + if pattern.upper() in desc_upper: + return category + return "Other" + + +def aggregate_by_category( + transactions: list[dict], categories: dict[str, list[str]] +) -> dict[str, dict[str, float]]: + """Sum spending per category, separate CRC/USD.""" + result = defaultdict(lambda: {"crc": 0.0, "usd": 0.0}) + for txn in transactions: + cat = categorize(txn["description"], categories) + if txn["amount_crc"]: + result[cat]["crc"] += txn["amount_crc"] + if txn["amount_usd"]: + result[cat]["usd"] += txn["amount_usd"] + return dict(result) + + +def aggregate_by_month(transactions: list[dict]) -> dict[str, dict[str, float]]: + """Sum spending per month (YYYY-MM), separate CRC/USD.""" + result = defaultdict(lambda: {"crc": 0.0, "usd": 0.0}) + for txn in transactions: + month = txn["date"][:7] # YYYY-MM + if txn["amount_crc"]: + result[month]["crc"] += txn["amount_crc"] + if txn["amount_usd"]: + result[month]["usd"] += txn["amount_usd"] + return dict(result) + + +def print_summary(by_category: dict, by_month: dict): + """Print text summary to stdout.""" + print("=== Spending by Category ===") + + # Sort by CRC amount descending + sorted_cats = sorted(by_category.items(), key=lambda x: x[1]["crc"], reverse=True) + total_crc = 0.0 + total_usd = 0.0 + + for cat, amounts in sorted_cats: + crc, usd = amounts["crc"], amounts["usd"] + total_crc += crc + total_usd += usd + print(f"{cat:20} CRC {crc:>12,.2f} USD {usd:>8,.2f}") + + print("-" * 50) + print(f"{'Total':20} CRC {total_crc:>12,.2f} USD {total_usd:>8,.2f}") + + print("\n=== Monthly Spending ===") + for month in sorted(by_month.keys()): + amounts = by_month[month] + print(f"{month}: CRC {amounts['crc']:>12,.2f} USD {amounts['usd']:>8,.2f}") + + +def plot_bar(data: dict, output: Path, show: bool): + """Bar chart of category spending (CRC).""" + # Sort by amount descending + sorted_items = sorted(data.items(), key=lambda x: x[1]["crc"], reverse=True) + categories = [item[0] for item in sorted_items] + amounts = [item[1]["crc"] for item in sorted_items] + + fig, ax = plt.subplots(figsize=(10, 6)) + bars = ax.barh(categories, amounts, color="steelblue") + ax.set_xlabel("Amount (CRC)") + ax.set_title("Spending by Category") + ax.invert_yaxis() + + # Add value labels + for bar, amount in zip(bars, amounts): + ax.text(bar.get_width() + max(amounts) * 0.01, bar.get_y() + bar.get_height() / 2, + f"{amount:,.0f}", va="center", fontsize=9) + + plt.tight_layout() + plt.savefig(output, dpi=150) + print(f"Saved bar chart to {output}") + + if show: + plt.show() + plt.close() + + +def plot_pie(data: dict, output: Path, show: bool): + """Pie chart of category distribution (CRC).""" + # Filter out zero/negative and sort + filtered = {k: v["crc"] for k, v in data.items() if v["crc"] > 0} + sorted_items = sorted(filtered.items(), key=lambda x: x[1], reverse=True) + + categories = [item[0] for item in sorted_items] + amounts = [item[1] for item in sorted_items] + + fig, ax = plt.subplots(figsize=(10, 8)) + wedges, texts, autotexts = ax.pie( + amounts, labels=categories, autopct="%1.1f%%", + startangle=90, pctdistance=0.75 + ) + ax.set_title("Spending Distribution by Category (CRC)") + + plt.tight_layout() + plt.savefig(output, dpi=150) + print(f"Saved pie chart to {output}") + + if show: + plt.show() + plt.close() + + +def plot_timeline(data: dict, output: Path, show: bool): + """Line chart of monthly spending (CRC).""" + months = sorted(data.keys()) + amounts = [data[m]["crc"] for m in months] + + fig, ax = plt.subplots(figsize=(10, 6)) + ax.plot(months, amounts, marker="o", linewidth=2, markersize=8, color="steelblue") + ax.fill_between(months, amounts, alpha=0.3, color="steelblue") + + ax.set_xlabel("Month") + ax.set_ylabel("Amount (CRC)") + ax.set_title("Monthly Spending") + ax.tick_params(axis="x", rotation=45) + + # Add value labels + for month, amount in zip(months, amounts): + ax.annotate(f"{amount:,.0f}", (month, amount), + textcoords="offset points", xytext=(0, 10), + ha="center", fontsize=9) + + plt.tight_layout() + plt.savefig(output, dpi=150) + print(f"Saved timeline chart to {output}") + + if show: + plt.show() + plt.close() + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze spending from BAC transaction JSON files" + ) + parser.add_argument( + "json_files", type=Path, nargs="+", + help="JSON files from bac_extract.py" + ) + parser.add_argument( + "--graph", choices=["bar", "pie", "timeline", "all"], + help="Generate graph type (use 'all' for all graphs)" + ) + parser.add_argument( + "-o", "--output", type=Path, + help="Output file for graph (default: spending_.png)" + ) + parser.add_argument( + "--show", action="store_true", + help="Display graph interactively" + ) + parser.add_argument( + "--categories", type=Path, default=Path("categories.json"), + help="Custom categories file (default: categories.json)" + ) + args = parser.parse_args() + + # Validate input files + for path in args.json_files: + if not path.exists(): + sys.exit(f"Error: File not found: {path}") + + # Check matplotlib early if graph requested + if args.graph and not HAS_MATPLOTLIB: + sys.exit("Error: matplotlib is required for graphs. Install with: pip install matplotlib") + + # Load categories + if not args.categories.exists(): + sys.exit(f"Error: Categories file not found: {args.categories}") + categories = load_categories(args.categories) + + # Load transactions + transactions = load_transactions(args.json_files) + if not transactions: + sys.exit("Error: No transactions found in input files") + + # Aggregate data + by_category = aggregate_by_category(transactions, categories) + by_month = aggregate_by_month(transactions) + + # Print summary + print_summary(by_category, by_month) + + # Generate graph if requested + if args.graph: + if args.graph == "all": + prefix = args.output.stem if args.output else "spending" + suffix = args.output.suffix if args.output else ".png" + plot_bar(by_category, Path(f"{prefix}_bar{suffix}"), args.show) + plot_pie(by_category, Path(f"{prefix}_pie{suffix}"), args.show) + plot_timeline(by_month, Path(f"{prefix}_timeline{suffix}"), args.show) + else: + output = args.output or Path(f"spending_{args.graph}.png") + if args.graph == "bar": + plot_bar(by_category, output, args.show) + elif args.graph == "pie": + plot_pie(by_category, output, args.show) + elif args.graph == "timeline": + plot_timeline(by_month, output, args.show) + + +if __name__ == "__main__": + main()