Commit
·
1d54def
0
Parent(s):
Initial commit
Browse files- README.md +31 -0
- app.py +44 -0
- requirements.txt +8 -0
- utils/__pycache__/data_processor.cpython-39.pyc +0 -0
- utils/__pycache__/invoice_parser.cpython-39.pyc +0 -0
- utils/__pycache__/llm_analyzer.cpython-39.pyc +0 -0
- utils/__pycache__/report_generator.cpython-39.pyc +0 -0
- utils/data_processor.py +16 -0
- utils/invoice_parser.py +33 -0
- utils/llm_analyzer.py +71 -0
- utils/report_generator.py +55 -0
README.md
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Invoice Reader & Budget Categorizer
|
2 |
+
|
3 |
+
This project creates an application that processes uploaded invoice PDFs, categorizes expenses, analyzes spending patterns using LLMs, and provides budget optimization recommendations.
|
4 |
+
|
5 |
+
## Objective
|
6 |
+
Create an application that:
|
7 |
+
- Processes invoice PDFs to extract transaction details.
|
8 |
+
- Categorizes expenses into predefined categories.
|
9 |
+
- Analyzes spending trends over time.
|
10 |
+
- Provides actionable budget recommendations.
|
11 |
+
|
12 |
+
## Features
|
13 |
+
- PDF upload and parsing.
|
14 |
+
- Transaction categorization (Utilities, Entertainment, Groceries, Travel, Shopping, Other).
|
15 |
+
- Spending pattern analysis over time.
|
16 |
+
- Budget optimization suggestions.
|
17 |
+
- Visualizations for spending insights.
|
18 |
+
|
19 |
+
## Technical Stack
|
20 |
+
- **PDF Parsing**: `pdfplumber`
|
21 |
+
- **Data Processing**: `pandas`
|
22 |
+
- **LLM**: Hugging Face Transformers (`facebook/bart-large`)
|
23 |
+
- **Visualization**: `matplotlib`
|
24 |
+
- **Interface**: Gradio
|
25 |
+
- **Deployment**: Hugging Face Spaces
|
26 |
+
|
27 |
+
## Setup Instructions
|
28 |
+
1. Clone the repository:
|
29 |
+
```bash
|
30 |
+
git clone <your-repo-url>
|
31 |
+
cd invoice-reader-budget-categorizer
|
app.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from utils.invoice_parser import parse_invoice
|
3 |
+
from utils.data_processor import process_data
|
4 |
+
from utils.llm_analyzer import LLMAnalyzer
|
5 |
+
from utils.report_generator import generate_report
|
6 |
+
|
7 |
+
# Initialize the LLM analyzer
|
8 |
+
analyzer = LLMAnalyzer()
|
9 |
+
|
10 |
+
def process_invoice(pdf_file):
|
11 |
+
try:
|
12 |
+
if not pdf_file:
|
13 |
+
return "No file uploaded."
|
14 |
+
|
15 |
+
transactions = parse_invoice(pdf_file) # Note: pdf_file is now a path string
|
16 |
+
if not transactions:
|
17 |
+
return "No transactions found in the invoice."
|
18 |
+
|
19 |
+
df = process_data(transactions)
|
20 |
+
if df.empty:
|
21 |
+
return "No valid transactions after processing."
|
22 |
+
|
23 |
+
df_categorized = analyzer.categorize_transactions(df)
|
24 |
+
spending_analysis = analyzer.analyze_spending_patterns(df_categorized)
|
25 |
+
|
26 |
+
recommendations = analyzer.generate_budget_recommendations(spending_analysis)
|
27 |
+
|
28 |
+
report = generate_report(df_categorized, spending_analysis, recommendations)
|
29 |
+
|
30 |
+
return report
|
31 |
+
|
32 |
+
except Exception as e:
|
33 |
+
return f"An error occurred: {str(e)}"
|
34 |
+
|
35 |
+
interface = gr.Interface(
|
36 |
+
fn=process_invoice,
|
37 |
+
inputs=gr.File(label="Upload Invoice (PDF)", type="filepath"), # Changed to "filepath"
|
38 |
+
outputs=gr.Markdown(label="Invoice Analysis Report"),
|
39 |
+
title="Invoice Reader & Budget Categorizer",
|
40 |
+
description="Upload your invoice PDF to categorize transactions, analyze spending patterns, and get budget optimization recommendations."
|
41 |
+
)
|
42 |
+
|
43 |
+
if __name__ == "__main__":
|
44 |
+
interface.launch(share=True) # Generates a public link
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pdfplumber==0.11.4
|
2 |
+
pandas==2.2.3
|
3 |
+
transformers==4.44.2
|
4 |
+
torch==2.4.1
|
5 |
+
gradio==4.44.0
|
6 |
+
matplotlib==3.9.2
|
7 |
+
numpy==1.26.4
|
8 |
+
tabulate
|
utils/__pycache__/data_processor.cpython-39.pyc
ADDED
Binary file (538 Bytes). View file
|
|
utils/__pycache__/invoice_parser.cpython-39.pyc
ADDED
Binary file (1 kB). View file
|
|
utils/__pycache__/llm_analyzer.cpython-39.pyc
ADDED
Binary file (2.78 kB). View file
|
|
utils/__pycache__/report_generator.cpython-39.pyc
ADDED
Binary file (1.81 kB). View file
|
|
utils/data_processor.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
def process_data(transactions):
|
3 |
+
df = pd.DataFrame(transactions)
|
4 |
+
|
5 |
+
if df.empty:
|
6 |
+
return df
|
7 |
+
|
8 |
+
# Ensure correct data types
|
9 |
+
df["date"] = pd.to_datetime(df["date"])
|
10 |
+
df["amount"] = df["amount"].astype(float)
|
11 |
+
|
12 |
+
# Add time-based columns
|
13 |
+
df["month"] = df["date"].dt.to_period("M")
|
14 |
+
df["week"] = df["date"].dt.isocalendar().week
|
15 |
+
|
16 |
+
return df
|
utils/invoice_parser.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pdfplumber
|
2 |
+
import re
|
3 |
+
from datetime import datetime
|
4 |
+
|
5 |
+
def parse_invoice(pdf_file):
|
6 |
+
transactions = []
|
7 |
+
|
8 |
+
with pdfplumber.open(pdf_file) as pdf:
|
9 |
+
for page in pdf.pages:
|
10 |
+
text = page.extract_text()
|
11 |
+
lines = text.split("\n")
|
12 |
+
|
13 |
+
for line in lines:
|
14 |
+
# Example pattern for invoice transaction
|
15 |
+
pattern = r"(\d{2}/\d{2}/\d{4})\s+(.+?)\s+([\d,.]+)\s+(.+)"
|
16 |
+
match = re.match(pattern, line.strip())
|
17 |
+
|
18 |
+
if match:
|
19 |
+
date_str, vendor, amount, description = match.groups()
|
20 |
+
try:
|
21 |
+
date = datetime.strptime(date_str, "%d/%m/%Y")
|
22 |
+
amount = float(amount.replace(",", ""))
|
23 |
+
transactions.append({
|
24 |
+
"date": date,
|
25 |
+
"vendor": vendor.strip(),
|
26 |
+
"amount": amount,
|
27 |
+
"description": description.strip()
|
28 |
+
})
|
29 |
+
except Exception as e:
|
30 |
+
print(f"Error parsing line: {line}, Error: {e}")
|
31 |
+
continue
|
32 |
+
|
33 |
+
return transactions
|
utils/llm_analyzer.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
class LLMAnalyzer:
|
5 |
+
def __init__(self):
|
6 |
+
self.summarizer = pipeline("summarization", model="facebook/bart-large", clean_up_tokenization_spaces=True)
|
7 |
+
self.text_generator = pipeline("text-generation", model="facebook/bart-large", clean_up_tokenization_spaces=True)
|
8 |
+
self.predefined_categories = ["Utilities", "Entertainment", "Groceries", "Travel", "Shopping", "Other"]
|
9 |
+
|
10 |
+
def categorize_transactions(self, df):
|
11 |
+
if df.empty:
|
12 |
+
return df
|
13 |
+
|
14 |
+
categorized_df = df.copy()
|
15 |
+
transaction_data = df.to_dict(orient="records")
|
16 |
+
categorized = []
|
17 |
+
|
18 |
+
for t in transaction_data:
|
19 |
+
desc = t["description"].lower()
|
20 |
+
if "electricity" in desc or "water" in desc:
|
21 |
+
category = "Utilities"
|
22 |
+
elif "movie" in desc or "game" in desc:
|
23 |
+
category = "Entertainment"
|
24 |
+
elif "grocery" in desc or "food" in desc:
|
25 |
+
category = "Groceries"
|
26 |
+
elif "travel" in desc or "flight" in desc:
|
27 |
+
category = "Travel"
|
28 |
+
elif "shop" in desc or "amazon" in desc:
|
29 |
+
category = "Shopping"
|
30 |
+
else:
|
31 |
+
category = "Other"
|
32 |
+
|
33 |
+
categorized.append({
|
34 |
+
"date": t["date"],
|
35 |
+
"vendor": t["vendor"],
|
36 |
+
"amount": t["amount"],
|
37 |
+
"description": t["description"],
|
38 |
+
"month": t["month"],
|
39 |
+
"week": t["week"],
|
40 |
+
"category": category
|
41 |
+
})
|
42 |
+
|
43 |
+
return pd.DataFrame(categorized)
|
44 |
+
|
45 |
+
def analyze_spending_patterns(self, df):
|
46 |
+
if df.empty:
|
47 |
+
return "No transactions available for analysis."
|
48 |
+
|
49 |
+
# Convert Period objects to strings for better summarization
|
50 |
+
monthly_spending = df.groupby("month").agg({"amount": "sum"}).rename(index=str).to_dict()["amount"]
|
51 |
+
weekly_spending = df.groupby("week").agg({"amount": "sum"}).to_dict()["amount"]
|
52 |
+
category_summary = df.groupby("category").agg({"amount": "sum"}).to_dict()["amount"]
|
53 |
+
|
54 |
+
analysis_text = (
|
55 |
+
f"Monthly Spending Trends: {monthly_spending}\n"
|
56 |
+
f"Weekly Spending Trends: {weekly_spending}\n"
|
57 |
+
f"Category-wise Spending: {category_summary}"
|
58 |
+
)
|
59 |
+
|
60 |
+
summary = self.summarizer(analysis_text, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
|
61 |
+
return summary
|
62 |
+
|
63 |
+
def generate_budget_recommendations(self, spending_analysis):
|
64 |
+
prompt = (
|
65 |
+
f"Based on the following spending analysis:\n{spending_analysis}\n"
|
66 |
+
f"Provide 3 actionable budget optimization recommendations to manage and reduce expenses. "
|
67 |
+
f"Each recommendation should be concise, specific, and practical (e.g., 'Reduce dining out by 20%')."
|
68 |
+
)
|
69 |
+
|
70 |
+
recommendations = self.text_generator(prompt, max_length=200, num_return_sequences=1, temperature=0.7)[0]["generated_text"]
|
71 |
+
return recommendations
|
utils/report_generator.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
import pandas as pd
|
3 |
+
import io
|
4 |
+
import base64
|
5 |
+
|
6 |
+
def generate_report(df, spending_analysis, recommendations):
|
7 |
+
report = f"""
|
8 |
+
## Invoice Reader & Budget Categorizer Report
|
9 |
+
|
10 |
+
### Categorized Transactions
|
11 |
+
{df.to_markdown(index=False)}
|
12 |
+
|
13 |
+
### Spending Insights
|
14 |
+
{spending_analysis}
|
15 |
+
|
16 |
+
### Budget Recommendations
|
17 |
+
{recommendations}
|
18 |
+
"""
|
19 |
+
|
20 |
+
# Generate visualizations
|
21 |
+
if not df.empty:
|
22 |
+
# Category-wise spending pie chart
|
23 |
+
category_spending = df.groupby("category")["amount"].sum()
|
24 |
+
plt.figure(figsize=(6, 6))
|
25 |
+
plt.pie(category_spending, labels=category_spending.index, autopct="%1.1f%%")
|
26 |
+
plt.title("Category-wise Spending")
|
27 |
+
buf = io.BytesIO()
|
28 |
+
plt.savefig(buf, format="png")
|
29 |
+
buf.seek(0)
|
30 |
+
category_plot = base64.b64encode(buf.getvalue()).decode("utf-8")
|
31 |
+
plt.close()
|
32 |
+
|
33 |
+
# Monthly spending trend
|
34 |
+
monthly_spending = df.groupby("month")["amount"].sum()
|
35 |
+
plt.figure(figsize=(8, 4))
|
36 |
+
monthly_spending.plot(kind="line", marker="o")
|
37 |
+
plt.title("Monthly Spending Trend")
|
38 |
+
plt.xlabel("Month")
|
39 |
+
plt.ylabel("Amount")
|
40 |
+
buf = io.BytesIO()
|
41 |
+
plt.savefig(buf, format="png")
|
42 |
+
buf.seek(0)
|
43 |
+
monthly_plot = base64.b64encode(buf.getvalue()).decode("utf-8")
|
44 |
+
plt.close()
|
45 |
+
|
46 |
+
report += f"""
|
47 |
+
### Visualizations
|
48 |
+
#### Category-wise Spending
|
49 |
+

|
50 |
+
|
51 |
+
#### Monthly Spending Trend
|
52 |
+

|
53 |
+
"""
|
54 |
+
|
55 |
+
return report
|