varun321 commited on
Commit
1d54def
·
0 Parent(s):

Initial commit

Browse files
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Invoice Reader & Budget Categorizer
2
+
3
+ This project creates an application that processes uploaded invoice PDFs, categorizes expenses, analyzes spending patterns using LLMs, and provides budget optimization recommendations.
4
+
5
+ ## Objective
6
+ Create an application that:
7
+ - Processes invoice PDFs to extract transaction details.
8
+ - Categorizes expenses into predefined categories.
9
+ - Analyzes spending trends over time.
10
+ - Provides actionable budget recommendations.
11
+
12
+ ## Features
13
+ - PDF upload and parsing.
14
+ - Transaction categorization (Utilities, Entertainment, Groceries, Travel, Shopping, Other).
15
+ - Spending pattern analysis over time.
16
+ - Budget optimization suggestions.
17
+ - Visualizations for spending insights.
18
+
19
+ ## Technical Stack
20
+ - **PDF Parsing**: `pdfplumber`
21
+ - **Data Processing**: `pandas`
22
+ - **LLM**: Hugging Face Transformers (`facebook/bart-large`)
23
+ - **Visualization**: `matplotlib`
24
+ - **Interface**: Gradio
25
+ - **Deployment**: Hugging Face Spaces
26
+
27
+ ## Setup Instructions
28
+ 1. Clone the repository:
29
+ ```bash
30
+ git clone <your-repo-url>
31
+ cd invoice-reader-budget-categorizer
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from utils.invoice_parser import parse_invoice
3
+ from utils.data_processor import process_data
4
+ from utils.llm_analyzer import LLMAnalyzer
5
+ from utils.report_generator import generate_report
6
+
7
+ # Initialize the LLM analyzer
8
+ analyzer = LLMAnalyzer()
9
+
10
+ def process_invoice(pdf_file):
11
+ try:
12
+ if not pdf_file:
13
+ return "No file uploaded."
14
+
15
+ transactions = parse_invoice(pdf_file) # Note: pdf_file is now a path string
16
+ if not transactions:
17
+ return "No transactions found in the invoice."
18
+
19
+ df = process_data(transactions)
20
+ if df.empty:
21
+ return "No valid transactions after processing."
22
+
23
+ df_categorized = analyzer.categorize_transactions(df)
24
+ spending_analysis = analyzer.analyze_spending_patterns(df_categorized)
25
+
26
+ recommendations = analyzer.generate_budget_recommendations(spending_analysis)
27
+
28
+ report = generate_report(df_categorized, spending_analysis, recommendations)
29
+
30
+ return report
31
+
32
+ except Exception as e:
33
+ return f"An error occurred: {str(e)}"
34
+
35
+ interface = gr.Interface(
36
+ fn=process_invoice,
37
+ inputs=gr.File(label="Upload Invoice (PDF)", type="filepath"), # Changed to "filepath"
38
+ outputs=gr.Markdown(label="Invoice Analysis Report"),
39
+ title="Invoice Reader & Budget Categorizer",
40
+ description="Upload your invoice PDF to categorize transactions, analyze spending patterns, and get budget optimization recommendations."
41
+ )
42
+
43
+ if __name__ == "__main__":
44
+ interface.launch(share=True) # Generates a public link
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pdfplumber==0.11.4
2
+ pandas==2.2.3
3
+ transformers==4.44.2
4
+ torch==2.4.1
5
+ gradio==4.44.0
6
+ matplotlib==3.9.2
7
+ numpy==1.26.4
8
+ tabulate
utils/__pycache__/data_processor.cpython-39.pyc ADDED
Binary file (538 Bytes). View file
 
utils/__pycache__/invoice_parser.cpython-39.pyc ADDED
Binary file (1 kB). View file
 
utils/__pycache__/llm_analyzer.cpython-39.pyc ADDED
Binary file (2.78 kB). View file
 
utils/__pycache__/report_generator.cpython-39.pyc ADDED
Binary file (1.81 kB). View file
 
utils/data_processor.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ def process_data(transactions):
3
+ df = pd.DataFrame(transactions)
4
+
5
+ if df.empty:
6
+ return df
7
+
8
+ # Ensure correct data types
9
+ df["date"] = pd.to_datetime(df["date"])
10
+ df["amount"] = df["amount"].astype(float)
11
+
12
+ # Add time-based columns
13
+ df["month"] = df["date"].dt.to_period("M")
14
+ df["week"] = df["date"].dt.isocalendar().week
15
+
16
+ return df
utils/invoice_parser.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ import re
3
+ from datetime import datetime
4
+
5
+ def parse_invoice(pdf_file):
6
+ transactions = []
7
+
8
+ with pdfplumber.open(pdf_file) as pdf:
9
+ for page in pdf.pages:
10
+ text = page.extract_text()
11
+ lines = text.split("\n")
12
+
13
+ for line in lines:
14
+ # Example pattern for invoice transaction
15
+ pattern = r"(\d{2}/\d{2}/\d{4})\s+(.+?)\s+([\d,.]+)\s+(.+)"
16
+ match = re.match(pattern, line.strip())
17
+
18
+ if match:
19
+ date_str, vendor, amount, description = match.groups()
20
+ try:
21
+ date = datetime.strptime(date_str, "%d/%m/%Y")
22
+ amount = float(amount.replace(",", ""))
23
+ transactions.append({
24
+ "date": date,
25
+ "vendor": vendor.strip(),
26
+ "amount": amount,
27
+ "description": description.strip()
28
+ })
29
+ except Exception as e:
30
+ print(f"Error parsing line: {line}, Error: {e}")
31
+ continue
32
+
33
+ return transactions
utils/llm_analyzer.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import pandas as pd
3
+
4
+ class LLMAnalyzer:
5
+ def __init__(self):
6
+ self.summarizer = pipeline("summarization", model="facebook/bart-large", clean_up_tokenization_spaces=True)
7
+ self.text_generator = pipeline("text-generation", model="facebook/bart-large", clean_up_tokenization_spaces=True)
8
+ self.predefined_categories = ["Utilities", "Entertainment", "Groceries", "Travel", "Shopping", "Other"]
9
+
10
+ def categorize_transactions(self, df):
11
+ if df.empty:
12
+ return df
13
+
14
+ categorized_df = df.copy()
15
+ transaction_data = df.to_dict(orient="records")
16
+ categorized = []
17
+
18
+ for t in transaction_data:
19
+ desc = t["description"].lower()
20
+ if "electricity" in desc or "water" in desc:
21
+ category = "Utilities"
22
+ elif "movie" in desc or "game" in desc:
23
+ category = "Entertainment"
24
+ elif "grocery" in desc or "food" in desc:
25
+ category = "Groceries"
26
+ elif "travel" in desc or "flight" in desc:
27
+ category = "Travel"
28
+ elif "shop" in desc or "amazon" in desc:
29
+ category = "Shopping"
30
+ else:
31
+ category = "Other"
32
+
33
+ categorized.append({
34
+ "date": t["date"],
35
+ "vendor": t["vendor"],
36
+ "amount": t["amount"],
37
+ "description": t["description"],
38
+ "month": t["month"],
39
+ "week": t["week"],
40
+ "category": category
41
+ })
42
+
43
+ return pd.DataFrame(categorized)
44
+
45
+ def analyze_spending_patterns(self, df):
46
+ if df.empty:
47
+ return "No transactions available for analysis."
48
+
49
+ # Convert Period objects to strings for better summarization
50
+ monthly_spending = df.groupby("month").agg({"amount": "sum"}).rename(index=str).to_dict()["amount"]
51
+ weekly_spending = df.groupby("week").agg({"amount": "sum"}).to_dict()["amount"]
52
+ category_summary = df.groupby("category").agg({"amount": "sum"}).to_dict()["amount"]
53
+
54
+ analysis_text = (
55
+ f"Monthly Spending Trends: {monthly_spending}\n"
56
+ f"Weekly Spending Trends: {weekly_spending}\n"
57
+ f"Category-wise Spending: {category_summary}"
58
+ )
59
+
60
+ summary = self.summarizer(analysis_text, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
61
+ return summary
62
+
63
+ def generate_budget_recommendations(self, spending_analysis):
64
+ prompt = (
65
+ f"Based on the following spending analysis:\n{spending_analysis}\n"
66
+ f"Provide 3 actionable budget optimization recommendations to manage and reduce expenses. "
67
+ f"Each recommendation should be concise, specific, and practical (e.g., 'Reduce dining out by 20%')."
68
+ )
69
+
70
+ recommendations = self.text_generator(prompt, max_length=200, num_return_sequences=1, temperature=0.7)[0]["generated_text"]
71
+ return recommendations
utils/report_generator.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import pandas as pd
3
+ import io
4
+ import base64
5
+
6
+ def generate_report(df, spending_analysis, recommendations):
7
+ report = f"""
8
+ ## Invoice Reader & Budget Categorizer Report
9
+
10
+ ### Categorized Transactions
11
+ {df.to_markdown(index=False)}
12
+
13
+ ### Spending Insights
14
+ {spending_analysis}
15
+
16
+ ### Budget Recommendations
17
+ {recommendations}
18
+ """
19
+
20
+ # Generate visualizations
21
+ if not df.empty:
22
+ # Category-wise spending pie chart
23
+ category_spending = df.groupby("category")["amount"].sum()
24
+ plt.figure(figsize=(6, 6))
25
+ plt.pie(category_spending, labels=category_spending.index, autopct="%1.1f%%")
26
+ plt.title("Category-wise Spending")
27
+ buf = io.BytesIO()
28
+ plt.savefig(buf, format="png")
29
+ buf.seek(0)
30
+ category_plot = base64.b64encode(buf.getvalue()).decode("utf-8")
31
+ plt.close()
32
+
33
+ # Monthly spending trend
34
+ monthly_spending = df.groupby("month")["amount"].sum()
35
+ plt.figure(figsize=(8, 4))
36
+ monthly_spending.plot(kind="line", marker="o")
37
+ plt.title("Monthly Spending Trend")
38
+ plt.xlabel("Month")
39
+ plt.ylabel("Amount")
40
+ buf = io.BytesIO()
41
+ plt.savefig(buf, format="png")
42
+ buf.seek(0)
43
+ monthly_plot = base64.b64encode(buf.getvalue()).decode("utf-8")
44
+ plt.close()
45
+
46
+ report += f"""
47
+ ### Visualizations
48
+ #### Category-wise Spending
49
+ ![Category-wise Spending](data:image/png;base64,{category_plot})
50
+
51
+ #### Monthly Spending Trend
52
+ ![Monthly Spending Trend](data:image/png;base64,{monthly_plot})
53
+ """
54
+
55
+ return report