Spaces:
Sleeping
Sleeping
cosmoruler
commited on
Commit
·
ec8033f
1
Parent(s):
6487a68
Add new components and update app.py
Browse files- agent_chain.py +35 -0
- app.py +7 -0
- classifier.py +91 -0
- main.py +66 -0
- ocr_extractor.py +27 -0
- prompts/agentic_reasoning.md +26 -0
- recommender.py +35 -0
- requirements.txt +11 -0
- visualisation.py +45 -0
agent_chain.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/agent_chain.py
|
2 |
+
|
3 |
+
from langchain import OpenAI, LLMChain
|
4 |
+
from langchain.prompts import PromptTemplate
|
5 |
+
import json
|
6 |
+
|
7 |
+
# Load OpenAI model or use Hugging Face if preferred
|
8 |
+
llm = OpenAI(temperature=0.1)
|
9 |
+
|
10 |
+
def agentic_reasoning(bill_file):
|
11 |
+
# Read bill as text or structured JSON
|
12 |
+
try:
|
13 |
+
if bill_file.name.endswith(".json"):
|
14 |
+
bill_data = json.load(bill_file)
|
15 |
+
bill_text = json.dumps(bill_data)
|
16 |
+
else:
|
17 |
+
bill_text = bill_file.read().decode("utf-8", errors="ignore")
|
18 |
+
except:
|
19 |
+
bill_text = "Could not read file properly."
|
20 |
+
|
21 |
+
# Load prompt
|
22 |
+
with open("prompts/agentic_reasoning.md", "r") as f:
|
23 |
+
prompt_template = f.read()
|
24 |
+
|
25 |
+
prompt = PromptTemplate(
|
26 |
+
input_variables=["bill_details"],
|
27 |
+
template=prompt_template
|
28 |
+
)
|
29 |
+
|
30 |
+
chain = LLMChain(llm=llm, prompt=prompt)
|
31 |
+
|
32 |
+
# Run the agent
|
33 |
+
result = chain.run({"bill_details": bill_text})
|
34 |
+
|
35 |
+
return result.strip()
|
app.py
CHANGED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
def greet(name):
|
4 |
+
return "Hello " + name + "!!"
|
5 |
+
|
6 |
+
demo = gr.Interface(fn=greet, inputs="text", outputs="text")
|
7 |
+
demo.launch()
|
classifier.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/classifier.py
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import json
|
5 |
+
|
6 |
+
# Load UEN lookup table
|
7 |
+
# data/uen_lookup.csv format: uen,company_name,category (SME/Enterprise)
|
8 |
+
uen_lookup = pd.read_csv("data/uen_lookup.csv")
|
9 |
+
|
10 |
+
|
11 |
+
def classify_bill(bill_file=None, extracted_fields=None):
|
12 |
+
"""
|
13 |
+
Classify the bill into SME, Enterprise, or Personal.
|
14 |
+
|
15 |
+
Inputs:
|
16 |
+
- bill_file: uploaded Streamlit file (if JSON provided)
|
17 |
+
- extracted_fields: dict from OCR extraction (if using PDF)
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
- category: 'SME', 'Enterprise', 'Personal', or 'Unknown'
|
21 |
+
- details: dict with reasoning info
|
22 |
+
"""
|
23 |
+
if extracted_fields:
|
24 |
+
# Use fields from OCR
|
25 |
+
uen = extracted_fields.get("uen", "")
|
26 |
+
company_name = extracted_fields.get("company_name", "").lower()
|
27 |
+
poc_name = extracted_fields.get("poc_name", "").lower()
|
28 |
+
address = extracted_fields.get("address", "").lower()
|
29 |
+
else:
|
30 |
+
# Use raw file input (assumes JSON format)
|
31 |
+
try:
|
32 |
+
bill_data = json.load(bill_file)
|
33 |
+
except:
|
34 |
+
return "Unknown", {"error": "Unable to read bill file."}
|
35 |
+
|
36 |
+
uen = bill_data.get("uen", "")
|
37 |
+
company_name = bill_data.get("company_name", "").lower()
|
38 |
+
poc_name = bill_data.get("poc_name", "").lower()
|
39 |
+
address = bill_data.get("address", "").lower()
|
40 |
+
|
41 |
+
# Check UEN against lookup
|
42 |
+
if uen and (uen in uen_lookup["uen"].values):
|
43 |
+
category = uen_lookup[uen_lookup["uen"] == uen]["category"].values[0]
|
44 |
+
return category, {"uen": uen, "match_type": "UEN Lookup Match"}
|
45 |
+
|
46 |
+
# Heuristic rules if no UEN match
|
47 |
+
heuristics = []
|
48 |
+
|
49 |
+
if company_name:
|
50 |
+
if any(keyword in company_name for keyword in ["pte", "ltd", "inc", "corporation", "co.", "company", "corp"]):
|
51 |
+
heuristics.append("Company name suggests corporate account")
|
52 |
+
return "SME", {"heuristics": heuristics}
|
53 |
+
|
54 |
+
if address:
|
55 |
+
if any(keyword in address for keyword in ["business park", "tech park", "industrial", "tower", "suite"]):
|
56 |
+
heuristics.append("Address suggests business premises")
|
57 |
+
return "SME", {"heuristics": heuristics}
|
58 |
+
|
59 |
+
if poc_name:
|
60 |
+
if len(poc_name.split()) == 2: # Likely a personal name
|
61 |
+
heuristics.append("POC looks like a personal name")
|
62 |
+
return "Personal", {"heuristics": heuristics}
|
63 |
+
|
64 |
+
# Fallback if nothing matches
|
65 |
+
return "Unknown", {"heuristics": heuristics, "note": "Could not classify from available data."}
|
66 |
+
|
67 |
+
|
68 |
+
def extract_fields(text):
|
69 |
+
"""
|
70 |
+
Extract UEN, company name, POC name, and address from raw OCR text.
|
71 |
+
Uses regex patterns.
|
72 |
+
|
73 |
+
Inputs:
|
74 |
+
- text: raw OCR text
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
- dict with extracted fields
|
78 |
+
"""
|
79 |
+
import re
|
80 |
+
|
81 |
+
uen_match = re.search(r'\b[0-9]{8}[A-Z]\b', text)
|
82 |
+
company_match = re.search(r'Bill (To|Company):\s*(.*)', text, re.IGNORECASE)
|
83 |
+
poc_match = re.search(r'Attention:\s*(.*)', text, re.IGNORECASE)
|
84 |
+
address_match = re.search(r'Address:\s*(.*)', text, re.IGNORECASE)
|
85 |
+
|
86 |
+
return {
|
87 |
+
"uen": uen_match.group(0) if uen_match else "",
|
88 |
+
"company_name": company_match.group(2) if company_match else "",
|
89 |
+
"poc_name": poc_match.group(1) if poc_match else "",
|
90 |
+
"address": address_match.group(1) if address_match else ""
|
91 |
+
}
|
main.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/main.py
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
from app.ocr_extractor import extract_text_from_pdf
|
5 |
+
from app.classifier import classify_bill, extract_fields
|
6 |
+
from app.recommender import recommend_plans
|
7 |
+
from app.visualisation import show_comparison_chart
|
8 |
+
from app.agent_chain import agentic_reasoning
|
9 |
+
|
10 |
+
st.set_page_config(page_title="Telco Bill Recommender", layout="wide")
|
11 |
+
|
12 |
+
st.title("📄 Telco Bill Scanner & Plan Recommender")
|
13 |
+
|
14 |
+
# Upload Bill
|
15 |
+
uploaded_file = st.file_uploader("Upload your Telco Bill (PDF or JSON)", type=["pdf", "json"])
|
16 |
+
|
17 |
+
if uploaded_file:
|
18 |
+
st.success("Bill uploaded successfully!")
|
19 |
+
|
20 |
+
# Step 0: OCR Extraction for PDFs
|
21 |
+
if uploaded_file.name.endswith(".pdf"):
|
22 |
+
st.subheader("Step 0: OCR Extraction")
|
23 |
+
extracted_text = extract_text_from_pdf(uploaded_file)
|
24 |
+
st.text_area("Extracted Text (Preview)", extracted_text[:1000])
|
25 |
+
|
26 |
+
# Extract structured fields from OCR text
|
27 |
+
fields = extract_fields(extracted_text)
|
28 |
+
else:
|
29 |
+
extracted_text = None
|
30 |
+
fields = None
|
31 |
+
|
32 |
+
# Step 1: Customer Type Classification
|
33 |
+
st.subheader("Step 1: Customer Type Identification")
|
34 |
+
|
35 |
+
if fields:
|
36 |
+
customer_type, details = classify_bill(None, fields)
|
37 |
+
else:
|
38 |
+
customer_type, details = classify_bill(uploaded_file)
|
39 |
+
|
40 |
+
st.write(f"**Detected Type:** {customer_type}")
|
41 |
+
st.json(details)
|
42 |
+
|
43 |
+
# Fallback to Agentic AI if classification uncertain
|
44 |
+
if customer_type == "Unknown":
|
45 |
+
st.warning("Classification uncertain. Using Agentic AI fallback reasoning...")
|
46 |
+
|
47 |
+
# Use extracted text if available; else read file as text
|
48 |
+
if extracted_text:
|
49 |
+
agent_input = extracted_text
|
50 |
+
else:
|
51 |
+
try:
|
52 |
+
agent_input = uploaded_file.read().decode("utf-8", errors="ignore")
|
53 |
+
except:
|
54 |
+
agent_input = "Could not read file."
|
55 |
+
|
56 |
+
customer_type = agentic_reasoning(agent_input)
|
57 |
+
st.write(f"**Agentic AI suggests:** {customer_type}")
|
58 |
+
|
59 |
+
# Step 2: Plan Recommendation
|
60 |
+
st.subheader("Step 2: Plan Recommendations")
|
61 |
+
recommendations = recommend_plans(uploaded_file, customer_type)
|
62 |
+
st.table(recommendations)
|
63 |
+
|
64 |
+
# Step 3: Visualisation of Savings
|
65 |
+
st.subheader("Step 3: Usage & Cost Comparison")
|
66 |
+
show_comparison_chart(uploaded_file, recommendations)
|
ocr_extractor.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/ocr_extractor.py
|
2 |
+
|
3 |
+
import pdfplumber
|
4 |
+
import pytesseract
|
5 |
+
from PIL import Image
|
6 |
+
import io
|
7 |
+
|
8 |
+
def extract_text_from_pdf(file):
|
9 |
+
"""
|
10 |
+
Extract text from PDF using pdfplumber and pytesseract fallback for images.
|
11 |
+
"""
|
12 |
+
text_output = ""
|
13 |
+
|
14 |
+
with pdfplumber.open(file) as pdf:
|
15 |
+
for page in pdf.pages:
|
16 |
+
# Extract text if available
|
17 |
+
page_text = page.extract_text()
|
18 |
+
if page_text:
|
19 |
+
text_output += page_text + "\n"
|
20 |
+
else:
|
21 |
+
# If no text layer, use OCR on image
|
22 |
+
image = page.to_image(resolution=300)
|
23 |
+
pil_image = image.original
|
24 |
+
ocr_text = pytesseract.image_to_string(pil_image)
|
25 |
+
text_output += ocr_text + "\n"
|
26 |
+
|
27 |
+
return text_output
|
prompts/agentic_reasoning.md
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
You are an AI agent tasked with classifying telco bills into three categories:
|
2 |
+
|
3 |
+
- SME (Small & Medium Enterprise)
|
4 |
+
- Enterprise
|
5 |
+
- Personal
|
6 |
+
|
7 |
+
Given the following bill details, reason step-by-step to determine the correct category.
|
8 |
+
|
9 |
+
Guidelines:
|
10 |
+
|
11 |
+
- If the bill contains a UEN and the company name is present, check the size:
|
12 |
+
- SME: 10-100 employees, no dedicated account manager
|
13 |
+
- Enterprise: >500 employees, has dedicated account manager
|
14 |
+
- If no company name is found and the bill is addressed to an individual, classify as Personal.
|
15 |
+
- If details are ambiguous, make your best guess and explain why.
|
16 |
+
|
17 |
+
Return ONLY the final classification label: **SME**, **Enterprise**, or **Personal**.
|
18 |
+
|
19 |
+
---
|
20 |
+
|
21 |
+
Bill Details:
|
22 |
+
{bill_details}
|
23 |
+
|
24 |
+
---
|
25 |
+
|
26 |
+
Final Answer:
|
recommender.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/recommender.py
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import json
|
5 |
+
|
6 |
+
def recommend_plans(bill_file, customer_type):
|
7 |
+
"""
|
8 |
+
Recommend SME or Enterprise plans based on usage patterns.
|
9 |
+
"""
|
10 |
+
# Load plan data
|
11 |
+
if customer_type == "SME":
|
12 |
+
plans = pd.read_csv("data/plans_sme.csv")
|
13 |
+
elif customer_type == "Enterprise":
|
14 |
+
plans = pd.read_csv("data/plans_enterprise.csv")
|
15 |
+
else:
|
16 |
+
return pd.DataFrame([{"message": "No plans available for this customer type"}])
|
17 |
+
|
18 |
+
# Load bill data (simulate usage capture for now)
|
19 |
+
try:
|
20 |
+
if bill_file.name.endswith(".json"):
|
21 |
+
bill_data = json.load(bill_file)
|
22 |
+
else:
|
23 |
+
bill_data = {"lines": 10, "data_usage_gb": 50, "current_cost": 500}
|
24 |
+
except:
|
25 |
+
bill_data = {"lines": 10, "data_usage_gb": 50, "current_cost": 500}
|
26 |
+
|
27 |
+
# Simple matching logic: filter plans based on number of lines / data usage
|
28 |
+
recommended = plans[
|
29 |
+
(plans["min_lines"] <= bill_data["lines"]) &
|
30 |
+
(plans["max_lines"] >= bill_data["lines"])
|
31 |
+
]
|
32 |
+
|
33 |
+
recommended = recommended.sort_values(by="price_per_line")
|
34 |
+
|
35 |
+
return recommended[["plan_name", "price_per_line", "data_quota_gb", "notes"]].head(5)
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
langchain
|
3 |
+
openai
|
4 |
+
sentence-transformers
|
5 |
+
pandas
|
6 |
+
tqdm
|
7 |
+
pytesseract
|
8 |
+
pdfplumber
|
9 |
+
scikit-learn
|
10 |
+
python-docx
|
11 |
+
pydantic
|
visualisation.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/visualisation.py
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
import pandas as pd
|
5 |
+
import json
|
6 |
+
import altair as alt
|
7 |
+
|
8 |
+
def show_comparison_chart(bill_file, recommended_plans):
|
9 |
+
"""
|
10 |
+
Visualises current vs recommended plans in a bar chart.
|
11 |
+
"""
|
12 |
+
# Simulate current usage
|
13 |
+
try:
|
14 |
+
if bill_file.name.endswith(".json"):
|
15 |
+
bill_data = json.load(bill_file)
|
16 |
+
else:
|
17 |
+
bill_data = {"lines": 10, "data_usage_gb": 50, "current_cost": 500}
|
18 |
+
except:
|
19 |
+
bill_data = {"lines": 10, "data_usage_gb": 50, "current_cost": 500}
|
20 |
+
|
21 |
+
current_cost = bill_data["current_cost"]
|
22 |
+
current_lines = bill_data["lines"]
|
23 |
+
|
24 |
+
data = {
|
25 |
+
"Plan": ["Current Bill"] + list(recommended_plans["plan_name"]),
|
26 |
+
"Cost (Est)": [current_cost] + list(recommended_plans["price_per_line"] * current_lines)
|
27 |
+
}
|
28 |
+
|
29 |
+
df = pd.DataFrame(data)
|
30 |
+
|
31 |
+
chart = alt.Chart(df).mark_bar().encode(
|
32 |
+
x=alt.X('Plan', sort=None),
|
33 |
+
y='Cost (Est)',
|
34 |
+
color=alt.condition(
|
35 |
+
alt.datum.Plan == "Current Bill",
|
36 |
+
alt.value('red'),
|
37 |
+
alt.value('green')
|
38 |
+
)
|
39 |
+
).properties(
|
40 |
+
width=600,
|
41 |
+
height=400,
|
42 |
+
title="Cost Comparison: Current vs Recommended Plans"
|
43 |
+
)
|
44 |
+
|
45 |
+
st.altair_chart(chart, use_container_width=True)
|