cosmoruler commited on
Commit
ec8033f
·
1 Parent(s): 6487a68

Add new components and update app.py

Browse files
Files changed (9) hide show
  1. agent_chain.py +35 -0
  2. app.py +7 -0
  3. classifier.py +91 -0
  4. main.py +66 -0
  5. ocr_extractor.py +27 -0
  6. prompts/agentic_reasoning.md +26 -0
  7. recommender.py +35 -0
  8. requirements.txt +11 -0
  9. visualisation.py +45 -0
agent_chain.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/agent_chain.py
2
+
3
+ from langchain import OpenAI, LLMChain
4
+ from langchain.prompts import PromptTemplate
5
+ import json
6
+
7
+ # Load OpenAI model or use Hugging Face if preferred
8
+ llm = OpenAI(temperature=0.1)
9
+
10
+ def agentic_reasoning(bill_file):
11
+ # Read bill as text or structured JSON
12
+ try:
13
+ if bill_file.name.endswith(".json"):
14
+ bill_data = json.load(bill_file)
15
+ bill_text = json.dumps(bill_data)
16
+ else:
17
+ bill_text = bill_file.read().decode("utf-8", errors="ignore")
18
+ except:
19
+ bill_text = "Could not read file properly."
20
+
21
+ # Load prompt
22
+ with open("prompts/agentic_reasoning.md", "r") as f:
23
+ prompt_template = f.read()
24
+
25
+ prompt = PromptTemplate(
26
+ input_variables=["bill_details"],
27
+ template=prompt_template
28
+ )
29
+
30
+ chain = LLMChain(llm=llm, prompt=prompt)
31
+
32
+ # Run the agent
33
+ result = chain.run({"bill_details": bill_text})
34
+
35
+ return result.strip()
app.py CHANGED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def greet(name):
4
+ return "Hello " + name + "!!"
5
+
6
+ demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
+ demo.launch()
classifier.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/classifier.py
2
+
3
+ import pandas as pd
4
+ import json
5
+
6
+ # Load UEN lookup table
7
+ # data/uen_lookup.csv format: uen,company_name,category (SME/Enterprise)
8
+ uen_lookup = pd.read_csv("data/uen_lookup.csv")
9
+
10
+
11
+ def classify_bill(bill_file=None, extracted_fields=None):
12
+ """
13
+ Classify the bill into SME, Enterprise, or Personal.
14
+
15
+ Inputs:
16
+ - bill_file: uploaded Streamlit file (if JSON provided)
17
+ - extracted_fields: dict from OCR extraction (if using PDF)
18
+
19
+ Returns:
20
+ - category: 'SME', 'Enterprise', 'Personal', or 'Unknown'
21
+ - details: dict with reasoning info
22
+ """
23
+ if extracted_fields:
24
+ # Use fields from OCR
25
+ uen = extracted_fields.get("uen", "")
26
+ company_name = extracted_fields.get("company_name", "").lower()
27
+ poc_name = extracted_fields.get("poc_name", "").lower()
28
+ address = extracted_fields.get("address", "").lower()
29
+ else:
30
+ # Use raw file input (assumes JSON format)
31
+ try:
32
+ bill_data = json.load(bill_file)
33
+ except:
34
+ return "Unknown", {"error": "Unable to read bill file."}
35
+
36
+ uen = bill_data.get("uen", "")
37
+ company_name = bill_data.get("company_name", "").lower()
38
+ poc_name = bill_data.get("poc_name", "").lower()
39
+ address = bill_data.get("address", "").lower()
40
+
41
+ # Check UEN against lookup
42
+ if uen and (uen in uen_lookup["uen"].values):
43
+ category = uen_lookup[uen_lookup["uen"] == uen]["category"].values[0]
44
+ return category, {"uen": uen, "match_type": "UEN Lookup Match"}
45
+
46
+ # Heuristic rules if no UEN match
47
+ heuristics = []
48
+
49
+ if company_name:
50
+ if any(keyword in company_name for keyword in ["pte", "ltd", "inc", "corporation", "co.", "company", "corp"]):
51
+ heuristics.append("Company name suggests corporate account")
52
+ return "SME", {"heuristics": heuristics}
53
+
54
+ if address:
55
+ if any(keyword in address for keyword in ["business park", "tech park", "industrial", "tower", "suite"]):
56
+ heuristics.append("Address suggests business premises")
57
+ return "SME", {"heuristics": heuristics}
58
+
59
+ if poc_name:
60
+ if len(poc_name.split()) == 2: # Likely a personal name
61
+ heuristics.append("POC looks like a personal name")
62
+ return "Personal", {"heuristics": heuristics}
63
+
64
+ # Fallback if nothing matches
65
+ return "Unknown", {"heuristics": heuristics, "note": "Could not classify from available data."}
66
+
67
+
68
+ def extract_fields(text):
69
+ """
70
+ Extract UEN, company name, POC name, and address from raw OCR text.
71
+ Uses regex patterns.
72
+
73
+ Inputs:
74
+ - text: raw OCR text
75
+
76
+ Returns:
77
+ - dict with extracted fields
78
+ """
79
+ import re
80
+
81
+ uen_match = re.search(r'\b[0-9]{8}[A-Z]\b', text)
82
+ company_match = re.search(r'Bill (To|Company):\s*(.*)', text, re.IGNORECASE)
83
+ poc_match = re.search(r'Attention:\s*(.*)', text, re.IGNORECASE)
84
+ address_match = re.search(r'Address:\s*(.*)', text, re.IGNORECASE)
85
+
86
+ return {
87
+ "uen": uen_match.group(0) if uen_match else "",
88
+ "company_name": company_match.group(2) if company_match else "",
89
+ "poc_name": poc_match.group(1) if poc_match else "",
90
+ "address": address_match.group(1) if address_match else ""
91
+ }
main.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/main.py
2
+
3
+ import streamlit as st
4
+ from app.ocr_extractor import extract_text_from_pdf
5
+ from app.classifier import classify_bill, extract_fields
6
+ from app.recommender import recommend_plans
7
+ from app.visualisation import show_comparison_chart
8
+ from app.agent_chain import agentic_reasoning
9
+
10
+ st.set_page_config(page_title="Telco Bill Recommender", layout="wide")
11
+
12
+ st.title("📄 Telco Bill Scanner & Plan Recommender")
13
+
14
+ # Upload Bill
15
+ uploaded_file = st.file_uploader("Upload your Telco Bill (PDF or JSON)", type=["pdf", "json"])
16
+
17
+ if uploaded_file:
18
+ st.success("Bill uploaded successfully!")
19
+
20
+ # Step 0: OCR Extraction for PDFs
21
+ if uploaded_file.name.endswith(".pdf"):
22
+ st.subheader("Step 0: OCR Extraction")
23
+ extracted_text = extract_text_from_pdf(uploaded_file)
24
+ st.text_area("Extracted Text (Preview)", extracted_text[:1000])
25
+
26
+ # Extract structured fields from OCR text
27
+ fields = extract_fields(extracted_text)
28
+ else:
29
+ extracted_text = None
30
+ fields = None
31
+
32
+ # Step 1: Customer Type Classification
33
+ st.subheader("Step 1: Customer Type Identification")
34
+
35
+ if fields:
36
+ customer_type, details = classify_bill(None, fields)
37
+ else:
38
+ customer_type, details = classify_bill(uploaded_file)
39
+
40
+ st.write(f"**Detected Type:** {customer_type}")
41
+ st.json(details)
42
+
43
+ # Fallback to Agentic AI if classification uncertain
44
+ if customer_type == "Unknown":
45
+ st.warning("Classification uncertain. Using Agentic AI fallback reasoning...")
46
+
47
+ # Use extracted text if available; else read file as text
48
+ if extracted_text:
49
+ agent_input = extracted_text
50
+ else:
51
+ try:
52
+ agent_input = uploaded_file.read().decode("utf-8", errors="ignore")
53
+ except:
54
+ agent_input = "Could not read file."
55
+
56
+ customer_type = agentic_reasoning(agent_input)
57
+ st.write(f"**Agentic AI suggests:** {customer_type}")
58
+
59
+ # Step 2: Plan Recommendation
60
+ st.subheader("Step 2: Plan Recommendations")
61
+ recommendations = recommend_plans(uploaded_file, customer_type)
62
+ st.table(recommendations)
63
+
64
+ # Step 3: Visualisation of Savings
65
+ st.subheader("Step 3: Usage & Cost Comparison")
66
+ show_comparison_chart(uploaded_file, recommendations)
ocr_extractor.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/ocr_extractor.py
2
+
3
+ import pdfplumber
4
+ import pytesseract
5
+ from PIL import Image
6
+ import io
7
+
8
+ def extract_text_from_pdf(file):
9
+ """
10
+ Extract text from PDF using pdfplumber and pytesseract fallback for images.
11
+ """
12
+ text_output = ""
13
+
14
+ with pdfplumber.open(file) as pdf:
15
+ for page in pdf.pages:
16
+ # Extract text if available
17
+ page_text = page.extract_text()
18
+ if page_text:
19
+ text_output += page_text + "\n"
20
+ else:
21
+ # If no text layer, use OCR on image
22
+ image = page.to_image(resolution=300)
23
+ pil_image = image.original
24
+ ocr_text = pytesseract.image_to_string(pil_image)
25
+ text_output += ocr_text + "\n"
26
+
27
+ return text_output
prompts/agentic_reasoning.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are an AI agent tasked with classifying telco bills into three categories:
2
+
3
+ - SME (Small & Medium Enterprise)
4
+ - Enterprise
5
+ - Personal
6
+
7
+ Given the following bill details, reason step-by-step to determine the correct category.
8
+
9
+ Guidelines:
10
+
11
+ - If the bill contains a UEN and the company name is present, check the size:
12
+ - SME: 10-100 employees, no dedicated account manager
13
+ - Enterprise: >500 employees, has dedicated account manager
14
+ - If no company name is found and the bill is addressed to an individual, classify as Personal.
15
+ - If details are ambiguous, make your best guess and explain why.
16
+
17
+ Return ONLY the final classification label: **SME**, **Enterprise**, or **Personal**.
18
+
19
+ ---
20
+
21
+ Bill Details:
22
+ {bill_details}
23
+
24
+ ---
25
+
26
+ Final Answer:
recommender.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/recommender.py
2
+
3
+ import pandas as pd
4
+ import json
5
+
6
+ def recommend_plans(bill_file, customer_type):
7
+ """
8
+ Recommend SME or Enterprise plans based on usage patterns.
9
+ """
10
+ # Load plan data
11
+ if customer_type == "SME":
12
+ plans = pd.read_csv("data/plans_sme.csv")
13
+ elif customer_type == "Enterprise":
14
+ plans = pd.read_csv("data/plans_enterprise.csv")
15
+ else:
16
+ return pd.DataFrame([{"message": "No plans available for this customer type"}])
17
+
18
+ # Load bill data (simulate usage capture for now)
19
+ try:
20
+ if bill_file.name.endswith(".json"):
21
+ bill_data = json.load(bill_file)
22
+ else:
23
+ bill_data = {"lines": 10, "data_usage_gb": 50, "current_cost": 500}
24
+ except:
25
+ bill_data = {"lines": 10, "data_usage_gb": 50, "current_cost": 500}
26
+
27
+ # Simple matching logic: filter plans based on number of lines / data usage
28
+ recommended = plans[
29
+ (plans["min_lines"] <= bill_data["lines"]) &
30
+ (plans["max_lines"] >= bill_data["lines"])
31
+ ]
32
+
33
+ recommended = recommended.sort_values(by="price_per_line")
34
+
35
+ return recommended[["plan_name", "price_per_line", "data_quota_gb", "notes"]].head(5)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ langchain
3
+ openai
4
+ sentence-transformers
5
+ pandas
6
+ tqdm
7
+ pytesseract
8
+ pdfplumber
9
+ scikit-learn
10
+ python-docx
11
+ pydantic
visualisation.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/visualisation.py
2
+
3
+ import streamlit as st
4
+ import pandas as pd
5
+ import json
6
+ import altair as alt
7
+
8
+ def show_comparison_chart(bill_file, recommended_plans):
9
+ """
10
+ Visualises current vs recommended plans in a bar chart.
11
+ """
12
+ # Simulate current usage
13
+ try:
14
+ if bill_file.name.endswith(".json"):
15
+ bill_data = json.load(bill_file)
16
+ else:
17
+ bill_data = {"lines": 10, "data_usage_gb": 50, "current_cost": 500}
18
+ except:
19
+ bill_data = {"lines": 10, "data_usage_gb": 50, "current_cost": 500}
20
+
21
+ current_cost = bill_data["current_cost"]
22
+ current_lines = bill_data["lines"]
23
+
24
+ data = {
25
+ "Plan": ["Current Bill"] + list(recommended_plans["plan_name"]),
26
+ "Cost (Est)": [current_cost] + list(recommended_plans["price_per_line"] * current_lines)
27
+ }
28
+
29
+ df = pd.DataFrame(data)
30
+
31
+ chart = alt.Chart(df).mark_bar().encode(
32
+ x=alt.X('Plan', sort=None),
33
+ y='Cost (Est)',
34
+ color=alt.condition(
35
+ alt.datum.Plan == "Current Bill",
36
+ alt.value('red'),
37
+ alt.value('green')
38
+ )
39
+ ).properties(
40
+ width=600,
41
+ height=400,
42
+ title="Cost Comparison: Current vs Recommended Plans"
43
+ )
44
+
45
+ st.altair_chart(chart, use_container_width=True)