Spaces:

NTU-Peak-2
/

Singtel_Bill_Prototype

Sleeping

App Files Files Community

cosmoruler commited on Jul 16

Commit

ec8033f

1 Parent(s): 6487a68

Add new components and update app.py

Browse files

Files changed (9) hide show

agent_chain.py +35 -0
app.py +7 -0
classifier.py +91 -0
main.py +66 -0
ocr_extractor.py +27 -0
prompts/agentic_reasoning.md +26 -0
recommender.py +35 -0
requirements.txt +11 -0
visualisation.py +45 -0

agent_chain.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# app/agent_chain.py
+from langchain import OpenAI, LLMChain
+from langchain.prompts import PromptTemplate
+import json
+# Load OpenAI model or use Hugging Face if preferred
+llm = OpenAI(temperature=0.1)
+def agentic_reasoning(bill_file):
+    # Read bill as text or structured JSON
+    try:
+        if bill_file.name.endswith(".json"):
+            bill_data = json.load(bill_file)
+            bill_text = json.dumps(bill_data)
+        else:
+            bill_text = bill_file.read().decode("utf-8", errors="ignore")
+    except:
+        bill_text = "Could not read file properly."
+    # Load prompt
+    with open("prompts/agentic_reasoning.md", "r") as f:
+        prompt_template = f.read()
+    prompt = PromptTemplate(
+        input_variables=["bill_details"],
+        template=prompt_template
+    )
+    chain = LLMChain(llm=llm, prompt=prompt)
+    # Run the agent
+    result = chain.run({"bill_details": bill_text})
+    return result.strip()

app.py CHANGED Viewed

	@@ -0,0 +1,7 @@

+import gradio as gr
+def greet(name):
+    return "Hello " + name + "!!"
+demo = gr.Interface(fn=greet, inputs="text", outputs="text")
+demo.launch()

classifier.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# app/classifier.py
+import pandas as pd
+import json
+# Load UEN lookup table
+# data/uen_lookup.csv format: uen,company_name,category (SME/Enterprise)
+uen_lookup = pd.read_csv("data/uen_lookup.csv")
+def classify_bill(bill_file=None, extracted_fields=None):
+    """
+    Classify the bill into SME, Enterprise, or Personal.
+    Inputs:
+    - bill_file: uploaded Streamlit file (if JSON provided)
+    - extracted_fields: dict from OCR extraction (if using PDF)
+    Returns:
+    - category: 'SME', 'Enterprise', 'Personal', or 'Unknown'
+    - details: dict with reasoning info
+    """
+    if extracted_fields:
+        # Use fields from OCR
+        uen = extracted_fields.get("uen", "")
+        company_name = extracted_fields.get("company_name", "").lower()
+        poc_name = extracted_fields.get("poc_name", "").lower()
+        address = extracted_fields.get("address", "").lower()
+    else:
+        # Use raw file input (assumes JSON format)
+        try:
+            bill_data = json.load(bill_file)
+        except:
+            return "Unknown", {"error": "Unable to read bill file."}
+        uen = bill_data.get("uen", "")
+        company_name = bill_data.get("company_name", "").lower()
+        poc_name = bill_data.get("poc_name", "").lower()
+        address = bill_data.get("address", "").lower()
+    # Check UEN against lookup
+    if uen and (uen in uen_lookup["uen"].values):
+        category = uen_lookup[uen_lookup["uen"] == uen]["category"].values[0]
+        return category, {"uen": uen, "match_type": "UEN Lookup Match"}
+    # Heuristic rules if no UEN match
+    heuristics = []
+    if company_name:
+        if any(keyword in company_name for keyword in ["pte", "ltd", "inc", "corporation", "co.", "company", "corp"]):
+            heuristics.append("Company name suggests corporate account")
+            return "SME", {"heuristics": heuristics}
+    if address:
+        if any(keyword in address for keyword in ["business park", "tech park", "industrial", "tower", "suite"]):
+            heuristics.append("Address suggests business premises")
+            return "SME", {"heuristics": heuristics}
+    if poc_name:
+        if len(poc_name.split()) == 2:  # Likely a personal name
+            heuristics.append("POC looks like a personal name")
+            return "Personal", {"heuristics": heuristics}
+    # Fallback if nothing matches
+    return "Unknown", {"heuristics": heuristics, "note": "Could not classify from available data."}
+def extract_fields(text):
+    """
+    Extract UEN, company name, POC name, and address from raw OCR text.
+    Uses regex patterns.
+    Inputs:
+    - text: raw OCR text
+    Returns:
+    - dict with extracted fields
+    """
+    import re
+    uen_match = re.search(r'\b[0-9]{8}[A-Z]\b', text)
+    company_match = re.search(r'Bill (To|Company):\s*(.*)', text, re.IGNORECASE)
+    poc_match = re.search(r'Attention:\s*(.*)', text, re.IGNORECASE)
+    address_match = re.search(r'Address:\s*(.*)', text, re.IGNORECASE)
+    return {
+        "uen": uen_match.group(0) if uen_match else "",
+        "company_name": company_match.group(2) if company_match else "",
+        "poc_name": poc_match.group(1) if poc_match else "",
+        "address": address_match.group(1) if address_match else ""
+    }

main.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# app/main.py
+import streamlit as st
+from app.ocr_extractor import extract_text_from_pdf
+from app.classifier import classify_bill, extract_fields
+from app.recommender import recommend_plans
+from app.visualisation import show_comparison_chart
+from app.agent_chain import agentic_reasoning
+st.set_page_config(page_title="Telco Bill Recommender", layout="wide")
+st.title("📄 Telco Bill Scanner & Plan Recommender")
+# Upload Bill
+uploaded_file = st.file_uploader("Upload your Telco Bill (PDF or JSON)", type=["pdf", "json"])
+if uploaded_file:
+    st.success("Bill uploaded successfully!")
+    # Step 0: OCR Extraction for PDFs
+    if uploaded_file.name.endswith(".pdf"):
+        st.subheader("Step 0: OCR Extraction")
+        extracted_text = extract_text_from_pdf(uploaded_file)
+        st.text_area("Extracted Text (Preview)", extracted_text[:1000])
+        # Extract structured fields from OCR text
+        fields = extract_fields(extracted_text)
+    else:
+        extracted_text = None
+        fields = None
+    # Step 1: Customer Type Classification
+    st.subheader("Step 1: Customer Type Identification")
+    if fields:
+        customer_type, details = classify_bill(None, fields)
+    else:
+        customer_type, details = classify_bill(uploaded_file)
+    st.write(f"**Detected Type:** {customer_type}")
+    st.json(details)
+    # Fallback to Agentic AI if classification uncertain
+    if customer_type == "Unknown":
+        st.warning("Classification uncertain. Using Agentic AI fallback reasoning...")
+        # Use extracted text if available; else read file as text
+        if extracted_text:
+            agent_input = extracted_text
+        else:
+            try:
+                agent_input = uploaded_file.read().decode("utf-8", errors="ignore")
+            except:
+                agent_input = "Could not read file."
+        customer_type = agentic_reasoning(agent_input)
+        st.write(f"**Agentic AI suggests:** {customer_type}")
+    # Step 2: Plan Recommendation
+    st.subheader("Step 2: Plan Recommendations")
+    recommendations = recommend_plans(uploaded_file, customer_type)
+    st.table(recommendations)
+    # Step 3: Visualisation of Savings
+    st.subheader("Step 3: Usage & Cost Comparison")
+    show_comparison_chart(uploaded_file, recommendations)

ocr_extractor.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# app/ocr_extractor.py
+import pdfplumber
+import pytesseract
+from PIL import Image
+import io
+def extract_text_from_pdf(file):
+    """
+    Extract text from PDF using pdfplumber and pytesseract fallback for images.
+    """
+    text_output = ""
+    with pdfplumber.open(file) as pdf:
+        for page in pdf.pages:
+            # Extract text if available
+            page_text = page.extract_text()
+            if page_text:
+                text_output += page_text + "\n"
+            else:
+                # If no text layer, use OCR on image
+                image = page.to_image(resolution=300)
+                pil_image = image.original
+                ocr_text = pytesseract.image_to_string(pil_image)
+                text_output += ocr_text + "\n"
+    return text_output

prompts/agentic_reasoning.md ADDED Viewed

	@@ -0,0 +1,26 @@

+You are an AI agent tasked with classifying telco bills into three categories:
+- SME (Small & Medium Enterprise)
+- Enterprise
+- Personal
+Given the following bill details, reason step-by-step to determine the correct category.
+Guidelines:
+- If the bill contains a UEN and the company name is present, check the size:
+  - SME: 10-100 employees, no dedicated account manager
+  - Enterprise: >500 employees, has dedicated account manager
+- If no company name is found and the bill is addressed to an individual, classify as Personal.
+- If details are ambiguous, make your best guess and explain why.
+Return ONLY the final classification label: **SME**, **Enterprise**, or **Personal**.
+---
+Bill Details:
+{bill_details}
+---
+Final Answer:

recommender.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# app/recommender.py
+import pandas as pd
+import json
+def recommend_plans(bill_file, customer_type):
+    """
+    Recommend SME or Enterprise plans based on usage patterns.
+    """
+    # Load plan data
+    if customer_type == "SME":
+        plans = pd.read_csv("data/plans_sme.csv")
+    elif customer_type == "Enterprise":
+        plans = pd.read_csv("data/plans_enterprise.csv")
+    else:
+        return pd.DataFrame([{"message": "No plans available for this customer type"}])
+    # Load bill data (simulate usage capture for now)
+    try:
+        if bill_file.name.endswith(".json"):
+            bill_data = json.load(bill_file)
+        else:
+            bill_data = {"lines": 10, "data_usage_gb": 50, "current_cost": 500}
+    except:
+        bill_data = {"lines": 10, "data_usage_gb": 50, "current_cost": 500}
+    # Simple matching logic: filter plans based on number of lines / data usage
+    recommended = plans[
+        (plans["min_lines"] <= bill_data["lines"]) &
+        (plans["max_lines"] >= bill_data["lines"])
+    ]
+    recommended = recommended.sort_values(by="price_per_line")
+    return recommended[["plan_name", "price_per_line", "data_quota_gb", "notes"]].head(5)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+streamlit
+langchain
+openai
+sentence-transformers
+pandas
+tqdm
+pytesseract
+pdfplumber
+scikit-learn
+python-docx
+pydantic

visualisation.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# app/visualisation.py
+import streamlit as st
+import pandas as pd
+import json
+import altair as alt
+def show_comparison_chart(bill_file, recommended_plans):
+    """
+    Visualises current vs recommended plans in a bar chart.
+    """
+    # Simulate current usage
+    try:
+        if bill_file.name.endswith(".json"):
+            bill_data = json.load(bill_file)
+        else:
+            bill_data = {"lines": 10, "data_usage_gb": 50, "current_cost": 500}
+    except:
+        bill_data = {"lines": 10, "data_usage_gb": 50, "current_cost": 500}
+    current_cost = bill_data["current_cost"]
+    current_lines = bill_data["lines"]
+    data = {
+        "Plan": ["Current Bill"] + list(recommended_plans["plan_name"]),
+        "Cost (Est)": [current_cost] + list(recommended_plans["price_per_line"] * current_lines)
+    }
+    df = pd.DataFrame(data)
+    chart = alt.Chart(df).mark_bar().encode(
+        x=alt.X('Plan', sort=None),
+        y='Cost (Est)',
+        color=alt.condition(
+            alt.datum.Plan == "Current Bill",
+            alt.value('red'),
+            alt.value('green')
+        )
+    ).properties(
+        width=600,
+        height=400,
+        title="Cost Comparison: Current vs Recommended Plans"
+    )
+    st.altair_chart(chart, use_container_width=True)