Singtel_Bill_Prototype / classifier.py
cosmoruler
Add new components and update app.py
ec8033f
# app/classifier.py
import pandas as pd
import json
# Load UEN lookup table
# data/uen_lookup.csv format: uen,company_name,category (SME/Enterprise)
uen_lookup = pd.read_csv("data/uen_lookup.csv")
def classify_bill(bill_file=None, extracted_fields=None):
"""
Classify the bill into SME, Enterprise, or Personal.
Inputs:
- bill_file: uploaded Streamlit file (if JSON provided)
- extracted_fields: dict from OCR extraction (if using PDF)
Returns:
- category: 'SME', 'Enterprise', 'Personal', or 'Unknown'
- details: dict with reasoning info
"""
if extracted_fields:
# Use fields from OCR
uen = extracted_fields.get("uen", "")
company_name = extracted_fields.get("company_name", "").lower()
poc_name = extracted_fields.get("poc_name", "").lower()
address = extracted_fields.get("address", "").lower()
else:
# Use raw file input (assumes JSON format)
try:
bill_data = json.load(bill_file)
except:
return "Unknown", {"error": "Unable to read bill file."}
uen = bill_data.get("uen", "")
company_name = bill_data.get("company_name", "").lower()
poc_name = bill_data.get("poc_name", "").lower()
address = bill_data.get("address", "").lower()
# Check UEN against lookup
if uen and (uen in uen_lookup["uen"].values):
category = uen_lookup[uen_lookup["uen"] == uen]["category"].values[0]
return category, {"uen": uen, "match_type": "UEN Lookup Match"}
# Heuristic rules if no UEN match
heuristics = []
if company_name:
if any(keyword in company_name for keyword in ["pte", "ltd", "inc", "corporation", "co.", "company", "corp"]):
heuristics.append("Company name suggests corporate account")
return "SME", {"heuristics": heuristics}
if address:
if any(keyword in address for keyword in ["business park", "tech park", "industrial", "tower", "suite"]):
heuristics.append("Address suggests business premises")
return "SME", {"heuristics": heuristics}
if poc_name:
if len(poc_name.split()) == 2: # Likely a personal name
heuristics.append("POC looks like a personal name")
return "Personal", {"heuristics": heuristics}
# Fallback if nothing matches
return "Unknown", {"heuristics": heuristics, "note": "Could not classify from available data."}
def extract_fields(text):
"""
Extract UEN, company name, POC name, and address from raw OCR text.
Uses regex patterns.
Inputs:
- text: raw OCR text
Returns:
- dict with extracted fields
"""
import re
uen_match = re.search(r'\b[0-9]{8}[A-Z]\b', text)
company_match = re.search(r'Bill (To|Company):\s*(.*)', text, re.IGNORECASE)
poc_match = re.search(r'Attention:\s*(.*)', text, re.IGNORECASE)
address_match = re.search(r'Address:\s*(.*)', text, re.IGNORECASE)
return {
"uen": uen_match.group(0) if uen_match else "",
"company_name": company_match.group(2) if company_match else "",
"poc_name": poc_match.group(1) if poc_match else "",
"address": address_match.group(1) if address_match else ""
}