Spaces:
Sleeping
Sleeping
# app/classifier.py | |
import pandas as pd | |
import json | |
# Load UEN lookup table | |
# data/uen_lookup.csv format: uen,company_name,category (SME/Enterprise) | |
uen_lookup = pd.read_csv("data/uen_lookup.csv") | |
def classify_bill(bill_file=None, extracted_fields=None): | |
""" | |
Classify the bill into SME, Enterprise, or Personal. | |
Inputs: | |
- bill_file: uploaded Streamlit file (if JSON provided) | |
- extracted_fields: dict from OCR extraction (if using PDF) | |
Returns: | |
- category: 'SME', 'Enterprise', 'Personal', or 'Unknown' | |
- details: dict with reasoning info | |
""" | |
if extracted_fields: | |
# Use fields from OCR | |
uen = extracted_fields.get("uen", "") | |
company_name = extracted_fields.get("company_name", "").lower() | |
poc_name = extracted_fields.get("poc_name", "").lower() | |
address = extracted_fields.get("address", "").lower() | |
else: | |
# Use raw file input (assumes JSON format) | |
try: | |
bill_data = json.load(bill_file) | |
except: | |
return "Unknown", {"error": "Unable to read bill file."} | |
uen = bill_data.get("uen", "") | |
company_name = bill_data.get("company_name", "").lower() | |
poc_name = bill_data.get("poc_name", "").lower() | |
address = bill_data.get("address", "").lower() | |
# Check UEN against lookup | |
if uen and (uen in uen_lookup["uen"].values): | |
category = uen_lookup[uen_lookup["uen"] == uen]["category"].values[0] | |
return category, {"uen": uen, "match_type": "UEN Lookup Match"} | |
# Heuristic rules if no UEN match | |
heuristics = [] | |
if company_name: | |
if any(keyword in company_name for keyword in ["pte", "ltd", "inc", "corporation", "co.", "company", "corp"]): | |
heuristics.append("Company name suggests corporate account") | |
return "SME", {"heuristics": heuristics} | |
if address: | |
if any(keyword in address for keyword in ["business park", "tech park", "industrial", "tower", "suite"]): | |
heuristics.append("Address suggests business premises") | |
return "SME", {"heuristics": heuristics} | |
if poc_name: | |
if len(poc_name.split()) == 2: # Likely a personal name | |
heuristics.append("POC looks like a personal name") | |
return "Personal", {"heuristics": heuristics} | |
# Fallback if nothing matches | |
return "Unknown", {"heuristics": heuristics, "note": "Could not classify from available data."} | |
def extract_fields(text): | |
""" | |
Extract UEN, company name, POC name, and address from raw OCR text. | |
Uses regex patterns. | |
Inputs: | |
- text: raw OCR text | |
Returns: | |
- dict with extracted fields | |
""" | |
import re | |
uen_match = re.search(r'\b[0-9]{8}[A-Z]\b', text) | |
company_match = re.search(r'Bill (To|Company):\s*(.*)', text, re.IGNORECASE) | |
poc_match = re.search(r'Attention:\s*(.*)', text, re.IGNORECASE) | |
address_match = re.search(r'Address:\s*(.*)', text, re.IGNORECASE) | |
return { | |
"uen": uen_match.group(0) if uen_match else "", | |
"company_name": company_match.group(2) if company_match else "", | |
"poc_name": poc_match.group(1) if poc_match else "", | |
"address": address_match.group(1) if address_match else "" | |
} | |