# app/classifier.py import pandas as pd import json # Load UEN lookup table # data/uen_lookup.csv format: uen,company_name,category (SME/Enterprise) uen_lookup = pd.read_csv("data/uen_lookup.csv") def classify_bill(bill_file=None, extracted_fields=None): """ Classify the bill into SME, Enterprise, or Personal. Inputs: - bill_file: uploaded Streamlit file (if JSON provided) - extracted_fields: dict from OCR extraction (if using PDF) Returns: - category: 'SME', 'Enterprise', 'Personal', or 'Unknown' - details: dict with reasoning info """ if extracted_fields: # Use fields from OCR uen = extracted_fields.get("uen", "") company_name = extracted_fields.get("company_name", "").lower() poc_name = extracted_fields.get("poc_name", "").lower() address = extracted_fields.get("address", "").lower() else: # Use raw file input (assumes JSON format) try: bill_data = json.load(bill_file) except: return "Unknown", {"error": "Unable to read bill file."} uen = bill_data.get("uen", "") company_name = bill_data.get("company_name", "").lower() poc_name = bill_data.get("poc_name", "").lower() address = bill_data.get("address", "").lower() # Check UEN against lookup if uen and (uen in uen_lookup["uen"].values): category = uen_lookup[uen_lookup["uen"] == uen]["category"].values[0] return category, {"uen": uen, "match_type": "UEN Lookup Match"} # Heuristic rules if no UEN match heuristics = [] if company_name: if any(keyword in company_name for keyword in ["pte", "ltd", "inc", "corporation", "co.", "company", "corp"]): heuristics.append("Company name suggests corporate account") return "SME", {"heuristics": heuristics} if address: if any(keyword in address for keyword in ["business park", "tech park", "industrial", "tower", "suite"]): heuristics.append("Address suggests business premises") return "SME", {"heuristics": heuristics} if poc_name: if len(poc_name.split()) == 2: # Likely a personal name heuristics.append("POC looks like a personal name") return "Personal", {"heuristics": heuristics} # Fallback if nothing matches return "Unknown", {"heuristics": heuristics, "note": "Could not classify from available data."} def extract_fields(text): """ Extract UEN, company name, POC name, and address from raw OCR text. Uses regex patterns. Inputs: - text: raw OCR text Returns: - dict with extracted fields """ import re uen_match = re.search(r'\b[0-9]{8}[A-Z]\b', text) company_match = re.search(r'Bill (To|Company):\s*(.*)', text, re.IGNORECASE) poc_match = re.search(r'Attention:\s*(.*)', text, re.IGNORECASE) address_match = re.search(r'Address:\s*(.*)', text, re.IGNORECASE) return { "uen": uen_match.group(0) if uen_match else "", "company_name": company_match.group(2) if company_match else "", "poc_name": poc_match.group(1) if poc_match else "", "address": address_match.group(1) if address_match else "" }