File size: 3,264 Bytes
ec8033f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# app/classifier.py

import pandas as pd
import json

# Load UEN lookup table
# data/uen_lookup.csv format: uen,company_name,category (SME/Enterprise)
uen_lookup = pd.read_csv("data/uen_lookup.csv")


def classify_bill(bill_file=None, extracted_fields=None):
    """
    Classify the bill into SME, Enterprise, or Personal.
    
    Inputs:
    - bill_file: uploaded Streamlit file (if JSON provided)
    - extracted_fields: dict from OCR extraction (if using PDF)
    
    Returns:
    - category: 'SME', 'Enterprise', 'Personal', or 'Unknown'
    - details: dict with reasoning info
    """
    if extracted_fields:
        # Use fields from OCR
        uen = extracted_fields.get("uen", "")
        company_name = extracted_fields.get("company_name", "").lower()
        poc_name = extracted_fields.get("poc_name", "").lower()
        address = extracted_fields.get("address", "").lower()
    else:
        # Use raw file input (assumes JSON format)
        try:
            bill_data = json.load(bill_file)
        except:
            return "Unknown", {"error": "Unable to read bill file."}

        uen = bill_data.get("uen", "")
        company_name = bill_data.get("company_name", "").lower()
        poc_name = bill_data.get("poc_name", "").lower()
        address = bill_data.get("address", "").lower()

    # Check UEN against lookup
    if uen and (uen in uen_lookup["uen"].values):
        category = uen_lookup[uen_lookup["uen"] == uen]["category"].values[0]
        return category, {"uen": uen, "match_type": "UEN Lookup Match"}

    # Heuristic rules if no UEN match
    heuristics = []

    if company_name:
        if any(keyword in company_name for keyword in ["pte", "ltd", "inc", "corporation", "co.", "company", "corp"]):
            heuristics.append("Company name suggests corporate account")
            return "SME", {"heuristics": heuristics}

    if address:
        if any(keyword in address for keyword in ["business park", "tech park", "industrial", "tower", "suite"]):
            heuristics.append("Address suggests business premises")
            return "SME", {"heuristics": heuristics}

    if poc_name:
        if len(poc_name.split()) == 2:  # Likely a personal name
            heuristics.append("POC looks like a personal name")
            return "Personal", {"heuristics": heuristics}

    # Fallback if nothing matches
    return "Unknown", {"heuristics": heuristics, "note": "Could not classify from available data."}


def extract_fields(text):
    """
    Extract UEN, company name, POC name, and address from raw OCR text.
    Uses regex patterns.
    
    Inputs:
    - text: raw OCR text
    
    Returns:
    - dict with extracted fields
    """
    import re

    uen_match = re.search(r'\b[0-9]{8}[A-Z]\b', text)
    company_match = re.search(r'Bill (To|Company):\s*(.*)', text, re.IGNORECASE)
    poc_match = re.search(r'Attention:\s*(.*)', text, re.IGNORECASE)
    address_match = re.search(r'Address:\s*(.*)', text, re.IGNORECASE)

    return {
        "uen": uen_match.group(0) if uen_match else "",
        "company_name": company_match.group(2) if company_match else "",
        "poc_name": poc_match.group(1) if poc_match else "",
        "address": address_match.group(1) if address_match else ""
    }