Spaces:

gopichandra
/

SMART_KYC_OCR

Sleeping

App Files Files Community

gopichandra commited on Sep 8, 2025

Commit

7147400

verified ·

1 Parent(s): a4d0a87

Update utils.py

Browse files

Files changed (1) hide show

utils.py +156 -68

utils.py CHANGED Viewed

@@ -1,17 +1,44 @@
-from paddleocr import PaddleOCR
 import re
-# Initialize OCR
 ocr = PaddleOCR(use_angle_cls=True, lang='en')
 def extract_kyc_fields(file_path, force_type=None):
     try:
         result = ocr.ocr(file_path, cls=True)
         lines = []
         for block in result:
             for line in block:
-                text = line[1][0].strip()
                 if text:
                     lines.append(text)
@@ -20,96 +47,157 @@ def extract_kyc_fields(file_path, force_type=None):
         if force_type:
             card_type = force_type.upper()
         else:
-            pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
-            aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
             card_type = "UNKNOWN"
-            if pan_match:
                 card_type = "PAN"
-            elif aadhaar_match:
                 card_type = "AADHAAR"
         response = {"card_type": card_type}
         if card_type == "PAN":
-            pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
-            if pan_match:
-                response["pan_number"] = pan_match.group(0)
-            response["dob"] = extract_dob(lines)
-            response["name"] = extract_pan_name(lines)
         elif card_type == "AADHAAR":
-            aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
-            if aadhaar_match:
-                response["aadhaar_number"] = aadhaar_match.group(0)
-            response["dob"] = extract_dob(lines)
-            response["gender"] = extract_gender(lines)
-            response["name"] = extract_aadhaar_name(lines)
         else:
             response["error"] = "Could not identify document as PAN or Aadhaar."
         return response
     except Exception as e:
         return {"error": f"OCR processing failed: {str(e)}"}
-def extract_dob(lines):
-    dob = "Not found"
-    for line in lines:
-        match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
-        if match:
-            return match.group(0)
-    for line in lines:
-        match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
-        if match:
-            return match.group(0)
     for line in lines:
-        match = re.search(r'\b(19|20)\d{2}\b', line)
-        if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
-            return match.group(0)
-    return dob
-def extract_gender(lines):
     for line in lines:
-        if "MALE" in line.upper():
-            return "MALE"
-        elif "FEMALE" in line.upper():
-            return "FEMALE"
-        elif "TRANSGENDER" in line.upper():
-            return "TRANSGENDER"
     return "Not found"
-def extract_pan_name(lines):
-    for i in range(len(lines)):
-        if "INCOME TAX DEPARTMENT" in lines[i].upper():
             for j in range(i + 1, len(lines)):
-                possible = lines[j].strip()
-                if (
-                    re.match(r'^[A-Z\s.]+$', possible)
-                    and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"])
-                    and not re.search(r'\d', possible)
-                ):
-                    return possible.strip()
     return "Not found"
-def extract_aadhaar_name(lines):
     for i, line in enumerate(lines):
-        if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
-            possible_name = lines[i - 1].strip()
-            if (
-                not re.search(r'\d', possible_name)
-                and len(possible_name.split()) >= 2
-                and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
-            ):
-                return possible_name
     for line in lines:
-        if (
-            not re.search(r'\d', line)
-            and len(line.split()) >= 2
-            and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
-        ):
             return line.strip()
     return "Not found"

+import os
 import re
+from datetime import datetime
+from simple_salesforce import Salesforce
+from paddleocr import PaddleOCR
+# -----------------------------------
+# OCR SETUP
+# -----------------------------------
+os.environ.setdefault("OMP_NUM_THREADS", "1")  # limit threads for stability
 ocr = PaddleOCR(use_angle_cls=True, lang='en')
+# Regex patterns
+PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b'
+AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b'
+DOB_REGEXES = [
+    r'\b\d{2}[./-]\d{2}[./-]\d{4}\b',
+    r'\b\d{4}-\d{2}-\d{2}\b',
+    r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b',
+    r'\b(19|20)\d{2}\b'
+]
+GENDERS = ["MALE", "FEMALE", "TRANSGENDER"]  # kept for completeness (not stored)
+# -----------------------------------
+# OCR HELPERS
+# -----------------------------------
 def extract_kyc_fields(file_path, force_type=None):
+    """
+    Returns a dict with:
+      card_type: PAN | AADHAAR | UNKNOWN
+      pan_number / aadhaar_number
+      name (best-guess)
+      dob  (best-guess for the detected card)
+    """
     try:
         result = ocr.ocr(file_path, cls=True)
         lines = []
         for block in result:
             for line in block:
+                text = re.sub(r'\s+', ' ', line[1][0].strip())
                 if text:
                     lines.append(text)
         if force_type:
             card_type = force_type.upper()
         else:
             card_type = "UNKNOWN"
+            if re.search(PAN_REGEX, full_text):
                 card_type = "PAN"
+            elif re.search(AADHAAR_REGEX, full_text):
                 card_type = "AADHAAR"
         response = {"card_type": card_type}
         if card_type == "PAN":
+            response["pan_number"] = _first_match(PAN_REGEX, full_text) or "Not found"
+            response["dob"] = _extract_dob(lines)
+            response["name"] = _extract_pan_name(lines)
         elif card_type == "AADHAAR":
+            response["aadhaar_number"] = _first_match(AADHAAR_REGEX, full_text) or "Not found"
+            response["dob"] = _extract_dob(lines)
+            response["name"] = _extract_aadhaar_name(lines)
         else:
             response["error"] = "Could not identify document as PAN or Aadhaar."
+            # best-effort generic fields
+            response["dob"] = _extract_dob(lines)
+            response["name"] = _extract_generic_name(lines)
         return response
     except Exception as e:
         return {"error": f"OCR processing failed: {str(e)}"}
+def _first_match(pattern, text, flags=0):
+    m = re.search(pattern, text, flags)
+    return m.group(0) if m else None
+def _extract_dob(lines):
+    # Try common formats
     for line in lines:
+        for pattern in DOB_REGEXES[:-1]:
+            m = re.search(pattern, line, re.IGNORECASE)
+            if m:
+                return m.group(0)
+    # Year-only with labels
     for line in lines:
+        m = re.search(DOB_REGEXES[-1], line)
+        if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH", "DOB"]):
+            return m.group(0)
     return "Not found"
+def _extract_pan_name(lines):
+    for i, line in enumerate(lines):
+        if "INCOME TAX DEPARTMENT" in line.upper():
             for j in range(i + 1, len(lines)):
+                candidate = lines[j].strip()
+                if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate):
+                    if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]):
+                        return candidate
     return "Not found"
+def _extract_aadhaar_name(lines):
+    # Heuristic: Name usually above DOB
     for i, line in enumerate(lines):
+        if any(re.search(p, line, re.IGNORECASE) for p in DOB_REGEXES):
+            if i > 0:
+                candidate = lines[i - 1].strip()
+                if _looks_like_name(candidate):
+                    return candidate
+    # Fallback
+    for line in lines:
+        if _looks_like_name(line.strip()):
+            return line.strip()
+    return "Not found"
+def _extract_generic_name(lines):
     for line in lines:
+        if _looks_like_name(line.strip()):
             return line.strip()
     return "Not found"
+def _looks_like_name(text):
+    if re.search(r'\d', text):
+        return False
+    if len(text.split()) < 2:
+        return False
+    banned = ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]
+    return not any(b in text.upper() for b in banned)
+# -----------------------------------
+# SALESFORCE HELPERS
+# -----------------------------------
+SF_USERNAME = os.getenv("SF_USERNAME", "")
+SF_PASSWORD = os.getenv("SF_PASSWORD", "")
+SF_TOKEN    = os.getenv("SF_TOKEN", "")
+SF_DOMAIN   = os.getenv("SF_DOMAIN", "login")  # "login"=prod, "test"=sandbox
+def connect_salesforce():
+    try:
+        sf = Salesforce(
+            username=SF_USERNAME,
+            password=SF_PASSWORD,
+            security_token=SF_TOKEN,
+            domain=SF_DOMAIN
+        )
+        print(f"✅ Connected to Salesforce ({SF_DOMAIN})")
+        return sf
+    except Exception as e:
+        print("❌ Salesforce login failed:", e)
+        return None
+def create_kyc_record(sf, kyc_data, file_name=None, agent_id=None):
+    """
+    Creates a record in KYC_Record__c with the fields:
+      Aadhaar_Name__c, Aadhaar_DOB__c, Aadhaar_Number__c
+      Pan_Name__c,    Pan_DOB__c,    PAN_Number__c
+    Optionally includes Agent__c if you pass agent_id and that field exists.
+    """
+    try:
+        if not sf:
+            return {"status": "error", "message": "Salesforce not connected"}
+        # Normalize values
+        def val_or_blank(key): return (kyc_data.get(key) or "").replace("Not found", "")
+        record = {
+            "Aadhaar_Name__c":  "",
+            "Aadhaar_DOB__c":   "",
+            "Aadhaar_Number__c":"",
+            "Pan_Name__c":      "",
+            "Pan_DOB__c":       "",
+            "PAN_Number__c":    "",
+        }
+        ct = (kyc_data.get("card_type") or "").upper()
+        if ct == "AADHAAR":
+            record["Aadhaar_Name__c"]   = val_or_blank("name")
+            record["Aadhaar_DOB__c"]    = val_or_blank("dob")
+            record["Aadhaar_Number__c"] = val_or_blank("aadhaar_number")
+        elif ct == "PAN":
+            record["Pan_Name__c"]    = val_or_blank("name")
+            record["Pan_DOB__c"]     = val_or_blank("dob")
+            record["PAN_Number__c"]  = val_or_blank("pan_number")
+        else:
+            # Unknown: best effort — fill name/dob into Aadhaar side to avoid losing data
+            record["Aadhaar_Name__c"] = val_or_blank("name")
+            record["Aadhaar_DOB__c"]  = val_or_blank("dob")
+        # Optionally include Agent__c if provided (and exists in your org)
+        if agent_id:
+            record["Agent__c"] = agent_id
+        # Optionally store file name in a text field if you have one (not required by you):
+        # record["KYC_File_Name__c"] = file_name or ""
+        resp = sf.KYC_Record__c.create(record)
+        return {"status": "success", "record_id": resp.get("id")}
+    except Exception as e:
+        return {"status": "error", "message": str(e)}