Spaces:
Sleeping
Sleeping
File size: 7,111 Bytes
7147400 c70099c 7147400 c70099c 7147400 5ebcb93 ae2e698 7147400 254fdf9 7147400 a8683a1 ae2e698 8324e53 7147400 8324e53 7147400 8324e53 ae2e698 8324e53 ae2e698 254fdf9 b07dfbb 254fdf9 7147400 254fdf9 7147400 254fdf9 a726fb2 2c3e33d 7147400 2c3e33d 254fdf9 7147400 a726fb2 254fdf9 7147400 254fdf9 a726fb2 a8683a1 8324e53 254fdf9 7147400 254fdf9 7147400 254fdf9 7147400 65bef46 7147400 65bef46 7147400 65bef46 7147400 65bef46 7147400 65bef46 7147400 65bef46 7147400 65bef46 7147400 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
import os
import re
from datetime import datetime
from simple_salesforce import Salesforce
from paddleocr import PaddleOCR
# -----------------------------------
# OCR SETUP
# -----------------------------------
os.environ.setdefault("OMP_NUM_THREADS", "1") # limit threads for stability
ocr = PaddleOCR(use_angle_cls=True, lang='en')
# Regex patterns
PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b'
AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b'
DOB_REGEXES = [
r'\b\d{2}[./-]\d{2}[./-]\d{4}\b',
r'\b\d{4}-\d{2}-\d{2}\b',
r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b',
r'\b(19|20)\d{2}\b'
]
GENDERS = ["MALE", "FEMALE", "TRANSGENDER"] # kept for completeness (not stored)
# -----------------------------------
# OCR HELPERS
# -----------------------------------
def extract_kyc_fields(file_path, force_type=None):
"""
Returns a dict with:
card_type: PAN | AADHAAR | UNKNOWN
pan_number / aadhaar_number
name (best-guess)
dob (best-guess for the detected card)
"""
try:
result = ocr.ocr(file_path, cls=True)
lines = []
for block in result:
for line in block:
text = re.sub(r'\s+', ' ', line[1][0].strip())
if text:
lines.append(text)
full_text = "\n".join(lines)
if force_type:
card_type = force_type.upper()
else:
card_type = "UNKNOWN"
if re.search(PAN_REGEX, full_text):
card_type = "PAN"
elif re.search(AADHAAR_REGEX, full_text):
card_type = "AADHAAR"
response = {"card_type": card_type}
if card_type == "PAN":
response["pan_number"] = _first_match(PAN_REGEX, full_text) or "Not found"
response["dob"] = _extract_dob(lines)
response["name"] = _extract_pan_name(lines)
elif card_type == "AADHAAR":
response["aadhaar_number"] = _first_match(AADHAAR_REGEX, full_text) or "Not found"
response["dob"] = _extract_dob(lines)
response["name"] = _extract_aadhaar_name(lines)
else:
response["error"] = "Could not identify document as PAN or Aadhaar."
# best-effort generic fields
response["dob"] = _extract_dob(lines)
response["name"] = _extract_generic_name(lines)
return response
except Exception as e:
return {"error": f"OCR processing failed: {str(e)}"}
def _first_match(pattern, text, flags=0):
m = re.search(pattern, text, flags)
return m.group(0) if m else None
def _extract_dob(lines):
# Try common formats
for line in lines:
for pattern in DOB_REGEXES[:-1]:
m = re.search(pattern, line, re.IGNORECASE)
if m:
return m.group(0)
# Year-only with labels
for line in lines:
m = re.search(DOB_REGEXES[-1], line)
if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH", "DOB"]):
return m.group(0)
return "Not found"
def _extract_pan_name(lines):
for i, line in enumerate(lines):
if "INCOME TAX DEPARTMENT" in line.upper():
for j in range(i + 1, len(lines)):
candidate = lines[j].strip()
if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate):
if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]):
return candidate
return "Not found"
def _extract_aadhaar_name(lines):
# Heuristic: Name usually above DOB
for i, line in enumerate(lines):
if any(re.search(p, line, re.IGNORECASE) for p in DOB_REGEXES):
if i > 0:
candidate = lines[i - 1].strip()
if _looks_like_name(candidate):
return candidate
# Fallback
for line in lines:
if _looks_like_name(line.strip()):
return line.strip()
return "Not found"
def _extract_generic_name(lines):
for line in lines:
if _looks_like_name(line.strip()):
return line.strip()
return "Not found"
def _looks_like_name(text):
if re.search(r'\d', text):
return False
if len(text.split()) < 2:
return False
banned = ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]
return not any(b in text.upper() for b in banned)
# -----------------------------------
# SALESFORCE HELPERS
# -----------------------------------
SF_USERNAME = os.getenv("SF_USERNAME", "")
SF_PASSWORD = os.getenv("SF_PASSWORD", "")
SF_TOKEN = os.getenv("SF_TOKEN", "")
SF_DOMAIN = os.getenv("SF_DOMAIN", "login") # "login"=prod, "test"=sandbox
def connect_salesforce():
try:
sf = Salesforce(
username=SF_USERNAME,
password=SF_PASSWORD,
security_token=SF_TOKEN,
domain=SF_DOMAIN
)
print(f"β
Connected to Salesforce ({SF_DOMAIN})")
return sf
except Exception as e:
print("β Salesforce login failed:", e)
return None
def create_kyc_record(sf, kyc_data, file_name=None, agent_id=None):
"""
Creates a record in KYC_Record__c with the fields:
Aadhaar_Name__c, Aadhaar_DOB__c, Aadhaar_Number__c
Pan_Name__c, Pan_DOB__c, PAN_Number__c
Optionally includes Agent__c if you pass agent_id and that field exists.
"""
try:
if not sf:
return {"status": "error", "message": "Salesforce not connected"}
# Normalize values
def val_or_blank(key): return (kyc_data.get(key) or "").replace("Not found", "")
record = {
"Aadhaar_Name__c": "",
"Aadhaar_DOB__c": "",
"Aadhaar_Number__c":"",
"Pan_Name__c": "",
"Pan_DOB__c": "",
"PAN_Number__c": "",
}
ct = (kyc_data.get("card_type") or "").upper()
if ct == "AADHAAR":
record["Aadhaar_Name__c"] = val_or_blank("name")
record["Aadhaar_DOB__c"] = val_or_blank("dob")
record["Aadhaar_Number__c"] = val_or_blank("aadhaar_number")
elif ct == "PAN":
record["Pan_Name__c"] = val_or_blank("name")
record["Pan_DOB__c"] = val_or_blank("dob")
record["PAN_Number__c"] = val_or_blank("pan_number")
else:
# Unknown: best effort β fill name/dob into Aadhaar side to avoid losing data
record["Aadhaar_Name__c"] = val_or_blank("name")
record["Aadhaar_DOB__c"] = val_or_blank("dob")
# Optionally include Agent__c if provided (and exists in your org)
if agent_id:
record["Agent__c"] = agent_id
# Optionally store file name in a text field if you have one (not required by you):
# record["KYC_File_Name__c"] = file_name or ""
resp = sf.KYC_Record__c.create(record)
return {"status": "success", "record_id": resp.get("id")}
except Exception as e:
return {"status": "error", "message": str(e)}
|