Spaces:
Sleeping
Sleeping
import os | |
import re | |
from datetime import datetime | |
from simple_salesforce import Salesforce | |
from paddleocr import PaddleOCR | |
# ----------------------------------- | |
# OCR SETUP | |
# ----------------------------------- | |
os.environ.setdefault("OMP_NUM_THREADS", "1") # limit threads for stability | |
ocr = PaddleOCR(use_angle_cls=True, lang='en') | |
# Regex patterns | |
PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b' | |
AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b' | |
DOB_REGEXES = [ | |
r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', | |
r'\b\d{4}-\d{2}-\d{2}\b', | |
r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b', | |
r'\b(19|20)\d{2}\b' | |
] | |
GENDERS = ["MALE", "FEMALE", "TRANSGENDER"] # kept for completeness (not stored) | |
# ----------------------------------- | |
# OCR HELPERS | |
# ----------------------------------- | |
def extract_kyc_fields(file_path, force_type=None): | |
""" | |
Returns a dict with: | |
card_type: PAN | AADHAAR | UNKNOWN | |
pan_number / aadhaar_number | |
name (best-guess) | |
dob (best-guess for the detected card) | |
""" | |
try: | |
result = ocr.ocr(file_path, cls=True) | |
lines = [] | |
for block in result: | |
for line in block: | |
text = re.sub(r'\s+', ' ', line[1][0].strip()) | |
if text: | |
lines.append(text) | |
full_text = "\n".join(lines) | |
if force_type: | |
card_type = force_type.upper() | |
else: | |
card_type = "UNKNOWN" | |
if re.search(PAN_REGEX, full_text): | |
card_type = "PAN" | |
elif re.search(AADHAAR_REGEX, full_text): | |
card_type = "AADHAAR" | |
response = {"card_type": card_type} | |
if card_type == "PAN": | |
response["pan_number"] = _first_match(PAN_REGEX, full_text) or "Not found" | |
response["dob"] = _extract_dob(lines) | |
response["name"] = _extract_pan_name(lines) | |
elif card_type == "AADHAAR": | |
response["aadhaar_number"] = _first_match(AADHAAR_REGEX, full_text) or "Not found" | |
response["dob"] = _extract_dob(lines) | |
response["name"] = _extract_aadhaar_name(lines) | |
else: | |
response["error"] = "Could not identify document as PAN or Aadhaar." | |
# best-effort generic fields | |
response["dob"] = _extract_dob(lines) | |
response["name"] = _extract_generic_name(lines) | |
return response | |
except Exception as e: | |
return {"error": f"OCR processing failed: {str(e)}"} | |
def _first_match(pattern, text, flags=0): | |
m = re.search(pattern, text, flags) | |
return m.group(0) if m else None | |
def _extract_dob(lines): | |
# Try common formats | |
for line in lines: | |
for pattern in DOB_REGEXES[:-1]: | |
m = re.search(pattern, line, re.IGNORECASE) | |
if m: | |
return m.group(0) | |
# Year-only with labels | |
for line in lines: | |
m = re.search(DOB_REGEXES[-1], line) | |
if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH", "DOB"]): | |
return m.group(0) | |
return "Not found" | |
def _extract_pan_name(lines): | |
for i, line in enumerate(lines): | |
if "INCOME TAX DEPARTMENT" in line.upper(): | |
for j in range(i + 1, len(lines)): | |
candidate = lines[j].strip() | |
if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate): | |
if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]): | |
return candidate | |
return "Not found" | |
def _extract_aadhaar_name(lines): | |
# Heuristic: Name usually above DOB | |
for i, line in enumerate(lines): | |
if any(re.search(p, line, re.IGNORECASE) for p in DOB_REGEXES): | |
if i > 0: | |
candidate = lines[i - 1].strip() | |
if _looks_like_name(candidate): | |
return candidate | |
# Fallback | |
for line in lines: | |
if _looks_like_name(line.strip()): | |
return line.strip() | |
return "Not found" | |
def _extract_generic_name(lines): | |
for line in lines: | |
if _looks_like_name(line.strip()): | |
return line.strip() | |
return "Not found" | |
def _looks_like_name(text): | |
if re.search(r'\d', text): | |
return False | |
if len(text.split()) < 2: | |
return False | |
banned = ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"] | |
return not any(b in text.upper() for b in banned) | |
# ----------------------------------- | |
# SALESFORCE HELPERS | |
# ----------------------------------- | |
SF_USERNAME = os.getenv("SF_USERNAME", "") | |
SF_PASSWORD = os.getenv("SF_PASSWORD", "") | |
SF_TOKEN = os.getenv("SF_TOKEN", "") | |
SF_DOMAIN = os.getenv("SF_DOMAIN", "login") # "login"=prod, "test"=sandbox | |
def connect_salesforce(): | |
try: | |
sf = Salesforce( | |
username=SF_USERNAME, | |
password=SF_PASSWORD, | |
security_token=SF_TOKEN, | |
domain=SF_DOMAIN | |
) | |
print(f"β Connected to Salesforce ({SF_DOMAIN})") | |
return sf | |
except Exception as e: | |
print("β Salesforce login failed:", e) | |
return None | |
def create_kyc_record(sf, kyc_data, file_name=None, agent_id=None): | |
""" | |
Creates a record in KYC_Record__c with the fields: | |
Aadhaar_Name__c, Aadhaar_DOB__c, Aadhaar_Number__c | |
Pan_Name__c, Pan_DOB__c, PAN_Number__c | |
Optionally includes Agent__c if you pass agent_id and that field exists. | |
""" | |
try: | |
if not sf: | |
return {"status": "error", "message": "Salesforce not connected"} | |
# Normalize values | |
def val_or_blank(key): return (kyc_data.get(key) or "").replace("Not found", "") | |
record = { | |
"Aadhaar_Name__c": "", | |
"Aadhaar_DOB__c": "", | |
"Aadhaar_Number__c":"", | |
"Pan_Name__c": "", | |
"Pan_DOB__c": "", | |
"PAN_Number__c": "", | |
} | |
ct = (kyc_data.get("card_type") or "").upper() | |
if ct == "AADHAAR": | |
record["Aadhaar_Name__c"] = val_or_blank("name") | |
record["Aadhaar_DOB__c"] = val_or_blank("dob") | |
record["Aadhaar_Number__c"] = val_or_blank("aadhaar_number") | |
elif ct == "PAN": | |
record["Pan_Name__c"] = val_or_blank("name") | |
record["Pan_DOB__c"] = val_or_blank("dob") | |
record["PAN_Number__c"] = val_or_blank("pan_number") | |
else: | |
# Unknown: best effort β fill name/dob into Aadhaar side to avoid losing data | |
record["Aadhaar_Name__c"] = val_or_blank("name") | |
record["Aadhaar_DOB__c"] = val_or_blank("dob") | |
# Optionally include Agent__c if provided (and exists in your org) | |
if agent_id: | |
record["Agent__c"] = agent_id | |
# Optionally store file name in a text field if you have one (not required by you): | |
# record["KYC_File_Name__c"] = file_name or "" | |
resp = sf.KYC_Record__c.create(record) | |
return {"status": "success", "record_id": resp.get("id")} | |
except Exception as e: | |
return {"status": "error", "message": str(e)} | |