SMART_KYC_OCR / utils.py
gopichandra's picture
Update utils.py
7147400 verified
raw
history blame
7.11 kB
import os
import re
from datetime import datetime
from simple_salesforce import Salesforce
from paddleocr import PaddleOCR
# -----------------------------------
# OCR SETUP
# -----------------------------------
os.environ.setdefault("OMP_NUM_THREADS", "1") # limit threads for stability
ocr = PaddleOCR(use_angle_cls=True, lang='en')
# Regex patterns
PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b'
AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b'
DOB_REGEXES = [
r'\b\d{2}[./-]\d{2}[./-]\d{4}\b',
r'\b\d{4}-\d{2}-\d{2}\b',
r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b',
r'\b(19|20)\d{2}\b'
]
GENDERS = ["MALE", "FEMALE", "TRANSGENDER"] # kept for completeness (not stored)
# -----------------------------------
# OCR HELPERS
# -----------------------------------
def extract_kyc_fields(file_path, force_type=None):
"""
Returns a dict with:
card_type: PAN | AADHAAR | UNKNOWN
pan_number / aadhaar_number
name (best-guess)
dob (best-guess for the detected card)
"""
try:
result = ocr.ocr(file_path, cls=True)
lines = []
for block in result:
for line in block:
text = re.sub(r'\s+', ' ', line[1][0].strip())
if text:
lines.append(text)
full_text = "\n".join(lines)
if force_type:
card_type = force_type.upper()
else:
card_type = "UNKNOWN"
if re.search(PAN_REGEX, full_text):
card_type = "PAN"
elif re.search(AADHAAR_REGEX, full_text):
card_type = "AADHAAR"
response = {"card_type": card_type}
if card_type == "PAN":
response["pan_number"] = _first_match(PAN_REGEX, full_text) or "Not found"
response["dob"] = _extract_dob(lines)
response["name"] = _extract_pan_name(lines)
elif card_type == "AADHAAR":
response["aadhaar_number"] = _first_match(AADHAAR_REGEX, full_text) or "Not found"
response["dob"] = _extract_dob(lines)
response["name"] = _extract_aadhaar_name(lines)
else:
response["error"] = "Could not identify document as PAN or Aadhaar."
# best-effort generic fields
response["dob"] = _extract_dob(lines)
response["name"] = _extract_generic_name(lines)
return response
except Exception as e:
return {"error": f"OCR processing failed: {str(e)}"}
def _first_match(pattern, text, flags=0):
m = re.search(pattern, text, flags)
return m.group(0) if m else None
def _extract_dob(lines):
# Try common formats
for line in lines:
for pattern in DOB_REGEXES[:-1]:
m = re.search(pattern, line, re.IGNORECASE)
if m:
return m.group(0)
# Year-only with labels
for line in lines:
m = re.search(DOB_REGEXES[-1], line)
if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH", "DOB"]):
return m.group(0)
return "Not found"
def _extract_pan_name(lines):
for i, line in enumerate(lines):
if "INCOME TAX DEPARTMENT" in line.upper():
for j in range(i + 1, len(lines)):
candidate = lines[j].strip()
if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate):
if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]):
return candidate
return "Not found"
def _extract_aadhaar_name(lines):
# Heuristic: Name usually above DOB
for i, line in enumerate(lines):
if any(re.search(p, line, re.IGNORECASE) for p in DOB_REGEXES):
if i > 0:
candidate = lines[i - 1].strip()
if _looks_like_name(candidate):
return candidate
# Fallback
for line in lines:
if _looks_like_name(line.strip()):
return line.strip()
return "Not found"
def _extract_generic_name(lines):
for line in lines:
if _looks_like_name(line.strip()):
return line.strip()
return "Not found"
def _looks_like_name(text):
if re.search(r'\d', text):
return False
if len(text.split()) < 2:
return False
banned = ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]
return not any(b in text.upper() for b in banned)
# -----------------------------------
# SALESFORCE HELPERS
# -----------------------------------
SF_USERNAME = os.getenv("SF_USERNAME", "")
SF_PASSWORD = os.getenv("SF_PASSWORD", "")
SF_TOKEN = os.getenv("SF_TOKEN", "")
SF_DOMAIN = os.getenv("SF_DOMAIN", "login") # "login"=prod, "test"=sandbox
def connect_salesforce():
try:
sf = Salesforce(
username=SF_USERNAME,
password=SF_PASSWORD,
security_token=SF_TOKEN,
domain=SF_DOMAIN
)
print(f"βœ… Connected to Salesforce ({SF_DOMAIN})")
return sf
except Exception as e:
print("❌ Salesforce login failed:", e)
return None
def create_kyc_record(sf, kyc_data, file_name=None, agent_id=None):
"""
Creates a record in KYC_Record__c with the fields:
Aadhaar_Name__c, Aadhaar_DOB__c, Aadhaar_Number__c
Pan_Name__c, Pan_DOB__c, PAN_Number__c
Optionally includes Agent__c if you pass agent_id and that field exists.
"""
try:
if not sf:
return {"status": "error", "message": "Salesforce not connected"}
# Normalize values
def val_or_blank(key): return (kyc_data.get(key) or "").replace("Not found", "")
record = {
"Aadhaar_Name__c": "",
"Aadhaar_DOB__c": "",
"Aadhaar_Number__c":"",
"Pan_Name__c": "",
"Pan_DOB__c": "",
"PAN_Number__c": "",
}
ct = (kyc_data.get("card_type") or "").upper()
if ct == "AADHAAR":
record["Aadhaar_Name__c"] = val_or_blank("name")
record["Aadhaar_DOB__c"] = val_or_blank("dob")
record["Aadhaar_Number__c"] = val_or_blank("aadhaar_number")
elif ct == "PAN":
record["Pan_Name__c"] = val_or_blank("name")
record["Pan_DOB__c"] = val_or_blank("dob")
record["PAN_Number__c"] = val_or_blank("pan_number")
else:
# Unknown: best effort β€” fill name/dob into Aadhaar side to avoid losing data
record["Aadhaar_Name__c"] = val_or_blank("name")
record["Aadhaar_DOB__c"] = val_or_blank("dob")
# Optionally include Agent__c if provided (and exists in your org)
if agent_id:
record["Agent__c"] = agent_id
# Optionally store file name in a text field if you have one (not required by you):
# record["KYC_File_Name__c"] = file_name or ""
resp = sf.KYC_Record__c.create(record)
return {"status": "success", "record_id": resp.get("id")}
except Exception as e:
return {"status": "error", "message": str(e)}