import os import re from datetime import datetime from simple_salesforce import Salesforce from paddleocr import PaddleOCR # ----------------------------------- # OCR SETUP # ----------------------------------- os.environ.setdefault("OMP_NUM_THREADS", "1") # limit threads for stability ocr = PaddleOCR(use_angle_cls=True, lang='en') # Regex patterns PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b' AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b' DOB_REGEXES = [ r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', r'\b\d{4}-\d{2}-\d{2}\b', r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b', r'\b(19|20)\d{2}\b' ] GENDERS = ["MALE", "FEMALE", "TRANSGENDER"] # kept for completeness (not stored) # ----------------------------------- # OCR HELPERS # ----------------------------------- def extract_kyc_fields(file_path, force_type=None): """ Returns a dict with: card_type: PAN | AADHAAR | UNKNOWN pan_number / aadhaar_number name (best-guess) dob (best-guess for the detected card) """ try: result = ocr.ocr(file_path, cls=True) lines = [] for block in result: for line in block: text = re.sub(r'\s+', ' ', line[1][0].strip()) if text: lines.append(text) full_text = "\n".join(lines) if force_type: card_type = force_type.upper() else: card_type = "UNKNOWN" if re.search(PAN_REGEX, full_text): card_type = "PAN" elif re.search(AADHAAR_REGEX, full_text): card_type = "AADHAAR" response = {"card_type": card_type} if card_type == "PAN": response["pan_number"] = _first_match(PAN_REGEX, full_text) or "Not found" response["dob"] = _extract_dob(lines) response["name"] = _extract_pan_name(lines) elif card_type == "AADHAAR": response["aadhaar_number"] = _first_match(AADHAAR_REGEX, full_text) or "Not found" response["dob"] = _extract_dob(lines) response["name"] = _extract_aadhaar_name(lines) else: response["error"] = "Could not identify document as PAN or Aadhaar." # best-effort generic fields response["dob"] = _extract_dob(lines) response["name"] = _extract_generic_name(lines) return response except Exception as e: return {"error": f"OCR processing failed: {str(e)}"} def _first_match(pattern, text, flags=0): m = re.search(pattern, text, flags) return m.group(0) if m else None def _extract_dob(lines): # Try common formats for line in lines: for pattern in DOB_REGEXES[:-1]: m = re.search(pattern, line, re.IGNORECASE) if m: return m.group(0) # Year-only with labels for line in lines: m = re.search(DOB_REGEXES[-1], line) if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH", "DOB"]): return m.group(0) return "Not found" def _extract_pan_name(lines): for i, line in enumerate(lines): if "INCOME TAX DEPARTMENT" in line.upper(): for j in range(i + 1, len(lines)): candidate = lines[j].strip() if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate): if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]): return candidate return "Not found" def _extract_aadhaar_name(lines): # Heuristic: Name usually above DOB for i, line in enumerate(lines): if any(re.search(p, line, re.IGNORECASE) for p in DOB_REGEXES): if i > 0: candidate = lines[i - 1].strip() if _looks_like_name(candidate): return candidate # Fallback for line in lines: if _looks_like_name(line.strip()): return line.strip() return "Not found" def _extract_generic_name(lines): for line in lines: if _looks_like_name(line.strip()): return line.strip() return "Not found" def _looks_like_name(text): if re.search(r'\d', text): return False if len(text.split()) < 2: return False banned = ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"] return not any(b in text.upper() for b in banned) # ----------------------------------- # SALESFORCE HELPERS # ----------------------------------- SF_USERNAME = os.getenv("SF_USERNAME", "") SF_PASSWORD = os.getenv("SF_PASSWORD", "") SF_TOKEN = os.getenv("SF_TOKEN", "") SF_DOMAIN = os.getenv("SF_DOMAIN", "login") # "login"=prod, "test"=sandbox def connect_salesforce(): try: sf = Salesforce( username=SF_USERNAME, password=SF_PASSWORD, security_token=SF_TOKEN, domain=SF_DOMAIN ) print(f"✅ Connected to Salesforce ({SF_DOMAIN})") return sf except Exception as e: print("❌ Salesforce login failed:", e) return None def create_kyc_record(sf, kyc_data, file_name=None, agent_id=None): """ Creates a record in KYC_Record__c with the fields: Aadhaar_Name__c, Aadhaar_DOB__c, Aadhaar_Number__c Pan_Name__c, Pan_DOB__c, PAN_Number__c Optionally includes Agent__c if you pass agent_id and that field exists. """ try: if not sf: return {"status": "error", "message": "Salesforce not connected"} # Normalize values def val_or_blank(key): return (kyc_data.get(key) or "").replace("Not found", "") record = { "Aadhaar_Name__c": "", "Aadhaar_DOB__c": "", "Aadhaar_Number__c":"", "Pan_Name__c": "", "Pan_DOB__c": "", "PAN_Number__c": "", } ct = (kyc_data.get("card_type") or "").upper() if ct == "AADHAAR": record["Aadhaar_Name__c"] = val_or_blank("name") record["Aadhaar_DOB__c"] = val_or_blank("dob") record["Aadhaar_Number__c"] = val_or_blank("aadhaar_number") elif ct == "PAN": record["Pan_Name__c"] = val_or_blank("name") record["Pan_DOB__c"] = val_or_blank("dob") record["PAN_Number__c"] = val_or_blank("pan_number") else: # Unknown: best effort — fill name/dob into Aadhaar side to avoid losing data record["Aadhaar_Name__c"] = val_or_blank("name") record["Aadhaar_DOB__c"] = val_or_blank("dob") # Optionally include Agent__c if provided (and exists in your org) if agent_id: record["Agent__c"] = agent_id # Optionally store file name in a text field if you have one (not required by you): # record["KYC_File_Name__c"] = file_name or "" resp = sf.KYC_Record__c.create(record) return {"status": "success", "record_id": resp.get("id")} except Exception as e: return {"status": "error", "message": str(e)}