from paddleocr import PaddleOCR import re # Initialize OCR once (English). Download happens first time it's used. # If you want to support other langs, set lang='en' -> 'en'|'hi'|'mr'... etc, or 'en'+'multilang models'. ocr = PaddleOCR(use_angle_cls=True, lang='en') def _extract_dob(lines): """ Try multiple formats: - dd/mm/yyyy | dd-mm-yyyy | dd.mm.yyyy - yyyy-mm-dd - Year of Birth lines (YOB / YEAR / BIRTH) """ # dd{sep}mm{sep}yyyy for line in lines: m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line) if m: return m.group(0) # yyyy-mm-dd for line in lines: m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line) if m: return m.group(0) # Year only if labeled as YOB/Year/Birth for line in lines: m = re.search(r'\b(19|20)\d{2}\b', line) if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]): return m.group(0) return "Not found" def extract_kyc_fields(file_path: str) -> dict: try: # OCR text extraction result = ocr.ocr(file_path, cls=True) # Flatten to text lines lines = [] for block in result: for line in block: text = line[1][0].strip() if text: lines.append(text) full_text = "\n".join(lines) # Detect card type by patterns pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text) aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text) if pan_match: card_type = "PAN" elif aadhaar_match: card_type = "AADHAAR" else: return { "card_type": "UNKNOWN", "error": "Could not identify document as PAN or Aadhaar." } response = {"card_type": card_type} # ===================== PAN CARD ===================== if card_type == "PAN": response["pan_number"] = pan_match.group(0) # DOB response["dob"] = _extract_dob(lines) # Name (heuristic: next lines after "INCOME TAX DEPARTMENT") name = "Not found" for i in range(len(lines)): if "INCOME TAX DEPARTMENT" in lines[i].upper(): for j in range(i + 1, len(lines)): possible = lines[j].strip() if ( re.match(r'^[A-Z\s.]+$', possible) and not any(x in possible.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]) and not re.search(r'\d', possible) and len(possible) >= 3 ): name = possible break break response["name"] = name # ===================== AADHAAR CARD ===================== else: response["aadhaar_number"] = aadhaar_match.group(0) # DOB / YOB response["dob"] = _extract_dob(lines) # Gender gender = "Not found" for line in lines: up = line.upper() if "TRANSGENDER" in up: gender = "TRANSGENDER" break if "FEMALE" in up: gender = "FEMALE" break if "MALE" in up: gender = "MALE" break response["gender"] = gender # Name: usually line before DOB or first reasonable line without digits name = "Not found" # try line before a date line for i, line in enumerate(lines): if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0: candidate = lines[i - 1].strip() if ( not re.search(r'\d', candidate) and len(candidate.split()) >= 2 and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]) ): name = candidate break # fallback if name == "Not found": for line in lines: candidate = line.strip() if ( not re.search(r'\d', candidate) and len(candidate.split()) >= 2 and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]) ): name = candidate break response["name"] = name return response except Exception as e: return {"error": f"OCR processing failed: {str(e)}"}