Spaces:
Sleeping
Sleeping
from paddleocr import PaddleOCR | |
import re | |
# Enable multilingual support (English + Hindi + Tamil) | |
ocr = PaddleOCR(use_angle_cls=True, lang='en|hi|ta') | |
def extract_kyc_fields(file_path): | |
try: | |
result = ocr.ocr(file_path, cls=True) | |
lines = [] | |
for block in result: | |
for line in block: | |
text = line[1][0].strip() | |
if text: | |
lines.append(text) | |
full_text = "\n".join(lines) | |
# PAN pattern: 5 letters + 4 digits + 1 letter | |
pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text) | |
aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text) | |
card_type = "UNKNOWN" | |
if pan_match: | |
card_type = "PAN" | |
elif aadhaar_match: | |
card_type = "AADHAAR" | |
response = {"card_type": card_type} | |
# --------- PAN CARD LOGIC --------- | |
if card_type == "PAN": | |
response["pan_number"] = pan_match.group(0) | |
# Extract DOB | |
dob = "Not found" | |
for line in lines: | |
match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line) | |
if match: | |
dob = match.group(0) | |
break | |
response["dob"] = dob | |
# Extract Name | |
name = "Not found" | |
for i in range(len(lines)): | |
if "INCOME TAX DEPARTMENT" in lines[i].upper(): | |
for j in range(i+1, len(lines)): | |
possible = lines[j].strip() | |
if ( | |
re.match(r'^[A-Z\s.]+$', possible) | |
and not any(x in possible for x in ["INDIA", "DEPARTMENT", "GOVT"]) | |
and not re.search(r'\d', possible) | |
): | |
name = possible.strip() | |
break | |
break | |
response["name"] = name | |
# --------- AADHAAR CARD LOGIC --------- | |
elif card_type == "AADHAAR": | |
response["aadhaar_number"] = aadhaar_match.group(0) | |
# Extract DOB | |
dob = "Not found" | |
for line in lines: | |
match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line) | |
if match: | |
dob = match.group(0) | |
break | |
response["dob"] = dob | |
# Extract Gender | |
gender = "Not found" | |
for line in lines: | |
if "MALE" in line.upper(): | |
gender = "MALE" | |
break | |
elif "FEMALE" in line.upper(): | |
gender = "FEMALE" | |
break | |
elif "TRANSGENDER" in line.upper(): | |
gender = "TRANSGENDER" | |
break | |
response["gender"] = gender | |
# Robust name extraction | |
name = "Not found" | |
# First attempt: line before DOB | |
for i, line in enumerate(lines): | |
if re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line) and i > 0: | |
possible_name = lines[i - 1].strip() | |
if ( | |
not any(x in possible_name.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"]) | |
and not re.search(r'\d', possible_name) | |
and len(possible_name.split()) >= 2 | |
): | |
name = possible_name | |
break | |
# Fallback: best guess line with title-cased text and no digits | |
if name == "Not found": | |
for line in lines: | |
if ( | |
not re.search(r'\d', line) | |
and len(line.split()) >= 2 | |
and line[0].isupper() | |
and not any(x in line.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"]) | |
): | |
name = line.strip() | |
break | |
response["name"] = name | |
else: | |
response["error"] = "Unable to determine document type (PAN/Aadhaar)." | |
return response | |
except Exception as e: | |
return {"error": f"OCR processing failed: {str(e)}"} | |