Spaces:
Sleeping
Sleeping
File size: 4,317 Bytes
ae2e698 c70099c e2cb16a ae2e698 a8683a1 ae2e698 8324e53 ae2e698 8324e53 ae2e698 e2cb16a a726fb2 e2cb16a a726fb2 e2cb16a 2c3e33d e2cb16a 2c3e33d e2cb16a 2c3e33d e2cb16a 2c3e33d e2cb16a 2c3e33d a726fb2 2c3e33d a726fb2 e2cb16a a726fb2 e2cb16a a726fb2 e2cb16a a726fb2 e2cb16a a726fb2 e2cb16a a726fb2 e2cb16a ae2e698 a726fb2 a8683a1 8324e53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
from paddleocr import PaddleOCR
import re
# Enable multilingual support (English + Hindi + Tamil)
ocr = PaddleOCR(use_angle_cls=True, lang='en|hi|ta')
def extract_kyc_fields(file_path):
try:
result = ocr.ocr(file_path, cls=True)
lines = []
for block in result:
for line in block:
text = line[1][0].strip()
if text:
lines.append(text)
full_text = "\n".join(lines)
# PAN pattern: 5 letters + 4 digits + 1 letter
pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
card_type = "UNKNOWN"
if pan_match:
card_type = "PAN"
elif aadhaar_match:
card_type = "AADHAAR"
response = {"card_type": card_type}
# --------- PAN CARD LOGIC ---------
if card_type == "PAN":
response["pan_number"] = pan_match.group(0)
# Extract DOB
dob = "Not found"
for line in lines:
match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
if match:
dob = match.group(0)
break
response["dob"] = dob
# Extract Name
name = "Not found"
for i in range(len(lines)):
if "INCOME TAX DEPARTMENT" in lines[i].upper():
for j in range(i+1, len(lines)):
possible = lines[j].strip()
if (
re.match(r'^[A-Z\s.]+$', possible)
and not any(x in possible for x in ["INDIA", "DEPARTMENT", "GOVT"])
and not re.search(r'\d', possible)
):
name = possible.strip()
break
break
response["name"] = name
# --------- AADHAAR CARD LOGIC ---------
elif card_type == "AADHAAR":
response["aadhaar_number"] = aadhaar_match.group(0)
# Extract DOB
dob = "Not found"
for line in lines:
match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
if match:
dob = match.group(0)
break
response["dob"] = dob
# Extract Gender
gender = "Not found"
for line in lines:
if "MALE" in line.upper():
gender = "MALE"
break
elif "FEMALE" in line.upper():
gender = "FEMALE"
break
elif "TRANSGENDER" in line.upper():
gender = "TRANSGENDER"
break
response["gender"] = gender
# Robust name extraction
name = "Not found"
# First attempt: line before DOB
for i, line in enumerate(lines):
if re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line) and i > 0:
possible_name = lines[i - 1].strip()
if (
not any(x in possible_name.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
and not re.search(r'\d', possible_name)
and len(possible_name.split()) >= 2
):
name = possible_name
break
# Fallback: best guess line with title-cased text and no digits
if name == "Not found":
for line in lines:
if (
not re.search(r'\d', line)
and len(line.split()) >= 2
and line[0].isupper()
and not any(x in line.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
):
name = line.strip()
break
response["name"] = name
else:
response["error"] = "Unable to determine document type (PAN/Aadhaar)."
return response
except Exception as e:
return {"error": f"OCR processing failed: {str(e)}"}
|