Spaces:
Sleeping
Sleeping
File size: 4,916 Bytes
ae2e698 c70099c b07dfbb 5ebcb93 ae2e698 b07dfbb a8683a1 655e2d6 ae2e698 b07dfbb 8324e53 ae2e698 8324e53 ae2e698 b07dfbb a726fb2 b07dfbb a726fb2 655e2d6 2c3e33d b07dfbb 2c3e33d b07dfbb 2c3e33d 655e2d6 2c3e33d e2cb16a b07dfbb e2cb16a b07dfbb 2c3e33d b07dfbb 2c3e33d 655e2d6 b07dfbb a726fb2 b07dfbb a726fb2 5ebcb93 a726fb2 349558e b07dfbb a726fb2 b07dfbb a726fb2 b07dfbb e2cb16a a726fb2 b07dfbb a726fb2 b07dfbb a726fb2 655e2d6 b07dfbb e2cb16a b07dfbb e2cb16a b07dfbb e2cb16a b07dfbb e2cb16a b07dfbb e2cb16a b07dfbb e2cb16a b07dfbb e2cb16a a726fb2 a8683a1 8324e53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
from paddleocr import PaddleOCR
import re
# Initialize OCR once (English). Download happens first time it's used.
# If you want to support other langs, set lang='en' -> 'en'|'hi'|'mr'... etc, or 'en'+'multilang models'.
ocr = PaddleOCR(use_angle_cls=True, lang='en')
def _extract_dob(lines):
"""
Try multiple formats:
- dd/mm/yyyy | dd-mm-yyyy | dd.mm.yyyy
- yyyy-mm-dd
- Year of Birth lines (YOB / YEAR / BIRTH)
"""
# dd{sep}mm{sep}yyyy
for line in lines:
m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
if m:
return m.group(0)
# yyyy-mm-dd
for line in lines:
m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
if m:
return m.group(0)
# Year only if labeled as YOB/Year/Birth
for line in lines:
m = re.search(r'\b(19|20)\d{2}\b', line)
if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]):
return m.group(0)
return "Not found"
def extract_kyc_fields(file_path: str) -> dict:
try:
# OCR text extraction
result = ocr.ocr(file_path, cls=True)
# Flatten to text lines
lines = []
for block in result:
for line in block:
text = line[1][0].strip()
if text:
lines.append(text)
full_text = "\n".join(lines)
# Detect card type by patterns
pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
if pan_match:
card_type = "PAN"
elif aadhaar_match:
card_type = "AADHAAR"
else:
return {
"card_type": "UNKNOWN",
"error": "Could not identify document as PAN or Aadhaar."
}
response = {"card_type": card_type}
# ===================== PAN CARD =====================
if card_type == "PAN":
response["pan_number"] = pan_match.group(0)
# DOB
response["dob"] = _extract_dob(lines)
# Name (heuristic: next lines after "INCOME TAX DEPARTMENT")
name = "Not found"
for i in range(len(lines)):
if "INCOME TAX DEPARTMENT" in lines[i].upper():
for j in range(i + 1, len(lines)):
possible = lines[j].strip()
if (
re.match(r'^[A-Z\s.]+$', possible)
and not any(x in possible.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"])
and not re.search(r'\d', possible)
and len(possible) >= 3
):
name = possible
break
break
response["name"] = name
# ===================== AADHAAR CARD =====================
else:
response["aadhaar_number"] = aadhaar_match.group(0)
# DOB / YOB
response["dob"] = _extract_dob(lines)
# Gender
gender = "Not found"
for line in lines:
up = line.upper()
if "TRANSGENDER" in up:
gender = "TRANSGENDER"
break
if "FEMALE" in up:
gender = "FEMALE"
break
if "MALE" in up:
gender = "MALE"
break
response["gender"] = gender
# Name: usually line before DOB or first reasonable line without digits
name = "Not found"
# try line before a date line
for i, line in enumerate(lines):
if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
candidate = lines[i - 1].strip()
if (
not re.search(r'\d', candidate)
and len(candidate.split()) >= 2
and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
):
name = candidate
break
# fallback
if name == "Not found":
for line in lines:
candidate = line.strip()
if (
not re.search(r'\d', candidate)
and len(candidate.split()) >= 2
and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
):
name = candidate
break
response["name"] = name
return response
except Exception as e:
return {"error": f"OCR processing failed: {str(e)}"}
|