Spaces:
Sleeping
Sleeping
File size: 4,362 Bytes
ae2e698 c70099c 5ebcb93 ae2e698 a8683a1 5ebcb93 ae2e698 5ebcb93 8324e53 ae2e698 5ebcb93 8324e53 ae2e698 5ebcb93 a726fb2 e2cb16a a726fb2 5ebcb93 2c3e33d 5ebcb93 2c3e33d 5ebcb93 2c3e33d e2cb16a 2c3e33d 5ebcb93 2c3e33d a726fb2 5ebcb93 a726fb2 5ebcb93 a726fb2 e2cb16a a726fb2 5ebcb93 a726fb2 e2cb16a 5ebcb93 e2cb16a a726fb2 5ebcb93 a726fb2 5ebcb93 ae2e698 a726fb2 a8683a1 8324e53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
from paddleocr import PaddleOCR
import re
# Initialize OCR for English (safe default for Aadhaar and PAN)
ocr = PaddleOCR(use_angle_cls=True, lang='en')
def extract_kyc_fields(file_path):
try:
# Run OCR
result = ocr.ocr(file_path, cls=True)
# Extract lines from result
lines = []
for block in result:
for line in block:
text = line[1][0].strip()
if text:
lines.append(text)
# Combine for pattern searches
full_text = "\n".join(lines)
# Detect document type
pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
card_type = "UNKNOWN"
if pan_match:
card_type = "PAN"
elif aadhaar_match:
card_type = "AADHAAR"
response = {"card_type": card_type}
# ===================== PAN CARD LOGIC =====================
if card_type == "PAN":
response["pan_number"] = pan_match.group(0)
# DOB
dob = "Not found"
for line in lines:
match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
if match:
dob = match.group(0)
break
response["dob"] = dob
# Name
name = "Not found"
for i in range(len(lines)):
if "INCOME TAX DEPARTMENT" in lines[i].upper():
for j in range(i+1, len(lines)):
possible = lines[j].strip()
if (
re.match(r'^[A-Z\s.]+$', possible)
and not any(x in possible for x in ["INDIA", "DEPARTMENT", "GOVT"])
and not re.search(r'\d', possible)
):
name = possible.strip()
break
break
response["name"] = name
# ===================== AADHAAR CARD LOGIC =====================
elif card_type == "AADHAAR":
response["aadhaar_number"] = aadhaar_match.group(0)
# DOB
dob = "Not found"
for line in lines:
match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
if match:
dob = match.group(0)
break
response["dob"] = dob
# Gender
gender = "Not found"
for line in lines:
if "MALE" in line.upper():
gender = "MALE"
break
elif "FEMALE" in line.upper():
gender = "FEMALE"
break
elif "TRANSGENDER" in line.upper():
gender = "TRANSGENDER"
break
response["gender"] = gender
# Name – try before DOB or other heuristics
name = "Not found"
for i, line in enumerate(lines):
if re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line) and i > 0:
possible_name = lines[i - 1].strip()
if (
not any(x in possible_name.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
and not re.search(r'\d', possible_name)
and len(possible_name.split()) >= 2
):
name = possible_name
break
# Fallback if above fails
if name == "Not found":
for line in lines:
if (
not re.search(r'\d', line)
and len(line.split()) >= 2
and not any(x in line.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
):
name = line.strip()
break
response["name"] = name
# ===================== UNKNOWN DOC =====================
else:
response["error"] = "Could not detect document type (PAN or Aadhaar)."
return response
except Exception as e:
return {"error": f"OCR processing failed: {str(e)}"}
|