Spaces:
Sleeping
Sleeping
File size: 3,888 Bytes
ae2e698 c70099c f7a759e 5ebcb93 ae2e698 b07dfbb f7a759e b07dfbb f7a759e b07dfbb a8683a1 ae2e698 8324e53 ae2e698 8324e53 ae2e698 a726fb2 b07dfbb f7a759e a726fb2 2c3e33d b07dfbb 2c3e33d 655e2d6 2c3e33d e2cb16a b07dfbb e2cb16a b07dfbb 2c3e33d b07dfbb 2c3e33d f7a759e a726fb2 b07dfbb a726fb2 349558e b07dfbb f7a759e b07dfbb f7a759e b07dfbb f7a759e a726fb2 655e2d6 b07dfbb e2cb16a b07dfbb e2cb16a b07dfbb e2cb16a b07dfbb e2cb16a b07dfbb e2cb16a b07dfbb e2cb16a a726fb2 a8683a1 8324e53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
from paddleocr import PaddleOCR
import re
# Initialize OCR for English
ocr = PaddleOCR(use_angle_cls=True, lang='en')
def _extract_dob(lines):
for line in lines:
m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
if m: return m.group(0)
for line in lines:
m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
if m: return m.group(0)
for line in lines:
m = re.search(r'\b(19|20)\d{2}\b', line)
if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]):
return m.group(0)
return "Not found"
def extract_kyc_fields(file_path: str) -> dict:
try:
result = ocr.ocr(file_path, cls=True)
lines = []
for block in result:
for line in block:
text = line[1][0].strip()
if text:
lines.append(text)
full_text = "\n".join(lines)
pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
if pan_match:
card_type = "PAN"
elif aadhaar_match:
card_type = "AADHAAR"
else:
return {"card_type": "UNKNOWN", "error": "Could not identify document as PAN or Aadhaar."}
response = {"card_type": card_type}
if card_type == "PAN":
response["pan_number"] = pan_match.group(0)
response["dob"] = _extract_dob(lines)
name = "Not found"
for i in range(len(lines)):
if "INCOME TAX DEPARTMENT" in lines[i].upper():
for j in range(i + 1, len(lines)):
possible = lines[j].strip()
if (
re.match(r'^[A-Z\s.]+$', possible)
and not any(x in possible.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"])
and not re.search(r'\d', possible)
and len(possible) >= 3
):
name = possible
break
break
response["name"] = name
else: # AADHAAR
response["aadhaar_number"] = aadhaar_match.group(0)
response["dob"] = _extract_dob(lines)
gender = "Not found"
for line in lines:
up = line.upper()
if "TRANSGENDER" in up:
gender = "TRANSGENDER"; break
if "FEMALE" in up:
gender = "FEMALE"; break
if "MALE" in up:
gender = "MALE"; break
response["gender"] = gender
name = "Not found"
for i, line in enumerate(lines):
if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
candidate = lines[i - 1].strip()
if (
not re.search(r'\d', candidate)
and len(candidate.split()) >= 2
and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
):
name = candidate
break
if name == "Not found":
for line in lines:
candidate = line.strip()
if (
not re.search(r'\d', candidate)
and len(candidate.split()) >= 2
and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
):
name = candidate
break
response["name"] = name
return response
except Exception as e:
return {"error": f"OCR processing failed: {str(e)}"}
|