SMART_KYC_OCR / utils.py
gopichandra's picture
Update utils.py
b07dfbb verified
raw
history blame
4.92 kB
from paddleocr import PaddleOCR
import re
# Initialize OCR once (English). Download happens first time it's used.
# If you want to support other langs, set lang='en' -> 'en'|'hi'|'mr'... etc, or 'en'+'multilang models'.
ocr = PaddleOCR(use_angle_cls=True, lang='en')
def _extract_dob(lines):
"""
Try multiple formats:
- dd/mm/yyyy | dd-mm-yyyy | dd.mm.yyyy
- yyyy-mm-dd
- Year of Birth lines (YOB / YEAR / BIRTH)
"""
# dd{sep}mm{sep}yyyy
for line in lines:
m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
if m:
return m.group(0)
# yyyy-mm-dd
for line in lines:
m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
if m:
return m.group(0)
# Year only if labeled as YOB/Year/Birth
for line in lines:
m = re.search(r'\b(19|20)\d{2}\b', line)
if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]):
return m.group(0)
return "Not found"
def extract_kyc_fields(file_path: str) -> dict:
try:
# OCR text extraction
result = ocr.ocr(file_path, cls=True)
# Flatten to text lines
lines = []
for block in result:
for line in block:
text = line[1][0].strip()
if text:
lines.append(text)
full_text = "\n".join(lines)
# Detect card type by patterns
pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
if pan_match:
card_type = "PAN"
elif aadhaar_match:
card_type = "AADHAAR"
else:
return {
"card_type": "UNKNOWN",
"error": "Could not identify document as PAN or Aadhaar."
}
response = {"card_type": card_type}
# ===================== PAN CARD =====================
if card_type == "PAN":
response["pan_number"] = pan_match.group(0)
# DOB
response["dob"] = _extract_dob(lines)
# Name (heuristic: next lines after "INCOME TAX DEPARTMENT")
name = "Not found"
for i in range(len(lines)):
if "INCOME TAX DEPARTMENT" in lines[i].upper():
for j in range(i + 1, len(lines)):
possible = lines[j].strip()
if (
re.match(r'^[A-Z\s.]+$', possible)
and not any(x in possible.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"])
and not re.search(r'\d', possible)
and len(possible) >= 3
):
name = possible
break
break
response["name"] = name
# ===================== AADHAAR CARD =====================
else:
response["aadhaar_number"] = aadhaar_match.group(0)
# DOB / YOB
response["dob"] = _extract_dob(lines)
# Gender
gender = "Not found"
for line in lines:
up = line.upper()
if "TRANSGENDER" in up:
gender = "TRANSGENDER"
break
if "FEMALE" in up:
gender = "FEMALE"
break
if "MALE" in up:
gender = "MALE"
break
response["gender"] = gender
# Name: usually line before DOB or first reasonable line without digits
name = "Not found"
# try line before a date line
for i, line in enumerate(lines):
if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
candidate = lines[i - 1].strip()
if (
not re.search(r'\d', candidate)
and len(candidate.split()) >= 2
and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
):
name = candidate
break
# fallback
if name == "Not found":
for line in lines:
candidate = line.strip()
if (
not re.search(r'\d', candidate)
and len(candidate.split()) >= 2
and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
):
name = candidate
break
response["name"] = name
return response
except Exception as e:
return {"error": f"OCR processing failed: {str(e)}"}