Spaces:
Sleeping
Sleeping
File size: 3,509 Bytes
ae2e698 c70099c 73f6d86 ae2e698 a8683a1 ae2e698 8324e53 ae2e698 8324e53 ae2e698 2c3e33d a726fb2 2c3e33d a726fb2 2c3e33d a726fb2 2c3e33d a726fb2 ae2e698 a726fb2 a8683a1 8324e53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
from paddleocr import PaddleOCR
import re
ocr = PaddleOCR(use_angle_cls=True, lang='en')
def extract_kyc_fields(file_path):
try:
result = ocr.ocr(file_path, cls=True)
lines = []
for block in result:
for line in block:
text = line[1][0].strip()
if text:
lines.append(text)
full_text = "\n".join(lines)
# PAN Number Detection
pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
if pan_match:
card_type = "PAN"
elif aadhaar_match:
card_type = "AADHAAR"
else:
card_type = "UNKNOWN"
response = {"card_type": card_type}
if card_type == "PAN":
response["pan_number"] = pan_match.group(0)
# Extract DOB as any line with DD/MM/YYYY
dob = "Not found"
for line in lines:
match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
if match:
dob = match.group(0)
break
response["dob"] = dob
# Improved name extraction: find first uppercase name-like line after "INCOME TAX DEPARTMENT"
name = "Not found"
for i in range(len(lines)):
if "INCOME TAX DEPARTMENT" in lines[i].upper():
for j in range(i+1, len(lines)):
possible = lines[j].strip()
if (
re.match(r'^[A-Z\s.]+$', possible) and
not any(x in possible for x in ["INDIA", "DEPARTMENT", "GOVT"]) and
not re.search(r'\d', possible)
):
name = possible.strip()
break
break
response["name"] = name
elif card_type == "AADHAAR":
response["aadhaar_number"] = aadhaar_match.group(0)
# Extract DOB
dob = "Not found"
for line in lines:
match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
if match:
dob = match.group(0)
break
response["dob"] = dob
# Gender
gender = "Not found"
for line in lines:
if "MALE" in line.upper():
gender = "MALE"
break
elif "FEMALE" in line.upper():
gender = "FEMALE"
break
response["gender"] = gender
# Name logic for Aadhaar (same as before)
name = "Not found"
for i, line in enumerate(lines):
if "DOB" in line.upper():
if i > 0:
possible_name = lines[i - 1]
if (
not any(x in possible_name.upper() for x in ["GOVERNMENT", "MALE", "FEMALE"])
and not re.search(r'\d', possible_name)
):
name = possible_name.strip()
break
response["name"] = name
else:
response["error"] = "Unable to determine document type."
return response
except Exception as e:
return {"error": f"OCR processing failed: {str(e)}"}
|