Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
@@ -16,13 +16,10 @@ def extract_kyc_fields(file_path):
|
|
16 |
|
17 |
full_text = "\n".join(lines)
|
18 |
|
19 |
-
# PAN
|
20 |
pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
|
21 |
-
|
22 |
-
# Aadhaar pattern: 12 digits
|
23 |
aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
|
24 |
|
25 |
-
# Check which type
|
26 |
if pan_match:
|
27 |
card_type = "PAN"
|
28 |
elif aadhaar_match:
|
@@ -32,11 +29,38 @@ def extract_kyc_fields(file_path):
|
|
32 |
|
33 |
response = {"card_type": card_type}
|
34 |
|
35 |
-
if card_type == "
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
response["aadhaar_number"] = aadhaar_match.group(0)
|
38 |
|
39 |
-
# DOB
|
40 |
dob = "Not found"
|
41 |
for line in lines:
|
42 |
match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
|
@@ -56,7 +80,7 @@ def extract_kyc_fields(file_path):
|
|
56 |
break
|
57 |
response["gender"] = gender
|
58 |
|
59 |
-
# Name
|
60 |
name = "Not found"
|
61 |
for i, line in enumerate(lines):
|
62 |
if "DOB" in line.upper():
|
@@ -70,31 +94,6 @@ def extract_kyc_fields(file_path):
|
|
70 |
break
|
71 |
response["name"] = name
|
72 |
|
73 |
-
elif card_type == "PAN":
|
74 |
-
# PAN
|
75 |
-
response["pan_number"] = pan_match.group(0)
|
76 |
-
|
77 |
-
# DOB
|
78 |
-
dob = "Not found"
|
79 |
-
for line in lines:
|
80 |
-
if "DATE OF BIRTH" in line.upper():
|
81 |
-
match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
|
82 |
-
if match:
|
83 |
-
dob = match.group(0)
|
84 |
-
break
|
85 |
-
response["dob"] = dob
|
86 |
-
|
87 |
-
# Name: first line after heading usually
|
88 |
-
name = "Not found"
|
89 |
-
for i, line in enumerate(lines):
|
90 |
-
if "INCOME TAX DEPARTMENT" in line.upper():
|
91 |
-
if i + 1 < len(lines):
|
92 |
-
possible_name = lines[i + 1]
|
93 |
-
if not re.search(r'\d', possible_name):
|
94 |
-
name = possible_name.strip()
|
95 |
-
break
|
96 |
-
response["name"] = name
|
97 |
-
|
98 |
else:
|
99 |
response["error"] = "Unable to determine document type."
|
100 |
|
|
|
16 |
|
17 |
full_text = "\n".join(lines)
|
18 |
|
19 |
+
# PAN Number Detection
|
20 |
pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
|
|
|
|
|
21 |
aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
|
22 |
|
|
|
23 |
if pan_match:
|
24 |
card_type = "PAN"
|
25 |
elif aadhaar_match:
|
|
|
29 |
|
30 |
response = {"card_type": card_type}
|
31 |
|
32 |
+
if card_type == "PAN":
|
33 |
+
response["pan_number"] = pan_match.group(0)
|
34 |
+
|
35 |
+
# Extract DOB as any line with DD/MM/YYYY
|
36 |
+
dob = "Not found"
|
37 |
+
for line in lines:
|
38 |
+
match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
|
39 |
+
if match:
|
40 |
+
dob = match.group(0)
|
41 |
+
break
|
42 |
+
response["dob"] = dob
|
43 |
+
|
44 |
+
# Improved name extraction: find first uppercase name-like line after "INCOME TAX DEPARTMENT"
|
45 |
+
name = "Not found"
|
46 |
+
for i in range(len(lines)):
|
47 |
+
if "INCOME TAX DEPARTMENT" in lines[i].upper():
|
48 |
+
for j in range(i+1, len(lines)):
|
49 |
+
possible = lines[j].strip()
|
50 |
+
if (
|
51 |
+
re.match(r'^[A-Z\s.]+$', possible) and
|
52 |
+
not any(x in possible for x in ["INDIA", "DEPARTMENT", "GOVT"]) and
|
53 |
+
not re.search(r'\d', possible)
|
54 |
+
):
|
55 |
+
name = possible.strip()
|
56 |
+
break
|
57 |
+
break
|
58 |
+
response["name"] = name
|
59 |
+
|
60 |
+
elif card_type == "AADHAAR":
|
61 |
response["aadhaar_number"] = aadhaar_match.group(0)
|
62 |
|
63 |
+
# Extract DOB
|
64 |
dob = "Not found"
|
65 |
for line in lines:
|
66 |
match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
|
|
|
80 |
break
|
81 |
response["gender"] = gender
|
82 |
|
83 |
+
# Name logic for Aadhaar (same as before)
|
84 |
name = "Not found"
|
85 |
for i, line in enumerate(lines):
|
86 |
if "DOB" in line.upper():
|
|
|
94 |
break
|
95 |
response["name"] = name
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
else:
|
98 |
response["error"] = "Unable to determine document type."
|
99 |
|