gopichandra commited on
Commit
2c3e33d
·
verified ·
1 Parent(s): a726fb2

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +32 -33
utils.py CHANGED
@@ -16,13 +16,10 @@ def extract_kyc_fields(file_path):
16
 
17
  full_text = "\n".join(lines)
18
 
19
- # PAN pattern: 5 letters + 4 digits + 1 letter
20
  pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
21
-
22
- # Aadhaar pattern: 12 digits
23
  aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
24
 
25
- # Check which type
26
  if pan_match:
27
  card_type = "PAN"
28
  elif aadhaar_match:
@@ -32,11 +29,38 @@ def extract_kyc_fields(file_path):
32
 
33
  response = {"card_type": card_type}
34
 
35
- if card_type == "AADHAAR":
36
- # Aadhaar
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  response["aadhaar_number"] = aadhaar_match.group(0)
38
 
39
- # DOB
40
  dob = "Not found"
41
  for line in lines:
42
  match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
@@ -56,7 +80,7 @@ def extract_kyc_fields(file_path):
56
  break
57
  response["gender"] = gender
58
 
59
- # Name
60
  name = "Not found"
61
  for i, line in enumerate(lines):
62
  if "DOB" in line.upper():
@@ -70,31 +94,6 @@ def extract_kyc_fields(file_path):
70
  break
71
  response["name"] = name
72
 
73
- elif card_type == "PAN":
74
- # PAN
75
- response["pan_number"] = pan_match.group(0)
76
-
77
- # DOB
78
- dob = "Not found"
79
- for line in lines:
80
- if "DATE OF BIRTH" in line.upper():
81
- match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
82
- if match:
83
- dob = match.group(0)
84
- break
85
- response["dob"] = dob
86
-
87
- # Name: first line after heading usually
88
- name = "Not found"
89
- for i, line in enumerate(lines):
90
- if "INCOME TAX DEPARTMENT" in line.upper():
91
- if i + 1 < len(lines):
92
- possible_name = lines[i + 1]
93
- if not re.search(r'\d', possible_name):
94
- name = possible_name.strip()
95
- break
96
- response["name"] = name
97
-
98
  else:
99
  response["error"] = "Unable to determine document type."
100
 
 
16
 
17
  full_text = "\n".join(lines)
18
 
19
+ # PAN Number Detection
20
  pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
 
 
21
  aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
22
 
 
23
  if pan_match:
24
  card_type = "PAN"
25
  elif aadhaar_match:
 
29
 
30
  response = {"card_type": card_type}
31
 
32
+ if card_type == "PAN":
33
+ response["pan_number"] = pan_match.group(0)
34
+
35
+ # Extract DOB as any line with DD/MM/YYYY
36
+ dob = "Not found"
37
+ for line in lines:
38
+ match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
39
+ if match:
40
+ dob = match.group(0)
41
+ break
42
+ response["dob"] = dob
43
+
44
+ # Improved name extraction: find first uppercase name-like line after "INCOME TAX DEPARTMENT"
45
+ name = "Not found"
46
+ for i in range(len(lines)):
47
+ if "INCOME TAX DEPARTMENT" in lines[i].upper():
48
+ for j in range(i+1, len(lines)):
49
+ possible = lines[j].strip()
50
+ if (
51
+ re.match(r'^[A-Z\s.]+$', possible) and
52
+ not any(x in possible for x in ["INDIA", "DEPARTMENT", "GOVT"]) and
53
+ not re.search(r'\d', possible)
54
+ ):
55
+ name = possible.strip()
56
+ break
57
+ break
58
+ response["name"] = name
59
+
60
+ elif card_type == "AADHAAR":
61
  response["aadhaar_number"] = aadhaar_match.group(0)
62
 
63
+ # Extract DOB
64
  dob = "Not found"
65
  for line in lines:
66
  match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
 
80
  break
81
  response["gender"] = gender
82
 
83
+ # Name logic for Aadhaar (same as before)
84
  name = "Not found"
85
  for i, line in enumerate(lines):
86
  if "DOB" in line.upper():
 
94
  break
95
  response["name"] = name
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  else:
98
  response["error"] = "Unable to determine document type."
99