gopichandra commited on
Commit
e2cb16a
·
verified ·
1 Parent(s): 2c3e33d

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +40 -21
utils.py CHANGED
@@ -1,7 +1,8 @@
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
- ocr = PaddleOCR(use_angle_cls=True, lang='en')
 
5
 
6
  def extract_kyc_fields(file_path):
7
  try:
@@ -16,23 +17,23 @@ def extract_kyc_fields(file_path):
16
 
17
  full_text = "\n".join(lines)
18
 
19
- # PAN Number Detection
20
  pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
21
  aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
22
 
 
23
  if pan_match:
24
  card_type = "PAN"
25
  elif aadhaar_match:
26
  card_type = "AADHAAR"
27
- else:
28
- card_type = "UNKNOWN"
29
 
30
  response = {"card_type": card_type}
31
 
 
32
  if card_type == "PAN":
33
  response["pan_number"] = pan_match.group(0)
34
 
35
- # Extract DOB as any line with DD/MM/YYYY
36
  dob = "Not found"
37
  for line in lines:
38
  match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
@@ -41,22 +42,23 @@ def extract_kyc_fields(file_path):
41
  break
42
  response["dob"] = dob
43
 
44
- # Improved name extraction: find first uppercase name-like line after "INCOME TAX DEPARTMENT"
45
  name = "Not found"
46
  for i in range(len(lines)):
47
  if "INCOME TAX DEPARTMENT" in lines[i].upper():
48
  for j in range(i+1, len(lines)):
49
  possible = lines[j].strip()
50
  if (
51
- re.match(r'^[A-Z\s.]+$', possible) and
52
- not any(x in possible for x in ["INDIA", "DEPARTMENT", "GOVT"]) and
53
- not re.search(r'\d', possible)
54
  ):
55
  name = possible.strip()
56
  break
57
  break
58
  response["name"] = name
59
 
 
60
  elif card_type == "AADHAAR":
61
  response["aadhaar_number"] = aadhaar_match.group(0)
62
 
@@ -69,7 +71,7 @@ def extract_kyc_fields(file_path):
69
  break
70
  response["dob"] = dob
71
 
72
- # Gender
73
  gender = "Not found"
74
  for line in lines:
75
  if "MALE" in line.upper():
@@ -78,24 +80,41 @@ def extract_kyc_fields(file_path):
78
  elif "FEMALE" in line.upper():
79
  gender = "FEMALE"
80
  break
 
 
 
81
  response["gender"] = gender
82
 
83
- # Name logic for Aadhaar (same as before)
84
  name = "Not found"
 
85
  for i, line in enumerate(lines):
86
- if "DOB" in line.upper():
87
- if i > 0:
88
- possible_name = lines[i - 1]
89
- if (
90
- not any(x in possible_name.upper() for x in ["GOVERNMENT", "MALE", "FEMALE"])
91
- and not re.search(r'\d', possible_name)
92
- ):
93
- name = possible_name.strip()
94
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  response["name"] = name
96
 
97
  else:
98
- response["error"] = "Unable to determine document type."
99
 
100
  return response
101
 
 
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
+ # Enable multilingual support (English + Hindi + Tamil)
5
+ ocr = PaddleOCR(use_angle_cls=True, lang='en|hi|ta')
6
 
7
  def extract_kyc_fields(file_path):
8
  try:
 
17
 
18
  full_text = "\n".join(lines)
19
 
20
+ # PAN pattern: 5 letters + 4 digits + 1 letter
21
  pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
22
  aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
23
 
24
+ card_type = "UNKNOWN"
25
  if pan_match:
26
  card_type = "PAN"
27
  elif aadhaar_match:
28
  card_type = "AADHAAR"
 
 
29
 
30
  response = {"card_type": card_type}
31
 
32
+ # --------- PAN CARD LOGIC ---------
33
  if card_type == "PAN":
34
  response["pan_number"] = pan_match.group(0)
35
 
36
+ # Extract DOB
37
  dob = "Not found"
38
  for line in lines:
39
  match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
 
42
  break
43
  response["dob"] = dob
44
 
45
+ # Extract Name
46
  name = "Not found"
47
  for i in range(len(lines)):
48
  if "INCOME TAX DEPARTMENT" in lines[i].upper():
49
  for j in range(i+1, len(lines)):
50
  possible = lines[j].strip()
51
  if (
52
+ re.match(r'^[A-Z\s.]+$', possible)
53
+ and not any(x in possible for x in ["INDIA", "DEPARTMENT", "GOVT"])
54
+ and not re.search(r'\d', possible)
55
  ):
56
  name = possible.strip()
57
  break
58
  break
59
  response["name"] = name
60
 
61
+ # --------- AADHAAR CARD LOGIC ---------
62
  elif card_type == "AADHAAR":
63
  response["aadhaar_number"] = aadhaar_match.group(0)
64
 
 
71
  break
72
  response["dob"] = dob
73
 
74
+ # Extract Gender
75
  gender = "Not found"
76
  for line in lines:
77
  if "MALE" in line.upper():
 
80
  elif "FEMALE" in line.upper():
81
  gender = "FEMALE"
82
  break
83
+ elif "TRANSGENDER" in line.upper():
84
+ gender = "TRANSGENDER"
85
+ break
86
  response["gender"] = gender
87
 
88
+ # Robust name extraction
89
  name = "Not found"
90
+ # First attempt: line before DOB
91
  for i, line in enumerate(lines):
92
+ if re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line) and i > 0:
93
+ possible_name = lines[i - 1].strip()
94
+ if (
95
+ not any(x in possible_name.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
96
+ and not re.search(r'\d', possible_name)
97
+ and len(possible_name.split()) >= 2
98
+ ):
99
+ name = possible_name
100
+ break
101
+
102
+ # Fallback: best guess line with title-cased text and no digits
103
+ if name == "Not found":
104
+ for line in lines:
105
+ if (
106
+ not re.search(r'\d', line)
107
+ and len(line.split()) >= 2
108
+ and line[0].isupper()
109
+ and not any(x in line.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
110
+ ):
111
+ name = line.strip()
112
+ break
113
+
114
  response["name"] = name
115
 
116
  else:
117
+ response["error"] = "Unable to determine document type (PAN/Aadhaar)."
118
 
119
  return response
120