gopichandra commited on
Commit
b07dfbb
·
verified ·
1 Parent(s): 349558e

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +64 -68
utils.py CHANGED
@@ -1,15 +1,43 @@
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
- # Initialize OCR once (English)
 
5
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
6
 
7
- def extract_kyc_fields(file_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  try:
9
  # OCR text extraction
10
  result = ocr.ocr(file_path, cls=True)
11
 
12
- # Clean up lines
13
  lines = []
14
  for block in result:
15
  for line in block:
@@ -17,18 +45,21 @@ def extract_kyc_fields(file_path):
17
  if text:
18
  lines.append(text)
19
 
20
- # Combine all text
21
  full_text = "\n".join(lines)
22
 
23
- # Detect card type
24
  pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
25
  aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
26
 
27
- card_type = "UNKNOWN"
28
  if pan_match:
29
  card_type = "PAN"
30
  elif aadhaar_match:
31
  card_type = "AADHAAR"
 
 
 
 
 
32
 
33
  response = {"card_type": card_type}
34
 
@@ -36,28 +67,10 @@ def extract_kyc_fields(file_path):
36
  if card_type == "PAN":
37
  response["pan_number"] = pan_match.group(0)
38
 
39
- # DOB extraction with multiple formats
40
- dob = "Not found"
41
- for line in lines:
42
- match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
43
- if match:
44
- dob = match.group(0)
45
- break
46
- if dob == "Not found":
47
- for line in lines:
48
- match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
49
- if match:
50
- dob = match.group(0)
51
- break
52
- if dob == "Not found":
53
- for line in lines:
54
- match = re.search(r'\b(19|20)\d{2}\b', line)
55
- if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
56
- dob = match.group(0)
57
- break
58
- response["dob"] = dob
59
 
60
- # Name detection
61
  name = "Not found"
62
  for i in range(len(lines)):
63
  if "INCOME TAX DEPARTMENT" in lines[i].upper():
@@ -65,80 +78,63 @@ def extract_kyc_fields(file_path):
65
  possible = lines[j].strip()
66
  if (
67
  re.match(r'^[A-Z\s.]+$', possible)
68
- and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"])
69
  and not re.search(r'\d', possible)
 
70
  ):
71
- name = possible.strip()
72
  break
73
  break
74
  response["name"] = name
75
 
76
  # ===================== AADHAAR CARD =====================
77
- elif card_type == "AADHAAR":
78
  response["aadhaar_number"] = aadhaar_match.group(0)
79
 
80
- # DOB extraction with multiple formats
81
- dob = "Not found"
82
- for line in lines:
83
- match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
84
- if match:
85
- dob = match.group(0)
86
- break
87
- if dob == "Not found":
88
- for line in lines:
89
- match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
90
- if match:
91
- dob = match.group(0)
92
- break
93
- if dob == "Not found":
94
- for line in lines:
95
- match = re.search(r'\b(19|20)\d{2}\b', line)
96
- if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
97
- dob = match.group(0)
98
- break
99
- response["dob"] = dob
100
 
101
  # Gender
102
  gender = "Not found"
103
  for line in lines:
104
  up = line.upper()
105
- if "MALE" in up:
106
- gender = "MALE"
107
  break
108
- elif "FEMALE" in up:
109
  gender = "FEMALE"
110
  break
111
- elif "TRANSGENDER" in up:
112
- gender = "TRANSGENDER"
113
  break
114
  response["gender"] = gender
115
 
116
- # Name detection: before DOB line
117
  name = "Not found"
 
118
  for i, line in enumerate(lines):
119
  if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
120
- possible_name = lines[i - 1].strip()
121
  if (
122
- not re.search(r'\d', possible_name)
123
- and len(possible_name.split()) >= 2
124
- and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
125
  ):
126
- name = possible_name
127
  break
 
128
  if name == "Not found":
129
  for line in lines:
 
130
  if (
131
- not re.search(r'\d', line)
132
- and len(line.split()) >= 2
133
- and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
134
  ):
135
- name = line.strip()
136
  break
137
  response["name"] = name
138
 
139
- else:
140
- response["error"] = "Could not identify document as PAN or Aadhaar."
141
-
142
  return response
143
 
144
  except Exception as e:
 
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
+ # Initialize OCR once (English). Download happens first time it's used.
5
+ # If you want to support other langs, set lang='en' -> 'en'|'hi'|'mr'... etc, or 'en'+'multilang models'.
6
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
7
 
8
+ def _extract_dob(lines):
9
+ """
10
+ Try multiple formats:
11
+ - dd/mm/yyyy | dd-mm-yyyy | dd.mm.yyyy
12
+ - yyyy-mm-dd
13
+ - Year of Birth lines (YOB / YEAR / BIRTH)
14
+ """
15
+ # dd{sep}mm{sep}yyyy
16
+ for line in lines:
17
+ m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
18
+ if m:
19
+ return m.group(0)
20
+
21
+ # yyyy-mm-dd
22
+ for line in lines:
23
+ m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
24
+ if m:
25
+ return m.group(0)
26
+
27
+ # Year only if labeled as YOB/Year/Birth
28
+ for line in lines:
29
+ m = re.search(r'\b(19|20)\d{2}\b', line)
30
+ if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]):
31
+ return m.group(0)
32
+
33
+ return "Not found"
34
+
35
+ def extract_kyc_fields(file_path: str) -> dict:
36
  try:
37
  # OCR text extraction
38
  result = ocr.ocr(file_path, cls=True)
39
 
40
+ # Flatten to text lines
41
  lines = []
42
  for block in result:
43
  for line in block:
 
45
  if text:
46
  lines.append(text)
47
 
 
48
  full_text = "\n".join(lines)
49
 
50
+ # Detect card type by patterns
51
  pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
52
  aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
53
 
 
54
  if pan_match:
55
  card_type = "PAN"
56
  elif aadhaar_match:
57
  card_type = "AADHAAR"
58
+ else:
59
+ return {
60
+ "card_type": "UNKNOWN",
61
+ "error": "Could not identify document as PAN or Aadhaar."
62
+ }
63
 
64
  response = {"card_type": card_type}
65
 
 
67
  if card_type == "PAN":
68
  response["pan_number"] = pan_match.group(0)
69
 
70
+ # DOB
71
+ response["dob"] = _extract_dob(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ # Name (heuristic: next lines after "INCOME TAX DEPARTMENT")
74
  name = "Not found"
75
  for i in range(len(lines)):
76
  if "INCOME TAX DEPARTMENT" in lines[i].upper():
 
78
  possible = lines[j].strip()
79
  if (
80
  re.match(r'^[A-Z\s.]+$', possible)
81
+ and not any(x in possible.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"])
82
  and not re.search(r'\d', possible)
83
+ and len(possible) >= 3
84
  ):
85
+ name = possible
86
  break
87
  break
88
  response["name"] = name
89
 
90
  # ===================== AADHAAR CARD =====================
91
+ else:
92
  response["aadhaar_number"] = aadhaar_match.group(0)
93
 
94
+ # DOB / YOB
95
+ response["dob"] = _extract_dob(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  # Gender
98
  gender = "Not found"
99
  for line in lines:
100
  up = line.upper()
101
+ if "TRANSGENDER" in up:
102
+ gender = "TRANSGENDER"
103
  break
104
+ if "FEMALE" in up:
105
  gender = "FEMALE"
106
  break
107
+ if "MALE" in up:
108
+ gender = "MALE"
109
  break
110
  response["gender"] = gender
111
 
112
+ # Name: usually line before DOB or first reasonable line without digits
113
  name = "Not found"
114
+ # try line before a date line
115
  for i, line in enumerate(lines):
116
  if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
117
+ candidate = lines[i - 1].strip()
118
  if (
119
+ not re.search(r'\d', candidate)
120
+ and len(candidate.split()) >= 2
121
+ and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
122
  ):
123
+ name = candidate
124
  break
125
+ # fallback
126
  if name == "Not found":
127
  for line in lines:
128
+ candidate = line.strip()
129
  if (
130
+ not re.search(r'\d', candidate)
131
+ and len(candidate.split()) >= 2
132
+ and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
133
  ):
134
+ name = candidate
135
  break
136
  response["name"] = name
137
 
 
 
 
138
  return response
139
 
140
  except Exception as e: