gopichandra commited on
Commit
254fdf9
·
verified ·
1 Parent(s): 8dcb382

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +75 -48
utils.py CHANGED
@@ -1,26 +1,15 @@
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
- # Initialize OCR for English
5
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
6
 
7
- def _extract_dob(lines):
8
- for line in lines:
9
- m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
10
- if m: return m.group(0)
11
- for line in lines:
12
- m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
13
- if m: return m.group(0)
14
- for line in lines:
15
- m = re.search(r'\b(19|20)\d{2}\b', line)
16
- if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]):
17
- return m.group(0)
18
- return "Not found"
19
-
20
- def extract_kyc_fields(file_path: str) -> dict:
21
  try:
 
22
  result = ocr.ocr(file_path, cls=True)
23
 
 
24
  lines = []
25
  for block in result:
26
  for line in block:
@@ -30,22 +19,31 @@ def extract_kyc_fields(file_path: str) -> dict:
30
 
31
  full_text = "\n".join(lines)
32
 
33
- pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
34
- aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
35
-
36
- if pan_match:
37
- card_type = "PAN"
38
- elif aadhaar_match:
39
- card_type = "AADHAAR"
40
  else:
41
- return {"card_type": "UNKNOWN", "error": "Could not identify document as PAN or Aadhaar."}
 
 
 
 
 
 
42
 
43
  response = {"card_type": card_type}
44
 
 
45
  if card_type == "PAN":
46
- response["pan_number"] = pan_match.group(0)
47
- response["dob"] = _extract_dob(lines)
 
48
 
 
 
 
 
 
49
  name = "Not found"
50
  for i in range(len(lines)):
51
  if "INCOME TAX DEPARTMENT" in lines[i].upper():
@@ -53,54 +51,83 @@ def extract_kyc_fields(file_path: str) -> dict:
53
  possible = lines[j].strip()
54
  if (
55
  re.match(r'^[A-Z\s.]+$', possible)
56
- and not any(x in possible.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"])
57
  and not re.search(r'\d', possible)
58
- and len(possible) >= 3
59
  ):
60
- name = possible
61
  break
62
  break
63
  response["name"] = name
64
 
65
- else: # AADHAAR
66
- response["aadhaar_number"] = aadhaar_match.group(0)
67
- response["dob"] = _extract_dob(lines)
 
 
 
 
 
 
68
 
 
69
  gender = "Not found"
70
  for line in lines:
71
- up = line.upper()
72
- if "TRANSGENDER" in up:
73
- gender = "TRANSGENDER"; break
74
- if "FEMALE" in up:
75
- gender = "FEMALE"; break
76
- if "MALE" in up:
77
- gender = "MALE"; break
 
 
78
  response["gender"] = gender
79
 
 
80
  name = "Not found"
81
  for i, line in enumerate(lines):
82
  if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
83
- candidate = lines[i - 1].strip()
84
  if (
85
- not re.search(r'\d', candidate)
86
- and len(candidate.split()) >= 2
87
- and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
88
  ):
89
- name = candidate
90
  break
91
  if name == "Not found":
92
  for line in lines:
93
- candidate = line.strip()
94
  if (
95
- not re.search(r'\d', candidate)
96
- and len(candidate.split()) >= 2
97
- and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
98
  ):
99
- name = candidate
100
  break
101
  response["name"] = name
102
 
 
 
 
103
  return response
104
 
105
  except Exception as e:
106
  return {"error": f"OCR processing failed: {str(e)}"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
+ # Initialize OCR
5
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
6
 
7
+ def extract_kyc_fields(file_path, force_type=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  try:
9
+ # OCR text extraction
10
  result = ocr.ocr(file_path, cls=True)
11
 
12
+ # Clean up lines
13
  lines = []
14
  for block in result:
15
  for line in block:
 
19
 
20
  full_text = "\n".join(lines)
21
 
22
+ # Detect card type (if not forced)
23
+ if force_type:
24
+ card_type = force_type.upper()
 
 
 
 
25
  else:
26
+ pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
27
+ aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
28
+ card_type = "UNKNOWN"
29
+ if pan_match:
30
+ card_type = "PAN"
31
+ elif aadhaar_match:
32
+ card_type = "AADHAAR"
33
 
34
  response = {"card_type": card_type}
35
 
36
+ # ===================== PAN CARD =====================
37
  if card_type == "PAN":
38
+ pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
39
+ if pan_match:
40
+ response["pan_number"] = pan_match.group(0)
41
 
42
+ # DOB extraction
43
+ dob = extract_dob(lines)
44
+ response["dob"] = dob
45
+
46
+ # Name detection
47
  name = "Not found"
48
  for i in range(len(lines)):
49
  if "INCOME TAX DEPARTMENT" in lines[i].upper():
 
51
  possible = lines[j].strip()
52
  if (
53
  re.match(r'^[A-Z\s.]+$', possible)
54
+ and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"])
55
  and not re.search(r'\d', possible)
 
56
  ):
57
+ name = possible.strip()
58
  break
59
  break
60
  response["name"] = name
61
 
62
+ # ===================== AADHAAR CARD =====================
63
+ elif card_type == "AADHAAR":
64
+ aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
65
+ if aadhaar_match:
66
+ response["aadhaar_number"] = aadhaar_match.group(0)
67
+
68
+ # DOB extraction
69
+ dob = extract_dob(lines)
70
+ response["dob"] = dob
71
 
72
+ # Gender detection
73
  gender = "Not found"
74
  for line in lines:
75
+ if "MALE" in line.upper():
76
+ gender = "MALE"
77
+ break
78
+ elif "FEMALE" in line.upper():
79
+ gender = "FEMALE"
80
+ break
81
+ elif "TRANSGENDER" in line.upper():
82
+ gender = "TRANSGENDER"
83
+ break
84
  response["gender"] = gender
85
 
86
+ # Name detection: before DOB
87
  name = "Not found"
88
  for i, line in enumerate(lines):
89
  if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
90
+ possible_name = lines[i - 1].strip()
91
  if (
92
+ not re.search(r'\d', possible_name)
93
+ and len(possible_name.split()) >= 2
94
+ and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
95
  ):
96
+ name = possible_name
97
  break
98
  if name == "Not found":
99
  for line in lines:
 
100
  if (
101
+ not re.search(r'\d', line)
102
+ and len(line.split()) >= 2
103
+ and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
104
  ):
105
+ name = line.strip()
106
  break
107
  response["name"] = name
108
 
109
+ else:
110
+ response["error"] = "Could not identify document as PAN or Aadhaar."
111
+
112
  return response
113
 
114
  except Exception as e:
115
  return {"error": f"OCR processing failed: {str(e)}"}
116
+
117
+
118
+ def extract_dob(lines):
119
+ """Extract DOB from OCR lines in multiple formats."""
120
+ dob = "Not found"
121
+ for line in lines:
122
+ match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
123
+ if match:
124
+ return match.group(0)
125
+ for line in lines:
126
+ match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
127
+ if match:
128
+ return match.group(0)
129
+ for line in lines:
130
+ match = re.search(r'\b(19|20)\d{2}\b', line)
131
+ if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
132
+ return match.group(0)
133
+ return dob