gopichandra commited on
Commit
65bef46
·
verified ·
1 Parent(s): 254fdf9

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +50 -68
utils.py CHANGED
@@ -6,10 +6,8 @@ ocr = PaddleOCR(use_angle_cls=True, lang='en')
6
 
7
  def extract_kyc_fields(file_path, force_type=None):
8
  try:
9
- # OCR text extraction
10
  result = ocr.ocr(file_path, cls=True)
11
 
12
- # Clean up lines
13
  lines = []
14
  for block in result:
15
  for line in block:
@@ -19,7 +17,6 @@ def extract_kyc_fields(file_path, force_type=None):
19
 
20
  full_text = "\n".join(lines)
21
 
22
- # Detect card type (if not forced)
23
  if force_type:
24
  card_type = force_type.upper()
25
  else:
@@ -33,90 +30,30 @@ def extract_kyc_fields(file_path, force_type=None):
33
 
34
  response = {"card_type": card_type}
35
 
36
- # ===================== PAN CARD =====================
37
  if card_type == "PAN":
38
  pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
39
  if pan_match:
40
  response["pan_number"] = pan_match.group(0)
 
 
41
 
42
- # DOB extraction
43
- dob = extract_dob(lines)
44
- response["dob"] = dob
45
-
46
- # Name detection
47
- name = "Not found"
48
- for i in range(len(lines)):
49
- if "INCOME TAX DEPARTMENT" in lines[i].upper():
50
- for j in range(i + 1, len(lines)):
51
- possible = lines[j].strip()
52
- if (
53
- re.match(r'^[A-Z\s.]+$', possible)
54
- and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"])
55
- and not re.search(r'\d', possible)
56
- ):
57
- name = possible.strip()
58
- break
59
- break
60
- response["name"] = name
61
-
62
- # ===================== AADHAAR CARD =====================
63
  elif card_type == "AADHAAR":
64
  aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
65
  if aadhaar_match:
66
  response["aadhaar_number"] = aadhaar_match.group(0)
67
-
68
- # DOB extraction
69
- dob = extract_dob(lines)
70
- response["dob"] = dob
71
-
72
- # Gender detection
73
- gender = "Not found"
74
- for line in lines:
75
- if "MALE" in line.upper():
76
- gender = "MALE"
77
- break
78
- elif "FEMALE" in line.upper():
79
- gender = "FEMALE"
80
- break
81
- elif "TRANSGENDER" in line.upper():
82
- gender = "TRANSGENDER"
83
- break
84
- response["gender"] = gender
85
-
86
- # Name detection: before DOB
87
- name = "Not found"
88
- for i, line in enumerate(lines):
89
- if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
90
- possible_name = lines[i - 1].strip()
91
- if (
92
- not re.search(r'\d', possible_name)
93
- and len(possible_name.split()) >= 2
94
- and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
95
- ):
96
- name = possible_name
97
- break
98
- if name == "Not found":
99
- for line in lines:
100
- if (
101
- not re.search(r'\d', line)
102
- and len(line.split()) >= 2
103
- and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
104
- ):
105
- name = line.strip()
106
- break
107
- response["name"] = name
108
 
109
  else:
110
  response["error"] = "Could not identify document as PAN or Aadhaar."
111
 
112
  return response
113
-
114
  except Exception as e:
115
  return {"error": f"OCR processing failed: {str(e)}"}
116
 
117
 
118
  def extract_dob(lines):
119
- """Extract DOB from OCR lines in multiple formats."""
120
  dob = "Not found"
121
  for line in lines:
122
  match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
@@ -131,3 +68,48 @@ def extract_dob(lines):
131
  if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
132
  return match.group(0)
133
  return dob
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def extract_kyc_fields(file_path, force_type=None):
8
  try:
 
9
  result = ocr.ocr(file_path, cls=True)
10
 
 
11
  lines = []
12
  for block in result:
13
  for line in block:
 
17
 
18
  full_text = "\n".join(lines)
19
 
 
20
  if force_type:
21
  card_type = force_type.upper()
22
  else:
 
30
 
31
  response = {"card_type": card_type}
32
 
 
33
  if card_type == "PAN":
34
  pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
35
  if pan_match:
36
  response["pan_number"] = pan_match.group(0)
37
+ response["dob"] = extract_dob(lines)
38
+ response["name"] = extract_pan_name(lines)
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  elif card_type == "AADHAAR":
41
  aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
42
  if aadhaar_match:
43
  response["aadhaar_number"] = aadhaar_match.group(0)
44
+ response["dob"] = extract_dob(lines)
45
+ response["gender"] = extract_gender(lines)
46
+ response["name"] = extract_aadhaar_name(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  else:
49
  response["error"] = "Could not identify document as PAN or Aadhaar."
50
 
51
  return response
 
52
  except Exception as e:
53
  return {"error": f"OCR processing failed: {str(e)}"}
54
 
55
 
56
  def extract_dob(lines):
 
57
  dob = "Not found"
58
  for line in lines:
59
  match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
 
68
  if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
69
  return match.group(0)
70
  return dob
71
+
72
+
73
+ def extract_gender(lines):
74
+ for line in lines:
75
+ if "MALE" in line.upper():
76
+ return "MALE"
77
+ elif "FEMALE" in line.upper():
78
+ return "FEMALE"
79
+ elif "TRANSGENDER" in line.upper():
80
+ return "TRANSGENDER"
81
+ return "Not found"
82
+
83
+
84
+ def extract_pan_name(lines):
85
+ for i in range(len(lines)):
86
+ if "INCOME TAX DEPARTMENT" in lines[i].upper():
87
+ for j in range(i + 1, len(lines)):
88
+ possible = lines[j].strip()
89
+ if (
90
+ re.match(r'^[A-Z\s.]+$', possible)
91
+ and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"])
92
+ and not re.search(r'\d', possible)
93
+ ):
94
+ return possible.strip()
95
+ return "Not found"
96
+
97
+
98
+ def extract_aadhaar_name(lines):
99
+ for i, line in enumerate(lines):
100
+ if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
101
+ possible_name = lines[i - 1].strip()
102
+ if (
103
+ not re.search(r'\d', possible_name)
104
+ and len(possible_name.split()) >= 2
105
+ and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
106
+ ):
107
+ return possible_name
108
+ for line in lines:
109
+ if (
110
+ not re.search(r'\d', line)
111
+ and len(line.split()) >= 2
112
+ and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
113
+ ):
114
+ return line.strip()
115
+ return "Not found"