gopichandra commited on
Commit
5ebcb93
·
verified ·
1 Parent(s): e2cb16a

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +16 -14
utils.py CHANGED
@@ -1,13 +1,15 @@
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
- # Enable multilingual support (English + Hindi + Tamil)
5
- ocr = PaddleOCR(use_angle_cls=True, lang='en|hi|ta')
6
 
7
  def extract_kyc_fields(file_path):
8
  try:
 
9
  result = ocr.ocr(file_path, cls=True)
10
 
 
11
  lines = []
12
  for block in result:
13
  for line in block:
@@ -15,9 +17,10 @@ def extract_kyc_fields(file_path):
15
  if text:
16
  lines.append(text)
17
 
 
18
  full_text = "\n".join(lines)
19
 
20
- # PAN pattern: 5 letters + 4 digits + 1 letter
21
  pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
22
  aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
23
 
@@ -29,11 +32,11 @@ def extract_kyc_fields(file_path):
29
 
30
  response = {"card_type": card_type}
31
 
32
- # --------- PAN CARD LOGIC ---------
33
  if card_type == "PAN":
34
  response["pan_number"] = pan_match.group(0)
35
 
36
- # Extract DOB
37
  dob = "Not found"
38
  for line in lines:
39
  match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
@@ -42,7 +45,7 @@ def extract_kyc_fields(file_path):
42
  break
43
  response["dob"] = dob
44
 
45
- # Extract Name
46
  name = "Not found"
47
  for i in range(len(lines)):
48
  if "INCOME TAX DEPARTMENT" in lines[i].upper():
@@ -58,11 +61,11 @@ def extract_kyc_fields(file_path):
58
  break
59
  response["name"] = name
60
 
61
- # --------- AADHAAR CARD LOGIC ---------
62
  elif card_type == "AADHAAR":
63
  response["aadhaar_number"] = aadhaar_match.group(0)
64
 
65
- # Extract DOB
66
  dob = "Not found"
67
  for line in lines:
68
  match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
@@ -71,7 +74,7 @@ def extract_kyc_fields(file_path):
71
  break
72
  response["dob"] = dob
73
 
74
- # Extract Gender
75
  gender = "Not found"
76
  for line in lines:
77
  if "MALE" in line.upper():
@@ -85,9 +88,8 @@ def extract_kyc_fields(file_path):
85
  break
86
  response["gender"] = gender
87
 
88
- # Robust name extraction
89
  name = "Not found"
90
- # First attempt: line before DOB
91
  for i, line in enumerate(lines):
92
  if re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line) and i > 0:
93
  possible_name = lines[i - 1].strip()
@@ -99,13 +101,12 @@ def extract_kyc_fields(file_path):
99
  name = possible_name
100
  break
101
 
102
- # Fallback: best guess line with title-cased text and no digits
103
  if name == "Not found":
104
  for line in lines:
105
  if (
106
  not re.search(r'\d', line)
107
  and len(line.split()) >= 2
108
- and line[0].isupper()
109
  and not any(x in line.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
110
  ):
111
  name = line.strip()
@@ -113,8 +114,9 @@ def extract_kyc_fields(file_path):
113
 
114
  response["name"] = name
115
 
 
116
  else:
117
- response["error"] = "Unable to determine document type (PAN/Aadhaar)."
118
 
119
  return response
120
 
 
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
+ # Initialize OCR for English (safe default for Aadhaar and PAN)
5
+ ocr = PaddleOCR(use_angle_cls=True, lang='en')
6
 
7
  def extract_kyc_fields(file_path):
8
  try:
9
+ # Run OCR
10
  result = ocr.ocr(file_path, cls=True)
11
 
12
+ # Extract lines from result
13
  lines = []
14
  for block in result:
15
  for line in block:
 
17
  if text:
18
  lines.append(text)
19
 
20
+ # Combine for pattern searches
21
  full_text = "\n".join(lines)
22
 
23
+ # Detect document type
24
  pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
25
  aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
26
 
 
32
 
33
  response = {"card_type": card_type}
34
 
35
+ # ===================== PAN CARD LOGIC =====================
36
  if card_type == "PAN":
37
  response["pan_number"] = pan_match.group(0)
38
 
39
+ # DOB
40
  dob = "Not found"
41
  for line in lines:
42
  match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
 
45
  break
46
  response["dob"] = dob
47
 
48
+ # Name
49
  name = "Not found"
50
  for i in range(len(lines)):
51
  if "INCOME TAX DEPARTMENT" in lines[i].upper():
 
61
  break
62
  response["name"] = name
63
 
64
+ # ===================== AADHAAR CARD LOGIC =====================
65
  elif card_type == "AADHAAR":
66
  response["aadhaar_number"] = aadhaar_match.group(0)
67
 
68
+ # DOB
69
  dob = "Not found"
70
  for line in lines:
71
  match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
 
74
  break
75
  response["dob"] = dob
76
 
77
+ # Gender
78
  gender = "Not found"
79
  for line in lines:
80
  if "MALE" in line.upper():
 
88
  break
89
  response["gender"] = gender
90
 
91
+ # Name try before DOB or other heuristics
92
  name = "Not found"
 
93
  for i, line in enumerate(lines):
94
  if re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line) and i > 0:
95
  possible_name = lines[i - 1].strip()
 
101
  name = possible_name
102
  break
103
 
104
+ # Fallback if above fails
105
  if name == "Not found":
106
  for line in lines:
107
  if (
108
  not re.search(r'\d', line)
109
  and len(line.split()) >= 2
 
110
  and not any(x in line.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
111
  ):
112
  name = line.strip()
 
114
 
115
  response["name"] = name
116
 
117
+ # ===================== UNKNOWN DOC =====================
118
  else:
119
+ response["error"] = "Could not detect document type (PAN or Aadhaar)."
120
 
121
  return response
122