gopichandra commited on
Commit
f7a759e
·
verified ·
1 Parent(s): 0edec5e

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +8 -43
utils.py CHANGED
@@ -1,43 +1,26 @@
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
- # Initialize OCR once (English). Download happens first time it's used.
5
- # If you want to support other langs, set lang='en' -> 'en'|'hi'|'mr'... etc, or 'en'+'multilang models'.
6
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
7
 
8
  def _extract_dob(lines):
9
- """
10
- Try multiple formats:
11
- - dd/mm/yyyy | dd-mm-yyyy | dd.mm.yyyy
12
- - yyyy-mm-dd
13
- - Year of Birth lines (YOB / YEAR / BIRTH)
14
- """
15
- # dd{sep}mm{sep}yyyy
16
  for line in lines:
17
  m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
18
- if m:
19
- return m.group(0)
20
-
21
- # yyyy-mm-dd
22
  for line in lines:
23
  m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
24
- if m:
25
- return m.group(0)
26
-
27
- # Year only if labeled as YOB/Year/Birth
28
  for line in lines:
29
  m = re.search(r'\b(19|20)\d{2}\b', line)
30
  if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]):
31
  return m.group(0)
32
-
33
  return "Not found"
34
 
35
  def extract_kyc_fields(file_path: str) -> dict:
36
  try:
37
- # OCR text extraction
38
  result = ocr.ocr(file_path, cls=True)
39
 
40
- # Flatten to text lines
41
  lines = []
42
  for block in result:
43
  for line in block:
@@ -47,7 +30,6 @@ def extract_kyc_fields(file_path: str) -> dict:
47
 
48
  full_text = "\n".join(lines)
49
 
50
- # Detect card type by patterns
51
  pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
52
  aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
53
 
@@ -56,21 +38,14 @@ def extract_kyc_fields(file_path: str) -> dict:
56
  elif aadhaar_match:
57
  card_type = "AADHAAR"
58
  else:
59
- return {
60
- "card_type": "UNKNOWN",
61
- "error": "Could not identify document as PAN or Aadhaar."
62
- }
63
 
64
  response = {"card_type": card_type}
65
 
66
- # ===================== PAN CARD =====================
67
  if card_type == "PAN":
68
  response["pan_number"] = pan_match.group(0)
69
-
70
- # DOB
71
  response["dob"] = _extract_dob(lines)
72
 
73
- # Name (heuristic: next lines after "INCOME TAX DEPARTMENT")
74
  name = "Not found"
75
  for i in range(len(lines)):
76
  if "INCOME TAX DEPARTMENT" in lines[i].upper():
@@ -87,31 +62,22 @@ def extract_kyc_fields(file_path: str) -> dict:
87
  break
88
  response["name"] = name
89
 
90
- # ===================== AADHAAR CARD =====================
91
- else:
92
  response["aadhaar_number"] = aadhaar_match.group(0)
93
-
94
- # DOB / YOB
95
  response["dob"] = _extract_dob(lines)
96
 
97
- # Gender
98
  gender = "Not found"
99
  for line in lines:
100
  up = line.upper()
101
  if "TRANSGENDER" in up:
102
- gender = "TRANSGENDER"
103
- break
104
  if "FEMALE" in up:
105
- gender = "FEMALE"
106
- break
107
  if "MALE" in up:
108
- gender = "MALE"
109
- break
110
  response["gender"] = gender
111
 
112
- # Name: usually line before DOB or first reasonable line without digits
113
  name = "Not found"
114
- # try line before a date line
115
  for i, line in enumerate(lines):
116
  if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
117
  candidate = lines[i - 1].strip()
@@ -122,7 +88,6 @@ def extract_kyc_fields(file_path: str) -> dict:
122
  ):
123
  name = candidate
124
  break
125
- # fallback
126
  if name == "Not found":
127
  for line in lines:
128
  candidate = line.strip()
 
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
+ # Initialize OCR for English
 
5
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
6
 
7
  def _extract_dob(lines):
 
 
 
 
 
 
 
8
  for line in lines:
9
  m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
10
+ if m: return m.group(0)
 
 
 
11
  for line in lines:
12
  m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
13
+ if m: return m.group(0)
 
 
 
14
  for line in lines:
15
  m = re.search(r'\b(19|20)\d{2}\b', line)
16
  if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]):
17
  return m.group(0)
 
18
  return "Not found"
19
 
20
  def extract_kyc_fields(file_path: str) -> dict:
21
  try:
 
22
  result = ocr.ocr(file_path, cls=True)
23
 
 
24
  lines = []
25
  for block in result:
26
  for line in block:
 
30
 
31
  full_text = "\n".join(lines)
32
 
 
33
  pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
34
  aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
35
 
 
38
  elif aadhaar_match:
39
  card_type = "AADHAAR"
40
  else:
41
+ return {"card_type": "UNKNOWN", "error": "Could not identify document as PAN or Aadhaar."}
 
 
 
42
 
43
  response = {"card_type": card_type}
44
 
 
45
  if card_type == "PAN":
46
  response["pan_number"] = pan_match.group(0)
 
 
47
  response["dob"] = _extract_dob(lines)
48
 
 
49
  name = "Not found"
50
  for i in range(len(lines)):
51
  if "INCOME TAX DEPARTMENT" in lines[i].upper():
 
62
  break
63
  response["name"] = name
64
 
65
+ else: # AADHAAR
 
66
  response["aadhaar_number"] = aadhaar_match.group(0)
 
 
67
  response["dob"] = _extract_dob(lines)
68
 
 
69
  gender = "Not found"
70
  for line in lines:
71
  up = line.upper()
72
  if "TRANSGENDER" in up:
73
+ gender = "TRANSGENDER"; break
 
74
  if "FEMALE" in up:
75
+ gender = "FEMALE"; break
 
76
  if "MALE" in up:
77
+ gender = "MALE"; break
 
78
  response["gender"] = gender
79
 
 
80
  name = "Not found"
 
81
  for i, line in enumerate(lines):
82
  if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
83
  candidate = lines[i - 1].strip()
 
88
  ):
89
  name = candidate
90
  break
 
91
  if name == "Not found":
92
  for line in lines:
93
  candidate = line.strip()