gopichandra commited on
Commit
a726fb2
·
verified ·
1 Parent(s): 8324e53

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +81 -45
utils.py CHANGED
@@ -1,7 +1,6 @@
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
- # Initialize OCR with English and Tamil (or just 'en' if you want)
5
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
6
 
7
  def extract_kyc_fields(file_path):
@@ -15,54 +14,91 @@ def extract_kyc_fields(file_path):
15
  if text:
16
  lines.append(text)
17
 
18
- # Combine all lines into one big string
19
  full_text = "\n".join(lines)
20
 
21
- # Aadhaar Number strictly 12 digits (grouped or not)
22
- aadhaar = next((line for line in lines if re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', line)), "Not found")
23
-
24
- # DOB with or without label
25
- dob = "Not found"
26
- for line in lines:
27
- match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
28
- if match:
29
- dob = match.group(0)
30
- break
31
-
32
- # Gender – look for common gender keywords
33
- gender = "Not found"
34
- for line in lines:
35
- if "MALE" in line.upper():
36
- gender = "MALE"
37
- break
38
- elif "FEMALE" in line.upper():
39
- gender = "FEMALE"
40
- break
41
- elif "TRANSGENDER" in line.upper():
42
- gender = "TRANSGENDER"
43
- break
44
-
45
- # Name – find most probable name line (usually near DOB)
46
- name = "Not found"
47
- for i, line in enumerate(lines):
48
- # Assume name is just above DOB or gender
49
- if "DOB" in line.upper() or "MALE" in line.upper() or "FEMALE" in line.upper():
50
- if i > 0:
51
- possible_name = lines[i - 1]
52
- # Filter to avoid accidental text
53
- if (
54
- not any(x in possible_name.upper() for x in ["GOVERNMENT", "DOB", "MALE", "FEMALE", "YEAR"])
55
- and not re.search(r'\d', possible_name)
56
- ):
57
- name = possible_name.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- return {
61
- "aadhaar_number": aadhaar,
62
- "dob": dob,
63
- "gender": gender,
64
- "name": name
65
- }
66
 
67
  except Exception as e:
68
  return {"error": f"OCR processing failed: {str(e)}"}
 
1
  from paddleocr import PaddleOCR
2
  import re
3
 
 
4
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
5
 
6
  def extract_kyc_fields(file_path):
 
14
  if text:
15
  lines.append(text)
16
 
 
17
  full_text = "\n".join(lines)
18
 
19
+ # PAN pattern: 5 letters + 4 digits + 1 letter
20
+ pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
21
+
22
+ # Aadhaar pattern: 12 digits
23
+ aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
24
+
25
+ # Check which type
26
+ if pan_match:
27
+ card_type = "PAN"
28
+ elif aadhaar_match:
29
+ card_type = "AADHAAR"
30
+ else:
31
+ card_type = "UNKNOWN"
32
+
33
+ response = {"card_type": card_type}
34
+
35
+ if card_type == "AADHAAR":
36
+ # Aadhaar
37
+ response["aadhaar_number"] = aadhaar_match.group(0)
38
+
39
+ # DOB
40
+ dob = "Not found"
41
+ for line in lines:
42
+ match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
43
+ if match:
44
+ dob = match.group(0)
45
+ break
46
+ response["dob"] = dob
47
+
48
+ # Gender
49
+ gender = "Not found"
50
+ for line in lines:
51
+ if "MALE" in line.upper():
52
+ gender = "MALE"
53
+ break
54
+ elif "FEMALE" in line.upper():
55
+ gender = "FEMALE"
56
+ break
57
+ response["gender"] = gender
58
+
59
+ # Name
60
+ name = "Not found"
61
+ for i, line in enumerate(lines):
62
+ if "DOB" in line.upper():
63
+ if i > 0:
64
+ possible_name = lines[i - 1]
65
+ if (
66
+ not any(x in possible_name.upper() for x in ["GOVERNMENT", "MALE", "FEMALE"])
67
+ and not re.search(r'\d', possible_name)
68
+ ):
69
+ name = possible_name.strip()
70
+ break
71
+ response["name"] = name
72
+
73
+ elif card_type == "PAN":
74
+ # PAN
75
+ response["pan_number"] = pan_match.group(0)
76
+
77
+ # DOB
78
+ dob = "Not found"
79
+ for line in lines:
80
+ if "DATE OF BIRTH" in line.upper():
81
+ match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
82
+ if match:
83
+ dob = match.group(0)
84
  break
85
+ response["dob"] = dob
86
+
87
+ # Name: first line after heading usually
88
+ name = "Not found"
89
+ for i, line in enumerate(lines):
90
+ if "INCOME TAX DEPARTMENT" in line.upper():
91
+ if i + 1 < len(lines):
92
+ possible_name = lines[i + 1]
93
+ if not re.search(r'\d', possible_name):
94
+ name = possible_name.strip()
95
+ break
96
+ response["name"] = name
97
+
98
+ else:
99
+ response["error"] = "Unable to determine document type."
100
 
101
+ return response
 
 
 
 
 
102
 
103
  except Exception as e:
104
  return {"error": f"OCR processing failed: {str(e)}"}