gopichandra commited on
Commit
8324e53
·
verified ·
1 Parent(s): 443f4db

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +49 -20
utils.py CHANGED
@@ -1,39 +1,68 @@
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
- # Initialize OCR model
5
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
6
 
7
  def extract_kyc_fields(file_path):
8
  try:
9
  result = ocr.ocr(file_path, cls=True)
10
 
11
- # Combine text lines
12
- all_text = ""
13
- for line_group in result:
14
- for line in line_group:
15
- all_text += line[1][0] + "\n"
 
16
 
17
- # Aadhaar pattern
18
- aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', all_text)
19
 
20
- # DOB pattern
21
- dob_match = re.search(r'\b\d{2}[\/\-]\d{2}[\/\-]\d{4}\b', all_text)
22
 
23
- # Name logic
24
- name = "Not found"
25
- for line in all_text.split("\n"):
26
- if re.search(r'\b(name|naam|namf)\b', line, re.IGNORECASE):
27
- name = line.split(":")[-1].strip() if ":" in line else line.strip()
 
 
 
 
 
 
 
 
 
 
 
28
  break
29
- if name == "Not found" and all_text.strip():
30
- name = all_text.split("\n")[0].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  return {
33
- "aadhaar_number": aadhaar_match.group(0) if aadhaar_match else "Not found",
34
- "dob": dob_match.group(0) if dob_match else "Not found",
 
35
  "name": name
36
  }
37
 
38
  except Exception as e:
39
- return {"error": f"PaddleOCR failed: {str(e)}"}
 
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
+ # Initialize OCR with English and Tamil (or just 'en' if you want)
5
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
6
 
7
  def extract_kyc_fields(file_path):
8
  try:
9
  result = ocr.ocr(file_path, cls=True)
10
 
11
+ lines = []
12
+ for block in result:
13
+ for line in block:
14
+ text = line[1][0].strip()
15
+ if text:
16
+ lines.append(text)
17
 
18
+ # Combine all lines into one big string
19
+ full_text = "\n".join(lines)
20
 
21
+ # Aadhaar Number – strictly 12 digits (grouped or not)
22
+ aadhaar = next((line for line in lines if re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', line)), "Not found")
23
 
24
+ # DOB – with or without label
25
+ dob = "Not found"
26
+ for line in lines:
27
+ match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
28
+ if match:
29
+ dob = match.group(0)
30
+ break
31
+
32
+ # Gender – look for common gender keywords
33
+ gender = "Not found"
34
+ for line in lines:
35
+ if "MALE" in line.upper():
36
+ gender = "MALE"
37
+ break
38
+ elif "FEMALE" in line.upper():
39
+ gender = "FEMALE"
40
  break
41
+ elif "TRANSGENDER" in line.upper():
42
+ gender = "TRANSGENDER"
43
+ break
44
+
45
+ # Name – find most probable name line (usually near DOB)
46
+ name = "Not found"
47
+ for i, line in enumerate(lines):
48
+ # Assume name is just above DOB or gender
49
+ if "DOB" in line.upper() or "MALE" in line.upper() or "FEMALE" in line.upper():
50
+ if i > 0:
51
+ possible_name = lines[i - 1]
52
+ # Filter to avoid accidental text
53
+ if (
54
+ not any(x in possible_name.upper() for x in ["GOVERNMENT", "DOB", "MALE", "FEMALE", "YEAR"])
55
+ and not re.search(r'\d', possible_name)
56
+ ):
57
+ name = possible_name.strip()
58
+ break
59
 
60
  return {
61
+ "aadhaar_number": aadhaar,
62
+ "dob": dob,
63
+ "gender": gender,
64
  "name": name
65
  }
66
 
67
  except Exception as e:
68
+ return {"error": f"OCR processing failed: {str(e)}"}