gopichandra commited on
Commit
73f6d86
·
verified ·
1 Parent(s): 7538705

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +7 -8
utils.py CHANGED
@@ -1,33 +1,32 @@
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
- # Initialize OCR model only once
5
- ocr = PaddleOCR(use_angle_cls=True, lang='en') # lang='en' for English documents
6
 
7
  def extract_kyc_fields(file_path):
8
  try:
9
- # Run OCR
10
  result = ocr.ocr(file_path, cls=True)
11
 
 
12
  all_text = ""
13
  for line_group in result:
14
  for line in line_group:
15
  all_text += line[1][0] + "\n"
16
 
17
- # Aadhaar number (format with or without space/dash)
18
  aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', all_text)
19
 
20
- # DOB (any DD/MM/YYYY or similar)
21
  dob_match = re.search(r'\b\d{2}[\/\-]\d{2}[\/\-]\d{4}\b', all_text)
22
 
23
- # Name: try to detect a line with 'Name' or fallback to top line
24
  name = "Not found"
25
  for line in all_text.split("\n"):
26
  if re.search(r'\b(name|naam|namf)\b', line, re.IGNORECASE):
27
  name = line.split(":")[-1].strip() if ":" in line else line.strip()
28
  break
29
-
30
- if name == "Not found":
31
  name = all_text.split("\n")[0].strip()
32
 
33
  return {
 
1
  from paddleocr import PaddleOCR
2
  import re
3
 
4
+ # Initialize OCR model
5
+ ocr = PaddleOCR(use_angle_cls=True, lang='en')
6
 
7
  def extract_kyc_fields(file_path):
8
  try:
 
9
  result = ocr.ocr(file_path, cls=True)
10
 
11
+ # Combine text lines
12
  all_text = ""
13
  for line_group in result:
14
  for line in line_group:
15
  all_text += line[1][0] + "\n"
16
 
17
+ # Aadhaar pattern
18
  aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', all_text)
19
 
20
+ # DOB pattern
21
  dob_match = re.search(r'\b\d{2}[\/\-]\d{2}[\/\-]\d{4}\b', all_text)
22
 
23
+ # Name logic
24
  name = "Not found"
25
  for line in all_text.split("\n"):
26
  if re.search(r'\b(name|naam|namf)\b', line, re.IGNORECASE):
27
  name = line.split(":")[-1].strip() if ":" in line else line.strip()
28
  break
29
+ if name == "Not found" and all_text.strip():
 
30
  name = all_text.split("\n")[0].strip()
31
 
32
  return {