gopichandra commited on
Commit
ae2e698
·
verified ·
1 Parent(s): a87236e

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +30 -30
utils.py CHANGED
@@ -1,40 +1,40 @@
1
- import pytesseract
2
- from PIL import Image
3
  import re
4
 
 
 
 
5
  def extract_kyc_fields(file_path):
6
  try:
7
- # Open and convert image to RGB for OCR
8
- image = Image.open(file_path).convert("RGB")
9
-
10
- # Run Tesseract OCR
11
- text = pytesseract.image_to_string(image)
12
-
13
- # Aadhaar pattern: 12-digit, with or without space or dash
14
- aadhaar_match = re.search(r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', text)
15
-
16
- # DOB pattern: formats like DD-MM-YYYY or DD/MM/YYYY
17
- dob_match = re.search(r'\b\d{2}[/-]\d{2}[/-]\d{4}\b', text)
18
-
19
- # Try to extract name line heuristically (line with "Name", "Naam", etc.)
20
- name_line = next(
21
- (
22
- line for line in text.split("\n")
23
- if re.search(r'\b(name|naam|namf)\b', line, re.IGNORECASE)
24
- ),
25
- ""
26
- )
27
-
28
- # Extract name text
29
- name = name_line.split(":")[-1].strip() if ":" in name_line else name_line.strip()
30
-
31
- # Return structured KYC data
32
  return {
33
  "aadhaar_number": aadhaar_match.group(0) if aadhaar_match else "Not found",
34
  "dob": dob_match.group(0) if dob_match else "Not found",
35
- "name": name if name else "Not found"
36
  }
37
 
38
  except Exception as e:
39
- # Return error as dict to show in Gradio
40
- return {"error": f"OCR failed: {str(e)}"}
 
1
+ from paddleocr import PaddleOCR
 
2
  import re
3
 
4
+ # Initialize OCR model only once
5
+ ocr = PaddleOCR(use_angle_cls=True, lang='en') # lang='en' for English documents
6
+
7
  def extract_kyc_fields(file_path):
8
  try:
9
+ # Run OCR
10
+ result = ocr.ocr(file_path, cls=True)
11
+
12
+ all_text = ""
13
+ for line_group in result:
14
+ for line in line_group:
15
+ all_text += line[1][0] + "\n"
16
+
17
+ # Aadhaar number (format with or without space/dash)
18
+ aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', all_text)
19
+
20
+ # DOB (any DD/MM/YYYY or similar)
21
+ dob_match = re.search(r'\b\d{2}[\/\-]\d{2}[\/\-]\d{4}\b', all_text)
22
+
23
+ # Name: try to detect a line with 'Name' or fallback to top line
24
+ name = "Not found"
25
+ for line in all_text.split("\n"):
26
+ if re.search(r'\b(name|naam|namf)\b', line, re.IGNORECASE):
27
+ name = line.split(":")[-1].strip() if ":" in line else line.strip()
28
+ break
29
+
30
+ if name == "Not found":
31
+ name = all_text.split("\n")[0].strip()
32
+
 
33
  return {
34
  "aadhaar_number": aadhaar_match.group(0) if aadhaar_match else "Not found",
35
  "dob": dob_match.group(0) if dob_match else "Not found",
36
+ "name": name
37
  }
38
 
39
  except Exception as e:
40
+ return {"error": f"PaddleOCR failed: {str(e)}"}