gopichandra commited on
Commit
a87236e
·
verified ·
1 Parent(s): a9b1adf

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +24 -6
utils.py CHANGED
@@ -4,19 +4,37 @@ import re
4
 
5
  def extract_kyc_fields(file_path):
6
  try:
 
7
  image = Image.open(file_path).convert("RGB")
 
 
8
  text = pytesseract.image_to_string(image)
 
 
 
 
 
 
9
 
10
- aadhaar = re.search(r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', text)
11
- dob = re.search(r'\d{2}[\/\-]\d{2}[\/\-]\d{4}', text)
12
- name_line = next((line for line in text.split("\n") if re.search(r'(?i)name', line)), "")
 
 
 
 
 
 
 
13
  name = name_line.split(":")[-1].strip() if ":" in name_line else name_line.strip()
14
 
 
15
  return {
16
- "aadhaar_number": aadhaar.group(0) if aadhaar else "Not found",
17
- "dob": dob.group(0) if dob else "Not found",
18
  "name": name if name else "Not found"
19
  }
20
 
21
  except Exception as e:
22
- return {"error": str(e)}
 
 
4
 
5
  def extract_kyc_fields(file_path):
6
  try:
7
+ # Open and convert image to RGB for OCR
8
  image = Image.open(file_path).convert("RGB")
9
+
10
+ # Run Tesseract OCR
11
  text = pytesseract.image_to_string(image)
12
+
13
+ # Aadhaar pattern: 12-digit, with or without space or dash
14
+ aadhaar_match = re.search(r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', text)
15
+
16
+ # DOB pattern: formats like DD-MM-YYYY or DD/MM/YYYY
17
+ dob_match = re.search(r'\b\d{2}[/-]\d{2}[/-]\d{4}\b', text)
18
 
19
+ # Try to extract name line heuristically (line with "Name", "Naam", etc.)
20
+ name_line = next(
21
+ (
22
+ line for line in text.split("\n")
23
+ if re.search(r'\b(name|naam|namf)\b', line, re.IGNORECASE)
24
+ ),
25
+ ""
26
+ )
27
+
28
+ # Extract name text
29
  name = name_line.split(":")[-1].strip() if ":" in name_line else name_line.strip()
30
 
31
+ # Return structured KYC data
32
  return {
33
+ "aadhaar_number": aadhaar_match.group(0) if aadhaar_match else "Not found",
34
+ "dob": dob_match.group(0) if dob_match else "Not found",
35
  "name": name if name else "Not found"
36
  }
37
 
38
  except Exception as e:
39
+ # Return error as dict to show in Gradio
40
+ return {"error": f"OCR failed: {str(e)}"}