gopichandra commited on
Commit
a1903a9
·
verified ·
1 Parent(s): de2660a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -25
app.py CHANGED
@@ -4,30 +4,39 @@ import pytesseract
4
  import re
5
 
6
  def extract_kyc_fields(image):
7
- text = pytesseract.image_to_string(image)
8
-
9
- result = {}
10
-
11
- # Aadhaar number pattern
12
- aadhaar_match = re.search(r'\d{4}[\s-]?\d{4}[\s-]?\d{4}', text)
13
- if aadhaar_match:
14
- aadhaar_number = aadhaar_match.group().replace(" ", "-")
15
- result["aadhaar_number"] = aadhaar_number
16
-
17
- # Name: first line (assuming top of image is name)
18
- lines = text.split('\n')
19
- for line in lines:
20
- clean_line = line.strip()
21
- if clean_line and clean_line.isalpha() and len(clean_line.split()) >= 2:
22
- result["name"] = clean_line
23
- break
24
-
25
- # DOB or Birth Date
26
- dob_match = re.search(r'(\d{2}[/-]\d{2}[/-]\d{4})|(\d{4}-\d{2}-\d{2})', text)
27
- if dob_match:
28
- result["dob"] = dob_match.group()
29
-
30
- return result if result else {"error": "Unable to extract KYC details"}
 
 
 
 
 
 
 
 
 
31
 
32
  # Gradio UI
33
  iface = gr.Interface(
@@ -35,7 +44,7 @@ iface = gr.Interface(
35
  inputs=gr.Image(type="pil"),
36
  outputs="json",
37
  title="🧠 Smart KYC OCR Extractor",
38
- description="Upload an Aadhaar/PAN image to extract structured KYC fields using OCR"
39
  )
40
 
41
  if __name__ == "__main__":
 
4
  import re
5
 
6
  def extract_kyc_fields(image):
7
+ try:
8
+ text = pytesseract.image_to_string(image)
9
+
10
+ result = {}
11
+
12
+ # Aadhaar number pattern
13
+ aadhaar_match = re.search(r'\d{4}[\s-]?\d{4}[\s-]?\d{4}', text)
14
+ if aadhaar_match:
15
+ aadhaar_number = aadhaar_match.group().replace(" ", "-")
16
+ result["aadhaar_number"] = aadhaar_number
17
+ else:
18
+ result["aadhaar_number"] = "Not found"
19
+
20
+ # Name (first all-alphabetic line with 2+ words)
21
+ lines = text.split('\n')
22
+ for line in lines:
23
+ clean_line = line.strip()
24
+ if clean_line and re.match(r'^[A-Za-z ]+$', clean_line) and len(clean_line.split()) >= 2:
25
+ result["name"] = clean_line
26
+ break
27
+ if "name" not in result:
28
+ result["name"] = "Not found"
29
+
30
+ # DOB or Birth Date
31
+ dob_match = re.search(r'(\d{2}[/-]\d{2}[/-]\d{4})|(\d{4}-\d{2}-\d{2})', text)
32
+ if dob_match:
33
+ result["dob"] = dob_match.group()
34
+ else:
35
+ result["dob"] = "Not found"
36
+
37
+ return result
38
+ except Exception as e:
39
+ return {"error": str(e)}
40
 
41
  # Gradio UI
42
  iface = gr.Interface(
 
44
  inputs=gr.Image(type="pil"),
45
  outputs="json",
46
  title="🧠 Smart KYC OCR Extractor",
47
+ description="Upload Aadhaar/PAN image and extract KYC fields using Tesseract"
48
  )
49
 
50
  if __name__ == "__main__":