Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,30 +4,39 @@ import pytesseract
|
|
4 |
import re
|
5 |
|
6 |
def extract_kyc_fields(image):
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
# Gradio UI
|
33 |
iface = gr.Interface(
|
@@ -35,7 +44,7 @@ iface = gr.Interface(
|
|
35 |
inputs=gr.Image(type="pil"),
|
36 |
outputs="json",
|
37 |
title="🧠 Smart KYC OCR Extractor",
|
38 |
-
description="Upload
|
39 |
)
|
40 |
|
41 |
if __name__ == "__main__":
|
|
|
4 |
import re
|
5 |
|
6 |
def extract_kyc_fields(image):
|
7 |
+
try:
|
8 |
+
text = pytesseract.image_to_string(image)
|
9 |
+
|
10 |
+
result = {}
|
11 |
+
|
12 |
+
# Aadhaar number pattern
|
13 |
+
aadhaar_match = re.search(r'\d{4}[\s-]?\d{4}[\s-]?\d{4}', text)
|
14 |
+
if aadhaar_match:
|
15 |
+
aadhaar_number = aadhaar_match.group().replace(" ", "-")
|
16 |
+
result["aadhaar_number"] = aadhaar_number
|
17 |
+
else:
|
18 |
+
result["aadhaar_number"] = "Not found"
|
19 |
+
|
20 |
+
# Name (first all-alphabetic line with 2+ words)
|
21 |
+
lines = text.split('\n')
|
22 |
+
for line in lines:
|
23 |
+
clean_line = line.strip()
|
24 |
+
if clean_line and re.match(r'^[A-Za-z ]+$', clean_line) and len(clean_line.split()) >= 2:
|
25 |
+
result["name"] = clean_line
|
26 |
+
break
|
27 |
+
if "name" not in result:
|
28 |
+
result["name"] = "Not found"
|
29 |
+
|
30 |
+
# DOB or Birth Date
|
31 |
+
dob_match = re.search(r'(\d{2}[/-]\d{2}[/-]\d{4})|(\d{4}-\d{2}-\d{2})', text)
|
32 |
+
if dob_match:
|
33 |
+
result["dob"] = dob_match.group()
|
34 |
+
else:
|
35 |
+
result["dob"] = "Not found"
|
36 |
+
|
37 |
+
return result
|
38 |
+
except Exception as e:
|
39 |
+
return {"error": str(e)}
|
40 |
|
41 |
# Gradio UI
|
42 |
iface = gr.Interface(
|
|
|
44 |
inputs=gr.Image(type="pil"),
|
45 |
outputs="json",
|
46 |
title="🧠 Smart KYC OCR Extractor",
|
47 |
+
description="Upload Aadhaar/PAN image and extract KYC fields using Tesseract"
|
48 |
)
|
49 |
|
50 |
if __name__ == "__main__":
|