Spaces:

gopichandra
/

SMART_KYC_OCR

Sleeping

gopichandra commited on Aug 5

Commit

a87236e

verified ·

1 Parent(s): a9b1adf

Update utils.py

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -4,19 +4,37 @@ import re
 def extract_kyc_fields(file_path):
     try:
         image = Image.open(file_path).convert("RGB")
         text = pytesseract.image_to_string(image)
-        aadhaar = re.search(r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', text)
-        dob = re.search(r'\d{2}[\/\-]\d{2}[\/\-]\d{4}', text)
-        name_line = next((line for line in text.split("\n") if re.search(r'(?i)name', line)), "")
         name = name_line.split(":")[-1].strip() if ":" in name_line else name_line.strip()
         return {
-            "aadhaar_number": aadhaar.group(0) if aadhaar else "Not found",
-            "dob": dob.group(0) if dob else "Not found",
             "name": name if name else "Not found"
         }
     except Exception as e:
-        return {"error": str(e)}

 def extract_kyc_fields(file_path):
     try:
+        # Open and convert image to RGB for OCR
         image = Image.open(file_path).convert("RGB")
+        # Run Tesseract OCR
         text = pytesseract.image_to_string(image)
+        # Aadhaar pattern: 12-digit, with or without space or dash
+        aadhaar_match = re.search(r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', text)
+        # DOB pattern: formats like DD-MM-YYYY or DD/MM/YYYY
+        dob_match = re.search(r'\b\d{2}[/-]\d{2}[/-]\d{4}\b', text)
+        # Try to extract name line heuristically (line with "Name", "Naam", etc.)
+        name_line = next(
+            (
+                line for line in text.split("\n")
+                if re.search(r'\b(name|naam|namf)\b', line, re.IGNORECASE)
+            ),
+            ""
+        )
+        # Extract name text
         name = name_line.split(":")[-1].strip() if ":" in name_line else name_line.strip()
+        # Return structured KYC data
         return {
+            "aadhaar_number": aadhaar_match.group(0) if aadhaar_match else "Not found",
+            "dob": dob_match.group(0) if dob_match else "Not found",
             "name": name if name else "Not found"
         }
     except Exception as e:
+        # Return error as dict to show in Gradio
+        return {"error": f"OCR failed: {str(e)}"}