SMART_KYC_OCR

Sleeping

App Files Files Community

gopichandra commited on Aug 6

Commit

8324e53

verified ·

1 Parent(s): 443f4db

Update utils.py

Browse files

Files changed (1) hide show

utils.py +49 -20

utils.py CHANGED Viewed

@@ -1,39 +1,68 @@
 from paddleocr import PaddleOCR
 import re
-# Initialize OCR model
 ocr = PaddleOCR(use_angle_cls=True, lang='en')
 def extract_kyc_fields(file_path):
     try:
         result = ocr.ocr(file_path, cls=True)
-        # Combine text lines
-        all_text = ""
-        for line_group in result:
-            for line in line_group:
-                all_text += line[1][0] + "\n"
-        # Aadhaar pattern
-        aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', all_text)
-        # DOB pattern
-        dob_match = re.search(r'\b\d{2}[\/\-]\d{2}[\/\-]\d{4}\b', all_text)
-        # Name logic
-        name = "Not found"
-        for line in all_text.split("\n"):
-            if re.search(r'\b(name|naam|namf)\b', line, re.IGNORECASE):
-                name = line.split(":")[-1].strip() if ":" in line else line.strip()
                 break
-        if name == "Not found" and all_text.strip():
-            name = all_text.split("\n")[0].strip()
         return {
-            "aadhaar_number": aadhaar_match.group(0) if aadhaar_match else "Not found",
-            "dob": dob_match.group(0) if dob_match else "Not found",
             "name": name
         }
     except Exception as e:
-        return {"error": f"PaddleOCR failed: {str(e)}"}

 from paddleocr import PaddleOCR
 import re
+# Initialize OCR with English and Tamil (or just 'en' if you want)
 ocr = PaddleOCR(use_angle_cls=True, lang='en')
 def extract_kyc_fields(file_path):
     try:
         result = ocr.ocr(file_path, cls=True)
+        lines = []
+        for block in result:
+            for line in block:
+                text = line[1][0].strip()
+                if text:
+                    lines.append(text)
+        # Combine all lines into one big string
+        full_text = "\n".join(lines)
+        # Aadhaar Number – strictly 12 digits (grouped or not)
+        aadhaar = next((line for line in lines if re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', line)), "Not found")
+        # DOB – with or without label
+        dob = "Not found"
+        for line in lines:
+            match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
+            if match:
+                dob = match.group(0)
+                break
+        # Gender – look for common gender keywords
+        gender = "Not found"
+        for line in lines:
+            if "MALE" in line.upper():
+                gender = "MALE"
+                break
+            elif "FEMALE" in line.upper():
+                gender = "FEMALE"
                 break
+            elif "TRANSGENDER" in line.upper():
+                gender = "TRANSGENDER"
+                break
+        # Name – find most probable name line (usually near DOB)
+        name = "Not found"
+        for i, line in enumerate(lines):
+            # Assume name is just above DOB or gender
+            if "DOB" in line.upper() or "MALE" in line.upper() or "FEMALE" in line.upper():
+                if i > 0:
+                    possible_name = lines[i - 1]
+                    # Filter to avoid accidental text
+                    if (
+                        not any(x in possible_name.upper() for x in ["GOVERNMENT", "DOB", "MALE", "FEMALE", "YEAR"])
+                        and not re.search(r'\d', possible_name)
+                    ):
+                        name = possible_name.strip()
+                        break
         return {
+            "aadhaar_number": aadhaar,
+            "dob": dob,
+            "gender": gender,
             "name": name
         }
     except Exception as e:
+        return {"error": f"OCR processing failed: {str(e)}"}