SMART_KYC_OCR

Sleeping

App Files Files Community

gopichandra commited on Aug 21

Commit

65bef46

verified ·

1 Parent(s): 254fdf9

Update utils.py

Browse files

Files changed (1) hide show

utils.py +50 -68

utils.py CHANGED Viewed

@@ -6,10 +6,8 @@ ocr = PaddleOCR(use_angle_cls=True, lang='en')
 def extract_kyc_fields(file_path, force_type=None):
     try:
-        # OCR text extraction
         result = ocr.ocr(file_path, cls=True)
-        # Clean up lines
         lines = []
         for block in result:
             for line in block:
@@ -19,7 +17,6 @@ def extract_kyc_fields(file_path, force_type=None):
         full_text = "\n".join(lines)
-        # Detect card type (if not forced)
         if force_type:
             card_type = force_type.upper()
         else:
@@ -33,90 +30,30 @@ def extract_kyc_fields(file_path, force_type=None):
         response = {"card_type": card_type}
-        # ===================== PAN CARD =====================
         if card_type == "PAN":
             pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
             if pan_match:
                 response["pan_number"] = pan_match.group(0)
-            # DOB extraction
-            dob = extract_dob(lines)
-            response["dob"] = dob
-            # Name detection
-            name = "Not found"
-            for i in range(len(lines)):
-                if "INCOME TAX DEPARTMENT" in lines[i].upper():
-                    for j in range(i + 1, len(lines)):
-                        possible = lines[j].strip()
-                        if (
-                            re.match(r'^[A-Z\s.]+$', possible)
-                            and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"])
-                            and not re.search(r'\d', possible)
-                        ):
-                            name = possible.strip()
-                            break
-                    break
-            response["name"] = name
-        # ===================== AADHAAR CARD =====================
         elif card_type == "AADHAAR":
             aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
             if aadhaar_match:
                 response["aadhaar_number"] = aadhaar_match.group(0)
-            # DOB extraction
-            dob = extract_dob(lines)
-            response["dob"] = dob
-            # Gender detection
-            gender = "Not found"
-            for line in lines:
-                if "MALE" in line.upper():
-                    gender = "MALE"
-                    break
-                elif "FEMALE" in line.upper():
-                    gender = "FEMALE"
-                    break
-                elif "TRANSGENDER" in line.upper():
-                    gender = "TRANSGENDER"
-                    break
-            response["gender"] = gender
-            # Name detection: before DOB
-            name = "Not found"
-            for i, line in enumerate(lines):
-                if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
-                    possible_name = lines[i - 1].strip()
-                    if (
-                        not re.search(r'\d', possible_name)
-                        and len(possible_name.split()) >= 2
-                        and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
-                    ):
-                        name = possible_name
-                        break
-            if name == "Not found":
-                for line in lines:
-                    if (
-                        not re.search(r'\d', line)
-                        and len(line.split()) >= 2
-                        and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
-                    ):
-                        name = line.strip()
-                        break
-            response["name"] = name
         else:
             response["error"] = "Could not identify document as PAN or Aadhaar."
         return response
     except Exception as e:
         return {"error": f"OCR processing failed: {str(e)}"}
 def extract_dob(lines):
-    """Extract DOB from OCR lines in multiple formats."""
     dob = "Not found"
     for line in lines:
         match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
@@ -131,3 +68,48 @@ def extract_dob(lines):
         if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
             return match.group(0)
     return dob

 def extract_kyc_fields(file_path, force_type=None):
     try:
         result = ocr.ocr(file_path, cls=True)
         lines = []
         for block in result:
             for line in block:
         full_text = "\n".join(lines)
         if force_type:
             card_type = force_type.upper()
         else:
         response = {"card_type": card_type}
         if card_type == "PAN":
             pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
             if pan_match:
                 response["pan_number"] = pan_match.group(0)
+            response["dob"] = extract_dob(lines)
+            response["name"] = extract_pan_name(lines)
         elif card_type == "AADHAAR":
             aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
             if aadhaar_match:
                 response["aadhaar_number"] = aadhaar_match.group(0)
+            response["dob"] = extract_dob(lines)
+            response["gender"] = extract_gender(lines)
+            response["name"] = extract_aadhaar_name(lines)
         else:
             response["error"] = "Could not identify document as PAN or Aadhaar."
         return response
     except Exception as e:
         return {"error": f"OCR processing failed: {str(e)}"}
 def extract_dob(lines):
     dob = "Not found"
     for line in lines:
         match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
         if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
             return match.group(0)
     return dob
+def extract_gender(lines):
+    for line in lines:
+        if "MALE" in line.upper():
+            return "MALE"
+        elif "FEMALE" in line.upper():
+            return "FEMALE"
+        elif "TRANSGENDER" in line.upper():
+            return "TRANSGENDER"
+    return "Not found"
+def extract_pan_name(lines):
+    for i in range(len(lines)):
+        if "INCOME TAX DEPARTMENT" in lines[i].upper():
+            for j in range(i + 1, len(lines)):
+                possible = lines[j].strip()
+                if (
+                    re.match(r'^[A-Z\s.]+$', possible)
+                    and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"])
+                    and not re.search(r'\d', possible)
+                ):
+                    return possible.strip()
+    return "Not found"
+def extract_aadhaar_name(lines):
+    for i, line in enumerate(lines):
+        if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
+            possible_name = lines[i - 1].strip()
+            if (
+                not re.search(r'\d', possible_name)
+                and len(possible_name.split()) >= 2
+                and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
+            ):
+                return possible_name
+    for line in lines:
+        if (
+            not re.search(r'\d', line)
+            and len(line.split()) >= 2
+            and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
+        ):
+            return line.strip()
+    return "Not found"