Spaces:

gopichandra
/

SMART_KYC_OCR

Sleeping

App Files Files Community

gopichandra commited on Aug 21

Commit

254fdf9

verified ·

1 Parent(s): 8dcb382

Update utils.py

Browse files

Files changed (1) hide show

utils.py +75 -48

utils.py CHANGED Viewed

@@ -1,26 +1,15 @@
 from paddleocr import PaddleOCR
 import re
-# Initialize OCR for English
 ocr = PaddleOCR(use_angle_cls=True, lang='en')
-def _extract_dob(lines):
-    for line in lines:
-        m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
-        if m: return m.group(0)
-    for line in lines:
-        m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
-        if m: return m.group(0)
-    for line in lines:
-        m = re.search(r'\b(19|20)\d{2}\b', line)
-        if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]):
-            return m.group(0)
-    return "Not found"
-def extract_kyc_fields(file_path: str) -> dict:
     try:
         result = ocr.ocr(file_path, cls=True)
         lines = []
         for block in result:
             for line in block:
@@ -30,22 +19,31 @@ def extract_kyc_fields(file_path: str) -> dict:
         full_text = "\n".join(lines)
-        pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
-        aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
-        if pan_match:
-            card_type = "PAN"
-        elif aadhaar_match:
-            card_type = "AADHAAR"
         else:
-            return {"card_type": "UNKNOWN", "error": "Could not identify document as PAN or Aadhaar."}
         response = {"card_type": card_type}
         if card_type == "PAN":
-            response["pan_number"] = pan_match.group(0)
-            response["dob"] = _extract_dob(lines)
             name = "Not found"
             for i in range(len(lines)):
                 if "INCOME TAX DEPARTMENT" in lines[i].upper():
@@ -53,54 +51,83 @@ def extract_kyc_fields(file_path: str) -> dict:
                         possible = lines[j].strip()
                         if (
                             re.match(r'^[A-Z\s.]+$', possible)
-                            and not any(x in possible.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"])
                             and not re.search(r'\d', possible)
-                            and len(possible) >= 3
                         ):
-                            name = possible
                             break
                     break
             response["name"] = name
-        else:  # AADHAAR
-            response["aadhaar_number"] = aadhaar_match.group(0)
-            response["dob"] = _extract_dob(lines)
             gender = "Not found"
             for line in lines:
-                up = line.upper()
-                if "TRANSGENDER" in up:
-                    gender = "TRANSGENDER"; break
-                if "FEMALE" in up:
-                    gender = "FEMALE"; break
-                if "MALE" in up:
-                    gender = "MALE"; break
             response["gender"] = gender
             name = "Not found"
             for i, line in enumerate(lines):
                 if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
-                    candidate = lines[i - 1].strip()
                     if (
-                        not re.search(r'\d', candidate)
-                        and len(candidate.split()) >= 2
-                        and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                     ):
-                        name = candidate
                         break
             if name == "Not found":
                 for line in lines:
-                    candidate = line.strip()
                     if (
-                        not re.search(r'\d', candidate)
-                        and len(candidate.split()) >= 2
-                        and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                     ):
-                        name = candidate
                         break
             response["name"] = name
         return response
     except Exception as e:
         return {"error": f"OCR processing failed: {str(e)}"}

 from paddleocr import PaddleOCR
 import re
+# Initialize OCR
 ocr = PaddleOCR(use_angle_cls=True, lang='en')
+def extract_kyc_fields(file_path, force_type=None):
     try:
+        # OCR text extraction
         result = ocr.ocr(file_path, cls=True)
+        # Clean up lines
         lines = []
         for block in result:
             for line in block:
         full_text = "\n".join(lines)
+        # Detect card type (if not forced)
+        if force_type:
+            card_type = force_type.upper()
         else:
+            pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
+            aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
+            card_type = "UNKNOWN"
+            if pan_match:
+                card_type = "PAN"
+            elif aadhaar_match:
+                card_type = "AADHAAR"
         response = {"card_type": card_type}
+        # ===================== PAN CARD =====================
         if card_type == "PAN":
+            pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
+            if pan_match:
+                response["pan_number"] = pan_match.group(0)
+            # DOB extraction
+            dob = extract_dob(lines)
+            response["dob"] = dob
+            # Name detection
             name = "Not found"
             for i in range(len(lines)):
                 if "INCOME TAX DEPARTMENT" in lines[i].upper():
                         possible = lines[j].strip()
                         if (
                             re.match(r'^[A-Z\s.]+$', possible)
+                            and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"])
                             and not re.search(r'\d', possible)
                         ):
+                            name = possible.strip()
                             break
                     break
             response["name"] = name
+        # ===================== AADHAAR CARD =====================
+        elif card_type == "AADHAAR":
+            aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
+            if aadhaar_match:
+                response["aadhaar_number"] = aadhaar_match.group(0)
+            # DOB extraction
+            dob = extract_dob(lines)
+            response["dob"] = dob
+            # Gender detection
             gender = "Not found"
             for line in lines:
+                if "MALE" in line.upper():
+                    gender = "MALE"
+                    break
+                elif "FEMALE" in line.upper():
+                    gender = "FEMALE"
+                    break
+                elif "TRANSGENDER" in line.upper():
+                    gender = "TRANSGENDER"
+                    break
             response["gender"] = gender
+            # Name detection: before DOB
             name = "Not found"
             for i, line in enumerate(lines):
                 if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
+                    possible_name = lines[i - 1].strip()
                     if (
+                        not re.search(r'\d', possible_name)
+                        and len(possible_name.split()) >= 2
+                        and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                     ):
+                        name = possible_name
                         break
             if name == "Not found":
                 for line in lines:
                     if (
+                        not re.search(r'\d', line)
+                        and len(line.split()) >= 2
+                        and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                     ):
+                        name = line.strip()
                         break
             response["name"] = name
+        else:
+            response["error"] = "Could not identify document as PAN or Aadhaar."
         return response
     except Exception as e:
         return {"error": f"OCR processing failed: {str(e)}"}
+def extract_dob(lines):
+    """Extract DOB from OCR lines in multiple formats."""
+    dob = "Not found"
+    for line in lines:
+        match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
+        if match:
+            return match.group(0)
+    for line in lines:
+        match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
+        if match:
+            return match.group(0)
+    for line in lines:
+        match = re.search(r'\b(19|20)\d{2}\b', line)
+        if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
+            return match.group(0)
+    return dob