SMART_KYC_OCR

Sleeping

App Files Files Community

gopichandra commited on Aug 20

Commit

b07dfbb

verified ·

1 Parent(s): 349558e

Update utils.py

Browse files

Files changed (1) hide show

utils.py +64 -68

utils.py CHANGED Viewed

@@ -1,15 +1,43 @@
 from paddleocr import PaddleOCR
 import re
-# Initialize OCR once (English)
 ocr = PaddleOCR(use_angle_cls=True, lang='en')
-def extract_kyc_fields(file_path):
     try:
         # OCR text extraction
         result = ocr.ocr(file_path, cls=True)
-        # Clean up lines
         lines = []
         for block in result:
             for line in block:
@@ -17,18 +45,21 @@ def extract_kyc_fields(file_path):
                 if text:
                     lines.append(text)
-        # Combine all text
         full_text = "\n".join(lines)
-        # Detect card type
         pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
         aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
-        card_type = "UNKNOWN"
         if pan_match:
             card_type = "PAN"
         elif aadhaar_match:
             card_type = "AADHAAR"
         response = {"card_type": card_type}
@@ -36,28 +67,10 @@ def extract_kyc_fields(file_path):
         if card_type == "PAN":
             response["pan_number"] = pan_match.group(0)
-            # DOB extraction with multiple formats
-            dob = "Not found"
-            for line in lines:
-                match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
-                if match:
-                    dob = match.group(0)
-                    break
-            if dob == "Not found":
-                for line in lines:
-                    match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
-                    if match:
-                        dob = match.group(0)
-                        break
-            if dob == "Not found":
-                for line in lines:
-                    match = re.search(r'\b(19|20)\d{2}\b', line)
-                    if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
-                        dob = match.group(0)
-                        break
-            response["dob"] = dob
-            # Name detection
             name = "Not found"
             for i in range(len(lines)):
                 if "INCOME TAX DEPARTMENT" in lines[i].upper():
@@ -65,80 +78,63 @@ def extract_kyc_fields(file_path):
                         possible = lines[j].strip()
                         if (
                             re.match(r'^[A-Z\s.]+$', possible)
-                            and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"])
                             and not re.search(r'\d', possible)
                         ):
-                            name = possible.strip()
                             break
                     break
             response["name"] = name
         # ===================== AADHAAR CARD =====================
-        elif card_type == "AADHAAR":
             response["aadhaar_number"] = aadhaar_match.group(0)
-            # DOB extraction with multiple formats
-            dob = "Not found"
-            for line in lines:
-                match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
-                if match:
-                    dob = match.group(0)
-                    break
-            if dob == "Not found":
-                for line in lines:
-                    match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
-                    if match:
-                        dob = match.group(0)
-                        break
-            if dob == "Not found":
-                for line in lines:
-                    match = re.search(r'\b(19|20)\d{2}\b', line)
-                    if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
-                        dob = match.group(0)
-                        break
-            response["dob"] = dob
             # Gender
             gender = "Not found"
             for line in lines:
                 up = line.upper()
-                if "MALE" in up:
-                    gender = "MALE"
                     break
-                elif "FEMALE" in up:
                     gender = "FEMALE"
                     break
-                elif "TRANSGENDER" in up:
-                    gender = "TRANSGENDER"
                     break
             response["gender"] = gender
-            # Name detection: before DOB line
             name = "Not found"
             for i, line in enumerate(lines):
                 if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
-                    possible_name = lines[i - 1].strip()
                     if (
-                        not re.search(r'\d', possible_name)
-                        and len(possible_name.split()) >= 2
-                        and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                     ):
-                        name = possible_name
                         break
             if name == "Not found":
                 for line in lines:
                     if (
-                        not re.search(r'\d', line)
-                        and len(line.split()) >= 2
-                        and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                     ):
-                        name = line.strip()
                         break
             response["name"] = name
-        else:
-            response["error"] = "Could not identify document as PAN or Aadhaar."
         return response
     except Exception as e:

 from paddleocr import PaddleOCR
 import re
+# Initialize OCR once (English). Download happens first time it's used.
+# If you want to support other langs, set lang='en' -> 'en'|'hi'|'mr'... etc, or 'en'+'multilang models'.
 ocr = PaddleOCR(use_angle_cls=True, lang='en')
+def _extract_dob(lines):
+    """
+    Try multiple formats:
+      - dd/mm/yyyy | dd-mm-yyyy | dd.mm.yyyy
+      - yyyy-mm-dd
+      - Year of Birth lines (YOB / YEAR / BIRTH)
+    """
+    # dd{sep}mm{sep}yyyy
+    for line in lines:
+        m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
+        if m:
+            return m.group(0)
+    # yyyy-mm-dd
+    for line in lines:
+        m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
+        if m:
+            return m.group(0)
+    # Year only if labeled as YOB/Year/Birth
+    for line in lines:
+        m = re.search(r'\b(19|20)\d{2}\b', line)
+        if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]):
+            return m.group(0)
+    return "Not found"
+def extract_kyc_fields(file_path: str) -> dict:
     try:
         # OCR text extraction
         result = ocr.ocr(file_path, cls=True)
+        # Flatten to text lines
         lines = []
         for block in result:
             for line in block:
                 if text:
                     lines.append(text)
         full_text = "\n".join(lines)
+        # Detect card type by patterns
         pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
         aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
         if pan_match:
             card_type = "PAN"
         elif aadhaar_match:
             card_type = "AADHAAR"
+        else:
+            return {
+                "card_type": "UNKNOWN",
+                "error": "Could not identify document as PAN or Aadhaar."
+            }
         response = {"card_type": card_type}
         if card_type == "PAN":
             response["pan_number"] = pan_match.group(0)
+            # DOB
+            response["dob"] = _extract_dob(lines)
+            # Name (heuristic: next lines after "INCOME TAX DEPARTMENT")
             name = "Not found"
             for i in range(len(lines)):
                 if "INCOME TAX DEPARTMENT" in lines[i].upper():
                         possible = lines[j].strip()
                         if (
                             re.match(r'^[A-Z\s.]+$', possible)
+                            and not any(x in possible.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"])
                             and not re.search(r'\d', possible)
+                            and len(possible) >= 3
                         ):
+                            name = possible
                             break
                     break
             response["name"] = name
         # ===================== AADHAAR CARD =====================
+        else:
             response["aadhaar_number"] = aadhaar_match.group(0)
+            # DOB / YOB
+            response["dob"] = _extract_dob(lines)
             # Gender
             gender = "Not found"
             for line in lines:
                 up = line.upper()
+                if "TRANSGENDER" in up:
+                    gender = "TRANSGENDER"
                     break
+                if "FEMALE" in up:
                     gender = "FEMALE"
                     break
+                if "MALE" in up:
+                    gender = "MALE"
                     break
             response["gender"] = gender
+            # Name: usually line before DOB or first reasonable line without digits
             name = "Not found"
+            # try line before a date line
             for i, line in enumerate(lines):
                 if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
+                    candidate = lines[i - 1].strip()
                     if (
+                        not re.search(r'\d', candidate)
+                        and len(candidate.split()) >= 2
+                        and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                     ):
+                        name = candidate
                         break
+            # fallback
             if name == "Not found":
                 for line in lines:
+                    candidate = line.strip()
                     if (
+                        not re.search(r'\d', candidate)
+                        and len(candidate.split()) >= 2
+                        and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                     ):
+                        name = candidate
                         break
             response["name"] = name
         return response
     except Exception as e: