SMART_KYC_OCR

Sleeping

App Files Files Community

gopichandra commited on Aug 6

Commit

e2cb16a

verified ·

1 Parent(s): 2c3e33d

Update utils.py

Browse files

Files changed (1) hide show

utils.py +40 -21

utils.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from paddleocr import PaddleOCR
 import re
-ocr = PaddleOCR(use_angle_cls=True, lang='en')
 def extract_kyc_fields(file_path):
     try:
@@ -16,23 +17,23 @@ def extract_kyc_fields(file_path):
         full_text = "\n".join(lines)
-        # PAN Number Detection
         pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
         aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
         if pan_match:
             card_type = "PAN"
         elif aadhaar_match:
             card_type = "AADHAAR"
-        else:
-            card_type = "UNKNOWN"
         response = {"card_type": card_type}
         if card_type == "PAN":
             response["pan_number"] = pan_match.group(0)
-            # Extract DOB as any line with DD/MM/YYYY
             dob = "Not found"
             for line in lines:
                 match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
@@ -41,22 +42,23 @@ def extract_kyc_fields(file_path):
                     break
             response["dob"] = dob
-            # Improved name extraction: find first uppercase name-like line after "INCOME TAX DEPARTMENT"
             name = "Not found"
             for i in range(len(lines)):
                 if "INCOME TAX DEPARTMENT" in lines[i].upper():
                     for j in range(i+1, len(lines)):
                         possible = lines[j].strip()
                         if (
-                            re.match(r'^[A-Z\s.]+$', possible) and
-                            not any(x in possible for x in ["INDIA", "DEPARTMENT", "GOVT"]) and
-                            not re.search(r'\d', possible)
                         ):
                             name = possible.strip()
                             break
                     break
             response["name"] = name
         elif card_type == "AADHAAR":
             response["aadhaar_number"] = aadhaar_match.group(0)
@@ -69,7 +71,7 @@ def extract_kyc_fields(file_path):
                     break
             response["dob"] = dob
-            # Gender
             gender = "Not found"
             for line in lines:
                 if "MALE" in line.upper():
@@ -78,24 +80,41 @@ def extract_kyc_fields(file_path):
                 elif "FEMALE" in line.upper():
                     gender = "FEMALE"
                     break
             response["gender"] = gender
-            # Name logic for Aadhaar (same as before)
             name = "Not found"
             for i, line in enumerate(lines):
-                if "DOB" in line.upper():
-                    if i > 0:
-                        possible_name = lines[i - 1]
-                        if (
-                            not any(x in possible_name.upper() for x in ["GOVERNMENT", "MALE", "FEMALE"])
-                            and not re.search(r'\d', possible_name)
-                        ):
-                            name = possible_name.strip()
-                            break
             response["name"] = name
         else:
-            response["error"] = "Unable to determine document type."
         return response

 from paddleocr import PaddleOCR
 import re
+# Enable multilingual support (English + Hindi + Tamil)
+ocr = PaddleOCR(use_angle_cls=True, lang='en|hi|ta')
 def extract_kyc_fields(file_path):
     try:
         full_text = "\n".join(lines)
+        # PAN pattern: 5 letters + 4 digits + 1 letter
         pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
         aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
+        card_type = "UNKNOWN"
         if pan_match:
             card_type = "PAN"
         elif aadhaar_match:
             card_type = "AADHAAR"
         response = {"card_type": card_type}
+        # --------- PAN CARD LOGIC ---------
         if card_type == "PAN":
             response["pan_number"] = pan_match.group(0)
+            # Extract DOB
             dob = "Not found"
             for line in lines:
                 match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
                     break
             response["dob"] = dob
+            # Extract Name
             name = "Not found"
             for i in range(len(lines)):
                 if "INCOME TAX DEPARTMENT" in lines[i].upper():
                     for j in range(i+1, len(lines)):
                         possible = lines[j].strip()
                         if (
+                            re.match(r'^[A-Z\s.]+$', possible)
+                            and not any(x in possible for x in ["INDIA", "DEPARTMENT", "GOVT"])
+                            and not re.search(r'\d', possible)
                         ):
                             name = possible.strip()
                             break
                     break
             response["name"] = name
+        # --------- AADHAAR CARD LOGIC ---------
         elif card_type == "AADHAAR":
             response["aadhaar_number"] = aadhaar_match.group(0)
                     break
             response["dob"] = dob
+            # Extract Gender
             gender = "Not found"
             for line in lines:
                 if "MALE" in line.upper():
                 elif "FEMALE" in line.upper():
                     gender = "FEMALE"
                     break
+                elif "TRANSGENDER" in line.upper():
+                    gender = "TRANSGENDER"
+                    break
             response["gender"] = gender
+            # Robust name extraction
             name = "Not found"
+            # First attempt: line before DOB
             for i, line in enumerate(lines):
+                if re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line) and i > 0:
+                    possible_name = lines[i - 1].strip()
+                    if (
+                        not any(x in possible_name.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
+                        and not re.search(r'\d', possible_name)
+                        and len(possible_name.split()) >= 2
+                    ):
+                        name = possible_name
+                        break
+            # Fallback: best guess line with title-cased text and no digits
+            if name == "Not found":
+                for line in lines:
+                    if (
+                        not re.search(r'\d', line)
+                        and len(line.split()) >= 2
+                        and line[0].isupper()
+                        and not any(x in line.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
+                    ):
+                        name = line.strip()
+                        break
             response["name"] = name
         else:
+            response["error"] = "Unable to determine document type (PAN/Aadhaar)."
         return response