SMART_KYC_OCR

Sleeping

App Files Files Community

gopichandra commited on Aug 6

Commit

5ebcb93

verified ·

1 Parent(s): e2cb16a

Update utils.py

Browse files

Files changed (1) hide show

utils.py +16 -14

utils.py CHANGED Viewed

@@ -1,13 +1,15 @@
 from paddleocr import PaddleOCR
 import re
-# Enable multilingual support (English + Hindi + Tamil)
-ocr = PaddleOCR(use_angle_cls=True, lang='en|hi|ta')
 def extract_kyc_fields(file_path):
     try:
         result = ocr.ocr(file_path, cls=True)
         lines = []
         for block in result:
             for line in block:
@@ -15,9 +17,10 @@ def extract_kyc_fields(file_path):
                 if text:
                     lines.append(text)
         full_text = "\n".join(lines)
-        # PAN pattern: 5 letters + 4 digits + 1 letter
         pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
         aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
@@ -29,11 +32,11 @@ def extract_kyc_fields(file_path):
         response = {"card_type": card_type}
-        # --------- PAN CARD LOGIC ---------
         if card_type == "PAN":
             response["pan_number"] = pan_match.group(0)
-            # Extract DOB
             dob = "Not found"
             for line in lines:
                 match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
@@ -42,7 +45,7 @@ def extract_kyc_fields(file_path):
                     break
             response["dob"] = dob
-            # Extract Name
             name = "Not found"
             for i in range(len(lines)):
                 if "INCOME TAX DEPARTMENT" in lines[i].upper():
@@ -58,11 +61,11 @@ def extract_kyc_fields(file_path):
                     break
             response["name"] = name
-        # --------- AADHAAR CARD LOGIC ---------
         elif card_type == "AADHAAR":
             response["aadhaar_number"] = aadhaar_match.group(0)
-            # Extract DOB
             dob = "Not found"
             for line in lines:
                 match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
@@ -71,7 +74,7 @@ def extract_kyc_fields(file_path):
                     break
             response["dob"] = dob
-            # Extract Gender
             gender = "Not found"
             for line in lines:
                 if "MALE" in line.upper():
@@ -85,9 +88,8 @@ def extract_kyc_fields(file_path):
                     break
             response["gender"] = gender
-            # Robust name extraction
             name = "Not found"
-            # First attempt: line before DOB
             for i, line in enumerate(lines):
                 if re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line) and i > 0:
                     possible_name = lines[i - 1].strip()
@@ -99,13 +101,12 @@ def extract_kyc_fields(file_path):
                         name = possible_name
                         break
-            # Fallback: best guess line with title-cased text and no digits
             if name == "Not found":
                 for line in lines:
                     if (
                         not re.search(r'\d', line)
                         and len(line.split()) >= 2
-                        and line[0].isupper()
                         and not any(x in line.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
                     ):
                         name = line.strip()
@@ -113,8 +114,9 @@ def extract_kyc_fields(file_path):
             response["name"] = name
         else:
-            response["error"] = "Unable to determine document type (PAN/Aadhaar)."
         return response

 from paddleocr import PaddleOCR
 import re
+# Initialize OCR for English (safe default for Aadhaar and PAN)
+ocr = PaddleOCR(use_angle_cls=True, lang='en')
 def extract_kyc_fields(file_path):
     try:
+        # Run OCR
         result = ocr.ocr(file_path, cls=True)
+        # Extract lines from result
         lines = []
         for block in result:
             for line in block:
                 if text:
                     lines.append(text)
+        # Combine for pattern searches
         full_text = "\n".join(lines)
+        # Detect document type
         pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
         aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
         response = {"card_type": card_type}
+        # ===================== PAN CARD LOGIC =====================
         if card_type == "PAN":
             response["pan_number"] = pan_match.group(0)
+            # DOB
             dob = "Not found"
             for line in lines:
                 match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
                     break
             response["dob"] = dob
+            # Name
             name = "Not found"
             for i in range(len(lines)):
                 if "INCOME TAX DEPARTMENT" in lines[i].upper():
                     break
             response["name"] = name
+        # ===================== AADHAAR CARD LOGIC =====================
         elif card_type == "AADHAAR":
             response["aadhaar_number"] = aadhaar_match.group(0)
+            # DOB
             dob = "Not found"
             for line in lines:
                 match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
                     break
             response["dob"] = dob
+            # Gender
             gender = "Not found"
             for line in lines:
                 if "MALE" in line.upper():
                     break
             response["gender"] = gender
+            # Name – try before DOB or other heuristics
             name = "Not found"
             for i, line in enumerate(lines):
                 if re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line) and i > 0:
                     possible_name = lines[i - 1].strip()
                         name = possible_name
                         break
+            # Fallback if above fails
             if name == "Not found":
                 for line in lines:
                     if (
                         not re.search(r'\d', line)
                         and len(line.split()) >= 2
                         and not any(x in line.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
                     ):
                         name = line.strip()
             response["name"] = name
+        # ===================== UNKNOWN DOC =====================
         else:
+            response["error"] = "Could not detect document type (PAN or Aadhaar)."
         return response