SMART_KYC_OCR

Sleeping

gopichandra commited on Aug 5

Commit

73f6d86

verified ·

1 Parent(s): 7538705

Update utils.py

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -1,33 +1,32 @@
 from paddleocr import PaddleOCR
 import re
-# Initialize OCR model only once
-ocr = PaddleOCR(use_angle_cls=True, lang='en')  # lang='en' for English documents
 def extract_kyc_fields(file_path):
     try:
-        # Run OCR
         result = ocr.ocr(file_path, cls=True)
         all_text = ""
         for line_group in result:
             for line in line_group:
                 all_text += line[1][0] + "\n"
-        # Aadhaar number (format with or without space/dash)
         aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', all_text)
-        # DOB (any DD/MM/YYYY or similar)
         dob_match = re.search(r'\b\d{2}[\/\-]\d{2}[\/\-]\d{4}\b', all_text)
-        # Name: try to detect a line with 'Name' or fallback to top line
         name = "Not found"
         for line in all_text.split("\n"):
             if re.search(r'\b(name|naam|namf)\b', line, re.IGNORECASE):
                 name = line.split(":")[-1].strip() if ":" in line else line.strip()
                 break
-        if name == "Not found":
             name = all_text.split("\n")[0].strip()
         return {

 from paddleocr import PaddleOCR
 import re
+# Initialize OCR model
+ocr = PaddleOCR(use_angle_cls=True, lang='en')
 def extract_kyc_fields(file_path):
     try:
         result = ocr.ocr(file_path, cls=True)
+        # Combine text lines
         all_text = ""
         for line_group in result:
             for line in line_group:
                 all_text += line[1][0] + "\n"
+        # Aadhaar pattern
         aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', all_text)
+        # DOB pattern
         dob_match = re.search(r'\b\d{2}[\/\-]\d{2}[\/\-]\d{4}\b', all_text)
+        # Name logic
         name = "Not found"
         for line in all_text.split("\n"):
             if re.search(r'\b(name|naam|namf)\b', line, re.IGNORECASE):
                 name = line.split(":")[-1].strip() if ":" in line else line.strip()
                 break
+        if name == "Not found" and all_text.strip():
             name = all_text.split("\n")[0].strip()
         return {