SMART_KYC_OCR

Sleeping

File size: 4,317 Bytes

ae2e698
c70099c
 
e2cb16a
 
ae2e698
a8683a1
 
ae2e698
 
8324e53
 
 
 
 
 
ae2e698
8324e53
ae2e698
e2cb16a
a726fb2
 
 
e2cb16a
a726fb2
 
 
 
 
 
 
e2cb16a
2c3e33d
 
 
e2cb16a
2c3e33d
 
 
 
 
 
 
 
e2cb16a
2c3e33d
 
 
 
 
 
e2cb16a
 
 
2c3e33d
 
 
 
 
 
e2cb16a
2c3e33d
a726fb2
 
2c3e33d
a726fb2
 
 
 
 
 
 
 
e2cb16a
a726fb2
 
 
 
 
 
 
 
e2cb16a
 
 
a726fb2
 
e2cb16a
a726fb2
e2cb16a
a726fb2
e2cb16a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a726fb2
 
 
e2cb16a
ae2e698
a726fb2
a8683a1
 
8324e53

from paddleocr import PaddleOCR
import re

# Enable multilingual support (English + Hindi + Tamil)
ocr = PaddleOCR(use_angle_cls=True, lang='en|hi|ta')

def extract_kyc_fields(file_path):
    try:
        result = ocr.ocr(file_path, cls=True)

        lines = []
        for block in result:
            for line in block:
                text = line[1][0].strip()
                if text:
                    lines.append(text)

        full_text = "\n".join(lines)

        # PAN pattern: 5 letters + 4 digits + 1 letter
        pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
        aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)

        card_type = "UNKNOWN"
        if pan_match:
            card_type = "PAN"
        elif aadhaar_match:
            card_type = "AADHAAR"

        response = {"card_type": card_type}

        # --------- PAN CARD LOGIC ---------
        if card_type == "PAN":
            response["pan_number"] = pan_match.group(0)

            # Extract DOB
            dob = "Not found"
            for line in lines:
                match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
                if match:
                    dob = match.group(0)
                    break
            response["dob"] = dob

            # Extract Name
            name = "Not found"
            for i in range(len(lines)):
                if "INCOME TAX DEPARTMENT" in lines[i].upper():
                    for j in range(i+1, len(lines)):
                        possible = lines[j].strip()
                        if (
                            re.match(r'^[A-Z\s.]+$', possible)
                            and not any(x in possible for x in ["INDIA", "DEPARTMENT", "GOVT"])
                            and not re.search(r'\d', possible)
                        ):
                            name = possible.strip()
                            break
                    break
            response["name"] = name

        # --------- AADHAAR CARD LOGIC ---------
        elif card_type == "AADHAAR":
            response["aadhaar_number"] = aadhaar_match.group(0)

            # Extract DOB
            dob = "Not found"
            for line in lines:
                match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
                if match:
                    dob = match.group(0)
                    break
            response["dob"] = dob

            # Extract Gender
            gender = "Not found"
            for line in lines:
                if "MALE" in line.upper():
                    gender = "MALE"
                    break
                elif "FEMALE" in line.upper():
                    gender = "FEMALE"
                    break
                elif "TRANSGENDER" in line.upper():
                    gender = "TRANSGENDER"
                    break
            response["gender"] = gender

            # Robust name extraction
            name = "Not found"
            # First attempt: line before DOB
            for i, line in enumerate(lines):
                if re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line) and i > 0:
                    possible_name = lines[i - 1].strip()
                    if (
                        not any(x in possible_name.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
                        and not re.search(r'\d', possible_name)
                        and len(possible_name.split()) >= 2
                    ):
                        name = possible_name
                        break

            # Fallback: best guess line with title-cased text and no digits
            if name == "Not found":
                for line in lines:
                    if (
                        not re.search(r'\d', line)
                        and len(line.split()) >= 2
                        and line[0].isupper()
                        and not any(x in line.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
                    ):
                        name = line.strip()
                        break

            response["name"] = name

        else:
            response["error"] = "Unable to determine document type (PAN/Aadhaar)."

        return response

    except Exception as e:
        return {"error": f"OCR processing failed: {str(e)}"}