SMART_KYC_OCR

Sleeping

File size: 3,509 Bytes

from paddleocr import PaddleOCR
import re

ocr = PaddleOCR(use_angle_cls=True, lang='en')

def extract_kyc_fields(file_path):
    try:
        result = ocr.ocr(file_path, cls=True)

        lines = []
        for block in result:
            for line in block:
                text = line[1][0].strip()
                if text:
                    lines.append(text)

        full_text = "\n".join(lines)

        # PAN Number Detection
        pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
        aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)

        if pan_match:
            card_type = "PAN"
        elif aadhaar_match:
            card_type = "AADHAAR"
        else:
            card_type = "UNKNOWN"

        response = {"card_type": card_type}

        if card_type == "PAN":
            response["pan_number"] = pan_match.group(0)

            # Extract DOB as any line with DD/MM/YYYY
            dob = "Not found"
            for line in lines:
                match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
                if match:
                    dob = match.group(0)
                    break
            response["dob"] = dob

            # Improved name extraction: find first uppercase name-like line after "INCOME TAX DEPARTMENT"
            name = "Not found"
            for i in range(len(lines)):
                if "INCOME TAX DEPARTMENT" in lines[i].upper():
                    for j in range(i+1, len(lines)):
                        possible = lines[j].strip()
                        if (
                            re.match(r'^[A-Z\s.]+$', possible) and
                            not any(x in possible for x in ["INDIA", "DEPARTMENT", "GOVT"]) and
                            not re.search(r'\d', possible)
                        ):
                            name = possible.strip()
                            break
                    break
            response["name"] = name

        elif card_type == "AADHAAR":
            response["aadhaar_number"] = aadhaar_match.group(0)

            # Extract DOB
            dob = "Not found"
            for line in lines:
                match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
                if match:
                    dob = match.group(0)
                    break
            response["dob"] = dob

            # Gender
            gender = "Not found"
            for line in lines:
                if "MALE" in line.upper():
                    gender = "MALE"
                    break
                elif "FEMALE" in line.upper():
                    gender = "FEMALE"
                    break
            response["gender"] = gender

            # Name logic for Aadhaar (same as before)
            name = "Not found"
            for i, line in enumerate(lines):
                if "DOB" in line.upper():
                    if i > 0:
                        possible_name = lines[i - 1]
                        if (
                            not any(x in possible_name.upper() for x in ["GOVERNMENT", "MALE", "FEMALE"])
                            and not re.search(r'\d', possible_name)
                        ):
                            name = possible_name.strip()
                            break
            response["name"] = name

        else:
            response["error"] = "Unable to determine document type."

        return response

    except Exception as e:
        return {"error": f"OCR processing failed: {str(e)}"}