SMART_KYC_OCR

Sleeping

File size: 4,362 Bytes

ae2e698
c70099c
 
5ebcb93
 
ae2e698
a8683a1
 
5ebcb93
ae2e698
 
5ebcb93
8324e53
 
 
 
 
 
ae2e698
5ebcb93
8324e53
ae2e698
5ebcb93
a726fb2
 
 
e2cb16a
a726fb2
 
 
 
 
 
 
5ebcb93
2c3e33d
 
 
5ebcb93
2c3e33d
 
 
 
 
 
 
 
5ebcb93
2c3e33d
 
 
 
 
 
e2cb16a
 
 
2c3e33d
 
 
 
 
 
5ebcb93
2c3e33d
a726fb2
 
5ebcb93
a726fb2
 
 
 
 
 
 
 
5ebcb93
a726fb2
 
 
 
 
 
 
 
e2cb16a
 
 
a726fb2
 
5ebcb93
a726fb2
 
e2cb16a
 
 
 
 
 
 
 
 
 
5ebcb93
e2cb16a
 
 
 
 
 
 
 
 
 
a726fb2
 
5ebcb93
a726fb2
5ebcb93
ae2e698
a726fb2
a8683a1
 
8324e53

from paddleocr import PaddleOCR
import re

# Initialize OCR for English (safe default for Aadhaar and PAN)
ocr = PaddleOCR(use_angle_cls=True, lang='en')

def extract_kyc_fields(file_path):
    try:
        # Run OCR
        result = ocr.ocr(file_path, cls=True)

        # Extract lines from result
        lines = []
        for block in result:
            for line in block:
                text = line[1][0].strip()
                if text:
                    lines.append(text)

        # Combine for pattern searches
        full_text = "\n".join(lines)

        # Detect document type
        pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
        aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)

        card_type = "UNKNOWN"
        if pan_match:
            card_type = "PAN"
        elif aadhaar_match:
            card_type = "AADHAAR"

        response = {"card_type": card_type}

        # ===================== PAN CARD LOGIC =====================
        if card_type == "PAN":
            response["pan_number"] = pan_match.group(0)

            # DOB
            dob = "Not found"
            for line in lines:
                match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
                if match:
                    dob = match.group(0)
                    break
            response["dob"] = dob

            # Name
            name = "Not found"
            for i in range(len(lines)):
                if "INCOME TAX DEPARTMENT" in lines[i].upper():
                    for j in range(i+1, len(lines)):
                        possible = lines[j].strip()
                        if (
                            re.match(r'^[A-Z\s.]+$', possible)
                            and not any(x in possible for x in ["INDIA", "DEPARTMENT", "GOVT"])
                            and not re.search(r'\d', possible)
                        ):
                            name = possible.strip()
                            break
                    break
            response["name"] = name

        # ===================== AADHAAR CARD LOGIC =====================
        elif card_type == "AADHAAR":
            response["aadhaar_number"] = aadhaar_match.group(0)

            # DOB
            dob = "Not found"
            for line in lines:
                match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
                if match:
                    dob = match.group(0)
                    break
            response["dob"] = dob

            # Gender
            gender = "Not found"
            for line in lines:
                if "MALE" in line.upper():
                    gender = "MALE"
                    break
                elif "FEMALE" in line.upper():
                    gender = "FEMALE"
                    break
                elif "TRANSGENDER" in line.upper():
                    gender = "TRANSGENDER"
                    break
            response["gender"] = gender

            # Name – try before DOB or other heuristics
            name = "Not found"
            for i, line in enumerate(lines):
                if re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line) and i > 0:
                    possible_name = lines[i - 1].strip()
                    if (
                        not any(x in possible_name.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
                        and not re.search(r'\d', possible_name)
                        and len(possible_name.split()) >= 2
                    ):
                        name = possible_name
                        break

            # Fallback if above fails
            if name == "Not found":
                for line in lines:
                    if (
                        not re.search(r'\d', line)
                        and len(line.split()) >= 2
                        and not any(x in line.upper() for x in ["GOVERNMENT", "INDIA", "DOB", "MALE", "FEMALE"])
                    ):
                        name = line.strip()
                        break

            response["name"] = name

        # ===================== UNKNOWN DOC =====================
        else:
            response["error"] = "Could not detect document type (PAN or Aadhaar)."

        return response

    except Exception as e:
        return {"error": f"OCR processing failed: {str(e)}"}