SMART_KYC_OCR

Sleeping

File size: 4,916 Bytes

ae2e698
c70099c
 
b07dfbb
 
5ebcb93
ae2e698
b07dfbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8683a1
655e2d6
ae2e698
 
b07dfbb
8324e53
 
 
 
 
 
ae2e698
8324e53
ae2e698
b07dfbb
a726fb2
 
 
 
 
 
 
b07dfbb
 
 
 
 
a726fb2
 
 
655e2d6
2c3e33d
 
 
b07dfbb
 
2c3e33d
b07dfbb
2c3e33d
 
 
655e2d6
2c3e33d
 
e2cb16a
b07dfbb
e2cb16a
b07dfbb
2c3e33d
b07dfbb
2c3e33d
 
 
 
655e2d6
b07dfbb
a726fb2
 
b07dfbb
 
a726fb2
5ebcb93
a726fb2
 
349558e
b07dfbb
 
a726fb2
b07dfbb
a726fb2
 
b07dfbb
 
e2cb16a
a726fb2
 
b07dfbb
a726fb2
b07dfbb
a726fb2
655e2d6
b07dfbb
e2cb16a
b07dfbb
 
 
e2cb16a
b07dfbb
e2cb16a
b07dfbb
e2cb16a
 
b07dfbb
e2cb16a
b07dfbb
 
 
e2cb16a
b07dfbb
e2cb16a
a726fb2
 
 
a8683a1
 
8324e53

from paddleocr import PaddleOCR
import re

# Initialize OCR once (English). Download happens first time it's used.
# If you want to support other langs, set lang='en' -> 'en'|'hi'|'mr'... etc, or 'en'+'multilang models'.
ocr = PaddleOCR(use_angle_cls=True, lang='en')

def _extract_dob(lines):
    """
    Try multiple formats:
      - dd/mm/yyyy | dd-mm-yyyy | dd.mm.yyyy
      - yyyy-mm-dd
      - Year of Birth lines (YOB / YEAR / BIRTH)
    """
    # dd{sep}mm{sep}yyyy
    for line in lines:
        m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
        if m:
            return m.group(0)

    # yyyy-mm-dd
    for line in lines:
        m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
        if m:
            return m.group(0)

    # Year only if labeled as YOB/Year/Birth
    for line in lines:
        m = re.search(r'\b(19|20)\d{2}\b', line)
        if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]):
            return m.group(0)

    return "Not found"

def extract_kyc_fields(file_path: str) -> dict:
    try:
        # OCR text extraction
        result = ocr.ocr(file_path, cls=True)

        # Flatten to text lines
        lines = []
        for block in result:
            for line in block:
                text = line[1][0].strip()
                if text:
                    lines.append(text)

        full_text = "\n".join(lines)

        # Detect card type by patterns
        pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
        aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)

        if pan_match:
            card_type = "PAN"
        elif aadhaar_match:
            card_type = "AADHAAR"
        else:
            return {
                "card_type": "UNKNOWN",
                "error": "Could not identify document as PAN or Aadhaar."
            }

        response = {"card_type": card_type}

        # ===================== PAN CARD =====================
        if card_type == "PAN":
            response["pan_number"] = pan_match.group(0)

            # DOB
            response["dob"] = _extract_dob(lines)

            # Name (heuristic: next lines after "INCOME TAX DEPARTMENT")
            name = "Not found"
            for i in range(len(lines)):
                if "INCOME TAX DEPARTMENT" in lines[i].upper():
                    for j in range(i + 1, len(lines)):
                        possible = lines[j].strip()
                        if (
                            re.match(r'^[A-Z\s.]+$', possible)
                            and not any(x in possible.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"])
                            and not re.search(r'\d', possible)
                            and len(possible) >= 3
                        ):
                            name = possible
                            break
                    break
            response["name"] = name

        # ===================== AADHAAR CARD =====================
        else:
            response["aadhaar_number"] = aadhaar_match.group(0)

            # DOB / YOB
            response["dob"] = _extract_dob(lines)

            # Gender
            gender = "Not found"
            for line in lines:
                up = line.upper()
                if "TRANSGENDER" in up:
                    gender = "TRANSGENDER"
                    break
                if "FEMALE" in up:
                    gender = "FEMALE"
                    break
                if "MALE" in up:
                    gender = "MALE"
                    break
            response["gender"] = gender

            # Name: usually line before DOB or first reasonable line without digits
            name = "Not found"
            # try line before a date line
            for i, line in enumerate(lines):
                if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
                    candidate = lines[i - 1].strip()
                    if (
                        not re.search(r'\d', candidate)
                        and len(candidate.split()) >= 2
                        and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                    ):
                        name = candidate
                        break
            # fallback
            if name == "Not found":
                for line in lines:
                    candidate = line.strip()
                    if (
                        not re.search(r'\d', candidate)
                        and len(candidate.split()) >= 2
                        and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                    ):
                        name = candidate
                        break
            response["name"] = name

        return response

    except Exception as e:
        return {"error": f"OCR processing failed: {str(e)}"}