SMART_KYC_OCR

Sleeping

File size: 3,888 Bytes

ae2e698
c70099c
 
f7a759e
5ebcb93
ae2e698
b07dfbb
 
 
f7a759e
b07dfbb
 
f7a759e
b07dfbb
 
 
 
 
 
 
a8683a1
ae2e698
 
8324e53
 
 
 
 
 
ae2e698
8324e53
ae2e698
a726fb2
 
 
 
 
 
 
b07dfbb
f7a759e
a726fb2
 
 
2c3e33d
 
b07dfbb
2c3e33d
 
 
 
655e2d6
2c3e33d
 
e2cb16a
b07dfbb
e2cb16a
b07dfbb
2c3e33d
b07dfbb
2c3e33d
 
 
 
f7a759e
a726fb2
b07dfbb
a726fb2
 
 
349558e
b07dfbb
f7a759e
b07dfbb
f7a759e
b07dfbb
f7a759e
a726fb2
 
 
 
655e2d6
b07dfbb
e2cb16a
b07dfbb
 
 
e2cb16a
b07dfbb
e2cb16a
 
 
b07dfbb
e2cb16a
b07dfbb
 
 
e2cb16a
b07dfbb
e2cb16a
a726fb2
 
 
a8683a1
 
8324e53

from paddleocr import PaddleOCR
import re

# Initialize OCR for English
ocr = PaddleOCR(use_angle_cls=True, lang='en')

def _extract_dob(lines):
    for line in lines:
        m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
        if m: return m.group(0)
    for line in lines:
        m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
        if m: return m.group(0)
    for line in lines:
        m = re.search(r'\b(19|20)\d{2}\b', line)
        if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]):
            return m.group(0)
    return "Not found"

def extract_kyc_fields(file_path: str) -> dict:
    try:
        result = ocr.ocr(file_path, cls=True)

        lines = []
        for block in result:
            for line in block:
                text = line[1][0].strip()
                if text:
                    lines.append(text)

        full_text = "\n".join(lines)

        pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
        aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)

        if pan_match:
            card_type = "PAN"
        elif aadhaar_match:
            card_type = "AADHAAR"
        else:
            return {"card_type": "UNKNOWN", "error": "Could not identify document as PAN or Aadhaar."}

        response = {"card_type": card_type}

        if card_type == "PAN":
            response["pan_number"] = pan_match.group(0)
            response["dob"] = _extract_dob(lines)

            name = "Not found"
            for i in range(len(lines)):
                if "INCOME TAX DEPARTMENT" in lines[i].upper():
                    for j in range(i + 1, len(lines)):
                        possible = lines[j].strip()
                        if (
                            re.match(r'^[A-Z\s.]+$', possible)
                            and not any(x in possible.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"])
                            and not re.search(r'\d', possible)
                            and len(possible) >= 3
                        ):
                            name = possible
                            break
                    break
            response["name"] = name

        else:  # AADHAAR
            response["aadhaar_number"] = aadhaar_match.group(0)
            response["dob"] = _extract_dob(lines)

            gender = "Not found"
            for line in lines:
                up = line.upper()
                if "TRANSGENDER" in up:
                    gender = "TRANSGENDER"; break
                if "FEMALE" in up:
                    gender = "FEMALE"; break
                if "MALE" in up:
                    gender = "MALE"; break
            response["gender"] = gender

            name = "Not found"
            for i, line in enumerate(lines):
                if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
                    candidate = lines[i - 1].strip()
                    if (
                        not re.search(r'\d', candidate)
                        and len(candidate.split()) >= 2
                        and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                    ):
                        name = candidate
                        break
            if name == "Not found":
                for line in lines:
                    candidate = line.strip()
                    if (
                        not re.search(r'\d', candidate)
                        and len(candidate.split()) >= 2
                        and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                    ):
                        name = candidate
                        break
            response["name"] = name

        return response

    except Exception as e:
        return {"error": f"OCR processing failed: {str(e)}"}