from paddleocr import PaddleOCR
import re

# Initialize OCR for English
ocr = PaddleOCR(use_angle_cls=True, lang='en')

def _extract_dob(lines):
    for line in lines:
        m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
        if m: return m.group(0)
    for line in lines:
        m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
        if m: return m.group(0)
    for line in lines:
        m = re.search(r'\b(19|20)\d{2}\b', line)
        if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]):
            return m.group(0)
    return "Not found"

def extract_kyc_fields(file_path: str) -> dict:
    try:
        result = ocr.ocr(file_path, cls=True)

        lines = []
        for block in result:
            for line in block:
                text = line[1][0].strip()
                if text:
                    lines.append(text)

        full_text = "\n".join(lines)

        pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
        aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)

        if pan_match:
            card_type = "PAN"
        elif aadhaar_match:
            card_type = "AADHAAR"
        else:
            return {"card_type": "UNKNOWN", "error": "Could not identify document as PAN or Aadhaar."}

        response = {"card_type": card_type}

        if card_type == "PAN":
            response["pan_number"] = pan_match.group(0)
            response["dob"] = _extract_dob(lines)

            name = "Not found"
            for i in range(len(lines)):
                if "INCOME TAX DEPARTMENT" in lines[i].upper():
                    for j in range(i + 1, len(lines)):
                        possible = lines[j].strip()
                        if (
                            re.match(r'^[A-Z\s.]+$', possible)
                            and not any(x in possible.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"])
                            and not re.search(r'\d', possible)
                            and len(possible) >= 3
                        ):
                            name = possible
                            break
                    break
            response["name"] = name

        else:  # AADHAAR
            response["aadhaar_number"] = aadhaar_match.group(0)
            response["dob"] = _extract_dob(lines)

            gender = "Not found"
            for line in lines:
                up = line.upper()
                if "TRANSGENDER" in up:
                    gender = "TRANSGENDER"; break
                if "FEMALE" in up:
                    gender = "FEMALE"; break
                if "MALE" in up:
                    gender = "MALE"; break
            response["gender"] = gender

            name = "Not found"
            for i, line in enumerate(lines):
                if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
                    candidate = lines[i - 1].strip()
                    if (
                        not re.search(r'\d', candidate)
                        and len(candidate.split()) >= 2
                        and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                    ):
                        name = candidate
                        break
            if name == "Not found":
                for line in lines:
                    candidate = line.strip()
                    if (
                        not re.search(r'\d', candidate)
                        and len(candidate.split()) >= 2
                        and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                    ):
                        name = candidate
                        break
            response["name"] = name

        return response

    except Exception as e:
        return {"error": f"OCR processing failed: {str(e)}"}