from paddleocr import PaddleOCR
import re

# Initialize OCR once (English). Download happens first time it's used.
# If you want to support other langs, set lang='en' -> 'en'|'hi'|'mr'... etc, or 'en'+'multilang models'.
ocr = PaddleOCR(use_angle_cls=True, lang='en')

def _extract_dob(lines):
    """
    Try multiple formats:
      - dd/mm/yyyy | dd-mm-yyyy | dd.mm.yyyy
      - yyyy-mm-dd
      - Year of Birth lines (YOB / YEAR / BIRTH)
    """
    # dd{sep}mm{sep}yyyy
    for line in lines:
        m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
        if m:
            return m.group(0)

    # yyyy-mm-dd
    for line in lines:
        m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
        if m:
            return m.group(0)

    # Year only if labeled as YOB/Year/Birth
    for line in lines:
        m = re.search(r'\b(19|20)\d{2}\b', line)
        if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]):
            return m.group(0)

    return "Not found"

def extract_kyc_fields(file_path: str) -> dict:
    try:
        # OCR text extraction
        result = ocr.ocr(file_path, cls=True)

        # Flatten to text lines
        lines = []
        for block in result:
            for line in block:
                text = line[1][0].strip()
                if text:
                    lines.append(text)

        full_text = "\n".join(lines)

        # Detect card type by patterns
        pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
        aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)

        if pan_match:
            card_type = "PAN"
        elif aadhaar_match:
            card_type = "AADHAAR"
        else:
            return {
                "card_type": "UNKNOWN",
                "error": "Could not identify document as PAN or Aadhaar."
            }

        response = {"card_type": card_type}

        # ===================== PAN CARD =====================
        if card_type == "PAN":
            response["pan_number"] = pan_match.group(0)

            # DOB
            response["dob"] = _extract_dob(lines)

            # Name (heuristic: next lines after "INCOME TAX DEPARTMENT")
            name = "Not found"
            for i in range(len(lines)):
                if "INCOME TAX DEPARTMENT" in lines[i].upper():
                    for j in range(i + 1, len(lines)):
                        possible = lines[j].strip()
                        if (
                            re.match(r'^[A-Z\s.]+$', possible)
                            and not any(x in possible.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"])
                            and not re.search(r'\d', possible)
                            and len(possible) >= 3
                        ):
                            name = possible
                            break
                    break
            response["name"] = name

        # ===================== AADHAAR CARD =====================
        else:
            response["aadhaar_number"] = aadhaar_match.group(0)

            # DOB / YOB
            response["dob"] = _extract_dob(lines)

            # Gender
            gender = "Not found"
            for line in lines:
                up = line.upper()
                if "TRANSGENDER" in up:
                    gender = "TRANSGENDER"
                    break
                if "FEMALE" in up:
                    gender = "FEMALE"
                    break
                if "MALE" in up:
                    gender = "MALE"
                    break
            response["gender"] = gender

            # Name: usually line before DOB or first reasonable line without digits
            name = "Not found"
            # try line before a date line
            for i, line in enumerate(lines):
                if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
                    candidate = lines[i - 1].strip()
                    if (
                        not re.search(r'\d', candidate)
                        and len(candidate.split()) >= 2
                        and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                    ):
                        name = candidate
                        break
            # fallback
            if name == "Not found":
                for line in lines:
                    candidate = line.strip()
                    if (
                        not re.search(r'\d', candidate)
                        and len(candidate.split()) >= 2
                        and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
                    ):
                        name = candidate
                        break
            response["name"] = name

        return response

    except Exception as e:
        return {"error": f"OCR processing failed: {str(e)}"}