from __future__ import annotations
import os
import re
import subprocess
import zipfile
from typing import List, Dict
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# ===============================
# Load Model & Tokenizer
# ===============================
MODEL_ID = "sravya-abburi/ResumeParserBERT"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)

ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1
)

# ===============================
# Text Extraction
# ===============================
def extract_text(file_path: str) -> str:
    """Extract raw text from PDF or DOCX."""
    if not file_path or not os.path.isfile(file_path):
        return ""

    lower_name = file_path.lower()
    try:
        if lower_name.endswith(".pdf"):
            result = subprocess.run(
                ["pdftotext", "-layout", file_path, "-"],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                check=False
            )
            return result.stdout.decode("utf-8", errors="ignore")

        elif lower_name.endswith(".docx"):
            with zipfile.ZipFile(file_path) as zf:
                with zf.open("word/document.xml") as docx_xml:
                    xml_bytes = docx_xml.read()
                    xml_text = xml_bytes.decode("utf-8", errors="ignore")
                    xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
                    text = re.sub(r"<[^>]+>", " ", xml_text)
                    return re.sub(r"\s+", " ", text)
        else:
            return ""
    except Exception:
        return ""

# ===============================
# Parse Resume using BERT NER
# ===============================
def parse_with_bert(text: str) -> Dict[str, str]:
    """Parse resume text into structured fields using BERT NER."""
    entities = ner_pipeline(text)

    name_tokens, skill_tokens, edu_tokens, exp_tokens = [], [], [], []

    for ent in entities:
        label = ent["entity_group"].upper()
        word = ent["word"].strip()

        if label == "NAME" and word not in name_tokens:
            name_tokens.append(word)
        elif label == "SKILL" and word not in skill_tokens:
            skill_tokens.append(word)
        elif label == "EDUCATION" and word not in edu_tokens:
            edu_tokens.append(word)
        elif label == "EXPERIENCE" and word not in exp_tokens:
            exp_tokens.append(word)

    return {
        "name": " ".join(name_tokens),
        "skills": ", ".join(skill_tokens),
        "education": ", ".join(edu_tokens),
        "experience": ", ".join(exp_tokens)
    }

# ===============================
# Main Parse Function
# ===============================
def parse_resume(file_path: str, filename: str) -> dict:
    """Main function for resume parsing."""
    text = extract_text(file_path)
    if not text:
        return {"name": "", "skills": "", "education": "", "experience": ""}

    ents = parse_with_bert(text)

    # Fallback: use filename for name if model doesn't find one
    if not ents["name"]:
        base = os.path.basename(filename)
        base = re.sub(r"\.(pdf|docx|doc)$", "", base, flags=re.I)
        ents["name"] = re.sub(r"[\._-]+", " ", base).title().strip()

    return ents