Spaces:

husseinelsaadi
/

Codingo

Paused

File size: 2,391 Bytes

efffc2e
864c2ae
b336194
a511250
 
864c2ae
 
a511250
 
 
efffc2e
864c2ae
af02e64
a511250
864c2ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a511250
 
864c2ae
efffc2e
d4b2339
864c2ae
a511250
efffc2e
a511250
efffc2e
a511250
 
 
 
 
864c2ae
 
 
 
 
 
 
 
efffc2e
 
a511250
 
 
 
efffc2e

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import subprocess, zipfile, re, os

# === Load pretrained HF model ===
MODEL_NAME = "sravya-abburi/ResumeParserBERT"  # or "Kiet/autotrain-resume_parser-1159242747"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)

# Use CPU for stability (device=-1) to avoid GPU memory issues from other parts of the app
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=-1)

# === Extract text from PDF/DOCX ===
def extract_text(file_path: str) -> str:
    """Extract text from PDF or DOCX resumes."""
    if file_path.lower().endswith(".pdf"):
        result = subprocess.run(
            ["pdftotext", "-layout", file_path, "-"],
            stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False
        )
        return result.stdout.decode("utf-8", errors="ignore")
    elif file_path.lower().endswith(".docx"):
        with zipfile.ZipFile(file_path) as zf:
            with zf.open("word/document.xml") as docx_xml:
                xml_text = docx_xml.read().decode("utf-8", errors="ignore")
                xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
                return re.sub(r"<[^>]+>", " ", xml_text)
    return ""

# === Parse resume with NER ===
def parse_resume(file_path: str, filename: str = None) -> dict:
    """Parse resume and extract Name, Skills, Education, Experience."""
    text = extract_text(file_path)
    entities = ner_pipeline(text)

    name, skills, education, experience = [], [], [], []

    for ent in entities:
        word = ent["word"].strip()
        label = ent["entity_group"].upper()

        # Skip empty or placeholder tokens
        if not word or word.startswith("LABEL_"):
            continue

        if label == "NAME":
            name.append(word)
        elif label == "SKILL":
            skills.append(word)
        elif label in ["EDUCATION", "DEGREE"]:
            education.append(word)
        elif label in ["EXPERIENCE", "JOB", "ROLE"]:
            experience.append(word)

    return {
        "name": " ".join(dict.fromkeys(name)),
        "skills": ", ".join(dict.fromkeys(skills)),
        "education": ", ".join(dict.fromkeys(education)),
        "experience": ", ".join(dict.fromkeys(experience))
    }