Spaces:

husseinelsaadi
/

Codingo

Paused

File size: 2,457 Bytes

efffc2e
6d286f1
b336194
6d286f1
 
 
 
864c2ae
 
6d286f1
a511250
6d286f1
 
 
af02e64
6d286f1
 
 
 
 
 
 
 
 
 
 
 
 
 
864c2ae
 
 
 
 
6d286f1
864c2ae
 
6d286f1
 
 
a511250
6d286f1
864c2ae
6d286f1
efffc2e
d4b2339
864c2ae
efffc2e
 
6d286f1
864c2ae
 
 
 
 
 
 
 
efffc2e
 
a511250
 
 
 
efffc2e

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import zipfile, re, os

# ===============================
# Load Model & Tokenizer
# ===============================
MODEL_NAME = "sravya-abburi/ResumeParserBERT"  # Change to Kiet/autotrain-resume_parser-1159242747 if needed
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# ===============================
# Extract Text (PDF & DOCX)
# ===============================
def extract_text(file_path: str) -> str:
    """Extract text from PDF or DOCX without external dependencies."""
    file_path_lower = file_path.lower()

    # PDF reading using PyMuPDF (built into Spaces environment)
    if file_path_lower.endswith(".pdf"):
        import fitz  # PyMuPDF
        text = ""
        with fitz.open(file_path) as pdf_doc:
            for page in pdf_doc:
                text += page.get_text()
        return text

    # DOCX reading by extracting XML content
    elif file_path_lower.endswith(".docx"):
        with zipfile.ZipFile(file_path) as zf:
            with zf.open("word/document.xml") as docx_xml:
                xml_text = docx_xml.read().decode("utf-8", errors="ignore")
                xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
                return re.sub(r"<[^>]+>", " ", xml_text)

    return ""

# ===============================
# Parse Resume
# ===============================
def parse_resume(file_path: str, filename: str = None) -> dict:
    """Parse resume and extract structured information."""
    text = extract_text(file_path)

    entities = ner_pipeline(text)

    name, skills, education, experience = [], [], [], []
    for ent in entities:
        label = ent["entity_group"].upper()
        word = ent["word"].strip()
        if label == "NAME":
            name.append(word)
        elif label == "SKILL":
            skills.append(word)
        elif label in ["EDUCATION", "DEGREE"]:
            education.append(word)
        elif label in ["EXPERIENCE", "JOB", "ROLE"]:
            experience.append(word)

    return {
        "name": " ".join(dict.fromkeys(name)),
        "skills": ", ".join(dict.fromkeys(skills)),
        "education": ", ".join(dict.fromkeys(education)),
        "experience": ", ".join(dict.fromkeys(experience))
    }