Spaces:
Paused
Paused
import json | |
from pathlib import Path | |
from typing import Dict | |
from pdfminer.high_level import extract_text as pdf_extract_text | |
from docx import Document | |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
# -------------------- | |
# Load PyTorch Resume NER Model | |
# -------------------- | |
MODEL_NAME = "manishiitg/resume-ner" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME) | |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") | |
# -------------------- | |
# Extract Text from PDF/DOCX | |
# -------------------- | |
def extract_text(file_path: str) -> str: | |
path = Path(file_path) | |
if path.suffix.lower() == ".pdf": | |
return pdf_extract_text(file_path) | |
elif path.suffix.lower() == ".docx": | |
doc = Document(file_path) | |
return "\n".join([p.text for p in doc.paragraphs]) | |
else: | |
raise ValueError("Unsupported file format") | |
# -------------------- | |
# Parse Resume | |
# -------------------- | |
def parse_resume(file_path: str) -> Dict[str, str]: | |
text = extract_text(file_path) | |
entities = ner_pipeline(text) | |
name = [] | |
skills = [] | |
education = [] | |
experience = [] | |
for ent in entities: | |
label = ent["entity_group"].upper() | |
value = ent["word"].strip() | |
if label == "NAME": | |
name.append(value) | |
elif label == "SKILL": | |
skills.append(value) | |
elif label in ["EDUCATION", "DEGREE"]: | |
education.append(value) | |
elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]: | |
experience.append(value) | |
return { | |
"name": " ".join(dict.fromkeys(name)) or "Not Found", | |
"skills": ", ".join(dict.fromkeys(skills)) or "Not Found", | |
"education": ", ".join(dict.fromkeys(education)) or "Not Found", | |
"experience": ", ".join(dict.fromkeys(experience)) or "Not Found" | |
} | |