Spaces:
Paused
Paused
File size: 2,124 Bytes
288175b 33fa314 288175b 33fa314 288175b 6248af7 288175b 6248af7 288175b 6248af7 288175b 6248af7 288175b 6248af7 288175b b336194 288175b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import json
from pathlib import Path
from typing import Dict
from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
# --------------------
# Load PyTorch Resume NER Model
# --------------------
MODEL_NAME = "manishiitg/resume-ner" # Works with PyTorch on Hugging Face Spaces
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
# --------------------
# Extract Text from PDF/DOCX
# --------------------
def extract_text(file_path: str) -> str:
path = Path(file_path)
if path.suffix.lower() == ".pdf":
return pdf_extract_text(file_path)
elif path.suffix.lower() == ".docx":
doc = Document(file_path)
return "\n".join([p.text for p in doc.paragraphs])
else:
raise ValueError("Unsupported file format")
# --------------------
# Parse Resume (returns only: full name, skills, education, experience)
# --------------------
def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
text = extract_text(file_path)
entities = ner_pipeline(text)
name_parts = []
skills = []
education = []
experience = []
for ent in entities:
label = ent["entity_group"].upper()
value = ent["word"].strip()
if label == "NAME":
name_parts.append(value)
elif label == "SKILL":
skills.append(value)
elif label in ["EDUCATION", "DEGREE"]:
education.append(value)
elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
experience.append(value)
full_name = " ".join(dict.fromkeys(name_parts)) or "Not Found"
return {
"name": full_name,
"skills": ", ".join(dict.fromkeys(skills)) or "Not Found",
"education": ", ".join(dict.fromkeys(education)) or "Not Found",
"experience": ", ".join(dict.fromkeys(experience)) or "Not Found"
}
|