Spaces:
Paused
Paused
import json | |
from pathlib import Path | |
from typing import Dict | |
from pdfminer.high_level import extract_text as pdf_extract_text | |
from docx import Document | |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
MODEL_NAME = "manishiitg/resume-ner" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME) | |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") | |
def extract_text(file_path: str) -> str: | |
path = Path(file_path) | |
if path.suffix.lower() == ".pdf": | |
text = pdf_extract_text(file_path) | |
elif path.suffix.lower() == ".docx": | |
doc = Document(file_path) | |
text = "\n".join([p.text for p in doc.paragraphs]) | |
else: | |
raise ValueError("Unsupported file format") | |
# Clean text | |
text = text.replace("\n", " ").replace("\r", " ").strip() | |
return text | |
def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]: | |
text = extract_text(file_path) | |
entities = ner_pipeline(text) | |
# Debug: Print actual detected entities | |
print("\n=== DEBUG: Entities Detected ===") | |
for ent in entities: | |
print(f"{ent['entity_group']} => {ent['word']}") | |
print("==============================\n") | |
name_parts, skills, education, experience = [], [], [], [] | |
for ent in entities: | |
label = ent["entity_group"].upper() | |
value = ent["word"].strip() | |
if label in ["NAME", "PERSON"]: | |
name_parts.append(value) | |
elif label in ["SKILL", "SKILLS"]: | |
skills.append(value) | |
elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]: | |
education.append(value) | |
elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]: | |
experience.append(value) | |
full_name = " ".join(dict.fromkeys(name_parts)) or "Not Found" | |
return { | |
"name": full_name, | |
"skills": ", ".join(dict.fromkeys(skills)) or "Not Found", | |
"education": ", ".join(dict.fromkeys(education)) or "Not Found", | |
"experience": ", ".join(dict.fromkeys(experience)) or "Not Found" | |
} | |