import json from pathlib import Path from typing import Dict from pdfminer.high_level import extract_text as pdf_extract_text from docx import Document from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline MODEL_NAME = "manishiitg/resume-ner" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME) ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") def extract_text(file_path: str) -> str: path = Path(file_path) if path.suffix.lower() == ".pdf": text = pdf_extract_text(file_path) elif path.suffix.lower() == ".docx": doc = Document(file_path) text = "\n".join([p.text for p in doc.paragraphs]) else: raise ValueError("Unsupported file format") # Clean text text = text.replace("\n", " ").replace("\r", " ").strip() return text def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]: text = extract_text(file_path) entities = ner_pipeline(text) # Debug: Print actual detected entities print("\n=== DEBUG: Entities Detected ===") for ent in entities: print(f"{ent['entity_group']} => {ent['word']}") print("==============================\n") name_parts, skills, education, experience = [], [], [], [] for ent in entities: label = ent["entity_group"].upper() value = ent["word"].strip() if label in ["NAME", "PERSON"]: name_parts.append(value) elif label in ["SKILL", "SKILLS"]: skills.append(value) elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]: education.append(value) elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]: experience.append(value) full_name = " ".join(dict.fromkeys(name_parts)) or "Not Found" return { "name": full_name, "skills": ", ".join(dict.fromkeys(skills)) or "Not Found", "education": ", ".join(dict.fromkeys(education)) or "Not Found", "experience": ", ".join(dict.fromkeys(experience)) or "Not Found" }