import json from pathlib import Path from typing import Dict from pdfminer.high_level import extract_text as pdf_extract_text from docx import Document from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline # -------------------- # Load Resume NER Model # -------------------- MODEL_NAME = "Ioana23/bert-finetuned-resumes-ner" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME) ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # -------------------- # Extract Text from PDF/DOCX # -------------------- def extract_text(file_path: str) -> str: path = Path(file_path) if path.suffix.lower() == ".pdf": return pdf_extract_text(file_path) elif path.suffix.lower() == ".docx": doc = Document(file_path) return "\n".join([p.text for p in doc.paragraphs]) else: raise ValueError("Unsupported file format") # -------------------- # Parse Resume # -------------------- def parse_resume(file_path: str) -> Dict[str, str]: text = extract_text(file_path) entities = ner_pipeline(text) name = [] skills = [] education = [] experience = [] for ent in entities: label = ent["entity_group"].upper() value = ent["word"].strip() if label == "NAME": name.append(value) elif label == "SKILL": skills.append(value) elif label in ["EDUCATION", "DEGREE"]: education.append(value) elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]: experience.append(value) return { "name": " ".join(dict.fromkeys(name)) or "Not Found", "skills": ", ".join(dict.fromkeys(skills)) or "Not Found", "education": ", ".join(dict.fromkeys(education)) or "Not Found", "experience": ", ".join(dict.fromkeys(experience)) or "Not Found" }