from __future__ import annotations import os import re import subprocess import zipfile from typing import List, Dict import torch from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline # =============================== # Load Model & Tokenizer # =============================== MODEL_ID = "sravya-abburi/ResumeParserBERT" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForTokenClassification.from_pretrained(MODEL_ID) ner_pipeline = pipeline( "ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0 if torch.cuda.is_available() else -1 ) # =============================== # Text Extraction # =============================== def extract_text(file_path: str) -> str: """Extract raw text from PDF or DOCX.""" if not file_path or not os.path.isfile(file_path): return "" lower_name = file_path.lower() try: if lower_name.endswith(".pdf"): result = subprocess.run( ["pdftotext", "-layout", file_path, "-"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False ) return result.stdout.decode("utf-8", errors="ignore") elif lower_name.endswith(".docx"): with zipfile.ZipFile(file_path) as zf: with zf.open("word/document.xml") as docx_xml: xml_bytes = docx_xml.read() xml_text = xml_bytes.decode("utf-8", errors="ignore") xml_text = re.sub(r"]*>", "\n", xml_text, flags=re.I) text = re.sub(r"<[^>]+>", " ", xml_text) return re.sub(r"\s+", " ", text) else: return "" except Exception: return "" # =============================== # Parse Resume using BERT NER # =============================== def parse_with_bert(text: str) -> Dict[str, str]: """Parse resume text into structured fields using BERT NER.""" entities = ner_pipeline(text) name_tokens, skill_tokens, edu_tokens, exp_tokens = [], [], [], [] for ent in entities: label = ent["entity_group"].upper() word = ent["word"].strip() if label == "NAME" and word not in name_tokens: name_tokens.append(word) elif label == "SKILL" and word not in skill_tokens: skill_tokens.append(word) elif label == "EDUCATION" and word not in edu_tokens: edu_tokens.append(word) elif label == "EXPERIENCE" and word not in exp_tokens: exp_tokens.append(word) return { "name": " ".join(name_tokens), "skills": ", ".join(skill_tokens), "education": ", ".join(edu_tokens), "experience": ", ".join(exp_tokens) } # =============================== # Main Parse Function # =============================== def parse_resume(file_path: str, filename: str) -> dict: """Main function for resume parsing.""" text = extract_text(file_path) if not text: return {"name": "", "skills": "", "education": "", "experience": ""} ents = parse_with_bert(text) # Fallback: use filename for name if model doesn't find one if not ents["name"]: base = os.path.basename(filename) base = re.sub(r"\.(pdf|docx|doc)$", "", base, flags=re.I) ents["name"] = re.sub(r"[\._-]+", " ", base).title().strip() return ents