Spaces:
Paused
Paused
from __future__ import annotations | |
import os | |
import re | |
import subprocess | |
import zipfile | |
from typing import List, Dict | |
import torch | |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
# =============================== | |
# Load Model & Tokenizer | |
# =============================== | |
MODEL_ID = "sravya-abburi/ResumeParserBERT" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID) | |
ner_pipeline = pipeline( | |
"ner", | |
model=model, | |
tokenizer=tokenizer, | |
aggregation_strategy="simple", | |
device=0 if torch.cuda.is_available() else -1 | |
) | |
# =============================== | |
# Text Extraction | |
# =============================== | |
def extract_text(file_path: str) -> str: | |
"""Extract raw text from PDF or DOCX.""" | |
if not file_path or not os.path.isfile(file_path): | |
return "" | |
lower_name = file_path.lower() | |
try: | |
if lower_name.endswith(".pdf"): | |
result = subprocess.run( | |
["pdftotext", "-layout", file_path, "-"], | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
check=False | |
) | |
return result.stdout.decode("utf-8", errors="ignore") | |
elif lower_name.endswith(".docx"): | |
with zipfile.ZipFile(file_path) as zf: | |
with zf.open("word/document.xml") as docx_xml: | |
xml_bytes = docx_xml.read() | |
xml_text = xml_bytes.decode("utf-8", errors="ignore") | |
xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I) | |
text = re.sub(r"<[^>]+>", " ", xml_text) | |
return re.sub(r"\s+", " ", text) | |
else: | |
return "" | |
except Exception: | |
return "" | |
# =============================== | |
# Parse Resume using BERT NER | |
# =============================== | |
def parse_with_bert(text: str) -> Dict[str, str]: | |
"""Parse resume text into structured fields using BERT NER.""" | |
entities = ner_pipeline(text) | |
name_tokens, skill_tokens, edu_tokens, exp_tokens = [], [], [], [] | |
for ent in entities: | |
label = ent["entity_group"].upper() | |
word = ent["word"].strip() | |
if label == "NAME" and word not in name_tokens: | |
name_tokens.append(word) | |
elif label == "SKILL" and word not in skill_tokens: | |
skill_tokens.append(word) | |
elif label == "EDUCATION" and word not in edu_tokens: | |
edu_tokens.append(word) | |
elif label == "EXPERIENCE" and word not in exp_tokens: | |
exp_tokens.append(word) | |
return { | |
"name": " ".join(name_tokens), | |
"skills": ", ".join(skill_tokens), | |
"education": ", ".join(edu_tokens), | |
"experience": ", ".join(exp_tokens) | |
} | |
# =============================== | |
# Main Parse Function | |
# =============================== | |
def parse_resume(file_path: str, filename: str) -> dict: | |
"""Main function for resume parsing.""" | |
text = extract_text(file_path) | |
if not text: | |
return {"name": "", "skills": "", "education": "", "experience": ""} | |
ents = parse_with_bert(text) | |
# Fallback: use filename for name if model doesn't find one | |
if not ents["name"]: | |
base = os.path.basename(filename) | |
base = re.sub(r"\.(pdf|docx|doc)$", "", base, flags=re.I) | |
ents["name"] = re.sub(r"[\._-]+", " ", base).title().strip() | |
return ents | |