Codingo / backend /services /resume_parser.py
husseinelsaadi's picture
updated
efffc2e
raw
history blame
3.45 kB
from __future__ import annotations
import os
import re
import subprocess
import zipfile
from typing import List, Dict
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
# ===============================
# Load Model & Tokenizer
# ===============================
MODEL_ID = "sravya-abburi/ResumeParserBERT"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
ner_pipeline = pipeline(
"ner",
model=model,
tokenizer=tokenizer,
aggregation_strategy="simple",
device=0 if torch.cuda.is_available() else -1
)
# ===============================
# Text Extraction
# ===============================
def extract_text(file_path: str) -> str:
"""Extract raw text from PDF or DOCX."""
if not file_path or not os.path.isfile(file_path):
return ""
lower_name = file_path.lower()
try:
if lower_name.endswith(".pdf"):
result = subprocess.run(
["pdftotext", "-layout", file_path, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False
)
return result.stdout.decode("utf-8", errors="ignore")
elif lower_name.endswith(".docx"):
with zipfile.ZipFile(file_path) as zf:
with zf.open("word/document.xml") as docx_xml:
xml_bytes = docx_xml.read()
xml_text = xml_bytes.decode("utf-8", errors="ignore")
xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
text = re.sub(r"<[^>]+>", " ", xml_text)
return re.sub(r"\s+", " ", text)
else:
return ""
except Exception:
return ""
# ===============================
# Parse Resume using BERT NER
# ===============================
def parse_with_bert(text: str) -> Dict[str, str]:
"""Parse resume text into structured fields using BERT NER."""
entities = ner_pipeline(text)
name_tokens, skill_tokens, edu_tokens, exp_tokens = [], [], [], []
for ent in entities:
label = ent["entity_group"].upper()
word = ent["word"].strip()
if label == "NAME" and word not in name_tokens:
name_tokens.append(word)
elif label == "SKILL" and word not in skill_tokens:
skill_tokens.append(word)
elif label == "EDUCATION" and word not in edu_tokens:
edu_tokens.append(word)
elif label == "EXPERIENCE" and word not in exp_tokens:
exp_tokens.append(word)
return {
"name": " ".join(name_tokens),
"skills": ", ".join(skill_tokens),
"education": ", ".join(edu_tokens),
"experience": ", ".join(exp_tokens)
}
# ===============================
# Main Parse Function
# ===============================
def parse_resume(file_path: str, filename: str) -> dict:
"""Main function for resume parsing."""
text = extract_text(file_path)
if not text:
return {"name": "", "skills": "", "education": "", "experience": ""}
ents = parse_with_bert(text)
# Fallback: use filename for name if model doesn't find one
if not ents["name"]:
base = os.path.basename(filename)
base = re.sub(r"\.(pdf|docx|doc)$", "", base, flags=re.I)
ents["name"] = re.sub(r"[\._-]+", " ", base).title().strip()
return ents