Spaces:
Paused
Paused
File size: 3,446 Bytes
af02e64 f3f24e3 efffc2e b336194 f3f24e3 efffc2e f3f24e3 efffc2e d4b2339 efffc2e d4b2339 af02e64 efffc2e af02e64 f3f24e3 af02e64 efffc2e d4b2339 efffc2e d4b2339 efffc2e f3f24e3 efffc2e d4b2339 efffc2e d4b2339 efffc2e f3f24e3 af02e64 f3f24e3 af02e64 d4b2339 efffc2e d4b2339 efffc2e d4b2339 efffc2e 1ead253 8d99522 efffc2e 8d99522 efffc2e f3f24e3 efffc2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
from __future__ import annotations
import os
import re
import subprocess
import zipfile
from typing import List, Dict
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
# ===============================
# Load Model & Tokenizer
# ===============================
MODEL_ID = "sravya-abburi/ResumeParserBERT"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
ner_pipeline = pipeline(
"ner",
model=model,
tokenizer=tokenizer,
aggregation_strategy="simple",
device=0 if torch.cuda.is_available() else -1
)
# ===============================
# Text Extraction
# ===============================
def extract_text(file_path: str) -> str:
"""Extract raw text from PDF or DOCX."""
if not file_path or not os.path.isfile(file_path):
return ""
lower_name = file_path.lower()
try:
if lower_name.endswith(".pdf"):
result = subprocess.run(
["pdftotext", "-layout", file_path, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False
)
return result.stdout.decode("utf-8", errors="ignore")
elif lower_name.endswith(".docx"):
with zipfile.ZipFile(file_path) as zf:
with zf.open("word/document.xml") as docx_xml:
xml_bytes = docx_xml.read()
xml_text = xml_bytes.decode("utf-8", errors="ignore")
xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
text = re.sub(r"<[^>]+>", " ", xml_text)
return re.sub(r"\s+", " ", text)
else:
return ""
except Exception:
return ""
# ===============================
# Parse Resume using BERT NER
# ===============================
def parse_with_bert(text: str) -> Dict[str, str]:
"""Parse resume text into structured fields using BERT NER."""
entities = ner_pipeline(text)
name_tokens, skill_tokens, edu_tokens, exp_tokens = [], [], [], []
for ent in entities:
label = ent["entity_group"].upper()
word = ent["word"].strip()
if label == "NAME" and word not in name_tokens:
name_tokens.append(word)
elif label == "SKILL" and word not in skill_tokens:
skill_tokens.append(word)
elif label == "EDUCATION" and word not in edu_tokens:
edu_tokens.append(word)
elif label == "EXPERIENCE" and word not in exp_tokens:
exp_tokens.append(word)
return {
"name": " ".join(name_tokens),
"skills": ", ".join(skill_tokens),
"education": ", ".join(edu_tokens),
"experience": ", ".join(exp_tokens)
}
# ===============================
# Main Parse Function
# ===============================
def parse_resume(file_path: str, filename: str) -> dict:
"""Main function for resume parsing."""
text = extract_text(file_path)
if not text:
return {"name": "", "skills": "", "education": "", "experience": ""}
ents = parse_with_bert(text)
# Fallback: use filename for name if model doesn't find one
if not ents["name"]:
base = os.path.basename(filename)
base = re.sub(r"\.(pdf|docx|doc)$", "", base, flags=re.I)
ents["name"] = re.sub(r"[\._-]+", " ", base).title().strip()
return ents
|