Spaces:
Paused
Paused
File size: 2,584 Bytes
682910e efffc2e b336194 682910e 864c2ae 682910e 6d286f1 a511250 682910e af02e64 682910e 864c2ae 682910e 6d286f1 682910e a511250 682910e 864c2ae efffc2e d4b2339 864c2ae efffc2e 6d286f1 682910e 864c2ae efffc2e a511250 efffc2e 682910e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import os
import re
import subprocess
import zipfile
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
# --------------------
# Load Model
# --------------------
MODEL_NAME = "sravya-abburi/ResumeParserBERT"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
# --------------------
# Extract Text
# --------------------
def extract_text(file_path: str) -> str:
"""Extract text from PDF/DOCX resumes."""
if file_path.lower().endswith(".pdf"):
try:
result = subprocess.run(
["pdftotext", "-layout", file_path, "-"],
stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False
)
return result.stdout.decode("utf-8", errors="ignore")
except:
return ""
elif file_path.lower().endswith(".docx"):
try:
with zipfile.ZipFile(file_path) as zf:
with zf.open("word/document.xml") as docx_xml:
xml_text = docx_xml.read().decode("utf-8", errors="ignore")
xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
return re.sub(r"<[^>]+>", " ", xml_text)
except:
return ""
return ""
# --------------------
# Parse Resume
# --------------------
def parse_resume(file_path: str, filename: str = None) -> dict:
"""Extract Name, Skills, Education, Experience from resume."""
text = extract_text(file_path)
entities = ner_pipeline(text)
name, skills, education, experience = [], [], [], []
for ent in entities:
label = ent["entity_group"].upper()
word = ent["word"].strip()
if label == "NAME":
name.append(word)
elif label == "SKILL":
skills.append(word)
elif label in ["EDUCATION", "DEGREE"]:
education.append(word)
elif label in ["EXPERIENCE", "JOB", "ROLE"]:
experience.append(word)
return {
"name": " ".join(dict.fromkeys(name)),
"skills": ", ".join(dict.fromkeys(skills)),
"education": ", ".join(dict.fromkeys(education)),
"experience": ", ".join(dict.fromkeys(experience))
}
# --------------------
# Example
# --------------------
if __name__ == "__main__":
resume_path = "resume.pdf" # Change to test file
result = parse_resume(resume_path)
print(json.dumps(result, indent=2))
|