Spaces:
Paused
Paused
File size: 2,581 Bytes
efffc2e 6d286f1 947d727 b336194 6d286f1 947d727 864c2ae 6d286f1 a511250 6d286f1 af02e64 6d286f1 947d727 6d286f1 947d727 6d286f1 947d727 6d286f1 864c2ae 6d286f1 864c2ae 6d286f1 a511250 6d286f1 864c2ae 6d286f1 efffc2e d4b2339 864c2ae efffc2e 6d286f1 864c2ae efffc2e a511250 efffc2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import zipfile, re, os
from PyPDF2 import PdfReader # Lightweight & already in Spaces
# ===============================
# Load Model & Tokenizer
# ===============================
MODEL_NAME = "sravya-abburi/ResumeParserBERT" # Swap to Kiet model if needed
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
# ===============================
# Extract Text (PDF & DOCX)
# ===============================
def extract_text(file_path: str) -> str:
"""Extract text from PDF or DOCX without external dependencies."""
file_path_lower = file_path.lower()
# ✅ PDF reading using PyPDF2 (no fitz, no installs needed)
if file_path_lower.endswith(".pdf"):
text = ""
with open(file_path, "rb") as f:
reader = PdfReader(f)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
# ✅ DOCX reading by extracting XML content
elif file_path_lower.endswith(".docx"):
with zipfile.ZipFile(file_path) as zf:
with zf.open("word/document.xml") as docx_xml:
xml_text = docx_xml.read().decode("utf-8", errors="ignore")
xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
return re.sub(r"<[^>]+>", " ", xml_text)
return ""
# ===============================
# Parse Resume
# ===============================
def parse_resume(file_path: str, filename: str = None) -> dict:
"""Parse resume and extract structured information."""
text = extract_text(file_path)
entities = ner_pipeline(text)
name, skills, education, experience = [], [], [], []
for ent in entities:
label = ent["entity_group"].upper()
word = ent["word"].strip()
if label == "NAME":
name.append(word)
elif label == "SKILL":
skills.append(word)
elif label in ["EDUCATION", "DEGREE"]:
education.append(word)
elif label in ["EXPERIENCE", "JOB", "ROLE"]:
experience.append(word)
return {
"name": " ".join(dict.fromkeys(name)),
"skills": ", ".join(dict.fromkeys(skills)),
"education": ", ".join(dict.fromkeys(education)),
"experience": ", ".join(dict.fromkeys(experience))
}
|