Spaces:
Sleeping
Sleeping
# --- resume_embedding_utils.py --- | |
import re | |
import nltk | |
import spacy | |
import pdfplumber | |
import numpy as np | |
from nltk import sent_tokenize | |
from collections import defaultdict | |
from sentence_transformers import SentenceTransformer, util | |
from pathlib import Path | |
# --- Setup --- | |
nltk.download("punkt") | |
nlp = spacy.load("en_core_web_sm") | |
sbert = SentenceTransformer("all-MiniLM-L6-v2") | |
# --- Templates for fallback classification --- | |
RESUME_TEMPLATES = { | |
"name": ["My name is", "Resume of", "Name:"], | |
"skills": ["Skills: Python, Java", "Proficient in C++ and ML"], | |
"experience": ["Worked at Google", "Software Engineer at Amazon"], | |
"education": ["Bachelor of Technology from IIT", "Master's in Data Science"], | |
"certifications": ["AWS Certified", "Completed PMP Certification"], | |
"projects": ["Built an AI chatbot", "Project: Deep Learning"], | |
"tech_stack": ["Tech Stack: Python, TensorFlow", "Languages: Java, C++"] | |
} | |
TEMPLATE_EMBEDDINGS = { | |
k: sbert.encode(v, convert_to_tensor=True) | |
for k, v in RESUME_TEMPLATES.items() | |
} | |
COMMON_HEADERS = { | |
"skills": ["skills", "technical skills"], | |
"experience": ["experience", "work experience", "employment"], | |
"education": ["education", "academics"], | |
"certifications": ["certifications"], | |
"projects": ["projects", "achievements"], | |
"tech_stack": ["tech stack", "languages", "tools"], | |
"name": ["name", "profile"] | |
} | |
def normalize_header(text): | |
lower = text.lower().strip().strip(":") | |
for section, aliases in COMMON_HEADERS.items(): | |
if any(lower.startswith(alias) for alias in aliases): | |
return section | |
return None | |
def classify_line(line): | |
emb = sbert.encode(line, convert_to_tensor=True) | |
scores = { | |
k: float(util.cos_sim(emb, TEMPLATE_EMBEDDINGS[k]).max()) | |
for k in TEMPLATE_EMBEDDINGS | |
} | |
best = max(scores, key=scores.get) | |
return best if scores[best] > 0.4 else None | |
def extract_name(text): | |
for line in text.splitlines(): | |
doc = nlp(line.strip()) | |
for ent in doc.ents: | |
if ent.label_ == "PERSON": | |
return ent.text.strip() | |
return None | |
def pdf_to_text(pdf_path): | |
with pdfplumber.open(pdf_path) as pdf: | |
return "\n".join([page.extract_text() or "" for page in pdf.pages]) | |
def extract_resume_sections(text): | |
lines = text.splitlines() | |
merged_lines = [] | |
prev_line = "" | |
for raw in lines: | |
line = raw.strip() | |
if not line: | |
continue | |
if prev_line and (line[0].islower() or line.startswith(("and", "which", "-", "or", ",", "of", "to"))): | |
merged_lines[-1] += " " + line | |
else: | |
merged_lines.append(line) | |
prev_line = line | |
sections = defaultdict(list) | |
current_section = None | |
name_found = extract_name(text) | |
for line in merged_lines: | |
normalized = normalize_header(line) | |
if normalized: | |
current_section = normalized | |
continue | |
lower = line.lower() | |
if any(w in lower for w in ["bachelor", "ph.d", "master", "diploma", "msc", "b.tech", "mba"]): | |
current_section = "education" | |
elif "tech stack" in lower or "languages" in lower or "tools" in lower: | |
current_section = "tech_stack" | |
elif "achievements" in lower or line.startswith(("Built", "Developed")) or "project" in lower: | |
current_section = "projects" | |
elif "work experience" in lower or re.search(r"(intern|engineer|manager|scientist|developer)", lower): | |
current_section = "experience" | |
if not current_section: | |
current_section = classify_line(line) | |
if current_section: | |
if current_section in ["education", "experience", "certifications"] and sections[current_section]: | |
if line[0].islower() or re.match(r"^(Concentrated|Focused|Research|Worked|Led|Responsible|Published|with|and|using|or|to)\b", line): | |
sections[current_section][-1] += " " + line | |
continue | |
sections[current_section].append(line) | |
if name_found and name_found not in sections.get("name", []): | |
sections["name"].insert(0, name_found) | |
return dict(sections) | |
def generate_resume_embedding(parsed_resume): | |
combined = " ".join( | |
parsed_resume.get("skills", []) + | |
parsed_resume.get("experience", []) + | |
parsed_resume.get("education", []) + | |
parsed_resume.get("certifications", []) + | |
parsed_resume.get("projects", []) + | |
parsed_resume.get("tech_stack", []) | |
) | |
if not combined.strip(): | |
return sbert.encode("generic resume", convert_to_numpy=True) | |
return sbert.encode(combined, convert_to_numpy=True) | |
def generate_embeddings_for_all_resumes(pdf_paths): | |
results = {} | |
print("\n🧪 DEBUGGING RESUME PARSING:\n") | |
for pdf_path in pdf_paths: | |
file_name = Path(pdf_path).name | |
text = pdf_to_text(pdf_path) | |
parsed = extract_resume_sections(text) | |
print(f"\n📄 Resume: {file_name}") | |
for section in ["name", "skills", "experience", "education", "certifications", "projects", "tech_stack"]: | |
lines = parsed.get(section) | |
if lines: | |
print(f" ✅ {section.title()}: {len(lines)} line(s)") | |
else: | |
print(f" ❌ {section.title()}: Not found") | |
embedding = generate_resume_embedding(parsed) | |
print(f" 🔢 Embedding shape: {embedding.shape}") | |
results[file_name] = { | |
"embedding": { | |
"skills": sbert.encode(" ".join(parsed.get("skills", [])), convert_to_numpy=True) if parsed.get("skills") else None, | |
"experience": sbert.encode(" ".join(parsed.get("experience", [])), convert_to_numpy=True) if parsed.get("experience") else None, | |
"education": sbert.encode(" ".join(parsed.get("education", [])), convert_to_numpy=True) if parsed.get("education") else None, | |
"certifications": sbert.encode(" ".join(parsed.get("certifications", [])), convert_to_numpy=True) if parsed.get("certifications") else None, | |
"projects": sbert.encode(" ".join(parsed.get("projects", [])), convert_to_numpy=True) if parsed.get("projects") else None, | |
"tech_stack": sbert.encode(" ".join(parsed.get("tech_stack", [])), convert_to_numpy=True) if parsed.get("tech_stack") else None, | |
}, | |
"parsed": parsed | |
} | |
return results | |