Spaces:
Sleeping
Sleeping
File size: 6,668 Bytes
da06e55 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
# --- resume_embedding_utils.py ---
import re
import nltk
import spacy
import pdfplumber
import numpy as np
from nltk import sent_tokenize
from collections import defaultdict
from sentence_transformers import SentenceTransformer, util
from pathlib import Path
# --- Setup ---
nltk.download("punkt")
nlp = spacy.load("en_core_web_sm")
sbert = SentenceTransformer("all-MiniLM-L6-v2")
# --- Templates for fallback classification ---
RESUME_TEMPLATES = {
"name": ["My name is", "Resume of", "Name:"],
"skills": ["Skills: Python, Java", "Proficient in C++ and ML"],
"experience": ["Worked at Google", "Software Engineer at Amazon"],
"education": ["Bachelor of Technology from IIT", "Master's in Data Science"],
"certifications": ["AWS Certified", "Completed PMP Certification"],
"projects": ["Built an AI chatbot", "Project: Deep Learning"],
"tech_stack": ["Tech Stack: Python, TensorFlow", "Languages: Java, C++"]
}
TEMPLATE_EMBEDDINGS = {
k: sbert.encode(v, convert_to_tensor=True)
for k, v in RESUME_TEMPLATES.items()
}
COMMON_HEADERS = {
"skills": ["skills", "technical skills"],
"experience": ["experience", "work experience", "employment"],
"education": ["education", "academics"],
"certifications": ["certifications"],
"projects": ["projects", "achievements"],
"tech_stack": ["tech stack", "languages", "tools"],
"name": ["name", "profile"]
}
def normalize_header(text):
lower = text.lower().strip().strip(":")
for section, aliases in COMMON_HEADERS.items():
if any(lower.startswith(alias) for alias in aliases):
return section
return None
def classify_line(line):
emb = sbert.encode(line, convert_to_tensor=True)
scores = {
k: float(util.cos_sim(emb, TEMPLATE_EMBEDDINGS[k]).max())
for k in TEMPLATE_EMBEDDINGS
}
best = max(scores, key=scores.get)
return best if scores[best] > 0.4 else None
def extract_name(text):
for line in text.splitlines():
doc = nlp(line.strip())
for ent in doc.ents:
if ent.label_ == "PERSON":
return ent.text.strip()
return None
def pdf_to_text(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
return "\n".join([page.extract_text() or "" for page in pdf.pages])
def extract_resume_sections(text):
lines = text.splitlines()
merged_lines = []
prev_line = ""
for raw in lines:
line = raw.strip()
if not line:
continue
if prev_line and (line[0].islower() or line.startswith(("and", "which", "-", "or", ",", "of", "to"))):
merged_lines[-1] += " " + line
else:
merged_lines.append(line)
prev_line = line
sections = defaultdict(list)
current_section = None
name_found = extract_name(text)
for line in merged_lines:
normalized = normalize_header(line)
if normalized:
current_section = normalized
continue
lower = line.lower()
if any(w in lower for w in ["bachelor", "ph.d", "master", "diploma", "msc", "b.tech", "mba"]):
current_section = "education"
elif "tech stack" in lower or "languages" in lower or "tools" in lower:
current_section = "tech_stack"
elif "achievements" in lower or line.startswith(("Built", "Developed")) or "project" in lower:
current_section = "projects"
elif "work experience" in lower or re.search(r"(intern|engineer|manager|scientist|developer)", lower):
current_section = "experience"
if not current_section:
current_section = classify_line(line)
if current_section:
if current_section in ["education", "experience", "certifications"] and sections[current_section]:
if line[0].islower() or re.match(r"^(Concentrated|Focused|Research|Worked|Led|Responsible|Published|with|and|using|or|to)\b", line):
sections[current_section][-1] += " " + line
continue
sections[current_section].append(line)
if name_found and name_found not in sections.get("name", []):
sections["name"].insert(0, name_found)
return dict(sections)
def generate_resume_embedding(parsed_resume):
combined = " ".join(
parsed_resume.get("skills", []) +
parsed_resume.get("experience", []) +
parsed_resume.get("education", []) +
parsed_resume.get("certifications", []) +
parsed_resume.get("projects", []) +
parsed_resume.get("tech_stack", [])
)
if not combined.strip():
return sbert.encode("generic resume", convert_to_numpy=True)
return sbert.encode(combined, convert_to_numpy=True)
def generate_embeddings_for_all_resumes(pdf_paths):
results = {}
print("\n🧪 DEBUGGING RESUME PARSING:\n")
for pdf_path in pdf_paths:
file_name = Path(pdf_path).name
text = pdf_to_text(pdf_path)
parsed = extract_resume_sections(text)
print(f"\n📄 Resume: {file_name}")
for section in ["name", "skills", "experience", "education", "certifications", "projects", "tech_stack"]:
lines = parsed.get(section)
if lines:
print(f" ✅ {section.title()}: {len(lines)} line(s)")
else:
print(f" ❌ {section.title()}: Not found")
embedding = generate_resume_embedding(parsed)
print(f" 🔢 Embedding shape: {embedding.shape}")
results[file_name] = {
"embedding": {
"skills": sbert.encode(" ".join(parsed.get("skills", [])), convert_to_numpy=True) if parsed.get("skills") else None,
"experience": sbert.encode(" ".join(parsed.get("experience", [])), convert_to_numpy=True) if parsed.get("experience") else None,
"education": sbert.encode(" ".join(parsed.get("education", [])), convert_to_numpy=True) if parsed.get("education") else None,
"certifications": sbert.encode(" ".join(parsed.get("certifications", [])), convert_to_numpy=True) if parsed.get("certifications") else None,
"projects": sbert.encode(" ".join(parsed.get("projects", [])), convert_to_numpy=True) if parsed.get("projects") else None,
"tech_stack": sbert.encode(" ".join(parsed.get("tech_stack", [])), convert_to_numpy=True) if parsed.get("tech_stack") else None,
},
"parsed": parsed
}
return results
|