Spaces:

panchadip
/

resume_matcher

Sleeping

App Files Files Community

resume_matcher / resume_embedding_utils.py

panchadip

Upload 10 files

da06e55 verified 4 months ago

raw

history blame contribute delete

6.67 kB

	# --- resume_embedding_utils.py ---
	import re
	import nltk
	import spacy
	import pdfplumber
	import numpy as np
	from nltk import sent_tokenize
	from collections import defaultdict
	from sentence_transformers import SentenceTransformer, util
	from pathlib import Path

	# --- Setup ---
	nltk.download("punkt")
	nlp = spacy.load("en_core_web_sm")
	sbert = SentenceTransformer("all-MiniLM-L6-v2")

	# --- Templates for fallback classification ---
	RESUME_TEMPLATES = {
	"name": ["My name is", "Resume of", "Name:"],
	"skills": ["Skills: Python, Java", "Proficient in C++ and ML"],
	"experience": ["Worked at Google", "Software Engineer at Amazon"],
	"education": ["Bachelor of Technology from IIT", "Master's in Data Science"],
	"certifications": ["AWS Certified", "Completed PMP Certification"],
	"projects": ["Built an AI chatbot", "Project: Deep Learning"],
	"tech_stack": ["Tech Stack: Python, TensorFlow", "Languages: Java, C++"]
	}

	TEMPLATE_EMBEDDINGS = {
	k: sbert.encode(v, convert_to_tensor=True)
	for k, v in RESUME_TEMPLATES.items()
	}

	COMMON_HEADERS = {
	"skills": ["skills", "technical skills"],
	"experience": ["experience", "work experience", "employment"],
	"education": ["education", "academics"],
	"certifications": ["certifications"],
	"projects": ["projects", "achievements"],
	"tech_stack": ["tech stack", "languages", "tools"],
	"name": ["name", "profile"]
	}

	def normalize_header(text):
	lower = text.lower().strip().strip(":")
	for section, aliases in COMMON_HEADERS.items():
	if any(lower.startswith(alias) for alias in aliases):
	return section
	return None

	def classify_line(line):
	emb = sbert.encode(line, convert_to_tensor=True)
	scores = {
	k: float(util.cos_sim(emb, TEMPLATE_EMBEDDINGS[k]).max())
	for k in TEMPLATE_EMBEDDINGS
	}
	best = max(scores, key=scores.get)
	return best if scores[best] > 0.4 else None

	def extract_name(text):
	for line in text.splitlines():
	doc = nlp(line.strip())
	for ent in doc.ents:
	if ent.label_ == "PERSON":
	return ent.text.strip()
	return None

	def pdf_to_text(pdf_path):
	with pdfplumber.open(pdf_path) as pdf:
	return "\n".join([page.extract_text() or "" for page in pdf.pages])

	def extract_resume_sections(text):
	lines = text.splitlines()
	merged_lines = []
	prev_line = ""

	for raw in lines:
	line = raw.strip()
	if not line:
	continue
	if prev_line and (line[0].islower() or line.startswith(("and", "which", "-", "or", ",", "of", "to"))):
	merged_lines[-1] += " " + line
	else:
	merged_lines.append(line)
	prev_line = line

	sections = defaultdict(list)
	current_section = None
	name_found = extract_name(text)

	for line in merged_lines:
	normalized = normalize_header(line)
	if normalized:
	current_section = normalized
	continue

	lower = line.lower()
	if any(w in lower for w in ["bachelor", "ph.d", "master", "diploma", "msc", "b.tech", "mba"]):
	current_section = "education"
	elif "tech stack" in lower or "languages" in lower or "tools" in lower:
	current_section = "tech_stack"
	elif "achievements" in lower or line.startswith(("Built", "Developed")) or "project" in lower:
	current_section = "projects"
	elif "work experience" in lower or re.search(r"(intern\|engineer\|manager\|scientist\|developer)", lower):
	current_section = "experience"

	if not current_section:
	current_section = classify_line(line)

	if current_section:
	if current_section in ["education", "experience", "certifications"] and sections[current_section]:
	if line[0].islower() or re.match(r"^(Concentrated\|Focused\|Research\|Worked\|Led\|Responsible\|Published\|with\|and\|using\|or\|to)\b", line):
	sections[current_section][-1] += " " + line
	continue
	sections[current_section].append(line)

	if name_found and name_found not in sections.get("name", []):
	sections["name"].insert(0, name_found)

	return dict(sections)

	def generate_resume_embedding(parsed_resume):
	combined = " ".join(
	parsed_resume.get("skills", []) +
	parsed_resume.get("experience", []) +
	parsed_resume.get("education", []) +
	parsed_resume.get("certifications", []) +
	parsed_resume.get("projects", []) +
	parsed_resume.get("tech_stack", [])
	)
	if not combined.strip():
	return sbert.encode("generic resume", convert_to_numpy=True)
	return sbert.encode(combined, convert_to_numpy=True)

	def generate_embeddings_for_all_resumes(pdf_paths):
	results = {}

	print("\n🧪 DEBUGGING RESUME PARSING:\n")

	for pdf_path in pdf_paths:
	file_name = Path(pdf_path).name
	text = pdf_to_text(pdf_path)
	parsed = extract_resume_sections(text)

	print(f"\n📄 Resume: {file_name}")
	for section in ["name", "skills", "experience", "education", "certifications", "projects", "tech_stack"]:
	lines = parsed.get(section)
	if lines:
	print(f" ✅ {section.title()}: {len(lines)} line(s)")
	else:
	print(f" ❌ {section.title()}: Not found")

	embedding = generate_resume_embedding(parsed)
	print(f" 🔢 Embedding shape: {embedding.shape}")

	results[file_name] = {
	"embedding": {
	"skills": sbert.encode(" ".join(parsed.get("skills", [])), convert_to_numpy=True) if parsed.get("skills") else None,
	"experience": sbert.encode(" ".join(parsed.get("experience", [])), convert_to_numpy=True) if parsed.get("experience") else None,
	"education": sbert.encode(" ".join(parsed.get("education", [])), convert_to_numpy=True) if parsed.get("education") else None,
	"certifications": sbert.encode(" ".join(parsed.get("certifications", [])), convert_to_numpy=True) if parsed.get("certifications") else None,
	"projects": sbert.encode(" ".join(parsed.get("projects", [])), convert_to_numpy=True) if parsed.get("projects") else None,
	"tech_stack": sbert.encode(" ".join(parsed.get("tech_stack", [])), convert_to_numpy=True) if parsed.get("tech_stack") else None,
	},
	"parsed": parsed
	}

	return results