Spaces:
Sleeping
Sleeping
import re | |
import nltk | |
from nltk import sent_tokenize | |
from collections import defaultdict | |
from sentence_transformers import SentenceTransformer, util | |
import spacy | |
import numpy as np | |
# Ensure nltk data is available | |
try: | |
nltk.data.find("tokenizers/punkt") | |
except LookupError: | |
nltk.download("punkt") | |
# Load models | |
sbert = SentenceTransformer("all-MiniLM-L6-v2") | |
nlp = spacy.load("en_core_web_sm") | |
# Relevant templates | |
TEMPLATES = { | |
"job_title": ["We're hiring a Backend Developer", "Job Title: Cloud Engineer", "Looking for a Product Manager"], | |
"responsibilities": ["You will collaborate with teams", "Expected to deliver high performance"], | |
"qualifications": ["Bachelor's or Master's in CS", "Degree in engineering or related field"] | |
} | |
TEMPLATE_EMBEDDINGS = {k: sbert.encode(v, convert_to_tensor=True) for k, v in TEMPLATES.items()} | |
COMMON_HEADERS = ['responsibilities', 'qualifications'] | |
def clean_line(line): | |
return line.strip() | |
def classify_line(line): | |
line_embedding = sbert.encode(line, convert_to_tensor=True) | |
scores = {k: float(util.cos_sim(line_embedding, TEMPLATE_EMBEDDINGS[k]).max()) for k in TEMPLATE_EMBEDDINGS} | |
best_match = max(scores, key=scores.get) | |
return best_match if scores[best_match] > 0.4 else None | |
def extract_job_title(text): | |
# Regex-based extraction | |
patterns = [ | |
r"We are (seeking|looking for|hiring)( an?| a)? (?P<title>[A-Z][a-zA-Z\s\-]+)", | |
r"Job Title[:\-]?\s*(?P<title>[A-Z][\w\s\-]+)" | |
] | |
for pat in patterns: | |
match = re.search(pat, text, re.IGNORECASE) | |
if match: | |
title = match.group("title").strip() | |
# Trim any filler trailing words | |
for stop_word in [" to ", " who ", " that ", " and ", " for ", " with "]: | |
if stop_word in title: | |
title = title.split(stop_word)[0].strip() | |
break | |
if title.lower() not in ["responsibilities", "description", "qualifications"]: | |
return title | |
# Manual fallback: check for job title in lines | |
for line in text.splitlines(): | |
if "job title" in line.lower(): | |
return line.split(":")[-1].strip() | |
# Final fallback: first short line that isn’t a section | |
for line in text.splitlines(): | |
line = line.strip() | |
if not line or line.lower().startswith(("description", "responsibilities", "qualifications")): | |
continue | |
if len(line.split()) <= 7 and line[0].isupper(): | |
return line.strip() | |
return "Unknown" | |
def extract_sections(text): | |
lines = text.splitlines() | |
results = defaultdict(list) | |
results["job_title"] = extract_job_title(text) | |
current_section = None | |
normalized_headers = { | |
'responsibilities': 'responsibilities', | |
'qualifications': 'qualifications' | |
} | |
for line in lines: | |
raw_line = line.strip() | |
if not raw_line: | |
continue | |
lower_line = raw_line.lower().strip(":").strip() | |
if lower_line in normalized_headers: | |
current_section = normalized_headers[lower_line] | |
continue | |
if current_section: | |
results[current_section].append(raw_line) | |
else: | |
category = classify_line(raw_line) | |
if category and category != "job_title": | |
results[category].append(raw_line) | |
print("🔍 JD Section Classification Results (final):") | |
for section, content in results.items(): | |
if section != "job_title": | |
print(f" {section}: {len(content)} lines") | |
return dict(results) | |
def generate_jd_embedding(jd_text): | |
parsed = extract_sections(jd_text) | |
title = parsed.get("job_title", "Unknown") | |
embeddings_by_section = {} | |
for section in ["responsibilities", "qualifications"]: | |
lines = parsed.get(section, []) | |
if lines: | |
combined = " ".join(lines) | |
emb = sbert.encode(combined, convert_to_numpy=True) | |
embeddings_by_section[section] = emb | |
print(f"✅ Embedded section '{section}': shape = {emb.shape}") | |
else: | |
print(f"❌ No content found for section '{section}'") | |
embeddings_by_section[section] = None | |
return title, embeddings_by_section |