File size: 4,399 Bytes
da06e55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import re
import nltk
from nltk import sent_tokenize
from collections import defaultdict
from sentence_transformers import SentenceTransformer, util
import spacy
import numpy as np

# Ensure nltk data is available
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

# Load models
sbert = SentenceTransformer("all-MiniLM-L6-v2")
nlp = spacy.load("en_core_web_sm")

# Relevant templates
TEMPLATES = {
    "job_title": ["We're hiring a Backend Developer", "Job Title: Cloud Engineer", "Looking for a Product Manager"],
    "responsibilities": ["You will collaborate with teams", "Expected to deliver high performance"],
    "qualifications": ["Bachelor's or Master's in CS", "Degree in engineering or related field"]
}

TEMPLATE_EMBEDDINGS = {k: sbert.encode(v, convert_to_tensor=True) for k, v in TEMPLATES.items()}

COMMON_HEADERS = ['responsibilities', 'qualifications']

def clean_line(line):
    return line.strip()

def classify_line(line):
    line_embedding = sbert.encode(line, convert_to_tensor=True)
    scores = {k: float(util.cos_sim(line_embedding, TEMPLATE_EMBEDDINGS[k]).max()) for k in TEMPLATE_EMBEDDINGS}
    best_match = max(scores, key=scores.get)
    return best_match if scores[best_match] > 0.4 else None

def extract_job_title(text):
    # Regex-based extraction
    patterns = [
        r"We are (seeking|looking for|hiring)( an?| a)? (?P<title>[A-Z][a-zA-Z\s\-]+)",
        r"Job Title[:\-]?\s*(?P<title>[A-Z][\w\s\-]+)"
    ]
    for pat in patterns:
        match = re.search(pat, text, re.IGNORECASE)
        if match:
            title = match.group("title").strip()

            # Trim any filler trailing words
            for stop_word in [" to ", " who ", " that ", " and ", " for ", " with "]:
                if stop_word in title:
                    title = title.split(stop_word)[0].strip()
                    break

            if title.lower() not in ["responsibilities", "description", "qualifications"]:
                return title

    # Manual fallback: check for job title in lines
    for line in text.splitlines():
        if "job title" in line.lower():
            return line.split(":")[-1].strip()

    # Final fallback: first short line that isn’t a section
    for line in text.splitlines():
        line = line.strip()
        if not line or line.lower().startswith(("description", "responsibilities", "qualifications")):
            continue
        if len(line.split()) <= 7 and line[0].isupper():
            return line.strip()

    return "Unknown"

def extract_sections(text):
    lines = text.splitlines()
    results = defaultdict(list)
    results["job_title"] = extract_job_title(text)

    current_section = None
    normalized_headers = {
        'responsibilities': 'responsibilities',
        'qualifications': 'qualifications'
    }

    for line in lines:
        raw_line = line.strip()
        if not raw_line:
            continue

        lower_line = raw_line.lower().strip(":").strip()
        if lower_line in normalized_headers:
            current_section = normalized_headers[lower_line]
            continue

        if current_section:
            results[current_section].append(raw_line)
        else:
            category = classify_line(raw_line)
            if category and category != "job_title":
                results[category].append(raw_line)

    print("🔍 JD Section Classification Results (final):")
    for section, content in results.items():
        if section != "job_title":
            print(f"  {section}: {len(content)} lines")

    return dict(results)

def generate_jd_embedding(jd_text):
    parsed = extract_sections(jd_text)
    title = parsed.get("job_title", "Unknown")

    embeddings_by_section = {}
    for section in ["responsibilities", "qualifications"]:
        lines = parsed.get(section, [])
        if lines:
            combined = " ".join(lines)
            emb = sbert.encode(combined, convert_to_numpy=True)
            embeddings_by_section[section] = emb
            print(f"✅ Embedded section '{section}': shape = {emb.shape}")
        else:
            print(f"❌ No content found for section '{section}'")
            embeddings_by_section[section] = None

    return title, embeddings_by_section