File size: 6,668 Bytes
da06e55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# --- resume_embedding_utils.py ---
import re
import nltk
import spacy
import pdfplumber
import numpy as np
from nltk import sent_tokenize
from collections import defaultdict
from sentence_transformers import SentenceTransformer, util
from pathlib import Path

# --- Setup ---
nltk.download("punkt")
nlp = spacy.load("en_core_web_sm")
sbert = SentenceTransformer("all-MiniLM-L6-v2")

# --- Templates for fallback classification ---
RESUME_TEMPLATES = {
    "name": ["My name is", "Resume of", "Name:"],
    "skills": ["Skills: Python, Java", "Proficient in C++ and ML"],
    "experience": ["Worked at Google", "Software Engineer at Amazon"],
    "education": ["Bachelor of Technology from IIT", "Master's in Data Science"],
    "certifications": ["AWS Certified", "Completed PMP Certification"],
    "projects": ["Built an AI chatbot", "Project: Deep Learning"],
    "tech_stack": ["Tech Stack: Python, TensorFlow", "Languages: Java, C++"]
}

TEMPLATE_EMBEDDINGS = {
    k: sbert.encode(v, convert_to_tensor=True)
    for k, v in RESUME_TEMPLATES.items()
}

COMMON_HEADERS = {
    "skills": ["skills", "technical skills"],
    "experience": ["experience", "work experience", "employment"],
    "education": ["education", "academics"],
    "certifications": ["certifications"],
    "projects": ["projects", "achievements"],
    "tech_stack": ["tech stack", "languages", "tools"],
    "name": ["name", "profile"]
}

def normalize_header(text):
    lower = text.lower().strip().strip(":")
    for section, aliases in COMMON_HEADERS.items():
        if any(lower.startswith(alias) for alias in aliases):
            return section
    return None

def classify_line(line):
    emb = sbert.encode(line, convert_to_tensor=True)
    scores = {
        k: float(util.cos_sim(emb, TEMPLATE_EMBEDDINGS[k]).max())
        for k in TEMPLATE_EMBEDDINGS
    }
    best = max(scores, key=scores.get)
    return best if scores[best] > 0.4 else None

def extract_name(text):
    for line in text.splitlines():
        doc = nlp(line.strip())
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                return ent.text.strip()
    return None

def pdf_to_text(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "\n".join([page.extract_text() or "" for page in pdf.pages])

def extract_resume_sections(text):
    lines = text.splitlines()
    merged_lines = []
    prev_line = ""

    for raw in lines:
        line = raw.strip()
        if not line:
            continue
        if prev_line and (line[0].islower() or line.startswith(("and", "which", "-", "or", ",", "of", "to"))):
            merged_lines[-1] += " " + line
        else:
            merged_lines.append(line)
            prev_line = line

    sections = defaultdict(list)
    current_section = None
    name_found = extract_name(text)

    for line in merged_lines:
        normalized = normalize_header(line)
        if normalized:
            current_section = normalized
            continue

        lower = line.lower()
        if any(w in lower for w in ["bachelor", "ph.d", "master", "diploma", "msc", "b.tech", "mba"]):
            current_section = "education"
        elif "tech stack" in lower or "languages" in lower or "tools" in lower:
            current_section = "tech_stack"
        elif "achievements" in lower or line.startswith(("Built", "Developed")) or "project" in lower:
            current_section = "projects"
        elif "work experience" in lower or re.search(r"(intern|engineer|manager|scientist|developer)", lower):
            current_section = "experience"

        if not current_section:
            current_section = classify_line(line)

        if current_section:
            if current_section in ["education", "experience", "certifications"] and sections[current_section]:
                if line[0].islower() or re.match(r"^(Concentrated|Focused|Research|Worked|Led|Responsible|Published|with|and|using|or|to)\b", line):
                    sections[current_section][-1] += " " + line
                    continue
            sections[current_section].append(line)

    if name_found and name_found not in sections.get("name", []):
        sections["name"].insert(0, name_found)

    return dict(sections)

def generate_resume_embedding(parsed_resume):
    combined = " ".join(
        parsed_resume.get("skills", []) +
        parsed_resume.get("experience", []) +
        parsed_resume.get("education", []) +
        parsed_resume.get("certifications", []) +
        parsed_resume.get("projects", []) +
        parsed_resume.get("tech_stack", [])
    )
    if not combined.strip():
        return sbert.encode("generic resume", convert_to_numpy=True)
    return sbert.encode(combined, convert_to_numpy=True)

def generate_embeddings_for_all_resumes(pdf_paths):
    results = {}

    print("\n🧪 DEBUGGING RESUME PARSING:\n")

    for pdf_path in pdf_paths:
        file_name = Path(pdf_path).name
        text = pdf_to_text(pdf_path)
        parsed = extract_resume_sections(text)

        print(f"\n📄 Resume: {file_name}")
        for section in ["name", "skills", "experience", "education", "certifications", "projects", "tech_stack"]:
            lines = parsed.get(section)
            if lines:
                print(f"  ✅ {section.title()}: {len(lines)} line(s)")
            else:
                print(f"  ❌ {section.title()}: Not found")

        embedding = generate_resume_embedding(parsed)
        print(f"  🔢 Embedding shape: {embedding.shape}")

        results[file_name] = {
            "embedding": {
                "skills": sbert.encode(" ".join(parsed.get("skills", [])), convert_to_numpy=True) if parsed.get("skills") else None,
                "experience": sbert.encode(" ".join(parsed.get("experience", [])), convert_to_numpy=True) if parsed.get("experience") else None,
                "education": sbert.encode(" ".join(parsed.get("education", [])), convert_to_numpy=True) if parsed.get("education") else None,
                "certifications": sbert.encode(" ".join(parsed.get("certifications", [])), convert_to_numpy=True) if parsed.get("certifications") else None,
                "projects": sbert.encode(" ".join(parsed.get("projects", [])), convert_to_numpy=True) if parsed.get("projects") else None,
                "tech_stack": sbert.encode(" ".join(parsed.get("tech_stack", [])), convert_to_numpy=True) if parsed.get("tech_stack") else None,
            },
            "parsed": parsed
        }

    return results