panchadip commited on
Commit
da06e55
·
verified ·
1 Parent(s): ba9ad5a

Upload 10 files

Browse files
Files changed (10) hide show
  1. agent_framework.py +394 -0
  2. app.py +322 -0
  3. database.py +19 -0
  4. email_utils.py +45 -0
  5. jd_embedding_utils.py +125 -0
  6. matcher.py +94 -0
  7. models.py +91 -0
  8. requirements.txt +14 -0
  9. resume_embedding_utils.py +167 -0
  10. schemas.py +16 -0
agent_framework.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from datetime import datetime, timedelta
3
+ import random
4
+ from typing import List, Dict, Any
5
+ import json
6
+
7
+ # Configure logging
8
+ logging.basicConfig(level=logging.INFO,
9
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
10
+
11
+ class Agent:
12
+ """Base class for all agents in the system"""
13
+ def __init__(self, name: str):
14
+ self.name = name
15
+ self.logger = logging.getLogger(f"Agent:{name}")
16
+ self.logger.info(f"Agent {name} initialized")
17
+
18
+ def log_action(self, action: str, details: Any = None):
19
+ """Log an action taken by this agent"""
20
+ self.logger.info(f"Action: {action} - Details: {details}")
21
+
22
+ def __str__(self):
23
+ return f"Agent({self.name})"
24
+
25
+ class JDAnalyzerAgent(Agent):
26
+ """Agent responsible for analyzing job descriptions"""
27
+ def __init__(self):
28
+ super().__init__("JDAnalyzer")
29
+
30
+ def analyze_jd(self, jd_text: str) -> Dict:
31
+ """Analyze a job description to extract key information"""
32
+ from jd_embedding_utils import generate_jd_embedding, extract_sections
33
+
34
+ self.log_action("Analyzing job description", {"length": len(jd_text)})
35
+
36
+ # Extract and generate embeddings
37
+ title, embedding = generate_jd_embedding(jd_text)
38
+ sections = extract_sections(jd_text)
39
+
40
+ # Generate summary
41
+ summary = self.generate_summary(sections)
42
+
43
+ result = {
44
+ "title": title,
45
+ "embedding": embedding,
46
+ "sections": sections,
47
+ "summary": summary
48
+ }
49
+
50
+ self.log_action("Analysis complete", {"title": title})
51
+ return result
52
+
53
+ def generate_summary(self, sections: Dict) -> str:
54
+ """Generate a human-readable summary of the job description"""
55
+ title = sections.get("job_title", "Unknown Position")
56
+
57
+ # Get responsibilities and qualifications
58
+ responsibilities = sections.get("responsibilities", [])
59
+ qualifications = sections.get("qualifications", [])
60
+
61
+ # Generate summary text
62
+ summary = f"Position: {title}\n\n"
63
+
64
+ if responsibilities:
65
+ summary += "Key Responsibilities:\n"
66
+ # Limit to top 5 responsibilities for brevity
67
+ for i, resp in enumerate(responsibilities[:5]):
68
+ summary += f"- {resp}\n"
69
+ if len(responsibilities) > 5:
70
+ summary += f"- Plus {len(responsibilities) - 5} more responsibilities\n"
71
+
72
+ summary += "\n"
73
+
74
+ if qualifications:
75
+ summary += "Required Qualifications:\n"
76
+ # Limit to top 5 qualifications for brevity
77
+ for i, qual in enumerate(qualifications[:5]):
78
+ summary += f"- {qual}\n"
79
+ if len(qualifications) > 5:
80
+ summary += f"- Plus {len(qualifications) - 5} more qualifications\n"
81
+
82
+ return summary
83
+
84
+ class CVAnalyzerAgent(Agent):
85
+ """Agent responsible for analyzing candidate CVs"""
86
+ def __init__(self):
87
+ super().__init__("CVAnalyzer")
88
+
89
+ def process_cv(self, file_path: str, filename: str) -> Dict:
90
+ """Process a CV to extract key information"""
91
+ from resume_embedding_utils import pdf_to_text, extract_resume_sections, generate_resume_embedding
92
+
93
+ self.log_action("Processing CV", {"filename": filename})
94
+
95
+ # Extract text from PDF
96
+ text = pdf_to_text(file_path)
97
+
98
+ # Parse CV sections
99
+ parsed_sections = extract_resume_sections(text)
100
+
101
+ # Generate section-specific embeddings
102
+ section_embeddings = {}
103
+ for section in ["experience", "education", "skills", "projects", "certifications", "tech_stack"]:
104
+ if section in parsed_sections and parsed_sections[section]:
105
+ section_text = " ".join(parsed_sections[section])
106
+ if section_text.strip():
107
+ from sentence_transformers import SentenceTransformer
108
+ model = SentenceTransformer("all-MiniLM-L6-v2")
109
+ section_embeddings[section] = model.encode(section_text, convert_to_numpy=True)
110
+
111
+ # Generate summary
112
+ summary = self.generate_summary(parsed_sections)
113
+
114
+ result = {
115
+ "parsed": parsed_sections,
116
+ "embedding": section_embeddings,
117
+ "text": text,
118
+ "summary": summary
119
+ }
120
+
121
+ self.log_action("CV processing complete", {
122
+ "sections_found": list(parsed_sections.keys())
123
+ })
124
+
125
+ return result
126
+
127
+ def generate_summary(self, sections: Dict) -> str:
128
+ """Generate a human-readable summary of the CV"""
129
+ name = sections.get("name", ["Unknown Candidate"])[0] if sections.get("name") else "Unknown Candidate"
130
+
131
+ # Extract key information
132
+ skills = sections.get("skills", [])
133
+ experience = sections.get("experience", [])
134
+ education = sections.get("education", [])
135
+
136
+ # Generate summary text
137
+ summary = f"Candidate: {name}\n\n"
138
+
139
+ if skills:
140
+ summary += "Key Skills:\n"
141
+ # Limit to top 5 skills for brevity
142
+ for i, skill in enumerate(skills[:5]):
143
+ summary += f"- {skill}\n"
144
+ if len(skills) > 5:
145
+ summary += f"- Plus {len(skills) - 5} more skills\n"
146
+
147
+ summary += "\n"
148
+
149
+ if experience:
150
+ summary += "Experience:\n"
151
+ # Limit to top 3 experiences for brevity
152
+ for i, exp in enumerate(experience[:3]):
153
+ summary += f"- {exp}\n"
154
+ if len(experience) > 3:
155
+ summary += f"- Plus {len(experience) - 3} more experiences\n"
156
+
157
+ summary += "\n"
158
+
159
+ if education:
160
+ summary += "Education:\n"
161
+ # Limit to top 2 education entries for brevity
162
+ for i, edu in enumerate(education[:2]):
163
+ summary += f"- {edu}\n"
164
+
165
+ return summary
166
+
167
+ class MatchingAgent(Agent):
168
+ """Agent responsible for matching CVs against job descriptions"""
169
+ def __init__(self, threshold: float = 0.7):
170
+ super().__init__("Matcher")
171
+ self.threshold = threshold
172
+
173
+ def match_cvs_to_jd(self, jd_data: Dict, cv_data: Dict[str, Dict]) -> Dict:
174
+ """Match multiple CVs against a job description"""
175
+ from matcher import match_all_resumes
176
+
177
+ self.log_action("Starting matching process", {
178
+ "jd_title": jd_data.get("title", "Unknown"),
179
+ "cv_count": len(cv_data)
180
+ })
181
+
182
+ jd_title = jd_data.get("title", "Unknown Position")
183
+ jd_embeddings = jd_data.get("embedding", {})
184
+
185
+ # Match each CV against the JD
186
+ matches = []
187
+ for filename, resume_data in cv_data.items():
188
+ parsed = resume_data["parsed"]
189
+ embedding = resume_data.get("embedding", {})
190
+
191
+ # Extract name from parsed CV or use filename
192
+ name = self._extract_name(parsed, filename)
193
+
194
+ from matcher import calculate_match_score
195
+ score, reasoning = calculate_match_score(jd_embeddings, embedding)
196
+
197
+ match_data = {
198
+ "name": name,
199
+ "filename": filename,
200
+ "score": score,
201
+ "reasoning": reasoning,
202
+ "isMatch": score >= self.threshold # Use threshold for matching
203
+ }
204
+
205
+ self.log_action("CV matched", {
206
+ "name": name,
207
+ "score": score,
208
+ "is_match": match_data["isMatch"]
209
+ })
210
+
211
+ matches.append(match_data)
212
+
213
+ # Sort matches by score in descending order
214
+ matches.sort(key=lambda x: x["score"], reverse=True)
215
+
216
+ result = {"matches": matches}
217
+ self.log_action("Matching complete", {
218
+ "total_matches": len(matches),
219
+ "qualified_matches": sum(1 for m in matches if m["isMatch"])
220
+ })
221
+
222
+ return result
223
+
224
+ def _extract_name(self, parsed, fallback):
225
+ """Extract name from parsed CV or use fallback"""
226
+ from pathlib import Path
227
+ if "name" in parsed and parsed["name"] and len(parsed["name"]) > 0:
228
+ return parsed["name"][0]
229
+ return Path(fallback).stem
230
+
231
+ class SchedulerAgent(Agent):
232
+ """Agent responsible for scheduling interviews with matched candidates"""
233
+ def __init__(self):
234
+ super().__init__("Scheduler")
235
+
236
+ def generate_interview_slots(self, days_ahead: int = 10, slots_per_day: int = 3) -> List[Dict]:
237
+ """Generate available interview time slots for the next N days"""
238
+ slots = []
239
+ start_date = datetime.now() + timedelta(days=1) # Start from tomorrow
240
+
241
+ for day in range(days_ahead):
242
+ current_date = start_date + timedelta(days=day)
243
+
244
+ # Skip weekends
245
+ if current_date.weekday() >= 5: # 5 = Saturday, 6 = Sunday
246
+ continue
247
+
248
+ # Generate time slots for this day
249
+ possible_hours = [9, 10, 11, 13, 14, 15, 16] # 9 AM to 5 PM with lunch break
250
+ selected_hours = random.sample(possible_hours, min(slots_per_day, len(possible_hours)))
251
+ selected_hours.sort()
252
+
253
+ for hour in selected_hours:
254
+ slot_time = current_date.replace(hour=hour, minute=0, second=0, microsecond=0)
255
+ slots.append({
256
+ "date": slot_time.strftime("%Y-%m-%d"),
257
+ "time": slot_time.strftime("%H:%M"),
258
+ "datetime": slot_time,
259
+ "formatted": slot_time.strftime("%A, %B %d at %I:%M %p")
260
+ })
261
+
262
+ return slots
263
+
264
+ def prepare_email_for_candidate(self, candidate: Dict, job_title: str) -> Dict:
265
+ """Prepare an email for a shortlisted candidate with interview slots"""
266
+ self.log_action("Preparing email", {"candidate": candidate["name"]})
267
+
268
+ # Generate interview slots
269
+ interview_slots = self.generate_interview_slots(days_ahead=7, slots_per_day=2)
270
+
271
+ # Format the email content
272
+ candidate_name = candidate["name"]
273
+
274
+ # Create the email content with interview slots
275
+ subject = f"Interview Invitation: {job_title} Position"
276
+
277
+ body = f"""Dear {candidate_name},
278
+
279
+ We are pleased to inform you that your profile has been shortlisted for the {job_title} position. Your qualifications and experience align well with what we're looking for.
280
+
281
+ We would like to invite you for an interview. Please select one of the following time slots that works best for you:
282
+
283
+ """
284
+
285
+ # Add the first 3 available slots
286
+ for i, slot in enumerate(interview_slots[:3]):
287
+ body += f"Option {i+1}: {slot['formatted']}\n"
288
+
289
+ body += f"""
290
+ Please reply to this email with your preferred time slot, or suggest an alternative if none of these work for you.
291
+
292
+ The interview will be conducted via video call, and the details will be sent once you confirm your availability.
293
+
294
+ We look forward to speaking with you!
295
+
296
+ Best regards,
297
+ Recruitment Team"""
298
+
299
+ return {
300
+ "to": candidate_name,
301
+ "email": self._generate_email_address(candidate_name),
302
+ "subject": subject,
303
+ "body": body,
304
+ "slots": interview_slots[:3]
305
+ }
306
+
307
+ def _generate_email_address(self, name: str) -> str:
308
+ """Generate a placeholder email address from a name"""
309
+ # Convert to lowercase, replace spaces with dots, add domain
310
+ email = name.lower().replace(" ", ".")
311
+ return f"{email}@example.com"
312
+
313
+ def send_interview_email(self, email_data: Dict) -> Dict:
314
+ """Send an interview invitation email to a candidate"""
315
+ from email_utils import send_email
316
+
317
+ self.log_action("Sending interview email", {
318
+ "to": email_data["to"],
319
+ "email": email_data["email"]
320
+ })
321
+
322
+ # Call the email utility to send the email
323
+ result = send_email(
324
+ to_email=email_data["email"],
325
+ subject=email_data["subject"],
326
+ body=email_data["body"].replace("\n", "<br>")
327
+ )
328
+
329
+ self.log_action("Email sent", {"success": result["success"]})
330
+ return result
331
+
332
+ class AgentCoordinator:
333
+ """Coordinates the activities of all agents in the system"""
334
+ def __init__(self):
335
+ self.jd_agent = JDAnalyzerAgent()
336
+ self.cv_agent = CVAnalyzerAgent()
337
+ self.matching_agent = MatchingAgent()
338
+ self.scheduler_agent = SchedulerAgent()
339
+ self.logger = logging.getLogger("AgentCoordinator")
340
+
341
+ def process_job_description(self, jd_text: str) -> Dict:
342
+ """Process a job description using the JD agent"""
343
+ self.logger.info("Starting job description processing")
344
+ return self.jd_agent.analyze_jd(jd_text)
345
+
346
+ def process_resumes(self, file_paths: List[tuple]) -> Dict[str, Dict]:
347
+ """Process multiple resumes using the CV agent"""
348
+ self.logger.info(f"Starting resume processing for {len(file_paths)} files")
349
+
350
+ results = {}
351
+ for filename, file_path in file_paths:
352
+ results[filename] = self.cv_agent.process_cv(file_path, filename)
353
+
354
+ return results
355
+
356
+ def match_candidates(self, jd_data: Dict, cv_data: Dict[str, Dict]) -> Dict:
357
+ """Match candidates with the job description"""
358
+ self.logger.info("Starting candidate matching")
359
+ return self.matching_agent.match_cvs_to_jd(jd_data, cv_data)
360
+
361
+ def schedule_interviews(self, matches: List[Dict], job_title: str) -> List[Dict]:
362
+ """Schedule interviews for matched candidates"""
363
+ self.logger.info(f"Scheduling interviews for {len(matches)} candidates")
364
+
365
+ email_data = []
366
+ for candidate in matches:
367
+ if candidate["isMatch"]:
368
+ email_info = self.scheduler_agent.prepare_email_for_candidate(candidate, job_title)
369
+ email_data.append(email_info)
370
+
371
+ return email_data
372
+
373
+ def execute_full_workflow(self, jd_text: str, resume_files: List[tuple]) -> Dict:
374
+ """Execute the complete workflow from JD analysis to interview scheduling"""
375
+ self.logger.info("Starting full recruitment workflow")
376
+
377
+ # Step 1: Process the job description
378
+ jd_result = self.process_job_description(jd_text)
379
+
380
+ # Step 2: Process all resumes
381
+ resume_results = self.process_resumes(resume_files)
382
+
383
+ # Step 3: Match candidates with the job
384
+ match_results = self.match_candidates(jd_result, resume_results)
385
+
386
+ # Step 4: Schedule interviews for matched candidates
387
+ email_data = self.schedule_interviews(match_results["matches"], jd_result["title"])
388
+
389
+ return {
390
+ "jd": jd_result,
391
+ "resumes": resume_results,
392
+ "matches": match_results,
393
+ "emails": email_data
394
+ }
app.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from fastapi.responses import JSONResponse
4
+ from pydantic import BaseModel, EmailStr
5
+ import tempfile
6
+ import os
7
+ import shutil
8
+ from typing import List, Dict, Any
9
+ import json
10
+ import numpy as np
11
+ from pathlib import Path
12
+ import asyncio
13
+ from sentence_transformers import SentenceTransformer
14
+ import sqlite3
15
+
16
+ from jd_embedding_utils import generate_jd_embedding, extract_sections
17
+ from resume_embedding_utils import pdf_to_text, extract_resume_sections, generate_resume_embedding
18
+ from matcher import calculate_match_score, match_all_resumes
19
+ from email_utils import send_email
20
+ from agent_framework import AgentCoordinator
21
+
22
+ app = FastAPI()
23
+
24
+ # Add CORS middleware
25
+ app.add_middleware(
26
+ CORSMiddleware,
27
+ allow_origins=["*"],
28
+ allow_credentials=True,
29
+ allow_methods=["*"],
30
+ allow_headers=["*"],
31
+ )
32
+
33
+ # Initialize SQLite database
34
+ def init_db():
35
+ conn = sqlite3.connect("recruitly.db")
36
+ cursor = conn.cursor()
37
+ cursor.execute("""
38
+ CREATE TABLE IF NOT EXISTS job_descriptions (
39
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
40
+ title TEXT,
41
+ embedding TEXT,
42
+ sections TEXT,
43
+ summary TEXT
44
+ )
45
+ """)
46
+ cursor.execute("""
47
+ CREATE TABLE IF NOT EXISTS resumes (
48
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
49
+ filename TEXT,
50
+ embedding TEXT,
51
+ parsed TEXT,
52
+ summary TEXT
53
+ )
54
+ """)
55
+ cursor.execute("""
56
+ CREATE TABLE IF NOT EXISTS matches (
57
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
58
+ resume_id INTEGER,
59
+ jd_id INTEGER,
60
+ score REAL,
61
+ reasoning TEXT,
62
+ FOREIGN KEY (resume_id) REFERENCES resumes (id),
63
+ FOREIGN KEY (jd_id) REFERENCES job_descriptions (id)
64
+ )
65
+ """)
66
+ conn.commit()
67
+ conn.close()
68
+
69
+ # Call init_db on startup
70
+ init_db()
71
+
72
+ # Classes for request/response models
73
+ class JDRequest(BaseModel):
74
+ text: str
75
+
76
+ class MatchRequest(BaseModel):
77
+ jd_sections: Dict[str, List[str]]
78
+ resume_data: Dict[str, Dict[str, Any]]
79
+
80
+ class EmailRequest(BaseModel):
81
+ email: str
82
+ name: str
83
+ subject: str
84
+ body: str
85
+
86
+ class ScheduleRequest(BaseModel):
87
+ candidate_id: str
88
+ name: str
89
+ email: str
90
+
91
+ class NumpyEncoder(json.JSONEncoder):
92
+ def default(self, obj):
93
+ if isinstance(obj, np.ndarray):
94
+ return obj.tolist()
95
+ return json.JSONEncoder.default(self, obj)
96
+
97
+ # Store processed JD and resumes in memory for matching
98
+ current_session = {
99
+ "jd": None,
100
+ "resumes": {},
101
+ "agent_coordinator": AgentCoordinator()
102
+ }
103
+
104
+ # Create a single model instance for reuse
105
+ model = SentenceTransformer("all-MiniLM-L6-v2")
106
+
107
+ @app.post("/embed")
108
+ def get_embedding(request: JDRequest):
109
+ """Process a job description and generate its embedding"""
110
+ coordinator = current_session["agent_coordinator"]
111
+ result = coordinator.process_job_description(request.text)
112
+
113
+ # Store in current session
114
+ current_session["jd"] = result
115
+
116
+ # Convert embedding dictionary properly for JSON response
117
+ serializable_embedding = json.loads(
118
+ json.dumps(result["embedding"], cls=NumpyEncoder)
119
+ )
120
+
121
+ response_data = {
122
+ "title": result["title"],
123
+ "embedding": serializable_embedding,
124
+ "sections": result["sections"],
125
+ "summary": result.get("summary", "")
126
+ }
127
+
128
+ return response_data
129
+
130
+ @app.post("/upload-resumes")
131
+ async def upload_resumes(files: List[UploadFile] = File(...)):
132
+ """Process multiple resume PDFs and generate embeddings for each"""
133
+ if not files:
134
+ raise HTTPException(status_code=400, detail="No files provided")
135
+
136
+ # Create temp directory for saving uploaded files
137
+ with tempfile.TemporaryDirectory() as temp_dir:
138
+ resume_results = {}
139
+
140
+ # First save all files to disk to avoid keeping file handles open too long
141
+ file_paths = []
142
+ for file in files:
143
+ file_path = os.path.join(temp_dir, file.filename)
144
+ with open(file_path, "wb") as buffer:
145
+ shutil.copyfileobj(file.file, buffer)
146
+ file_paths.append((file.filename, file_path))
147
+
148
+ # Process files in batches to avoid memory issues
149
+ batch_size = 3
150
+ for i in range(0, len(file_paths), batch_size):
151
+ batch = file_paths[i:i+batch_size]
152
+ batch_tasks = []
153
+
154
+ for filename, file_path in batch:
155
+ batch_tasks.append(process_resume(filename, file_path))
156
+
157
+ # Process each batch concurrently
158
+ batch_results = await asyncio.gather(*batch_tasks)
159
+
160
+ # Combine results
161
+ for filename, result in batch_results:
162
+ resume_results[filename] = result
163
+ # Add to current session
164
+ if "error" not in result:
165
+ current_session["resumes"][filename] = result
166
+
167
+ # Convert NumPy arrays to lists for JSON response
168
+ serializable_results = json.loads(
169
+ json.dumps(resume_results, cls=NumpyEncoder)
170
+ )
171
+
172
+ return JSONResponse(content=serializable_results)
173
+
174
+ async def process_resume(filename, file_path):
175
+ """Process a single resume PDF file"""
176
+ try:
177
+ coordinator = current_session["agent_coordinator"]
178
+ result = coordinator.cv_agent.process_cv(file_path, filename)
179
+ return filename, result
180
+
181
+ except Exception as e:
182
+ print(f"Error processing {filename}: {str(e)}")
183
+ return filename, {"error": str(e)}
184
+
185
+ @app.post("/match")
186
+ def match_resumes():
187
+ """Match the current JD with all processed resumes"""
188
+ jd = current_session["jd"]
189
+ resumes = current_session["resumes"]
190
+
191
+ if not jd or not resumes:
192
+ raise HTTPException(status_code=400, detail="Job description or resumes missing")
193
+
194
+ jd_title = jd["title"]
195
+ jd_embeddings = jd["embedding"]
196
+
197
+ # Match all resumes
198
+ all_candidates = match_all_resumes(jd_title, jd_embeddings, resumes, threshold=0.8)
199
+
200
+ # Save all candidates to the database
201
+ conn = sqlite3.connect("recruitly.db")
202
+ cursor = conn.cursor()
203
+ for candidate in all_candidates:
204
+ cursor.execute("""
205
+ INSERT INTO matches (resume_id, jd_id, score, reasoning)
206
+ VALUES (?, ?, ?, ?)
207
+ """, (candidate.get("resume_id"), jd.get("id"), candidate["score"], json.dumps(candidate["reasoning"])))
208
+ conn.commit()
209
+ conn.close()
210
+
211
+ # Include all candidates in the response
212
+ return {"candidates": all_candidates}
213
+
214
+ @app.post("/generate-interview-slots")
215
+ def generate_interview_slots():
216
+ """Generate potential interview time slots"""
217
+ if not current_session["agent_coordinator"]:
218
+ raise HTTPException(status_code=400, detail="Agent coordinator not initialized")
219
+
220
+ slots = current_session["agent_coordinator"].scheduler_agent.generate_interview_slots()
221
+
222
+ return {"slots": slots}
223
+
224
+ @app.post("/prepare-interview-email/{candidate_id}")
225
+ def prepare_interview_email(candidate_id: str):
226
+ """Prepare an interview email for a specific candidate"""
227
+ if not current_session["jd"]:
228
+ raise HTTPException(status_code=400, detail="No job description processed")
229
+
230
+ # Find the candidate in the matches
231
+ matched_candidates = []
232
+ if "matches" in current_session:
233
+ matched_candidates = current_session["matches"]["matches"]
234
+
235
+ candidate = None
236
+ for match in matched_candidates:
237
+ if match["name"] == candidate_id or str(match.get("id", "")) == candidate_id:
238
+ candidate = match
239
+ break
240
+
241
+ if not candidate:
242
+ raise HTTPException(status_code=404, detail=f"Candidate {candidate_id} not found")
243
+
244
+ # Generate email content
245
+ email_data = current_session["agent_coordinator"].scheduler_agent.prepare_email_for_candidate(
246
+ candidate,
247
+ current_session["jd"]["title"]
248
+ )
249
+
250
+ return email_data
251
+
252
+ @app.post("/send-email")
253
+ def send_candidate_email(request: EmailRequest):
254
+ """Send an email to a candidate"""
255
+ try:
256
+ result = send_email(
257
+ to_email=request.email,
258
+ subject=request.subject,
259
+ body=request.body
260
+ )
261
+
262
+ if result["success"]:
263
+ return {"success": True, "message": f"Email sent to {request.name}"}
264
+ else:
265
+ raise HTTPException(status_code=500, detail=result["message"])
266
+ except Exception as e:
267
+ raise HTTPException(status_code=500, detail=str(e))
268
+
269
+ @app.get("/suggest-interview-times/{candidate_id}")
270
+ def suggest_interview_times(candidate_id: str):
271
+ """Suggest available interview time slots for a candidate"""
272
+ coordinator = current_session["agent_coordinator"]
273
+ slots = coordinator.scheduler_agent.generate_interview_slots(days_ahead=7, slots_per_day=3)
274
+
275
+ return {"candidate_id": candidate_id, "slots": slots}
276
+
277
+ # Helper function to extract name from parsed resume
278
+ def _extract_name(parsed, fallback):
279
+ if "name" in parsed and parsed["name"] and len(parsed["name"]) > 0:
280
+ return parsed["name"][0]
281
+ return Path(fallback).stem
282
+
283
+ @app.get("/clear-session")
284
+ def clear_session():
285
+ """Clear the current session data"""
286
+ current_session["jd"] = None
287
+ current_session["resumes"] = {}
288
+ return {"message": "Session cleared"}
289
+
290
+ @app.get("/test-match")
291
+ def test_match():
292
+ """Test endpoint to diagnose matching issues"""
293
+ test_jd = """We are seeking an innovative and strategic Product Manager to lead the development and execution of new products. The ideal candidate will collaborate with cross-functional teams to define product roadmaps, analyze market trends, and ensure successful product launches. Responsibilities: Define product vision and strategy based on market research and customer needs. Work closely with engineering, design, and marketing teams to develop and launch products. Prioritize features, create roadmaps, and manage product lifecycle. Analyze user feedback and data to optimize product performance. Ensure alignment between business goals and product development. Qualifications: Bachelor's degree in Business, Computer Science, or a related field. Experience in product management, agile methodologies, and market research. Strong analytical, leadership, and communication skills. Familiarity with project management tools and data-driven decision-making."""
294
+
295
+ # Process the test JD
296
+ title, embedding = generate_jd_embedding(test_jd)
297
+ sections = extract_sections(test_jd)
298
+
299
+ # Create a simple test resume with matching sections
300
+ test_resume = {
301
+ "skills": ["Product management", "Agile methodologies", "Leadership"],
302
+ "experience": ["5 years experience in product management", "Led cross-functional teams"],
303
+ "education": ["Bachelor's degree in Computer Science"],
304
+ "qualifications": ["Strong analytical skills", "Communication skills"]
305
+ }
306
+
307
+ # Run the matcher with debug output
308
+ score, reasoning = calculate_match_score(sections, test_resume)
309
+
310
+ return {
311
+ "jd_sections": sections,
312
+ "resume_sections": test_resume,
313
+ "score": score,
314
+ "reasoning": reasoning
315
+ }
316
+
317
+ # Run the app locally
318
+ if __name__ == "__main__":
319
+ import uvicorn
320
+ uvicorn.run(app, host="127.0.0.1", port=8000)
321
+
322
+
database.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import create_engine
2
+ from sqlalchemy.ext.declarative import declarative_base
3
+ from sqlalchemy.orm import sessionmaker
4
+
5
+ SQLALCHEMY_DATABASE_URL = "sqlite:///./application_db.sqlite"
6
+
7
+ engine = create_engine(
8
+ SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}
9
+ )
10
+ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
11
+
12
+ Base = declarative_base()
13
+
14
+ def get_db():
15
+ db = SessionLocal()
16
+ try:
17
+ yield db
18
+ finally:
19
+ db.close()
email_utils.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import smtplib
3
+ from email.mime.multipart import MIMEMultipart
4
+ from email.mime.text import MIMEText
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+
10
+ def send_email(to_email, subject, body):
11
+ """Send an email using the configured SMTP server"""
12
+ try:
13
+ # Get email configuration from environment variables
14
+ email_service = os.getenv("EMAIL_SERVICE")
15
+ email_user = os.getenv("EMAIL_USER")
16
+ email_password = os.getenv("EMAIL_PASS")
17
+ email_from = os.getenv("EMAIL_FROM")
18
+
19
+ # Create message
20
+ msg = MIMEMultipart()
21
+ msg['From'] = email_from
22
+ msg['To'] = to_email
23
+ msg['Subject'] = subject
24
+
25
+ # Add body to email
26
+ msg.attach(MIMEText(body, 'html'))
27
+
28
+ # Setup SMTP server
29
+ if email_service == 'gmail':
30
+ server = smtplib.SMTP('smtp.gmail.com', 587)
31
+ else:
32
+ raise ValueError(f"Unsupported email service: {email_service}")
33
+
34
+ server.starttls()
35
+ server.login(email_user, email_password)
36
+
37
+ # Send email
38
+ text = msg.as_string()
39
+ server.sendmail(email_from, to_email, text)
40
+ server.quit()
41
+
42
+ return {"success": True, "message": "Email sent successfully"}
43
+ except Exception as e:
44
+ print(f"Error sending email: {str(e)}")
45
+ return {"success": False, "message": str(e)}
jd_embedding_utils.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import nltk
3
+ from nltk import sent_tokenize
4
+ from collections import defaultdict
5
+ from sentence_transformers import SentenceTransformer, util
6
+ import spacy
7
+ import numpy as np
8
+
9
+ # Ensure nltk data is available
10
+ try:
11
+ nltk.data.find("tokenizers/punkt")
12
+ except LookupError:
13
+ nltk.download("punkt")
14
+
15
+ # Load models
16
+ sbert = SentenceTransformer("all-MiniLM-L6-v2")
17
+ nlp = spacy.load("en_core_web_sm")
18
+
19
+ # Relevant templates
20
+ TEMPLATES = {
21
+ "job_title": ["We're hiring a Backend Developer", "Job Title: Cloud Engineer", "Looking for a Product Manager"],
22
+ "responsibilities": ["You will collaborate with teams", "Expected to deliver high performance"],
23
+ "qualifications": ["Bachelor's or Master's in CS", "Degree in engineering or related field"]
24
+ }
25
+
26
+ TEMPLATE_EMBEDDINGS = {k: sbert.encode(v, convert_to_tensor=True) for k, v in TEMPLATES.items()}
27
+
28
+ COMMON_HEADERS = ['responsibilities', 'qualifications']
29
+
30
+ def clean_line(line):
31
+ return line.strip()
32
+
33
+ def classify_line(line):
34
+ line_embedding = sbert.encode(line, convert_to_tensor=True)
35
+ scores = {k: float(util.cos_sim(line_embedding, TEMPLATE_EMBEDDINGS[k]).max()) for k in TEMPLATE_EMBEDDINGS}
36
+ best_match = max(scores, key=scores.get)
37
+ return best_match if scores[best_match] > 0.4 else None
38
+
39
+ def extract_job_title(text):
40
+ # Regex-based extraction
41
+ patterns = [
42
+ r"We are (seeking|looking for|hiring)( an?| a)? (?P<title>[A-Z][a-zA-Z\s\-]+)",
43
+ r"Job Title[:\-]?\s*(?P<title>[A-Z][\w\s\-]+)"
44
+ ]
45
+ for pat in patterns:
46
+ match = re.search(pat, text, re.IGNORECASE)
47
+ if match:
48
+ title = match.group("title").strip()
49
+
50
+ # Trim any filler trailing words
51
+ for stop_word in [" to ", " who ", " that ", " and ", " for ", " with "]:
52
+ if stop_word in title:
53
+ title = title.split(stop_word)[0].strip()
54
+ break
55
+
56
+ if title.lower() not in ["responsibilities", "description", "qualifications"]:
57
+ return title
58
+
59
+ # Manual fallback: check for job title in lines
60
+ for line in text.splitlines():
61
+ if "job title" in line.lower():
62
+ return line.split(":")[-1].strip()
63
+
64
+ # Final fallback: first short line that isn’t a section
65
+ for line in text.splitlines():
66
+ line = line.strip()
67
+ if not line or line.lower().startswith(("description", "responsibilities", "qualifications")):
68
+ continue
69
+ if len(line.split()) <= 7 and line[0].isupper():
70
+ return line.strip()
71
+
72
+ return "Unknown"
73
+
74
+ def extract_sections(text):
75
+ lines = text.splitlines()
76
+ results = defaultdict(list)
77
+ results["job_title"] = extract_job_title(text)
78
+
79
+ current_section = None
80
+ normalized_headers = {
81
+ 'responsibilities': 'responsibilities',
82
+ 'qualifications': 'qualifications'
83
+ }
84
+
85
+ for line in lines:
86
+ raw_line = line.strip()
87
+ if not raw_line:
88
+ continue
89
+
90
+ lower_line = raw_line.lower().strip(":").strip()
91
+ if lower_line in normalized_headers:
92
+ current_section = normalized_headers[lower_line]
93
+ continue
94
+
95
+ if current_section:
96
+ results[current_section].append(raw_line)
97
+ else:
98
+ category = classify_line(raw_line)
99
+ if category and category != "job_title":
100
+ results[category].append(raw_line)
101
+
102
+ print("🔍 JD Section Classification Results (final):")
103
+ for section, content in results.items():
104
+ if section != "job_title":
105
+ print(f" {section}: {len(content)} lines")
106
+
107
+ return dict(results)
108
+
109
+ def generate_jd_embedding(jd_text):
110
+ parsed = extract_sections(jd_text)
111
+ title = parsed.get("job_title", "Unknown")
112
+
113
+ embeddings_by_section = {}
114
+ for section in ["responsibilities", "qualifications"]:
115
+ lines = parsed.get(section, [])
116
+ if lines:
117
+ combined = " ".join(lines)
118
+ emb = sbert.encode(combined, convert_to_numpy=True)
119
+ embeddings_by_section[section] = emb
120
+ print(f"✅ Embedded section '{section}': shape = {emb.shape}")
121
+ else:
122
+ print(f"❌ No content found for section '{section}'")
123
+ embeddings_by_section[section] = None
124
+
125
+ return title, embeddings_by_section
matcher.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import util
2
+ import numpy as np
3
+
4
+ # Weights for each aligned JD section
5
+ weights = {
6
+ "responsibilities": 0.7,
7
+ "qualifications": 0.7
8
+ }
9
+
10
+ # Function to compute cosine similarity with fallback
11
+ def safe_cos_sim(vec1, vec2):
12
+ if vec1 is None or vec2 is None:
13
+ return 0.0
14
+ return float(util.cos_sim(vec1, vec2).item())
15
+
16
+ # Enhanced explanation with match levels
17
+ def interpret_match(label, score):
18
+ if score >= 0.75:
19
+ return f"✅ Strong alignment in {label}: {round(score * 100, 1)}%"
20
+ elif score >= 0.5:
21
+ return f"⚠️ Partial alignment in {label}: {round(score * 100, 1)}%"
22
+ else:
23
+ return f"❌ Weak alignment in {label}: {round(score * 100, 1)}%"
24
+
25
+ # Matching logic
26
+ def calculate_match_score(jd_embeddings, resume_embeddings):
27
+ explanation = []
28
+ total_score = 0.0
29
+
30
+ # Responsibilities: experience + projects
31
+ jd_resp = jd_embeddings.get("responsibilities")
32
+ resume_resp = _combine_embeddings([
33
+ resume_embeddings.get("experience"),
34
+ resume_embeddings.get("projects")
35
+ ])
36
+ sim_resp = safe_cos_sim(jd_resp, resume_resp)
37
+ total_score += sim_resp * weights["responsibilities"]
38
+ explanation.append(interpret_match("Responsibilities", sim_resp))
39
+
40
+ # Qualifications: education + certs + skills
41
+ jd_qual = jd_embeddings.get("qualifications")
42
+ resume_qual = _combine_embeddings([
43
+ resume_embeddings.get("education"),
44
+ resume_embeddings.get("certifications"),
45
+ resume_embeddings.get("skills")
46
+ ])
47
+ sim_qual = safe_cos_sim(jd_qual, resume_qual)
48
+ total_score += sim_qual * weights["qualifications"]
49
+ explanation.append(interpret_match("Qualifications", sim_qual))
50
+
51
+ return round(total_score, 3), explanation
52
+
53
+ # Combine multiple numpy vectors into one
54
+ def _combine_embeddings(embeddings_list):
55
+ valid = [vec for vec in embeddings_list if vec is not None]
56
+ if not valid:
57
+ return None
58
+ return np.mean(valid, axis=0)
59
+
60
+ # Main matcher
61
+ def match_all_resumes(jd_title, jd_embeddings, resume_data, threshold=0.8):
62
+ all_candidates = []
63
+
64
+ print(f"\n📌 Matching resumes against JD: **{jd_title}**\n")
65
+
66
+ for filename, data in resume_data.items():
67
+ parsed = data.get("parsed", {})
68
+ embeddings = data.get("embedding", {})
69
+
70
+ name = _extract_name(parsed, fallback=filename)
71
+ score, explanation = calculate_match_score(jd_embeddings, embeddings)
72
+
73
+ print(f"🔍 {name} — Score: {round(score*100, 1)}%")
74
+ for line in explanation:
75
+ print(" •", line)
76
+ print("✅ Shortlisted\n" if score >= threshold else "❌ Not shortlisted\n")
77
+
78
+ all_candidates.append({
79
+ "name": name,
80
+ "score": score,
81
+ "reasoning": explanation,
82
+ "resume_id": data.get("id"), # Assuming resume ID is stored in data
83
+ "is_match": score >= threshold # Flag for passing the threshold
84
+ })
85
+
86
+ return all_candidates
87
+
88
+ # Name extractor fallback
89
+ def _extract_name(parsed, fallback="Unknown"):
90
+ name_lines = parsed.get("name", [])
91
+ for line in name_lines:
92
+ if line and any(c.isalpha() for c in line):
93
+ return line.strip()
94
+ return fallback
models.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Boolean, Column, ForeignKey, Integer, String, Float, Text, LargeBinary, DateTime, JSON
2
+ from sqlalchemy.ext.declarative import declarative_base
3
+ from sqlalchemy.orm import relationship
4
+ from sqlalchemy.sql import func
5
+
6
+ Base = declarative_base()
7
+
8
+ class User(Base):
9
+ __tablename__ = "users"
10
+
11
+ id = Column(Integer, primary_key=True, index=True)
12
+ email = Column(String, unique=True, index=True)
13
+ password = Column(String)
14
+ name = Column(String)
15
+ created_at = Column(DateTime, server_default=func.now())
16
+
17
+ job_descriptions = relationship("JobDescription", back_populates="user")
18
+ resumes = relationship("Resume", back_populates="user")
19
+ memories = relationship("ApplicationMemory", back_populates="user")
20
+
21
+ class JobDescription(Base):
22
+ __tablename__ = "job_descriptions"
23
+
24
+ id = Column(Integer, primary_key=True, index=True)
25
+ user_id = Column(Integer, ForeignKey("users.id"))
26
+ title = Column(String)
27
+ description = Column(Text)
28
+ embedding_data = Column(JSON)
29
+ sections = Column(JSON)
30
+ created_at = Column(DateTime, server_default=func.now())
31
+
32
+ user = relationship("User", back_populates="job_descriptions")
33
+ matches = relationship("Match", back_populates="job_description")
34
+
35
+ class Resume(Base):
36
+ __tablename__ = "resumes"
37
+
38
+ id = Column(Integer, primary_key=True, index=True)
39
+ user_id = Column(Integer, ForeignKey("users.id"))
40
+ filename = Column(String)
41
+ candidate_name = Column(String, nullable=True)
42
+ file_content = Column(LargeBinary)
43
+ parsed_data = Column(JSON)
44
+ embedding_data = Column(JSON)
45
+ summary = Column(Text)
46
+ created_at = Column(DateTime, server_default=func.now())
47
+
48
+ user = relationship("User", back_populates="resumes")
49
+ matches = relationship("Match", back_populates="resume")
50
+
51
+ class Match(Base):
52
+ __tablename__ = "matches"
53
+
54
+ id = Column(Integer, primary_key=True, index=True)
55
+ job_description_id = Column(Integer, ForeignKey("job_descriptions.id"))
56
+ resume_id = Column(Integer, ForeignKey("resumes.id"))
57
+ score = Column(Float)
58
+ is_match = Column(Boolean)
59
+ reasoning = Column(JSON)
60
+ created_at = Column(DateTime, server_default=func.now())
61
+
62
+ job_description = relationship("JobDescription", back_populates="matches")
63
+ resume = relationship("Resume", back_populates="matches")
64
+ interviews = relationship("Interview", back_populates="match")
65
+
66
+ class Interview(Base):
67
+ __tablename__ = "interviews"
68
+
69
+ id = Column(Integer, primary_key=True, index=True)
70
+ match_id = Column(Integer, ForeignKey("matches.id"))
71
+ scheduled_time = Column(DateTime)
72
+ email_sent = Column(Boolean, default=False)
73
+ status = Column(String, default="scheduled") # scheduled, completed, cancelled
74
+ notes = Column(Text, nullable=True)
75
+ created_at = Column(DateTime, server_default=func.now())
76
+
77
+ match = relationship("Match", back_populates="interviews")
78
+
79
+ class ApplicationMemory(Base):
80
+ """Long-term memory storage for the application"""
81
+ __tablename__ = "application_memories"
82
+
83
+ id = Column(Integer, primary_key=True, index=True)
84
+ user_id = Column(Integer, ForeignKey("users.id"))
85
+ type = Column(String, index=True) # job_description, resume, match, interview, etc.
86
+ reference_id = Column(Integer) # ID of the referenced entity
87
+ data = Column(JSON) # Main data to store
88
+ metadata = Column(JSON, nullable=True) # Additional metadata
89
+ created_at = Column(DateTime, server_default=func.now())
90
+
91
+ user = relationship("User", back_populates="memories")
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ nltk
2
+ sentence-transformers
3
+ spacy
4
+ numpy<2
5
+ pybind11
6
+ torch
7
+ torchvision
8
+ tf-keras
9
+ fastapi
10
+ uvicorn
11
+ python-multipart
12
+ pdfplumber
13
+ python-dotenv
14
+ email-validator
resume_embedding_utils.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- resume_embedding_utils.py ---
2
+ import re
3
+ import nltk
4
+ import spacy
5
+ import pdfplumber
6
+ import numpy as np
7
+ from nltk import sent_tokenize
8
+ from collections import defaultdict
9
+ from sentence_transformers import SentenceTransformer, util
10
+ from pathlib import Path
11
+
12
+ # --- Setup ---
13
+ nltk.download("punkt")
14
+ nlp = spacy.load("en_core_web_sm")
15
+ sbert = SentenceTransformer("all-MiniLM-L6-v2")
16
+
17
+ # --- Templates for fallback classification ---
18
+ RESUME_TEMPLATES = {
19
+ "name": ["My name is", "Resume of", "Name:"],
20
+ "skills": ["Skills: Python, Java", "Proficient in C++ and ML"],
21
+ "experience": ["Worked at Google", "Software Engineer at Amazon"],
22
+ "education": ["Bachelor of Technology from IIT", "Master's in Data Science"],
23
+ "certifications": ["AWS Certified", "Completed PMP Certification"],
24
+ "projects": ["Built an AI chatbot", "Project: Deep Learning"],
25
+ "tech_stack": ["Tech Stack: Python, TensorFlow", "Languages: Java, C++"]
26
+ }
27
+
28
+ TEMPLATE_EMBEDDINGS = {
29
+ k: sbert.encode(v, convert_to_tensor=True)
30
+ for k, v in RESUME_TEMPLATES.items()
31
+ }
32
+
33
+ COMMON_HEADERS = {
34
+ "skills": ["skills", "technical skills"],
35
+ "experience": ["experience", "work experience", "employment"],
36
+ "education": ["education", "academics"],
37
+ "certifications": ["certifications"],
38
+ "projects": ["projects", "achievements"],
39
+ "tech_stack": ["tech stack", "languages", "tools"],
40
+ "name": ["name", "profile"]
41
+ }
42
+
43
+ def normalize_header(text):
44
+ lower = text.lower().strip().strip(":")
45
+ for section, aliases in COMMON_HEADERS.items():
46
+ if any(lower.startswith(alias) for alias in aliases):
47
+ return section
48
+ return None
49
+
50
+ def classify_line(line):
51
+ emb = sbert.encode(line, convert_to_tensor=True)
52
+ scores = {
53
+ k: float(util.cos_sim(emb, TEMPLATE_EMBEDDINGS[k]).max())
54
+ for k in TEMPLATE_EMBEDDINGS
55
+ }
56
+ best = max(scores, key=scores.get)
57
+ return best if scores[best] > 0.4 else None
58
+
59
+ def extract_name(text):
60
+ for line in text.splitlines():
61
+ doc = nlp(line.strip())
62
+ for ent in doc.ents:
63
+ if ent.label_ == "PERSON":
64
+ return ent.text.strip()
65
+ return None
66
+
67
+ def pdf_to_text(pdf_path):
68
+ with pdfplumber.open(pdf_path) as pdf:
69
+ return "\n".join([page.extract_text() or "" for page in pdf.pages])
70
+
71
+ def extract_resume_sections(text):
72
+ lines = text.splitlines()
73
+ merged_lines = []
74
+ prev_line = ""
75
+
76
+ for raw in lines:
77
+ line = raw.strip()
78
+ if not line:
79
+ continue
80
+ if prev_line and (line[0].islower() or line.startswith(("and", "which", "-", "or", ",", "of", "to"))):
81
+ merged_lines[-1] += " " + line
82
+ else:
83
+ merged_lines.append(line)
84
+ prev_line = line
85
+
86
+ sections = defaultdict(list)
87
+ current_section = None
88
+ name_found = extract_name(text)
89
+
90
+ for line in merged_lines:
91
+ normalized = normalize_header(line)
92
+ if normalized:
93
+ current_section = normalized
94
+ continue
95
+
96
+ lower = line.lower()
97
+ if any(w in lower for w in ["bachelor", "ph.d", "master", "diploma", "msc", "b.tech", "mba"]):
98
+ current_section = "education"
99
+ elif "tech stack" in lower or "languages" in lower or "tools" in lower:
100
+ current_section = "tech_stack"
101
+ elif "achievements" in lower or line.startswith(("Built", "Developed")) or "project" in lower:
102
+ current_section = "projects"
103
+ elif "work experience" in lower or re.search(r"(intern|engineer|manager|scientist|developer)", lower):
104
+ current_section = "experience"
105
+
106
+ if not current_section:
107
+ current_section = classify_line(line)
108
+
109
+ if current_section:
110
+ if current_section in ["education", "experience", "certifications"] and sections[current_section]:
111
+ if line[0].islower() or re.match(r"^(Concentrated|Focused|Research|Worked|Led|Responsible|Published|with|and|using|or|to)\b", line):
112
+ sections[current_section][-1] += " " + line
113
+ continue
114
+ sections[current_section].append(line)
115
+
116
+ if name_found and name_found not in sections.get("name", []):
117
+ sections["name"].insert(0, name_found)
118
+
119
+ return dict(sections)
120
+
121
+ def generate_resume_embedding(parsed_resume):
122
+ combined = " ".join(
123
+ parsed_resume.get("skills", []) +
124
+ parsed_resume.get("experience", []) +
125
+ parsed_resume.get("education", []) +
126
+ parsed_resume.get("certifications", []) +
127
+ parsed_resume.get("projects", []) +
128
+ parsed_resume.get("tech_stack", [])
129
+ )
130
+ if not combined.strip():
131
+ return sbert.encode("generic resume", convert_to_numpy=True)
132
+ return sbert.encode(combined, convert_to_numpy=True)
133
+
134
+ def generate_embeddings_for_all_resumes(pdf_paths):
135
+ results = {}
136
+
137
+ print("\n���� DEBUGGING RESUME PARSING:\n")
138
+
139
+ for pdf_path in pdf_paths:
140
+ file_name = Path(pdf_path).name
141
+ text = pdf_to_text(pdf_path)
142
+ parsed = extract_resume_sections(text)
143
+
144
+ print(f"\n📄 Resume: {file_name}")
145
+ for section in ["name", "skills", "experience", "education", "certifications", "projects", "tech_stack"]:
146
+ lines = parsed.get(section)
147
+ if lines:
148
+ print(f" ✅ {section.title()}: {len(lines)} line(s)")
149
+ else:
150
+ print(f" ❌ {section.title()}: Not found")
151
+
152
+ embedding = generate_resume_embedding(parsed)
153
+ print(f" 🔢 Embedding shape: {embedding.shape}")
154
+
155
+ results[file_name] = {
156
+ "embedding": {
157
+ "skills": sbert.encode(" ".join(parsed.get("skills", [])), convert_to_numpy=True) if parsed.get("skills") else None,
158
+ "experience": sbert.encode(" ".join(parsed.get("experience", [])), convert_to_numpy=True) if parsed.get("experience") else None,
159
+ "education": sbert.encode(" ".join(parsed.get("education", [])), convert_to_numpy=True) if parsed.get("education") else None,
160
+ "certifications": sbert.encode(" ".join(parsed.get("certifications", [])), convert_to_numpy=True) if parsed.get("certifications") else None,
161
+ "projects": sbert.encode(" ".join(parsed.get("projects", [])), convert_to_numpy=True) if parsed.get("projects") else None,
162
+ "tech_stack": sbert.encode(" ".join(parsed.get("tech_stack", [])), convert_to_numpy=True) if parsed.get("tech_stack") else None,
163
+ },
164
+ "parsed": parsed
165
+ }
166
+
167
+ return results
schemas.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, EmailStr
2
+ from typing import Optional, Dict, Any, List
3
+ from datetime import datetime
4
+
5
+ # ...existing schemas...
6
+
7
+ class MemoryData(BaseModel):
8
+ id: int
9
+ type: str
10
+ reference_id: int
11
+ data: Dict[str, Any]
12
+ metadata: Optional[Dict[str, Any]] = None
13
+ created_at: datetime
14
+
15
+ class Config:
16
+ orm_mode = True