husseinelsaadi commited on
Commit
9ee49ff
·
1 Parent(s): a1b807c

qdrant included

Browse files
backend/services/interview_engine.py CHANGED
@@ -8,6 +8,12 @@ import logging
8
  import tempfile
9
  import shutil
10
  import torch
 
 
 
 
 
 
11
 
12
  if torch.cuda.is_available():
13
  print("🔥 CUDA Available")
@@ -96,6 +102,10 @@ load_whisper_model()
96
 
97
  def generate_first_question(profile, job):
98
  """Generate the first interview question based on profile and job"""
 
 
 
 
99
  try:
100
  prompt = f"""
101
  You are conducting an interview for a {job.role} position at {job.company}.
@@ -104,6 +114,9 @@ def generate_first_question(profile, job):
104
  - Experience: {profile.get('experience', [])}
105
  - Education: {profile.get('education', [])}
106
 
 
 
 
107
  Generate an appropriate opening interview question that is professional and relevant.
108
  Keep it concise and clear. Respond with ONLY the question text, no additional formatting.
109
  If the interview is for a technical role, focus on technical skills. Make the question related
 
8
  import tempfile
9
  import shutil
10
  import torch
11
+ from backend.services.interview_retrieval import (
12
+ extract_all_roles_from_qdrant,
13
+ retrieve_interview_data,
14
+ random_context_chunks
15
+ )
16
+
17
 
18
  if torch.cuda.is_available():
19
  print("🔥 CUDA Available")
 
102
 
103
  def generate_first_question(profile, job):
104
  """Generate the first interview question based on profile and job"""
105
+ all_roles = extract_all_roles_from_qdrant()
106
+ retrieved_data = retrieve_interview_data(job.role.lower(), all_roles)
107
+ context_data = random_context_chunks(retrieved_data, k=4)
108
+
109
  try:
110
  prompt = f"""
111
  You are conducting an interview for a {job.role} position at {job.company}.
 
114
  - Experience: {profile.get('experience', [])}
115
  - Education: {profile.get('education', [])}
116
 
117
+ Use the following context to generate a relevant opening question:
118
+ {context_data}
119
+
120
  Generate an appropriate opening interview question that is professional and relevant.
121
  Keep it concise and clear. Respond with ONLY the question text, no additional formatting.
122
  If the interview is for a technical role, focus on technical skills. Make the question related
backend/services/interview_retrieval.py ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helper functions for retrieving interview questions and answers from an
3
+ existing Qdrant vector collection. These functions encapsulate the
4
+ logic for extracting available job roles, fetching all Q&A pairs for a
5
+ given role, finding similar roles when an exact match is not present,
6
+ and assembling a randomised context from retrieved data. They rely on
7
+ the ``qdrant-client`` library for interacting with the remote
8
+ collection, ``sentence-transformers`` for computing embeddings, and
9
+ scikit-learn's cosine similarity implementation.
10
+
11
+ The collection is expected to exist prior to use and to be
12
+ configured with vectors generated by the all-MiniLM-L6-v2 model. Do
13
+ not modify the connection details, vector size or distance metric.
14
+
15
+ Usage example::
16
+
17
+ from backend.services.interview_retrieval import (
18
+ extract_all_roles_from_qdrant, retrieve_interview_data,
19
+ random_context_chunks
20
+ )
21
+
22
+ all_roles = extract_all_roles_from_qdrant(collection_name="interview_questions")
23
+ retrieved = retrieve_interview_data("data scientist", all_roles)
24
+ context = random_context_chunks(retrieved, k=4)
25
+
26
+ The above snippet fetches all stored roles, retrieves Q&A pairs for
27
+ the specified role (falling back to similar roles if necessary), and
28
+ builds a randomised context of four question/answer items.
29
+
30
+ These helpers are designed to be drop‑in compatible with the existing
31
+ interview system. They deliberately avoid using Qdrant's ``search``
32
+ API, instead relying on ``scroll`` to iterate through all records.
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ import logging
38
+ import random
39
+ from typing import Dict, List, Sequence, Tuple
40
+
41
+ try:
42
+ # Attempt to import Qdrant client classes. In environments where
43
+ # qdrant-client is not installed (e.g. during local testing without
44
+ # vector storage), these imports will fail. We handle that by
45
+ # assigning ``None`` to the client and conditionally disabling
46
+ # functions that depend on it.
47
+ from qdrant_client import QdrantClient # type: ignore
48
+ from qdrant_client.http.models import Filter, FieldCondition, MatchValue # type: ignore
49
+ except Exception:
50
+ QdrantClient = None # type: ignore
51
+ Filter = None # type: ignore
52
+ FieldCondition = None # type: ignore
53
+ MatchValue = None # type: ignore
54
+ from sklearn.metrics.pairwise import cosine_similarity
55
+ import numpy as np
56
+
57
+ # ``sentence_transformers`` is an optional dependency. To avoid
58
+ # import‑time errors in environments where it is absent (e.g. during
59
+ # lightweight testing or static analysis), we avoid importing it at
60
+ # module level. Instead, ``LocalEmbeddings`` will attempt to import
61
+ # SentenceTransformer when instantiated. If the import fails, a
62
+ # RuntimeError is raised from within the constructor, signalling that
63
+ # embedding functionality is unavailable.
64
+ SentenceTransformer = None # type: ignore
65
+
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # Qdrant configuration
69
+ #
70
+ # These connection details must not be altered. They point to the
71
+ # existing Qdrant instance containing interview questions and answers.
72
+
73
+ if QdrantClient is not None:
74
+ qdrant_client: QdrantClient | None = QdrantClient(
75
+ url="https://313b1ceb-057f-4b7b-89f5-7b19a213fe65.us-east-1-0.aws.cloud.qdrant.io:6333",
76
+ api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.w13SPZbljbSvt9Ch_0r034QhMFlmEr4ctXqLo2zhxm4",
77
+ )
78
+ else:
79
+ qdrant_client = None
80
+
81
+ # Name of the Qdrant collection containing interview Q&A pairs. Do not
82
+ # modify this value; the collection already exists and is populated.
83
+ COLLECTION_NAME: str = "interview_questions"
84
+
85
+
86
+ class LocalEmbeddings:
87
+ """
88
+ Lightweight wrapper around a SentenceTransformer model. Provides
89
+ convenience methods for embedding a single query string or a list of
90
+ documents. The model name is fixed to the one used during data
91
+ ingestion (all‑MiniLM‑L6‑v2).
92
+ """
93
+
94
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
95
+ global SentenceTransformer # use global to update when imported
96
+ if SentenceTransformer is None:
97
+ try:
98
+ from sentence_transformers import SentenceTransformer as _ST # type: ignore
99
+ SentenceTransformer = _ST # type: ignore
100
+ except Exception as exc:
101
+ # Fail loudly when embeddings cannot be loaded. The caller
102
+ # should ensure that ``sentence-transformers`` is installed.
103
+ raise RuntimeError(
104
+ "sentence-transformers is required to compute embeddings. Please install the package."
105
+ ) from exc
106
+ self.model = SentenceTransformer(model_name) # type: ignore
107
+
108
+ def embed_query(self, text: str) -> List[float]:
109
+ """Embed a single query string and return a list of floats."""
110
+ return self.model.encode(text).tolist()
111
+
112
+ def embed_documents(self, documents: Sequence[str]) -> List[List[float]]:
113
+ """Embed a sequence of documents and return a list of vectors."""
114
+ return self.model.encode(list(documents)).tolist()
115
+
116
+
117
+ # Instantiate the embeddings once. This avoids repeatedly loading
118
+ # model weights on each function call. If sentence-transformers is
119
+ # unavailable, ``embeddings`` will be set to ``None`` and similarity
120
+ # searches will be disabled. Consumers should check for ``None``
121
+ # where appropriate.
122
+ try:
123
+ embeddings: LocalEmbeddings | None = LocalEmbeddings()
124
+ except Exception as exc:
125
+ logging.warning(
126
+ "Failed to initialise LocalEmbeddings. Similarity search will be disabled. "
127
+ f"Error: {exc}"
128
+ )
129
+ embeddings = None
130
+
131
+
132
+ def extract_all_roles_from_qdrant(collection_name: str = COLLECTION_NAME) -> List[str]:
133
+ """
134
+ Extract all unique job roles from the specified Qdrant collection.
135
+
136
+ This function iterates through every point in the collection using
137
+ Qdrant's ``scroll`` API and collects the ``job_role`` field from
138
+ payloads. It returns a sorted list of unique roles. Roles in the
139
+ underlying data are expected to be stored in lowercase; however,
140
+ callers should not rely on this and should normalise input when
141
+ performing comparisons.
142
+
143
+ Parameters
144
+ ----------
145
+ collection_name : str, optional
146
+ Name of the Qdrant collection. Defaults to ``COLLECTION_NAME``.
147
+
148
+ Returns
149
+ -------
150
+ List[str]
151
+ A list of unique job roles present in the collection.
152
+ """
153
+ unique_roles: set[str] = set()
154
+ offset: Tuple[str, int] | None = None
155
+ limit: int = 256 # reasonable batch size to avoid heavy memory usage
156
+
157
+ # If the Qdrant client failed to initialise, return an empty list.
158
+ if qdrant_client is None:
159
+ logging.error(
160
+ "Qdrant client is unavailable; cannot extract roles. Ensure qdrant-client is installed."
161
+ )
162
+ return []
163
+
164
+ while True:
165
+ try:
166
+ # ``scroll`` returns a tuple: (list of points, next offset)
167
+ points, offset = qdrant_client.scroll(
168
+ collection_name=collection_name,
169
+ scroll_filter=None,
170
+ offset=offset,
171
+ limit=limit,
172
+ with_payload=True,
173
+ with_vectors=False,
174
+ )
175
+ except Exception as exc:
176
+ logging.error(f"Error scrolling Qdrant collection '{collection_name}': {exc}")
177
+ break
178
+
179
+ for point in points:
180
+ payload = getattr(point, "payload", {}) or {}
181
+ role = payload.get("job_role")
182
+ if isinstance(role, str) and role.strip():
183
+ unique_roles.add(role.strip().lower())
184
+
185
+ # When ``offset`` is None, we have reached the end of the collection.
186
+ if offset is None:
187
+ break
188
+
189
+ return sorted(unique_roles)
190
+
191
+
192
+ def get_role_questions(job_role: str) -> List[Dict[str, str]]:
193
+ """
194
+ Retrieve all interview questions and answers for a specific job role.
195
+
196
+ This helper uses Qdrant's ``scroll`` API with a ``Filter`` that
197
+ matches the ``job_role`` payload field exactly. All matching
198
+ entries are returned, regardless of the number of stored vectors.
199
+
200
+ Parameters
201
+ ----------
202
+ job_role : str
203
+ The job role to match against the ``job_role`` field in payloads.
204
+ Matching is case‑insensitive; the provided role is normalised
205
+ internally to lowercase.
206
+
207
+ Returns
208
+ -------
209
+ List[Dict[str, str]]
210
+ A list of dictionaries, each containing ``question``, ``answer``
211
+ and ``job_role`` keys. If no entries are found, an empty list
212
+ is returned.
213
+ """
214
+ if not isinstance(job_role, str) or not job_role.strip():
215
+ return []
216
+
217
+ role_lower = job_role.strip().lower()
218
+
219
+ # Build a filter to match the exact job_role value. We avoid
220
+ # constructing nested field paths because the payload is flat.
221
+ if qdrant_client is None or Filter is None or FieldCondition is None or MatchValue is None:
222
+ logging.error(
223
+ "Qdrant client or filter classes are unavailable; cannot retrieve questions for roles."
224
+ )
225
+ return []
226
+
227
+ match_filter = Filter(
228
+ must=[
229
+ FieldCondition(
230
+ key="job_role",
231
+ match=MatchValue(value=role_lower),
232
+ )
233
+ ]
234
+ )
235
+
236
+ results: List[Dict[str, str]] = []
237
+ offset: Tuple[str, int] | None = None
238
+ limit: int = 256
239
+
240
+ while True:
241
+ try:
242
+ points, offset = qdrant_client.scroll(
243
+ collection_name=COLLECTION_NAME,
244
+ scroll_filter=match_filter,
245
+ offset=offset,
246
+ limit=limit,
247
+ with_payload=True,
248
+ with_vectors=False,
249
+ )
250
+ except Exception as exc:
251
+ logging.error(f"Error retrieving questions for role '{job_role}': {exc}")
252
+ break
253
+
254
+ for point in points:
255
+ payload = getattr(point, "payload", {}) or {}
256
+ question = payload.get("question")
257
+ answer = payload.get("answer")
258
+ payload_role = payload.get("job_role")
259
+ if all(isinstance(item, str) for item in (question, answer, payload_role)):
260
+ results.append({
261
+ "question": question,
262
+ "answer": answer,
263
+ "job_role": payload_role,
264
+ })
265
+
266
+ if offset is None:
267
+ break
268
+
269
+ return results
270
+
271
+
272
+ def find_similar_roles(user_role: str, all_roles: Sequence[str], top_k: int = 3) -> List[str]:
273
+ """
274
+ Find the most similar job roles to the provided role string.
275
+
276
+ When an exact match for ``user_role`` is not found in the collection,
277
+ this helper computes embeddings for the user's input and all known
278
+ roles, then ranks them by cosine similarity. It returns up to
279
+ ``top_k`` roles with the highest similarity scores, excluding any
280
+ roles that exactly match ``user_role`` (case‑insensitively).
281
+
282
+ Parameters
283
+ ----------
284
+ user_role : str
285
+ The role provided by the user. This value is embedded and
286
+ compared against all known roles.
287
+ all_roles : Sequence[str]
288
+ A sequence of all role names available in the collection. It is
289
+ assumed that these have been normalised to lowercase.
290
+ top_k : int, optional
291
+ The maximum number of similar roles to return. Defaults to 3.
292
+
293
+ Returns
294
+ -------
295
+ List[str]
296
+ A list of the most similar roles, ordered by decreasing
297
+ similarity. If fewer than ``top_k`` roles are available or
298
+ embedding computation fails, a shorter list may be returned.
299
+ """
300
+ if not isinstance(user_role, str) or not user_role.strip() or not all_roles:
301
+ return []
302
+
303
+ user_role_norm = user_role.strip().lower()
304
+
305
+ # Filter out any roles identical to the user input (case‑insensitive)
306
+ candidate_roles = [role for role in all_roles if role.lower() != user_role_norm]
307
+ if not candidate_roles:
308
+ return []
309
+
310
+ if embeddings is None:
311
+ logging.warning(
312
+ "Embeddings are unavailable; cannot compute similar roles. Returning empty list."
313
+ )
314
+ return []
315
+ try:
316
+ # Compute embeddings for the query and candidate roles
317
+ query_vec = np.array([embeddings.embed_query(user_role_norm)])
318
+ role_vecs = np.array(embeddings.embed_documents(candidate_roles))
319
+
320
+ # Compute cosine similarity (higher values indicate greater similarity)
321
+ sims: np.ndarray = cosine_similarity(query_vec, role_vecs)[0]
322
+
323
+ # Pair each role with its similarity and sort descending
324
+ paired: List[Tuple[str, float]] = list(zip(candidate_roles, sims))
325
+ paired.sort(key=lambda x: x[1], reverse=True)
326
+
327
+ # Extract the top_k roles (handles case where top_k > number of roles)
328
+ top_roles = [role for role, _ in paired[:max(0, top_k)]]
329
+ return top_roles
330
+ except Exception as exc:
331
+ logging.error(f"Error finding similar roles for '{user_role}': {exc}")
332
+ return []
333
+
334
+
335
+ def retrieve_interview_data(job_role: str, all_roles: Sequence[str]) -> List[Dict[str, str]]:
336
+ """
337
+ Retrieve interview questions and answers for a job role with fallback.
338
+
339
+ The retrieval process follows these steps:
340
+ 1. Attempt an exact match by fetching all questions associated with
341
+ ``job_role`` via ``get_role_questions``.
342
+ 2. If no questions are returned, compute the ``top_k`` most similar
343
+ roles using ``find_similar_roles`` and retrieve questions for each.
344
+ 3. Deduplicate results based on the question text to avoid
345
+ repetition when combining multiple roles.
346
+
347
+ Parameters
348
+ ----------
349
+ job_role : str
350
+ The desired job role provided by the user.
351
+ all_roles : Sequence[str]
352
+ The complete list of roles available in the collection. Passed
353
+ in to avoid re‑fetching roles multiple times.
354
+
355
+ Returns
356
+ -------
357
+ List[Dict[str, str]]
358
+ A deduplicated list of question/answer dictionaries. The
359
+ ``job_role`` field in each item reflects the role it was
360
+ retrieved from. If neither an exact nor a similar role yields
361
+ results, an empty list is returned.
362
+ """
363
+ if not isinstance(job_role, str) or not job_role.strip():
364
+ return []
365
+
366
+ # First try exact match
367
+ results: List[Dict[str, str]] = get_role_questions(job_role)
368
+
369
+ # If no results, find similar roles and aggregate their questions
370
+ if not results:
371
+ similar_roles = find_similar_roles(job_role, all_roles, top_k=3)
372
+ for role in similar_roles:
373
+ role_questions = get_role_questions(role)
374
+ results.extend(role_questions)
375
+
376
+ # Deduplicate by question text to avoid repetition
377
+ seen_questions: set[str] = set()
378
+ deduped: List[Dict[str, str]] = []
379
+ for item in results:
380
+ question = item.get("question")
381
+ if isinstance(question, str) and question not in seen_questions:
382
+ deduped.append(item)
383
+ seen_questions.add(question)
384
+
385
+ return deduped
386
+
387
+
388
+ def random_context_chunks(retrieved_data: Sequence[Dict[str, str]], k: int = 3) -> str:
389
+ """
390
+ Build a context string by sampling Q&A pairs from retrieved data.
391
+
392
+ This helper randomly selects up to ``k`` items from the provided
393
+ collection of question/answer pairs and formats them as a context
394
+ string suitable for inclusion in an LLM prompt. Each entry is
395
+ formatted as ``"Q: [question]\nA: [answer]"`` and separated by a
396
+ blank line. If ``retrieved_data`` is empty, an empty string is
397
+ returned.
398
+
399
+ Parameters
400
+ ----------
401
+ retrieved_data : Sequence[Dict[str, str]]
402
+ The list of Q&A dictionaries returned by ``retrieve_interview_data``.
403
+ k : int, optional
404
+ The number of entries to sample. Defaults to 3. If ``k`` is
405
+ greater than the length of ``retrieved_data``, all items are used.
406
+
407
+ Returns
408
+ -------
409
+ str
410
+ A concatenated context string with each Q&A pair on its own
411
+ lines, separated by blank lines. Returns an empty string if
412
+ ``retrieved_data`` is empty.
413
+ """
414
+ if not retrieved_data:
415
+ return ""
416
+
417
+ # Determine the number of samples to draw. ``random.sample`` will
418
+ # raise ValueError if k > len(retrieved_data), so we cap it.
419
+ num_samples = max(0, min(k, len(retrieved_data)))
420
+ try:
421
+ sampled = random.sample(list(retrieved_data), num_samples)
422
+ except ValueError:
423
+ sampled = list(retrieved_data)
424
+
425
+ # Build the context string
426
+ parts: List[str] = []
427
+ for item in sampled:
428
+ q = item.get("question", "").strip()
429
+ a = item.get("answer", "").strip()
430
+ if q and a:
431
+ parts.append(f"Q: {q}\nA: {a}")
432
+
433
+ return "\n\n".join(parts)
434
+
435
+
436
+ __all__ = [
437
+ "extract_all_roles_from_qdrant",
438
+ "get_role_questions",
439
+ "find_similar_roles",
440
+ "retrieve_interview_data",
441
+ "random_context_chunks",
442
+ "embeddings",
443
+ "qdrant_client",
444
+ ]