Spaces:
Paused
Paused
""" | |
Helper functions for retrieving interview questions and answers from an | |
existing Qdrant vector collection. These functions encapsulate the | |
logic for extracting available job roles, fetching all Q&A pairs for a | |
given role, finding similar roles when an exact match is not present, | |
and assembling a randomised context from retrieved data. They rely on | |
the ``qdrant-client`` library for interacting with the remote | |
collection, ``sentence-transformers`` for computing embeddings, and | |
scikit-learn's cosine similarity implementation. | |
The collection is expected to exist prior to use and to be | |
configured with vectors generated by the all-MiniLM-L6-v2 model. Do | |
not modify the connection details, vector size or distance metric. | |
Usage example:: | |
from backend.services.interview_retrieval import ( | |
extract_all_roles_from_qdrant, retrieve_interview_data, | |
random_context_chunks | |
) | |
all_roles = extract_all_roles_from_qdrant(collection_name="interview_questions") | |
retrieved = retrieve_interview_data("data scientist", all_roles) | |
context = random_context_chunks(retrieved, k=4) | |
The above snippet fetches all stored roles, retrieves Q&A pairs for | |
the specified role (falling back to similar roles if necessary), and | |
builds a randomised context of four question/answer items. | |
These helpers are designed to be drop‑in compatible with the existing | |
interview system. They deliberately avoid using Qdrant's ``search`` | |
API, instead relying on ``scroll`` to iterate through all records. | |
""" | |
from __future__ import annotations | |
import logging | |
import random | |
from typing import Dict, List, Sequence, Tuple | |
try: | |
# Attempt to import Qdrant client classes. In environments where | |
# qdrant-client is not installed (e.g. during local testing without | |
# vector storage), these imports will fail. We handle that by | |
# assigning ``None`` to the client and conditionally disabling | |
# functions that depend on it. | |
from qdrant_client import QdrantClient # type: ignore | |
from qdrant_client.http.models import Filter, FieldCondition, MatchValue # type: ignore | |
except Exception: | |
QdrantClient = None # type: ignore | |
Filter = None # type: ignore | |
FieldCondition = None # type: ignore | |
MatchValue = None # type: ignore | |
from sklearn.metrics.pairwise import cosine_similarity | |
import numpy as np | |
# ``sentence_transformers`` is an optional dependency. To avoid | |
# import‑time errors in environments where it is absent (e.g. during | |
# lightweight testing or static analysis), we avoid importing it at | |
# module level. Instead, ``LocalEmbeddings`` will attempt to import | |
# SentenceTransformer when instantiated. If the import fails, a | |
# RuntimeError is raised from within the constructor, signalling that | |
# embedding functionality is unavailable. | |
SentenceTransformer = None # type: ignore | |
# --------------------------------------------------------------------------- | |
# Qdrant configuration | |
# | |
# These connection details must not be altered. They point to the | |
# existing Qdrant instance containing interview questions and answers. | |
if QdrantClient is not None: | |
qdrant_client: QdrantClient | None = QdrantClient( | |
url="https://313b1ceb-057f-4b7b-89f5-7b19a213fe65.us-east-1-0.aws.cloud.qdrant.io:6333", | |
api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.w13SPZbljbSvt9Ch_0r034QhMFlmEr4ctXqLo2zhxm4", | |
) | |
else: | |
qdrant_client = None | |
# Name of the Qdrant collection containing interview Q&A pairs. Do not | |
# modify this value; the collection already exists and is populated. | |
COLLECTION_NAME: str = "interview_questions" | |
class LocalEmbeddings: | |
""" | |
Lightweight wrapper around a SentenceTransformer model. Provides | |
convenience methods for embedding a single query string or a list of | |
documents. The model name is fixed to the one used during data | |
ingestion (all‑MiniLM‑L6‑v2). | |
""" | |
def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None: | |
global SentenceTransformer # use global to update when imported | |
if SentenceTransformer is None: | |
try: | |
from sentence_transformers import SentenceTransformer as _ST # type: ignore | |
SentenceTransformer = _ST # type: ignore | |
except Exception as exc: | |
# Fail loudly when embeddings cannot be loaded. The caller | |
# should ensure that ``sentence-transformers`` is installed. | |
raise RuntimeError( | |
"sentence-transformers is required to compute embeddings. Please install the package." | |
) from exc | |
self.model = SentenceTransformer(model_name) # type: ignore | |
def embed_query(self, text: str) -> List[float]: | |
"""Embed a single query string and return a list of floats.""" | |
return self.model.encode(text).tolist() | |
def embed_documents(self, documents: Sequence[str]) -> List[List[float]]: | |
"""Embed a sequence of documents and return a list of vectors.""" | |
return self.model.encode(list(documents)).tolist() | |
# Instantiate the embeddings once. This avoids repeatedly loading | |
# model weights on each function call. If sentence-transformers is | |
# unavailable, ``embeddings`` will be set to ``None`` and similarity | |
# searches will be disabled. Consumers should check for ``None`` | |
# where appropriate. | |
try: | |
embeddings: LocalEmbeddings | None = LocalEmbeddings() | |
except Exception as exc: | |
logging.warning( | |
"Failed to initialise LocalEmbeddings. Similarity search will be disabled. " | |
f"Error: {exc}" | |
) | |
embeddings = None | |
def extract_all_roles_from_qdrant(collection_name: str = COLLECTION_NAME) -> List[str]: | |
""" | |
Extract all unique job roles from the specified Qdrant collection. | |
This function iterates through every point in the collection using | |
Qdrant's ``scroll`` API and collects the ``job_role`` field from | |
payloads. It returns a sorted list of unique roles. Roles in the | |
underlying data are expected to be stored in lowercase; however, | |
callers should not rely on this and should normalise input when | |
performing comparisons. | |
Parameters | |
---------- | |
collection_name : str, optional | |
Name of the Qdrant collection. Defaults to ``COLLECTION_NAME``. | |
Returns | |
------- | |
List[str] | |
A list of unique job roles present in the collection. | |
""" | |
unique_roles: set[str] = set() | |
offset: Tuple[str, int] | None = None | |
limit: int = 256 # reasonable batch size to avoid heavy memory usage | |
# If the Qdrant client failed to initialise, return an empty list. | |
if qdrant_client is None: | |
logging.error( | |
"Qdrant client is unavailable; cannot extract roles. Ensure qdrant-client is installed." | |
) | |
return [] | |
while True: | |
try: | |
# ``scroll`` returns a tuple: (list of points, next offset) | |
points, offset = qdrant_client.scroll( | |
collection_name=collection_name, | |
scroll_filter=None, | |
offset=offset, | |
limit=limit, | |
with_payload=True, | |
with_vectors=False, | |
) | |
except Exception as exc: | |
logging.error(f"Error scrolling Qdrant collection '{collection_name}': {exc}") | |
break | |
for point in points: | |
payload = getattr(point, "payload", {}) or {} | |
role = payload.get("job_role") | |
if isinstance(role, str) and role.strip(): | |
unique_roles.add(role.strip().lower()) | |
# When ``offset`` is None, we have reached the end of the collection. | |
if offset is None: | |
break | |
return sorted(unique_roles) | |
def get_role_questions(job_role: str) -> List[Dict[str, str]]: | |
""" | |
Retrieve all interview questions and answers for a specific job role. | |
This helper uses Qdrant's ``scroll`` API with a ``Filter`` that | |
matches the ``job_role`` payload field exactly. All matching | |
entries are returned, regardless of the number of stored vectors. | |
Parameters | |
---------- | |
job_role : str | |
The job role to match against the ``job_role`` field in payloads. | |
Matching is case‑insensitive; the provided role is normalised | |
internally to lowercase. | |
Returns | |
------- | |
List[Dict[str, str]] | |
A list of dictionaries, each containing ``question``, ``answer`` | |
and ``job_role`` keys. If no entries are found, an empty list | |
is returned. | |
""" | |
if not isinstance(job_role, str) or not job_role.strip(): | |
return [] | |
role_lower = job_role.strip().lower() | |
# Build a filter to match the exact job_role value. We avoid | |
# constructing nested field paths because the payload is flat. | |
if qdrant_client is None or Filter is None or FieldCondition is None or MatchValue is None: | |
logging.error( | |
"Qdrant client or filter classes are unavailable; cannot retrieve questions for roles." | |
) | |
return [] | |
match_filter = Filter( | |
must=[ | |
FieldCondition( | |
key="job_role", | |
match=MatchValue(value=role_lower), | |
) | |
] | |
) | |
results: List[Dict[str, str]] = [] | |
offset: Tuple[str, int] | None = None | |
limit: int = 256 | |
while True: | |
try: | |
points, offset = qdrant_client.scroll( | |
collection_name=COLLECTION_NAME, | |
scroll_filter=match_filter, | |
offset=offset, | |
limit=limit, | |
with_payload=True, | |
with_vectors=False, | |
) | |
except Exception as exc: | |
logging.error(f"Error retrieving questions for role '{job_role}': {exc}") | |
break | |
for point in points: | |
payload = getattr(point, "payload", {}) or {} | |
question = payload.get("question") | |
answer = payload.get("answer") | |
payload_role = payload.get("job_role") | |
if all(isinstance(item, str) for item in (question, answer, payload_role)): | |
results.append({ | |
"question": question, | |
"answer": answer, | |
"job_role": payload_role, | |
}) | |
if offset is None: | |
break | |
return results | |
def find_similar_roles(user_role: str, all_roles: Sequence[str], top_k: int = 3) -> List[str]: | |
""" | |
Find the most similar job roles to the provided role string. | |
When an exact match for ``user_role`` is not found in the collection, | |
this helper computes embeddings for the user's input and all known | |
roles, then ranks them by cosine similarity. It returns up to | |
``top_k`` roles with the highest similarity scores, excluding any | |
roles that exactly match ``user_role`` (case‑insensitively). | |
Parameters | |
---------- | |
user_role : str | |
The role provided by the user. This value is embedded and | |
compared against all known roles. | |
all_roles : Sequence[str] | |
A sequence of all role names available in the collection. It is | |
assumed that these have been normalised to lowercase. | |
top_k : int, optional | |
The maximum number of similar roles to return. Defaults to 3. | |
Returns | |
------- | |
List[str] | |
A list of the most similar roles, ordered by decreasing | |
similarity. If fewer than ``top_k`` roles are available or | |
embedding computation fails, a shorter list may be returned. | |
""" | |
if not isinstance(user_role, str) or not user_role.strip() or not all_roles: | |
return [] | |
user_role_norm = user_role.strip().lower() | |
# Filter out any roles identical to the user input (case‑insensitive) | |
candidate_roles = [role for role in all_roles if role.lower() != user_role_norm] | |
if not candidate_roles: | |
return [] | |
if embeddings is None: | |
logging.warning( | |
"Embeddings are unavailable; cannot compute similar roles. Returning empty list." | |
) | |
return [] | |
try: | |
# Compute embeddings for the query and candidate roles | |
query_vec = np.array([embeddings.embed_query(user_role_norm)]) | |
role_vecs = np.array(embeddings.embed_documents(candidate_roles)) | |
# Compute cosine similarity (higher values indicate greater similarity) | |
sims: np.ndarray = cosine_similarity(query_vec, role_vecs)[0] | |
# Pair each role with its similarity and sort descending | |
paired: List[Tuple[str, float]] = list(zip(candidate_roles, sims)) | |
paired.sort(key=lambda x: x[1], reverse=True) | |
# Extract the top_k roles (handles case where top_k > number of roles) | |
top_roles = [role for role, _ in paired[:max(0, top_k)]] | |
return top_roles | |
except Exception as exc: | |
logging.error(f"Error finding similar roles for '{user_role}': {exc}") | |
return [] | |
def retrieve_interview_data(job_role: str, all_roles: Sequence[str]) -> List[Dict[str, str]]: | |
""" | |
Retrieve interview questions and answers for a job role with fallback. | |
The retrieval process follows these steps: | |
1. Attempt an exact match by fetching all questions associated with | |
``job_role`` via ``get_role_questions``. | |
2. If no questions are returned, compute the ``top_k`` most similar | |
roles using ``find_similar_roles`` and retrieve questions for each. | |
3. Deduplicate results based on the question text to avoid | |
repetition when combining multiple roles. | |
Parameters | |
---------- | |
job_role : str | |
The desired job role provided by the user. | |
all_roles : Sequence[str] | |
The complete list of roles available in the collection. Passed | |
in to avoid re‑fetching roles multiple times. | |
Returns | |
------- | |
List[Dict[str, str]] | |
A deduplicated list of question/answer dictionaries. The | |
``job_role`` field in each item reflects the role it was | |
retrieved from. If neither an exact nor a similar role yields | |
results, an empty list is returned. | |
""" | |
if not isinstance(job_role, str) or not job_role.strip(): | |
return [] | |
# First try exact match | |
results: List[Dict[str, str]] = get_role_questions(job_role) | |
# If no results, find similar roles and aggregate their questions | |
if not results: | |
similar_roles = find_similar_roles(job_role, all_roles, top_k=3) | |
for role in similar_roles: | |
role_questions = get_role_questions(role) | |
results.extend(role_questions) | |
# Deduplicate by question text to avoid repetition | |
seen_questions: set[str] = set() | |
deduped: List[Dict[str, str]] = [] | |
for item in results: | |
question = item.get("question") | |
if isinstance(question, str) and question not in seen_questions: | |
deduped.append(item) | |
seen_questions.add(question) | |
return deduped | |
def random_context_chunks(retrieved_data: Sequence[Dict[str, str]], k: int = 3) -> str: | |
""" | |
Build a context string by sampling Q&A pairs from retrieved data. | |
This helper randomly selects up to ``k`` items from the provided | |
collection of question/answer pairs and formats them as a context | |
string suitable for inclusion in an LLM prompt. Each entry is | |
formatted as ``"Q: [question]\nA: [answer]"`` and separated by a | |
blank line. If ``retrieved_data`` is empty, an empty string is | |
returned. | |
Parameters | |
---------- | |
retrieved_data : Sequence[Dict[str, str]] | |
The list of Q&A dictionaries returned by ``retrieve_interview_data``. | |
k : int, optional | |
The number of entries to sample. Defaults to 3. If ``k`` is | |
greater than the length of ``retrieved_data``, all items are used. | |
Returns | |
------- | |
str | |
A concatenated context string with each Q&A pair on its own | |
lines, separated by blank lines. Returns an empty string if | |
``retrieved_data`` is empty. | |
""" | |
if not retrieved_data: | |
return "" | |
# Determine the number of samples to draw. ``random.sample`` will | |
# raise ValueError if k > len(retrieved_data), so we cap it. | |
num_samples = max(0, min(k, len(retrieved_data))) | |
try: | |
sampled = random.sample(list(retrieved_data), num_samples) | |
except ValueError: | |
sampled = list(retrieved_data) | |
# Build the context string | |
parts: List[str] = [] | |
for item in sampled: | |
q = item.get("question", "").strip() | |
a = item.get("answer", "").strip() | |
if q and a: | |
parts.append(f"Q: {q}\nA: {a}") | |
return "\n\n".join(parts) | |
__all__ = [ | |
"extract_all_roles_from_qdrant", | |
"get_role_questions", | |
"find_similar_roles", | |
"retrieve_interview_data", | |
"random_context_chunks", | |
"embeddings", | |
"qdrant_client", | |
] |