""" Helper functions for retrieving interview questions and answers from an existing Qdrant vector collection. These functions encapsulate the logic for extracting available job roles, fetching all Q&A pairs for a given role, finding similar roles when an exact match is not present, and assembling a randomised context from retrieved data. They rely on the ``qdrant-client`` library for interacting with the remote collection, ``sentence-transformers`` for computing embeddings, and scikit-learn's cosine similarity implementation. The collection is expected to exist prior to use and to be configured with vectors generated by the all-MiniLM-L6-v2 model. Do not modify the connection details, vector size or distance metric. Usage example:: from backend.services.interview_retrieval import ( extract_all_roles_from_qdrant, retrieve_interview_data, random_context_chunks ) all_roles = extract_all_roles_from_qdrant(collection_name="interview_questions") retrieved = retrieve_interview_data("data scientist", all_roles) context = random_context_chunks(retrieved, k=4) The above snippet fetches all stored roles, retrieves Q&A pairs for the specified role (falling back to similar roles if necessary), and builds a randomised context of four question/answer items. These helpers are designed to be drop‑in compatible with the existing interview system. They deliberately avoid using Qdrant's ``search`` API, instead relying on ``scroll`` to iterate through all records. """ from __future__ import annotations import logging import random from typing import Dict, List, Sequence, Tuple try: # Attempt to import Qdrant client classes. In environments where # qdrant-client is not installed (e.g. during local testing without # vector storage), these imports will fail. We handle that by # assigning ``None`` to the client and conditionally disabling # functions that depend on it. from qdrant_client import QdrantClient # type: ignore from qdrant_client.http.models import Filter, FieldCondition, MatchValue # type: ignore except Exception: QdrantClient = None # type: ignore Filter = None # type: ignore FieldCondition = None # type: ignore MatchValue = None # type: ignore from sklearn.metrics.pairwise import cosine_similarity import numpy as np # ``sentence_transformers`` is an optional dependency. To avoid # import‑time errors in environments where it is absent (e.g. during # lightweight testing or static analysis), we avoid importing it at # module level. Instead, ``LocalEmbeddings`` will attempt to import # SentenceTransformer when instantiated. If the import fails, a # RuntimeError is raised from within the constructor, signalling that # embedding functionality is unavailable. SentenceTransformer = None # type: ignore # --------------------------------------------------------------------------- # Qdrant configuration # # These connection details must not be altered. They point to the # existing Qdrant instance containing interview questions and answers. if QdrantClient is not None: qdrant_client: QdrantClient | None = QdrantClient( url="https://313b1ceb-057f-4b7b-89f5-7b19a213fe65.us-east-1-0.aws.cloud.qdrant.io:6333", api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.w13SPZbljbSvt9Ch_0r034QhMFlmEr4ctXqLo2zhxm4", ) else: qdrant_client = None # Name of the Qdrant collection containing interview Q&A pairs. Do not # modify this value; the collection already exists and is populated. COLLECTION_NAME: str = "interview_questions" class LocalEmbeddings: """ Lightweight wrapper around a SentenceTransformer model. Provides convenience methods for embedding a single query string or a list of documents. The model name is fixed to the one used during data ingestion (all‑MiniLM‑L6‑v2). """ def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None: global SentenceTransformer # use global to update when imported if SentenceTransformer is None: try: from sentence_transformers import SentenceTransformer as _ST # type: ignore SentenceTransformer = _ST # type: ignore except Exception as exc: # Fail loudly when embeddings cannot be loaded. The caller # should ensure that ``sentence-transformers`` is installed. raise RuntimeError( "sentence-transformers is required to compute embeddings. Please install the package." ) from exc self.model = SentenceTransformer(model_name) # type: ignore def embed_query(self, text: str) -> List[float]: """Embed a single query string and return a list of floats.""" return self.model.encode(text).tolist() def embed_documents(self, documents: Sequence[str]) -> List[List[float]]: """Embed a sequence of documents and return a list of vectors.""" return self.model.encode(list(documents)).tolist() # Instantiate the embeddings once. This avoids repeatedly loading # model weights on each function call. If sentence-transformers is # unavailable, ``embeddings`` will be set to ``None`` and similarity # searches will be disabled. Consumers should check for ``None`` # where appropriate. try: embeddings: LocalEmbeddings | None = LocalEmbeddings() except Exception as exc: logging.warning( "Failed to initialise LocalEmbeddings. Similarity search will be disabled. " f"Error: {exc}" ) embeddings = None def extract_all_roles_from_qdrant(collection_name: str = COLLECTION_NAME) -> List[str]: """ Extract all unique job roles from the specified Qdrant collection. This function iterates through every point in the collection using Qdrant's ``scroll`` API and collects the ``job_role`` field from payloads. It returns a sorted list of unique roles. Roles in the underlying data are expected to be stored in lowercase; however, callers should not rely on this and should normalise input when performing comparisons. Parameters ---------- collection_name : str, optional Name of the Qdrant collection. Defaults to ``COLLECTION_NAME``. Returns ------- List[str] A list of unique job roles present in the collection. """ unique_roles: set[str] = set() offset: Tuple[str, int] | None = None limit: int = 256 # reasonable batch size to avoid heavy memory usage # If the Qdrant client failed to initialise, return an empty list. if qdrant_client is None: logging.error( "Qdrant client is unavailable; cannot extract roles. Ensure qdrant-client is installed." ) return [] while True: try: # ``scroll`` returns a tuple: (list of points, next offset) points, offset = qdrant_client.scroll( collection_name=collection_name, scroll_filter=None, offset=offset, limit=limit, with_payload=True, with_vectors=False, ) except Exception as exc: logging.error(f"Error scrolling Qdrant collection '{collection_name}': {exc}") break for point in points: payload = getattr(point, "payload", {}) or {} role = payload.get("job_role") if isinstance(role, str) and role.strip(): unique_roles.add(role.strip().lower()) # When ``offset`` is None, we have reached the end of the collection. if offset is None: break return sorted(unique_roles) def get_role_questions(job_role: str) -> List[Dict[str, str]]: """ Retrieve all interview questions and answers for a specific job role. This helper uses Qdrant's ``scroll`` API with a ``Filter`` that matches the ``job_role`` payload field exactly. All matching entries are returned, regardless of the number of stored vectors. Parameters ---------- job_role : str The job role to match against the ``job_role`` field in payloads. Matching is case‑insensitive; the provided role is normalised internally to lowercase. Returns ------- List[Dict[str, str]] A list of dictionaries, each containing ``question``, ``answer`` and ``job_role`` keys. If no entries are found, an empty list is returned. """ if not isinstance(job_role, str) or not job_role.strip(): return [] role_lower = job_role.strip().lower() # Build a filter to match the exact job_role value. We avoid # constructing nested field paths because the payload is flat. if qdrant_client is None or Filter is None or FieldCondition is None or MatchValue is None: logging.error( "Qdrant client or filter classes are unavailable; cannot retrieve questions for roles." ) return [] match_filter = Filter( must=[ FieldCondition( key="job_role", match=MatchValue(value=role_lower), ) ] ) results: List[Dict[str, str]] = [] offset: Tuple[str, int] | None = None limit: int = 256 while True: try: points, offset = qdrant_client.scroll( collection_name=COLLECTION_NAME, scroll_filter=match_filter, offset=offset, limit=limit, with_payload=True, with_vectors=False, ) except Exception as exc: logging.error(f"Error retrieving questions for role '{job_role}': {exc}") break for point in points: payload = getattr(point, "payload", {}) or {} question = payload.get("question") answer = payload.get("answer") payload_role = payload.get("job_role") if all(isinstance(item, str) for item in (question, answer, payload_role)): results.append({ "question": question, "answer": answer, "job_role": payload_role, }) if offset is None: break return results def find_similar_roles(user_role: str, all_roles: Sequence[str], top_k: int = 3) -> List[str]: """ Find the most similar job roles to the provided role string. When an exact match for ``user_role`` is not found in the collection, this helper computes embeddings for the user's input and all known roles, then ranks them by cosine similarity. It returns up to ``top_k`` roles with the highest similarity scores, excluding any roles that exactly match ``user_role`` (case‑insensitively). Parameters ---------- user_role : str The role provided by the user. This value is embedded and compared against all known roles. all_roles : Sequence[str] A sequence of all role names available in the collection. It is assumed that these have been normalised to lowercase. top_k : int, optional The maximum number of similar roles to return. Defaults to 3. Returns ------- List[str] A list of the most similar roles, ordered by decreasing similarity. If fewer than ``top_k`` roles are available or embedding computation fails, a shorter list may be returned. """ if not isinstance(user_role, str) or not user_role.strip() or not all_roles: return [] user_role_norm = user_role.strip().lower() # Filter out any roles identical to the user input (case‑insensitive) candidate_roles = [role for role in all_roles if role.lower() != user_role_norm] if not candidate_roles: return [] if embeddings is None: logging.warning( "Embeddings are unavailable; cannot compute similar roles. Returning empty list." ) return [] try: # Compute embeddings for the query and candidate roles query_vec = np.array([embeddings.embed_query(user_role_norm)]) role_vecs = np.array(embeddings.embed_documents(candidate_roles)) # Compute cosine similarity (higher values indicate greater similarity) sims: np.ndarray = cosine_similarity(query_vec, role_vecs)[0] # Pair each role with its similarity and sort descending paired: List[Tuple[str, float]] = list(zip(candidate_roles, sims)) paired.sort(key=lambda x: x[1], reverse=True) # Extract the top_k roles (handles case where top_k > number of roles) top_roles = [role for role, _ in paired[:max(0, top_k)]] return top_roles except Exception as exc: logging.error(f"Error finding similar roles for '{user_role}': {exc}") return [] def retrieve_interview_data(job_role: str, all_roles: Sequence[str]) -> List[Dict[str, str]]: """ Retrieve interview questions and answers for a job role with fallback. The retrieval process follows these steps: 1. Attempt an exact match by fetching all questions associated with ``job_role`` via ``get_role_questions``. 2. If no questions are returned, compute the ``top_k`` most similar roles using ``find_similar_roles`` and retrieve questions for each. 3. Deduplicate results based on the question text to avoid repetition when combining multiple roles. Parameters ---------- job_role : str The desired job role provided by the user. all_roles : Sequence[str] The complete list of roles available in the collection. Passed in to avoid re‑fetching roles multiple times. Returns ------- List[Dict[str, str]] A deduplicated list of question/answer dictionaries. The ``job_role`` field in each item reflects the role it was retrieved from. If neither an exact nor a similar role yields results, an empty list is returned. """ if not isinstance(job_role, str) or not job_role.strip(): return [] # First try exact match results: List[Dict[str, str]] = get_role_questions(job_role) # If no results, find similar roles and aggregate their questions if not results: similar_roles = find_similar_roles(job_role, all_roles, top_k=3) for role in similar_roles: role_questions = get_role_questions(role) results.extend(role_questions) # Deduplicate by question text to avoid repetition seen_questions: set[str] = set() deduped: List[Dict[str, str]] = [] for item in results: question = item.get("question") if isinstance(question, str) and question not in seen_questions: deduped.append(item) seen_questions.add(question) return deduped def random_context_chunks(retrieved_data: Sequence[Dict[str, str]], k: int = 3) -> str: """ Build a context string by sampling Q&A pairs from retrieved data. This helper randomly selects up to ``k`` items from the provided collection of question/answer pairs and formats them as a context string suitable for inclusion in an LLM prompt. Each entry is formatted as ``"Q: [question]\nA: [answer]"`` and separated by a blank line. If ``retrieved_data`` is empty, an empty string is returned. Parameters ---------- retrieved_data : Sequence[Dict[str, str]] The list of Q&A dictionaries returned by ``retrieve_interview_data``. k : int, optional The number of entries to sample. Defaults to 3. If ``k`` is greater than the length of ``retrieved_data``, all items are used. Returns ------- str A concatenated context string with each Q&A pair on its own lines, separated by blank lines. Returns an empty string if ``retrieved_data`` is empty. """ if not retrieved_data: return "" # Determine the number of samples to draw. ``random.sample`` will # raise ValueError if k > len(retrieved_data), so we cap it. num_samples = max(0, min(k, len(retrieved_data))) try: sampled = random.sample(list(retrieved_data), num_samples) except ValueError: sampled = list(retrieved_data) # Build the context string parts: List[str] = [] for item in sampled: q = item.get("question", "").strip() a = item.get("answer", "").strip() if q and a: parts.append(f"Q: {q}\nA: {a}") return "\n\n".join(parts) __all__ = [ "extract_all_roles_from_qdrant", "get_role_questions", "find_similar_roles", "retrieve_interview_data", "random_context_chunks", "embeddings", "qdrant_client", ]