Spaces:
Paused
Paused
| """ | |
| Helper functions for retrieving interview questions and answers from an | |
| existing Qdrant vector collection. These functions encapsulate the | |
| logic for extracting available job roles, fetching all Q&A pairs for a | |
| given role, finding similar roles when an exact match is not present, | |
| and assembling a randomised context from retrieved data. They rely on | |
| the ``qdrant-client`` library for interacting with the remote | |
| collection, ``sentence-transformers`` for computing embeddings, and | |
| scikit-learn's cosine similarity implementation. | |
| The collection is expected to exist prior to use and to be | |
| configured with vectors generated by the all-MiniLM-L6-v2 model. Do | |
| not modify the connection details, vector size or distance metric. | |
| Usage example:: | |
| from backend.services.interview_retrieval import ( | |
| extract_all_roles_from_qdrant, retrieve_interview_data, | |
| random_context_chunks | |
| ) | |
| all_roles = extract_all_roles_from_qdrant(collection_name="interview_questions") | |
| retrieved = retrieve_interview_data("data scientist", all_roles) | |
| context = random_context_chunks(retrieved, k=4) | |
| The above snippet fetches all stored roles, retrieves Q&A pairs for | |
| the specified role (falling back to similar roles if necessary), and | |
| builds a randomised context of four question/answer items. | |
| These helpers are designed to be drop‑in compatible with the existing | |
| interview system. They deliberately avoid using Qdrant's ``search`` | |
| API, instead relying on ``scroll`` to iterate through all records. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import random | |
| from typing import Dict, List, Sequence, Tuple | |
| try: | |
| # Attempt to import Qdrant client classes. In environments where | |
| # qdrant-client is not installed (e.g. during local testing without | |
| # vector storage), these imports will fail. We handle that by | |
| # assigning ``None`` to the client and conditionally disabling | |
| # functions that depend on it. | |
| from qdrant_client import QdrantClient # type: ignore | |
| from qdrant_client.http.models import Filter, FieldCondition, MatchValue # type: ignore | |
| except Exception: | |
| QdrantClient = None # type: ignore | |
| Filter = None # type: ignore | |
| FieldCondition = None # type: ignore | |
| MatchValue = None # type: ignore | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| # ``sentence_transformers`` is an optional dependency. To avoid | |
| # import‑time errors in environments where it is absent (e.g. during | |
| # lightweight testing or static analysis), we avoid importing it at | |
| # module level. Instead, ``LocalEmbeddings`` will attempt to import | |
| # SentenceTransformer when instantiated. If the import fails, a | |
| # RuntimeError is raised from within the constructor, signalling that | |
| # embedding functionality is unavailable. | |
| SentenceTransformer = None # type: ignore | |
| # --------------------------------------------------------------------------- | |
| # Qdrant configuration | |
| # | |
| # These connection details must not be altered. They point to the | |
| # existing Qdrant instance containing interview questions and answers. | |
| if QdrantClient is not None: | |
| qdrant_client: QdrantClient | None = QdrantClient( | |
| url="https://313b1ceb-057f-4b7b-89f5-7b19a213fe65.us-east-1-0.aws.cloud.qdrant.io:6333", | |
| api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.w13SPZbljbSvt9Ch_0r034QhMFlmEr4ctXqLo2zhxm4", | |
| ) | |
| else: | |
| qdrant_client = None | |
| # Name of the Qdrant collection containing interview Q&A pairs. Do not | |
| # modify this value; the collection already exists and is populated. | |
| COLLECTION_NAME: str = "interview_questions" | |
| class LocalEmbeddings: | |
| """ | |
| Lightweight wrapper around a SentenceTransformer model. Provides | |
| convenience methods for embedding a single query string or a list of | |
| documents. The model name is fixed to the one used during data | |
| ingestion (all‑MiniLM‑L6‑v2). | |
| """ | |
| def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None: | |
| global SentenceTransformer # use global to update when imported | |
| if SentenceTransformer is None: | |
| try: | |
| from sentence_transformers import SentenceTransformer as _ST # type: ignore | |
| SentenceTransformer = _ST # type: ignore | |
| except Exception as exc: | |
| # Fail loudly when embeddings cannot be loaded. The caller | |
| # should ensure that ``sentence-transformers`` is installed. | |
| raise RuntimeError( | |
| "sentence-transformers is required to compute embeddings. Please install the package." | |
| ) from exc | |
| self.model = SentenceTransformer(model_name) # type: ignore | |
| def embed_query(self, text: str) -> List[float]: | |
| """Embed a single query string and return a list of floats.""" | |
| return self.model.encode(text).tolist() | |
| def embed_documents(self, documents: Sequence[str]) -> List[List[float]]: | |
| """Embed a sequence of documents and return a list of vectors.""" | |
| return self.model.encode(list(documents)).tolist() | |
| # Instantiate the embeddings once. This avoids repeatedly loading | |
| # model weights on each function call. If sentence-transformers is | |
| # unavailable, ``embeddings`` will be set to ``None`` and similarity | |
| # searches will be disabled. Consumers should check for ``None`` | |
| # where appropriate. | |
| try: | |
| embeddings: LocalEmbeddings | None = LocalEmbeddings() | |
| except Exception as exc: | |
| logging.warning( | |
| "Failed to initialise LocalEmbeddings. Similarity search will be disabled. " | |
| f"Error: {exc}" | |
| ) | |
| embeddings = None | |
| def extract_all_roles_from_qdrant(collection_name: str = COLLECTION_NAME) -> List[str]: | |
| """ | |
| Extract all unique job roles from the specified Qdrant collection. | |
| This function iterates through every point in the collection using | |
| Qdrant's ``scroll`` API and collects the ``job_role`` field from | |
| payloads. It returns a sorted list of unique roles. Roles in the | |
| underlying data are expected to be stored in lowercase; however, | |
| callers should not rely on this and should normalise input when | |
| performing comparisons. | |
| Parameters | |
| ---------- | |
| collection_name : str, optional | |
| Name of the Qdrant collection. Defaults to ``COLLECTION_NAME``. | |
| Returns | |
| ------- | |
| List[str] | |
| A list of unique job roles present in the collection. | |
| """ | |
| unique_roles: set[str] = set() | |
| offset: Tuple[str, int] | None = None | |
| limit: int = 256 # reasonable batch size to avoid heavy memory usage | |
| # If the Qdrant client failed to initialise, return an empty list. | |
| if qdrant_client is None: | |
| logging.error( | |
| "Qdrant client is unavailable; cannot extract roles. Ensure qdrant-client is installed." | |
| ) | |
| return [] | |
| while True: | |
| try: | |
| # ``scroll`` returns a tuple: (list of points, next offset) | |
| points, offset = qdrant_client.scroll( | |
| collection_name=collection_name, | |
| scroll_filter=None, | |
| offset=offset, | |
| limit=limit, | |
| with_payload=True, | |
| with_vectors=False, | |
| ) | |
| except Exception as exc: | |
| logging.error(f"Error scrolling Qdrant collection '{collection_name}': {exc}") | |
| break | |
| for point in points: | |
| payload = getattr(point, "payload", {}) or {} | |
| role = payload.get("job_role") | |
| if isinstance(role, str) and role.strip(): | |
| unique_roles.add(role.strip().lower()) | |
| # When ``offset`` is None, we have reached the end of the collection. | |
| if offset is None: | |
| break | |
| return sorted(unique_roles) | |
| def get_role_questions(job_role: str) -> List[Dict[str, str]]: | |
| """ | |
| Retrieve all interview questions and answers for a specific job role. | |
| This helper uses Qdrant's ``scroll`` API with a ``Filter`` that | |
| matches the ``job_role`` payload field exactly. All matching | |
| entries are returned, regardless of the number of stored vectors. | |
| Parameters | |
| ---------- | |
| job_role : str | |
| The job role to match against the ``job_role`` field in payloads. | |
| Matching is case‑insensitive; the provided role is normalised | |
| internally to lowercase. | |
| Returns | |
| ------- | |
| List[Dict[str, str]] | |
| A list of dictionaries, each containing ``question``, ``answer`` | |
| and ``job_role`` keys. If no entries are found, an empty list | |
| is returned. | |
| """ | |
| if not isinstance(job_role, str) or not job_role.strip(): | |
| return [] | |
| role_lower = job_role.strip().lower() | |
| # Build a filter to match the exact job_role value. We avoid | |
| # constructing nested field paths because the payload is flat. | |
| if qdrant_client is None or Filter is None or FieldCondition is None or MatchValue is None: | |
| logging.error( | |
| "Qdrant client or filter classes are unavailable; cannot retrieve questions for roles." | |
| ) | |
| return [] | |
| match_filter = Filter( | |
| must=[ | |
| FieldCondition( | |
| key="job_role", | |
| match=MatchValue(value=role_lower), | |
| ) | |
| ] | |
| ) | |
| results: List[Dict[str, str]] = [] | |
| offset: Tuple[str, int] | None = None | |
| limit: int = 256 | |
| while True: | |
| try: | |
| points, offset = qdrant_client.scroll( | |
| collection_name=COLLECTION_NAME, | |
| scroll_filter=match_filter, | |
| offset=offset, | |
| limit=limit, | |
| with_payload=True, | |
| with_vectors=False, | |
| ) | |
| except Exception as exc: | |
| logging.error(f"Error retrieving questions for role '{job_role}': {exc}") | |
| break | |
| for point in points: | |
| payload = getattr(point, "payload", {}) or {} | |
| question = payload.get("question") | |
| answer = payload.get("answer") | |
| payload_role = payload.get("job_role") | |
| if all(isinstance(item, str) for item in (question, answer, payload_role)): | |
| results.append({ | |
| "question": question, | |
| "answer": answer, | |
| "job_role": payload_role, | |
| }) | |
| if offset is None: | |
| break | |
| return results | |
| def find_similar_roles(user_role: str, all_roles: Sequence[str], top_k: int = 3) -> List[str]: | |
| """ | |
| Find the most similar job roles to the provided role string. | |
| When an exact match for ``user_role`` is not found in the collection, | |
| this helper computes embeddings for the user's input and all known | |
| roles, then ranks them by cosine similarity. It returns up to | |
| ``top_k`` roles with the highest similarity scores, excluding any | |
| roles that exactly match ``user_role`` (case‑insensitively). | |
| Parameters | |
| ---------- | |
| user_role : str | |
| The role provided by the user. This value is embedded and | |
| compared against all known roles. | |
| all_roles : Sequence[str] | |
| A sequence of all role names available in the collection. It is | |
| assumed that these have been normalised to lowercase. | |
| top_k : int, optional | |
| The maximum number of similar roles to return. Defaults to 3. | |
| Returns | |
| ------- | |
| List[str] | |
| A list of the most similar roles, ordered by decreasing | |
| similarity. If fewer than ``top_k`` roles are available or | |
| embedding computation fails, a shorter list may be returned. | |
| """ | |
| if not isinstance(user_role, str) or not user_role.strip() or not all_roles: | |
| return [] | |
| user_role_norm = user_role.strip().lower() | |
| # Filter out any roles identical to the user input (case‑insensitive) | |
| candidate_roles = [role for role in all_roles if role.lower() != user_role_norm] | |
| if not candidate_roles: | |
| return [] | |
| if embeddings is None: | |
| logging.warning( | |
| "Embeddings are unavailable; cannot compute similar roles. Returning empty list." | |
| ) | |
| return [] | |
| try: | |
| # Compute embeddings for the query and candidate roles | |
| query_vec = np.array([embeddings.embed_query(user_role_norm)]) | |
| role_vecs = np.array(embeddings.embed_documents(candidate_roles)) | |
| # Compute cosine similarity (higher values indicate greater similarity) | |
| sims: np.ndarray = cosine_similarity(query_vec, role_vecs)[0] | |
| # Pair each role with its similarity and sort descending | |
| paired: List[Tuple[str, float]] = list(zip(candidate_roles, sims)) | |
| paired.sort(key=lambda x: x[1], reverse=True) | |
| # Extract the top_k roles (handles case where top_k > number of roles) | |
| top_roles = [role for role, _ in paired[:max(0, top_k)]] | |
| return top_roles | |
| except Exception as exc: | |
| logging.error(f"Error finding similar roles for '{user_role}': {exc}") | |
| return [] | |
| def retrieve_interview_data(job_role: str, all_roles: Sequence[str]) -> List[Dict[str, str]]: | |
| """ | |
| Retrieve interview questions and answers for a job role with fallback. | |
| The retrieval process follows these steps: | |
| 1. Attempt an exact match by fetching all questions associated with | |
| ``job_role`` via ``get_role_questions``. | |
| 2. If no questions are returned, compute the ``top_k`` most similar | |
| roles using ``find_similar_roles`` and retrieve questions for each. | |
| 3. Deduplicate results based on the question text to avoid | |
| repetition when combining multiple roles. | |
| Parameters | |
| ---------- | |
| job_role : str | |
| The desired job role provided by the user. | |
| all_roles : Sequence[str] | |
| The complete list of roles available in the collection. Passed | |
| in to avoid re‑fetching roles multiple times. | |
| Returns | |
| ------- | |
| List[Dict[str, str]] | |
| A deduplicated list of question/answer dictionaries. The | |
| ``job_role`` field in each item reflects the role it was | |
| retrieved from. If neither an exact nor a similar role yields | |
| results, an empty list is returned. | |
| """ | |
| if not isinstance(job_role, str) or not job_role.strip(): | |
| return [] | |
| # First try exact match | |
| results: List[Dict[str, str]] = get_role_questions(job_role) | |
| # If no results, find similar roles and aggregate their questions | |
| if not results: | |
| similar_roles = find_similar_roles(job_role, all_roles, top_k=3) | |
| for role in similar_roles: | |
| role_questions = get_role_questions(role) | |
| results.extend(role_questions) | |
| # Deduplicate by question text to avoid repetition | |
| seen_questions: set[str] = set() | |
| deduped: List[Dict[str, str]] = [] | |
| for item in results: | |
| question = item.get("question") | |
| if isinstance(question, str) and question not in seen_questions: | |
| deduped.append(item) | |
| seen_questions.add(question) | |
| return deduped | |
| def random_context_chunks(retrieved_data: Sequence[Dict[str, str]], k: int = 3) -> str: | |
| """ | |
| Build a context string by sampling Q&A pairs from retrieved data. | |
| This helper randomly selects up to ``k`` items from the provided | |
| collection of question/answer pairs and formats them as a context | |
| string suitable for inclusion in an LLM prompt. Each entry is | |
| formatted as ``"Q: [question]\nA: [answer]"`` and separated by a | |
| blank line. If ``retrieved_data`` is empty, an empty string is | |
| returned. | |
| Parameters | |
| ---------- | |
| retrieved_data : Sequence[Dict[str, str]] | |
| The list of Q&A dictionaries returned by ``retrieve_interview_data``. | |
| k : int, optional | |
| The number of entries to sample. Defaults to 3. If ``k`` is | |
| greater than the length of ``retrieved_data``, all items are used. | |
| Returns | |
| ------- | |
| str | |
| A concatenated context string with each Q&A pair on its own | |
| lines, separated by blank lines. Returns an empty string if | |
| ``retrieved_data`` is empty. | |
| """ | |
| if not retrieved_data: | |
| return "" | |
| # Determine the number of samples to draw. ``random.sample`` will | |
| # raise ValueError if k > len(retrieved_data), so we cap it. | |
| num_samples = max(0, min(k, len(retrieved_data))) | |
| try: | |
| sampled = random.sample(list(retrieved_data), num_samples) | |
| except ValueError: | |
| sampled = list(retrieved_data) | |
| # Build the context string | |
| parts: List[str] = [] | |
| for item in sampled: | |
| q = item.get("question", "").strip() | |
| a = item.get("answer", "").strip() | |
| if q and a: | |
| parts.append(f"Q: {q}\nA: {a}") | |
| return "\n\n".join(parts) | |
| __all__ = [ | |
| "extract_all_roles_from_qdrant", | |
| "get_role_questions", | |
| "find_similar_roles", | |
| "retrieve_interview_data", | |
| "random_context_chunks", | |
| "embeddings", | |
| "qdrant_client", | |
| ] |