Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on Aug 2

Commit

9ee49ff

1 Parent(s): a1b807c

qdrant included

Browse files

Files changed (2) hide show

backend/services/interview_engine.py +13 -0
backend/services/interview_retrieval.py +444 -0

backend/services/interview_engine.py CHANGED Viewed

@@ -8,6 +8,12 @@ import logging
 import tempfile
 import shutil
 import torch
 if torch.cuda.is_available():
     print("🔥 CUDA Available")
@@ -96,6 +102,10 @@ load_whisper_model()
 def generate_first_question(profile, job):
     """Generate the first interview question based on profile and job"""
     try:
         prompt = f"""
         You are conducting an interview for a {job.role} position at {job.company}.
@@ -104,6 +114,9 @@ def generate_first_question(profile, job):
         - Experience: {profile.get('experience', [])}
         - Education: {profile.get('education', [])}
         Generate an appropriate opening interview question that is professional and relevant.
         Keep it concise and clear. Respond with ONLY the question text, no additional formatting.
         If the interview is for a technical role, focus on technical skills. Make the question related

 import tempfile
 import shutil
 import torch
+from backend.services.interview_retrieval import (
+    extract_all_roles_from_qdrant,
+    retrieve_interview_data,
+    random_context_chunks
+)
 if torch.cuda.is_available():
     print("🔥 CUDA Available")
 def generate_first_question(profile, job):
     """Generate the first interview question based on profile and job"""
+    all_roles = extract_all_roles_from_qdrant()
+    retrieved_data = retrieve_interview_data(job.role.lower(), all_roles)
+    context_data = random_context_chunks(retrieved_data, k=4)
     try:
         prompt = f"""
         You are conducting an interview for a {job.role} position at {job.company}.
         - Experience: {profile.get('experience', [])}
         - Education: {profile.get('education', [])}
+        Use the following context to generate a relevant opening question:
+        {context_data}
         Generate an appropriate opening interview question that is professional and relevant.
         Keep it concise and clear. Respond with ONLY the question text, no additional formatting.
         If the interview is for a technical role, focus on technical skills. Make the question related

backend/services/interview_retrieval.py ADDED Viewed

	@@ -0,0 +1,444 @@

+"""
+Helper functions for retrieving interview questions and answers from an
+existing Qdrant vector collection.  These functions encapsulate the
+logic for extracting available job roles, fetching all Q&A pairs for a
+given role, finding similar roles when an exact match is not present,
+and assembling a randomised context from retrieved data.  They rely on
+the ``qdrant-client`` library for interacting with the remote
+collection, ``sentence-transformers`` for computing embeddings, and
+scikit-learn's cosine similarity implementation.
+The collection is expected to exist prior to use and to be
+configured with vectors generated by the all-MiniLM-L6-v2 model.  Do
+not modify the connection details, vector size or distance metric.
+Usage example::
+    from backend.services.interview_retrieval import (
+        extract_all_roles_from_qdrant, retrieve_interview_data,
+        random_context_chunks
+    )
+    all_roles = extract_all_roles_from_qdrant(collection_name="interview_questions")
+    retrieved = retrieve_interview_data("data scientist", all_roles)
+    context = random_context_chunks(retrieved, k=4)
+The above snippet fetches all stored roles, retrieves Q&A pairs for
+the specified role (falling back to similar roles if necessary), and
+builds a randomised context of four question/answer items.
+These helpers are designed to be drop‑in compatible with the existing
+interview system.  They deliberately avoid using Qdrant's ``search``
+API, instead relying on ``scroll`` to iterate through all records.
+"""
+from __future__ import annotations
+import logging
+import random
+from typing import Dict, List, Sequence, Tuple
+try:
+    # Attempt to import Qdrant client classes.  In environments where
+    # qdrant-client is not installed (e.g. during local testing without
+    # vector storage), these imports will fail.  We handle that by
+    # assigning ``None`` to the client and conditionally disabling
+    # functions that depend on it.
+    from qdrant_client import QdrantClient  # type: ignore
+    from qdrant_client.http.models import Filter, FieldCondition, MatchValue  # type: ignore
+except Exception:
+    QdrantClient = None  # type: ignore
+    Filter = None  # type: ignore
+    FieldCondition = None  # type: ignore
+    MatchValue = None  # type: ignore
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+# ``sentence_transformers`` is an optional dependency.  To avoid
+# import‑time errors in environments where it is absent (e.g. during
+# lightweight testing or static analysis), we avoid importing it at
+# module level.  Instead, ``LocalEmbeddings`` will attempt to import
+# SentenceTransformer when instantiated.  If the import fails, a
+# RuntimeError is raised from within the constructor, signalling that
+# embedding functionality is unavailable.
+SentenceTransformer = None  # type: ignore
+# ---------------------------------------------------------------------------
+# Qdrant configuration
+#
+# These connection details must not be altered.  They point to the
+# existing Qdrant instance containing interview questions and answers.
+if QdrantClient is not None:
+    qdrant_client: QdrantClient | None = QdrantClient(
+        url="https://313b1ceb-057f-4b7b-89f5-7b19a213fe65.us-east-1-0.aws.cloud.qdrant.io:6333",
+        api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.w13SPZbljbSvt9Ch_0r034QhMFlmEr4ctXqLo2zhxm4",
+    )
+else:
+    qdrant_client = None
+# Name of the Qdrant collection containing interview Q&A pairs.  Do not
+# modify this value; the collection already exists and is populated.
+COLLECTION_NAME: str = "interview_questions"
+class LocalEmbeddings:
+    """
+    Lightweight wrapper around a SentenceTransformer model.  Provides
+    convenience methods for embedding a single query string or a list of
+    documents.  The model name is fixed to the one used during data
+    ingestion (all‑MiniLM‑L6‑v2).
+    """
+    def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
+        global SentenceTransformer  # use global to update when imported
+        if SentenceTransformer is None:
+            try:
+                from sentence_transformers import SentenceTransformer as _ST  # type: ignore
+                SentenceTransformer = _ST  # type: ignore
+            except Exception as exc:
+                # Fail loudly when embeddings cannot be loaded.  The caller
+                # should ensure that ``sentence-transformers`` is installed.
+                raise RuntimeError(
+                    "sentence-transformers is required to compute embeddings. Please install the package."
+                ) from exc
+        self.model = SentenceTransformer(model_name)  # type: ignore
+    def embed_query(self, text: str) -> List[float]:
+        """Embed a single query string and return a list of floats."""
+        return self.model.encode(text).tolist()
+    def embed_documents(self, documents: Sequence[str]) -> List[List[float]]:
+        """Embed a sequence of documents and return a list of vectors."""
+        return self.model.encode(list(documents)).tolist()
+# Instantiate the embeddings once.  This avoids repeatedly loading
+# model weights on each function call.  If sentence-transformers is
+# unavailable, ``embeddings`` will be set to ``None`` and similarity
+# searches will be disabled.  Consumers should check for ``None``
+# where appropriate.
+try:
+    embeddings: LocalEmbeddings | None = LocalEmbeddings()
+except Exception as exc:
+    logging.warning(
+        "Failed to initialise LocalEmbeddings. Similarity search will be disabled. "
+        f"Error: {exc}"
+    )
+    embeddings = None
+def extract_all_roles_from_qdrant(collection_name: str = COLLECTION_NAME) -> List[str]:
+    """
+    Extract all unique job roles from the specified Qdrant collection.
+    This function iterates through every point in the collection using
+    Qdrant's ``scroll`` API and collects the ``job_role`` field from
+    payloads.  It returns a sorted list of unique roles.  Roles in the
+    underlying data are expected to be stored in lowercase; however,
+    callers should not rely on this and should normalise input when
+    performing comparisons.
+    Parameters
+    ----------
+    collection_name : str, optional
+        Name of the Qdrant collection.  Defaults to ``COLLECTION_NAME``.
+    Returns
+    -------
+    List[str]
+        A list of unique job roles present in the collection.
+    """
+    unique_roles: set[str] = set()
+    offset: Tuple[str, int] | None = None
+    limit: int = 256  # reasonable batch size to avoid heavy memory usage
+    # If the Qdrant client failed to initialise, return an empty list.
+    if qdrant_client is None:
+        logging.error(
+            "Qdrant client is unavailable; cannot extract roles. Ensure qdrant-client is installed."
+        )
+        return []
+    while True:
+        try:
+            # ``scroll`` returns a tuple: (list of points, next offset)
+            points, offset = qdrant_client.scroll(
+                collection_name=collection_name,
+                scroll_filter=None,
+                offset=offset,
+                limit=limit,
+                with_payload=True,
+                with_vectors=False,
+            )
+        except Exception as exc:
+            logging.error(f"Error scrolling Qdrant collection '{collection_name}': {exc}")
+            break
+        for point in points:
+            payload = getattr(point, "payload", {}) or {}
+            role = payload.get("job_role")
+            if isinstance(role, str) and role.strip():
+                unique_roles.add(role.strip().lower())
+        # When ``offset`` is None, we have reached the end of the collection.
+        if offset is None:
+            break
+    return sorted(unique_roles)
+def get_role_questions(job_role: str) -> List[Dict[str, str]]:
+    """
+    Retrieve all interview questions and answers for a specific job role.
+    This helper uses Qdrant's ``scroll`` API with a ``Filter`` that
+    matches the ``job_role`` payload field exactly.  All matching
+    entries are returned, regardless of the number of stored vectors.
+    Parameters
+    ----------
+    job_role : str
+        The job role to match against the ``job_role`` field in payloads.
+        Matching is case‑insensitive; the provided role is normalised
+        internally to lowercase.
+    Returns
+    -------
+    List[Dict[str, str]]
+        A list of dictionaries, each containing ``question``, ``answer``
+        and ``job_role`` keys.  If no entries are found, an empty list
+        is returned.
+    """
+    if not isinstance(job_role, str) or not job_role.strip():
+        return []
+    role_lower = job_role.strip().lower()
+    # Build a filter to match the exact job_role value.  We avoid
+    # constructing nested field paths because the payload is flat.
+    if qdrant_client is None or Filter is None or FieldCondition is None or MatchValue is None:
+        logging.error(
+            "Qdrant client or filter classes are unavailable; cannot retrieve questions for roles."
+        )
+        return []
+    match_filter = Filter(
+        must=[
+            FieldCondition(
+                key="job_role",
+                match=MatchValue(value=role_lower),
+            )
+        ]
+    )
+    results: List[Dict[str, str]] = []
+    offset: Tuple[str, int] | None = None
+    limit: int = 256
+    while True:
+        try:
+            points, offset = qdrant_client.scroll(
+                collection_name=COLLECTION_NAME,
+                scroll_filter=match_filter,
+                offset=offset,
+                limit=limit,
+                with_payload=True,
+                with_vectors=False,
+            )
+        except Exception as exc:
+            logging.error(f"Error retrieving questions for role '{job_role}': {exc}")
+            break
+        for point in points:
+            payload = getattr(point, "payload", {}) or {}
+            question = payload.get("question")
+            answer = payload.get("answer")
+            payload_role = payload.get("job_role")
+            if all(isinstance(item, str) for item in (question, answer, payload_role)):
+                results.append({
+                    "question": question,
+                    "answer": answer,
+                    "job_role": payload_role,
+                })
+        if offset is None:
+            break
+    return results
+def find_similar_roles(user_role: str, all_roles: Sequence[str], top_k: int = 3) -> List[str]:
+    """
+    Find the most similar job roles to the provided role string.
+    When an exact match for ``user_role`` is not found in the collection,
+    this helper computes embeddings for the user's input and all known
+    roles, then ranks them by cosine similarity.  It returns up to
+    ``top_k`` roles with the highest similarity scores, excluding any
+    roles that exactly match ``user_role`` (case‑insensitively).
+    Parameters
+    ----------
+    user_role : str
+        The role provided by the user.  This value is embedded and
+        compared against all known roles.
+    all_roles : Sequence[str]
+        A sequence of all role names available in the collection.  It is
+        assumed that these have been normalised to lowercase.
+    top_k : int, optional
+        The maximum number of similar roles to return.  Defaults to 3.
+    Returns
+    -------
+    List[str]
+        A list of the most similar roles, ordered by decreasing
+        similarity.  If fewer than ``top_k`` roles are available or
+        embedding computation fails, a shorter list may be returned.
+    """
+    if not isinstance(user_role, str) or not user_role.strip() or not all_roles:
+        return []
+    user_role_norm = user_role.strip().lower()
+    # Filter out any roles identical to the user input (case‑insensitive)
+    candidate_roles = [role for role in all_roles if role.lower() != user_role_norm]
+    if not candidate_roles:
+        return []
+    if embeddings is None:
+        logging.warning(
+            "Embeddings are unavailable; cannot compute similar roles. Returning empty list."
+        )
+        return []
+    try:
+        # Compute embeddings for the query and candidate roles
+        query_vec = np.array([embeddings.embed_query(user_role_norm)])
+        role_vecs = np.array(embeddings.embed_documents(candidate_roles))
+        # Compute cosine similarity (higher values indicate greater similarity)
+        sims: np.ndarray = cosine_similarity(query_vec, role_vecs)[0]
+        # Pair each role with its similarity and sort descending
+        paired: List[Tuple[str, float]] = list(zip(candidate_roles, sims))
+        paired.sort(key=lambda x: x[1], reverse=True)
+        # Extract the top_k roles (handles case where top_k > number of roles)
+        top_roles = [role for role, _ in paired[:max(0, top_k)]]
+        return top_roles
+    except Exception as exc:
+        logging.error(f"Error finding similar roles for '{user_role}': {exc}")
+        return []
+def retrieve_interview_data(job_role: str, all_roles: Sequence[str]) -> List[Dict[str, str]]:
+    """
+    Retrieve interview questions and answers for a job role with fallback.
+    The retrieval process follows these steps:
+    1. Attempt an exact match by fetching all questions associated with
+       ``job_role`` via ``get_role_questions``.
+    2. If no questions are returned, compute the ``top_k`` most similar
+       roles using ``find_similar_roles`` and retrieve questions for each.
+    3. Deduplicate results based on the question text to avoid
+       repetition when combining multiple roles.
+    Parameters
+    ----------
+    job_role : str
+        The desired job role provided by the user.
+    all_roles : Sequence[str]
+        The complete list of roles available in the collection.  Passed
+        in to avoid re‑fetching roles multiple times.
+    Returns
+    -------
+    List[Dict[str, str]]
+        A deduplicated list of question/answer dictionaries.  The
+        ``job_role`` field in each item reflects the role it was
+        retrieved from.  If neither an exact nor a similar role yields
+        results, an empty list is returned.
+    """
+    if not isinstance(job_role, str) or not job_role.strip():
+        return []
+    # First try exact match
+    results: List[Dict[str, str]] = get_role_questions(job_role)
+    # If no results, find similar roles and aggregate their questions
+    if not results:
+        similar_roles = find_similar_roles(job_role, all_roles, top_k=3)
+        for role in similar_roles:
+            role_questions = get_role_questions(role)
+            results.extend(role_questions)
+    # Deduplicate by question text to avoid repetition
+    seen_questions: set[str] = set()
+    deduped: List[Dict[str, str]] = []
+    for item in results:
+        question = item.get("question")
+        if isinstance(question, str) and question not in seen_questions:
+            deduped.append(item)
+            seen_questions.add(question)
+    return deduped
+def random_context_chunks(retrieved_data: Sequence[Dict[str, str]], k: int = 3) -> str:
+    """
+    Build a context string by sampling Q&A pairs from retrieved data.
+    This helper randomly selects up to ``k`` items from the provided
+    collection of question/answer pairs and formats them as a context
+    string suitable for inclusion in an LLM prompt.  Each entry is
+    formatted as ``"Q: [question]\nA: [answer]"`` and separated by a
+    blank line.  If ``retrieved_data`` is empty, an empty string is
+    returned.
+    Parameters
+    ----------
+    retrieved_data : Sequence[Dict[str, str]]
+        The list of Q&A dictionaries returned by ``retrieve_interview_data``.
+    k : int, optional
+        The number of entries to sample.  Defaults to 3.  If ``k`` is
+        greater than the length of ``retrieved_data``, all items are used.
+    Returns
+    -------
+    str
+        A concatenated context string with each Q&A pair on its own
+        lines, separated by blank lines.  Returns an empty string if
+        ``retrieved_data`` is empty.
+    """
+    if not retrieved_data:
+        return ""
+    # Determine the number of samples to draw.  ``random.sample`` will
+    # raise ValueError if k > len(retrieved_data), so we cap it.
+    num_samples = max(0, min(k, len(retrieved_data)))
+    try:
+        sampled = random.sample(list(retrieved_data), num_samples)
+    except ValueError:
+        sampled = list(retrieved_data)
+    # Build the context string
+    parts: List[str] = []
+    for item in sampled:
+        q = item.get("question", "").strip()
+        a = item.get("answer", "").strip()
+        if q and a:
+            parts.append(f"Q: {q}\nA: {a}")
+    return "\n\n".join(parts)
+__all__ = [
+    "extract_all_roles_from_qdrant",
+    "get_role_questions",
+    "find_similar_roles",
+    "retrieve_interview_data",
+    "random_context_chunks",
+    "embeddings",
+    "qdrant_client",
+]