Spaces:

husseinelsaadi
/

Codingo

Paused

File size: 16,834 Bytes

9ee49ff

"""
Helper functions for retrieving interview questions and answers from an
existing Qdrant vector collection.  These functions encapsulate the
logic for extracting available job roles, fetching all Q&A pairs for a
given role, finding similar roles when an exact match is not present,
and assembling a randomised context from retrieved data.  They rely on
the ``qdrant-client`` library for interacting with the remote
collection, ``sentence-transformers`` for computing embeddings, and
scikit-learn's cosine similarity implementation.

The collection is expected to exist prior to use and to be
configured with vectors generated by the all-MiniLM-L6-v2 model.  Do
not modify the connection details, vector size or distance metric.

Usage example::

    from backend.services.interview_retrieval import (
        extract_all_roles_from_qdrant, retrieve_interview_data,
        random_context_chunks
    )

    all_roles = extract_all_roles_from_qdrant(collection_name="interview_questions")
    retrieved = retrieve_interview_data("data scientist", all_roles)
    context = random_context_chunks(retrieved, k=4)

The above snippet fetches all stored roles, retrieves Q&A pairs for
the specified role (falling back to similar roles if necessary), and
builds a randomised context of four question/answer items.

These helpers are designed to be drop‑in compatible with the existing
interview system.  They deliberately avoid using Qdrant's ``search``
API, instead relying on ``scroll`` to iterate through all records.
"""

from __future__ import annotations

import logging
import random
from typing import Dict, List, Sequence, Tuple

try:
    # Attempt to import Qdrant client classes.  In environments where
    # qdrant-client is not installed (e.g. during local testing without
    # vector storage), these imports will fail.  We handle that by
    # assigning ``None`` to the client and conditionally disabling
    # functions that depend on it.
    from qdrant_client import QdrantClient  # type: ignore
    from qdrant_client.http.models import Filter, FieldCondition, MatchValue  # type: ignore
except Exception:
    QdrantClient = None  # type: ignore
    Filter = None  # type: ignore
    FieldCondition = None  # type: ignore
    MatchValue = None  # type: ignore
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# ``sentence_transformers`` is an optional dependency.  To avoid
# import‑time errors in environments where it is absent (e.g. during
# lightweight testing or static analysis), we avoid importing it at
# module level.  Instead, ``LocalEmbeddings`` will attempt to import
# SentenceTransformer when instantiated.  If the import fails, a
# RuntimeError is raised from within the constructor, signalling that
# embedding functionality is unavailable.
SentenceTransformer = None  # type: ignore


# ---------------------------------------------------------------------------
# Qdrant configuration
#
# These connection details must not be altered.  They point to the
# existing Qdrant instance containing interview questions and answers.

if QdrantClient is not None:
    qdrant_client: QdrantClient | None = QdrantClient(
        url="https://313b1ceb-057f-4b7b-89f5-7b19a213fe65.us-east-1-0.aws.cloud.qdrant.io:6333",
        api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.w13SPZbljbSvt9Ch_0r034QhMFlmEr4ctXqLo2zhxm4",
    )
else:
    qdrant_client = None

# Name of the Qdrant collection containing interview Q&A pairs.  Do not
# modify this value; the collection already exists and is populated.
COLLECTION_NAME: str = "interview_questions"


class LocalEmbeddings:
    """
    Lightweight wrapper around a SentenceTransformer model.  Provides
    convenience methods for embedding a single query string or a list of
    documents.  The model name is fixed to the one used during data
    ingestion (all‑MiniLM‑L6‑v2).
    """

    def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
        global SentenceTransformer  # use global to update when imported
        if SentenceTransformer is None:
            try:
                from sentence_transformers import SentenceTransformer as _ST  # type: ignore
                SentenceTransformer = _ST  # type: ignore
            except Exception as exc:
                # Fail loudly when embeddings cannot be loaded.  The caller
                # should ensure that ``sentence-transformers`` is installed.
                raise RuntimeError(
                    "sentence-transformers is required to compute embeddings. Please install the package."
                ) from exc
        self.model = SentenceTransformer(model_name)  # type: ignore

    def embed_query(self, text: str) -> List[float]:
        """Embed a single query string and return a list of floats."""
        return self.model.encode(text).tolist()

    def embed_documents(self, documents: Sequence[str]) -> List[List[float]]:
        """Embed a sequence of documents and return a list of vectors."""
        return self.model.encode(list(documents)).tolist()


# Instantiate the embeddings once.  This avoids repeatedly loading
# model weights on each function call.  If sentence-transformers is
# unavailable, ``embeddings`` will be set to ``None`` and similarity
# searches will be disabled.  Consumers should check for ``None``
# where appropriate.
try:
    embeddings: LocalEmbeddings | None = LocalEmbeddings()
except Exception as exc:
    logging.warning(
        "Failed to initialise LocalEmbeddings. Similarity search will be disabled. "
        f"Error: {exc}"
    )
    embeddings = None


def extract_all_roles_from_qdrant(collection_name: str = COLLECTION_NAME) -> List[str]:
    """
    Extract all unique job roles from the specified Qdrant collection.

    This function iterates through every point in the collection using
    Qdrant's ``scroll`` API and collects the ``job_role`` field from
    payloads.  It returns a sorted list of unique roles.  Roles in the
    underlying data are expected to be stored in lowercase; however,
    callers should not rely on this and should normalise input when
    performing comparisons.

    Parameters
    ----------
    collection_name : str, optional
        Name of the Qdrant collection.  Defaults to ``COLLECTION_NAME``.

    Returns
    -------
    List[str]
        A list of unique job roles present in the collection.
    """
    unique_roles: set[str] = set()
    offset: Tuple[str, int] | None = None
    limit: int = 256  # reasonable batch size to avoid heavy memory usage

    # If the Qdrant client failed to initialise, return an empty list.
    if qdrant_client is None:
        logging.error(
            "Qdrant client is unavailable; cannot extract roles. Ensure qdrant-client is installed."
        )
        return []

    while True:
        try:
            # ``scroll`` returns a tuple: (list of points, next offset)
            points, offset = qdrant_client.scroll(
                collection_name=collection_name,
                scroll_filter=None,
                offset=offset,
                limit=limit,
                with_payload=True,
                with_vectors=False,
            )
        except Exception as exc:
            logging.error(f"Error scrolling Qdrant collection '{collection_name}': {exc}")
            break

        for point in points:
            payload = getattr(point, "payload", {}) or {}
            role = payload.get("job_role")
            if isinstance(role, str) and role.strip():
                unique_roles.add(role.strip().lower())

        # When ``offset`` is None, we have reached the end of the collection.
        if offset is None:
            break

    return sorted(unique_roles)


def get_role_questions(job_role: str) -> List[Dict[str, str]]:
    """
    Retrieve all interview questions and answers for a specific job role.

    This helper uses Qdrant's ``scroll`` API with a ``Filter`` that
    matches the ``job_role`` payload field exactly.  All matching
    entries are returned, regardless of the number of stored vectors.

    Parameters
    ----------
    job_role : str
        The job role to match against the ``job_role`` field in payloads.
        Matching is case‑insensitive; the provided role is normalised
        internally to lowercase.

    Returns
    -------
    List[Dict[str, str]]
        A list of dictionaries, each containing ``question``, ``answer``
        and ``job_role`` keys.  If no entries are found, an empty list
        is returned.
    """
    if not isinstance(job_role, str) or not job_role.strip():
        return []

    role_lower = job_role.strip().lower()

    # Build a filter to match the exact job_role value.  We avoid
    # constructing nested field paths because the payload is flat.
    if qdrant_client is None or Filter is None or FieldCondition is None or MatchValue is None:
        logging.error(
            "Qdrant client or filter classes are unavailable; cannot retrieve questions for roles."
        )
        return []

    match_filter = Filter(
        must=[
            FieldCondition(
                key="job_role",
                match=MatchValue(value=role_lower),
            )
        ]
    )

    results: List[Dict[str, str]] = []
    offset: Tuple[str, int] | None = None
    limit: int = 256

    while True:
        try:
            points, offset = qdrant_client.scroll(
                collection_name=COLLECTION_NAME,
                scroll_filter=match_filter,
                offset=offset,
                limit=limit,
                with_payload=True,
                with_vectors=False,
            )
        except Exception as exc:
            logging.error(f"Error retrieving questions for role '{job_role}': {exc}")
            break

        for point in points:
            payload = getattr(point, "payload", {}) or {}
            question = payload.get("question")
            answer = payload.get("answer")
            payload_role = payload.get("job_role")
            if all(isinstance(item, str) for item in (question, answer, payload_role)):
                results.append({
                    "question": question,
                    "answer": answer,
                    "job_role": payload_role,
                })

        if offset is None:
            break

    return results


def find_similar_roles(user_role: str, all_roles: Sequence[str], top_k: int = 3) -> List[str]:
    """
    Find the most similar job roles to the provided role string.

    When an exact match for ``user_role`` is not found in the collection,
    this helper computes embeddings for the user's input and all known
    roles, then ranks them by cosine similarity.  It returns up to
    ``top_k`` roles with the highest similarity scores, excluding any
    roles that exactly match ``user_role`` (case‑insensitively).

    Parameters
    ----------
    user_role : str
        The role provided by the user.  This value is embedded and
        compared against all known roles.
    all_roles : Sequence[str]
        A sequence of all role names available in the collection.  It is
        assumed that these have been normalised to lowercase.
    top_k : int, optional
        The maximum number of similar roles to return.  Defaults to 3.

    Returns
    -------
    List[str]
        A list of the most similar roles, ordered by decreasing
        similarity.  If fewer than ``top_k`` roles are available or
        embedding computation fails, a shorter list may be returned.
    """
    if not isinstance(user_role, str) or not user_role.strip() or not all_roles:
        return []

    user_role_norm = user_role.strip().lower()

    # Filter out any roles identical to the user input (case‑insensitive)
    candidate_roles = [role for role in all_roles if role.lower() != user_role_norm]
    if not candidate_roles:
        return []

    if embeddings is None:
        logging.warning(
            "Embeddings are unavailable; cannot compute similar roles. Returning empty list."
        )
        return []
    try:
        # Compute embeddings for the query and candidate roles
        query_vec = np.array([embeddings.embed_query(user_role_norm)])
        role_vecs = np.array(embeddings.embed_documents(candidate_roles))

        # Compute cosine similarity (higher values indicate greater similarity)
        sims: np.ndarray = cosine_similarity(query_vec, role_vecs)[0]

        # Pair each role with its similarity and sort descending
        paired: List[Tuple[str, float]] = list(zip(candidate_roles, sims))
        paired.sort(key=lambda x: x[1], reverse=True)

        # Extract the top_k roles (handles case where top_k > number of roles)
        top_roles = [role for role, _ in paired[:max(0, top_k)]]
        return top_roles
    except Exception as exc:
        logging.error(f"Error finding similar roles for '{user_role}': {exc}")
        return []


def retrieve_interview_data(job_role: str, all_roles: Sequence[str]) -> List[Dict[str, str]]:
    """
    Retrieve interview questions and answers for a job role with fallback.

    The retrieval process follows these steps:
    1. Attempt an exact match by fetching all questions associated with
       ``job_role`` via ``get_role_questions``.
    2. If no questions are returned, compute the ``top_k`` most similar
       roles using ``find_similar_roles`` and retrieve questions for each.
    3. Deduplicate results based on the question text to avoid
       repetition when combining multiple roles.

    Parameters
    ----------
    job_role : str
        The desired job role provided by the user.
    all_roles : Sequence[str]
        The complete list of roles available in the collection.  Passed
        in to avoid re‑fetching roles multiple times.

    Returns
    -------
    List[Dict[str, str]]
        A deduplicated list of question/answer dictionaries.  The
        ``job_role`` field in each item reflects the role it was
        retrieved from.  If neither an exact nor a similar role yields
        results, an empty list is returned.
    """
    if not isinstance(job_role, str) or not job_role.strip():
        return []

    # First try exact match
    results: List[Dict[str, str]] = get_role_questions(job_role)

    # If no results, find similar roles and aggregate their questions
    if not results:
        similar_roles = find_similar_roles(job_role, all_roles, top_k=3)
        for role in similar_roles:
            role_questions = get_role_questions(role)
            results.extend(role_questions)

    # Deduplicate by question text to avoid repetition
    seen_questions: set[str] = set()
    deduped: List[Dict[str, str]] = []
    for item in results:
        question = item.get("question")
        if isinstance(question, str) and question not in seen_questions:
            deduped.append(item)
            seen_questions.add(question)

    return deduped


def random_context_chunks(retrieved_data: Sequence[Dict[str, str]], k: int = 3) -> str:
    """
    Build a context string by sampling Q&A pairs from retrieved data.

    This helper randomly selects up to ``k`` items from the provided
    collection of question/answer pairs and formats them as a context
    string suitable for inclusion in an LLM prompt.  Each entry is
    formatted as ``"Q: [question]\nA: [answer]"`` and separated by a
    blank line.  If ``retrieved_data`` is empty, an empty string is
    returned.

    Parameters
    ----------
    retrieved_data : Sequence[Dict[str, str]]
        The list of Q&A dictionaries returned by ``retrieve_interview_data``.
    k : int, optional
        The number of entries to sample.  Defaults to 3.  If ``k`` is
        greater than the length of ``retrieved_data``, all items are used.

    Returns
    -------
    str
        A concatenated context string with each Q&A pair on its own
        lines, separated by blank lines.  Returns an empty string if
        ``retrieved_data`` is empty.
    """
    if not retrieved_data:
        return ""

    # Determine the number of samples to draw.  ``random.sample`` will
    # raise ValueError if k > len(retrieved_data), so we cap it.
    num_samples = max(0, min(k, len(retrieved_data)))
    try:
        sampled = random.sample(list(retrieved_data), num_samples)
    except ValueError:
        sampled = list(retrieved_data)

    # Build the context string
    parts: List[str] = []
    for item in sampled:
        q = item.get("question", "").strip()
        a = item.get("answer", "").strip()
        if q and a:
            parts.append(f"Q: {q}\nA: {a}")

    return "\n\n".join(parts)


__all__ = [
    "extract_all_roles_from_qdrant",
    "get_role_questions",
    "find_similar_roles",
    "retrieve_interview_data",
    "random_context_chunks",
    "embeddings",
    "qdrant_client",
]