Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

Codingo / backend /services /interview_retrieval.py

husseinelsaadi

qdrant included

9ee49ff 17 days ago

raw

history blame

16.8 kB

	"""
	Helper functions for retrieving interview questions and answers from an
	existing Qdrant vector collection. These functions encapsulate the
	logic for extracting available job roles, fetching all Q&A pairs for a
	given role, finding similar roles when an exact match is not present,
	and assembling a randomised context from retrieved data. They rely on
	the ``qdrant-client`` library for interacting with the remote
	collection, ``sentence-transformers`` for computing embeddings, and
	scikit-learn's cosine similarity implementation.

	The collection is expected to exist prior to use and to be
	configured with vectors generated by the all-MiniLM-L6-v2 model. Do
	not modify the connection details, vector size or distance metric.

	Usage example::

	from backend.services.interview_retrieval import (
	extract_all_roles_from_qdrant, retrieve_interview_data,
	random_context_chunks
	)

	all_roles = extract_all_roles_from_qdrant(collection_name="interview_questions")
	retrieved = retrieve_interview_data("data scientist", all_roles)
	context = random_context_chunks(retrieved, k=4)

	The above snippet fetches all stored roles, retrieves Q&A pairs for
	the specified role (falling back to similar roles if necessary), and
	builds a randomised context of four question/answer items.

	These helpers are designed to be drop‑in compatible with the existing
	interview system. They deliberately avoid using Qdrant's ``search``
	API, instead relying on ``scroll`` to iterate through all records.
	"""

	from __future__ import annotations

	import logging
	import random
	from typing import Dict, List, Sequence, Tuple

	try:
	# Attempt to import Qdrant client classes. In environments where
	# qdrant-client is not installed (e.g. during local testing without
	# vector storage), these imports will fail. We handle that by
	# assigning ``None`` to the client and conditionally disabling
	# functions that depend on it.
	from qdrant_client import QdrantClient # type: ignore
	from qdrant_client.http.models import Filter, FieldCondition, MatchValue # type: ignore
	except Exception:
	QdrantClient = None # type: ignore
	Filter = None # type: ignore
	FieldCondition = None # type: ignore
	MatchValue = None # type: ignore
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np

	# ``sentence_transformers`` is an optional dependency. To avoid
	# import‑time errors in environments where it is absent (e.g. during
	# lightweight testing or static analysis), we avoid importing it at
	# module level. Instead, ``LocalEmbeddings`` will attempt to import
	# SentenceTransformer when instantiated. If the import fails, a
	# RuntimeError is raised from within the constructor, signalling that
	# embedding functionality is unavailable.
	SentenceTransformer = None # type: ignore


	# ---------------------------------------------------------------------------
	# Qdrant configuration
	#
	# These connection details must not be altered. They point to the
	# existing Qdrant instance containing interview questions and answers.

	if QdrantClient is not None:
	qdrant_client: QdrantClient \| None = QdrantClient(
	url="https://313b1ceb-057f-4b7b-89f5-7b19a213fe65.us-east-1-0.aws.cloud.qdrant.io:6333",
	api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.w13SPZbljbSvt9Ch_0r034QhMFlmEr4ctXqLo2zhxm4",
	)
	else:
	qdrant_client = None

	# Name of the Qdrant collection containing interview Q&A pairs. Do not
	# modify this value; the collection already exists and is populated.
	COLLECTION_NAME: str = "interview_questions"


	class LocalEmbeddings:
	"""
	Lightweight wrapper around a SentenceTransformer model. Provides
	convenience methods for embedding a single query string or a list of
	documents. The model name is fixed to the one used during data
	ingestion (all‑MiniLM‑L6‑v2).
	"""

	def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
	global SentenceTransformer # use global to update when imported
	if SentenceTransformer is None:
	try:
	from sentence_transformers import SentenceTransformer as _ST # type: ignore
	SentenceTransformer = _ST # type: ignore
	except Exception as exc:
	# Fail loudly when embeddings cannot be loaded. The caller
	# should ensure that ``sentence-transformers`` is installed.
	raise RuntimeError(
	"sentence-transformers is required to compute embeddings. Please install the package."
	) from exc
	self.model = SentenceTransformer(model_name) # type: ignore

	def embed_query(self, text: str) -> List[float]:
	"""Embed a single query string and return a list of floats."""
	return self.model.encode(text).tolist()

	def embed_documents(self, documents: Sequence[str]) -> List[List[float]]:
	"""Embed a sequence of documents and return a list of vectors."""
	return self.model.encode(list(documents)).tolist()


	# Instantiate the embeddings once. This avoids repeatedly loading
	# model weights on each function call. If sentence-transformers is
	# unavailable, ``embeddings`` will be set to ``None`` and similarity
	# searches will be disabled. Consumers should check for ``None``
	# where appropriate.
	try:
	embeddings: LocalEmbeddings \| None = LocalEmbeddings()
	except Exception as exc:
	logging.warning(
	"Failed to initialise LocalEmbeddings. Similarity search will be disabled. "
	f"Error: {exc}"
	)
	embeddings = None


	def extract_all_roles_from_qdrant(collection_name: str = COLLECTION_NAME) -> List[str]:
	"""
	Extract all unique job roles from the specified Qdrant collection.

	This function iterates through every point in the collection using
	Qdrant's ``scroll`` API and collects the ``job_role`` field from
	payloads. It returns a sorted list of unique roles. Roles in the
	underlying data are expected to be stored in lowercase; however,
	callers should not rely on this and should normalise input when
	performing comparisons.

	Parameters
	----------
	collection_name : str, optional
	Name of the Qdrant collection. Defaults to ``COLLECTION_NAME``.

	Returns
	-------
	List[str]
	A list of unique job roles present in the collection.
	"""
	unique_roles: set[str] = set()
	offset: Tuple[str, int] \| None = None
	limit: int = 256 # reasonable batch size to avoid heavy memory usage

	# If the Qdrant client failed to initialise, return an empty list.
	if qdrant_client is None:
	logging.error(
	"Qdrant client is unavailable; cannot extract roles. Ensure qdrant-client is installed."
	)
	return []

	while True:
	try:
	# ``scroll`` returns a tuple: (list of points, next offset)
	points, offset = qdrant_client.scroll(
	collection_name=collection_name,
	scroll_filter=None,
	offset=offset,
	limit=limit,
	with_payload=True,
	with_vectors=False,
	)
	except Exception as exc:
	logging.error(f"Error scrolling Qdrant collection '{collection_name}': {exc}")
	break

	for point in points:
	payload = getattr(point, "payload", {}) or {}
	role = payload.get("job_role")
	if isinstance(role, str) and role.strip():
	unique_roles.add(role.strip().lower())

	# When ``offset`` is None, we have reached the end of the collection.
	if offset is None:
	break

	return sorted(unique_roles)


	def get_role_questions(job_role: str) -> List[Dict[str, str]]:
	"""
	Retrieve all interview questions and answers for a specific job role.

	This helper uses Qdrant's ``scroll`` API with a ``Filter`` that
	matches the ``job_role`` payload field exactly. All matching
	entries are returned, regardless of the number of stored vectors.

	Parameters
	----------
	job_role : str
	The job role to match against the ``job_role`` field in payloads.
	Matching is case‑insensitive; the provided role is normalised
	internally to lowercase.

	Returns
	-------
	List[Dict[str, str]]
	A list of dictionaries, each containing ``question``, ``answer``
	and ``job_role`` keys. If no entries are found, an empty list
	is returned.
	"""
	if not isinstance(job_role, str) or not job_role.strip():
	return []

	role_lower = job_role.strip().lower()

	# Build a filter to match the exact job_role value. We avoid
	# constructing nested field paths because the payload is flat.
	if qdrant_client is None or Filter is None or FieldCondition is None or MatchValue is None:
	logging.error(
	"Qdrant client or filter classes are unavailable; cannot retrieve questions for roles."
	)
	return []

	match_filter = Filter(
	must=[
	FieldCondition(
	key="job_role",
	match=MatchValue(value=role_lower),
	)
	]
	)

	results: List[Dict[str, str]] = []
	offset: Tuple[str, int] \| None = None
	limit: int = 256

	while True:
	try:
	points, offset = qdrant_client.scroll(
	collection_name=COLLECTION_NAME,
	scroll_filter=match_filter,
	offset=offset,
	limit=limit,
	with_payload=True,
	with_vectors=False,
	)
	except Exception as exc:
	logging.error(f"Error retrieving questions for role '{job_role}': {exc}")
	break

	for point in points:
	payload = getattr(point, "payload", {}) or {}
	question = payload.get("question")
	answer = payload.get("answer")
	payload_role = payload.get("job_role")
	if all(isinstance(item, str) for item in (question, answer, payload_role)):
	results.append({
	"question": question,
	"answer": answer,
	"job_role": payload_role,
	})

	if offset is None:
	break

	return results


	def find_similar_roles(user_role: str, all_roles: Sequence[str], top_k: int = 3) -> List[str]:
	"""
	Find the most similar job roles to the provided role string.

	When an exact match for ``user_role`` is not found in the collection,
	this helper computes embeddings for the user's input and all known
	roles, then ranks them by cosine similarity. It returns up to
	``top_k`` roles with the highest similarity scores, excluding any
	roles that exactly match ``user_role`` (case‑insensitively).

	Parameters
	----------
	user_role : str
	The role provided by the user. This value is embedded and
	compared against all known roles.
	all_roles : Sequence[str]
	A sequence of all role names available in the collection. It is
	assumed that these have been normalised to lowercase.
	top_k : int, optional
	The maximum number of similar roles to return. Defaults to 3.

	Returns
	-------
	List[str]
	A list of the most similar roles, ordered by decreasing
	similarity. If fewer than ``top_k`` roles are available or
	embedding computation fails, a shorter list may be returned.
	"""
	if not isinstance(user_role, str) or not user_role.strip() or not all_roles:
	return []

	user_role_norm = user_role.strip().lower()

	# Filter out any roles identical to the user input (case‑insensitive)
	candidate_roles = [role for role in all_roles if role.lower() != user_role_norm]
	if not candidate_roles:
	return []

	if embeddings is None:
	logging.warning(
	"Embeddings are unavailable; cannot compute similar roles. Returning empty list."
	)
	return []
	try:
	# Compute embeddings for the query and candidate roles
	query_vec = np.array([embeddings.embed_query(user_role_norm)])
	role_vecs = np.array(embeddings.embed_documents(candidate_roles))

	# Compute cosine similarity (higher values indicate greater similarity)
	sims: np.ndarray = cosine_similarity(query_vec, role_vecs)[0]

	# Pair each role with its similarity and sort descending
	paired: List[Tuple[str, float]] = list(zip(candidate_roles, sims))
	paired.sort(key=lambda x: x[1], reverse=True)

	# Extract the top_k roles (handles case where top_k > number of roles)
	top_roles = [role for role, _ in paired[:max(0, top_k)]]
	return top_roles
	except Exception as exc:
	logging.error(f"Error finding similar roles for '{user_role}': {exc}")
	return []


	def retrieve_interview_data(job_role: str, all_roles: Sequence[str]) -> List[Dict[str, str]]:
	"""
	Retrieve interview questions and answers for a job role with fallback.

	The retrieval process follows these steps:
	1. Attempt an exact match by fetching all questions associated with
	``job_role`` via ``get_role_questions``.
	2. If no questions are returned, compute the ``top_k`` most similar
	roles using ``find_similar_roles`` and retrieve questions for each.
	3. Deduplicate results based on the question text to avoid
	repetition when combining multiple roles.

	Parameters
	----------
	job_role : str
	The desired job role provided by the user.
	all_roles : Sequence[str]
	The complete list of roles available in the collection. Passed
	in to avoid re‑fetching roles multiple times.

	Returns
	-------
	List[Dict[str, str]]
	A deduplicated list of question/answer dictionaries. The
	``job_role`` field in each item reflects the role it was
	retrieved from. If neither an exact nor a similar role yields
	results, an empty list is returned.
	"""
	if not isinstance(job_role, str) or not job_role.strip():
	return []

	# First try exact match
	results: List[Dict[str, str]] = get_role_questions(job_role)

	# If no results, find similar roles and aggregate their questions
	if not results:
	similar_roles = find_similar_roles(job_role, all_roles, top_k=3)
	for role in similar_roles:
	role_questions = get_role_questions(role)
	results.extend(role_questions)

	# Deduplicate by question text to avoid repetition
	seen_questions: set[str] = set()
	deduped: List[Dict[str, str]] = []
	for item in results:
	question = item.get("question")
	if isinstance(question, str) and question not in seen_questions:
	deduped.append(item)
	seen_questions.add(question)

	return deduped


	def random_context_chunks(retrieved_data: Sequence[Dict[str, str]], k: int = 3) -> str:
	"""
	Build a context string by sampling Q&A pairs from retrieved data.

	This helper randomly selects up to ``k`` items from the provided
	collection of question/answer pairs and formats them as a context
	string suitable for inclusion in an LLM prompt. Each entry is
	formatted as ``"Q: [question]\nA: [answer]"`` and separated by a
	blank line. If ``retrieved_data`` is empty, an empty string is
	returned.

	Parameters
	----------
	retrieved_data : Sequence[Dict[str, str]]
	The list of Q&A dictionaries returned by ``retrieve_interview_data``.
	k : int, optional
	The number of entries to sample. Defaults to 3. If ``k`` is
	greater than the length of ``retrieved_data``, all items are used.

	Returns
	-------
	str
	A concatenated context string with each Q&A pair on its own
	lines, separated by blank lines. Returns an empty string if
	``retrieved_data`` is empty.
	"""
	if not retrieved_data:
	return ""

	# Determine the number of samples to draw. ``random.sample`` will
	# raise ValueError if k > len(retrieved_data), so we cap it.
	num_samples = max(0, min(k, len(retrieved_data)))
	try:
	sampled = random.sample(list(retrieved_data), num_samples)
	except ValueError:
	sampled = list(retrieved_data)

	# Build the context string
	parts: List[str] = []
	for item in sampled:
	q = item.get("question", "").strip()
	a = item.get("answer", "").strip()
	if q and a:
	parts.append(f"Q: {q}\nA: {a}")

	return "\n\n".join(parts)


	__all__ = [
	"extract_all_roles_from_qdrant",
	"get_role_questions",
	"find_similar_roles",
	"retrieve_interview_data",
	"random_context_chunks",
	"embeddings",
	"qdrant_client",
	]