Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

Codingo / backend /services /codingo_chatbot.py

husseinelsaadi

chatbot updated

fb236cf 18 days ago

raw

history blame

13 kB

	"""
	codingo_chatbot.py
	===================

	This module encapsulates the logic for Codingo's website chatbot. It
	loads a knowledge base from ``chatbot/chatbot.txt``, builds a vector
	database using Chroma and SentenceTransformers, and uses a local LLM
	powered by ``llama‑cpp‑python`` to generate answers constrained to the
	retrieved context. The code is written to initialise all heavy
	resources lazily on first use and to cache them for subsequent
	requests. This prevents repeated model downloads and avoids
	recomputing embeddings for every chat query.

	The underlying LLM is the TinyLlama 1.1B chat model distributed via
	Hugging Face in GGUF format. When the model file is not present
	locally it is downloaded automatically using ``huggingface_hub``.
	Depending on the environment the model will run on GPU if CUDA is
	available or fall back to CPU otherwise. See the ``init_llm``
	function for details.

	Note: This module deliberately contains no references to OpenAI. It
	relies solely on open‑source libraries available on PyPI (such as
	``llama‑cpp‑python`` and ``chromadb``) so that it can be used on
	Hugging Face Spaces without requiring proprietary API keys.
	"""

	from __future__ import annotations

	import os
	import threading
	from typing import List

	import numpy as np

	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from sentence_transformers import SentenceTransformer
	import chromadb
	from chromadb.config import Settings
	from huggingface_hub import hf_hub_download

	try:
	from llama_cpp import Llama # type: ignore
	except Exception as exc: # pragma: no cover - import may fail until dependency installed
	# Provide a helpful error if llama_cpp isn't installed.
	raise ImportError(
	"llama_cpp is required for the chatbot. Please add 'llama-cpp-python' "
	"to your requirements.txt"
	) from exc

	# ---------------------------------------------------------------------------
	# Configuration
	#
	# Compute the absolute path to the chatbot knowledge base. We derive this
	# relative to this file so that the module works regardless of the working
	# directory. The project structure places ``chatbot.txt`` at
	# ``Codingo12/chatbot/chatbot.txt``.
	PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
	CHATBOT_TXT_PATH = os.path.join(PROJECT_ROOT, "chatbot", "chatbot.txt")

	# Directory where Chroma will persist its database. This location is
	# writable on both local machines and Hugging Face Spaces. It is
	# intentionally distinct from the web app instance path to avoid
	# permission issues.
	CHROMA_DB_DIR = os.path.join("/tmp", "chatbot_chroma")

	# Settings for the TinyLlama model. These can be overridden via
	# environment variables if desired (for example to switch to a
	# different quantisation or to test with a smaller model). See
	# https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF for
	# available filenames.
	LLAMA_REPO = os.getenv(
	"LLAMA_REPO",
	"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
	)
	LLAMA_FILE = os.getenv(
	"LLAMA_FILE",
	"tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
	)

	# Local directory where the GGUF model file will be stored. Using
	# ``/tmp`` avoids writing into the read‑only repository filesystem on
	# Hugging Face Spaces. The directory will be created as needed.
	LLAMA_LOCAL_DIR = os.path.join("/tmp", "llama_models")

	# Generation parameters. These values mirror those used in the
	# provided Jupyter notebook. They can be tweaked via environment
	# variables if necessary to trade off quality against speed.
	MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "256"))
	TEMPERATURE = float(os.getenv("LLAMA_TEMPERATURE", "0.7"))
	TOP_P = float(os.getenv("LLAMA_TOP_P", "0.9"))
	REPEAT_PENALTY = float(os.getenv("LLAMA_REPEAT_PENALTY", "1.15"))

	# Thread lock to guard lazy initialisation in multi‑threaded Flask
	# environments. Without this lock multiple concurrent requests may
	# attempt to download the model or populate the database at the same
	# time, leading to redundant work or race conditions.
	_init_lock = threading.Lock()

	# Global singletons for embedder, vector collection and LLM. These
	# variables are populated on first use and reused thereafter.
	_embedder: SentenceTransformer \| None = None
	_collection: chromadb.Collection \| None = None
	_llm: Llama \| None = None


	def _load_chatbot_text() -> str:
	"""Read the chatbot knowledge base from disk.

	If the file is missing, a small default description of Codingo is
	returned. This ensures the chatbot still provides a sensible
	answer rather than crashing.
	"""
	try:
	with open(CHATBOT_TXT_PATH, encoding="utf-8") as f:
	return f.read()
	except FileNotFoundError:
	# Fallback content if the knowledge base file is missing
	return (
	"Codingo is an AI‑powered recruitment platform designed to "
	"streamline job applications, candidate screening and hiring. "
	"We make hiring smarter, faster and fairer through automation "
	"and intelligent recommendations."
	)


	def init_embedder_and_db() -> None:
	"""Initialise the SentenceTransformer embedder and Chroma vector DB.

	This function is idempotent: if the embedder and collection are
	already initialised it returns immediately. Otherwise it reads
	``chatbot.txt``, splits it into overlapping chunks, computes
	embeddings and persists them to a Chroma collection. The
	resulting ``SentenceTransformer`` and collection objects are saved
	in global variables for later reuse.
	"""
	global _embedder, _collection
	if _embedder is not None and _collection is not None:
	return
	with _init_lock:
	if _embedder is not None and _collection is not None:
	return
	# Ensure persistence directory exists
	os.makedirs(CHROMA_DB_DIR, exist_ok=True)

	# Read knowledge base
	text = _load_chatbot_text()

	# Split into chunks; use double newlines to prefer splitting on
	# paragraph boundaries. Overlap helps the model maintain
	# context across neighbouring chunks.
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=300,
	chunk_overlap=100,
	separators=["\n\n"],
	)
	docs: List[str] = [doc.strip() for doc in splitter.split_text(text) if doc.strip()]

	# Initialise embedder (MiniLM). We specify device via env.
	embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	embeddings = embedder.encode(docs, show_progress_bar=False, batch_size=32)

	# Initialise Chroma client
	client = chromadb.Client(
	Settings(
	persist_directory=CHROMA_DB_DIR,
	anonymized_telemetry=False,
	is_persistent=True,
	)
	)

	# Create or get collection. This returns an existing collection if
	# already present on disk.
	collection = client.get_or_create_collection("codingo_chatbot")

	# Populate collection only if empty. A naive call to
	# ``collection.get(limit=1)`` may raise if the collection does
	# not exist yet, so we catch any exception and treat it as an
	# empty DB. Distances are stored as cosine similarity.
	need_populate = False
	try:
	existing = collection.get(limit=1)
	if not existing or not existing.get("documents"):
	need_populate = True
	except Exception:
	need_populate = True
	if need_populate:
	ids = [f"doc_{i}" for i in range(len(docs))]
	collection.add(documents=docs, embeddings=embeddings.tolist(), ids=ids)
	_embedder = embedder
	_collection = collection


	def init_llm() -> None:
	"""Initialise the llama‑cpp model for response generation.

	This function lazily downloads the GGUF model from Hugging Face if
	necessary and instantiates a ``llama_cpp.Llama`` object. The
	resulting instance is stored in the global ``_llm`` variable. To
	control GPU usage set the ``CUDA_VISIBLE_DEVICES`` environment
	variable or override ``LLAMA_N_GPU_LAYERS``. By default we use one
	GPU layer when CUDA is available, otherwise the model runs on CPU.
	"""
	global _llm
	if _llm is not None:
	return
	with _init_lock:
	if _llm is not None:
	return
	# Ensure the model directory exists
	os.makedirs(LLAMA_LOCAL_DIR, exist_ok=True)
	# Download model if not already present
	local_path = os.path.join(LLAMA_LOCAL_DIR, LLAMA_FILE)
	if not os.path.exists(local_path):
	# The file will be downloaded to LLAMA_LOCAL_DIR. Use
	# ``local_dir_use_symlinks=False`` to avoid creating
	# symlinks that may break on certain filesystems.
	local_path = hf_hub_download(
	repo_id=LLAMA_REPO,
	filename=LLAMA_FILE,
	local_dir=LLAMA_LOCAL_DIR,
	local_dir_use_symlinks=False,
	)
	# Determine GPU usage. We default to one GPU layer if CUDA
	# appears available. Users can override via LLAMA_N_GPU_LAYERS.
	try:
	import torch # type: ignore
	use_cuda = torch.cuda.is_available()
	except Exception:
	use_cuda = False
	n_gpu_layers_env = os.getenv("LLAMA_N_GPU_LAYERS")
	if n_gpu_layers_env:
	try:
	n_gpu_layers = int(n_gpu_layers_env)
	except ValueError:
	n_gpu_layers = 0
	else:
	n_gpu_layers = 1 if use_cuda else 0
	# Construct the Llama instance. The context window is set
	# generously to 2048 tokens; adjust via LLAMA_N_CTX if needed.
	n_ctx = int(os.getenv("LLAMA_N_CTX", "2048"))
	# Use half the available CPU cores for inference threads to
	# balance responsiveness and resource use.
	try:
	n_threads = max(1, os.cpu_count() // 2)
	except Exception:
	n_threads = 2
	_llm = Llama(
	model_path=local_path,
	n_ctx=n_ctx,
	n_threads=n_threads,
	n_gpu_layers=n_gpu_layers,
	)


	def _build_prompt(query: str, context: str) -> str:
	"""Construct the full prompt for the TinyLlama chat model.

	The prompt format follows the conventions used by the model as
	illustrated in the provided notebook. We include a system message
	instructing the model to answer only using the given context and to
	politely decline if the information is unavailable.
	"""
	system_prompt = (
	"You are the official chatbot of Codingo. "
	"Answer ONLY by using the CONTEXT. "
	"If the information is not available for you, say it politely."
	)
	prompt = (
	f"<\|system\|>\n{system_prompt}</s>\n"
	f"<\|user\|>\n{query}\n\nCONTEXTE:\n{context}</s>\n"
	f"<\|assistant\|>\n"
	)
	return prompt


	def get_response(query: str, k: int = 3, score_threshold: float = 2.0) -> str:
	"""Return a chatbot response for the given query.

	This function performs the following steps:

	1. Ensures the embedder, vector database and LLM are initialised.
	2. Embeds the user's query and retrieves the top ``k`` most
	similar documents from the Chroma collection.
	3. Filters out documents whose cosine distance exceeds
	``score_threshold`` (larger distances indicate less similarity).
	4. Builds a prompt containing the user query and the concatenated
	relevant context.
	5. Feeds the prompt to the TinyLlama model and returns its
	response, trimming trailing whitespace.

	If no relevant context is found, a fallback message is returned.
	"""
	if not query or not query.strip():
	return "Please type a question about the Codingo platform."
	init_embedder_and_db()
	init_llm()
	assert _embedder is not None and _collection is not None and _llm is not None
	# Embed query and search collection
	query_vector = _embedder.encode([query])[0]
	results = _collection.query(query_embeddings=[query_vector.tolist()], n_results=k)
	docs = results.get("documents", [[]])[0] if results else []
	distances = results.get("distances", [[]])[0] if results else []
	# Filter by score
	relevant: List[str] = [d for d, s in zip(docs, distances) if s < score_threshold]
	if not relevant:
	return "Sorry, I don't have enough information to answer that question."
	context = "\n\n".join(relevant)
	prompt = _build_prompt(query, context)
	# Generate completion
	output = _llm(
	prompt,
	max_tokens=MAX_TOKENS,
	temperature=TEMPERATURE,
	top_p=TOP_P,
	repeat_penalty=REPEAT_PENALTY,
	stop=["</s>"]
	)
	text = output["choices"][0]["text"].strip()
	return text or "I'm here to answer your questions about Codingo. What would you like to know?"