Codingo / backend /services /codingo_chatbot.py
husseinelsaadi's picture
chatbot updated
fb236cf
raw
history blame
13 kB
"""
codingo_chatbot.py
===================
This module encapsulates the logic for Codingo's website chatbot. It
loads a knowledge base from ``chatbot/chatbot.txt``, builds a vector
database using Chroma and SentenceTransformers, and uses a local LLM
powered by ``llama‑cpp‑python`` to generate answers constrained to the
retrieved context. The code is written to initialise all heavy
resources lazily on first use and to cache them for subsequent
requests. This prevents repeated model downloads and avoids
recomputing embeddings for every chat query.
The underlying LLM is the TinyLlama 1.1B chat model distributed via
Hugging Face in GGUF format. When the model file is not present
locally it is downloaded automatically using ``huggingface_hub``.
Depending on the environment the model will run on GPU if CUDA is
available or fall back to CPU otherwise. See the ``init_llm``
function for details.
Note: This module deliberately contains no references to OpenAI. It
relies solely on open‑source libraries available on PyPI (such as
``llama‑cpp‑python`` and ``chromadb``) so that it can be used on
Hugging Face Spaces without requiring proprietary API keys.
"""
from __future__ import annotations
import os
import threading
from typing import List
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from huggingface_hub import hf_hub_download
try:
from llama_cpp import Llama # type: ignore
except Exception as exc: # pragma: no cover - import may fail until dependency installed
# Provide a helpful error if llama_cpp isn't installed.
raise ImportError(
"llama_cpp is required for the chatbot. Please add 'llama-cpp-python' "
"to your requirements.txt"
) from exc
# ---------------------------------------------------------------------------
# Configuration
#
# Compute the absolute path to the chatbot knowledge base. We derive this
# relative to this file so that the module works regardless of the working
# directory. The project structure places ``chatbot.txt`` at
# ``Codingo12/chatbot/chatbot.txt``.
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
CHATBOT_TXT_PATH = os.path.join(PROJECT_ROOT, "chatbot", "chatbot.txt")
# Directory where Chroma will persist its database. This location is
# writable on both local machines and Hugging Face Spaces. It is
# intentionally distinct from the web app instance path to avoid
# permission issues.
CHROMA_DB_DIR = os.path.join("/tmp", "chatbot_chroma")
# Settings for the TinyLlama model. These can be overridden via
# environment variables if desired (for example to switch to a
# different quantisation or to test with a smaller model). See
# https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF for
# available filenames.
LLAMA_REPO = os.getenv(
"LLAMA_REPO",
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
)
LLAMA_FILE = os.getenv(
"LLAMA_FILE",
"tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
)
# Local directory where the GGUF model file will be stored. Using
# ``/tmp`` avoids writing into the read‑only repository filesystem on
# Hugging Face Spaces. The directory will be created as needed.
LLAMA_LOCAL_DIR = os.path.join("/tmp", "llama_models")
# Generation parameters. These values mirror those used in the
# provided Jupyter notebook. They can be tweaked via environment
# variables if necessary to trade off quality against speed.
MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "256"))
TEMPERATURE = float(os.getenv("LLAMA_TEMPERATURE", "0.7"))
TOP_P = float(os.getenv("LLAMA_TOP_P", "0.9"))
REPEAT_PENALTY = float(os.getenv("LLAMA_REPEAT_PENALTY", "1.15"))
# Thread lock to guard lazy initialisation in multi‑threaded Flask
# environments. Without this lock multiple concurrent requests may
# attempt to download the model or populate the database at the same
# time, leading to redundant work or race conditions.
_init_lock = threading.Lock()
# Global singletons for embedder, vector collection and LLM. These
# variables are populated on first use and reused thereafter.
_embedder: SentenceTransformer | None = None
_collection: chromadb.Collection | None = None
_llm: Llama | None = None
def _load_chatbot_text() -> str:
"""Read the chatbot knowledge base from disk.
If the file is missing, a small default description of Codingo is
returned. This ensures the chatbot still provides a sensible
answer rather than crashing.
"""
try:
with open(CHATBOT_TXT_PATH, encoding="utf-8") as f:
return f.read()
except FileNotFoundError:
# Fallback content if the knowledge base file is missing
return (
"Codingo is an AI‑powered recruitment platform designed to "
"streamline job applications, candidate screening and hiring. "
"We make hiring smarter, faster and fairer through automation "
"and intelligent recommendations."
)
def init_embedder_and_db() -> None:
"""Initialise the SentenceTransformer embedder and Chroma vector DB.
This function is idempotent: if the embedder and collection are
already initialised it returns immediately. Otherwise it reads
``chatbot.txt``, splits it into overlapping chunks, computes
embeddings and persists them to a Chroma collection. The
resulting ``SentenceTransformer`` and collection objects are saved
in global variables for later reuse.
"""
global _embedder, _collection
if _embedder is not None and _collection is not None:
return
with _init_lock:
if _embedder is not None and _collection is not None:
return
# Ensure persistence directory exists
os.makedirs(CHROMA_DB_DIR, exist_ok=True)
# Read knowledge base
text = _load_chatbot_text()
# Split into chunks; use double newlines to prefer splitting on
# paragraph boundaries. Overlap helps the model maintain
# context across neighbouring chunks.
splitter = RecursiveCharacterTextSplitter(
chunk_size=300,
chunk_overlap=100,
separators=["\n\n"],
)
docs: List[str] = [doc.strip() for doc in splitter.split_text(text) if doc.strip()]
# Initialise embedder (MiniLM). We specify device via env.
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedder.encode(docs, show_progress_bar=False, batch_size=32)
# Initialise Chroma client
client = chromadb.Client(
Settings(
persist_directory=CHROMA_DB_DIR,
anonymized_telemetry=False,
is_persistent=True,
)
)
# Create or get collection. This returns an existing collection if
# already present on disk.
collection = client.get_or_create_collection("codingo_chatbot")
# Populate collection only if empty. A naive call to
# ``collection.get(limit=1)`` may raise if the collection does
# not exist yet, so we catch any exception and treat it as an
# empty DB. Distances are stored as cosine similarity.
need_populate = False
try:
existing = collection.get(limit=1)
if not existing or not existing.get("documents"):
need_populate = True
except Exception:
need_populate = True
if need_populate:
ids = [f"doc_{i}" for i in range(len(docs))]
collection.add(documents=docs, embeddings=embeddings.tolist(), ids=ids)
_embedder = embedder
_collection = collection
def init_llm() -> None:
"""Initialise the llama‑cpp model for response generation.
This function lazily downloads the GGUF model from Hugging Face if
necessary and instantiates a ``llama_cpp.Llama`` object. The
resulting instance is stored in the global ``_llm`` variable. To
control GPU usage set the ``CUDA_VISIBLE_DEVICES`` environment
variable or override ``LLAMA_N_GPU_LAYERS``. By default we use one
GPU layer when CUDA is available, otherwise the model runs on CPU.
"""
global _llm
if _llm is not None:
return
with _init_lock:
if _llm is not None:
return
# Ensure the model directory exists
os.makedirs(LLAMA_LOCAL_DIR, exist_ok=True)
# Download model if not already present
local_path = os.path.join(LLAMA_LOCAL_DIR, LLAMA_FILE)
if not os.path.exists(local_path):
# The file will be downloaded to LLAMA_LOCAL_DIR. Use
# ``local_dir_use_symlinks=False`` to avoid creating
# symlinks that may break on certain filesystems.
local_path = hf_hub_download(
repo_id=LLAMA_REPO,
filename=LLAMA_FILE,
local_dir=LLAMA_LOCAL_DIR,
local_dir_use_symlinks=False,
)
# Determine GPU usage. We default to one GPU layer if CUDA
# appears available. Users can override via LLAMA_N_GPU_LAYERS.
try:
import torch # type: ignore
use_cuda = torch.cuda.is_available()
except Exception:
use_cuda = False
n_gpu_layers_env = os.getenv("LLAMA_N_GPU_LAYERS")
if n_gpu_layers_env:
try:
n_gpu_layers = int(n_gpu_layers_env)
except ValueError:
n_gpu_layers = 0
else:
n_gpu_layers = 1 if use_cuda else 0
# Construct the Llama instance. The context window is set
# generously to 2048 tokens; adjust via LLAMA_N_CTX if needed.
n_ctx = int(os.getenv("LLAMA_N_CTX", "2048"))
# Use half the available CPU cores for inference threads to
# balance responsiveness and resource use.
try:
n_threads = max(1, os.cpu_count() // 2)
except Exception:
n_threads = 2
_llm = Llama(
model_path=local_path,
n_ctx=n_ctx,
n_threads=n_threads,
n_gpu_layers=n_gpu_layers,
)
def _build_prompt(query: str, context: str) -> str:
"""Construct the full prompt for the TinyLlama chat model.
The prompt format follows the conventions used by the model as
illustrated in the provided notebook. We include a system message
instructing the model to answer only using the given context and to
politely decline if the information is unavailable.
"""
system_prompt = (
"You are the official chatbot of Codingo. "
"Answer ONLY by using the CONTEXT. "
"If the information is not available for you, say it politely."
)
prompt = (
f"<|system|>\n{system_prompt}</s>\n"
f"<|user|>\n{query}\n\nCONTEXTE:\n{context}</s>\n"
f"<|assistant|>\n"
)
return prompt
def get_response(query: str, k: int = 3, score_threshold: float = 2.0) -> str:
"""Return a chatbot response for the given query.
This function performs the following steps:
1. Ensures the embedder, vector database and LLM are initialised.
2. Embeds the user's query and retrieves the top ``k`` most
similar documents from the Chroma collection.
3. Filters out documents whose cosine distance exceeds
``score_threshold`` (larger distances indicate less similarity).
4. Builds a prompt containing the user query and the concatenated
relevant context.
5. Feeds the prompt to the TinyLlama model and returns its
response, trimming trailing whitespace.
If no relevant context is found, a fallback message is returned.
"""
if not query or not query.strip():
return "Please type a question about the Codingo platform."
init_embedder_and_db()
init_llm()
assert _embedder is not None and _collection is not None and _llm is not None
# Embed query and search collection
query_vector = _embedder.encode([query])[0]
results = _collection.query(query_embeddings=[query_vector.tolist()], n_results=k)
docs = results.get("documents", [[]])[0] if results else []
distances = results.get("distances", [[]])[0] if results else []
# Filter by score
relevant: List[str] = [d for d, s in zip(docs, distances) if s < score_threshold]
if not relevant:
return "Sorry, I don't have enough information to answer that question."
context = "\n\n".join(relevant)
prompt = _build_prompt(query, context)
# Generate completion
output = _llm(
prompt,
max_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
top_p=TOP_P,
repeat_penalty=REPEAT_PENALTY,
stop=["</s>"]
)
text = output["choices"][0]["text"].strip()
return text or "I'm here to answer your questions about Codingo. What would you like to know?"