Spaces:
Paused
Paused
""" | |
codingo_chatbot.py | |
=================== | |
This module encapsulates the logic for Codingo's website chatbot. It | |
loads a knowledge base from ``chatbot/chatbot.txt``, builds a vector | |
database using Chroma and SentenceTransformers, and uses a local LLM | |
powered by ``llama‑cpp‑python`` to generate answers constrained to the | |
retrieved context. The code is written to initialise all heavy | |
resources lazily on first use and to cache them for subsequent | |
requests. This prevents repeated model downloads and avoids | |
recomputing embeddings for every chat query. | |
The underlying LLM is the TinyLlama 1.1B chat model distributed via | |
Hugging Face in GGUF format. When the model file is not present | |
locally it is downloaded automatically using ``huggingface_hub``. | |
Depending on the environment the model will run on GPU if CUDA is | |
available or fall back to CPU otherwise. See the ``init_llm`` | |
function for details. | |
Note: This module deliberately contains no references to OpenAI. It | |
relies solely on open‑source libraries available on PyPI (such as | |
``llama‑cpp‑python`` and ``chromadb``) so that it can be used on | |
Hugging Face Spaces without requiring proprietary API keys. | |
""" | |
from __future__ import annotations | |
import os | |
import threading | |
from typing import List | |
import numpy as np | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from sentence_transformers import SentenceTransformer | |
import chromadb | |
from chromadb.config import Settings | |
from huggingface_hub import hf_hub_download | |
try: | |
from llama_cpp import Llama # type: ignore | |
except Exception as exc: # pragma: no cover - import may fail until dependency installed | |
# Provide a helpful error if llama_cpp isn't installed. | |
raise ImportError( | |
"llama_cpp is required for the chatbot. Please add 'llama-cpp-python' " | |
"to your requirements.txt" | |
) from exc | |
# --------------------------------------------------------------------------- | |
# Configuration | |
# | |
# Compute the absolute path to the chatbot knowledge base. We derive this | |
# relative to this file so that the module works regardless of the working | |
# directory. The project structure places ``chatbot.txt`` at | |
# ``Codingo12/chatbot/chatbot.txt``. | |
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) | |
CHATBOT_TXT_PATH = os.path.join(PROJECT_ROOT, "chatbot", "chatbot.txt") | |
# Directory where Chroma will persist its database. This location is | |
# writable on both local machines and Hugging Face Spaces. It is | |
# intentionally distinct from the web app instance path to avoid | |
# permission issues. | |
CHROMA_DB_DIR = os.path.join("/tmp", "chatbot_chroma") | |
# Settings for the TinyLlama model. These can be overridden via | |
# environment variables if desired (for example to switch to a | |
# different quantisation or to test with a smaller model). See | |
# https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF for | |
# available filenames. | |
LLAMA_REPO = os.getenv( | |
"LLAMA_REPO", | |
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", | |
) | |
LLAMA_FILE = os.getenv( | |
"LLAMA_FILE", | |
"tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", | |
) | |
# Local directory where the GGUF model file will be stored. Using | |
# ``/tmp`` avoids writing into the read‑only repository filesystem on | |
# Hugging Face Spaces. The directory will be created as needed. | |
LLAMA_LOCAL_DIR = os.path.join("/tmp", "llama_models") | |
# Generation parameters. These values mirror those used in the | |
# provided Jupyter notebook. They can be tweaked via environment | |
# variables if necessary to trade off quality against speed. | |
MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "256")) | |
TEMPERATURE = float(os.getenv("LLAMA_TEMPERATURE", "0.7")) | |
TOP_P = float(os.getenv("LLAMA_TOP_P", "0.9")) | |
REPEAT_PENALTY = float(os.getenv("LLAMA_REPEAT_PENALTY", "1.15")) | |
# Thread lock to guard lazy initialisation in multi‑threaded Flask | |
# environments. Without this lock multiple concurrent requests may | |
# attempt to download the model or populate the database at the same | |
# time, leading to redundant work or race conditions. | |
_init_lock = threading.Lock() | |
# Global singletons for embedder, vector collection and LLM. These | |
# variables are populated on first use and reused thereafter. | |
_embedder: SentenceTransformer | None = None | |
_collection: chromadb.Collection | None = None | |
_llm: Llama | None = None | |
def _load_chatbot_text() -> str: | |
"""Read the chatbot knowledge base from disk. | |
If the file is missing, a small default description of Codingo is | |
returned. This ensures the chatbot still provides a sensible | |
answer rather than crashing. | |
""" | |
try: | |
with open(CHATBOT_TXT_PATH, encoding="utf-8") as f: | |
return f.read() | |
except FileNotFoundError: | |
# Fallback content if the knowledge base file is missing | |
return ( | |
"Codingo is an AI‑powered recruitment platform designed to " | |
"streamline job applications, candidate screening and hiring. " | |
"We make hiring smarter, faster and fairer through automation " | |
"and intelligent recommendations." | |
) | |
def init_embedder_and_db() -> None: | |
"""Initialise the SentenceTransformer embedder and Chroma vector DB. | |
This function is idempotent: if the embedder and collection are | |
already initialised it returns immediately. Otherwise it reads | |
``chatbot.txt``, splits it into overlapping chunks, computes | |
embeddings and persists them to a Chroma collection. The | |
resulting ``SentenceTransformer`` and collection objects are saved | |
in global variables for later reuse. | |
""" | |
global _embedder, _collection | |
if _embedder is not None and _collection is not None: | |
return | |
with _init_lock: | |
if _embedder is not None and _collection is not None: | |
return | |
# Ensure persistence directory exists | |
os.makedirs(CHROMA_DB_DIR, exist_ok=True) | |
# Read knowledge base | |
text = _load_chatbot_text() | |
# Split into chunks; use double newlines to prefer splitting on | |
# paragraph boundaries. Overlap helps the model maintain | |
# context across neighbouring chunks. | |
splitter = RecursiveCharacterTextSplitter( | |
chunk_size=300, | |
chunk_overlap=100, | |
separators=["\n\n"], | |
) | |
docs: List[str] = [doc.strip() for doc in splitter.split_text(text) if doc.strip()] | |
# Initialise embedder (MiniLM). We specify device via env. | |
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
embeddings = embedder.encode(docs, show_progress_bar=False, batch_size=32) | |
# Initialise Chroma client | |
client = chromadb.Client( | |
Settings( | |
persist_directory=CHROMA_DB_DIR, | |
anonymized_telemetry=False, | |
is_persistent=True, | |
) | |
) | |
# Create or get collection. This returns an existing collection if | |
# already present on disk. | |
collection = client.get_or_create_collection("codingo_chatbot") | |
# Populate collection only if empty. A naive call to | |
# ``collection.get(limit=1)`` may raise if the collection does | |
# not exist yet, so we catch any exception and treat it as an | |
# empty DB. Distances are stored as cosine similarity. | |
need_populate = False | |
try: | |
existing = collection.get(limit=1) | |
if not existing or not existing.get("documents"): | |
need_populate = True | |
except Exception: | |
need_populate = True | |
if need_populate: | |
ids = [f"doc_{i}" for i in range(len(docs))] | |
collection.add(documents=docs, embeddings=embeddings.tolist(), ids=ids) | |
_embedder = embedder | |
_collection = collection | |
def init_llm() -> None: | |
"""Initialise the llama‑cpp model for response generation. | |
This function lazily downloads the GGUF model from Hugging Face if | |
necessary and instantiates a ``llama_cpp.Llama`` object. The | |
resulting instance is stored in the global ``_llm`` variable. To | |
control GPU usage set the ``CUDA_VISIBLE_DEVICES`` environment | |
variable or override ``LLAMA_N_GPU_LAYERS``. By default we use one | |
GPU layer when CUDA is available, otherwise the model runs on CPU. | |
""" | |
global _llm | |
if _llm is not None: | |
return | |
with _init_lock: | |
if _llm is not None: | |
return | |
# Ensure the model directory exists | |
os.makedirs(LLAMA_LOCAL_DIR, exist_ok=True) | |
# Download model if not already present | |
local_path = os.path.join(LLAMA_LOCAL_DIR, LLAMA_FILE) | |
if not os.path.exists(local_path): | |
# The file will be downloaded to LLAMA_LOCAL_DIR. Use | |
# ``local_dir_use_symlinks=False`` to avoid creating | |
# symlinks that may break on certain filesystems. | |
local_path = hf_hub_download( | |
repo_id=LLAMA_REPO, | |
filename=LLAMA_FILE, | |
local_dir=LLAMA_LOCAL_DIR, | |
local_dir_use_symlinks=False, | |
) | |
# Determine GPU usage. We default to one GPU layer if CUDA | |
# appears available. Users can override via LLAMA_N_GPU_LAYERS. | |
try: | |
import torch # type: ignore | |
use_cuda = torch.cuda.is_available() | |
except Exception: | |
use_cuda = False | |
n_gpu_layers_env = os.getenv("LLAMA_N_GPU_LAYERS") | |
if n_gpu_layers_env: | |
try: | |
n_gpu_layers = int(n_gpu_layers_env) | |
except ValueError: | |
n_gpu_layers = 0 | |
else: | |
n_gpu_layers = 1 if use_cuda else 0 | |
# Construct the Llama instance. The context window is set | |
# generously to 2048 tokens; adjust via LLAMA_N_CTX if needed. | |
n_ctx = int(os.getenv("LLAMA_N_CTX", "2048")) | |
# Use half the available CPU cores for inference threads to | |
# balance responsiveness and resource use. | |
try: | |
n_threads = max(1, os.cpu_count() // 2) | |
except Exception: | |
n_threads = 2 | |
_llm = Llama( | |
model_path=local_path, | |
n_ctx=n_ctx, | |
n_threads=n_threads, | |
n_gpu_layers=n_gpu_layers, | |
) | |
def _build_prompt(query: str, context: str) -> str: | |
"""Construct the full prompt for the TinyLlama chat model. | |
The prompt format follows the conventions used by the model as | |
illustrated in the provided notebook. We include a system message | |
instructing the model to answer only using the given context and to | |
politely decline if the information is unavailable. | |
""" | |
system_prompt = ( | |
"You are the official chatbot of Codingo. " | |
"Answer ONLY by using the CONTEXT. " | |
"If the information is not available for you, say it politely." | |
) | |
prompt = ( | |
f"<|system|>\n{system_prompt}</s>\n" | |
f"<|user|>\n{query}\n\nCONTEXTE:\n{context}</s>\n" | |
f"<|assistant|>\n" | |
) | |
return prompt | |
def get_response(query: str, k: int = 3, score_threshold: float = 2.0) -> str: | |
"""Return a chatbot response for the given query. | |
This function performs the following steps: | |
1. Ensures the embedder, vector database and LLM are initialised. | |
2. Embeds the user's query and retrieves the top ``k`` most | |
similar documents from the Chroma collection. | |
3. Filters out documents whose cosine distance exceeds | |
``score_threshold`` (larger distances indicate less similarity). | |
4. Builds a prompt containing the user query and the concatenated | |
relevant context. | |
5. Feeds the prompt to the TinyLlama model and returns its | |
response, trimming trailing whitespace. | |
If no relevant context is found, a fallback message is returned. | |
""" | |
if not query or not query.strip(): | |
return "Please type a question about the Codingo platform." | |
init_embedder_and_db() | |
init_llm() | |
assert _embedder is not None and _collection is not None and _llm is not None | |
# Embed query and search collection | |
query_vector = _embedder.encode([query])[0] | |
results = _collection.query(query_embeddings=[query_vector.tolist()], n_results=k) | |
docs = results.get("documents", [[]])[0] if results else [] | |
distances = results.get("distances", [[]])[0] if results else [] | |
# Filter by score | |
relevant: List[str] = [d for d, s in zip(docs, distances) if s < score_threshold] | |
if not relevant: | |
return "Sorry, I don't have enough information to answer that question." | |
context = "\n\n".join(relevant) | |
prompt = _build_prompt(query, context) | |
# Generate completion | |
output = _llm( | |
prompt, | |
max_tokens=MAX_TOKENS, | |
temperature=TEMPERATURE, | |
top_p=TOP_P, | |
repeat_penalty=REPEAT_PENALTY, | |
stop=["</s>"] | |
) | |
text = output["choices"][0]["text"].strip() | |
return text or "I'm here to answer your questions about Codingo. What would you like to know?" | |