""" codingo_chatbot.py =================== This module encapsulates the logic for Codingo's website chatbot. It loads a knowledge base from ``chatbot/chatbot.txt``, builds a vector database using Chroma and SentenceTransformers, and uses a local LLM powered by ``llama‑cpp‑python`` to generate answers constrained to the retrieved context. The code is written to initialise all heavy resources lazily on first use and to cache them for subsequent requests. This prevents repeated model downloads and avoids recomputing embeddings for every chat query. The underlying LLM is the TinyLlama 1.1B chat model distributed via Hugging Face in GGUF format. When the model file is not present locally it is downloaded automatically using ``huggingface_hub``. Depending on the environment the model will run on GPU if CUDA is available or fall back to CPU otherwise. See the ``init_llm`` function for details. Note: This module deliberately contains no references to OpenAI. It relies solely on open‑source libraries available on PyPI (such as ``llama‑cpp‑python`` and ``chromadb``) so that it can be used on Hugging Face Spaces without requiring proprietary API keys. """ from __future__ import annotations import os import threading from typing import List import numpy as np from langchain.text_splitter import RecursiveCharacterTextSplitter from sentence_transformers import SentenceTransformer import chromadb from chromadb.config import Settings from huggingface_hub import hf_hub_download try: from llama_cpp import Llama # type: ignore except Exception as exc: # pragma: no cover - import may fail until dependency installed # Provide a helpful error if llama_cpp isn't installed. raise ImportError( "llama_cpp is required for the chatbot. Please add 'llama-cpp-python' " "to your requirements.txt" ) from exc # --------------------------------------------------------------------------- # Configuration # # Compute the absolute path to the chatbot knowledge base. We derive this # relative to this file so that the module works regardless of the working # directory. The project structure places ``chatbot.txt`` at # ``Codingo12/chatbot/chatbot.txt``. PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) CHATBOT_TXT_PATH = os.path.join(PROJECT_ROOT, "chatbot", "chatbot.txt") # Directory where Chroma will persist its database. This location is # writable on both local machines and Hugging Face Spaces. It is # intentionally distinct from the web app instance path to avoid # permission issues. CHROMA_DB_DIR = os.path.join("/tmp", "chatbot_chroma") # Settings for the TinyLlama model. These can be overridden via # environment variables if desired (for example to switch to a # different quantisation or to test with a smaller model). See # https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF for # available filenames. LLAMA_REPO = os.getenv( "LLAMA_REPO", "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", ) LLAMA_FILE = os.getenv( "LLAMA_FILE", "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", ) # Local directory where the GGUF model file will be stored. Using # ``/tmp`` avoids writing into the read‑only repository filesystem on # Hugging Face Spaces. The directory will be created as needed. LLAMA_LOCAL_DIR = os.path.join("/tmp", "llama_models") # Generation parameters. These values mirror those used in the # provided Jupyter notebook. They can be tweaked via environment # variables if necessary to trade off quality against speed. MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "256")) TEMPERATURE = float(os.getenv("LLAMA_TEMPERATURE", "0.7")) TOP_P = float(os.getenv("LLAMA_TOP_P", "0.9")) REPEAT_PENALTY = float(os.getenv("LLAMA_REPEAT_PENALTY", "1.15")) # Thread lock to guard lazy initialisation in multi‑threaded Flask # environments. Without this lock multiple concurrent requests may # attempt to download the model or populate the database at the same # time, leading to redundant work or race conditions. _init_lock = threading.Lock() # Global singletons for embedder, vector collection and LLM. These # variables are populated on first use and reused thereafter. _embedder: SentenceTransformer | None = None _collection: chromadb.Collection | None = None _llm: Llama | None = None def _load_chatbot_text() -> str: """Read the chatbot knowledge base from disk. If the file is missing, a small default description of Codingo is returned. This ensures the chatbot still provides a sensible answer rather than crashing. """ try: with open(CHATBOT_TXT_PATH, encoding="utf-8") as f: return f.read() except FileNotFoundError: # Fallback content if the knowledge base file is missing return ( "Codingo is an AI‑powered recruitment platform designed to " "streamline job applications, candidate screening and hiring. " "We make hiring smarter, faster and fairer through automation " "and intelligent recommendations." ) def init_embedder_and_db() -> None: """Initialise the SentenceTransformer embedder and Chroma vector DB. This function is idempotent: if the embedder and collection are already initialised it returns immediately. Otherwise it reads ``chatbot.txt``, splits it into overlapping chunks, computes embeddings and persists them to a Chroma collection. The resulting ``SentenceTransformer`` and collection objects are saved in global variables for later reuse. """ global _embedder, _collection if _embedder is not None and _collection is not None: return with _init_lock: if _embedder is not None and _collection is not None: return # Ensure persistence directory exists os.makedirs(CHROMA_DB_DIR, exist_ok=True) # Read knowledge base text = _load_chatbot_text() # Split into chunks; use double newlines to prefer splitting on # paragraph boundaries. Overlap helps the model maintain # context across neighbouring chunks. splitter = RecursiveCharacterTextSplitter( chunk_size=300, chunk_overlap=100, separators=["\n\n"], ) docs: List[str] = [doc.strip() for doc in splitter.split_text(text) if doc.strip()] # Initialise embedder (MiniLM). We specify device via env. embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") embeddings = embedder.encode(docs, show_progress_bar=False, batch_size=32) # Initialise Chroma client client = chromadb.Client( Settings( persist_directory=CHROMA_DB_DIR, anonymized_telemetry=False, is_persistent=True, ) ) # Create or get collection. This returns an existing collection if # already present on disk. collection = client.get_or_create_collection("codingo_chatbot") # Populate collection only if empty. A naive call to # ``collection.get(limit=1)`` may raise if the collection does # not exist yet, so we catch any exception and treat it as an # empty DB. Distances are stored as cosine similarity. need_populate = False try: existing = collection.get(limit=1) if not existing or not existing.get("documents"): need_populate = True except Exception: need_populate = True if need_populate: ids = [f"doc_{i}" for i in range(len(docs))] collection.add(documents=docs, embeddings=embeddings.tolist(), ids=ids) _embedder = embedder _collection = collection def init_llm() -> None: """Initialise the llama‑cpp model for response generation. This function lazily downloads the GGUF model from Hugging Face if necessary and instantiates a ``llama_cpp.Llama`` object. The resulting instance is stored in the global ``_llm`` variable. To control GPU usage set the ``CUDA_VISIBLE_DEVICES`` environment variable or override ``LLAMA_N_GPU_LAYERS``. By default we use one GPU layer when CUDA is available, otherwise the model runs on CPU. """ global _llm if _llm is not None: return with _init_lock: if _llm is not None: return # Ensure the model directory exists os.makedirs(LLAMA_LOCAL_DIR, exist_ok=True) # Download model if not already present local_path = os.path.join(LLAMA_LOCAL_DIR, LLAMA_FILE) if not os.path.exists(local_path): # The file will be downloaded to LLAMA_LOCAL_DIR. Use # ``local_dir_use_symlinks=False`` to avoid creating # symlinks that may break on certain filesystems. local_path = hf_hub_download( repo_id=LLAMA_REPO, filename=LLAMA_FILE, local_dir=LLAMA_LOCAL_DIR, local_dir_use_symlinks=False, ) # Determine GPU usage. We default to one GPU layer if CUDA # appears available. Users can override via LLAMA_N_GPU_LAYERS. try: import torch # type: ignore use_cuda = torch.cuda.is_available() except Exception: use_cuda = False n_gpu_layers_env = os.getenv("LLAMA_N_GPU_LAYERS") if n_gpu_layers_env: try: n_gpu_layers = int(n_gpu_layers_env) except ValueError: n_gpu_layers = 0 else: n_gpu_layers = 1 if use_cuda else 0 # Construct the Llama instance. The context window is set # generously to 2048 tokens; adjust via LLAMA_N_CTX if needed. n_ctx = int(os.getenv("LLAMA_N_CTX", "2048")) # Use half the available CPU cores for inference threads to # balance responsiveness and resource use. try: n_threads = max(1, os.cpu_count() // 2) except Exception: n_threads = 2 _llm = Llama( model_path=local_path, n_ctx=n_ctx, n_threads=n_threads, n_gpu_layers=n_gpu_layers, ) def _build_prompt(query: str, context: str) -> str: """Construct the full prompt for the TinyLlama chat model. The prompt format follows the conventions used by the model as illustrated in the provided notebook. We include a system message instructing the model to answer only using the given context and to politely decline if the information is unavailable. """ system_prompt = ( "You are the official chatbot of Codingo. " "Answer ONLY by using the CONTEXT. " "If the information is not available for you, say it politely." ) prompt = ( f"<|system|>\n{system_prompt}\n" f"<|user|>\n{query}\n\nCONTEXTE:\n{context}\n" f"<|assistant|>\n" ) return prompt def get_response(query: str, k: int = 3, score_threshold: float = 2.0) -> str: """Return a chatbot response for the given query. This function performs the following steps: 1. Ensures the embedder, vector database and LLM are initialised. 2. Embeds the user's query and retrieves the top ``k`` most similar documents from the Chroma collection. 3. Filters out documents whose cosine distance exceeds ``score_threshold`` (larger distances indicate less similarity). 4. Builds a prompt containing the user query and the concatenated relevant context. 5. Feeds the prompt to the TinyLlama model and returns its response, trimming trailing whitespace. If no relevant context is found, a fallback message is returned. """ if not query or not query.strip(): return "Please type a question about the Codingo platform." init_embedder_and_db() init_llm() assert _embedder is not None and _collection is not None and _llm is not None # Embed query and search collection query_vector = _embedder.encode([query])[0] results = _collection.query(query_embeddings=[query_vector.tolist()], n_results=k) docs = results.get("documents", [[]])[0] if results else [] distances = results.get("distances", [[]])[0] if results else [] # Filter by score relevant: List[str] = [d for d, s in zip(docs, distances) if s < score_threshold] if not relevant: return "Sorry, I don't have enough information to answer that question." context = "\n\n".join(relevant) prompt = _build_prompt(query, context) # Generate completion output = _llm( prompt, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, repeat_penalty=REPEAT_PENALTY, stop=[""] ) text = output["choices"][0]["text"].strip() return text or "I'm here to answer your questions about Codingo. What would you like to know?"