Spaces:

husseinelsaadi
/

Codingo

Paused

File size: 12,990 Bytes

fb236cf

"""
codingo_chatbot.py
===================

This module encapsulates the logic for Codingo's website chatbot.  It
loads a knowledge base from ``chatbot/chatbot.txt``, builds a vector
database using Chroma and SentenceTransformers, and uses a local LLM
powered by ``llama‑cpp‑python`` to generate answers constrained to the
retrieved context.  The code is written to initialise all heavy
resources lazily on first use and to cache them for subsequent
requests.  This prevents repeated model downloads and avoids
recomputing embeddings for every chat query.

The underlying LLM is the TinyLlama 1.1B chat model distributed via
Hugging Face in GGUF format.  When the model file is not present
locally it is downloaded automatically using ``huggingface_hub``.
Depending on the environment the model will run on GPU if CUDA is
available or fall back to CPU otherwise.  See the ``init_llm``
function for details.

Note: This module deliberately contains no references to OpenAI.  It
relies solely on open‑source libraries available on PyPI (such as
``llama‑cpp‑python`` and ``chromadb``) so that it can be used on
Hugging Face Spaces without requiring proprietary API keys.
"""

from __future__ import annotations

import os
import threading
from typing import List

import numpy as np

from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from huggingface_hub import hf_hub_download

try:
    from llama_cpp import Llama  # type: ignore
except Exception as exc:  # pragma: no cover - import may fail until dependency installed
    # Provide a helpful error if llama_cpp isn't installed.
    raise ImportError(
        "llama_cpp is required for the chatbot. Please add 'llama-cpp-python' "
        "to your requirements.txt"
    ) from exc

# ---------------------------------------------------------------------------
# Configuration
#
# Compute the absolute path to the chatbot knowledge base.  We derive this
# relative to this file so that the module works regardless of the working
# directory.  The project structure places ``chatbot.txt`` at
# ``Codingo12/chatbot/chatbot.txt``.
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
CHATBOT_TXT_PATH = os.path.join(PROJECT_ROOT, "chatbot", "chatbot.txt")

# Directory where Chroma will persist its database.  This location is
# writable on both local machines and Hugging Face Spaces.  It is
# intentionally distinct from the web app instance path to avoid
# permission issues.
CHROMA_DB_DIR = os.path.join("/tmp", "chatbot_chroma")

# Settings for the TinyLlama model.  These can be overridden via
# environment variables if desired (for example to switch to a
# different quantisation or to test with a smaller model).  See
# https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF for
# available filenames.
LLAMA_REPO = os.getenv(
    "LLAMA_REPO",
    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
)
LLAMA_FILE = os.getenv(
    "LLAMA_FILE",
    "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
)

# Local directory where the GGUF model file will be stored.  Using
# ``/tmp`` avoids writing into the read‑only repository filesystem on
# Hugging Face Spaces.  The directory will be created as needed.
LLAMA_LOCAL_DIR = os.path.join("/tmp", "llama_models")

# Generation parameters.  These values mirror those used in the
# provided Jupyter notebook.  They can be tweaked via environment
# variables if necessary to trade off quality against speed.
MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "256"))
TEMPERATURE = float(os.getenv("LLAMA_TEMPERATURE", "0.7"))
TOP_P = float(os.getenv("LLAMA_TOP_P", "0.9"))
REPEAT_PENALTY = float(os.getenv("LLAMA_REPEAT_PENALTY", "1.15"))

# Thread lock to guard lazy initialisation in multi‑threaded Flask
# environments.  Without this lock multiple concurrent requests may
# attempt to download the model or populate the database at the same
# time, leading to redundant work or race conditions.
_init_lock = threading.Lock()

# Global singletons for embedder, vector collection and LLM.  These
# variables are populated on first use and reused thereafter.
_embedder: SentenceTransformer | None = None
_collection: chromadb.Collection | None = None
_llm: Llama | None = None


def _load_chatbot_text() -> str:
    """Read the chatbot knowledge base from disk.

    If the file is missing, a small default description of Codingo is
    returned.  This ensures the chatbot still provides a sensible
    answer rather than crashing.
    """
    try:
        with open(CHATBOT_TXT_PATH, encoding="utf-8") as f:
            return f.read()
    except FileNotFoundError:
        # Fallback content if the knowledge base file is missing
        return (
            "Codingo is an AI‑powered recruitment platform designed to "
            "streamline job applications, candidate screening and hiring. "
            "We make hiring smarter, faster and fairer through automation "
            "and intelligent recommendations."
        )


def init_embedder_and_db() -> None:
    """Initialise the SentenceTransformer embedder and Chroma vector DB.

    This function is idempotent: if the embedder and collection are
    already initialised it returns immediately.  Otherwise it reads
    ``chatbot.txt``, splits it into overlapping chunks, computes
    embeddings and persists them to a Chroma collection.  The
    resulting ``SentenceTransformer`` and collection objects are saved
    in global variables for later reuse.
    """
    global _embedder, _collection
    if _embedder is not None and _collection is not None:
        return
    with _init_lock:
        if _embedder is not None and _collection is not None:
            return
        # Ensure persistence directory exists
        os.makedirs(CHROMA_DB_DIR, exist_ok=True)

        # Read knowledge base
        text = _load_chatbot_text()

        # Split into chunks; use double newlines to prefer splitting on
        # paragraph boundaries.  Overlap helps the model maintain
        # context across neighbouring chunks.
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=300,
            chunk_overlap=100,
            separators=["\n\n"],
        )
        docs: List[str] = [doc.strip() for doc in splitter.split_text(text) if doc.strip()]

        # Initialise embedder (MiniLM).  We specify device via env.
        embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
        embeddings = embedder.encode(docs, show_progress_bar=False, batch_size=32)

        # Initialise Chroma client
        client = chromadb.Client(
            Settings(
                persist_directory=CHROMA_DB_DIR,
                anonymized_telemetry=False,
                is_persistent=True,
            )
        )

        # Create or get collection.  This returns an existing collection if
        # already present on disk.
        collection = client.get_or_create_collection("codingo_chatbot")

        # Populate collection only if empty.  A naive call to
        # ``collection.get(limit=1)`` may raise if the collection does
        # not exist yet, so we catch any exception and treat it as an
        # empty DB.  Distances are stored as cosine similarity.
        need_populate = False
        try:
            existing = collection.get(limit=1)
            if not existing or not existing.get("documents"):
                need_populate = True
        except Exception:
            need_populate = True
        if need_populate:
            ids = [f"doc_{i}" for i in range(len(docs))]
            collection.add(documents=docs, embeddings=embeddings.tolist(), ids=ids)
        _embedder = embedder
        _collection = collection


def init_llm() -> None:
    """Initialise the llama‑cpp model for response generation.

    This function lazily downloads the GGUF model from Hugging Face if
    necessary and instantiates a ``llama_cpp.Llama`` object.  The
    resulting instance is stored in the global ``_llm`` variable.  To
    control GPU usage set the ``CUDA_VISIBLE_DEVICES`` environment
    variable or override ``LLAMA_N_GPU_LAYERS``.  By default we use one
    GPU layer when CUDA is available, otherwise the model runs on CPU.
    """
    global _llm
    if _llm is not None:
        return
    with _init_lock:
        if _llm is not None:
            return
        # Ensure the model directory exists
        os.makedirs(LLAMA_LOCAL_DIR, exist_ok=True)
        # Download model if not already present
        local_path = os.path.join(LLAMA_LOCAL_DIR, LLAMA_FILE)
        if not os.path.exists(local_path):
            # The file will be downloaded to LLAMA_LOCAL_DIR.  Use
            # ``local_dir_use_symlinks=False`` to avoid creating
            # symlinks that may break on certain filesystems.
            local_path = hf_hub_download(
                repo_id=LLAMA_REPO,
                filename=LLAMA_FILE,
                local_dir=LLAMA_LOCAL_DIR,
                local_dir_use_symlinks=False,
            )
        # Determine GPU usage.  We default to one GPU layer if CUDA
        # appears available.  Users can override via LLAMA_N_GPU_LAYERS.
        try:
            import torch  # type: ignore
            use_cuda = torch.cuda.is_available()
        except Exception:
            use_cuda = False
        n_gpu_layers_env = os.getenv("LLAMA_N_GPU_LAYERS")
        if n_gpu_layers_env:
            try:
                n_gpu_layers = int(n_gpu_layers_env)
            except ValueError:
                n_gpu_layers = 0
        else:
            n_gpu_layers = 1 if use_cuda else 0
        # Construct the Llama instance.  The context window is set
        # generously to 2048 tokens; adjust via LLAMA_N_CTX if needed.
        n_ctx = int(os.getenv("LLAMA_N_CTX", "2048"))
        # Use half the available CPU cores for inference threads to
        # balance responsiveness and resource use.
        try:
            n_threads = max(1, os.cpu_count() // 2)
        except Exception:
            n_threads = 2
        _llm = Llama(
            model_path=local_path,
            n_ctx=n_ctx,
            n_threads=n_threads,
            n_gpu_layers=n_gpu_layers,
        )


def _build_prompt(query: str, context: str) -> str:
    """Construct the full prompt for the TinyLlama chat model.

    The prompt format follows the conventions used by the model as
    illustrated in the provided notebook.  We include a system message
    instructing the model to answer only using the given context and to
    politely decline if the information is unavailable.
    """
    system_prompt = (
        "You are the official chatbot of Codingo. "
        "Answer ONLY by using the CONTEXT. "
        "If the information is not available for you, say it politely."
    )
    prompt = (
        f"<|system|>\n{system_prompt}</s>\n"
        f"<|user|>\n{query}\n\nCONTEXTE:\n{context}</s>\n"
        f"<|assistant|>\n"
    )
    return prompt


def get_response(query: str, k: int = 3, score_threshold: float = 2.0) -> str:
    """Return a chatbot response for the given query.

    This function performs the following steps:

    1. Ensures the embedder, vector database and LLM are initialised.
    2. Embeds the user's query and retrieves the top ``k`` most
       similar documents from the Chroma collection.
    3. Filters out documents whose cosine distance exceeds
       ``score_threshold`` (larger distances indicate less similarity).
    4. Builds a prompt containing the user query and the concatenated
       relevant context.
    5. Feeds the prompt to the TinyLlama model and returns its
       response, trimming trailing whitespace.

    If no relevant context is found, a fallback message is returned.
    """
    if not query or not query.strip():
        return "Please type a question about the Codingo platform."
    init_embedder_and_db()
    init_llm()
    assert _embedder is not None and _collection is not None and _llm is not None
    # Embed query and search collection
    query_vector = _embedder.encode([query])[0]
    results = _collection.query(query_embeddings=[query_vector.tolist()], n_results=k)
    docs = results.get("documents", [[]])[0] if results else []
    distances = results.get("distances", [[]])[0] if results else []
    # Filter by score
    relevant: List[str] = [d for d, s in zip(docs, distances) if s < score_threshold]
    if not relevant:
        return "Sorry, I don't have enough information to answer that question."
    context = "\n\n".join(relevant)
    prompt = _build_prompt(query, context)
    # Generate completion
    output = _llm(
        prompt,
        max_tokens=MAX_TOKENS,
        temperature=TEMPERATURE,
        top_p=TOP_P,
        repeat_penalty=REPEAT_PENALTY,
        stop=["</s>"]
    )
    text = output["choices"][0]["text"].strip()
    return text or "I'm here to answer your questions about Codingo. What would you like to know?"