husseinelsaadi commited on
Commit
fb236cf
·
1 Parent(s): 7502aed

chatbot updated

Browse files
Files changed (4) hide show
  1. Dockerfile +2 -0
  2. app.py +30 -29
  3. backend/services/codingo_chatbot.py +319 -0
  4. requirements.txt +5 -2
Dockerfile CHANGED
@@ -5,6 +5,8 @@ FROM nvidia/cuda:12.3.2-cudnn9-runtime-ubuntu22.04
5
  ENV DEBIAN_FRONTEND=noninteractive
6
  RUN apt-get update && apt-get install -y \
7
  python3 python3-pip ffmpeg git libsndfile1 \
 
 
8
  && rm -rf /var/lib/apt/lists/*
9
 
10
  # Set up Python environment
 
5
  ENV DEBIAN_FRONTEND=noninteractive
6
  RUN apt-get update && apt-get install -y \
7
  python3 python3-pip ffmpeg git libsndfile1 \
8
+ # Development tools required to compile native extensions such as llama-cpp-python
9
+ build-essential cmake libopenblas-dev \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
  # Set up Python environment
app.py CHANGED
@@ -32,27 +32,34 @@ import re
32
  import json
33
 
34
  # -----------------------------------------------------------------------------
35
- # Chatbot setup
36
  #
37
- # The chatbot uses a local vector database (Chroma) to search the
38
- # ``chatbot/chatbot.txt`` knowledge base. Retrieved passages are fed to
39
- # a lightweight conversational model from Hugging Face. To avoid the
40
- # expensive model and database initialisation on every request, embeddings
41
- # and the vector collection are loaded lazily the first time a chat query
42
- # is processed. Subsequent requests reuse the same global objects. All
43
- # chatbot logic resides in ``chatbot/chatbot.py``.
44
-
45
- # Paths for the chatbot knowledge base and persistent vector store. We
46
- # compute these relative to the current file so that the app can be deployed
47
- # anywhere without needing to change configuration. The ``chroma_db``
48
- # directory will be created automatically by the Chroma client if it does not
49
- # exist.
50
- # The internal chatbot logic has been extracted to ``chatbot/chatbot.py``. See
51
- # that module for details. We import the ``get_chatbot_response`` function
52
- # here so that the Flask route can delegate queries directly to it. This
53
- # prevents ``app.py`` from depending on the heavy ML libraries and keeps
54
- # the application entry point lean.
55
- from chatbot.chatbot import get_chatbot_response
 
 
 
 
 
 
 
56
 
57
  # Initialize Flask app
58
  app = Flask(
@@ -348,17 +355,11 @@ if __name__ == '__main__':
348
 
349
  with app.app_context():
350
  db.create_all()
351
- # Pre-initialize the chatbot on startup for faster first response. We
352
- # deliberately trigger a dummy query here to force loading of the
353
- # sentence encoder, vector store and conversational model. Any
354
- # exceptions during warm‑up are logged but do not stop the app from
355
- # starting.
356
  print("Initializing chatbot...")
357
  try:
358
- # Import inside the block to ensure the module has been
359
- # properly loaded with the current environment settings.
360
- from chatbot.chatbot import get_chatbot_response
361
- _ = get_chatbot_response("Hello!")
362
  print("Chatbot initialized successfully")
363
  except Exception as e:
364
  print(f"Chatbot initialization warning: {e}")
 
32
  import json
33
 
34
  # -----------------------------------------------------------------------------
35
+ # Chatbot integration
36
  #
37
+ # We delegate all chatbot logic to the ``codingo_chatbot`` module within
38
+ # ``backend/services``. This module handles loading the knowledge base,
39
+ # building embeddings, initialising the TinyLlama model and generating
40
+ # responses. Importing here ensures the heavy dependencies are loaded only
41
+ # when the chatbot endpoint is used. See ``backend/services/codingo_chatbot.py``
42
+ # for implementation details.
43
+
44
+ from backend.services.codingo_chatbot import get_response as _codingo_get_response
45
+
46
+
47
+ def get_chatbot_response(query: str) -> str:
48
+ """Proxy to the codingo_chatbot implementation.
49
+
50
+ This function exists to preserve the original public API of
51
+ ``app.get_chatbot_response`` while redirecting calls to the new
52
+ implementation. It catches any exceptions and returns a user
53
+ friendly message, ensuring the Flask route never raises.
54
+ """
55
+ try:
56
+ return _codingo_get_response(query)
57
+ except Exception as exc:
58
+ print(f"Chatbot error: {exc}", file=sys.stderr)
59
+ return (
60
+ "I'm having trouble processing your request. Please try again or ask "
61
+ "about Codingo's features, job matching, or how to use the platform."
62
+ )
63
 
64
  # Initialize Flask app
65
  app = Flask(
 
355
 
356
  with app.app_context():
357
  db.create_all()
358
+ # Pre-initialize chatbot on startup for faster first response
 
 
 
 
359
  print("Initializing chatbot...")
360
  try:
361
+ init_chatbot()
362
+ init_hf_model()
 
 
363
  print("Chatbot initialized successfully")
364
  except Exception as e:
365
  print(f"Chatbot initialization warning: {e}")
backend/services/codingo_chatbot.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ codingo_chatbot.py
3
+ ===================
4
+
5
+ This module encapsulates the logic for Codingo's website chatbot. It
6
+ loads a knowledge base from ``chatbot/chatbot.txt``, builds a vector
7
+ database using Chroma and SentenceTransformers, and uses a local LLM
8
+ powered by ``llama‑cpp‑python`` to generate answers constrained to the
9
+ retrieved context. The code is written to initialise all heavy
10
+ resources lazily on first use and to cache them for subsequent
11
+ requests. This prevents repeated model downloads and avoids
12
+ recomputing embeddings for every chat query.
13
+
14
+ The underlying LLM is the TinyLlama 1.1B chat model distributed via
15
+ Hugging Face in GGUF format. When the model file is not present
16
+ locally it is downloaded automatically using ``huggingface_hub``.
17
+ Depending on the environment the model will run on GPU if CUDA is
18
+ available or fall back to CPU otherwise. See the ``init_llm``
19
+ function for details.
20
+
21
+ Note: This module deliberately contains no references to OpenAI. It
22
+ relies solely on open‑source libraries available on PyPI (such as
23
+ ``llama‑cpp‑python`` and ``chromadb``) so that it can be used on
24
+ Hugging Face Spaces without requiring proprietary API keys.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import os
30
+ import threading
31
+ from typing import List
32
+
33
+ import numpy as np
34
+
35
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
36
+ from sentence_transformers import SentenceTransformer
37
+ import chromadb
38
+ from chromadb.config import Settings
39
+ from huggingface_hub import hf_hub_download
40
+
41
+ try:
42
+ from llama_cpp import Llama # type: ignore
43
+ except Exception as exc: # pragma: no cover - import may fail until dependency installed
44
+ # Provide a helpful error if llama_cpp isn't installed.
45
+ raise ImportError(
46
+ "llama_cpp is required for the chatbot. Please add 'llama-cpp-python' "
47
+ "to your requirements.txt"
48
+ ) from exc
49
+
50
+ # ---------------------------------------------------------------------------
51
+ # Configuration
52
+ #
53
+ # Compute the absolute path to the chatbot knowledge base. We derive this
54
+ # relative to this file so that the module works regardless of the working
55
+ # directory. The project structure places ``chatbot.txt`` at
56
+ # ``Codingo12/chatbot/chatbot.txt``.
57
+ PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
58
+ CHATBOT_TXT_PATH = os.path.join(PROJECT_ROOT, "chatbot", "chatbot.txt")
59
+
60
+ # Directory where Chroma will persist its database. This location is
61
+ # writable on both local machines and Hugging Face Spaces. It is
62
+ # intentionally distinct from the web app instance path to avoid
63
+ # permission issues.
64
+ CHROMA_DB_DIR = os.path.join("/tmp", "chatbot_chroma")
65
+
66
+ # Settings for the TinyLlama model. These can be overridden via
67
+ # environment variables if desired (for example to switch to a
68
+ # different quantisation or to test with a smaller model). See
69
+ # https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF for
70
+ # available filenames.
71
+ LLAMA_REPO = os.getenv(
72
+ "LLAMA_REPO",
73
+ "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
74
+ )
75
+ LLAMA_FILE = os.getenv(
76
+ "LLAMA_FILE",
77
+ "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
78
+ )
79
+
80
+ # Local directory where the GGUF model file will be stored. Using
81
+ # ``/tmp`` avoids writing into the read‑only repository filesystem on
82
+ # Hugging Face Spaces. The directory will be created as needed.
83
+ LLAMA_LOCAL_DIR = os.path.join("/tmp", "llama_models")
84
+
85
+ # Generation parameters. These values mirror those used in the
86
+ # provided Jupyter notebook. They can be tweaked via environment
87
+ # variables if necessary to trade off quality against speed.
88
+ MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "256"))
89
+ TEMPERATURE = float(os.getenv("LLAMA_TEMPERATURE", "0.7"))
90
+ TOP_P = float(os.getenv("LLAMA_TOP_P", "0.9"))
91
+ REPEAT_PENALTY = float(os.getenv("LLAMA_REPEAT_PENALTY", "1.15"))
92
+
93
+ # Thread lock to guard lazy initialisation in multi‑threaded Flask
94
+ # environments. Without this lock multiple concurrent requests may
95
+ # attempt to download the model or populate the database at the same
96
+ # time, leading to redundant work or race conditions.
97
+ _init_lock = threading.Lock()
98
+
99
+ # Global singletons for embedder, vector collection and LLM. These
100
+ # variables are populated on first use and reused thereafter.
101
+ _embedder: SentenceTransformer | None = None
102
+ _collection: chromadb.Collection | None = None
103
+ _llm: Llama | None = None
104
+
105
+
106
+ def _load_chatbot_text() -> str:
107
+ """Read the chatbot knowledge base from disk.
108
+
109
+ If the file is missing, a small default description of Codingo is
110
+ returned. This ensures the chatbot still provides a sensible
111
+ answer rather than crashing.
112
+ """
113
+ try:
114
+ with open(CHATBOT_TXT_PATH, encoding="utf-8") as f:
115
+ return f.read()
116
+ except FileNotFoundError:
117
+ # Fallback content if the knowledge base file is missing
118
+ return (
119
+ "Codingo is an AI‑powered recruitment platform designed to "
120
+ "streamline job applications, candidate screening and hiring. "
121
+ "We make hiring smarter, faster and fairer through automation "
122
+ "and intelligent recommendations."
123
+ )
124
+
125
+
126
+ def init_embedder_and_db() -> None:
127
+ """Initialise the SentenceTransformer embedder and Chroma vector DB.
128
+
129
+ This function is idempotent: if the embedder and collection are
130
+ already initialised it returns immediately. Otherwise it reads
131
+ ``chatbot.txt``, splits it into overlapping chunks, computes
132
+ embeddings and persists them to a Chroma collection. The
133
+ resulting ``SentenceTransformer`` and collection objects are saved
134
+ in global variables for later reuse.
135
+ """
136
+ global _embedder, _collection
137
+ if _embedder is not None and _collection is not None:
138
+ return
139
+ with _init_lock:
140
+ if _embedder is not None and _collection is not None:
141
+ return
142
+ # Ensure persistence directory exists
143
+ os.makedirs(CHROMA_DB_DIR, exist_ok=True)
144
+
145
+ # Read knowledge base
146
+ text = _load_chatbot_text()
147
+
148
+ # Split into chunks; use double newlines to prefer splitting on
149
+ # paragraph boundaries. Overlap helps the model maintain
150
+ # context across neighbouring chunks.
151
+ splitter = RecursiveCharacterTextSplitter(
152
+ chunk_size=300,
153
+ chunk_overlap=100,
154
+ separators=["\n\n"],
155
+ )
156
+ docs: List[str] = [doc.strip() for doc in splitter.split_text(text) if doc.strip()]
157
+
158
+ # Initialise embedder (MiniLM). We specify device via env.
159
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
160
+ embeddings = embedder.encode(docs, show_progress_bar=False, batch_size=32)
161
+
162
+ # Initialise Chroma client
163
+ client = chromadb.Client(
164
+ Settings(
165
+ persist_directory=CHROMA_DB_DIR,
166
+ anonymized_telemetry=False,
167
+ is_persistent=True,
168
+ )
169
+ )
170
+
171
+ # Create or get collection. This returns an existing collection if
172
+ # already present on disk.
173
+ collection = client.get_or_create_collection("codingo_chatbot")
174
+
175
+ # Populate collection only if empty. A naive call to
176
+ # ``collection.get(limit=1)`` may raise if the collection does
177
+ # not exist yet, so we catch any exception and treat it as an
178
+ # empty DB. Distances are stored as cosine similarity.
179
+ need_populate = False
180
+ try:
181
+ existing = collection.get(limit=1)
182
+ if not existing or not existing.get("documents"):
183
+ need_populate = True
184
+ except Exception:
185
+ need_populate = True
186
+ if need_populate:
187
+ ids = [f"doc_{i}" for i in range(len(docs))]
188
+ collection.add(documents=docs, embeddings=embeddings.tolist(), ids=ids)
189
+ _embedder = embedder
190
+ _collection = collection
191
+
192
+
193
+ def init_llm() -> None:
194
+ """Initialise the llama‑cpp model for response generation.
195
+
196
+ This function lazily downloads the GGUF model from Hugging Face if
197
+ necessary and instantiates a ``llama_cpp.Llama`` object. The
198
+ resulting instance is stored in the global ``_llm`` variable. To
199
+ control GPU usage set the ``CUDA_VISIBLE_DEVICES`` environment
200
+ variable or override ``LLAMA_N_GPU_LAYERS``. By default we use one
201
+ GPU layer when CUDA is available, otherwise the model runs on CPU.
202
+ """
203
+ global _llm
204
+ if _llm is not None:
205
+ return
206
+ with _init_lock:
207
+ if _llm is not None:
208
+ return
209
+ # Ensure the model directory exists
210
+ os.makedirs(LLAMA_LOCAL_DIR, exist_ok=True)
211
+ # Download model if not already present
212
+ local_path = os.path.join(LLAMA_LOCAL_DIR, LLAMA_FILE)
213
+ if not os.path.exists(local_path):
214
+ # The file will be downloaded to LLAMA_LOCAL_DIR. Use
215
+ # ``local_dir_use_symlinks=False`` to avoid creating
216
+ # symlinks that may break on certain filesystems.
217
+ local_path = hf_hub_download(
218
+ repo_id=LLAMA_REPO,
219
+ filename=LLAMA_FILE,
220
+ local_dir=LLAMA_LOCAL_DIR,
221
+ local_dir_use_symlinks=False,
222
+ )
223
+ # Determine GPU usage. We default to one GPU layer if CUDA
224
+ # appears available. Users can override via LLAMA_N_GPU_LAYERS.
225
+ try:
226
+ import torch # type: ignore
227
+ use_cuda = torch.cuda.is_available()
228
+ except Exception:
229
+ use_cuda = False
230
+ n_gpu_layers_env = os.getenv("LLAMA_N_GPU_LAYERS")
231
+ if n_gpu_layers_env:
232
+ try:
233
+ n_gpu_layers = int(n_gpu_layers_env)
234
+ except ValueError:
235
+ n_gpu_layers = 0
236
+ else:
237
+ n_gpu_layers = 1 if use_cuda else 0
238
+ # Construct the Llama instance. The context window is set
239
+ # generously to 2048 tokens; adjust via LLAMA_N_CTX if needed.
240
+ n_ctx = int(os.getenv("LLAMA_N_CTX", "2048"))
241
+ # Use half the available CPU cores for inference threads to
242
+ # balance responsiveness and resource use.
243
+ try:
244
+ n_threads = max(1, os.cpu_count() // 2)
245
+ except Exception:
246
+ n_threads = 2
247
+ _llm = Llama(
248
+ model_path=local_path,
249
+ n_ctx=n_ctx,
250
+ n_threads=n_threads,
251
+ n_gpu_layers=n_gpu_layers,
252
+ )
253
+
254
+
255
+ def _build_prompt(query: str, context: str) -> str:
256
+ """Construct the full prompt for the TinyLlama chat model.
257
+
258
+ The prompt format follows the conventions used by the model as
259
+ illustrated in the provided notebook. We include a system message
260
+ instructing the model to answer only using the given context and to
261
+ politely decline if the information is unavailable.
262
+ """
263
+ system_prompt = (
264
+ "You are the official chatbot of Codingo. "
265
+ "Answer ONLY by using the CONTEXT. "
266
+ "If the information is not available for you, say it politely."
267
+ )
268
+ prompt = (
269
+ f"<|system|>\n{system_prompt}</s>\n"
270
+ f"<|user|>\n{query}\n\nCONTEXTE:\n{context}</s>\n"
271
+ f"<|assistant|>\n"
272
+ )
273
+ return prompt
274
+
275
+
276
+ def get_response(query: str, k: int = 3, score_threshold: float = 2.0) -> str:
277
+ """Return a chatbot response for the given query.
278
+
279
+ This function performs the following steps:
280
+
281
+ 1. Ensures the embedder, vector database and LLM are initialised.
282
+ 2. Embeds the user's query and retrieves the top ``k`` most
283
+ similar documents from the Chroma collection.
284
+ 3. Filters out documents whose cosine distance exceeds
285
+ ``score_threshold`` (larger distances indicate less similarity).
286
+ 4. Builds a prompt containing the user query and the concatenated
287
+ relevant context.
288
+ 5. Feeds the prompt to the TinyLlama model and returns its
289
+ response, trimming trailing whitespace.
290
+
291
+ If no relevant context is found, a fallback message is returned.
292
+ """
293
+ if not query or not query.strip():
294
+ return "Please type a question about the Codingo platform."
295
+ init_embedder_and_db()
296
+ init_llm()
297
+ assert _embedder is not None and _collection is not None and _llm is not None
298
+ # Embed query and search collection
299
+ query_vector = _embedder.encode([query])[0]
300
+ results = _collection.query(query_embeddings=[query_vector.tolist()], n_results=k)
301
+ docs = results.get("documents", [[]])[0] if results else []
302
+ distances = results.get("distances", [[]])[0] if results else []
303
+ # Filter by score
304
+ relevant: List[str] = [d for d, s in zip(docs, distances) if s < score_threshold]
305
+ if not relevant:
306
+ return "Sorry, I don't have enough information to answer that question."
307
+ context = "\n\n".join(relevant)
308
+ prompt = _build_prompt(query, context)
309
+ # Generate completion
310
+ output = _llm(
311
+ prompt,
312
+ max_tokens=MAX_TOKENS,
313
+ temperature=TEMPERATURE,
314
+ top_p=TOP_P,
315
+ repeat_penalty=REPEAT_PENALTY,
316
+ stop=["</s>"]
317
+ )
318
+ text = output["choices"][0]["text"].strip()
319
+ return text or "I'm here to answer your questions about Codingo. What would you like to know?"
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  flask
2
  flask_login
3
  flask_sqlalchemy
@@ -55,5 +56,7 @@ pydub>=0.25.1
55
  requests>=2.31.0
56
 
57
  # Additional dependencies for improved chatbot functionality
58
- # Note: We're using DialoGPT which requires transformers (already included above)
59
- # No OpenAI dependency needed - using Hugging Face models instead
 
 
 
1
+
2
  flask
3
  flask_login
4
  flask_sqlalchemy
 
56
  requests>=2.31.0
57
 
58
  # Additional dependencies for improved chatbot functionality
59
+ # Note: The chatbot now uses a local Llama model via ``llama-cpp-python``.
60
+ # We include the dependency here so that it is installed on Hugging Face
61
+ # Spaces. The version is pinned for reproducibility and compatibility.
62
+ llama-cpp-python==0.2.27