husseinelsaadi commited on
Commit
46ecbc8
·
1 Parent(s): 987f59c
Files changed (1) hide show
  1. backend/services/codingo_chatbot.py +141 -112
backend/services/codingo_chatbot.py CHANGED
@@ -4,9 +4,12 @@ codingo_chatbot.py
4
 
5
  This module encapsulates the logic for Codingo's website chatbot. It
6
  loads a knowledge base from ``chatbot/chatbot.txt``, builds a vector
7
- database using Chroma and SentenceTransformers, and uses a local LLM
8
- powered by ``llama‑cpp‑python`` to generate answers constrained to the
9
- retrieved context.
 
 
 
10
  """
11
 
12
  from __future__ import annotations
@@ -21,37 +24,42 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
21
  from sentence_transformers import SentenceTransformer
22
  import chromadb
23
  from chromadb.config import Settings
24
- from huggingface_hub import hf_hub_download
25
 
 
 
 
 
 
 
 
26
  try:
27
- from llama_cpp import Llama # type: ignore
28
- except Exception as exc: # pragma: no cover - import may fail until dependency installed
29
- raise ImportError(
30
- "llama_cpp is required for the chatbot. Please add 'llama-cpp-python' "
31
- "to your requirements.txt"
32
- ) from exc
33
 
34
  # Configuration
35
  PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
36
  CHATBOT_TXT_PATH = os.path.join(PROJECT_ROOT, "chatbot", "chatbot.txt")
37
  CHROMA_DB_DIR = os.path.join("/tmp", "chatbot_chroma")
38
 
39
- # TinyLlama model settings
40
- LLAMA_REPO = os.getenv("LLAMA_REPO", "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF")
41
- LLAMA_FILE = os.getenv("LLAMA_FILE", "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")
42
- LLAMA_LOCAL_DIR = os.path.join("/tmp", "llama_models")
43
-
44
- # Generation parameters - adjusted for better responses
45
- MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "512"))
46
- TEMPERATURE = float(os.getenv("LLAMA_TEMPERATURE", "0.3"))
47
- TOP_P = float(os.getenv("LLAMA_TOP_P", "0.9"))
48
- REPEAT_PENALTY = float(os.getenv("LLAMA_REPEAT_PENALTY", "1.1"))
49
 
50
  # Thread lock and globals
51
  _init_lock = threading.Lock()
52
  _embedder: SentenceTransformer | None = None
53
  _collection: chromadb.Collection | None = None
54
- _llm: Llama | None = None
55
 
56
 
57
  def _load_chatbot_text() -> str:
@@ -136,135 +144,156 @@ def init_embedder_and_db() -> None:
136
 
137
 
138
  def init_llm() -> None:
139
- """Initialize the llama‑cpp model for response generation."""
 
 
 
 
 
140
  global _llm
141
  if _llm is not None:
142
  return
143
  with _init_lock:
144
  if _llm is not None:
145
  return
146
-
147
- os.makedirs(LLAMA_LOCAL_DIR, exist_ok=True)
148
- local_path = os.path.join(LLAMA_LOCAL_DIR, LLAMA_FILE)
149
-
150
- if not os.path.exists(local_path):
151
- local_path = hf_hub_download(
152
- repo_id=LLAMA_REPO,
153
- filename=LLAMA_FILE,
154
- local_dir=LLAMA_LOCAL_DIR,
155
- local_dir_use_symlinks=False,
156
- )
157
-
158
- # GPU configuration
159
- try:
160
- import torch
161
- use_cuda = torch.cuda.is_available()
162
- except Exception:
163
- use_cuda = False
164
-
165
- n_gpu_layers = int(os.getenv("LLAMA_N_GPU_LAYERS", "35" if use_cuda else "0"))
166
- n_ctx = int(os.getenv("LLAMA_N_CTX", "2048"))
167
- n_threads = max(1, os.cpu_count() // 2) if os.cpu_count() else 4
168
-
169
- _llm = Llama(
170
- model_path=local_path,
171
- n_ctx=n_ctx,
172
- n_threads=n_threads,
173
- n_gpu_layers=n_gpu_layers,
174
- verbose=False, # Reduce logging
175
- )
176
 
177
 
178
  def _build_prompt(query: str, context: str) -> str:
179
- """Construct a natural prompt for the TinyLlama chat model."""
180
- # Use a more direct, conversational system prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  system_prompt = (
182
- "You are LUNA, a friendly AI assistant for the Codingo recruitment platform. "
183
- "Answer questions naturally and conversationally. Use the provided information "
184
- "to give helpful, direct answers. Keep responses concise and relevant."
 
 
185
  )
186
-
187
- # Build the prompt with context integrated naturally
188
  if context:
189
- prompt = (
190
- f"<|system|>\n{system_prompt}</s>\n"
191
- f"<|user|>\nContext: {context}\n\n"
192
- f"Question: {query}</s>\n"
193
- f"<|assistant|>\n"
194
  )
195
  else:
196
- prompt = (
197
- f"<|system|>\n{system_prompt}</s>\n"
198
- f"<|user|>\n{query}</s>\n"
199
- f"<|assistant|>\n"
 
 
 
200
  )
201
-
202
- return prompt
203
 
204
 
205
  def get_response(query: str, k: int = 3, score_threshold: float = 1.5) -> str:
206
- """Return a chatbot response for the given query."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  if not query or not query.strip():
208
  return "Hi! I'm LUNA, your Codingo assistant. How can I help you today?"
209
-
 
210
  init_embedder_and_db()
211
  init_llm()
212
-
213
- assert _embedder is not None and _collection is not None and _llm is not None
214
-
215
- # Handle greetings directly
 
 
216
  greetings = ['hi', 'hello', 'hey', 'good morning', 'good afternoon', 'good evening']
217
  if query.lower().strip() in greetings:
218
  return "Hello! I'm LUNA, your AI assistant for Codingo. How can I help you with our recruitment platform today?"
219
-
220
- # Embed query and search
221
  query_vector = _embedder.encode([query])[0]
222
  results = _collection.query(query_embeddings=[query_vector.tolist()], n_results=k)
223
-
224
  docs = results.get("documents", [[]])[0] if results else []
225
  distances = results.get("distances", [[]])[0] if results else []
226
-
227
- # Filter by score (lower threshold for better matching)
228
  relevant: List[str] = [d for d, s in zip(docs, distances) if s < score_threshold]
229
-
 
230
  if not relevant:
231
- # Provide a helpful response even without specific context
232
  return (
233
- "I don't have specific information about that in my knowledge base. "
234
- "However, I can tell you that Codingo is an AI-powered recruitment platform "
235
- "that helps with job applications, candidate screening, and hiring. "
236
- "Would you like to know more about our features?"
237
  )
238
-
239
- # Join context with better formatting
240
- context = " ".join(relevant[:2]) # Use top 2 most relevant chunks
241
  prompt = _build_prompt(query, context)
242
-
243
- # Generate response with better parameters
244
- output = _llm(
245
- prompt,
246
- max_tokens=MAX_TOKENS,
247
- temperature=TEMPERATURE,
248
- top_p=TOP_P,
249
- repeat_penalty=REPEAT_PENALTY,
250
- stop=["</s>", "<|user|>", "<|system|>"],
251
- echo=False,
252
- )
253
-
254
- # Extract and clean the response
255
- text = output["choices"][0]["text"].strip()
256
-
257
- # Remove any meta-descriptions that might have leaked through
 
 
 
 
 
258
  lines = text.split('\n')
259
  cleaned_lines = []
260
  for line in lines:
261
- if any(phrase in line.lower() for phrase in [
 
262
  'the chatbot', 'this bot', 'the bot provides',
263
- 'in response to', 'overall,'
 
264
  ]):
265
  continue
266
  cleaned_lines.append(line)
267
-
268
- text = '\n'.join(cleaned_lines).strip()
269
-
270
- return text or "I'm here to help you with Codingo. Could you please rephrase your question?"
 
 
 
4
 
5
  This module encapsulates the logic for Codingo's website chatbot. It
6
  loads a knowledge base from ``chatbot/chatbot.txt``, builds a vector
7
+ database using Chroma and SentenceTransformers, and uses the shared
8
+ Groq language model (imported from ``backend.services.interview_engine``)
9
+ to generate answers constrained to the retrieved context. If a Groq API
10
+ key is not configured, a lightweight dummy model will be used as a
11
+ fallback. TinyLlama and other local models are no longer used in this
12
+ module.
13
  """
14
 
15
  from __future__ import annotations
 
24
  from sentence_transformers import SentenceTransformer
25
  import chromadb
26
  from chromadb.config import Settings
 
27
 
28
+ # Import the shared Groq LLM instance from the interview engine. This ensures
29
+ # that the chatbot uses the exact same language model as the interview API.
30
+ from backend.services.interview_engine import groq_llm
31
+
32
+ # The llama_cpp dependency is no longer used for the chatbot. We keep the
33
+ # import guarded to avoid breaking environments where llama_cpp is not
34
+ # installed, but it is no longer required for generating responses.
35
  try:
36
+ from llama_cpp import Llama # type: ignore # noqa: F401
37
+ except Exception:
38
+ # We don't raise here because the Groq LLM will be used instead. If
39
+ # llama_cpp is unavailable, it won't affect chatbot functionality.
40
+ Llama = None # type: ignore
 
41
 
42
  # Configuration
43
  PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
44
  CHATBOT_TXT_PATH = os.path.join(PROJECT_ROOT, "chatbot", "chatbot.txt")
45
  CHROMA_DB_DIR = os.path.join("/tmp", "chatbot_chroma")
46
 
47
+ # Generation parameters for the Groq LLM. These values can be adjusted via
48
+ # environment variables if desired. They loosely mirror the previous TinyLlama
49
+ # settings but are applied when constructing prompts for the Groq LLM. Note
50
+ # that Groq models internally determine sampling behaviour; these variables
51
+ # mainly govern how much content we include in the prompt and do not directly
52
+ # control the sampling temperature of the Groq API.
53
+ MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "512")) # kept for compatibility
54
+ TEMPERATURE = float(os.getenv("LLAMA_TEMPERATURE", "0.3")) # unused but retained
55
+ TOP_P = float(os.getenv("LLAMA_TOP_P", "0.9")) # unused but retained
56
+ REPEAT_PENALTY = float(os.getenv("LLAMA_REPEAT_PENALTY", "1.1")) # unused
57
 
58
  # Thread lock and globals
59
  _init_lock = threading.Lock()
60
  _embedder: SentenceTransformer | None = None
61
  _collection: chromadb.Collection | None = None
62
+ _llm = None # This will be set to the shared Groq LLM instance
63
 
64
 
65
  def _load_chatbot_text() -> str:
 
144
 
145
 
146
  def init_llm() -> None:
147
+ """
148
+ Initialize the chatbot's language model. This function now assigns
149
+ the globally shared Groq LLM instance imported from the interview
150
+ engine. If the Groq API key is unavailable, the fallback dummy
151
+ model defined in the interview engine will be used automatically.
152
+ """
153
  global _llm
154
  if _llm is not None:
155
  return
156
  with _init_lock:
157
  if _llm is not None:
158
  return
159
+ # Assign the shared Groq LLM instance. This may be a DummyGroq when
160
+ # no API key is provided. We avoid loading any local GGUF models.
161
+ _llm = groq_llm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
 
164
  def _build_prompt(query: str, context: str) -> str:
165
+ """
166
+ Construct a prompt for the Groq LLM. The prompt instructs the model to
167
+ behave as LUNA, Codingo's friendly assistant. It emphasises using only
168
+ information from the provided context to answer the question and
169
+ encourages the model to admit when the answer is unknown. This plain
170
+ format works well with ChatGroq's ``invoke`` API.
171
+
172
+ Args:
173
+ query: The user's question.
174
+ context: Concatenated snippets from the knowledge base deemed
175
+ relevant to the query.
176
+
177
+ Returns:
178
+ A formatted string prompt ready for submission to the Groq LLM.
179
+ """
180
  system_prompt = (
181
+ "You are LUNA, the friendly AI assistant for the Codingo recruitment "
182
+ "platform. You only answer questions using the information provided "
183
+ "in the context below. If the context does not contain the answer, "
184
+ "respond politely that you don't know. Keep your answers concise and "
185
+ "helpful."
186
  )
187
+
 
188
  if context:
189
+ return (
190
+ f"{system_prompt}\n\n"
191
+ f"Context:\n{context}\n\n"
192
+ f"Question: {query}\n"
193
+ f"Answer:"
194
  )
195
  else:
196
+ # When no context is available, still pass an empty context so the
197
+ # model knows there is no supporting information.
198
+ return (
199
+ f"{system_prompt}\n\n"
200
+ "Context:\n\n"
201
+ f"Question: {query}\n"
202
+ f"Answer:"
203
  )
 
 
204
 
205
 
206
  def get_response(query: str, k: int = 3, score_threshold: float = 1.5) -> str:
207
+ """
208
+ Generate a response to the user's query using the shared Groq LLM and the
209
+ chatbot's knowledge base. The function retrieves relevant context
210
+ passages from the vector store, constructs a prompt instructing the
211
+ model to answer as LUNA using only that context, and returns the
212
+ resulting answer. If no context is available, a polite fallback
213
+ message is returned without calling the LLM.
214
+
215
+ Args:
216
+ query: The user's question or statement.
217
+ k: Number of nearest neighbour documents to retrieve from the
218
+ knowledge base (default 3).
219
+ score_threshold: Maximum distance for a document to be considered
220
+ relevant (smaller means more similar).
221
+
222
+ Returns:
223
+ A string response appropriate for the chatbot UI.
224
+ """
225
+ # Handle empty queries gracefully
226
  if not query or not query.strip():
227
  return "Hi! I'm LUNA, your Codingo assistant. How can I help you today?"
228
+
229
+ # Initialise embedder, vector DB and LLM if necessary
230
  init_embedder_and_db()
231
  init_llm()
232
+
233
+ # If embedder or collection or LLM didn't initialise, provide a safe fallback
234
+ if _embedder is None or _collection is None or _llm is None:
235
+ return "I'm sorry, I'm unable to process your request right now. Please try again later."
236
+
237
+ # Normalise for simple greetings
238
  greetings = ['hi', 'hello', 'hey', 'good morning', 'good afternoon', 'good evening']
239
  if query.lower().strip() in greetings:
240
  return "Hello! I'm LUNA, your AI assistant for Codingo. How can I help you with our recruitment platform today?"
241
+
242
+ # Embed query and search for relevant documents
243
  query_vector = _embedder.encode([query])[0]
244
  results = _collection.query(query_embeddings=[query_vector.tolist()], n_results=k)
245
+
246
  docs = results.get("documents", [[]])[0] if results else []
247
  distances = results.get("distances", [[]])[0] if results else []
248
+
249
+ # Filter by distance threshold
250
  relevant: List[str] = [d for d, s in zip(docs, distances) if s < score_threshold]
251
+
252
+ # If no relevant context is found, politely admit ignorance
253
  if not relevant:
 
254
  return (
255
+ "I'm sorry, I don't know the answer to that question based on my knowledge. "
256
+ "Could you ask something else about Codingo or its services?"
 
 
257
  )
258
+
259
+ # Concatenate the most relevant passages for context (use top 2)
260
+ context = "\n\n".join(relevant[:2])
261
  prompt = _build_prompt(query, context)
262
+
263
+ try:
264
+ # Invoke the Groq LLM. The ``invoke`` method may return an object
265
+ # with a ``content`` attribute or a plain string, depending on the
266
+ # backend. We handle both cases transparently.
267
+ response = _llm.invoke(prompt)
268
+ except Exception:
269
+ # If invocation fails, return a generic error message
270
+ return "I'm sorry, I encountered an error while generating a response. Please try again later."
271
+
272
+ # Extract text from the LLM response
273
+ if hasattr(response, 'content'):
274
+ text = str(response.content).strip()
275
+ elif isinstance(response, dict):
276
+ # Some wrappers may return dicts (e.g. ChatCompletion). Try common keys.
277
+ text = response.get('message', '') or response.get('text', '') or str(response)
278
+ text = text.strip()
279
+ else:
280
+ text = str(response).strip()
281
+
282
+ # Post-process the answer: remove unwanted phrases referring to the bot
283
  lines = text.split('\n')
284
  cleaned_lines = []
285
  for line in lines:
286
+ lower_line = line.lower()
287
+ if any(phrase in lower_line for phrase in [
288
  'the chatbot', 'this bot', 'the bot provides',
289
+ 'in response to', 'overall,',
290
+ 'as an ai language model'
291
  ]):
292
  continue
293
  cleaned_lines.append(line)
294
+ cleaned_text = '\n'.join(cleaned_lines).strip()
295
+
296
+ # Ensure we return some meaningful text
297
+ return cleaned_text or (
298
+ "I'm sorry, I couldn't generate a proper response. Could you rephrase your question?"
299
+ )