Spaces:
Paused
Paused
Commit
·
46ecbc8
1
Parent(s):
987f59c
updated
Browse files- backend/services/codingo_chatbot.py +141 -112
backend/services/codingo_chatbot.py
CHANGED
@@ -4,9 +4,12 @@ codingo_chatbot.py
|
|
4 |
|
5 |
This module encapsulates the logic for Codingo's website chatbot. It
|
6 |
loads a knowledge base from ``chatbot/chatbot.txt``, builds a vector
|
7 |
-
database using Chroma and SentenceTransformers, and uses
|
8 |
-
|
9 |
-
retrieved context.
|
|
|
|
|
|
|
10 |
"""
|
11 |
|
12 |
from __future__ import annotations
|
@@ -21,37 +24,42 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
21 |
from sentence_transformers import SentenceTransformer
|
22 |
import chromadb
|
23 |
from chromadb.config import Settings
|
24 |
-
from huggingface_hub import hf_hub_download
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
try:
|
27 |
-
from llama_cpp import Llama # type: ignore
|
28 |
-
except Exception
|
29 |
-
raise
|
30 |
-
|
31 |
-
|
32 |
-
) from exc
|
33 |
|
34 |
# Configuration
|
35 |
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
|
36 |
CHATBOT_TXT_PATH = os.path.join(PROJECT_ROOT, "chatbot", "chatbot.txt")
|
37 |
CHROMA_DB_DIR = os.path.join("/tmp", "chatbot_chroma")
|
38 |
|
39 |
-
#
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
#
|
45 |
-
MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "512"))
|
46 |
-
TEMPERATURE = float(os.getenv("LLAMA_TEMPERATURE", "0.3"))
|
47 |
-
TOP_P = float(os.getenv("LLAMA_TOP_P", "0.9"))
|
48 |
-
REPEAT_PENALTY = float(os.getenv("LLAMA_REPEAT_PENALTY", "1.1"))
|
49 |
|
50 |
# Thread lock and globals
|
51 |
_init_lock = threading.Lock()
|
52 |
_embedder: SentenceTransformer | None = None
|
53 |
_collection: chromadb.Collection | None = None
|
54 |
-
_llm
|
55 |
|
56 |
|
57 |
def _load_chatbot_text() -> str:
|
@@ -136,135 +144,156 @@ def init_embedder_and_db() -> None:
|
|
136 |
|
137 |
|
138 |
def init_llm() -> None:
|
139 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
140 |
global _llm
|
141 |
if _llm is not None:
|
142 |
return
|
143 |
with _init_lock:
|
144 |
if _llm is not None:
|
145 |
return
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
if not os.path.exists(local_path):
|
151 |
-
local_path = hf_hub_download(
|
152 |
-
repo_id=LLAMA_REPO,
|
153 |
-
filename=LLAMA_FILE,
|
154 |
-
local_dir=LLAMA_LOCAL_DIR,
|
155 |
-
local_dir_use_symlinks=False,
|
156 |
-
)
|
157 |
-
|
158 |
-
# GPU configuration
|
159 |
-
try:
|
160 |
-
import torch
|
161 |
-
use_cuda = torch.cuda.is_available()
|
162 |
-
except Exception:
|
163 |
-
use_cuda = False
|
164 |
-
|
165 |
-
n_gpu_layers = int(os.getenv("LLAMA_N_GPU_LAYERS", "35" if use_cuda else "0"))
|
166 |
-
n_ctx = int(os.getenv("LLAMA_N_CTX", "2048"))
|
167 |
-
n_threads = max(1, os.cpu_count() // 2) if os.cpu_count() else 4
|
168 |
-
|
169 |
-
_llm = Llama(
|
170 |
-
model_path=local_path,
|
171 |
-
n_ctx=n_ctx,
|
172 |
-
n_threads=n_threads,
|
173 |
-
n_gpu_layers=n_gpu_layers,
|
174 |
-
verbose=False, # Reduce logging
|
175 |
-
)
|
176 |
|
177 |
|
178 |
def _build_prompt(query: str, context: str) -> str:
|
179 |
-
"""
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
system_prompt = (
|
182 |
-
"You are LUNA,
|
183 |
-
"
|
184 |
-
"
|
|
|
|
|
185 |
)
|
186 |
-
|
187 |
-
# Build the prompt with context integrated naturally
|
188 |
if context:
|
189 |
-
|
190 |
-
f"
|
191 |
-
f"
|
192 |
-
f"Question: {query}
|
193 |
-
f"
|
194 |
)
|
195 |
else:
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
f"
|
|
|
|
|
|
|
200 |
)
|
201 |
-
|
202 |
-
return prompt
|
203 |
|
204 |
|
205 |
def get_response(query: str, k: int = 3, score_threshold: float = 1.5) -> str:
|
206 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
if not query or not query.strip():
|
208 |
return "Hi! I'm LUNA, your Codingo assistant. How can I help you today?"
|
209 |
-
|
|
|
210 |
init_embedder_and_db()
|
211 |
init_llm()
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
|
|
|
|
216 |
greetings = ['hi', 'hello', 'hey', 'good morning', 'good afternoon', 'good evening']
|
217 |
if query.lower().strip() in greetings:
|
218 |
return "Hello! I'm LUNA, your AI assistant for Codingo. How can I help you with our recruitment platform today?"
|
219 |
-
|
220 |
-
# Embed query and search
|
221 |
query_vector = _embedder.encode([query])[0]
|
222 |
results = _collection.query(query_embeddings=[query_vector.tolist()], n_results=k)
|
223 |
-
|
224 |
docs = results.get("documents", [[]])[0] if results else []
|
225 |
distances = results.get("distances", [[]])[0] if results else []
|
226 |
-
|
227 |
-
# Filter by
|
228 |
relevant: List[str] = [d for d, s in zip(docs, distances) if s < score_threshold]
|
229 |
-
|
|
|
230 |
if not relevant:
|
231 |
-
# Provide a helpful response even without specific context
|
232 |
return (
|
233 |
-
"I don't
|
234 |
-
"
|
235 |
-
"that helps with job applications, candidate screening, and hiring. "
|
236 |
-
"Would you like to know more about our features?"
|
237 |
)
|
238 |
-
|
239 |
-
#
|
240 |
-
context = "
|
241 |
prompt = _build_prompt(query, context)
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
|
|
|
|
|
|
|
|
|
|
258 |
lines = text.split('\n')
|
259 |
cleaned_lines = []
|
260 |
for line in lines:
|
261 |
-
|
|
|
262 |
'the chatbot', 'this bot', 'the bot provides',
|
263 |
-
'in response to', 'overall,'
|
|
|
264 |
]):
|
265 |
continue
|
266 |
cleaned_lines.append(line)
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
return
|
|
|
|
|
|
4 |
|
5 |
This module encapsulates the logic for Codingo's website chatbot. It
|
6 |
loads a knowledge base from ``chatbot/chatbot.txt``, builds a vector
|
7 |
+
database using Chroma and SentenceTransformers, and uses the shared
|
8 |
+
Groq language model (imported from ``backend.services.interview_engine``)
|
9 |
+
to generate answers constrained to the retrieved context. If a Groq API
|
10 |
+
key is not configured, a lightweight dummy model will be used as a
|
11 |
+
fallback. TinyLlama and other local models are no longer used in this
|
12 |
+
module.
|
13 |
"""
|
14 |
|
15 |
from __future__ import annotations
|
|
|
24 |
from sentence_transformers import SentenceTransformer
|
25 |
import chromadb
|
26 |
from chromadb.config import Settings
|
|
|
27 |
|
28 |
+
# Import the shared Groq LLM instance from the interview engine. This ensures
|
29 |
+
# that the chatbot uses the exact same language model as the interview API.
|
30 |
+
from backend.services.interview_engine import groq_llm
|
31 |
+
|
32 |
+
# The llama_cpp dependency is no longer used for the chatbot. We keep the
|
33 |
+
# import guarded to avoid breaking environments where llama_cpp is not
|
34 |
+
# installed, but it is no longer required for generating responses.
|
35 |
try:
|
36 |
+
from llama_cpp import Llama # type: ignore # noqa: F401
|
37 |
+
except Exception:
|
38 |
+
# We don't raise here because the Groq LLM will be used instead. If
|
39 |
+
# llama_cpp is unavailable, it won't affect chatbot functionality.
|
40 |
+
Llama = None # type: ignore
|
|
|
41 |
|
42 |
# Configuration
|
43 |
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
|
44 |
CHATBOT_TXT_PATH = os.path.join(PROJECT_ROOT, "chatbot", "chatbot.txt")
|
45 |
CHROMA_DB_DIR = os.path.join("/tmp", "chatbot_chroma")
|
46 |
|
47 |
+
# Generation parameters for the Groq LLM. These values can be adjusted via
|
48 |
+
# environment variables if desired. They loosely mirror the previous TinyLlama
|
49 |
+
# settings but are applied when constructing prompts for the Groq LLM. Note
|
50 |
+
# that Groq models internally determine sampling behaviour; these variables
|
51 |
+
# mainly govern how much content we include in the prompt and do not directly
|
52 |
+
# control the sampling temperature of the Groq API.
|
53 |
+
MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "512")) # kept for compatibility
|
54 |
+
TEMPERATURE = float(os.getenv("LLAMA_TEMPERATURE", "0.3")) # unused but retained
|
55 |
+
TOP_P = float(os.getenv("LLAMA_TOP_P", "0.9")) # unused but retained
|
56 |
+
REPEAT_PENALTY = float(os.getenv("LLAMA_REPEAT_PENALTY", "1.1")) # unused
|
57 |
|
58 |
# Thread lock and globals
|
59 |
_init_lock = threading.Lock()
|
60 |
_embedder: SentenceTransformer | None = None
|
61 |
_collection: chromadb.Collection | None = None
|
62 |
+
_llm = None # This will be set to the shared Groq LLM instance
|
63 |
|
64 |
|
65 |
def _load_chatbot_text() -> str:
|
|
|
144 |
|
145 |
|
146 |
def init_llm() -> None:
|
147 |
+
"""
|
148 |
+
Initialize the chatbot's language model. This function now assigns
|
149 |
+
the globally shared Groq LLM instance imported from the interview
|
150 |
+
engine. If the Groq API key is unavailable, the fallback dummy
|
151 |
+
model defined in the interview engine will be used automatically.
|
152 |
+
"""
|
153 |
global _llm
|
154 |
if _llm is not None:
|
155 |
return
|
156 |
with _init_lock:
|
157 |
if _llm is not None:
|
158 |
return
|
159 |
+
# Assign the shared Groq LLM instance. This may be a DummyGroq when
|
160 |
+
# no API key is provided. We avoid loading any local GGUF models.
|
161 |
+
_llm = groq_llm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
|
164 |
def _build_prompt(query: str, context: str) -> str:
|
165 |
+
"""
|
166 |
+
Construct a prompt for the Groq LLM. The prompt instructs the model to
|
167 |
+
behave as LUNA, Codingo's friendly assistant. It emphasises using only
|
168 |
+
information from the provided context to answer the question and
|
169 |
+
encourages the model to admit when the answer is unknown. This plain
|
170 |
+
format works well with ChatGroq's ``invoke`` API.
|
171 |
+
|
172 |
+
Args:
|
173 |
+
query: The user's question.
|
174 |
+
context: Concatenated snippets from the knowledge base deemed
|
175 |
+
relevant to the query.
|
176 |
+
|
177 |
+
Returns:
|
178 |
+
A formatted string prompt ready for submission to the Groq LLM.
|
179 |
+
"""
|
180 |
system_prompt = (
|
181 |
+
"You are LUNA, the friendly AI assistant for the Codingo recruitment "
|
182 |
+
"platform. You only answer questions using the information provided "
|
183 |
+
"in the context below. If the context does not contain the answer, "
|
184 |
+
"respond politely that you don't know. Keep your answers concise and "
|
185 |
+
"helpful."
|
186 |
)
|
187 |
+
|
|
|
188 |
if context:
|
189 |
+
return (
|
190 |
+
f"{system_prompt}\n\n"
|
191 |
+
f"Context:\n{context}\n\n"
|
192 |
+
f"Question: {query}\n"
|
193 |
+
f"Answer:"
|
194 |
)
|
195 |
else:
|
196 |
+
# When no context is available, still pass an empty context so the
|
197 |
+
# model knows there is no supporting information.
|
198 |
+
return (
|
199 |
+
f"{system_prompt}\n\n"
|
200 |
+
"Context:\n\n"
|
201 |
+
f"Question: {query}\n"
|
202 |
+
f"Answer:"
|
203 |
)
|
|
|
|
|
204 |
|
205 |
|
206 |
def get_response(query: str, k: int = 3, score_threshold: float = 1.5) -> str:
|
207 |
+
"""
|
208 |
+
Generate a response to the user's query using the shared Groq LLM and the
|
209 |
+
chatbot's knowledge base. The function retrieves relevant context
|
210 |
+
passages from the vector store, constructs a prompt instructing the
|
211 |
+
model to answer as LUNA using only that context, and returns the
|
212 |
+
resulting answer. If no context is available, a polite fallback
|
213 |
+
message is returned without calling the LLM.
|
214 |
+
|
215 |
+
Args:
|
216 |
+
query: The user's question or statement.
|
217 |
+
k: Number of nearest neighbour documents to retrieve from the
|
218 |
+
knowledge base (default 3).
|
219 |
+
score_threshold: Maximum distance for a document to be considered
|
220 |
+
relevant (smaller means more similar).
|
221 |
+
|
222 |
+
Returns:
|
223 |
+
A string response appropriate for the chatbot UI.
|
224 |
+
"""
|
225 |
+
# Handle empty queries gracefully
|
226 |
if not query or not query.strip():
|
227 |
return "Hi! I'm LUNA, your Codingo assistant. How can I help you today?"
|
228 |
+
|
229 |
+
# Initialise embedder, vector DB and LLM if necessary
|
230 |
init_embedder_and_db()
|
231 |
init_llm()
|
232 |
+
|
233 |
+
# If embedder or collection or LLM didn't initialise, provide a safe fallback
|
234 |
+
if _embedder is None or _collection is None or _llm is None:
|
235 |
+
return "I'm sorry, I'm unable to process your request right now. Please try again later."
|
236 |
+
|
237 |
+
# Normalise for simple greetings
|
238 |
greetings = ['hi', 'hello', 'hey', 'good morning', 'good afternoon', 'good evening']
|
239 |
if query.lower().strip() in greetings:
|
240 |
return "Hello! I'm LUNA, your AI assistant for Codingo. How can I help you with our recruitment platform today?"
|
241 |
+
|
242 |
+
# Embed query and search for relevant documents
|
243 |
query_vector = _embedder.encode([query])[0]
|
244 |
results = _collection.query(query_embeddings=[query_vector.tolist()], n_results=k)
|
245 |
+
|
246 |
docs = results.get("documents", [[]])[0] if results else []
|
247 |
distances = results.get("distances", [[]])[0] if results else []
|
248 |
+
|
249 |
+
# Filter by distance threshold
|
250 |
relevant: List[str] = [d for d, s in zip(docs, distances) if s < score_threshold]
|
251 |
+
|
252 |
+
# If no relevant context is found, politely admit ignorance
|
253 |
if not relevant:
|
|
|
254 |
return (
|
255 |
+
"I'm sorry, I don't know the answer to that question based on my knowledge. "
|
256 |
+
"Could you ask something else about Codingo or its services?"
|
|
|
|
|
257 |
)
|
258 |
+
|
259 |
+
# Concatenate the most relevant passages for context (use top 2)
|
260 |
+
context = "\n\n".join(relevant[:2])
|
261 |
prompt = _build_prompt(query, context)
|
262 |
+
|
263 |
+
try:
|
264 |
+
# Invoke the Groq LLM. The ``invoke`` method may return an object
|
265 |
+
# with a ``content`` attribute or a plain string, depending on the
|
266 |
+
# backend. We handle both cases transparently.
|
267 |
+
response = _llm.invoke(prompt)
|
268 |
+
except Exception:
|
269 |
+
# If invocation fails, return a generic error message
|
270 |
+
return "I'm sorry, I encountered an error while generating a response. Please try again later."
|
271 |
+
|
272 |
+
# Extract text from the LLM response
|
273 |
+
if hasattr(response, 'content'):
|
274 |
+
text = str(response.content).strip()
|
275 |
+
elif isinstance(response, dict):
|
276 |
+
# Some wrappers may return dicts (e.g. ChatCompletion). Try common keys.
|
277 |
+
text = response.get('message', '') or response.get('text', '') or str(response)
|
278 |
+
text = text.strip()
|
279 |
+
else:
|
280 |
+
text = str(response).strip()
|
281 |
+
|
282 |
+
# Post-process the answer: remove unwanted phrases referring to the bot
|
283 |
lines = text.split('\n')
|
284 |
cleaned_lines = []
|
285 |
for line in lines:
|
286 |
+
lower_line = line.lower()
|
287 |
+
if any(phrase in lower_line for phrase in [
|
288 |
'the chatbot', 'this bot', 'the bot provides',
|
289 |
+
'in response to', 'overall,',
|
290 |
+
'as an ai language model'
|
291 |
]):
|
292 |
continue
|
293 |
cleaned_lines.append(line)
|
294 |
+
cleaned_text = '\n'.join(cleaned_lines).strip()
|
295 |
+
|
296 |
+
# Ensure we return some meaningful text
|
297 |
+
return cleaned_text or (
|
298 |
+
"I'm sorry, I couldn't generate a proper response. Could you rephrase your question?"
|
299 |
+
)
|