Update app.py
Browse files
app.py
CHANGED
@@ -62,11 +62,6 @@ except Exception as e:
|
|
62 |
exit(1)
|
63 |
|
64 |
# Initialize tiktoken encoder for token counting (approximate for GGUF models, but good enough)
|
65 |
-
# For TinyLlama, we'll use a generic encoder or one that's close enough.
|
66 |
-
# 'cl100k_base' is common for OpenAI models, but a good approximation for many others.
|
67 |
-
# For more precise counts for GGUF, you might need to use the model's tokenizer if available
|
68 |
-
# or rely on llama.cpp's internal tokenization (which is harder to access directly).
|
69 |
-
# For simplicity and general estimation, cl100k_base is often used.
|
70 |
try:
|
71 |
encoding = tiktoken.get_encoding("cl100k_base")
|
72 |
except Exception:
|
@@ -110,10 +105,11 @@ def truncate_history(history, max_tokens):
|
|
110 |
current_tokens = count_tokens_in_message(system_message)
|
111 |
|
112 |
# Add messages from most recent, until max_tokens is reached
|
113 |
-
|
|
|
114 |
message_tokens = count_tokens_in_message(message)
|
115 |
if current_tokens + message_tokens <= max_tokens:
|
116 |
-
truncated_history.insert(1, message) # Insert after system message
|
117 |
current_tokens += message_tokens
|
118 |
else:
|
119 |
break # Stop adding if next message exceeds limit
|
@@ -190,20 +186,37 @@ async def generate(request: Request):
|
|
190 |
if not prompt:
|
191 |
return {"error": "Prompt cannot be empty"}, 400
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
# Generate a new session ID if not provided (for new conversations)
|
194 |
if not session_id:
|
195 |
session_id = str(uuid.uuid4())
|
196 |
# Initialize chat history for a new session with a system message
|
197 |
chat_histories[session_id] = [
|
198 |
-
{"role": "system", "content":
|
199 |
]
|
200 |
print(f"🆕 New session created: {session_id}")
|
201 |
elif session_id not in chat_histories:
|
202 |
# If a session ID is provided but not found, re-initialize it
|
203 |
chat_histories[session_id] = [
|
204 |
-
{"role": "system", "content":
|
205 |
]
|
206 |
print(f"⚠️ Session ID {session_id} not found, re-initializing history.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
print(f"🧾 Prompt received for session {session_id}: {prompt}")
|
209 |
|
@@ -213,15 +226,17 @@ async def generate(request: Request):
|
|
213 |
|
214 |
# Truncate history if it exceeds the max context tokens
|
215 |
# We subtract a buffer for the new prompt itself and the expected response
|
216 |
-
# A rough estimate for prompt + response: 100 tokens (prompt) +
|
217 |
-
|
218 |
-
effective_max_history_tokens = MAX_CONTEXT_TOKENS - count_tokens_in_message({"role": "user", "content": prompt}) - 200 # Buffer for response
|
219 |
|
220 |
if get_message_token_length(current_messages) > MAX_CONTEXT_TOKENS:
|
221 |
print(f"✂️ Truncating history for session {session_id}. Current tokens: {get_message_token_length(current_messages)}")
|
222 |
chat_histories[session_id] = truncate_history(current_messages, effective_max_history_tokens)
|
223 |
-
# Re-add the current user prompt after truncation
|
224 |
-
|
|
|
|
|
|
|
225 |
chat_histories[session_id].append({"role": "user", "content": prompt})
|
226 |
print(f"✅ History truncated. New tokens: {get_message_token_length(chat_histories[session_id])}")
|
227 |
else:
|
@@ -231,7 +246,7 @@ async def generate(request: Request):
|
|
231 |
# Pass the (potentially truncated) chat history for context
|
232 |
response = llm.create_chat_completion(
|
233 |
messages=chat_histories[session_id],
|
234 |
-
max_tokens=
|
235 |
temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
|
236 |
stop=["</s>"] # Stop sequence for TinyLlama Chat
|
237 |
)
|
|
|
62 |
exit(1)
|
63 |
|
64 |
# Initialize tiktoken encoder for token counting (approximate for GGUF models, but good enough)
|
|
|
|
|
|
|
|
|
|
|
65 |
try:
|
66 |
encoding = tiktoken.get_encoding("cl100k_base")
|
67 |
except Exception:
|
|
|
105 |
current_tokens = count_tokens_in_message(system_message)
|
106 |
|
107 |
# Add messages from most recent, until max_tokens is reached
|
108 |
+
# Iterate from second-to-last to first user/assistant message
|
109 |
+
for message in reversed(history[1:]):
|
110 |
message_tokens = count_tokens_in_message(message)
|
111 |
if current_tokens + message_tokens <= max_tokens:
|
112 |
+
truncated_history.insert(1, message) # Insert after system message to maintain order
|
113 |
current_tokens += message_tokens
|
114 |
else:
|
115 |
break # Stop adding if next message exceeds limit
|
|
|
186 |
if not prompt:
|
187 |
return {"error": "Prompt cannot be empty"}, 400
|
188 |
|
189 |
+
# Define the system prompt with an emphasis on using context
|
190 |
+
system_prompt_content = (
|
191 |
+
"You are a helpful AI assistant for data analysis. "
|
192 |
+
"You are designed to provide concise and actionable suggestions based on the data provided or questions asked. "
|
193 |
+
"**Always refer to the information given in the current conversation context.** "
|
194 |
+
"Keep your responses focused on data insights and actionable steps for report generation. "
|
195 |
+
"Do not claim to have no memory if the information is present in the conversation history."
|
196 |
+
)
|
197 |
+
|
198 |
# Generate a new session ID if not provided (for new conversations)
|
199 |
if not session_id:
|
200 |
session_id = str(uuid.uuid4())
|
201 |
# Initialize chat history for a new session with a system message
|
202 |
chat_histories[session_id] = [
|
203 |
+
{"role": "system", "content": system_prompt_content}
|
204 |
]
|
205 |
print(f"🆕 New session created: {session_id}")
|
206 |
elif session_id not in chat_histories:
|
207 |
# If a session ID is provided but not found, re-initialize it
|
208 |
chat_histories[session_id] = [
|
209 |
+
{"role": "system", "content": system_prompt_content}
|
210 |
]
|
211 |
print(f"⚠️ Session ID {session_id} not found, re-initializing history.")
|
212 |
+
else:
|
213 |
+
# Ensure the system message is always the most up-to-date one
|
214 |
+
if chat_histories[session_id][0]["role"] == "system":
|
215 |
+
chat_histories[session_id][0]["content"] = system_prompt_content
|
216 |
+
else:
|
217 |
+
# This case should ideally not happen if history is managed correctly
|
218 |
+
chat_histories[session_id].insert(0, {"role": "system", "content": system_prompt_content})
|
219 |
+
|
220 |
|
221 |
print(f"🧾 Prompt received for session {session_id}: {prompt}")
|
222 |
|
|
|
226 |
|
227 |
# Truncate history if it exceeds the max context tokens
|
228 |
# We subtract a buffer for the new prompt itself and the expected response
|
229 |
+
# A rough estimate for prompt + response: 100 tokens (prompt) + 100 tokens (response) = 200 tokens
|
230 |
+
effective_max_history_tokens = MAX_CONTEXT_TOKENS - count_tokens_in_message({"role": "user", "content": prompt}) - 100 # Buffer for response
|
|
|
231 |
|
232 |
if get_message_token_length(current_messages) > MAX_CONTEXT_TOKENS:
|
233 |
print(f"✂️ Truncating history for session {session_id}. Current tokens: {get_message_token_length(current_messages)}")
|
234 |
chat_histories[session_id] = truncate_history(current_messages, effective_max_history_tokens)
|
235 |
+
# Re-add the current user prompt after truncation if it was removed
|
236 |
+
# (This logic ensures the current prompt is always the last user message)
|
237 |
+
if not (chat_histories[session_id] and
|
238 |
+
chat_histories[session_id][-1]["role"] == "user" and
|
239 |
+
chat_histories[session_id][-1]["content"] == prompt):
|
240 |
chat_histories[session_id].append({"role": "user", "content": prompt})
|
241 |
print(f"✅ History truncated. New tokens: {get_message_token_length(chat_histories[session_id])}")
|
242 |
else:
|
|
|
246 |
# Pass the (potentially truncated) chat history for context
|
247 |
response = llm.create_chat_completion(
|
248 |
messages=chat_histories[session_id],
|
249 |
+
max_tokens=150, # Further limit response length to encourage conciseness and speed
|
250 |
temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
|
251 |
stop=["</s>"] # Stop sequence for TinyLlama Chat
|
252 |
)
|