Priyanshukr-1 commited on
Commit
b03785b
·
verified ·
1 Parent(s): 29e1d2a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -15
app.py CHANGED
@@ -62,11 +62,6 @@ except Exception as e:
62
  exit(1)
63
 
64
  # Initialize tiktoken encoder for token counting (approximate for GGUF models, but good enough)
65
- # For TinyLlama, we'll use a generic encoder or one that's close enough.
66
- # 'cl100k_base' is common for OpenAI models, but a good approximation for many others.
67
- # For more precise counts for GGUF, you might need to use the model's tokenizer if available
68
- # or rely on llama.cpp's internal tokenization (which is harder to access directly).
69
- # For simplicity and general estimation, cl100k_base is often used.
70
  try:
71
  encoding = tiktoken.get_encoding("cl100k_base")
72
  except Exception:
@@ -110,10 +105,11 @@ def truncate_history(history, max_tokens):
110
  current_tokens = count_tokens_in_message(system_message)
111
 
112
  # Add messages from most recent, until max_tokens is reached
113
- for message in reversed(history[1:]): # Iterate from second-to-last to first user/assistant message
 
114
  message_tokens = count_tokens_in_message(message)
115
  if current_tokens + message_tokens <= max_tokens:
116
- truncated_history.insert(1, message) # Insert after system message
117
  current_tokens += message_tokens
118
  else:
119
  break # Stop adding if next message exceeds limit
@@ -190,20 +186,37 @@ async def generate(request: Request):
190
  if not prompt:
191
  return {"error": "Prompt cannot be empty"}, 400
192
 
 
 
 
 
 
 
 
 
 
193
  # Generate a new session ID if not provided (for new conversations)
194
  if not session_id:
195
  session_id = str(uuid.uuid4())
196
  # Initialize chat history for a new session with a system message
197
  chat_histories[session_id] = [
198
- {"role": "system", "content": "You are a helpful AI assistant for data analysis. Provide concise and actionable suggestions based on the data provided or questions asked. Keep your responses focused on data insights and actionable steps for report generation."}
199
  ]
200
  print(f"🆕 New session created: {session_id}")
201
  elif session_id not in chat_histories:
202
  # If a session ID is provided but not found, re-initialize it
203
  chat_histories[session_id] = [
204
- {"role": "system", "content": "You are a helpful AI assistant for data analysis. Provide concise and actionable suggestions based on the data provided or questions asked. Keep your responses focused on data insights and actionable steps for report generation."}
205
  ]
206
  print(f"⚠️ Session ID {session_id} not found, re-initializing history.")
 
 
 
 
 
 
 
 
207
 
208
  print(f"🧾 Prompt received for session {session_id}: {prompt}")
209
 
@@ -213,15 +226,17 @@ async def generate(request: Request):
213
 
214
  # Truncate history if it exceeds the max context tokens
215
  # We subtract a buffer for the new prompt itself and the expected response
216
- # A rough estimate for prompt + response: 100 tokens (prompt) + 200 tokens (response) = 300 tokens
217
- # So, MAX_CONTEXT_TOKENS - 300 for the actual history
218
- effective_max_history_tokens = MAX_CONTEXT_TOKENS - count_tokens_in_message({"role": "user", "content": prompt}) - 200 # Buffer for response
219
 
220
  if get_message_token_length(current_messages) > MAX_CONTEXT_TOKENS:
221
  print(f"✂️ Truncating history for session {session_id}. Current tokens: {get_message_token_length(current_messages)}")
222
  chat_histories[session_id] = truncate_history(current_messages, effective_max_history_tokens)
223
- # Re-add the current user prompt after truncation
224
- if chat_histories[session_id][-1]["role"] != "user" or chat_histories[session_id][-1]["content"] != prompt:
 
 
 
225
  chat_histories[session_id].append({"role": "user", "content": prompt})
226
  print(f"✅ History truncated. New tokens: {get_message_token_length(chat_histories[session_id])}")
227
  else:
@@ -231,7 +246,7 @@ async def generate(request: Request):
231
  # Pass the (potentially truncated) chat history for context
232
  response = llm.create_chat_completion(
233
  messages=chat_histories[session_id],
234
- max_tokens=256, # Further limit response length for faster generation
235
  temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
236
  stop=["</s>"] # Stop sequence for TinyLlama Chat
237
  )
 
62
  exit(1)
63
 
64
  # Initialize tiktoken encoder for token counting (approximate for GGUF models, but good enough)
 
 
 
 
 
65
  try:
66
  encoding = tiktoken.get_encoding("cl100k_base")
67
  except Exception:
 
105
  current_tokens = count_tokens_in_message(system_message)
106
 
107
  # Add messages from most recent, until max_tokens is reached
108
+ # Iterate from second-to-last to first user/assistant message
109
+ for message in reversed(history[1:]):
110
  message_tokens = count_tokens_in_message(message)
111
  if current_tokens + message_tokens <= max_tokens:
112
+ truncated_history.insert(1, message) # Insert after system message to maintain order
113
  current_tokens += message_tokens
114
  else:
115
  break # Stop adding if next message exceeds limit
 
186
  if not prompt:
187
  return {"error": "Prompt cannot be empty"}, 400
188
 
189
+ # Define the system prompt with an emphasis on using context
190
+ system_prompt_content = (
191
+ "You are a helpful AI assistant for data analysis. "
192
+ "You are designed to provide concise and actionable suggestions based on the data provided or questions asked. "
193
+ "**Always refer to the information given in the current conversation context.** "
194
+ "Keep your responses focused on data insights and actionable steps for report generation. "
195
+ "Do not claim to have no memory if the information is present in the conversation history."
196
+ )
197
+
198
  # Generate a new session ID if not provided (for new conversations)
199
  if not session_id:
200
  session_id = str(uuid.uuid4())
201
  # Initialize chat history for a new session with a system message
202
  chat_histories[session_id] = [
203
+ {"role": "system", "content": system_prompt_content}
204
  ]
205
  print(f"🆕 New session created: {session_id}")
206
  elif session_id not in chat_histories:
207
  # If a session ID is provided but not found, re-initialize it
208
  chat_histories[session_id] = [
209
+ {"role": "system", "content": system_prompt_content}
210
  ]
211
  print(f"⚠️ Session ID {session_id} not found, re-initializing history.")
212
+ else:
213
+ # Ensure the system message is always the most up-to-date one
214
+ if chat_histories[session_id][0]["role"] == "system":
215
+ chat_histories[session_id][0]["content"] = system_prompt_content
216
+ else:
217
+ # This case should ideally not happen if history is managed correctly
218
+ chat_histories[session_id].insert(0, {"role": "system", "content": system_prompt_content})
219
+
220
 
221
  print(f"🧾 Prompt received for session {session_id}: {prompt}")
222
 
 
226
 
227
  # Truncate history if it exceeds the max context tokens
228
  # We subtract a buffer for the new prompt itself and the expected response
229
+ # A rough estimate for prompt + response: 100 tokens (prompt) + 100 tokens (response) = 200 tokens
230
+ effective_max_history_tokens = MAX_CONTEXT_TOKENS - count_tokens_in_message({"role": "user", "content": prompt}) - 100 # Buffer for response
 
231
 
232
  if get_message_token_length(current_messages) > MAX_CONTEXT_TOKENS:
233
  print(f"✂️ Truncating history for session {session_id}. Current tokens: {get_message_token_length(current_messages)}")
234
  chat_histories[session_id] = truncate_history(current_messages, effective_max_history_tokens)
235
+ # Re-add the current user prompt after truncation if it was removed
236
+ # (This logic ensures the current prompt is always the last user message)
237
+ if not (chat_histories[session_id] and
238
+ chat_histories[session_id][-1]["role"] == "user" and
239
+ chat_histories[session_id][-1]["content"] == prompt):
240
  chat_histories[session_id].append({"role": "user", "content": prompt})
241
  print(f"✅ History truncated. New tokens: {get_message_token_length(chat_histories[session_id])}")
242
  else:
 
246
  # Pass the (potentially truncated) chat history for context
247
  response = llm.create_chat_completion(
248
  messages=chat_histories[session_id],
249
+ max_tokens=150, # Further limit response length to encourage conciseness and speed
250
  temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
251
  stop=["</s>"] # Stop sequence for TinyLlama Chat
252
  )