Spaces:

Priyanshukr-1
/

openhermes_mistral_API

Sleeping

App Files Files Community

Priyanshukr-1 commited on 30 days ago

Commit

b03785b

verified ·

1 Parent(s): 29e1d2a

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -15

app.py CHANGED Viewed

@@ -62,11 +62,6 @@ except Exception as e:
     exit(1)
 # Initialize tiktoken encoder for token counting (approximate for GGUF models, but good enough)
-# For TinyLlama, we'll use a generic encoder or one that's close enough.
-# 'cl100k_base' is common for OpenAI models, but a good approximation for many others.
-# For more precise counts for GGUF, you might need to use the model's tokenizer if available
-# or rely on llama.cpp's internal tokenization (which is harder to access directly).
-# For simplicity and general estimation, cl100k_base is often used.
 try:
     encoding = tiktoken.get_encoding("cl100k_base")
 except Exception:
@@ -110,10 +105,11 @@ def truncate_history(history, max_tokens):
     current_tokens = count_tokens_in_message(system_message)
     # Add messages from most recent, until max_tokens is reached
-    for message in reversed(history[1:]): # Iterate from second-to-last to first user/assistant message
         message_tokens = count_tokens_in_message(message)
         if current_tokens + message_tokens <= max_tokens:
-            truncated_history.insert(1, message) # Insert after system message
             current_tokens += message_tokens
         else:
             break # Stop adding if next message exceeds limit
@@ -190,20 +186,37 @@ async def generate(request: Request):
     if not prompt:
         return {"error": "Prompt cannot be empty"}, 400
     # Generate a new session ID if not provided (for new conversations)
     if not session_id:
         session_id = str(uuid.uuid4())
         # Initialize chat history for a new session with a system message
         chat_histories[session_id] = [
-            {"role": "system", "content": "You are a helpful AI assistant for data analysis. Provide concise and actionable suggestions based on the data provided or questions asked. Keep your responses focused on data insights and actionable steps for report generation."}
         ]
         print(f"🆕 New session created: {session_id}")
     elif session_id not in chat_histories:
         # If a session ID is provided but not found, re-initialize it
         chat_histories[session_id] = [
-            {"role": "system", "content": "You are a helpful AI assistant for data analysis. Provide concise and actionable suggestions based on the data provided or questions asked. Keep your responses focused on data insights and actionable steps for report generation."}
         ]
         print(f"⚠️ Session ID {session_id} not found, re-initializing history.")
     print(f"🧾 Prompt received for session {session_id}: {prompt}")
@@ -213,15 +226,17 @@ async def generate(request: Request):
     # Truncate history if it exceeds the max context tokens
     # We subtract a buffer for the new prompt itself and the expected response
-    # A rough estimate for prompt + response: 100 tokens (prompt) + 200 tokens (response) = 300 tokens
-    # So, MAX_CONTEXT_TOKENS - 300 for the actual history
-    effective_max_history_tokens = MAX_CONTEXT_TOKENS - count_tokens_in_message({"role": "user", "content": prompt}) - 200 # Buffer for response
     if get_message_token_length(current_messages) > MAX_CONTEXT_TOKENS:
         print(f"✂️ Truncating history for session {session_id}. Current tokens: {get_message_token_length(current_messages)}")
         chat_histories[session_id] = truncate_history(current_messages, effective_max_history_tokens)
-        # Re-add the current user prompt after truncation
-        if chat_histories[session_id][-1]["role"] != "user" or chat_histories[session_id][-1]["content"] != prompt:
              chat_histories[session_id].append({"role": "user", "content": prompt})
         print(f"✅ History truncated. New tokens: {get_message_token_length(chat_histories[session_id])}")
     else:
@@ -231,7 +246,7 @@ async def generate(request: Request):
         # Pass the (potentially truncated) chat history for context
         response = llm.create_chat_completion(
             messages=chat_histories[session_id],
-            max_tokens=256,  # Further limit response length for faster generation
             temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
             stop=["</s>"] # Stop sequence for TinyLlama Chat
         )

     exit(1)
 # Initialize tiktoken encoder for token counting (approximate for GGUF models, but good enough)
 try:
     encoding = tiktoken.get_encoding("cl100k_base")
 except Exception:
     current_tokens = count_tokens_in_message(system_message)
     # Add messages from most recent, until max_tokens is reached
+    # Iterate from second-to-last to first user/assistant message
+    for message in reversed(history[1:]):
         message_tokens = count_tokens_in_message(message)
         if current_tokens + message_tokens <= max_tokens:
+            truncated_history.insert(1, message) # Insert after system message to maintain order
             current_tokens += message_tokens
         else:
             break # Stop adding if next message exceeds limit
     if not prompt:
         return {"error": "Prompt cannot be empty"}, 400
+    # Define the system prompt with an emphasis on using context
+    system_prompt_content = (
+        "You are a helpful AI assistant for data analysis. "
+        "You are designed to provide concise and actionable suggestions based on the data provided or questions asked. "
+        "**Always refer to the information given in the current conversation context.** "
+        "Keep your responses focused on data insights and actionable steps for report generation. "
+        "Do not claim to have no memory if the information is present in the conversation history."
+    )
     # Generate a new session ID if not provided (for new conversations)
     if not session_id:
         session_id = str(uuid.uuid4())
         # Initialize chat history for a new session with a system message
         chat_histories[session_id] = [
+            {"role": "system", "content": system_prompt_content}
         ]
         print(f"🆕 New session created: {session_id}")
     elif session_id not in chat_histories:
         # If a session ID is provided but not found, re-initialize it
         chat_histories[session_id] = [
+            {"role": "system", "content": system_prompt_content}
         ]
         print(f"⚠️ Session ID {session_id} not found, re-initializing history.")
+    else:
+        # Ensure the system message is always the most up-to-date one
+        if chat_histories[session_id][0]["role"] == "system":
+            chat_histories[session_id][0]["content"] = system_prompt_content
+        else:
+            # This case should ideally not happen if history is managed correctly
+            chat_histories[session_id].insert(0, {"role": "system", "content": system_prompt_content})
     print(f"🧾 Prompt received for session {session_id}: {prompt}")
     # Truncate history if it exceeds the max context tokens
     # We subtract a buffer for the new prompt itself and the expected response
+    # A rough estimate for prompt + response: 100 tokens (prompt) + 100 tokens (response) = 200 tokens
+    effective_max_history_tokens = MAX_CONTEXT_TOKENS - count_tokens_in_message({"role": "user", "content": prompt}) - 100 # Buffer for response
     if get_message_token_length(current_messages) > MAX_CONTEXT_TOKENS:
         print(f"✂️ Truncating history for session {session_id}. Current tokens: {get_message_token_length(current_messages)}")
         chat_histories[session_id] = truncate_history(current_messages, effective_max_history_tokens)
+        # Re-add the current user prompt after truncation if it was removed
+        # (This logic ensures the current prompt is always the last user message)
+        if not (chat_histories[session_id] and
+                chat_histories[session_id][-1]["role"] == "user" and
+                chat_histories[session_id][-1]["content"] == prompt):
              chat_histories[session_id].append({"role": "user", "content": prompt})
         print(f"✅ History truncated. New tokens: {get_message_token_length(chat_histories[session_id])}")
     else:
         # Pass the (potentially truncated) chat history for context
         response = llm.create_chat_completion(
             messages=chat_histories[session_id],
+            max_tokens=150,  # Further limit response length to encourage conciseness and speed
             temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
             stop=["</s>"] # Stop sequence for TinyLlama Chat
         )