Spaces:

Priyanshukr-1
/

openhermes_mistral_API

Sleeping

App Files Files Community

Priyanshukr-1 commited on 30 days ago

Commit

01e79df

verified ·

1 Parent(s): d9ba98f

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -17

app.py CHANGED Viewed

@@ -7,11 +7,11 @@ import psutil
 import multiprocessing
 import time
 import uuid # For generating unique session IDs
 app = FastAPI()
 # === Model Config ===
-# Corrected REPO_ID to use TheBloke's GGUF version of TinyLlama
 REPO_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
 FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # Q4_K_M is a good balance of size and quality
 MODEL_DIR = "models"
@@ -38,14 +38,8 @@ else:
     model_path = MODEL_PATH
 # === Optimal thread usage ===
-# psutil.cpu_count(logical=True) gives the number of logical cores (threads)
-# psutil.cpu_count(logical=False) gives the number of physical cores
-# For llama.cpp, n_threads often performs best when set to the number of physical cores,
-# or slightly more, but not exceeding logical cores. Experimentation is key.
 logical_cores = psutil.cpu_count(logical=True)
 physical_cores = psutil.cpu_count(logical=False)
-# A common recommendation is to use physical cores or physical_cores * 2
-# Let's try physical_cores for a start, or a fixed value if physical_cores is too low.
 recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
 print(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
@@ -55,10 +49,10 @@ print(f"Using n_threads: {recommended_threads}")
 try:
     llm = Llama(
         model_path=model_path,
-        n_ctx=1024,  # Reduced context for TinyLlama, can increase if memory allows and context is critical
         n_threads=recommended_threads,
-        use_mlock=True,  # Lock model in RAM for faster access (good for stability on CPU)
-        n_gpu_layers=0,  # CPU only, keep at 0 for Hugging Face free tier
         chat_format="chatml",  # TinyLlama Chat uses ChatML format
         verbose=False
     )
@@ -67,11 +61,65 @@ except Exception as e:
     print(f"❌ Error loading Llama model: {e}")
     exit(1)
 # === Global dictionary to store chat histories per session ===
-# In a production environment, this should be replaced with a persistent storage
-# like Redis, a database, or a dedicated session management system.
 chat_histories = {}
 @app.get("/")
 def root():
     return {"message": "✅ Data Analysis AI API is live and optimized!"}
@@ -159,14 +207,31 @@ async def generate(request: Request):
     print(f"🧾 Prompt received for session {session_id}: {prompt}")
-    # Add the user's new message to the history for this session
-    chat_histories[session_id].append({"role": "user", "content": prompt})
     try:
-        # Pass the entire chat history for context
         response = llm.create_chat_completion(
             messages=chat_histories[session_id],
-            max_tokens=512,  # Limit response length for faster generation
             temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
             stop=["</s>"] # Stop sequence for TinyLlama Chat
         )
@@ -178,7 +243,8 @@ async def generate(request: Request):
         return {
             "response": ai_response_content,
-            "session_id": session_id # Return the session_id so the client can use it for subsequent requests
         }
     except Exception as e:
         print(f"❌ Error during generation for session {session_id}: {e}")

 import multiprocessing
 import time
 import uuid # For generating unique session IDs
+import tiktoken # For estimating token count
 app = FastAPI()
 # === Model Config ===
 REPO_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
 FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # Q4_K_M is a good balance of size and quality
 MODEL_DIR = "models"
     model_path = MODEL_PATH
 # === Optimal thread usage ===
 logical_cores = psutil.cpu_count(logical=True)
 physical_cores = psutil.cpu_count(logical=False)
 recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
 print(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
 try:
     llm = Llama(
         model_path=model_path,
+        n_ctx=1024,  # Context window size for the model
         n_threads=recommended_threads,
+        use_mlock=True,  # Lock model in RAM for faster access
+        n_gpu_layers=0,  # CPU only
         chat_format="chatml",  # TinyLlama Chat uses ChatML format
         verbose=False
     )
     print(f"❌ Error loading Llama model: {e}")
     exit(1)
+# Initialize tiktoken encoder for token counting (approximate for GGUF models, but good enough)
+# For TinyLlama, we'll use a generic encoder or one that's close enough.
+# 'cl100k_base' is common for OpenAI models, but a good approximation for many others.
+# For more precise counts for GGUF, you might need to use the model's tokenizer if available
+# or rely on llama.cpp's internal tokenization (which is harder to access directly).
+# For simplicity and general estimation, cl100k_base is often used.
+try:
+    encoding = tiktoken.get_encoding("cl100k_base")
+except Exception:
+    print("⚠️ Could not load tiktoken 'cl100k_base' encoding. Using basic len() for token estimation.")
+    encoding = None
 # === Global dictionary to store chat histories per session ===
 chat_histories = {}
+# === Context Truncation Settings ===
+# Max tokens for the entire conversation history (input to the model)
+# This should be less than n_ctx to leave room for the new prompt and generated response.
+MAX_CONTEXT_TOKENS = 800 # Keep total input context below this, leaving 224 tokens for new prompt + response
+def count_tokens_in_message(message):
+    """Estimates tokens in a single message using tiktoken or simple char count."""
+    if encoding:
+        return len(encoding.encode(message.get("content", "")))
+    else:
+        # Fallback for when tiktoken isn't available or for simple estimation
+        return len(message.get("content", "")) // 4 # Rough estimate: 1 token ~ 4 characters
+def get_message_token_length(messages):
+    """Calculates total tokens for a list of messages."""
+    total_tokens = 0
+    for message in messages:
+        total_tokens += count_tokens_in_message(message)
+    return total_tokens
+def truncate_history(history, max_tokens):
+    """
+    Truncates the chat history to fit within max_tokens.
+    Keeps the system message and recent messages.
+    """
+    if not history:
+        return []
+    # Always keep the system message
+    system_message = history[0]
+    truncated_history = [system_message]
+    current_tokens = count_tokens_in_message(system_message)
+    # Add messages from most recent, until max_tokens is reached
+    for message in reversed(history[1:]): # Iterate from second-to-last to first user/assistant message
+        message_tokens = count_tokens_in_message(message)
+        if current_tokens + message_tokens <= max_tokens:
+            truncated_history.insert(1, message) # Insert after system message
+            current_tokens += message_tokens
+        else:
+            break # Stop adding if next message exceeds limit
+    return truncated_history
 @app.get("/")
 def root():
     return {"message": "✅ Data Analysis AI API is live and optimized!"}
     print(f"🧾 Prompt received for session {session_id}: {prompt}")
+    # Add the user's new message to a temporary list to check total length
+    current_messages = list(chat_histories[session_id]) # Create a copy
+    current_messages.append({"role": "user", "content": prompt})
+    # Truncate history if it exceeds the max context tokens
+    # We subtract a buffer for the new prompt itself and the expected response
+    # A rough estimate for prompt + response: 100 tokens (prompt) + 200 tokens (response) = 300 tokens
+    # So, MAX_CONTEXT_TOKENS - 300 for the actual history
+    effective_max_history_tokens = MAX_CONTEXT_TOKENS - count_tokens_in_message({"role": "user", "content": prompt}) - 200 # Buffer for response
+    if get_message_token_length(current_messages) > MAX_CONTEXT_TOKENS:
+        print(f"✂️ Truncating history for session {session_id}. Current tokens: {get_message_token_length(current_messages)}")
+        chat_histories[session_id] = truncate_history(current_messages, effective_max_history_tokens)
+        # Re-add the current user prompt after truncation
+        if chat_histories[session_id][-1]["role"] != "user" or chat_histories[session_id][-1]["content"] != prompt:
+             chat_histories[session_id].append({"role": "user", "content": prompt})
+        print(f"✅ History truncated. New tokens: {get_message_token_length(chat_histories[session_id])}")
+    else:
+        chat_histories[session_id] = current_messages # If not truncated, just update with the new message
     try:
+        # Pass the (potentially truncated) chat history for context
         response = llm.create_chat_completion(
             messages=chat_histories[session_id],
+            max_tokens=256,  # Further limit response length for faster generation
             temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
             stop=["</s>"] # Stop sequence for TinyLlama Chat
         )
         return {
             "response": ai_response_content,
+            "session_id": session_id, # Return the session_id so the client can use it for subsequent requests
+            "current_context_tokens": get_message_token_length(chat_histories[session_id])
         }
     except Exception as e:
         print(f"❌ Error during generation for session {session_id}: {e}")