Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,6 @@ import platform
|
|
6 |
import psutil
|
7 |
import multiprocessing
|
8 |
import time
|
9 |
-
import uuid # For generating unique session IDs
|
10 |
import tiktoken # For estimating token count
|
11 |
|
12 |
app = FastAPI()
|
@@ -49,7 +48,7 @@ print(f"Using n_threads: {recommended_threads}")
|
|
49 |
try:
|
50 |
llm = Llama(
|
51 |
model_path=model_path,
|
52 |
-
n_ctx=1024, # Context window size for the model
|
53 |
n_threads=recommended_threads,
|
54 |
use_mlock=True, # Lock model in RAM for faster access
|
55 |
n_gpu_layers=0, # CPU only
|
@@ -61,64 +60,25 @@ except Exception as e:
|
|
61 |
print(f"❌ Error loading Llama model: {e}")
|
62 |
exit(1)
|
63 |
|
64 |
-
# Initialize tiktoken encoder for token counting
|
65 |
try:
|
66 |
encoding = tiktoken.get_encoding("cl100k_base")
|
67 |
except Exception:
|
68 |
-
print("⚠️ Could not load tiktoken 'cl100k_base' encoding.
|
69 |
encoding = None
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
# === Context Truncation Settings ===
|
75 |
-
# Max tokens for the entire conversation history (input to the model)
|
76 |
-
# This should be less than n_ctx to leave room for the new prompt and generated response.
|
77 |
-
MAX_CONTEXT_TOKENS = 800 # Keep total input context below this, leaving 224 tokens for new prompt + response
|
78 |
-
|
79 |
-
def count_tokens_in_message(message):
|
80 |
-
"""Estimates tokens in a single message using tiktoken or simple char count."""
|
81 |
if encoding:
|
82 |
-
return len(encoding.encode(
|
83 |
else:
|
84 |
# Fallback for when tiktoken isn't available or for simple estimation
|
85 |
-
return len(
|
86 |
-
|
87 |
-
def get_message_token_length(messages):
|
88 |
-
"""Calculates total tokens for a list of messages."""
|
89 |
-
total_tokens = 0
|
90 |
-
for message in messages:
|
91 |
-
total_tokens += count_tokens_in_message(message)
|
92 |
-
return total_tokens
|
93 |
-
|
94 |
-
def truncate_history(history, max_tokens):
|
95 |
-
"""
|
96 |
-
Truncates the chat history to fit within max_tokens.
|
97 |
-
Keeps the system message and recent messages.
|
98 |
-
"""
|
99 |
-
if not history:
|
100 |
-
return []
|
101 |
-
|
102 |
-
# Always keep the system message
|
103 |
-
system_message = history[0]
|
104 |
-
truncated_history = [system_message]
|
105 |
-
current_tokens = count_tokens_in_message(system_message)
|
106 |
|
107 |
-
# Add messages from most recent, until max_tokens is reached
|
108 |
-
# Iterate from second-to-last to first user/assistant message
|
109 |
-
for message in reversed(history[1:]):
|
110 |
-
message_tokens = count_tokens_in_message(message)
|
111 |
-
if current_tokens + message_tokens <= max_tokens:
|
112 |
-
truncated_history.insert(1, message) # Insert after system message to maintain order
|
113 |
-
current_tokens += message_tokens
|
114 |
-
else:
|
115 |
-
break # Stop adding if next message exceeds limit
|
116 |
-
|
117 |
-
return truncated_history
|
118 |
|
119 |
@app.get("/")
|
120 |
def root():
|
121 |
-
return {"message": "✅ Data Analysis AI API is live and optimized!"}
|
122 |
|
123 |
@app.get("/get_sys")
|
124 |
def get_sys_specs():
|
@@ -175,95 +135,48 @@ def process_list():
|
|
175 |
@app.post("/generate")
|
176 |
async def generate(request: Request):
|
177 |
"""
|
178 |
-
Generates a response from the LLM
|
179 |
-
Expects a JSON body with 'prompt'
|
180 |
-
If 'session_id' is not provided, a new one will be generated.
|
181 |
"""
|
182 |
data = await request.json()
|
183 |
prompt = data.get("prompt", "").strip()
|
184 |
-
session_id = data.get("session_id")
|
185 |
|
186 |
if not prompt:
|
187 |
return {"error": "Prompt cannot be empty"}, 400
|
188 |
|
189 |
-
# Define the system prompt
|
190 |
system_prompt_content = (
|
191 |
"You are a helpful AI assistant for data analysis. "
|
192 |
-
"
|
193 |
-
"
|
194 |
-
"Keep your responses focused on data insights and actionable steps for report generation. "
|
195 |
-
"Do not claim to have no memory if the information is present in the conversation history."
|
196 |
)
|
197 |
|
198 |
-
#
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
{"role": "system", "content": system_prompt_content}
|
204 |
-
]
|
205 |
-
print(f"🆕 New session created: {session_id}")
|
206 |
-
elif session_id not in chat_histories:
|
207 |
-
# If a session ID is provided but not found, re-initialize it
|
208 |
-
chat_histories[session_id] = [
|
209 |
-
{"role": "system", "content": system_prompt_content}
|
210 |
-
]
|
211 |
-
print(f"⚠️ Session ID {session_id} not found, re-initializing history.")
|
212 |
-
else:
|
213 |
-
# Ensure the system message is always the most up-to-date one
|
214 |
-
if chat_histories[session_id][0]["role"] == "system":
|
215 |
-
chat_histories[session_id][0]["content"] = system_prompt_content
|
216 |
-
else:
|
217 |
-
# This case should ideally not happen if history is managed correctly
|
218 |
-
chat_histories[session_id].insert(0, {"role": "system", "content": system_prompt_content})
|
219 |
-
|
220 |
-
|
221 |
-
print(f"🧾 Prompt received for session {session_id}: {prompt}")
|
222 |
|
223 |
-
#
|
224 |
-
|
225 |
-
current_messages.append({"role": "user", "content": prompt})
|
226 |
|
227 |
-
|
228 |
-
|
229 |
-
# A rough estimate for prompt + response: 100 tokens (prompt) + 100 tokens (response) = 200 tokens
|
230 |
-
effective_max_history_tokens = MAX_CONTEXT_TOKENS - count_tokens_in_message({"role": "user", "content": prompt}) - 100 # Buffer for response
|
231 |
-
|
232 |
-
if get_message_token_length(current_messages) > MAX_CONTEXT_TOKENS:
|
233 |
-
print(f"✂️ Truncating history for session {session_id}. Current tokens: {get_message_token_length(current_messages)}")
|
234 |
-
chat_histories[session_id] = truncate_history(current_messages, effective_max_history_tokens)
|
235 |
-
# Re-add the current user prompt after truncation if it was removed
|
236 |
-
# (This logic ensures the current prompt is always the last user message)
|
237 |
-
if not (chat_histories[session_id] and
|
238 |
-
chat_histories[session_id][-1]["role"] == "user" and
|
239 |
-
chat_histories[session_id][-1]["content"] == prompt):
|
240 |
-
chat_histories[session_id].append({"role": "user", "content": prompt})
|
241 |
-
print(f"✅ History truncated. New tokens: {get_message_token_length(chat_histories[session_id])}")
|
242 |
-
else:
|
243 |
-
chat_histories[session_id] = current_messages # If not truncated, just update with the new message
|
244 |
|
245 |
try:
|
246 |
-
# Pass the (potentially truncated) chat history for context
|
247 |
response = llm.create_chat_completion(
|
248 |
-
messages=
|
249 |
-
max_tokens=150, #
|
250 |
temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
|
251 |
stop=["</s>"] # Stop sequence for TinyLlama Chat
|
252 |
)
|
253 |
|
254 |
ai_response_content = response["choices"][0]["message"]["content"].strip()
|
255 |
|
256 |
-
# Add the AI's response to the history for future turns
|
257 |
-
chat_histories[session_id].append({"role": "assistant", "content": ai_response_content})
|
258 |
-
|
259 |
return {
|
260 |
"response": ai_response_content,
|
261 |
-
"
|
262 |
-
"current_context_tokens": get_message_token_length(chat_histories[session_id])
|
263 |
}
|
264 |
except Exception as e:
|
265 |
-
print(f"❌ Error during generation
|
266 |
-
|
267 |
-
if chat_histories[session_id] and chat_histories[session_id][-1]["role"] == "user":
|
268 |
-
chat_histories[session_id].pop()
|
269 |
-
return {"error": f"Failed to generate response: {e}. Please try again.", "session_id": session_id}, 500
|
|
|
6 |
import psutil
|
7 |
import multiprocessing
|
8 |
import time
|
|
|
9 |
import tiktoken # For estimating token count
|
10 |
|
11 |
app = FastAPI()
|
|
|
48 |
try:
|
49 |
llm = Llama(
|
50 |
model_path=model_path,
|
51 |
+
n_ctx=1024, # Context window size for the model (still needed, but not fully utilized for history)
|
52 |
n_threads=recommended_threads,
|
53 |
use_mlock=True, # Lock model in RAM for faster access
|
54 |
n_gpu_layers=0, # CPU only
|
|
|
60 |
print(f"❌ Error loading Llama model: {e}")
|
61 |
exit(1)
|
62 |
|
63 |
+
# Initialize tiktoken encoder for token counting
|
64 |
try:
|
65 |
encoding = tiktoken.get_encoding("cl100k_base")
|
66 |
except Exception:
|
67 |
+
print("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
|
68 |
encoding = None
|
69 |
|
70 |
+
def count_tokens_in_text(text):
|
71 |
+
"""Estimates tokens in a given text using tiktoken or simple char count."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
if encoding:
|
73 |
+
return len(encoding.encode(text))
|
74 |
else:
|
75 |
# Fallback for when tiktoken isn't available or for simple estimation
|
76 |
+
return len(text) // 4 # Rough estimate: 1 token ~ 4 characters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
@app.get("/")
|
80 |
def root():
|
81 |
+
return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}
|
82 |
|
83 |
@app.get("/get_sys")
|
84 |
def get_sys_specs():
|
|
|
135 |
@app.post("/generate")
|
136 |
async def generate(request: Request):
|
137 |
"""
|
138 |
+
Generates a response from the LLM without retaining chat context.
|
139 |
+
Expects a JSON body with 'prompt'.
|
|
|
140 |
"""
|
141 |
data = await request.json()
|
142 |
prompt = data.get("prompt", "").strip()
|
|
|
143 |
|
144 |
if not prompt:
|
145 |
return {"error": "Prompt cannot be empty"}, 400
|
146 |
|
147 |
+
# Define the system prompt - sent with every request
|
148 |
system_prompt_content = (
|
149 |
"You are a helpful AI assistant for data analysis. "
|
150 |
+
"Provide concise and actionable suggestions based on the data provided or questions asked. "
|
151 |
+
"Focus on data insights and actionable steps for report generation."
|
|
|
|
|
152 |
)
|
153 |
|
154 |
+
# Construct messages for the current request only
|
155 |
+
messages_for_llm = [
|
156 |
+
{"role": "system", "content": system_prompt_content},
|
157 |
+
{"role": "user", "content": prompt}
|
158 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
+
# Calculate tokens in the user's prompt
|
161 |
+
prompt_tokens = count_tokens_in_text(prompt)
|
|
|
162 |
|
163 |
+
print(f"🧾 Prompt received: {prompt}")
|
164 |
+
print(f"Tokens in prompt: {prompt_tokens}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
try:
|
|
|
167 |
response = llm.create_chat_completion(
|
168 |
+
messages=messages_for_llm,
|
169 |
+
max_tokens=150, # Keep response length short for maximum speed
|
170 |
temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
|
171 |
stop=["</s>"] # Stop sequence for TinyLlama Chat
|
172 |
)
|
173 |
|
174 |
ai_response_content = response["choices"][0]["message"]["content"].strip()
|
175 |
|
|
|
|
|
|
|
176 |
return {
|
177 |
"response": ai_response_content,
|
178 |
+
"prompt_tokens": prompt_tokens # Return tokens in the prompt
|
|
|
179 |
}
|
180 |
except Exception as e:
|
181 |
+
print(f"❌ Error during generation: {e}")
|
182 |
+
return {"error": f"Failed to generate response: {e}. Please try again."}, 500
|
|
|
|
|
|