Priyanshukr-1 commited on
Commit
280099a
·
verified ·
1 Parent(s): b03785b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -114
app.py CHANGED
@@ -6,7 +6,6 @@ import platform
6
  import psutil
7
  import multiprocessing
8
  import time
9
- import uuid # For generating unique session IDs
10
  import tiktoken # For estimating token count
11
 
12
  app = FastAPI()
@@ -49,7 +48,7 @@ print(f"Using n_threads: {recommended_threads}")
49
  try:
50
  llm = Llama(
51
  model_path=model_path,
52
- n_ctx=1024, # Context window size for the model
53
  n_threads=recommended_threads,
54
  use_mlock=True, # Lock model in RAM for faster access
55
  n_gpu_layers=0, # CPU only
@@ -61,64 +60,25 @@ except Exception as e:
61
  print(f"❌ Error loading Llama model: {e}")
62
  exit(1)
63
 
64
- # Initialize tiktoken encoder for token counting (approximate for GGUF models, but good enough)
65
  try:
66
  encoding = tiktoken.get_encoding("cl100k_base")
67
  except Exception:
68
- print("⚠️ Could not load tiktoken 'cl100k_base' encoding. Using basic len() for token estimation.")
69
  encoding = None
70
 
71
- # === Global dictionary to store chat histories per session ===
72
- chat_histories = {}
73
-
74
- # === Context Truncation Settings ===
75
- # Max tokens for the entire conversation history (input to the model)
76
- # This should be less than n_ctx to leave room for the new prompt and generated response.
77
- MAX_CONTEXT_TOKENS = 800 # Keep total input context below this, leaving 224 tokens for new prompt + response
78
-
79
- def count_tokens_in_message(message):
80
- """Estimates tokens in a single message using tiktoken or simple char count."""
81
  if encoding:
82
- return len(encoding.encode(message.get("content", "")))
83
  else:
84
  # Fallback for when tiktoken isn't available or for simple estimation
85
- return len(message.get("content", "")) // 4 # Rough estimate: 1 token ~ 4 characters
86
-
87
- def get_message_token_length(messages):
88
- """Calculates total tokens for a list of messages."""
89
- total_tokens = 0
90
- for message in messages:
91
- total_tokens += count_tokens_in_message(message)
92
- return total_tokens
93
-
94
- def truncate_history(history, max_tokens):
95
- """
96
- Truncates the chat history to fit within max_tokens.
97
- Keeps the system message and recent messages.
98
- """
99
- if not history:
100
- return []
101
-
102
- # Always keep the system message
103
- system_message = history[0]
104
- truncated_history = [system_message]
105
- current_tokens = count_tokens_in_message(system_message)
106
 
107
- # Add messages from most recent, until max_tokens is reached
108
- # Iterate from second-to-last to first user/assistant message
109
- for message in reversed(history[1:]):
110
- message_tokens = count_tokens_in_message(message)
111
- if current_tokens + message_tokens <= max_tokens:
112
- truncated_history.insert(1, message) # Insert after system message to maintain order
113
- current_tokens += message_tokens
114
- else:
115
- break # Stop adding if next message exceeds limit
116
-
117
- return truncated_history
118
 
119
  @app.get("/")
120
  def root():
121
- return {"message": "✅ Data Analysis AI API is live and optimized!"}
122
 
123
  @app.get("/get_sys")
124
  def get_sys_specs():
@@ -175,95 +135,48 @@ def process_list():
175
  @app.post("/generate")
176
  async def generate(request: Request):
177
  """
178
- Generates a response from the LLM, maintaining chat context.
179
- Expects a JSON body with 'prompt' and optionally 'session_id'.
180
- If 'session_id' is not provided, a new one will be generated.
181
  """
182
  data = await request.json()
183
  prompt = data.get("prompt", "").strip()
184
- session_id = data.get("session_id")
185
 
186
  if not prompt:
187
  return {"error": "Prompt cannot be empty"}, 400
188
 
189
- # Define the system prompt with an emphasis on using context
190
  system_prompt_content = (
191
  "You are a helpful AI assistant for data analysis. "
192
- "You are designed to provide concise and actionable suggestions based on the data provided or questions asked. "
193
- "**Always refer to the information given in the current conversation context.** "
194
- "Keep your responses focused on data insights and actionable steps for report generation. "
195
- "Do not claim to have no memory if the information is present in the conversation history."
196
  )
197
 
198
- # Generate a new session ID if not provided (for new conversations)
199
- if not session_id:
200
- session_id = str(uuid.uuid4())
201
- # Initialize chat history for a new session with a system message
202
- chat_histories[session_id] = [
203
- {"role": "system", "content": system_prompt_content}
204
- ]
205
- print(f"🆕 New session created: {session_id}")
206
- elif session_id not in chat_histories:
207
- # If a session ID is provided but not found, re-initialize it
208
- chat_histories[session_id] = [
209
- {"role": "system", "content": system_prompt_content}
210
- ]
211
- print(f"⚠️ Session ID {session_id} not found, re-initializing history.")
212
- else:
213
- # Ensure the system message is always the most up-to-date one
214
- if chat_histories[session_id][0]["role"] == "system":
215
- chat_histories[session_id][0]["content"] = system_prompt_content
216
- else:
217
- # This case should ideally not happen if history is managed correctly
218
- chat_histories[session_id].insert(0, {"role": "system", "content": system_prompt_content})
219
-
220
-
221
- print(f"🧾 Prompt received for session {session_id}: {prompt}")
222
 
223
- # Add the user's new message to a temporary list to check total length
224
- current_messages = list(chat_histories[session_id]) # Create a copy
225
- current_messages.append({"role": "user", "content": prompt})
226
 
227
- # Truncate history if it exceeds the max context tokens
228
- # We subtract a buffer for the new prompt itself and the expected response
229
- # A rough estimate for prompt + response: 100 tokens (prompt) + 100 tokens (response) = 200 tokens
230
- effective_max_history_tokens = MAX_CONTEXT_TOKENS - count_tokens_in_message({"role": "user", "content": prompt}) - 100 # Buffer for response
231
-
232
- if get_message_token_length(current_messages) > MAX_CONTEXT_TOKENS:
233
- print(f"✂️ Truncating history for session {session_id}. Current tokens: {get_message_token_length(current_messages)}")
234
- chat_histories[session_id] = truncate_history(current_messages, effective_max_history_tokens)
235
- # Re-add the current user prompt after truncation if it was removed
236
- # (This logic ensures the current prompt is always the last user message)
237
- if not (chat_histories[session_id] and
238
- chat_histories[session_id][-1]["role"] == "user" and
239
- chat_histories[session_id][-1]["content"] == prompt):
240
- chat_histories[session_id].append({"role": "user", "content": prompt})
241
- print(f"✅ History truncated. New tokens: {get_message_token_length(chat_histories[session_id])}")
242
- else:
243
- chat_histories[session_id] = current_messages # If not truncated, just update with the new message
244
 
245
  try:
246
- # Pass the (potentially truncated) chat history for context
247
  response = llm.create_chat_completion(
248
- messages=chat_histories[session_id],
249
- max_tokens=150, # Further limit response length to encourage conciseness and speed
250
  temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
251
  stop=["</s>"] # Stop sequence for TinyLlama Chat
252
  )
253
 
254
  ai_response_content = response["choices"][0]["message"]["content"].strip()
255
 
256
- # Add the AI's response to the history for future turns
257
- chat_histories[session_id].append({"role": "assistant", "content": ai_response_content})
258
-
259
  return {
260
  "response": ai_response_content,
261
- "session_id": session_id, # Return the session_id so the client can use it for subsequent requests
262
- "current_context_tokens": get_message_token_length(chat_histories[session_id])
263
  }
264
  except Exception as e:
265
- print(f"❌ Error during generation for session {session_id}: {e}")
266
- # Remove the last user message from history if generation failed to prevent bad state
267
- if chat_histories[session_id] and chat_histories[session_id][-1]["role"] == "user":
268
- chat_histories[session_id].pop()
269
- return {"error": f"Failed to generate response: {e}. Please try again.", "session_id": session_id}, 500
 
6
  import psutil
7
  import multiprocessing
8
  import time
 
9
  import tiktoken # For estimating token count
10
 
11
  app = FastAPI()
 
48
  try:
49
  llm = Llama(
50
  model_path=model_path,
51
+ n_ctx=1024, # Context window size for the model (still needed, but not fully utilized for history)
52
  n_threads=recommended_threads,
53
  use_mlock=True, # Lock model in RAM for faster access
54
  n_gpu_layers=0, # CPU only
 
60
  print(f"❌ Error loading Llama model: {e}")
61
  exit(1)
62
 
63
+ # Initialize tiktoken encoder for token counting
64
  try:
65
  encoding = tiktoken.get_encoding("cl100k_base")
66
  except Exception:
67
+ print("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
68
  encoding = None
69
 
70
+ def count_tokens_in_text(text):
71
+ """Estimates tokens in a given text using tiktoken or simple char count."""
 
 
 
 
 
 
 
 
72
  if encoding:
73
+ return len(encoding.encode(text))
74
  else:
75
  # Fallback for when tiktoken isn't available or for simple estimation
76
+ return len(text) // 4 # Rough estimate: 1 token ~ 4 characters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  @app.get("/")
80
  def root():
81
+ return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}
82
 
83
  @app.get("/get_sys")
84
  def get_sys_specs():
 
135
  @app.post("/generate")
136
  async def generate(request: Request):
137
  """
138
+ Generates a response from the LLM without retaining chat context.
139
+ Expects a JSON body with 'prompt'.
 
140
  """
141
  data = await request.json()
142
  prompt = data.get("prompt", "").strip()
 
143
 
144
  if not prompt:
145
  return {"error": "Prompt cannot be empty"}, 400
146
 
147
+ # Define the system prompt - sent with every request
148
  system_prompt_content = (
149
  "You are a helpful AI assistant for data analysis. "
150
+ "Provide concise and actionable suggestions based on the data provided or questions asked. "
151
+ "Focus on data insights and actionable steps for report generation."
 
 
152
  )
153
 
154
+ # Construct messages for the current request only
155
+ messages_for_llm = [
156
+ {"role": "system", "content": system_prompt_content},
157
+ {"role": "user", "content": prompt}
158
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
+ # Calculate tokens in the user's prompt
161
+ prompt_tokens = count_tokens_in_text(prompt)
 
162
 
163
+ print(f"🧾 Prompt received: {prompt}")
164
+ print(f"Tokens in prompt: {prompt_tokens}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
  try:
 
167
  response = llm.create_chat_completion(
168
+ messages=messages_for_llm,
169
+ max_tokens=150, # Keep response length short for maximum speed
170
  temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
171
  stop=["</s>"] # Stop sequence for TinyLlama Chat
172
  )
173
 
174
  ai_response_content = response["choices"][0]["message"]["content"].strip()
175
 
 
 
 
176
  return {
177
  "response": ai_response_content,
178
+ "prompt_tokens": prompt_tokens # Return tokens in the prompt
 
179
  }
180
  except Exception as e:
181
+ print(f"❌ Error during generation: {e}")
182
+ return {"error": f"Failed to generate response: {e}. Please try again."}, 500