File size: 11,225 Bytes
43cc365 2d70789 048628f 191a9f9 01e79df 048628f 43cc365 d9ba98f 191a9f9 43cc365 a49d7b2 43cc365 a49d7b2 191a9f9 43cc365 a49d7b2 43cc365 a49d7b2 191a9f9 a49d7b2 191a9f9 01e79df 191a9f9 01e79df 191a9f9 01e79df 191a9f9 43cc365 01e79df 43cc365 191a9f9 43cc365 2d70789 191a9f9 2d70789 a49d7b2 191a9f9 a49d7b2 191a9f9 a49d7b2 2d70789 ab6809d 8afce56 191a9f9 a49d7b2 191a9f9 8ac0dd3 a49d7b2 191a9f9 a49d7b2 8ac0dd3 191a9f9 8ac0dd3 191a9f9 ab6809d 191a9f9 a49d7b2 ab6809d 43cc365 191a9f9 43cc365 a49d7b2 191a9f9 314bed8 191a9f9 1fb027f 191a9f9 01e79df 191a9f9 01e79df 191a9f9 01e79df 191a9f9 01e79df 191a9f9 d9ba98f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 |
from fastapi import FastAPI, Request
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import platform
import psutil
import multiprocessing
import time
import uuid # For generating unique session IDs
import tiktoken # For estimating token count
app = FastAPI()
# === Model Config ===
REPO_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # Q4_K_M is a good balance of size and quality
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
# === Download if model not available ===
if not os.path.exists(MODEL_PATH):
print(f"β¬οΈ Downloading {FILENAME} from Hugging Face...")
try:
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
cache_dir=MODEL_DIR,
local_dir=MODEL_DIR,
local_dir_use_symlinks=False
)
print(f"β
Model downloaded to: {model_path}")
except Exception as e:
print(f"β Error downloading model: {e}")
# Exit or handle error appropriately if model download fails
exit(1)
else:
print(f"β
Model already available at: {MODEL_PATH}")
model_path = MODEL_PATH
# === Optimal thread usage ===
logical_cores = psutil.cpu_count(logical=True)
physical_cores = psutil.cpu_count(logical=False)
recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
print(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
print(f"Using n_threads: {recommended_threads}")
# === Load the model ===
try:
llm = Llama(
model_path=model_path,
n_ctx=1024, # Context window size for the model
n_threads=recommended_threads,
use_mlock=True, # Lock model in RAM for faster access
n_gpu_layers=0, # CPU only
chat_format="chatml", # TinyLlama Chat uses ChatML format
verbose=False
)
print("π Llama model loaded successfully!")
except Exception as e:
print(f"β Error loading Llama model: {e}")
exit(1)
# Initialize tiktoken encoder for token counting (approximate for GGUF models, but good enough)
# For TinyLlama, we'll use a generic encoder or one that's close enough.
# 'cl100k_base' is common for OpenAI models, but a good approximation for many others.
# For more precise counts for GGUF, you might need to use the model's tokenizer if available
# or rely on llama.cpp's internal tokenization (which is harder to access directly).
# For simplicity and general estimation, cl100k_base is often used.
try:
encoding = tiktoken.get_encoding("cl100k_base")
except Exception:
print("β οΈ Could not load tiktoken 'cl100k_base' encoding. Using basic len() for token estimation.")
encoding = None
# === Global dictionary to store chat histories per session ===
chat_histories = {}
# === Context Truncation Settings ===
# Max tokens for the entire conversation history (input to the model)
# This should be less than n_ctx to leave room for the new prompt and generated response.
MAX_CONTEXT_TOKENS = 800 # Keep total input context below this, leaving 224 tokens for new prompt + response
def count_tokens_in_message(message):
"""Estimates tokens in a single message using tiktoken or simple char count."""
if encoding:
return len(encoding.encode(message.get("content", "")))
else:
# Fallback for when tiktoken isn't available or for simple estimation
return len(message.get("content", "")) // 4 # Rough estimate: 1 token ~ 4 characters
def get_message_token_length(messages):
"""Calculates total tokens for a list of messages."""
total_tokens = 0
for message in messages:
total_tokens += count_tokens_in_message(message)
return total_tokens
def truncate_history(history, max_tokens):
"""
Truncates the chat history to fit within max_tokens.
Keeps the system message and recent messages.
"""
if not history:
return []
# Always keep the system message
system_message = history[0]
truncated_history = [system_message]
current_tokens = count_tokens_in_message(system_message)
# Add messages from most recent, until max_tokens is reached
for message in reversed(history[1:]): # Iterate from second-to-last to first user/assistant message
message_tokens = count_tokens_in_message(message)
if current_tokens + message_tokens <= max_tokens:
truncated_history.insert(1, message) # Insert after system message
current_tokens += message_tokens
else:
break # Stop adding if next message exceeds limit
return truncated_history
@app.get("/")
def root():
return {"message": "β
Data Analysis AI API is live and optimized!"}
@app.get("/get_sys")
def get_sys_specs():
"""Returns system specifications including CPU, RAM, and OS details."""
memory = psutil.virtual_memory()
return {
"CPU": {
"physical_cores": physical_cores,
"logical_cores": logical_cores,
"max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
"cpu_usage_percent": psutil.cpu_percent(interval=1) # CPU usage over 1 second
},
"RAM": {
"total_GB": round(memory.total / (1024 ** 3), 2),
"available_GB": round(memory.available / (1024 ** 3), 2),
"usage_percent": memory.percent
},
"System": {
"platform": platform.platform(),
"architecture": platform.machine(),
"python_version": platform.python_version()
},
"Model_Config": {
"model_name": FILENAME,
"n_ctx": llm.n_ctx(),
"n_threads": llm.n_threads(),
"use_mlock": llm.use_mlock()
}
}
@app.get("/process_list")
def process_list():
"""Returns a list of processes consuming significant CPU."""
time.sleep(1) # Let CPU settle for accurate measurement
processes = []
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
try:
cpu = proc.cpu_percent()
mem = proc.memory_percent()
# Filter processes using more than 5% CPU or 2% memory
if cpu > 5 or mem > 2:
processes.append({
"pid": proc.pid,
"name": proc.name(),
"cpu_percent": round(cpu, 2),
"memory_percent": round(mem, 2)
})
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
pass
# Sort by CPU usage descending
processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
return {"heavy_processes": processes}
@app.post("/generate")
async def generate(request: Request):
"""
Generates a response from the LLM, maintaining chat context.
Expects a JSON body with 'prompt' and optionally 'session_id'.
If 'session_id' is not provided, a new one will be generated.
"""
data = await request.json()
prompt = data.get("prompt", "").strip()
session_id = data.get("session_id")
if not prompt:
return {"error": "Prompt cannot be empty"}, 400
# Generate a new session ID if not provided (for new conversations)
if not session_id:
session_id = str(uuid.uuid4())
# Initialize chat history for a new session with a system message
chat_histories[session_id] = [
{"role": "system", "content": "You are a helpful AI assistant for data analysis. Provide concise and actionable suggestions based on the data provided or questions asked. Keep your responses focused on data insights and actionable steps for report generation."}
]
print(f"π New session created: {session_id}")
elif session_id not in chat_histories:
# If a session ID is provided but not found, re-initialize it
chat_histories[session_id] = [
{"role": "system", "content": "You are a helpful AI assistant for data analysis. Provide concise and actionable suggestions based on the data provided or questions asked. Keep your responses focused on data insights and actionable steps for report generation."}
]
print(f"β οΈ Session ID {session_id} not found, re-initializing history.")
print(f"π§Ύ Prompt received for session {session_id}: {prompt}")
# Add the user's new message to a temporary list to check total length
current_messages = list(chat_histories[session_id]) # Create a copy
current_messages.append({"role": "user", "content": prompt})
# Truncate history if it exceeds the max context tokens
# We subtract a buffer for the new prompt itself and the expected response
# A rough estimate for prompt + response: 100 tokens (prompt) + 200 tokens (response) = 300 tokens
# So, MAX_CONTEXT_TOKENS - 300 for the actual history
effective_max_history_tokens = MAX_CONTEXT_TOKENS - count_tokens_in_message({"role": "user", "content": prompt}) - 200 # Buffer for response
if get_message_token_length(current_messages) > MAX_CONTEXT_TOKENS:
print(f"βοΈ Truncating history for session {session_id}. Current tokens: {get_message_token_length(current_messages)}")
chat_histories[session_id] = truncate_history(current_messages, effective_max_history_tokens)
# Re-add the current user prompt after truncation
if chat_histories[session_id][-1]["role"] != "user" or chat_histories[session_id][-1]["content"] != prompt:
chat_histories[session_id].append({"role": "user", "content": prompt})
print(f"β
History truncated. New tokens: {get_message_token_length(chat_histories[session_id])}")
else:
chat_histories[session_id] = current_messages # If not truncated, just update with the new message
try:
# Pass the (potentially truncated) chat history for context
response = llm.create_chat_completion(
messages=chat_histories[session_id],
max_tokens=256, # Further limit response length for faster generation
temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
stop=["</s>"] # Stop sequence for TinyLlama Chat
)
ai_response_content = response["choices"][0]["message"]["content"].strip()
# Add the AI's response to the history for future turns
chat_histories[session_id].append({"role": "assistant", "content": ai_response_content})
return {
"response": ai_response_content,
"session_id": session_id, # Return the session_id so the client can use it for subsequent requests
"current_context_tokens": get_message_token_length(chat_histories[session_id])
}
except Exception as e:
print(f"β Error during generation for session {session_id}: {e}")
# Remove the last user message from history if generation failed to prevent bad state
if chat_histories[session_id] and chat_histories[session_id][-1]["role"] == "user":
chat_histories[session_id].pop()
return {"error": f"Failed to generate response: {e}. Please try again.", "session_id": session_id}, 500
|