Update app.py
Browse files
app.py
CHANGED
@@ -6,58 +6,86 @@ import platform
|
|
6 |
import psutil
|
7 |
import multiprocessing
|
8 |
import time
|
|
|
9 |
|
10 |
app = FastAPI()
|
11 |
|
12 |
# === Model Config ===
|
13 |
-
|
14 |
-
|
|
|
15 |
MODEL_DIR = "models"
|
16 |
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
|
17 |
|
18 |
# === Download if model not available ===
|
19 |
if not os.path.exists(MODEL_PATH):
|
20 |
print(f"⬇️ Downloading {FILENAME} from Hugging Face...")
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
else:
|
29 |
print(f"✅ Model already available at: {MODEL_PATH}")
|
30 |
model_path = MODEL_PATH
|
31 |
|
32 |
# === Optimal thread usage ===
|
|
|
|
|
|
|
|
|
33 |
logical_cores = psutil.cpu_count(logical=True)
|
34 |
physical_cores = psutil.cpu_count(logical=False)
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
# === Load the model ===
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
@app.get("/")
|
49 |
def root():
|
50 |
-
return {"message": "✅
|
51 |
|
52 |
@app.get("/get_sys")
|
53 |
def get_sys_specs():
|
|
|
54 |
memory = psutil.virtual_memory()
|
55 |
return {
|
56 |
"CPU": {
|
57 |
"physical_cores": physical_cores,
|
58 |
"logical_cores": logical_cores,
|
59 |
-
"max_freq_mhz": psutil.cpu_freq().max,
|
60 |
-
"
|
61 |
},
|
62 |
"RAM": {
|
63 |
"total_GB": round(memory.total / (1024 ** 3), 2),
|
@@ -67,45 +95,94 @@ def get_sys_specs():
|
|
67 |
"System": {
|
68 |
"platform": platform.platform(),
|
69 |
"architecture": platform.machine(),
|
70 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
}
|
72 |
}
|
73 |
|
74 |
@app.get("/process_list")
|
75 |
def process_list():
|
76 |
-
|
|
|
77 |
processes = []
|
78 |
-
for proc in psutil.process_iter(['pid', 'name']):
|
79 |
try:
|
80 |
cpu = proc.cpu_percent()
|
81 |
-
|
|
|
|
|
82 |
processes.append({
|
83 |
"pid": proc.pid,
|
84 |
"name": proc.name(),
|
85 |
-
"cpu_percent": cpu
|
|
|
86 |
})
|
87 |
-
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
88 |
pass
|
|
|
|
|
89 |
return {"heavy_processes": processes}
|
90 |
|
91 |
@app.post("/generate")
|
92 |
async def generate(request: Request):
|
|
|
|
|
|
|
|
|
|
|
93 |
data = await request.json()
|
94 |
prompt = data.get("prompt", "").strip()
|
95 |
-
|
96 |
-
|
97 |
-
response = llm.create_chat_completion(
|
98 |
-
messages=[
|
99 |
-
{"role": "system", "content": "You are a helpful AI assistant."},
|
100 |
-
{"role": "user", "content": prompt}
|
101 |
-
],
|
102 |
-
max_tokens=2048,
|
103 |
-
temperature=0.7,
|
104 |
-
stop=["</s>"]
|
105 |
-
)
|
106 |
|
107 |
-
|
|
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import psutil
|
7 |
import multiprocessing
|
8 |
import time
|
9 |
+
import uuid # For generating unique session IDs
|
10 |
|
11 |
app = FastAPI()
|
12 |
|
13 |
# === Model Config ===
|
14 |
+
# Switched to TinyLlama-1.1B-Chat-v1.0 for better CPU performance
|
15 |
+
REPO_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0-GGUF"
|
16 |
+
FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # Q4_K_M is a good balance of size and quality
|
17 |
MODEL_DIR = "models"
|
18 |
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
|
19 |
|
20 |
# === Download if model not available ===
|
21 |
if not os.path.exists(MODEL_PATH):
|
22 |
print(f"⬇️ Downloading {FILENAME} from Hugging Face...")
|
23 |
+
try:
|
24 |
+
model_path = hf_hub_download(
|
25 |
+
repo_id=REPO_ID,
|
26 |
+
filename=FILENAME,
|
27 |
+
cache_dir=MODEL_DIR,
|
28 |
+
local_dir=MODEL_DIR,
|
29 |
+
local_dir_use_symlinks=False
|
30 |
+
)
|
31 |
+
print(f"✅ Model downloaded to: {model_path}")
|
32 |
+
except Exception as e:
|
33 |
+
print(f"❌ Error downloading model: {e}")
|
34 |
+
# Exit or handle error appropriately if model download fails
|
35 |
+
exit(1)
|
36 |
else:
|
37 |
print(f"✅ Model already available at: {MODEL_PATH}")
|
38 |
model_path = MODEL_PATH
|
39 |
|
40 |
# === Optimal thread usage ===
|
41 |
+
# psutil.cpu_count(logical=True) gives the number of logical cores (threads)
|
42 |
+
# psutil.cpu_count(logical=False) gives the number of physical cores
|
43 |
+
# For llama.cpp, n_threads often performs best when set to the number of physical cores,
|
44 |
+
# or slightly more, but not exceeding logical cores. Experimentation is key.
|
45 |
logical_cores = psutil.cpu_count(logical=True)
|
46 |
physical_cores = psutil.cpu_count(logical=False)
|
47 |
+
# A common recommendation is to use physical cores or physical_cores * 2
|
48 |
+
# Let's try physical_cores for a start, or a fixed value if physical_cores is too low.
|
49 |
+
recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
|
50 |
+
|
51 |
+
print(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
|
52 |
+
print(f"Using n_threads: {recommended_threads}")
|
53 |
|
54 |
# === Load the model ===
|
55 |
+
try:
|
56 |
+
llm = Llama(
|
57 |
+
model_path=model_path,
|
58 |
+
n_ctx=1024, # Reduced context for TinyLlama, can increase if memory allows and context is critical
|
59 |
+
n_threads=recommended_threads,
|
60 |
+
use_mlock=True, # Lock model in RAM for faster access (good for stability on CPU)
|
61 |
+
n_gpu_layers=0, # CPU only, keep at 0 for Hugging Face free tier
|
62 |
+
chat_format="chatml", # TinyLlama Chat uses ChatML format
|
63 |
+
verbose=False
|
64 |
+
)
|
65 |
+
print("🚀 Llama model loaded successfully!")
|
66 |
+
except Exception as e:
|
67 |
+
print(f"❌ Error loading Llama model: {e}")
|
68 |
+
exit(1)
|
69 |
+
|
70 |
+
# === Global dictionary to store chat histories per session ===
|
71 |
+
# In a production environment, this should be replaced with a persistent storage
|
72 |
+
# like Redis, a database, or a dedicated session management system.
|
73 |
+
chat_histories = {}
|
74 |
|
75 |
@app.get("/")
|
76 |
def root():
|
77 |
+
return {"message": "✅ Data Analysis AI API is live and optimized!"}
|
78 |
|
79 |
@app.get("/get_sys")
|
80 |
def get_sys_specs():
|
81 |
+
"""Returns system specifications including CPU, RAM, and OS details."""
|
82 |
memory = psutil.virtual_memory()
|
83 |
return {
|
84 |
"CPU": {
|
85 |
"physical_cores": physical_cores,
|
86 |
"logical_cores": logical_cores,
|
87 |
+
"max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
|
88 |
+
"cpu_usage_percent": psutil.cpu_percent(interval=1) # CPU usage over 1 second
|
89 |
},
|
90 |
"RAM": {
|
91 |
"total_GB": round(memory.total / (1024 ** 3), 2),
|
|
|
95 |
"System": {
|
96 |
"platform": platform.platform(),
|
97 |
"architecture": platform.machine(),
|
98 |
+
"python_version": platform.python_version()
|
99 |
+
},
|
100 |
+
"Model_Config": {
|
101 |
+
"model_name": FILENAME,
|
102 |
+
"n_ctx": llm.n_ctx(),
|
103 |
+
"n_threads": llm.n_threads(),
|
104 |
+
"use_mlock": llm.use_mlock()
|
105 |
}
|
106 |
}
|
107 |
|
108 |
@app.get("/process_list")
|
109 |
def process_list():
|
110 |
+
"""Returns a list of processes consuming significant CPU."""
|
111 |
+
time.sleep(1) # Let CPU settle for accurate measurement
|
112 |
processes = []
|
113 |
+
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
|
114 |
try:
|
115 |
cpu = proc.cpu_percent()
|
116 |
+
mem = proc.memory_percent()
|
117 |
+
# Filter processes using more than 5% CPU or 2% memory
|
118 |
+
if cpu > 5 or mem > 2:
|
119 |
processes.append({
|
120 |
"pid": proc.pid,
|
121 |
"name": proc.name(),
|
122 |
+
"cpu_percent": round(cpu, 2),
|
123 |
+
"memory_percent": round(mem, 2)
|
124 |
})
|
125 |
+
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
126 |
pass
|
127 |
+
# Sort by CPU usage descending
|
128 |
+
processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
|
129 |
return {"heavy_processes": processes}
|
130 |
|
131 |
@app.post("/generate")
|
132 |
async def generate(request: Request):
|
133 |
+
"""
|
134 |
+
Generates a response from the LLM, maintaining chat context.
|
135 |
+
Expects a JSON body with 'prompt' and optionally 'session_id'.
|
136 |
+
If 'session_id' is not provided, a new one will be generated.
|
137 |
+
"""
|
138 |
data = await request.json()
|
139 |
prompt = data.get("prompt", "").strip()
|
140 |
+
session_id = data.get("session_id")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
+
if not prompt:
|
143 |
+
return {"error": "Prompt cannot be empty"}, 400
|
144 |
|
145 |
+
# Generate a new session ID if not provided (for new conversations)
|
146 |
+
if not session_id:
|
147 |
+
session_id = str(uuid.uuid4())
|
148 |
+
# Initialize chat history for a new session with a system message
|
149 |
+
chat_histories[session_id] = [
|
150 |
+
{"role": "system", "content": "You are a helpful AI assistant for data analysis. Provide concise and actionable suggestions based on the data provided or questions asked. Keep your responses focused on data insights and actionable steps for report generation."}
|
151 |
+
]
|
152 |
+
print(f"🆕 New session created: {session_id}")
|
153 |
+
elif session_id not in chat_histories:
|
154 |
+
# If a session ID is provided but not found, re-initialize it
|
155 |
+
chat_histories[session_id] = [
|
156 |
+
{"role": "system", "content": "You are a helpful AI assistant for data analysis. Provide concise and actionable suggestions based on the data provided or questions asked. Keep your responses focused on data insights and actionable steps for report generation."}
|
157 |
+
]
|
158 |
+
print(f"⚠️ Session ID {session_id} not found, re-initializing history.")
|
159 |
+
|
160 |
+
print(f"🧾 Prompt received for session {session_id}: {prompt}")
|
161 |
+
|
162 |
+
# Add the user's new message to the history for this session
|
163 |
+
chat_histories[session_id].append({"role": "user", "content": prompt})
|
164 |
+
|
165 |
+
try:
|
166 |
+
# Pass the entire chat history for context
|
167 |
+
response = llm.create_chat_completion(
|
168 |
+
messages=chat_histories[session_id],
|
169 |
+
max_tokens=512, # Limit response length for faster generation
|
170 |
+
temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
|
171 |
+
stop=["</s>"] # Stop sequence for TinyLlama Chat
|
172 |
+
)
|
173 |
+
|
174 |
+
ai_response_content = response["choices"][0]["message"]["content"].strip()
|
175 |
+
|
176 |
+
# Add the AI's response to the history for future turns
|
177 |
+
chat_histories[session_id].append({"role": "assistant", "content": ai_response_content})
|
178 |
+
|
179 |
+
return {
|
180 |
+
"response": ai_response_content,
|
181 |
+
"session_id": session_id # Return the session_id so the client can use it for subsequent requests
|
182 |
+
}
|
183 |
+
except Exception as e:
|
184 |
+
print(f"❌ Error during generation for session {session_id}: {e}")
|
185 |
+
# Remove the last user message from history if generation failed to prevent bad state
|
186 |
+
if chat_histories[session_id] and chat_histories[session_id][-1]["role"] == "user":
|
187 |
+
chat_histories[session_id].pop()
|
188 |
+
return {"error": f"Failed to generate response: {e}. Please try again.", "session_id": session_id}, 500
|