Priyanshukr-1 commited on
Commit
191a9f9
·
verified ·
1 Parent(s): a839b23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -43
app.py CHANGED
@@ -6,58 +6,86 @@ import platform
6
  import psutil
7
  import multiprocessing
8
  import time
 
9
 
10
  app = FastAPI()
11
 
12
  # === Model Config ===
13
- REPO_ID = "TheBloke/phi-2-GGUF"
14
- FILENAME = "phi-2.Q3_K_M.gguf"
 
15
  MODEL_DIR = "models"
16
  MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
17
 
18
  # === Download if model not available ===
19
  if not os.path.exists(MODEL_PATH):
20
  print(f"⬇️ Downloading {FILENAME} from Hugging Face...")
21
- model_path = hf_hub_download(
22
- repo_id=REPO_ID,
23
- filename=FILENAME,
24
- cache_dir=MODEL_DIR,
25
- local_dir=MODEL_DIR,
26
- local_dir_use_symlinks=False
27
- )
 
 
 
 
 
 
28
  else:
29
  print(f"✅ Model already available at: {MODEL_PATH}")
30
  model_path = MODEL_PATH
31
 
32
  # === Optimal thread usage ===
 
 
 
 
33
  logical_cores = psutil.cpu_count(logical=True)
34
  physical_cores = psutil.cpu_count(logical=False)
35
- recommended_threads = 4
 
 
 
 
 
36
 
37
  # === Load the model ===
38
- llm = Llama(
39
- model_path=model_path,
40
- n_ctx=2048, # Can increase depending on memory
41
- n_threads=recommended_threads,
42
- use_mlock=True, # lock model in RAM for faster access
43
- n_gpu_layers=0, # CPU only, use >0 if GPU is present
44
- chat_format="chatml", # for Hermes 2
45
- verbose=False
46
- )
 
 
 
 
 
 
 
 
 
 
47
 
48
  @app.get("/")
49
  def root():
50
- return {"message": "✅ Hermes 2 Mistral API is live and optimized!"}
51
 
52
  @app.get("/get_sys")
53
  def get_sys_specs():
 
54
  memory = psutil.virtual_memory()
55
  return {
56
  "CPU": {
57
  "physical_cores": physical_cores,
58
  "logical_cores": logical_cores,
59
- "max_freq_mhz": psutil.cpu_freq().max,
60
- "cpu_usage": psutil.cpu_percent(interval=1)
61
  },
62
  "RAM": {
63
  "total_GB": round(memory.total / (1024 ** 3), 2),
@@ -67,45 +95,94 @@ def get_sys_specs():
67
  "System": {
68
  "platform": platform.platform(),
69
  "architecture": platform.machine(),
70
- "python": platform.python_version()
 
 
 
 
 
 
71
  }
72
  }
73
 
74
  @app.get("/process_list")
75
  def process_list():
76
- time.sleep(1) # Let CPU settle
 
77
  processes = []
78
- for proc in psutil.process_iter(['pid', 'name']):
79
  try:
80
  cpu = proc.cpu_percent()
81
- if cpu > 10:
 
 
82
  processes.append({
83
  "pid": proc.pid,
84
  "name": proc.name(),
85
- "cpu_percent": cpu
 
86
  })
87
- except (psutil.NoSuchProcess, psutil.AccessDenied):
88
  pass
 
 
89
  return {"heavy_processes": processes}
90
 
91
  @app.post("/generate")
92
  async def generate(request: Request):
 
 
 
 
 
93
  data = await request.json()
94
  prompt = data.get("prompt", "").strip()
95
- print("🧾 Prompt received:", prompt)
96
-
97
- response = llm.create_chat_completion(
98
- messages=[
99
- {"role": "system", "content": "You are a helpful AI assistant."},
100
- {"role": "user", "content": prompt}
101
- ],
102
- max_tokens=2048,
103
- temperature=0.7,
104
- stop=["</s>"]
105
- )
106
 
107
- llm.reset() # Free memory after response
 
108
 
109
- return {
110
- "response": response["choices"][0]["message"]["content"].strip()
111
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import psutil
7
  import multiprocessing
8
  import time
9
+ import uuid # For generating unique session IDs
10
 
11
  app = FastAPI()
12
 
13
  # === Model Config ===
14
+ # Switched to TinyLlama-1.1B-Chat-v1.0 for better CPU performance
15
+ REPO_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0-GGUF"
16
+ FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # Q4_K_M is a good balance of size and quality
17
  MODEL_DIR = "models"
18
  MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
19
 
20
  # === Download if model not available ===
21
  if not os.path.exists(MODEL_PATH):
22
  print(f"⬇️ Downloading {FILENAME} from Hugging Face...")
23
+ try:
24
+ model_path = hf_hub_download(
25
+ repo_id=REPO_ID,
26
+ filename=FILENAME,
27
+ cache_dir=MODEL_DIR,
28
+ local_dir=MODEL_DIR,
29
+ local_dir_use_symlinks=False
30
+ )
31
+ print(f"✅ Model downloaded to: {model_path}")
32
+ except Exception as e:
33
+ print(f"❌ Error downloading model: {e}")
34
+ # Exit or handle error appropriately if model download fails
35
+ exit(1)
36
  else:
37
  print(f"✅ Model already available at: {MODEL_PATH}")
38
  model_path = MODEL_PATH
39
 
40
  # === Optimal thread usage ===
41
+ # psutil.cpu_count(logical=True) gives the number of logical cores (threads)
42
+ # psutil.cpu_count(logical=False) gives the number of physical cores
43
+ # For llama.cpp, n_threads often performs best when set to the number of physical cores,
44
+ # or slightly more, but not exceeding logical cores. Experimentation is key.
45
  logical_cores = psutil.cpu_count(logical=True)
46
  physical_cores = psutil.cpu_count(logical=False)
47
+ # A common recommendation is to use physical cores or physical_cores * 2
48
+ # Let's try physical_cores for a start, or a fixed value if physical_cores is too low.
49
+ recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
50
+
51
+ print(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
52
+ print(f"Using n_threads: {recommended_threads}")
53
 
54
  # === Load the model ===
55
+ try:
56
+ llm = Llama(
57
+ model_path=model_path,
58
+ n_ctx=1024, # Reduced context for TinyLlama, can increase if memory allows and context is critical
59
+ n_threads=recommended_threads,
60
+ use_mlock=True, # Lock model in RAM for faster access (good for stability on CPU)
61
+ n_gpu_layers=0, # CPU only, keep at 0 for Hugging Face free tier
62
+ chat_format="chatml", # TinyLlama Chat uses ChatML format
63
+ verbose=False
64
+ )
65
+ print("🚀 Llama model loaded successfully!")
66
+ except Exception as e:
67
+ print(f"❌ Error loading Llama model: {e}")
68
+ exit(1)
69
+
70
+ # === Global dictionary to store chat histories per session ===
71
+ # In a production environment, this should be replaced with a persistent storage
72
+ # like Redis, a database, or a dedicated session management system.
73
+ chat_histories = {}
74
 
75
  @app.get("/")
76
  def root():
77
+ return {"message": "✅ Data Analysis AI API is live and optimized!"}
78
 
79
  @app.get("/get_sys")
80
  def get_sys_specs():
81
+ """Returns system specifications including CPU, RAM, and OS details."""
82
  memory = psutil.virtual_memory()
83
  return {
84
  "CPU": {
85
  "physical_cores": physical_cores,
86
  "logical_cores": logical_cores,
87
+ "max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
88
+ "cpu_usage_percent": psutil.cpu_percent(interval=1) # CPU usage over 1 second
89
  },
90
  "RAM": {
91
  "total_GB": round(memory.total / (1024 ** 3), 2),
 
95
  "System": {
96
  "platform": platform.platform(),
97
  "architecture": platform.machine(),
98
+ "python_version": platform.python_version()
99
+ },
100
+ "Model_Config": {
101
+ "model_name": FILENAME,
102
+ "n_ctx": llm.n_ctx(),
103
+ "n_threads": llm.n_threads(),
104
+ "use_mlock": llm.use_mlock()
105
  }
106
  }
107
 
108
  @app.get("/process_list")
109
  def process_list():
110
+ """Returns a list of processes consuming significant CPU."""
111
+ time.sleep(1) # Let CPU settle for accurate measurement
112
  processes = []
113
+ for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
114
  try:
115
  cpu = proc.cpu_percent()
116
+ mem = proc.memory_percent()
117
+ # Filter processes using more than 5% CPU or 2% memory
118
+ if cpu > 5 or mem > 2:
119
  processes.append({
120
  "pid": proc.pid,
121
  "name": proc.name(),
122
+ "cpu_percent": round(cpu, 2),
123
+ "memory_percent": round(mem, 2)
124
  })
125
+ except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
126
  pass
127
+ # Sort by CPU usage descending
128
+ processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
129
  return {"heavy_processes": processes}
130
 
131
  @app.post("/generate")
132
  async def generate(request: Request):
133
+ """
134
+ Generates a response from the LLM, maintaining chat context.
135
+ Expects a JSON body with 'prompt' and optionally 'session_id'.
136
+ If 'session_id' is not provided, a new one will be generated.
137
+ """
138
  data = await request.json()
139
  prompt = data.get("prompt", "").strip()
140
+ session_id = data.get("session_id")
 
 
 
 
 
 
 
 
 
 
141
 
142
+ if not prompt:
143
+ return {"error": "Prompt cannot be empty"}, 400
144
 
145
+ # Generate a new session ID if not provided (for new conversations)
146
+ if not session_id:
147
+ session_id = str(uuid.uuid4())
148
+ # Initialize chat history for a new session with a system message
149
+ chat_histories[session_id] = [
150
+ {"role": "system", "content": "You are a helpful AI assistant for data analysis. Provide concise and actionable suggestions based on the data provided or questions asked. Keep your responses focused on data insights and actionable steps for report generation."}
151
+ ]
152
+ print(f"🆕 New session created: {session_id}")
153
+ elif session_id not in chat_histories:
154
+ # If a session ID is provided but not found, re-initialize it
155
+ chat_histories[session_id] = [
156
+ {"role": "system", "content": "You are a helpful AI assistant for data analysis. Provide concise and actionable suggestions based on the data provided or questions asked. Keep your responses focused on data insights and actionable steps for report generation."}
157
+ ]
158
+ print(f"⚠️ Session ID {session_id} not found, re-initializing history.")
159
+
160
+ print(f"🧾 Prompt received for session {session_id}: {prompt}")
161
+
162
+ # Add the user's new message to the history for this session
163
+ chat_histories[session_id].append({"role": "user", "content": prompt})
164
+
165
+ try:
166
+ # Pass the entire chat history for context
167
+ response = llm.create_chat_completion(
168
+ messages=chat_histories[session_id],
169
+ max_tokens=512, # Limit response length for faster generation
170
+ temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
171
+ stop=["</s>"] # Stop sequence for TinyLlama Chat
172
+ )
173
+
174
+ ai_response_content = response["choices"][0]["message"]["content"].strip()
175
+
176
+ # Add the AI's response to the history for future turns
177
+ chat_histories[session_id].append({"role": "assistant", "content": ai_response_content})
178
+
179
+ return {
180
+ "response": ai_response_content,
181
+ "session_id": session_id # Return the session_id so the client can use it for subsequent requests
182
+ }
183
+ except Exception as e:
184
+ print(f"❌ Error during generation for session {session_id}: {e}")
185
+ # Remove the last user message from history if generation failed to prevent bad state
186
+ if chat_histories[session_id] and chat_histories[session_id][-1]["role"] == "user":
187
+ chat_histories[session_id].pop()
188
+ return {"error": f"Failed to generate response: {e}. Please try again.", "session_id": session_id}, 500