Update app.py
Browse files
app.py
CHANGED
@@ -2,25 +2,22 @@ from fastapi import FastAPI, Request
|
|
2 |
from llama_cpp import Llama
|
3 |
from huggingface_hub import hf_hub_download
|
4 |
import os
|
5 |
-
|
6 |
import platform
|
7 |
import psutil
|
8 |
import multiprocessing
|
9 |
import time
|
10 |
|
11 |
-
|
12 |
-
|
13 |
app = FastAPI()
|
14 |
|
15 |
# === Model Config ===
|
16 |
-
REPO_ID = "TheBloke/Mistral-7B-
|
17 |
-
FILENAME = "mistral-7b
|
18 |
MODEL_DIR = "models"
|
19 |
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
|
20 |
|
21 |
-
# === Download
|
22 |
if not os.path.exists(MODEL_PATH):
|
23 |
-
print(f"Downloading
|
24 |
model_path = hf_hub_download(
|
25 |
repo_id=REPO_ID,
|
26 |
filename=FILENAME,
|
@@ -29,101 +26,86 @@ if not os.path.exists(MODEL_PATH):
|
|
29 |
local_dir_use_symlinks=False
|
30 |
)
|
31 |
else:
|
32 |
-
print(f"Model already
|
33 |
model_path = MODEL_PATH
|
34 |
|
35 |
-
# ===
|
|
|
|
|
|
|
|
|
|
|
36 |
llm = Llama(
|
37 |
model_path=model_path,
|
38 |
-
n_ctx=
|
39 |
-
n_threads=
|
40 |
-
n_batch=
|
|
|
|
|
|
|
|
|
41 |
)
|
42 |
|
43 |
@app.get("/")
|
44 |
def root():
|
45 |
-
return {"message": "Mistral API is live!"}
|
46 |
|
47 |
@app.get("/get_sys")
|
48 |
def get_sys_specs():
|
49 |
-
cpu_info = {
|
50 |
-
"physical_cores": psutil.cpu_count(logical=False),
|
51 |
-
"logical_cores": psutil.cpu_count(logical=True),
|
52 |
-
"max_frequency_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else None,
|
53 |
-
"cpu_usage_percent": psutil.cpu_percent(interval=1)
|
54 |
-
}
|
55 |
-
|
56 |
memory = psutil.virtual_memory()
|
57 |
-
ram_info = {
|
58 |
-
"total_gb": round(memory.total / (1024 ** 3), 2),
|
59 |
-
"available_gb": round(memory.available / (1024 ** 3), 2),
|
60 |
-
"used_percent": memory.percent
|
61 |
-
}
|
62 |
-
|
63 |
-
system_info = {
|
64 |
-
"system": platform.system(),
|
65 |
-
"machine": platform.machine(),
|
66 |
-
"platform": platform.platform(),
|
67 |
-
"processor": platform.processor(),
|
68 |
-
"python_version": platform.python_version(),
|
69 |
-
}
|
70 |
-
|
71 |
return {
|
72 |
-
"
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
}
|
77 |
|
78 |
@app.get("/process_list")
|
79 |
def process_list():
|
80 |
-
|
81 |
-
|
82 |
-
# Warm up CPU usage stats
|
83 |
-
for proc in psutil.process_iter():
|
84 |
-
try:
|
85 |
-
proc.cpu_percent(interval=None) # Prime the value
|
86 |
-
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
87 |
-
pass
|
88 |
-
|
89 |
-
time.sleep(1) # Let CPU usage accumulate over time
|
90 |
-
|
91 |
for proc in psutil.process_iter(['pid', 'name']):
|
92 |
try:
|
93 |
-
|
94 |
-
if
|
95 |
-
|
96 |
"pid": proc.pid,
|
97 |
"name": proc.name(),
|
98 |
-
"cpu_percent":
|
99 |
})
|
100 |
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
101 |
pass
|
102 |
-
|
103 |
-
return {
|
104 |
-
"process list": p_l
|
105 |
-
}
|
106 |
-
|
107 |
|
108 |
@app.post("/generate")
|
109 |
async def generate(request: Request):
|
110 |
data = await request.json()
|
111 |
-
prompt = data.get("prompt", "")
|
112 |
-
|
113 |
-
print("🧾 Received prompt:", prompt)
|
114 |
|
115 |
response = llm.create_chat_completion(
|
116 |
messages=[
|
117 |
-
{"role": "system", "content": "You are a helpful assistant."},
|
118 |
{"role": "user", "content": prompt}
|
119 |
],
|
120 |
-
max_tokens=
|
121 |
temperature=0.7,
|
|
|
122 |
)
|
123 |
|
124 |
-
|
125 |
-
|
126 |
-
llm.reset()
|
127 |
|
128 |
return {
|
129 |
"response": response["choices"][0]["message"]["content"].strip()
|
|
|
2 |
from llama_cpp import Llama
|
3 |
from huggingface_hub import hf_hub_download
|
4 |
import os
|
|
|
5 |
import platform
|
6 |
import psutil
|
7 |
import multiprocessing
|
8 |
import time
|
9 |
|
|
|
|
|
10 |
app = FastAPI()
|
11 |
|
12 |
# === Model Config ===
|
13 |
+
REPO_ID = "TheBloke/Hermes-2-Pro-Mistral-7B-GGUF"
|
14 |
+
FILENAME = "hermes-2-pro-mistral-7b.Q4_K_M.gguf"
|
15 |
MODEL_DIR = "models"
|
16 |
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
|
17 |
|
18 |
+
# === Download if model not available ===
|
19 |
if not os.path.exists(MODEL_PATH):
|
20 |
+
print(f"⬇️ Downloading {FILENAME} from Hugging Face...")
|
21 |
model_path = hf_hub_download(
|
22 |
repo_id=REPO_ID,
|
23 |
filename=FILENAME,
|
|
|
26 |
local_dir_use_symlinks=False
|
27 |
)
|
28 |
else:
|
29 |
+
print(f"✅ Model already available at: {MODEL_PATH}")
|
30 |
model_path = MODEL_PATH
|
31 |
|
32 |
+
# === Optimal thread usage ===
|
33 |
+
logical_cores = psutil.cpu_count(logical=True)
|
34 |
+
physical_cores = psutil.cpu_count(logical=False)
|
35 |
+
recommended_threads = min(physical_cores or 4, 8)
|
36 |
+
|
37 |
+
# === Load the model ===
|
38 |
llm = Llama(
|
39 |
model_path=model_path,
|
40 |
+
n_ctx=8192, # Can increase depending on memory
|
41 |
+
n_threads=recommended_threads,
|
42 |
+
n_batch=64, # adjust depending on RAM
|
43 |
+
use_mlock=True, # lock model in RAM for faster access
|
44 |
+
n_gpu_layers=0, # CPU only, use >0 if GPU is present
|
45 |
+
chat_format="chatml", # for Hermes 2
|
46 |
+
verbose=False
|
47 |
)
|
48 |
|
49 |
@app.get("/")
|
50 |
def root():
|
51 |
+
return {"message": "✅ Hermes 2 Mistral API is live and optimized!"}
|
52 |
|
53 |
@app.get("/get_sys")
|
54 |
def get_sys_specs():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
memory = psutil.virtual_memory()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
return {
|
57 |
+
"CPU": {
|
58 |
+
"physical_cores": physical_cores,
|
59 |
+
"logical_cores": logical_cores,
|
60 |
+
"max_freq_mhz": psutil.cpu_freq().max,
|
61 |
+
"cpu_usage": psutil.cpu_percent(interval=1)
|
62 |
+
},
|
63 |
+
"RAM": {
|
64 |
+
"total_GB": round(memory.total / (1024 ** 3), 2),
|
65 |
+
"available_GB": round(memory.available / (1024 ** 3), 2),
|
66 |
+
"usage_percent": memory.percent
|
67 |
+
},
|
68 |
+
"System": {
|
69 |
+
"platform": platform.platform(),
|
70 |
+
"architecture": platform.machine(),
|
71 |
+
"python": platform.python_version()
|
72 |
+
}
|
73 |
}
|
74 |
|
75 |
@app.get("/process_list")
|
76 |
def process_list():
|
77 |
+
time.sleep(1) # Let CPU settle
|
78 |
+
processes = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
for proc in psutil.process_iter(['pid', 'name']):
|
80 |
try:
|
81 |
+
cpu = proc.cpu_percent()
|
82 |
+
if cpu > 10:
|
83 |
+
processes.append({
|
84 |
"pid": proc.pid,
|
85 |
"name": proc.name(),
|
86 |
+
"cpu_percent": cpu
|
87 |
})
|
88 |
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
89 |
pass
|
90 |
+
return {"heavy_processes": processes}
|
|
|
|
|
|
|
|
|
91 |
|
92 |
@app.post("/generate")
|
93 |
async def generate(request: Request):
|
94 |
data = await request.json()
|
95 |
+
prompt = data.get("prompt", "").strip()
|
96 |
+
print("🧾 Prompt received:", prompt)
|
|
|
97 |
|
98 |
response = llm.create_chat_completion(
|
99 |
messages=[
|
100 |
+
{"role": "system", "content": "You are a helpful AI assistant."},
|
101 |
{"role": "user", "content": prompt}
|
102 |
],
|
103 |
+
max_tokens=2048,
|
104 |
temperature=0.7,
|
105 |
+
stop=["</s>"]
|
106 |
)
|
107 |
|
108 |
+
llm.reset() # Free memory after response
|
|
|
|
|
109 |
|
110 |
return {
|
111 |
"response": response["choices"][0]["message"]["content"].strip()
|