|
from fastapi import FastAPI, Request |
|
from llama_cpp import Llama |
|
from huggingface_hub import hf_hub_download |
|
import os |
|
import platform |
|
import psutil |
|
import multiprocessing |
|
import time |
|
|
|
app = FastAPI() |
|
|
|
|
|
REPO_ID = "TheBloke/phi-2-GGUF" |
|
FILENAME = "phi-2.Q3_K_M.gguf" |
|
MODEL_DIR = "models" |
|
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME) |
|
|
|
|
|
if not os.path.exists(MODEL_PATH): |
|
print(f"⬇️ Downloading {FILENAME} from Hugging Face...") |
|
model_path = hf_hub_download( |
|
repo_id=REPO_ID, |
|
filename=FILENAME, |
|
cache_dir=MODEL_DIR, |
|
local_dir=MODEL_DIR, |
|
local_dir_use_symlinks=False |
|
) |
|
else: |
|
print(f"✅ Model already available at: {MODEL_PATH}") |
|
model_path = MODEL_PATH |
|
|
|
|
|
logical_cores = psutil.cpu_count(logical=True) |
|
physical_cores = psutil.cpu_count(logical=False) |
|
recommended_threads = 4 |
|
|
|
|
|
llm = Llama( |
|
model_path=model_path, |
|
n_ctx=2048, |
|
n_threads=recommended_threads, |
|
use_mlock=True, |
|
n_gpu_layers=0, |
|
chat_format="chatml", |
|
verbose=False |
|
) |
|
|
|
@app.get("/") |
|
def root(): |
|
return {"message": "✅ Hermes 2 Mistral API is live and optimized!"} |
|
|
|
@app.get("/get_sys") |
|
def get_sys_specs(): |
|
memory = psutil.virtual_memory() |
|
return { |
|
"CPU": { |
|
"physical_cores": physical_cores, |
|
"logical_cores": logical_cores, |
|
"max_freq_mhz": psutil.cpu_freq().max, |
|
"cpu_usage": psutil.cpu_percent(interval=1) |
|
}, |
|
"RAM": { |
|
"total_GB": round(memory.total / (1024 ** 3), 2), |
|
"available_GB": round(memory.available / (1024 ** 3), 2), |
|
"usage_percent": memory.percent |
|
}, |
|
"System": { |
|
"platform": platform.platform(), |
|
"architecture": platform.machine(), |
|
"python": platform.python_version() |
|
} |
|
} |
|
|
|
@app.get("/process_list") |
|
def process_list(): |
|
time.sleep(1) |
|
processes = [] |
|
for proc in psutil.process_iter(['pid', 'name']): |
|
try: |
|
cpu = proc.cpu_percent() |
|
if cpu > 10: |
|
processes.append({ |
|
"pid": proc.pid, |
|
"name": proc.name(), |
|
"cpu_percent": cpu |
|
}) |
|
except (psutil.NoSuchProcess, psutil.AccessDenied): |
|
pass |
|
return {"heavy_processes": processes} |
|
|
|
@app.post("/generate") |
|
async def generate(request: Request): |
|
data = await request.json() |
|
prompt = data.get("prompt", "").strip() |
|
print("🧾 Prompt received:", prompt) |
|
|
|
response = llm.create_chat_completion( |
|
messages=[ |
|
{"role": "system", "content": "You are a helpful AI assistant."}, |
|
{"role": "user", "content": prompt} |
|
], |
|
max_tokens=2048, |
|
temperature=0.7, |
|
stop=["</s>"] |
|
) |
|
|
|
llm.reset() |
|
|
|
return { |
|
"response": response["choices"][0]["message"]["content"].strip() |
|
} |
|
|