File size: 3,183 Bytes
43cc365 2d70789 048628f 43cc365 676ed5b a839b23 43cc365 a49d7b2 43cc365 a49d7b2 43cc365 a49d7b2 43cc365 a49d7b2 af4a857 a49d7b2 43cc365 c1d1082 a49d7b2 43cc365 a49d7b2 43cc365 2d70789 a49d7b2 2d70789 ab6809d 8afce56 a49d7b2 8ac0dd3 a49d7b2 8ac0dd3 a49d7b2 8ac0dd3 ab6809d a49d7b2 ab6809d 43cc365 a49d7b2 314bed8 a49d7b2 314bed8 a49d7b2 314bed8 a49d7b2 314bed8 a49d7b2 1fb027f 314bed8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
from fastapi import FastAPI, Request
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import platform
import psutil
import multiprocessing
import time
app = FastAPI()
# === Model Config ===
REPO_ID = "TheBloke/phi-2-GGUF"
FILENAME = "phi-2.Q3_K_M.gguf"
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
# === Download if model not available ===
if not os.path.exists(MODEL_PATH):
print(f"⬇️ Downloading {FILENAME} from Hugging Face...")
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
cache_dir=MODEL_DIR,
local_dir=MODEL_DIR,
local_dir_use_symlinks=False
)
else:
print(f"✅ Model already available at: {MODEL_PATH}")
model_path = MODEL_PATH
# === Optimal thread usage ===
logical_cores = psutil.cpu_count(logical=True)
physical_cores = psutil.cpu_count(logical=False)
recommended_threads = 4
# === Load the model ===
llm = Llama(
model_path=model_path,
n_ctx=2048, # Can increase depending on memory
n_threads=recommended_threads,
use_mlock=True, # lock model in RAM for faster access
n_gpu_layers=0, # CPU only, use >0 if GPU is present
chat_format="chatml", # for Hermes 2
verbose=False
)
@app.get("/")
def root():
return {"message": "✅ Hermes 2 Mistral API is live and optimized!"}
@app.get("/get_sys")
def get_sys_specs():
memory = psutil.virtual_memory()
return {
"CPU": {
"physical_cores": physical_cores,
"logical_cores": logical_cores,
"max_freq_mhz": psutil.cpu_freq().max,
"cpu_usage": psutil.cpu_percent(interval=1)
},
"RAM": {
"total_GB": round(memory.total / (1024 ** 3), 2),
"available_GB": round(memory.available / (1024 ** 3), 2),
"usage_percent": memory.percent
},
"System": {
"platform": platform.platform(),
"architecture": platform.machine(),
"python": platform.python_version()
}
}
@app.get("/process_list")
def process_list():
time.sleep(1) # Let CPU settle
processes = []
for proc in psutil.process_iter(['pid', 'name']):
try:
cpu = proc.cpu_percent()
if cpu > 10:
processes.append({
"pid": proc.pid,
"name": proc.name(),
"cpu_percent": cpu
})
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
return {"heavy_processes": processes}
@app.post("/generate")
async def generate(request: Request):
data = await request.json()
prompt = data.get("prompt", "").strip()
print("🧾 Prompt received:", prompt)
response = llm.create_chat_completion(
messages=[
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": prompt}
],
max_tokens=2048,
temperature=0.7,
stop=["</s>"]
)
llm.reset() # Free memory after response
return {
"response": response["choices"][0]["message"]["content"].strip()
}
|