Priyanshukr-1's picture
Update app.py
a839b23 verified
raw
history blame
3.18 kB
from fastapi import FastAPI, Request
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import platform
import psutil
import multiprocessing
import time
app = FastAPI()
# === Model Config ===
REPO_ID = "TheBloke/phi-2-GGUF"
FILENAME = "phi-2.Q3_K_M.gguf"
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
# === Download if model not available ===
if not os.path.exists(MODEL_PATH):
print(f"⬇️ Downloading {FILENAME} from Hugging Face...")
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
cache_dir=MODEL_DIR,
local_dir=MODEL_DIR,
local_dir_use_symlinks=False
)
else:
print(f"✅ Model already available at: {MODEL_PATH}")
model_path = MODEL_PATH
# === Optimal thread usage ===
logical_cores = psutil.cpu_count(logical=True)
physical_cores = psutil.cpu_count(logical=False)
recommended_threads = 4
# === Load the model ===
llm = Llama(
model_path=model_path,
n_ctx=2048, # Can increase depending on memory
n_threads=recommended_threads,
use_mlock=True, # lock model in RAM for faster access
n_gpu_layers=0, # CPU only, use >0 if GPU is present
chat_format="chatml", # for Hermes 2
verbose=False
)
@app.get("/")
def root():
return {"message": "✅ Hermes 2 Mistral API is live and optimized!"}
@app.get("/get_sys")
def get_sys_specs():
memory = psutil.virtual_memory()
return {
"CPU": {
"physical_cores": physical_cores,
"logical_cores": logical_cores,
"max_freq_mhz": psutil.cpu_freq().max,
"cpu_usage": psutil.cpu_percent(interval=1)
},
"RAM": {
"total_GB": round(memory.total / (1024 ** 3), 2),
"available_GB": round(memory.available / (1024 ** 3), 2),
"usage_percent": memory.percent
},
"System": {
"platform": platform.platform(),
"architecture": platform.machine(),
"python": platform.python_version()
}
}
@app.get("/process_list")
def process_list():
time.sleep(1) # Let CPU settle
processes = []
for proc in psutil.process_iter(['pid', 'name']):
try:
cpu = proc.cpu_percent()
if cpu > 10:
processes.append({
"pid": proc.pid,
"name": proc.name(),
"cpu_percent": cpu
})
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
return {"heavy_processes": processes}
@app.post("/generate")
async def generate(request: Request):
data = await request.json()
prompt = data.get("prompt", "").strip()
print("🧾 Prompt received:", prompt)
response = llm.create_chat_completion(
messages=[
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": prompt}
],
max_tokens=2048,
temperature=0.7,
stop=["</s>"]
)
llm.reset() # Free memory after response
return {
"response": response["choices"][0]["message"]["content"].strip()
}