from fastapi import FastAPI, Request from llama_cpp import Llama from huggingface_hub import hf_hub_download import os import platform import psutil import multiprocessing import time app = FastAPI() # === Model Config === REPO_ID = "TheBloke/phi-2-GGUF" FILENAME = "phi-2.Q3_K_M.gguf" MODEL_DIR = "models" MODEL_PATH = os.path.join(MODEL_DIR, FILENAME) # === Download if model not available === if not os.path.exists(MODEL_PATH): print(f"⬇️ Downloading {FILENAME} from Hugging Face...") model_path = hf_hub_download( repo_id=REPO_ID, filename=FILENAME, cache_dir=MODEL_DIR, local_dir=MODEL_DIR, local_dir_use_symlinks=False ) else: print(f"✅ Model already available at: {MODEL_PATH}") model_path = MODEL_PATH # === Optimal thread usage === logical_cores = psutil.cpu_count(logical=True) physical_cores = psutil.cpu_count(logical=False) recommended_threads = 4 # === Load the model === llm = Llama( model_path=model_path, n_ctx=2048, # Can increase depending on memory n_threads=recommended_threads, use_mlock=True, # lock model in RAM for faster access n_gpu_layers=0, # CPU only, use >0 if GPU is present chat_format="chatml", # for Hermes 2 verbose=False ) @app.get("/") def root(): return {"message": "✅ Hermes 2 Mistral API is live and optimized!"} @app.get("/get_sys") def get_sys_specs(): memory = psutil.virtual_memory() return { "CPU": { "physical_cores": physical_cores, "logical_cores": logical_cores, "max_freq_mhz": psutil.cpu_freq().max, "cpu_usage": psutil.cpu_percent(interval=1) }, "RAM": { "total_GB": round(memory.total / (1024 ** 3), 2), "available_GB": round(memory.available / (1024 ** 3), 2), "usage_percent": memory.percent }, "System": { "platform": platform.platform(), "architecture": platform.machine(), "python": platform.python_version() } } @app.get("/process_list") def process_list(): time.sleep(1) # Let CPU settle processes = [] for proc in psutil.process_iter(['pid', 'name']): try: cpu = proc.cpu_percent() if cpu > 10: processes.append({ "pid": proc.pid, "name": proc.name(), "cpu_percent": cpu }) except (psutil.NoSuchProcess, psutil.AccessDenied): pass return {"heavy_processes": processes} @app.post("/generate") async def generate(request: Request): data = await request.json() prompt = data.get("prompt", "").strip() print("🧾 Prompt received:", prompt) response = llm.create_chat_completion( messages=[ {"role": "system", "content": "You are a helpful AI assistant."}, {"role": "user", "content": prompt} ], max_tokens=2048, temperature=0.7, stop=[""] ) llm.reset() # Free memory after response return { "response": response["choices"][0]["message"]["content"].strip() }