from fastapi import FastAPI, Request from llama_cpp import Llama from huggingface_hub import hf_hub_download import os import platform import psutil import multiprocessing app = FastAPI() # === Model Config === REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF" FILENAME = "mistral-7b-instruct-v0.1.Q4_K_M.gguf" MODEL_DIR = "models" MODEL_PATH = os.path.join(MODEL_DIR, FILENAME) # === Download only if not already present === if not os.path.exists(MODEL_PATH): print(f"Downloading model {FILENAME} from Hugging Face...") model_path = hf_hub_download( repo_id=REPO_ID, filename=FILENAME, cache_dir=MODEL_DIR, local_dir=MODEL_DIR, local_dir_use_symlinks=False ) else: print(f"Model already exists at: {MODEL_PATH}") model_path = MODEL_PATH # === Load LLM === llm = Llama( model_path=model_path, n_ctx=1024, n_threads=16, # Adjust for your CPU n_batch=64 ) @app.get("/") def root(): return {"message": "Mistral API is live!"} @app.get("/get_sys") def get_sys_specs(): cpu_info = { "physical_cores": psutil.cpu_count(logical=False), "logical_cores": psutil.cpu_count(logical=True), "max_frequency_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else None, "cpu_usage_percent": psutil.cpu_percent(interval=1) } memory = psutil.virtual_memory() ram_info = { "total_gb": round(memory.total / (1024 ** 3), 2), "available_gb": round(memory.available / (1024 ** 3), 2), "used_percent": memory.percent } system_info = { "system": platform.system(), "machine": platform.machine(), "platform": platform.platform(), "processor": platform.processor(), "python_version": platform.python_version(), } return { "cpu": cpu_info, "ram": ram_info, "system": system_info, "recommended_threads": min(psutil.cpu_count(logical=False) or 2, 8) } @app.post("/generate") async def generate(request: Request): data = await request.json() prompt = data.get("prompt", "") print("🧾 Received prompt:", prompt) response = llm.create_chat_completion( messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt} ], max_tokens=1024, temperature=0.7, ) print("📤 Raw model response:", response) llm.reset() return { "response": response["choices"][0]["message"]["content"].strip() }