File size: 2,564 Bytes
43cc365 2d70789 43cc365 f6a8b6c 1fb027f 43cc365 2d70789 43cc365 314bed8 1fb027f 314bed8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
from fastapi import FastAPI, Request
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import platform
import psutil
import multiprocessing
app = FastAPI()
# === Model Config ===
REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
FILENAME = "mistral-7b-instruct-v0.1.Q4_K_M.gguf"
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
# === Download only if not already present ===
if not os.path.exists(MODEL_PATH):
print(f"Downloading model {FILENAME} from Hugging Face...")
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
cache_dir=MODEL_DIR,
local_dir=MODEL_DIR,
local_dir_use_symlinks=False
)
else:
print(f"Model already exists at: {MODEL_PATH}")
model_path = MODEL_PATH
# === Load LLM ===
llm = Llama(
model_path=model_path,
n_ctx=1024,
n_threads=16, # Adjust for your CPU
n_batch=64
)
@app.get("/")
def root():
return {"message": "Mistral API is live!"}
@app.get("/get_sys")
def get_sys_specs():
cpu_info = {
"physical_cores": psutil.cpu_count(logical=False),
"logical_cores": psutil.cpu_count(logical=True),
"max_frequency_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else None,
"cpu_usage_percent": psutil.cpu_percent(interval=1)
}
memory = psutil.virtual_memory()
ram_info = {
"total_gb": round(memory.total / (1024 ** 3), 2),
"available_gb": round(memory.available / (1024 ** 3), 2),
"used_percent": memory.percent
}
system_info = {
"system": platform.system(),
"machine": platform.machine(),
"platform": platform.platform(),
"processor": platform.processor(),
"python_version": platform.python_version(),
}
return {
"cpu": cpu_info,
"ram": ram_info,
"system": system_info,
"recommended_threads": min(psutil.cpu_count(logical=False) or 2, 8)
}
@app.post("/generate")
async def generate(request: Request):
data = await request.json()
prompt = data.get("prompt", "")
print("🧾 Received prompt:", prompt)
response = llm.create_chat_completion(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
],
max_tokens=1024,
temperature=0.7,
)
print("📤 Raw model response:", response)
llm.reset()
return {
"response": response["choices"][0]["message"]["content"].strip()
}
|