File size: 2,564 Bytes
43cc365
 
 
 
 
2d70789
 
 
 
 
43cc365
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6a8b6c
1fb027f
43cc365
 
 
 
 
 
2d70789
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43cc365
 
 
 
314bed8
 
 
 
 
 
 
 
 
 
 
 
 
 
1fb027f
 
314bed8
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from fastapi import FastAPI, Request
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

import platform
import psutil
import multiprocessing


app = FastAPI()

# === Model Config ===
REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
FILENAME = "mistral-7b-instruct-v0.1.Q4_K_M.gguf"
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)

# === Download only if not already present ===
if not os.path.exists(MODEL_PATH):
    print(f"Downloading model {FILENAME} from Hugging Face...")
    model_path = hf_hub_download(
        repo_id=REPO_ID,
        filename=FILENAME,
        cache_dir=MODEL_DIR,
        local_dir=MODEL_DIR,
        local_dir_use_symlinks=False
    )
else:
    print(f"Model already exists at: {MODEL_PATH}")
    model_path = MODEL_PATH

# === Load LLM ===
llm = Llama(
    model_path=model_path,
    n_ctx=1024,
    n_threads=16,  # Adjust for your CPU
    n_batch=64
)

@app.get("/")
def root():
    return {"message": "Mistral API is live!"}

@app.get("/get_sys")
def get_sys_specs():
    cpu_info = {
        "physical_cores": psutil.cpu_count(logical=False),
        "logical_cores": psutil.cpu_count(logical=True),
        "max_frequency_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else None,
        "cpu_usage_percent": psutil.cpu_percent(interval=1)
    }

    memory = psutil.virtual_memory()
    ram_info = {
        "total_gb": round(memory.total / (1024 ** 3), 2),
        "available_gb": round(memory.available / (1024 ** 3), 2),
        "used_percent": memory.percent
    }

    system_info = {
        "system": platform.system(),
        "machine": platform.machine(),
        "platform": platform.platform(),
        "processor": platform.processor(),
        "python_version": platform.python_version(),
    }

    return {
        "cpu": cpu_info,
        "ram": ram_info,
        "system": system_info,
        "recommended_threads": min(psutil.cpu_count(logical=False) or 2, 8)
    }

@app.post("/generate")
async def generate(request: Request):
    data = await request.json()
    prompt = data.get("prompt", "")

    print("🧾 Received prompt:", prompt)

    response = llm.create_chat_completion(
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1024,
        temperature=0.7,
    )

    print("📤 Raw model response:", response)

    llm.reset()

    return {
        "response": response["choices"][0]["message"]["content"].strip()
    }