|
from fastapi import FastAPI, Request |
|
from llama_cpp import Llama |
|
from huggingface_hub import hf_hub_download |
|
import os |
|
|
|
app = FastAPI() |
|
|
|
|
|
REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF" |
|
FILENAME = "mistral-7b-instruct-v0.1.Q4_K_M.gguf" |
|
MODEL_DIR = "models" |
|
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME) |
|
|
|
|
|
if not os.path.exists(MODEL_PATH): |
|
print(f"Downloading model {FILENAME} from Hugging Face...") |
|
model_path = hf_hub_download( |
|
repo_id=REPO_ID, |
|
filename=FILENAME, |
|
cache_dir=MODEL_DIR, |
|
local_dir=MODEL_DIR, |
|
local_dir_use_symlinks=False |
|
) |
|
else: |
|
print(f"Model already exists at: {MODEL_PATH}") |
|
model_path = MODEL_PATH |
|
|
|
|
|
llm = Llama( |
|
model_path=model_path, |
|
n_ctx=1024, |
|
n_threads=4, |
|
n_batch=64 |
|
) |
|
|
|
@app.get("/") |
|
def root(): |
|
return {"message": "Mistral API is live!"} |
|
|
|
@app.post("/generate") |
|
async def generate(request: Request): |
|
data = await request.json() |
|
prompt = data.get("prompt", "") |
|
|
|
print("🧾 Received prompt:", prompt) |
|
|
|
response = llm.create_chat_completion( |
|
messages=[ |
|
{"role": "system", "content": "You are a helpful assistant."}, |
|
{"role": "user", "content": prompt} |
|
], |
|
max_tokens=1024, |
|
temperature=0.7, |
|
) |
|
|
|
print("📤 Raw model response:", response) |
|
|
|
llm.reset() |
|
|
|
return { |
|
"response": response["choices"][0]["message"]["content"].strip() |
|
} |
|
|