Spaces:
Sleeping
Sleeping
import os | |
import requests | |
import gradio as gr | |
from llama_cpp import Llama | |
MODEL_URL = "https://huggingface.co/QuantFactory/Ministral-3b-instruct-GGUF/resolve/main/Ministral-3b-instruct.Q4_1.gguf?download=true" # truncated for clarity | |
MODEL_PATH = "Ministral-3b-instruct.Q4_1.gguf" | |
# Download model if not already downloaded | |
if not os.path.exists(MODEL_PATH): | |
print("Downloading model...") | |
with requests.get(MODEL_URL, stream=True) as r: | |
r.raise_for_status() | |
with open(MODEL_PATH, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=8192): | |
f.write(chunk) | |
print("Model downloaded.") | |
# Load the model with adjustments for CPU | |
llm = Llama( | |
model_path=MODEL_PATH, | |
n_ctx=4096, # Reduced context window size | |
n_threads=2, # Reduced threads for CPU use | |
n_gpu_layers=0, # Set to 0 since we're using CPU | |
chat_format="chatml" | |
) | |
def chat_interface(message, history): | |
if history is None: | |
history = [] | |
chat_prompt = [] | |
for user_msg, bot_msg in history: | |
chat_prompt.append({"role": "user", "content": user_msg}) | |
chat_prompt.append({"role": "assistant", "content": bot_msg}) | |
chat_prompt.append({"role": "user", "content": message}) | |
response = llm.create_chat_completion(messages=chat_prompt, stream=False) | |
reply = response["choices"][0]["message"]["content"] | |
history.append((message, reply)) | |
return reply, history | |
gr.ChatInterface(fn=chat_interface, title="Ministral 3B Chat").launch() | |