import os import requests import gradio as gr from llama_cpp import Llama MODEL_URL = "https://huggingface.co/QuantFactory/Ministral-3b-instruct-GGUF/resolve/main/Ministral-3b-instruct.Q4_1.gguf?download=true" # truncated for clarity MODEL_PATH = "Ministral-3b-instruct.Q4_1.gguf" # Download model if not already downloaded if not os.path.exists(MODEL_PATH): print("Downloading model...") with requests.get(MODEL_URL, stream=True) as r: r.raise_for_status() with open(MODEL_PATH, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) print("Model downloaded.") # Load the model with adjustments for CPU llm = Llama( model_path=MODEL_PATH, n_ctx=4096, # Reduced context window size n_threads=2, # Reduced threads for CPU use n_gpu_layers=0, # Set to 0 since we're using CPU chat_format="chatml" ) def chat_interface(message, history): if history is None: history = [] chat_prompt = [] for user_msg, bot_msg in history: chat_prompt.append({"role": "user", "content": user_msg}) chat_prompt.append({"role": "assistant", "content": bot_msg}) chat_prompt.append({"role": "user", "content": message}) response = llm.create_chat_completion(messages=chat_prompt, stream=False) reply = response["choices"][0]["message"]["content"] history.append((message, reply)) return reply, history gr.ChatInterface(fn=chat_interface, title="Ministral 3B Chat").launch()