import os
import requests
import gradio as gr
from llama_cpp import Llama

MODEL_URL = "https://huggingface.co/QuantFactory/Ministral-3b-instruct-GGUF/resolve/main/Ministral-3b-instruct.Q4_1.gguf?download=true"  # truncated for clarity
MODEL_PATH = "Ministral-3b-instruct.Q4_1.gguf"

# Download model if not already downloaded
if not os.path.exists(MODEL_PATH):
    print("Downloading model...")
    with requests.get(MODEL_URL, stream=True) as r:
        r.raise_for_status()
        with open(MODEL_PATH, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    print("Model downloaded.")

# Load the model with adjustments for CPU
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=4096,  # Reduced context window size
    n_threads=2,  # Reduced threads for CPU use
    n_gpu_layers=0,  # Set to 0 since we're using CPU
    chat_format="chatml"
)

def chat_interface(message, history):
    if history is None:
        history = []

    chat_prompt = []
    for user_msg, bot_msg in history:
        chat_prompt.append({"role": "user", "content": user_msg})
        chat_prompt.append({"role": "assistant", "content": bot_msg})
    chat_prompt.append({"role": "user", "content": message})

    response = llm.create_chat_completion(messages=chat_prompt, stream=False)
    reply = response["choices"][0]["message"]["content"]
    history.append((message, reply))
    return reply, history

gr.ChatInterface(fn=chat_interface, title="Ministral 3B Chat").launch()