from llama_cpp import Llama
import gradio as gr

# Load the GGUF model (quantized, small model)
llm = Llama(
    model_path="mental-health-chatbot-i1.Q4_K_M.gguf",  # change filename if using a different quant
    n_ctx=2048,
    n_threads=4,  # adjust based on your Space CPU
)

def chat(message, history):
    full_prompt = ""
    for user, bot in history:
        full_prompt += f"User: {user}\nBot: {bot}\n"
    full_prompt += f"User: {message}\nBot:"

    output = llm(full_prompt, max_tokens=128, stop=["User:", "\n"], echo=False)
    reply = output["choices"][0]["text"].strip()
    return reply

# Simple chat UI
gr.ChatInterface(fn=chat).launch()