from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import gradio as gr

# Download the model from Hugging Face
model_name = "johnpaulbin/articulate-V1-Q8_0-GGUF"
model_file = "articulate-V1-Q8_0.gguf"  # Replace with the actual GGUF file name from the repository
model_path = hf_hub_download(repo_id=model_name, filename=model_file)

# Initialize the Llama model with llama-cpp-python
llm = Llama(
    model_path=model_path,
    n_ctx=1024,           # Context length (adjust as needed)
    n_threads=2,         # Number of CPU threads
    n_gpu_layers=0        # Run on CPU only (no GPU in free Spaces tier)
)

# Define the chat function for Gradio
def chat(message, history):
    # Build the message list with history and current user input
    messages = []
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})
    
    # Perform inference with greedy decoding
    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=100,    # Limit output length
        top_k=1,          # Greedy decoding: select the top token
        temperature=0.01  # Low temperature for determinism (top_k=1 is sufficient)
    )
    
    # Extract and return the generated text
    generated_text = response['choices'][0]['message']['content']
    return generated_text

# Create the Gradio ChatInterface
iface = gr.ChatInterface(
    fn=chat,
    title="Articulate V1 Chatbot",
    description="Chat with the Articulate V1 model (Llama 3-based) using greedy decoding."
)

# Launch the app
iface.launch()