import gradio as gr
import time
from ctransformers import AutoModelForCausalLM  # Please ensure this import is correct
from huggingface_hub import hf_hub_download

PROMPT_TEMPLATE = (
  
)

def load_llm():

    # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
    llm = AutoModelForCausalLM.from_pretrained(
    "s3nh/PY007-TinyLlama-1.1B-Chat-v0.2-GGUF",
    model_file="PY007-TinyLlama-1.1B-Chat-v0.2.Q4_K_M.gguf",
    model_type="llama",
    gpu_layers=0,
    max_new_tokens = 1096,
    repetition_penalty = 1.13,
    temperature = 0.1
    )
    return llm

def llm_function(message, chat_history):
    llm = load_llm()
    formatted_message = PROMPT_TEMPLATE + f"<s>[INST]{message}[/INST]</s>"
    response = llm(
        formatted_message
    )
    output_texts = response
    return output_texts

title = "这里是小兮辞"

examples = [
    'What is yellow fever.',
]

gr.ChatInterface(
    fn=llm_function,
    title=title,
    examples=examples
).launch()