import gradio as gr import time from ctransformers import AutoModelForCausalLM # Please ensure this import is correct from huggingface_hub import hf_hub_download PROMPT_TEMPLATE = ( ) def load_llm(): # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system. llm = AutoModelForCausalLM.from_pretrained( "s3nh/PY007-TinyLlama-1.1B-Chat-v0.2-GGUF", model_file="PY007-TinyLlama-1.1B-Chat-v0.2.Q4_K_M.gguf", model_type="llama", gpu_layers=0, max_new_tokens = 1096, repetition_penalty = 1.13, temperature = 0.1 ) return llm def llm_function(message, chat_history): llm = load_llm() formatted_message = PROMPT_TEMPLATE + f"[INST]{message}[/INST]" response = llm( formatted_message ) output_texts = response return output_texts title = "这里是小兮辞" examples = [ 'What is yellow fever.', ] gr.ChatInterface( fn=llm_function, title=title, examples=examples ).launch()