InferSC

Sleeping

File size: 2,749 Bytes

f0dc90a
 
 
 
d1d2b7d
f0dc90a
558a253
f0dc90a
 
d1d2b7d
 
 
f0dc90a
 
dbde5f6
f0dc90a
 
d1d2b7d
 
9190a90
f0dc90a
 
 
 
72e002c
f0dc90a
 
558a253
f0dc90a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0013c1
255e744
ba0a926
255e744
ba0a926
255e744
d1d2b7d
255e744
 
a0013c1
255e744
f0dc90a
255e744
f0dc90a
 
90afce1

import gradio as gr
from gpt4all import GPT4All
from huggingface_hub import hf_hub_download

title = "兮辞"
description = """
Infer service
"""

model_path = "TheBloke/openbuddy-zephyr-7B-v14.1-GGUF"
model_name = "openbuddy-zephyr-7b-v14.1.Q4_K_M.gguf"
hf_hub_download(repo_id="TheBloke/openbuddy-zephyr-7B-v14.1-GGUF", filename=model_name, local_dir=model_path, local_dir_use_symlinks=True)

print("Start the model init process")
model = model = GPT4All(model_name, model_path, allow_download = True, device="cpu")
print("Finish the model init process")

model.config["promptTemplate"] = "[INST] {0} [/INST]"
model.config["systemPrompt"] = "You are a helpful assistant named 兮辞."
model._is_chat_session_activated = True

max_new_tokens = 2048

def generater(message, history, temperature, top_p, top_k):
    prompt = ""
    for user_message, assistant_message in history:
        prompt += model.config["promptTemplate"].format(user_message)
        prompt += assistant_message + "<|im_end|>"
    prompt += model.config["promptTemplate"].format(message)
    outputs = []    
    for token in model.generate(prompt=prompt, temp=temperature, top_k = top_k, top_p = top_p, max_tokens = max_new_tokens, streaming=True):
        outputs.append(token)
        yield "".join(outputs)

def vote(data: gr.LikeData):
    if data.liked:
        return
    else:
        return

chatbot = gr.Chatbot(avatar_images=('resourse/user-icon.png', 'resourse/chatbot-icon.png'),bubble_full_width = False)

additional_inputs=[
    gr.Slider(
        label="temperature",
        value=0.5,
        minimum=0.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.",
    ),
    gr.Slider(
        label="top_p",
        value=1.0,
        minimum=0.0,
        maximum=1.0,
        step=0.01,
        interactive=True,
        info="0.1 means only the tokens comprising the top 10% probability mass are considered. Suggest set to 1 and use temperature. 1 means 100% and will disable it",
    ),
    gr.Slider(
        label="top_k",
        value=40,
        minimum=0,
        maximum=1000,
        step=1,
        interactive=True,
        info="limits candidate tokens to a fixed number after sorting by probability. Setting it higher than the vocabulary size deactivates this limit.",
    )
]


iface = gr.ChatInterface(
    fn = generater,
    title=title,
    description = description,
    additional_inputs=additional_inputs,
    
)


with gr.Blocks(css="resourse/style/custom.css") as demo:
    chatbot.like(vote, None, None)
    iface.render()

if __name__ == "__main__":
    demo.queue().launch()