import gradio as gr
from llm_rs import AutoModel,SessionConfig,GenerationConfig,Precision,KnownModels
from huggingface_hub import space_info
repo_name = "svjack/ggml"
file_name = "wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin"
examples = [
"How to promote Chinese traditional culture ?",
"Explain the meaning of word Ottoman",
"Explain the meaning of 👨",
"Use following emojis to generate a short description of a scene , the emojis are 👨👩🔥❄️",
"Use following emojis to generate a short description of a scene , the emojis are 🌲🔥👨💦",
]
session_config = SessionConfig(threads=2,batch_size=2)
model = AutoModel.from_pretrained(repo_name, model_file=file_name, session_config=session_config,verbose=True,model_type=KnownModels.Llama)
'''
model_path = "/Users/svjack/Library/Application Support/nomic.ai/GPT4All/wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin"
model = AutoModel.from_pretrained(model_path,
model_type=KnownModels.Llama)
'''
def process_stream(instruction, temperature, top_p, top_k, max_new_tokens, seed):
prompt=f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:
Answer:"""
generation_config = GenerationConfig(seed=seed,temperature=temperature,top_p=top_p,top_k=top_k,max_new_tokens=max_new_tokens)
response = ""
streamer = model.stream(prompt=prompt,generation_config=generation_config)
for new_text in streamer:
response += new_text
yield response
with gr.Blocks(
theme=gr.themes.Soft(),
css=".disclaimer {font-variant-caps: all-small-caps;}",
) as demo:
gr.Markdown(
"""
Wizardlm-13b on CPU in Rust 🦀
This demo uses the [rustformers/llm](https://github.com/rustformers/llm) library via [llm-rs](https://github.com/LLukas22/llm-rs-python) on 2 CPU cores.
"""
)
'''
markdown_exp_size = "##"
lora_repo = "svjack/chatglm3-few-shot"
lora_repo_link = "svjack/chatglm3-few-shot/?input_list_index=1"
emoji_info = space_info(lora_repo).__dict__["cardData"]["emoji"]
space_cnt = 1
task_name = "[---Emojis to Image Prompt---]"
gr.Markdown(
value=f"{markdown_exp_size} {task_name} few shot prompt in ChatGLM3 Few Shot space repo (click submit to activate) : [{lora_repo_link}](https://huggingface.co/spaces/{lora_repo_link}) {emoji_info}",
visible=True,
elem_id="selected_space",
)
'''
with gr.Row():
with gr.Column():
with gr.Row():
instruction = gr.Textbox(
placeholder="Enter your question or instruction here",
label="Question/Instruction",
elem_id="q-input",
)
with gr.Accordion("Advanced Options:", open=False):
with gr.Row():
with gr.Column():
with gr.Row():
temperature = gr.Slider(
label="Temperature",
value=0.8,
minimum=0.1,
maximum=1.0,
step=0.1,
interactive=True,
info="Higher values produce more diverse outputs",
)
with gr.Column():
with gr.Row():
top_p = gr.Slider(
label="Top-p (nucleus sampling)",
value=0.95,
minimum=0.0,
maximum=1.0,
step=0.01,
interactive=True,
info=(
"Sample from the smallest possible set of tokens whose cumulative probability "
"exceeds top_p. Set to 1 to disable and sample from all tokens."
),
)
with gr.Column():
with gr.Row():
top_k = gr.Slider(
label="Top-k",
value=40,
minimum=5,
maximum=80,
step=1,
interactive=True,
info="Sample from a shortlist of top-k tokens — 0 to disable and sample from all tokens.",
)
with gr.Column():
with gr.Row():
max_new_tokens = gr.Slider(
label="Maximum new tokens",
value=256,
minimum=0,
maximum=1024,
step=5,
interactive=True,
info="The maximum number of new tokens to generate",
)
with gr.Column():
with gr.Row():
seed = gr.Number(
label="Seed",
value=42,
interactive=True,
info="The seed to use for the generation",
precision=0
)
with gr.Row():
submit = gr.Button("Submit")
with gr.Row():
with gr.Box():
gr.Markdown("**Wizardlm-13b**")
output_7b = gr.Markdown()
with gr.Row():
gr.Examples(
examples=examples,
inputs=[instruction],
cache_examples=False,
fn=process_stream,
outputs=output_7b,
)
submit.click(
process_stream,
inputs=[instruction, temperature, top_p, top_k, max_new_tokens,seed],
outputs=output_7b,
)
instruction.submit(
process_stream,
inputs=[instruction, temperature, top_p, top_k, max_new_tokens,seed],
outputs=output_7b,
)
with demo:
gr.HTML(
'''
'''
)
demo.queue(max_size=4, concurrency_count=1).launch(debug=True)