Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
import torch | |
from transformers import pipeline, BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer | |
MODEL_ID = "marcelbinz/Llama-3.1-Centaur-70B" | |
bnb_4bit_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=torch.bfloat16, | |
bnb_4bit_use_double_quant=True, | |
) | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_ID, | |
device_map="auto", | |
attn_implementation="flash_attention_2", | |
quantization_config=bnb_4bit_config, | |
) | |
pipe = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
device_map="auto", | |
) | |
def infer(prompt): | |
return pipe(prompt, max_new_tokens=1, do_sample=True, temperature=1.0, return_full_text=False)[0]["generated_text"] | |
with gr.Blocks(fill_width=True, css=""" | |
#prompt-box textarea {height:200px} | |
#answer-box textarea {height:320px} | |
""") as demo: | |
with gr.Row(equal_height=True): | |
inp = gr.Textbox(label="Prompt", elem_id="prompt-box", | |
lines=12, max_lines=12, scale=3) | |
outp = gr.Textbox(label="Response", elem_id="answer-box", | |
lines=1, interactive=False, scale=3) | |
run = gr.Button("Run") | |
run.click(infer, inp, outp) | |
demo.queue().launch() |