|
import gradio as gr |
|
import spaces |
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
model_name = "dasomaru/gemma-3-4bit-it-demo" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
torch_dtype=torch.float16, |
|
trust_remote_code=True, |
|
) |
|
|
|
@spaces.GPU |
|
def chat(user_input): |
|
model.to("cuda") |
|
|
|
messages = [{ |
|
"role": "user", |
|
"content": [{"type": "text", "text": user_input}] |
|
}] |
|
prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) |
|
|
|
inputs = tokenizer(prompt, return_tensors="pt").to("cuda") |
|
|
|
with torch.no_grad(): |
|
outputs = model.generate( |
|
**inputs, |
|
max_new_tokens=256, |
|
temperature=1.0, |
|
top_p=0.95, |
|
top_k=64, |
|
do_sample=True, |
|
) |
|
|
|
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
return output_text.split(user_input)[-1].strip() |
|
|
|
|
|
demo = gr.Interface( |
|
fn=chat, |
|
inputs=gr.Textbox(lines=2, placeholder="Type your message..."), |
|
outputs=gr.Textbox(lines=10), |
|
title="๐ง Gemma-3 4bit (ZeroGPU)", |
|
description="This Space uses the ZeroGPU feature. First request might take a few seconds!" |
|
) |
|
|
|
demo.launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|