gemma / app.py
dasomaru's picture
Update app.py
e7303ec verified
raw
history blame
1.86 kB
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "dasomaru/gemma-3-4bit-it-demo"
# ๐Ÿš€ tokenizer๋Š” CPU์—์„œ๋„ ๋ฏธ๋ฆฌ ๋ถˆ๋Ÿฌ์˜ฌ ์ˆ˜ ์žˆ์Œ
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# ๐Ÿš€ model์€ CPU๋กœ๋งŒ ๋จผ์ € ์˜ฌ๋ฆผ (GPU ์•„์ง ์—†์Œ)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16, # 4bit model์ด๋‹ˆ๊นŒ
trust_remote_code=True,
)
@spaces.GPU # ์ด ํ•จ์ˆ˜ ์‹คํ–‰๋  ๋•Œ GPU ํ• ๋‹น๋จ!
def chat(user_input):
model.to("cuda") # ํ•จ์ˆ˜ ์•ˆ์—์„œ GPU๋กœ ์ด๋™!
messages = [{
"role": "user",
"content": [{"type": "text", "text": user_input}]
}]
prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=256,
temperature=1.0,
top_p=0.95,
top_k=64,
do_sample=True,
)
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return output_text.split(user_input)[-1].strip()
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ •
demo = gr.Interface(
fn=chat,
inputs=gr.Textbox(lines=2, placeholder="Type your message..."),
outputs=gr.Textbox(lines=10),
title="๐Ÿง  Gemma-3 4bit (ZeroGPU)",
description="This Space uses the ZeroGPU feature. First request might take a few seconds!"
)
demo.launch()
# zero = torch.Tensor([0]).cuda()
# print(zero.device) # <-- 'cpu' ๐Ÿค”
# @spaces.GPU
# def greet(n):
# print(zero.device) # <-- 'cuda:0' ๐Ÿค—
# return f"Hello {zero + n} Tensor"
# demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
# demo.launch()