Spaces:

matthartman
/

gemma-3n

Runtime error

App Files Files Community

matthartman commited on 17 days ago

Commit

294ffc2

verified ·

1 Parent(s): ddf06c6

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +73 -0

app.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import gradio as gr
+import spaces
+import torch
+from fastrtc import AdditionalOutputs, ReplyOnPause, WebRTC, WebRTCData, get_cloudflare_turn_credentials_async
+from threading import Thread
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers.generation.streamers import TextIteratorStreamer
+MODEL_ID = "google/gemma-3-27b-it"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype=torch.float16,
+)
+@spaces.GPU(time_limit=120)
+def generate(data: WebRTCData, history, system_prompt="", max_new_tokens=512):
+    text = data.textbox
+    history.append({"role": "user", "content": text})
+    yield AdditionalOutputs(history)
+    messages = [{"role": "system", "content": system_prompt}] if system_prompt else []
+    messages.extend(history)
+    inputs = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        return_tensors="pt",
+        tokenize=True,
+    ).to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    gen_kwargs = dict(
+        input_ids=inputs,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=False,
+    )
+    Thread(target=model.generate, kwargs=gen_kwargs).start()
+    new_message = {"role": "assistant", "content": ""}
+    for token in streamer:
+        new_message["content"] += token
+        yield AdditionalOutputs(history + [new_message])
+with gr.Blocks() as demo:
+    chatbot = gr.Chatbot(type="messages")
+    webrtc = WebRTC(
+        modality="audio",
+        mode="send",
+        variant="textbox",
+        rtc_configuration=get_cloudflare_turn_credentials_async,
+    )
+    with gr.Accordion("Settings", open=False):
+        system_prompt = gr.Textbox(
+            "You are a helpful assistant.", label="System prompt"
+        )
+        max_new_tokens = gr.Slider(50, 1500, 700, label="Max new tokens")
+    webrtc.stream(
+        ReplyOnPause(generate),
+        inputs=[webrtc, chatbot, system_prompt, max_new_tokens],
+        outputs=[chatbot],
+        concurrency_limit=100,
+    )
+    webrtc.on_additional_outputs(
+        lambda old, new: new, inputs=[chatbot], outputs=[chatbot]
+    )
+if __name__ == "__main__":
+    demo.launch(ssr_mode=False)