Spaces:

Staticaliza
/

Zero-5

Paused

App Files Files

xet

Community

Staticaliza commited on Apr 29

Commit

fa4e434

verified ·

1 Parent(s): f5f1e3e

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -26

app.py CHANGED Viewed

@@ -1,16 +1,31 @@
 import gradio as gr
 import spaces
-import threading
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 model_id = "microsoft/bitnet-b1.58-2B-4T"
-tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
-@spaces.GPU(duration=15)
-def gpu():
-    print("[GPU] | GPU maintained.")
 def respond(
     message: str,
     history: list[tuple[str, str]],
@@ -30,22 +45,19 @@ def respond(
     prompt = tokenizer.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
-    inputs = tokenizer(prompt, return_tensors="pt")
-    input_ids = inputs.input_ids
-    attention_mask = inputs.attention_mask
     streamer = TextIteratorStreamer(
         tokenizer, skip_prompt=True, skip_special_tokens=True
     )
-    generate_kwargs = {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "streamer": streamer,
-        "max_new_tokens": max_tokens,
-        "temperature": temperature,
-        "top_p": top_p,
-        "do_sample": True,
-    }
     thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
     thread.start()
@@ -57,16 +69,49 @@ def respond(
 demo = gr.ChatInterface(
     fn=respond,
     title="Bitnet-b1.58-2B-4T Chatbot",
-    description="powered by microsoft bitnet-b1.58-2b-4t (cpu only)",
     examples=[
-        ["hello how are you", "you are a helpful ai assistant", 512, 0.7, 0.95],
-        ["can you code a snake game in python", "you are a helpful ai assistant for coding", 2048, 0.7, 0.95],
     ],
     additional_inputs=[
-        gr.Textbox(value="you are a helpful ai assistant", label="system message"),
-        gr.Slider(minimum=1, maximum=8192, value=2048, step=1, label="max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="temperature"),
-        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="top-p"),
     ],
 )

+import os
+os.system("pip install git+https://github.com/shumingma/transformers.git")
+import threading
+import torch
+import torch._dynamo
+torch._dynamo.config.suppress_errors = True
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextIteratorStreamer,
+)
 import gradio as gr
 import spaces
 model_id = "microsoft/bitnet-b1.58-2B-4T"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto"
+)
+print(model.device)
+@spaces.GPU
 def respond(
     message: str,
     history: list[tuple[str, str]],
     prompt = tokenizer.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(
         tokenizer, skip_prompt=True, skip_special_tokens=True
     )
+    generate_kwargs = dict(
+        **inputs,
+        streamer=streamer,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        do_sample=True,
+    )
     thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
     thread.start()
 demo = gr.ChatInterface(
     fn=respond,
     title="Bitnet-b1.58-2B-4T Chatbot",
+    description="This chat application is powered by Microsoft's SOTA Bitnet-b1.58-2B-4T and designed for natural and fast conversations.",
     examples=[
+        [
+            "Hello! How are you?",
+            "You are a helpful AI assistant for everyday tasks.",
+            512,
+            0.7,
+            0.95,
+        ],
+        [
+            "Can you code a snake game in Python?",
+            "You are a helpful AI assistant for coding.",
+            2048,
+            0.7,
+            0.95,
+        ],
     ],
     additional_inputs=[
+        gr.Textbox(
+            value="You are a helpful AI assistant.",
+            label="System message"
+        ),
+        gr.Slider(
+            minimum=1,
+            maximum=8192,
+            value=2048,
+            step=1,
+            label="Max new tokens"
+        ),
+        gr.Slider(
+            minimum=0.1,
+            maximum=4.0,
+            value=0.7,
+            step=0.1,
+            label="Temperature"
+        ),
+        gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            value=0.95,
+            step=0.05,
+            label="Top-p (nucleus sampling)"
+        ),
     ],
 )