Spaces:

Staticaliza
/

Zero-5

Paused

App Files Files Community

Staticaliza commited on Apr 30

Commit

d90f2bc

verified ·

1 Parent(s): 0f7e3e6

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -30

app.py CHANGED Viewed

@@ -1,48 +1,125 @@
 import os
 import threading
 import torch
 import torch._dynamo
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import gradio as gr
 import spaces
-os.system("pip install git+https://github.com/shumingma/transformers.git")
-torch._dynamo.config.suppress_errors = True
 model_id = "microsoft/bitnet-b1.58-2B-4T"
-tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map={"": "cpu"}, trust_remote_code=True)
-model.to("cpu")
 @spaces.GPU(duration=15)
 def gpu():
     print("[GPU] | GPU maintained.")
-def respond_simple(message: str, max_tokens: int, temperature: float, top_p: float):
-    inputs = tokenizer(message, return_tensors="pt").to("cpu")
-    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
-    thread = threading.Thread(target=model.generate, kwargs={
         **inputs,
-        "streamer": streamer,
-        "max_new_tokens": max_tokens,
-        "temperature": temperature,
-        "top_p": top_p,
-        "do_sample": True
-    })
     thread.start()
-    output = ""
-    for chunk in streamer:
-        output += chunk
-    return output
-with gr.Blocks() as demo:
-    gr.Markdown("## bitnet-b1.58-2b-4t completion")
-    tok = gr.Slider(1, 8192, value=2048, step=1, label="max new tokens")
-    temp = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="temperature")
-    top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="top-p")
-    inp = gr.Textbox(label="prompt", lines=2)
-    out = gr.Textbox(label="completion", lines=10)
-    inp.submit(respond_simple, [inp, tok, temp, top_p], out)
 if __name__ == "__main__":
     demo.launch()

 import os
+os.system("pip install git+https://github.com/shumingma/transformers.git accelerate")
 import threading
 import torch
 import torch._dynamo
+torch._dynamo.config.suppress_errors = True
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextIteratorStreamer,
+)
 import gradio as gr
 import spaces
 model_id = "microsoft/bitnet-b1.58-2B-4T"
+# tokenizer unchanged
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+# load on CPU by default; no device_map
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,       # CPU can emulate bfloat16
+    low_cpu_mem_usage=True            # reduces peak RAM (requires accelerate)
+)
+print(next(model.parameters()).device)
 @spaces.GPU(duration=15)
 def gpu():
     print("[GPU] | GPU maintained.")
+def respond(
+    message: str,
+    history: list[tuple[str, str]],
+    system_message: str,
+    max_tokens: int,
+    temperature: float,
+    top_p: float,
+):
+    messages = [{"role": "system", "content": system_message}]
+    for user_msg, bot_msg in history:
+        if user_msg:
+            messages.append({"role": "user", "content": user_msg})
+        if bot_msg:
+            messages.append({"role": "assistant", "content": bot_msg})
+    messages.append({"role": "user", "content": message})
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = tokenizer(prompt, return_tensors="pt")
+    streamer = TextIteratorStreamer(
+        tokenizer, skip_prompt=True, skip_special_tokens=True
+    )
+    generate_kwargs = dict(
         **inputs,
+        streamer=streamer,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        do_sample=True,
+    )
+    thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
     thread.start()
+    response = ""
+    for new_text in streamer:
+        response += new_text
+        yield response
+demo = gr.ChatInterface(
+    fn=respond,
+    title="Bitnet-b1.58-2B-4T Chatbot",
+    description="This chat application is powered by Microsoft's SOTA Bitnet-b1.58-2B-4T and designed for natural and fast conversations.",
+    examples=[
+        [
+            "Hello! How are you?",
+            "You are a helpful AI assistant for everyday tasks.",
+            512,
+            0.7,
+            0.95,
+        ],
+        [
+            "Can you code a snake game in Python?",
+            "You are a helpful AI assistant for coding.",
+            2048,
+            0.7,
+            0.95,
+        ],
+    ],
+    additional_inputs=[
+        gr.Textbox(
+            value="You are a helpful AI assistant.",
+            label="System message"
+        ),
+        gr.Slider(
+            minimum=1,
+            maximum=8192,
+            value=2048,
+            step=1,
+            label="Max new tokens"
+        ),
+        gr.Slider(
+            minimum=0.1,
+            maximum=4.0,
+            value=0.7,
+            step=0.1,
+            label="Temperature"
+        ),
+        gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            value=0.95,
+            step=0.05,
+            label="Top-p (nucleus sampling)"
+        ),
+    ],
+)
 if __name__ == "__main__":
     demo.launch()