Spaces:

Staticaliza
/

Zero-5

Paused

App Files Files Community

Staticaliza commited on Apr 29

Commit

65323fa

verified ·

1 Parent(s): 4b47c09

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -96

app.py CHANGED Viewed

@@ -1,123 +1,56 @@
-import os
-os.system("pip install git+https://github.com/shumingma/transformers.git accelerate")
-import threading
-import torch
-import torch._dynamo
-torch._dynamo.config.suppress_errors = True
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    TextIteratorStreamer,
-)
-import gradio as gr
-import spaces
 model_id = "microsoft/bitnet-b1.58-2B-4T"
-# tokenizer unchanged
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-# load on CPU by default; no device_map
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    torch_dtype=torch.bfloat16,       # CPU can emulate bfloat16
-    low_cpu_mem_usage=True            # reduces peak RAM (requires accelerate)
-)
-print(next(model.parameters()).device)
 @spaces.GPU(duration=15)
 def gpu():
     print("[GPU] | GPU maintained.")
-def respond(
-    message: str,
-    history: list[tuple[str, str]],
-    system_message: str,
-    max_tokens: int,
-    temperature: float,
-    top_p: float,
-):
     messages = [{"role": "system", "content": system_message}]
     for user_msg, bot_msg in history:
-        if user_msg:
-            messages.append({"role": "user", "content": user_msg})
-        if bot_msg:
-            messages.append({"role": "assistant", "content": bot_msg})
     messages.append({"role": "user", "content": message})
-    prompt = tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    inputs = tokenizer(prompt, return_tensors="pt")
-    streamer = TextIteratorStreamer(
-        tokenizer, skip_prompt=True, skip_special_tokens=True
-    )
-    generate_kwargs = dict(
-        **inputs,
-        streamer=streamer,
-        max_new_tokens=max_tokens,
-        temperature=temperature,
-        top_p=top_p,
-        do_sample=True,
-    )
-    thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
-    thread.start()
-    response = ""
     for new_text in streamer:
         response += new_text
         yield response
 demo = gr.ChatInterface(
     fn=respond,
-    title="Bitnet-b1.58-2B-4T Chatbot",
-    description="This chat application is powered by Microsoft's SOTA Bitnet-b1.58-2B-4T and designed for natural and fast conversations.",
     examples=[
-        [
-            "Hello! How are you?",
-            "You are a helpful AI assistant for everyday tasks.",
-            512,
-            0.7,
-            0.95,
-        ],
-        [
-            "Can you code a snake game in Python?",
-            "You are a helpful AI assistant for coding.",
-            2048,
-            0.7,
-            0.95,
-        ],
     ],
     additional_inputs=[
-        gr.Textbox(
-            value="You are a helpful AI assistant.",
-            label="System message"
-        ),
-        gr.Slider(
-            minimum=1,
-            maximum=8192,
-            value=2048,
-            step=1,
-            label="Max new tokens"
-        ),
-        gr.Slider(
-            minimum=0.1,
-            maximum=4.0,
-            value=0.7,
-            step=0.1,
-            label="Temperature"
-        ),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)"
-        ),
     ],
 )

+import os, subprocess, shlex, json, threading, gradio as gr, spaces
+from transformers import AutoTokenizer, TextIteratorStreamer
 model_id = "microsoft/bitnet-b1.58-2B-4T"
+gguf_path = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf"   # update if different
+threads   = os.cpu_count() or 8
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+def bitnet_cpp_generate(prompt, n_predict, temperature, top_p):
+    cmd = f"python BitNet/run_inference.py -m {gguf_path} -p {json.dumps(prompt)} -n {n_predict} -t {threads} -temp {temperature} -top_p {top_p} -cnv"
+    with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, text=True, bufsize=1) as proc:
+        for line in proc.stdout:
+            yield line.rstrip("\n")
 @spaces.GPU(duration=15)
 def gpu():
     print("[GPU] | GPU maintained.")
+def respond(message, history, system_message, max_tokens, temperature, top_p):
     messages = [{"role": "system", "content": system_message}]
     for user_msg, bot_msg in history:
+        if user_msg: messages.append({"role": "user", "content": user_msg})
+        if bot_msg:  messages.append({"role": "assistant", "content": bot_msg})
     messages.append({"role": "user", "content": message})
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    response, streamer = "", TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    def work():
+        for tok in bitnet_cpp_generate(prompt, max_tokens, temperature, top_p):
+            streamer.put(tok)
+        streamer.end()
+    threading.Thread(target=work, daemon=True).start()
     for new_text in streamer:
         response += new_text
         yield response
 demo = gr.ChatInterface(
     fn=respond,
+    title="Bitnet-b1.58-2B-4T Chatbot (cpp backend)",
+    description="ultra-light cpu chat using bitnet_cpp",
     examples=[
+        ["hello!", "you are a helpful ai assistant.", 512, 0.7, 0.95],
+        ["code a snake game in python", "you are a helpful ai assistant.", 2048, 0.7, 0.95],
     ],
     additional_inputs=[
+        gr.Textbox(value="you are a helpful ai assistant.", label="system message"),
+        gr.Slider(1, 8192, 2048, 1, label="max new tokens"),
+        gr.Slider(0.1, 4.0, 0.7, 0.1, label="temperature"),
+        gr.Slider(0.1, 1.0, 0.95, 0.05, label="top-p"),
     ],
 )