Spaces:

Staticaliza
/

Zero-5

Paused

App Files Files Community

Staticaliza commited on Apr 30

Commit

fbbda22

verified ·

1 Parent(s): 5b9ea8e

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -67

app.py CHANGED Viewed

@@ -1,71 +1,48 @@
-import subprocess, sys, pathlib, os, json, threading, gradio as gr, spaces
-from transformers import AutoTokenizer, TextIteratorStreamer
-# ─── one-time deps ────────────────────────────────────────────────────────────
-subprocess.run([sys.executable,"-m","pip","install","--quiet","--upgrade",
-                "llama-cpp-python"], check=True)          # avx2 / avx512 wheels
-# ------------------------------------------------------------------------------
-model_id   = "microsoft/bitnet-b1.58-2B-4T"
-gguf_path  = pathlib.Path("ggml-model-i2_s.gguf")
-threads    = os.cpu_count() or 8
-if not gguf_path.exists():                                # grab ready-made gguf
-    subprocess.run(["huggingface-cli","download",
-                    "microsoft/bitnet-b1.58-2B-4T-gguf",
-                    "--include","ggml-model-i2_s.gguf",
-                    "--local-dir",".","--repo-type","model"], check=True)
-from llama_cpp import Llama                                # after wheel install
-llm = Llama(model_path=str(gguf_path),
-            n_threads=threads,
-            n_ctx=8192,           # bitnet uses 4-T rope, large ctx is fine
-            logits_all=False)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-def bitnet_cpp_generate(prompt, n_predict, temperature):
-    # llama-cpp streams dicts; we yield just the text chunk
-    for chunk in llm(prompt,
-                     max_tokens=n_predict,
-                     temperature=temperature,
-                     stream=True):
-        yield chunk["choices"][0]["text"]
 @spaces.GPU(duration=15)
-def gpu(): print("[GPU] | GPU maintained.")
-def respond(msg, hist, sys_msg, max_tokens, temp, _top_p_unused):
-    msgs=[{"role":"system","content":sys_msg}]
-    for u,b in hist:
-        if u: msgs.append({"role":"user","content":u})
-        if b: msgs.append({"role":"assistant","content":b})
-    msgs.append({"role":"user","content":msg})
-    prompt = tokenizer.apply_chat_template(msgs, tokenize=False,
-                                           add_generation_prompt=True)
-    stream  = TextIteratorStreamer(tokenizer, skip_prompt=True,
-                                   skip_special_tokens=True)
-    def worker():
-        for tok in bitnet_cpp_generate(prompt, max_tokens, temp): stream.put(tok)
-        stream.end()
-    threading.Thread(target=worker, daemon=True).start()
-    out=""
-    for chunk in stream:
-        out+=chunk
-        yield out
-demo = gr.ChatInterface(
-    fn=respond,
-    title="bitnet-b1.58-2B-4T (llama-cpp-python)",
-    description="fast cpu chat with bitnet via llama-cpp wheel (no manual build)",
-    examples=[["hello","you are helpful",256,0.7,0.0]],
-    additional_inputs=[
-        gr.Textbox(value="you are helpful",label="system message"),
-        gr.Slider(1,8192,1024,1,label="max new tokens"),
-        gr.Slider(0.1,4,0.7,0.1,label="temperature"),
-        gr.Slider(0.0,1.0,0.0,0.05,label="top-p (placeholder)"),
-    ],
-)
-if __name__=="__main__":
     demo.launch()

+import os
+import threading
+import torch
+import torch._dynamo
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+import gradio as gr
+import spaces
+os.system("pip install git+https://github.com/shumingma/transformers.git")
+torch._dynamo.config.suppress_errors = True
+model_id = "microsoft/bitnet-b1.58-2B-4T"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map={"": "cpu"}).to("cpu")
+model.to("cpu")
 @spaces.GPU(duration=15)
+def gpu():
+    print("[GPU] | GPU maintained.")
+def respond_simple(message: str, max_tokens: int, temperature: float, top_p: float):
+    inputs = tokenizer(message, return_tensors="pt").to("cpu")
+    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+    thread = threading.Thread(target=model.generate, kwargs={
+        **inputs,
+        "streamer": streamer,
+        "max_new_tokens": max_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "do_sample": True
+    })
+    thread.start()
+    output = ""
+    for chunk in streamer:
+        output += chunk
+    return output
+with gr.Blocks() as demo:
+    gr.Markdown("## bitnet-b1.58-2b-4t completion")
+    tok = gr.Slider(1, 8192, value=2048, step=1, label="max new tokens")
+    temp = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="temperature")
+    top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="top-p")
+    inp = gr.Textbox(label="prompt", lines=2)
+    out = gr.Textbox(label="completion", lines=10)
+    inp.submit(respond_simple, [inp, tok, temp, top_p], out)
+if __name__ == "__main__":
     demo.launch()