Spaces:

Staticaliza
/

Zero-5

Paused

App Files Files Community

Staticaliza commited on Apr 29

Commit

5b9ea8e

verified ·

1 Parent(s): d46a062

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -24

app.py CHANGED Viewed

@@ -1,29 +1,36 @@
-import subprocess, sys, pathlib, os, shlex, json, threading, gradio as gr, spaces
 from transformers import AutoTokenizer, TextIteratorStreamer
-model_id = "microsoft/bitnet-b1.58-2B-4T"
-repo_dir = pathlib.Path("BitNet")
-gguf_file = pathlib.Path("ggml-model-i2_s.gguf")
-threads = os.cpu_count() or 8
-if not repo_dir.exists():
-    subprocess.run(["git","clone","--depth","1","--recursive",
-                    "https://github.com/microsoft/BitNet.git"], check=True)
-if not gguf_file.exists():
     subprocess.run(["huggingface-cli","download",
                     "microsoft/bitnet-b1.58-2B-4T-gguf",
-                    "--local-dir",".",
                     "--include","ggml-model-i2_s.gguf",
-                    "--repo-type","model"], check=True)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 def bitnet_cpp_generate(prompt, n_predict, temperature):
-    cmd = f"python BitNet/run_inference.py -m {gguf_file} -p {json.dumps(prompt)} -n {n_predict} -t {threads} -temp {temperature} -cnv"
-    with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, text=True, bufsize=1) as p:
-        for line in p.stdout:
-            yield line.rstrip("\n")
 @spaces.GPU(duration=15)
 def gpu(): print("[GPU] | GPU maintained.")
@@ -34,28 +41,31 @@ def respond(msg, hist, sys_msg, max_tokens, temp, _top_p_unused):
         if u: msgs.append({"role":"user","content":u})
         if b: msgs.append({"role":"assistant","content":b})
     msgs.append({"role":"user","content":msg})
-    prompt=tokenizer.apply_chat_template(msgs,tokenize=False,add_generation_prompt=True)
-    stream=TextIteratorStreamer(tokenizer,skip_prompt=True,skip_special_tokens=True)
     def worker():
         for tok in bitnet_cpp_generate(prompt, max_tokens, temp): stream.put(tok)
         stream.end()
-    threading.Thread(target=worker,daemon=True).start()
     out=""
     for chunk in stream:
         out+=chunk
         yield out
-demo=gr.ChatInterface(
     fn=respond,
-    title="bitnet-b1.58-2b-4t (cpp)",
-    description="fast cpu chat via bitnet.cpp",
-    examples=[["hello","you are helpful",256,0.7,0.95]],
     additional_inputs=[
         gr.Textbox(value="you are helpful",label="system message"),
         gr.Slider(1,8192,1024,1,label="max new tokens"),
         gr.Slider(0.1,4,0.7,0.1,label="temperature"),
-        gr.Slider(0.0,1.0,0.95,0.05,label="top-p (ui only, not passed)"),
     ],
 )
-if __name__=="__main__": demo.launch()

+import subprocess, sys, pathlib, os, json, threading, gradio as gr, spaces
 from transformers import AutoTokenizer, TextIteratorStreamer
+# ─── one-time deps ────────────────────────────────────────────────────────────
+subprocess.run([sys.executable,"-m","pip","install","--quiet","--upgrade",
+                "llama-cpp-python"], check=True)          # avx2 / avx512 wheels
+# ------------------------------------------------------------------------------
+model_id   = "microsoft/bitnet-b1.58-2B-4T"
+gguf_path  = pathlib.Path("ggml-model-i2_s.gguf")
+threads    = os.cpu_count() or 8
+if not gguf_path.exists():                                # grab ready-made gguf
     subprocess.run(["huggingface-cli","download",
                     "microsoft/bitnet-b1.58-2B-4T-gguf",
                     "--include","ggml-model-i2_s.gguf",
+                    "--local-dir",".","--repo-type","model"], check=True)
+from llama_cpp import Llama                                # after wheel install
+llm = Llama(model_path=str(gguf_path),
+            n_threads=threads,
+            n_ctx=8192,           # bitnet uses 4-T rope, large ctx is fine
+            logits_all=False)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 def bitnet_cpp_generate(prompt, n_predict, temperature):
+    # llama-cpp streams dicts; we yield just the text chunk
+    for chunk in llm(prompt,
+                     max_tokens=n_predict,
+                     temperature=temperature,
+                     stream=True):
+        yield chunk["choices"][0]["text"]
 @spaces.GPU(duration=15)
 def gpu(): print("[GPU] | GPU maintained.")
         if u: msgs.append({"role":"user","content":u})
         if b: msgs.append({"role":"assistant","content":b})
     msgs.append({"role":"user","content":msg})
+    prompt = tokenizer.apply_chat_template(msgs, tokenize=False,
+                                           add_generation_prompt=True)
+    stream  = TextIteratorStreamer(tokenizer, skip_prompt=True,
+                                   skip_special_tokens=True)
     def worker():
         for tok in bitnet_cpp_generate(prompt, max_tokens, temp): stream.put(tok)
         stream.end()
+    threading.Thread(target=worker, daemon=True).start()
     out=""
     for chunk in stream:
         out+=chunk
         yield out
+demo = gr.ChatInterface(
     fn=respond,
+    title="bitnet-b1.58-2B-4T (llama-cpp-python)",
+    description="fast cpu chat with bitnet via llama-cpp wheel (no manual build)",
+    examples=[["hello","you are helpful",256,0.7,0.0]],
     additional_inputs=[
         gr.Textbox(value="you are helpful",label="system message"),
         gr.Slider(1,8192,1024,1,label="max new tokens"),
         gr.Slider(0.1,4,0.7,0.1,label="temperature"),
+        gr.Slider(0.0,1.0,0.0,0.05,label="top-p (placeholder)"),
     ],
 )
+if __name__=="__main__":
+    demo.launch()