Spaces:

Staticaliza
/

Zero-5

Paused

App Files Files Community

Staticaliza commited on Apr 29

Commit

1b84f3f

verified ·

1 Parent(s): 7e0f1bb

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -36

app.py CHANGED Viewed

@@ -1,53 +1,59 @@
-import subprocess, pathlib, sys, json, shlex, threading, os, gradio as gr, spaces
 from transformers import AutoTokenizer, TextIteratorStreamer
-model_id  = "microsoft/bitnet-b1.58-2B-4T"
-repo_dir  = pathlib.Path("BitNet")
-gguf_dir  = pathlib.Path("models/BitNet-b1.58-2B-4T")
-gguf_path = gguf_dir / "ggml-model-i2_s.gguf"
 if not repo_dir.exists():
-    subprocess.run(["git","clone","--depth","1","https://github.com/microsoft/BitNet.git"], check=True)
-if not gguf_path.exists():
-    subprocess.run([sys.executable,"BitNet/setup_env.py","-md",model_id,"-q","i2_s"], check=True)
-threads   = os.cpu_count() or 8
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 def bitnet_cpp_generate(prompt,n_predict,temperature,top_p):
-    cmd = f"python BitNet/run_inference.py -m {gguf_path} -p {json.dumps(prompt)} -n {n_predict} -t {threads} -temp {temperature} -top_p {top_p} -cnv"
-    with subprocess.Popen(shlex.split(cmd),stdout=subprocess.PIPE,text=True,bufsize=1) as proc:
-        for line in proc.stdout: yield line.rstrip("\n")
 @spaces.GPU(duration=15)
 def gpu(): print("[GPU] | GPU maintained.")
-def respond(message,history,system_message,max_tokens,temperature,top_p):
-    messages=[{"role":"system","content":system_message}]
-    for u,b in history:
-        if u: messages.append({"role":"user","content":u})
-        if b: messages.append({"role":"assistant","content":b})
-    messages.append({"role":"user","content":message})
-    prompt=tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
-    response,streamer=" ",TextIteratorStreamer(tokenizer,skip_prompt=True,skip_special_tokens=True)
-    def work():
-        for tok in bitnet_cpp_generate(prompt,max_tokens,temperature,top_p): streamer.put(tok)
-        streamer.end()
-    threading.Thread(target=work,daemon=True).start()
-    for new in streamer:
-        response+=new
-        yield response
-demo=gr.ChatInterface(
     fn=respond,
-    title="BitNet-b1.58-2B-4T (cpp backend)",
-    description="fast cpu chat via bitnet_cpp",
-    examples=[["hi","you are a helpful ai assistant.",512,0.7,0.95]],
     additional_inputs=[
-        gr.Textbox(value="you are a helpful ai assistant.",label="system message"),
-        gr.Slider(1,8192,2048,1,label="max new tokens"),
-        gr.Slider(0.1,4.0,0.7,0.1,label="temperature"),
-        gr.Slider(0.1,1.0,0.95,0.05,label="top-p"),
     ],
 )

+import subprocess, sys, pathlib, os, shlex, json, threading, gradio as gr, spaces
 from transformers import AutoTokenizer, TextIteratorStreamer
+model_id   = "microsoft/bitnet-b1.58-2B-4T"
+repo_dir   = pathlib.Path("BitNet")
+gguf_repo  = "microsoft/bitnet-b1.58-2B-4T-gguf"
+gguf_file  = pathlib.Path("ggml-model-i2_s.gguf")
+threads    = os.cpu_count() or 8
+# step 1 ── grab cpp runtime (with submodule) once
 if not repo_dir.exists():
+    subprocess.run(["git","clone","--depth","1","--recursive",
+                    "https://github.com/microsoft/BitNet.git"], check=True)
+# step 2 ── get ready-made gguf so we skip conversion/quant
+if not gguf_file.exists():
+    subprocess.run(["huggingface-cli","download",gguf_repo,
+                    "--local-dir",".","--repo-type","model",
+                    "--include","ggml-model-i2_s.gguf"], check=True)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 def bitnet_cpp_generate(prompt,n_predict,temperature,top_p):
+    cmd = f"python BitNet/run_inference.py -m {gguf_file} -p {json.dumps(prompt)} -n {n_predict} -t {threads} -temp {temperature} -top_p {top_p} -cnv"
+    with subprocess.Popen(shlex.split(cmd),stdout=subprocess.PIPE,text=True,bufsize=1) as p:
+        for line in p.stdout: yield line.rstrip("\n")
 @spaces.GPU(duration=15)
 def gpu(): print("[GPU] | GPU maintained.")
+def respond(msg,hist,sys_msg,max_tokens,temp,top_p):
+    msgs = [{"role":"system","content":sys_msg}]+[
+        {"role":"user","content":u} if i%2==0 else {"role":"assistant","content":a}
+        for i,(u,a) in enumerate(hist) if u or a
+    ]+[{"role":"user","content":msg}]
+    prompt = tokenizer.apply_chat_template(msgs,tokenize=False,add_generation_prompt=True)
+    stream  = TextIteratorStreamer(tokenizer,skip_prompt=True,skip_special_tokens=True)
+    def worker():
+        for tok in bitnet_cpp_generate(prompt,max_tokens,temp,top_p): stream.put(tok)
+        stream.end()
+    threading.Thread(target=worker,daemon=True).start()
+    out=""
+    for chunk in stream:
+        out+=chunk
+        yield out
+demo = gr.ChatInterface(
     fn=respond,
+    title="bitnet-b1.58-2B-4T (cpp)",
+    description="fast cpu chat via bitnet.cpp",
+    examples=[["hello","you are helpful.",256,0.7,0.95]],
     additional_inputs=[
+        gr.Textbox(value="you are helpful.",label="system"),
+        gr.Slider(1,8192,1024,1,label="max tokens"),
+        gr.Slider(0.1,4,0.7,0.1,label="temperature"),
+        gr.Slider(0.1,1,0.95,0.05,label="top-p"),
     ],
 )