Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -1,42 +1,43 @@
|
|
1 |
import subprocess, sys, pathlib, os, shlex, json, threading, gradio as gr, spaces
|
2 |
from transformers import AutoTokenizer, TextIteratorStreamer
|
3 |
|
4 |
-
model_id
|
5 |
-
repo_dir
|
6 |
-
|
7 |
-
|
8 |
-
threads = os.cpu_count() or 8
|
9 |
|
10 |
-
# step 1 ── grab cpp runtime (with submodule) once
|
11 |
if not repo_dir.exists():
|
12 |
subprocess.run(["git","clone","--depth","1","--recursive",
|
13 |
"https://github.com/microsoft/BitNet.git"], check=True)
|
14 |
|
15 |
-
# step 2 ── get ready-made gguf so we skip conversion/quant
|
16 |
if not gguf_file.exists():
|
17 |
-
subprocess.run(["huggingface-cli","download",
|
18 |
-
"
|
19 |
-
"--
|
|
|
|
|
20 |
|
21 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
22 |
|
23 |
-
def bitnet_cpp_generate(prompt,n_predict,temperature
|
24 |
-
cmd = f"python BitNet/run_inference.py -m {gguf_file} -p {json.dumps(prompt)} -n {n_predict} -t {threads} -temp {temperature} -
|
25 |
-
with subprocess.Popen(shlex.split(cmd),stdout=subprocess.PIPE,text=True,bufsize=1) as p:
|
26 |
-
for line in p.stdout:
|
|
|
27 |
|
28 |
@spaces.GPU(duration=15)
|
29 |
def gpu(): print("[GPU] | GPU maintained.")
|
30 |
|
31 |
-
def respond(msg,hist,sys_msg,max_tokens,temp,
|
32 |
-
msgs
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
38 |
def worker():
|
39 |
-
for tok in bitnet_cpp_generate(prompt,max_tokens,temp
|
40 |
stream.end()
|
41 |
threading.Thread(target=worker,daemon=True).start()
|
42 |
out=""
|
@@ -44,16 +45,16 @@ def respond(msg,hist,sys_msg,max_tokens,temp,top_p):
|
|
44 |
out+=chunk
|
45 |
yield out
|
46 |
|
47 |
-
demo
|
48 |
fn=respond,
|
49 |
-
title="bitnet-b1.58-
|
50 |
description="fast cpu chat via bitnet.cpp",
|
51 |
-
examples=[["hello","you are helpful
|
52 |
additional_inputs=[
|
53 |
-
gr.Textbox(value="you are helpful
|
54 |
-
gr.Slider(1,8192,1024,1,label="max tokens"),
|
55 |
gr.Slider(0.1,4,0.7,0.1,label="temperature"),
|
56 |
-
gr.Slider(0.
|
57 |
],
|
58 |
)
|
59 |
|
|
|
1 |
import subprocess, sys, pathlib, os, shlex, json, threading, gradio as gr, spaces
|
2 |
from transformers import AutoTokenizer, TextIteratorStreamer
|
3 |
|
4 |
+
model_id = "microsoft/bitnet-b1.58-2B-4T"
|
5 |
+
repo_dir = pathlib.Path("BitNet")
|
6 |
+
gguf_file = pathlib.Path("ggml-model-i2_s.gguf")
|
7 |
+
threads = os.cpu_count() or 8
|
|
|
8 |
|
|
|
9 |
if not repo_dir.exists():
|
10 |
subprocess.run(["git","clone","--depth","1","--recursive",
|
11 |
"https://github.com/microsoft/BitNet.git"], check=True)
|
12 |
|
|
|
13 |
if not gguf_file.exists():
|
14 |
+
subprocess.run(["huggingface-cli","download",
|
15 |
+
"microsoft/bitnet-b1.58-2B-4T-gguf",
|
16 |
+
"--local-dir",".",
|
17 |
+
"--include","ggml-model-i2_s.gguf",
|
18 |
+
"--repo-type","model"], check=True)
|
19 |
|
20 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
21 |
|
22 |
+
def bitnet_cpp_generate(prompt, n_predict, temperature):
|
23 |
+
cmd = f"python BitNet/run_inference.py -m {gguf_file} -p {json.dumps(prompt)} -n {n_predict} -t {threads} -temp {temperature} -cnv"
|
24 |
+
with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, text=True, bufsize=1) as p:
|
25 |
+
for line in p.stdout:
|
26 |
+
yield line.rstrip("\n")
|
27 |
|
28 |
@spaces.GPU(duration=15)
|
29 |
def gpu(): print("[GPU] | GPU maintained.")
|
30 |
|
31 |
+
def respond(msg, hist, sys_msg, max_tokens, temp, _top_p_unused):
|
32 |
+
msgs=[{"role":"system","content":sys_msg}]
|
33 |
+
for u,b in hist:
|
34 |
+
if u: msgs.append({"role":"user","content":u})
|
35 |
+
if b: msgs.append({"role":"assistant","content":b})
|
36 |
+
msgs.append({"role":"user","content":msg})
|
37 |
+
prompt=tokenizer.apply_chat_template(msgs,tokenize=False,add_generation_prompt=True)
|
38 |
+
stream=TextIteratorStreamer(tokenizer,skip_prompt=True,skip_special_tokens=True)
|
39 |
def worker():
|
40 |
+
for tok in bitnet_cpp_generate(prompt, max_tokens, temp): stream.put(tok)
|
41 |
stream.end()
|
42 |
threading.Thread(target=worker,daemon=True).start()
|
43 |
out=""
|
|
|
45 |
out+=chunk
|
46 |
yield out
|
47 |
|
48 |
+
demo=gr.ChatInterface(
|
49 |
fn=respond,
|
50 |
+
title="bitnet-b1.58-2b-4t (cpp)",
|
51 |
description="fast cpu chat via bitnet.cpp",
|
52 |
+
examples=[["hello","you are helpful",256,0.7,0.95]],
|
53 |
additional_inputs=[
|
54 |
+
gr.Textbox(value="you are helpful",label="system message"),
|
55 |
+
gr.Slider(1,8192,1024,1,label="max new tokens"),
|
56 |
gr.Slider(0.1,4,0.7,0.1,label="temperature"),
|
57 |
+
gr.Slider(0.0,1.0,0.95,0.05,label="top-p (ui only, not passed)"),
|
58 |
],
|
59 |
)
|
60 |
|