File size: 2,459 Bytes
1b84f3f
65323fa
f3e3a98
d46a062
 
 
 
7e0f1bb
 
1b84f3f
 
 
 
d46a062
 
 
 
 
f47d0a5
fa4e434
4b47c09
d46a062
 
 
 
 
cd5769b
d4c0a68
7e0f1bb
 
d46a062
 
 
 
 
 
 
 
1b84f3f
d46a062
1b84f3f
 
 
 
 
 
 
d46a062
459aebd
d46a062
1b84f3f
d46a062
459aebd
d46a062
 
1b84f3f
d46a062
459aebd
 
278edce
7e0f1bb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import subprocess, sys, pathlib, os, shlex, json, threading, gradio as gr, spaces
from transformers import AutoTokenizer, TextIteratorStreamer

model_id = "microsoft/bitnet-b1.58-2B-4T"
repo_dir = pathlib.Path("BitNet")
gguf_file = pathlib.Path("ggml-model-i2_s.gguf")
threads = os.cpu_count() or 8

if not repo_dir.exists():
    subprocess.run(["git","clone","--depth","1","--recursive",
                    "https://github.com/microsoft/BitNet.git"], check=True)

if not gguf_file.exists():
    subprocess.run(["huggingface-cli","download",
                    "microsoft/bitnet-b1.58-2B-4T-gguf",
                    "--local-dir",".",
                    "--include","ggml-model-i2_s.gguf",
                    "--repo-type","model"], check=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)

def bitnet_cpp_generate(prompt, n_predict, temperature):
    cmd = f"python BitNet/run_inference.py -m {gguf_file} -p {json.dumps(prompt)} -n {n_predict} -t {threads} -temp {temperature} -cnv"
    with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, text=True, bufsize=1) as p:
        for line in p.stdout:
            yield line.rstrip("\n")

@spaces.GPU(duration=15)
def gpu(): print("[GPU] | GPU maintained.")

def respond(msg, hist, sys_msg, max_tokens, temp, _top_p_unused):
    msgs=[{"role":"system","content":sys_msg}]
    for u,b in hist:
        if u: msgs.append({"role":"user","content":u})
        if b: msgs.append({"role":"assistant","content":b})
    msgs.append({"role":"user","content":msg})
    prompt=tokenizer.apply_chat_template(msgs,tokenize=False,add_generation_prompt=True)
    stream=TextIteratorStreamer(tokenizer,skip_prompt=True,skip_special_tokens=True)
    def worker():
        for tok in bitnet_cpp_generate(prompt, max_tokens, temp): stream.put(tok)
        stream.end()
    threading.Thread(target=worker,daemon=True).start()
    out=""
    for chunk in stream:
        out+=chunk
        yield out

demo=gr.ChatInterface(
    fn=respond,
    title="bitnet-b1.58-2b-4t (cpp)",
    description="fast cpu chat via bitnet.cpp",
    examples=[["hello","you are helpful",256,0.7,0.95]],
    additional_inputs=[
        gr.Textbox(value="you are helpful",label="system message"),
        gr.Slider(1,8192,1024,1,label="max new tokens"),
        gr.Slider(0.1,4,0.7,0.1,label="temperature"),
        gr.Slider(0.0,1.0,0.95,0.05,label="top-p (ui only, not passed)"),
    ],
)

if __name__=="__main__": demo.launch()