Spaces:
Paused
Paused
File size: 2,459 Bytes
1b84f3f 65323fa f3e3a98 d46a062 7e0f1bb 1b84f3f d46a062 f47d0a5 fa4e434 4b47c09 d46a062 cd5769b d4c0a68 7e0f1bb d46a062 1b84f3f d46a062 1b84f3f d46a062 459aebd d46a062 1b84f3f d46a062 459aebd d46a062 1b84f3f d46a062 459aebd 278edce 7e0f1bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import subprocess, sys, pathlib, os, shlex, json, threading, gradio as gr, spaces
from transformers import AutoTokenizer, TextIteratorStreamer
model_id = "microsoft/bitnet-b1.58-2B-4T"
repo_dir = pathlib.Path("BitNet")
gguf_file = pathlib.Path("ggml-model-i2_s.gguf")
threads = os.cpu_count() or 8
if not repo_dir.exists():
subprocess.run(["git","clone","--depth","1","--recursive",
"https://github.com/microsoft/BitNet.git"], check=True)
if not gguf_file.exists():
subprocess.run(["huggingface-cli","download",
"microsoft/bitnet-b1.58-2B-4T-gguf",
"--local-dir",".",
"--include","ggml-model-i2_s.gguf",
"--repo-type","model"], check=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
def bitnet_cpp_generate(prompt, n_predict, temperature):
cmd = f"python BitNet/run_inference.py -m {gguf_file} -p {json.dumps(prompt)} -n {n_predict} -t {threads} -temp {temperature} -cnv"
with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, text=True, bufsize=1) as p:
for line in p.stdout:
yield line.rstrip("\n")
@spaces.GPU(duration=15)
def gpu(): print("[GPU] | GPU maintained.")
def respond(msg, hist, sys_msg, max_tokens, temp, _top_p_unused):
msgs=[{"role":"system","content":sys_msg}]
for u,b in hist:
if u: msgs.append({"role":"user","content":u})
if b: msgs.append({"role":"assistant","content":b})
msgs.append({"role":"user","content":msg})
prompt=tokenizer.apply_chat_template(msgs,tokenize=False,add_generation_prompt=True)
stream=TextIteratorStreamer(tokenizer,skip_prompt=True,skip_special_tokens=True)
def worker():
for tok in bitnet_cpp_generate(prompt, max_tokens, temp): stream.put(tok)
stream.end()
threading.Thread(target=worker,daemon=True).start()
out=""
for chunk in stream:
out+=chunk
yield out
demo=gr.ChatInterface(
fn=respond,
title="bitnet-b1.58-2b-4t (cpp)",
description="fast cpu chat via bitnet.cpp",
examples=[["hello","you are helpful",256,0.7,0.95]],
additional_inputs=[
gr.Textbox(value="you are helpful",label="system message"),
gr.Slider(1,8192,1024,1,label="max new tokens"),
gr.Slider(0.1,4,0.7,0.1,label="temperature"),
gr.Slider(0.0,1.0,0.95,0.05,label="top-p (ui only, not passed)"),
],
)
if __name__=="__main__": demo.launch() |