Staticaliza commited on
Commit
1b84f3f
·
verified ·
1 Parent(s): 7e0f1bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -36
app.py CHANGED
@@ -1,53 +1,59 @@
1
- import subprocess, pathlib, sys, json, shlex, threading, os, gradio as gr, spaces
2
  from transformers import AutoTokenizer, TextIteratorStreamer
3
 
4
- model_id = "microsoft/bitnet-b1.58-2B-4T"
5
- repo_dir = pathlib.Path("BitNet")
6
- gguf_dir = pathlib.Path("models/BitNet-b1.58-2B-4T")
7
- gguf_path = gguf_dir / "ggml-model-i2_s.gguf"
 
8
 
 
9
  if not repo_dir.exists():
10
- subprocess.run(["git","clone","--depth","1","https://github.com/microsoft/BitNet.git"], check=True)
11
- if not gguf_path.exists():
12
- subprocess.run([sys.executable,"BitNet/setup_env.py","-md",model_id,"-q","i2_s"], check=True)
 
 
 
 
 
13
 
14
- threads = os.cpu_count() or 8
15
  tokenizer = AutoTokenizer.from_pretrained(model_id)
16
 
17
  def bitnet_cpp_generate(prompt,n_predict,temperature,top_p):
18
- cmd = f"python BitNet/run_inference.py -m {gguf_path} -p {json.dumps(prompt)} -n {n_predict} -t {threads} -temp {temperature} -top_p {top_p} -cnv"
19
- with subprocess.Popen(shlex.split(cmd),stdout=subprocess.PIPE,text=True,bufsize=1) as proc:
20
- for line in proc.stdout: yield line.rstrip("\n")
21
 
22
  @spaces.GPU(duration=15)
23
  def gpu(): print("[GPU] | GPU maintained.")
24
 
25
- def respond(message,history,system_message,max_tokens,temperature,top_p):
26
- messages=[{"role":"system","content":system_message}]
27
- for u,b in history:
28
- if u: messages.append({"role":"user","content":u})
29
- if b: messages.append({"role":"assistant","content":b})
30
- messages.append({"role":"user","content":message})
31
- prompt=tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
32
- response,streamer=" ",TextIteratorStreamer(tokenizer,skip_prompt=True,skip_special_tokens=True)
33
- def work():
34
- for tok in bitnet_cpp_generate(prompt,max_tokens,temperature,top_p): streamer.put(tok)
35
- streamer.end()
36
- threading.Thread(target=work,daemon=True).start()
37
- for new in streamer:
38
- response+=new
39
- yield response
40
-
41
- demo=gr.ChatInterface(
42
  fn=respond,
43
- title="BitNet-b1.58-2B-4T (cpp backend)",
44
- description="fast cpu chat via bitnet_cpp",
45
- examples=[["hi","you are a helpful ai assistant.",512,0.7,0.95]],
46
  additional_inputs=[
47
- gr.Textbox(value="you are a helpful ai assistant.",label="system message"),
48
- gr.Slider(1,8192,2048,1,label="max new tokens"),
49
- gr.Slider(0.1,4.0,0.7,0.1,label="temperature"),
50
- gr.Slider(0.1,1.0,0.95,0.05,label="top-p"),
51
  ],
52
  )
53
 
 
1
+ import subprocess, sys, pathlib, os, shlex, json, threading, gradio as gr, spaces
2
  from transformers import AutoTokenizer, TextIteratorStreamer
3
 
4
+ model_id = "microsoft/bitnet-b1.58-2B-4T"
5
+ repo_dir = pathlib.Path("BitNet")
6
+ gguf_repo = "microsoft/bitnet-b1.58-2B-4T-gguf"
7
+ gguf_file = pathlib.Path("ggml-model-i2_s.gguf")
8
+ threads = os.cpu_count() or 8
9
 
10
+ # step 1 ── grab cpp runtime (with submodule) once
11
  if not repo_dir.exists():
12
+ subprocess.run(["git","clone","--depth","1","--recursive",
13
+ "https://github.com/microsoft/BitNet.git"], check=True)
14
+
15
+ # step 2 ── get ready-made gguf so we skip conversion/quant
16
+ if not gguf_file.exists():
17
+ subprocess.run(["huggingface-cli","download",gguf_repo,
18
+ "--local-dir",".","--repo-type","model",
19
+ "--include","ggml-model-i2_s.gguf"], check=True)
20
 
 
21
  tokenizer = AutoTokenizer.from_pretrained(model_id)
22
 
23
  def bitnet_cpp_generate(prompt,n_predict,temperature,top_p):
24
+ cmd = f"python BitNet/run_inference.py -m {gguf_file} -p {json.dumps(prompt)} -n {n_predict} -t {threads} -temp {temperature} -top_p {top_p} -cnv"
25
+ with subprocess.Popen(shlex.split(cmd),stdout=subprocess.PIPE,text=True,bufsize=1) as p:
26
+ for line in p.stdout: yield line.rstrip("\n")
27
 
28
  @spaces.GPU(duration=15)
29
  def gpu(): print("[GPU] | GPU maintained.")
30
 
31
+ def respond(msg,hist,sys_msg,max_tokens,temp,top_p):
32
+ msgs = [{"role":"system","content":sys_msg}]+[
33
+ {"role":"user","content":u} if i%2==0 else {"role":"assistant","content":a}
34
+ for i,(u,a) in enumerate(hist) if u or a
35
+ ]+[{"role":"user","content":msg}]
36
+ prompt = tokenizer.apply_chat_template(msgs,tokenize=False,add_generation_prompt=True)
37
+ stream = TextIteratorStreamer(tokenizer,skip_prompt=True,skip_special_tokens=True)
38
+ def worker():
39
+ for tok in bitnet_cpp_generate(prompt,max_tokens,temp,top_p): stream.put(tok)
40
+ stream.end()
41
+ threading.Thread(target=worker,daemon=True).start()
42
+ out=""
43
+ for chunk in stream:
44
+ out+=chunk
45
+ yield out
46
+
47
+ demo = gr.ChatInterface(
48
  fn=respond,
49
+ title="bitnet-b1.58-2B-4T (cpp)",
50
+ description="fast cpu chat via bitnet.cpp",
51
+ examples=[["hello","you are helpful.",256,0.7,0.95]],
52
  additional_inputs=[
53
+ gr.Textbox(value="you are helpful.",label="system"),
54
+ gr.Slider(1,8192,1024,1,label="max tokens"),
55
+ gr.Slider(0.1,4,0.7,0.1,label="temperature"),
56
+ gr.Slider(0.1,1,0.95,0.05,label="top-p"),
57
  ],
58
  )
59