Staticaliza commited on
Commit
d46a062
·
verified ·
1 Parent(s): 1b84f3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -28
app.py CHANGED
@@ -1,42 +1,43 @@
1
  import subprocess, sys, pathlib, os, shlex, json, threading, gradio as gr, spaces
2
  from transformers import AutoTokenizer, TextIteratorStreamer
3
 
4
- model_id = "microsoft/bitnet-b1.58-2B-4T"
5
- repo_dir = pathlib.Path("BitNet")
6
- gguf_repo = "microsoft/bitnet-b1.58-2B-4T-gguf"
7
- gguf_file = pathlib.Path("ggml-model-i2_s.gguf")
8
- threads = os.cpu_count() or 8
9
 
10
- # step 1 ── grab cpp runtime (with submodule) once
11
  if not repo_dir.exists():
12
  subprocess.run(["git","clone","--depth","1","--recursive",
13
  "https://github.com/microsoft/BitNet.git"], check=True)
14
 
15
- # step 2 ── get ready-made gguf so we skip conversion/quant
16
  if not gguf_file.exists():
17
- subprocess.run(["huggingface-cli","download",gguf_repo,
18
- "--local-dir",".","--repo-type","model",
19
- "--include","ggml-model-i2_s.gguf"], check=True)
 
 
20
 
21
  tokenizer = AutoTokenizer.from_pretrained(model_id)
22
 
23
- def bitnet_cpp_generate(prompt,n_predict,temperature,top_p):
24
- cmd = f"python BitNet/run_inference.py -m {gguf_file} -p {json.dumps(prompt)} -n {n_predict} -t {threads} -temp {temperature} -top_p {top_p} -cnv"
25
- with subprocess.Popen(shlex.split(cmd),stdout=subprocess.PIPE,text=True,bufsize=1) as p:
26
- for line in p.stdout: yield line.rstrip("\n")
 
27
 
28
  @spaces.GPU(duration=15)
29
  def gpu(): print("[GPU] | GPU maintained.")
30
 
31
- def respond(msg,hist,sys_msg,max_tokens,temp,top_p):
32
- msgs = [{"role":"system","content":sys_msg}]+[
33
- {"role":"user","content":u} if i%2==0 else {"role":"assistant","content":a}
34
- for i,(u,a) in enumerate(hist) if u or a
35
- ]+[{"role":"user","content":msg}]
36
- prompt = tokenizer.apply_chat_template(msgs,tokenize=False,add_generation_prompt=True)
37
- stream = TextIteratorStreamer(tokenizer,skip_prompt=True,skip_special_tokens=True)
 
38
  def worker():
39
- for tok in bitnet_cpp_generate(prompt,max_tokens,temp,top_p): stream.put(tok)
40
  stream.end()
41
  threading.Thread(target=worker,daemon=True).start()
42
  out=""
@@ -44,16 +45,16 @@ def respond(msg,hist,sys_msg,max_tokens,temp,top_p):
44
  out+=chunk
45
  yield out
46
 
47
- demo = gr.ChatInterface(
48
  fn=respond,
49
- title="bitnet-b1.58-2B-4T (cpp)",
50
  description="fast cpu chat via bitnet.cpp",
51
- examples=[["hello","you are helpful.",256,0.7,0.95]],
52
  additional_inputs=[
53
- gr.Textbox(value="you are helpful.",label="system"),
54
- gr.Slider(1,8192,1024,1,label="max tokens"),
55
  gr.Slider(0.1,4,0.7,0.1,label="temperature"),
56
- gr.Slider(0.1,1,0.95,0.05,label="top-p"),
57
  ],
58
  )
59
 
 
1
  import subprocess, sys, pathlib, os, shlex, json, threading, gradio as gr, spaces
2
  from transformers import AutoTokenizer, TextIteratorStreamer
3
 
4
+ model_id = "microsoft/bitnet-b1.58-2B-4T"
5
+ repo_dir = pathlib.Path("BitNet")
6
+ gguf_file = pathlib.Path("ggml-model-i2_s.gguf")
7
+ threads = os.cpu_count() or 8
 
8
 
 
9
  if not repo_dir.exists():
10
  subprocess.run(["git","clone","--depth","1","--recursive",
11
  "https://github.com/microsoft/BitNet.git"], check=True)
12
 
 
13
  if not gguf_file.exists():
14
+ subprocess.run(["huggingface-cli","download",
15
+ "microsoft/bitnet-b1.58-2B-4T-gguf",
16
+ "--local-dir",".",
17
+ "--include","ggml-model-i2_s.gguf",
18
+ "--repo-type","model"], check=True)
19
 
20
  tokenizer = AutoTokenizer.from_pretrained(model_id)
21
 
22
+ def bitnet_cpp_generate(prompt, n_predict, temperature):
23
+ cmd = f"python BitNet/run_inference.py -m {gguf_file} -p {json.dumps(prompt)} -n {n_predict} -t {threads} -temp {temperature} -cnv"
24
+ with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, text=True, bufsize=1) as p:
25
+ for line in p.stdout:
26
+ yield line.rstrip("\n")
27
 
28
  @spaces.GPU(duration=15)
29
  def gpu(): print("[GPU] | GPU maintained.")
30
 
31
+ def respond(msg, hist, sys_msg, max_tokens, temp, _top_p_unused):
32
+ msgs=[{"role":"system","content":sys_msg}]
33
+ for u,b in hist:
34
+ if u: msgs.append({"role":"user","content":u})
35
+ if b: msgs.append({"role":"assistant","content":b})
36
+ msgs.append({"role":"user","content":msg})
37
+ prompt=tokenizer.apply_chat_template(msgs,tokenize=False,add_generation_prompt=True)
38
+ stream=TextIteratorStreamer(tokenizer,skip_prompt=True,skip_special_tokens=True)
39
  def worker():
40
+ for tok in bitnet_cpp_generate(prompt, max_tokens, temp): stream.put(tok)
41
  stream.end()
42
  threading.Thread(target=worker,daemon=True).start()
43
  out=""
 
45
  out+=chunk
46
  yield out
47
 
48
+ demo=gr.ChatInterface(
49
  fn=respond,
50
+ title="bitnet-b1.58-2b-4t (cpp)",
51
  description="fast cpu chat via bitnet.cpp",
52
+ examples=[["hello","you are helpful",256,0.7,0.95]],
53
  additional_inputs=[
54
+ gr.Textbox(value="you are helpful",label="system message"),
55
+ gr.Slider(1,8192,1024,1,label="max new tokens"),
56
  gr.Slider(0.1,4,0.7,0.1,label="temperature"),
57
+ gr.Slider(0.0,1.0,0.95,0.05,label="top-p (ui only, not passed)"),
58
  ],
59
  )
60