Staticaliza commited on
Commit
5b9ea8e
Β·
verified Β·
1 Parent(s): d46a062

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -24
app.py CHANGED
@@ -1,29 +1,36 @@
1
- import subprocess, sys, pathlib, os, shlex, json, threading, gradio as gr, spaces
2
  from transformers import AutoTokenizer, TextIteratorStreamer
3
 
4
- model_id = "microsoft/bitnet-b1.58-2B-4T"
5
- repo_dir = pathlib.Path("BitNet")
6
- gguf_file = pathlib.Path("ggml-model-i2_s.gguf")
7
- threads = os.cpu_count() or 8
8
 
9
- if not repo_dir.exists():
10
- subprocess.run(["git","clone","--depth","1","--recursive",
11
- "https://github.com/microsoft/BitNet.git"], check=True)
12
 
13
- if not gguf_file.exists():
14
  subprocess.run(["huggingface-cli","download",
15
  "microsoft/bitnet-b1.58-2B-4T-gguf",
16
- "--local-dir",".",
17
  "--include","ggml-model-i2_s.gguf",
18
- "--repo-type","model"], check=True)
 
 
 
 
 
 
19
 
20
  tokenizer = AutoTokenizer.from_pretrained(model_id)
21
 
22
  def bitnet_cpp_generate(prompt, n_predict, temperature):
23
- cmd = f"python BitNet/run_inference.py -m {gguf_file} -p {json.dumps(prompt)} -n {n_predict} -t {threads} -temp {temperature} -cnv"
24
- with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, text=True, bufsize=1) as p:
25
- for line in p.stdout:
26
- yield line.rstrip("\n")
 
 
27
 
28
  @spaces.GPU(duration=15)
29
  def gpu(): print("[GPU] | GPU maintained.")
@@ -34,28 +41,31 @@ def respond(msg, hist, sys_msg, max_tokens, temp, _top_p_unused):
34
  if u: msgs.append({"role":"user","content":u})
35
  if b: msgs.append({"role":"assistant","content":b})
36
  msgs.append({"role":"user","content":msg})
37
- prompt=tokenizer.apply_chat_template(msgs,tokenize=False,add_generation_prompt=True)
38
- stream=TextIteratorStreamer(tokenizer,skip_prompt=True,skip_special_tokens=True)
 
 
39
  def worker():
40
  for tok in bitnet_cpp_generate(prompt, max_tokens, temp): stream.put(tok)
41
  stream.end()
42
- threading.Thread(target=worker,daemon=True).start()
43
  out=""
44
  for chunk in stream:
45
  out+=chunk
46
  yield out
47
 
48
- demo=gr.ChatInterface(
49
  fn=respond,
50
- title="bitnet-b1.58-2b-4t (cpp)",
51
- description="fast cpu chat via bitnet.cpp",
52
- examples=[["hello","you are helpful",256,0.7,0.95]],
53
  additional_inputs=[
54
  gr.Textbox(value="you are helpful",label="system message"),
55
  gr.Slider(1,8192,1024,1,label="max new tokens"),
56
  gr.Slider(0.1,4,0.7,0.1,label="temperature"),
57
- gr.Slider(0.0,1.0,0.95,0.05,label="top-p (ui only, not passed)"),
58
  ],
59
  )
60
 
61
- if __name__=="__main__": demo.launch()
 
 
1
+ import subprocess, sys, pathlib, os, json, threading, gradio as gr, spaces
2
  from transformers import AutoTokenizer, TextIteratorStreamer
3
 
4
+ # ─── one-time deps ────────────────────────────────────────────────────────────
5
+ subprocess.run([sys.executable,"-m","pip","install","--quiet","--upgrade",
6
+ "llama-cpp-python"], check=True) # avx2 / avx512 wheels
7
+ # ------------------------------------------------------------------------------
8
 
9
+ model_id = "microsoft/bitnet-b1.58-2B-4T"
10
+ gguf_path = pathlib.Path("ggml-model-i2_s.gguf")
11
+ threads = os.cpu_count() or 8
12
 
13
+ if not gguf_path.exists(): # grab ready-made gguf
14
  subprocess.run(["huggingface-cli","download",
15
  "microsoft/bitnet-b1.58-2B-4T-gguf",
 
16
  "--include","ggml-model-i2_s.gguf",
17
+ "--local-dir",".","--repo-type","model"], check=True)
18
+
19
+ from llama_cpp import Llama # after wheel install
20
+ llm = Llama(model_path=str(gguf_path),
21
+ n_threads=threads,
22
+ n_ctx=8192, # bitnet uses 4-T rope, large ctx is fine
23
+ logits_all=False)
24
 
25
  tokenizer = AutoTokenizer.from_pretrained(model_id)
26
 
27
  def bitnet_cpp_generate(prompt, n_predict, temperature):
28
+ # llama-cpp streams dicts; we yield just the text chunk
29
+ for chunk in llm(prompt,
30
+ max_tokens=n_predict,
31
+ temperature=temperature,
32
+ stream=True):
33
+ yield chunk["choices"][0]["text"]
34
 
35
  @spaces.GPU(duration=15)
36
  def gpu(): print("[GPU] | GPU maintained.")
 
41
  if u: msgs.append({"role":"user","content":u})
42
  if b: msgs.append({"role":"assistant","content":b})
43
  msgs.append({"role":"user","content":msg})
44
+ prompt = tokenizer.apply_chat_template(msgs, tokenize=False,
45
+ add_generation_prompt=True)
46
+ stream = TextIteratorStreamer(tokenizer, skip_prompt=True,
47
+ skip_special_tokens=True)
48
  def worker():
49
  for tok in bitnet_cpp_generate(prompt, max_tokens, temp): stream.put(tok)
50
  stream.end()
51
+ threading.Thread(target=worker, daemon=True).start()
52
  out=""
53
  for chunk in stream:
54
  out+=chunk
55
  yield out
56
 
57
+ demo = gr.ChatInterface(
58
  fn=respond,
59
+ title="bitnet-b1.58-2B-4T (llama-cpp-python)",
60
+ description="fast cpu chat with bitnet via llama-cpp wheel (no manual build)",
61
+ examples=[["hello","you are helpful",256,0.7,0.0]],
62
  additional_inputs=[
63
  gr.Textbox(value="you are helpful",label="system message"),
64
  gr.Slider(1,8192,1024,1,label="max new tokens"),
65
  gr.Slider(0.1,4,0.7,0.1,label="temperature"),
66
+ gr.Slider(0.0,1.0,0.0,0.05,label="top-p (placeholder)"),
67
  ],
68
  )
69
 
70
+ if __name__=="__main__":
71
+ demo.launch()