Staticaliza commited on
Commit
fbbda22
Β·
verified Β·
1 Parent(s): 5b9ea8e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -67
app.py CHANGED
@@ -1,71 +1,48 @@
1
- import subprocess, sys, pathlib, os, json, threading, gradio as gr, spaces
2
- from transformers import AutoTokenizer, TextIteratorStreamer
3
-
4
- # ─── one-time deps ────────────────────────────────────────────────────────────
5
- subprocess.run([sys.executable,"-m","pip","install","--quiet","--upgrade",
6
- "llama-cpp-python"], check=True) # avx2 / avx512 wheels
7
- # ------------------------------------------------------------------------------
8
-
9
- model_id = "microsoft/bitnet-b1.58-2B-4T"
10
- gguf_path = pathlib.Path("ggml-model-i2_s.gguf")
11
- threads = os.cpu_count() or 8
12
-
13
- if not gguf_path.exists(): # grab ready-made gguf
14
- subprocess.run(["huggingface-cli","download",
15
- "microsoft/bitnet-b1.58-2B-4T-gguf",
16
- "--include","ggml-model-i2_s.gguf",
17
- "--local-dir",".","--repo-type","model"], check=True)
18
-
19
- from llama_cpp import Llama # after wheel install
20
- llm = Llama(model_path=str(gguf_path),
21
- n_threads=threads,
22
- n_ctx=8192, # bitnet uses 4-T rope, large ctx is fine
23
- logits_all=False)
24
-
25
  tokenizer = AutoTokenizer.from_pretrained(model_id)
26
-
27
- def bitnet_cpp_generate(prompt, n_predict, temperature):
28
- # llama-cpp streams dicts; we yield just the text chunk
29
- for chunk in llm(prompt,
30
- max_tokens=n_predict,
31
- temperature=temperature,
32
- stream=True):
33
- yield chunk["choices"][0]["text"]
34
 
35
  @spaces.GPU(duration=15)
36
- def gpu(): print("[GPU] | GPU maintained.")
37
-
38
- def respond(msg, hist, sys_msg, max_tokens, temp, _top_p_unused):
39
- msgs=[{"role":"system","content":sys_msg}]
40
- for u,b in hist:
41
- if u: msgs.append({"role":"user","content":u})
42
- if b: msgs.append({"role":"assistant","content":b})
43
- msgs.append({"role":"user","content":msg})
44
- prompt = tokenizer.apply_chat_template(msgs, tokenize=False,
45
- add_generation_prompt=True)
46
- stream = TextIteratorStreamer(tokenizer, skip_prompt=True,
47
- skip_special_tokens=True)
48
- def worker():
49
- for tok in bitnet_cpp_generate(prompt, max_tokens, temp): stream.put(tok)
50
- stream.end()
51
- threading.Thread(target=worker, daemon=True).start()
52
- out=""
53
- for chunk in stream:
54
- out+=chunk
55
- yield out
56
-
57
- demo = gr.ChatInterface(
58
- fn=respond,
59
- title="bitnet-b1.58-2B-4T (llama-cpp-python)",
60
- description="fast cpu chat with bitnet via llama-cpp wheel (no manual build)",
61
- examples=[["hello","you are helpful",256,0.7,0.0]],
62
- additional_inputs=[
63
- gr.Textbox(value="you are helpful",label="system message"),
64
- gr.Slider(1,8192,1024,1,label="max new tokens"),
65
- gr.Slider(0.1,4,0.7,0.1,label="temperature"),
66
- gr.Slider(0.0,1.0,0.0,0.05,label="top-p (placeholder)"),
67
- ],
68
- )
69
-
70
- if __name__=="__main__":
71
  demo.launch()
 
1
+ import os
2
+ import threading
3
+ import torch
4
+ import torch._dynamo
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
+ import gradio as gr
7
+ import spaces
8
+
9
+ os.system("pip install git+https://github.com/shumingma/transformers.git")
10
+ torch._dynamo.config.suppress_errors = True
11
+
12
+ model_id = "microsoft/bitnet-b1.58-2B-4T"
 
 
 
 
 
 
 
 
 
 
 
 
13
  tokenizer = AutoTokenizer.from_pretrained(model_id)
14
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map={"": "cpu"}).to("cpu")
15
+ model.to("cpu")
 
 
 
 
 
 
16
 
17
  @spaces.GPU(duration=15)
18
+ def gpu():
19
+ print("[GPU] | GPU maintained.")
20
+
21
+ def respond_simple(message: str, max_tokens: int, temperature: float, top_p: float):
22
+ inputs = tokenizer(message, return_tensors="pt").to("cpu")
23
+ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
24
+ thread = threading.Thread(target=model.generate, kwargs={
25
+ **inputs,
26
+ "streamer": streamer,
27
+ "max_new_tokens": max_tokens,
28
+ "temperature": temperature,
29
+ "top_p": top_p,
30
+ "do_sample": True
31
+ })
32
+ thread.start()
33
+ output = ""
34
+ for chunk in streamer:
35
+ output += chunk
36
+ return output
37
+
38
+ with gr.Blocks() as demo:
39
+ gr.Markdown("## bitnet-b1.58-2b-4t completion")
40
+ tok = gr.Slider(1, 8192, value=2048, step=1, label="max new tokens")
41
+ temp = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="temperature")
42
+ top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="top-p")
43
+ inp = gr.Textbox(label="prompt", lines=2)
44
+ out = gr.Textbox(label="completion", lines=10)
45
+ inp.submit(respond_simple, [inp, tok, temp, top_p], out)
46
+
47
+ if __name__ == "__main__":
 
 
 
 
 
48
  demo.launch()