Staticaliza commited on
Commit
65323fa
·
verified ·
1 Parent(s): 4b47c09

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -96
app.py CHANGED
@@ -1,123 +1,56 @@
1
- import os
2
- os.system("pip install git+https://github.com/shumingma/transformers.git accelerate")
3
-
4
- import threading
5
- import torch
6
- import torch._dynamo
7
- torch._dynamo.config.suppress_errors = True
8
-
9
- from transformers import (
10
- AutoModelForCausalLM,
11
- AutoTokenizer,
12
- TextIteratorStreamer,
13
- )
14
- import gradio as gr
15
- import spaces
16
 
17
  model_id = "microsoft/bitnet-b1.58-2B-4T"
 
 
18
 
19
- # tokenizer unchanged
20
  tokenizer = AutoTokenizer.from_pretrained(model_id)
21
 
22
- # load on CPU by default; no device_map
23
- model = AutoModelForCausalLM.from_pretrained(
24
- model_id,
25
- torch_dtype=torch.bfloat16, # CPU can emulate bfloat16
26
- low_cpu_mem_usage=True # reduces peak RAM (requires accelerate)
27
- )
28
-
29
- print(next(model.parameters()).device)
30
 
31
  @spaces.GPU(duration=15)
32
  def gpu():
33
  print("[GPU] | GPU maintained.")
34
-
35
- def respond(
36
- message: str,
37
- history: list[tuple[str, str]],
38
- system_message: str,
39
- max_tokens: int,
40
- temperature: float,
41
- top_p: float,
42
- ):
43
  messages = [{"role": "system", "content": system_message}]
44
  for user_msg, bot_msg in history:
45
- if user_msg:
46
- messages.append({"role": "user", "content": user_msg})
47
- if bot_msg:
48
- messages.append({"role": "assistant", "content": bot_msg})
49
  messages.append({"role": "user", "content": message})
50
 
51
- prompt = tokenizer.apply_chat_template(
52
- messages, tokenize=False, add_generation_prompt=True
53
- )
54
- inputs = tokenizer(prompt, return_tensors="pt")
 
 
 
55
 
56
- streamer = TextIteratorStreamer(
57
- tokenizer, skip_prompt=True, skip_special_tokens=True
58
- )
59
- generate_kwargs = dict(
60
- **inputs,
61
- streamer=streamer,
62
- max_new_tokens=max_tokens,
63
- temperature=temperature,
64
- top_p=top_p,
65
- do_sample=True,
66
- )
67
- thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
68
- thread.start()
69
 
70
- response = ""
71
  for new_text in streamer:
72
  response += new_text
73
  yield response
74
 
75
  demo = gr.ChatInterface(
76
  fn=respond,
77
- title="Bitnet-b1.58-2B-4T Chatbot",
78
- description="This chat application is powered by Microsoft's SOTA Bitnet-b1.58-2B-4T and designed for natural and fast conversations.",
79
  examples=[
80
- [
81
- "Hello! How are you?",
82
- "You are a helpful AI assistant for everyday tasks.",
83
- 512,
84
- 0.7,
85
- 0.95,
86
- ],
87
- [
88
- "Can you code a snake game in Python?",
89
- "You are a helpful AI assistant for coding.",
90
- 2048,
91
- 0.7,
92
- 0.95,
93
- ],
94
  ],
95
  additional_inputs=[
96
- gr.Textbox(
97
- value="You are a helpful AI assistant.",
98
- label="System message"
99
- ),
100
- gr.Slider(
101
- minimum=1,
102
- maximum=8192,
103
- value=2048,
104
- step=1,
105
- label="Max new tokens"
106
- ),
107
- gr.Slider(
108
- minimum=0.1,
109
- maximum=4.0,
110
- value=0.7,
111
- step=0.1,
112
- label="Temperature"
113
- ),
114
- gr.Slider(
115
- minimum=0.1,
116
- maximum=1.0,
117
- value=0.95,
118
- step=0.05,
119
- label="Top-p (nucleus sampling)"
120
- ),
121
  ],
122
  )
123
 
 
1
+ import os, subprocess, shlex, json, threading, gradio as gr, spaces
2
+ from transformers import AutoTokenizer, TextIteratorStreamer
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  model_id = "microsoft/bitnet-b1.58-2B-4T"
5
+ gguf_path = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf" # update if different
6
+ threads = os.cpu_count() or 8
7
 
 
8
  tokenizer = AutoTokenizer.from_pretrained(model_id)
9
 
10
+ def bitnet_cpp_generate(prompt, n_predict, temperature, top_p):
11
+ cmd = f"python BitNet/run_inference.py -m {gguf_path} -p {json.dumps(prompt)} -n {n_predict} -t {threads} -temp {temperature} -top_p {top_p} -cnv"
12
+ with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, text=True, bufsize=1) as proc:
13
+ for line in proc.stdout:
14
+ yield line.rstrip("\n")
 
 
 
15
 
16
  @spaces.GPU(duration=15)
17
  def gpu():
18
  print("[GPU] | GPU maintained.")
19
+
20
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
 
 
 
 
 
 
 
21
  messages = [{"role": "system", "content": system_message}]
22
  for user_msg, bot_msg in history:
23
+ if user_msg: messages.append({"role": "user", "content": user_msg})
24
+ if bot_msg: messages.append({"role": "assistant", "content": bot_msg})
 
 
25
  messages.append({"role": "user", "content": message})
26
 
27
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
28
+ response, streamer = "", TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
29
+
30
+ def work():
31
+ for tok in bitnet_cpp_generate(prompt, max_tokens, temperature, top_p):
32
+ streamer.put(tok)
33
+ streamer.end()
34
 
35
+ threading.Thread(target=work, daemon=True).start()
 
 
 
 
 
 
 
 
 
 
 
 
36
 
 
37
  for new_text in streamer:
38
  response += new_text
39
  yield response
40
 
41
  demo = gr.ChatInterface(
42
  fn=respond,
43
+ title="Bitnet-b1.58-2B-4T Chatbot (cpp backend)",
44
+ description="ultra-light cpu chat using bitnet_cpp",
45
  examples=[
46
+ ["hello!", "you are a helpful ai assistant.", 512, 0.7, 0.95],
47
+ ["code a snake game in python", "you are a helpful ai assistant.", 2048, 0.7, 0.95],
 
 
 
 
 
 
 
 
 
 
 
 
48
  ],
49
  additional_inputs=[
50
+ gr.Textbox(value="you are a helpful ai assistant.", label="system message"),
51
+ gr.Slider(1, 8192, 2048, 1, label="max new tokens"),
52
+ gr.Slider(0.1, 4.0, 0.7, 0.1, label="temperature"),
53
+ gr.Slider(0.1, 1.0, 0.95, 0.05, label="top-p"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  ],
55
  )
56