Staticaliza commited on
Commit
d90f2bc
·
verified ·
1 Parent(s): 0f7e3e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -30
app.py CHANGED
@@ -1,48 +1,125 @@
1
  import os
 
 
2
  import threading
3
  import torch
4
  import torch._dynamo
5
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
 
 
 
 
 
6
  import gradio as gr
7
  import spaces
8
 
9
- os.system("pip install git+https://github.com/shumingma/transformers.git")
10
- torch._dynamo.config.suppress_errors = True
11
-
12
  model_id = "microsoft/bitnet-b1.58-2B-4T"
13
- tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
14
- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map={"": "cpu"}, trust_remote_code=True)
15
- model.to("cpu")
 
 
 
 
 
 
 
 
 
16
 
17
  @spaces.GPU(duration=15)
18
  def gpu():
19
  print("[GPU] | GPU maintained.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- def respond_simple(message: str, max_tokens: int, temperature: float, top_p: float):
22
- inputs = tokenizer(message, return_tensors="pt").to("cpu")
23
- streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
24
- thread = threading.Thread(target=model.generate, kwargs={
25
  **inputs,
26
- "streamer": streamer,
27
- "max_new_tokens": max_tokens,
28
- "temperature": temperature,
29
- "top_p": top_p,
30
- "do_sample": True
31
- })
 
32
  thread.start()
33
- output = ""
34
- for chunk in streamer:
35
- output += chunk
36
- return output
37
-
38
- with gr.Blocks() as demo:
39
- gr.Markdown("## bitnet-b1.58-2b-4t completion")
40
- tok = gr.Slider(1, 8192, value=2048, step=1, label="max new tokens")
41
- temp = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="temperature")
42
- top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="top-p")
43
- inp = gr.Textbox(label="prompt", lines=2)
44
- out = gr.Textbox(label="completion", lines=10)
45
- inp.submit(respond_simple, [inp, tok, temp, top_p], out)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  if __name__ == "__main__":
48
  demo.launch()
 
1
  import os
2
+ os.system("pip install git+https://github.com/shumingma/transformers.git accelerate")
3
+
4
  import threading
5
  import torch
6
  import torch._dynamo
7
+ torch._dynamo.config.suppress_errors = True
8
+
9
+ from transformers import (
10
+ AutoModelForCausalLM,
11
+ AutoTokenizer,
12
+ TextIteratorStreamer,
13
+ )
14
  import gradio as gr
15
  import spaces
16
 
 
 
 
17
  model_id = "microsoft/bitnet-b1.58-2B-4T"
18
+
19
+ # tokenizer unchanged
20
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
21
+
22
+ # load on CPU by default; no device_map
23
+ model = AutoModelForCausalLM.from_pretrained(
24
+ model_id,
25
+ torch_dtype=torch.bfloat16, # CPU can emulate bfloat16
26
+ low_cpu_mem_usage=True # reduces peak RAM (requires accelerate)
27
+ )
28
+
29
+ print(next(model.parameters()).device)
30
 
31
  @spaces.GPU(duration=15)
32
  def gpu():
33
  print("[GPU] | GPU maintained.")
34
+
35
+ def respond(
36
+ message: str,
37
+ history: list[tuple[str, str]],
38
+ system_message: str,
39
+ max_tokens: int,
40
+ temperature: float,
41
+ top_p: float,
42
+ ):
43
+ messages = [{"role": "system", "content": system_message}]
44
+ for user_msg, bot_msg in history:
45
+ if user_msg:
46
+ messages.append({"role": "user", "content": user_msg})
47
+ if bot_msg:
48
+ messages.append({"role": "assistant", "content": bot_msg})
49
+ messages.append({"role": "user", "content": message})
50
+
51
+ prompt = tokenizer.apply_chat_template(
52
+ messages, tokenize=False, add_generation_prompt=True
53
+ )
54
+ inputs = tokenizer(prompt, return_tensors="pt")
55
 
56
+ streamer = TextIteratorStreamer(
57
+ tokenizer, skip_prompt=True, skip_special_tokens=True
58
+ )
59
+ generate_kwargs = dict(
60
  **inputs,
61
+ streamer=streamer,
62
+ max_new_tokens=max_tokens,
63
+ temperature=temperature,
64
+ top_p=top_p,
65
+ do_sample=True,
66
+ )
67
+ thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
68
  thread.start()
69
+
70
+ response = ""
71
+ for new_text in streamer:
72
+ response += new_text
73
+ yield response
74
+
75
+ demo = gr.ChatInterface(
76
+ fn=respond,
77
+ title="Bitnet-b1.58-2B-4T Chatbot",
78
+ description="This chat application is powered by Microsoft's SOTA Bitnet-b1.58-2B-4T and designed for natural and fast conversations.",
79
+ examples=[
80
+ [
81
+ "Hello! How are you?",
82
+ "You are a helpful AI assistant for everyday tasks.",
83
+ 512,
84
+ 0.7,
85
+ 0.95,
86
+ ],
87
+ [
88
+ "Can you code a snake game in Python?",
89
+ "You are a helpful AI assistant for coding.",
90
+ 2048,
91
+ 0.7,
92
+ 0.95,
93
+ ],
94
+ ],
95
+ additional_inputs=[
96
+ gr.Textbox(
97
+ value="You are a helpful AI assistant.",
98
+ label="System message"
99
+ ),
100
+ gr.Slider(
101
+ minimum=1,
102
+ maximum=8192,
103
+ value=2048,
104
+ step=1,
105
+ label="Max new tokens"
106
+ ),
107
+ gr.Slider(
108
+ minimum=0.1,
109
+ maximum=4.0,
110
+ value=0.7,
111
+ step=0.1,
112
+ label="Temperature"
113
+ ),
114
+ gr.Slider(
115
+ minimum=0.1,
116
+ maximum=1.0,
117
+ value=0.95,
118
+ step=0.05,
119
+ label="Top-p (nucleus sampling)"
120
+ ),
121
+ ],
122
+ )
123
 
124
  if __name__ == "__main__":
125
  demo.launch()