Staticaliza commited on
Commit
5ad9406
·
verified ·
1 Parent(s): d90f2bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -115
app.py CHANGED
@@ -1,125 +1,36 @@
1
- import os
2
- os.system("pip install git+https://github.com/shumingma/transformers.git accelerate")
3
-
4
- import threading
5
- import torch
6
- import torch._dynamo
7
- torch._dynamo.config.suppress_errors = True
8
-
9
- from transformers import (
10
- AutoModelForCausalLM,
11
- AutoTokenizer,
12
- TextIteratorStreamer,
13
- )
14
- import gradio as gr
15
  import spaces
16
-
17
- model_id = "microsoft/bitnet-b1.58-2B-4T"
18
-
19
- # tokenizer unchanged
20
- tokenizer = AutoTokenizer.from_pretrained(model_id)
21
-
22
- # load on CPU by default; no device_map
23
- model = AutoModelForCausalLM.from_pretrained(
24
- model_id,
25
- torch_dtype=torch.bfloat16, # CPU can emulate bfloat16
26
- low_cpu_mem_usage=True # reduces peak RAM (requires accelerate)
27
- )
28
-
29
- print(next(model.parameters()).device)
30
-
31
  @spaces.GPU(duration=15)
32
  def gpu():
33
  print("[GPU] | GPU maintained.")
34
 
35
- def respond(
36
- message: str,
37
- history: list[tuple[str, str]],
38
- system_message: str,
39
- max_tokens: int,
40
- temperature: float,
41
- top_p: float,
42
- ):
43
- messages = [{"role": "system", "content": system_message}]
44
- for user_msg, bot_msg in history:
45
- if user_msg:
46
- messages.append({"role": "user", "content": user_msg})
47
- if bot_msg:
48
- messages.append({"role": "assistant", "content": bot_msg})
49
- messages.append({"role": "user", "content": message})
50
-
51
- prompt = tokenizer.apply_chat_template(
52
- messages, tokenize=False, add_generation_prompt=True
53
- )
54
- inputs = tokenizer(prompt, return_tensors="pt")
55
-
56
- streamer = TextIteratorStreamer(
57
- tokenizer, skip_prompt=True, skip_special_tokens=True
58
- )
59
- generate_kwargs = dict(
60
- **inputs,
61
- streamer=streamer,
62
- max_new_tokens=max_tokens,
63
- temperature=temperature,
64
- top_p=top_p,
65
- do_sample=True,
66
- )
67
- thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
68
- thread.start()
69
-
70
- response = ""
71
- for new_text in streamer:
72
- response += new_text
73
- yield response
74
 
75
- demo = gr.ChatInterface(
76
- fn=respond,
77
- title="Bitnet-b1.58-2B-4T Chatbot",
78
- description="This chat application is powered by Microsoft's SOTA Bitnet-b1.58-2B-4T and designed for natural and fast conversations.",
79
- examples=[
80
- [
81
- "Hello! How are you?",
82
- "You are a helpful AI assistant for everyday tasks.",
83
- 512,
84
- 0.7,
85
- 0.95,
86
- ],
87
- [
88
- "Can you code a snake game in Python?",
89
- "You are a helpful AI assistant for coding.",
90
- 2048,
91
- 0.7,
92
- 0.95,
93
- ],
94
- ],
95
- additional_inputs=[
96
- gr.Textbox(
97
- value="You are a helpful AI assistant.",
98
- label="System message"
99
- ),
100
- gr.Slider(
101
- minimum=1,
102
- maximum=8192,
103
- value=2048,
104
- step=1,
105
- label="Max new tokens"
106
- ),
107
- gr.Slider(
108
- minimum=0.1,
109
- maximum=4.0,
110
- value=0.7,
111
- step=0.1,
112
- label="Temperature"
113
- ),
114
- gr.Slider(
115
- minimum=0.1,
116
- maximum=1.0,
117
- value=0.95,
118
- step=0.05,
119
- label="Top-p (nucleus sampling)"
120
- ),
121
  ],
 
 
 
122
  )
123
 
124
  if __name__ == "__main__":
125
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import spaces
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  @spaces.GPU(duration=15)
3
  def gpu():
4
  print("[GPU] | GPU maintained.")
5
 
6
+ import os
7
+ import subprocess
8
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ MODEL_PATH = os.environ.get("MODEL_PATH", "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf")
11
+
12
+ def generate(prompt, max_tokens=128, temperature=0.7):
13
+ cmd = [
14
+ "python", "run_inference.py",
15
+ "-m", MODEL_PATH,
16
+ "-p", prompt,
17
+ "-n", str(max_tokens),
18
+ "-temp", str(temperature)
19
+ ]
20
+ result = subprocess.run(cmd, capture_output=True, text=True)
21
+ return result.stdout.strip()
22
+
23
+ iface = gr.Interface(
24
+ fn=generate,
25
+ inputs=[
26
+ gr.Textbox(lines=2, placeholder="Enter your prompt here", label="Prompt"),
27
+ gr.Slider(1, 512, value=128, step=1, label="Max Tokens"),
28
+ gr.Slider(0.0, 1.0, value=0.7, step=0.01, label="Temperature")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  ],
30
+ outputs=gr.Textbox(label="Completion"),
31
+ title="BitNet.cpp Completion Demo",
32
+ description="demo of bitnet.cpp inference for 1-bit llms"
33
  )
34
 
35
  if __name__ == "__main__":
36
+ iface.launch()