VisoLearn commited on
Commit
580e705
·
verified ·
1 Parent(s): 944ad0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -122
app.py CHANGED
@@ -1,189 +1,146 @@
1
  import gradio as gr
2
- import spaces
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
  import torch
5
  from threading import Thread
6
- import bitsandbytes as bnb
7
 
 
8
  phi4_model_path = "Compumacy/OpenBioLLm-70B"
9
-
10
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
 
12
- # Load model with 4-bit quantization
13
- from transformers import BitsAndBytesConfig
14
 
15
- # Configure 4-bit quantization
16
- quantization_config = BitsAndBytesConfig(
17
  load_in_4bit=True,
18
  bnb_4bit_compute_dtype=torch.float16,
19
  bnb_4bit_use_double_quant=True,
20
  bnb_4bit_quant_type="nf4"
21
  )
22
 
23
- phi4_model = AutoModelForCausalLM.from_pretrained(
 
 
 
 
 
 
 
 
 
 
 
 
24
  phi4_model_path,
25
  device_map="auto",
26
- quantization_config=quantization_config
 
 
27
  )
28
- phi4_tokenizer = AutoTokenizer.from_pretrained(phi4_model_path)
29
 
30
- @spaces.GPU(duration=120)
 
 
 
 
 
 
 
 
 
31
  def generate_response(user_message, max_tokens, temperature, top_k, top_p, repetition_penalty, history_state):
32
  if not user_message.strip():
33
  return history_state, history_state
34
-
35
- # Phi-4 model settings
36
- model = phi4_model
37
- tokenizer = phi4_tokenizer
38
- start_tag = "<|im_start|>"
39
- sep_tag = "<|im_sep|>"
40
- end_tag = "<|im_end|>"
41
-
42
- # Recommended prompt settings by Microsoft
43
- system_message = "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:"
44
  prompt = f"{start_tag}system{sep_tag}{system_message}{end_tag}"
45
- for message in history_state:
46
- if message["role"] == "user":
47
- prompt += f"{start_tag}user{sep_tag}{message['content']}{end_tag}"
48
- elif message["role"] == "assistant" and message["content"]:
49
- prompt += f"{start_tag}assistant{sep_tag}{message['content']}{end_tag}"
50
  prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
51
 
52
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
53
 
54
- do_sample = not (temperature == 1.0 and top_k >= 100 and top_p == 1.0)
55
-
56
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
57
-
58
- # sampling techniques
59
  generation_kwargs = {
60
- "input_ids": inputs["input_ids"],
61
- "attention_mask": inputs["attention_mask"],
62
  "max_new_tokens": int(max_tokens),
63
  "do_sample": True,
64
- "temperature": temperature, # Use the slider value
65
  "top_k": int(top_k),
66
- "top_p": top_p, # Use the slider value
67
  "repetition_penalty": repetition_penalty,
68
- "streamer": streamer,
69
  }
70
 
71
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
72
- thread.start()
73
 
74
- # Stream the response
75
  assistant_response = ""
76
  new_history = history_state + [
77
  {"role": "user", "content": user_message},
78
  {"role": "assistant", "content": ""}
79
  ]
80
- for new_token in streamer:
81
- cleaned_token = new_token.replace("<|im_start|>", "").replace("<|im_sep|>", "").replace("<|im_end|>", "")
82
- assistant_response += cleaned_token
83
- new_history[-1]["content"] = assistant_response.strip()
 
 
84
  yield new_history, new_history
85
 
86
  yield new_history, new_history
87
 
 
88
  example_messages = {
89
- "Math reasoning": "If a rectangular prism has a length of 6 cm, a width of 4 cm, and a height of 5 cm, what is the length of the longest line segment that can be drawn from one vertex to another?",
90
- "Logic puzzle": "Four people (Alex, Blake, Casey, and Dana) each have a different favorite color (red, blue, green, yellow) and a different favorite fruit (apple, banana, cherry, date). Given the following clues: 1) The person who likes red doesn't like dates. 2) Alex likes yellow. 3) The person who likes blue likes cherries. 4) Blake doesn't like apples or bananas. 5) Casey doesn't like yellow or green. Who likes what color and what fruit?",
91
- "Physics problem": "A ball is thrown upward with an initial velocity of 15 m/s from a height of 2 meters above the ground. Assuming the acceleration due to gravity is 9.8 m/s², determine: 1) The maximum height the ball reaches. 2) The total time the ball is in the air before hitting the ground. 3) The velocity with which the ball hits the ground."
92
  }
93
 
 
94
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
95
- gr.Markdown(
96
- """
97
- # try the example problems below to see how the model breaks down complex reasoning problems.
98
- ## *Running with 4-bit quantization*
99
- """
100
- )
101
-
102
- history_state = gr.State([])
103
 
 
104
  with gr.Row():
105
  with gr.Column(scale=1):
106
  gr.Markdown("### Settings")
107
- max_tokens_slider = gr.Slider(
108
- minimum=64,
109
- maximum=32768,
110
- step=1024,
111
- value=4096,
112
- label="Max Tokens"
113
- )
114
  with gr.Accordion("Advanced Settings", open=False):
115
- temperature_slider = gr.Slider(
116
- minimum=0.1,
117
- maximum=2.0,
118
- value=0.8,
119
- label="Temperature"
120
- )
121
- top_k_slider = gr.Slider(
122
- minimum=1,
123
- maximum=100,
124
- step=1,
125
- value=50,
126
- label="Top-k"
127
- )
128
- top_p_slider = gr.Slider(
129
- minimum=0.1,
130
- maximum=1.0,
131
- value=0.95,
132
- label="Top-p"
133
- )
134
- repetition_penalty_slider = gr.Slider(
135
- minimum=1.0,
136
- maximum=2.0,
137
- value=1.0,
138
- label="Repetition Penalty"
139
- )
140
-
141
  with gr.Column(scale=4):
142
  chatbot = gr.Chatbot(label="Chat", type="messages")
143
  with gr.Row():
144
- user_input = gr.Textbox(
145
- label="Your message",
146
- placeholder="Type your message here...",
147
- scale=3
148
- )
149
  submit_button = gr.Button("Send", variant="primary", scale=1)
150
  clear_button = gr.Button("Clear", scale=1)
151
  gr.Markdown("**Try these examples:**")
152
  with gr.Row():
153
- example1_button = gr.Button("Math reasoning")
154
- example2_button = gr.Button("Logic puzzle")
155
- example3_button = gr.Button("Physics problem")
156
 
157
  submit_button.click(
158
  fn=generate_response,
159
  inputs=[user_input, max_tokens_slider, temperature_slider, top_k_slider, top_p_slider, repetition_penalty_slider, history_state],
160
  outputs=[chatbot, history_state]
161
- ).then(
162
- fn=lambda: gr.update(value=""),
163
- inputs=None,
164
- outputs=user_input
165
- )
166
 
167
- clear_button.click(
168
- fn=lambda: ([], []),
169
- inputs=None,
170
- outputs=[chatbot, history_state]
171
- )
172
-
173
- example1_button.click(
174
- fn=lambda: gr.update(value=example_messages["Math reasoning"]),
175
- inputs=None,
176
- outputs=user_input
177
- )
178
- example2_button.click(
179
- fn=lambda: gr.update(value=example_messages["Logic puzzle"]),
180
- inputs=None,
181
- outputs=user_input
182
- )
183
- example3_button.click(
184
- fn=lambda: gr.update(value=example_messages["Physics problem"]),
185
- inputs=None,
186
- outputs=user_input
187
- )
188
 
189
- demo.launch(ssr_mode=False)
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
3
+ from accelerate import init_empty_weights, load_checkpoint_and_dispatch
4
  import torch
5
  from threading import Thread
 
6
 
7
+ # Model and device configuration
8
  phi4_model_path = "Compumacy/OpenBioLLm-70B"
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
11
+ # === INITIALIZE EMPTY WEIGHTS ===
12
+ init_empty_weights()
13
 
14
+ # === CONFIGURE 4-BIT QUANTIZATION ===
15
+ bnb_config = BitsAndBytesConfig(
16
  load_in_4bit=True,
17
  bnb_4bit_compute_dtype=torch.float16,
18
  bnb_4bit_use_double_quant=True,
19
  bnb_4bit_quant_type="nf4"
20
  )
21
 
22
+ # === LOAD MODEL WITH QUANTIZATION ===
23
+ model = AutoModelForCausalLM.from_pretrained(
24
+ phi4_model_path,
25
+ quantization_config=bnb_config,
26
+ torch_dtype=torch.float16,
27
+ device_map="auto"
28
+ )
29
+
30
+ tokenizer = AutoTokenizer.from_pretrained(phi4_model_path)
31
+
32
+ # === OFFLOAD TO CPU/DISK ===
33
+ model = load_checkpoint_and_dispatch(
34
+ model,
35
  phi4_model_path,
36
  device_map="auto",
37
+ offload_folder="offload",
38
+ offload_state_dict=True,
39
+ max_memory={**{i: "12GB" for i in range(torch.cuda.device_count())}, "cpu": "30GB"}
40
  )
 
41
 
42
+ # Enable gradient checkpointing if ever fine-tuning
43
+ model.gradient_checkpointing_enable()
44
+
45
+ # Optionally compile for PyTorch >= 2.0
46
+ try:
47
+ model = torch.compile(model)
48
+ except Exception:
49
+ pass
50
+
51
+ # === RESPONSE GENERATOR ===
52
  def generate_response(user_message, max_tokens, temperature, top_k, top_p, repetition_penalty, history_state):
53
  if not user_message.strip():
54
  return history_state, history_state
55
+
56
+ # Prompt setup
57
+ system_message = (
58
+ "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process..."
59
+ )
60
+ start_tag, sep_tag, end_tag = "<|im_start|>", "<|im_sep|>", "<|im_end|>"
 
 
 
 
61
  prompt = f"{start_tag}system{sep_tag}{system_message}{end_tag}"
62
+ for msg in history_state:
63
+ tag = msg["role"]
64
+ content = msg["content"]
65
+ prompt += f"{start_tag}{tag}{sep_tag}{content}{end_tag}"
 
66
  prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
67
 
68
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
69
 
70
+ # Streaming setup
 
71
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
 
 
72
  generation_kwargs = {
73
+ "input_ids": inputs.input_ids,
74
+ "attention_mask": inputs.attention_mask,
75
  "max_new_tokens": int(max_tokens),
76
  "do_sample": True,
77
+ "temperature": temperature,
78
  "top_k": int(top_k),
79
+ "top_p": top_p,
80
  "repetition_penalty": repetition_penalty,
81
+ "streamer": streamer
82
  }
83
 
84
+ # Run generation in thread
85
+ Thread(target=model.generate, kwargs=generation_kwargs).start()
86
 
 
87
  assistant_response = ""
88
  new_history = history_state + [
89
  {"role": "user", "content": user_message},
90
  {"role": "assistant", "content": ""}
91
  ]
92
+
93
+ # Stream tokens
94
+ for token in streamer:
95
+ clean = token.replace(start_tag, "").replace(sep_tag, "").replace(end_tag, "")
96
+ assistant_response += clean
97
+ new_history[-1]["content"] = assistant_response
98
  yield new_history, new_history
99
 
100
  yield new_history, new_history
101
 
102
+ # === EXAMPLE MESSAGES ===
103
  example_messages = {
104
+ "Math reasoning": "If a rectangular prism has a length of 6 cm...",
105
+ "Logic puzzle": "Four people (Alex, Blake, Casey, ...)",
106
+ "Physics problem": "A ball is thrown upward with an initial velocity..."
107
  }
108
 
109
+ # === GRADIO APP ===
110
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
111
+ gr.Markdown("""
112
+ # Phi-4 Chat
113
+ Try the example problems below to see how the model breaks down complex reasoning.
114
+ """ )
 
 
 
 
115
 
116
+ history_state = gr.State([])
117
  with gr.Row():
118
  with gr.Column(scale=1):
119
  gr.Markdown("### Settings")
120
+ max_tokens_slider = gr.Slider(64, 32768, step=1024, value=2048, label="Max Tokens")
 
 
 
 
 
 
121
  with gr.Accordion("Advanced Settings", open=False):
122
+ temperature_slider = gr.Slider(0.1, 2.0, value=0.8, label="Temperature")
123
+ top_k_slider = gr.Slider(1, 100, step=1, value=50, label="Top-k")
124
+ top_p_slider = gr.Slider(0.1, 1.0, value=0.95, label="Top-p")
125
+ repetition_penalty_slider = gr.Slider(1.0, 2.0, value=1.0, label="Repetition Penalty")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  with gr.Column(scale=4):
127
  chatbot = gr.Chatbot(label="Chat", type="messages")
128
  with gr.Row():
129
+ user_input = gr.Textbox(placeholder="Type your message...", scale=3)
 
 
 
 
130
  submit_button = gr.Button("Send", variant="primary", scale=1)
131
  clear_button = gr.Button("Clear", scale=1)
132
  gr.Markdown("**Try these examples:**")
133
  with gr.Row():
134
+ for name in example_messages:
135
+ btn = gr.Button(name)
136
+ btn.click(fn=lambda n=name: gr.update(value=example_messages[n]), inputs=None, outputs=user_input)
137
 
138
  submit_button.click(
139
  fn=generate_response,
140
  inputs=[user_input, max_tokens_slider, temperature_slider, top_k_slider, top_p_slider, repetition_penalty_slider, history_state],
141
  outputs=[chatbot, history_state]
142
+ ).then(lambda: gr.update(value=""), None, user_input)
 
 
 
 
143
 
144
+ clear_button.click(lambda: ([], []), None, [chatbot, history_state])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
+ demo.launch(ssr_mode=False)