VisoLearn commited on
Commit
b5ca495
·
verified ·
1 Parent(s): 6211171

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -43
app.py CHANGED
@@ -1,75 +1,62 @@
1
  import spaces
2
  import gradio as gr
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
4
- from accelerate import init_empty_weights, load_checkpoint_and_dispatch
5
  import torch
6
  from threading import Thread
7
 
8
  # Model and device configuration
9
  phi4_model_path = "Compumacy/OpenBioLLm-70B"
10
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
-
12
- # === INITIALIZE EMPTY WEIGHTS ===
13
- init_empty_weights()
14
-
15
- # === CONFIGURE 4-BIT QUANTIZATION ===
16
- bnb_config = BitsAndBytesConfig(
17
- load_in_4bit=True,
18
- bnb_4bit_compute_dtype=torch.float16,
19
- bnb_4bit_use_double_quant=True,
20
- bnb_4bit_quant_type="nf4"
21
  )
22
 
23
- # === LOAD MODEL WITH QUANTIZATION ===
24
- model = AutoModelForCausalLM.from_pretrained(
25
- phi4_model_path,
26
- quantization_config=bnb_config,
27
- torch_dtype=torch.float16,
28
- device_map="auto"
29
- )
30
-
31
- tokenizer = AutoTokenizer.from_pretrained(phi4_model_path)
32
-
33
- # === OFFLOAD TO CPU/DISK ===
34
- model = load_checkpoint_and_dispatch(
35
- model,
36
  phi4_model_path,
 
37
  device_map="auto",
38
- offload_folder="offload",
39
- offload_state_dict=True,
40
- max_memory={**{i: "12GB" for i in range(torch.cuda.device_count())}, "cpu": "30GB"}
41
  )
42
 
43
- # Enable gradient checkpointing if ever fine-tuning
44
- model.gradient_checkpointing_enable()
45
 
46
- # Optionally compile for PyTorch >= 2.0
47
  try:
48
  model = torch.compile(model)
49
  except Exception:
50
  pass
51
 
52
- # === RESPONSE GENERATOR ===
53
  @spaces.GPU()
54
  def generate_response(user_message, max_tokens, temperature, top_k, top_p, repetition_penalty, history_state):
55
  if not user_message.strip():
56
  return history_state, history_state
57
 
58
- # Prompt setup
59
  system_message = (
60
  "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process..."
61
  )
62
  start_tag, sep_tag, end_tag = "<|im_start|>", "<|im_sep|>", "<|im_end|>"
 
 
63
  prompt = f"{start_tag}system{sep_tag}{system_message}{end_tag}"
64
  for msg in history_state:
65
- tag = msg["role"]
66
- content = msg["content"]
67
- prompt += f"{start_tag}{tag}{sep_tag}{content}{end_tag}"
68
  prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
69
 
 
70
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
71
 
72
- # Streaming setup
73
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
74
  generation_kwargs = {
75
  "input_ids": inputs.input_ids,
@@ -83,7 +70,7 @@ def generate_response(user_message, max_tokens, temperature, top_k, top_p, repet
83
  "streamer": streamer
84
  }
85
 
86
- # Run generation in thread
87
  Thread(target=model.generate, kwargs=generation_kwargs).start()
88
 
89
  assistant_response = ""
@@ -92,7 +79,7 @@ def generate_response(user_message, max_tokens, temperature, top_k, top_p, repet
92
  {"role": "assistant", "content": ""}
93
  ]
94
 
95
- # Stream tokens
96
  for token in streamer:
97
  clean = token.replace(start_tag, "").replace(sep_tag, "").replace(end_tag, "")
98
  assistant_response += clean
@@ -111,7 +98,7 @@ example_messages = {
111
  # === GRADIO APP ===
112
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
113
  gr.Markdown("""
114
- # Phi-4 Chat
115
  Try the example problems below to see how the model breaks down complex reasoning.
116
  """ )
117
 
@@ -133,9 +120,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
133
  clear_button = gr.Button("Clear", scale=1)
134
  gr.Markdown("**Try these examples:**")
135
  with gr.Row():
136
- for name in example_messages:
137
  btn = gr.Button(name)
138
- btn.click(fn=lambda n=name: gr.update(value=example_messages[n]), inputs=None, outputs=user_input)
139
 
140
  submit_button.click(
141
  fn=generate_response,
 
1
  import spaces
2
  import gradio as gr
3
+ from transformers import AutoTokenizer, TextIteratorStreamer
4
+ from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
5
  import torch
6
  from threading import Thread
7
 
8
  # Model and device configuration
9
  phi4_model_path = "Compumacy/OpenBioLLm-70B"
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+
12
+ # === GPTQ 2-bit QUANTIZATION CONFIG ===
13
+ quantize_config = BaseQuantizeConfig(
14
+ load_in_4bit=False,
15
+ load_in_8bit=False,
16
+ quantization_bit=2,
17
+ compute_dtype=torch.float16,
18
+ use_double_quant=True,
19
+ quant_type="nf4"
 
20
  )
21
 
22
+ # === LOAD GPTQ-QUANTIZED MODEL ===
23
+ model = AutoGPTQForCausalLM.from_quantized(
 
 
 
 
 
 
 
 
 
 
 
24
  phi4_model_path,
25
+ quantize_config=quantize_config,
26
  device_map="auto",
27
+ use_safetensors=True,
 
 
28
  )
29
 
30
+ tokenizer = AutoTokenizer.from_pretrained(phi4_model_path)
 
31
 
32
+ # === OPTIONAL: TorchCompile for optimization (PyTorch >= 2.0) ===
33
  try:
34
  model = torch.compile(model)
35
  except Exception:
36
  pass
37
 
38
+ # === STREAMING RESPONSE GENERATOR ===
39
  @spaces.GPU()
40
  def generate_response(user_message, max_tokens, temperature, top_k, top_p, repetition_penalty, history_state):
41
  if not user_message.strip():
42
  return history_state, history_state
43
 
44
+ # System prompt prefix
45
  system_message = (
46
  "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process..."
47
  )
48
  start_tag, sep_tag, end_tag = "<|im_start|>", "<|im_sep|>", "<|im_end|>"
49
+
50
+ # Build full prompt
51
  prompt = f"{start_tag}system{sep_tag}{system_message}{end_tag}"
52
  for msg in history_state:
53
+ prompt += f"{start_tag}{msg['role']}{sep_tag}{msg['content']}{end_tag}"
 
 
54
  prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
55
 
56
+ # Tokenize and move to device
57
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
58
 
59
+ # Set up streamer
60
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
61
  generation_kwargs = {
62
  "input_ids": inputs.input_ids,
 
70
  "streamer": streamer
71
  }
72
 
73
+ # Launch generation
74
  Thread(target=model.generate, kwargs=generation_kwargs).start()
75
 
76
  assistant_response = ""
 
79
  {"role": "assistant", "content": ""}
80
  ]
81
 
82
+ # Stream tokens back to Gradio
83
  for token in streamer:
84
  clean = token.replace(start_tag, "").replace(sep_tag, "").replace(end_tag, "")
85
  assistant_response += clean
 
98
  # === GRADIO APP ===
99
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
100
  gr.Markdown("""
101
+ # Phi-4 Chat with GPTQ Quant
102
  Try the example problems below to see how the model breaks down complex reasoning.
103
  """ )
104
 
 
120
  clear_button = gr.Button("Clear", scale=1)
121
  gr.Markdown("**Try these examples:**")
122
  with gr.Row():
123
+ for name, text in example_messages.items():
124
  btn = gr.Button(name)
125
+ btn.click(fn=lambda t=text: gr.update(value=t), None, user_input)
126
 
127
  submit_button.click(
128
  fn=generate_response,