VisoLearn commited on
Commit
fa6be8b
·
verified ·
1 Parent(s): 04dde6a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -12
app.py CHANGED
@@ -7,18 +7,21 @@ from threading import Thread
7
 
8
  # Model and device configuration
9
  phi4_model_path = "Compumacy/OpenBioLLm-70B"
 
 
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
 
12
  # === GPTQ 2-bit QUANTIZATION CONFIG ===
13
  quantize_config = BaseQuantizeConfig(
14
  bits=2, # 2-bit quantization
15
  group_size=128, # grouping size
16
- desc_act=False # disable descending activations for speed
17
  )
18
 
19
  # === LOAD GPTQ-QUANTIZED MODEL ===
20
  model = AutoGPTQForCausalLM.from_quantized(
21
  phi4_model_path,
 
22
  quantize_config=quantize_config,
23
  device_map="auto",
24
  use_safetensors=True,
@@ -38,22 +41,18 @@ def generate_response(user_message, max_tokens, temperature, top_k, top_p, repet
38
  if not user_message.strip():
39
  return history_state, history_state
40
 
41
- # System prompt prefix
42
  system_message = (
43
  "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process..."
44
  )
45
  start_tag, sep_tag, end_tag = "<|im_start|>", "<|im_sep|>", "<|im_end|>"
46
 
47
- # Build full prompt
48
  prompt = f"{start_tag}system{sep_tag}{system_message}{end_tag}"
49
  for msg in history_state:
50
  prompt += f"{start_tag}{msg['role']}{sep_tag}{msg['content']}{end_tag}"
51
  prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
52
 
53
- # Tokenize and move to device
54
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
55
-
56
- # Set up streamer
57
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
58
  generation_kwargs = {
59
  "input_ids": inputs.input_ids,
@@ -67,7 +66,6 @@ def generate_response(user_message, max_tokens, temperature, top_k, top_p, repet
67
  "streamer": streamer
68
  }
69
 
70
- # Launch generation
71
  Thread(target=model.generate, kwargs=generation_kwargs).start()
72
 
73
  assistant_response = ""
@@ -76,7 +74,6 @@ def generate_response(user_message, max_tokens, temperature, top_k, top_p, repet
76
  {"role": "assistant", "content": ""}
77
  ]
78
 
79
- # Stream tokens back to Gradio
80
  for token in streamer:
81
  clean = token.replace(start_tag, "").replace(sep_tag, "").replace(end_tag, "")
82
  assistant_response += clean
@@ -85,14 +82,13 @@ def generate_response(user_message, max_tokens, temperature, top_k, top_p, repet
85
 
86
  yield new_history, new_history
87
 
88
- # === EXAMPLE MESSAGES ===
89
  example_messages = {
90
  "Math reasoning": "If a rectangular prism has a length of 6 cm...",
91
  "Logic puzzle": "Four people (Alex, Blake, Casey, ...)",
92
  "Physics problem": "A ball is thrown upward with an initial velocity..."
93
  }
94
 
95
- # === GRADIO APP ===
96
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
97
  gr.Markdown("""
98
  # Phi-4 Chat with GPTQ Quant
@@ -131,6 +127,5 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
131
 
132
  demo.launch(ssr_mode=False)
133
 
134
- # Note:
135
- # To get CUDA extensions (nf4, double quant, etc.) back, reinstall AutoGPTQ with CUDA support:
136
  # pip install git+https://github.com/PanQiWei/AutoGPTQ.git#egg=auto-gptq[cuda]
 
7
 
8
  # Model and device configuration
9
  phi4_model_path = "Compumacy/OpenBioLLm-70B"
10
+ # Specify the base filename of the GPTQ checkpoint in the repo
11
+ model_basename = "gptq_model-2bit-128g.safetensors"
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
 
14
  # === GPTQ 2-bit QUANTIZATION CONFIG ===
15
  quantize_config = BaseQuantizeConfig(
16
  bits=2, # 2-bit quantization
17
  group_size=128, # grouping size
18
+ desc_act=False # disable descending activations
19
  )
20
 
21
  # === LOAD GPTQ-QUANTIZED MODEL ===
22
  model = AutoGPTQForCausalLM.from_quantized(
23
  phi4_model_path,
24
+ model_basename=model_basename,
25
  quantize_config=quantize_config,
26
  device_map="auto",
27
  use_safetensors=True,
 
41
  if not user_message.strip():
42
  return history_state, history_state
43
 
 
44
  system_message = (
45
  "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process..."
46
  )
47
  start_tag, sep_tag, end_tag = "<|im_start|>", "<|im_sep|>", "<|im_end|>"
48
 
49
+ # Build prompt
50
  prompt = f"{start_tag}system{sep_tag}{system_message}{end_tag}"
51
  for msg in history_state:
52
  prompt += f"{start_tag}{msg['role']}{sep_tag}{msg['content']}{end_tag}"
53
  prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
54
 
 
55
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
 
 
56
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
57
  generation_kwargs = {
58
  "input_ids": inputs.input_ids,
 
66
  "streamer": streamer
67
  }
68
 
 
69
  Thread(target=model.generate, kwargs=generation_kwargs).start()
70
 
71
  assistant_response = ""
 
74
  {"role": "assistant", "content": ""}
75
  ]
76
 
 
77
  for token in streamer:
78
  clean = token.replace(start_tag, "").replace(sep_tag, "").replace(end_tag, "")
79
  assistant_response += clean
 
82
 
83
  yield new_history, new_history
84
 
85
+ # === EXAMPLES ===
86
  example_messages = {
87
  "Math reasoning": "If a rectangular prism has a length of 6 cm...",
88
  "Logic puzzle": "Four people (Alex, Blake, Casey, ...)",
89
  "Physics problem": "A ball is thrown upward with an initial velocity..."
90
  }
91
 
 
92
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
93
  gr.Markdown("""
94
  # Phi-4 Chat with GPTQ Quant
 
127
 
128
  demo.launch(ssr_mode=False)
129
 
130
+ # If you still see missing CUDA kernels warnings, reinstall AutoGPTQ with CUDA support:
 
131
  # pip install git+https://github.com/PanQiWei/AutoGPTQ.git#egg=auto-gptq[cuda]