Spaces:
Sleeping
Sleeping
File size: 5,132 Bytes
b4ed6e4 fa7e3c5 b5ca495 fa7e3c5 580e705 b2b704c fa6be8b b5ca495 692b14d fa6be8b bb20016 b5ca495 54dd705 fa6be8b b5ca495 54dd705 b5ca495 54dd705 fa7e3c5 b5ca495 580e705 b5ca495 580e705 b5ca495 b4ed6e4 6d70605 fa7e3c5 580e705 b5ca495 fa6be8b 6d70605 580e705 b5ca495 6d70605 fa7e3c5 580e705 fa7e3c5 1794ce2 580e705 fa7e3c5 580e705 fa7e3c5 580e705 fa7e3c5 580e705 fa7e3c5 580e705 fa7e3c5 fa6be8b fa7e3c5 580e705 fa7e3c5 580e705 b5ca495 580e705 fa7e3c5 580e705 fa7e3c5 580e705 fa7e3c5 580e705 fa7e3c5 580e705 fa7e3c5 b5ca495 580e705 6045b5b fa7e3c5 6d70605 fa7e3c5 580e705 fa7e3c5 580e705 fa7e3c5 692b14d fa6be8b 692b14d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import spaces
import gradio as gr
from transformers import AutoTokenizer, TextIteratorStreamer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import torch
from threading import Thread
# Model and device configuration
phi4_model_path = "Compumacy/OpenBioLLm-70B"
# Specify the base filename of the GPTQ checkpoint in the repo
model_basename = "gptq_model-2bit-128g.safetensors"
device = "cuda" if torch.cuda.is_available() else "cpu"
# === GPTQ 2-bit QUANTIZATION CONFIG ===
quantize_config = BaseQuantizeConfig(
bits=2, # 2-bit quantization
group_size=128, # grouping size
desc_act=False # disable descending activations
)
# === LOAD GPTQ-QUANTIZED MODEL ===
model = AutoGPTQForCausalLM.from_quantized(
phi4_model_path,
model_basename=model_basename,
quantize_config=quantize_config,
device_map="auto",
use_safetensors=True,
)
tokenizer = AutoTokenizer.from_pretrained(phi4_model_path)
# === OPTIONAL: TorchCompile for optimization (PyTorch >= 2.0) ===
try:
model = torch.compile(model)
except Exception:
pass
# === STREAMING RESPONSE GENERATOR ===
@spaces.GPU()
def generate_response(user_message, max_tokens, temperature, top_k, top_p, repetition_penalty, history_state):
if not user_message.strip():
return history_state, history_state
system_message = (
"Your role as an assistant involves thoroughly exploring questions through a systematic thinking process..."
)
start_tag, sep_tag, end_tag = "<|im_start|>", "<|im_sep|>", "<|im_end|>"
# Build prompt
prompt = f"{start_tag}system{sep_tag}{system_message}{end_tag}"
for msg in history_state:
prompt += f"{start_tag}{msg['role']}{sep_tag}{msg['content']}{end_tag}"
prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
generation_kwargs = {
"input_ids": inputs.input_ids,
"attention_mask": inputs.attention_mask,
"max_new_tokens": int(max_tokens),
"do_sample": True,
"temperature": temperature,
"top_k": int(top_k),
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"streamer": streamer
}
Thread(target=model.generate, kwargs=generation_kwargs).start()
assistant_response = ""
new_history = history_state + [
{"role": "user", "content": user_message},
{"role": "assistant", "content": ""}
]
for token in streamer:
clean = token.replace(start_tag, "").replace(sep_tag, "").replace(end_tag, "")
assistant_response += clean
new_history[-1]["content"] = assistant_response
yield new_history, new_history
yield new_history, new_history
# === EXAMPLES ===
example_messages = {
"Math reasoning": "If a rectangular prism has a length of 6 cm...",
"Logic puzzle": "Four people (Alex, Blake, Casey, ...)",
"Physics problem": "A ball is thrown upward with an initial velocity..."
}
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# Phi-4 Chat with GPTQ Quant
Try the example problems below to see how the model breaks down complex reasoning.
""" )
history_state = gr.State([])
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Settings")
max_tokens_slider = gr.Slider(64, 32768, step=1024, value=2048, label="Max Tokens")
with gr.Accordion("Advanced Settings", open=False):
temperature_slider = gr.Slider(0.1, 2.0, value=0.8, label="Temperature")
top_k_slider = gr.Slider(1, 100, step=1, value=50, label="Top-k")
top_p_slider = gr.Slider(0.1, 1.0, value=0.95, label="Top-p")
repetition_penalty_slider = gr.Slider(1.0, 2.0, value=1.0, label="Repetition Penalty")
with gr.Column(scale=4):
chatbot = gr.Chatbot(label="Chat", type="messages")
with gr.Row():
user_input = gr.Textbox(placeholder="Type your message...", scale=3)
submit_button = gr.Button("Send", variant="primary", scale=1)
clear_button = gr.Button("Clear", scale=1)
gr.Markdown("**Try these examples:**")
with gr.Row():
for name, text in example_messages.items():
btn = gr.Button(name)
btn.click(fn=lambda t=text: gr.update(value=t), inputs=None, outputs=user_input)
submit_button.click(
fn=generate_response,
inputs=[user_input, max_tokens_slider, temperature_slider, top_k_slider, top_p_slider, repetition_penalty_slider, history_state],
outputs=[chatbot, history_state]
).then(lambda: gr.update(value=""), None, user_input)
clear_button.click(lambda: ([], []), None, [chatbot, history_state])
demo.launch(ssr_mode=False)
# If you still see missing CUDA kernels warnings, reinstall AutoGPTQ with CUDA support:
# pip install git+https://github.com/PanQiWei/AutoGPTQ.git#egg=auto-gptq[cuda]
|