MedicallAssistant

Running on Zero

App Files Files Community

VisoLearn commited on 26 days ago

Commit

b5ca495

verified ·

1 Parent(s): 6211171

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -43

app.py CHANGED Viewed

@@ -1,75 +1,62 @@
 import spaces
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
-from accelerate import init_empty_weights, load_checkpoint_and_dispatch
 import torch
 from threading import Thread
 # Model and device configuration
 phi4_model_path = "Compumacy/OpenBioLLm-70B"
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-# === INITIALIZE EMPTY WEIGHTS ===
-init_empty_weights()
-# === CONFIGURE 4-BIT QUANTIZATION ===
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.float16,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4"
 )
-# === LOAD MODEL WITH QUANTIZATION ===
-model = AutoModelForCausalLM.from_pretrained(
-    phi4_model_path,
-    quantization_config=bnb_config,
-    torch_dtype=torch.float16,
-    device_map="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained(phi4_model_path)
-# === OFFLOAD TO CPU/DISK ===
-model = load_checkpoint_and_dispatch(
-    model,
     phi4_model_path,
     device_map="auto",
-    offload_folder="offload",
-    offload_state_dict=True,
-    max_memory={**{i: "12GB" for i in range(torch.cuda.device_count())}, "cpu": "30GB"}
 )
-# Enable gradient checkpointing if ever fine-tuning
-model.gradient_checkpointing_enable()
-# Optionally compile for PyTorch >= 2.0
 try:
     model = torch.compile(model)
 except Exception:
     pass
-# === RESPONSE GENERATOR ===
 @spaces.GPU()
 def generate_response(user_message, max_tokens, temperature, top_k, top_p, repetition_penalty, history_state):
     if not user_message.strip():
         return history_state, history_state
-    # Prompt setup
     system_message = (
         "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process..."
     )
     start_tag, sep_tag, end_tag = "<|im_start|>", "<|im_sep|>", "<|im_end|>"
     prompt = f"{start_tag}system{sep_tag}{system_message}{end_tag}"
     for msg in history_state:
-        tag = msg["role"]
-        content = msg["content"]
-        prompt += f"{start_tag}{tag}{sep_tag}{content}{end_tag}"
     prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
     inputs = tokenizer(prompt, return_tensors="pt").to(device)
-    # Streaming setup
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
     generation_kwargs = {
         "input_ids": inputs.input_ids,
@@ -83,7 +70,7 @@ def generate_response(user_message, max_tokens, temperature, top_k, top_p, repet
         "streamer": streamer
     }
-    # Run generation in thread
     Thread(target=model.generate, kwargs=generation_kwargs).start()
     assistant_response = ""
@@ -92,7 +79,7 @@ def generate_response(user_message, max_tokens, temperature, top_k, top_p, repet
         {"role": "assistant", "content": ""}
     ]
-    # Stream tokens
     for token in streamer:
         clean = token.replace(start_tag, "").replace(sep_tag, "").replace(end_tag, "")
         assistant_response += clean
@@ -111,7 +98,7 @@ example_messages = {
 # === GRADIO APP ===
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # Phi-4 Chat
     Try the example problems below to see how the model breaks down complex reasoning.
     """ )
@@ -133,9 +120,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 clear_button = gr.Button("Clear", scale=1)
             gr.Markdown("**Try these examples:**")
             with gr.Row():
-                for name in example_messages:
                     btn = gr.Button(name)
-                    btn.click(fn=lambda n=name: gr.update(value=example_messages[n]), inputs=None, outputs=user_input)
     submit_button.click(
         fn=generate_response,

 import spaces
 import gradio as gr
+from transformers import AutoTokenizer, TextIteratorStreamer
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
 import torch
 from threading import Thread
 # Model and device configuration
 phi4_model_path = "Compumacy/OpenBioLLm-70B"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# === GPTQ 2-bit QUANTIZATION CONFIG ===
+quantize_config = BaseQuantizeConfig(
+    load_in_4bit=False,
+    load_in_8bit=False,
+    quantization_bit=2,
+    compute_dtype=torch.float16,
+    use_double_quant=True,
+    quant_type="nf4"
 )
+# === LOAD GPTQ-QUANTIZED MODEL ===
+model = AutoGPTQForCausalLM.from_quantized(
     phi4_model_path,
+    quantize_config=quantize_config,
     device_map="auto",
+    use_safetensors=True,
 )
+tokenizer = AutoTokenizer.from_pretrained(phi4_model_path)
+# === OPTIONAL: TorchCompile for optimization (PyTorch >= 2.0) ===
 try:
     model = torch.compile(model)
 except Exception:
     pass
+# === STREAMING RESPONSE GENERATOR ===
 @spaces.GPU()
 def generate_response(user_message, max_tokens, temperature, top_k, top_p, repetition_penalty, history_state):
     if not user_message.strip():
         return history_state, history_state
+    # System prompt prefix
     system_message = (
         "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process..."
     )
     start_tag, sep_tag, end_tag = "<|im_start|>", "<|im_sep|>", "<|im_end|>"
+    # Build full prompt
     prompt = f"{start_tag}system{sep_tag}{system_message}{end_tag}"
     for msg in history_state:
+        prompt += f"{start_tag}{msg['role']}{sep_tag}{msg['content']}{end_tag}"
     prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
+    # Tokenize and move to device
     inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    # Set up streamer
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
     generation_kwargs = {
         "input_ids": inputs.input_ids,
         "streamer": streamer
     }
+    # Launch generation
     Thread(target=model.generate, kwargs=generation_kwargs).start()
     assistant_response = ""
         {"role": "assistant", "content": ""}
     ]
+    # Stream tokens back to Gradio
     for token in streamer:
         clean = token.replace(start_tag, "").replace(sep_tag, "").replace(end_tag, "")
         assistant_response += clean
 # === GRADIO APP ===
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # Phi-4 Chat with GPTQ Quant
     Try the example problems below to see how the model breaks down complex reasoning.
     """ )
                 clear_button = gr.Button("Clear", scale=1)
             gr.Markdown("**Try these examples:**")
             with gr.Row():
+                for name, text in example_messages.items():
                     btn = gr.Button(name)
+                    btn.click(fn=lambda t=text: gr.update(value=t), None, user_input)
     submit_button.click(
         fn=generate_response,