MedicallAssistant

Running on Zero

App Files Files Community

VisoLearn commited on 26 days ago

Commit

fa6be8b

verified ·

1 Parent(s): 04dde6a

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -12

app.py CHANGED Viewed

@@ -7,18 +7,21 @@ from threading import Thread
 # Model and device configuration
 phi4_model_path = "Compumacy/OpenBioLLm-70B"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # === GPTQ 2-bit QUANTIZATION CONFIG ===
 quantize_config = BaseQuantizeConfig(
     bits=2,            # 2-bit quantization
     group_size=128,    # grouping size
-    desc_act=False     # disable descending activations for speed
 )
 # === LOAD GPTQ-QUANTIZED MODEL ===
 model = AutoGPTQForCausalLM.from_quantized(
     phi4_model_path,
     quantize_config=quantize_config,
     device_map="auto",
     use_safetensors=True,
@@ -38,22 +41,18 @@ def generate_response(user_message, max_tokens, temperature, top_k, top_p, repet
     if not user_message.strip():
         return history_state, history_state
-    # System prompt prefix
     system_message = (
         "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process..."
     )
     start_tag, sep_tag, end_tag = "<|im_start|>", "<|im_sep|>", "<|im_end|>"
-    # Build full prompt
     prompt = f"{start_tag}system{sep_tag}{system_message}{end_tag}"
     for msg in history_state:
         prompt += f"{start_tag}{msg['role']}{sep_tag}{msg['content']}{end_tag}"
     prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
-    # Tokenize and move to device
     inputs = tokenizer(prompt, return_tensors="pt").to(device)
-    # Set up streamer
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
     generation_kwargs = {
         "input_ids": inputs.input_ids,
@@ -67,7 +66,6 @@ def generate_response(user_message, max_tokens, temperature, top_k, top_p, repet
         "streamer": streamer
     }
-    # Launch generation
     Thread(target=model.generate, kwargs=generation_kwargs).start()
     assistant_response = ""
@@ -76,7 +74,6 @@ def generate_response(user_message, max_tokens, temperature, top_k, top_p, repet
         {"role": "assistant", "content": ""}
     ]
-    # Stream tokens back to Gradio
     for token in streamer:
         clean = token.replace(start_tag, "").replace(sep_tag, "").replace(end_tag, "")
         assistant_response += clean
@@ -85,14 +82,13 @@ def generate_response(user_message, max_tokens, temperature, top_k, top_p, repet
     yield new_history, new_history
-# === EXAMPLE MESSAGES ===
 example_messages = {
     "Math reasoning": "If a rectangular prism has a length of 6 cm...",
     "Logic puzzle": "Four people (Alex, Blake, Casey, ...)",
     "Physics problem": "A ball is thrown upward with an initial velocity..."
 }
-# === GRADIO APP ===
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # Phi-4 Chat with GPTQ Quant
@@ -131,6 +127,5 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
 demo.launch(ssr_mode=False)
-# Note:
-# To get CUDA extensions (nf4, double quant, etc.) back, reinstall AutoGPTQ with CUDA support:
 # pip install git+https://github.com/PanQiWei/AutoGPTQ.git#egg=auto-gptq[cuda]

 # Model and device configuration
 phi4_model_path = "Compumacy/OpenBioLLm-70B"
+# Specify the base filename of the GPTQ checkpoint in the repo
+model_basename = "gptq_model-2bit-128g.safetensors"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # === GPTQ 2-bit QUANTIZATION CONFIG ===
 quantize_config = BaseQuantizeConfig(
     bits=2,            # 2-bit quantization
     group_size=128,    # grouping size
+    desc_act=False     # disable descending activations
 )
 # === LOAD GPTQ-QUANTIZED MODEL ===
 model = AutoGPTQForCausalLM.from_quantized(
     phi4_model_path,
+    model_basename=model_basename,
     quantize_config=quantize_config,
     device_map="auto",
     use_safetensors=True,
     if not user_message.strip():
         return history_state, history_state
     system_message = (
         "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process..."
     )
     start_tag, sep_tag, end_tag = "<|im_start|>", "<|im_sep|>", "<|im_end|>"
+    # Build prompt
     prompt = f"{start_tag}system{sep_tag}{system_message}{end_tag}"
     for msg in history_state:
         prompt += f"{start_tag}{msg['role']}{sep_tag}{msg['content']}{end_tag}"
     prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
     inputs = tokenizer(prompt, return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
     generation_kwargs = {
         "input_ids": inputs.input_ids,
         "streamer": streamer
     }
     Thread(target=model.generate, kwargs=generation_kwargs).start()
     assistant_response = ""
         {"role": "assistant", "content": ""}
     ]
     for token in streamer:
         clean = token.replace(start_tag, "").replace(sep_tag, "").replace(end_tag, "")
         assistant_response += clean
     yield new_history, new_history
+# === EXAMPLES ===
 example_messages = {
     "Math reasoning": "If a rectangular prism has a length of 6 cm...",
     "Logic puzzle": "Four people (Alex, Blake, Casey, ...)",
     "Physics problem": "A ball is thrown upward with an initial velocity..."
 }
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # Phi-4 Chat with GPTQ Quant
 demo.launch(ssr_mode=False)
+# If you still see missing CUDA kernels warnings, reinstall AutoGPTQ with CUDA support:
 # pip install git+https://github.com/PanQiWei/AutoGPTQ.git#egg=auto-gptq[cuda]