MedicallAssistant

Sleeping

VisoLearn commited on about 1 month ago

Commit

692b14d

verified ·

1 Parent(s): 13a516c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,12 +11,9 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 # === GPTQ 2-bit QUANTIZATION CONFIG ===
 quantize_config = BaseQuantizeConfig(
-    load_in_4bit=False,
-    load_in_8bit=False,
-    quantization_bit=2,
-    compute_dtype=torch.float16,
-    use_double_quant=True,
-    quant_type="nf4"
 )
 # === LOAD GPTQ-QUANTIZED MODEL ===
@@ -132,4 +129,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     clear_button.click(lambda: ([], []), None, [chatbot, history_state])
-demo.launch(ssr_mode=False)

 # === GPTQ 2-bit QUANTIZATION CONFIG ===
 quantize_config = BaseQuantizeConfig(
+    bits=2,            # 2-bit quantization
+    group_size=128,    # grouping size
+    desc_act=False     # disable descending activations for speed
 )
 # === LOAD GPTQ-QUANTIZED MODEL ===
     clear_button.click(lambda: ([], []), None, [chatbot, history_state])
+demo.launch(ssr_mode=False)
+# Note:
+# To get CUDA extensions (nf4, double quant, etc.) back, reinstall AutoGPTQ with CUDA support:
+# pip install git+https://github.com/PanQiWei/AutoGPTQ.git#egg=auto-gptq[cuda]