MedicallAssistant

Running on Zero

VisoLearn commited on May 12

Commit

54dd705

verified ·

1 Parent(s): c1641dc

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,12 +3,23 @@ import spaces
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import torch
 from threading import Thread
-phi4_model_path = "Daemontatox/Qwen3-14B-Griffon"
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-phi4_model = AutoModelForCausalLM.from_pretrained(phi4_model_path, device_map="auto", torch_dtype="auto")
 phi4_tokenizer = AutoTokenizer.from_pretrained(phi4_model_path)
 @spaces.GPU(duration=120)
@@ -45,9 +56,9 @@ def generate_response(user_message, max_tokens, temperature, top_k, top_p, repet
         "attention_mask": inputs["attention_mask"],
         "max_new_tokens": int(max_tokens),
         "do_sample": True,
-        "temperature": 0.8,
         "top_k": int(top_k),
-        "top_p": 0.95,
         "repetition_penalty": repetition_penalty,
         "streamer": streamer,
     }
@@ -79,6 +90,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # try the example problems below to see how the model breaks down complex reasoning problems.
         """
     )

 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import torch
 from threading import Thread
+import bitsandbytes as bnb
+phi4_model_path = "Compumacy/OpenBioLLm-70B"
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# Load model with 4-bit quantization
+phi4_model = AutoModelForCausalLM.from_pretrained(
+    phi4_model_path,
+    device_map="auto",
+    load_in_4bit=True,              # Enable 4-bit quantization
+    quantization_config={
+        "bnb_4bit_compute_dtype": torch.float16,
+        "bnb_4bit_use_double_quant": True,
+        "bnb_4bit_quant_type": "nf4"
+    }
+)
 phi4_tokenizer = AutoTokenizer.from_pretrained(phi4_model_path)
 @spaces.GPU(duration=120)
         "attention_mask": inputs["attention_mask"],
         "max_new_tokens": int(max_tokens),
         "do_sample": True,
+        "temperature": temperature,  # Use the slider value
         "top_k": int(top_k),
+        "top_p": top_p,              # Use the slider value
         "repetition_penalty": repetition_penalty,
         "streamer": streamer,
     }
     gr.Markdown(
         """
         # try the example problems below to see how the model breaks down complex reasoning problems.
+        ## *Running with 4-bit quantization*
         """
     )