VisoLearn commited on
Commit
54dd705
·
verified ·
1 Parent(s): c1641dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -4
app.py CHANGED
@@ -3,12 +3,23 @@ import spaces
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
  import torch
5
  from threading import Thread
 
6
 
7
- phi4_model_path = "Daemontatox/Qwen3-14B-Griffon"
8
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
11
- phi4_model = AutoModelForCausalLM.from_pretrained(phi4_model_path, device_map="auto", torch_dtype="auto")
 
 
 
 
 
 
 
 
 
 
12
  phi4_tokenizer = AutoTokenizer.from_pretrained(phi4_model_path)
13
 
14
  @spaces.GPU(duration=120)
@@ -45,9 +56,9 @@ def generate_response(user_message, max_tokens, temperature, top_k, top_p, repet
45
  "attention_mask": inputs["attention_mask"],
46
  "max_new_tokens": int(max_tokens),
47
  "do_sample": True,
48
- "temperature": 0.8,
49
  "top_k": int(top_k),
50
- "top_p": 0.95,
51
  "repetition_penalty": repetition_penalty,
52
  "streamer": streamer,
53
  }
@@ -79,6 +90,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
79
  gr.Markdown(
80
  """
81
  # try the example problems below to see how the model breaks down complex reasoning problems.
 
82
  """
83
  )
84
 
 
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
  import torch
5
  from threading import Thread
6
+ import bitsandbytes as bnb
7
 
8
+ phi4_model_path = "Compumacy/OpenBioLLm-70B"
9
 
10
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
 
12
+ # Load model with 4-bit quantization
13
+ phi4_model = AutoModelForCausalLM.from_pretrained(
14
+ phi4_model_path,
15
+ device_map="auto",
16
+ load_in_4bit=True, # Enable 4-bit quantization
17
+ quantization_config={
18
+ "bnb_4bit_compute_dtype": torch.float16,
19
+ "bnb_4bit_use_double_quant": True,
20
+ "bnb_4bit_quant_type": "nf4"
21
+ }
22
+ )
23
  phi4_tokenizer = AutoTokenizer.from_pretrained(phi4_model_path)
24
 
25
  @spaces.GPU(duration=120)
 
56
  "attention_mask": inputs["attention_mask"],
57
  "max_new_tokens": int(max_tokens),
58
  "do_sample": True,
59
+ "temperature": temperature, # Use the slider value
60
  "top_k": int(top_k),
61
+ "top_p": top_p, # Use the slider value
62
  "repetition_penalty": repetition_penalty,
63
  "streamer": streamer,
64
  }
 
90
  gr.Markdown(
91
  """
92
  # try the example problems below to see how the model breaks down complex reasoning problems.
93
+ ## *Running with 4-bit quantization*
94
  """
95
  )
96