david-thrower commited on
Commit
b1558e3
·
verified ·
1 Parent(s): ac23ad5

Update app.py

Browse files

Another attempt at reducing the RAM footprint without slowing down the model.

Files changed (1) hide show
  1. app.py +20 -14
app.py CHANGED
@@ -1,24 +1,30 @@
1
- # import gradio as gr
2
- # import torch
3
- # from transformers import AutoTokenizer, AutoModelForCausalLM
4
 
5
- # MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
6
- # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
7
 
8
- # print("Loading tokenizer & model…")
9
- # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
10
  # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE)
11
 
 
 
 
 
 
 
12
  #########
13
 
14
- print("Loading tokenizer & model…")
15
- import gradio as gr
16
- from transformers import AutoTokenizer
17
- from optimum.onnxruntime import ORTModelForCausalLM
18
 
19
- MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
20
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
21
- model = ORTModelForCausalLM.from_pretrained(MODEL_ID, export=True, quantize=True)
22
 
23
  #########
24
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
 
5
+ MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
6
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
7
 
8
+ print("Loading tokenizer & model…")
9
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
10
  # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE)
11
 
12
+ model = AutoModelForCausalLM.from_pretrained(
13
+ MODEL_ID,
14
+ load_in_8bit=True, # or try load_in_4bit=True
15
+ device_map="cpu"
16
+ )
17
+
18
  #########
19
 
20
+ # print("Loading tokenizer & model…")
21
+ # import gradio as gr
22
+ # from transformers import AutoTokenizer
23
+ # from optimum.onnxruntime import ORTModelForCausalLM
24
 
25
+ # MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
26
+ # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
27
+ # model = ORTModelForCausalLM.from_pretrained(MODEL_ID, export=True, quantize=True)
28
 
29
  #########
30