david-thrower commited on
Commit
585435b
·
verified ·
1 Parent(s): 94ccb47

Update app.py

Browse files

Revert ONNX quantization attempt...

Files changed (1) hide show
  1. app.py +26 -26
app.py CHANGED
@@ -1,45 +1,45 @@
1
 
2
- # import gc
3
 
4
- # import gradio as gr
5
- # import torch
6
- # from transformers import AutoTokenizer, AutoModelForCausalLM, HqqConfig
7
 
8
- # quant_config = HqqConfig(nbits=8, group_size=64)
9
 
10
- # MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
11
- # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
- # print("Loading tokenizer & model…")
14
- # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
15
- # # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE)
16
 
17
- # model =\
18
- # AutoModelForCausalLM\
19
- # .from_pretrained(
20
- # MODEL_ID,
21
- # torch_dtype=torch.float16,
22
- # # device_map="cuda",
23
- # quantization_config=quant_config
24
- # ).to(DEVICE)
25
 
26
  #gc.collect()
27
 
28
  #########
29
 
30
- import gc
31
 
32
- import gradio as gr
33
- from transformers import AutoTokenizer
34
- from optimum.onnxruntime import ORTModelForCausalLM, ORTQuantizer
35
- from optimum.onnxruntime.configuration import AutoQuantizationConfig
36
 
37
- MODEL_NAME = "HuggingFaceTB/SmolLM3-3B"
38
 
39
 
40
 
41
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
42
- model = ORTModelForCausalLM.from_pretrained(MODEL_NAME, export=True)
43
 
44
  # print("Creating quant config")
45
  # qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
 
1
 
2
+ import gc
3
 
4
+ import gradio as gr
5
+ import torch
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM, HqqConfig
7
 
8
+ quant_config = HqqConfig(nbits=8, group_size=64)
9
 
10
+ MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
11
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
+ print("Loading tokenizer & model…")
14
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
15
+ # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE)
16
 
17
+ model =\
18
+ AutoModelForCausalLM\
19
+ .from_pretrained(
20
+ MODEL_ID,
21
+ torch_dtype=torch.float16,
22
+ # device_map="cuda",
23
+ quantization_config=quant_config
24
+ ).to(DEVICE)
25
 
26
  #gc.collect()
27
 
28
  #########
29
 
30
+ # import gc
31
 
32
+ # import gradio as gr
33
+ # from transformers import AutoTokenizer
34
+ # from optimum.onnxruntime import ORTModelForCausalLM, ORTQuantizer
35
+ # from optimum.onnxruntime.configuration import AutoQuantizationConfig
36
 
37
+ # MODEL_NAME = "HuggingFaceTB/SmolLM3-3B"
38
 
39
 
40
 
41
+ # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
42
+ # model = ORTModelForCausalLM.from_pretrained(MODEL_NAME, export=True)
43
 
44
  # print("Creating quant config")
45
  # qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)