david-thrower commited on
Commit
8140d5a
·
verified ·
1 Parent(s): 10edbe0

Update app.py

Browse files

An attempt at avx512_vnni int8 quanitzation using ONNX runtime.

Files changed (1) hide show
  1. app.py +52 -16
app.py CHANGED
@@ -1,30 +1,66 @@
1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import gc
3
 
4
  import gradio as gr
5
- import torch
6
- from transformers import AutoTokenizer, AutoModelForCausalLM, HqqConfig
 
 
 
 
 
7
 
8
- quant_config = HqqConfig(nbits=8, group_size=64)
 
9
 
10
- MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
11
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
12
 
13
- print("Loading tokenizer & model…")
14
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
15
- # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE)
16
 
17
- model =\
18
- AutoModelForCausalLM\
19
- .from_pretrained(
20
- MODEL_ID,
21
- torch_dtype=torch.float16,
22
- # device_map="cuda",
23
- quantization_config=quant_config
24
- ).to(DEVICE)
25
 
 
26
  gc.collect()
27
 
 
 
 
 
 
 
 
 
28
  #########
29
 
30
  # print("Loading tokenizer & model…")
 
1
 
2
+ # import gc
3
+
4
+ # import gradio as gr
5
+ # import torch
6
+ # from transformers import AutoTokenizer, AutoModelForCausalLM, HqqConfig
7
+
8
+ # quant_config = HqqConfig(nbits=8, group_size=64)
9
+
10
+ # MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
11
+ # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
+
13
+ # print("Loading tokenizer & model…")
14
+ # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
15
+ # # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE)
16
+
17
+ # model =\
18
+ # AutoModelForCausalLM\
19
+ # .from_pretrained(
20
+ # MODEL_ID,
21
+ # torch_dtype=torch.float16,
22
+ # # device_map="cuda",
23
+ # quantization_config=quant_config
24
+ # ).to(DEVICE)
25
+
26
+ #gc.collect()
27
+
28
+ #########
29
+
30
  import gc
31
 
32
  import gradio as gr
33
+ from transformers import AutoTokenizer
34
+ from optimum.onnxruntime import ORTModelForCausalLM, ORTQuantizer
35
+ from optimum.onnxruntime.configuration import AutoQuantizationConfig
36
+
37
+ MODEL_NAME = "HuggingFaceTB/SmolLM3-3B"
38
+
39
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
40
 
41
+ qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
42
+ quantizer = ORTQuantizer.from_pretrained(MODEL_NAME)
43
 
44
+ # Step 4: Perform quantization saving output in a new directory
45
+ quantized_model_dir = "./quantized_model"
46
+ print("Starting quantization...")
47
+ quantizer.quantize(save_dir=quantized_model_dir, quantization_config=qconfig)
48
 
 
 
 
49
 
50
+ del(quantizer)
51
+ del(qconfig)
 
 
 
 
 
 
52
 
53
+ # Run garbage collection again to release memory from quantizer objects
54
  gc.collect()
55
 
56
+ # Step 5: Load the quantized ONNX model for inference
57
+ print("Loading quantized ONNX model for inference...")
58
+ model = ORTModelForCausalLM.from_pretrained(quantized_model_dir)
59
+
60
+ # Garbage collection again after final loading
61
+ gc.collect()
62
+
63
+
64
  #########
65
 
66
  # print("Loading tokenizer & model…")