david-thrower commited on
Commit
e102b08
·
verified ·
1 Parent(s): 8140d5a

Update app.py

Browse files

Debug model import and quantization...

Files changed (1) hide show
  1. app.py +11 -3
app.py CHANGED
@@ -36,19 +36,27 @@ from optimum.onnxruntime.configuration import AutoQuantizationConfig
36
 
37
  MODEL_NAME = "HuggingFaceTB/SmolLM3-3B"
38
 
 
 
39
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 
40
 
 
41
  qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
42
- quantizer = ORTQuantizer.from_pretrained(MODEL_NAME)
43
 
 
 
 
44
  # Step 4: Perform quantization saving output in a new directory
45
  quantized_model_dir = "./quantized_model"
46
  print("Starting quantization...")
47
  quantizer.quantize(save_dir=quantized_model_dir, quantization_config=qconfig)
48
-
49
 
50
  del(quantizer)
51
  del(qconfig)
 
52
 
53
  # Run garbage collection again to release memory from quantizer objects
54
  gc.collect()
@@ -56,11 +64,11 @@ gc.collect()
56
  # Step 5: Load the quantized ONNX model for inference
57
  print("Loading quantized ONNX model for inference...")
58
  model = ORTModelForCausalLM.from_pretrained(quantized_model_dir)
 
59
 
60
  # Garbage collection again after final loading
61
  gc.collect()
62
 
63
-
64
  #########
65
 
66
  # print("Loading tokenizer & model…")
 
36
 
37
  MODEL_NAME = "HuggingFaceTB/SmolLM3-3B"
38
 
39
+
40
+
41
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
42
+ model = ORTModelForCausalLM.from_pretrained(MODEL_NAME, export=True)
43
 
44
+ print("Creating quant config")
45
  qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
46
+ print("Creating quant config successful")
47
 
48
+ print("Creating quantizer")
49
+ quantizer = ORTQuantizer.from_pretrained(model)
50
+ print("Creating quantizer successful")
51
  # Step 4: Perform quantization saving output in a new directory
52
  quantized_model_dir = "./quantized_model"
53
  print("Starting quantization...")
54
  quantizer.quantize(save_dir=quantized_model_dir, quantization_config=qconfig)
55
+ print("Quantization was successful. Garbage collecting...")
56
 
57
  del(quantizer)
58
  del(qconfig)
59
+ del(model)
60
 
61
  # Run garbage collection again to release memory from quantizer objects
62
  gc.collect()
 
64
  # Step 5: Load the quantized ONNX model for inference
65
  print("Loading quantized ONNX model for inference...")
66
  model = ORTModelForCausalLM.from_pretrained(quantized_model_dir)
67
+ print("Loading model was succcessful. Garbage collecting.")
68
 
69
  # Garbage collection again after final loading
70
  gc.collect()
71
 
 
72
  #########
73
 
74
  # print("Loading tokenizer & model…")