DaniAffCH commited on Nov 6, 2024

Commit

85a27e0

1 Parent(s): 07ad80f

[GSoC] Add block quantized models (#270)

* Gemm and MatMul block quantization support

* refactoring

* fix indentation

* node name independent

* Block quantization tool:
- constant weight category supported
- add data type saturation
- handled the case in which all the elements within a block are the same

benchmark script modified to support block quantized models

block quantized some models

* add missing block quantized models

* formatting

* add blocked models to eval script. Evaluation yunet

* Add sface and pphumanseg evaluation, block quantization tool fix, handpose blocked model fix, removed blocked CRNN EN,

* changed evaluation metric in block_quantize script and add verbose mode

* Add evaluation for PP-ResNet and Mobilenet

* changed file suffix and update readmes

* renamed int8bq

Files changed (25) hide show

benchmark/README.md +1 -1
benchmark/benchmark.py +5 -2
models/__init__.py +4 -0
models/face_detection_yunet/README.md +7 -2
models/face_recognition_sface/README.md +3 -0
models/facial_expression_recognition/README.md +1 -0
models/handpose_estimation_mediapipe/README.md +1 -0
models/human_segmentation_pphumanseg/README.md +7 -2
models/image_classification_mobilenet/README.md +6 -0
models/image_classification_ppresnet/README.md +5 -0
models/license_plate_detection_yunet/README.md +3 -0
models/object_detection_nanodet/README.md +3 -1
models/object_detection_yolox/README.md +3 -1
models/object_tracking_vittrack/README.md +4 -1
models/optical_flow_estimation_raft/README.md +2 -0
models/palm_detection_mediapipe/README.md +1 -0
models/person_detection_mediapipe/README.md +3 -0
models/person_reid_youtureid/README.md +2 -2
models/pose_estimation_mediapipe/README.md +2 -0
models/text_detection_ppocr/README.md +2 -1
models/text_recognition_crnn/README.md +2 -1
tools/eval/README.md +15 -1
tools/eval/eval.py +33 -0
tools/quantize/README.md +2 -0
tools/quantize/block_quantize.py +111 -37

benchmark/README.md CHANGED Viewed

@@ -26,7 +26,7 @@ python benchmark.py --cfg ./config/face_detection_yunet.yaml
 # All configs
 python benchmark.py --all
-# All configs but only fp32 models (--fp32, --fp16, --int8 are available for now)
 python benchmark.py --all --fp32
 # All configs but exclude some of them (fill with config name keywords, not sensitive to upper/lower case, seperate with colons)

 # All configs
 python benchmark.py --all
+# All configs but only fp32 models (--fp32, --fp16, --int8 --int8bq are available for now)
 python benchmark.py --all --fp32
 # All configs but exclude some of them (fill with config name keywords, not sensitive to upper/lower case, seperate with colons)

benchmark/benchmark.py CHANGED Viewed

@@ -46,6 +46,7 @@ parser.add_argument("--model_exclude", type=str, help="Models to be excluded. Sp
 parser.add_argument("--fp32", action="store_true", help="Benchmark models of float32 precision only.")
 parser.add_argument("--fp16", action="store_true", help="Benchmark models of float16 precision only.")
 parser.add_argument("--int8", action="store_true", help="Benchmark models of int8 precision only.")
 parser.add_argument("--all", action="store_true", help="Benchmark all models")
 args = parser.parse_args()
@@ -194,15 +195,17 @@ if __name__ == '__main__':
         model_handler, model_paths = MODELS.get(model_config.pop('name'))
         _model_paths = []
-        if args.fp32 or args.fp16 or args.int8:
             if args.fp32:
                 _model_paths += model_paths['fp32']
             if args.fp16:
                 _model_paths += model_paths['fp16']
             if args.int8:
                 _model_paths += model_paths['int8']
         else:
-            _model_paths = model_paths['fp32'] + model_paths['fp16'] + model_paths['int8']
         # filter out excluded models
         excludes = []
         if args.model_exclude is not None:

 parser.add_argument("--fp32", action="store_true", help="Benchmark models of float32 precision only.")
 parser.add_argument("--fp16", action="store_true", help="Benchmark models of float16 precision only.")
 parser.add_argument("--int8", action="store_true", help="Benchmark models of int8 precision only.")
+parser.add_argument("--int8bq", action="store_true", help="Benchmark models of blocked int8 precision only.")
 parser.add_argument("--all", action="store_true", help="Benchmark all models")
 args = parser.parse_args()
         model_handler, model_paths = MODELS.get(model_config.pop('name'))
         _model_paths = []
+        if args.fp32 or args.fp16 or args.int8 or args.int8bq:
             if args.fp32:
                 _model_paths += model_paths['fp32']
             if args.fp16:
                 _model_paths += model_paths['fp16']
             if args.int8:
                 _model_paths += model_paths['int8']
+            if args.int8bq:
+                _model_paths += model_paths['int8bq']
         else:
+            _model_paths = model_paths['fp32'] + model_paths['fp16'] + model_paths['int8'] + model_paths["int8bq"]
         # filter out excluded models
         excludes = []
         if args.model_exclude is not None:

models/__init__.py CHANGED Viewed

@@ -46,6 +46,7 @@ class ModuleRegistery:
         fp32_model_paths = []
         fp16_model_paths = []
         int8_model_paths = []
         # onnx
         ret_onnx = sorted(glob.glob(os.path.join(model_dir, "*.onnx")))
         if "object_tracking" in item.__module__:
@@ -57,6 +58,8 @@ class ModuleRegistery:
                     int8_model_paths.append([r])
                 elif "fp16" in r: # exclude fp16 for now
                     fp16_model_paths.append([r])
                 else:
                     fp32_model_paths.append([r])
         # caffe
@@ -72,6 +75,7 @@ class ModuleRegistery:
             fp32=fp32_model_paths,
             fp16=fp16_model_paths,
             int8=int8_model_paths,
         )
         self._dict[item.__name__] = (item, all_model_paths)

         fp32_model_paths = []
         fp16_model_paths = []
         int8_model_paths = []
+        int8bq_model_paths = []
         # onnx
         ret_onnx = sorted(glob.glob(os.path.join(model_dir, "*.onnx")))
         if "object_tracking" in item.__module__:
                     int8_model_paths.append([r])
                 elif "fp16" in r: # exclude fp16 for now
                     fp16_model_paths.append([r])
+                elif "blocked" in r:
+                    int8bq_model_paths.append([r])
                 else:
                     fp32_model_paths.append([r])
         # caffe
             fp32=fp32_model_paths,
             fp16=fp16_model_paths,
             int8=int8_model_paths,
+            int8bq=int8bq_model_paths
         )
         self._dict[item.__name__] = (item, all_model_paths)

models/face_detection_yunet/README.md CHANGED Viewed

@@ -8,15 +8,20 @@ Notes:
 - This model can detect **faces of pixels between around 10x10 to 300x300** due to the training scheme.
 - For details on training this model, please visit https://github.com/ShiqiYu/libfacedetection.train.
 - This ONNX model has fixed input shape, but OpenCV DNN infers on the exact shape of input image. See https://github.com/opencv/opencv_zoo/issues/44 for more information.
 Results of accuracy evaluation with [tools/eval](../../tools/eval).
 | Models      | Easy AP | Medium AP | Hard AP |
 | ----------- | ------- | --------- | ------- |
-| YuNet       | 0.8871  | 0.8710    | 0.7681  |
-| YuNet quant | 0.8838  | 0.8683    | 0.7676  |
 \*: 'quant' stands for 'quantized'.
 ## Demo

 - This model can detect **faces of pixels between around 10x10 to 300x300** due to the training scheme.
 - For details on training this model, please visit https://github.com/ShiqiYu/libfacedetection.train.
 - This ONNX model has fixed input shape, but OpenCV DNN infers on the exact shape of input image. See https://github.com/opencv/opencv_zoo/issues/44 for more information.
+- `face_detection_yunet_2023mar_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 Results of accuracy evaluation with [tools/eval](../../tools/eval).
 | Models      | Easy AP | Medium AP | Hard AP |
 | ----------- | ------- | --------- | ------- |
+| YuNet       | 0.8844  | 0.8656    | 0.7503  |
+| YuNet block | 0.8845  | 0.8652    | 0.7504  |
+| YuNet quant | 0.8810  | 0.8629    | 0.7503  |
 \*: 'quant' stands for 'quantized'.
+\*\*: 'block' stands for 'blockwise quantized'.
 ## Demo

models/face_recognition_sface/README.md CHANGED Viewed

@@ -8,15 +8,18 @@ Note:
 - Model files encode MobileFaceNet instances trained on the SFace loss function, see the [SFace paper](https://arxiv.org/abs/2205.12010) for reference.
 - ONNX file conversions from [original code base](https://github.com/zhongyy/SFace) thanks to [Chengrui Wang](https://github.com/crywang).
 - (As of Sep 2021) Supporting 5-landmark warping for now, see below for details.
 Results of accuracy evaluation with [tools/eval](../../tools/eval).
 | Models      | Accuracy |
 | ----------- | -------- |
 | SFace       | 0.9940   |
 | SFace quant | 0.9932   |
 \*: 'quant' stands for 'quantized'.
 ## Demo

 - Model files encode MobileFaceNet instances trained on the SFace loss function, see the [SFace paper](https://arxiv.org/abs/2205.12010) for reference.
 - ONNX file conversions from [original code base](https://github.com/zhongyy/SFace) thanks to [Chengrui Wang](https://github.com/crywang).
 - (As of Sep 2021) Supporting 5-landmark warping for now, see below for details.
+- `face_recognition_sface_2021dec_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 Results of accuracy evaluation with [tools/eval](../../tools/eval).
 | Models      | Accuracy |
 | ----------- | -------- |
 | SFace       | 0.9940   |
+| SFace block | 0.9942   |
 | SFace quant | 0.9932   |
 \*: 'quant' stands for 'quantized'.
+\*\*: 'block' stands for 'blockwise quantized'.
 ## Demo

models/facial_expression_recognition/README.md CHANGED Viewed

@@ -7,6 +7,7 @@ Note:
 - Progressive Teacher is contributed by [Jing Jiang](https://scholar.google.com/citations?user=OCwcfAwAAAAJ&hl=zh-CN).
 -  [MobileFaceNet](https://link.springer.com/chapter/10.1007/978-3-319-97909-0_46) is used as the backbone and the model is able to classify seven basic facial expressions (angry, disgust, fearful, happy, neutral, sad, surprised).
 - [facial_expression_recognition_mobilefacenet_2022july.onnx](https://github.com/opencv/opencv_zoo/raw/master/models/facial_expression_recognition/facial_expression_recognition_mobilefacenet_2022july.onnx) is implemented thanks to [Chengrui Wang](https://github.com/crywang).
 Results of accuracy evaluation on [RAF-DB](http://whdeng.cn/RAF/model1.html).

 - Progressive Teacher is contributed by [Jing Jiang](https://scholar.google.com/citations?user=OCwcfAwAAAAJ&hl=zh-CN).
 -  [MobileFaceNet](https://link.springer.com/chapter/10.1007/978-3-319-97909-0_46) is used as the backbone and the model is able to classify seven basic facial expressions (angry, disgust, fearful, happy, neutral, sad, surprised).
 - [facial_expression_recognition_mobilefacenet_2022july.onnx](https://github.com/opencv/opencv_zoo/raw/master/models/facial_expression_recognition/facial_expression_recognition_mobilefacenet_2022july.onnx) is implemented thanks to [Chengrui Wang](https://github.com/crywang).
+- `facial_expression_recognition_mobilefacenet_2022july_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 Results of accuracy evaluation on [RAF-DB](http://whdeng.cn/RAF/model1.html).

models/handpose_estimation_mediapipe/README.md CHANGED Viewed

@@ -14,6 +14,7 @@ This model is converted from TFlite to ONNX using following tools:
 **Note**:
 - The int8-quantized model may produce invalid results due to a significant drop of accuracy.
 - Visit https://github.com/google/mediapipe/blob/master/docs/solutions/models.md#hands for models of larger scale.
 ## Demo

 **Note**:
 - The int8-quantized model may produce invalid results due to a significant drop of accuracy.
 - Visit https://github.com/google/mediapipe/blob/master/docs/solutions/models.md#hands for models of larger scale.
+- `handpose_estimation_mediapipe_2023feb_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 ## Demo

models/human_segmentation_pphumanseg/README.md CHANGED Viewed

@@ -2,6 +2,9 @@
 This model is ported from [PaddleHub](https://github.com/PaddlePaddle/PaddleHub) using [this script from OpenCV](https://github.com/opencv/opencv/blob/master/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/paddle_humanseg.py).
 ## Demo
 ### Python
@@ -46,11 +49,13 @@ Results of accuracy evaluation with [tools/eval](../../tools/eval).
 | Models             | Accuracy       | mIoU          |
 | ------------------ | -------------- | ------------- |
-| PPHumanSeg         | 0.9581         | 0.8996        |
-| PPHumanSeg quant   | 0.4365         | 0.2788        |
 \*: 'quant' stands for 'quantized'.
 ---
 ## License

 This model is ported from [PaddleHub](https://github.com/PaddlePaddle/PaddleHub) using [this script from OpenCV](https://github.com/opencv/opencv/blob/master/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/paddle_humanseg.py).
+**Note**:
+- `human_segmentation_pphumanseg_2023mar_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 ## Demo
 ### Python
 | Models             | Accuracy       | mIoU          |
 | ------------------ | -------------- | ------------- |
+| PPHumanSeg         | 0.9656         | 0.9164        |
+| PPHumanSeg block   | 0.9655         | 0.9162        |
+| PPHumanSeg quant   | 0.7285         | 0.3642        |
 \*: 'quant' stands for 'quantized'.
+\*\*: 'block' stands for 'blockwise quantized'.
 ---
 ## License

models/image_classification_mobilenet/README.md CHANGED Viewed

@@ -4,16 +4,22 @@ MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applicatio
 MobileNetV2: Inverted Residuals and Linear Bottlenecks
 Results of accuracy evaluation with [tools/eval](../../tools/eval).
 | Models             | Top-1 Accuracy | Top-5 Accuracy |
 | ------------------ | -------------- | -------------- |
 | MobileNet V1       | 67.64          | 87.97          |
 | MobileNet V1 quant | 55.53          | 78.74          |
 | MobileNet V2       | 69.44          | 89.23          |
 | MobileNet V2 quant | 68.37          | 88.56          |
 \*: 'quant' stands for 'quantized'.
 ## Demo

 MobileNetV2: Inverted Residuals and Linear Bottlenecks
+**Note**:
+- `image_classification_mobilenetvX_2022apr_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 Results of accuracy evaluation with [tools/eval](../../tools/eval).
 | Models             | Top-1 Accuracy | Top-5 Accuracy |
 | ------------------ | -------------- | -------------- |
 | MobileNet V1       | 67.64          | 87.97          |
+| MobileNet V1 block | 67.21          | 87.62          |
 | MobileNet V1 quant | 55.53          | 78.74          |
 | MobileNet V2       | 69.44          | 89.23          |
+| MobileNet V2 block | 68.66          | 88.90          |
 | MobileNet V2 quant | 68.37          | 88.56          |
 \*: 'quant' stands for 'quantized'.
+\*\*: 'block' stands for 'blockwise quantized'.
 ## Demo

models/image_classification_ppresnet/README.md CHANGED Viewed

@@ -4,14 +4,19 @@ Deep Residual Learning for Image Recognition
 This model is ported from [PaddleHub](https://github.com/PaddlePaddle/PaddleHub) using [this script from OpenCV](https://github.com/opencv/opencv/blob/master/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/paddle_resnet50.py).
 Results of accuracy evaluation with [tools/eval](../../tools/eval).
 | Models          | Top-1 Accuracy | Top-5 Accuracy |
 | --------------- | -------------- | -------------- |
 | PP-ResNet       | 82.28          | 96.15          |
 | PP-ResNet quant | 0.22           | 0.96           |
 \*: 'quant' stands for 'quantized'.
 ## Demo

 This model is ported from [PaddleHub](https://github.com/PaddlePaddle/PaddleHub) using [this script from OpenCV](https://github.com/opencv/opencv/blob/master/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/paddle_resnet50.py).
+**Note**:
+- `image_classification_ppresnet50_2022jan_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 Results of accuracy evaluation with [tools/eval](../../tools/eval).
 | Models          | Top-1 Accuracy | Top-5 Accuracy |
 | --------------- | -------------- | -------------- |
 | PP-ResNet       | 82.28          | 96.15          |
+| PP-ResNet block | 82.27          | 96.15          |
 | PP-ResNet quant | 0.22           | 0.96           |
 \*: 'quant' stands for 'quantized'.
+\*\*: 'block' stands for 'blockwise quantized'.
 ## Demo

models/license_plate_detection_yunet/README.md CHANGED Viewed

@@ -4,6 +4,9 @@ This model is contributed by Dong Xu (徐栋) from [watrix.ai](watrix.ai) (银
 Please note that the model is trained with Chinese license plates, so the detection results of other license plates with this model may be limited.
 ## Demo
 Run the following command to try the demo:

 Please note that the model is trained with Chinese license plates, so the detection results of other license plates with this model may be limited.
+**Note**:
+- `license_plate_detection_lpd_yunet_2023mar_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 ## Demo
 Run the following command to try the demo:

models/object_detection_nanodet/README.md CHANGED Viewed

@@ -2,8 +2,10 @@
 Nanodet: NanoDet is a FCOS-style one-stage anchor-free object detection model which using Generalized Focal Loss as classification and regression loss.In NanoDet-Plus, we propose a novel label assignment strategy with a simple assign guidance module (AGM) and a dynamic soft label assigner (DSLA) to solve the optimal label assignment problem in lightweight model training.
-Note:
 - This version of nanodet: Nanodet-m-plus-1.5x_416
 ## Demo

 Nanodet: NanoDet is a FCOS-style one-stage anchor-free object detection model which using Generalized Focal Loss as classification and regression loss.In NanoDet-Plus, we propose a novel label assignment strategy with a simple assign guidance module (AGM) and a dynamic soft label assigner (DSLA) to solve the optimal label assignment problem in lightweight model training.
+**Note**:
 - This version of nanodet: Nanodet-m-plus-1.5x_416
+- `object_detection_nanodet_2022nov_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 ## Demo

models/object_detection_yolox/README.md CHANGED Viewed

@@ -8,8 +8,10 @@ Key features of the YOLOX object detector
 - **SimOTA advanced label assignment strategy** reduces training time and avoids additional solver hyperparameters
 - **Strong data augmentations like MixUp and Mosiac** to boost YOLOX performance
-Note:
 - This version of YoloX: YoloX_s
 ## Demo

 - **SimOTA advanced label assignment strategy** reduces training time and avoids additional solver hyperparameters
 - **Strong data augmentations like MixUp and Mosiac** to boost YOLOX performance
+**Note**:
 - This version of YoloX: YoloX_s
+- `object_detection_yolox_2022nov_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 ## Demo

models/object_tracking_vittrack/README.md CHANGED Viewed

@@ -8,7 +8,10 @@ Video demo: https://youtu.be/MJiPnu1ZQRI
 This model is contributed by [Pengyu Liu](https://github.com/lpylpy0514) in GSoC 2023 project [**Realtime object tracking models**](https://github.com/opencv/opencv/wiki/GSoC_2023#idea-realtime-object-tracking-models)
-**NOTE: OpenCV > 4.8.0 is required. Build from source with instructions from https://opencv.org/get-started/.**
 # Demo
 ## Python

 This model is contributed by [Pengyu Liu](https://github.com/lpylpy0514) in GSoC 2023 project [**Realtime object tracking models**](https://github.com/opencv/opencv/wiki/GSoC_2023#idea-realtime-object-tracking-models)
+**Note**:
+- OpenCV > 4.8.0 is required. Build from source with instructions from https://opencv.org/get-started/.**
+- `object_tracking_vittrack_2023sep_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 # Demo
 ## Python

models/optical_flow_estimation_raft/README.md CHANGED Viewed

@@ -1,6 +1,8 @@
 # RAFT
 This model is originally created by Zachary Teed and Jia Deng of Princeton University. The source code for the model is at [their repository on GitHub](https://github.com/princeton-vl/RAFT), and the original [research paper](https://arxiv.org/abs/2003.12039) is published on [Arxiv](https://arxiv.org/abs/2003.12039). The model was converted to ONNX by [PINTO0309](https://github.com/PINTO0309) in his [model zoo](https://github.com/PINTO0309/PINTO_model_zoo/tree/main/252_RAFT). The ONNX model has several variations depending on the training dataset and input dimesnions. The model used in this demo is trained on Sintel dataset with input size of 360 $\times$ 480.
 ## Demo

 # RAFT
 This model is originally created by Zachary Teed and Jia Deng of Princeton University. The source code for the model is at [their repository on GitHub](https://github.com/princeton-vl/RAFT), and the original [research paper](https://arxiv.org/abs/2003.12039) is published on [Arxiv](https://arxiv.org/abs/2003.12039). The model was converted to ONNX by [PINTO0309](https://github.com/PINTO0309) in his [model zoo](https://github.com/PINTO0309/PINTO_model_zoo/tree/main/252_RAFT). The ONNX model has several variations depending on the training dataset and input dimesnions. The model used in this demo is trained on Sintel dataset with input size of 360 $\times$ 480.
+**Note**:
+- `optical_flow_estimation_raft_2023aug_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 ## Demo

models/palm_detection_mediapipe/README.md CHANGED Viewed

@@ -9,6 +9,7 @@ SSD Anchors are generated from [GenMediaPipePalmDectionSSDAnchors](https://githu
 **Note**:
 - Visit https://github.com/google/mediapipe/blob/master/docs/solutions/models.md#hands for models of larger scale.
 ## Demo

 **Note**:
 - Visit https://github.com/google/mediapipe/blob/master/docs/solutions/models.md#hands for models of larger scale.
+- `palm_detection_mediapipe_2023feb_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 ## Demo

models/person_detection_mediapipe/README.md CHANGED Viewed

@@ -7,6 +7,9 @@ This model detects upper body and full body keypoints of a person, and is downlo
 SSD Anchors are generated from [GenMediaPipePalmDectionSSDAnchors](https://github.com/VimalMollyn/GenMediaPipePalmDectionSSDAnchors)
 ## Demo
 ### Python

 SSD Anchors are generated from [GenMediaPipePalmDectionSSDAnchors](https://github.com/VimalMollyn/GenMediaPipePalmDectionSSDAnchors)
+**Note**:
+- `person_detection_mediapipe_2023mar_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 ## Demo
 ### Python

models/person_reid_youtureid/README.md CHANGED Viewed

@@ -2,9 +2,9 @@
 This model is provided by Tencent Youtu Lab [[Credits]](https://github.com/opencv/opencv/blob/394e640909d5d8edf9c1f578f8216d513373698c/samples/dnn/person_reid.py#L6-L11).
-Note:
 - Model source: https://github.com/ReID-Team/ReID_extra_testdata
 ## Demo

 This model is provided by Tencent Youtu Lab [[Credits]](https://github.com/opencv/opencv/blob/394e640909d5d8edf9c1f578f8216d513373698c/samples/dnn/person_reid.py#L6-L11).
+**Note**:
 - Model source: https://github.com/ReID-Team/ReID_extra_testdata
+- `person_reid_youtu_2021nov_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 ## Demo

models/pose_estimation_mediapipe/README.md CHANGED Viewed

@@ -10,6 +10,8 @@ This model is converted from TFlite to ONNX using following tools:
 **Note**:
 - Visit https://github.com/google/mediapipe/blob/master/docs/solutions/models.md#pose for models of larger scale.
 ## Demo
 ### python

 **Note**:
 - Visit https://github.com/google/mediapipe/blob/master/docs/solutions/models.md#pose for models of larger scale.
+- `pose_estimation_mediapipe_2023mar_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 ## Demo
 ### python

models/text_detection_ppocr/README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 PP-OCRv3: More Attempts for the Improvement of Ultra Lightweight OCR System.
-Note:
 - The int8 quantization model may produce unstable results due to some loss of accuracy.
 - Original Paddle Models source of English: [here](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar).
@@ -10,6 +10,7 @@ Note:
 - `IC15` in the filename means the model is trained on [IC15 dataset](https://rrc.cvc.uab.es/?ch=4&com=introduction), which can detect English text instances only.
 - `TD500` in the filename means the model is trained on [TD500 dataset](http://www.iapr-tc11.org/mediawiki/index.php/MSRA_Text_Detection_500_Database_(MSRA-TD500)), which can detect both English & Chinese instances.
 - Visit https://docs.opencv.org/master/d4/d43/tutorial_dnn_text_spotting.html for more information.
 ## Demo

 PP-OCRv3: More Attempts for the Improvement of Ultra Lightweight OCR System.
+**Note**:
 - The int8 quantization model may produce unstable results due to some loss of accuracy.
 - Original Paddle Models source of English: [here](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar).
 - `IC15` in the filename means the model is trained on [IC15 dataset](https://rrc.cvc.uab.es/?ch=4&com=introduction), which can detect English text instances only.
 - `TD500` in the filename means the model is trained on [TD500 dataset](http://www.iapr-tc11.org/mediawiki/index.php/MSRA_Text_Detection_500_Database_(MSRA-TD500)), which can detect both English & Chinese instances.
 - Visit https://docs.opencv.org/master/d4/d43/tutorial_dnn_text_spotting.html for more information.
+- `text_detection_xx_ppocrv3_2023may_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 ## Demo

models/text_recognition_crnn/README.md CHANGED Viewed

@@ -15,7 +15,7 @@ Results of accuracy evaluation with [tools/eval](../../tools/eval) at different
 \*: 'FP16' or 'INT8' stands for 'model quantized into FP16' or 'model quantized into int8'
-Note:
 - Model source:
   - `text_recognition_CRNN_EN_2021sep.onnx`: https://docs.opencv.org/4.5.2/d9/d1e/tutorial_dnn_OCR.html (CRNN_VGG_BiLSTM_CTC.onnx)
@@ -25,6 +25,7 @@ Note:
 - `text_recognition_CRNN_CH_2021sep.onnx` can detect digits (0\~9), upper/lower-case letters (a\~z and A\~Z), and some special characters (see `CHARSET_CH_94` for details in `crnn.py`).
 - `text_recognition_CRNN_CN_2021nov.onnx` can detect digits (0\~9), upper/lower-case letters (a\~z and A\~Z), some Chinese characters and some special characters (see `CHARSET_CN_3944` for details in `crnn.py`).
 - For details on training this model series, please visit https://github.com/zihaomu/deep-text-recognition-benchmark.
 ## Demo

 \*: 'FP16' or 'INT8' stands for 'model quantized into FP16' or 'model quantized into int8'
+**Note**:
 - Model source:
   - `text_recognition_CRNN_EN_2021sep.onnx`: https://docs.opencv.org/4.5.2/d9/d1e/tutorial_dnn_OCR.html (CRNN_VGG_BiLSTM_CTC.onnx)
 - `text_recognition_CRNN_CH_2021sep.onnx` can detect digits (0\~9), upper/lower-case letters (a\~z and A\~Z), and some special characters (see `CHARSET_CH_94` for details in `crnn.py`).
 - `text_recognition_CRNN_CN_2021nov.onnx` can detect digits (0\~9), upper/lower-case letters (a\~z and A\~Z), some Chinese characters and some special characters (see `CHARSET_CN_3944` for details in `crnn.py`).
 - For details on training this model series, please visit https://github.com/zihaomu/deep-text-recognition-benchmark.
+- `text_recognition_CRNN_XX_2021xxx_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
 ## Demo

tools/eval/README.md CHANGED Viewed

@@ -146,7 +146,7 @@ python eval.py -m sface -d lfw -dr /path/to/lfw
 ### Prepare data
-Please visit http://iapr-tc11.org/mediawiki/index.php/ICDAR_2003_Robust_Reading_Competitions to download the ICDAR2003 dataset and the labels.
 ```shell
 $ tree -L 2 /path/to/icdar
@@ -199,6 +199,20 @@ python eval.py -m crnn -d iiit5k -dr /path/to/iiit5k
 ### Prepare data
 Please download the mini_supervisely data from [here](https://paddleseg.bj.bcebos.com/humanseg/data/mini_supervisely.zip) which includes the validation dataset and unzip it.
 ### Evaluation
 Run evaluation with the following command :

 ### Prepare data
+Please visit http://iapr-tc11.org/mediawiki/index.php/ICDAR_2003_Robust_Reading_Competitions to download the ICDAR2003 dataset and the labels. You have to download the Robust Word Recognition [TrialTrain Set](http://www.iapr-tc11.org/dataset/ICDAR2003_RobustReading/TrialTrain/word.zip) only.
 ```shell
 $ tree -L 2 /path/to/icdar
 ### Prepare data
 Please download the mini_supervisely data from [here](https://paddleseg.bj.bcebos.com/humanseg/data/mini_supervisely.zip) which includes the validation dataset and unzip it.
+```shell
+$ tree -L 2 /path/to/mini_supervisely
+.
+├── Annotations
+│   ├── ache-adult-depression-expression-41253.png
+│   ├── ...
+├── Images
+│   ├── ache-adult-depression-expression-41253.jpg
+│   ├── ...
+├── test.txt
+├── train.txt
+└── val.txt
+```
 ### Evaluation
 Run evaluation with the following command :

tools/eval/eval.py CHANGED Viewed

@@ -33,6 +33,12 @@ models = dict(
         modelPath=os.path.join(root_dir, "models/image_classification_mobilenet/image_classification_mobilenetv1_2022apr_int8.onnx"),
         topK=5,
         loadLabel=False),
     mobilenetv2=dict(
         name="MobileNet",
         topic="image_classification",
@@ -45,6 +51,12 @@ models = dict(
         modelPath=os.path.join(root_dir, "models/image_classification_mobilenet/image_classification_mobilenetv2_2022apr_int8.onnx"),
         topK=5,
         loadLabel=False),
     ppresnet=dict(
         name="PPResNet",
         topic="image_classification",
@@ -57,6 +69,12 @@ models = dict(
         modelPath=os.path.join(root_dir, "models/image_classification_ppresnet/image_classification_ppresnet50_2022jan_int8.onnx"),
         topK=5,
         loadLabel=False),
     yunet=dict(
         name="YuNet",
         topic="face_detection",
@@ -71,6 +89,13 @@ models = dict(
         topK=5000,
         confThreshold=0.3,
         nmsThreshold=0.45),
     sface=dict(
         name="SFace",
         topic="face_recognition",
@@ -79,6 +104,10 @@ models = dict(
         name="SFace",
         topic="face_recognition",
         modelPath=os.path.join(root_dir, "models/face_recognition_sface/face_recognition_sface_2021dec_int8.onnx")),
     crnn_en=dict(
         name="CRNN",
         topic="text_recognition",
@@ -95,6 +124,10 @@ models = dict(
         name="PPHumanSeg",
         topic="human_segmentation",
         modelPath=os.path.join(root_dir, "models/human_segmentation_pphumanseg/human_segmentation_pphumanseg_2023mar_int8.onnx")),
 )
 datasets = dict(

         modelPath=os.path.join(root_dir, "models/image_classification_mobilenet/image_classification_mobilenetv1_2022apr_int8.onnx"),
         topK=5,
         loadLabel=False),
+    mobilenetv1_bq=dict(
+        name="MobileNet",
+        topic="image_classification",
+        modelPath=os.path.join(root_dir, "models/image_classification_mobilenet/image_classification_mobilenetv1_2022apr_int8bq.onnx"),
+        topK=5,
+        loadLabel=False),
     mobilenetv2=dict(
         name="MobileNet",
         topic="image_classification",
         modelPath=os.path.join(root_dir, "models/image_classification_mobilenet/image_classification_mobilenetv2_2022apr_int8.onnx"),
         topK=5,
         loadLabel=False),
+    mobilenetv2_bq=dict(
+        name="MobileNet",
+        topic="image_classification",
+        modelPath=os.path.join(root_dir, "models/image_classification_mobilenet/image_classification_mobilenetv2_2022apr_int8bq.onnx"),
+        topK=5,
+        loadLabel=False),
     ppresnet=dict(
         name="PPResNet",
         topic="image_classification",
         modelPath=os.path.join(root_dir, "models/image_classification_ppresnet/image_classification_ppresnet50_2022jan_int8.onnx"),
         topK=5,
         loadLabel=False),
+    ppresnet_bq=dict(
+        name="PPResNet",
+        topic="image_classification",
+        modelPath=os.path.join(root_dir, "models/image_classification_ppresnet/image_classification_ppresnet50_2022jan_int8bq.onnx"),
+        topK=5,
+        loadLabel=False),
     yunet=dict(
         name="YuNet",
         topic="face_detection",
         topK=5000,
         confThreshold=0.3,
         nmsThreshold=0.45),
+    yunet_bq=dict(
+        name="YuNet",
+        topic="face_detection",
+        modelPath=os.path.join(root_dir, "models/face_detection_yunet/face_detection_yunet_2023mar_int8bq.onnx"),
+        topK=5000,
+        confThreshold=0.3,
+        nmsThreshold=0.45),
     sface=dict(
         name="SFace",
         topic="face_recognition",
         name="SFace",
         topic="face_recognition",
         modelPath=os.path.join(root_dir, "models/face_recognition_sface/face_recognition_sface_2021dec_int8.onnx")),
+    sface_bq=dict(
+        name="SFace",
+        topic="face_recognition",
+        modelPath=os.path.join(root_dir, "models/face_recognition_sface/face_recognition_sface_2021dec_int8bq.onnx")),
     crnn_en=dict(
         name="CRNN",
         topic="text_recognition",
         name="PPHumanSeg",
         topic="human_segmentation",
         modelPath=os.path.join(root_dir, "models/human_segmentation_pphumanseg/human_segmentation_pphumanseg_2023mar_int8.onnx")),
+    pphumanseg_bq=dict(
+        name="PPHumanSeg",
+        topic="human_segmentation",
+        modelPath=os.path.join(root_dir, "models/human_segmentation_pphumanseg/human_segmentation_pphumanseg_2023mar_int8bq.onnx")),
 )
 datasets = dict(

tools/quantize/README.md CHANGED Viewed

@@ -54,6 +54,8 @@ python quantize-inc.py model1
 ## Blockwise quantization usage
 `block_quantize.py` requires Python>=3.7
 To perform weight-only blockwise quantization:

 ## Blockwise quantization usage
+Block-quantized models under each model directory are generated with `--block_size=64`
 `block_quantize.py` requires Python>=3.7
 To perform weight-only blockwise quantization:

tools/quantize/block_quantize.py CHANGED Viewed

@@ -8,7 +8,8 @@ if sys.version_info < MIN_PYTHON_VERSION:
 import argparse
 import os
 from dataclasses import dataclass, field
-from typing import List, Optional, Tuple
 import numpy as np
 import onnx
@@ -22,12 +23,19 @@ SUPPORTED_OPS = {"Conv", "Gemm", "MatMul"}
 ONNX_OPSET = 21
 @dataclass
 class BlockQuantizeConfig:
     input_model_path: str
     output_model_path: str
     block_size: int
     bits: int
 @dataclass
@@ -75,9 +83,13 @@ def block_quantize_tensor(
     y_scale_elementwise = np.repeat(scale, repeats=repeats, axis=block_axis)
     y_zero_point_elementwise = np.repeat(zero_point, repeats=repeats, axis=block_axis)
-    y = np.rint(x / y_scale_elementwise + y_zero_point_elementwise).astype(
-        BITS_TO_NUMPY_TYPE[n_bits]
-    )
     return y
@@ -129,6 +141,13 @@ class BlockQuantizer:
         self.initializers_map = {
             init.name: init for init in self.model.graph.initializer
         }
     def validate_conf(self):
         if not os.path.isfile(self.conf.input_model_path):
@@ -155,34 +174,59 @@ class BlockQuantizer:
                 f"Bits must be one of the following values: [{allowed_values}]."
             )
-    def get_initializer_tensor(self, name: str) -> Optional[np.ndarray]:
         if name in self.initializers_map:
             return onnx.numpy_helper.to_array(self.initializers_map[name])
-        return None
     def compute_scale_zeropoint(
         self, b_min: np.ndarray, b_max: np.ndarray
     ) -> Tuple[np.ndarray, np.ndarray]:
         assert (
-            b_min < b_max
-        ).all(), (
-            "minimum must be lower than maximum when computing scale and zero point"
-        )
         # zero must be present in the range, this enforces qmin <= zero_point <= qmax
         b_min = np.minimum(b_min, np.zeros_like(b_min, dtype=b_min.dtype))
         b_max = np.maximum(b_max, np.zeros_like(b_max, dtype=b_max.dtype))
-        qmin = np.iinfo(BITS_TO_NUMPY_TYPE[self.conf.bits]).min
-        qmax = np.iinfo(BITS_TO_NUMPY_TYPE[self.conf.bits]).max
         dq = qmax - qmin
-        scales = (b_max - b_min) / dq
-        zeropoints = np.rint(qmin - b_min / scales).astype(
-            BITS_TO_NUMPY_TYPE[self.conf.bits]
-        )
         return (scales, zeropoints)
@@ -221,7 +265,8 @@ class BlockQuantizer:
             quantized_weight, quantization_axis, scales, zeropoints
         )
-        qerror = np.linalg.norm(reconstructed_mat - weight)
         res = BlockQuantizeResult(
             quantized_weight,
@@ -241,16 +286,32 @@ class BlockQuantizer:
         return size_mb
-    def display_summary(self, sqe: List):
-        mse = sum(sqe) / len(sqe)
         original_model_size = self.get_model_size(self.conf.input_model_path)
         quantized_model_size = self.get_model_size(self.conf.output_model_path)
         print("Done! Results saved in", self.conf.output_model_path)
         print("\nSummary of Results:\n")
         print(f"{'Metric':<30} {'Value':<10}")
         print(f"{'-'*40}")
-        print(f"{'Mean Squared Quantization Error':<30} {mse:.6f}")
         print(f"{'Original Model Size (KB)':<31} {original_model_size:,.2f}")
         print(f"{'Block-Quantized Model Size (KB)':<30} {quantized_model_size:,.2f}")
@@ -258,7 +319,7 @@ class BlockQuantizer:
         print("Quantizing the model...")
         quantized_inputs = []
-        sqe = []
         node_idx = 0
@@ -267,7 +328,13 @@ class BlockQuantizer:
             if node.op_type in SUPPORTED_OPS:
                 for input_idx, input_name in enumerate(node.input):
-                    weight = self.get_initializer_tensor(input_name)
                     quantized_weights_name = f"{input_name}_quantized"
                     quantized_node_name = f"{input_name}_quantized_node"
@@ -279,9 +346,8 @@ class BlockQuantizer:
                     shape_name = f"{input_name}_shape"
                     reshaped_weights_name = f"{input_name}_reshaped"
-                    # Skip quantization if weights are taken as external input
-                    # or if they don't contain enough elements to create at least 1 block
-                    if weight is None or weight.size < self.conf.block_size:
                         continue
                     reshape_needed = weight.ndim > 2
@@ -295,9 +361,15 @@ class BlockQuantizer:
                         )
                         continue
-                    quantized_inputs.append(input_name)
                     block_quantize_res = self.block_quantize(weight)
                     dequantize_node = create_dequantize_node(
                         quantized_node_name,
                         quantized_weights_name,
@@ -352,14 +424,7 @@ class BlockQuantizer:
                         ]
                     )
-                    # Removing fp32 weights
-                    self.graph.initializer.remove(
-                        next(
-                            init
-                            for init in self.graph.initializer
-                            if init.name == input_name
-                        )
-                    )
                     node.input[input_idx] = (
                         reshaped_weights_name
@@ -374,11 +439,12 @@ class BlockQuantizer:
                     self.graph.node.insert(0, dequantize_node)
                     node_idx += 1
-                    self.graph.value_info.insert(0, shape_info)
                     self.graph.value_info.insert(0, dequantized_weights_info)
-                    sqe.append(block_quantize_res.quantization_error**2)
             node_idx += 1
         onnx.checker.check_model(self.model, full_check=True)
@@ -421,6 +487,13 @@ def setup_args() -> argparse.Namespace:
         default="block_quantized_model.onnx",
         required=False,
     )
     return parser.parse_args()
@@ -433,6 +506,7 @@ if __name__ == "__main__":
         output_model_path=args.output_model,
         block_size=args.block_size,
         bits=args.bits,
     )
     quantizer = BlockQuantizer(quantization_config)

 import argparse
 import os
 from dataclasses import dataclass, field
+from typing import Dict, Tuple
+from enum import Enum, auto
 import numpy as np
 import onnx
 ONNX_OPSET = 21
+class WeightCategory(Enum):
+    INITIALIZER = auto()
+    CONSTANT = auto()
+    NONE = auto()
 @dataclass
 class BlockQuantizeConfig:
     input_model_path: str
     output_model_path: str
     block_size: int
     bits: int
+    verbose: bool
 @dataclass
     y_scale_elementwise = np.repeat(scale, repeats=repeats, axis=block_axis)
     y_zero_point_elementwise = np.repeat(zero_point, repeats=repeats, axis=block_axis)
+    type_info = np.iinfo(BITS_TO_NUMPY_TYPE[n_bits])
+    min_value = type_info.min
+    max_value = type_info.max
+    y = np.rint(x / y_scale_elementwise + y_zero_point_elementwise)
+    y = np.clip(y, min_value, max_value)
+    y = y.astype(BITS_TO_NUMPY_TYPE[n_bits])
     return y
         self.initializers_map = {
             init.name: init for init in self.model.graph.initializer
         }
+        self.costants_map = {
+            node.output[0]: next(
+                attr.t for attr in node.attribute if attr.name == "value"
+            )
+            for node in self.model.graph.node
+            if node.op_type == "Constant"
+        }
     def validate_conf(self):
         if not os.path.isfile(self.conf.input_model_path):
                 f"Bits must be one of the following values: [{allowed_values}]."
             )
+    def get_weight_category(self, name: str) -> WeightCategory:
         if name in self.initializers_map:
+            return WeightCategory.INITIALIZER
+        if name in self.costants_map:
+            return WeightCategory.CONSTANT
+        else:
+            return WeightCategory.NONE
+    def get_weight_tensor(self, name: str, category: WeightCategory) -> np.ndarray:
+        if category == WeightCategory.INITIALIZER:
             return onnx.numpy_helper.to_array(self.initializers_map[name])
+        elif category == WeightCategory.CONSTANT:
+            return onnx.numpy_helper.to_array(self.costants_map[name])
+        else:
+            raise AssertionError("Invalid weight category")
+    def remove_fp32_weights(self, name: str, category: WeightCategory):
+        if category == WeightCategory.INITIALIZER:
+            self.graph.initializer.remove(
+                next(init for init in self.graph.initializer if init.name == name)
+            )
+        elif category == WeightCategory.CONSTANT:
+            self.graph.node.remove(
+                next(
+                    node
+                    for node in self.graph.node
+                    if node.op_type == "Constant" and node.output[0] == name
+                )
+            )
+        else:
+            raise AssertionError("Invalid weight category")
     def compute_scale_zeropoint(
         self, b_min: np.ndarray, b_max: np.ndarray
     ) -> Tuple[np.ndarray, np.ndarray]:
         assert (
+            b_min <= b_max
+        ).all(), "minimum must not be greater than maximum when computing scale and zero point"
         # zero must be present in the range, this enforces qmin <= zero_point <= qmax
         b_min = np.minimum(b_min, np.zeros_like(b_min, dtype=b_min.dtype))
         b_max = np.maximum(b_max, np.zeros_like(b_max, dtype=b_max.dtype))
+        type_info = np.iinfo(BITS_TO_NUMPY_TYPE[self.conf.bits])
+        qmin = type_info.min
+        qmax = type_info.max
         dq = qmax - qmin
+        scales = np.where(b_max != b_min, (b_max - b_min) / dq, 1.0)
+        zeropoints = np.where(b_max != b_min, np.rint(qmin - b_min / scales), 0.0)
+        zeropoints = zeropoints.astype(BITS_TO_NUMPY_TYPE[self.conf.bits])
         return (scales, zeropoints)
             quantized_weight, quantization_axis, scales, zeropoints
         )
+        # Relative Norm
+        qerror = np.linalg.norm(reconstructed_mat - weight) / (np.linalg.norm(weight) + 1e-10)
         res = BlockQuantizeResult(
             quantized_weight,
         return size_mb
+    def display_summary(self, sqe: Dict[str, int]):
+        sqe_v = list(sqe.values())
+        if len(sqe_v) == 0:
+            mse = 0
+            print(
+                "Warning: No weights have been quantized, likely due to unsupported layers."
+            )
+        else:
+            mse = sum(sqe_v) / len(sqe_v)
         original_model_size = self.get_model_size(self.conf.input_model_path)
         quantized_model_size = self.get_model_size(self.conf.output_model_path)
+        if self.conf.verbose:
+            sorted_sqe = sorted(sqe.items(), key=lambda item: item[1], reverse=True)
+            longest_key_len = max(len(key) for key in sqe.keys())
+            print("Quantization error (Relative Norm) sorted in ascending order:")
+            for key, value in sorted_sqe:
+                print(f"{key:<{longest_key_len}} : {value}")
         print("Done! Results saved in", self.conf.output_model_path)
         print("\nSummary of Results:\n")
         print(f"{'Metric':<30} {'Value':<10}")
         print(f"{'-'*40}")
+        print(f"{'Relative Norm Error':<31} {mse:.6f}")
         print(f"{'Original Model Size (KB)':<31} {original_model_size:,.2f}")
         print(f"{'Block-Quantized Model Size (KB)':<30} {quantized_model_size:,.2f}")
         print("Quantizing the model...")
         quantized_inputs = []
+        sqe = {}
         node_idx = 0
             if node.op_type in SUPPORTED_OPS:
                 for input_idx, input_name in enumerate(node.input):
+                    weightCategory = self.get_weight_category(input_name)
+                    # Skip quantization if weights are taken as external input
+                    if weightCategory == WeightCategory.NONE:
+                        continue
+                    weight = self.get_weight_tensor(input_name, weightCategory)
                     quantized_weights_name = f"{input_name}_quantized"
                     quantized_node_name = f"{input_name}_quantized_node"
                     shape_name = f"{input_name}_shape"
                     reshaped_weights_name = f"{input_name}_reshaped"
+                    # Skip quantization if weights don't contain enough elements to create at least 1 block
+                    if weight.size < self.conf.block_size:
                         continue
                     reshape_needed = weight.ndim > 2
                         )
                         continue
                     block_quantize_res = self.block_quantize(weight)
+                    # Skip quantization if it wouldn't reduce the model size
+                    if block_quantize_res.block_size == 1:
+                        continue
+                    quantized_inputs.append(input_name)
                     dequantize_node = create_dequantize_node(
                         quantized_node_name,
                         quantized_weights_name,
                         ]
                     )
+                    self.remove_fp32_weights(input_name, weightCategory)
                     node.input[input_idx] = (
                         reshaped_weights_name
                     self.graph.node.insert(0, dequantize_node)
                     node_idx += 1
+                    if reshape_needed:
+                        self.graph.value_info.insert(0, shape_info)
                     self.graph.value_info.insert(0, dequantized_weights_info)
+                    sqe[input_name] = block_quantize_res.quantization_error
             node_idx += 1
         onnx.checker.check_model(self.model, full_check=True)
         default="block_quantized_model.onnx",
         required=False,
     )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable verbose output",
+        required=False,
+    )
     return parser.parse_args()
         output_model_path=args.output_model,
         block_size=args.block_size,
         bits=args.bits,
+        verbose=args.verbose
     )
     quantizer = BlockQuantizer(quantization_config)