DaniAffCH commited on
Commit
85a27e0
·
1 Parent(s): 07ad80f

[GSoC] Add block quantized models (#270)

Browse files

* Gemm and MatMul block quantization support

* refactoring

* fix indentation

* node name independent

* Block quantization tool:
- constant weight category supported
- add data type saturation
- handled the case in which all the elements within a block are the same

benchmark script modified to support block quantized models

block quantized some models

* add missing block quantized models

* formatting

* add blocked models to eval script. Evaluation yunet

* Add sface and pphumanseg evaluation, block quantization tool fix, handpose blocked model fix, removed blocked CRNN EN,

* changed evaluation metric in block_quantize script and add verbose mode

* Add evaluation for PP-ResNet and Mobilenet

* changed file suffix and update readmes

* renamed int8bq

benchmark/README.md CHANGED
@@ -26,7 +26,7 @@ python benchmark.py --cfg ./config/face_detection_yunet.yaml
26
  # All configs
27
  python benchmark.py --all
28
 
29
- # All configs but only fp32 models (--fp32, --fp16, --int8 are available for now)
30
  python benchmark.py --all --fp32
31
 
32
  # All configs but exclude some of them (fill with config name keywords, not sensitive to upper/lower case, seperate with colons)
 
26
  # All configs
27
  python benchmark.py --all
28
 
29
+ # All configs but only fp32 models (--fp32, --fp16, --int8 --int8bq are available for now)
30
  python benchmark.py --all --fp32
31
 
32
  # All configs but exclude some of them (fill with config name keywords, not sensitive to upper/lower case, seperate with colons)
benchmark/benchmark.py CHANGED
@@ -46,6 +46,7 @@ parser.add_argument("--model_exclude", type=str, help="Models to be excluded. Sp
46
  parser.add_argument("--fp32", action="store_true", help="Benchmark models of float32 precision only.")
47
  parser.add_argument("--fp16", action="store_true", help="Benchmark models of float16 precision only.")
48
  parser.add_argument("--int8", action="store_true", help="Benchmark models of int8 precision only.")
 
49
  parser.add_argument("--all", action="store_true", help="Benchmark all models")
50
  args = parser.parse_args()
51
 
@@ -194,15 +195,17 @@ if __name__ == '__main__':
194
  model_handler, model_paths = MODELS.get(model_config.pop('name'))
195
 
196
  _model_paths = []
197
- if args.fp32 or args.fp16 or args.int8:
198
  if args.fp32:
199
  _model_paths += model_paths['fp32']
200
  if args.fp16:
201
  _model_paths += model_paths['fp16']
202
  if args.int8:
203
  _model_paths += model_paths['int8']
 
 
204
  else:
205
- _model_paths = model_paths['fp32'] + model_paths['fp16'] + model_paths['int8']
206
  # filter out excluded models
207
  excludes = []
208
  if args.model_exclude is not None:
 
46
  parser.add_argument("--fp32", action="store_true", help="Benchmark models of float32 precision only.")
47
  parser.add_argument("--fp16", action="store_true", help="Benchmark models of float16 precision only.")
48
  parser.add_argument("--int8", action="store_true", help="Benchmark models of int8 precision only.")
49
+ parser.add_argument("--int8bq", action="store_true", help="Benchmark models of blocked int8 precision only.")
50
  parser.add_argument("--all", action="store_true", help="Benchmark all models")
51
  args = parser.parse_args()
52
 
 
195
  model_handler, model_paths = MODELS.get(model_config.pop('name'))
196
 
197
  _model_paths = []
198
+ if args.fp32 or args.fp16 or args.int8 or args.int8bq:
199
  if args.fp32:
200
  _model_paths += model_paths['fp32']
201
  if args.fp16:
202
  _model_paths += model_paths['fp16']
203
  if args.int8:
204
  _model_paths += model_paths['int8']
205
+ if args.int8bq:
206
+ _model_paths += model_paths['int8bq']
207
  else:
208
+ _model_paths = model_paths['fp32'] + model_paths['fp16'] + model_paths['int8'] + model_paths["int8bq"]
209
  # filter out excluded models
210
  excludes = []
211
  if args.model_exclude is not None:
models/__init__.py CHANGED
@@ -46,6 +46,7 @@ class ModuleRegistery:
46
  fp32_model_paths = []
47
  fp16_model_paths = []
48
  int8_model_paths = []
 
49
  # onnx
50
  ret_onnx = sorted(glob.glob(os.path.join(model_dir, "*.onnx")))
51
  if "object_tracking" in item.__module__:
@@ -57,6 +58,8 @@ class ModuleRegistery:
57
  int8_model_paths.append([r])
58
  elif "fp16" in r: # exclude fp16 for now
59
  fp16_model_paths.append([r])
 
 
60
  else:
61
  fp32_model_paths.append([r])
62
  # caffe
@@ -72,6 +75,7 @@ class ModuleRegistery:
72
  fp32=fp32_model_paths,
73
  fp16=fp16_model_paths,
74
  int8=int8_model_paths,
 
75
  )
76
 
77
  self._dict[item.__name__] = (item, all_model_paths)
 
46
  fp32_model_paths = []
47
  fp16_model_paths = []
48
  int8_model_paths = []
49
+ int8bq_model_paths = []
50
  # onnx
51
  ret_onnx = sorted(glob.glob(os.path.join(model_dir, "*.onnx")))
52
  if "object_tracking" in item.__module__:
 
58
  int8_model_paths.append([r])
59
  elif "fp16" in r: # exclude fp16 for now
60
  fp16_model_paths.append([r])
61
+ elif "blocked" in r:
62
+ int8bq_model_paths.append([r])
63
  else:
64
  fp32_model_paths.append([r])
65
  # caffe
 
75
  fp32=fp32_model_paths,
76
  fp16=fp16_model_paths,
77
  int8=int8_model_paths,
78
+ int8bq=int8bq_model_paths
79
  )
80
 
81
  self._dict[item.__name__] = (item, all_model_paths)
models/face_detection_yunet/README.md CHANGED
@@ -8,15 +8,20 @@ Notes:
8
  - This model can detect **faces of pixels between around 10x10 to 300x300** due to the training scheme.
9
  - For details on training this model, please visit https://github.com/ShiqiYu/libfacedetection.train.
10
  - This ONNX model has fixed input shape, but OpenCV DNN infers on the exact shape of input image. See https://github.com/opencv/opencv_zoo/issues/44 for more information.
 
11
 
12
  Results of accuracy evaluation with [tools/eval](../../tools/eval).
13
 
14
  | Models | Easy AP | Medium AP | Hard AP |
15
  | ----------- | ------- | --------- | ------- |
16
- | YuNet | 0.8871 | 0.8710 | 0.7681 |
17
- | YuNet quant | 0.8838 | 0.8683 | 0.7676 |
 
 
18
 
19
  \*: 'quant' stands for 'quantized'.
 
 
20
 
21
  ## Demo
22
 
 
8
  - This model can detect **faces of pixels between around 10x10 to 300x300** due to the training scheme.
9
  - For details on training this model, please visit https://github.com/ShiqiYu/libfacedetection.train.
10
  - This ONNX model has fixed input shape, but OpenCV DNN infers on the exact shape of input image. See https://github.com/opencv/opencv_zoo/issues/44 for more information.
11
+ - `face_detection_yunet_2023mar_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
12
 
13
  Results of accuracy evaluation with [tools/eval](../../tools/eval).
14
 
15
  | Models | Easy AP | Medium AP | Hard AP |
16
  | ----------- | ------- | --------- | ------- |
17
+ | YuNet | 0.8844 | 0.8656 | 0.7503 |
18
+ | YuNet block | 0.8845 | 0.8652 | 0.7504 |
19
+ | YuNet quant | 0.8810 | 0.8629 | 0.7503 |
20
+
21
 
22
  \*: 'quant' stands for 'quantized'.
23
+ \*\*: 'block' stands for 'blockwise quantized'.
24
+
25
 
26
  ## Demo
27
 
models/face_recognition_sface/README.md CHANGED
@@ -8,15 +8,18 @@ Note:
8
  - Model files encode MobileFaceNet instances trained on the SFace loss function, see the [SFace paper](https://arxiv.org/abs/2205.12010) for reference.
9
  - ONNX file conversions from [original code base](https://github.com/zhongyy/SFace) thanks to [Chengrui Wang](https://github.com/crywang).
10
  - (As of Sep 2021) Supporting 5-landmark warping for now, see below for details.
 
11
 
12
  Results of accuracy evaluation with [tools/eval](../../tools/eval).
13
 
14
  | Models | Accuracy |
15
  | ----------- | -------- |
16
  | SFace | 0.9940 |
 
17
  | SFace quant | 0.9932 |
18
 
19
  \*: 'quant' stands for 'quantized'.
 
20
 
21
  ## Demo
22
 
 
8
  - Model files encode MobileFaceNet instances trained on the SFace loss function, see the [SFace paper](https://arxiv.org/abs/2205.12010) for reference.
9
  - ONNX file conversions from [original code base](https://github.com/zhongyy/SFace) thanks to [Chengrui Wang](https://github.com/crywang).
10
  - (As of Sep 2021) Supporting 5-landmark warping for now, see below for details.
11
+ - `face_recognition_sface_2021dec_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
12
 
13
  Results of accuracy evaluation with [tools/eval](../../tools/eval).
14
 
15
  | Models | Accuracy |
16
  | ----------- | -------- |
17
  | SFace | 0.9940 |
18
+ | SFace block | 0.9942 |
19
  | SFace quant | 0.9932 |
20
 
21
  \*: 'quant' stands for 'quantized'.
22
+ \*\*: 'block' stands for 'blockwise quantized'.
23
 
24
  ## Demo
25
 
models/facial_expression_recognition/README.md CHANGED
@@ -7,6 +7,7 @@ Note:
7
  - Progressive Teacher is contributed by [Jing Jiang](https://scholar.google.com/citations?user=OCwcfAwAAAAJ&hl=zh-CN).
8
  - [MobileFaceNet](https://link.springer.com/chapter/10.1007/978-3-319-97909-0_46) is used as the backbone and the model is able to classify seven basic facial expressions (angry, disgust, fearful, happy, neutral, sad, surprised).
9
  - [facial_expression_recognition_mobilefacenet_2022july.onnx](https://github.com/opencv/opencv_zoo/raw/master/models/facial_expression_recognition/facial_expression_recognition_mobilefacenet_2022july.onnx) is implemented thanks to [Chengrui Wang](https://github.com/crywang).
 
10
 
11
  Results of accuracy evaluation on [RAF-DB](http://whdeng.cn/RAF/model1.html).
12
 
 
7
  - Progressive Teacher is contributed by [Jing Jiang](https://scholar.google.com/citations?user=OCwcfAwAAAAJ&hl=zh-CN).
8
  - [MobileFaceNet](https://link.springer.com/chapter/10.1007/978-3-319-97909-0_46) is used as the backbone and the model is able to classify seven basic facial expressions (angry, disgust, fearful, happy, neutral, sad, surprised).
9
  - [facial_expression_recognition_mobilefacenet_2022july.onnx](https://github.com/opencv/opencv_zoo/raw/master/models/facial_expression_recognition/facial_expression_recognition_mobilefacenet_2022july.onnx) is implemented thanks to [Chengrui Wang](https://github.com/crywang).
10
+ - `facial_expression_recognition_mobilefacenet_2022july_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
11
 
12
  Results of accuracy evaluation on [RAF-DB](http://whdeng.cn/RAF/model1.html).
13
 
models/handpose_estimation_mediapipe/README.md CHANGED
@@ -14,6 +14,7 @@ This model is converted from TFlite to ONNX using following tools:
14
  **Note**:
15
  - The int8-quantized model may produce invalid results due to a significant drop of accuracy.
16
  - Visit https://github.com/google/mediapipe/blob/master/docs/solutions/models.md#hands for models of larger scale.
 
17
 
18
  ## Demo
19
 
 
14
  **Note**:
15
  - The int8-quantized model may produce invalid results due to a significant drop of accuracy.
16
  - Visit https://github.com/google/mediapipe/blob/master/docs/solutions/models.md#hands for models of larger scale.
17
+ - `handpose_estimation_mediapipe_2023feb_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
18
 
19
  ## Demo
20
 
models/human_segmentation_pphumanseg/README.md CHANGED
@@ -2,6 +2,9 @@
2
 
3
  This model is ported from [PaddleHub](https://github.com/PaddlePaddle/PaddleHub) using [this script from OpenCV](https://github.com/opencv/opencv/blob/master/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/paddle_humanseg.py).
4
 
 
 
 
5
  ## Demo
6
 
7
  ### Python
@@ -46,11 +49,13 @@ Results of accuracy evaluation with [tools/eval](../../tools/eval).
46
 
47
  | Models | Accuracy | mIoU |
48
  | ------------------ | -------------- | ------------- |
49
- | PPHumanSeg | 0.9581 | 0.8996 |
50
- | PPHumanSeg quant | 0.4365 | 0.2788 |
 
51
 
52
 
53
  \*: 'quant' stands for 'quantized'.
 
54
 
55
  ---
56
  ## License
 
2
 
3
  This model is ported from [PaddleHub](https://github.com/PaddlePaddle/PaddleHub) using [this script from OpenCV](https://github.com/opencv/opencv/blob/master/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/paddle_humanseg.py).
4
 
5
+ **Note**:
6
+ - `human_segmentation_pphumanseg_2023mar_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
7
+
8
  ## Demo
9
 
10
  ### Python
 
49
 
50
  | Models | Accuracy | mIoU |
51
  | ------------------ | -------------- | ------------- |
52
+ | PPHumanSeg | 0.9656 | 0.9164 |
53
+ | PPHumanSeg block | 0.9655 | 0.9162 |
54
+ | PPHumanSeg quant | 0.7285 | 0.3642 |
55
 
56
 
57
  \*: 'quant' stands for 'quantized'.
58
+ \*\*: 'block' stands for 'blockwise quantized'.
59
 
60
  ---
61
  ## License
models/image_classification_mobilenet/README.md CHANGED
@@ -4,16 +4,22 @@ MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applicatio
4
 
5
  MobileNetV2: Inverted Residuals and Linear Bottlenecks
6
 
 
 
 
7
  Results of accuracy evaluation with [tools/eval](../../tools/eval).
8
 
9
  | Models | Top-1 Accuracy | Top-5 Accuracy |
10
  | ------------------ | -------------- | -------------- |
11
  | MobileNet V1 | 67.64 | 87.97 |
 
12
  | MobileNet V1 quant | 55.53 | 78.74 |
13
  | MobileNet V2 | 69.44 | 89.23 |
 
14
  | MobileNet V2 quant | 68.37 | 88.56 |
15
 
16
  \*: 'quant' stands for 'quantized'.
 
17
 
18
  ## Demo
19
 
 
4
 
5
  MobileNetV2: Inverted Residuals and Linear Bottlenecks
6
 
7
+ **Note**:
8
+ - `image_classification_mobilenetvX_2022apr_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
9
+
10
  Results of accuracy evaluation with [tools/eval](../../tools/eval).
11
 
12
  | Models | Top-1 Accuracy | Top-5 Accuracy |
13
  | ------------------ | -------------- | -------------- |
14
  | MobileNet V1 | 67.64 | 87.97 |
15
+ | MobileNet V1 block | 67.21 | 87.62 |
16
  | MobileNet V1 quant | 55.53 | 78.74 |
17
  | MobileNet V2 | 69.44 | 89.23 |
18
+ | MobileNet V2 block | 68.66 | 88.90 |
19
  | MobileNet V2 quant | 68.37 | 88.56 |
20
 
21
  \*: 'quant' stands for 'quantized'.
22
+ \*\*: 'block' stands for 'blockwise quantized'.
23
 
24
  ## Demo
25
 
models/image_classification_ppresnet/README.md CHANGED
@@ -4,14 +4,19 @@ Deep Residual Learning for Image Recognition
4
 
5
  This model is ported from [PaddleHub](https://github.com/PaddlePaddle/PaddleHub) using [this script from OpenCV](https://github.com/opencv/opencv/blob/master/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/paddle_resnet50.py).
6
 
 
 
 
7
  Results of accuracy evaluation with [tools/eval](../../tools/eval).
8
 
9
  | Models | Top-1 Accuracy | Top-5 Accuracy |
10
  | --------------- | -------------- | -------------- |
11
  | PP-ResNet | 82.28 | 96.15 |
 
12
  | PP-ResNet quant | 0.22 | 0.96 |
13
 
14
  \*: 'quant' stands for 'quantized'.
 
15
 
16
  ## Demo
17
 
 
4
 
5
  This model is ported from [PaddleHub](https://github.com/PaddlePaddle/PaddleHub) using [this script from OpenCV](https://github.com/opencv/opencv/blob/master/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/paddle_resnet50.py).
6
 
7
+ **Note**:
8
+ - `image_classification_ppresnet50_2022jan_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
9
+
10
  Results of accuracy evaluation with [tools/eval](../../tools/eval).
11
 
12
  | Models | Top-1 Accuracy | Top-5 Accuracy |
13
  | --------------- | -------------- | -------------- |
14
  | PP-ResNet | 82.28 | 96.15 |
15
+ | PP-ResNet block | 82.27 | 96.15 |
16
  | PP-ResNet quant | 0.22 | 0.96 |
17
 
18
  \*: 'quant' stands for 'quantized'.
19
+ \*\*: 'block' stands for 'blockwise quantized'.
20
 
21
  ## Demo
22
 
models/license_plate_detection_yunet/README.md CHANGED
@@ -4,6 +4,9 @@ This model is contributed by Dong Xu (徐栋) from [watrix.ai](watrix.ai) (银
4
 
5
  Please note that the model is trained with Chinese license plates, so the detection results of other license plates with this model may be limited.
6
 
 
 
 
7
  ## Demo
8
 
9
  Run the following command to try the demo:
 
4
 
5
  Please note that the model is trained with Chinese license plates, so the detection results of other license plates with this model may be limited.
6
 
7
+ **Note**:
8
+ - `license_plate_detection_lpd_yunet_2023mar_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
9
+
10
  ## Demo
11
 
12
  Run the following command to try the demo:
models/object_detection_nanodet/README.md CHANGED
@@ -2,8 +2,10 @@
2
 
3
  Nanodet: NanoDet is a FCOS-style one-stage anchor-free object detection model which using Generalized Focal Loss as classification and regression loss.In NanoDet-Plus, we propose a novel label assignment strategy with a simple assign guidance module (AGM) and a dynamic soft label assigner (DSLA) to solve the optimal label assignment problem in lightweight model training.
4
 
5
- Note:
6
  - This version of nanodet: Nanodet-m-plus-1.5x_416
 
 
7
 
8
  ## Demo
9
 
 
2
 
3
  Nanodet: NanoDet is a FCOS-style one-stage anchor-free object detection model which using Generalized Focal Loss as classification and regression loss.In NanoDet-Plus, we propose a novel label assignment strategy with a simple assign guidance module (AGM) and a dynamic soft label assigner (DSLA) to solve the optimal label assignment problem in lightweight model training.
4
 
5
+ **Note**:
6
  - This version of nanodet: Nanodet-m-plus-1.5x_416
7
+ - `object_detection_nanodet_2022nov_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
8
+
9
 
10
  ## Demo
11
 
models/object_detection_yolox/README.md CHANGED
@@ -8,8 +8,10 @@ Key features of the YOLOX object detector
8
  - **SimOTA advanced label assignment strategy** reduces training time and avoids additional solver hyperparameters
9
  - **Strong data augmentations like MixUp and Mosiac** to boost YOLOX performance
10
 
11
- Note:
12
  - This version of YoloX: YoloX_s
 
 
13
 
14
  ## Demo
15
 
 
8
  - **SimOTA advanced label assignment strategy** reduces training time and avoids additional solver hyperparameters
9
  - **Strong data augmentations like MixUp and Mosiac** to boost YOLOX performance
10
 
11
+ **Note**:
12
  - This version of YoloX: YoloX_s
13
+ - `object_detection_yolox_2022nov_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
14
+
15
 
16
  ## Demo
17
 
models/object_tracking_vittrack/README.md CHANGED
@@ -8,7 +8,10 @@ Video demo: https://youtu.be/MJiPnu1ZQRI
8
 
9
  This model is contributed by [Pengyu Liu](https://github.com/lpylpy0514) in GSoC 2023 project [**Realtime object tracking models**](https://github.com/opencv/opencv/wiki/GSoC_2023#idea-realtime-object-tracking-models)
10
 
11
- **NOTE: OpenCV > 4.8.0 is required. Build from source with instructions from https://opencv.org/get-started/.**
 
 
 
12
 
13
  # Demo
14
  ## Python
 
8
 
9
  This model is contributed by [Pengyu Liu](https://github.com/lpylpy0514) in GSoC 2023 project [**Realtime object tracking models**](https://github.com/opencv/opencv/wiki/GSoC_2023#idea-realtime-object-tracking-models)
10
 
11
+ **Note**:
12
+ - OpenCV > 4.8.0 is required. Build from source with instructions from https://opencv.org/get-started/.**
13
+ - `object_tracking_vittrack_2023sep_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
14
+
15
 
16
  # Demo
17
  ## Python
models/optical_flow_estimation_raft/README.md CHANGED
@@ -1,6 +1,8 @@
1
  # RAFT
2
  This model is originally created by Zachary Teed and Jia Deng of Princeton University. The source code for the model is at [their repository on GitHub](https://github.com/princeton-vl/RAFT), and the original [research paper](https://arxiv.org/abs/2003.12039) is published on [Arxiv](https://arxiv.org/abs/2003.12039). The model was converted to ONNX by [PINTO0309](https://github.com/PINTO0309) in his [model zoo](https://github.com/PINTO0309/PINTO_model_zoo/tree/main/252_RAFT). The ONNX model has several variations depending on the training dataset and input dimesnions. The model used in this demo is trained on Sintel dataset with input size of 360 $\times$ 480.
3
 
 
 
4
 
5
  ## Demo
6
 
 
1
  # RAFT
2
  This model is originally created by Zachary Teed and Jia Deng of Princeton University. The source code for the model is at [their repository on GitHub](https://github.com/princeton-vl/RAFT), and the original [research paper](https://arxiv.org/abs/2003.12039) is published on [Arxiv](https://arxiv.org/abs/2003.12039). The model was converted to ONNX by [PINTO0309](https://github.com/PINTO0309) in his [model zoo](https://github.com/PINTO0309/PINTO_model_zoo/tree/main/252_RAFT). The ONNX model has several variations depending on the training dataset and input dimesnions. The model used in this demo is trained on Sintel dataset with input size of 360 $\times$ 480.
3
 
4
+ **Note**:
5
+ - `optical_flow_estimation_raft_2023aug_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
6
 
7
  ## Demo
8
 
models/palm_detection_mediapipe/README.md CHANGED
@@ -9,6 +9,7 @@ SSD Anchors are generated from [GenMediaPipePalmDectionSSDAnchors](https://githu
9
 
10
  **Note**:
11
  - Visit https://github.com/google/mediapipe/blob/master/docs/solutions/models.md#hands for models of larger scale.
 
12
 
13
  ## Demo
14
 
 
9
 
10
  **Note**:
11
  - Visit https://github.com/google/mediapipe/blob/master/docs/solutions/models.md#hands for models of larger scale.
12
+ - `palm_detection_mediapipe_2023feb_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
13
 
14
  ## Demo
15
 
models/person_detection_mediapipe/README.md CHANGED
@@ -7,6 +7,9 @@ This model detects upper body and full body keypoints of a person, and is downlo
7
 
8
  SSD Anchors are generated from [GenMediaPipePalmDectionSSDAnchors](https://github.com/VimalMollyn/GenMediaPipePalmDectionSSDAnchors)
9
 
 
 
 
10
  ## Demo
11
 
12
  ### Python
 
7
 
8
  SSD Anchors are generated from [GenMediaPipePalmDectionSSDAnchors](https://github.com/VimalMollyn/GenMediaPipePalmDectionSSDAnchors)
9
 
10
+ **Note**:
11
+ - `person_detection_mediapipe_2023mar_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
12
+
13
  ## Demo
14
 
15
  ### Python
models/person_reid_youtureid/README.md CHANGED
@@ -2,9 +2,9 @@
2
 
3
  This model is provided by Tencent Youtu Lab [[Credits]](https://github.com/opencv/opencv/blob/394e640909d5d8edf9c1f578f8216d513373698c/samples/dnn/person_reid.py#L6-L11).
4
 
5
- Note:
6
-
7
  - Model source: https://github.com/ReID-Team/ReID_extra_testdata
 
8
 
9
  ## Demo
10
 
 
2
 
3
  This model is provided by Tencent Youtu Lab [[Credits]](https://github.com/opencv/opencv/blob/394e640909d5d8edf9c1f578f8216d513373698c/samples/dnn/person_reid.py#L6-L11).
4
 
5
+ **Note**:
 
6
  - Model source: https://github.com/ReID-Team/ReID_extra_testdata
7
+ - `person_reid_youtu_2021nov_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
8
 
9
  ## Demo
10
 
models/pose_estimation_mediapipe/README.md CHANGED
@@ -10,6 +10,8 @@ This model is converted from TFlite to ONNX using following tools:
10
 
11
  **Note**:
12
  - Visit https://github.com/google/mediapipe/blob/master/docs/solutions/models.md#pose for models of larger scale.
 
 
13
  ## Demo
14
 
15
  ### python
 
10
 
11
  **Note**:
12
  - Visit https://github.com/google/mediapipe/blob/master/docs/solutions/models.md#pose for models of larger scale.
13
+ - `pose_estimation_mediapipe_2023mar_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
14
+
15
  ## Demo
16
 
17
  ### python
models/text_detection_ppocr/README.md CHANGED
@@ -2,7 +2,7 @@
2
 
3
  PP-OCRv3: More Attempts for the Improvement of Ultra Lightweight OCR System.
4
 
5
- Note:
6
 
7
  - The int8 quantization model may produce unstable results due to some loss of accuracy.
8
  - Original Paddle Models source of English: [here](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar).
@@ -10,6 +10,7 @@ Note:
10
  - `IC15` in the filename means the model is trained on [IC15 dataset](https://rrc.cvc.uab.es/?ch=4&com=introduction), which can detect English text instances only.
11
  - `TD500` in the filename means the model is trained on [TD500 dataset](http://www.iapr-tc11.org/mediawiki/index.php/MSRA_Text_Detection_500_Database_(MSRA-TD500)), which can detect both English & Chinese instances.
12
  - Visit https://docs.opencv.org/master/d4/d43/tutorial_dnn_text_spotting.html for more information.
 
13
 
14
  ## Demo
15
 
 
2
 
3
  PP-OCRv3: More Attempts for the Improvement of Ultra Lightweight OCR System.
4
 
5
+ **Note**:
6
 
7
  - The int8 quantization model may produce unstable results due to some loss of accuracy.
8
  - Original Paddle Models source of English: [here](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar).
 
10
  - `IC15` in the filename means the model is trained on [IC15 dataset](https://rrc.cvc.uab.es/?ch=4&com=introduction), which can detect English text instances only.
11
  - `TD500` in the filename means the model is trained on [TD500 dataset](http://www.iapr-tc11.org/mediawiki/index.php/MSRA_Text_Detection_500_Database_(MSRA-TD500)), which can detect both English & Chinese instances.
12
  - Visit https://docs.opencv.org/master/d4/d43/tutorial_dnn_text_spotting.html for more information.
13
+ - `text_detection_xx_ppocrv3_2023may_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
14
 
15
  ## Demo
16
 
models/text_recognition_crnn/README.md CHANGED
@@ -15,7 +15,7 @@ Results of accuracy evaluation with [tools/eval](../../tools/eval) at different
15
 
16
  \*: 'FP16' or 'INT8' stands for 'model quantized into FP16' or 'model quantized into int8'
17
 
18
- Note:
19
 
20
  - Model source:
21
  - `text_recognition_CRNN_EN_2021sep.onnx`: https://docs.opencv.org/4.5.2/d9/d1e/tutorial_dnn_OCR.html (CRNN_VGG_BiLSTM_CTC.onnx)
@@ -25,6 +25,7 @@ Note:
25
  - `text_recognition_CRNN_CH_2021sep.onnx` can detect digits (0\~9), upper/lower-case letters (a\~z and A\~Z), and some special characters (see `CHARSET_CH_94` for details in `crnn.py`).
26
  - `text_recognition_CRNN_CN_2021nov.onnx` can detect digits (0\~9), upper/lower-case letters (a\~z and A\~Z), some Chinese characters and some special characters (see `CHARSET_CN_3944` for details in `crnn.py`).
27
  - For details on training this model series, please visit https://github.com/zihaomu/deep-text-recognition-benchmark.
 
28
 
29
  ## Demo
30
 
 
15
 
16
  \*: 'FP16' or 'INT8' stands for 'model quantized into FP16' or 'model quantized into int8'
17
 
18
+ **Note**:
19
 
20
  - Model source:
21
  - `text_recognition_CRNN_EN_2021sep.onnx`: https://docs.opencv.org/4.5.2/d9/d1e/tutorial_dnn_OCR.html (CRNN_VGG_BiLSTM_CTC.onnx)
 
25
  - `text_recognition_CRNN_CH_2021sep.onnx` can detect digits (0\~9), upper/lower-case letters (a\~z and A\~Z), and some special characters (see `CHARSET_CH_94` for details in `crnn.py`).
26
  - `text_recognition_CRNN_CN_2021nov.onnx` can detect digits (0\~9), upper/lower-case letters (a\~z and A\~Z), some Chinese characters and some special characters (see `CHARSET_CN_3944` for details in `crnn.py`).
27
  - For details on training this model series, please visit https://github.com/zihaomu/deep-text-recognition-benchmark.
28
+ - `text_recognition_CRNN_XX_2021xxx_int8bq.onnx` represents the block-quantized version in int8 precision and is generated using [block_quantize.py](../../tools/quantize/block_quantize.py) with `block_size=64`.
29
 
30
  ## Demo
31
 
tools/eval/README.md CHANGED
@@ -146,7 +146,7 @@ python eval.py -m sface -d lfw -dr /path/to/lfw
146
 
147
  ### Prepare data
148
 
149
- Please visit http://iapr-tc11.org/mediawiki/index.php/ICDAR_2003_Robust_Reading_Competitions to download the ICDAR2003 dataset and the labels.
150
 
151
  ```shell
152
  $ tree -L 2 /path/to/icdar
@@ -199,6 +199,20 @@ python eval.py -m crnn -d iiit5k -dr /path/to/iiit5k
199
  ### Prepare data
200
  Please download the mini_supervisely data from [here](https://paddleseg.bj.bcebos.com/humanseg/data/mini_supervisely.zip) which includes the validation dataset and unzip it.
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  ### Evaluation
203
 
204
  Run evaluation with the following command :
 
146
 
147
  ### Prepare data
148
 
149
+ Please visit http://iapr-tc11.org/mediawiki/index.php/ICDAR_2003_Robust_Reading_Competitions to download the ICDAR2003 dataset and the labels. You have to download the Robust Word Recognition [TrialTrain Set](http://www.iapr-tc11.org/dataset/ICDAR2003_RobustReading/TrialTrain/word.zip) only.
150
 
151
  ```shell
152
  $ tree -L 2 /path/to/icdar
 
199
  ### Prepare data
200
  Please download the mini_supervisely data from [here](https://paddleseg.bj.bcebos.com/humanseg/data/mini_supervisely.zip) which includes the validation dataset and unzip it.
201
 
202
+ ```shell
203
+ $ tree -L 2 /path/to/mini_supervisely
204
+ .
205
+ ├── Annotations
206
+ │   ├── ache-adult-depression-expression-41253.png
207
+ │   ├── ...
208
+ ├── Images
209
+ │   ├── ache-adult-depression-expression-41253.jpg
210
+ │   ├── ...
211
+ ├── test.txt
212
+ ├── train.txt
213
+ └── val.txt
214
+ ```
215
+
216
  ### Evaluation
217
 
218
  Run evaluation with the following command :
tools/eval/eval.py CHANGED
@@ -33,6 +33,12 @@ models = dict(
33
  modelPath=os.path.join(root_dir, "models/image_classification_mobilenet/image_classification_mobilenetv1_2022apr_int8.onnx"),
34
  topK=5,
35
  loadLabel=False),
 
 
 
 
 
 
36
  mobilenetv2=dict(
37
  name="MobileNet",
38
  topic="image_classification",
@@ -45,6 +51,12 @@ models = dict(
45
  modelPath=os.path.join(root_dir, "models/image_classification_mobilenet/image_classification_mobilenetv2_2022apr_int8.onnx"),
46
  topK=5,
47
  loadLabel=False),
 
 
 
 
 
 
48
  ppresnet=dict(
49
  name="PPResNet",
50
  topic="image_classification",
@@ -57,6 +69,12 @@ models = dict(
57
  modelPath=os.path.join(root_dir, "models/image_classification_ppresnet/image_classification_ppresnet50_2022jan_int8.onnx"),
58
  topK=5,
59
  loadLabel=False),
 
 
 
 
 
 
60
  yunet=dict(
61
  name="YuNet",
62
  topic="face_detection",
@@ -71,6 +89,13 @@ models = dict(
71
  topK=5000,
72
  confThreshold=0.3,
73
  nmsThreshold=0.45),
 
 
 
 
 
 
 
74
  sface=dict(
75
  name="SFace",
76
  topic="face_recognition",
@@ -79,6 +104,10 @@ models = dict(
79
  name="SFace",
80
  topic="face_recognition",
81
  modelPath=os.path.join(root_dir, "models/face_recognition_sface/face_recognition_sface_2021dec_int8.onnx")),
 
 
 
 
82
  crnn_en=dict(
83
  name="CRNN",
84
  topic="text_recognition",
@@ -95,6 +124,10 @@ models = dict(
95
  name="PPHumanSeg",
96
  topic="human_segmentation",
97
  modelPath=os.path.join(root_dir, "models/human_segmentation_pphumanseg/human_segmentation_pphumanseg_2023mar_int8.onnx")),
 
 
 
 
98
  )
99
 
100
  datasets = dict(
 
33
  modelPath=os.path.join(root_dir, "models/image_classification_mobilenet/image_classification_mobilenetv1_2022apr_int8.onnx"),
34
  topK=5,
35
  loadLabel=False),
36
+ mobilenetv1_bq=dict(
37
+ name="MobileNet",
38
+ topic="image_classification",
39
+ modelPath=os.path.join(root_dir, "models/image_classification_mobilenet/image_classification_mobilenetv1_2022apr_int8bq.onnx"),
40
+ topK=5,
41
+ loadLabel=False),
42
  mobilenetv2=dict(
43
  name="MobileNet",
44
  topic="image_classification",
 
51
  modelPath=os.path.join(root_dir, "models/image_classification_mobilenet/image_classification_mobilenetv2_2022apr_int8.onnx"),
52
  topK=5,
53
  loadLabel=False),
54
+ mobilenetv2_bq=dict(
55
+ name="MobileNet",
56
+ topic="image_classification",
57
+ modelPath=os.path.join(root_dir, "models/image_classification_mobilenet/image_classification_mobilenetv2_2022apr_int8bq.onnx"),
58
+ topK=5,
59
+ loadLabel=False),
60
  ppresnet=dict(
61
  name="PPResNet",
62
  topic="image_classification",
 
69
  modelPath=os.path.join(root_dir, "models/image_classification_ppresnet/image_classification_ppresnet50_2022jan_int8.onnx"),
70
  topK=5,
71
  loadLabel=False),
72
+ ppresnet_bq=dict(
73
+ name="PPResNet",
74
+ topic="image_classification",
75
+ modelPath=os.path.join(root_dir, "models/image_classification_ppresnet/image_classification_ppresnet50_2022jan_int8bq.onnx"),
76
+ topK=5,
77
+ loadLabel=False),
78
  yunet=dict(
79
  name="YuNet",
80
  topic="face_detection",
 
89
  topK=5000,
90
  confThreshold=0.3,
91
  nmsThreshold=0.45),
92
+ yunet_bq=dict(
93
+ name="YuNet",
94
+ topic="face_detection",
95
+ modelPath=os.path.join(root_dir, "models/face_detection_yunet/face_detection_yunet_2023mar_int8bq.onnx"),
96
+ topK=5000,
97
+ confThreshold=0.3,
98
+ nmsThreshold=0.45),
99
  sface=dict(
100
  name="SFace",
101
  topic="face_recognition",
 
104
  name="SFace",
105
  topic="face_recognition",
106
  modelPath=os.path.join(root_dir, "models/face_recognition_sface/face_recognition_sface_2021dec_int8.onnx")),
107
+ sface_bq=dict(
108
+ name="SFace",
109
+ topic="face_recognition",
110
+ modelPath=os.path.join(root_dir, "models/face_recognition_sface/face_recognition_sface_2021dec_int8bq.onnx")),
111
  crnn_en=dict(
112
  name="CRNN",
113
  topic="text_recognition",
 
124
  name="PPHumanSeg",
125
  topic="human_segmentation",
126
  modelPath=os.path.join(root_dir, "models/human_segmentation_pphumanseg/human_segmentation_pphumanseg_2023mar_int8.onnx")),
127
+ pphumanseg_bq=dict(
128
+ name="PPHumanSeg",
129
+ topic="human_segmentation",
130
+ modelPath=os.path.join(root_dir, "models/human_segmentation_pphumanseg/human_segmentation_pphumanseg_2023mar_int8bq.onnx")),
131
  )
132
 
133
  datasets = dict(
tools/quantize/README.md CHANGED
@@ -54,6 +54,8 @@ python quantize-inc.py model1
54
 
55
  ## Blockwise quantization usage
56
 
 
 
57
  `block_quantize.py` requires Python>=3.7
58
 
59
  To perform weight-only blockwise quantization:
 
54
 
55
  ## Blockwise quantization usage
56
 
57
+ Block-quantized models under each model directory are generated with `--block_size=64`
58
+
59
  `block_quantize.py` requires Python>=3.7
60
 
61
  To perform weight-only blockwise quantization:
tools/quantize/block_quantize.py CHANGED
@@ -8,7 +8,8 @@ if sys.version_info < MIN_PYTHON_VERSION:
8
  import argparse
9
  import os
10
  from dataclasses import dataclass, field
11
- from typing import List, Optional, Tuple
 
12
 
13
  import numpy as np
14
  import onnx
@@ -22,12 +23,19 @@ SUPPORTED_OPS = {"Conv", "Gemm", "MatMul"}
22
  ONNX_OPSET = 21
23
 
24
 
 
 
 
 
 
 
25
  @dataclass
26
  class BlockQuantizeConfig:
27
  input_model_path: str
28
  output_model_path: str
29
  block_size: int
30
  bits: int
 
31
 
32
 
33
  @dataclass
@@ -75,9 +83,13 @@ def block_quantize_tensor(
75
  y_scale_elementwise = np.repeat(scale, repeats=repeats, axis=block_axis)
76
  y_zero_point_elementwise = np.repeat(zero_point, repeats=repeats, axis=block_axis)
77
 
78
- y = np.rint(x / y_scale_elementwise + y_zero_point_elementwise).astype(
79
- BITS_TO_NUMPY_TYPE[n_bits]
80
- )
 
 
 
 
81
 
82
  return y
83
 
@@ -129,6 +141,13 @@ class BlockQuantizer:
129
  self.initializers_map = {
130
  init.name: init for init in self.model.graph.initializer
131
  }
 
 
 
 
 
 
 
132
 
133
  def validate_conf(self):
134
  if not os.path.isfile(self.conf.input_model_path):
@@ -155,34 +174,59 @@ class BlockQuantizer:
155
  f"Bits must be one of the following values: [{allowed_values}]."
156
  )
157
 
158
- def get_initializer_tensor(self, name: str) -> Optional[np.ndarray]:
159
  if name in self.initializers_map:
 
 
 
 
 
 
 
 
160
  return onnx.numpy_helper.to_array(self.initializers_map[name])
 
 
 
 
161
 
162
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  def compute_scale_zeropoint(
165
  self, b_min: np.ndarray, b_max: np.ndarray
166
  ) -> Tuple[np.ndarray, np.ndarray]:
167
  assert (
168
- b_min < b_max
169
- ).all(), (
170
- "minimum must be lower than maximum when computing scale and zero point"
171
- )
172
 
173
  # zero must be present in the range, this enforces qmin <= zero_point <= qmax
174
  b_min = np.minimum(b_min, np.zeros_like(b_min, dtype=b_min.dtype))
175
  b_max = np.maximum(b_max, np.zeros_like(b_max, dtype=b_max.dtype))
176
 
177
- qmin = np.iinfo(BITS_TO_NUMPY_TYPE[self.conf.bits]).min
178
- qmax = np.iinfo(BITS_TO_NUMPY_TYPE[self.conf.bits]).max
 
179
 
180
  dq = qmax - qmin
181
 
182
- scales = (b_max - b_min) / dq
183
- zeropoints = np.rint(qmin - b_min / scales).astype(
184
- BITS_TO_NUMPY_TYPE[self.conf.bits]
185
- )
186
 
187
  return (scales, zeropoints)
188
 
@@ -221,7 +265,8 @@ class BlockQuantizer:
221
  quantized_weight, quantization_axis, scales, zeropoints
222
  )
223
 
224
- qerror = np.linalg.norm(reconstructed_mat - weight)
 
225
 
226
  res = BlockQuantizeResult(
227
  quantized_weight,
@@ -241,16 +286,32 @@ class BlockQuantizer:
241
 
242
  return size_mb
243
 
244
- def display_summary(self, sqe: List):
245
- mse = sum(sqe) / len(sqe)
 
 
 
 
 
 
 
246
  original_model_size = self.get_model_size(self.conf.input_model_path)
247
  quantized_model_size = self.get_model_size(self.conf.output_model_path)
248
 
 
 
 
 
 
 
 
 
 
249
  print("Done! Results saved in", self.conf.output_model_path)
250
  print("\nSummary of Results:\n")
251
  print(f"{'Metric':<30} {'Value':<10}")
252
  print(f"{'-'*40}")
253
- print(f"{'Mean Squared Quantization Error':<30} {mse:.6f}")
254
  print(f"{'Original Model Size (KB)':<31} {original_model_size:,.2f}")
255
  print(f"{'Block-Quantized Model Size (KB)':<30} {quantized_model_size:,.2f}")
256
 
@@ -258,7 +319,7 @@ class BlockQuantizer:
258
  print("Quantizing the model...")
259
 
260
  quantized_inputs = []
261
- sqe = []
262
 
263
  node_idx = 0
264
 
@@ -267,7 +328,13 @@ class BlockQuantizer:
267
 
268
  if node.op_type in SUPPORTED_OPS:
269
  for input_idx, input_name in enumerate(node.input):
270
- weight = self.get_initializer_tensor(input_name)
 
 
 
 
 
 
271
 
272
  quantized_weights_name = f"{input_name}_quantized"
273
  quantized_node_name = f"{input_name}_quantized_node"
@@ -279,9 +346,8 @@ class BlockQuantizer:
279
  shape_name = f"{input_name}_shape"
280
  reshaped_weights_name = f"{input_name}_reshaped"
281
 
282
- # Skip quantization if weights are taken as external input
283
- # or if they don't contain enough elements to create at least 1 block
284
- if weight is None or weight.size < self.conf.block_size:
285
  continue
286
 
287
  reshape_needed = weight.ndim > 2
@@ -295,9 +361,15 @@ class BlockQuantizer:
295
  )
296
  continue
297
 
298
- quantized_inputs.append(input_name)
299
  block_quantize_res = self.block_quantize(weight)
300
 
 
 
 
 
 
 
301
  dequantize_node = create_dequantize_node(
302
  quantized_node_name,
303
  quantized_weights_name,
@@ -352,14 +424,7 @@ class BlockQuantizer:
352
  ]
353
  )
354
 
355
- # Removing fp32 weights
356
- self.graph.initializer.remove(
357
- next(
358
- init
359
- for init in self.graph.initializer
360
- if init.name == input_name
361
- )
362
- )
363
 
364
  node.input[input_idx] = (
365
  reshaped_weights_name
@@ -374,11 +439,12 @@ class BlockQuantizer:
374
 
375
  self.graph.node.insert(0, dequantize_node)
376
  node_idx += 1
377
- self.graph.value_info.insert(0, shape_info)
 
378
  self.graph.value_info.insert(0, dequantized_weights_info)
379
 
380
- sqe.append(block_quantize_res.quantization_error**2)
381
-
382
  node_idx += 1
383
 
384
  onnx.checker.check_model(self.model, full_check=True)
@@ -421,6 +487,13 @@ def setup_args() -> argparse.Namespace:
421
  default="block_quantized_model.onnx",
422
  required=False,
423
  )
 
 
 
 
 
 
 
424
 
425
  return parser.parse_args()
426
 
@@ -433,6 +506,7 @@ if __name__ == "__main__":
433
  output_model_path=args.output_model,
434
  block_size=args.block_size,
435
  bits=args.bits,
 
436
  )
437
 
438
  quantizer = BlockQuantizer(quantization_config)
 
8
  import argparse
9
  import os
10
  from dataclasses import dataclass, field
11
+ from typing import Dict, Tuple
12
+ from enum import Enum, auto
13
 
14
  import numpy as np
15
  import onnx
 
23
  ONNX_OPSET = 21
24
 
25
 
26
+ class WeightCategory(Enum):
27
+ INITIALIZER = auto()
28
+ CONSTANT = auto()
29
+ NONE = auto()
30
+
31
+
32
  @dataclass
33
  class BlockQuantizeConfig:
34
  input_model_path: str
35
  output_model_path: str
36
  block_size: int
37
  bits: int
38
+ verbose: bool
39
 
40
 
41
  @dataclass
 
83
  y_scale_elementwise = np.repeat(scale, repeats=repeats, axis=block_axis)
84
  y_zero_point_elementwise = np.repeat(zero_point, repeats=repeats, axis=block_axis)
85
 
86
+ type_info = np.iinfo(BITS_TO_NUMPY_TYPE[n_bits])
87
+ min_value = type_info.min
88
+ max_value = type_info.max
89
+
90
+ y = np.rint(x / y_scale_elementwise + y_zero_point_elementwise)
91
+ y = np.clip(y, min_value, max_value)
92
+ y = y.astype(BITS_TO_NUMPY_TYPE[n_bits])
93
 
94
  return y
95
 
 
141
  self.initializers_map = {
142
  init.name: init for init in self.model.graph.initializer
143
  }
144
+ self.costants_map = {
145
+ node.output[0]: next(
146
+ attr.t for attr in node.attribute if attr.name == "value"
147
+ )
148
+ for node in self.model.graph.node
149
+ if node.op_type == "Constant"
150
+ }
151
 
152
  def validate_conf(self):
153
  if not os.path.isfile(self.conf.input_model_path):
 
174
  f"Bits must be one of the following values: [{allowed_values}]."
175
  )
176
 
177
+ def get_weight_category(self, name: str) -> WeightCategory:
178
  if name in self.initializers_map:
179
+ return WeightCategory.INITIALIZER
180
+ if name in self.costants_map:
181
+ return WeightCategory.CONSTANT
182
+ else:
183
+ return WeightCategory.NONE
184
+
185
+ def get_weight_tensor(self, name: str, category: WeightCategory) -> np.ndarray:
186
+ if category == WeightCategory.INITIALIZER:
187
  return onnx.numpy_helper.to_array(self.initializers_map[name])
188
+ elif category == WeightCategory.CONSTANT:
189
+ return onnx.numpy_helper.to_array(self.costants_map[name])
190
+ else:
191
+ raise AssertionError("Invalid weight category")
192
 
193
+ def remove_fp32_weights(self, name: str, category: WeightCategory):
194
+ if category == WeightCategory.INITIALIZER:
195
+ self.graph.initializer.remove(
196
+ next(init for init in self.graph.initializer if init.name == name)
197
+ )
198
+ elif category == WeightCategory.CONSTANT:
199
+ self.graph.node.remove(
200
+ next(
201
+ node
202
+ for node in self.graph.node
203
+ if node.op_type == "Constant" and node.output[0] == name
204
+ )
205
+ )
206
+ else:
207
+ raise AssertionError("Invalid weight category")
208
 
209
  def compute_scale_zeropoint(
210
  self, b_min: np.ndarray, b_max: np.ndarray
211
  ) -> Tuple[np.ndarray, np.ndarray]:
212
  assert (
213
+ b_min <= b_max
214
+ ).all(), "minimum must not be greater than maximum when computing scale and zero point"
 
 
215
 
216
  # zero must be present in the range, this enforces qmin <= zero_point <= qmax
217
  b_min = np.minimum(b_min, np.zeros_like(b_min, dtype=b_min.dtype))
218
  b_max = np.maximum(b_max, np.zeros_like(b_max, dtype=b_max.dtype))
219
 
220
+ type_info = np.iinfo(BITS_TO_NUMPY_TYPE[self.conf.bits])
221
+ qmin = type_info.min
222
+ qmax = type_info.max
223
 
224
  dq = qmax - qmin
225
 
226
+ scales = np.where(b_max != b_min, (b_max - b_min) / dq, 1.0)
227
+
228
+ zeropoints = np.where(b_max != b_min, np.rint(qmin - b_min / scales), 0.0)
229
+ zeropoints = zeropoints.astype(BITS_TO_NUMPY_TYPE[self.conf.bits])
230
 
231
  return (scales, zeropoints)
232
 
 
265
  quantized_weight, quantization_axis, scales, zeropoints
266
  )
267
 
268
+ # Relative Norm
269
+ qerror = np.linalg.norm(reconstructed_mat - weight) / (np.linalg.norm(weight) + 1e-10)
270
 
271
  res = BlockQuantizeResult(
272
  quantized_weight,
 
286
 
287
  return size_mb
288
 
289
+ def display_summary(self, sqe: Dict[str, int]):
290
+ sqe_v = list(sqe.values())
291
+ if len(sqe_v) == 0:
292
+ mse = 0
293
+ print(
294
+ "Warning: No weights have been quantized, likely due to unsupported layers."
295
+ )
296
+ else:
297
+ mse = sum(sqe_v) / len(sqe_v)
298
  original_model_size = self.get_model_size(self.conf.input_model_path)
299
  quantized_model_size = self.get_model_size(self.conf.output_model_path)
300
 
301
+ if self.conf.verbose:
302
+ sorted_sqe = sorted(sqe.items(), key=lambda item: item[1], reverse=True)
303
+ longest_key_len = max(len(key) for key in sqe.keys())
304
+
305
+ print("Quantization error (Relative Norm) sorted in ascending order:")
306
+
307
+ for key, value in sorted_sqe:
308
+ print(f"{key:<{longest_key_len}} : {value}")
309
+
310
  print("Done! Results saved in", self.conf.output_model_path)
311
  print("\nSummary of Results:\n")
312
  print(f"{'Metric':<30} {'Value':<10}")
313
  print(f"{'-'*40}")
314
+ print(f"{'Relative Norm Error':<31} {mse:.6f}")
315
  print(f"{'Original Model Size (KB)':<31} {original_model_size:,.2f}")
316
  print(f"{'Block-Quantized Model Size (KB)':<30} {quantized_model_size:,.2f}")
317
 
 
319
  print("Quantizing the model...")
320
 
321
  quantized_inputs = []
322
+ sqe = {}
323
 
324
  node_idx = 0
325
 
 
328
 
329
  if node.op_type in SUPPORTED_OPS:
330
  for input_idx, input_name in enumerate(node.input):
331
+ weightCategory = self.get_weight_category(input_name)
332
+
333
+ # Skip quantization if weights are taken as external input
334
+ if weightCategory == WeightCategory.NONE:
335
+ continue
336
+
337
+ weight = self.get_weight_tensor(input_name, weightCategory)
338
 
339
  quantized_weights_name = f"{input_name}_quantized"
340
  quantized_node_name = f"{input_name}_quantized_node"
 
346
  shape_name = f"{input_name}_shape"
347
  reshaped_weights_name = f"{input_name}_reshaped"
348
 
349
+ # Skip quantization if weights don't contain enough elements to create at least 1 block
350
+ if weight.size < self.conf.block_size:
 
351
  continue
352
 
353
  reshape_needed = weight.ndim > 2
 
361
  )
362
  continue
363
 
364
+
365
  block_quantize_res = self.block_quantize(weight)
366
 
367
+ # Skip quantization if it wouldn't reduce the model size
368
+ if block_quantize_res.block_size == 1:
369
+ continue
370
+
371
+ quantized_inputs.append(input_name)
372
+
373
  dequantize_node = create_dequantize_node(
374
  quantized_node_name,
375
  quantized_weights_name,
 
424
  ]
425
  )
426
 
427
+ self.remove_fp32_weights(input_name, weightCategory)
 
 
 
 
 
 
 
428
 
429
  node.input[input_idx] = (
430
  reshaped_weights_name
 
439
 
440
  self.graph.node.insert(0, dequantize_node)
441
  node_idx += 1
442
+ if reshape_needed:
443
+ self.graph.value_info.insert(0, shape_info)
444
  self.graph.value_info.insert(0, dequantized_weights_info)
445
 
446
+ sqe[input_name] = block_quantize_res.quantization_error
447
+
448
  node_idx += 1
449
 
450
  onnx.checker.check_model(self.model, full_check=True)
 
487
  default="block_quantized_model.onnx",
488
  required=False,
489
  )
490
+ parser.add_argument(
491
+ "-v",
492
+ "--verbose",
493
+ action="store_true",
494
+ help="Enable verbose output",
495
+ required=False,
496
+ )
497
 
498
  return parser.parse_args()
499
 
 
506
  output_model_path=args.output_model,
507
  block_size=args.block_size,
508
  bits=args.bits,
509
+ verbose=args.verbose
510
  )
511
 
512
  quantizer = BlockQuantizer(quantization_config)