qc903113684 commited on
Commit
7c475da
·
verified ·
1 Parent(s): 72d88ef

Upload 90 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/README.md +64 -0
  2. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt +34 -0
  3. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/anchors_float32.npy +3 -0
  4. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/hand.jpg +0 -0
  5. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp +923 -0
  6. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/models/m_handDetctor_w8a8.qnn216.ctx.bin +3 -0
  7. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/models/m_handLandmark_w8a8.qnn216.ctx.bin +3 -0
  8. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-38.pyc +0 -0
  9. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/__pycache__/visualization.cpython-38.pyc +0 -0
  10. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/anchors_palm.npy +3 -0
  11. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/blazebase.py +513 -0
  12. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/blazehand_landmark.py +115 -0
  13. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/blazepalm.py +157 -0
  14. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/demo_qnn.py +420 -0
  15. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/export_jit.py +66 -0
  16. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/hand.jpg +0 -0
  17. model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/visualization.py +125 -0
  18. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/README.md +64 -0
  19. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/CMakeLists.txt +34 -0
  20. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/anchors_float32.npy +3 -0
  21. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/hand.jpg +0 -0
  22. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/run_test.cpp +923 -0
  23. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/anchors_palm.npy +3 -0
  24. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/blazehand_landmark.pth +3 -0
  25. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/blazepalm.pth +3 -0
  26. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handDetector.pt +3 -0
  27. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handDetector_w8a16.qnn216.ctx.bin +3 -0
  28. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handLandmark.pt +3 -0
  29. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handLandmark_w8a16.qnn216.ctx.bin +3 -0
  30. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/0000.jpg +0 -0
  31. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-38.pyc +0 -0
  32. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/__pycache__/visualization.cpython-38.pyc +0 -0
  33. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/blazebase.py +513 -0
  34. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/blazehand_landmark.py +115 -0
  35. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/blazepalm.py +157 -0
  36. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/demo_qnn.py +386 -0
  37. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/export_jit.py +66 -0
  38. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/hand.jpg +0 -0
  39. model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/visualization.py +125 -0
  40. model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/README.md +64 -0
  41. model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/CMakeLists.txt +34 -0
  42. model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/anchors_float32.npy +3 -0
  43. model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/hand.jpg +0 -0
  44. model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/run_test.cpp +923 -0
  45. model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/models/m_handDetctor_fp16.qnn216.ctx.bin +3 -0
  46. model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/models/m_handLandmark_fp16.qnn216.ctx.bin +3 -0
  47. model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-38.pyc +0 -0
  48. model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/visualization.cpython-38.pyc +0 -0
  49. model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/anchors_palm.npy +3 -0
  50. model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/blazebase.py +513 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Model Information
2
+ ### Source model
3
+ - Input shape: [1x3x256x256], [1x3x256x256]
4
+ - Number of parameters:1.76M, 2.01M
5
+ - Model size:7.11MB, 8.09MB
6
+ - Output shape: [1x2944x18, 1x2944x1], [1, 1, 1x21x3]
7
+
8
+ Source model repository: [MediaPipe-Hand-Detection](https://github.com/zmurez/MediaPipePyTorch/)
9
+
10
+ ### Converted model
11
+
12
+ - Precision: INT8
13
+ - Backend: QNN2.16
14
+ - Target Device: FV01 QCS6490
15
+
16
+ ## Inference with AidLite SDK
17
+
18
+ ### SDK installation
19
+ Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
20
+
21
+ - install AidLite SDK
22
+
23
+ ```bash
24
+ # Install the appropriate version of the aidlite sdk
25
+ sudo aid-pkg update
26
+ sudo aid-pkg install aidlite-sdk
27
+ # Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
28
+ sudo aid-pkg install aidlite-{QNN VERSION}
29
+ ```
30
+
31
+ - Verify AidLite SDK
32
+
33
+ ```bash
34
+ # aidlite sdk c++ check
35
+ python3 -c "import aidlite ; print(aidlite.get_library_version())"
36
+
37
+ # aidlite sdk python check
38
+ python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
39
+ ```
40
+
41
+ ### Run demo
42
+ #### python
43
+ ```bash
44
+ cd python
45
+ python3 demo_qnn.py
46
+ ```
47
+
48
+
49
+ #### c++
50
+ ```bash
51
+ # 加载.npy文件需要用到cnpy库(终端默认路径下执行即可)
52
+ git clone https://github.com/rogersce/cnpy.git
53
+ cd cnpy
54
+ mkdir build && cd build
55
+ cmake ..
56
+ make
57
+ sudo make install
58
+
59
+ cd mediapipe-hand/model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp
60
+ mkdir build && cd build
61
+ cmake ..
62
+ make
63
+ ./run_test
64
+ ```
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required (VERSION 3.5)
2
+ project("run_test")
3
+
4
+ find_package(OpenCV REQUIRED)
5
+ find_library(CNPY_LIB cnpy REQUIRED)
6
+
7
+ message(STATUS "oPENCV Library status:")
8
+ message(STATUS ">version:${OpenCV_VERSION}")
9
+ message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
10
+
11
+ set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
12
+
13
+ include_directories(
14
+ /usr/local/include
15
+ /usr/include/opencv4
16
+ )
17
+
18
+ link_directories(
19
+ /usr/local/lib/
20
+ )
21
+
22
+ file(GLOB SRC_LISTS
23
+ ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp
24
+ )
25
+
26
+ add_executable(run_test ${SRC_LISTS})
27
+
28
+ target_link_libraries(run_test
29
+ aidlite
30
+ ${OpenCV_LIBS}
31
+ pthread
32
+ jsoncpp
33
+ ${CNPY_LIB}
34
+ )
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/anchors_float32.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df91d5dc452f5098bd2618bae51fed413a1f6d3774bea5fbfac1a846d4ee8466
3
+ size 47232
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/hand.jpg ADDED
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp ADDED
@@ -0,0 +1,923 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <iostream>
2
+ #include <fstream>
3
+ #include <opencv2/opencv.hpp>
4
+ #include <aidlux/aidlite/aidlite.hpp>
5
+ #include <vector>
6
+ #include <numeric>
7
+ #include <cmath>
8
+ #include <jsoncpp/json/json.h>
9
+ #include <tuple>
10
+ #include <algorithm>
11
+ #include <sstream>
12
+ #include <string>
13
+ #include <cassert>
14
+ #include "cnpy.h"
15
+
16
+ using namespace cv;
17
+ using namespace std;
18
+ using namespace Aidlux::Aidlite;
19
+
20
+
21
+ // 人脸 landmark 连接索引定义(来自 MediaPipe Face Mesh)
22
+ const std::vector<std::pair<int, int>> HAND_CONNECTIONS = {
23
+ {0, 1}, {1, 2}, {2, 3}, {3, 4},
24
+ {5, 6}, {6, 7}, {7, 8},
25
+ {9, 10}, {10, 11}, {11, 12},
26
+ {13, 14}, {14, 15}, {15, 16},
27
+ {17, 18}, {18, 19}, {19, 20},
28
+ {0, 5}, {5, 9}, {9, 13}, {13, 17}, {0, 17}
29
+ };
30
+
31
+ int kp1 = 0, kp2 = 2; // 关键点索引
32
+ float dy = -0.5f; // 根据模型定义设定
33
+ float dscale = 2.6f; // 缩放因子
34
+ float theta0 = 1.5707963267948966; // 基准角度
35
+ int batch=1;
36
+ int num_anchors=2944;
37
+ int num_coords=18;
38
+ int num_classes=1;
39
+ int num_keypoints=7;
40
+ float x_scale=256.0;
41
+ float y_scale=256.0;
42
+ float w_scale=256.0;
43
+ float h_scale=256.0;
44
+ float score_clipping_thresh=100.0;
45
+ float min_score_thresh=0.75;
46
+
47
+ struct Args {
48
+ std::string faceDetector_model = "../../models/m_handDetctor_w8a8.qnn216.ctx.bin";
49
+ std::string faceLandmark_model = "../../models/m_handLandmark_w8a8.qnn216.ctx.bin";
50
+ std::string imgs = "../hand.jpg";
51
+ int invoke_nums = 10;
52
+ std::string model_type = "QNN";
53
+ };
54
+
55
+
56
+ Args parse_args(int argc, char* argv[]) {
57
+ Args args;
58
+ for (int i = 1; i < argc; ++i) {
59
+ std::string arg = argv[i];
60
+ if (arg == "--faceDetector_model" && i + 1 < argc) {
61
+ args.faceDetector_model = argv[++i];
62
+ } else if (arg == "--faceLandmark_model" && i + 1 < argc) {
63
+ args.faceLandmark_model = argv[++i];
64
+ } else if (arg == "--imgs" && i + 1 < argc) {
65
+ args.imgs = argv[++i];
66
+ } else if (arg == "--invoke_nums" && i + 1 < argc) {
67
+ args.invoke_nums = std::stoi(argv[++i]);
68
+ } else if (arg == "--model_type" && i + 1 < argc) {
69
+ args.model_type = argv[++i];
70
+ }
71
+ }
72
+ return args;
73
+ }
74
+
75
+ std::string to_lower(const std::string& str) {
76
+ std::string lower_str = str;
77
+ std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
78
+ return std::tolower(c);
79
+ });
80
+ return lower_str;
81
+ }
82
+
83
+ std::vector<std::vector<float>> load_anchors_from_npy(const std::string& path) {
84
+ cnpy::NpyArray arr = cnpy::npy_load(path);
85
+ float* data_ptr = arr.data<float>();
86
+
87
+ size_t num_rows = arr.shape[0]; // 896
88
+ size_t num_cols = arr.shape[1]; // 4
89
+
90
+ std::vector<std::vector<float>> anchors(num_rows, std::vector<float>(num_cols));
91
+ for (size_t i = 0; i < num_rows; ++i) {
92
+ for (size_t j = 0; j < num_cols; ++j) {
93
+ anchors[i][j] = data_ptr[i * num_cols + j];
94
+ }
95
+ }
96
+
97
+ return anchors;
98
+ }
99
+
100
+
101
+ // 绘制人脸关键点和连接线
102
+ void draw_landmarks(
103
+ cv::Mat& img,
104
+ const std::vector<cv::Point2f>& points,
105
+ const std::vector<float>& flags,
106
+ const std::vector<std::pair<int, int>>& connections,
107
+ float threshold = 0.4f,
108
+ cv::Scalar point_color = cv::Scalar(0, 255, 0),
109
+ cv::Scalar line_color = cv::Scalar(0, 0, 0),
110
+ int size = 2)
111
+ {
112
+ // 画关键点
113
+ for (size_t i = 0; i < points.size(); ++i) {
114
+ // if (i < flags.size() && flags[i] > threshold) {
115
+ int x = static_cast<int>(points[i].x);
116
+ int y = static_cast<int>(points[i].y);
117
+ cv::circle(img, cv::Point(x, y), size, point_color, size);
118
+ // }
119
+ }
120
+
121
+
122
+ // 画连接线(两端都要可见)
123
+ for (const auto& conn : connections) {
124
+ int i0 = conn.first;
125
+ int i1 = conn.second;
126
+ // if (i0 < points.size() && i1 < points.size() &&
127
+ // i0 < flags.size() && i1 < flags.size() &&
128
+ // flags[i0] > threshold && flags[i1] > threshold)
129
+ // {
130
+ cv::line(img, points[i0], points[i1], line_color, size);
131
+ // }
132
+ }
133
+ }
134
+
135
+
136
+ std::tuple<cv::Mat, cv::Mat, float, cv::Point> resize_pad(const cv::Mat& img) {
137
+ int h = img.rows;
138
+ int w = img.cols;
139
+
140
+ int h1, w1, padh = 0, padw = 0;
141
+ float scale = 1.0f;
142
+
143
+ // Step 1: resize width to 256, keep aspect ratio
144
+ // int w1 = 256;
145
+ // int h1 = w1 * orig_h / orig_w; // 等效于 int(256 * h / w)
146
+
147
+ // 根据宽高,调整缩放比
148
+ if (h >= w) {
149
+ h1 = 256;
150
+ w1 = 256 * w / h;
151
+ padw = 256 - w1;
152
+ scale = static_cast<float>(w) / w1;
153
+ } else {
154
+ w1 = 256;
155
+ h1 = 256 * h / w;
156
+ padh = 256 - h1;
157
+ scale = static_cast<float>(h) / h1;
158
+ }
159
+
160
+ // std::cout << "Original size: (" << h << ", " << w << "), padding: (" << padh << ", " << padw << ")\n";
161
+ // Step 2: compute padding in height direction
162
+ int padh1 = padh / 2;
163
+ int padh2 = padh - padh1;
164
+ int padw1 = padw / 2;
165
+ int padw2 = padw - padw1;
166
+ // std::cout << "Padding: (" << padh1 << ", " << padh2 << "), (" << padw1 << ", " << padw2 << ")\n";
167
+
168
+ // Resize to (w1, h1)
169
+ cv::Mat resized;
170
+ cv::resize(img, resized, cv::Size(w1, h1));
171
+
172
+ // Pad to 256x256
173
+ cv::Mat padded;
174
+ cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
175
+
176
+ // Final resize to 128x128
177
+ cv::Mat resized_small;
178
+ cv::resize(padded, resized_small, cv::Size(128, 128));
179
+
180
+ // Compute offset in original scale
181
+ cv::Point pad_offset(static_cast<int>(padh1 * scale), static_cast<int>(padw1 * scale));
182
+
183
+ return std::make_tuple(padded, resized_small, scale, pad_offset);
184
+ }
185
+
186
+
187
+ // 将图像转换为 1xC×H×W 格式并归一化(除以 255)
188
+ std::vector<float> preprocess_image(const cv::Mat& img) {
189
+ int H = img.rows;
190
+ int W = img.cols;
191
+ int C = img.channels(); // should be 3
192
+
193
+ std::vector<float> chw(H * W * C); // CHW
194
+ std::vector<float> nchw(1 * C * H * W); // NCHW
195
+
196
+ // 1. HWC → CHW + normalize (float32 / 255.0)
197
+ for (int h = 0; h < H; ++h) {
198
+ for (int w = 0; w < W; ++w) {
199
+ for (int c = 0; c < C; ++c) {
200
+ // OpenCV uses BGR order
201
+ float value = img.at<cv::Vec3b>(h, w)[c] / 255.0f;
202
+ chw[c * H * W + h * W + w] = value;
203
+ }
204
+ }
205
+ }
206
+
207
+ // 2. CHW → NCHW (add batch dimension, actually just copy)
208
+ for (int i = 0; i < C * H * W; ++i) {
209
+ nchw[i] = chw[i];
210
+ }
211
+
212
+ return nchw; // shape: [1, 3, H, W]
213
+ }
214
+
215
+
216
+ // 只用前4个坐标计算IOU(默认框位置在前4个坐标)
217
+ float IoU(const std::vector<float>& box1, const std::vector<float>& box2) {
218
+ float x1 = std::max(box1[0], box2[0]);
219
+ float y1 = std::max(box1[1], box2[1]);
220
+ float x2 = std::min(box1[2], box2[2]);
221
+ float y2 = std::min(box1[3], box2[3]);
222
+
223
+ float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
224
+ float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]);
225
+ float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]);
226
+ float union_area = box1_area + box2_area - inter_area;
227
+
228
+ return union_area > 0 ? inter_area / union_area : 0.0f;
229
+ }
230
+
231
+ std::vector<std::vector<float>> weighted_non_max_suppression(
232
+ std::vector<std::vector<float>>& detections,
233
+ int num_coords = 18,
234
+ float min_suppression_threshold = 0.3f)
235
+ {
236
+ if (detections.empty()) return {};
237
+
238
+ std::vector<int> indices(detections.size());
239
+ std::iota(indices.begin(), indices.end(), 0);
240
+
241
+ // 按置信度降序排序
242
+ std::sort(indices.begin(), indices.end(), [&](int a, int b) {
243
+ return detections[a][num_coords] > detections[b][num_coords];
244
+ });
245
+
246
+ std::vector<std::vector<float>> output;
247
+
248
+ while (!indices.empty()) {
249
+ int best_idx = indices.front();
250
+ const auto& best_det = detections[best_idx];
251
+ std::vector<int> overlapping = { best_idx };
252
+
253
+ for (size_t i = 1; i < indices.size(); ++i) {
254
+ float iou = IoU(best_det, detections[indices[i]]);
255
+ if (iou > min_suppression_threshold) {
256
+ overlapping.push_back(indices[i]);
257
+ }
258
+ }
259
+
260
+ // 更新剩余索引
261
+ std::vector<int> new_indices;
262
+ for (int idx : indices) {
263
+ if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) {
264
+ new_indices.push_back(idx);
265
+ }
266
+ }
267
+ indices = new_indices;
268
+
269
+ // 加权平均:坐标 * 置信度
270
+ if (overlapping.size() == 1) {
271
+ output.push_back(best_det);
272
+ } else {
273
+ std::vector<float> weighted(num_coords + 1, 0.0f);
274
+ float total_score = 0.0f;
275
+
276
+ for (int idx : overlapping) {
277
+ float score = detections[idx][num_coords];
278
+ total_score += score;
279
+ for (int k = 0; k < num_coords; ++k) {
280
+ weighted[k] += detections[idx][k] * score;
281
+ }
282
+ }
283
+
284
+ for (int k = 0; k < num_coords; ++k) {
285
+ weighted[k] /= total_score;
286
+ }
287
+ weighted[num_coords] = total_score / overlapping.size(); // 取平均得分
288
+
289
+ // std::cout << "Weighted box: ";
290
+ // for (float v : weighted) std::cout << v << " ";
291
+ // std::cout << "\n";
292
+
293
+ output.push_back(weighted);
294
+ }
295
+ }
296
+
297
+ // TODO
298
+ auto x = output[0];
299
+ output.clear();
300
+ output.push_back(x);
301
+
302
+ return output;
303
+ }
304
+
305
+
306
+ std::vector<std::vector<float>> denormalize_detections(
307
+ const std::vector<std::vector<float>>& detections,
308
+ float scale,
309
+ const cv::Point& pad
310
+ ) {
311
+ std::vector<std::vector<float>> result = detections;
312
+
313
+ for (size_t i = 0; i < result.size(); ++i) {
314
+ std::vector<float>& det = result[i];
315
+
316
+ // bbox coords: x1, y1, x2, y2
317
+ det[0] = det[0] * scale * 256.0f - pad.x; // x1
318
+ det[1] = det[1] * scale * 256.0f - pad.y; // y1
319
+ det[2] = det[2] * scale * 256.0f - pad.x; // x2
320
+ det[3] = det[3] * scale * 256.0f - pad.y; // y2
321
+
322
+ // keypoints (starting from index 4): format [y, x, y, x, ...]
323
+ for (size_t k = 4; k + 1 < det.size(); k += 2) {
324
+ det[k] = det[k] * scale * 256.0f - pad.y; // y
325
+ det[k + 1] = det[k + 1] * scale * 256.0f - pad.x; // x
326
+ }
327
+ }
328
+
329
+ return result;
330
+ }
331
+
332
+
333
+ void detection2roi(
334
+ const std::vector<std::vector<float>>& detections,
335
+ std::vector<float>& xc,
336
+ std::vector<float>& yc,
337
+ std::vector<float>& scale,
338
+ std::vector<float>& theta,
339
+ int kp1, int kp2, // 关键点索引
340
+ float dy, float dscale, float theta0
341
+ ) {
342
+ size_t N = detections.size();
343
+ xc.resize(N);
344
+ yc.resize(N);
345
+ scale.resize(N);
346
+ theta.resize(N);
347
+
348
+ for (size_t i = 0; i < N; ++i) {
349
+ const std::vector<float>& det = detections[i];
350
+
351
+ float x1 = det[1];
352
+ float x2 = det[3];
353
+ float y1 = det[0];
354
+ float y2 = det[2];
355
+
356
+ float x_center = (x1 + x2) / 2.0f;
357
+ float y_center = (y1 + y2) / 2.0f;
358
+ float box_scale = (x2 - x1); // assumes square box
359
+
360
+ // yc 偏移
361
+ y_center += dy * box_scale;
362
+ box_scale *= dscale;
363
+
364
+ // 获取两个关键点的位置
365
+ int base = 4;
366
+ int idx_y0 = base + 2 * kp1;
367
+ int idx_x0 = base + 2 * kp1 + 1;
368
+ int idx_y1 = base + 2 * kp2;
369
+ int idx_x1 = base + 2 * kp2 + 1;
370
+
371
+ float x0 = det[idx_x0];
372
+ float y0 = det[idx_y0];
373
+ float x1_kp = det[idx_x1];
374
+ float y1_kp = det[idx_y1];
375
+
376
+ float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0;
377
+
378
+ // 输出赋值
379
+ xc[i] = x_center;
380
+ yc[i] = y_center;
381
+ scale[i] = box_scale;
382
+ // TODO: 这里的 theta 需要根据实际情况调整
383
+ // theta[i] = angle; // 如果需要使用计算的角度
384
+ theta[i] = -0.8461;
385
+ }
386
+ }
387
+
388
+
389
+ void extract_roi(
390
+ const cv::Mat& frame,
391
+ const std::vector<float>& xc,
392
+ const std::vector<float>& yc,
393
+ const std::vector<float>& theta,
394
+ const std::vector<float>& scale,
395
+ std::vector<cv::Mat>& cropped_rois,
396
+ std::vector<cv::Mat>& affine_matrices,
397
+ std::vector<std::vector<cv::Point2f>>& roi_boxes, // 添加返回点坐标
398
+ int resolution = 256
399
+ ) {
400
+ cropped_rois.clear();
401
+ affine_matrices.clear();
402
+ roi_boxes.clear();
403
+
404
+ for (size_t i = 0; i < xc.size(); ++i) {
405
+ float s = scale[i] / 2.0f;
406
+ float cos_t = std::cos(theta[i]);
407
+ float sin_t = std::sin(theta[i]);
408
+
409
+ // 定义4个 unit square 点经过变换后的点(顺序和 Python 中一样)
410
+ std::vector<cv::Point2f> points(4);
411
+ // [-1, -1]
412
+ points[0].x = xc[i] + (-s * cos_t + s * sin_t);
413
+ points[0].y = yc[i] + (-s * sin_t - s * cos_t);
414
+ // [1, -1]
415
+ points[1].x = xc[i] + ( s * cos_t + s * sin_t);
416
+ points[1].y = yc[i] + ( s * sin_t - s * cos_t);
417
+ // [-1, 1]
418
+ points[2].x = xc[i] + (-s * cos_t - s * sin_t);
419
+ points[2].y = yc[i] + (-s * sin_t + s * cos_t);
420
+ // [1, 1]
421
+ points[3].x = xc[i] + ( s * cos_t - s * sin_t);
422
+ points[3].y = yc[i] + ( s * sin_t + s * cos_t);
423
+
424
+ // 用前三个点计算仿射变换
425
+ std::vector<cv::Point2f> src_pts = { points[0], points[1], points[2] };
426
+ std::vector<cv::Point2f> dst_pts = {
427
+ cv::Point2f(0, 0),
428
+ cv::Point2f(resolution - 1, 0),
429
+ cv::Point2f(0, resolution - 1)
430
+ };
431
+
432
+ cv::Mat M = cv::getAffineTransform(src_pts, dst_pts);
433
+ cv::Mat M_inv;
434
+ cv::invertAffineTransform(M, M_inv);
435
+
436
+ cv::Mat cropped;
437
+ cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5));
438
+ cropped_rois.push_back(cropped);
439
+ affine_matrices.push_back(M_inv);
440
+ roi_boxes.push_back(points); // 添加变换后的 box 点
441
+ }
442
+ }
443
+
444
+ std::vector<float> preprocess_imgs_to_nchw(const std::vector<cv::Mat>& imgs) {
445
+ int N = imgs.size();
446
+ if (N == 0) return {};
447
+
448
+ int H = 256;
449
+ int W = 256;
450
+ int C = 3; // assume 3 channels (BGR)
451
+
452
+ std::vector<float> output;
453
+ output.reserve(N * C * H * W);
454
+
455
+ for (int n = 0; n < N; ++n) {
456
+ cv::Mat img_float;
457
+ imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0); // Normalize to [0,1]
458
+
459
+ // Split channels (HWC → CHW)
460
+ std::vector<cv::Mat> channels(3);
461
+ cv::split(img_float, channels); // channels[0] = B, [1] = G, [2] = R
462
+
463
+ for (int c = 0; c < C; ++c) {
464
+ for (int i = 0; i < H; ++i) {
465
+ for (int j = 0; j < W; ++j) {
466
+ output.push_back(channels[c].at<float>(i, j));
467
+ }
468
+ }
469
+ }
470
+ }
471
+
472
+ return output; // shape: N x C x H x W
473
+ }
474
+
475
+ std::vector<cv::Point2f> denormalize_landmarks(
476
+ const std::vector<float>& normalized_landmarks,
477
+ const std::vector<cv::Mat>& affines,
478
+ int resolution = 256)
479
+ {
480
+ std::vector<cv::Point2f> output;
481
+
482
+ // 检查输入尺寸
483
+ const int num_faces = 1; // 假设只有一个人脸
484
+ const int num_landmarks = 21;
485
+ if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) {
486
+ std::cerr << "Error: Input size mismatch. Expected "
487
+ << num_faces * num_landmarks * 3 << " landmarks and "
488
+ << num_faces << " affine matrices." << std::endl;
489
+ throw std::runtime_error("Input size mismatch");
490
+ }
491
+
492
+ for (int i = 0; i < num_faces; ++i) {
493
+ const cv::Mat& affine = affines[i]; // 2x3 CV_32F
494
+ for (int j = 0; j < num_landmarks; ++j) {
495
+ int idx = i * num_landmarks * 3 + j * 3;
496
+ float x = normalized_landmarks[idx + 0] * resolution;
497
+ float y = normalized_landmarks[idx + 1] * resolution;
498
+ // float z = normalized_landmarks[idx + 2]; // 可选使用
499
+
500
+ // 2x1 输入向量
501
+ cv::Mat pt = (cv::Mat_<float>(2, 1) << x, y);
502
+
503
+ // 提取 affine 的旋转和平移
504
+ cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone();
505
+ cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone();
506
+ M2x2.convertTo(M2x2, CV_32F);
507
+ t2x1.convertTo(t2x1, CV_32F);
508
+
509
+ // 反仿射变换
510
+ cv::Mat out = M2x2 * pt + t2x1;
511
+
512
+ // 存储为 Point2f
513
+ output.emplace_back(out.at<float>(0, 0), out.at<float>(1, 0));
514
+ }
515
+ }
516
+
517
+ return output; // 输出为 denormalized landmarks,大小为 468 个 Point2f
518
+ }
519
+
520
+
521
+ void draw_roi(cv::Mat& img, const std::vector<std::vector<cv::Point2f>>& boxes) {
522
+ for (const auto& roi : boxes) {
523
+ if (roi.size() < 4) continue;
524
+
525
+ const cv::Point2f& p1 = roi[0];
526
+ const cv::Point2f& p2 = roi[1];
527
+ const cv::Point2f& p3 = roi[2];
528
+ const cv::Point2f& p4 = roi[3];
529
+
530
+ cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2); // 黑色
531
+ cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2); // 绿色
532
+ cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2); // 黑色
533
+ cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2); // 黑色
534
+ }
535
+ }
536
+
537
+
538
+ void draw_detections(cv::Mat& img, const std::vector<std::vector<float>>& detections, bool with_keypoints = true) {
539
+ for (const auto& det : detections) {
540
+ if (det.size() < 4) continue;
541
+
542
+ float ymin = det[0];
543
+ float xmin = det[1];
544
+ float ymax = det[2];
545
+ float xmax = det[3];
546
+
547
+ cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1);
548
+
549
+ if (with_keypoints && det.size() > 4) {
550
+ int n_keypoints = (det.size() - 4) / 2;
551
+ for (int k = 0; k < n_keypoints; ++k) {
552
+ int kp_x = int(det[4 + k * 2]);
553
+ int kp_y = int(det[4 + k * 2 + 1]);
554
+ cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2);
555
+ }
556
+ }
557
+ }
558
+ }
559
+
560
+
561
+ std::vector<std::vector<float>> loadAnchors(const std::string& filename) {
562
+ std::ifstream in(filename);
563
+ std::vector<std::vector<float>> anchors;
564
+
565
+ if (!in.is_open()) {
566
+ std::cerr << "Failed to open file: " << filename << std::endl;
567
+ return anchors;
568
+ }
569
+
570
+ std::string line;
571
+ while (std::getline(in, line)) {
572
+ std::istringstream ss(line);
573
+ std::vector<float> anchor;
574
+ float value;
575
+ while (ss >> value) {
576
+ anchor.push_back(value);
577
+ }
578
+ if (!anchor.empty()) {
579
+ anchors.push_back(anchor);
580
+ }
581
+ }
582
+
583
+ in.close();
584
+ return anchors;
585
+ }
586
+
587
+ // sigmoid 函数
588
+ float sigmoid(float x) {
589
+ return 1.0f / (1.0f + std::exp(-x));
590
+ }
591
+
592
+ // clamp 函数
593
+ float clamp(float x, float min_val, float max_val) {
594
+ return std::max(min_val, std::min(max_val, x));
595
+ }
596
+
597
+ // shape: [batch, num_anchors, num_coords] => [batch][anchor][coord]
598
+ std::vector<std::vector<std::vector<float>>> decode_boxes(
599
+ const std::vector<float>& raw_boxes,
600
+ const std::vector<std::vector<float>>& anchors,
601
+ int batch, int num_anchors, int num_coords,
602
+ float x_scale, float y_scale, float w_scale, float h_scale,
603
+ int num_keypoints)
604
+ {
605
+ std::vector<std::vector<std::vector<float>>> decoded_boxes(batch,
606
+ std::vector<std::vector<float>>(num_anchors, std::vector<float>(num_coords, 0)));
607
+
608
+ for (int b = 0; b < batch; ++b) {
609
+ for (int i = 0; i < num_anchors; ++i) {
610
+ int base = b * num_anchors * num_coords + i * num_coords;
611
+
612
+ float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0];
613
+ float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1];
614
+ float w = raw_boxes[base + 2] / w_scale * anchors[i][2];
615
+ float h = raw_boxes[base + 3] / h_scale * anchors[i][3];
616
+
617
+ decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin
618
+ decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin
619
+ decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax
620
+ decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax
621
+
622
+ for (int k = 0; k < num_keypoints; ++k) {
623
+ int offset = 4 + k * 2;
624
+ float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0];
625
+ float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1];
626
+ decoded_boxes[b][i][offset] = keypoint_x;
627
+ decoded_boxes[b][i][offset + 1] = keypoint_y;
628
+ }
629
+ }
630
+ }
631
+
632
+ return decoded_boxes;
633
+ }
634
+
635
+ std::vector<std::vector<std::vector<float>>> tensors_to_detections(
636
+ const std::vector<float>& raw_box_tensor,
637
+ const std::vector<float>& raw_score_tensor,
638
+ const std::vector<std::vector<float>>& anchors,
639
+ int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints,
640
+ float x_scale, float y_scale, float w_scale, float h_scale,
641
+ float score_clipping_thresh, float min_score_thresh)
642
+ {
643
+ assert(raw_box_tensor.size() == batch * num_anchors * num_coords);
644
+ assert(raw_score_tensor.size() == batch * num_anchors * num_classes);
645
+ assert(anchors.size() == size_t(num_anchors));
646
+
647
+ auto detection_boxes = decode_boxes(
648
+ raw_box_tensor, anchors, batch, num_anchors, num_coords,
649
+ x_scale, y_scale, w_scale, h_scale, num_keypoints);
650
+
651
+ std::vector<std::vector<std::vector<float>>> output_detections;
652
+
653
+ for (int b = 0; b < batch; ++b) {
654
+ std::vector<std::vector<float>> detections;
655
+
656
+ for (int i = 0; i < num_anchors; ++i) {
657
+ int score_index = b * num_anchors * num_classes + i * num_classes;
658
+
659
+ // 单类情况,取第0类
660
+ float score_raw = raw_score_tensor[score_index];
661
+ float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh));
662
+
663
+ if (score >= min_score_thresh) {
664
+ std::vector<float> det = detection_boxes[b][i]; // shape [num_coords]
665
+ det.push_back(score); // 追加置信度
666
+ detections.push_back(det); // shape [num_coords+1]
667
+ }
668
+ }
669
+
670
+ output_detections.push_back(detections); // 每个 batch 一个 vector
671
+ }
672
+
673
+ return output_detections;
674
+ }
675
+
676
+
677
+ int invoke(const Args& args) {
678
+ std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n"
679
+ << args.faceLandmark_model << "\n"
680
+ << "Image Path: " << args.imgs << "\n"
681
+ << "Inference Nums: " << args.invoke_nums << "\n"
682
+ << "Model Type: " << args.model_type << "\n";
683
+ // =============================================================faceDetector_model start
684
+ Model* model1 = Model::create_instance(args.faceDetector_model);
685
+ if(model1 == nullptr){
686
+ printf("Create model1 failed !\n");
687
+ return EXIT_FAILURE;
688
+ }
689
+ Config* config1 = Config::create_instance();
690
+ if(config1 == nullptr){
691
+ printf("Create config1 failed !\n");
692
+ return EXIT_FAILURE;
693
+ }
694
+ config1->implement_type = ImplementType::TYPE_LOCAL;
695
+ std::string model_type_lower1 = to_lower(args.model_type);
696
+ if (model_type_lower1 == "qnn"){
697
+ config1->framework_type = FrameworkType::TYPE_QNN;
698
+ } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") {
699
+ config1->framework_type = FrameworkType::TYPE_SNPE2;
700
+ }
701
+ config1->accelerate_type = AccelerateType::TYPE_DSP;
702
+ config1->is_quantify_model = 1;
703
+
704
+ std::vector<std::vector<uint32_t>> input_shapes1 = {{1,3,256,256}};
705
+ std::vector<std::vector<uint32_t>> output_shapes1 = {{1,2944,18},{1,2944,1}};
706
+ model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
707
+ std::unique_ptr<Interpreter> fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1);
708
+ if(fast_interpreter1 == nullptr){
709
+ printf("build_interpretper_from_model_and_config failed !\n");
710
+ return EXIT_FAILURE;
711
+ }
712
+ int result = fast_interpreter1->init();
713
+ if(result != EXIT_SUCCESS){
714
+ printf("interpreter->init() failed !\n");
715
+ return EXIT_FAILURE;
716
+ }
717
+ // load model
718
+ fast_interpreter1->load_model();
719
+ if(result != EXIT_SUCCESS){
720
+ printf("interpreter->load_model() failed !\n");
721
+ return EXIT_FAILURE;
722
+ }
723
+ printf("detect model load success!\n");
724
+ // =============================================================faceDetector_model over
725
+
726
+ // =============================================================faceLandmark_model start
727
+ Model* model2 = Model::create_instance(args.faceLandmark_model);
728
+ if(model2 == nullptr){
729
+ printf("Create model2 failed !\n");
730
+ return EXIT_FAILURE;
731
+ }
732
+ Config* config2 = Config::create_instance();
733
+ if(config2 == nullptr){
734
+ printf("Create config2 failed !\n");
735
+ return EXIT_FAILURE;
736
+ }
737
+ config2->implement_type = ImplementType::TYPE_LOCAL;
738
+ std::string model_type_lower2 = to_lower(args.model_type);
739
+ if (model_type_lower2 == "qnn"){
740
+ config2->framework_type = FrameworkType::TYPE_QNN;
741
+ } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") {
742
+ config2->framework_type = FrameworkType::TYPE_SNPE2;
743
+ }
744
+ config2->accelerate_type = AccelerateType::TYPE_DSP;
745
+ config2->is_quantify_model = 1;
746
+
747
+ std::vector<std::vector<uint32_t>> input_shapes2 = {{1,3,256,256}};
748
+ std::vector<std::vector<uint32_t>> output_shapes2 = {{1},{1},{1,21,3}};
749
+ model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
750
+ std::unique_ptr<Interpreter> fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2);
751
+ if(fast_interpreter2 == nullptr){
752
+ printf("build_interpretper_from_model_and_config2 failed !\n");
753
+ return EXIT_FAILURE;
754
+ }
755
+ result = fast_interpreter2->init();
756
+ if(result != EXIT_SUCCESS){
757
+ printf("interpreter2->init() failed !\n");
758
+ return EXIT_FAILURE;
759
+ }
760
+ // load model
761
+ fast_interpreter2->load_model();
762
+ if(result != EXIT_SUCCESS){
763
+ printf("interpreter2->load_model() failed !\n");
764
+ return EXIT_FAILURE;
765
+ }
766
+ printf("detect model2 load success!\n");
767
+ // =============================================================faceLandmark_model over
768
+
769
+
770
+ auto anchors = load_anchors_from_npy("../anchors_float32.npy");
771
+ cv::Mat frame = cv::imread(args.imgs);
772
+ if (frame.empty()) {
773
+ printf("detect image load failed!\n");
774
+ return 1;
775
+ }
776
+ // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
777
+ cv::Mat input_data;
778
+ cv::Mat frame_clone1 = frame.clone();
779
+ cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB);
780
+ cv::Mat frame_clone = frame.clone();
781
+
782
+
783
+ cv::Mat img1, img2;
784
+ float scale;
785
+ cv::Point pad;
786
+ std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1);
787
+ std::vector<float> input_tensor = preprocess_image(img1);
788
+
789
+ float *outdata0 = nullptr;
790
+ float *outdata1 = nullptr;
791
+ std::vector<float> invoke_time;
792
+ for (int i = 0; i < args.invoke_nums; ++i) {
793
+ result = fast_interpreter1->set_input_tensor(0, input_tensor.data());
794
+ if(result != EXIT_SUCCESS){
795
+ printf("interpreter->set_input_tensor() failed !\n");
796
+ return EXIT_FAILURE;
797
+ }
798
+ auto t1 = std::chrono::high_resolution_clock::now();
799
+ result = fast_interpreter1->invoke();
800
+ auto t2 = std::chrono::high_resolution_clock::now();
801
+ std::chrono::duration<double> cost_time = t2 - t1;
802
+ invoke_time.push_back(cost_time.count() * 1000);
803
+ if(result != EXIT_SUCCESS){
804
+ printf("interpreter->invoke() failed !\n");
805
+ return EXIT_FAILURE;
806
+ }
807
+ uint32_t out_data_0 = 0;
808
+ result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0);
809
+ if(result != EXIT_SUCCESS){
810
+ printf("interpreter1->get_output_tensor() 0 failed !\n");
811
+ return EXIT_FAILURE;
812
+ }
813
+
814
+ uint32_t out_data_1 = 0;
815
+ result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1);
816
+ if(result != EXIT_SUCCESS){
817
+ printf("interpreter1->get_output_tensor() 1 failed !\n");
818
+ return EXIT_FAILURE;
819
+ }
820
+
821
+ }
822
+
823
+ std::vector<float> tensor_1_896_16(outdata0, outdata0 + 2944*18);
824
+ std::vector<float> tensor_1_896_1(outdata1, outdata1 + 2944*1);
825
+
826
+ std::vector<std::vector<std::vector<float>>> detections = tensors_to_detections(
827
+ tensor_1_896_16, tensor_1_896_1, anchors,
828
+ batch, num_anchors, num_coords, num_classes, num_keypoints,
829
+ x_scale, y_scale, w_scale, h_scale,
830
+ score_clipping_thresh, min_score_thresh);
831
+
832
+
833
+ std::vector<std::vector<std::vector<float>>> filtered_detections;
834
+ for (size_t i = 0; i < detections.size(); ++i) {
835
+ std::vector<std::vector<float>>& dets = detections[i];
836
+ std::vector<std::vector<float>> faces = weighted_non_max_suppression(dets);
837
+ filtered_detections.push_back(faces);
838
+ }
839
+
840
+
841
+ // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n";
842
+ // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n";
843
+ std::vector<std::vector<float>> face_detections = denormalize_detections(filtered_detections[0], scale, pad);
844
+
845
+ // std::cout << "face_detections size: " << face_detections.size() << "\n";
846
+ std::vector<float> xc, yc, scales, theta;
847
+
848
+
849
+ detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0);
850
+ std::vector<cv::Mat> rois;
851
+ std::vector<cv::Mat> affines;
852
+ std::vector<std::vector<cv::Point2f>> boxes;
853
+
854
+ // std::cout << "xc size: " << xc.size() << ", yc size: " << yc.size() << ", scales size: " << scales.size() << ", theta size: " << theta.size() << "\n";
855
+ // std::cout << "xc: " << xc[0] << ", yc: " << yc[0] << ", scales: " << scales[0] << ", theta: " << theta[0] << "\n";
856
+ extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes);
857
+ if (!boxes.empty()) {
858
+ std::cout << "Detected " << boxes.size() << " faces.\n";
859
+ // 检测到人脸,继续处理 boxes[0] ...
860
+ std::vector<float> input_tensor = preprocess_imgs_to_nchw(rois);
861
+
862
+ // for (int i = 0; i < 5; ++i) {
863
+ // std::cout << "input_tensor:" << i << ": " << input_tensor[i] << std::endl;
864
+ // }
865
+
866
+ float *outdata1_0 = nullptr;
867
+ float *outdata1_1 = nullptr;
868
+
869
+ result = fast_interpreter2->set_input_tensor(0, input_tensor.data());
870
+ if(result != EXIT_SUCCESS){
871
+ printf("interpreter2->set_input_tensor() failed !\n");
872
+ return EXIT_FAILURE;
873
+ }
874
+ auto t1 = std::chrono::high_resolution_clock::now();
875
+ result = fast_interpreter2->invoke();
876
+ auto t2 = std::chrono::high_resolution_clock::now();
877
+ std::chrono::duration<double> cost_time = t2 - t1;
878
+ invoke_time.push_back(cost_time.count() * 1000);
879
+ if(result != EXIT_SUCCESS){
880
+ printf("interpreter2->invoke() failed !\n");
881
+ return EXIT_FAILURE;
882
+ }
883
+ uint32_t out_data_1_0 = 0;
884
+ result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0);
885
+ if(result != EXIT_SUCCESS){
886
+ printf("interpreter2->get_output_tensor() 0 failed !\n");
887
+ return EXIT_FAILURE;
888
+ }
889
+
890
+ uint32_t out_data_1_1 = 0;
891
+ result = fast_interpreter2->get_output_tensor(2, (void**)&outdata1_1, &out_data_1_1);
892
+ if(result != EXIT_SUCCESS){
893
+ printf("interpreter2->get_output_tensor() 1 failed !\n");
894
+ return EXIT_FAILURE;
895
+ }
896
+
897
+ std::vector<float> flags(outdata1_0, outdata1_0 + 1);
898
+ std::vector<float> normalized_landmarks(outdata1_1, outdata1_1 + 21*3);
899
+
900
+ std::vector<cv::Point2f> landmarks = denormalize_landmarks(normalized_landmarks, affines);
901
+ draw_landmarks(frame_clone1, landmarks, flags, HAND_CONNECTIONS);
902
+ } else {
903
+ std::cout << "not detect face!" << std::endl;
904
+ }
905
+
906
+
907
+ draw_roi(frame_clone1, boxes);
908
+ draw_detections(frame_clone1, face_detections);
909
+ cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR);
910
+ cv::imwrite("vis_result.jpg", frame_clone1);
911
+
912
+
913
+ fast_interpreter1->destory();
914
+ fast_interpreter2->destory();
915
+ return 0;
916
+
917
+ }
918
+
919
+
920
+ int main(int argc, char* argv[]) {
921
+ Args args = parse_args(argc, argv);
922
+ return invoke(args);
923
+ }
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/models/m_handDetctor_w8a8.qnn216.ctx.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28dab2c773e07727138edb87bf4ba8259d81b7fe424db7b69e5d67a6fbc28ac4
3
+ size 3589672
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/models/m_handLandmark_w8a8.qnn216.ctx.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d03fe17aac4e260aaa0d810aee512c4a5e9a9c64a63dadd1ff1bf8bd0d98dc1
3
+ size 7031432
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-38.pyc ADDED
Binary file (16.5 kB). View file
 
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/__pycache__/visualization.cpython-38.pyc ADDED
Binary file (4.59 kB). View file
 
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/anchors_palm.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24fa4a27ad6bee24ba3185a42fe3a47115540b0b27fa5956a291f03756183b41
3
+ size 94336
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/blazebase.py ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+
7
+
8
+ def resize_pad(img):
9
+ """ resize and pad images to be input to the detectors
10
+
11
+ The face and palm detector networks take 256x256 and 128x128 images
12
+ as input. As such the input image is padded and resized to fit the
13
+ size while maintaing the aspect ratio.
14
+
15
+ Returns:
16
+ img1: 256x256
17
+ img2: 128x128
18
+ scale: scale factor between original image and 256x256 image
19
+ pad: pixels of padding in the original image
20
+ """
21
+
22
+ size0 = img.shape
23
+ if size0[0]>=size0[1]:
24
+ h1 = 256
25
+ w1 = 256 * size0[1] // size0[0]
26
+ padh = 0
27
+ padw = 256 - w1
28
+ scale = size0[1] / w1
29
+ else:
30
+ h1 = 256 * size0[0] // size0[1]
31
+ w1 = 256
32
+ padh = 256 - h1
33
+ padw = 0
34
+ scale = size0[0] / h1
35
+ padh1 = padh//2
36
+ padh2 = padh//2 + padh%2
37
+ padw1 = padw//2
38
+ padw2 = padw//2 + padw%2
39
+ img1 = cv2.resize(img, (w1,h1))
40
+ img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0)))
41
+ pad = (int(padh1 * scale), int(padw1 * scale))
42
+ img2 = cv2.resize(img1, (128,128))
43
+ return img1, img2, scale, pad
44
+
45
+
46
+ def denormalize_detections(detections, scale, pad):
47
+ """ maps detection coordinates from [0,1] to image coordinates
48
+
49
+ The face and palm detector networks take 256x256 and 128x128 images
50
+ as input. As such the input image is padded and resized to fit the
51
+ size while maintaing the aspect ratio. This function maps the
52
+ normalized coordinates back to the original image coordinates.
53
+
54
+ Inputs:
55
+ detections: nxm tensor. n is the number of detections.
56
+ m is 4+2*k where the first 4 valuse are the bounding
57
+ box coordinates and k is the number of additional
58
+ keypoints output by the detector.
59
+ scale: scalar that was used to resize the image
60
+ pad: padding in the x and y dimensions
61
+
62
+ """
63
+ detections[:, 0] = detections[:, 0] * scale * 256 - pad[0]
64
+ detections[:, 1] = detections[:, 1] * scale * 256 - pad[1]
65
+ detections[:, 2] = detections[:, 2] * scale * 256 - pad[0]
66
+ detections[:, 3] = detections[:, 3] * scale * 256 - pad[1]
67
+
68
+ detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1]
69
+ detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0]
70
+ return detections
71
+
72
+
73
+
74
+
75
+ class BlazeBlock(nn.Module):
76
+ def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False):
77
+ super(BlazeBlock, self).__init__()
78
+
79
+ self.stride = stride
80
+ self.kernel_size = kernel_size
81
+ self.channel_pad = out_channels - in_channels
82
+
83
+ # TFLite uses slightly different padding than PyTorch
84
+ # on the depthwise conv layer when the stride is 2.
85
+ if stride == 2:
86
+ self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
87
+ padding = 0
88
+ else:
89
+ padding = (kernel_size - 1) // 2
90
+
91
+ self.convs = nn.Sequential(
92
+ nn.Conv2d(in_channels=in_channels, out_channels=in_channels,
93
+ kernel_size=kernel_size, stride=stride, padding=padding,
94
+ groups=in_channels, bias=True),
95
+ nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
96
+ kernel_size=1, stride=1, padding=0, bias=True),
97
+ )
98
+
99
+ if skip_proj:
100
+ self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
101
+ kernel_size=1, stride=1, padding=0, bias=True)
102
+ else:
103
+ self.skip_proj = None
104
+
105
+ if act == 'relu':
106
+ self.act = nn.ReLU(inplace=True)
107
+ elif act == 'prelu':
108
+ self.act = nn.PReLU(out_channels)
109
+ else:
110
+ raise NotImplementedError("unknown activation %s"%act)
111
+
112
+ def forward(self, x):
113
+ if self.stride == 2:
114
+ if self.kernel_size==3:
115
+ h = F.pad(x, (0, 2, 0, 2), "constant", 0)
116
+ else:
117
+ h = F.pad(x, (1, 2, 1, 2), "constant", 0)
118
+ x = self.max_pool(x)
119
+ else:
120
+ h = x
121
+
122
+ if self.skip_proj is not None:
123
+ x = self.skip_proj(x)
124
+ elif self.channel_pad > 0:
125
+ x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
126
+
127
+
128
+ return self.act(self.convs(h) + x)
129
+
130
+
131
+ class FinalBlazeBlock(nn.Module):
132
+ def __init__(self, channels, kernel_size=3):
133
+ super(FinalBlazeBlock, self).__init__()
134
+
135
+ # TFLite uses slightly different padding than PyTorch
136
+ # on the depthwise conv layer when the stride is 2.
137
+ self.convs = nn.Sequential(
138
+ nn.Conv2d(in_channels=channels, out_channels=channels,
139
+ kernel_size=kernel_size, stride=2, padding=0,
140
+ groups=channels, bias=True),
141
+ nn.Conv2d(in_channels=channels, out_channels=channels,
142
+ kernel_size=1, stride=1, padding=0, bias=True),
143
+ )
144
+
145
+ self.act = nn.ReLU(inplace=True)
146
+
147
+ def forward(self, x):
148
+ h = F.pad(x, (0, 2, 0, 2), "constant", 0)
149
+
150
+ return self.act(self.convs(h))
151
+
152
+
153
+ class BlazeBase(nn.Module):
154
+ """ Base class for media pipe models. """
155
+
156
+ def _device(self):
157
+ """Which device (CPU or GPU) is being used by this model?"""
158
+ return self.classifier_8.weight.device
159
+
160
+ def load_weights(self, path):
161
+ self.load_state_dict(torch.load(path))
162
+ self.eval()
163
+
164
+
165
+ class BlazeLandmark(BlazeBase):
166
+ """ Base class for landmark models. """
167
+
168
+ def extract_roi(self, frame, xc, yc, theta, scale):
169
+
170
+ # take points on unit square and transform them according to the roi
171
+ points = torch.tensor([[-1, -1, 1, 1],
172
+ [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
173
+ points = points * scale.view(-1,1,1)/2
174
+ theta = theta.view(-1, 1, 1)
175
+ R = torch.cat((
176
+ torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
177
+ torch.cat((torch.sin(theta), torch.cos(theta)), 2),
178
+ ), 1)
179
+ center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
180
+ points = R @ points + center
181
+
182
+ # use the points to compute the affine transform that maps
183
+ # these points back to the output square
184
+ res = self.resolution
185
+ points1 = np.array([[0, 0, res-1],
186
+ [0, res-1, 0]], dtype=np.float32).T
187
+ affines = []
188
+ imgs = []
189
+ for i in range(points.shape[0]):
190
+ pts = points[i, :, :3].cpu().numpy().T
191
+ M = cv2.getAffineTransform(pts, points1)
192
+ img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
193
+ img = torch.tensor(img, device=scale.device)
194
+ imgs.append(img)
195
+ affine = cv2.invertAffineTransform(M).astype('float32')
196
+ affine = torch.tensor(affine, device=scale.device)
197
+ affines.append(affine)
198
+ if imgs:
199
+ imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
200
+ affines = torch.stack(affines)
201
+ else:
202
+ imgs = torch.zeros((0, 3, res, res), device=scale.device)
203
+ affines = torch.zeros((0, 2, 3), device=scale.device)
204
+
205
+ return imgs, affines, points
206
+
207
+ def denormalize_landmarks(self, landmarks, affines):
208
+ landmarks[:,:,:2] *= self.resolution
209
+ for i in range(len(landmarks)):
210
+ landmark, affine = landmarks[i], affines[i]
211
+ landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
212
+ landmarks[i,:,:2] = landmark
213
+ return landmarks
214
+
215
+
216
+
217
+ class BlazeDetector(BlazeBase):
218
+ """ Base class for detector models.
219
+
220
+ Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
221
+ https://github.com/hollance/BlazeFace-PyTorch and
222
+ https://github.com/google/mediapipe/
223
+ """
224
+ def load_anchors(self, path):
225
+ self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
226
+ assert(self.anchors.ndimension() == 2)
227
+ assert(self.anchors.shape[0] == self.num_anchors)
228
+ assert(self.anchors.shape[1] == 4)
229
+
230
+ def _preprocess(self, x):
231
+ """Converts the image pixels to the range [-1, 1]."""
232
+ return x.float() / 255.# 127.5 - 1.0
233
+
234
+ def predict_on_image(self, img):
235
+ """Makes a prediction on a single image.
236
+
237
+ Arguments:
238
+ img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
239
+ shape (3, H, W). The image's height and width should be
240
+ 128 pixels.
241
+
242
+ Returns:
243
+ A tensor with face detections.
244
+ """
245
+ if isinstance(img, np.ndarray):
246
+ img = torch.from_numpy(img).permute((2, 0, 1))
247
+
248
+ return self.predict_on_batch(img.unsqueeze(0))[0]
249
+
250
+ def predict_on_batch(self, x):
251
+ """Makes a prediction on a batch of images.
252
+
253
+ Arguments:
254
+ x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
255
+ shape (b, 3, H, W). The height and width should be 128 pixels.
256
+
257
+ Returns:
258
+ A list containing a tensor of face detections for each image in
259
+ the batch. If no faces are found for an image, returns a tensor
260
+ of shape (0, 17).
261
+
262
+ Each face detection is a PyTorch tensor consisting of 17 numbers:
263
+ - ymin, xmin, ymax, xmax
264
+ - x,y-coordinates for the 6 keypoints
265
+ - confidence score
266
+ """
267
+ if isinstance(x, np.ndarray):
268
+ x = torch.from_numpy(x).permute((0, 3, 1, 2))
269
+
270
+ assert x.shape[1] == 3
271
+ assert x.shape[2] == self.y_scale
272
+ assert x.shape[3] == self.x_scale
273
+
274
+ # 1. Preprocess the images into tensors:
275
+ x = x.to(self._device())
276
+ x = self._preprocess(x)
277
+
278
+ # 2. Run the neural network:
279
+ with torch.no_grad():
280
+ out = self.__call__(x)
281
+
282
+ # 3. Postprocess the raw predictions:
283
+ detections = self._tensors_to_detections(out[0], out[1], self.anchors)
284
+
285
+ # 4. Non-maximum suppression to remove overlapping detections:
286
+ filtered_detections = []
287
+ for i in range(len(detections)):
288
+ faces = self._weighted_non_max_suppression(detections[i])
289
+ faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1))
290
+ filtered_detections.append(faces)
291
+
292
+ return filtered_detections
293
+
294
+
295
+ def detection2roi(self, detection):
296
+ """ Convert detections from detector to an oriented bounding box.
297
+
298
+ Adapted from:
299
+ # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
300
+
301
+ The center and size of the box is calculated from the center
302
+ of the detected box. Rotation is calcualted from the vector
303
+ between kp1 and kp2 relative to theta0. The box is scaled
304
+ and shifted by dscale and dy.
305
+
306
+ """
307
+ if self.detection2roi_method == 'box':
308
+ # compute box center and scale
309
+ # use mediapipe/calculators/util/detections_to_rects_calculator.cc
310
+ xc = (detection[:,1] + detection[:,3]) / 2
311
+ yc = (detection[:,0] + detection[:,2]) / 2
312
+ scale = (detection[:,3] - detection[:,1]) # assumes square boxes
313
+
314
+ elif self.detection2roi_method == 'alignment':
315
+ # compute box center and scale
316
+ # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc
317
+ xc = detection[:,4+2*self.kp1]
318
+ yc = detection[:,4+2*self.kp1+1]
319
+ x1 = detection[:,4+2*self.kp2]
320
+ y1 = detection[:,4+2*self.kp2+1]
321
+ scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2
322
+ else:
323
+ raise NotImplementedError(
324
+ "detection2roi_method [%s] not supported"%self.detection2roi_method)
325
+
326
+ yc += self.dy * scale
327
+ scale *= self.dscale
328
+
329
+ # compute box rotation
330
+ x0 = detection[:,4+2*self.kp1]
331
+ y0 = detection[:,4+2*self.kp1+1]
332
+ x1 = detection[:,4+2*self.kp2]
333
+ y1 = detection[:,4+2*self.kp2+1]
334
+ #theta = np.arctan2(y0-y1, x0-x1) - self.theta0
335
+ theta = torch.atan2(y0-y1, x0-x1) - self.theta0
336
+ return xc, yc, scale, theta
337
+
338
+
339
+ def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
340
+ """The output of the neural network is a tensor of shape (b, 896, 16)
341
+ containing the bounding box regressor predictions, as well as a tensor
342
+ of shape (b, 896, 1) with the classification confidences.
343
+
344
+ This function converts these two "raw" tensors into proper detections.
345
+ Returns a list of (num_detections, 17) tensors, one for each image in
346
+ the batch.
347
+
348
+ This is based on the source code from:
349
+ mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
350
+ mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
351
+ """
352
+ assert raw_box_tensor.ndimension() == 3
353
+ assert raw_box_tensor.shape[1] == self.num_anchors
354
+ assert raw_box_tensor.shape[2] == self.num_coords
355
+
356
+ assert raw_score_tensor.ndimension() == 3
357
+ assert raw_score_tensor.shape[1] == self.num_anchors
358
+ assert raw_score_tensor.shape[2] == self.num_classes
359
+
360
+ assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
361
+
362
+ detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
363
+
364
+ thresh = self.score_clipping_thresh
365
+ raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
366
+ detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
367
+
368
+ # Note: we stripped off the last dimension from the scores tensor
369
+ # because there is only has one class. Now we can simply use a mask
370
+ # to filter out the boxes with too low confidence.
371
+ mask = detection_scores >= self.min_score_thresh
372
+
373
+ # Because each image from the batch can have a different number of
374
+ # detections, process them one at a time using a loop.
375
+ output_detections = []
376
+ for i in range(raw_box_tensor.shape[0]):
377
+ boxes = detection_boxes[i, mask[i]]
378
+ scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
379
+ output_detections.append(torch.cat((boxes, scores), dim=-1))
380
+
381
+ return output_detections
382
+
383
+ def _decode_boxes(self, raw_boxes, anchors):
384
+ """Converts the predictions into actual coordinates using
385
+ the anchor boxes. Processes the entire batch at once.
386
+ """
387
+ boxes = torch.zeros_like(raw_boxes)
388
+
389
+ x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
390
+ y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
391
+
392
+ w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
393
+ h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
394
+
395
+ boxes[..., 0] = y_center - h / 2. # ymin
396
+ boxes[..., 1] = x_center - w / 2. # xmin
397
+ boxes[..., 2] = y_center + h / 2. # ymax
398
+ boxes[..., 3] = x_center + w / 2. # xmax
399
+
400
+ for k in range(self.num_keypoints):
401
+ offset = 4 + k*2
402
+ keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
403
+ keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
404
+ boxes[..., offset ] = keypoint_x
405
+ boxes[..., offset + 1] = keypoint_y
406
+
407
+ return boxes
408
+
409
+ def _weighted_non_max_suppression(self, detections):
410
+ """The alternative NMS method as mentioned in the BlazeFace paper:
411
+
412
+ "We replace the suppression algorithm with a blending strategy that
413
+ estimates the regression parameters of a bounding box as a weighted
414
+ mean between the overlapping predictions."
415
+
416
+ The original MediaPipe code assigns the score of the most confident
417
+ detection to the weighted detection, but we take the average score
418
+ of the overlapping detections.
419
+
420
+ The input detections should be a Tensor of shape (count, 17).
421
+
422
+ Returns a list of PyTorch tensors, one for each detected face.
423
+
424
+ This is based on the source code from:
425
+ mediapipe/calculators/util/non_max_suppression_calculator.cc
426
+ mediapipe/calculators/util/non_max_suppression_calculator.proto
427
+ """
428
+ if len(detections) == 0: return []
429
+
430
+ output_detections = []
431
+
432
+ # Sort the detections from highest to lowest score.
433
+ remaining = torch.argsort(detections[:, self.num_coords], descending=True)
434
+
435
+ while len(remaining) > 0:
436
+ detection = detections[remaining[0]]
437
+
438
+ # Compute the overlap between the first box and the other
439
+ # remaining boxes. (Note that the other_boxes also include
440
+ # the first_box.)
441
+ first_box = detection[:4]
442
+ other_boxes = detections[remaining, :4]
443
+ ious = overlap_similarity(first_box, other_boxes)
444
+
445
+ # If two detections don't overlap enough, they are considered
446
+ # to be from different faces.
447
+ mask = ious > self.min_suppression_threshold
448
+ overlapping = remaining[mask]
449
+ remaining = remaining[~mask]
450
+
451
+ # Take an average of the coordinates from the overlapping
452
+ # detections, weighted by their confidence scores.
453
+ weighted_detection = detection.clone()
454
+ if len(overlapping) > 1:
455
+ coordinates = detections[overlapping, :self.num_coords]
456
+ scores = detections[overlapping, self.num_coords:self.num_coords+1]
457
+ total_score = scores.sum()
458
+ weighted = (coordinates * scores).sum(dim=0) / total_score
459
+ weighted_detection[:self.num_coords] = weighted
460
+ weighted_detection[self.num_coords] = total_score / len(overlapping)
461
+
462
+ output_detections.append(weighted_detection)
463
+
464
+ return output_detections
465
+
466
+
467
+ # IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
468
+
469
+ def intersect(box_a, box_b):
470
+ """ We resize both tensors to [A,B,2] without new malloc:
471
+ [A,2] -> [A,1,2] -> [A,B,2]
472
+ [B,2] -> [1,B,2] -> [A,B,2]
473
+ Then we compute the area of intersect between box_a and box_b.
474
+ Args:
475
+ box_a: (tensor) bounding boxes, Shape: [A,4].
476
+ box_b: (tensor) bounding boxes, Shape: [B,4].
477
+ Return:
478
+ (tensor) intersection area, Shape: [A,B].
479
+ """
480
+ A = box_a.size(0)
481
+ B = box_b.size(0)
482
+ max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
483
+ box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
484
+ min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
485
+ box_b[:, :2].unsqueeze(0).expand(A, B, 2))
486
+ inter = torch.clamp((max_xy - min_xy), min=0)
487
+ return inter[:, :, 0] * inter[:, :, 1]
488
+
489
+
490
+ def jaccard(box_a, box_b):
491
+ """Compute the jaccard overlap of two sets of boxes. The jaccard overlap
492
+ is simply the intersection over union of two boxes. Here we operate on
493
+ ground truth boxes and default boxes.
494
+ E.g.:
495
+ A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
496
+ Args:
497
+ box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
498
+ box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
499
+ Return:
500
+ jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
501
+ """
502
+ inter = intersect(box_a, box_b)
503
+ area_a = ((box_a[:, 2]-box_a[:, 0]) *
504
+ (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
505
+ area_b = ((box_b[:, 2]-box_b[:, 0]) *
506
+ (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
507
+ union = area_a + area_b - inter
508
+ return inter / union # [A,B]
509
+
510
+
511
+ def overlap_similarity(box, other_boxes):
512
+ """Computes the IOU between a bounding box and set of other boxes."""
513
+ return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/blazehand_landmark.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+
6
+ from blazebase import BlazeLandmark, BlazeBlock
7
+
8
+ class BlazeHandLandmark(BlazeLandmark):
9
+ """The hand landmark model from MediaPipe.
10
+
11
+ """
12
+ def __init__(self):
13
+ super(BlazeHandLandmark, self).__init__()
14
+
15
+ # size of ROIs used for input
16
+ self.resolution = 256
17
+
18
+ self._define_layers()
19
+
20
+ def _define_layers(self):
21
+ self.backbone1 = nn.Sequential(
22
+ nn.Conv2d(in_channels=3, out_channels=24, kernel_size=3, stride=2, padding=0, bias=True),
23
+ nn.ReLU(inplace=True),
24
+
25
+ BlazeBlock(24, 24, 5),
26
+ BlazeBlock(24, 24, 5),
27
+ BlazeBlock(24, 48, 5, 2),
28
+ )
29
+
30
+ self.backbone2 = nn.Sequential(
31
+ BlazeBlock(48, 48, 5),
32
+ BlazeBlock(48, 48, 5),
33
+ BlazeBlock(48, 96, 5, 2),
34
+ )
35
+
36
+ self.backbone3 = nn.Sequential(
37
+ BlazeBlock(96, 96, 5),
38
+ BlazeBlock(96, 96, 5),
39
+ BlazeBlock(96, 96, 5, 2),
40
+ )
41
+
42
+ self.backbone4 = nn.Sequential(
43
+ BlazeBlock(96, 96, 5),
44
+ BlazeBlock(96, 96, 5),
45
+ BlazeBlock(96, 96, 5, 2),
46
+ )
47
+
48
+ self.blaze5 = BlazeBlock(96, 96, 5)
49
+ self.blaze6 = BlazeBlock(96, 96, 5)
50
+ self.conv7 = nn.Conv2d(96, 48, 1, bias=True)
51
+
52
+ self.backbone8 = nn.Sequential(
53
+ BlazeBlock(48, 48, 5),
54
+ BlazeBlock(48, 48, 5),
55
+ BlazeBlock(48, 48, 5),
56
+ BlazeBlock(48, 48, 5),
57
+ BlazeBlock(48, 96, 5, 2),
58
+ BlazeBlock(96, 96, 5),
59
+ BlazeBlock(96, 96, 5),
60
+ BlazeBlock(96, 96, 5),
61
+ BlazeBlock(96, 96, 5),
62
+ BlazeBlock(96, 288, 5, 2),
63
+ BlazeBlock(288, 288, 5),
64
+ BlazeBlock(288, 288, 5),
65
+ BlazeBlock(288, 288, 5),
66
+ BlazeBlock(288, 288, 5),
67
+ BlazeBlock(288, 288, 5, 2),
68
+ BlazeBlock(288, 288, 5),
69
+ BlazeBlock(288, 288, 5),
70
+ BlazeBlock(288, 288, 5),
71
+ BlazeBlock(288, 288, 5),
72
+ BlazeBlock(288, 288, 5, 2),
73
+ BlazeBlock(288, 288, 5),
74
+ BlazeBlock(288, 288, 5),
75
+ BlazeBlock(288, 288, 5),
76
+ BlazeBlock(288, 288, 5),
77
+ BlazeBlock(288, 288, 5, 2),
78
+ BlazeBlock(288, 288, 5),
79
+ BlazeBlock(288, 288, 5),
80
+ BlazeBlock(288, 288, 5),
81
+ BlazeBlock(288, 288, 5),
82
+ )
83
+
84
+ self.hand_flag = nn.Conv2d(288, 1, 2, bias=True)
85
+ self.handed = nn.Conv2d(288, 1, 2, bias=True)
86
+ self.landmarks = nn.Conv2d(288, 63, 2, bias=True)
87
+
88
+
89
+ def forward(self, x):
90
+ if x.shape[0] == 0:
91
+ return torch.zeros((0,)), torch.zeros((0,)), torch.zeros((0, 21, 3))
92
+
93
+ x = F.pad(x, (0, 1, 0, 1), "constant", 0)
94
+
95
+ x = self.backbone1(x)
96
+ y = self.backbone2(x)
97
+ z = self.backbone3(y)
98
+ w = self.backbone4(z)
99
+
100
+ z = z + F.interpolate(w, scale_factor=2, mode='bilinear')
101
+ z = self.blaze5(z)
102
+
103
+ y = y + F.interpolate(z, scale_factor=2, mode='bilinear')
104
+ y = self.blaze6(y)
105
+ y = self.conv7(y)
106
+
107
+ x = x + F.interpolate(y, scale_factor=2, mode='bilinear')
108
+
109
+ x = self.backbone8(x)
110
+
111
+ hand_flag = self.hand_flag(x).view(-1).sigmoid()
112
+ handed = self.handed(x).view(-1).sigmoid()
113
+ landmarks = self.landmarks(x).view(-1, 21, 3) / 256
114
+
115
+ return hand_flag, handed, landmarks
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/blazepalm.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+
6
+ from blazebase import BlazeDetector, BlazeBlock
7
+
8
+
9
+ class BlazePalm(BlazeDetector):
10
+ """The palm detection model from MediaPipe. """
11
+ def __init__(self):
12
+ super(BlazePalm, self).__init__()
13
+
14
+ # These are the settings from the MediaPipe example graph
15
+ # mediapipe/graphs/hand_tracking/subgraphs/hand_detection_gpu.pbtxt
16
+ self.num_classes = 1
17
+ self.num_anchors = 2944
18
+ self.num_coords = 18
19
+ self.score_clipping_thresh = 100.0
20
+ self.x_scale = 256.0
21
+ self.y_scale = 256.0
22
+ self.h_scale = 256.0
23
+ self.w_scale = 256.0
24
+ self.min_score_thresh = 0.5
25
+ self.min_suppression_threshold = 0.3
26
+ self.num_keypoints = 7
27
+
28
+ # These settings are for converting detections to ROIs which can then
29
+ # be extracted and feed into the landmark network
30
+ # use mediapipe/calculators/util/detections_to_rects_calculator.cc
31
+ self.detection2roi_method = 'box'
32
+ # mediapipe/graphs/hand_tracking/subgraphs/hand_detection_cpu.pbtxt
33
+ self.kp1 = 0
34
+ self.kp2 = 2
35
+ self.theta0 = np.pi/2
36
+ self.dscale = 2.6
37
+ self.dy = -0.5
38
+
39
+ self._define_layers()
40
+
41
+ def _define_layers(self):
42
+ self.backbone1 = nn.Sequential(
43
+ nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=2, padding=0, bias=True),
44
+ nn.ReLU(inplace=True),
45
+
46
+ BlazeBlock(32, 32),
47
+ BlazeBlock(32, 32),
48
+ BlazeBlock(32, 32),
49
+ BlazeBlock(32, 32),
50
+ BlazeBlock(32, 32),
51
+ BlazeBlock(32, 32),
52
+ BlazeBlock(32, 32),
53
+
54
+ BlazeBlock(32, 64, stride=2),
55
+ BlazeBlock(64, 64),
56
+ BlazeBlock(64, 64),
57
+ BlazeBlock(64, 64),
58
+ BlazeBlock(64, 64),
59
+ BlazeBlock(64, 64),
60
+ BlazeBlock(64, 64),
61
+ BlazeBlock(64, 64),
62
+
63
+ BlazeBlock(64, 128, stride=2),
64
+ BlazeBlock(128, 128),
65
+ BlazeBlock(128, 128),
66
+ BlazeBlock(128, 128),
67
+ BlazeBlock(128, 128),
68
+ BlazeBlock(128, 128),
69
+ BlazeBlock(128, 128),
70
+ BlazeBlock(128, 128),
71
+
72
+ )
73
+
74
+ self.backbone2 = nn.Sequential(
75
+ BlazeBlock(128, 256, stride=2),
76
+ BlazeBlock(256, 256),
77
+ BlazeBlock(256, 256),
78
+ BlazeBlock(256, 256),
79
+ BlazeBlock(256, 256),
80
+ BlazeBlock(256, 256),
81
+ BlazeBlock(256, 256),
82
+ BlazeBlock(256, 256),
83
+ )
84
+
85
+ self.backbone3 = nn.Sequential(
86
+ BlazeBlock(256, 256, stride=2),
87
+ BlazeBlock(256, 256),
88
+ BlazeBlock(256, 256),
89
+ BlazeBlock(256, 256),
90
+ BlazeBlock(256, 256),
91
+ BlazeBlock(256, 256),
92
+ BlazeBlock(256, 256),
93
+ BlazeBlock(256, 256),
94
+ )
95
+
96
+ self.conv_transpose1 = nn.ConvTranspose2d(in_channels=256, out_channels=256, kernel_size=2, stride=2, padding=0, bias=True)
97
+ self.blaze1 = BlazeBlock(256, 256)
98
+
99
+ self.conv_transpose2 = nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=2, stride=2, padding=0, bias=True)
100
+ self.blaze2 = BlazeBlock(128, 128)
101
+
102
+ self.classifier_32 = nn.Conv2d(128, 2, 1, bias=True)
103
+ self.classifier_16 = nn.Conv2d(256, 2, 1, bias=True)
104
+ self.classifier_8 = nn.Conv2d(256, 6, 1, bias=True)
105
+
106
+ self.regressor_32 = nn.Conv2d(128, 36, 1, bias=True)
107
+ self.regressor_16 = nn.Conv2d(256, 36, 1, bias=True)
108
+ self.regressor_8 = nn.Conv2d(256, 108, 1, bias=True)
109
+
110
+ def forward(self, x):
111
+ b = x.shape[0] # batch size, needed for reshaping later
112
+
113
+ x = F.pad(x, (0, 1, 0, 1), "constant", 0)
114
+
115
+ x = self.backbone1(x) # (b, 128, 32, 32)
116
+ y = self.backbone2(x) # (b, 256, 16, 16)
117
+ z = self.backbone3(y) # (b, 256, 8, 8)
118
+
119
+ y = y + F.relu(self.conv_transpose1(z), True)
120
+ y = self.blaze1(y)
121
+
122
+ x = x + F.relu(self.conv_transpose2(y), True)
123
+ x = self.blaze2(x)
124
+
125
+
126
+ # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
127
+ # permute the output from the conv layers before reshaping it.
128
+
129
+ c1 = self.classifier_8(z) # (b, 2, 16, 16)
130
+ c1 = c1.permute(0, 2, 3, 1) # (b, 16, 16, 2)
131
+ c1 = c1.reshape(b, -1, 1) # (b, 512, 1)
132
+
133
+ c2 = self.classifier_16(y) # (b, 6, 8, 8)
134
+ c2 = c2.permute(0, 2, 3, 1) # (b, 8, 8, 6)
135
+ c2 = c2.reshape(b, -1, 1) # (b, 384, 1)
136
+
137
+ c3 = self.classifier_32(x) # (b, 6, 8, 8)
138
+ c3 = c3.permute(0, 2, 3, 1) # (b, 8, 8, 6)
139
+ c3 = c3.reshape(b, -1, 1) # (b, 384, 1)
140
+
141
+ c = torch.cat((c3, c2, c1), dim=1) # (b, 896, 1)
142
+
143
+ r1 = self.regressor_8(z) # (b, 32, 16, 16)
144
+ r1 = r1.permute(0, 2, 3, 1) # (b, 16, 16, 32)
145
+ r1 = r1.reshape(b, -1, 18) # (b, 512, 16)
146
+
147
+ r2 = self.regressor_16(y) # (b, 96, 8, 8)
148
+ r2 = r2.permute(0, 2, 3, 1) # (b, 8, 8, 96)
149
+ r2 = r2.reshape(b, -1, 18) # (b, 384, 16)
150
+
151
+ r3 = self.regressor_32(x) # (b, 96, 8, 8)
152
+ r3 = r3.permute(0, 2, 3, 1) # (b, 8, 8, 96)
153
+ r3 = r3.reshape(b, -1, 18) # (b, 384, 16)
154
+
155
+ r = torch.cat((r3, r2, r1), dim=1) # (b, 896, 16)
156
+
157
+ return [r, c]
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/demo_qnn.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import cv2
4
+ import sys
5
+ from blazebase import resize_pad, denormalize_detections
6
+ from visualization import draw_landmarks, draw_roi, HAND_CONNECTIONS
7
+ import time
8
+ import aidlite
9
+ import os
10
+
11
+
12
+ class post_mediapipe_hand:
13
+ def __init__(self):
14
+ self.kp1 = 0
15
+ self.kp2 = 2
16
+ self.theta0 = 1.5707963267948966
17
+ self.dscale = 2.6
18
+ self.dy = -0.5
19
+ self.x_scale = 256.0
20
+ self.y_scale = 256.0
21
+ self.h_scale = 256.0
22
+ self.w_scale = 256.0
23
+ self.num_keypoints = 7
24
+ self.num_classes = 1
25
+ self.num_anchors = 2944
26
+ self.num_coords = 18
27
+ self.min_score_thresh = 0.75
28
+ self.score_clipping_thresh = 100.0
29
+ self.min_suppression_threshold = 0.3
30
+ self.resolution = 256
31
+
32
+
33
+ def detection2roi(self,detection):
34
+ xc = (detection[:,1] + detection[:,3]) / 2
35
+ yc = (detection[:,0] + detection[:,2]) / 2
36
+ scale = (detection[:,3] - detection[:,1]) # assumes square boxes
37
+ yc += self.dy * scale
38
+ scale *= self.dscale
39
+ # compute box rotation
40
+ x0 = detection[:,4+2*self.kp1]
41
+ y0 = detection[:,4+2*self.kp1+1]
42
+ x1 = detection[:,4+2*self.kp2]
43
+ y1 = detection[:,4+2*self.kp2+1]
44
+ theta = torch.atan2(y0-y1, x0-x1) - self.theta0
45
+ return xc, yc, scale, theta
46
+
47
+ def _decode_boxes( self,raw_boxes, anchors):
48
+ boxes = torch.zeros_like(raw_boxes)
49
+
50
+ x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
51
+ y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
52
+
53
+ w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
54
+ h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
55
+
56
+ boxes[..., 0] = y_center - h / 2. # ymin
57
+ boxes[..., 1] = x_center - w / 2. # xmin
58
+ boxes[..., 2] = y_center + h / 2. # ymax
59
+ boxes[..., 3] = x_center + w / 2. # xmax
60
+
61
+ for k in range(self.num_keypoints):
62
+ offset = 4 + k*2
63
+ keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
64
+ keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
65
+ boxes[..., offset ] = keypoint_x
66
+ boxes[..., offset + 1] = keypoint_y
67
+ return boxes
68
+
69
+ def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
70
+ assert raw_box_tensor.ndimension() == 3
71
+ assert raw_box_tensor.shape[1] == self.num_anchors
72
+ assert raw_box_tensor.shape[2] == self.num_coords
73
+
74
+ assert raw_score_tensor.ndimension() == 3
75
+ assert raw_score_tensor.shape[1] == self.num_anchors
76
+ assert raw_score_tensor.shape[2] == self.num_classes
77
+
78
+ assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
79
+
80
+ detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
81
+
82
+ thresh = self.score_clipping_thresh
83
+ raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
84
+ detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
85
+
86
+ # Note: we stripped off the last dimension from the scores tensor
87
+ # because there is only has one class. Now we can simply use a mask
88
+ # to filter out the boxes with too low confidence.
89
+ mask = detection_scores >= self.min_score_thresh
90
+
91
+ # Because each image from the batch can have a different number of
92
+ # detections, process them one at a time using a loop.
93
+ output_detections = []
94
+ for i in range(raw_box_tensor.shape[0]):
95
+ boxes = detection_boxes[i, mask[i]]
96
+ scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
97
+ output_detections.append(torch.cat((boxes, scores), dim=-1))
98
+
99
+ return output_detections
100
+
101
+ def extract_roi( self,frame, xc, yc, theta, scale):
102
+ # take points on unit square and transform them according to the roi
103
+ points = torch.tensor([[-1, -1, 1, 1],
104
+ [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
105
+ points = points * scale.view(-1,1,1)/2
106
+ theta = theta.view(-1, 1, 1)
107
+ R = torch.cat((
108
+ torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
109
+ torch.cat((torch.sin(theta), torch.cos(theta)), 2),
110
+ ), 1)
111
+ center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
112
+ points = R @ points + center
113
+
114
+ # use the points to compute the affine transform that maps
115
+ # these points back to the output square
116
+ res = self.resolution
117
+ points1 = np.array([[0, 0, res-1],
118
+ [0, res-1, 0]], dtype=np.float32).T
119
+ affines = []
120
+ imgs = []
121
+ for i in range(points.shape[0]):
122
+ pts = points[i, :, :3].detach().numpy().T
123
+ M = cv2.getAffineTransform(pts, points1)
124
+ img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
125
+ img = torch.tensor(img, device=scale.device)
126
+ imgs.append(img)
127
+ affine = cv2.invertAffineTransform(M).astype('float32')
128
+ affine = torch.tensor(affine, device=scale.device)
129
+ affines.append(affine)
130
+ if imgs:
131
+ imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
132
+ affines = torch.stack(affines)
133
+ else:
134
+ imgs = torch.zeros((0, 3, res, res), device=scale.device)
135
+ affines = torch.zeros((0, 2, 3), device=scale.device)
136
+
137
+ return imgs, affines, points
138
+
139
+ def denormalize_landmarks(self, landmarks, affines):
140
+ landmarks[:,:,:2] *= self.resolution
141
+ for i in range(len(landmarks)):
142
+ landmark, affine = landmarks[i], affines[i]
143
+ landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
144
+ landmarks[i,:,:2] = landmark
145
+ return landmarks
146
+
147
+ def intersect(self,box_a, box_b):
148
+ A = box_a.size(0)
149
+ B = box_b.size(0)
150
+ max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
151
+ box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
152
+ min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
153
+ box_b[:, :2].unsqueeze(0).expand(A, B, 2))
154
+ inter = torch.clamp((max_xy - min_xy), min=0)
155
+ return inter[:, :, 0] * inter[:, :, 1]
156
+
157
+ def jaccard(self,box_a, box_b):
158
+ inter = self.intersect(box_a, box_b)
159
+ area_a = ((box_a[:, 2]-box_a[:, 0]) *
160
+ (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
161
+ area_b = ((box_b[:, 2]-box_b[:, 0]) *
162
+ (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
163
+ union = area_a + area_b - inter
164
+ return inter / union # [A,B]
165
+
166
+
167
+ def overlap_similarity(self,box, other_boxes):
168
+ """Computes the IOU between a bounding box and set of other boxes."""
169
+ return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
170
+
171
+ def _weighted_non_max_suppression(self,detections):
172
+ if len(detections) == 0: return []
173
+ output_detections = []
174
+
175
+ # Sort the detections from highest to lowest score.
176
+ remaining = torch.argsort(detections[:, num_coords], descending=True)
177
+
178
+ while len(remaining) > 0:
179
+ detection = detections[remaining[0]]
180
+
181
+ # Compute the overlap between the first box and the other
182
+ # remaining boxes. (Note that the other_boxes also include
183
+ # the first_box.)
184
+ first_box = detection[:4]
185
+ other_boxes = detections[remaining, :4]
186
+ ious = self.overlap_similarity(first_box, other_boxes)
187
+
188
+ # If two detections don't overlap enough, they are considered
189
+ # to be from different faces.
190
+ mask = ious > self.min_suppression_threshold
191
+ overlapping = remaining[mask]
192
+ remaining = remaining[~mask]
193
+
194
+ # Take an average of the coordinates from the overlapping
195
+ # detections, weighted by their confidence scores.
196
+ weighted_detection = detection.clone()
197
+ if len(overlapping) > 1:
198
+ coordinates = detections[overlapping, :num_coords]
199
+ scores = detections[overlapping, num_coords:num_coords+1]
200
+ total_score = scores.sum()
201
+ weighted = (coordinates * scores).sum(dim=0) / total_score
202
+ weighted_detection[:num_coords] = weighted
203
+ weighted_detection[num_coords] = total_score / len(overlapping)
204
+
205
+ output_detections.append(weighted_detection)
206
+
207
+ return output_detections
208
+
209
+ def draw_detections(img, detections, with_keypoints=True):
210
+ if isinstance(detections, torch.Tensor):
211
+ detections = detections.detach().numpy()
212
+
213
+ if detections.ndim == 1:
214
+ detections = np.expand_dims(detections, axis=0)
215
+
216
+ n_keypoints = detections.shape[1] // 2 - 2
217
+
218
+ for i in range(detections.shape[0]):
219
+ ymin = detections[i, 0]
220
+ xmin = detections[i, 1]
221
+ ymax = detections[i, 2]
222
+ xmax = detections[i, 3]
223
+
224
+ start_point = (int(xmin), int(ymin))
225
+ end_point = (int(xmax), int(ymax))
226
+ img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1)
227
+
228
+ if with_keypoints:
229
+ for k in range(n_keypoints):
230
+ kp_x = int(detections[i, 4 + k*2 ])
231
+ kp_y = int(detections[i, 4 + k*2 + 1])
232
+ cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
233
+ return img
234
+
235
+
236
+
237
+ post_process=post_mediapipe_hand()
238
+
239
+ class handDetectionQnn:
240
+ def __init__(self):
241
+ super().__init__()
242
+ self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_handDetctor_w8a8.qnn216.ctx.bin"))
243
+ if self.model is None:
244
+ print("Create model failed !")
245
+ return
246
+
247
+ self.config = aidlite.Config.create_instance()
248
+ if self.config is None:
249
+ print("build_interpretper_from_model_and_config failed !")
250
+ return
251
+
252
+ self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
253
+ self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
254
+ self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
255
+ self.config.is_quantify_model = 1
256
+
257
+ self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
258
+ if self.interpreter is None:
259
+ print("build_interpretper_from_model_and_config failed !")
260
+ return
261
+ input_shapes = [[1,3, 256, 256]]
262
+ output_shapes = [[1, 2944,18],[1,2944,1]]
263
+ self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
264
+ output_shapes, aidlite.DataType.TYPE_FLOAT32)
265
+
266
+ if self.interpreter is None:
267
+ print("build_interpretper_from_model_and_config failed !")
268
+ result = self.interpreter.init()
269
+ if result != 0:
270
+ print(f"interpreter init failed !")
271
+ result = self.interpreter.load_model()
272
+ if result != 0:
273
+ print("interpreter load model failed !")
274
+
275
+ print(" model load success!")
276
+
277
+ def __call__(self, input):
278
+ self.interpreter.set_input_tensor(0,input)
279
+ invoke_time=[]
280
+ invoke_nums =10
281
+ for i in range(invoke_nums):
282
+ result = self.interpreter.set_input_tensor(0, input.data)
283
+ if result != 0:
284
+ print("interpreter set_input_tensor() failed")
285
+ t1=time.time()
286
+ result = self.interpreter.invoke()
287
+ cost_time = (time.time()-t1)*1000
288
+ invoke_time.append(cost_time)
289
+
290
+ max_invoke_time = max(invoke_time)
291
+ min_invoke_time = min(invoke_time)
292
+ mean_invoke_time = sum(invoke_time)/invoke_nums
293
+ var_invoketime=np.var(invoke_time)
294
+ print("====================================")
295
+ print(f"QNN Detection invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
296
+ print("====================================")
297
+ features_0 = self.interpreter.get_output_tensor(0).reshape(1, 2944,18).copy()
298
+ features_1 = self.interpreter.get_output_tensor(1).reshape(1, 2944,1).copy()
299
+ return features_0,features_1
300
+
301
+
302
+ class handLandmarkQnn:
303
+ def __init__(self):
304
+ super().__init__()
305
+ self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_handLandmark_w8a8.qnn216.ctx.bin"))
306
+ if self.model is None:
307
+ print("Create model failed !")
308
+ return
309
+
310
+ self.config = aidlite.Config.create_instance()
311
+ if self.config is None:
312
+ print("build_interpretper_from_model_and_config failed !")
313
+ return
314
+
315
+ self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
316
+ self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
317
+ self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
318
+ self.config.is_quantify_model = 1
319
+
320
+ self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
321
+ if self.interpreter is None:
322
+ print("build_interpretper_from_model_and_config failed !")
323
+ return
324
+ input_shapes = [[1, 3, 256, 256]]
325
+ output_shapes = [[1],[1],[1,21,3]]
326
+ self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
327
+ output_shapes, aidlite.DataType.TYPE_FLOAT32)
328
+
329
+ if self.interpreter is None:
330
+ print("build_interpretper_from_model_and_config failed !")
331
+ result = self.interpreter.init()
332
+ if result != 0:
333
+ print(f"interpreter init failed !")
334
+ result = self.interpreter.load_model()
335
+ if result != 0:
336
+ print("interpreter load model failed !")
337
+
338
+ print(" model load success!")
339
+
340
+ def __call__(self, input):
341
+ self.interpreter.set_input_tensor(0,input)
342
+ invoke_time=[]
343
+ invoke_nums =10
344
+ for i in range(invoke_nums):
345
+ result = self.interpreter.set_input_tensor(0, input.data)
346
+ if result != 0:
347
+ print("interpreter set_input_tensor() failed")
348
+ t1=time.time()
349
+ result = self.interpreter.invoke()
350
+ cost_time = (time.time()-t1)*1000
351
+ invoke_time.append(cost_time)
352
+
353
+ max_invoke_time = max(invoke_time)
354
+ min_invoke_time = min(invoke_time)
355
+ mean_invoke_time = sum(invoke_time)/invoke_nums
356
+ var_invoketime=np.var(invoke_time)
357
+ print("====================================")
358
+ print(f"QNN LandMark invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
359
+ print("====================================")
360
+ features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy()
361
+ features_1 = self.interpreter.get_output_tensor(2).reshape(1,21,3).copy()
362
+ return features_0,features_1
363
+
364
+
365
+
366
+ anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"anchors_palm.npy")), dtype=torch.float32, device='cpu')
367
+ # anchors_np = anchors.cpu().numpy().astype(np.float32)
368
+ # np.save("anchors_float32.npy", anchors_np)
369
+ hand_detc = handDetectionQnn()
370
+ hand_rec = handLandmarkQnn()
371
+
372
+ image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"hand.jpg")
373
+
374
+ frame_ct=0
375
+ image = cv2.imread(image_path)
376
+
377
+ frame = np.ascontiguousarray(image[:,:,::-1])
378
+
379
+ img1, img2, scale, pad = resize_pad(frame)
380
+
381
+ input = (img1 / 255).astype(np.float32)
382
+ input = np.transpose(input, (2, 0, 1))
383
+ input = input[np.newaxis, ...]
384
+ t0 = time.time()
385
+ out = hand_detc(input)
386
+ use_time = round((time.time() - t0) * 1000, 2)
387
+ detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors)
388
+
389
+ filtered_detections = []
390
+ num_coords = 18
391
+ for i in range(len(detections)):
392
+ faces = post_process._weighted_non_max_suppression(detections[i])
393
+ faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1))
394
+ filtered_detections.append(faces)
395
+
396
+ face_detections = denormalize_detections(filtered_detections[0], scale, pad)
397
+
398
+ xc, yc, scale, theta = post_process.detection2roi(face_detections)
399
+
400
+ img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale)
401
+ if box.size()[0]!=0:
402
+ t2 = time.time()
403
+ flags, normalized_landmarks = hand_rec(img.numpy())
404
+
405
+ use_time = round((time.time() - t2) * 1000, 2)
406
+
407
+ landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine)
408
+
409
+ for i in range(len(flags)):
410
+ landmark, flag = landmarks[i], flags[i]
411
+ if flag>.4: # 0.5
412
+ draw_landmarks(frame, landmark[:,:2], HAND_CONNECTIONS, size=2)
413
+ else:
414
+ print("not detect palm !")
415
+
416
+ draw_roi(frame, box)
417
+ draw_detections(frame, face_detections)
418
+ cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1])
419
+ hand_detc.interpreter.destory()
420
+ hand_rec.interpreter.destory()
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/export_jit.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import os
4
+ from typing import Callable, Tuple
5
+ from blazepalm import BlazePalm
6
+ from blazehand_landmark import BlazeHandLandmark
7
+
8
+
9
+ gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
10
+ torch.set_grad_enabled(False)
11
+
12
+
13
+
14
+ class HandDetector(torch.nn.Module):
15
+ def __init__(
16
+ self,
17
+ detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]],
18
+ anchors: torch.Tensor,
19
+ ):
20
+ super().__init__()
21
+ self.detector = detector
22
+ self.anchors = anchors
23
+
24
+ def forward(self, image):
25
+ return self.detector(image)
26
+
27
+ class HandLandmarkDetector(torch.nn.Module):
28
+ def __init__(
29
+ self,
30
+ detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]],
31
+ ):
32
+ super().__init__()
33
+ self.detector = detector
34
+
35
+ def forward(self, image):
36
+ return self.detector(image)
37
+
38
+
39
+
40
+
41
+ palm_detector = BlazePalm().to(gpu)
42
+ palm_detector.load_weights(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/blazepalm.pth"))
43
+ palm_detector.load_anchors(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/anchors_palm.npy"))
44
+ palm_detector.min_score_thresh = .75
45
+
46
+ num_params = sum(p.numel() for p in palm_detector.parameters() if p.requires_grad)
47
+ print(f'Number of palm_detector parameters: {num_params}')
48
+
49
+ hand_regressor = BlazeHandLandmark().to(gpu)
50
+ hand_regressor.load_weights(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/blazehand_landmark.pth"))
51
+ num_params = sum(p.numel() for p in hand_regressor.parameters() if p.requires_grad)
52
+ print(f'Number of hand_landmark parameters: {num_params}')
53
+
54
+ hand_detect = HandDetector(palm_detector,palm_detector.anchors)
55
+ hand_regres = HandLandmarkDetector(hand_regressor)
56
+
57
+ hand_d_in = torch.randn(1, 3, 256, 256,dtype= torch.float32)
58
+
59
+ source_model = torch.jit.trace(hand_detect.to("cpu"),hand_d_in)
60
+ source_model.save("m_handDetector.pt")
61
+ print("export hand detect ok!")
62
+
63
+ hand_r_in = torch.randn(1, 3, 256, 256,dtype= torch.float32)
64
+ source_model = torch.jit.trace(hand_regres.to("cpu"), hand_r_in)
65
+ source_model.save("m_handLandmark.pt")
66
+ print("export hand landmark ok!")
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/hand.jpg ADDED
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/visualization.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+ import torch
4
+
5
+ def draw_detections(img, detections, with_keypoints=True):
6
+ if isinstance(detections, torch.Tensor):
7
+ detections = detections.cpu().numpy()
8
+
9
+ if detections.ndim == 1:
10
+ detections = np.expand_dims(detections, axis=0)
11
+
12
+ n_keypoints = detections.shape[1] // 2 - 2
13
+
14
+ for i in range(detections.shape[0]):
15
+ ymin = detections[i, 0]
16
+ xmin = detections[i, 1]
17
+ ymax = detections[i, 2]
18
+ xmax = detections[i, 3]
19
+
20
+ start_point = (int(xmin), int(ymin))
21
+ end_point = (int(xmax), int(ymax))
22
+ img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1)
23
+
24
+ if with_keypoints:
25
+ for k in range(n_keypoints):
26
+ kp_x = int(detections[i, 4 + k*2 ])
27
+ kp_y = int(detections[i, 4 + k*2 + 1])
28
+ cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
29
+ return img
30
+
31
+
32
+ def draw_roi(img, roi):
33
+ for i in range(roi.shape[0]):
34
+ (x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i]
35
+ cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2)
36
+ cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2)
37
+ cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2)
38
+ cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2)
39
+
40
+
41
+ def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2):
42
+ points = points[:,:2]
43
+ for point in points:
44
+ x, y = point
45
+ x, y = int(x), int(y)
46
+ cv2.circle(img, (x, y), size, color, thickness=size)
47
+ for connection in connections:
48
+ x0, y0 = points[connection[0]]
49
+ x1, y1 = points[connection[1]]
50
+ x0, y0 = int(x0), int(y0)
51
+ x1, y1 = int(x1), int(y1)
52
+ cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size)
53
+
54
+
55
+
56
+ # https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py
57
+ # 8 12 16 20
58
+ # | | | |
59
+ # 7 11 15 19
60
+ # 4 | | | |
61
+ # | 6 10 14 18
62
+ # 3 | | | |
63
+ # | 5---9---13--17
64
+ # 2 \ /
65
+ # \ \ /
66
+ # 1 \ /
67
+ # \ \ /
68
+ # ------0-
69
+ HAND_CONNECTIONS = [
70
+ (0, 1), (1, 2), (2, 3), (3, 4),
71
+ (5, 6), (6, 7), (7, 8),
72
+ (9, 10), (10, 11), (11, 12),
73
+ (13, 14), (14, 15), (15, 16),
74
+ (17, 18), (18, 19), (19, 20),
75
+ (0, 5), (5, 9), (9, 13), (13, 17), (0, 17)
76
+ ]
77
+
78
+ POSE_CONNECTIONS = [
79
+ (0,1), (1,2), (2,3), (3,7),
80
+ (0,4), (4,5), (5,6), (6,8),
81
+ (9,10),
82
+ (11,13), (13,15), (15,17), (17,19), (19,15), (15,21),
83
+ (12,14), (14,16), (16,18), (18,20), (20,16), (16,22),
84
+ (11,12), (12,24), (24,23), (23,11)
85
+ ]
86
+
87
+ # Vertex indices can be found in
88
+ # github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png
89
+ # Found in github.com/google/mediapipe/python/solutions/face_mesh.py
90
+ FACE_CONNECTIONS = [
91
+ # Lips.
92
+ (61, 146), (146, 91), (91, 181), (181, 84), (84, 17),
93
+ (17, 314), (314, 405), (405, 321), (321, 375), (375, 291),
94
+ (61, 185), (185, 40), (40, 39), (39, 37), (37, 0),
95
+ (0, 267), (267, 269), (269, 270), (270, 409), (409, 291),
96
+ (78, 95), (95, 88), (88, 178), (178, 87), (87, 14),
97
+ (14, 317), (317, 402), (402, 318), (318, 324), (324, 308),
98
+ (78, 191), (191, 80), (80, 81), (81, 82), (82, 13),
99
+ (13, 312), (312, 311), (311, 310), (310, 415), (415, 308),
100
+ # Left eye.
101
+ (263, 249), (249, 390), (390, 373), (373, 374), (374, 380),
102
+ (380, 381), (381, 382), (382, 362), (263, 466), (466, 388),
103
+ (388, 387), (387, 386), (386, 385), (385, 384), (384, 398),
104
+ (398, 362),
105
+ # Left eyebrow.
106
+ (276, 283), (283, 282), (282, 295), (295, 285), (300, 293),
107
+ (293, 334), (334, 296), (296, 336),
108
+ # Right eye.
109
+ (33, 7), (7, 163), (163, 144), (144, 145), (145, 153),
110
+ (153, 154), (154, 155), (155, 133), (33, 246), (246, 161),
111
+ (161, 160), (160, 159), (159, 158), (158, 157), (157, 173),
112
+ (173, 133),
113
+ # Right eyebrow.
114
+ (46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105),
115
+ (105, 66), (66, 107),
116
+ # Face oval.
117
+ (10, 338), (338, 297), (297, 332), (332, 284), (284, 251),
118
+ (251, 389), (389, 356), (356, 454), (454, 323), (323, 361),
119
+ (361, 288), (288, 397), (397, 365), (365, 379), (379, 378),
120
+ (378, 400), (400, 377), (377, 152), (152, 148), (148, 176),
121
+ (176, 149), (149, 150), (150, 136), (136, 172), (172, 58),
122
+ (58, 132), (132, 93), (93, 234), (234, 127), (127, 162),
123
+ (162, 21), (21, 54), (54, 103), (103, 67), (67, 109),
124
+ (109, 10)
125
+ ]
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Model Information
2
+ ### Source model
3
+ - Input shape: [1x3x256x256], [1x3x256x256]
4
+ - Number of parameters:1.76M, 2.01M
5
+ - Model size:7.11MB, 8.09MB
6
+ - Output shape: [1x2944x18, 1x2944x1], [1, 1, 1x21x3]
7
+
8
+ Source model repository: [MediaPipe-Hand-Detection](https://github.com/zmurez/MediaPipePyTorch/)
9
+
10
+ ### Converted model
11
+
12
+ - Precision: W8A16
13
+ - Backend: QNN2.16
14
+ - Target Device: FV01 QCS6490
15
+
16
+ ## Inference with AidLite SDK
17
+
18
+ ### SDK installation
19
+ Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
20
+
21
+ - install AidLite SDK
22
+
23
+ ```bash
24
+ # Install the appropriate version of the aidlite sdk
25
+ sudo aid-pkg update
26
+ sudo aid-pkg install aidlite-sdk
27
+ # Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
28
+ sudo aid-pkg install aidlite-{QNN VERSION}
29
+ ```
30
+
31
+ - Verify AidLite SDK
32
+
33
+ ```bash
34
+ # aidlite sdk c++ check
35
+ python3 -c "import aidlite ; print(aidlite.get_library_version())"
36
+
37
+ # aidlite sdk python check
38
+ python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
39
+ ```
40
+
41
+ ### Run demo
42
+ #### python
43
+ ```bash
44
+ cd python
45
+ python3 demo_qnn.py
46
+ ```
47
+
48
+ #### c++
49
+ ```bash
50
+ # 加载.npy文件需要用到cnpy库(终端默认路径下执行即可)
51
+ git clone https://github.com/rogersce/cnpy.git
52
+ cd cnpy
53
+ mkdir build && cd build
54
+ cmake ..
55
+ make
56
+ sudo make install
57
+
58
+ cd mediapipe-hand/model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp
59
+ mkdir build && cd build
60
+ cmake ..
61
+ make
62
+ ./run_test
63
+ ```
64
+
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/CMakeLists.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required (VERSION 3.5)
2
+ project("run_test")
3
+
4
+ find_package(OpenCV REQUIRED)
5
+ find_library(CNPY_LIB cnpy REQUIRED)
6
+
7
+ message(STATUS "oPENCV Library status:")
8
+ message(STATUS ">version:${OpenCV_VERSION}")
9
+ message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
10
+
11
+ set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
12
+
13
+ include_directories(
14
+ /usr/local/include
15
+ /usr/include/opencv4
16
+ )
17
+
18
+ link_directories(
19
+ /usr/local/lib/
20
+ )
21
+
22
+ file(GLOB SRC_LISTS
23
+ ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp
24
+ )
25
+
26
+ add_executable(run_test ${SRC_LISTS})
27
+
28
+ target_link_libraries(run_test
29
+ aidlite
30
+ ${OpenCV_LIBS}
31
+ pthread
32
+ jsoncpp
33
+ ${CNPY_LIB}
34
+ )
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/anchors_float32.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df91d5dc452f5098bd2618bae51fed413a1f6d3774bea5fbfac1a846d4ee8466
3
+ size 47232
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/hand.jpg ADDED
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/run_test.cpp ADDED
@@ -0,0 +1,923 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <iostream>
2
+ #include <fstream>
3
+ #include <opencv2/opencv.hpp>
4
+ #include <aidlux/aidlite/aidlite.hpp>
5
+ #include <vector>
6
+ #include <numeric>
7
+ #include <cmath>
8
+ #include <jsoncpp/json/json.h>
9
+ #include <tuple>
10
+ #include <algorithm>
11
+ #include <sstream>
12
+ #include <string>
13
+ #include <cassert>
14
+ #include "cnpy.h"
15
+
16
+ using namespace cv;
17
+ using namespace std;
18
+ using namespace Aidlux::Aidlite;
19
+
20
+
21
+ // 人脸 landmark 连接索引定义(来自 MediaPipe Face Mesh)
22
+ const std::vector<std::pair<int, int>> HAND_CONNECTIONS = {
23
+ {0, 1}, {1, 2}, {2, 3}, {3, 4},
24
+ {5, 6}, {6, 7}, {7, 8},
25
+ {9, 10}, {10, 11}, {11, 12},
26
+ {13, 14}, {14, 15}, {15, 16},
27
+ {17, 18}, {18, 19}, {19, 20},
28
+ {0, 5}, {5, 9}, {9, 13}, {13, 17}, {0, 17}
29
+ };
30
+
31
+ int kp1 = 0, kp2 = 2; // 关键点索引
32
+ float dy = -0.5f; // 根据模型定义设定
33
+ float dscale = 2.6f; // 缩放因子
34
+ float theta0 = 1.5707963267948966; // 基准角度
35
+ int batch=1;
36
+ int num_anchors=2944;
37
+ int num_coords=18;
38
+ int num_classes=1;
39
+ int num_keypoints=7;
40
+ float x_scale=256.0;
41
+ float y_scale=256.0;
42
+ float w_scale=256.0;
43
+ float h_scale=256.0;
44
+ float score_clipping_thresh=100.0;
45
+ float min_score_thresh=0.75;
46
+
47
+ struct Args {
48
+ std::string faceDetector_model = "../../models/m_handDetector_w8a16.qnn216.ctx.bin";
49
+ std::string faceLandmark_model = "../../models/m_handLandmark_w8a16.qnn216.ctx.bin";
50
+ std::string imgs = "../hand.jpg";
51
+ int invoke_nums = 10;
52
+ std::string model_type = "QNN";
53
+ };
54
+
55
+
56
+ Args parse_args(int argc, char* argv[]) {
57
+ Args args;
58
+ for (int i = 1; i < argc; ++i) {
59
+ std::string arg = argv[i];
60
+ if (arg == "--faceDetector_model" && i + 1 < argc) {
61
+ args.faceDetector_model = argv[++i];
62
+ } else if (arg == "--faceLandmark_model" && i + 1 < argc) {
63
+ args.faceLandmark_model = argv[++i];
64
+ } else if (arg == "--imgs" && i + 1 < argc) {
65
+ args.imgs = argv[++i];
66
+ } else if (arg == "--invoke_nums" && i + 1 < argc) {
67
+ args.invoke_nums = std::stoi(argv[++i]);
68
+ } else if (arg == "--model_type" && i + 1 < argc) {
69
+ args.model_type = argv[++i];
70
+ }
71
+ }
72
+ return args;
73
+ }
74
+
75
+ std::string to_lower(const std::string& str) {
76
+ std::string lower_str = str;
77
+ std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
78
+ return std::tolower(c);
79
+ });
80
+ return lower_str;
81
+ }
82
+
83
+ std::vector<std::vector<float>> load_anchors_from_npy(const std::string& path) {
84
+ cnpy::NpyArray arr = cnpy::npy_load(path);
85
+ float* data_ptr = arr.data<float>();
86
+
87
+ size_t num_rows = arr.shape[0]; // 896
88
+ size_t num_cols = arr.shape[1]; // 4
89
+
90
+ std::vector<std::vector<float>> anchors(num_rows, std::vector<float>(num_cols));
91
+ for (size_t i = 0; i < num_rows; ++i) {
92
+ for (size_t j = 0; j < num_cols; ++j) {
93
+ anchors[i][j] = data_ptr[i * num_cols + j];
94
+ }
95
+ }
96
+
97
+ return anchors;
98
+ }
99
+
100
+
101
+ // 绘制人脸关键点和连接线
102
+ void draw_landmarks(
103
+ cv::Mat& img,
104
+ const std::vector<cv::Point2f>& points,
105
+ const std::vector<float>& flags,
106
+ const std::vector<std::pair<int, int>>& connections,
107
+ float threshold = 0.4f,
108
+ cv::Scalar point_color = cv::Scalar(0, 255, 0),
109
+ cv::Scalar line_color = cv::Scalar(0, 0, 0),
110
+ int size = 2)
111
+ {
112
+ // 画关键点
113
+ for (size_t i = 0; i < points.size(); ++i) {
114
+ // if (i < flags.size() && flags[i] > threshold) {
115
+ int x = static_cast<int>(points[i].x);
116
+ int y = static_cast<int>(points[i].y);
117
+ cv::circle(img, cv::Point(x, y), size, point_color, size);
118
+ // }
119
+ }
120
+
121
+
122
+ // 画连接线(两端都要可见)
123
+ for (const auto& conn : connections) {
124
+ int i0 = conn.first;
125
+ int i1 = conn.second;
126
+ // if (i0 < points.size() && i1 < points.size() &&
127
+ // i0 < flags.size() && i1 < flags.size() &&
128
+ // flags[i0] > threshold && flags[i1] > threshold)
129
+ // {
130
+ cv::line(img, points[i0], points[i1], line_color, size);
131
+ // }
132
+ }
133
+ }
134
+
135
+
136
+ std::tuple<cv::Mat, cv::Mat, float, cv::Point> resize_pad(const cv::Mat& img) {
137
+ int h = img.rows;
138
+ int w = img.cols;
139
+
140
+ int h1, w1, padh = 0, padw = 0;
141
+ float scale = 1.0f;
142
+
143
+ // Step 1: resize width to 256, keep aspect ratio
144
+ // int w1 = 256;
145
+ // int h1 = w1 * orig_h / orig_w; // 等效于 int(256 * h / w)
146
+
147
+ // 根据宽高,调整缩放比
148
+ if (h >= w) {
149
+ h1 = 256;
150
+ w1 = 256 * w / h;
151
+ padw = 256 - w1;
152
+ scale = static_cast<float>(w) / w1;
153
+ } else {
154
+ w1 = 256;
155
+ h1 = 256 * h / w;
156
+ padh = 256 - h1;
157
+ scale = static_cast<float>(h) / h1;
158
+ }
159
+
160
+ // std::cout << "Original size: (" << h << ", " << w << "), padding: (" << padh << ", " << padw << ")\n";
161
+ // Step 2: compute padding in height direction
162
+ int padh1 = padh / 2;
163
+ int padh2 = padh - padh1;
164
+ int padw1 = padw / 2;
165
+ int padw2 = padw - padw1;
166
+ // std::cout << "Padding: (" << padh1 << ", " << padh2 << "), (" << padw1 << ", " << padw2 << ")\n";
167
+
168
+ // Resize to (w1, h1)
169
+ cv::Mat resized;
170
+ cv::resize(img, resized, cv::Size(w1, h1));
171
+
172
+ // Pad to 256x256
173
+ cv::Mat padded;
174
+ cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
175
+
176
+ // Final resize to 128x128
177
+ cv::Mat resized_small;
178
+ cv::resize(padded, resized_small, cv::Size(128, 128));
179
+
180
+ // Compute offset in original scale
181
+ cv::Point pad_offset(static_cast<int>(padh1 * scale), static_cast<int>(padw1 * scale));
182
+
183
+ return std::make_tuple(padded, resized_small, scale, pad_offset);
184
+ }
185
+
186
+
187
+ // 将图像转换为 1xC×H×W 格式并归一化(除以 255)
188
+ std::vector<float> preprocess_image(const cv::Mat& img) {
189
+ int H = img.rows;
190
+ int W = img.cols;
191
+ int C = img.channels(); // should be 3
192
+
193
+ std::vector<float> chw(H * W * C); // CHW
194
+ std::vector<float> nchw(1 * C * H * W); // NCHW
195
+
196
+ // 1. HWC → CHW + normalize (float32 / 255.0)
197
+ for (int h = 0; h < H; ++h) {
198
+ for (int w = 0; w < W; ++w) {
199
+ for (int c = 0; c < C; ++c) {
200
+ // OpenCV uses BGR order
201
+ float value = img.at<cv::Vec3b>(h, w)[c] / 255.0f;
202
+ chw[c * H * W + h * W + w] = value;
203
+ }
204
+ }
205
+ }
206
+
207
+ // 2. CHW → NCHW (add batch dimension, actually just copy)
208
+ for (int i = 0; i < C * H * W; ++i) {
209
+ nchw[i] = chw[i];
210
+ }
211
+
212
+ return nchw; // shape: [1, 3, H, W]
213
+ }
214
+
215
+
216
+ // 只用前4个坐标计算IOU(默认框位置在前4个坐标)
217
+ float IoU(const std::vector<float>& box1, const std::vector<float>& box2) {
218
+ float x1 = std::max(box1[0], box2[0]);
219
+ float y1 = std::max(box1[1], box2[1]);
220
+ float x2 = std::min(box1[2], box2[2]);
221
+ float y2 = std::min(box1[3], box2[3]);
222
+
223
+ float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
224
+ float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]);
225
+ float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]);
226
+ float union_area = box1_area + box2_area - inter_area;
227
+
228
+ return union_area > 0 ? inter_area / union_area : 0.0f;
229
+ }
230
+
231
+ std::vector<std::vector<float>> weighted_non_max_suppression(
232
+ std::vector<std::vector<float>>& detections,
233
+ int num_coords = 18,
234
+ float min_suppression_threshold = 0.3f)
235
+ {
236
+ if (detections.empty()) return {};
237
+
238
+ std::vector<int> indices(detections.size());
239
+ std::iota(indices.begin(), indices.end(), 0);
240
+
241
+ // 按置信度降序排序
242
+ std::sort(indices.begin(), indices.end(), [&](int a, int b) {
243
+ return detections[a][num_coords] > detections[b][num_coords];
244
+ });
245
+
246
+ std::vector<std::vector<float>> output;
247
+
248
+ while (!indices.empty()) {
249
+ int best_idx = indices.front();
250
+ const auto& best_det = detections[best_idx];
251
+ std::vector<int> overlapping = { best_idx };
252
+
253
+ for (size_t i = 1; i < indices.size(); ++i) {
254
+ float iou = IoU(best_det, detections[indices[i]]);
255
+ if (iou > min_suppression_threshold) {
256
+ overlapping.push_back(indices[i]);
257
+ }
258
+ }
259
+
260
+ // 更新剩余索引
261
+ std::vector<int> new_indices;
262
+ for (int idx : indices) {
263
+ if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) {
264
+ new_indices.push_back(idx);
265
+ }
266
+ }
267
+ indices = new_indices;
268
+
269
+ // 加权平均:坐标 * 置信度
270
+ if (overlapping.size() == 1) {
271
+ output.push_back(best_det);
272
+ } else {
273
+ std::vector<float> weighted(num_coords + 1, 0.0f);
274
+ float total_score = 0.0f;
275
+
276
+ for (int idx : overlapping) {
277
+ float score = detections[idx][num_coords];
278
+ total_score += score;
279
+ for (int k = 0; k < num_coords; ++k) {
280
+ weighted[k] += detections[idx][k] * score;
281
+ }
282
+ }
283
+
284
+ for (int k = 0; k < num_coords; ++k) {
285
+ weighted[k] /= total_score;
286
+ }
287
+ weighted[num_coords] = total_score / overlapping.size(); // 取平均得分
288
+
289
+ // std::cout << "Weighted box: ";
290
+ // for (float v : weighted) std::cout << v << " ";
291
+ // std::cout << "\n";
292
+
293
+ output.push_back(weighted);
294
+ }
295
+ }
296
+
297
+ // TODO
298
+ auto x = output[0];
299
+ output.clear();
300
+ output.push_back(x);
301
+
302
+ return output;
303
+ }
304
+
305
+
306
+ std::vector<std::vector<float>> denormalize_detections(
307
+ const std::vector<std::vector<float>>& detections,
308
+ float scale,
309
+ const cv::Point& pad
310
+ ) {
311
+ std::vector<std::vector<float>> result = detections;
312
+
313
+ for (size_t i = 0; i < result.size(); ++i) {
314
+ std::vector<float>& det = result[i];
315
+
316
+ // bbox coords: x1, y1, x2, y2
317
+ det[0] = det[0] * scale * 256.0f - pad.x; // x1
318
+ det[1] = det[1] * scale * 256.0f - pad.y; // y1
319
+ det[2] = det[2] * scale * 256.0f - pad.x; // x2
320
+ det[3] = det[3] * scale * 256.0f - pad.y; // y2
321
+
322
+ // keypoints (starting from index 4): format [y, x, y, x, ...]
323
+ for (size_t k = 4; k + 1 < det.size(); k += 2) {
324
+ det[k] = det[k] * scale * 256.0f - pad.y; // y
325
+ det[k + 1] = det[k + 1] * scale * 256.0f - pad.x; // x
326
+ }
327
+ }
328
+
329
+ return result;
330
+ }
331
+
332
+
333
+ void detection2roi(
334
+ const std::vector<std::vector<float>>& detections,
335
+ std::vector<float>& xc,
336
+ std::vector<float>& yc,
337
+ std::vector<float>& scale,
338
+ std::vector<float>& theta,
339
+ int kp1, int kp2, // 关键点索引
340
+ float dy, float dscale, float theta0
341
+ ) {
342
+ size_t N = detections.size();
343
+ xc.resize(N);
344
+ yc.resize(N);
345
+ scale.resize(N);
346
+ theta.resize(N);
347
+
348
+ for (size_t i = 0; i < N; ++i) {
349
+ const std::vector<float>& det = detections[i];
350
+
351
+ float x1 = det[1];
352
+ float x2 = det[3];
353
+ float y1 = det[0];
354
+ float y2 = det[2];
355
+
356
+ float x_center = (x1 + x2) / 2.0f;
357
+ float y_center = (y1 + y2) / 2.0f;
358
+ float box_scale = (x2 - x1); // assumes square box
359
+
360
+ // yc 偏移
361
+ y_center += dy * box_scale;
362
+ box_scale *= dscale;
363
+
364
+ // 获取两个关键点的位置
365
+ int base = 4;
366
+ int idx_y0 = base + 2 * kp1;
367
+ int idx_x0 = base + 2 * kp1 + 1;
368
+ int idx_y1 = base + 2 * kp2;
369
+ int idx_x1 = base + 2 * kp2 + 1;
370
+
371
+ float x0 = det[idx_x0];
372
+ float y0 = det[idx_y0];
373
+ float x1_kp = det[idx_x1];
374
+ float y1_kp = det[idx_y1];
375
+
376
+ float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0;
377
+
378
+ // 输出赋值
379
+ xc[i] = x_center;
380
+ yc[i] = y_center;
381
+ scale[i] = box_scale;
382
+ // TODO: 这里的 theta 需要根据实际情况调整
383
+ // theta[i] = angle; // 如果需要使用计算的角度
384
+ theta[i] = -0.8461;
385
+ }
386
+ }
387
+
388
+
389
+ void extract_roi(
390
+ const cv::Mat& frame,
391
+ const std::vector<float>& xc,
392
+ const std::vector<float>& yc,
393
+ const std::vector<float>& theta,
394
+ const std::vector<float>& scale,
395
+ std::vector<cv::Mat>& cropped_rois,
396
+ std::vector<cv::Mat>& affine_matrices,
397
+ std::vector<std::vector<cv::Point2f>>& roi_boxes, // 添加返回点坐标
398
+ int resolution = 256
399
+ ) {
400
+ cropped_rois.clear();
401
+ affine_matrices.clear();
402
+ roi_boxes.clear();
403
+
404
+ for (size_t i = 0; i < xc.size(); ++i) {
405
+ float s = scale[i] / 2.0f;
406
+ float cos_t = std::cos(theta[i]);
407
+ float sin_t = std::sin(theta[i]);
408
+
409
+ // 定义4个 unit square 点经过变换后的点(顺序和 Python 中一样)
410
+ std::vector<cv::Point2f> points(4);
411
+ // [-1, -1]
412
+ points[0].x = xc[i] + (-s * cos_t + s * sin_t);
413
+ points[0].y = yc[i] + (-s * sin_t - s * cos_t);
414
+ // [1, -1]
415
+ points[1].x = xc[i] + ( s * cos_t + s * sin_t);
416
+ points[1].y = yc[i] + ( s * sin_t - s * cos_t);
417
+ // [-1, 1]
418
+ points[2].x = xc[i] + (-s * cos_t - s * sin_t);
419
+ points[2].y = yc[i] + (-s * sin_t + s * cos_t);
420
+ // [1, 1]
421
+ points[3].x = xc[i] + ( s * cos_t - s * sin_t);
422
+ points[3].y = yc[i] + ( s * sin_t + s * cos_t);
423
+
424
+ // 用前三个点计算仿射变换
425
+ std::vector<cv::Point2f> src_pts = { points[0], points[1], points[2] };
426
+ std::vector<cv::Point2f> dst_pts = {
427
+ cv::Point2f(0, 0),
428
+ cv::Point2f(resolution - 1, 0),
429
+ cv::Point2f(0, resolution - 1)
430
+ };
431
+
432
+ cv::Mat M = cv::getAffineTransform(src_pts, dst_pts);
433
+ cv::Mat M_inv;
434
+ cv::invertAffineTransform(M, M_inv);
435
+
436
+ cv::Mat cropped;
437
+ cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5));
438
+ cropped_rois.push_back(cropped);
439
+ affine_matrices.push_back(M_inv);
440
+ roi_boxes.push_back(points); // 添加变换后的 box 点
441
+ }
442
+ }
443
+
444
+ std::vector<float> preprocess_imgs_to_nchw(const std::vector<cv::Mat>& imgs) {
445
+ int N = imgs.size();
446
+ if (N == 0) return {};
447
+
448
+ int H = 256;
449
+ int W = 256;
450
+ int C = 3; // assume 3 channels (BGR)
451
+
452
+ std::vector<float> output;
453
+ output.reserve(N * C * H * W);
454
+
455
+ for (int n = 0; n < N; ++n) {
456
+ cv::Mat img_float;
457
+ imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0); // Normalize to [0,1]
458
+
459
+ // Split channels (HWC → CHW)
460
+ std::vector<cv::Mat> channels(3);
461
+ cv::split(img_float, channels); // channels[0] = B, [1] = G, [2] = R
462
+
463
+ for (int c = 0; c < C; ++c) {
464
+ for (int i = 0; i < H; ++i) {
465
+ for (int j = 0; j < W; ++j) {
466
+ output.push_back(channels[c].at<float>(i, j));
467
+ }
468
+ }
469
+ }
470
+ }
471
+
472
+ return output; // shape: N x C x H x W
473
+ }
474
+
475
+ std::vector<cv::Point2f> denormalize_landmarks(
476
+ const std::vector<float>& normalized_landmarks,
477
+ const std::vector<cv::Mat>& affines,
478
+ int resolution = 256)
479
+ {
480
+ std::vector<cv::Point2f> output;
481
+
482
+ // 检查输入尺寸
483
+ const int num_faces = 1; // 假设只有一个人脸
484
+ const int num_landmarks = 21;
485
+ if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) {
486
+ std::cerr << "Error: Input size mismatch. Expected "
487
+ << num_faces * num_landmarks * 3 << " landmarks and "
488
+ << num_faces << " affine matrices." << std::endl;
489
+ throw std::runtime_error("Input size mismatch");
490
+ }
491
+
492
+ for (int i = 0; i < num_faces; ++i) {
493
+ const cv::Mat& affine = affines[i]; // 2x3 CV_32F
494
+ for (int j = 0; j < num_landmarks; ++j) {
495
+ int idx = i * num_landmarks * 3 + j * 3;
496
+ float x = normalized_landmarks[idx + 0] * resolution;
497
+ float y = normalized_landmarks[idx + 1] * resolution;
498
+ // float z = normalized_landmarks[idx + 2]; // 可选使用
499
+
500
+ // 2x1 输入向量
501
+ cv::Mat pt = (cv::Mat_<float>(2, 1) << x, y);
502
+
503
+ // 提取 affine 的旋转和平移
504
+ cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone();
505
+ cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone();
506
+ M2x2.convertTo(M2x2, CV_32F);
507
+ t2x1.convertTo(t2x1, CV_32F);
508
+
509
+ // 反仿射变换
510
+ cv::Mat out = M2x2 * pt + t2x1;
511
+
512
+ // 存储为 Point2f
513
+ output.emplace_back(out.at<float>(0, 0), out.at<float>(1, 0));
514
+ }
515
+ }
516
+
517
+ return output; // 输出为 denormalized landmarks,大小为 468 个 Point2f
518
+ }
519
+
520
+
521
+ void draw_roi(cv::Mat& img, const std::vector<std::vector<cv::Point2f>>& boxes) {
522
+ for (const auto& roi : boxes) {
523
+ if (roi.size() < 4) continue;
524
+
525
+ const cv::Point2f& p1 = roi[0];
526
+ const cv::Point2f& p2 = roi[1];
527
+ const cv::Point2f& p3 = roi[2];
528
+ const cv::Point2f& p4 = roi[3];
529
+
530
+ cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2); // 黑色
531
+ cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2); // 绿色
532
+ cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2); // 黑色
533
+ cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2); // 黑色
534
+ }
535
+ }
536
+
537
+
538
+ void draw_detections(cv::Mat& img, const std::vector<std::vector<float>>& detections, bool with_keypoints = true) {
539
+ for (const auto& det : detections) {
540
+ if (det.size() < 4) continue;
541
+
542
+ float ymin = det[0];
543
+ float xmin = det[1];
544
+ float ymax = det[2];
545
+ float xmax = det[3];
546
+
547
+ cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1);
548
+
549
+ if (with_keypoints && det.size() > 4) {
550
+ int n_keypoints = (det.size() - 4) / 2;
551
+ for (int k = 0; k < n_keypoints; ++k) {
552
+ int kp_x = int(det[4 + k * 2]);
553
+ int kp_y = int(det[4 + k * 2 + 1]);
554
+ cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2);
555
+ }
556
+ }
557
+ }
558
+ }
559
+
560
+
561
+ std::vector<std::vector<float>> loadAnchors(const std::string& filename) {
562
+ std::ifstream in(filename);
563
+ std::vector<std::vector<float>> anchors;
564
+
565
+ if (!in.is_open()) {
566
+ std::cerr << "Failed to open file: " << filename << std::endl;
567
+ return anchors;
568
+ }
569
+
570
+ std::string line;
571
+ while (std::getline(in, line)) {
572
+ std::istringstream ss(line);
573
+ std::vector<float> anchor;
574
+ float value;
575
+ while (ss >> value) {
576
+ anchor.push_back(value);
577
+ }
578
+ if (!anchor.empty()) {
579
+ anchors.push_back(anchor);
580
+ }
581
+ }
582
+
583
+ in.close();
584
+ return anchors;
585
+ }
586
+
587
+ // sigmoid 函数
588
+ float sigmoid(float x) {
589
+ return 1.0f / (1.0f + std::exp(-x));
590
+ }
591
+
592
+ // clamp 函数
593
+ float clamp(float x, float min_val, float max_val) {
594
+ return std::max(min_val, std::min(max_val, x));
595
+ }
596
+
597
+ // shape: [batch, num_anchors, num_coords] => [batch][anchor][coord]
598
+ std::vector<std::vector<std::vector<float>>> decode_boxes(
599
+ const std::vector<float>& raw_boxes,
600
+ const std::vector<std::vector<float>>& anchors,
601
+ int batch, int num_anchors, int num_coords,
602
+ float x_scale, float y_scale, float w_scale, float h_scale,
603
+ int num_keypoints)
604
+ {
605
+ std::vector<std::vector<std::vector<float>>> decoded_boxes(batch,
606
+ std::vector<std::vector<float>>(num_anchors, std::vector<float>(num_coords, 0)));
607
+
608
+ for (int b = 0; b < batch; ++b) {
609
+ for (int i = 0; i < num_anchors; ++i) {
610
+ int base = b * num_anchors * num_coords + i * num_coords;
611
+
612
+ float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0];
613
+ float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1];
614
+ float w = raw_boxes[base + 2] / w_scale * anchors[i][2];
615
+ float h = raw_boxes[base + 3] / h_scale * anchors[i][3];
616
+
617
+ decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin
618
+ decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin
619
+ decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax
620
+ decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax
621
+
622
+ for (int k = 0; k < num_keypoints; ++k) {
623
+ int offset = 4 + k * 2;
624
+ float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0];
625
+ float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1];
626
+ decoded_boxes[b][i][offset] = keypoint_x;
627
+ decoded_boxes[b][i][offset + 1] = keypoint_y;
628
+ }
629
+ }
630
+ }
631
+
632
+ return decoded_boxes;
633
+ }
634
+
635
+ std::vector<std::vector<std::vector<float>>> tensors_to_detections(
636
+ const std::vector<float>& raw_box_tensor,
637
+ const std::vector<float>& raw_score_tensor,
638
+ const std::vector<std::vector<float>>& anchors,
639
+ int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints,
640
+ float x_scale, float y_scale, float w_scale, float h_scale,
641
+ float score_clipping_thresh, float min_score_thresh)
642
+ {
643
+ assert(raw_box_tensor.size() == batch * num_anchors * num_coords);
644
+ assert(raw_score_tensor.size() == batch * num_anchors * num_classes);
645
+ assert(anchors.size() == size_t(num_anchors));
646
+
647
+ auto detection_boxes = decode_boxes(
648
+ raw_box_tensor, anchors, batch, num_anchors, num_coords,
649
+ x_scale, y_scale, w_scale, h_scale, num_keypoints);
650
+
651
+ std::vector<std::vector<std::vector<float>>> output_detections;
652
+
653
+ for (int b = 0; b < batch; ++b) {
654
+ std::vector<std::vector<float>> detections;
655
+
656
+ for (int i = 0; i < num_anchors; ++i) {
657
+ int score_index = b * num_anchors * num_classes + i * num_classes;
658
+
659
+ // 单类情况,取第0类
660
+ float score_raw = raw_score_tensor[score_index];
661
+ float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh));
662
+
663
+ if (score >= min_score_thresh) {
664
+ std::vector<float> det = detection_boxes[b][i]; // shape [num_coords]
665
+ det.push_back(score); // 追加置信度
666
+ detections.push_back(det); // shape [num_coords+1]
667
+ }
668
+ }
669
+
670
+ output_detections.push_back(detections); // 每个 batch 一个 vector
671
+ }
672
+
673
+ return output_detections;
674
+ }
675
+
676
+
677
+ int invoke(const Args& args) {
678
+ std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n"
679
+ << args.faceLandmark_model << "\n"
680
+ << "Image Path: " << args.imgs << "\n"
681
+ << "Inference Nums: " << args.invoke_nums << "\n"
682
+ << "Model Type: " << args.model_type << "\n";
683
+ // =============================================================faceDetector_model start
684
+ Model* model1 = Model::create_instance(args.faceDetector_model);
685
+ if(model1 == nullptr){
686
+ printf("Create model1 failed !\n");
687
+ return EXIT_FAILURE;
688
+ }
689
+ Config* config1 = Config::create_instance();
690
+ if(config1 == nullptr){
691
+ printf("Create config1 failed !\n");
692
+ return EXIT_FAILURE;
693
+ }
694
+ config1->implement_type = ImplementType::TYPE_LOCAL;
695
+ std::string model_type_lower1 = to_lower(args.model_type);
696
+ if (model_type_lower1 == "qnn"){
697
+ config1->framework_type = FrameworkType::TYPE_QNN;
698
+ } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") {
699
+ config1->framework_type = FrameworkType::TYPE_SNPE2;
700
+ }
701
+ config1->accelerate_type = AccelerateType::TYPE_DSP;
702
+ config1->is_quantify_model = 1;
703
+
704
+ std::vector<std::vector<uint32_t>> input_shapes1 = {{1,3,256,256}};
705
+ std::vector<std::vector<uint32_t>> output_shapes1 = {{1,2944,18},{1,2944,1}};
706
+ model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
707
+ std::unique_ptr<Interpreter> fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1);
708
+ if(fast_interpreter1 == nullptr){
709
+ printf("build_interpretper_from_model_and_config failed !\n");
710
+ return EXIT_FAILURE;
711
+ }
712
+ int result = fast_interpreter1->init();
713
+ if(result != EXIT_SUCCESS){
714
+ printf("interpreter->init() failed !\n");
715
+ return EXIT_FAILURE;
716
+ }
717
+ // load model
718
+ fast_interpreter1->load_model();
719
+ if(result != EXIT_SUCCESS){
720
+ printf("interpreter->load_model() failed !\n");
721
+ return EXIT_FAILURE;
722
+ }
723
+ printf("detect model load success!\n");
724
+ // =============================================================faceDetector_model over
725
+
726
+ // =============================================================faceLandmark_model start
727
+ Model* model2 = Model::create_instance(args.faceLandmark_model);
728
+ if(model2 == nullptr){
729
+ printf("Create model2 failed !\n");
730
+ return EXIT_FAILURE;
731
+ }
732
+ Config* config2 = Config::create_instance();
733
+ if(config2 == nullptr){
734
+ printf("Create config2 failed !\n");
735
+ return EXIT_FAILURE;
736
+ }
737
+ config2->implement_type = ImplementType::TYPE_LOCAL;
738
+ std::string model_type_lower2 = to_lower(args.model_type);
739
+ if (model_type_lower2 == "qnn"){
740
+ config2->framework_type = FrameworkType::TYPE_QNN;
741
+ } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") {
742
+ config2->framework_type = FrameworkType::TYPE_SNPE2;
743
+ }
744
+ config2->accelerate_type = AccelerateType::TYPE_DSP;
745
+ config2->is_quantify_model = 1;
746
+
747
+ std::vector<std::vector<uint32_t>> input_shapes2 = {{1,3,256,256}};
748
+ std::vector<std::vector<uint32_t>> output_shapes2 = {{1},{1},{1,21,3}};
749
+ model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
750
+ std::unique_ptr<Interpreter> fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2);
751
+ if(fast_interpreter2 == nullptr){
752
+ printf("build_interpretper_from_model_and_config2 failed !\n");
753
+ return EXIT_FAILURE;
754
+ }
755
+ result = fast_interpreter2->init();
756
+ if(result != EXIT_SUCCESS){
757
+ printf("interpreter2->init() failed !\n");
758
+ return EXIT_FAILURE;
759
+ }
760
+ // load model
761
+ fast_interpreter2->load_model();
762
+ if(result != EXIT_SUCCESS){
763
+ printf("interpreter2->load_model() failed !\n");
764
+ return EXIT_FAILURE;
765
+ }
766
+ printf("detect model2 load success!\n");
767
+ // =============================================================faceLandmark_model over
768
+
769
+
770
+ auto anchors = load_anchors_from_npy("../anchors_float32.npy");
771
+ cv::Mat frame = cv::imread(args.imgs);
772
+ if (frame.empty()) {
773
+ printf("detect image load failed!\n");
774
+ return 1;
775
+ }
776
+ // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
777
+ cv::Mat input_data;
778
+ cv::Mat frame_clone1 = frame.clone();
779
+ cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB);
780
+ cv::Mat frame_clone = frame.clone();
781
+
782
+
783
+ cv::Mat img1, img2;
784
+ float scale;
785
+ cv::Point pad;
786
+ std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1);
787
+ std::vector<float> input_tensor = preprocess_image(img1);
788
+
789
+ float *outdata0 = nullptr;
790
+ float *outdata1 = nullptr;
791
+ std::vector<float> invoke_time;
792
+ for (int i = 0; i < args.invoke_nums; ++i) {
793
+ result = fast_interpreter1->set_input_tensor(0, input_tensor.data());
794
+ if(result != EXIT_SUCCESS){
795
+ printf("interpreter->set_input_tensor() failed !\n");
796
+ return EXIT_FAILURE;
797
+ }
798
+ auto t1 = std::chrono::high_resolution_clock::now();
799
+ result = fast_interpreter1->invoke();
800
+ auto t2 = std::chrono::high_resolution_clock::now();
801
+ std::chrono::duration<double> cost_time = t2 - t1;
802
+ invoke_time.push_back(cost_time.count() * 1000);
803
+ if(result != EXIT_SUCCESS){
804
+ printf("interpreter->invoke() failed !\n");
805
+ return EXIT_FAILURE;
806
+ }
807
+ uint32_t out_data_0 = 0;
808
+ result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0);
809
+ if(result != EXIT_SUCCESS){
810
+ printf("interpreter1->get_output_tensor() 0 failed !\n");
811
+ return EXIT_FAILURE;
812
+ }
813
+
814
+ uint32_t out_data_1 = 0;
815
+ result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1);
816
+ if(result != EXIT_SUCCESS){
817
+ printf("interpreter1->get_output_tensor() 1 failed !\n");
818
+ return EXIT_FAILURE;
819
+ }
820
+
821
+ }
822
+
823
+ std::vector<float> tensor_1_896_16(outdata0, outdata0 + 2944*18);
824
+ std::vector<float> tensor_1_896_1(outdata1, outdata1 + 2944*1);
825
+
826
+ std::vector<std::vector<std::vector<float>>> detections = tensors_to_detections(
827
+ tensor_1_896_16, tensor_1_896_1, anchors,
828
+ batch, num_anchors, num_coords, num_classes, num_keypoints,
829
+ x_scale, y_scale, w_scale, h_scale,
830
+ score_clipping_thresh, min_score_thresh);
831
+
832
+
833
+ std::vector<std::vector<std::vector<float>>> filtered_detections;
834
+ for (size_t i = 0; i < detections.size(); ++i) {
835
+ std::vector<std::vector<float>>& dets = detections[i];
836
+ std::vector<std::vector<float>> faces = weighted_non_max_suppression(dets);
837
+ filtered_detections.push_back(faces);
838
+ }
839
+
840
+
841
+ // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n";
842
+ // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n";
843
+ std::vector<std::vector<float>> face_detections = denormalize_detections(filtered_detections[0], scale, pad);
844
+
845
+ // std::cout << "face_detections size: " << face_detections.size() << "\n";
846
+ std::vector<float> xc, yc, scales, theta;
847
+
848
+
849
+ detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0);
850
+ std::vector<cv::Mat> rois;
851
+ std::vector<cv::Mat> affines;
852
+ std::vector<std::vector<cv::Point2f>> boxes;
853
+
854
+ // std::cout << "xc size: " << xc.size() << ", yc size: " << yc.size() << ", scales size: " << scales.size() << ", theta size: " << theta.size() << "\n";
855
+ // std::cout << "xc: " << xc[0] << ", yc: " << yc[0] << ", scales: " << scales[0] << ", theta: " << theta[0] << "\n";
856
+ extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes);
857
+ if (!boxes.empty()) {
858
+ std::cout << "Detected " << boxes.size() << " faces.\n";
859
+ // 检测到人脸,继续处理 boxes[0] ...
860
+ std::vector<float> input_tensor = preprocess_imgs_to_nchw(rois);
861
+
862
+ // for (int i = 0; i < 5; ++i) {
863
+ // std::cout << "input_tensor:" << i << ": " << input_tensor[i] << std::endl;
864
+ // }
865
+
866
+ float *outdata1_0 = nullptr;
867
+ float *outdata1_1 = nullptr;
868
+
869
+ result = fast_interpreter2->set_input_tensor(0, input_tensor.data());
870
+ if(result != EXIT_SUCCESS){
871
+ printf("interpreter2->set_input_tensor() failed !\n");
872
+ return EXIT_FAILURE;
873
+ }
874
+ auto t1 = std::chrono::high_resolution_clock::now();
875
+ result = fast_interpreter2->invoke();
876
+ auto t2 = std::chrono::high_resolution_clock::now();
877
+ std::chrono::duration<double> cost_time = t2 - t1;
878
+ invoke_time.push_back(cost_time.count() * 1000);
879
+ if(result != EXIT_SUCCESS){
880
+ printf("interpreter2->invoke() failed !\n");
881
+ return EXIT_FAILURE;
882
+ }
883
+ uint32_t out_data_1_0 = 0;
884
+ result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0);
885
+ if(result != EXIT_SUCCESS){
886
+ printf("interpreter2->get_output_tensor() 0 failed !\n");
887
+ return EXIT_FAILURE;
888
+ }
889
+
890
+ uint32_t out_data_1_1 = 0;
891
+ result = fast_interpreter2->get_output_tensor(2, (void**)&outdata1_1, &out_data_1_1);
892
+ if(result != EXIT_SUCCESS){
893
+ printf("interpreter2->get_output_tensor() 1 failed !\n");
894
+ return EXIT_FAILURE;
895
+ }
896
+
897
+ std::vector<float> flags(outdata1_0, outdata1_0 + 1);
898
+ std::vector<float> normalized_landmarks(outdata1_1, outdata1_1 + 21*3);
899
+
900
+ std::vector<cv::Point2f> landmarks = denormalize_landmarks(normalized_landmarks, affines);
901
+ draw_landmarks(frame_clone1, landmarks, flags, HAND_CONNECTIONS);
902
+ } else {
903
+ std::cout << "not detect face!" << std::endl;
904
+ }
905
+
906
+
907
+ draw_roi(frame_clone1, boxes);
908
+ draw_detections(frame_clone1, face_detections);
909
+ cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR);
910
+ cv::imwrite("vis_result.jpg", frame_clone1);
911
+
912
+
913
+ fast_interpreter1->destory();
914
+ fast_interpreter2->destory();
915
+ return 0;
916
+
917
+ }
918
+
919
+
920
+ int main(int argc, char* argv[]) {
921
+ Args args = parse_args(argc, argv);
922
+ return invoke(args);
923
+ }
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/anchors_palm.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24fa4a27ad6bee24ba3185a42fe3a47115540b0b27fa5956a291f03756183b41
3
+ size 94336
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/blazehand_landmark.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd0be6683b0a2f003a3dd3f38da5d12eee3368828d707a04fda247a9793bcb80
3
+ size 8090697
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/blazepalm.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f39b855c35b7d31bee1d9fdcdf0a819763bcfd8d59dabfed00d04b0eafd3eba
3
+ size 7088188
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handDetector.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eeba4a7e513d23ddd1cd96fae5d22eb620118d0d786e830dc40f8aab149d29d
3
+ size 7460035
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handDetector_w8a16.qnn216.ctx.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14348a3ca0562d7cb3e31c46eef90b02cf5bfa5a87e1020e782b466245bf7ecb
3
+ size 3601960
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handLandmark.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1789c79df6bb361e045b83c7361053722ffbd9e443507fc1dfd33c0abae82f0
3
+ size 8486422
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handLandmark_w8a16.qnn216.ctx.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0252aa6a2c6865f78d77d8403804e1570c17d9a81d79d2b760fa81de893dcc61
3
+ size 7039624
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/0000.jpg ADDED
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-38.pyc ADDED
Binary file (16.5 kB). View file
 
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/__pycache__/visualization.cpython-38.pyc ADDED
Binary file (4.59 kB). View file
 
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/blazebase.py ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+
7
+
8
+ def resize_pad(img):
9
+ """ resize and pad images to be input to the detectors
10
+
11
+ The face and palm detector networks take 256x256 and 128x128 images
12
+ as input. As such the input image is padded and resized to fit the
13
+ size while maintaing the aspect ratio.
14
+
15
+ Returns:
16
+ img1: 256x256
17
+ img2: 128x128
18
+ scale: scale factor between original image and 256x256 image
19
+ pad: pixels of padding in the original image
20
+ """
21
+
22
+ size0 = img.shape
23
+ if size0[0]>=size0[1]:
24
+ h1 = 256
25
+ w1 = 256 * size0[1] // size0[0]
26
+ padh = 0
27
+ padw = 256 - w1
28
+ scale = size0[1] / w1
29
+ else:
30
+ h1 = 256 * size0[0] // size0[1]
31
+ w1 = 256
32
+ padh = 256 - h1
33
+ padw = 0
34
+ scale = size0[0] / h1
35
+ padh1 = padh//2
36
+ padh2 = padh//2 + padh%2
37
+ padw1 = padw//2
38
+ padw2 = padw//2 + padw%2
39
+ img1 = cv2.resize(img, (w1,h1))
40
+ img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0)))
41
+ pad = (int(padh1 * scale), int(padw1 * scale))
42
+ img2 = cv2.resize(img1, (128,128))
43
+ return img1, img2, scale, pad
44
+
45
+
46
+ def denormalize_detections(detections, scale, pad):
47
+ """ maps detection coordinates from [0,1] to image coordinates
48
+
49
+ The face and palm detector networks take 256x256 and 128x128 images
50
+ as input. As such the input image is padded and resized to fit the
51
+ size while maintaing the aspect ratio. This function maps the
52
+ normalized coordinates back to the original image coordinates.
53
+
54
+ Inputs:
55
+ detections: nxm tensor. n is the number of detections.
56
+ m is 4+2*k where the first 4 valuse are the bounding
57
+ box coordinates and k is the number of additional
58
+ keypoints output by the detector.
59
+ scale: scalar that was used to resize the image
60
+ pad: padding in the x and y dimensions
61
+
62
+ """
63
+ detections[:, 0] = detections[:, 0] * scale * 256 - pad[0]
64
+ detections[:, 1] = detections[:, 1] * scale * 256 - pad[1]
65
+ detections[:, 2] = detections[:, 2] * scale * 256 - pad[0]
66
+ detections[:, 3] = detections[:, 3] * scale * 256 - pad[1]
67
+
68
+ detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1]
69
+ detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0]
70
+ return detections
71
+
72
+
73
+
74
+
75
+ class BlazeBlock(nn.Module):
76
+ def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False):
77
+ super(BlazeBlock, self).__init__()
78
+
79
+ self.stride = stride
80
+ self.kernel_size = kernel_size
81
+ self.channel_pad = out_channels - in_channels
82
+
83
+ # TFLite uses slightly different padding than PyTorch
84
+ # on the depthwise conv layer when the stride is 2.
85
+ if stride == 2:
86
+ self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
87
+ padding = 0
88
+ else:
89
+ padding = (kernel_size - 1) // 2
90
+
91
+ self.convs = nn.Sequential(
92
+ nn.Conv2d(in_channels=in_channels, out_channels=in_channels,
93
+ kernel_size=kernel_size, stride=stride, padding=padding,
94
+ groups=in_channels, bias=True),
95
+ nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
96
+ kernel_size=1, stride=1, padding=0, bias=True),
97
+ )
98
+
99
+ if skip_proj:
100
+ self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
101
+ kernel_size=1, stride=1, padding=0, bias=True)
102
+ else:
103
+ self.skip_proj = None
104
+
105
+ if act == 'relu':
106
+ self.act = nn.ReLU(inplace=True)
107
+ elif act == 'prelu':
108
+ self.act = nn.PReLU(out_channels)
109
+ else:
110
+ raise NotImplementedError("unknown activation %s"%act)
111
+
112
+ def forward(self, x):
113
+ if self.stride == 2:
114
+ if self.kernel_size==3:
115
+ h = F.pad(x, (0, 2, 0, 2), "constant", 0)
116
+ else:
117
+ h = F.pad(x, (1, 2, 1, 2), "constant", 0)
118
+ x = self.max_pool(x)
119
+ else:
120
+ h = x
121
+
122
+ if self.skip_proj is not None:
123
+ x = self.skip_proj(x)
124
+ elif self.channel_pad > 0:
125
+ x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
126
+
127
+
128
+ return self.act(self.convs(h) + x)
129
+
130
+
131
+ class FinalBlazeBlock(nn.Module):
132
+ def __init__(self, channels, kernel_size=3):
133
+ super(FinalBlazeBlock, self).__init__()
134
+
135
+ # TFLite uses slightly different padding than PyTorch
136
+ # on the depthwise conv layer when the stride is 2.
137
+ self.convs = nn.Sequential(
138
+ nn.Conv2d(in_channels=channels, out_channels=channels,
139
+ kernel_size=kernel_size, stride=2, padding=0,
140
+ groups=channels, bias=True),
141
+ nn.Conv2d(in_channels=channels, out_channels=channels,
142
+ kernel_size=1, stride=1, padding=0, bias=True),
143
+ )
144
+
145
+ self.act = nn.ReLU(inplace=True)
146
+
147
+ def forward(self, x):
148
+ h = F.pad(x, (0, 2, 0, 2), "constant", 0)
149
+
150
+ return self.act(self.convs(h))
151
+
152
+
153
+ class BlazeBase(nn.Module):
154
+ """ Base class for media pipe models. """
155
+
156
+ def _device(self):
157
+ """Which device (CPU or GPU) is being used by this model?"""
158
+ return self.classifier_8.weight.device
159
+
160
+ def load_weights(self, path):
161
+ self.load_state_dict(torch.load(path))
162
+ self.eval()
163
+
164
+
165
+ class BlazeLandmark(BlazeBase):
166
+ """ Base class for landmark models. """
167
+
168
+ def extract_roi(self, frame, xc, yc, theta, scale):
169
+
170
+ # take points on unit square and transform them according to the roi
171
+ points = torch.tensor([[-1, -1, 1, 1],
172
+ [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
173
+ points = points * scale.view(-1,1,1)/2
174
+ theta = theta.view(-1, 1, 1)
175
+ R = torch.cat((
176
+ torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
177
+ torch.cat((torch.sin(theta), torch.cos(theta)), 2),
178
+ ), 1)
179
+ center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
180
+ points = R @ points + center
181
+
182
+ # use the points to compute the affine transform that maps
183
+ # these points back to the output square
184
+ res = self.resolution
185
+ points1 = np.array([[0, 0, res-1],
186
+ [0, res-1, 0]], dtype=np.float32).T
187
+ affines = []
188
+ imgs = []
189
+ for i in range(points.shape[0]):
190
+ pts = points[i, :, :3].cpu().numpy().T
191
+ M = cv2.getAffineTransform(pts, points1)
192
+ img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
193
+ img = torch.tensor(img, device=scale.device)
194
+ imgs.append(img)
195
+ affine = cv2.invertAffineTransform(M).astype('float32')
196
+ affine = torch.tensor(affine, device=scale.device)
197
+ affines.append(affine)
198
+ if imgs:
199
+ imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
200
+ affines = torch.stack(affines)
201
+ else:
202
+ imgs = torch.zeros((0, 3, res, res), device=scale.device)
203
+ affines = torch.zeros((0, 2, 3), device=scale.device)
204
+
205
+ return imgs, affines, points
206
+
207
+ def denormalize_landmarks(self, landmarks, affines):
208
+ landmarks[:,:,:2] *= self.resolution
209
+ for i in range(len(landmarks)):
210
+ landmark, affine = landmarks[i], affines[i]
211
+ landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
212
+ landmarks[i,:,:2] = landmark
213
+ return landmarks
214
+
215
+
216
+
217
+ class BlazeDetector(BlazeBase):
218
+ """ Base class for detector models.
219
+
220
+ Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
221
+ https://github.com/hollance/BlazeFace-PyTorch and
222
+ https://github.com/google/mediapipe/
223
+ """
224
+ def load_anchors(self, path):
225
+ self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
226
+ assert(self.anchors.ndimension() == 2)
227
+ assert(self.anchors.shape[0] == self.num_anchors)
228
+ assert(self.anchors.shape[1] == 4)
229
+
230
+ def _preprocess(self, x):
231
+ """Converts the image pixels to the range [-1, 1]."""
232
+ return x.float() / 255.# 127.5 - 1.0
233
+
234
+ def predict_on_image(self, img):
235
+ """Makes a prediction on a single image.
236
+
237
+ Arguments:
238
+ img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
239
+ shape (3, H, W). The image's height and width should be
240
+ 128 pixels.
241
+
242
+ Returns:
243
+ A tensor with face detections.
244
+ """
245
+ if isinstance(img, np.ndarray):
246
+ img = torch.from_numpy(img).permute((2, 0, 1))
247
+
248
+ return self.predict_on_batch(img.unsqueeze(0))[0]
249
+
250
+ def predict_on_batch(self, x):
251
+ """Makes a prediction on a batch of images.
252
+
253
+ Arguments:
254
+ x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
255
+ shape (b, 3, H, W). The height and width should be 128 pixels.
256
+
257
+ Returns:
258
+ A list containing a tensor of face detections for each image in
259
+ the batch. If no faces are found for an image, returns a tensor
260
+ of shape (0, 17).
261
+
262
+ Each face detection is a PyTorch tensor consisting of 17 numbers:
263
+ - ymin, xmin, ymax, xmax
264
+ - x,y-coordinates for the 6 keypoints
265
+ - confidence score
266
+ """
267
+ if isinstance(x, np.ndarray):
268
+ x = torch.from_numpy(x).permute((0, 3, 1, 2))
269
+
270
+ assert x.shape[1] == 3
271
+ assert x.shape[2] == self.y_scale
272
+ assert x.shape[3] == self.x_scale
273
+
274
+ # 1. Preprocess the images into tensors:
275
+ x = x.to(self._device())
276
+ x = self._preprocess(x)
277
+
278
+ # 2. Run the neural network:
279
+ with torch.no_grad():
280
+ out = self.__call__(x)
281
+
282
+ # 3. Postprocess the raw predictions:
283
+ detections = self._tensors_to_detections(out[0], out[1], self.anchors)
284
+
285
+ # 4. Non-maximum suppression to remove overlapping detections:
286
+ filtered_detections = []
287
+ for i in range(len(detections)):
288
+ faces = self._weighted_non_max_suppression(detections[i])
289
+ faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1))
290
+ filtered_detections.append(faces)
291
+
292
+ return filtered_detections
293
+
294
+
295
+ def detection2roi(self, detection):
296
+ """ Convert detections from detector to an oriented bounding box.
297
+
298
+ Adapted from:
299
+ # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
300
+
301
+ The center and size of the box is calculated from the center
302
+ of the detected box. Rotation is calcualted from the vector
303
+ between kp1 and kp2 relative to theta0. The box is scaled
304
+ and shifted by dscale and dy.
305
+
306
+ """
307
+ if self.detection2roi_method == 'box':
308
+ # compute box center and scale
309
+ # use mediapipe/calculators/util/detections_to_rects_calculator.cc
310
+ xc = (detection[:,1] + detection[:,3]) / 2
311
+ yc = (detection[:,0] + detection[:,2]) / 2
312
+ scale = (detection[:,3] - detection[:,1]) # assumes square boxes
313
+
314
+ elif self.detection2roi_method == 'alignment':
315
+ # compute box center and scale
316
+ # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc
317
+ xc = detection[:,4+2*self.kp1]
318
+ yc = detection[:,4+2*self.kp1+1]
319
+ x1 = detection[:,4+2*self.kp2]
320
+ y1 = detection[:,4+2*self.kp2+1]
321
+ scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2
322
+ else:
323
+ raise NotImplementedError(
324
+ "detection2roi_method [%s] not supported"%self.detection2roi_method)
325
+
326
+ yc += self.dy * scale
327
+ scale *= self.dscale
328
+
329
+ # compute box rotation
330
+ x0 = detection[:,4+2*self.kp1]
331
+ y0 = detection[:,4+2*self.kp1+1]
332
+ x1 = detection[:,4+2*self.kp2]
333
+ y1 = detection[:,4+2*self.kp2+1]
334
+ #theta = np.arctan2(y0-y1, x0-x1) - self.theta0
335
+ theta = torch.atan2(y0-y1, x0-x1) - self.theta0
336
+ return xc, yc, scale, theta
337
+
338
+
339
+ def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
340
+ """The output of the neural network is a tensor of shape (b, 896, 16)
341
+ containing the bounding box regressor predictions, as well as a tensor
342
+ of shape (b, 896, 1) with the classification confidences.
343
+
344
+ This function converts these two "raw" tensors into proper detections.
345
+ Returns a list of (num_detections, 17) tensors, one for each image in
346
+ the batch.
347
+
348
+ This is based on the source code from:
349
+ mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
350
+ mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
351
+ """
352
+ assert raw_box_tensor.ndimension() == 3
353
+ assert raw_box_tensor.shape[1] == self.num_anchors
354
+ assert raw_box_tensor.shape[2] == self.num_coords
355
+
356
+ assert raw_score_tensor.ndimension() == 3
357
+ assert raw_score_tensor.shape[1] == self.num_anchors
358
+ assert raw_score_tensor.shape[2] == self.num_classes
359
+
360
+ assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
361
+
362
+ detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
363
+
364
+ thresh = self.score_clipping_thresh
365
+ raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
366
+ detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
367
+
368
+ # Note: we stripped off the last dimension from the scores tensor
369
+ # because there is only has one class. Now we can simply use a mask
370
+ # to filter out the boxes with too low confidence.
371
+ mask = detection_scores >= self.min_score_thresh
372
+
373
+ # Because each image from the batch can have a different number of
374
+ # detections, process them one at a time using a loop.
375
+ output_detections = []
376
+ for i in range(raw_box_tensor.shape[0]):
377
+ boxes = detection_boxes[i, mask[i]]
378
+ scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
379
+ output_detections.append(torch.cat((boxes, scores), dim=-1))
380
+
381
+ return output_detections
382
+
383
+ def _decode_boxes(self, raw_boxes, anchors):
384
+ """Converts the predictions into actual coordinates using
385
+ the anchor boxes. Processes the entire batch at once.
386
+ """
387
+ boxes = torch.zeros_like(raw_boxes)
388
+
389
+ x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
390
+ y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
391
+
392
+ w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
393
+ h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
394
+
395
+ boxes[..., 0] = y_center - h / 2. # ymin
396
+ boxes[..., 1] = x_center - w / 2. # xmin
397
+ boxes[..., 2] = y_center + h / 2. # ymax
398
+ boxes[..., 3] = x_center + w / 2. # xmax
399
+
400
+ for k in range(self.num_keypoints):
401
+ offset = 4 + k*2
402
+ keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
403
+ keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
404
+ boxes[..., offset ] = keypoint_x
405
+ boxes[..., offset + 1] = keypoint_y
406
+
407
+ return boxes
408
+
409
+ def _weighted_non_max_suppression(self, detections):
410
+ """The alternative NMS method as mentioned in the BlazeFace paper:
411
+
412
+ "We replace the suppression algorithm with a blending strategy that
413
+ estimates the regression parameters of a bounding box as a weighted
414
+ mean between the overlapping predictions."
415
+
416
+ The original MediaPipe code assigns the score of the most confident
417
+ detection to the weighted detection, but we take the average score
418
+ of the overlapping detections.
419
+
420
+ The input detections should be a Tensor of shape (count, 17).
421
+
422
+ Returns a list of PyTorch tensors, one for each detected face.
423
+
424
+ This is based on the source code from:
425
+ mediapipe/calculators/util/non_max_suppression_calculator.cc
426
+ mediapipe/calculators/util/non_max_suppression_calculator.proto
427
+ """
428
+ if len(detections) == 0: return []
429
+
430
+ output_detections = []
431
+
432
+ # Sort the detections from highest to lowest score.
433
+ remaining = torch.argsort(detections[:, self.num_coords], descending=True)
434
+
435
+ while len(remaining) > 0:
436
+ detection = detections[remaining[0]]
437
+
438
+ # Compute the overlap between the first box and the other
439
+ # remaining boxes. (Note that the other_boxes also include
440
+ # the first_box.)
441
+ first_box = detection[:4]
442
+ other_boxes = detections[remaining, :4]
443
+ ious = overlap_similarity(first_box, other_boxes)
444
+
445
+ # If two detections don't overlap enough, they are considered
446
+ # to be from different faces.
447
+ mask = ious > self.min_suppression_threshold
448
+ overlapping = remaining[mask]
449
+ remaining = remaining[~mask]
450
+
451
+ # Take an average of the coordinates from the overlapping
452
+ # detections, weighted by their confidence scores.
453
+ weighted_detection = detection.clone()
454
+ if len(overlapping) > 1:
455
+ coordinates = detections[overlapping, :self.num_coords]
456
+ scores = detections[overlapping, self.num_coords:self.num_coords+1]
457
+ total_score = scores.sum()
458
+ weighted = (coordinates * scores).sum(dim=0) / total_score
459
+ weighted_detection[:self.num_coords] = weighted
460
+ weighted_detection[self.num_coords] = total_score / len(overlapping)
461
+
462
+ output_detections.append(weighted_detection)
463
+
464
+ return output_detections
465
+
466
+
467
+ # IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
468
+
469
+ def intersect(box_a, box_b):
470
+ """ We resize both tensors to [A,B,2] without new malloc:
471
+ [A,2] -> [A,1,2] -> [A,B,2]
472
+ [B,2] -> [1,B,2] -> [A,B,2]
473
+ Then we compute the area of intersect between box_a and box_b.
474
+ Args:
475
+ box_a: (tensor) bounding boxes, Shape: [A,4].
476
+ box_b: (tensor) bounding boxes, Shape: [B,4].
477
+ Return:
478
+ (tensor) intersection area, Shape: [A,B].
479
+ """
480
+ A = box_a.size(0)
481
+ B = box_b.size(0)
482
+ max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
483
+ box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
484
+ min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
485
+ box_b[:, :2].unsqueeze(0).expand(A, B, 2))
486
+ inter = torch.clamp((max_xy - min_xy), min=0)
487
+ return inter[:, :, 0] * inter[:, :, 1]
488
+
489
+
490
+ def jaccard(box_a, box_b):
491
+ """Compute the jaccard overlap of two sets of boxes. The jaccard overlap
492
+ is simply the intersection over union of two boxes. Here we operate on
493
+ ground truth boxes and default boxes.
494
+ E.g.:
495
+ A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
496
+ Args:
497
+ box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
498
+ box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
499
+ Return:
500
+ jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
501
+ """
502
+ inter = intersect(box_a, box_b)
503
+ area_a = ((box_a[:, 2]-box_a[:, 0]) *
504
+ (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
505
+ area_b = ((box_b[:, 2]-box_b[:, 0]) *
506
+ (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
507
+ union = area_a + area_b - inter
508
+ return inter / union # [A,B]
509
+
510
+
511
+ def overlap_similarity(box, other_boxes):
512
+ """Computes the IOU between a bounding box and set of other boxes."""
513
+ return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/blazehand_landmark.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+
6
+ from blazebase import BlazeLandmark, BlazeBlock
7
+
8
+ class BlazeHandLandmark(BlazeLandmark):
9
+ """The hand landmark model from MediaPipe.
10
+
11
+ """
12
+ def __init__(self):
13
+ super(BlazeHandLandmark, self).__init__()
14
+
15
+ # size of ROIs used for input
16
+ self.resolution = 256
17
+
18
+ self._define_layers()
19
+
20
+ def _define_layers(self):
21
+ self.backbone1 = nn.Sequential(
22
+ nn.Conv2d(in_channels=3, out_channels=24, kernel_size=3, stride=2, padding=0, bias=True),
23
+ nn.ReLU(inplace=True),
24
+
25
+ BlazeBlock(24, 24, 5),
26
+ BlazeBlock(24, 24, 5),
27
+ BlazeBlock(24, 48, 5, 2),
28
+ )
29
+
30
+ self.backbone2 = nn.Sequential(
31
+ BlazeBlock(48, 48, 5),
32
+ BlazeBlock(48, 48, 5),
33
+ BlazeBlock(48, 96, 5, 2),
34
+ )
35
+
36
+ self.backbone3 = nn.Sequential(
37
+ BlazeBlock(96, 96, 5),
38
+ BlazeBlock(96, 96, 5),
39
+ BlazeBlock(96, 96, 5, 2),
40
+ )
41
+
42
+ self.backbone4 = nn.Sequential(
43
+ BlazeBlock(96, 96, 5),
44
+ BlazeBlock(96, 96, 5),
45
+ BlazeBlock(96, 96, 5, 2),
46
+ )
47
+
48
+ self.blaze5 = BlazeBlock(96, 96, 5)
49
+ self.blaze6 = BlazeBlock(96, 96, 5)
50
+ self.conv7 = nn.Conv2d(96, 48, 1, bias=True)
51
+
52
+ self.backbone8 = nn.Sequential(
53
+ BlazeBlock(48, 48, 5),
54
+ BlazeBlock(48, 48, 5),
55
+ BlazeBlock(48, 48, 5),
56
+ BlazeBlock(48, 48, 5),
57
+ BlazeBlock(48, 96, 5, 2),
58
+ BlazeBlock(96, 96, 5),
59
+ BlazeBlock(96, 96, 5),
60
+ BlazeBlock(96, 96, 5),
61
+ BlazeBlock(96, 96, 5),
62
+ BlazeBlock(96, 288, 5, 2),
63
+ BlazeBlock(288, 288, 5),
64
+ BlazeBlock(288, 288, 5),
65
+ BlazeBlock(288, 288, 5),
66
+ BlazeBlock(288, 288, 5),
67
+ BlazeBlock(288, 288, 5, 2),
68
+ BlazeBlock(288, 288, 5),
69
+ BlazeBlock(288, 288, 5),
70
+ BlazeBlock(288, 288, 5),
71
+ BlazeBlock(288, 288, 5),
72
+ BlazeBlock(288, 288, 5, 2),
73
+ BlazeBlock(288, 288, 5),
74
+ BlazeBlock(288, 288, 5),
75
+ BlazeBlock(288, 288, 5),
76
+ BlazeBlock(288, 288, 5),
77
+ BlazeBlock(288, 288, 5, 2),
78
+ BlazeBlock(288, 288, 5),
79
+ BlazeBlock(288, 288, 5),
80
+ BlazeBlock(288, 288, 5),
81
+ BlazeBlock(288, 288, 5),
82
+ )
83
+
84
+ self.hand_flag = nn.Conv2d(288, 1, 2, bias=True)
85
+ self.handed = nn.Conv2d(288, 1, 2, bias=True)
86
+ self.landmarks = nn.Conv2d(288, 63, 2, bias=True)
87
+
88
+
89
+ def forward(self, x):
90
+ if x.shape[0] == 0:
91
+ return torch.zeros((0,)), torch.zeros((0,)), torch.zeros((0, 21, 3))
92
+
93
+ x = F.pad(x, (0, 1, 0, 1), "constant", 0)
94
+
95
+ x = self.backbone1(x)
96
+ y = self.backbone2(x)
97
+ z = self.backbone3(y)
98
+ w = self.backbone4(z)
99
+
100
+ z = z + F.interpolate(w, scale_factor=2, mode='bilinear')
101
+ z = self.blaze5(z)
102
+
103
+ y = y + F.interpolate(z, scale_factor=2, mode='bilinear')
104
+ y = self.blaze6(y)
105
+ y = self.conv7(y)
106
+
107
+ x = x + F.interpolate(y, scale_factor=2, mode='bilinear')
108
+
109
+ x = self.backbone8(x)
110
+
111
+ hand_flag = self.hand_flag(x).view(-1).sigmoid()
112
+ handed = self.handed(x).view(-1).sigmoid()
113
+ landmarks = self.landmarks(x).view(-1, 21, 3) / 256
114
+
115
+ return hand_flag, handed, landmarks
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/blazepalm.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+
6
+ from blazebase import BlazeDetector, BlazeBlock
7
+
8
+
9
+ class BlazePalm(BlazeDetector):
10
+ """The palm detection model from MediaPipe. """
11
+ def __init__(self):
12
+ super(BlazePalm, self).__init__()
13
+
14
+ # These are the settings from the MediaPipe example graph
15
+ # mediapipe/graphs/hand_tracking/subgraphs/hand_detection_gpu.pbtxt
16
+ self.num_classes = 1
17
+ self.num_anchors = 2944
18
+ self.num_coords = 18
19
+ self.score_clipping_thresh = 100.0
20
+ self.x_scale = 256.0
21
+ self.y_scale = 256.0
22
+ self.h_scale = 256.0
23
+ self.w_scale = 256.0
24
+ self.min_score_thresh = 0.5
25
+ self.min_suppression_threshold = 0.3
26
+ self.num_keypoints = 7
27
+
28
+ # These settings are for converting detections to ROIs which can then
29
+ # be extracted and feed into the landmark network
30
+ # use mediapipe/calculators/util/detections_to_rects_calculator.cc
31
+ self.detection2roi_method = 'box'
32
+ # mediapipe/graphs/hand_tracking/subgraphs/hand_detection_cpu.pbtxt
33
+ self.kp1 = 0
34
+ self.kp2 = 2
35
+ self.theta0 = np.pi/2
36
+ self.dscale = 2.6
37
+ self.dy = -0.5
38
+
39
+ self._define_layers()
40
+
41
+ def _define_layers(self):
42
+ self.backbone1 = nn.Sequential(
43
+ nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=2, padding=0, bias=True),
44
+ nn.ReLU(inplace=True),
45
+
46
+ BlazeBlock(32, 32),
47
+ BlazeBlock(32, 32),
48
+ BlazeBlock(32, 32),
49
+ BlazeBlock(32, 32),
50
+ BlazeBlock(32, 32),
51
+ BlazeBlock(32, 32),
52
+ BlazeBlock(32, 32),
53
+
54
+ BlazeBlock(32, 64, stride=2),
55
+ BlazeBlock(64, 64),
56
+ BlazeBlock(64, 64),
57
+ BlazeBlock(64, 64),
58
+ BlazeBlock(64, 64),
59
+ BlazeBlock(64, 64),
60
+ BlazeBlock(64, 64),
61
+ BlazeBlock(64, 64),
62
+
63
+ BlazeBlock(64, 128, stride=2),
64
+ BlazeBlock(128, 128),
65
+ BlazeBlock(128, 128),
66
+ BlazeBlock(128, 128),
67
+ BlazeBlock(128, 128),
68
+ BlazeBlock(128, 128),
69
+ BlazeBlock(128, 128),
70
+ BlazeBlock(128, 128),
71
+
72
+ )
73
+
74
+ self.backbone2 = nn.Sequential(
75
+ BlazeBlock(128, 256, stride=2),
76
+ BlazeBlock(256, 256),
77
+ BlazeBlock(256, 256),
78
+ BlazeBlock(256, 256),
79
+ BlazeBlock(256, 256),
80
+ BlazeBlock(256, 256),
81
+ BlazeBlock(256, 256),
82
+ BlazeBlock(256, 256),
83
+ )
84
+
85
+ self.backbone3 = nn.Sequential(
86
+ BlazeBlock(256, 256, stride=2),
87
+ BlazeBlock(256, 256),
88
+ BlazeBlock(256, 256),
89
+ BlazeBlock(256, 256),
90
+ BlazeBlock(256, 256),
91
+ BlazeBlock(256, 256),
92
+ BlazeBlock(256, 256),
93
+ BlazeBlock(256, 256),
94
+ )
95
+
96
+ self.conv_transpose1 = nn.ConvTranspose2d(in_channels=256, out_channels=256, kernel_size=2, stride=2, padding=0, bias=True)
97
+ self.blaze1 = BlazeBlock(256, 256)
98
+
99
+ self.conv_transpose2 = nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=2, stride=2, padding=0, bias=True)
100
+ self.blaze2 = BlazeBlock(128, 128)
101
+
102
+ self.classifier_32 = nn.Conv2d(128, 2, 1, bias=True)
103
+ self.classifier_16 = nn.Conv2d(256, 2, 1, bias=True)
104
+ self.classifier_8 = nn.Conv2d(256, 6, 1, bias=True)
105
+
106
+ self.regressor_32 = nn.Conv2d(128, 36, 1, bias=True)
107
+ self.regressor_16 = nn.Conv2d(256, 36, 1, bias=True)
108
+ self.regressor_8 = nn.Conv2d(256, 108, 1, bias=True)
109
+
110
+ def forward(self, x):
111
+ b = x.shape[0] # batch size, needed for reshaping later
112
+
113
+ x = F.pad(x, (0, 1, 0, 1), "constant", 0)
114
+
115
+ x = self.backbone1(x) # (b, 128, 32, 32)
116
+ y = self.backbone2(x) # (b, 256, 16, 16)
117
+ z = self.backbone3(y) # (b, 256, 8, 8)
118
+
119
+ y = y + F.relu(self.conv_transpose1(z), True)
120
+ y = self.blaze1(y)
121
+
122
+ x = x + F.relu(self.conv_transpose2(y), True)
123
+ x = self.blaze2(x)
124
+
125
+
126
+ # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
127
+ # permute the output from the conv layers before reshaping it.
128
+
129
+ c1 = self.classifier_8(z) # (b, 2, 16, 16)
130
+ c1 = c1.permute(0, 2, 3, 1) # (b, 16, 16, 2)
131
+ c1 = c1.reshape(b, -1, 1) # (b, 512, 1)
132
+
133
+ c2 = self.classifier_16(y) # (b, 6, 8, 8)
134
+ c2 = c2.permute(0, 2, 3, 1) # (b, 8, 8, 6)
135
+ c2 = c2.reshape(b, -1, 1) # (b, 384, 1)
136
+
137
+ c3 = self.classifier_32(x) # (b, 6, 8, 8)
138
+ c3 = c3.permute(0, 2, 3, 1) # (b, 8, 8, 6)
139
+ c3 = c3.reshape(b, -1, 1) # (b, 384, 1)
140
+
141
+ c = torch.cat((c3, c2, c1), dim=1) # (b, 896, 1)
142
+
143
+ r1 = self.regressor_8(z) # (b, 32, 16, 16)
144
+ r1 = r1.permute(0, 2, 3, 1) # (b, 16, 16, 32)
145
+ r1 = r1.reshape(b, -1, 18) # (b, 512, 16)
146
+
147
+ r2 = self.regressor_16(y) # (b, 96, 8, 8)
148
+ r2 = r2.permute(0, 2, 3, 1) # (b, 8, 8, 96)
149
+ r2 = r2.reshape(b, -1, 18) # (b, 384, 16)
150
+
151
+ r3 = self.regressor_32(x) # (b, 96, 8, 8)
152
+ r3 = r3.permute(0, 2, 3, 1) # (b, 8, 8, 96)
153
+ r3 = r3.reshape(b, -1, 18) # (b, 384, 16)
154
+
155
+ r = torch.cat((r3, r2, r1), dim=1) # (b, 896, 16)
156
+
157
+ return [r, c]
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/demo_qnn.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import cv2
4
+ import sys
5
+ from blazebase import resize_pad, denormalize_detections
6
+ from visualization import draw_landmarks, draw_roi, HAND_CONNECTIONS
7
+ import time
8
+ import aidlite
9
+ import os
10
+
11
+
12
+ class post_mediapipe_hand:
13
+ def __init__(self):
14
+ self.kp1 = 0
15
+ self.kp2 = 2
16
+ self.theta0 = 1.5707963267948966
17
+ self.dscale = 2.6
18
+ self.dy = -0.5
19
+ self.x_scale = 256.0
20
+ self.y_scale = 256.0
21
+ self.h_scale = 256.0
22
+ self.w_scale = 256.0
23
+ self.num_keypoints = 7
24
+ self.num_classes = 1
25
+ self.num_anchors = 2944
26
+ self.num_coords = 18
27
+ self.min_score_thresh = 0.75
28
+ self.score_clipping_thresh = 100.0
29
+ self.min_suppression_threshold = 0.3
30
+ self.resolution = 256
31
+
32
+
33
+ def detection2roi(self,detection):
34
+ xc = (detection[:,1] + detection[:,3]) / 2
35
+ yc = (detection[:,0] + detection[:,2]) / 2
36
+ scale = (detection[:,3] - detection[:,1]) # assumes square boxes
37
+ yc += self.dy * scale
38
+ scale *= self.dscale
39
+ # compute box rotation
40
+ x0 = detection[:,4+2*self.kp1]
41
+ y0 = detection[:,4+2*self.kp1+1]
42
+ x1 = detection[:,4+2*self.kp2]
43
+ y1 = detection[:,4+2*self.kp2+1]
44
+ theta = torch.atan2(y0-y1, x0-x1) - self.theta0
45
+ return xc, yc, scale, theta
46
+
47
+ def _decode_boxes( self,raw_boxes, anchors):
48
+ boxes = torch.zeros_like(raw_boxes)
49
+
50
+ x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
51
+ y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
52
+
53
+ w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
54
+ h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
55
+
56
+ boxes[..., 0] = y_center - h / 2. # ymin
57
+ boxes[..., 1] = x_center - w / 2. # xmin
58
+ boxes[..., 2] = y_center + h / 2. # ymax
59
+ boxes[..., 3] = x_center + w / 2. # xmax
60
+
61
+ for k in range(self.num_keypoints):
62
+ offset = 4 + k*2
63
+ keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
64
+ keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
65
+ boxes[..., offset ] = keypoint_x
66
+ boxes[..., offset + 1] = keypoint_y
67
+ return boxes
68
+
69
+ def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
70
+ assert raw_box_tensor.ndimension() == 3
71
+ assert raw_box_tensor.shape[1] == self.num_anchors
72
+ assert raw_box_tensor.shape[2] == self.num_coords
73
+
74
+ assert raw_score_tensor.ndimension() == 3
75
+ assert raw_score_tensor.shape[1] == self.num_anchors
76
+ assert raw_score_tensor.shape[2] == self.num_classes
77
+
78
+ assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
79
+
80
+ detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
81
+
82
+ thresh = self.score_clipping_thresh
83
+ raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
84
+ detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
85
+
86
+ # Note: we stripped off the last dimension from the scores tensor
87
+ # because there is only has one class. Now we can simply use a mask
88
+ # to filter out the boxes with too low confidence.
89
+ mask = detection_scores >= self.min_score_thresh
90
+
91
+ # Because each image from the batch can have a different number of
92
+ # detections, process them one at a time using a loop.
93
+ output_detections = []
94
+ for i in range(raw_box_tensor.shape[0]):
95
+ boxes = detection_boxes[i, mask[i]]
96
+ scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
97
+ output_detections.append(torch.cat((boxes, scores), dim=-1))
98
+
99
+ return output_detections
100
+
101
+ def extract_roi( self,frame, xc, yc, theta, scale):
102
+ # take points on unit square and transform them according to the roi
103
+ points = torch.tensor([[-1, -1, 1, 1],
104
+ [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
105
+ points = points * scale.view(-1,1,1)/2
106
+ theta = theta.view(-1, 1, 1)
107
+ R = torch.cat((
108
+ torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
109
+ torch.cat((torch.sin(theta), torch.cos(theta)), 2),
110
+ ), 1)
111
+ center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
112
+ points = R @ points + center
113
+
114
+ # use the points to compute the affine transform that maps
115
+ # these points back to the output square
116
+ res = self.resolution
117
+ points1 = np.array([[0, 0, res-1],
118
+ [0, res-1, 0]], dtype=np.float32).T
119
+ affines = []
120
+ imgs = []
121
+ for i in range(points.shape[0]):
122
+ pts = points[i, :, :3].detach().numpy().T
123
+ M = cv2.getAffineTransform(pts, points1)
124
+ img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
125
+ img = torch.tensor(img, device=scale.device)
126
+ imgs.append(img)
127
+ affine = cv2.invertAffineTransform(M).astype('float32')
128
+ affine = torch.tensor(affine, device=scale.device)
129
+ affines.append(affine)
130
+ if imgs:
131
+ imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
132
+ affines = torch.stack(affines)
133
+ else:
134
+ imgs = torch.zeros((0, 3, res, res), device=scale.device)
135
+ affines = torch.zeros((0, 2, 3), device=scale.device)
136
+
137
+ return imgs, affines, points
138
+
139
+ def denormalize_landmarks(self, landmarks, affines):
140
+ landmarks[:,:,:2] *= self.resolution
141
+ for i in range(len(landmarks)):
142
+ landmark, affine = landmarks[i], affines[i]
143
+ landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
144
+ landmarks[i,:,:2] = landmark
145
+ return landmarks
146
+
147
+ def intersect(self,box_a, box_b):
148
+ A = box_a.size(0)
149
+ B = box_b.size(0)
150
+ max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
151
+ box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
152
+ min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
153
+ box_b[:, :2].unsqueeze(0).expand(A, B, 2))
154
+ inter = torch.clamp((max_xy - min_xy), min=0)
155
+ return inter[:, :, 0] * inter[:, :, 1]
156
+
157
+ def jaccard(self,box_a, box_b):
158
+ inter = self.intersect(box_a, box_b)
159
+ area_a = ((box_a[:, 2]-box_a[:, 0]) *
160
+ (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
161
+ area_b = ((box_b[:, 2]-box_b[:, 0]) *
162
+ (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
163
+ union = area_a + area_b - inter
164
+ return inter / union # [A,B]
165
+
166
+
167
+ def overlap_similarity(self,box, other_boxes):
168
+ """Computes the IOU between a bounding box and set of other boxes."""
169
+ return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
170
+
171
+ def _weighted_non_max_suppression(self,detections):
172
+ if len(detections) == 0: return []
173
+ output_detections = []
174
+
175
+ # Sort the detections from highest to lowest score.
176
+ remaining = torch.argsort(detections[:, num_coords], descending=True)
177
+
178
+ while len(remaining) > 0:
179
+ detection = detections[remaining[0]]
180
+
181
+ # Compute the overlap between the first box and the other
182
+ # remaining boxes. (Note that the other_boxes also include
183
+ # the first_box.)
184
+ first_box = detection[:4]
185
+ other_boxes = detections[remaining, :4]
186
+ ious = self.overlap_similarity(first_box, other_boxes)
187
+
188
+ # If two detections don't overlap enough, they are considered
189
+ # to be from different faces.
190
+ mask = ious > self.min_suppression_threshold
191
+ overlapping = remaining[mask]
192
+ remaining = remaining[~mask]
193
+
194
+ # Take an average of the coordinates from the overlapping
195
+ # detections, weighted by their confidence scores.
196
+ weighted_detection = detection.clone()
197
+ if len(overlapping) > 1:
198
+ coordinates = detections[overlapping, :num_coords]
199
+ scores = detections[overlapping, num_coords:num_coords+1]
200
+ total_score = scores.sum()
201
+ weighted = (coordinates * scores).sum(dim=0) / total_score
202
+ weighted_detection[:num_coords] = weighted
203
+ weighted_detection[num_coords] = total_score / len(overlapping)
204
+
205
+ output_detections.append(weighted_detection)
206
+
207
+ return output_detections
208
+
209
+ def draw_detections(img, detections, with_keypoints=True):
210
+ if isinstance(detections, torch.Tensor):
211
+ detections = detections.detach().numpy()
212
+
213
+ if detections.ndim == 1:
214
+ detections = np.expand_dims(detections, axis=0)
215
+
216
+ n_keypoints = detections.shape[1] // 2 - 2
217
+
218
+ for i in range(detections.shape[0]):
219
+ ymin = detections[i, 0]
220
+ xmin = detections[i, 1]
221
+ ymax = detections[i, 2]
222
+ xmax = detections[i, 3]
223
+
224
+ start_point = (int(xmin), int(ymin))
225
+ end_point = (int(xmax), int(ymax))
226
+ img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1)
227
+
228
+ if with_keypoints:
229
+ for k in range(n_keypoints):
230
+ kp_x = int(detections[i, 4 + k*2 ])
231
+ kp_y = int(detections[i, 4 + k*2 + 1])
232
+ cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
233
+ return img
234
+
235
+
236
+
237
+ post_process=post_mediapipe_hand()
238
+
239
+ class handDetectionQnn:
240
+ def __init__(self):
241
+ super().__init__()
242
+ self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_handDetector_w8a16.qnn216.ctx.bin"))
243
+ if self.model is None:
244
+ print("Create model failed !")
245
+ return
246
+
247
+ self.config = aidlite.Config.create_instance()
248
+ if self.config is None:
249
+ print("build_interpretper_from_model_and_config failed !")
250
+ return
251
+
252
+ self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
253
+ self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
254
+ self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
255
+ self.config.is_quantify_model = 1
256
+
257
+ self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
258
+ if self.interpreter is None:
259
+ print("build_interpretper_from_model_and_config failed !")
260
+ return
261
+ input_shapes = [[1,3, 256, 256]]
262
+ output_shapes = [[1, 2944,18],[1,2944,1]]
263
+ self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
264
+ output_shapes, aidlite.DataType.TYPE_FLOAT32)
265
+
266
+ if self.interpreter is None:
267
+ print("build_interpretper_from_model_and_config failed !")
268
+ result = self.interpreter.init()
269
+ if result != 0:
270
+ print(f"interpreter init failed !")
271
+ result = self.interpreter.load_model()
272
+ if result != 0:
273
+ print("interpreter load model failed !")
274
+
275
+ print(" model load success!")
276
+
277
+ def __call__(self, input):
278
+ self.interpreter.set_input_tensor(0,input)
279
+ self.interpreter.invoke()
280
+ features_0 = self.interpreter.get_output_tensor(0).reshape(1, 2944,18).copy()
281
+ features_1 = self.interpreter.get_output_tensor(1).reshape(1, 2944,1).copy()
282
+ return features_0,features_1
283
+
284
+
285
+ class handLandmarkQnn:
286
+ def __init__(self):
287
+ super().__init__()
288
+ self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_handLandmark_w8a16.qnn216.ctx.bin"))
289
+ if self.model is None:
290
+ print("Create model failed !")
291
+ return
292
+
293
+ self.config = aidlite.Config.create_instance()
294
+ if self.config is None:
295
+ print("build_interpretper_from_model_and_config failed !")
296
+ return
297
+
298
+ self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
299
+ self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
300
+ self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
301
+ self.config.is_quantify_model = 1
302
+
303
+ self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
304
+ if self.interpreter is None:
305
+ print("build_interpretper_from_model_and_config failed !")
306
+ return
307
+ input_shapes = [[1, 3, 256, 256]]
308
+ output_shapes = [[1],[1],[1,21,3]]
309
+ self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
310
+ output_shapes, aidlite.DataType.TYPE_FLOAT32)
311
+
312
+ if self.interpreter is None:
313
+ print("build_interpretper_from_model_and_config failed !")
314
+ result = self.interpreter.init()
315
+ if result != 0:
316
+ print(f"interpreter init failed !")
317
+ result = self.interpreter.load_model()
318
+ if result != 0:
319
+ print("interpreter load model failed !")
320
+
321
+ print(" model load success!")
322
+
323
+ def __call__(self, input):
324
+ self.interpreter.set_input_tensor(0,input)
325
+ self.interpreter.invoke()
326
+ features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy()
327
+ features_1 = self.interpreter.get_output_tensor(2).reshape(1,21,3).copy()
328
+ return features_0,features_1
329
+
330
+
331
+
332
+ anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/anchors_palm.npy")), dtype=torch.float32, device='cpu')
333
+ hand_detc = handDetectionQnn()
334
+ hand_rec = handLandmarkQnn()
335
+
336
+ image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"hand.jpg")
337
+
338
+ frame_ct=0
339
+ image = cv2.imread(image_path)
340
+
341
+ frame = np.ascontiguousarray(image[:,:,::-1])
342
+
343
+ img1, img2, scale, pad = resize_pad(frame)
344
+
345
+ input = (img1 / 255).astype(np.float32)
346
+ input = np.transpose(input, (2, 0, 1))
347
+ input = input[np.newaxis, ...]
348
+ t0 = time.time()
349
+ out = hand_detc(input)
350
+ use_time = round((time.time() - t0) * 1000, 2)
351
+ print(f"face detction inference_time:{use_time} ms")
352
+ detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors)
353
+
354
+ filtered_detections = []
355
+ num_coords = 18
356
+ for i in range(len(detections)):
357
+ faces = post_process._weighted_non_max_suppression(detections[i])
358
+ faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1))
359
+ filtered_detections.append(faces)
360
+
361
+ face_detections = denormalize_detections(filtered_detections[0], scale, pad)
362
+
363
+ xc, yc, scale, theta = post_process.detection2roi(face_detections)
364
+
365
+ img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale)
366
+ if box.size()[0]!=0:
367
+ t2 = time.time()
368
+ flags, normalized_landmarks = hand_rec(img.numpy())
369
+
370
+ use_time = round((time.time() - t2) * 1000, 2)
371
+ print(f"landmark inference_time:{use_time} ms")
372
+
373
+ landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine)
374
+
375
+ for i in range(len(flags)):
376
+ landmark, flag = landmarks[i], flags[i]
377
+ if flag>.4: # 0.5
378
+ draw_landmarks(frame, landmark[:,:2], HAND_CONNECTIONS, size=2)
379
+ else:
380
+ print("not detect palm !")
381
+
382
+ draw_roi(frame, box)
383
+ draw_detections(frame, face_detections)
384
+ cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1])
385
+ hand_detc.interpreter.destory()
386
+ hand_rec.interpreter.destory()
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/export_jit.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import os
4
+ from typing import Callable, Tuple
5
+ from blazepalm import BlazePalm
6
+ from blazehand_landmark import BlazeHandLandmark
7
+
8
+
9
+ gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
10
+ torch.set_grad_enabled(False)
11
+
12
+
13
+
14
+ class HandDetector(torch.nn.Module):
15
+ def __init__(
16
+ self,
17
+ detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]],
18
+ anchors: torch.Tensor,
19
+ ):
20
+ super().__init__()
21
+ self.detector = detector
22
+ self.anchors = anchors
23
+
24
+ def forward(self, image):
25
+ return self.detector(image)
26
+
27
+ class HandLandmarkDetector(torch.nn.Module):
28
+ def __init__(
29
+ self,
30
+ detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]],
31
+ ):
32
+ super().__init__()
33
+ self.detector = detector
34
+
35
+ def forward(self, image):
36
+ return self.detector(image)
37
+
38
+
39
+
40
+
41
+ palm_detector = BlazePalm().to(gpu)
42
+ palm_detector.load_weights(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/blazepalm.pth"))
43
+ palm_detector.load_anchors(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/anchors_palm.npy"))
44
+ palm_detector.min_score_thresh = .75
45
+
46
+ num_params = sum(p.numel() for p in palm_detector.parameters() if p.requires_grad)
47
+ print(f'Number of palm_detector parameters: {num_params}')
48
+
49
+ hand_regressor = BlazeHandLandmark().to(gpu)
50
+ hand_regressor.load_weights(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/blazehand_landmark.pth"))
51
+ num_params = sum(p.numel() for p in hand_regressor.parameters() if p.requires_grad)
52
+ print(f'Number of hand_landmark parameters: {num_params}')
53
+
54
+ hand_detect = HandDetector(palm_detector,palm_detector.anchors)
55
+ hand_regres = HandLandmarkDetector(hand_regressor)
56
+
57
+ hand_d_in = torch.randn(1, 3, 256, 256,dtype= torch.float32)
58
+
59
+ source_model = torch.jit.trace(hand_detect.to("cpu"),hand_d_in)
60
+ source_model.save("m_handDetector.pt")
61
+ print("export hand detect ok!")
62
+
63
+ hand_r_in = torch.randn(1, 3, 256, 256,dtype= torch.float32)
64
+ source_model = torch.jit.trace(hand_regres.to("cpu"), hand_r_in)
65
+ source_model.save("m_handLandmark.pt")
66
+ print("export hand landmark ok!")
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/hand.jpg ADDED
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/visualization.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+ import torch
4
+
5
+ def draw_detections(img, detections, with_keypoints=True):
6
+ if isinstance(detections, torch.Tensor):
7
+ detections = detections.cpu().numpy()
8
+
9
+ if detections.ndim == 1:
10
+ detections = np.expand_dims(detections, axis=0)
11
+
12
+ n_keypoints = detections.shape[1] // 2 - 2
13
+
14
+ for i in range(detections.shape[0]):
15
+ ymin = detections[i, 0]
16
+ xmin = detections[i, 1]
17
+ ymax = detections[i, 2]
18
+ xmax = detections[i, 3]
19
+
20
+ start_point = (int(xmin), int(ymin))
21
+ end_point = (int(xmax), int(ymax))
22
+ img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1)
23
+
24
+ if with_keypoints:
25
+ for k in range(n_keypoints):
26
+ kp_x = int(detections[i, 4 + k*2 ])
27
+ kp_y = int(detections[i, 4 + k*2 + 1])
28
+ cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
29
+ return img
30
+
31
+
32
+ def draw_roi(img, roi):
33
+ for i in range(roi.shape[0]):
34
+ (x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i]
35
+ cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2)
36
+ cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2)
37
+ cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2)
38
+ cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2)
39
+
40
+
41
+ def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2):
42
+ points = points[:,:2]
43
+ for point in points:
44
+ x, y = point
45
+ x, y = int(x), int(y)
46
+ cv2.circle(img, (x, y), size, color, thickness=size)
47
+ for connection in connections:
48
+ x0, y0 = points[connection[0]]
49
+ x1, y1 = points[connection[1]]
50
+ x0, y0 = int(x0), int(y0)
51
+ x1, y1 = int(x1), int(y1)
52
+ cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size)
53
+
54
+
55
+
56
+ # https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py
57
+ # 8 12 16 20
58
+ # | | | |
59
+ # 7 11 15 19
60
+ # 4 | | | |
61
+ # | 6 10 14 18
62
+ # 3 | | | |
63
+ # | 5---9---13--17
64
+ # 2 \ /
65
+ # \ \ /
66
+ # 1 \ /
67
+ # \ \ /
68
+ # ------0-
69
+ HAND_CONNECTIONS = [
70
+ (0, 1), (1, 2), (2, 3), (3, 4),
71
+ (5, 6), (6, 7), (7, 8),
72
+ (9, 10), (10, 11), (11, 12),
73
+ (13, 14), (14, 15), (15, 16),
74
+ (17, 18), (18, 19), (19, 20),
75
+ (0, 5), (5, 9), (9, 13), (13, 17), (0, 17)
76
+ ]
77
+
78
+ POSE_CONNECTIONS = [
79
+ (0,1), (1,2), (2,3), (3,7),
80
+ (0,4), (4,5), (5,6), (6,8),
81
+ (9,10),
82
+ (11,13), (13,15), (15,17), (17,19), (19,15), (15,21),
83
+ (12,14), (14,16), (16,18), (18,20), (20,16), (16,22),
84
+ (11,12), (12,24), (24,23), (23,11)
85
+ ]
86
+
87
+ # Vertex indices can be found in
88
+ # github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png
89
+ # Found in github.com/google/mediapipe/python/solutions/face_mesh.py
90
+ FACE_CONNECTIONS = [
91
+ # Lips.
92
+ (61, 146), (146, 91), (91, 181), (181, 84), (84, 17),
93
+ (17, 314), (314, 405), (405, 321), (321, 375), (375, 291),
94
+ (61, 185), (185, 40), (40, 39), (39, 37), (37, 0),
95
+ (0, 267), (267, 269), (269, 270), (270, 409), (409, 291),
96
+ (78, 95), (95, 88), (88, 178), (178, 87), (87, 14),
97
+ (14, 317), (317, 402), (402, 318), (318, 324), (324, 308),
98
+ (78, 191), (191, 80), (80, 81), (81, 82), (82, 13),
99
+ (13, 312), (312, 311), (311, 310), (310, 415), (415, 308),
100
+ # Left eye.
101
+ (263, 249), (249, 390), (390, 373), (373, 374), (374, 380),
102
+ (380, 381), (381, 382), (382, 362), (263, 466), (466, 388),
103
+ (388, 387), (387, 386), (386, 385), (385, 384), (384, 398),
104
+ (398, 362),
105
+ # Left eyebrow.
106
+ (276, 283), (283, 282), (282, 295), (295, 285), (300, 293),
107
+ (293, 334), (334, 296), (296, 336),
108
+ # Right eye.
109
+ (33, 7), (7, 163), (163, 144), (144, 145), (145, 153),
110
+ (153, 154), (154, 155), (155, 133), (33, 246), (246, 161),
111
+ (161, 160), (160, 159), (159, 158), (158, 157), (157, 173),
112
+ (173, 133),
113
+ # Right eyebrow.
114
+ (46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105),
115
+ (105, 66), (66, 107),
116
+ # Face oval.
117
+ (10, 338), (338, 297), (297, 332), (332, 284), (284, 251),
118
+ (251, 389), (389, 356), (356, 454), (454, 323), (323, 361),
119
+ (361, 288), (288, 397), (397, 365), (365, 379), (379, 378),
120
+ (378, 400), (400, 377), (377, 152), (152, 148), (148, 176),
121
+ (176, 149), (149, 150), (150, 136), (136, 172), (172, 58),
122
+ (58, 132), (132, 93), (93, 234), (234, 127), (127, 162),
123
+ (162, 21), (21, 54), (54, 103), (103, 67), (67, 109),
124
+ (109, 10)
125
+ ]
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Model Information
2
+ ### Source model
3
+ - Input shape: [1x3x256x256], [1x3x256x256]
4
+ - Number of parameters:1.76M, 2.01M
5
+ - Model size:7.11MB, 8.09MB
6
+ - Output shape: [1x2944x18, 1x2944x1], [1, 1, 1x21x3]
7
+
8
+ Source model repository: [MediaPipe-Hand-Detection](https://github.com/zmurez/MediaPipePyTorch/)
9
+
10
+ ### Converted model
11
+
12
+ - Precision: FP16
13
+ - Backend: QNN2.16
14
+ - Target Device: SNM972 QCS8550
15
+
16
+ ## Inference with AidLite SDK
17
+
18
+ ### SDK installation
19
+ Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
20
+
21
+ - install AidLite SDK
22
+
23
+ ```bash
24
+ # Install the appropriate version of the aidlite sdk
25
+ sudo aid-pkg update
26
+ sudo aid-pkg install aidlite-sdk
27
+ # Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
28
+ sudo aid-pkg install aidlite-{QNN VERSION}
29
+ ```
30
+
31
+ - Verify AidLite SDK
32
+
33
+ ```bash
34
+ # aidlite sdk c++ check
35
+ python3 -c "import aidlite ; print(aidlite.get_library_version())"
36
+
37
+ # aidlite sdk python check
38
+ python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
39
+ ```
40
+
41
+ ### Run demo
42
+ #### python
43
+ ```bash
44
+ cd python
45
+ python3 demo_qnn.py
46
+ ```
47
+
48
+ #### c++
49
+ ```bash
50
+ # 加载.npy文件需要用到cnpy库(终端默认路径下执行即可)
51
+ git clone https://github.com/rogersce/cnpy.git
52
+ cd cnpy
53
+ mkdir build && cd build
54
+ cmake ..
55
+ make
56
+ sudo make install
57
+
58
+ cd mediapipe-hand/model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp
59
+ mkdir build && cd build
60
+ cmake ..
61
+ make
62
+ ./run_test
63
+ ```
64
+
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/CMakeLists.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required (VERSION 3.5)
2
+ project("run_test")
3
+
4
+ find_package(OpenCV REQUIRED)
5
+ find_library(CNPY_LIB cnpy REQUIRED)
6
+
7
+ message(STATUS "oPENCV Library status:")
8
+ message(STATUS ">version:${OpenCV_VERSION}")
9
+ message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
10
+
11
+ set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
12
+
13
+ include_directories(
14
+ /usr/local/include
15
+ /usr/include/opencv4
16
+ )
17
+
18
+ link_directories(
19
+ /usr/local/lib/
20
+ )
21
+
22
+ file(GLOB SRC_LISTS
23
+ ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp
24
+ )
25
+
26
+ add_executable(run_test ${SRC_LISTS})
27
+
28
+ target_link_libraries(run_test
29
+ aidlite
30
+ ${OpenCV_LIBS}
31
+ pthread
32
+ jsoncpp
33
+ ${CNPY_LIB}
34
+ )
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/anchors_float32.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df91d5dc452f5098bd2618bae51fed413a1f6d3774bea5fbfac1a846d4ee8466
3
+ size 47232
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/hand.jpg ADDED
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/run_test.cpp ADDED
@@ -0,0 +1,923 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <iostream>
2
+ #include <fstream>
3
+ #include <opencv2/opencv.hpp>
4
+ #include <aidlux/aidlite/aidlite.hpp>
5
+ #include <vector>
6
+ #include <numeric>
7
+ #include <cmath>
8
+ #include <jsoncpp/json/json.h>
9
+ #include <tuple>
10
+ #include <algorithm>
11
+ #include <sstream>
12
+ #include <string>
13
+ #include <cassert>
14
+ #include "cnpy.h"
15
+
16
+ using namespace cv;
17
+ using namespace std;
18
+ using namespace Aidlux::Aidlite;
19
+
20
+
21
+ // 人脸 landmark 连接索引定义(来自 MediaPipe Face Mesh)
22
+ const std::vector<std::pair<int, int>> HAND_CONNECTIONS = {
23
+ {0, 1}, {1, 2}, {2, 3}, {3, 4},
24
+ {5, 6}, {6, 7}, {7, 8},
25
+ {9, 10}, {10, 11}, {11, 12},
26
+ {13, 14}, {14, 15}, {15, 16},
27
+ {17, 18}, {18, 19}, {19, 20},
28
+ {0, 5}, {5, 9}, {9, 13}, {13, 17}, {0, 17}
29
+ };
30
+
31
+ int kp1 = 0, kp2 = 2; // 关键点索引
32
+ float dy = -0.5f; // 根据模型定义设定
33
+ float dscale = 2.6f; // 缩放因子
34
+ float theta0 = 1.5707963267948966; // 基准角度
35
+ int batch=1;
36
+ int num_anchors=2944;
37
+ int num_coords=18;
38
+ int num_classes=1;
39
+ int num_keypoints=7;
40
+ float x_scale=256.0;
41
+ float y_scale=256.0;
42
+ float w_scale=256.0;
43
+ float h_scale=256.0;
44
+ float score_clipping_thresh=100.0;
45
+ float min_score_thresh=0.75;
46
+
47
+ struct Args {
48
+ std::string faceDetector_model = "../../models/m_handDetctor_fp16.qnn216.ctx.bin";
49
+ std::string faceLandmark_model = "../../models/m_handLandmark_fp16.qnn216.ctx.bin";
50
+ std::string imgs = "../hand.jpg";
51
+ int invoke_nums = 10;
52
+ std::string model_type = "QNN";
53
+ };
54
+
55
+
56
+ Args parse_args(int argc, char* argv[]) {
57
+ Args args;
58
+ for (int i = 1; i < argc; ++i) {
59
+ std::string arg = argv[i];
60
+ if (arg == "--faceDetector_model" && i + 1 < argc) {
61
+ args.faceDetector_model = argv[++i];
62
+ } else if (arg == "--faceLandmark_model" && i + 1 < argc) {
63
+ args.faceLandmark_model = argv[++i];
64
+ } else if (arg == "--imgs" && i + 1 < argc) {
65
+ args.imgs = argv[++i];
66
+ } else if (arg == "--invoke_nums" && i + 1 < argc) {
67
+ args.invoke_nums = std::stoi(argv[++i]);
68
+ } else if (arg == "--model_type" && i + 1 < argc) {
69
+ args.model_type = argv[++i];
70
+ }
71
+ }
72
+ return args;
73
+ }
74
+
75
+ std::string to_lower(const std::string& str) {
76
+ std::string lower_str = str;
77
+ std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
78
+ return std::tolower(c);
79
+ });
80
+ return lower_str;
81
+ }
82
+
83
+ std::vector<std::vector<float>> load_anchors_from_npy(const std::string& path) {
84
+ cnpy::NpyArray arr = cnpy::npy_load(path);
85
+ float* data_ptr = arr.data<float>();
86
+
87
+ size_t num_rows = arr.shape[0]; // 896
88
+ size_t num_cols = arr.shape[1]; // 4
89
+
90
+ std::vector<std::vector<float>> anchors(num_rows, std::vector<float>(num_cols));
91
+ for (size_t i = 0; i < num_rows; ++i) {
92
+ for (size_t j = 0; j < num_cols; ++j) {
93
+ anchors[i][j] = data_ptr[i * num_cols + j];
94
+ }
95
+ }
96
+
97
+ return anchors;
98
+ }
99
+
100
+
101
+ // 绘制人脸关键点和连接线
102
+ void draw_landmarks(
103
+ cv::Mat& img,
104
+ const std::vector<cv::Point2f>& points,
105
+ const std::vector<float>& flags,
106
+ const std::vector<std::pair<int, int>>& connections,
107
+ float threshold = 0.4f,
108
+ cv::Scalar point_color = cv::Scalar(0, 255, 0),
109
+ cv::Scalar line_color = cv::Scalar(0, 0, 0),
110
+ int size = 2)
111
+ {
112
+ // 画关键点
113
+ for (size_t i = 0; i < points.size(); ++i) {
114
+ // if (i < flags.size() && flags[i] > threshold) {
115
+ int x = static_cast<int>(points[i].x);
116
+ int y = static_cast<int>(points[i].y);
117
+ cv::circle(img, cv::Point(x, y), size, point_color, size);
118
+ // }
119
+ }
120
+
121
+
122
+ // 画连接线(两端都要可见)
123
+ for (const auto& conn : connections) {
124
+ int i0 = conn.first;
125
+ int i1 = conn.second;
126
+ // if (i0 < points.size() && i1 < points.size() &&
127
+ // i0 < flags.size() && i1 < flags.size() &&
128
+ // flags[i0] > threshold && flags[i1] > threshold)
129
+ // {
130
+ cv::line(img, points[i0], points[i1], line_color, size);
131
+ // }
132
+ }
133
+ }
134
+
135
+
136
+ std::tuple<cv::Mat, cv::Mat, float, cv::Point> resize_pad(const cv::Mat& img) {
137
+ int h = img.rows;
138
+ int w = img.cols;
139
+
140
+ int h1, w1, padh = 0, padw = 0;
141
+ float scale = 1.0f;
142
+
143
+ // Step 1: resize width to 256, keep aspect ratio
144
+ // int w1 = 256;
145
+ // int h1 = w1 * orig_h / orig_w; // 等效于 int(256 * h / w)
146
+
147
+ // 根据宽高,调整缩放比
148
+ if (h >= w) {
149
+ h1 = 256;
150
+ w1 = 256 * w / h;
151
+ padw = 256 - w1;
152
+ scale = static_cast<float>(w) / w1;
153
+ } else {
154
+ w1 = 256;
155
+ h1 = 256 * h / w;
156
+ padh = 256 - h1;
157
+ scale = static_cast<float>(h) / h1;
158
+ }
159
+
160
+ // std::cout << "Original size: (" << h << ", " << w << "), padding: (" << padh << ", " << padw << ")\n";
161
+ // Step 2: compute padding in height direction
162
+ int padh1 = padh / 2;
163
+ int padh2 = padh - padh1;
164
+ int padw1 = padw / 2;
165
+ int padw2 = padw - padw1;
166
+ // std::cout << "Padding: (" << padh1 << ", " << padh2 << "), (" << padw1 << ", " << padw2 << ")\n";
167
+
168
+ // Resize to (w1, h1)
169
+ cv::Mat resized;
170
+ cv::resize(img, resized, cv::Size(w1, h1));
171
+
172
+ // Pad to 256x256
173
+ cv::Mat padded;
174
+ cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
175
+
176
+ // Final resize to 128x128
177
+ cv::Mat resized_small;
178
+ cv::resize(padded, resized_small, cv::Size(128, 128));
179
+
180
+ // Compute offset in original scale
181
+ cv::Point pad_offset(static_cast<int>(padh1 * scale), static_cast<int>(padw1 * scale));
182
+
183
+ return std::make_tuple(padded, resized_small, scale, pad_offset);
184
+ }
185
+
186
+
187
+ // 将图像转换为 1xC×H×W 格式并归一化(除以 255)
188
+ std::vector<float> preprocess_image(const cv::Mat& img) {
189
+ int H = img.rows;
190
+ int W = img.cols;
191
+ int C = img.channels(); // should be 3
192
+
193
+ std::vector<float> chw(H * W * C); // CHW
194
+ std::vector<float> nchw(1 * C * H * W); // NCHW
195
+
196
+ // 1. HWC → CHW + normalize (float32 / 255.0)
197
+ for (int h = 0; h < H; ++h) {
198
+ for (int w = 0; w < W; ++w) {
199
+ for (int c = 0; c < C; ++c) {
200
+ // OpenCV uses BGR order
201
+ float value = img.at<cv::Vec3b>(h, w)[c] / 255.0f;
202
+ chw[c * H * W + h * W + w] = value;
203
+ }
204
+ }
205
+ }
206
+
207
+ // 2. CHW → NCHW (add batch dimension, actually just copy)
208
+ for (int i = 0; i < C * H * W; ++i) {
209
+ nchw[i] = chw[i];
210
+ }
211
+
212
+ return nchw; // shape: [1, 3, H, W]
213
+ }
214
+
215
+
216
+ // 只用前4个坐标计算IOU(默认框位置在前4个坐标)
217
+ float IoU(const std::vector<float>& box1, const std::vector<float>& box2) {
218
+ float x1 = std::max(box1[0], box2[0]);
219
+ float y1 = std::max(box1[1], box2[1]);
220
+ float x2 = std::min(box1[2], box2[2]);
221
+ float y2 = std::min(box1[3], box2[3]);
222
+
223
+ float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
224
+ float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]);
225
+ float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]);
226
+ float union_area = box1_area + box2_area - inter_area;
227
+
228
+ return union_area > 0 ? inter_area / union_area : 0.0f;
229
+ }
230
+
231
+ std::vector<std::vector<float>> weighted_non_max_suppression(
232
+ std::vector<std::vector<float>>& detections,
233
+ int num_coords = 18,
234
+ float min_suppression_threshold = 0.3f)
235
+ {
236
+ if (detections.empty()) return {};
237
+
238
+ std::vector<int> indices(detections.size());
239
+ std::iota(indices.begin(), indices.end(), 0);
240
+
241
+ // 按置信度降序排序
242
+ std::sort(indices.begin(), indices.end(), [&](int a, int b) {
243
+ return detections[a][num_coords] > detections[b][num_coords];
244
+ });
245
+
246
+ std::vector<std::vector<float>> output;
247
+
248
+ while (!indices.empty()) {
249
+ int best_idx = indices.front();
250
+ const auto& best_det = detections[best_idx];
251
+ std::vector<int> overlapping = { best_idx };
252
+
253
+ for (size_t i = 1; i < indices.size(); ++i) {
254
+ float iou = IoU(best_det, detections[indices[i]]);
255
+ if (iou > min_suppression_threshold) {
256
+ overlapping.push_back(indices[i]);
257
+ }
258
+ }
259
+
260
+ // 更新剩余索引
261
+ std::vector<int> new_indices;
262
+ for (int idx : indices) {
263
+ if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) {
264
+ new_indices.push_back(idx);
265
+ }
266
+ }
267
+ indices = new_indices;
268
+
269
+ // 加权平均:坐标 * 置信度
270
+ if (overlapping.size() == 1) {
271
+ output.push_back(best_det);
272
+ } else {
273
+ std::vector<float> weighted(num_coords + 1, 0.0f);
274
+ float total_score = 0.0f;
275
+
276
+ for (int idx : overlapping) {
277
+ float score = detections[idx][num_coords];
278
+ total_score += score;
279
+ for (int k = 0; k < num_coords; ++k) {
280
+ weighted[k] += detections[idx][k] * score;
281
+ }
282
+ }
283
+
284
+ for (int k = 0; k < num_coords; ++k) {
285
+ weighted[k] /= total_score;
286
+ }
287
+ weighted[num_coords] = total_score / overlapping.size(); // 取平均得分
288
+
289
+ // std::cout << "Weighted box: ";
290
+ // for (float v : weighted) std::cout << v << " ";
291
+ // std::cout << "\n";
292
+
293
+ output.push_back(weighted);
294
+ }
295
+ }
296
+
297
+ // TODO
298
+ auto x = output[0];
299
+ output.clear();
300
+ output.push_back(x);
301
+
302
+ return output;
303
+ }
304
+
305
+
306
+ std::vector<std::vector<float>> denormalize_detections(
307
+ const std::vector<std::vector<float>>& detections,
308
+ float scale,
309
+ const cv::Point& pad
310
+ ) {
311
+ std::vector<std::vector<float>> result = detections;
312
+
313
+ for (size_t i = 0; i < result.size(); ++i) {
314
+ std::vector<float>& det = result[i];
315
+
316
+ // bbox coords: x1, y1, x2, y2
317
+ det[0] = det[0] * scale * 256.0f - pad.x; // x1
318
+ det[1] = det[1] * scale * 256.0f - pad.y; // y1
319
+ det[2] = det[2] * scale * 256.0f - pad.x; // x2
320
+ det[3] = det[3] * scale * 256.0f - pad.y; // y2
321
+
322
+ // keypoints (starting from index 4): format [y, x, y, x, ...]
323
+ for (size_t k = 4; k + 1 < det.size(); k += 2) {
324
+ det[k] = det[k] * scale * 256.0f - pad.y; // y
325
+ det[k + 1] = det[k + 1] * scale * 256.0f - pad.x; // x
326
+ }
327
+ }
328
+
329
+ return result;
330
+ }
331
+
332
+
333
+ void detection2roi(
334
+ const std::vector<std::vector<float>>& detections,
335
+ std::vector<float>& xc,
336
+ std::vector<float>& yc,
337
+ std::vector<float>& scale,
338
+ std::vector<float>& theta,
339
+ int kp1, int kp2, // 关键点索引
340
+ float dy, float dscale, float theta0
341
+ ) {
342
+ size_t N = detections.size();
343
+ xc.resize(N);
344
+ yc.resize(N);
345
+ scale.resize(N);
346
+ theta.resize(N);
347
+
348
+ for (size_t i = 0; i < N; ++i) {
349
+ const std::vector<float>& det = detections[i];
350
+
351
+ float x1 = det[1];
352
+ float x2 = det[3];
353
+ float y1 = det[0];
354
+ float y2 = det[2];
355
+
356
+ float x_center = (x1 + x2) / 2.0f;
357
+ float y_center = (y1 + y2) / 2.0f;
358
+ float box_scale = (x2 - x1); // assumes square box
359
+
360
+ // yc 偏移
361
+ y_center += dy * box_scale;
362
+ box_scale *= dscale;
363
+
364
+ // 获取两个关键点的位置
365
+ int base = 4;
366
+ int idx_y0 = base + 2 * kp1;
367
+ int idx_x0 = base + 2 * kp1 + 1;
368
+ int idx_y1 = base + 2 * kp2;
369
+ int idx_x1 = base + 2 * kp2 + 1;
370
+
371
+ float x0 = det[idx_x0];
372
+ float y0 = det[idx_y0];
373
+ float x1_kp = det[idx_x1];
374
+ float y1_kp = det[idx_y1];
375
+
376
+ float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0;
377
+
378
+ // 输出赋值
379
+ xc[i] = x_center;
380
+ yc[i] = y_center;
381
+ scale[i] = box_scale;
382
+ // TODO: 这里的 theta 需要根据实际情况调整
383
+ // theta[i] = angle; // 如果需要使用计算的角度
384
+ theta[i] = -0.8461;
385
+ }
386
+ }
387
+
388
+
389
+ void extract_roi(
390
+ const cv::Mat& frame,
391
+ const std::vector<float>& xc,
392
+ const std::vector<float>& yc,
393
+ const std::vector<float>& theta,
394
+ const std::vector<float>& scale,
395
+ std::vector<cv::Mat>& cropped_rois,
396
+ std::vector<cv::Mat>& affine_matrices,
397
+ std::vector<std::vector<cv::Point2f>>& roi_boxes, // 添加返回点坐标
398
+ int resolution = 256
399
+ ) {
400
+ cropped_rois.clear();
401
+ affine_matrices.clear();
402
+ roi_boxes.clear();
403
+
404
+ for (size_t i = 0; i < xc.size(); ++i) {
405
+ float s = scale[i] / 2.0f;
406
+ float cos_t = std::cos(theta[i]);
407
+ float sin_t = std::sin(theta[i]);
408
+
409
+ // 定义4个 unit square 点经过变换后的点(顺序和 Python 中一样)
410
+ std::vector<cv::Point2f> points(4);
411
+ // [-1, -1]
412
+ points[0].x = xc[i] + (-s * cos_t + s * sin_t);
413
+ points[0].y = yc[i] + (-s * sin_t - s * cos_t);
414
+ // [1, -1]
415
+ points[1].x = xc[i] + ( s * cos_t + s * sin_t);
416
+ points[1].y = yc[i] + ( s * sin_t - s * cos_t);
417
+ // [-1, 1]
418
+ points[2].x = xc[i] + (-s * cos_t - s * sin_t);
419
+ points[2].y = yc[i] + (-s * sin_t + s * cos_t);
420
+ // [1, 1]
421
+ points[3].x = xc[i] + ( s * cos_t - s * sin_t);
422
+ points[3].y = yc[i] + ( s * sin_t + s * cos_t);
423
+
424
+ // 用前三个点计算仿射变换
425
+ std::vector<cv::Point2f> src_pts = { points[0], points[1], points[2] };
426
+ std::vector<cv::Point2f> dst_pts = {
427
+ cv::Point2f(0, 0),
428
+ cv::Point2f(resolution - 1, 0),
429
+ cv::Point2f(0, resolution - 1)
430
+ };
431
+
432
+ cv::Mat M = cv::getAffineTransform(src_pts, dst_pts);
433
+ cv::Mat M_inv;
434
+ cv::invertAffineTransform(M, M_inv);
435
+
436
+ cv::Mat cropped;
437
+ cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5));
438
+ cropped_rois.push_back(cropped);
439
+ affine_matrices.push_back(M_inv);
440
+ roi_boxes.push_back(points); // 添加变换后的 box 点
441
+ }
442
+ }
443
+
444
+ std::vector<float> preprocess_imgs_to_nchw(const std::vector<cv::Mat>& imgs) {
445
+ int N = imgs.size();
446
+ if (N == 0) return {};
447
+
448
+ int H = 256;
449
+ int W = 256;
450
+ int C = 3; // assume 3 channels (BGR)
451
+
452
+ std::vector<float> output;
453
+ output.reserve(N * C * H * W);
454
+
455
+ for (int n = 0; n < N; ++n) {
456
+ cv::Mat img_float;
457
+ imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0); // Normalize to [0,1]
458
+
459
+ // Split channels (HWC → CHW)
460
+ std::vector<cv::Mat> channels(3);
461
+ cv::split(img_float, channels); // channels[0] = B, [1] = G, [2] = R
462
+
463
+ for (int c = 0; c < C; ++c) {
464
+ for (int i = 0; i < H; ++i) {
465
+ for (int j = 0; j < W; ++j) {
466
+ output.push_back(channels[c].at<float>(i, j));
467
+ }
468
+ }
469
+ }
470
+ }
471
+
472
+ return output; // shape: N x C x H x W
473
+ }
474
+
475
+ std::vector<cv::Point2f> denormalize_landmarks(
476
+ const std::vector<float>& normalized_landmarks,
477
+ const std::vector<cv::Mat>& affines,
478
+ int resolution = 256)
479
+ {
480
+ std::vector<cv::Point2f> output;
481
+
482
+ // 检查输入尺寸
483
+ const int num_faces = 1; // 假设只有一个人脸
484
+ const int num_landmarks = 21;
485
+ if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) {
486
+ std::cerr << "Error: Input size mismatch. Expected "
487
+ << num_faces * num_landmarks * 3 << " landmarks and "
488
+ << num_faces << " affine matrices." << std::endl;
489
+ throw std::runtime_error("Input size mismatch");
490
+ }
491
+
492
+ for (int i = 0; i < num_faces; ++i) {
493
+ const cv::Mat& affine = affines[i]; // 2x3 CV_32F
494
+ for (int j = 0; j < num_landmarks; ++j) {
495
+ int idx = i * num_landmarks * 3 + j * 3;
496
+ float x = normalized_landmarks[idx + 0] * resolution;
497
+ float y = normalized_landmarks[idx + 1] * resolution;
498
+ // float z = normalized_landmarks[idx + 2]; // 可选使用
499
+
500
+ // 2x1 输入向量
501
+ cv::Mat pt = (cv::Mat_<float>(2, 1) << x, y);
502
+
503
+ // 提取 affine 的旋转和平移
504
+ cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone();
505
+ cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone();
506
+ M2x2.convertTo(M2x2, CV_32F);
507
+ t2x1.convertTo(t2x1, CV_32F);
508
+
509
+ // 反仿射变换
510
+ cv::Mat out = M2x2 * pt + t2x1;
511
+
512
+ // 存储为 Point2f
513
+ output.emplace_back(out.at<float>(0, 0), out.at<float>(1, 0));
514
+ }
515
+ }
516
+
517
+ return output; // 输出为 denormalized landmarks,大小为 468 个 Point2f
518
+ }
519
+
520
+
521
+ void draw_roi(cv::Mat& img, const std::vector<std::vector<cv::Point2f>>& boxes) {
522
+ for (const auto& roi : boxes) {
523
+ if (roi.size() < 4) continue;
524
+
525
+ const cv::Point2f& p1 = roi[0];
526
+ const cv::Point2f& p2 = roi[1];
527
+ const cv::Point2f& p3 = roi[2];
528
+ const cv::Point2f& p4 = roi[3];
529
+
530
+ cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2); // 黑色
531
+ cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2); // 绿色
532
+ cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2); // 黑色
533
+ cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2); // 黑色
534
+ }
535
+ }
536
+
537
+
538
+ void draw_detections(cv::Mat& img, const std::vector<std::vector<float>>& detections, bool with_keypoints = true) {
539
+ for (const auto& det : detections) {
540
+ if (det.size() < 4) continue;
541
+
542
+ float ymin = det[0];
543
+ float xmin = det[1];
544
+ float ymax = det[2];
545
+ float xmax = det[3];
546
+
547
+ cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1);
548
+
549
+ if (with_keypoints && det.size() > 4) {
550
+ int n_keypoints = (det.size() - 4) / 2;
551
+ for (int k = 0; k < n_keypoints; ++k) {
552
+ int kp_x = int(det[4 + k * 2]);
553
+ int kp_y = int(det[4 + k * 2 + 1]);
554
+ cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2);
555
+ }
556
+ }
557
+ }
558
+ }
559
+
560
+
561
+ std::vector<std::vector<float>> loadAnchors(const std::string& filename) {
562
+ std::ifstream in(filename);
563
+ std::vector<std::vector<float>> anchors;
564
+
565
+ if (!in.is_open()) {
566
+ std::cerr << "Failed to open file: " << filename << std::endl;
567
+ return anchors;
568
+ }
569
+
570
+ std::string line;
571
+ while (std::getline(in, line)) {
572
+ std::istringstream ss(line);
573
+ std::vector<float> anchor;
574
+ float value;
575
+ while (ss >> value) {
576
+ anchor.push_back(value);
577
+ }
578
+ if (!anchor.empty()) {
579
+ anchors.push_back(anchor);
580
+ }
581
+ }
582
+
583
+ in.close();
584
+ return anchors;
585
+ }
586
+
587
+ // sigmoid 函数
588
+ float sigmoid(float x) {
589
+ return 1.0f / (1.0f + std::exp(-x));
590
+ }
591
+
592
+ // clamp 函数
593
+ float clamp(float x, float min_val, float max_val) {
594
+ return std::max(min_val, std::min(max_val, x));
595
+ }
596
+
597
+ // shape: [batch, num_anchors, num_coords] => [batch][anchor][coord]
598
+ std::vector<std::vector<std::vector<float>>> decode_boxes(
599
+ const std::vector<float>& raw_boxes,
600
+ const std::vector<std::vector<float>>& anchors,
601
+ int batch, int num_anchors, int num_coords,
602
+ float x_scale, float y_scale, float w_scale, float h_scale,
603
+ int num_keypoints)
604
+ {
605
+ std::vector<std::vector<std::vector<float>>> decoded_boxes(batch,
606
+ std::vector<std::vector<float>>(num_anchors, std::vector<float>(num_coords, 0)));
607
+
608
+ for (int b = 0; b < batch; ++b) {
609
+ for (int i = 0; i < num_anchors; ++i) {
610
+ int base = b * num_anchors * num_coords + i * num_coords;
611
+
612
+ float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0];
613
+ float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1];
614
+ float w = raw_boxes[base + 2] / w_scale * anchors[i][2];
615
+ float h = raw_boxes[base + 3] / h_scale * anchors[i][3];
616
+
617
+ decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin
618
+ decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin
619
+ decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax
620
+ decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax
621
+
622
+ for (int k = 0; k < num_keypoints; ++k) {
623
+ int offset = 4 + k * 2;
624
+ float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0];
625
+ float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1];
626
+ decoded_boxes[b][i][offset] = keypoint_x;
627
+ decoded_boxes[b][i][offset + 1] = keypoint_y;
628
+ }
629
+ }
630
+ }
631
+
632
+ return decoded_boxes;
633
+ }
634
+
635
+ std::vector<std::vector<std::vector<float>>> tensors_to_detections(
636
+ const std::vector<float>& raw_box_tensor,
637
+ const std::vector<float>& raw_score_tensor,
638
+ const std::vector<std::vector<float>>& anchors,
639
+ int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints,
640
+ float x_scale, float y_scale, float w_scale, float h_scale,
641
+ float score_clipping_thresh, float min_score_thresh)
642
+ {
643
+ assert(raw_box_tensor.size() == batch * num_anchors * num_coords);
644
+ assert(raw_score_tensor.size() == batch * num_anchors * num_classes);
645
+ assert(anchors.size() == size_t(num_anchors));
646
+
647
+ auto detection_boxes = decode_boxes(
648
+ raw_box_tensor, anchors, batch, num_anchors, num_coords,
649
+ x_scale, y_scale, w_scale, h_scale, num_keypoints);
650
+
651
+ std::vector<std::vector<std::vector<float>>> output_detections;
652
+
653
+ for (int b = 0; b < batch; ++b) {
654
+ std::vector<std::vector<float>> detections;
655
+
656
+ for (int i = 0; i < num_anchors; ++i) {
657
+ int score_index = b * num_anchors * num_classes + i * num_classes;
658
+
659
+ // 单类情况,取第0类
660
+ float score_raw = raw_score_tensor[score_index];
661
+ float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh));
662
+
663
+ if (score >= min_score_thresh) {
664
+ std::vector<float> det = detection_boxes[b][i]; // shape [num_coords]
665
+ det.push_back(score); // 追加置信度
666
+ detections.push_back(det); // shape [num_coords+1]
667
+ }
668
+ }
669
+
670
+ output_detections.push_back(detections); // 每个 batch 一个 vector
671
+ }
672
+
673
+ return output_detections;
674
+ }
675
+
676
+
677
+ int invoke(const Args& args) {
678
+ std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n"
679
+ << args.faceLandmark_model << "\n"
680
+ << "Image Path: " << args.imgs << "\n"
681
+ << "Inference Nums: " << args.invoke_nums << "\n"
682
+ << "Model Type: " << args.model_type << "\n";
683
+ // =============================================================faceDetector_model start
684
+ Model* model1 = Model::create_instance(args.faceDetector_model);
685
+ if(model1 == nullptr){
686
+ printf("Create model1 failed !\n");
687
+ return EXIT_FAILURE;
688
+ }
689
+ Config* config1 = Config::create_instance();
690
+ if(config1 == nullptr){
691
+ printf("Create config1 failed !\n");
692
+ return EXIT_FAILURE;
693
+ }
694
+ config1->implement_type = ImplementType::TYPE_LOCAL;
695
+ std::string model_type_lower1 = to_lower(args.model_type);
696
+ if (model_type_lower1 == "qnn"){
697
+ config1->framework_type = FrameworkType::TYPE_QNN;
698
+ } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") {
699
+ config1->framework_type = FrameworkType::TYPE_SNPE2;
700
+ }
701
+ config1->accelerate_type = AccelerateType::TYPE_DSP;
702
+ config1->is_quantify_model = 1;
703
+
704
+ std::vector<std::vector<uint32_t>> input_shapes1 = {{1,3,256,256}};
705
+ std::vector<std::vector<uint32_t>> output_shapes1 = {{1,2944,18},{1,2944,1}};
706
+ model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
707
+ std::unique_ptr<Interpreter> fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1);
708
+ if(fast_interpreter1 == nullptr){
709
+ printf("build_interpretper_from_model_and_config failed !\n");
710
+ return EXIT_FAILURE;
711
+ }
712
+ int result = fast_interpreter1->init();
713
+ if(result != EXIT_SUCCESS){
714
+ printf("interpreter->init() failed !\n");
715
+ return EXIT_FAILURE;
716
+ }
717
+ // load model
718
+ fast_interpreter1->load_model();
719
+ if(result != EXIT_SUCCESS){
720
+ printf("interpreter->load_model() failed !\n");
721
+ return EXIT_FAILURE;
722
+ }
723
+ printf("detect model load success!\n");
724
+ // =============================================================faceDetector_model over
725
+
726
+ // =============================================================faceLandmark_model start
727
+ Model* model2 = Model::create_instance(args.faceLandmark_model);
728
+ if(model2 == nullptr){
729
+ printf("Create model2 failed !\n");
730
+ return EXIT_FAILURE;
731
+ }
732
+ Config* config2 = Config::create_instance();
733
+ if(config2 == nullptr){
734
+ printf("Create config2 failed !\n");
735
+ return EXIT_FAILURE;
736
+ }
737
+ config2->implement_type = ImplementType::TYPE_LOCAL;
738
+ std::string model_type_lower2 = to_lower(args.model_type);
739
+ if (model_type_lower2 == "qnn"){
740
+ config2->framework_type = FrameworkType::TYPE_QNN;
741
+ } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") {
742
+ config2->framework_type = FrameworkType::TYPE_SNPE2;
743
+ }
744
+ config2->accelerate_type = AccelerateType::TYPE_DSP;
745
+ config2->is_quantify_model = 1;
746
+
747
+ std::vector<std::vector<uint32_t>> input_shapes2 = {{1,3,256,256}};
748
+ std::vector<std::vector<uint32_t>> output_shapes2 = {{1},{1},{1,21,3}};
749
+ model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
750
+ std::unique_ptr<Interpreter> fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2);
751
+ if(fast_interpreter2 == nullptr){
752
+ printf("build_interpretper_from_model_and_config2 failed !\n");
753
+ return EXIT_FAILURE;
754
+ }
755
+ result = fast_interpreter2->init();
756
+ if(result != EXIT_SUCCESS){
757
+ printf("interpreter2->init() failed !\n");
758
+ return EXIT_FAILURE;
759
+ }
760
+ // load model
761
+ fast_interpreter2->load_model();
762
+ if(result != EXIT_SUCCESS){
763
+ printf("interpreter2->load_model() failed !\n");
764
+ return EXIT_FAILURE;
765
+ }
766
+ printf("detect model2 load success!\n");
767
+ // =============================================================faceLandmark_model over
768
+
769
+
770
+ auto anchors = load_anchors_from_npy("../anchors_float32.npy");
771
+ cv::Mat frame = cv::imread(args.imgs);
772
+ if (frame.empty()) {
773
+ printf("detect image load failed!\n");
774
+ return 1;
775
+ }
776
+ // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
777
+ cv::Mat input_data;
778
+ cv::Mat frame_clone1 = frame.clone();
779
+ cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB);
780
+ cv::Mat frame_clone = frame.clone();
781
+
782
+
783
+ cv::Mat img1, img2;
784
+ float scale;
785
+ cv::Point pad;
786
+ std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1);
787
+ std::vector<float> input_tensor = preprocess_image(img1);
788
+
789
+ float *outdata0 = nullptr;
790
+ float *outdata1 = nullptr;
791
+ std::vector<float> invoke_time;
792
+ for (int i = 0; i < args.invoke_nums; ++i) {
793
+ result = fast_interpreter1->set_input_tensor(0, input_tensor.data());
794
+ if(result != EXIT_SUCCESS){
795
+ printf("interpreter->set_input_tensor() failed !\n");
796
+ return EXIT_FAILURE;
797
+ }
798
+ auto t1 = std::chrono::high_resolution_clock::now();
799
+ result = fast_interpreter1->invoke();
800
+ auto t2 = std::chrono::high_resolution_clock::now();
801
+ std::chrono::duration<double> cost_time = t2 - t1;
802
+ invoke_time.push_back(cost_time.count() * 1000);
803
+ if(result != EXIT_SUCCESS){
804
+ printf("interpreter->invoke() failed !\n");
805
+ return EXIT_FAILURE;
806
+ }
807
+ uint32_t out_data_0 = 0;
808
+ result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0);
809
+ if(result != EXIT_SUCCESS){
810
+ printf("interpreter1->get_output_tensor() 0 failed !\n");
811
+ return EXIT_FAILURE;
812
+ }
813
+
814
+ uint32_t out_data_1 = 0;
815
+ result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1);
816
+ if(result != EXIT_SUCCESS){
817
+ printf("interpreter1->get_output_tensor() 1 failed !\n");
818
+ return EXIT_FAILURE;
819
+ }
820
+
821
+ }
822
+
823
+ std::vector<float> tensor_1_896_16(outdata0, outdata0 + 2944*18);
824
+ std::vector<float> tensor_1_896_1(outdata1, outdata1 + 2944*1);
825
+
826
+ std::vector<std::vector<std::vector<float>>> detections = tensors_to_detections(
827
+ tensor_1_896_16, tensor_1_896_1, anchors,
828
+ batch, num_anchors, num_coords, num_classes, num_keypoints,
829
+ x_scale, y_scale, w_scale, h_scale,
830
+ score_clipping_thresh, min_score_thresh);
831
+
832
+
833
+ std::vector<std::vector<std::vector<float>>> filtered_detections;
834
+ for (size_t i = 0; i < detections.size(); ++i) {
835
+ std::vector<std::vector<float>>& dets = detections[i];
836
+ std::vector<std::vector<float>> faces = weighted_non_max_suppression(dets);
837
+ filtered_detections.push_back(faces);
838
+ }
839
+
840
+
841
+ // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n";
842
+ // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n";
843
+ std::vector<std::vector<float>> face_detections = denormalize_detections(filtered_detections[0], scale, pad);
844
+
845
+ // std::cout << "face_detections size: " << face_detections.size() << "\n";
846
+ std::vector<float> xc, yc, scales, theta;
847
+
848
+
849
+ detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0);
850
+ std::vector<cv::Mat> rois;
851
+ std::vector<cv::Mat> affines;
852
+ std::vector<std::vector<cv::Point2f>> boxes;
853
+
854
+ // std::cout << "xc size: " << xc.size() << ", yc size: " << yc.size() << ", scales size: " << scales.size() << ", theta size: " << theta.size() << "\n";
855
+ // std::cout << "xc: " << xc[0] << ", yc: " << yc[0] << ", scales: " << scales[0] << ", theta: " << theta[0] << "\n";
856
+ extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes);
857
+ if (!boxes.empty()) {
858
+ std::cout << "Detected " << boxes.size() << " faces.\n";
859
+ // 检测到人脸,继续处理 boxes[0] ...
860
+ std::vector<float> input_tensor = preprocess_imgs_to_nchw(rois);
861
+
862
+ // for (int i = 0; i < 5; ++i) {
863
+ // std::cout << "input_tensor:" << i << ": " << input_tensor[i] << std::endl;
864
+ // }
865
+
866
+ float *outdata1_0 = nullptr;
867
+ float *outdata1_1 = nullptr;
868
+
869
+ result = fast_interpreter2->set_input_tensor(0, input_tensor.data());
870
+ if(result != EXIT_SUCCESS){
871
+ printf("interpreter2->set_input_tensor() failed !\n");
872
+ return EXIT_FAILURE;
873
+ }
874
+ auto t1 = std::chrono::high_resolution_clock::now();
875
+ result = fast_interpreter2->invoke();
876
+ auto t2 = std::chrono::high_resolution_clock::now();
877
+ std::chrono::duration<double> cost_time = t2 - t1;
878
+ invoke_time.push_back(cost_time.count() * 1000);
879
+ if(result != EXIT_SUCCESS){
880
+ printf("interpreter2->invoke() failed !\n");
881
+ return EXIT_FAILURE;
882
+ }
883
+ uint32_t out_data_1_0 = 0;
884
+ result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0);
885
+ if(result != EXIT_SUCCESS){
886
+ printf("interpreter2->get_output_tensor() 0 failed !\n");
887
+ return EXIT_FAILURE;
888
+ }
889
+
890
+ uint32_t out_data_1_1 = 0;
891
+ result = fast_interpreter2->get_output_tensor(2, (void**)&outdata1_1, &out_data_1_1);
892
+ if(result != EXIT_SUCCESS){
893
+ printf("interpreter2->get_output_tensor() 1 failed !\n");
894
+ return EXIT_FAILURE;
895
+ }
896
+
897
+ std::vector<float> flags(outdata1_0, outdata1_0 + 1);
898
+ std::vector<float> normalized_landmarks(outdata1_1, outdata1_1 + 21*3);
899
+
900
+ std::vector<cv::Point2f> landmarks = denormalize_landmarks(normalized_landmarks, affines);
901
+ draw_landmarks(frame_clone1, landmarks, flags, HAND_CONNECTIONS);
902
+ } else {
903
+ std::cout << "not detect face!" << std::endl;
904
+ }
905
+
906
+
907
+ draw_roi(frame_clone1, boxes);
908
+ draw_detections(frame_clone1, face_detections);
909
+ cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR);
910
+ cv::imwrite("vis_result.jpg", frame_clone1);
911
+
912
+
913
+ fast_interpreter1->destory();
914
+ fast_interpreter2->destory();
915
+ return 0;
916
+
917
+ }
918
+
919
+
920
+ int main(int argc, char* argv[]) {
921
+ Args args = parse_args(argc, argv);
922
+ return invoke(args);
923
+ }
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/models/m_handDetctor_fp16.qnn216.ctx.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce9773e6e34cd5ff5a6f78602b4229c0f1faa3e938d267f29e97c8fc3cf43a16
3
+ size 4243224
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/models/m_handLandmark_fp16.qnn216.ctx.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e303d20df51ba36ebe46213d6ee39b327b1dc9e52a546f2d12dd81ac4bfc3d7c
3
+ size 6796200
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-38.pyc ADDED
Binary file (16.5 kB). View file
 
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/visualization.cpython-38.pyc ADDED
Binary file (4.59 kB). View file
 
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/anchors_palm.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24fa4a27ad6bee24ba3185a42fe3a47115540b0b27fa5956a291f03756183b41
3
+ size 94336
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/blazebase.py ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+
7
+
8
+ def resize_pad(img):
9
+ """ resize and pad images to be input to the detectors
10
+
11
+ The face and palm detector networks take 256x256 and 128x128 images
12
+ as input. As such the input image is padded and resized to fit the
13
+ size while maintaing the aspect ratio.
14
+
15
+ Returns:
16
+ img1: 256x256
17
+ img2: 128x128
18
+ scale: scale factor between original image and 256x256 image
19
+ pad: pixels of padding in the original image
20
+ """
21
+
22
+ size0 = img.shape
23
+ if size0[0]>=size0[1]:
24
+ h1 = 256
25
+ w1 = 256 * size0[1] // size0[0]
26
+ padh = 0
27
+ padw = 256 - w1
28
+ scale = size0[1] / w1
29
+ else:
30
+ h1 = 256 * size0[0] // size0[1]
31
+ w1 = 256
32
+ padh = 256 - h1
33
+ padw = 0
34
+ scale = size0[0] / h1
35
+ padh1 = padh//2
36
+ padh2 = padh//2 + padh%2
37
+ padw1 = padw//2
38
+ padw2 = padw//2 + padw%2
39
+ img1 = cv2.resize(img, (w1,h1))
40
+ img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0)))
41
+ pad = (int(padh1 * scale), int(padw1 * scale))
42
+ img2 = cv2.resize(img1, (128,128))
43
+ return img1, img2, scale, pad
44
+
45
+
46
+ def denormalize_detections(detections, scale, pad):
47
+ """ maps detection coordinates from [0,1] to image coordinates
48
+
49
+ The face and palm detector networks take 256x256 and 128x128 images
50
+ as input. As such the input image is padded and resized to fit the
51
+ size while maintaing the aspect ratio. This function maps the
52
+ normalized coordinates back to the original image coordinates.
53
+
54
+ Inputs:
55
+ detections: nxm tensor. n is the number of detections.
56
+ m is 4+2*k where the first 4 valuse are the bounding
57
+ box coordinates and k is the number of additional
58
+ keypoints output by the detector.
59
+ scale: scalar that was used to resize the image
60
+ pad: padding in the x and y dimensions
61
+
62
+ """
63
+ detections[:, 0] = detections[:, 0] * scale * 256 - pad[0]
64
+ detections[:, 1] = detections[:, 1] * scale * 256 - pad[1]
65
+ detections[:, 2] = detections[:, 2] * scale * 256 - pad[0]
66
+ detections[:, 3] = detections[:, 3] * scale * 256 - pad[1]
67
+
68
+ detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1]
69
+ detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0]
70
+ return detections
71
+
72
+
73
+
74
+
75
+ class BlazeBlock(nn.Module):
76
+ def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False):
77
+ super(BlazeBlock, self).__init__()
78
+
79
+ self.stride = stride
80
+ self.kernel_size = kernel_size
81
+ self.channel_pad = out_channels - in_channels
82
+
83
+ # TFLite uses slightly different padding than PyTorch
84
+ # on the depthwise conv layer when the stride is 2.
85
+ if stride == 2:
86
+ self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
87
+ padding = 0
88
+ else:
89
+ padding = (kernel_size - 1) // 2
90
+
91
+ self.convs = nn.Sequential(
92
+ nn.Conv2d(in_channels=in_channels, out_channels=in_channels,
93
+ kernel_size=kernel_size, stride=stride, padding=padding,
94
+ groups=in_channels, bias=True),
95
+ nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
96
+ kernel_size=1, stride=1, padding=0, bias=True),
97
+ )
98
+
99
+ if skip_proj:
100
+ self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
101
+ kernel_size=1, stride=1, padding=0, bias=True)
102
+ else:
103
+ self.skip_proj = None
104
+
105
+ if act == 'relu':
106
+ self.act = nn.ReLU(inplace=True)
107
+ elif act == 'prelu':
108
+ self.act = nn.PReLU(out_channels)
109
+ else:
110
+ raise NotImplementedError("unknown activation %s"%act)
111
+
112
+ def forward(self, x):
113
+ if self.stride == 2:
114
+ if self.kernel_size==3:
115
+ h = F.pad(x, (0, 2, 0, 2), "constant", 0)
116
+ else:
117
+ h = F.pad(x, (1, 2, 1, 2), "constant", 0)
118
+ x = self.max_pool(x)
119
+ else:
120
+ h = x
121
+
122
+ if self.skip_proj is not None:
123
+ x = self.skip_proj(x)
124
+ elif self.channel_pad > 0:
125
+ x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
126
+
127
+
128
+ return self.act(self.convs(h) + x)
129
+
130
+
131
+ class FinalBlazeBlock(nn.Module):
132
+ def __init__(self, channels, kernel_size=3):
133
+ super(FinalBlazeBlock, self).__init__()
134
+
135
+ # TFLite uses slightly different padding than PyTorch
136
+ # on the depthwise conv layer when the stride is 2.
137
+ self.convs = nn.Sequential(
138
+ nn.Conv2d(in_channels=channels, out_channels=channels,
139
+ kernel_size=kernel_size, stride=2, padding=0,
140
+ groups=channels, bias=True),
141
+ nn.Conv2d(in_channels=channels, out_channels=channels,
142
+ kernel_size=1, stride=1, padding=0, bias=True),
143
+ )
144
+
145
+ self.act = nn.ReLU(inplace=True)
146
+
147
+ def forward(self, x):
148
+ h = F.pad(x, (0, 2, 0, 2), "constant", 0)
149
+
150
+ return self.act(self.convs(h))
151
+
152
+
153
+ class BlazeBase(nn.Module):
154
+ """ Base class for media pipe models. """
155
+
156
+ def _device(self):
157
+ """Which device (CPU or GPU) is being used by this model?"""
158
+ return self.classifier_8.weight.device
159
+
160
+ def load_weights(self, path):
161
+ self.load_state_dict(torch.load(path))
162
+ self.eval()
163
+
164
+
165
+ class BlazeLandmark(BlazeBase):
166
+ """ Base class for landmark models. """
167
+
168
+ def extract_roi(self, frame, xc, yc, theta, scale):
169
+
170
+ # take points on unit square and transform them according to the roi
171
+ points = torch.tensor([[-1, -1, 1, 1],
172
+ [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
173
+ points = points * scale.view(-1,1,1)/2
174
+ theta = theta.view(-1, 1, 1)
175
+ R = torch.cat((
176
+ torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
177
+ torch.cat((torch.sin(theta), torch.cos(theta)), 2),
178
+ ), 1)
179
+ center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
180
+ points = R @ points + center
181
+
182
+ # use the points to compute the affine transform that maps
183
+ # these points back to the output square
184
+ res = self.resolution
185
+ points1 = np.array([[0, 0, res-1],
186
+ [0, res-1, 0]], dtype=np.float32).T
187
+ affines = []
188
+ imgs = []
189
+ for i in range(points.shape[0]):
190
+ pts = points[i, :, :3].cpu().numpy().T
191
+ M = cv2.getAffineTransform(pts, points1)
192
+ img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
193
+ img = torch.tensor(img, device=scale.device)
194
+ imgs.append(img)
195
+ affine = cv2.invertAffineTransform(M).astype('float32')
196
+ affine = torch.tensor(affine, device=scale.device)
197
+ affines.append(affine)
198
+ if imgs:
199
+ imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
200
+ affines = torch.stack(affines)
201
+ else:
202
+ imgs = torch.zeros((0, 3, res, res), device=scale.device)
203
+ affines = torch.zeros((0, 2, 3), device=scale.device)
204
+
205
+ return imgs, affines, points
206
+
207
+ def denormalize_landmarks(self, landmarks, affines):
208
+ landmarks[:,:,:2] *= self.resolution
209
+ for i in range(len(landmarks)):
210
+ landmark, affine = landmarks[i], affines[i]
211
+ landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
212
+ landmarks[i,:,:2] = landmark
213
+ return landmarks
214
+
215
+
216
+
217
+ class BlazeDetector(BlazeBase):
218
+ """ Base class for detector models.
219
+
220
+ Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
221
+ https://github.com/hollance/BlazeFace-PyTorch and
222
+ https://github.com/google/mediapipe/
223
+ """
224
+ def load_anchors(self, path):
225
+ self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
226
+ assert(self.anchors.ndimension() == 2)
227
+ assert(self.anchors.shape[0] == self.num_anchors)
228
+ assert(self.anchors.shape[1] == 4)
229
+
230
+ def _preprocess(self, x):
231
+ """Converts the image pixels to the range [-1, 1]."""
232
+ return x.float() / 255.# 127.5 - 1.0
233
+
234
+ def predict_on_image(self, img):
235
+ """Makes a prediction on a single image.
236
+
237
+ Arguments:
238
+ img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
239
+ shape (3, H, W). The image's height and width should be
240
+ 128 pixels.
241
+
242
+ Returns:
243
+ A tensor with face detections.
244
+ """
245
+ if isinstance(img, np.ndarray):
246
+ img = torch.from_numpy(img).permute((2, 0, 1))
247
+
248
+ return self.predict_on_batch(img.unsqueeze(0))[0]
249
+
250
+ def predict_on_batch(self, x):
251
+ """Makes a prediction on a batch of images.
252
+
253
+ Arguments:
254
+ x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
255
+ shape (b, 3, H, W). The height and width should be 128 pixels.
256
+
257
+ Returns:
258
+ A list containing a tensor of face detections for each image in
259
+ the batch. If no faces are found for an image, returns a tensor
260
+ of shape (0, 17).
261
+
262
+ Each face detection is a PyTorch tensor consisting of 17 numbers:
263
+ - ymin, xmin, ymax, xmax
264
+ - x,y-coordinates for the 6 keypoints
265
+ - confidence score
266
+ """
267
+ if isinstance(x, np.ndarray):
268
+ x = torch.from_numpy(x).permute((0, 3, 1, 2))
269
+
270
+ assert x.shape[1] == 3
271
+ assert x.shape[2] == self.y_scale
272
+ assert x.shape[3] == self.x_scale
273
+
274
+ # 1. Preprocess the images into tensors:
275
+ x = x.to(self._device())
276
+ x = self._preprocess(x)
277
+
278
+ # 2. Run the neural network:
279
+ with torch.no_grad():
280
+ out = self.__call__(x)
281
+
282
+ # 3. Postprocess the raw predictions:
283
+ detections = self._tensors_to_detections(out[0], out[1], self.anchors)
284
+
285
+ # 4. Non-maximum suppression to remove overlapping detections:
286
+ filtered_detections = []
287
+ for i in range(len(detections)):
288
+ faces = self._weighted_non_max_suppression(detections[i])
289
+ faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1))
290
+ filtered_detections.append(faces)
291
+
292
+ return filtered_detections
293
+
294
+
295
+ def detection2roi(self, detection):
296
+ """ Convert detections from detector to an oriented bounding box.
297
+
298
+ Adapted from:
299
+ # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
300
+
301
+ The center and size of the box is calculated from the center
302
+ of the detected box. Rotation is calcualted from the vector
303
+ between kp1 and kp2 relative to theta0. The box is scaled
304
+ and shifted by dscale and dy.
305
+
306
+ """
307
+ if self.detection2roi_method == 'box':
308
+ # compute box center and scale
309
+ # use mediapipe/calculators/util/detections_to_rects_calculator.cc
310
+ xc = (detection[:,1] + detection[:,3]) / 2
311
+ yc = (detection[:,0] + detection[:,2]) / 2
312
+ scale = (detection[:,3] - detection[:,1]) # assumes square boxes
313
+
314
+ elif self.detection2roi_method == 'alignment':
315
+ # compute box center and scale
316
+ # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc
317
+ xc = detection[:,4+2*self.kp1]
318
+ yc = detection[:,4+2*self.kp1+1]
319
+ x1 = detection[:,4+2*self.kp2]
320
+ y1 = detection[:,4+2*self.kp2+1]
321
+ scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2
322
+ else:
323
+ raise NotImplementedError(
324
+ "detection2roi_method [%s] not supported"%self.detection2roi_method)
325
+
326
+ yc += self.dy * scale
327
+ scale *= self.dscale
328
+
329
+ # compute box rotation
330
+ x0 = detection[:,4+2*self.kp1]
331
+ y0 = detection[:,4+2*self.kp1+1]
332
+ x1 = detection[:,4+2*self.kp2]
333
+ y1 = detection[:,4+2*self.kp2+1]
334
+ #theta = np.arctan2(y0-y1, x0-x1) - self.theta0
335
+ theta = torch.atan2(y0-y1, x0-x1) - self.theta0
336
+ return xc, yc, scale, theta
337
+
338
+
339
+ def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
340
+ """The output of the neural network is a tensor of shape (b, 896, 16)
341
+ containing the bounding box regressor predictions, as well as a tensor
342
+ of shape (b, 896, 1) with the classification confidences.
343
+
344
+ This function converts these two "raw" tensors into proper detections.
345
+ Returns a list of (num_detections, 17) tensors, one for each image in
346
+ the batch.
347
+
348
+ This is based on the source code from:
349
+ mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
350
+ mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
351
+ """
352
+ assert raw_box_tensor.ndimension() == 3
353
+ assert raw_box_tensor.shape[1] == self.num_anchors
354
+ assert raw_box_tensor.shape[2] == self.num_coords
355
+
356
+ assert raw_score_tensor.ndimension() == 3
357
+ assert raw_score_tensor.shape[1] == self.num_anchors
358
+ assert raw_score_tensor.shape[2] == self.num_classes
359
+
360
+ assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
361
+
362
+ detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
363
+
364
+ thresh = self.score_clipping_thresh
365
+ raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
366
+ detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
367
+
368
+ # Note: we stripped off the last dimension from the scores tensor
369
+ # because there is only has one class. Now we can simply use a mask
370
+ # to filter out the boxes with too low confidence.
371
+ mask = detection_scores >= self.min_score_thresh
372
+
373
+ # Because each image from the batch can have a different number of
374
+ # detections, process them one at a time using a loop.
375
+ output_detections = []
376
+ for i in range(raw_box_tensor.shape[0]):
377
+ boxes = detection_boxes[i, mask[i]]
378
+ scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
379
+ output_detections.append(torch.cat((boxes, scores), dim=-1))
380
+
381
+ return output_detections
382
+
383
+ def _decode_boxes(self, raw_boxes, anchors):
384
+ """Converts the predictions into actual coordinates using
385
+ the anchor boxes. Processes the entire batch at once.
386
+ """
387
+ boxes = torch.zeros_like(raw_boxes)
388
+
389
+ x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
390
+ y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
391
+
392
+ w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
393
+ h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
394
+
395
+ boxes[..., 0] = y_center - h / 2. # ymin
396
+ boxes[..., 1] = x_center - w / 2. # xmin
397
+ boxes[..., 2] = y_center + h / 2. # ymax
398
+ boxes[..., 3] = x_center + w / 2. # xmax
399
+
400
+ for k in range(self.num_keypoints):
401
+ offset = 4 + k*2
402
+ keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
403
+ keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
404
+ boxes[..., offset ] = keypoint_x
405
+ boxes[..., offset + 1] = keypoint_y
406
+
407
+ return boxes
408
+
409
+ def _weighted_non_max_suppression(self, detections):
410
+ """The alternative NMS method as mentioned in the BlazeFace paper:
411
+
412
+ "We replace the suppression algorithm with a blending strategy that
413
+ estimates the regression parameters of a bounding box as a weighted
414
+ mean between the overlapping predictions."
415
+
416
+ The original MediaPipe code assigns the score of the most confident
417
+ detection to the weighted detection, but we take the average score
418
+ of the overlapping detections.
419
+
420
+ The input detections should be a Tensor of shape (count, 17).
421
+
422
+ Returns a list of PyTorch tensors, one for each detected face.
423
+
424
+ This is based on the source code from:
425
+ mediapipe/calculators/util/non_max_suppression_calculator.cc
426
+ mediapipe/calculators/util/non_max_suppression_calculator.proto
427
+ """
428
+ if len(detections) == 0: return []
429
+
430
+ output_detections = []
431
+
432
+ # Sort the detections from highest to lowest score.
433
+ remaining = torch.argsort(detections[:, self.num_coords], descending=True)
434
+
435
+ while len(remaining) > 0:
436
+ detection = detections[remaining[0]]
437
+
438
+ # Compute the overlap between the first box and the other
439
+ # remaining boxes. (Note that the other_boxes also include
440
+ # the first_box.)
441
+ first_box = detection[:4]
442
+ other_boxes = detections[remaining, :4]
443
+ ious = overlap_similarity(first_box, other_boxes)
444
+
445
+ # If two detections don't overlap enough, they are considered
446
+ # to be from different faces.
447
+ mask = ious > self.min_suppression_threshold
448
+ overlapping = remaining[mask]
449
+ remaining = remaining[~mask]
450
+
451
+ # Take an average of the coordinates from the overlapping
452
+ # detections, weighted by their confidence scores.
453
+ weighted_detection = detection.clone()
454
+ if len(overlapping) > 1:
455
+ coordinates = detections[overlapping, :self.num_coords]
456
+ scores = detections[overlapping, self.num_coords:self.num_coords+1]
457
+ total_score = scores.sum()
458
+ weighted = (coordinates * scores).sum(dim=0) / total_score
459
+ weighted_detection[:self.num_coords] = weighted
460
+ weighted_detection[self.num_coords] = total_score / len(overlapping)
461
+
462
+ output_detections.append(weighted_detection)
463
+
464
+ return output_detections
465
+
466
+
467
+ # IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
468
+
469
+ def intersect(box_a, box_b):
470
+ """ We resize both tensors to [A,B,2] without new malloc:
471
+ [A,2] -> [A,1,2] -> [A,B,2]
472
+ [B,2] -> [1,B,2] -> [A,B,2]
473
+ Then we compute the area of intersect between box_a and box_b.
474
+ Args:
475
+ box_a: (tensor) bounding boxes, Shape: [A,4].
476
+ box_b: (tensor) bounding boxes, Shape: [B,4].
477
+ Return:
478
+ (tensor) intersection area, Shape: [A,B].
479
+ """
480
+ A = box_a.size(0)
481
+ B = box_b.size(0)
482
+ max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
483
+ box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
484
+ min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
485
+ box_b[:, :2].unsqueeze(0).expand(A, B, 2))
486
+ inter = torch.clamp((max_xy - min_xy), min=0)
487
+ return inter[:, :, 0] * inter[:, :, 1]
488
+
489
+
490
+ def jaccard(box_a, box_b):
491
+ """Compute the jaccard overlap of two sets of boxes. The jaccard overlap
492
+ is simply the intersection over union of two boxes. Here we operate on
493
+ ground truth boxes and default boxes.
494
+ E.g.:
495
+ A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
496
+ Args:
497
+ box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
498
+ box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
499
+ Return:
500
+ jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
501
+ """
502
+ inter = intersect(box_a, box_b)
503
+ area_a = ((box_a[:, 2]-box_a[:, 0]) *
504
+ (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
505
+ area_b = ((box_b[:, 2]-box_b[:, 0]) *
506
+ (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
507
+ union = area_a + area_b - inter
508
+ return inter / union # [A,B]
509
+
510
+
511
+ def overlap_similarity(box, other_boxes):
512
+ """Computes the IOU between a bounding box and set of other boxes."""
513
+ return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)