qc903113684 commited on
Commit
dd398c8
·
verified ·
1 Parent(s): feb6c7d

Upload 35 files

Browse files
Files changed (35) hide show
  1. model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/README.md +48 -0
  2. model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt +32 -0
  3. model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/cpp/dogs.jpg +0 -0
  4. model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp +899 -0
  5. model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin +3 -0
  6. model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/dogs.jpg +0 -0
  7. model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/onnx_export.py +50 -0
  8. model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/prompt.py +456 -0
  9. model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/run_test.py +224 -0
  10. model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/tools_pt.py +372 -0
  11. model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/utils.py +86 -0
  12. model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/README.md +48 -0
  13. model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/models/cutoff_fastsam_x_fp16.qnn216.ctx.bin +3 -0
  14. model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/dogs.jpg +0 -0
  15. model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/onnx_export.py +50 -0
  16. model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/prompt.py +456 -0
  17. model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/run_test.py +224 -0
  18. model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/tools_pt.py +372 -0
  19. model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/utils.py +86 -0
  20. model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/README.md +48 -0
  21. model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin +3 -0
  22. model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/dogs.jpg +0 -0
  23. model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/onnx_export.py +50 -0
  24. model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/prompt.py +456 -0
  25. model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/run_test.py +224 -0
  26. model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/tools_pt.py +372 -0
  27. model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/utils.py +86 -0
  28. model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/README.md +48 -0
  29. model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/models/cutoff_fastsam_x_w8a16.qnn216.ctx.bin +3 -0
  30. model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/dogs.jpg +0 -0
  31. model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/onnx_export.py +50 -0
  32. model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/prompt.py +456 -0
  33. model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/run_test.py +224 -0
  34. model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/tools_pt.py +372 -0
  35. model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/utils.py +86 -0
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Model Information
2
+ ### Source model
3
+ - Input shape: 640x640
4
+ - Number of parameters: 68.89M
5
+ - Model size: 277.39M
6
+ - Output shape: 1x37x8400,1x32x160x160
7
+
8
+ Source model repository: [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
9
+
10
+ ### Converted model
11
+
12
+ - Precision: INT8
13
+ - Backend: QNN2.16
14
+ - Target Device: FV01 QCS6490
15
+
16
+ ## Inference with AidLite SDK
17
+
18
+ ### SDK installation
19
+ Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
20
+
21
+ - Install AidLite SDK
22
+
23
+ ```bash
24
+ # Install the appropriate version of the aidlite sdk
25
+ sudo aid-pkg update
26
+ sudo aid-pkg install aidlite-sdk
27
+ # Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
28
+ sudo aid-pkg install aidlite-{QNN VERSION}
29
+ ```
30
+
31
+ - Verify AidLite SDK
32
+
33
+ ```bash
34
+ # aidlite sdk c++ check
35
+ python3 -c "import aidlite ; print(aidlite.get_library_version())"
36
+
37
+ # aidlite sdk python check
38
+ python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
39
+ ```
40
+
41
+ ### Run demo
42
+ ```bash
43
+ cd fastsam_x/model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite
44
+ export LD_PRELOAD=/home/aidlux/.local/lib/python3.8/site-packages/torch/lib/../../torch.libs/libgomp-804f19d4.so.1.0.0
45
+
46
+ python3 ./python/run_test.py --target_model ./models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin --imgs ./python/dogs.jpg --invoke_nums 10
47
+ ```
48
+
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required (VERSION 3.5)
2
+ project("run_test")
3
+
4
+ find_package(OpenCV REQUIRED)
5
+
6
+ message(STATUS "oPENCV Library status:")
7
+ message(STATUS ">version:${OpenCV_VERSION}")
8
+ message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
9
+
10
+ set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
11
+
12
+ include_directories(
13
+ /usr/local/include
14
+ /usr/include/opencv4
15
+ )
16
+
17
+ link_directories(
18
+ /usr/local/lib/
19
+ )
20
+
21
+ file(GLOB SRC_LISTS
22
+ ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp
23
+ )
24
+
25
+ add_executable(run_test ${SRC_LISTS})
26
+
27
+ target_link_libraries(run_test
28
+ aidlite
29
+ ${OpenCV_LIBS}
30
+ pthread
31
+ jsoncpp
32
+ )
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/cpp/dogs.jpg ADDED
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp ADDED
@@ -0,0 +1,899 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <iostream>
2
+ #include <fstream>
3
+ #include <opencv2/opencv.hpp>
4
+ #include <aidlux/aidlite/aidlite.hpp>
5
+ #include <vector>
6
+ #include <numeric>
7
+ #include <cmath>
8
+ #include <jsoncpp/json/json.h>
9
+ #include <cstring> // for memcpy
10
+ #include <algorithm>
11
+ #include <opencv2/dnn.hpp>
12
+ #include <random>
13
+
14
+ using namespace cv;
15
+ using namespace std;
16
+ using namespace Aidlux::Aidlite;
17
+
18
+
19
+ struct Tensor {
20
+ float* data;
21
+ std::vector<int> shape; // 保存维度信息,例如 [1, 37, 8400]
22
+ };
23
+
24
+ struct Detection {
25
+ float x1, y1, x2, y2; // bbox
26
+ float conf; // confidence
27
+ int anchor_idx; // 新增字段
28
+ std::vector<float> mask_vec; // 32维掩码向量
29
+ };
30
+
31
+ struct Args {
32
+ std::string target_model = "../../models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin";
33
+ std::string imgs = "../dogs.jpg";
34
+ int invoke_nums = 10;
35
+ std::string model_type = "QNN";
36
+ };
37
+
38
+
39
+ Args parse_args(int argc, char* argv[]) {
40
+ Args args;
41
+ for (int i = 1; i < argc; ++i) {
42
+ std::string arg = argv[i];
43
+ if (arg == "--target_model" && i + 1 < argc) {
44
+ args.target_model = argv[++i];
45
+ } else if (arg == "--imgs" && i + 1 < argc) {
46
+ args.imgs = argv[++i];
47
+ } else if (arg == "--invoke_nums" && i + 1 < argc) {
48
+ args.invoke_nums = std::stoi(argv[++i]);
49
+ } else if (arg == "--model_type" && i + 1 < argc) {
50
+ args.model_type = argv[++i];
51
+ }
52
+ }
53
+ return args;
54
+ }
55
+
56
+ std::string to_lower(const std::string& str) {
57
+ std::string lower_str = str;
58
+ std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
59
+ return std::tolower(c);
60
+ });
61
+ return lower_str;
62
+ }
63
+
64
+ std::vector<float> convert_to_NCHW(const cv::Mat& float_img) {
65
+ int H = float_img.rows;
66
+ int W = float_img.cols;
67
+
68
+ std::vector<float> nchw_data(1 * 3 * H * W); // 1=batch size
69
+
70
+ // 从 HWC 转为 NCHW(按通道拷贝)
71
+ int channel_size = H * W;
72
+ for (int c = 0; c < 3; ++c) {
73
+ for (int h = 0; h < H; ++h) {
74
+ for (int w = 0; w < W; ++w) {
75
+ float val = float_img.at<cv::Vec3f>(h, w)[c];
76
+ nchw_data[c * channel_size + h * W + w] = val;
77
+ }
78
+ }
79
+ }
80
+
81
+ return nchw_data;
82
+ }
83
+
84
+ int transpose(float* src, unsigned int* src_dims, unsigned int* tsp_dims, float* dest){
85
+
86
+ int current_coordinate[4] = {0, 0, 0, 0};
87
+ for(int a = 0; a < src_dims[0]; ++a){
88
+ current_coordinate[0] = a;
89
+ for(int b = 0; b < src_dims[1]; ++b){
90
+ current_coordinate[1] = b;
91
+ for(int c = 0; c < src_dims[2]; ++c){
92
+ current_coordinate[2] = c;
93
+ for(int d = 0; d < src_dims[3]; ++d){
94
+ current_coordinate[3] = d;
95
+
96
+ int old_index = current_coordinate[0]*src_dims[1]*src_dims[2]*src_dims[3] +
97
+ current_coordinate[1]*src_dims[2]*src_dims[3] +
98
+ current_coordinate[2]*src_dims[3] +
99
+ current_coordinate[3];
100
+
101
+ int new_index = current_coordinate[tsp_dims[0]]*src_dims[tsp_dims[1]]*src_dims[tsp_dims[2]]*src_dims[tsp_dims[3]] +
102
+ current_coordinate[tsp_dims[1]]*src_dims[tsp_dims[2]]*src_dims[tsp_dims[3]] +
103
+ current_coordinate[tsp_dims[2]]*src_dims[tsp_dims[3]] +
104
+ current_coordinate[tsp_dims[3]];
105
+
106
+ dest[new_index] = src[old_index];
107
+ }
108
+ }
109
+ }
110
+ }
111
+
112
+ return EXIT_SUCCESS;
113
+ }
114
+
115
+ // 拼接两个 float* 的张量(沿 axis=1)
116
+ void concat_along_axis1(
117
+ const float* in1, int C1,
118
+ const float* in2, int C2,
119
+ const float* in3, int C3,
120
+ int N, int W, // N=1, W=8400
121
+ float* out // 输出大小: N * (C1+C2+C3) * W
122
+ ) {
123
+ int offset = 0;
124
+ int block = W; // 每个通道的元素数 = W
125
+
126
+ // 处理 in1: [N, C1, W]
127
+ for (int c = 0; c < C1; ++c) {
128
+ std::memcpy(out + offset, in1 + c * block, sizeof(float) * block);
129
+ offset += block;
130
+ }
131
+
132
+ // 处理 in2: [N, C2, W]
133
+ for (int c = 0; c < C2; ++c) {
134
+ std::memcpy(out + offset, in2 + c * block, sizeof(float) * block);
135
+ offset += block;
136
+ }
137
+
138
+ // 处理 in3: [N, C3, W]
139
+ for (int c = 0; c < C3; ++c) {
140
+ std::memcpy(out + offset, in3 + c * block, sizeof(float) * block);
141
+ offset += block;
142
+ }
143
+ }
144
+
145
+
146
+ // 创建 qnn_out 相当于 Python 的列表
147
+ std::vector<Tensor> create_qnn_out(float* output_concat1, float* outdata0) {
148
+ std::vector<Tensor> qnn_out;
149
+
150
+ // 第一个张量: [1, 37, 8400]
151
+ Tensor t1;
152
+ t1.data = output_concat1;
153
+ t1.shape = {1, 37, 8400};
154
+ qnn_out.push_back(t1);
155
+
156
+ // 第二个张量: [1, 32, 160, 160]
157
+ Tensor t2;
158
+ t2.data = outdata0;
159
+ t2.shape = {1, 32, 160, 160};
160
+ qnn_out.push_back(t2);
161
+
162
+ return qnn_out;
163
+ }
164
+
165
+ void xywh2xyxy(const std::vector<std::vector<float>>& boxes_xywh, std::vector<std::vector<float>>& boxes_xyxy) {
166
+ boxes_xyxy.clear();
167
+ for (const auto& box : boxes_xywh) {
168
+ float x = box[0], y = box[1], w = box[2], h = box[3];
169
+ float x1 = x - w / 2.0f;
170
+ float y1 = y - h / 2.0f;
171
+ float x2 = x + w / 2.0f;
172
+ float y2 = y + h / 2.0f;
173
+ boxes_xyxy.push_back({x1, y1, x2, y2});
174
+ }
175
+ }
176
+
177
+ // std::vector<Detection> non_max_suppression_qnn(
178
+ // const Tensor& det_output, // qnn_out[0], shape [1, 84, 8400]
179
+ // float conf_thres,
180
+ // float iou_thres,
181
+ // int max_det,
182
+ // int num_classes,
183
+ // bool agnostic = false,
184
+ // const std::vector<int>& class_filter = {} // 可选类过滤
185
+ // ) {
186
+ // std::vector<Detection> candidates;
187
+
188
+ // int num_outputs = det_output.shape[1]; // 通常是 4 + num_classes
189
+ // int num_anchors = det_output.shape[2]; // 8400
190
+ // num_classes = num_outputs - 5;
191
+ // float* data = det_output.data;
192
+ // // for (int i =0 ; i<20;++i) {
193
+ // // float val = data[i];
194
+ // // std::cout << "det_output data[" << i << "] = " << val << std::endl;
195
+ // // }
196
+ // // std::cout << "shape: ["
197
+ // // << det_output.shape[0] << ", "
198
+ // // << det_output.shape[1] << ", "
199
+ // // << det_output.shape[2] << "]" << std::endl;
200
+ // // std::cout << "num_outputs: " << num_outputs << ", num_anchors: " << num_anchors << std::endl;
201
+ // for (int i = 0; i < num_anchors; ++i) {
202
+ // float x = data[0 * num_outputs * num_anchors + 0 * num_anchors + i];
203
+ // float y = data[0 * num_outputs * num_anchors + 1 * num_anchors + i];
204
+ // float w = data[0 * num_outputs * num_anchors + 2 * num_anchors + i];
205
+ // float h = data[0 * num_outputs * num_anchors + 3 * num_anchors + i];
206
+ // float obj_conf = data[4 * num_anchors + i];
207
+
208
+ // // 置信度最大类别
209
+ // float max_conf = -1.0f;
210
+ // int cls_id = -1;
211
+ // for (int c = 0; c < num_classes; ++c) {
212
+
213
+ // // float class_conf = data[0 * num_outputs * num_anchors + (5 + c) * num_anchors + i];
214
+ // float class_conf = data[(5 + c) * num_anchors + i];
215
+ // float conf = obj_conf * class_conf ;
216
+ // if (conf > max_conf) {
217
+ // max_conf = conf;
218
+ // cls_id = c;
219
+ // }
220
+ // }
221
+ // if (max_conf < conf_thres) continue;
222
+ // if (!class_filter.empty() &&
223
+ // std::find(class_filter.begin(), class_filter.end(), cls_id) == class_filter.end()) {
224
+ // continue;
225
+ // }
226
+
227
+
228
+ // Detection det;
229
+ // det.x1 = x - w / 2.0f;
230
+ // det.y1 = y - h / 2.0f;
231
+ // det.x2 = x + w / 2.0f;
232
+ // det.y2 = y + h / 2.0f;
233
+ // det.conf = max_conf;
234
+ // det.class_id = agnostic ? 0 : cls_id;
235
+ // det.anchor_idx = i;
236
+ // candidates.push_back(det);
237
+ // }
238
+ // // NMS
239
+ // std::vector<cv::Rect> boxes;
240
+ // std::vector<float> scores;
241
+ // for (const auto& d : candidates) {
242
+ // int x = static_cast<int>(d.x1);
243
+ // int y = static_cast<int>(d.y1);
244
+ // int w = static_cast<int>(d.x2 - d.x1);
245
+ // int h = static_cast<int>(d.y2 - d.y1);
246
+ // boxes.emplace_back(cv::Rect(x, y, w, h));
247
+ // scores.push_back(d.conf);
248
+ // }
249
+
250
+ // std::vector<int> keep;
251
+ // cv::dnn::NMSBoxes(boxes, scores, conf_thres, iou_thres, keep, 1.f, max_det);
252
+
253
+ // std::vector<Detection> result;
254
+ // for (int idx : keep) {
255
+ // result.push_back(candidates[idx]);
256
+ // }
257
+
258
+ // return result;
259
+ // }
260
+
261
+
262
+ std::vector<Detection> non_max_suppression_qnn(
263
+ const Tensor& det_output,
264
+ float conf_thres,
265
+ float iou_thres,
266
+ int max_det,
267
+ int mask_dim = 32
268
+ ) {
269
+ std::vector<Detection> candidates;
270
+ int num_outputs = det_output.shape[1]; // 37 = 4 + 1 + 32
271
+ int num_anchors = det_output.shape[2];
272
+ float* data = det_output.data;
273
+
274
+ for (int i = 0; i < num_anchors; ++i) {
275
+ float x = data[0 * num_outputs * num_anchors + 0 * num_anchors + i];
276
+ float y = data[0 * num_outputs * num_anchors + 1 * num_anchors + i];
277
+ float w = data[0 * num_outputs * num_anchors + 2 * num_anchors + i];
278
+ float h = data[0 * num_outputs * num_anchors + 3 * num_anchors + i];
279
+ float conf = data[0 * num_outputs * num_anchors + 4 * num_anchors + i];
280
+
281
+ if (conf < conf_thres)
282
+ continue;
283
+
284
+ Detection det;
285
+ det.x1 = x - w / 2.0f;
286
+ det.y1 = y - h / 2.0f;
287
+ det.x2 = x + w / 2.0f;
288
+ det.y2 = y + h / 2.0f;
289
+ det.conf = conf;
290
+ det.anchor_idx = i;
291
+
292
+ // 提取 32维掩码向量
293
+ det.mask_vec.resize(mask_dim);
294
+ for (int m = 0; m < mask_dim; ++m) {
295
+ det.mask_vec[m] = data[0 * num_outputs * num_anchors + (5 + m) * num_anchors + i];
296
+ }
297
+
298
+ candidates.push_back(det);
299
+ }
300
+
301
+ // OpenCV NMS
302
+ std::vector<cv::Rect> boxes;
303
+ std::vector<float> scores;
304
+ for (const auto& d : candidates) {
305
+ boxes.emplace_back(cv::Rect(cv::Point(d.x1, d.y1), cv::Point(d.x2, d.y2)));
306
+ scores.push_back(d.conf);
307
+ }
308
+
309
+ std::vector<int> keep;
310
+ cv::dnn::NMSBoxes(boxes, scores, conf_thres, iou_thres, keep, 1.f, max_det);
311
+
312
+ std::vector<Detection> result;
313
+ for (int idx : keep) {
314
+ result.push_back(candidates[idx]);
315
+ }
316
+
317
+ return result;
318
+ }
319
+
320
+
321
+
322
+
323
+
324
+
325
+ void adjust_bboxes_to_image_border(std::vector<Detection>& boxes, int image_h, int image_w, int threshold = 20) {
326
+ for (auto& box : boxes) {
327
+ if (box.x1 < threshold) box.x1 = 0;
328
+ if (box.y1 < threshold) box.y1 = 0;
329
+ if (box.x2 > image_w - threshold) box.x2 = image_w;
330
+ if (box.y2 > image_h - threshold) box.y2 = image_h;
331
+ }
332
+ }
333
+
334
+ std::vector<int> bbox_iou(const Detection& box1,
335
+ const std::vector<Detection>& boxes,
336
+ float iou_thres,
337
+ int image_h,
338
+ int image_w) {
339
+ std::vector<Detection> adjusted_boxes = boxes;
340
+ adjust_bboxes_to_image_border(adjusted_boxes, image_h, image_w);
341
+
342
+ std::vector<int> indices;
343
+ float area1 = (box1.x2 - box1.x1) * (box1.y2 - box1.y1);
344
+
345
+ for (size_t i = 0; i < adjusted_boxes.size(); ++i) {
346
+ const auto& box2 = adjusted_boxes[i];
347
+
348
+ float inter_x1 = std::max(box1.x1, box2.x1);
349
+ float inter_y1 = std::max(box1.y1, box2.y1);
350
+ float inter_x2 = std::min(box1.x2, box2.x2);
351
+ float inter_y2 = std::min(box1.y2, box2.y2);
352
+
353
+ float inter_w = std::max(0.0f, inter_x2 - inter_x1);
354
+ float inter_h = std::max(0.0f, inter_y2 - inter_y1);
355
+ float inter_area = inter_w * inter_h;
356
+
357
+ float area2 = (box2.x2 - box2.x1) * (box2.y2 - box2.y1);
358
+ float union_area = area1 + area2 - inter_area;
359
+
360
+ float iou = union_area > 0 ? inter_area / union_area : 0.0f;
361
+
362
+ if (iou > iou_thres) {
363
+ indices.push_back(i);
364
+ }
365
+ }
366
+
367
+ return indices;
368
+ }
369
+
370
+ void inject_full_box(std::vector<Detection>& detections, int image_h, int image_w, float iou_thres = 0.9f) {
371
+ if (detections.empty()) {
372
+ std::cout << "No object detected." << std::endl;
373
+ return;
374
+ }
375
+
376
+ // 构造全图框(x1=0, y1=0, x2=w, y2=h, conf=1.0)
377
+ Detection full_box;
378
+ full_box.x1 = 0.0f;
379
+ full_box.y1 = 0.0f;
380
+ full_box.x2 = static_cast<float>(image_w);
381
+ full_box.y2 = static_cast<float>(image_h);
382
+ full_box.conf = 1.0f;
383
+ // full_box.class_id = -1; // 表示未定义类别,可自定义
384
+
385
+ // 计算高 IoU 匹配框
386
+ std::vector<int> matched = bbox_iou(full_box, detections, iou_thres, image_h, image_w);
387
+ if (!matched.empty()) {
388
+ for (int idx : matched) {
389
+ detections[idx] = full_box;
390
+ }
391
+ }
392
+ }
393
+
394
+ // ----------- clip_boxes -----------
395
+ void clip_boxes(std::vector<Detection>& boxes, const cv::Size& shape) {
396
+ int width = shape.width;
397
+ int height = shape.height;
398
+ for (auto& box : boxes) {
399
+ box.x1 = std::max(0.f, std::min(box.x1, static_cast<float>(width)));
400
+ box.y1 = std::max(0.f, std::min(box.y1, static_cast<float>(height)));
401
+ box.x2 = std::max(0.f, std::min(box.x2, static_cast<float>(width)));
402
+ box.y2 = std::max(0.f, std::min(box.y2, static_cast<float>(height)));
403
+ }
404
+ }
405
+
406
+ // ----------- scale_boxes -----------
407
+ // 输入:原推理图大小 img1_shape (h,w), 检测框 boxes (vector<Detection>)
408
+ // 输出:boxes坐标映射到原图大小img0_shape (h0,w0)的坐标系
409
+ // 近似实现参考Python版本,返回修改后的boxes
410
+ void scale_boxes(const cv::Size& img1_shape, std::vector<Detection>& boxes, const cv::Size& img0_shape) {
411
+ // 计算缩放比例
412
+ float gain = std::min(img1_shape.height / (float)img0_shape.height, img1_shape.width / (float)img0_shape.width);
413
+ float pad_w = (img1_shape.width - img0_shape.width * gain) / 2.0f;
414
+ float pad_h = (img1_shape.height - img0_shape.height * gain) / 2.0f;
415
+
416
+ // 调整boxes
417
+ for (auto& box : boxes) {
418
+ box.x1 = (box.x1 - pad_w) / gain;
419
+ box.x2 = (box.x2 - pad_w) / gain;
420
+ box.y1 = (box.y1 - pad_h) / gain;
421
+ box.y2 = (box.y2 - pad_h) / gain;
422
+ }
423
+
424
+ // 限制框坐标
425
+ clip_boxes(boxes, img0_shape);
426
+ }
427
+
428
+
429
+ // ----------- crop_masks -----------
430
+ // masks: vector<cv::Mat>,每个mask单通道float,尺寸HxW
431
+ // boxes: 对应检测框
432
+ // 返回每个掩码按bbox裁剪后的掩码
433
+ std::vector<cv::Mat> crop_masks(const std::vector<cv::Mat>& masks, const std::vector<Detection>& boxes) {
434
+ assert(masks.size() == boxes.size());
435
+ if (masks.empty()) return {};
436
+
437
+ int H = masks[0].rows;
438
+ int W = masks[0].cols;
439
+
440
+ // 构造r,c坐标矩阵
441
+ cv::Mat r(1, W, CV_32F);
442
+ for (int i = 0; i < W; ++i) r.at<float>(0, i) = float(i);
443
+ cv::Mat c(H, 1, CV_32F);
444
+ for (int i = 0; i < H; ++i) c.at<float>(i, 0) = float(i);
445
+
446
+ std::vector<cv::Mat> cropped_masks;
447
+ cropped_masks.reserve(masks.size());
448
+
449
+ for (size_t i = 0; i < masks.size(); ++i) {
450
+ const cv::Mat& mask = masks[i];
451
+ const Detection& box = boxes[i];
452
+
453
+ cv::Mat r_mat, c_mat;
454
+ cv::repeat(r, H, 1, r_mat); // (H, W)
455
+ cv::repeat(c, 1, W, c_mat); // (H, W)
456
+
457
+ cv::Mat mask_r1, mask_r2, mask_c1, mask_c2;
458
+ cv::compare(r_mat, box.x1, mask_r1, cv::CMP_GE);
459
+ cv::compare(r_mat, box.x2, mask_r2, cv::CMP_LT);
460
+ cv::compare(c_mat, box.y1, mask_c1, cv::CMP_GE);
461
+ cv::compare(c_mat, box.y2, mask_c2, cv::CMP_LT);
462
+
463
+ cv::Mat region_mask = (mask_r1 & mask_r2 & mask_c1 & mask_c2);
464
+ region_mask.convertTo(region_mask, CV_32F, 1.0 / 255.0);
465
+
466
+ cv::Mat cropped = mask.mul(region_mask);
467
+ cropped_masks.push_back(cropped);
468
+ }
469
+ return cropped_masks;
470
+ }
471
+
472
+ // ----------- process_mask_native -----------
473
+ // protos: Tensor c x mh x mw
474
+ // masks_in: 多个mask (N x mask_channels)
475
+ // bboxes: vector<Detection>
476
+ // shape: 原图h,w
477
+ // 计算掩码矩阵 (masks_in @ protos) sigmoid -> resize -> crop -> threshold
478
+ // 这里用cv::Mat做矩阵乘法近似
479
+ std::vector<cv::Mat> process_mask_native(const Tensor& protos,
480
+ const std::vector<cv::Mat>& masks_in,
481
+ const std::vector<Detection>& bboxes,
482
+ const cv::Size& shape)
483
+ {
484
+ int c = protos.shape[1]; // 通道数
485
+ int mh = protos.shape[2];
486
+ int mw = protos.shape[3];
487
+ int N = static_cast<int>(masks_in.size());
488
+
489
+ // 将 protos.data 转为 cv::Mat (c, mh*mw)
490
+ cv::Mat proto_mat(c, mh * mw, CV_32F, protos.data);
491
+
492
+ std::vector<cv::Mat> results;
493
+ results.reserve(N);
494
+
495
+ for (int i = 0; i < N; ++i) {
496
+ // masks_in[i] 是单个掩码,尺寸应是 (1, c),这里假设它是1xC的cv::Mat
497
+ // 做矩阵乘法 masks_in[i] (1 x c) * proto_mat (c x mh*mw) => 1 x (mh*mw)
498
+ std::cout << "proto_mat size: " << proto_mat.rows << " x " << proto_mat.cols << std::endl;
499
+ std::cout << "masks_in[" << i << "] size: " << masks_in[i].rows << " x " << masks_in[i].cols << std::endl;
500
+ cv::Mat mask_flat = masks_in[i] * proto_mat; // 1 x (mh*mw)
501
+
502
+ mask_flat = mask_flat.reshape(1, mh); // reshape成 (mh, mw)
503
+
504
+ // sigmoid
505
+ cv::Mat mask_sigmoid;
506
+ cv::exp(-mask_flat, mask_sigmoid);
507
+ mask_sigmoid = 1.0 / (1.0 + mask_sigmoid);
508
+
509
+ // Resize到原图尺寸 shape (h,w)
510
+ cv::Mat mask_resized;
511
+ cv::resize(mask_sigmoid, mask_resized, shape, 0, 0, cv::INTER_LINEAR);
512
+
513
+ // 裁剪掩码
514
+ std::vector<cv::Mat> temp_vec = { mask_resized };
515
+ auto mask_cropped = crop_masks(temp_vec, std::vector<Detection>{bboxes[i]});
516
+ assert(mask_cropped.size() == 1);
517
+
518
+ // threshold 0.5
519
+ cv::Mat mask_bin;
520
+ cv::threshold(mask_cropped[0], mask_bin, 0.5, 1.0, cv::THRESH_BINARY);
521
+
522
+ results.push_back(mask_bin);
523
+ }
524
+ return results;
525
+ }
526
+
527
+ // ----------- process_mask -----------
528
+ // 类似 process_mask_native,但多了bbox缩放和可选上采样upsample
529
+ // masks_in: N x mask_channels (对应pred中[:,6:])
530
+ // bboxes输入是经过缩放的boxes,image_shape是原图尺寸
531
+ std::vector<cv::Mat> process_mask(const Tensor& protos,
532
+ const std::vector<cv::Mat>& masks_in,
533
+ std::vector<Detection> bboxes,
534
+ const cv::Size& image_shape,
535
+ bool upsample = true)
536
+ {
537
+ int c = protos.shape[1];
538
+ int mh = protos.shape[2];
539
+ int mw = protos.shape[3];
540
+ int N = static_cast<int>(masks_in.size());
541
+
542
+ // 将 protos.data 转为 cv::Mat (c, mh*mw)
543
+ cv::Mat proto_mat(c, mh * mw, CV_32F, protos.data);
544
+
545
+ // 调整bbox至downsample尺寸
546
+ for (auto& box : bboxes) {
547
+ box.x1 *= (float)mw / image_shape.width;
548
+ box.x2 *= (float)mw / image_shape.width;
549
+ box.y1 *= (float)mh / image_shape.height;
550
+ box.y2 *= (float)mh / image_shape.height;
551
+ }
552
+
553
+ std::vector<cv::Mat> results;
554
+ results.reserve(N);
555
+
556
+ for (int i = 0; i < N; ++i) {
557
+ // 矩阵乘法
558
+ cv::Mat mask_flat = masks_in[i] * proto_mat;
559
+ mask_flat = mask_flat.reshape(1, mh);
560
+
561
+ // sigmoid
562
+ cv::Mat mask_sigmoid;
563
+ cv::exp(-mask_flat, mask_sigmoid);
564
+ mask_sigmoid = 1.0 / (1.0 + mask_sigmoid);
565
+
566
+ // 裁剪掩码
567
+ std::vector<cv::Mat> temp_vec = { mask_sigmoid };
568
+ auto mask_cropped = crop_masks(temp_vec, std::vector<Detection>{bboxes[i]});
569
+ assert(mask_cropped.size() == 1);
570
+ cv::Mat mask = mask_cropped[0];
571
+
572
+ // 上采样回原图大小
573
+ if (upsample) {
574
+ cv::Mat mask_upsampled;
575
+ cv::resize(mask, mask_upsampled, image_shape, 0, 0, cv::INTER_LINEAR);
576
+ mask = mask_upsampled;
577
+ }
578
+
579
+ // threshold
580
+ cv::Mat mask_bin;
581
+ cv::threshold(mask, mask_bin, 0.5, 1.0, cv::THRESH_BINARY);
582
+
583
+ results.push_back(mask_bin);
584
+ }
585
+ return results;
586
+ }
587
+
588
+ cv::Mat plot_to_result(const cv::Mat& image,
589
+ const std::vector<cv::Mat>& masks,
590
+ bool mask_random_color = true,
591
+ bool withContours = true,
592
+ bool retina = false) {
593
+ // 图像转换
594
+ cv::Mat rgb_img;
595
+ cv::cvtColor(image, rgb_img, cv::COLOR_BGR2RGB);
596
+ int original_h = rgb_img.rows;
597
+ int original_w = rgb_img.cols;
598
+
599
+ // 准备RGBA可视图
600
+ cv::Mat rgba(rgb_img.size(), CV_8UC4);
601
+ for (int y = 0; y < rgb_img.rows; ++y) {
602
+ for (int x = 0; x < rgb_img.cols; ++x) {
603
+ cv::Vec3b pix = rgb_img.at<cv::Vec3b>(y, x);
604
+ rgba.at<cv::Vec4b>(y, x) = cv::Vec4b(pix[0], pix[1], pix[2], 255); // BGR -> BGRA
605
+ }
606
+ }
607
+
608
+ // 遍历mask,构建叠加透明颜色
609
+ for (size_t i = 0; i < masks.size(); ++i) {
610
+ cv::Mat mask = masks[i];
611
+ if (mask.type() != CV_8UC1)
612
+ mask.convertTo(mask, CV_8UC1);
613
+
614
+ // 形态学操作
615
+ cv::morphologyEx(mask, mask, cv::MORPH_CLOSE, cv::Mat::ones(3, 3, CV_8U));
616
+ cv::morphologyEx(mask, mask, cv::MORPH_OPEN, cv::Mat::ones(8, 8, CV_8U));
617
+
618
+ if (!retina) {
619
+ cv::resize(mask, mask, cv::Size(original_w, original_h), 0, 0, cv::INTER_NEAREST);
620
+ }
621
+
622
+ // 设置颜色
623
+ cv::Scalar color;
624
+ if (mask_random_color) {
625
+ std::mt19937 rng(i + 42);
626
+ color = cv::Scalar(rng() % 255, rng() % 255, rng() % 255); // BGR
627
+ } else {
628
+ color = cv::Scalar(255, 144, 30); // 蓝色调
629
+ }
630
+
631
+ // 叠加到 RGBA 图
632
+ for (int y = 0; y < mask.rows; ++y) {
633
+ for (int x = 0; x < mask.cols; ++x) {
634
+ if (mask.at<uchar>(y, x) > 0) {
635
+ cv::Vec4b& pix = rgba.at<cv::Vec4b>(y, x);
636
+ for (int c = 0; c < 3; ++c) {
637
+ pix[c] = static_cast<uchar>(0.6 * color[c] + 0.4 * pix[c]); // 混合颜色
638
+ }
639
+ pix[3] = 255;
640
+ }
641
+ }
642
+ }
643
+ }
644
+
645
+ // 绘制轮廓
646
+ if (withContours) {
647
+ for (const auto& mask : masks) {
648
+ cv::Mat bin;
649
+ mask.convertTo(bin, CV_8UC1);
650
+ if (!retina)
651
+ cv::resize(bin, bin, cv::Size(original_w, original_h), 0, 0, cv::INTER_NEAREST);
652
+
653
+ std::vector<std::vector<cv::Point>> contours;
654
+ cv::findContours(bin, contours, cv::RETR_TREE, cv::CHAIN_APPROX_SIMPLE);
655
+ cv::drawContours(rgba, contours, -1, cv::Scalar(255, 255, 255, 255), 2);
656
+ }
657
+ }
658
+
659
+ // 转为3通道 BGR 输出
660
+ cv::Mat result;
661
+ cv::cvtColor(rgba, result, cv::COLOR_RGBA2BGR);
662
+ return result;
663
+ }
664
+
665
+
666
+
667
+
668
+
669
+
670
+ int invoke(const Args& args) {
671
+ std::cout << "Start main ... ... Model Path: " << args.target_model << "\n"
672
+ << "Image Path: " << args.imgs << "\n"
673
+ << "Inference Nums: " << args.invoke_nums << "\n"
674
+ << "Model Type: " << args.model_type << "\n";
675
+ Model* model = Model::create_instance(args.target_model);
676
+ if(model == nullptr){
677
+ printf("Create model failed !\n");
678
+ return EXIT_FAILURE;
679
+ }
680
+ Config* config = Config::create_instance();
681
+ if(config == nullptr){
682
+ printf("Create config failed !\n");
683
+ return EXIT_FAILURE;
684
+ }
685
+ config->implement_type = ImplementType::TYPE_LOCAL;
686
+ std::string model_type_lower = to_lower(args.model_type);
687
+ if (model_type_lower == "qnn"){
688
+ config->framework_type = FrameworkType::TYPE_QNN;
689
+ } else if (model_type_lower == "snpe2" || model_type_lower == "snpe") {
690
+ config->framework_type = FrameworkType::TYPE_SNPE2;
691
+ }
692
+ config->accelerate_type = AccelerateType::TYPE_DSP;
693
+ config->is_quantify_model = 1;
694
+
695
+ unsigned int model_h = 640;
696
+ unsigned int model_w = 640;
697
+ std::vector<std::vector<uint32_t>> input_shapes = {{1,model_h,model_w,3}};
698
+ std::vector<std::vector<uint32_t>> output_shapes = {{1,1,8400},{1,4,8400},{1,32,8400},{1,160,160,32}};
699
+ model->set_model_properties(input_shapes, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
700
+ std::unique_ptr<Interpreter> fast_interpreter = InterpreterBuilder::build_interpretper_from_model_and_config(model, config);
701
+ if(fast_interpreter == nullptr){
702
+ printf("build_interpretper_from_model_and_config failed !\n");
703
+ return EXIT_FAILURE;
704
+ }
705
+ int result = fast_interpreter->init();
706
+ if(result != EXIT_SUCCESS){
707
+ printf("interpreter->init() failed !\n");
708
+ return EXIT_FAILURE;
709
+ }
710
+ // load model
711
+ fast_interpreter->load_model();
712
+ if(result != EXIT_SUCCESS){
713
+ printf("interpreter->load_model() failed !\n");
714
+ return EXIT_FAILURE;
715
+ }
716
+ printf("detect model load success!\n");
717
+
718
+ cv::Mat frame = cv::imread(args.imgs);
719
+ if (frame.empty()) {
720
+ printf("detect image load failed!\n");
721
+ return 1;
722
+ }
723
+ printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
724
+ cv::Mat input_data;
725
+ cv::Mat frame_clone = frame.clone();
726
+ cv::cvtColor(frame_clone, frame_clone, cv::COLOR_BGR2RGB);
727
+ cv::resize(frame_clone, frame_clone, cv::Size(model_w, model_h));
728
+ cv::Mat float_img;
729
+ frame_clone.convertTo(float_img, CV_32FC3, 1.0 / 255.0);
730
+ std::vector<float> input_tensor = convert_to_NCHW(float_img);
731
+
732
+
733
+ float *outdata0 = nullptr;
734
+ float *outdata1 = nullptr;
735
+ float *outdata2 = nullptr;
736
+ float *outdata3 = nullptr;
737
+ std::vector<float> invoke_time;
738
+ for (int i = 0; i < args.invoke_nums; ++i) {
739
+ result = fast_interpreter->set_input_tensor(0, input_tensor.data());
740
+ if(result != EXIT_SUCCESS){
741
+ printf("interpreter->set_input_tensor() failed !\n");
742
+ return EXIT_FAILURE;
743
+ }
744
+ auto t1 = std::chrono::high_resolution_clock::now();
745
+ result = fast_interpreter->invoke();
746
+ auto t2 = std::chrono::high_resolution_clock::now();
747
+ std::chrono::duration<double> cost_time = t2 - t1;
748
+ invoke_time.push_back(cost_time.count() * 1000);
749
+ if(result != EXIT_SUCCESS){
750
+ printf("interpreter->invoke() failed !\n");
751
+ return EXIT_FAILURE;
752
+ }
753
+ // [1,160,160,32]
754
+ uint32_t out_data_0 = 0;
755
+ result = fast_interpreter->get_output_tensor(0, (void**)&outdata0, &out_data_0);
756
+ if(result != EXIT_SUCCESS){
757
+ printf("interpreter->get_output_tensor() 1 failed !\n");
758
+ return EXIT_FAILURE;
759
+ }
760
+
761
+ // [1,32,8400]
762
+ uint32_t out_data_1 = 0;
763
+ result = fast_interpreter->get_output_tensor(1, (void**)&outdata1, &out_data_1);
764
+ if(result != EXIT_SUCCESS){
765
+ printf("interpreter->get_output_tensor() 1 failed !\n");
766
+ return EXIT_FAILURE;
767
+ }
768
+
769
+ // [1,1,8400]
770
+ uint32_t out_data_2 = 0;
771
+ result = fast_interpreter->get_output_tensor(2, (void**)&outdata2, &out_data_2);
772
+ if(result != EXIT_SUCCESS){
773
+ printf("interpreter->get_output_tensor() 1 failed !\n");
774
+ return EXIT_FAILURE;
775
+ }
776
+
777
+ // [1,4,8400]
778
+ uint32_t out_data_3 = 0;
779
+ result = fast_interpreter->get_output_tensor(3, (void**)&outdata3, &out_data_3);
780
+ if(result != EXIT_SUCCESS){
781
+ printf("interpreter->get_output_tensor() 1 failed !\n");
782
+ return EXIT_FAILURE;
783
+ }
784
+
785
+ }
786
+
787
+ float max_invoke_time = *std::max_element(invoke_time.begin(), invoke_time.end());
788
+ float min_invoke_time = *std::min_element(invoke_time.begin(), invoke_time.end());
789
+ float mean_invoke_time = std::accumulate(invoke_time.begin(), invoke_time.end(), 0.0f) / args.invoke_nums;
790
+ float var_invoketime = 0.0f;
791
+ for (auto time : invoke_time) {
792
+ var_invoketime += (time - mean_invoke_time) * (time - mean_invoke_time);
793
+ }
794
+ var_invoketime /= args.invoke_nums;
795
+ printf("=======================================\n");
796
+ printf("QNN inference %d times :\n --mean_invoke_time is %f \n --max_invoke_time is %f \n --min_invoke_time is %f \n --var_invoketime is %f\n",
797
+ args.invoke_nums, mean_invoke_time, max_invoke_time, min_invoke_time, var_invoketime);
798
+ printf("=======================================\n");
799
+
800
+ // post process
801
+ // outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
802
+
803
+ unsigned int src_dims[4] = {1, 160,160,32};
804
+ unsigned int tsp_dims[4] = {0,3,1,2};
805
+ unsigned int stride_data_num = 1*160*160*32;
806
+ float* format_data = new float[stride_data_num];
807
+ transpose(outdata0, src_dims, tsp_dims, format_data);
808
+ // cv::Mat proto_buffer(32,160*160, CV_32F, format_data);
809
+ // std::cout << "proto_buffer 维度: " << proto_buffer.rows << "x" << proto_buffer.cols << std::endl;
810
+
811
+ for (int i = 0; i < 8400; ++i) {
812
+ outdata2[i] = 1.0f / (1.0f + std::exp(-outdata2[i]));
813
+ }
814
+
815
+ const int N = 1;
816
+ const int W = 8400;
817
+ const int C1 = 4, C2 = 1, C3 = 32;
818
+ const int total_C = C1 + C2 + C3;
819
+ // 分配输出空间
820
+ float output_concat1[total_C * W]; // [1, 37, 8400]
821
+ concat_along_axis1(outdata3, C1, outdata2, C2, outdata1, C3, N, W, output_concat1);
822
+
823
+ std::vector<Tensor> qnn_out = create_qnn_out(output_concat1, format_data);
824
+
825
+ float conf_thres = 0.25f;
826
+ float iou_thres = 0.45f;
827
+ int max_det = 300;
828
+ int num_classes = 1;
829
+ bool agnostic = false;
830
+
831
+ int total_channels = qnn_out[0].shape[1]; // 37
832
+ int mask_dim = 32; // 你从 YOLOv8 导出的模型可能是 mask_dim=32
833
+ num_classes = total_channels - 5 - mask_dim;
834
+ std::cout << "num_classes = " << num_classes << ", mask_dim = " << mask_dim << std::endl;
835
+
836
+ std::vector<Detection> dets = non_max_suppression_qnn(qnn_out[0], conf_thres, iou_thres, max_det, mask_dim);
837
+ std::cout << "dets size: " << dets.size() << std::endl;
838
+ inject_full_box(dets, 640, 640, 0.9f);
839
+
840
+ std::vector<cv::Mat> orig_imgs;
841
+ orig_imgs.push_back(frame.clone());
842
+
843
+ const Tensor& proto_tensor = qnn_out.back();
844
+ size_t batch_size = dets.size();
845
+ std::vector<std::vector<cv::Mat>> results(batch_size);
846
+ // scale_boxes将dets框映射回原图
847
+ std::cout << "infer_img_shape: " << proto_tensor.shape[0] << "x" << proto_tensor.shape[1] << "x" << proto_tensor.shape[3] << " x " << proto_tensor.shape[2] << std::endl;
848
+ cv::Size infer_img_shape(proto_tensor.shape[3], proto_tensor.shape[2]); // [1,c,h,w]
849
+ scale_boxes(infer_img_shape, dets, orig_imgs[0].size());
850
+
851
+ // 从 dets 提取 mask向量,掩码向量是 dets中假设对应的mask数组,需要你准备好
852
+ // 这里暂时用空,需你准备 N个 masks_in (N x c) 的 cv::Mat
853
+ std::vector<cv::Mat> masks_in;
854
+ float* data = qnn_out[0].data;
855
+ int C = qnn_out[0].shape[1]; // 37
856
+ int W1 = qnn_out[0].shape[2]; // 8400
857
+ // int mask_channels = 3;
858
+ // int num_cls = 29;
859
+ for (size_t i = 0; i < dets.size(); ++i) {
860
+ int anchor_idx = dets[i].anchor_idx;
861
+ cv::Mat mask_vec(1, mask_dim, CV_32F);
862
+ std::cout << "Fixed mask_vec[" << i << "] = " << mask_vec << std::endl;
863
+ for (int m = 0; m < mask_dim; ++m) {
864
+ int ch = 5 + m;
865
+ mask_vec.at<float>(0, m) = data[ch * W + anchor_idx];
866
+ }
867
+ masks_in.push_back(mask_vec);
868
+ }
869
+ // 请根据你的推理结果把 dets中每个框对应的mask向量填入masks_in,尺寸应为1 x c (float)
870
+
871
+ // 下面只示范调用:
872
+ std::vector<cv::Mat> masks;
873
+ masks = process_mask_native(proto_tensor, masks_in, dets, orig_imgs[0].size());
874
+
875
+ // for (int i = 0; i < masks.size(); ++i) {
876
+ // // Save raw mask
877
+ // cv::imwrite("mask_raw_" + std::to_string(i) + ".png", masks[i] * 255);
878
+
879
+ // }
880
+
881
+ results[0] = masks;
882
+ if (results.empty()) {
883
+ return -1;
884
+ }
885
+
886
+ std::vector<cv::Mat> ann = results[0]; // 假设只处理第一个batch的结果
887
+
888
+ cv::Mat result_img = plot_to_result(frame, ann);
889
+ cv::imwrite("result_with_mask.png", result_img);
890
+
891
+ fast_interpreter->destory();
892
+ return 0;
893
+ }
894
+
895
+
896
+ int main(int argc, char* argv[]) {
897
+ Args args = parse_args(argc, argv);
898
+ return invoke(args);
899
+ }
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e0334b0766f80b127bf90f14831339b6c94e66e4bbf0767c40e1602028accd0
3
+ size 82550024
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/dogs.jpg ADDED
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/onnx_export.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import cv2
3
+ import os
4
+ import sys
5
+
6
+ from ultralytics.models.fastsam import FastSAM
7
+
8
+ class Fast_SAM(torch.nn.Module):
9
+ """Exportable FastSAM model, end-to-end."""
10
+
11
+ def __init__(self) -> None:
12
+ super().__init__()
13
+ pt_name ='./models/FastSAM-s.pt'
14
+ self.model =FastSAM(pt_name).model
15
+
16
+ def forward(self, image: torch.Tensor):
17
+ """
18
+ Run FastSAM on `image`, and produce high quality segmentation masks.
19
+ Faster than SAM as it is based on YOLOv8.
20
+
21
+ Parameters:
22
+ image: Pixel values pre-processed for encoder consumption.
23
+ Range: float[0, 1]
24
+ 3-channel Color Space: BGR
25
+ Returns:
26
+
27
+ """
28
+ predictions = self.model(image)
29
+ # Return predictions as a tuple instead of nested tuple.
30
+ return (predictions[0], predictions[1][2])
31
+
32
+
33
+ model = Fast_SAM()
34
+ num_params = sum(p.numel() for p in model.parameters())
35
+ print(f'Number of FastSAM-s parameters: {num_params}')
36
+ dummy_input = torch.randn( [1,3,640,640],dtype=torch.float32 )
37
+ source_model = torch.jit.trace(
38
+ model.to("cpu"), dummy_input, check_trace=False
39
+ )
40
+ torch.onnx.export(model, # model being run
41
+ dummy_input, # model input (or a tuple for multiple inputs)
42
+ "./models/fastsam_s.onnx", # where to save the model
43
+ export_params=True, # store the trained parameter weights inside the model file
44
+ opset_version=12, # the ONNX version to export the model to
45
+ do_constant_folding=True, # whether to execute constant folding for optimization
46
+ input_names = ['input'], # the model's input names
47
+ output_names = ['boxes','mask'],
48
+ verbose=True,
49
+ )
50
+ print("Convert to onnx successfully!")
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/prompt.py ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import cv2
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ import torch
7
+ from utils import image_to_np_ndarray
8
+ from PIL import Image
9
+
10
+
11
+ class FastSAMPrompt:
12
+
13
+ def __init__(self, image, results, device='cpu'):
14
+ if isinstance(image, str) or isinstance(image, Image.Image):
15
+ image = image_to_np_ndarray(image)
16
+ self.device = device
17
+ self.results = results
18
+ self.img = image
19
+
20
+ def _segment_image(self, image, bbox):
21
+ if isinstance(image, Image.Image):
22
+ image_array = np.array(image)
23
+ else:
24
+ image_array = image
25
+ segmented_image_array = np.zeros_like(image_array)
26
+ x1, y1, x2, y2 = bbox
27
+ segmented_image_array[y1:y2, x1:x2] = image_array[y1:y2, x1:x2]
28
+ segmented_image = Image.fromarray(segmented_image_array)
29
+ black_image = Image.new('RGB', image.size, (255, 255, 255))
30
+ # transparency_mask = np.zeros_like((), dtype=np.uint8)
31
+ transparency_mask = np.zeros((image_array.shape[0], image_array.shape[1]), dtype=np.uint8)
32
+ transparency_mask[y1:y2, x1:x2] = 255
33
+ transparency_mask_image = Image.fromarray(transparency_mask, mode='L')
34
+ black_image.paste(segmented_image, mask=transparency_mask_image)
35
+ return black_image
36
+
37
+ def _format_results(self, result, filter=0):
38
+ annotations = []
39
+ n = len(result.masks.data)
40
+ for i in range(n):
41
+ annotation = {}
42
+ mask = result.masks.data[i] == 1.0
43
+
44
+ if torch.sum(mask) < filter:
45
+ continue
46
+ annotation['id'] = i
47
+ annotation['segmentation'] = mask.cpu().numpy()
48
+ annotation['bbox'] = result.boxes.data[i]
49
+ annotation['score'] = result.boxes.conf[i]
50
+ annotation['area'] = annotation['segmentation'].sum()
51
+ annotations.append(annotation)
52
+ return annotations
53
+
54
+ def filter_masks(annotations): # filte the overlap mask
55
+ annotations.sort(key=lambda x: x['area'], reverse=True)
56
+ to_remove = set()
57
+ for i in range(0, len(annotations)):
58
+ a = annotations[i]
59
+ for j in range(i + 1, len(annotations)):
60
+ b = annotations[j]
61
+ if i != j and j not in to_remove:
62
+ # check if
63
+ if b['area'] < a['area']:
64
+ if (a['segmentation'] & b['segmentation']).sum() / b['segmentation'].sum() > 0.8:
65
+ to_remove.add(j)
66
+
67
+ return [a for i, a in enumerate(annotations) if i not in to_remove], to_remove
68
+
69
+ def _get_bbox_from_mask(self, mask):
70
+ mask = mask.astype(np.uint8)
71
+ contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
72
+ x1, y1, w, h = cv2.boundingRect(contours[0])
73
+ x2, y2 = x1 + w, y1 + h
74
+ if len(contours) > 1:
75
+ for b in contours:
76
+ x_t, y_t, w_t, h_t = cv2.boundingRect(b)
77
+ # Merge multiple bounding boxes into one.
78
+ x1 = min(x1, x_t)
79
+ y1 = min(y1, y_t)
80
+ x2 = max(x2, x_t + w_t)
81
+ y2 = max(y2, y_t + h_t)
82
+ h = y2 - y1
83
+ w = x2 - x1
84
+ return [x1, y1, x2, y2]
85
+
86
+ def plot_to_result(self,
87
+ annotations,
88
+ bboxes=None,
89
+ points=None,
90
+ point_label=None,
91
+ mask_random_color=True,
92
+ better_quality=True,
93
+ retina=False,
94
+ withContours=True) -> np.ndarray:
95
+ if isinstance(annotations[0], dict):
96
+ annotations = [annotation['segmentation'] for annotation in annotations]
97
+ image = self.img
98
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
99
+ original_h = image.shape[0]
100
+ original_w = image.shape[1]
101
+ if sys.platform == "darwin":
102
+ plt.switch_backend("TkAgg")
103
+ plt.figure(figsize=(original_w / 100, original_h / 100))
104
+ # Add subplot with no margin.
105
+ plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
106
+ plt.margins(0, 0)
107
+ plt.gca().xaxis.set_major_locator(plt.NullLocator())
108
+ plt.gca().yaxis.set_major_locator(plt.NullLocator())
109
+
110
+ plt.imshow(image)
111
+ if better_quality:
112
+ if isinstance(annotations[0], torch.Tensor):
113
+ annotations = np.array(annotations.cpu())
114
+ for i, mask in enumerate(annotations):
115
+ mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8))
116
+ annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8))
117
+ if self.device == 'cpu':
118
+ annotations = np.array(annotations)
119
+ self.fast_show_mask(
120
+ annotations,
121
+ plt.gca(),
122
+ random_color=mask_random_color,
123
+ bboxes=bboxes,
124
+ points=points,
125
+ pointlabel=point_label,
126
+ retinamask=retina,
127
+ target_height=original_h,
128
+ target_width=original_w,
129
+ )
130
+ else:
131
+ if isinstance(annotations[0], np.ndarray):
132
+ annotations = torch.from_numpy(annotations)
133
+ self.fast_show_mask_gpu(
134
+ annotations,
135
+ plt.gca(),
136
+ random_color=mask_random_color,
137
+ bboxes=bboxes,
138
+ points=points,
139
+ pointlabel=point_label,
140
+ retinamask=retina,
141
+ target_height=original_h,
142
+ target_width=original_w,
143
+ )
144
+ if isinstance(annotations, torch.Tensor):
145
+ annotations = annotations.cpu().numpy()
146
+ if withContours:
147
+ contour_all = []
148
+ temp = np.zeros((original_h, original_w, 1))
149
+ for i, mask in enumerate(annotations):
150
+ if type(mask) == dict:
151
+ mask = mask['segmentation']
152
+ annotation = mask.astype(np.uint8)
153
+ if not retina:
154
+ annotation = cv2.resize(
155
+ annotation,
156
+ (original_w, original_h),
157
+ interpolation=cv2.INTER_NEAREST,
158
+ )
159
+ contours, hierarchy = cv2.findContours(annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
160
+ for contour in contours:
161
+ contour_all.append(contour)
162
+ cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2)
163
+ color = np.array([0 / 255, 0 / 255, 255 / 255, 0.8])
164
+ contour_mask = temp / 255 * color.reshape(1, 1, -1)
165
+ plt.imshow(contour_mask)
166
+
167
+ plt.axis('off')
168
+ fig = plt.gcf()
169
+ plt.draw()
170
+
171
+ try:
172
+ buf = fig.canvas.tostring_rgb()
173
+ except AttributeError:
174
+ fig.canvas.draw()
175
+ buf = fig.canvas.tostring_rgb()
176
+ cols, rows = fig.canvas.get_width_height()
177
+ img_array = np.frombuffer(buf, dtype=np.uint8).reshape(rows, cols, 3)
178
+ result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
179
+ plt.close()
180
+ return result
181
+
182
+ # Remark for refactoring: IMO a function should do one thing only, storing the image and plotting should be seperated and do not necessarily need to be class functions but standalone utility functions that the user can chain in his scripts to have more fine-grained control.
183
+ def plot(self,
184
+ annotations,
185
+ output_path,
186
+ bboxes=None,
187
+ points=None,
188
+ point_label=None,
189
+ mask_random_color=True,
190
+ better_quality=True,
191
+ retina=False,
192
+ withContours=True):
193
+ if len(annotations) == 0:
194
+ return None
195
+ result = self.plot_to_result(
196
+ annotations,
197
+ bboxes,
198
+ points,
199
+ point_label,
200
+ mask_random_color,
201
+ better_quality,
202
+ retina,
203
+ withContours,
204
+ )
205
+
206
+ path = os.path.dirname(os.path.abspath(output_path))
207
+ if not os.path.exists(path):
208
+ os.makedirs(path)
209
+ result = result[:, :, ::-1]
210
+ cv2.imwrite(output_path, result)
211
+
212
+ # CPU post process
213
+ def fast_show_mask(
214
+ self,
215
+ annotation,
216
+ ax,
217
+ random_color=False,
218
+ bboxes=None,
219
+ points=None,
220
+ pointlabel=None,
221
+ retinamask=True,
222
+ target_height=960,
223
+ target_width=960,
224
+ ):
225
+ msak_sum = annotation.shape[0]
226
+ height = annotation.shape[1]
227
+ weight = annotation.shape[2]
228
+ #Sort annotations based on area.
229
+ areas = np.sum(annotation, axis=(1, 2))
230
+ sorted_indices = np.argsort(areas)
231
+ annotation = annotation[sorted_indices]
232
+
233
+ index = (annotation != 0).argmax(axis=0)
234
+ if random_color:
235
+ color = np.random.random((msak_sum, 1, 1, 3))
236
+ else:
237
+ color = np.ones((msak_sum, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 255 / 255])
238
+ transparency = np.ones((msak_sum, 1, 1, 1)) * 0.6
239
+ visual = np.concatenate([color, transparency], axis=-1)
240
+ mask_image = np.expand_dims(annotation, -1) * visual
241
+
242
+ show = np.zeros((height, weight, 4))
243
+ h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing='ij')
244
+ indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
245
+ # Use vectorized indexing to update the values of 'show'.
246
+ show[h_indices, w_indices, :] = mask_image[indices]
247
+ if bboxes is not None:
248
+ for bbox in bboxes:
249
+ x1, y1, x2, y2 = bbox
250
+ ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
251
+ # draw point
252
+ if points is not None:
253
+ plt.scatter(
254
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
255
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
256
+ s=20,
257
+ c='y',
258
+ )
259
+ plt.scatter(
260
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
261
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
262
+ s=20,
263
+ c='m',
264
+ )
265
+
266
+ if not retinamask:
267
+ show = cv2.resize(show, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
268
+ ax.imshow(show)
269
+
270
+ def fast_show_mask_gpu(
271
+ self,
272
+ annotation,
273
+ ax,
274
+ random_color=False,
275
+ bboxes=None,
276
+ points=None,
277
+ pointlabel=None,
278
+ retinamask=True,
279
+ target_height=960,
280
+ target_width=960,
281
+ ):
282
+ msak_sum = annotation.shape[0]
283
+ height = annotation.shape[1]
284
+ weight = annotation.shape[2]
285
+ areas = torch.sum(annotation, dim=(1, 2))
286
+ sorted_indices = torch.argsort(areas, descending=False)
287
+ annotation = annotation[sorted_indices]
288
+ # Find the index of the first non-zero value at each position.
289
+ index = (annotation != 0).to(torch.long).argmax(dim=0)
290
+ if random_color:
291
+ color = torch.rand((msak_sum, 1, 1, 3)).to(annotation.device)
292
+ else:
293
+ color = torch.ones((msak_sum, 1, 1, 3)).to(annotation.device) * torch.tensor([
294
+ 30 / 255, 144 / 255, 255 / 255]).to(annotation.device)
295
+ transparency = torch.ones((msak_sum, 1, 1, 1)).to(annotation.device) * 0.6
296
+ visual = torch.cat([color, transparency], dim=-1)
297
+ mask_image = torch.unsqueeze(annotation, -1) * visual
298
+ # Select data according to the index. The index indicates which batch's data to choose at each position, converting the mask_image into a single batch form.
299
+ show = torch.zeros((height, weight, 4)).to(annotation.device)
300
+ try:
301
+ h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight), indexing='ij')
302
+ except:
303
+ h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight))
304
+ indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
305
+ # Use vectorized indexing to update the values of 'show'.
306
+ show[h_indices, w_indices, :] = mask_image[indices]
307
+ show_cpu = show.cpu().numpy()
308
+ if bboxes is not None:
309
+ for bbox in bboxes:
310
+ x1, y1, x2, y2 = bbox
311
+ ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
312
+ # draw point
313
+ if points is not None:
314
+ plt.scatter(
315
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
316
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
317
+ s=20,
318
+ c='y',
319
+ )
320
+ plt.scatter(
321
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
322
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
323
+ s=20,
324
+ c='m',
325
+ )
326
+ if not retinamask:
327
+ show_cpu = cv2.resize(show_cpu, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
328
+ ax.imshow(show_cpu)
329
+
330
+ # clip
331
+ @torch.no_grad()
332
+ def retrieve(self, model, preprocess, elements, search_text: str, device) -> int:
333
+ preprocessed_images = [preprocess(image).to(device) for image in elements]
334
+ try:
335
+ import clip # for linear_assignment
336
+
337
+ except (ImportError, AssertionError, AttributeError):
338
+ from ultralytics.yolo.utils.checks import check_requirements
339
+
340
+ check_requirements('git+https://github.com/openai/CLIP.git') # required before installing lap from source
341
+ import clip
342
+
343
+
344
+ tokenized_text = clip.tokenize([search_text]).to(device)
345
+ stacked_images = torch.stack(preprocessed_images)
346
+ image_features = model.encode_image(stacked_images)
347
+ text_features = model.encode_text(tokenized_text)
348
+ image_features /= image_features.norm(dim=-1, keepdim=True)
349
+ text_features /= text_features.norm(dim=-1, keepdim=True)
350
+ probs = 100.0 * image_features @ text_features.T
351
+ return probs[:, 0].softmax(dim=0)
352
+
353
+ def _crop_image(self, format_results):
354
+
355
+ image = Image.fromarray(cv2.cvtColor(self.img, cv2.COLOR_BGR2RGB))
356
+ ori_w, ori_h = image.size
357
+ annotations = format_results
358
+ mask_h, mask_w = annotations[0]['segmentation'].shape
359
+ if ori_w != mask_w or ori_h != mask_h:
360
+ image = image.resize((mask_w, mask_h))
361
+ cropped_boxes = []
362
+ cropped_images = []
363
+ not_crop = []
364
+ filter_id = []
365
+ # annotations, _ = filter_masks(annotations)
366
+ # filter_id = list(_)
367
+ for _, mask in enumerate(annotations):
368
+ if np.sum(mask['segmentation']) <= 100:
369
+ filter_id.append(_)
370
+ continue
371
+ bbox = self._get_bbox_from_mask(mask['segmentation']) # mask 的 bbox
372
+ cropped_boxes.append(self._segment_image(image, bbox))
373
+ # cropped_boxes.append(segment_image(image,mask["segmentation"]))
374
+ cropped_images.append(bbox) # Save the bounding box of the cropped image.
375
+
376
+ return cropped_boxes, cropped_images, not_crop, filter_id, annotations
377
+
378
+ def box_prompt(self, bbox=None, bboxes=None):
379
+ if self.results == None:
380
+ return []
381
+ assert bbox or bboxes
382
+ if bboxes is None:
383
+ bboxes = [bbox]
384
+ max_iou_index = []
385
+ for bbox in bboxes:
386
+ assert (bbox[2] != 0 and bbox[3] != 0)
387
+ masks = self.results[0].masks.data
388
+ target_height = self.img.shape[0]
389
+ target_width = self.img.shape[1]
390
+ h = masks.shape[1]
391
+ w = masks.shape[2]
392
+ if h != target_height or w != target_width:
393
+ bbox = [
394
+ int(bbox[0] * w / target_width),
395
+ int(bbox[1] * h / target_height),
396
+ int(bbox[2] * w / target_width),
397
+ int(bbox[3] * h / target_height), ]
398
+ bbox[0] = round(bbox[0]) if round(bbox[0]) > 0 else 0
399
+ bbox[1] = round(bbox[1]) if round(bbox[1]) > 0 else 0
400
+ bbox[2] = round(bbox[2]) if round(bbox[2]) < w else w
401
+ bbox[3] = round(bbox[3]) if round(bbox[3]) < h else h
402
+
403
+ # IoUs = torch.zeros(len(masks), dtype=torch.float32)
404
+ bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
405
+
406
+ masks_area = torch.sum(masks[:, bbox[1]:bbox[3], bbox[0]:bbox[2]], dim=(1, 2))
407
+ orig_masks_area = torch.sum(masks, dim=(1, 2))
408
+
409
+ union = bbox_area + orig_masks_area - masks_area
410
+ IoUs = masks_area / union
411
+ max_iou_index.append(int(torch.argmax(IoUs)))
412
+ max_iou_index = list(set(max_iou_index))
413
+ return np.array(masks[max_iou_index].cpu().numpy())
414
+
415
+ def point_prompt(self, points, pointlabel): # numpy
416
+ if self.results == None:
417
+ return []
418
+ masks = self._format_results(self.results[0], 0)
419
+ target_height = self.img.shape[0]
420
+ target_width = self.img.shape[1]
421
+ h = masks[0]['segmentation'].shape[0]
422
+ w = masks[0]['segmentation'].shape[1]
423
+ if h != target_height or w != target_width:
424
+ points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points]
425
+ onemask = np.zeros((h, w))
426
+ masks = sorted(masks, key=lambda x: x['area'], reverse=True)
427
+ for i, annotation in enumerate(masks):
428
+ if type(annotation) == dict:
429
+ mask = annotation['segmentation']
430
+ else:
431
+ mask = annotation
432
+ for i, point in enumerate(points):
433
+ if mask[point[1], point[0]] == 1 and pointlabel[i] == 1:
434
+ onemask[mask] = 1
435
+ if mask[point[1], point[0]] == 1 and pointlabel[i] == 0:
436
+ onemask[mask] = 0
437
+ onemask = onemask >= 1
438
+ return np.array([onemask])
439
+
440
+ def text_prompt(self, text):
441
+ if self.results == None:
442
+ return []
443
+ format_results = self._format_results(self.results[0], 0)
444
+ cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._crop_image(format_results)
445
+ clip_model, preprocess = clip.load('ViT-B/32', device=self.device)
446
+ scores = self.retrieve(clip_model, preprocess, cropped_boxes, text, device=self.device)
447
+ max_idx = scores.argsort()
448
+ max_idx = max_idx[-1]
449
+ max_idx += sum(np.array(filter_id) <= int(max_idx))
450
+ return np.array([annotations[max_idx]['segmentation']])
451
+
452
+ def everything_prompt(self):
453
+ if self.results == None:
454
+ return []
455
+ return self.results[0].masks.data
456
+
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/run_test.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import cv2
4
+ import numpy as np
5
+ import onnxruntime
6
+ import time
7
+ import matplotlib.pyplot as plt
8
+ import torch
9
+ from ultralytics.engine.results import Results
10
+ from tools_pt import *
11
+ from prompt import FastSAMPrompt
12
+ import aidlite
13
+ import argparse
14
+ import ast
15
+
16
+ # 定义相似度函数
17
+ def get_acc(onnx_out,other_out):
18
+ cosine_similarity=np.dot(np.array(onnx_out),np.array(other_out))/(np.linalg.norm(np.array(onnx_out)) * np.linalg.norm(np.array(other_out)))
19
+ return cosine_similarity
20
+
21
+ def cal_sigmoid(x):
22
+ return 1 / (1 + np.exp(-x))
23
+
24
+ class qnn_predict(object):
25
+ def __init__(self,inputshape,outputshape,args) -> None:
26
+ aidlite.set_log_level(aidlite.LogLevel.INFO)
27
+ aidlite.log_to_stderr()
28
+ print(f"Aidlite library version : {aidlite.get_library_version()}")
29
+ print(f"Aidlite python library version : {aidlite.get_py_library_version()}")
30
+ config = aidlite.Config.create_instance()
31
+ if config is None:
32
+ print("Create model failed !")
33
+ config.implement_type = aidlite.ImplementType.TYPE_LOCAL
34
+ config.framework_type = aidlite.FrameworkType.TYPE_QNN
35
+ config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
36
+ config.is_quantify_model = 1
37
+
38
+ model = aidlite.Model.create_instance(args.target_model)
39
+ if model is None:
40
+ print("Create model failed !")
41
+
42
+ self.input_shape=inputshape
43
+ self.out_shape = outputshape
44
+ model.set_model_properties(self.input_shape, aidlite.DataType.TYPE_FLOAT32, self.out_shape, aidlite.DataType.TYPE_FLOAT32)
45
+ self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(model, config)
46
+ if self.interpreter is None:
47
+ print("build_interpretper_from_model_and_config failed !")
48
+ result = self.interpreter.init()
49
+ if result != 0:
50
+ print(f"interpreter init failed !")
51
+ result = self.interpreter.load_model()
52
+ if result != 0:
53
+ print("interpreter load model failed !")
54
+ print("detect model load success!")
55
+
56
+ self.conf = 0.4
57
+ self.iou=0.9
58
+ self.size = 640
59
+ self.agnostic_nms=False
60
+ self.max_det = 300
61
+ self.names=['object']
62
+ self.classes =None
63
+ self.retina_masks=True
64
+
65
+ def pretreat_img(self,img):
66
+ scale = 1/255.
67
+ img_size = cv2.resize(img, (self.size,self.size), interpolation=cv2.INTER_LINEAR)
68
+ float_img = img_size.astype('float32')
69
+ float_img = float_img* scale
70
+ float_img = float_img[:, :, ::-1]
71
+ return float_img
72
+
73
+ def postprocess(self, preds, img, orig_imgs):
74
+ """TODO: filter by classes."""
75
+ p = non_max_suppression(torch.from_numpy(preds[0]),
76
+ self.conf,
77
+ self.iou,
78
+ agnostic=self.agnostic_nms,
79
+ max_det=self.max_det,
80
+ nc=len(self.names),
81
+ classes=self.classes)
82
+
83
+ results = []
84
+ if len(p) == 0 or len(p[0]) == 0:
85
+ print("No object detected.")
86
+ return results
87
+
88
+ full_box = torch.zeros_like(p[0][0])
89
+ full_box[2], full_box[3], full_box[4], full_box[6:] = img.shape[3], img.shape[2], 1.0, 1.0
90
+ full_box = full_box.view(1, -1)
91
+ critical_iou_index = bbox_iou(full_box[0][:4], p[0][:, :4], iou_thres=0.9, image_shape=img.shape[2:])
92
+ if critical_iou_index.numel() != 0:
93
+ full_box[0][4] = p[0][critical_iou_index][:,4]
94
+ full_box[0][6:] = p[0][critical_iou_index][:,6:]
95
+ p[0][critical_iou_index] = full_box
96
+
97
+ #proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported
98
+ proto=torch.from_numpy(preds[-1])
99
+ for i, pred in enumerate(p):
100
+ orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs
101
+ path =img[0] #self.batch[0]
102
+ img_path = path[i] if isinstance(path, list) else path
103
+ if not len(pred): # save empty boxes
104
+ results.append(Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6]))
105
+ continue
106
+ if self.retina_masks:
107
+ if not isinstance(orig_imgs, torch.Tensor):
108
+ pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
109
+ masks = process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2]) # HWC
110
+ else:
111
+ masks = process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True) # HWC
112
+ if not isinstance(orig_imgs, torch.Tensor):
113
+ pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
114
+ results.append(
115
+ Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6], masks=masks))
116
+ return results
117
+
118
+ def qnn_run(self, orig_imgs,img_path,args):
119
+ input_img_f =self.pretreat_img(orig_imgs) # 图片resize HWC
120
+ # print("qnn_input:",input_img_f)
121
+ # encoder texts
122
+ input_img = np.expand_dims(input_img_f, 0)
123
+
124
+ invoke_time=[]
125
+ for i in range(args.invoke_nums):
126
+ result = self.interpreter.set_input_tensor(0, input_img.data)
127
+ t0 = time.time()
128
+ result = self.interpreter.invoke()
129
+ t1 = time.time()
130
+ cost_time=(t1-t0)*1000
131
+ invoke_time.append(cost_time)
132
+ mask_ = self.interpreter.get_output_tensor(0)
133
+ concat_ = self.interpreter.get_output_tensor(1)
134
+ mul_ = self.interpreter.get_output_tensor(3)
135
+ split_ = self.interpreter.get_output_tensor(2)
136
+ mask_ = mask_.reshape( * self.out_shape[3])
137
+ mask_=mask_.transpose((0, 3, 1,2))
138
+ concat_ = concat_.reshape( *self.out_shape[2])
139
+ mul_ = mul_.reshape( *self.out_shape[1])
140
+ split_ = split_.reshape( *self.out_shape[0])
141
+ sig_ = cal_sigmoid(split_)
142
+
143
+ output_concat = np.concatenate((mul_,sig_),axis=1)
144
+ output_concat = np.concatenate((output_concat,concat_),axis=1)
145
+
146
+ # outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
147
+ ## time 统计
148
+ max_invoke_time = max(invoke_time)
149
+ min_invoke_time = min(invoke_time)
150
+ mean_invoke_time = sum(invoke_time)/args.invoke_nums
151
+ var_invoketime=np.var(invoke_time)
152
+ print("========================================")
153
+ print(f"QNN inference {args.invoke_nums} times :\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
154
+ print("========================================")
155
+
156
+ qnn_out = [np.array(output_concat),np.array(mask_)]
157
+ # print("qnn predict out:",qnn_out)
158
+
159
+ nchw_img = input_img.transpose(0,3,1,2)
160
+ everything_results = self.postprocess( qnn_out, nchw_img, [orig_imgs])
161
+ # print("everything_results: ",everything_results)
162
+
163
+ prompt_process = FastSAMPrompt(args.imgs, everything_results, device="cpu")
164
+
165
+ # ann = prompt_process.point_prompt(points=[[620, 360]], pointlabel=[1])
166
+ try:
167
+ if args.point_prompt ==[[0,0]]:
168
+ ann = prompt_process.everything_prompt()
169
+ else:
170
+ ann = prompt_process.point_prompt(points=args.point_prompt, pointlabel=[1])
171
+ out_name = os.path.basename(img_path).split(".")[0]
172
+ if True: # savepic
173
+ outpath = "python/"
174
+ if not os.path.exists(outpath):
175
+ os.mkdir(outpath)
176
+ prompt_process.plot(
177
+ annotations=ann,
178
+ output_path=os.path.join(outpath,out_name+"_result_int8.jpg"),
179
+ mask_random_color=True,
180
+ better_quality=True,
181
+ retina=False,
182
+ withContours=True,
183
+ )
184
+ else:
185
+ plt.figure()
186
+ prompt_process.fast_show_mask(annotation=ann,
187
+ ax = plt)
188
+ except Exception as e:
189
+ print(f"Waning : An error occurred in the picture {img_path} prediction -{e}")
190
+ return [mask_.reshape(-1),output_concat.reshape(-1)]
191
+
192
+
193
+
194
+ def parser_args():
195
+ parser = argparse.ArgumentParser(description="Run model benchmarks")
196
+ parser.add_argument('--target_model',type=str,default='models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin.aidem',help="inference model path")
197
+ parser.add_argument('--source_model',type=str,default='models/fastsam_x.onnx',help="original model path")
198
+ parser.add_argument('--imgs',type=str,default='python/dogs.jpg',help="Predict images path")
199
+ parser.add_argument('--invoke_nums',type=int,default=10,help="Inference nums")
200
+ parser.add_argument('--point_prompt',type=str,default="[[0,0]]",help="example:[[x1,y1],[x2,y2]]")
201
+ args = parser.parse_args()
202
+ return args
203
+
204
+
205
+ if __name__ == "__main__":
206
+ args = parser_args()
207
+ inputshape=[[1,640,640,3]]
208
+ outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
209
+ args.point_prompt = ast.literal_eval(args.point_prompt)
210
+
211
+ predict = qnn_predict(inputshape,outputshape,args)
212
+ if os.path.isdir(args.imgs):
213
+ img_files = os.listdir(args.imgs)
214
+ for fi in img_files:
215
+ img_path = os.path.join(args.imgs,fi)
216
+ im0s = cv2.imread(img_path) # BGR
217
+ im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
218
+ predict.qnn_run(im0s,img_path,args)
219
+ else:
220
+ img_path = args.imgs
221
+ im0s = cv2.imread(img_path) # BGR
222
+ im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
223
+ qnn_result = predict.qnn_run(im0s,img_path,args)
224
+ print("Prediction completion and the results are saved !")
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/tools_pt.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import time
3
+ import torch
4
+ import torchvision
5
+ import torch.nn.functional as F
6
+
7
+
8
+
9
+ def clip_boxes(boxes, shape):
10
+ """
11
+ Takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the shape.
12
+
13
+ Args:
14
+ boxes (torch.Tensor): the bounding boxes to clip
15
+ shape (tuple): the shape of the image
16
+ """
17
+ if isinstance(boxes, torch.Tensor): # faster individually
18
+ boxes[..., 0].clamp_(0, shape[1]) # x1
19
+ boxes[..., 1].clamp_(0, shape[0]) # y1
20
+ boxes[..., 2].clamp_(0, shape[1]) # x2
21
+ boxes[..., 3].clamp_(0, shape[0]) # y2
22
+ else: # np.array (faster grouped)
23
+ boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
24
+ boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
25
+
26
+ def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding=True):
27
+ """
28
+ Rescales bounding boxes (in the format of xyxy) from the shape of the image they were originally specified in
29
+ (img1_shape) to the shape of a different image (img0_shape).
30
+
31
+ Args:
32
+ img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
33
+ boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
34
+ img0_shape (tuple): the shape of the target image, in the format of (height, width).
35
+ ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
36
+ calculated based on the size difference between the two images.
37
+ padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
38
+ rescaling.
39
+
40
+ Returns:
41
+ boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
42
+ """
43
+ if ratio_pad is None: # calculate from img0_shape
44
+ gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
45
+ pad = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1), round(
46
+ (img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # wh padding
47
+ else:
48
+ gain = ratio_pad[0][0]
49
+ pad = ratio_pad[1]
50
+
51
+ if padding:
52
+ boxes[..., [0, 2]] -= pad[0] # x padding
53
+ boxes[..., [1, 3]] -= pad[1] # y padding
54
+ boxes[..., :4] /= gain
55
+ clip_boxes(boxes, img0_shape)
56
+ return boxes
57
+
58
+
59
+ def xywh2xyxy(x):
60
+ """
61
+ Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
62
+ top-left corner and (x2, y2) is the bottom-right corner.
63
+
64
+ Args:
65
+ x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.
66
+
67
+ Returns:
68
+ y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
69
+ """
70
+ assert x.shape[-1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'
71
+ y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x) # faster than clone/copy
72
+ dw = x[..., 2] / 2 # half-width
73
+ dh = x[..., 3] / 2 # half-height
74
+ y[..., 0] = x[..., 0] - dw # top left x
75
+ y[..., 1] = x[..., 1] - dh # top left y
76
+ y[..., 2] = x[..., 0] + dw # bottom right x
77
+ y[..., 3] = x[..., 1] + dh # bottom right y
78
+ return y
79
+
80
+
81
+ def non_max_suppression(
82
+ prediction,
83
+ conf_thres=0.25,
84
+ iou_thres=0.45,
85
+ classes=None,
86
+ agnostic=False,
87
+ multi_label=False,
88
+ labels=(),
89
+ max_det=300,
90
+ nc=0, # number of classes (optional)
91
+ max_time_img=0.05,
92
+ max_nms=30000,
93
+ max_wh=7680,
94
+ ):
95
+ """
96
+ Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box.
97
+
98
+ Args:
99
+ prediction (torch.Tensor): A tensor of shape (batch_size, num_classes + 4 + num_masks, num_boxes)
100
+ containing the predicted boxes, classes, and masks. The tensor should be in the format
101
+ output by a model, such as YOLO.
102
+ conf_thres (float): The confidence threshold below which boxes will be filtered out.
103
+ Valid values are between 0.0 and 1.0.
104
+ iou_thres (float): The IoU threshold below which boxes will be filtered out during NMS.
105
+ Valid values are between 0.0 and 1.0.
106
+ classes (List[int]): A list of class indices to consider. If None, all classes will be considered.
107
+ agnostic (bool): If True, the model is agnostic to the number of classes, and all
108
+ classes will be considered as one.
109
+ multi_label (bool): If True, each box may have multiple labels.
110
+ labels (List[List[Union[int, float, torch.Tensor]]]): A list of lists, where each inner
111
+ list contains the apriori labels for a given image. The list should be in the format
112
+ output by a dataloader, with each label being a tuple of (class_index, x1, y1, x2, y2).
113
+ max_det (int): The maximum number of boxes to keep after NMS.
114
+ nc (int, optional): The number of classes output by the model. Any indices after this will be considered masks.
115
+ max_time_img (float): The maximum time (seconds) for processing one image.
116
+ max_nms (int): The maximum number of boxes into torchvision.ops.nms().
117
+ max_wh (int): The maximum box width and height in pixels
118
+
119
+ Returns:
120
+ (List[torch.Tensor]): A list of length batch_size, where each element is a tensor of
121
+ shape (num_boxes, 6 + num_masks) containing the kept boxes, with columns
122
+ (x1, y1, x2, y2, confidence, class, mask1, mask2, ...).
123
+ """
124
+
125
+ # Checks
126
+ assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
127
+ assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
128
+ if isinstance(prediction, (list, tuple)): # YOLOv8 model in validation model, output = (inference_out, loss_out)
129
+ prediction = prediction[0] # select only inference output
130
+
131
+ device = prediction.device
132
+ mps = 'mps' in device.type # Apple MPS
133
+ if mps: # MPS not fully supported yet, convert tensors to CPU before NMS
134
+ prediction = prediction.cpu()
135
+ bs = prediction.shape[0] # batch size
136
+ nc = nc or (prediction.shape[1] - 4) # number of classes
137
+ nm = prediction.shape[1] - nc - 4
138
+ mi = 4 + nc # mask start index
139
+ xc = prediction[:, 4:mi].amax(1) > conf_thres # candidates
140
+
141
+ # Settings
142
+ # min_wh = 2 # (pixels) minimum box width and height
143
+ time_limit = 0.5 + max_time_img * bs # seconds to quit after
144
+ multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
145
+
146
+ prediction = prediction.transpose(-1, -2) # shape(1,84,6300) to shape(1,6300,84)
147
+ prediction[..., :4] = xywh2xyxy(prediction[..., :4]) # xywh to xyxy
148
+
149
+ t = time.time()
150
+ output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
151
+ for xi, x in enumerate(prediction): # image index, image inference
152
+ # Apply constraints
153
+ # x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0 # width-height
154
+ x = x[xc[xi]] # confidence
155
+
156
+ # Cat apriori labels if autolabelling
157
+ if labels and len(labels[xi]):
158
+ lb = labels[xi]
159
+ v = torch.zeros((len(lb), nc + nm + 4), device=x.device)
160
+ v[:, :4] = xywh2xyxy(lb[:, 1:5]) # box
161
+ v[range(len(lb)), lb[:, 0].long() + 4] = 1.0 # cls
162
+ x = torch.cat((x, v), 0)
163
+
164
+ # If none remain process next image
165
+ if not x.shape[0]:
166
+ continue
167
+
168
+ # Detections matrix nx6 (xyxy, conf, cls)
169
+ box, cls, mask = x.split((4, nc, nm), 1)
170
+
171
+ if multi_label:
172
+ i, j = torch.where(cls > conf_thres)
173
+ x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1)
174
+ else: # best class only
175
+ conf, j = cls.max(1, keepdim=True)
176
+ x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
177
+
178
+ # Filter by class
179
+ if classes is not None:
180
+ x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
181
+
182
+ # Check shape
183
+ n = x.shape[0] # number of boxes
184
+ if not n: # no boxes
185
+ continue
186
+ if n > max_nms: # excess boxes
187
+ x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence and remove excess boxes
188
+
189
+ # Batched NMS
190
+ c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
191
+ boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
192
+ i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
193
+ i = i[:max_det] # limit detections
194
+
195
+ # # Experimental
196
+ # merge = False # use merge-NMS
197
+ # if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
198
+ # # Update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
199
+ # from .metrics import box_iou
200
+ # iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
201
+ # weights = iou * scores[None] # box weights
202
+ # x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
203
+ # redundant = True # require redundant detections
204
+ # if redundant:
205
+ # i = i[iou.sum(1) > 1] # require redundancy
206
+
207
+ output[xi] = x[i]
208
+ if mps:
209
+ output[xi] = output[xi].to(device)
210
+ # if (time.time() - t) > time_limit:
211
+ # LOGGER.warning(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded')
212
+ # break # time limit exceeded
213
+
214
+ return output
215
+
216
+
217
+ def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
218
+ '''Adjust bounding boxes to stick to image border if they are within a certain threshold.
219
+ Args:
220
+ boxes: (n, 4)
221
+ image_shape: (height, width)
222
+ threshold: pixel threshold
223
+ Returns:
224
+ adjusted_boxes: adjusted bounding boxes
225
+ '''
226
+
227
+ # Image dimensions
228
+ h, w = image_shape
229
+
230
+ # Adjust boxes
231
+ boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
232
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
233
+ boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
234
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
235
+ boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
236
+ w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
237
+ boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
238
+ h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
239
+
240
+ return boxes
241
+
242
+ def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
243
+ '''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
244
+ Args:
245
+ box1: (4, )
246
+ boxes: (n, 4)
247
+ Returns:
248
+ high_iou_indices: Indices of boxes with IoU > thres
249
+ '''
250
+ boxes = adjust_bboxes_to_image_border(boxes, image_shape)
251
+ # obtain coordinates for intersections
252
+ x1 = torch.max(box1[0], boxes[:, 0])
253
+ y1 = torch.max(box1[1], boxes[:, 1])
254
+ x2 = torch.min(box1[2], boxes[:, 2])
255
+ y2 = torch.min(box1[3], boxes[:, 3])
256
+
257
+ # compute the area of intersection
258
+ intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
259
+
260
+ # compute the area of both individual boxes
261
+ box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
262
+ box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
263
+
264
+ # compute the area of union
265
+ union = box1_area + box2_area - intersection
266
+
267
+ # compute the IoU
268
+ iou = intersection / union # Should be shape (n, )
269
+ if raw_output:
270
+ if iou.numel() == 0:
271
+ return 0
272
+ return iou
273
+
274
+ # get indices of boxes with IoU > thres
275
+ high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
276
+
277
+ return high_iou_indices
278
+
279
+
280
+ def scale_masks(masks, shape, padding=True):
281
+ """
282
+ Rescale segment masks to shape.
283
+
284
+ Args:
285
+ masks (torch.Tensor): (N, C, H, W).
286
+ shape (tuple): Height and width.
287
+ padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
288
+ rescaling.
289
+ """
290
+ mh, mw = masks.shape[2:]
291
+ gain = min(mh / shape[0], mw / shape[1]) # gain = old / new
292
+ pad = [mw - shape[1] * gain, mh - shape[0] * gain] # wh padding
293
+ if padding:
294
+ pad[0] /= 2
295
+ pad[1] /= 2
296
+ top, left = (int(pad[1]), int(pad[0])) if padding else (0, 0) # y, x
297
+ bottom, right = (int(mh - pad[1]), int(mw - pad[0]))
298
+ masks = masks[..., top:bottom, left:right]
299
+
300
+ masks = F.interpolate(masks, shape, mode="bilinear", align_corners=False) # NCHW
301
+ return masks
302
+
303
+
304
+ def process_mask_native(protos, masks_in, bboxes, shape):
305
+ """
306
+ It takes the output of the mask head, and crops it after upsampling to the bounding boxes.
307
+
308
+ Args:
309
+ protos (torch.Tensor): [mask_dim, mask_h, mask_w]
310
+ masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms
311
+ bboxes (torch.Tensor): [n, 4], n is number of masks after nms
312
+ shape (tuple): the size of the input image (h,w)
313
+
314
+ Returns:
315
+ masks (torch.Tensor): The returned masks with dimensions [h, w, n]
316
+ """
317
+ c, mh, mw = protos.shape # CHW
318
+ masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw)
319
+ masks = scale_masks(masks[None], shape)[0] # CHW
320
+ masks = crop_mask(masks, bboxes) # CHW
321
+ return masks.gt_(0.5)
322
+
323
+ def crop_mask(masks, boxes):
324
+ """
325
+ It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box.
326
+
327
+ Args:
328
+ masks (torch.Tensor): [n, h, w] tensor of masks
329
+ boxes (torch.Tensor): [n, 4] tensor of bbox coordinates in relative point form
330
+
331
+ Returns:
332
+ (torch.Tensor): The masks are being cropped to the bounding box.
333
+ """
334
+ _, h, w = masks.shape
335
+ x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(n,1,1)
336
+ r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,1,w)
337
+ c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(1,h,1)
338
+
339
+ return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
340
+
341
+ def process_mask(protos, masks_in, bboxes, shape, upsample=False):
342
+ """
343
+ Apply masks to bounding boxes using the output of the mask head.
344
+
345
+ Args:
346
+ protos (torch.Tensor): A tensor of shape [mask_dim, mask_h, mask_w].
347
+ masks_in (torch.Tensor): A tensor of shape [n, mask_dim], where n is the number of masks after NMS.
348
+ bboxes (torch.Tensor): A tensor of shape [n, 4], where n is the number of masks after NMS.
349
+ shape (tuple): A tuple of integers representing the size of the input image in the format (h, w).
350
+ upsample (bool): A flag to indicate whether to upsample the mask to the original image size. Default is False.
351
+
352
+ Returns:
353
+ (torch.Tensor): A binary mask tensor of shape [n, h, w], where n is the number of masks after NMS, and h and w
354
+ are the height and width of the input image. The mask is applied to the bounding boxes.
355
+ """
356
+
357
+ c, mh, mw = protos.shape # CHW
358
+ ih, iw = shape
359
+ masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) # CHW
360
+
361
+ downsampled_bboxes = bboxes.clone()
362
+ downsampled_bboxes[:, 0] *= mw / iw
363
+ downsampled_bboxes[:, 2] *= mw / iw
364
+ downsampled_bboxes[:, 3] *= mh / ih
365
+ downsampled_bboxes[:, 1] *= mh / ih
366
+
367
+ masks = crop_mask(masks, downsampled_bboxes) # CHW
368
+ if upsample:
369
+ masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW
370
+ return masks.gt_(0.5)
371
+
372
+
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/utils.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from PIL import Image
4
+
5
+
6
+ def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
7
+ '''Adjust bounding boxes to stick to image border if they are within a certain threshold.
8
+ Args:
9
+ boxes: (n, 4)
10
+ image_shape: (height, width)
11
+ threshold: pixel threshold
12
+ Returns:
13
+ adjusted_boxes: adjusted bounding boxes
14
+ '''
15
+
16
+ # Image dimensions
17
+ h, w = image_shape
18
+
19
+ # Adjust boxes
20
+ boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
21
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
22
+ boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
23
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
24
+ boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
25
+ w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
26
+ boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
27
+ h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
28
+
29
+ return boxes
30
+
31
+
32
+
33
+ def convert_box_xywh_to_xyxy(box):
34
+ x1 = box[0]
35
+ y1 = box[1]
36
+ x2 = box[0] + box[2]
37
+ y2 = box[1] + box[3]
38
+ return [x1, y1, x2, y2]
39
+
40
+
41
+ def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
42
+ '''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
43
+ Args:
44
+ box1: (4, )
45
+ boxes: (n, 4)
46
+ Returns:
47
+ high_iou_indices: Indices of boxes with IoU > thres
48
+ '''
49
+ boxes = adjust_bboxes_to_image_border(boxes, image_shape)
50
+ # obtain coordinates for intersections
51
+ x1 = torch.max(box1[0], boxes[:, 0])
52
+ y1 = torch.max(box1[1], boxes[:, 1])
53
+ x2 = torch.min(box1[2], boxes[:, 2])
54
+ y2 = torch.min(box1[3], boxes[:, 3])
55
+
56
+ # compute the area of intersection
57
+ intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
58
+
59
+ # compute the area of both individual boxes
60
+ box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
61
+ box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
62
+
63
+ # compute the area of union
64
+ union = box1_area + box2_area - intersection
65
+
66
+ # compute the IoU
67
+ iou = intersection / union # Should be shape (n, )
68
+ if raw_output:
69
+ if iou.numel() == 0:
70
+ return 0
71
+ return iou
72
+
73
+ # get indices of boxes with IoU > thres
74
+ high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
75
+
76
+ return high_iou_indices
77
+
78
+
79
+ def image_to_np_ndarray(image):
80
+ if type(image) is str:
81
+ return np.array(Image.open(image))
82
+ elif issubclass(type(image), Image.Image):
83
+ return np.array(image)
84
+ elif type(image) is np.ndarray:
85
+ return image
86
+ return None
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Model Information
2
+ ### Source model
3
+ - Input shape: 640x640
4
+ - Number of parameters: 68.89M
5
+ - Model size: 277.39M
6
+ - Output shape: 1x37x8400,1x32x160x160
7
+
8
+ Source model repository: [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
9
+
10
+ ### Converted model
11
+
12
+ - Precision: FP16
13
+ - Backend: QNN2.16
14
+ - Target Device: SNM972 QCS8550
15
+
16
+ ## Inference with AidLite SDK
17
+
18
+ ### SDK installation
19
+ Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
20
+
21
+ - Install AidLite SDK
22
+
23
+ ```bash
24
+ # Install the appropriate version of the aidlite sdk
25
+ sudo aid-pkg update
26
+ sudo aid-pkg install aidlite-sdk
27
+ # Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
28
+ sudo aid-pkg install aidlite-{QNN VERSION}
29
+ ```
30
+
31
+ - Verify AidLite SDK
32
+
33
+ ```bash
34
+ # aidlite sdk c++ check
35
+ python3 -c "import aidlite ; print(aidlite.get_library_version())"
36
+
37
+ # aidlite sdk python check
38
+ python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
39
+ ```
40
+
41
+ ### Run demo
42
+ ```bash
43
+ cd fastsam_x/model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite
44
+ export LD_PRELOAD=/home/aidlux/.local/lib/python3.8/site-packages/torch/lib/../../torch.libs/libgomp-804f19d4.so.1.0.0
45
+
46
+ python3 ./python/run_test.py --target_model ./models/cutoff_fastsam_x_fp16.qnn216.ctx.bin --imgs ./python/dogs.jpg --invoke_nums 10
47
+ ```
48
+
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/models/cutoff_fastsam_x_fp16.qnn216.ctx.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dc4b65549ba459736b2fcfe9cd190c63d70108c3edd5c35f9af310af19e5871
3
+ size 148172000
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/dogs.jpg ADDED
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/onnx_export.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import cv2
3
+ import os
4
+ import sys
5
+
6
+ from ultralytics.models.fastsam import FastSAM
7
+
8
+ class Fast_SAM(torch.nn.Module):
9
+ """Exportable FastSAM model, end-to-end."""
10
+
11
+ def __init__(self) -> None:
12
+ super().__init__()
13
+ pt_name ='./models/FastSAM-s.pt'
14
+ self.model =FastSAM(pt_name).model
15
+
16
+ def forward(self, image: torch.Tensor):
17
+ """
18
+ Run FastSAM on `image`, and produce high quality segmentation masks.
19
+ Faster than SAM as it is based on YOLOv8.
20
+
21
+ Parameters:
22
+ image: Pixel values pre-processed for encoder consumption.
23
+ Range: float[0, 1]
24
+ 3-channel Color Space: BGR
25
+ Returns:
26
+
27
+ """
28
+ predictions = self.model(image)
29
+ # Return predictions as a tuple instead of nested tuple.
30
+ return (predictions[0], predictions[1][2])
31
+
32
+
33
+ model = Fast_SAM()
34
+ num_params = sum(p.numel() for p in model.parameters())
35
+ print(f'Number of FastSAM-s parameters: {num_params}')
36
+ dummy_input = torch.randn( [1,3,640,640],dtype=torch.float32 )
37
+ source_model = torch.jit.trace(
38
+ model.to("cpu"), dummy_input, check_trace=False
39
+ )
40
+ torch.onnx.export(model, # model being run
41
+ dummy_input, # model input (or a tuple for multiple inputs)
42
+ "./models/fastsam_s.onnx", # where to save the model
43
+ export_params=True, # store the trained parameter weights inside the model file
44
+ opset_version=12, # the ONNX version to export the model to
45
+ do_constant_folding=True, # whether to execute constant folding for optimization
46
+ input_names = ['input'], # the model's input names
47
+ output_names = ['boxes','mask'],
48
+ verbose=True,
49
+ )
50
+ print("Convert to onnx successfully!")
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/prompt.py ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import cv2
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ import torch
7
+ from utils import image_to_np_ndarray
8
+ from PIL import Image
9
+
10
+
11
+ class FastSAMPrompt:
12
+
13
+ def __init__(self, image, results, device='cpu'):
14
+ if isinstance(image, str) or isinstance(image, Image.Image):
15
+ image = image_to_np_ndarray(image)
16
+ self.device = device
17
+ self.results = results
18
+ self.img = image
19
+
20
+ def _segment_image(self, image, bbox):
21
+ if isinstance(image, Image.Image):
22
+ image_array = np.array(image)
23
+ else:
24
+ image_array = image
25
+ segmented_image_array = np.zeros_like(image_array)
26
+ x1, y1, x2, y2 = bbox
27
+ segmented_image_array[y1:y2, x1:x2] = image_array[y1:y2, x1:x2]
28
+ segmented_image = Image.fromarray(segmented_image_array)
29
+ black_image = Image.new('RGB', image.size, (255, 255, 255))
30
+ # transparency_mask = np.zeros_like((), dtype=np.uint8)
31
+ transparency_mask = np.zeros((image_array.shape[0], image_array.shape[1]), dtype=np.uint8)
32
+ transparency_mask[y1:y2, x1:x2] = 255
33
+ transparency_mask_image = Image.fromarray(transparency_mask, mode='L')
34
+ black_image.paste(segmented_image, mask=transparency_mask_image)
35
+ return black_image
36
+
37
+ def _format_results(self, result, filter=0):
38
+ annotations = []
39
+ n = len(result.masks.data)
40
+ for i in range(n):
41
+ annotation = {}
42
+ mask = result.masks.data[i] == 1.0
43
+
44
+ if torch.sum(mask) < filter:
45
+ continue
46
+ annotation['id'] = i
47
+ annotation['segmentation'] = mask.cpu().numpy()
48
+ annotation['bbox'] = result.boxes.data[i]
49
+ annotation['score'] = result.boxes.conf[i]
50
+ annotation['area'] = annotation['segmentation'].sum()
51
+ annotations.append(annotation)
52
+ return annotations
53
+
54
+ def filter_masks(annotations): # filte the overlap mask
55
+ annotations.sort(key=lambda x: x['area'], reverse=True)
56
+ to_remove = set()
57
+ for i in range(0, len(annotations)):
58
+ a = annotations[i]
59
+ for j in range(i + 1, len(annotations)):
60
+ b = annotations[j]
61
+ if i != j and j not in to_remove:
62
+ # check if
63
+ if b['area'] < a['area']:
64
+ if (a['segmentation'] & b['segmentation']).sum() / b['segmentation'].sum() > 0.8:
65
+ to_remove.add(j)
66
+
67
+ return [a for i, a in enumerate(annotations) if i not in to_remove], to_remove
68
+
69
+ def _get_bbox_from_mask(self, mask):
70
+ mask = mask.astype(np.uint8)
71
+ contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
72
+ x1, y1, w, h = cv2.boundingRect(contours[0])
73
+ x2, y2 = x1 + w, y1 + h
74
+ if len(contours) > 1:
75
+ for b in contours:
76
+ x_t, y_t, w_t, h_t = cv2.boundingRect(b)
77
+ # Merge multiple bounding boxes into one.
78
+ x1 = min(x1, x_t)
79
+ y1 = min(y1, y_t)
80
+ x2 = max(x2, x_t + w_t)
81
+ y2 = max(y2, y_t + h_t)
82
+ h = y2 - y1
83
+ w = x2 - x1
84
+ return [x1, y1, x2, y2]
85
+
86
+ def plot_to_result(self,
87
+ annotations,
88
+ bboxes=None,
89
+ points=None,
90
+ point_label=None,
91
+ mask_random_color=True,
92
+ better_quality=True,
93
+ retina=False,
94
+ withContours=True) -> np.ndarray:
95
+ if isinstance(annotations[0], dict):
96
+ annotations = [annotation['segmentation'] for annotation in annotations]
97
+ image = self.img
98
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
99
+ original_h = image.shape[0]
100
+ original_w = image.shape[1]
101
+ if sys.platform == "darwin":
102
+ plt.switch_backend("TkAgg")
103
+ plt.figure(figsize=(original_w / 100, original_h / 100))
104
+ # Add subplot with no margin.
105
+ plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
106
+ plt.margins(0, 0)
107
+ plt.gca().xaxis.set_major_locator(plt.NullLocator())
108
+ plt.gca().yaxis.set_major_locator(plt.NullLocator())
109
+
110
+ plt.imshow(image)
111
+ if better_quality:
112
+ if isinstance(annotations[0], torch.Tensor):
113
+ annotations = np.array(annotations.cpu())
114
+ for i, mask in enumerate(annotations):
115
+ mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8))
116
+ annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8))
117
+ if self.device == 'cpu':
118
+ annotations = np.array(annotations)
119
+ self.fast_show_mask(
120
+ annotations,
121
+ plt.gca(),
122
+ random_color=mask_random_color,
123
+ bboxes=bboxes,
124
+ points=points,
125
+ pointlabel=point_label,
126
+ retinamask=retina,
127
+ target_height=original_h,
128
+ target_width=original_w,
129
+ )
130
+ else:
131
+ if isinstance(annotations[0], np.ndarray):
132
+ annotations = torch.from_numpy(annotations)
133
+ self.fast_show_mask_gpu(
134
+ annotations,
135
+ plt.gca(),
136
+ random_color=mask_random_color,
137
+ bboxes=bboxes,
138
+ points=points,
139
+ pointlabel=point_label,
140
+ retinamask=retina,
141
+ target_height=original_h,
142
+ target_width=original_w,
143
+ )
144
+ if isinstance(annotations, torch.Tensor):
145
+ annotations = annotations.cpu().numpy()
146
+ if withContours:
147
+ contour_all = []
148
+ temp = np.zeros((original_h, original_w, 1))
149
+ for i, mask in enumerate(annotations):
150
+ if type(mask) == dict:
151
+ mask = mask['segmentation']
152
+ annotation = mask.astype(np.uint8)
153
+ if not retina:
154
+ annotation = cv2.resize(
155
+ annotation,
156
+ (original_w, original_h),
157
+ interpolation=cv2.INTER_NEAREST,
158
+ )
159
+ contours, hierarchy = cv2.findContours(annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
160
+ for contour in contours:
161
+ contour_all.append(contour)
162
+ cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2)
163
+ color = np.array([0 / 255, 0 / 255, 255 / 255, 0.8])
164
+ contour_mask = temp / 255 * color.reshape(1, 1, -1)
165
+ plt.imshow(contour_mask)
166
+
167
+ plt.axis('off')
168
+ fig = plt.gcf()
169
+ plt.draw()
170
+
171
+ try:
172
+ buf = fig.canvas.tostring_rgb()
173
+ except AttributeError:
174
+ fig.canvas.draw()
175
+ buf = fig.canvas.tostring_rgb()
176
+ cols, rows = fig.canvas.get_width_height()
177
+ img_array = np.frombuffer(buf, dtype=np.uint8).reshape(rows, cols, 3)
178
+ result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
179
+ plt.close()
180
+ return result
181
+
182
+ # Remark for refactoring: IMO a function should do one thing only, storing the image and plotting should be seperated and do not necessarily need to be class functions but standalone utility functions that the user can chain in his scripts to have more fine-grained control.
183
+ def plot(self,
184
+ annotations,
185
+ output_path,
186
+ bboxes=None,
187
+ points=None,
188
+ point_label=None,
189
+ mask_random_color=True,
190
+ better_quality=True,
191
+ retina=False,
192
+ withContours=True):
193
+ if len(annotations) == 0:
194
+ return None
195
+ result = self.plot_to_result(
196
+ annotations,
197
+ bboxes,
198
+ points,
199
+ point_label,
200
+ mask_random_color,
201
+ better_quality,
202
+ retina,
203
+ withContours,
204
+ )
205
+
206
+ path = os.path.dirname(os.path.abspath(output_path))
207
+ if not os.path.exists(path):
208
+ os.makedirs(path)
209
+ result = result[:, :, ::-1]
210
+ cv2.imwrite(output_path, result)
211
+
212
+ # CPU post process
213
+ def fast_show_mask(
214
+ self,
215
+ annotation,
216
+ ax,
217
+ random_color=False,
218
+ bboxes=None,
219
+ points=None,
220
+ pointlabel=None,
221
+ retinamask=True,
222
+ target_height=960,
223
+ target_width=960,
224
+ ):
225
+ msak_sum = annotation.shape[0]
226
+ height = annotation.shape[1]
227
+ weight = annotation.shape[2]
228
+ #Sort annotations based on area.
229
+ areas = np.sum(annotation, axis=(1, 2))
230
+ sorted_indices = np.argsort(areas)
231
+ annotation = annotation[sorted_indices]
232
+
233
+ index = (annotation != 0).argmax(axis=0)
234
+ if random_color:
235
+ color = np.random.random((msak_sum, 1, 1, 3))
236
+ else:
237
+ color = np.ones((msak_sum, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 255 / 255])
238
+ transparency = np.ones((msak_sum, 1, 1, 1)) * 0.6
239
+ visual = np.concatenate([color, transparency], axis=-1)
240
+ mask_image = np.expand_dims(annotation, -1) * visual
241
+
242
+ show = np.zeros((height, weight, 4))
243
+ h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing='ij')
244
+ indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
245
+ # Use vectorized indexing to update the values of 'show'.
246
+ show[h_indices, w_indices, :] = mask_image[indices]
247
+ if bboxes is not None:
248
+ for bbox in bboxes:
249
+ x1, y1, x2, y2 = bbox
250
+ ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
251
+ # draw point
252
+ if points is not None:
253
+ plt.scatter(
254
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
255
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
256
+ s=20,
257
+ c='y',
258
+ )
259
+ plt.scatter(
260
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
261
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
262
+ s=20,
263
+ c='m',
264
+ )
265
+
266
+ if not retinamask:
267
+ show = cv2.resize(show, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
268
+ ax.imshow(show)
269
+
270
+ def fast_show_mask_gpu(
271
+ self,
272
+ annotation,
273
+ ax,
274
+ random_color=False,
275
+ bboxes=None,
276
+ points=None,
277
+ pointlabel=None,
278
+ retinamask=True,
279
+ target_height=960,
280
+ target_width=960,
281
+ ):
282
+ msak_sum = annotation.shape[0]
283
+ height = annotation.shape[1]
284
+ weight = annotation.shape[2]
285
+ areas = torch.sum(annotation, dim=(1, 2))
286
+ sorted_indices = torch.argsort(areas, descending=False)
287
+ annotation = annotation[sorted_indices]
288
+ # Find the index of the first non-zero value at each position.
289
+ index = (annotation != 0).to(torch.long).argmax(dim=0)
290
+ if random_color:
291
+ color = torch.rand((msak_sum, 1, 1, 3)).to(annotation.device)
292
+ else:
293
+ color = torch.ones((msak_sum, 1, 1, 3)).to(annotation.device) * torch.tensor([
294
+ 30 / 255, 144 / 255, 255 / 255]).to(annotation.device)
295
+ transparency = torch.ones((msak_sum, 1, 1, 1)).to(annotation.device) * 0.6
296
+ visual = torch.cat([color, transparency], dim=-1)
297
+ mask_image = torch.unsqueeze(annotation, -1) * visual
298
+ # Select data according to the index. The index indicates which batch's data to choose at each position, converting the mask_image into a single batch form.
299
+ show = torch.zeros((height, weight, 4)).to(annotation.device)
300
+ try:
301
+ h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight), indexing='ij')
302
+ except:
303
+ h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight))
304
+ indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
305
+ # Use vectorized indexing to update the values of 'show'.
306
+ show[h_indices, w_indices, :] = mask_image[indices]
307
+ show_cpu = show.cpu().numpy()
308
+ if bboxes is not None:
309
+ for bbox in bboxes:
310
+ x1, y1, x2, y2 = bbox
311
+ ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
312
+ # draw point
313
+ if points is not None:
314
+ plt.scatter(
315
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
316
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
317
+ s=20,
318
+ c='y',
319
+ )
320
+ plt.scatter(
321
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
322
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
323
+ s=20,
324
+ c='m',
325
+ )
326
+ if not retinamask:
327
+ show_cpu = cv2.resize(show_cpu, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
328
+ ax.imshow(show_cpu)
329
+
330
+ # clip
331
+ @torch.no_grad()
332
+ def retrieve(self, model, preprocess, elements, search_text: str, device) -> int:
333
+ preprocessed_images = [preprocess(image).to(device) for image in elements]
334
+ try:
335
+ import clip # for linear_assignment
336
+
337
+ except (ImportError, AssertionError, AttributeError):
338
+ from ultralytics.yolo.utils.checks import check_requirements
339
+
340
+ check_requirements('git+https://github.com/openai/CLIP.git') # required before installing lap from source
341
+ import clip
342
+
343
+
344
+ tokenized_text = clip.tokenize([search_text]).to(device)
345
+ stacked_images = torch.stack(preprocessed_images)
346
+ image_features = model.encode_image(stacked_images)
347
+ text_features = model.encode_text(tokenized_text)
348
+ image_features /= image_features.norm(dim=-1, keepdim=True)
349
+ text_features /= text_features.norm(dim=-1, keepdim=True)
350
+ probs = 100.0 * image_features @ text_features.T
351
+ return probs[:, 0].softmax(dim=0)
352
+
353
+ def _crop_image(self, format_results):
354
+
355
+ image = Image.fromarray(cv2.cvtColor(self.img, cv2.COLOR_BGR2RGB))
356
+ ori_w, ori_h = image.size
357
+ annotations = format_results
358
+ mask_h, mask_w = annotations[0]['segmentation'].shape
359
+ if ori_w != mask_w or ori_h != mask_h:
360
+ image = image.resize((mask_w, mask_h))
361
+ cropped_boxes = []
362
+ cropped_images = []
363
+ not_crop = []
364
+ filter_id = []
365
+ # annotations, _ = filter_masks(annotations)
366
+ # filter_id = list(_)
367
+ for _, mask in enumerate(annotations):
368
+ if np.sum(mask['segmentation']) <= 100:
369
+ filter_id.append(_)
370
+ continue
371
+ bbox = self._get_bbox_from_mask(mask['segmentation']) # mask 的 bbox
372
+ cropped_boxes.append(self._segment_image(image, bbox))
373
+ # cropped_boxes.append(segment_image(image,mask["segmentation"]))
374
+ cropped_images.append(bbox) # Save the bounding box of the cropped image.
375
+
376
+ return cropped_boxes, cropped_images, not_crop, filter_id, annotations
377
+
378
+ def box_prompt(self, bbox=None, bboxes=None):
379
+ if self.results == None:
380
+ return []
381
+ assert bbox or bboxes
382
+ if bboxes is None:
383
+ bboxes = [bbox]
384
+ max_iou_index = []
385
+ for bbox in bboxes:
386
+ assert (bbox[2] != 0 and bbox[3] != 0)
387
+ masks = self.results[0].masks.data
388
+ target_height = self.img.shape[0]
389
+ target_width = self.img.shape[1]
390
+ h = masks.shape[1]
391
+ w = masks.shape[2]
392
+ if h != target_height or w != target_width:
393
+ bbox = [
394
+ int(bbox[0] * w / target_width),
395
+ int(bbox[1] * h / target_height),
396
+ int(bbox[2] * w / target_width),
397
+ int(bbox[3] * h / target_height), ]
398
+ bbox[0] = round(bbox[0]) if round(bbox[0]) > 0 else 0
399
+ bbox[1] = round(bbox[1]) if round(bbox[1]) > 0 else 0
400
+ bbox[2] = round(bbox[2]) if round(bbox[2]) < w else w
401
+ bbox[3] = round(bbox[3]) if round(bbox[3]) < h else h
402
+
403
+ # IoUs = torch.zeros(len(masks), dtype=torch.float32)
404
+ bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
405
+
406
+ masks_area = torch.sum(masks[:, bbox[1]:bbox[3], bbox[0]:bbox[2]], dim=(1, 2))
407
+ orig_masks_area = torch.sum(masks, dim=(1, 2))
408
+
409
+ union = bbox_area + orig_masks_area - masks_area
410
+ IoUs = masks_area / union
411
+ max_iou_index.append(int(torch.argmax(IoUs)))
412
+ max_iou_index = list(set(max_iou_index))
413
+ return np.array(masks[max_iou_index].cpu().numpy())
414
+
415
+ def point_prompt(self, points, pointlabel): # numpy
416
+ if self.results == None:
417
+ return []
418
+ masks = self._format_results(self.results[0], 0)
419
+ target_height = self.img.shape[0]
420
+ target_width = self.img.shape[1]
421
+ h = masks[0]['segmentation'].shape[0]
422
+ w = masks[0]['segmentation'].shape[1]
423
+ if h != target_height or w != target_width:
424
+ points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points]
425
+ onemask = np.zeros((h, w))
426
+ masks = sorted(masks, key=lambda x: x['area'], reverse=True)
427
+ for i, annotation in enumerate(masks):
428
+ if type(annotation) == dict:
429
+ mask = annotation['segmentation']
430
+ else:
431
+ mask = annotation
432
+ for i, point in enumerate(points):
433
+ if mask[point[1], point[0]] == 1 and pointlabel[i] == 1:
434
+ onemask[mask] = 1
435
+ if mask[point[1], point[0]] == 1 and pointlabel[i] == 0:
436
+ onemask[mask] = 0
437
+ onemask = onemask >= 1
438
+ return np.array([onemask])
439
+
440
+ def text_prompt(self, text):
441
+ if self.results == None:
442
+ return []
443
+ format_results = self._format_results(self.results[0], 0)
444
+ cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._crop_image(format_results)
445
+ clip_model, preprocess = clip.load('ViT-B/32', device=self.device)
446
+ scores = self.retrieve(clip_model, preprocess, cropped_boxes, text, device=self.device)
447
+ max_idx = scores.argsort()
448
+ max_idx = max_idx[-1]
449
+ max_idx += sum(np.array(filter_id) <= int(max_idx))
450
+ return np.array([annotations[max_idx]['segmentation']])
451
+
452
+ def everything_prompt(self):
453
+ if self.results == None:
454
+ return []
455
+ return self.results[0].masks.data
456
+
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/run_test.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import cv2
4
+ import numpy as np
5
+ import onnxruntime
6
+ import time
7
+ import matplotlib.pyplot as plt
8
+ import torch
9
+ from ultralytics.engine.results import Results
10
+ from tools_pt import *
11
+ from prompt import FastSAMPrompt
12
+ import aidlite
13
+ import argparse
14
+ import ast
15
+
16
+ # 定义相似度函数
17
+ def get_acc(onnx_out,other_out):
18
+ cosine_similarity=np.dot(np.array(onnx_out),np.array(other_out))/(np.linalg.norm(np.array(onnx_out)) * np.linalg.norm(np.array(other_out)))
19
+ return cosine_similarity
20
+
21
+ def cal_sigmoid(x):
22
+ return 1 / (1 + np.exp(-x))
23
+
24
+ class qnn_predict(object):
25
+ def __init__(self,inputshape,outputshape,args) -> None:
26
+ aidlite.set_log_level(aidlite.LogLevel.INFO)
27
+ aidlite.log_to_stderr()
28
+ print(f"Aidlite library version : {aidlite.get_library_version()}")
29
+ print(f"Aidlite python library version : {aidlite.get_py_library_version()}")
30
+ config = aidlite.Config.create_instance()
31
+ if config is None:
32
+ print("Create model failed !")
33
+ config.implement_type = aidlite.ImplementType.TYPE_LOCAL
34
+ config.framework_type = aidlite.FrameworkType.TYPE_QNN
35
+ config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
36
+ config.is_quantify_model = 1
37
+
38
+ model = aidlite.Model.create_instance(args.target_model)
39
+ if model is None:
40
+ print("Create model failed !")
41
+
42
+ self.input_shape=inputshape
43
+ self.out_shape = outputshape
44
+ model.set_model_properties(self.input_shape, aidlite.DataType.TYPE_FLOAT32, self.out_shape, aidlite.DataType.TYPE_FLOAT32)
45
+ self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(model, config)
46
+ if self.interpreter is None:
47
+ print("build_interpretper_from_model_and_config failed !")
48
+ result = self.interpreter.init()
49
+ if result != 0:
50
+ print(f"interpreter init failed !")
51
+ result = self.interpreter.load_model()
52
+ if result != 0:
53
+ print("interpreter load model failed !")
54
+ print("detect model load success!")
55
+
56
+ self.conf = 0.4
57
+ self.iou=0.9
58
+ self.size = 640
59
+ self.agnostic_nms=False
60
+ self.max_det = 300
61
+ self.names=['object']
62
+ self.classes =None
63
+ self.retina_masks=True
64
+
65
+ def pretreat_img(self,img):
66
+ scale = 1/255.
67
+ img_size = cv2.resize(img, (self.size,self.size), interpolation=cv2.INTER_LINEAR)
68
+ float_img = img_size.astype('float32')
69
+ float_img = float_img* scale
70
+ float_img = float_img[:, :, ::-1]
71
+ return float_img
72
+
73
+ def postprocess(self, preds, img, orig_imgs):
74
+ """TODO: filter by classes."""
75
+ p = non_max_suppression(torch.from_numpy(preds[0]),
76
+ self.conf,
77
+ self.iou,
78
+ agnostic=self.agnostic_nms,
79
+ max_det=self.max_det,
80
+ nc=len(self.names),
81
+ classes=self.classes)
82
+
83
+ results = []
84
+ if len(p) == 0 or len(p[0]) == 0:
85
+ print("No object detected.")
86
+ return results
87
+
88
+ full_box = torch.zeros_like(p[0][0])
89
+ full_box[2], full_box[3], full_box[4], full_box[6:] = img.shape[3], img.shape[2], 1.0, 1.0
90
+ full_box = full_box.view(1, -1)
91
+ critical_iou_index = bbox_iou(full_box[0][:4], p[0][:, :4], iou_thres=0.9, image_shape=img.shape[2:])
92
+ if critical_iou_index.numel() != 0:
93
+ full_box[0][4] = p[0][critical_iou_index][:,4]
94
+ full_box[0][6:] = p[0][critical_iou_index][:,6:]
95
+ p[0][critical_iou_index] = full_box
96
+
97
+ #proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported
98
+ proto=torch.from_numpy(preds[-1])
99
+ for i, pred in enumerate(p):
100
+ orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs
101
+ path =img[0] #self.batch[0]
102
+ img_path = path[i] if isinstance(path, list) else path
103
+ if not len(pred): # save empty boxes
104
+ results.append(Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6]))
105
+ continue
106
+ if self.retina_masks:
107
+ if not isinstance(orig_imgs, torch.Tensor):
108
+ pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
109
+ masks = process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2]) # HWC
110
+ else:
111
+ masks = process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True) # HWC
112
+ if not isinstance(orig_imgs, torch.Tensor):
113
+ pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
114
+ results.append(
115
+ Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6], masks=masks))
116
+ return results
117
+
118
+ def qnn_run(self, orig_imgs,img_path,args):
119
+ input_img_f =self.pretreat_img(orig_imgs) # 图片resize HWC
120
+ # print("qnn_input:",input_img_f)
121
+ # encoder texts
122
+ input_img = np.expand_dims(input_img_f, 0)
123
+
124
+ invoke_time=[]
125
+ for i in range(args.invoke_nums):
126
+ result = self.interpreter.set_input_tensor(0, input_img.data)
127
+ t0 = time.time()
128
+ result = self.interpreter.invoke()
129
+ t1 = time.time()
130
+ cost_time=(t1-t0)*1000
131
+ invoke_time.append(cost_time)
132
+ mask_ = self.interpreter.get_output_tensor(0)
133
+ concat_ = self.interpreter.get_output_tensor(1)
134
+ mul_ = self.interpreter.get_output_tensor(3)
135
+ split_ = self.interpreter.get_output_tensor(2)
136
+ mask_ = mask_.reshape( * self.out_shape[3])
137
+ mask_=mask_.transpose((0, 3, 1,2))
138
+ concat_ = concat_.reshape( *self.out_shape[2])
139
+ mul_ = mul_.reshape( *self.out_shape[1])
140
+ split_ = split_.reshape( *self.out_shape[0])
141
+ sig_ = cal_sigmoid(split_)
142
+
143
+ output_concat = np.concatenate((mul_,sig_),axis=1)
144
+ output_concat = np.concatenate((output_concat,concat_),axis=1)
145
+
146
+ # outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
147
+ ## time 统计
148
+ max_invoke_time = max(invoke_time)
149
+ min_invoke_time = min(invoke_time)
150
+ mean_invoke_time = sum(invoke_time)/args.invoke_nums
151
+ var_invoketime=np.var(invoke_time)
152
+ print("========================================")
153
+ print(f"QNN inference {args.invoke_nums} times :\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
154
+ print("========================================")
155
+
156
+ qnn_out = [np.array(output_concat),np.array(mask_)]
157
+ # print("qnn predict out:",qnn_out)
158
+
159
+ nchw_img = input_img.transpose(0,3,1,2)
160
+ everything_results = self.postprocess( qnn_out, nchw_img, [orig_imgs])
161
+ # print("everything_results: ",everything_results)
162
+
163
+ prompt_process = FastSAMPrompt(args.imgs, everything_results, device="cpu")
164
+
165
+ # ann = prompt_process.point_prompt(points=[[620, 360]], pointlabel=[1])
166
+ try:
167
+ if args.point_prompt ==[[0,0]]:
168
+ ann = prompt_process.everything_prompt()
169
+ else:
170
+ ann = prompt_process.point_prompt(points=args.point_prompt, pointlabel=[1])
171
+ out_name = os.path.basename(img_path).split(".")[0]
172
+ if True: # savepic
173
+ outpath = "python/"
174
+ if not os.path.exists(outpath):
175
+ os.mkdir(outpath)
176
+ prompt_process.plot(
177
+ annotations=ann,
178
+ output_path=os.path.join(outpath,out_name+"_result.jpg"),
179
+ mask_random_color=True,
180
+ better_quality=True,
181
+ retina=False,
182
+ withContours=True,
183
+ )
184
+ else:
185
+ plt.figure()
186
+ prompt_process.fast_show_mask(annotation=ann,
187
+ ax = plt)
188
+ except Exception as e:
189
+ print(f"Waning : An error occurred in the picture {img_path} prediction -{e}")
190
+ return [mask_.reshape(-1),output_concat.reshape(-1)]
191
+
192
+
193
+
194
+ def parser_args():
195
+ parser = argparse.ArgumentParser(description="Run model benchmarks")
196
+ parser.add_argument('--target_model',type=str,default='models/cutoff_fastsam_x_fp16.qnn216.ctx.bin.aidem',help="inference model path")
197
+ parser.add_argument('--source_model',type=str,default='models/fastsam_x.onnx',help="original model path")
198
+ parser.add_argument('--imgs',type=str,default='python/dogs.jpg',help="Predict images path")
199
+ parser.add_argument('--invoke_nums',type=int,default=10,help="Inference nums")
200
+ parser.add_argument('--point_prompt',type=str,default="[[0,0]]",help="example:[[x1,y1],[x2,y2]]")
201
+ args = parser.parse_args()
202
+ return args
203
+
204
+
205
+ if __name__ == "__main__":
206
+ args = parser_args()
207
+ inputshape=[[1,640,640,3]]
208
+ outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
209
+ args.point_prompt = ast.literal_eval(args.point_prompt)
210
+
211
+ predict = qnn_predict(inputshape,outputshape,args)
212
+ if os.path.isdir(args.imgs):
213
+ img_files = os.listdir(args.imgs)
214
+ for fi in img_files:
215
+ img_path = os.path.join(args.imgs,fi)
216
+ im0s = cv2.imread(img_path) # BGR
217
+ im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
218
+ predict.qnn_run(im0s,img_path,args)
219
+ else:
220
+ img_path = args.imgs
221
+ im0s = cv2.imread(img_path) # BGR
222
+ im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
223
+ qnn_result = predict.qnn_run(im0s,img_path,args)
224
+ print("Prediction completion and the results are saved !")
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/tools_pt.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import time
3
+ import torch
4
+ import torchvision
5
+ import torch.nn.functional as F
6
+
7
+
8
+
9
+ def clip_boxes(boxes, shape):
10
+ """
11
+ Takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the shape.
12
+
13
+ Args:
14
+ boxes (torch.Tensor): the bounding boxes to clip
15
+ shape (tuple): the shape of the image
16
+ """
17
+ if isinstance(boxes, torch.Tensor): # faster individually
18
+ boxes[..., 0].clamp_(0, shape[1]) # x1
19
+ boxes[..., 1].clamp_(0, shape[0]) # y1
20
+ boxes[..., 2].clamp_(0, shape[1]) # x2
21
+ boxes[..., 3].clamp_(0, shape[0]) # y2
22
+ else: # np.array (faster grouped)
23
+ boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
24
+ boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
25
+
26
+ def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding=True):
27
+ """
28
+ Rescales bounding boxes (in the format of xyxy) from the shape of the image they were originally specified in
29
+ (img1_shape) to the shape of a different image (img0_shape).
30
+
31
+ Args:
32
+ img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
33
+ boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
34
+ img0_shape (tuple): the shape of the target image, in the format of (height, width).
35
+ ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
36
+ calculated based on the size difference between the two images.
37
+ padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
38
+ rescaling.
39
+
40
+ Returns:
41
+ boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
42
+ """
43
+ if ratio_pad is None: # calculate from img0_shape
44
+ gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
45
+ pad = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1), round(
46
+ (img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # wh padding
47
+ else:
48
+ gain = ratio_pad[0][0]
49
+ pad = ratio_pad[1]
50
+
51
+ if padding:
52
+ boxes[..., [0, 2]] -= pad[0] # x padding
53
+ boxes[..., [1, 3]] -= pad[1] # y padding
54
+ boxes[..., :4] /= gain
55
+ clip_boxes(boxes, img0_shape)
56
+ return boxes
57
+
58
+
59
+ def xywh2xyxy(x):
60
+ """
61
+ Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
62
+ top-left corner and (x2, y2) is the bottom-right corner.
63
+
64
+ Args:
65
+ x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.
66
+
67
+ Returns:
68
+ y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
69
+ """
70
+ assert x.shape[-1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'
71
+ y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x) # faster than clone/copy
72
+ dw = x[..., 2] / 2 # half-width
73
+ dh = x[..., 3] / 2 # half-height
74
+ y[..., 0] = x[..., 0] - dw # top left x
75
+ y[..., 1] = x[..., 1] - dh # top left y
76
+ y[..., 2] = x[..., 0] + dw # bottom right x
77
+ y[..., 3] = x[..., 1] + dh # bottom right y
78
+ return y
79
+
80
+
81
+ def non_max_suppression(
82
+ prediction,
83
+ conf_thres=0.25,
84
+ iou_thres=0.45,
85
+ classes=None,
86
+ agnostic=False,
87
+ multi_label=False,
88
+ labels=(),
89
+ max_det=300,
90
+ nc=0, # number of classes (optional)
91
+ max_time_img=0.05,
92
+ max_nms=30000,
93
+ max_wh=7680,
94
+ ):
95
+ """
96
+ Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box.
97
+
98
+ Args:
99
+ prediction (torch.Tensor): A tensor of shape (batch_size, num_classes + 4 + num_masks, num_boxes)
100
+ containing the predicted boxes, classes, and masks. The tensor should be in the format
101
+ output by a model, such as YOLO.
102
+ conf_thres (float): The confidence threshold below which boxes will be filtered out.
103
+ Valid values are between 0.0 and 1.0.
104
+ iou_thres (float): The IoU threshold below which boxes will be filtered out during NMS.
105
+ Valid values are between 0.0 and 1.0.
106
+ classes (List[int]): A list of class indices to consider. If None, all classes will be considered.
107
+ agnostic (bool): If True, the model is agnostic to the number of classes, and all
108
+ classes will be considered as one.
109
+ multi_label (bool): If True, each box may have multiple labels.
110
+ labels (List[List[Union[int, float, torch.Tensor]]]): A list of lists, where each inner
111
+ list contains the apriori labels for a given image. The list should be in the format
112
+ output by a dataloader, with each label being a tuple of (class_index, x1, y1, x2, y2).
113
+ max_det (int): The maximum number of boxes to keep after NMS.
114
+ nc (int, optional): The number of classes output by the model. Any indices after this will be considered masks.
115
+ max_time_img (float): The maximum time (seconds) for processing one image.
116
+ max_nms (int): The maximum number of boxes into torchvision.ops.nms().
117
+ max_wh (int): The maximum box width and height in pixels
118
+
119
+ Returns:
120
+ (List[torch.Tensor]): A list of length batch_size, where each element is a tensor of
121
+ shape (num_boxes, 6 + num_masks) containing the kept boxes, with columns
122
+ (x1, y1, x2, y2, confidence, class, mask1, mask2, ...).
123
+ """
124
+
125
+ # Checks
126
+ assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
127
+ assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
128
+ if isinstance(prediction, (list, tuple)): # YOLOv8 model in validation model, output = (inference_out, loss_out)
129
+ prediction = prediction[0] # select only inference output
130
+
131
+ device = prediction.device
132
+ mps = 'mps' in device.type # Apple MPS
133
+ if mps: # MPS not fully supported yet, convert tensors to CPU before NMS
134
+ prediction = prediction.cpu()
135
+ bs = prediction.shape[0] # batch size
136
+ nc = nc or (prediction.shape[1] - 4) # number of classes
137
+ nm = prediction.shape[1] - nc - 4
138
+ mi = 4 + nc # mask start index
139
+ xc = prediction[:, 4:mi].amax(1) > conf_thres # candidates
140
+
141
+ # Settings
142
+ # min_wh = 2 # (pixels) minimum box width and height
143
+ time_limit = 0.5 + max_time_img * bs # seconds to quit after
144
+ multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
145
+
146
+ prediction = prediction.transpose(-1, -2) # shape(1,84,6300) to shape(1,6300,84)
147
+ prediction[..., :4] = xywh2xyxy(prediction[..., :4]) # xywh to xyxy
148
+
149
+ t = time.time()
150
+ output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
151
+ for xi, x in enumerate(prediction): # image index, image inference
152
+ # Apply constraints
153
+ # x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0 # width-height
154
+ x = x[xc[xi]] # confidence
155
+
156
+ # Cat apriori labels if autolabelling
157
+ if labels and len(labels[xi]):
158
+ lb = labels[xi]
159
+ v = torch.zeros((len(lb), nc + nm + 4), device=x.device)
160
+ v[:, :4] = xywh2xyxy(lb[:, 1:5]) # box
161
+ v[range(len(lb)), lb[:, 0].long() + 4] = 1.0 # cls
162
+ x = torch.cat((x, v), 0)
163
+
164
+ # If none remain process next image
165
+ if not x.shape[0]:
166
+ continue
167
+
168
+ # Detections matrix nx6 (xyxy, conf, cls)
169
+ box, cls, mask = x.split((4, nc, nm), 1)
170
+
171
+ if multi_label:
172
+ i, j = torch.where(cls > conf_thres)
173
+ x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1)
174
+ else: # best class only
175
+ conf, j = cls.max(1, keepdim=True)
176
+ x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
177
+
178
+ # Filter by class
179
+ if classes is not None:
180
+ x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
181
+
182
+ # Check shape
183
+ n = x.shape[0] # number of boxes
184
+ if not n: # no boxes
185
+ continue
186
+ if n > max_nms: # excess boxes
187
+ x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence and remove excess boxes
188
+
189
+ # Batched NMS
190
+ c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
191
+ boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
192
+ i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
193
+ i = i[:max_det] # limit detections
194
+
195
+ # # Experimental
196
+ # merge = False # use merge-NMS
197
+ # if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
198
+ # # Update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
199
+ # from .metrics import box_iou
200
+ # iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
201
+ # weights = iou * scores[None] # box weights
202
+ # x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
203
+ # redundant = True # require redundant detections
204
+ # if redundant:
205
+ # i = i[iou.sum(1) > 1] # require redundancy
206
+
207
+ output[xi] = x[i]
208
+ if mps:
209
+ output[xi] = output[xi].to(device)
210
+ # if (time.time() - t) > time_limit:
211
+ # LOGGER.warning(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded')
212
+ # break # time limit exceeded
213
+
214
+ return output
215
+
216
+
217
+ def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
218
+ '''Adjust bounding boxes to stick to image border if they are within a certain threshold.
219
+ Args:
220
+ boxes: (n, 4)
221
+ image_shape: (height, width)
222
+ threshold: pixel threshold
223
+ Returns:
224
+ adjusted_boxes: adjusted bounding boxes
225
+ '''
226
+
227
+ # Image dimensions
228
+ h, w = image_shape
229
+
230
+ # Adjust boxes
231
+ boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
232
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
233
+ boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
234
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
235
+ boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
236
+ w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
237
+ boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
238
+ h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
239
+
240
+ return boxes
241
+
242
+ def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
243
+ '''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
244
+ Args:
245
+ box1: (4, )
246
+ boxes: (n, 4)
247
+ Returns:
248
+ high_iou_indices: Indices of boxes with IoU > thres
249
+ '''
250
+ boxes = adjust_bboxes_to_image_border(boxes, image_shape)
251
+ # obtain coordinates for intersections
252
+ x1 = torch.max(box1[0], boxes[:, 0])
253
+ y1 = torch.max(box1[1], boxes[:, 1])
254
+ x2 = torch.min(box1[2], boxes[:, 2])
255
+ y2 = torch.min(box1[3], boxes[:, 3])
256
+
257
+ # compute the area of intersection
258
+ intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
259
+
260
+ # compute the area of both individual boxes
261
+ box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
262
+ box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
263
+
264
+ # compute the area of union
265
+ union = box1_area + box2_area - intersection
266
+
267
+ # compute the IoU
268
+ iou = intersection / union # Should be shape (n, )
269
+ if raw_output:
270
+ if iou.numel() == 0:
271
+ return 0
272
+ return iou
273
+
274
+ # get indices of boxes with IoU > thres
275
+ high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
276
+
277
+ return high_iou_indices
278
+
279
+
280
+ def scale_masks(masks, shape, padding=True):
281
+ """
282
+ Rescale segment masks to shape.
283
+
284
+ Args:
285
+ masks (torch.Tensor): (N, C, H, W).
286
+ shape (tuple): Height and width.
287
+ padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
288
+ rescaling.
289
+ """
290
+ mh, mw = masks.shape[2:]
291
+ gain = min(mh / shape[0], mw / shape[1]) # gain = old / new
292
+ pad = [mw - shape[1] * gain, mh - shape[0] * gain] # wh padding
293
+ if padding:
294
+ pad[0] /= 2
295
+ pad[1] /= 2
296
+ top, left = (int(pad[1]), int(pad[0])) if padding else (0, 0) # y, x
297
+ bottom, right = (int(mh - pad[1]), int(mw - pad[0]))
298
+ masks = masks[..., top:bottom, left:right]
299
+
300
+ masks = F.interpolate(masks, shape, mode="bilinear", align_corners=False) # NCHW
301
+ return masks
302
+
303
+
304
+ def process_mask_native(protos, masks_in, bboxes, shape):
305
+ """
306
+ It takes the output of the mask head, and crops it after upsampling to the bounding boxes.
307
+
308
+ Args:
309
+ protos (torch.Tensor): [mask_dim, mask_h, mask_w]
310
+ masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms
311
+ bboxes (torch.Tensor): [n, 4], n is number of masks after nms
312
+ shape (tuple): the size of the input image (h,w)
313
+
314
+ Returns:
315
+ masks (torch.Tensor): The returned masks with dimensions [h, w, n]
316
+ """
317
+ c, mh, mw = protos.shape # CHW
318
+ masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw)
319
+ masks = scale_masks(masks[None], shape)[0] # CHW
320
+ masks = crop_mask(masks, bboxes) # CHW
321
+ return masks.gt_(0.5)
322
+
323
+ def crop_mask(masks, boxes):
324
+ """
325
+ It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box.
326
+
327
+ Args:
328
+ masks (torch.Tensor): [n, h, w] tensor of masks
329
+ boxes (torch.Tensor): [n, 4] tensor of bbox coordinates in relative point form
330
+
331
+ Returns:
332
+ (torch.Tensor): The masks are being cropped to the bounding box.
333
+ """
334
+ _, h, w = masks.shape
335
+ x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(n,1,1)
336
+ r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,1,w)
337
+ c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(1,h,1)
338
+
339
+ return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
340
+
341
+ def process_mask(protos, masks_in, bboxes, shape, upsample=False):
342
+ """
343
+ Apply masks to bounding boxes using the output of the mask head.
344
+
345
+ Args:
346
+ protos (torch.Tensor): A tensor of shape [mask_dim, mask_h, mask_w].
347
+ masks_in (torch.Tensor): A tensor of shape [n, mask_dim], where n is the number of masks after NMS.
348
+ bboxes (torch.Tensor): A tensor of shape [n, 4], where n is the number of masks after NMS.
349
+ shape (tuple): A tuple of integers representing the size of the input image in the format (h, w).
350
+ upsample (bool): A flag to indicate whether to upsample the mask to the original image size. Default is False.
351
+
352
+ Returns:
353
+ (torch.Tensor): A binary mask tensor of shape [n, h, w], where n is the number of masks after NMS, and h and w
354
+ are the height and width of the input image. The mask is applied to the bounding boxes.
355
+ """
356
+
357
+ c, mh, mw = protos.shape # CHW
358
+ ih, iw = shape
359
+ masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) # CHW
360
+
361
+ downsampled_bboxes = bboxes.clone()
362
+ downsampled_bboxes[:, 0] *= mw / iw
363
+ downsampled_bboxes[:, 2] *= mw / iw
364
+ downsampled_bboxes[:, 3] *= mh / ih
365
+ downsampled_bboxes[:, 1] *= mh / ih
366
+
367
+ masks = crop_mask(masks, downsampled_bboxes) # CHW
368
+ if upsample:
369
+ masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW
370
+ return masks.gt_(0.5)
371
+
372
+
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/utils.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from PIL import Image
4
+
5
+
6
+ def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
7
+ '''Adjust bounding boxes to stick to image border if they are within a certain threshold.
8
+ Args:
9
+ boxes: (n, 4)
10
+ image_shape: (height, width)
11
+ threshold: pixel threshold
12
+ Returns:
13
+ adjusted_boxes: adjusted bounding boxes
14
+ '''
15
+
16
+ # Image dimensions
17
+ h, w = image_shape
18
+
19
+ # Adjust boxes
20
+ boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
21
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
22
+ boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
23
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
24
+ boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
25
+ w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
26
+ boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
27
+ h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
28
+
29
+ return boxes
30
+
31
+
32
+
33
+ def convert_box_xywh_to_xyxy(box):
34
+ x1 = box[0]
35
+ y1 = box[1]
36
+ x2 = box[0] + box[2]
37
+ y2 = box[1] + box[3]
38
+ return [x1, y1, x2, y2]
39
+
40
+
41
+ def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
42
+ '''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
43
+ Args:
44
+ box1: (4, )
45
+ boxes: (n, 4)
46
+ Returns:
47
+ high_iou_indices: Indices of boxes with IoU > thres
48
+ '''
49
+ boxes = adjust_bboxes_to_image_border(boxes, image_shape)
50
+ # obtain coordinates for intersections
51
+ x1 = torch.max(box1[0], boxes[:, 0])
52
+ y1 = torch.max(box1[1], boxes[:, 1])
53
+ x2 = torch.min(box1[2], boxes[:, 2])
54
+ y2 = torch.min(box1[3], boxes[:, 3])
55
+
56
+ # compute the area of intersection
57
+ intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
58
+
59
+ # compute the area of both individual boxes
60
+ box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
61
+ box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
62
+
63
+ # compute the area of union
64
+ union = box1_area + box2_area - intersection
65
+
66
+ # compute the IoU
67
+ iou = intersection / union # Should be shape (n, )
68
+ if raw_output:
69
+ if iou.numel() == 0:
70
+ return 0
71
+ return iou
72
+
73
+ # get indices of boxes with IoU > thres
74
+ high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
75
+
76
+ return high_iou_indices
77
+
78
+
79
+ def image_to_np_ndarray(image):
80
+ if type(image) is str:
81
+ return np.array(Image.open(image))
82
+ elif issubclass(type(image), Image.Image):
83
+ return np.array(image)
84
+ elif type(image) is np.ndarray:
85
+ return image
86
+ return None
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Model Information
2
+ ### Source model
3
+ - Input shape: 640x640
4
+ - Number of parameters: 68.89M
5
+ - Model size: 277.39M
6
+ - Output shape: 1x37x8400,1x32x160x160
7
+
8
+ Source model repository: [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
9
+
10
+ ### Converted model
11
+
12
+ - Precision: INT8
13
+ - Backend: QNN2.16
14
+ - Target Device: SNM972 QCS8550
15
+
16
+ ## Inference with AidLite SDK
17
+
18
+ ### SDK installation
19
+ Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
20
+
21
+ - Install AidLite SDK
22
+
23
+ ```bash
24
+ # Install the appropriate version of the aidlite sdk
25
+ sudo aid-pkg update
26
+ sudo aid-pkg install aidlite-sdk
27
+ # Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
28
+ sudo aid-pkg install aidlite-{QNN VERSION}
29
+ ```
30
+
31
+ - Verify AidLite SDK
32
+
33
+ ```bash
34
+ # aidlite sdk c++ check
35
+ python3 -c "import aidlite ; print(aidlite.get_library_version())"
36
+
37
+ # aidlite sdk python check
38
+ python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
39
+ ```
40
+
41
+ ### Run demo
42
+ ```bash
43
+ cd fastsam_x/model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite
44
+ export LD_PRELOAD=/home/aidlux/.local/lib/python3.8/site-packages/torch/lib/../../torch.libs/libgomp-804f19d4.so.1.0.0
45
+
46
+ python3 ./python/run_test.py --target_model ./models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin --imgs ./python/dogs.jpg --invoke_nums 10
47
+ ```
48
+
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b41dab35e31041716c09ed9bb079e180a7f0d0e7ae8a34596507b55260efe6a
3
+ size 74562824
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/dogs.jpg ADDED
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/onnx_export.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import cv2
3
+ import os
4
+ import sys
5
+
6
+ from ultralytics.models.fastsam import FastSAM
7
+
8
+ class Fast_SAM(torch.nn.Module):
9
+ """Exportable FastSAM model, end-to-end."""
10
+
11
+ def __init__(self) -> None:
12
+ super().__init__()
13
+ pt_name ='./models/FastSAM-s.pt'
14
+ self.model =FastSAM(pt_name).model
15
+
16
+ def forward(self, image: torch.Tensor):
17
+ """
18
+ Run FastSAM on `image`, and produce high quality segmentation masks.
19
+ Faster than SAM as it is based on YOLOv8.
20
+
21
+ Parameters:
22
+ image: Pixel values pre-processed for encoder consumption.
23
+ Range: float[0, 1]
24
+ 3-channel Color Space: BGR
25
+ Returns:
26
+
27
+ """
28
+ predictions = self.model(image)
29
+ # Return predictions as a tuple instead of nested tuple.
30
+ return (predictions[0], predictions[1][2])
31
+
32
+
33
+ model = Fast_SAM()
34
+ num_params = sum(p.numel() for p in model.parameters())
35
+ print(f'Number of FastSAM-s parameters: {num_params}')
36
+ dummy_input = torch.randn( [1,3,640,640],dtype=torch.float32 )
37
+ source_model = torch.jit.trace(
38
+ model.to("cpu"), dummy_input, check_trace=False
39
+ )
40
+ torch.onnx.export(model, # model being run
41
+ dummy_input, # model input (or a tuple for multiple inputs)
42
+ "./models/fastsam_s.onnx", # where to save the model
43
+ export_params=True, # store the trained parameter weights inside the model file
44
+ opset_version=12, # the ONNX version to export the model to
45
+ do_constant_folding=True, # whether to execute constant folding for optimization
46
+ input_names = ['input'], # the model's input names
47
+ output_names = ['boxes','mask'],
48
+ verbose=True,
49
+ )
50
+ print("Convert to onnx successfully!")
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/prompt.py ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import cv2
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ import torch
7
+ from utils import image_to_np_ndarray
8
+ from PIL import Image
9
+
10
+
11
+ class FastSAMPrompt:
12
+
13
+ def __init__(self, image, results, device='cpu'):
14
+ if isinstance(image, str) or isinstance(image, Image.Image):
15
+ image = image_to_np_ndarray(image)
16
+ self.device = device
17
+ self.results = results
18
+ self.img = image
19
+
20
+ def _segment_image(self, image, bbox):
21
+ if isinstance(image, Image.Image):
22
+ image_array = np.array(image)
23
+ else:
24
+ image_array = image
25
+ segmented_image_array = np.zeros_like(image_array)
26
+ x1, y1, x2, y2 = bbox
27
+ segmented_image_array[y1:y2, x1:x2] = image_array[y1:y2, x1:x2]
28
+ segmented_image = Image.fromarray(segmented_image_array)
29
+ black_image = Image.new('RGB', image.size, (255, 255, 255))
30
+ # transparency_mask = np.zeros_like((), dtype=np.uint8)
31
+ transparency_mask = np.zeros((image_array.shape[0], image_array.shape[1]), dtype=np.uint8)
32
+ transparency_mask[y1:y2, x1:x2] = 255
33
+ transparency_mask_image = Image.fromarray(transparency_mask, mode='L')
34
+ black_image.paste(segmented_image, mask=transparency_mask_image)
35
+ return black_image
36
+
37
+ def _format_results(self, result, filter=0):
38
+ annotations = []
39
+ n = len(result.masks.data)
40
+ for i in range(n):
41
+ annotation = {}
42
+ mask = result.masks.data[i] == 1.0
43
+
44
+ if torch.sum(mask) < filter:
45
+ continue
46
+ annotation['id'] = i
47
+ annotation['segmentation'] = mask.cpu().numpy()
48
+ annotation['bbox'] = result.boxes.data[i]
49
+ annotation['score'] = result.boxes.conf[i]
50
+ annotation['area'] = annotation['segmentation'].sum()
51
+ annotations.append(annotation)
52
+ return annotations
53
+
54
+ def filter_masks(annotations): # filte the overlap mask
55
+ annotations.sort(key=lambda x: x['area'], reverse=True)
56
+ to_remove = set()
57
+ for i in range(0, len(annotations)):
58
+ a = annotations[i]
59
+ for j in range(i + 1, len(annotations)):
60
+ b = annotations[j]
61
+ if i != j and j not in to_remove:
62
+ # check if
63
+ if b['area'] < a['area']:
64
+ if (a['segmentation'] & b['segmentation']).sum() / b['segmentation'].sum() > 0.8:
65
+ to_remove.add(j)
66
+
67
+ return [a for i, a in enumerate(annotations) if i not in to_remove], to_remove
68
+
69
+ def _get_bbox_from_mask(self, mask):
70
+ mask = mask.astype(np.uint8)
71
+ contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
72
+ x1, y1, w, h = cv2.boundingRect(contours[0])
73
+ x2, y2 = x1 + w, y1 + h
74
+ if len(contours) > 1:
75
+ for b in contours:
76
+ x_t, y_t, w_t, h_t = cv2.boundingRect(b)
77
+ # Merge multiple bounding boxes into one.
78
+ x1 = min(x1, x_t)
79
+ y1 = min(y1, y_t)
80
+ x2 = max(x2, x_t + w_t)
81
+ y2 = max(y2, y_t + h_t)
82
+ h = y2 - y1
83
+ w = x2 - x1
84
+ return [x1, y1, x2, y2]
85
+
86
+ def plot_to_result(self,
87
+ annotations,
88
+ bboxes=None,
89
+ points=None,
90
+ point_label=None,
91
+ mask_random_color=True,
92
+ better_quality=True,
93
+ retina=False,
94
+ withContours=True) -> np.ndarray:
95
+ if isinstance(annotations[0], dict):
96
+ annotations = [annotation['segmentation'] for annotation in annotations]
97
+ image = self.img
98
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
99
+ original_h = image.shape[0]
100
+ original_w = image.shape[1]
101
+ if sys.platform == "darwin":
102
+ plt.switch_backend("TkAgg")
103
+ plt.figure(figsize=(original_w / 100, original_h / 100))
104
+ # Add subplot with no margin.
105
+ plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
106
+ plt.margins(0, 0)
107
+ plt.gca().xaxis.set_major_locator(plt.NullLocator())
108
+ plt.gca().yaxis.set_major_locator(plt.NullLocator())
109
+
110
+ plt.imshow(image)
111
+ if better_quality:
112
+ if isinstance(annotations[0], torch.Tensor):
113
+ annotations = np.array(annotations.cpu())
114
+ for i, mask in enumerate(annotations):
115
+ mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8))
116
+ annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8))
117
+ if self.device == 'cpu':
118
+ annotations = np.array(annotations)
119
+ self.fast_show_mask(
120
+ annotations,
121
+ plt.gca(),
122
+ random_color=mask_random_color,
123
+ bboxes=bboxes,
124
+ points=points,
125
+ pointlabel=point_label,
126
+ retinamask=retina,
127
+ target_height=original_h,
128
+ target_width=original_w,
129
+ )
130
+ else:
131
+ if isinstance(annotations[0], np.ndarray):
132
+ annotations = torch.from_numpy(annotations)
133
+ self.fast_show_mask_gpu(
134
+ annotations,
135
+ plt.gca(),
136
+ random_color=mask_random_color,
137
+ bboxes=bboxes,
138
+ points=points,
139
+ pointlabel=point_label,
140
+ retinamask=retina,
141
+ target_height=original_h,
142
+ target_width=original_w,
143
+ )
144
+ if isinstance(annotations, torch.Tensor):
145
+ annotations = annotations.cpu().numpy()
146
+ if withContours:
147
+ contour_all = []
148
+ temp = np.zeros((original_h, original_w, 1))
149
+ for i, mask in enumerate(annotations):
150
+ if type(mask) == dict:
151
+ mask = mask['segmentation']
152
+ annotation = mask.astype(np.uint8)
153
+ if not retina:
154
+ annotation = cv2.resize(
155
+ annotation,
156
+ (original_w, original_h),
157
+ interpolation=cv2.INTER_NEAREST,
158
+ )
159
+ contours, hierarchy = cv2.findContours(annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
160
+ for contour in contours:
161
+ contour_all.append(contour)
162
+ cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2)
163
+ color = np.array([0 / 255, 0 / 255, 255 / 255, 0.8])
164
+ contour_mask = temp / 255 * color.reshape(1, 1, -1)
165
+ plt.imshow(contour_mask)
166
+
167
+ plt.axis('off')
168
+ fig = plt.gcf()
169
+ plt.draw()
170
+
171
+ try:
172
+ buf = fig.canvas.tostring_rgb()
173
+ except AttributeError:
174
+ fig.canvas.draw()
175
+ buf = fig.canvas.tostring_rgb()
176
+ cols, rows = fig.canvas.get_width_height()
177
+ img_array = np.frombuffer(buf, dtype=np.uint8).reshape(rows, cols, 3)
178
+ result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
179
+ plt.close()
180
+ return result
181
+
182
+ # Remark for refactoring: IMO a function should do one thing only, storing the image and plotting should be seperated and do not necessarily need to be class functions but standalone utility functions that the user can chain in his scripts to have more fine-grained control.
183
+ def plot(self,
184
+ annotations,
185
+ output_path,
186
+ bboxes=None,
187
+ points=None,
188
+ point_label=None,
189
+ mask_random_color=True,
190
+ better_quality=True,
191
+ retina=False,
192
+ withContours=True):
193
+ if len(annotations) == 0:
194
+ return None
195
+ result = self.plot_to_result(
196
+ annotations,
197
+ bboxes,
198
+ points,
199
+ point_label,
200
+ mask_random_color,
201
+ better_quality,
202
+ retina,
203
+ withContours,
204
+ )
205
+
206
+ path = os.path.dirname(os.path.abspath(output_path))
207
+ if not os.path.exists(path):
208
+ os.makedirs(path)
209
+ result = result[:, :, ::-1]
210
+ cv2.imwrite(output_path, result)
211
+
212
+ # CPU post process
213
+ def fast_show_mask(
214
+ self,
215
+ annotation,
216
+ ax,
217
+ random_color=False,
218
+ bboxes=None,
219
+ points=None,
220
+ pointlabel=None,
221
+ retinamask=True,
222
+ target_height=960,
223
+ target_width=960,
224
+ ):
225
+ msak_sum = annotation.shape[0]
226
+ height = annotation.shape[1]
227
+ weight = annotation.shape[2]
228
+ #Sort annotations based on area.
229
+ areas = np.sum(annotation, axis=(1, 2))
230
+ sorted_indices = np.argsort(areas)
231
+ annotation = annotation[sorted_indices]
232
+
233
+ index = (annotation != 0).argmax(axis=0)
234
+ if random_color:
235
+ color = np.random.random((msak_sum, 1, 1, 3))
236
+ else:
237
+ color = np.ones((msak_sum, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 255 / 255])
238
+ transparency = np.ones((msak_sum, 1, 1, 1)) * 0.6
239
+ visual = np.concatenate([color, transparency], axis=-1)
240
+ mask_image = np.expand_dims(annotation, -1) * visual
241
+
242
+ show = np.zeros((height, weight, 4))
243
+ h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing='ij')
244
+ indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
245
+ # Use vectorized indexing to update the values of 'show'.
246
+ show[h_indices, w_indices, :] = mask_image[indices]
247
+ if bboxes is not None:
248
+ for bbox in bboxes:
249
+ x1, y1, x2, y2 = bbox
250
+ ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
251
+ # draw point
252
+ if points is not None:
253
+ plt.scatter(
254
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
255
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
256
+ s=20,
257
+ c='y',
258
+ )
259
+ plt.scatter(
260
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
261
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
262
+ s=20,
263
+ c='m',
264
+ )
265
+
266
+ if not retinamask:
267
+ show = cv2.resize(show, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
268
+ ax.imshow(show)
269
+
270
+ def fast_show_mask_gpu(
271
+ self,
272
+ annotation,
273
+ ax,
274
+ random_color=False,
275
+ bboxes=None,
276
+ points=None,
277
+ pointlabel=None,
278
+ retinamask=True,
279
+ target_height=960,
280
+ target_width=960,
281
+ ):
282
+ msak_sum = annotation.shape[0]
283
+ height = annotation.shape[1]
284
+ weight = annotation.shape[2]
285
+ areas = torch.sum(annotation, dim=(1, 2))
286
+ sorted_indices = torch.argsort(areas, descending=False)
287
+ annotation = annotation[sorted_indices]
288
+ # Find the index of the first non-zero value at each position.
289
+ index = (annotation != 0).to(torch.long).argmax(dim=0)
290
+ if random_color:
291
+ color = torch.rand((msak_sum, 1, 1, 3)).to(annotation.device)
292
+ else:
293
+ color = torch.ones((msak_sum, 1, 1, 3)).to(annotation.device) * torch.tensor([
294
+ 30 / 255, 144 / 255, 255 / 255]).to(annotation.device)
295
+ transparency = torch.ones((msak_sum, 1, 1, 1)).to(annotation.device) * 0.6
296
+ visual = torch.cat([color, transparency], dim=-1)
297
+ mask_image = torch.unsqueeze(annotation, -1) * visual
298
+ # Select data according to the index. The index indicates which batch's data to choose at each position, converting the mask_image into a single batch form.
299
+ show = torch.zeros((height, weight, 4)).to(annotation.device)
300
+ try:
301
+ h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight), indexing='ij')
302
+ except:
303
+ h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight))
304
+ indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
305
+ # Use vectorized indexing to update the values of 'show'.
306
+ show[h_indices, w_indices, :] = mask_image[indices]
307
+ show_cpu = show.cpu().numpy()
308
+ if bboxes is not None:
309
+ for bbox in bboxes:
310
+ x1, y1, x2, y2 = bbox
311
+ ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
312
+ # draw point
313
+ if points is not None:
314
+ plt.scatter(
315
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
316
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
317
+ s=20,
318
+ c='y',
319
+ )
320
+ plt.scatter(
321
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
322
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
323
+ s=20,
324
+ c='m',
325
+ )
326
+ if not retinamask:
327
+ show_cpu = cv2.resize(show_cpu, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
328
+ ax.imshow(show_cpu)
329
+
330
+ # clip
331
+ @torch.no_grad()
332
+ def retrieve(self, model, preprocess, elements, search_text: str, device) -> int:
333
+ preprocessed_images = [preprocess(image).to(device) for image in elements]
334
+ try:
335
+ import clip # for linear_assignment
336
+
337
+ except (ImportError, AssertionError, AttributeError):
338
+ from ultralytics.yolo.utils.checks import check_requirements
339
+
340
+ check_requirements('git+https://github.com/openai/CLIP.git') # required before installing lap from source
341
+ import clip
342
+
343
+
344
+ tokenized_text = clip.tokenize([search_text]).to(device)
345
+ stacked_images = torch.stack(preprocessed_images)
346
+ image_features = model.encode_image(stacked_images)
347
+ text_features = model.encode_text(tokenized_text)
348
+ image_features /= image_features.norm(dim=-1, keepdim=True)
349
+ text_features /= text_features.norm(dim=-1, keepdim=True)
350
+ probs = 100.0 * image_features @ text_features.T
351
+ return probs[:, 0].softmax(dim=0)
352
+
353
+ def _crop_image(self, format_results):
354
+
355
+ image = Image.fromarray(cv2.cvtColor(self.img, cv2.COLOR_BGR2RGB))
356
+ ori_w, ori_h = image.size
357
+ annotations = format_results
358
+ mask_h, mask_w = annotations[0]['segmentation'].shape
359
+ if ori_w != mask_w or ori_h != mask_h:
360
+ image = image.resize((mask_w, mask_h))
361
+ cropped_boxes = []
362
+ cropped_images = []
363
+ not_crop = []
364
+ filter_id = []
365
+ # annotations, _ = filter_masks(annotations)
366
+ # filter_id = list(_)
367
+ for _, mask in enumerate(annotations):
368
+ if np.sum(mask['segmentation']) <= 100:
369
+ filter_id.append(_)
370
+ continue
371
+ bbox = self._get_bbox_from_mask(mask['segmentation']) # mask 的 bbox
372
+ cropped_boxes.append(self._segment_image(image, bbox))
373
+ # cropped_boxes.append(segment_image(image,mask["segmentation"]))
374
+ cropped_images.append(bbox) # Save the bounding box of the cropped image.
375
+
376
+ return cropped_boxes, cropped_images, not_crop, filter_id, annotations
377
+
378
+ def box_prompt(self, bbox=None, bboxes=None):
379
+ if self.results == None:
380
+ return []
381
+ assert bbox or bboxes
382
+ if bboxes is None:
383
+ bboxes = [bbox]
384
+ max_iou_index = []
385
+ for bbox in bboxes:
386
+ assert (bbox[2] != 0 and bbox[3] != 0)
387
+ masks = self.results[0].masks.data
388
+ target_height = self.img.shape[0]
389
+ target_width = self.img.shape[1]
390
+ h = masks.shape[1]
391
+ w = masks.shape[2]
392
+ if h != target_height or w != target_width:
393
+ bbox = [
394
+ int(bbox[0] * w / target_width),
395
+ int(bbox[1] * h / target_height),
396
+ int(bbox[2] * w / target_width),
397
+ int(bbox[3] * h / target_height), ]
398
+ bbox[0] = round(bbox[0]) if round(bbox[0]) > 0 else 0
399
+ bbox[1] = round(bbox[1]) if round(bbox[1]) > 0 else 0
400
+ bbox[2] = round(bbox[2]) if round(bbox[2]) < w else w
401
+ bbox[3] = round(bbox[3]) if round(bbox[3]) < h else h
402
+
403
+ # IoUs = torch.zeros(len(masks), dtype=torch.float32)
404
+ bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
405
+
406
+ masks_area = torch.sum(masks[:, bbox[1]:bbox[3], bbox[0]:bbox[2]], dim=(1, 2))
407
+ orig_masks_area = torch.sum(masks, dim=(1, 2))
408
+
409
+ union = bbox_area + orig_masks_area - masks_area
410
+ IoUs = masks_area / union
411
+ max_iou_index.append(int(torch.argmax(IoUs)))
412
+ max_iou_index = list(set(max_iou_index))
413
+ return np.array(masks[max_iou_index].cpu().numpy())
414
+
415
+ def point_prompt(self, points, pointlabel): # numpy
416
+ if self.results == None:
417
+ return []
418
+ masks = self._format_results(self.results[0], 0)
419
+ target_height = self.img.shape[0]
420
+ target_width = self.img.shape[1]
421
+ h = masks[0]['segmentation'].shape[0]
422
+ w = masks[0]['segmentation'].shape[1]
423
+ if h != target_height or w != target_width:
424
+ points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points]
425
+ onemask = np.zeros((h, w))
426
+ masks = sorted(masks, key=lambda x: x['area'], reverse=True)
427
+ for i, annotation in enumerate(masks):
428
+ if type(annotation) == dict:
429
+ mask = annotation['segmentation']
430
+ else:
431
+ mask = annotation
432
+ for i, point in enumerate(points):
433
+ if mask[point[1], point[0]] == 1 and pointlabel[i] == 1:
434
+ onemask[mask] = 1
435
+ if mask[point[1], point[0]] == 1 and pointlabel[i] == 0:
436
+ onemask[mask] = 0
437
+ onemask = onemask >= 1
438
+ return np.array([onemask])
439
+
440
+ def text_prompt(self, text):
441
+ if self.results == None:
442
+ return []
443
+ format_results = self._format_results(self.results[0], 0)
444
+ cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._crop_image(format_results)
445
+ clip_model, preprocess = clip.load('ViT-B/32', device=self.device)
446
+ scores = self.retrieve(clip_model, preprocess, cropped_boxes, text, device=self.device)
447
+ max_idx = scores.argsort()
448
+ max_idx = max_idx[-1]
449
+ max_idx += sum(np.array(filter_id) <= int(max_idx))
450
+ return np.array([annotations[max_idx]['segmentation']])
451
+
452
+ def everything_prompt(self):
453
+ if self.results == None:
454
+ return []
455
+ return self.results[0].masks.data
456
+
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/run_test.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import cv2
4
+ import numpy as np
5
+ import onnxruntime
6
+ import time
7
+ import matplotlib.pyplot as plt
8
+ import torch
9
+ from ultralytics.engine.results import Results
10
+ from tools_pt import *
11
+ from prompt import FastSAMPrompt
12
+ import aidlite
13
+ import argparse
14
+ import ast
15
+
16
+ # 定义相似度函数
17
+ def get_acc(onnx_out,other_out):
18
+ cosine_similarity=np.dot(np.array(onnx_out),np.array(other_out))/(np.linalg.norm(np.array(onnx_out)) * np.linalg.norm(np.array(other_out)))
19
+ return cosine_similarity
20
+
21
+ def cal_sigmoid(x):
22
+ return 1 / (1 + np.exp(-x))
23
+
24
+ class qnn_predict(object):
25
+ def __init__(self,inputshape,outputshape,args) -> None:
26
+ aidlite.set_log_level(aidlite.LogLevel.INFO)
27
+ aidlite.log_to_stderr()
28
+ print(f"Aidlite library version : {aidlite.get_library_version()}")
29
+ print(f"Aidlite python library version : {aidlite.get_py_library_version()}")
30
+ config = aidlite.Config.create_instance()
31
+ if config is None:
32
+ print("Create model failed !")
33
+ config.implement_type = aidlite.ImplementType.TYPE_LOCAL
34
+ config.framework_type = aidlite.FrameworkType.TYPE_QNN
35
+ config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
36
+ config.is_quantify_model = 1
37
+
38
+ model = aidlite.Model.create_instance(args.target_model)
39
+ if model is None:
40
+ print("Create model failed !")
41
+
42
+ self.input_shape=inputshape
43
+ self.out_shape = outputshape
44
+ model.set_model_properties(self.input_shape, aidlite.DataType.TYPE_FLOAT32, self.out_shape, aidlite.DataType.TYPE_FLOAT32)
45
+ self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(model, config)
46
+ if self.interpreter is None:
47
+ print("build_interpretper_from_model_and_config failed !")
48
+ result = self.interpreter.init()
49
+ if result != 0:
50
+ print(f"interpreter init failed !")
51
+ result = self.interpreter.load_model()
52
+ if result != 0:
53
+ print("interpreter load model failed !")
54
+ print("detect model load success!")
55
+
56
+ self.conf = 0.4
57
+ self.iou=0.9
58
+ self.size = 640
59
+ self.agnostic_nms=False
60
+ self.max_det = 300
61
+ self.names=['object']
62
+ self.classes =None
63
+ self.retina_masks=True
64
+
65
+ def pretreat_img(self,img):
66
+ scale = 1/255.
67
+ img_size = cv2.resize(img, (self.size,self.size), interpolation=cv2.INTER_LINEAR)
68
+ float_img = img_size.astype('float32')
69
+ float_img = float_img* scale
70
+ float_img = float_img[:, :, ::-1]
71
+ return float_img
72
+
73
+ def postprocess(self, preds, img, orig_imgs):
74
+ """TODO: filter by classes."""
75
+ p = non_max_suppression(torch.from_numpy(preds[0]),
76
+ self.conf,
77
+ self.iou,
78
+ agnostic=self.agnostic_nms,
79
+ max_det=self.max_det,
80
+ nc=len(self.names),
81
+ classes=self.classes)
82
+
83
+ results = []
84
+ if len(p) == 0 or len(p[0]) == 0:
85
+ print("No object detected.")
86
+ return results
87
+
88
+ full_box = torch.zeros_like(p[0][0])
89
+ full_box[2], full_box[3], full_box[4], full_box[6:] = img.shape[3], img.shape[2], 1.0, 1.0
90
+ full_box = full_box.view(1, -1)
91
+ critical_iou_index = bbox_iou(full_box[0][:4], p[0][:, :4], iou_thres=0.9, image_shape=img.shape[2:])
92
+ if critical_iou_index.numel() != 0:
93
+ full_box[0][4] = p[0][critical_iou_index][:,4]
94
+ full_box[0][6:] = p[0][critical_iou_index][:,6:]
95
+ p[0][critical_iou_index] = full_box
96
+
97
+ #proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported
98
+ proto=torch.from_numpy(preds[-1])
99
+ for i, pred in enumerate(p):
100
+ orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs
101
+ path =img[0] #self.batch[0]
102
+ img_path = path[i] if isinstance(path, list) else path
103
+ if not len(pred): # save empty boxes
104
+ results.append(Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6]))
105
+ continue
106
+ if self.retina_masks:
107
+ if not isinstance(orig_imgs, torch.Tensor):
108
+ pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
109
+ masks = process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2]) # HWC
110
+ else:
111
+ masks = process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True) # HWC
112
+ if not isinstance(orig_imgs, torch.Tensor):
113
+ pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
114
+ results.append(
115
+ Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6], masks=masks))
116
+ return results
117
+
118
+ def qnn_run(self, orig_imgs,img_path,args):
119
+ input_img_f =self.pretreat_img(orig_imgs) # 图片resize HWC
120
+ # print("qnn_input:",input_img_f)
121
+ # encoder texts
122
+ input_img = np.expand_dims(input_img_f, 0)
123
+
124
+ invoke_time=[]
125
+ for i in range(args.invoke_nums):
126
+ result = self.interpreter.set_input_tensor(0, input_img.data)
127
+ t0 = time.time()
128
+ result = self.interpreter.invoke()
129
+ t1 = time.time()
130
+ cost_time=(t1-t0)*1000
131
+ invoke_time.append(cost_time)
132
+ mask_ = self.interpreter.get_output_tensor(0)
133
+ concat_ = self.interpreter.get_output_tensor(1)
134
+ mul_ = self.interpreter.get_output_tensor(3)
135
+ split_ = self.interpreter.get_output_tensor(2)
136
+ mask_ = mask_.reshape( * self.out_shape[3])
137
+ mask_=mask_.transpose((0, 3, 1,2))
138
+ concat_ = concat_.reshape( *self.out_shape[2])
139
+ mul_ = mul_.reshape( *self.out_shape[1])
140
+ split_ = split_.reshape( *self.out_shape[0])
141
+ sig_ = cal_sigmoid(split_)
142
+
143
+ output_concat = np.concatenate((mul_,sig_),axis=1)
144
+ output_concat = np.concatenate((output_concat,concat_),axis=1)
145
+
146
+ # outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
147
+ ## time 统计
148
+ max_invoke_time = max(invoke_time)
149
+ min_invoke_time = min(invoke_time)
150
+ mean_invoke_time = sum(invoke_time)/args.invoke_nums
151
+ var_invoketime=np.var(invoke_time)
152
+ print("========================================")
153
+ print(f"QNN inference {args.invoke_nums} times :\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
154
+ print("========================================")
155
+
156
+ qnn_out = [np.array(output_concat),np.array(mask_)]
157
+ # print("qnn predict out:",qnn_out)
158
+
159
+ nchw_img = input_img.transpose(0,3,1,2)
160
+ everything_results = self.postprocess( qnn_out, nchw_img, [orig_imgs])
161
+ # print("everything_results: ",everything_results)
162
+
163
+ prompt_process = FastSAMPrompt(args.imgs, everything_results, device="cpu")
164
+
165
+ # ann = prompt_process.point_prompt(points=[[620, 360]], pointlabel=[1])
166
+ try:
167
+ if args.point_prompt ==[[0,0]]:
168
+ ann = prompt_process.everything_prompt()
169
+ else:
170
+ ann = prompt_process.point_prompt(points=args.point_prompt, pointlabel=[1])
171
+ out_name = os.path.basename(img_path).split(".")[0]
172
+ if True: # savepic
173
+ outpath = "python/"
174
+ if not os.path.exists(outpath):
175
+ os.mkdir(outpath)
176
+ prompt_process.plot(
177
+ annotations=ann,
178
+ output_path=os.path.join(outpath,out_name+"_result.jpg"),
179
+ mask_random_color=True,
180
+ better_quality=True,
181
+ retina=False,
182
+ withContours=True,
183
+ )
184
+ else:
185
+ plt.figure()
186
+ prompt_process.fast_show_mask(annotation=ann,
187
+ ax = plt)
188
+ except Exception as e:
189
+ print(f"Waning : An error occurred in the picture {img_path} prediction -{e}")
190
+ return [mask_.reshape(-1),output_concat.reshape(-1)]
191
+
192
+
193
+
194
+ def parser_args():
195
+ parser = argparse.ArgumentParser(description="Run model benchmarks")
196
+ parser.add_argument('--target_model',type=str,default='models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin.aidem',help="inference model path")
197
+ parser.add_argument('--source_model',type=str,default='models/fastsam_x.onnx',help="original model path")
198
+ parser.add_argument('--imgs',type=str,default='python/dogs.jpg',help="Predict images path")
199
+ parser.add_argument('--invoke_nums',type=int,default=10,help="Inference nums")
200
+ parser.add_argument('--point_prompt',type=str,default="[[0,0]]",help="example:[[x1,y1],[x2,y2]]")
201
+ args = parser.parse_args()
202
+ return args
203
+
204
+
205
+ if __name__ == "__main__":
206
+ args = parser_args()
207
+ inputshape=[[1,640,640,3]]
208
+ outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
209
+ args.point_prompt = ast.literal_eval(args.point_prompt)
210
+
211
+ predict = qnn_predict(inputshape,outputshape,args)
212
+ if os.path.isdir(args.imgs):
213
+ img_files = os.listdir(args.imgs)
214
+ for fi in img_files:
215
+ img_path = os.path.join(args.imgs,fi)
216
+ im0s = cv2.imread(img_path) # BGR
217
+ im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
218
+ predict.qnn_run(im0s,img_path,args)
219
+ else:
220
+ img_path = args.imgs
221
+ im0s = cv2.imread(img_path) # BGR
222
+ im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
223
+ qnn_result = predict.qnn_run(im0s,img_path,args)
224
+ print("Prediction completion and the results are saved !")
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/tools_pt.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import time
3
+ import torch
4
+ import torchvision
5
+ import torch.nn.functional as F
6
+
7
+
8
+
9
+ def clip_boxes(boxes, shape):
10
+ """
11
+ Takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the shape.
12
+
13
+ Args:
14
+ boxes (torch.Tensor): the bounding boxes to clip
15
+ shape (tuple): the shape of the image
16
+ """
17
+ if isinstance(boxes, torch.Tensor): # faster individually
18
+ boxes[..., 0].clamp_(0, shape[1]) # x1
19
+ boxes[..., 1].clamp_(0, shape[0]) # y1
20
+ boxes[..., 2].clamp_(0, shape[1]) # x2
21
+ boxes[..., 3].clamp_(0, shape[0]) # y2
22
+ else: # np.array (faster grouped)
23
+ boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
24
+ boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
25
+
26
+ def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding=True):
27
+ """
28
+ Rescales bounding boxes (in the format of xyxy) from the shape of the image they were originally specified in
29
+ (img1_shape) to the shape of a different image (img0_shape).
30
+
31
+ Args:
32
+ img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
33
+ boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
34
+ img0_shape (tuple): the shape of the target image, in the format of (height, width).
35
+ ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
36
+ calculated based on the size difference between the two images.
37
+ padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
38
+ rescaling.
39
+
40
+ Returns:
41
+ boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
42
+ """
43
+ if ratio_pad is None: # calculate from img0_shape
44
+ gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
45
+ pad = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1), round(
46
+ (img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # wh padding
47
+ else:
48
+ gain = ratio_pad[0][0]
49
+ pad = ratio_pad[1]
50
+
51
+ if padding:
52
+ boxes[..., [0, 2]] -= pad[0] # x padding
53
+ boxes[..., [1, 3]] -= pad[1] # y padding
54
+ boxes[..., :4] /= gain
55
+ clip_boxes(boxes, img0_shape)
56
+ return boxes
57
+
58
+
59
+ def xywh2xyxy(x):
60
+ """
61
+ Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
62
+ top-left corner and (x2, y2) is the bottom-right corner.
63
+
64
+ Args:
65
+ x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.
66
+
67
+ Returns:
68
+ y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
69
+ """
70
+ assert x.shape[-1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'
71
+ y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x) # faster than clone/copy
72
+ dw = x[..., 2] / 2 # half-width
73
+ dh = x[..., 3] / 2 # half-height
74
+ y[..., 0] = x[..., 0] - dw # top left x
75
+ y[..., 1] = x[..., 1] - dh # top left y
76
+ y[..., 2] = x[..., 0] + dw # bottom right x
77
+ y[..., 3] = x[..., 1] + dh # bottom right y
78
+ return y
79
+
80
+
81
+ def non_max_suppression(
82
+ prediction,
83
+ conf_thres=0.25,
84
+ iou_thres=0.45,
85
+ classes=None,
86
+ agnostic=False,
87
+ multi_label=False,
88
+ labels=(),
89
+ max_det=300,
90
+ nc=0, # number of classes (optional)
91
+ max_time_img=0.05,
92
+ max_nms=30000,
93
+ max_wh=7680,
94
+ ):
95
+ """
96
+ Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box.
97
+
98
+ Args:
99
+ prediction (torch.Tensor): A tensor of shape (batch_size, num_classes + 4 + num_masks, num_boxes)
100
+ containing the predicted boxes, classes, and masks. The tensor should be in the format
101
+ output by a model, such as YOLO.
102
+ conf_thres (float): The confidence threshold below which boxes will be filtered out.
103
+ Valid values are between 0.0 and 1.0.
104
+ iou_thres (float): The IoU threshold below which boxes will be filtered out during NMS.
105
+ Valid values are between 0.0 and 1.0.
106
+ classes (List[int]): A list of class indices to consider. If None, all classes will be considered.
107
+ agnostic (bool): If True, the model is agnostic to the number of classes, and all
108
+ classes will be considered as one.
109
+ multi_label (bool): If True, each box may have multiple labels.
110
+ labels (List[List[Union[int, float, torch.Tensor]]]): A list of lists, where each inner
111
+ list contains the apriori labels for a given image. The list should be in the format
112
+ output by a dataloader, with each label being a tuple of (class_index, x1, y1, x2, y2).
113
+ max_det (int): The maximum number of boxes to keep after NMS.
114
+ nc (int, optional): The number of classes output by the model. Any indices after this will be considered masks.
115
+ max_time_img (float): The maximum time (seconds) for processing one image.
116
+ max_nms (int): The maximum number of boxes into torchvision.ops.nms().
117
+ max_wh (int): The maximum box width and height in pixels
118
+
119
+ Returns:
120
+ (List[torch.Tensor]): A list of length batch_size, where each element is a tensor of
121
+ shape (num_boxes, 6 + num_masks) containing the kept boxes, with columns
122
+ (x1, y1, x2, y2, confidence, class, mask1, mask2, ...).
123
+ """
124
+
125
+ # Checks
126
+ assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
127
+ assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
128
+ if isinstance(prediction, (list, tuple)): # YOLOv8 model in validation model, output = (inference_out, loss_out)
129
+ prediction = prediction[0] # select only inference output
130
+
131
+ device = prediction.device
132
+ mps = 'mps' in device.type # Apple MPS
133
+ if mps: # MPS not fully supported yet, convert tensors to CPU before NMS
134
+ prediction = prediction.cpu()
135
+ bs = prediction.shape[0] # batch size
136
+ nc = nc or (prediction.shape[1] - 4) # number of classes
137
+ nm = prediction.shape[1] - nc - 4
138
+ mi = 4 + nc # mask start index
139
+ xc = prediction[:, 4:mi].amax(1) > conf_thres # candidates
140
+
141
+ # Settings
142
+ # min_wh = 2 # (pixels) minimum box width and height
143
+ time_limit = 0.5 + max_time_img * bs # seconds to quit after
144
+ multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
145
+
146
+ prediction = prediction.transpose(-1, -2) # shape(1,84,6300) to shape(1,6300,84)
147
+ prediction[..., :4] = xywh2xyxy(prediction[..., :4]) # xywh to xyxy
148
+
149
+ t = time.time()
150
+ output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
151
+ for xi, x in enumerate(prediction): # image index, image inference
152
+ # Apply constraints
153
+ # x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0 # width-height
154
+ x = x[xc[xi]] # confidence
155
+
156
+ # Cat apriori labels if autolabelling
157
+ if labels and len(labels[xi]):
158
+ lb = labels[xi]
159
+ v = torch.zeros((len(lb), nc + nm + 4), device=x.device)
160
+ v[:, :4] = xywh2xyxy(lb[:, 1:5]) # box
161
+ v[range(len(lb)), lb[:, 0].long() + 4] = 1.0 # cls
162
+ x = torch.cat((x, v), 0)
163
+
164
+ # If none remain process next image
165
+ if not x.shape[0]:
166
+ continue
167
+
168
+ # Detections matrix nx6 (xyxy, conf, cls)
169
+ box, cls, mask = x.split((4, nc, nm), 1)
170
+
171
+ if multi_label:
172
+ i, j = torch.where(cls > conf_thres)
173
+ x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1)
174
+ else: # best class only
175
+ conf, j = cls.max(1, keepdim=True)
176
+ x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
177
+
178
+ # Filter by class
179
+ if classes is not None:
180
+ x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
181
+
182
+ # Check shape
183
+ n = x.shape[0] # number of boxes
184
+ if not n: # no boxes
185
+ continue
186
+ if n > max_nms: # excess boxes
187
+ x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence and remove excess boxes
188
+
189
+ # Batched NMS
190
+ c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
191
+ boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
192
+ i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
193
+ i = i[:max_det] # limit detections
194
+
195
+ # # Experimental
196
+ # merge = False # use merge-NMS
197
+ # if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
198
+ # # Update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
199
+ # from .metrics import box_iou
200
+ # iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
201
+ # weights = iou * scores[None] # box weights
202
+ # x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
203
+ # redundant = True # require redundant detections
204
+ # if redundant:
205
+ # i = i[iou.sum(1) > 1] # require redundancy
206
+
207
+ output[xi] = x[i]
208
+ if mps:
209
+ output[xi] = output[xi].to(device)
210
+ # if (time.time() - t) > time_limit:
211
+ # LOGGER.warning(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded')
212
+ # break # time limit exceeded
213
+
214
+ return output
215
+
216
+
217
+ def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
218
+ '''Adjust bounding boxes to stick to image border if they are within a certain threshold.
219
+ Args:
220
+ boxes: (n, 4)
221
+ image_shape: (height, width)
222
+ threshold: pixel threshold
223
+ Returns:
224
+ adjusted_boxes: adjusted bounding boxes
225
+ '''
226
+
227
+ # Image dimensions
228
+ h, w = image_shape
229
+
230
+ # Adjust boxes
231
+ boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
232
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
233
+ boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
234
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
235
+ boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
236
+ w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
237
+ boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
238
+ h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
239
+
240
+ return boxes
241
+
242
+ def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
243
+ '''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
244
+ Args:
245
+ box1: (4, )
246
+ boxes: (n, 4)
247
+ Returns:
248
+ high_iou_indices: Indices of boxes with IoU > thres
249
+ '''
250
+ boxes = adjust_bboxes_to_image_border(boxes, image_shape)
251
+ # obtain coordinates for intersections
252
+ x1 = torch.max(box1[0], boxes[:, 0])
253
+ y1 = torch.max(box1[1], boxes[:, 1])
254
+ x2 = torch.min(box1[2], boxes[:, 2])
255
+ y2 = torch.min(box1[3], boxes[:, 3])
256
+
257
+ # compute the area of intersection
258
+ intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
259
+
260
+ # compute the area of both individual boxes
261
+ box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
262
+ box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
263
+
264
+ # compute the area of union
265
+ union = box1_area + box2_area - intersection
266
+
267
+ # compute the IoU
268
+ iou = intersection / union # Should be shape (n, )
269
+ if raw_output:
270
+ if iou.numel() == 0:
271
+ return 0
272
+ return iou
273
+
274
+ # get indices of boxes with IoU > thres
275
+ high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
276
+
277
+ return high_iou_indices
278
+
279
+
280
+ def scale_masks(masks, shape, padding=True):
281
+ """
282
+ Rescale segment masks to shape.
283
+
284
+ Args:
285
+ masks (torch.Tensor): (N, C, H, W).
286
+ shape (tuple): Height and width.
287
+ padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
288
+ rescaling.
289
+ """
290
+ mh, mw = masks.shape[2:]
291
+ gain = min(mh / shape[0], mw / shape[1]) # gain = old / new
292
+ pad = [mw - shape[1] * gain, mh - shape[0] * gain] # wh padding
293
+ if padding:
294
+ pad[0] /= 2
295
+ pad[1] /= 2
296
+ top, left = (int(pad[1]), int(pad[0])) if padding else (0, 0) # y, x
297
+ bottom, right = (int(mh - pad[1]), int(mw - pad[0]))
298
+ masks = masks[..., top:bottom, left:right]
299
+
300
+ masks = F.interpolate(masks, shape, mode="bilinear", align_corners=False) # NCHW
301
+ return masks
302
+
303
+
304
+ def process_mask_native(protos, masks_in, bboxes, shape):
305
+ """
306
+ It takes the output of the mask head, and crops it after upsampling to the bounding boxes.
307
+
308
+ Args:
309
+ protos (torch.Tensor): [mask_dim, mask_h, mask_w]
310
+ masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms
311
+ bboxes (torch.Tensor): [n, 4], n is number of masks after nms
312
+ shape (tuple): the size of the input image (h,w)
313
+
314
+ Returns:
315
+ masks (torch.Tensor): The returned masks with dimensions [h, w, n]
316
+ """
317
+ c, mh, mw = protos.shape # CHW
318
+ masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw)
319
+ masks = scale_masks(masks[None], shape)[0] # CHW
320
+ masks = crop_mask(masks, bboxes) # CHW
321
+ return masks.gt_(0.5)
322
+
323
+ def crop_mask(masks, boxes):
324
+ """
325
+ It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box.
326
+
327
+ Args:
328
+ masks (torch.Tensor): [n, h, w] tensor of masks
329
+ boxes (torch.Tensor): [n, 4] tensor of bbox coordinates in relative point form
330
+
331
+ Returns:
332
+ (torch.Tensor): The masks are being cropped to the bounding box.
333
+ """
334
+ _, h, w = masks.shape
335
+ x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(n,1,1)
336
+ r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,1,w)
337
+ c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(1,h,1)
338
+
339
+ return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
340
+
341
+ def process_mask(protos, masks_in, bboxes, shape, upsample=False):
342
+ """
343
+ Apply masks to bounding boxes using the output of the mask head.
344
+
345
+ Args:
346
+ protos (torch.Tensor): A tensor of shape [mask_dim, mask_h, mask_w].
347
+ masks_in (torch.Tensor): A tensor of shape [n, mask_dim], where n is the number of masks after NMS.
348
+ bboxes (torch.Tensor): A tensor of shape [n, 4], where n is the number of masks after NMS.
349
+ shape (tuple): A tuple of integers representing the size of the input image in the format (h, w).
350
+ upsample (bool): A flag to indicate whether to upsample the mask to the original image size. Default is False.
351
+
352
+ Returns:
353
+ (torch.Tensor): A binary mask tensor of shape [n, h, w], where n is the number of masks after NMS, and h and w
354
+ are the height and width of the input image. The mask is applied to the bounding boxes.
355
+ """
356
+
357
+ c, mh, mw = protos.shape # CHW
358
+ ih, iw = shape
359
+ masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) # CHW
360
+
361
+ downsampled_bboxes = bboxes.clone()
362
+ downsampled_bboxes[:, 0] *= mw / iw
363
+ downsampled_bboxes[:, 2] *= mw / iw
364
+ downsampled_bboxes[:, 3] *= mh / ih
365
+ downsampled_bboxes[:, 1] *= mh / ih
366
+
367
+ masks = crop_mask(masks, downsampled_bboxes) # CHW
368
+ if upsample:
369
+ masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW
370
+ return masks.gt_(0.5)
371
+
372
+
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/utils.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from PIL import Image
4
+
5
+
6
+ def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
7
+ '''Adjust bounding boxes to stick to image border if they are within a certain threshold.
8
+ Args:
9
+ boxes: (n, 4)
10
+ image_shape: (height, width)
11
+ threshold: pixel threshold
12
+ Returns:
13
+ adjusted_boxes: adjusted bounding boxes
14
+ '''
15
+
16
+ # Image dimensions
17
+ h, w = image_shape
18
+
19
+ # Adjust boxes
20
+ boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
21
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
22
+ boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
23
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
24
+ boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
25
+ w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
26
+ boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
27
+ h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
28
+
29
+ return boxes
30
+
31
+
32
+
33
+ def convert_box_xywh_to_xyxy(box):
34
+ x1 = box[0]
35
+ y1 = box[1]
36
+ x2 = box[0] + box[2]
37
+ y2 = box[1] + box[3]
38
+ return [x1, y1, x2, y2]
39
+
40
+
41
+ def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
42
+ '''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
43
+ Args:
44
+ box1: (4, )
45
+ boxes: (n, 4)
46
+ Returns:
47
+ high_iou_indices: Indices of boxes with IoU > thres
48
+ '''
49
+ boxes = adjust_bboxes_to_image_border(boxes, image_shape)
50
+ # obtain coordinates for intersections
51
+ x1 = torch.max(box1[0], boxes[:, 0])
52
+ y1 = torch.max(box1[1], boxes[:, 1])
53
+ x2 = torch.min(box1[2], boxes[:, 2])
54
+ y2 = torch.min(box1[3], boxes[:, 3])
55
+
56
+ # compute the area of intersection
57
+ intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
58
+
59
+ # compute the area of both individual boxes
60
+ box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
61
+ box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
62
+
63
+ # compute the area of union
64
+ union = box1_area + box2_area - intersection
65
+
66
+ # compute the IoU
67
+ iou = intersection / union # Should be shape (n, )
68
+ if raw_output:
69
+ if iou.numel() == 0:
70
+ return 0
71
+ return iou
72
+
73
+ # get indices of boxes with IoU > thres
74
+ high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
75
+
76
+ return high_iou_indices
77
+
78
+
79
+ def image_to_np_ndarray(image):
80
+ if type(image) is str:
81
+ return np.array(Image.open(image))
82
+ elif issubclass(type(image), Image.Image):
83
+ return np.array(image)
84
+ elif type(image) is np.ndarray:
85
+ return image
86
+ return None
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Model Information
2
+ ### Source model
3
+ - Input shape: 640x640
4
+ - Number of parameters: 68.89M
5
+ - Model size: 277.39M
6
+ - Output shape: 1x37x8400,1x32x160x160
7
+
8
+ Source model repository: [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
9
+
10
+ ### Converted model
11
+
12
+ - Precision: W8A16
13
+ - Backend: QNN2.16
14
+ - Target Device: SNM972 QCS8550
15
+
16
+ ## Inference with AidLite SDK
17
+
18
+ ### SDK installation
19
+ Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
20
+
21
+ - Install AidLite SDK
22
+
23
+ ```bash
24
+ # Install the appropriate version of the aidlite sdk
25
+ sudo aid-pkg update
26
+ sudo aid-pkg install aidlite-sdk
27
+ # Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
28
+ sudo aid-pkg install aidlite-{QNN VERSION}
29
+ ```
30
+
31
+ - Verify AidLite SDK
32
+
33
+ ```bash
34
+ # aidlite sdk c++ check
35
+ python3 -c "import aidlite ; print(aidlite.get_library_version())"
36
+
37
+ # aidlite sdk python check
38
+ python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
39
+ ```
40
+
41
+ ### Run demo
42
+ ```bash
43
+ cd fastsam_x/model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite
44
+ export LD_PRELOAD=/home/aidlux/.local/lib/python3.8/site-packages/torch/lib/../../torch.libs/libgomp-804f19d4.so.1.0.0
45
+
46
+ python3 ./python/run_test.py --target_model ./models/cutoff_fastsam_x_w8a16.qnn216.ctx.bin --imgs ./python/dogs.jpg --invoke_nums 10
47
+ ```
48
+
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/models/cutoff_fastsam_x_w8a16.qnn216.ctx.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e08622accfab5d0dac489e898818d66b717f8b4ac820ed213585dfe0a37023b
3
+ size 75750664
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/dogs.jpg ADDED
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/onnx_export.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import cv2
3
+ import os
4
+ import sys
5
+
6
+ from ultralytics.models.fastsam import FastSAM
7
+
8
+ class Fast_SAM(torch.nn.Module):
9
+ """Exportable FastSAM model, end-to-end."""
10
+
11
+ def __init__(self) -> None:
12
+ super().__init__()
13
+ pt_name ='./models/FastSAM-s.pt'
14
+ self.model =FastSAM(pt_name).model
15
+
16
+ def forward(self, image: torch.Tensor):
17
+ """
18
+ Run FastSAM on `image`, and produce high quality segmentation masks.
19
+ Faster than SAM as it is based on YOLOv8.
20
+
21
+ Parameters:
22
+ image: Pixel values pre-processed for encoder consumption.
23
+ Range: float[0, 1]
24
+ 3-channel Color Space: BGR
25
+ Returns:
26
+
27
+ """
28
+ predictions = self.model(image)
29
+ # Return predictions as a tuple instead of nested tuple.
30
+ return (predictions[0], predictions[1][2])
31
+
32
+
33
+ model = Fast_SAM()
34
+ num_params = sum(p.numel() for p in model.parameters())
35
+ print(f'Number of FastSAM-s parameters: {num_params}')
36
+ dummy_input = torch.randn( [1,3,640,640],dtype=torch.float32 )
37
+ source_model = torch.jit.trace(
38
+ model.to("cpu"), dummy_input, check_trace=False
39
+ )
40
+ torch.onnx.export(model, # model being run
41
+ dummy_input, # model input (or a tuple for multiple inputs)
42
+ "./models/fastsam_s.onnx", # where to save the model
43
+ export_params=True, # store the trained parameter weights inside the model file
44
+ opset_version=12, # the ONNX version to export the model to
45
+ do_constant_folding=True, # whether to execute constant folding for optimization
46
+ input_names = ['input'], # the model's input names
47
+ output_names = ['boxes','mask'],
48
+ verbose=True,
49
+ )
50
+ print("Convert to onnx successfully!")
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/prompt.py ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import cv2
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ import torch
7
+ from utils import image_to_np_ndarray
8
+ from PIL import Image
9
+
10
+
11
+ class FastSAMPrompt:
12
+
13
+ def __init__(self, image, results, device='cpu'):
14
+ if isinstance(image, str) or isinstance(image, Image.Image):
15
+ image = image_to_np_ndarray(image)
16
+ self.device = device
17
+ self.results = results
18
+ self.img = image
19
+
20
+ def _segment_image(self, image, bbox):
21
+ if isinstance(image, Image.Image):
22
+ image_array = np.array(image)
23
+ else:
24
+ image_array = image
25
+ segmented_image_array = np.zeros_like(image_array)
26
+ x1, y1, x2, y2 = bbox
27
+ segmented_image_array[y1:y2, x1:x2] = image_array[y1:y2, x1:x2]
28
+ segmented_image = Image.fromarray(segmented_image_array)
29
+ black_image = Image.new('RGB', image.size, (255, 255, 255))
30
+ # transparency_mask = np.zeros_like((), dtype=np.uint8)
31
+ transparency_mask = np.zeros((image_array.shape[0], image_array.shape[1]), dtype=np.uint8)
32
+ transparency_mask[y1:y2, x1:x2] = 255
33
+ transparency_mask_image = Image.fromarray(transparency_mask, mode='L')
34
+ black_image.paste(segmented_image, mask=transparency_mask_image)
35
+ return black_image
36
+
37
+ def _format_results(self, result, filter=0):
38
+ annotations = []
39
+ n = len(result.masks.data)
40
+ for i in range(n):
41
+ annotation = {}
42
+ mask = result.masks.data[i] == 1.0
43
+
44
+ if torch.sum(mask) < filter:
45
+ continue
46
+ annotation['id'] = i
47
+ annotation['segmentation'] = mask.cpu().numpy()
48
+ annotation['bbox'] = result.boxes.data[i]
49
+ annotation['score'] = result.boxes.conf[i]
50
+ annotation['area'] = annotation['segmentation'].sum()
51
+ annotations.append(annotation)
52
+ return annotations
53
+
54
+ def filter_masks(annotations): # filte the overlap mask
55
+ annotations.sort(key=lambda x: x['area'], reverse=True)
56
+ to_remove = set()
57
+ for i in range(0, len(annotations)):
58
+ a = annotations[i]
59
+ for j in range(i + 1, len(annotations)):
60
+ b = annotations[j]
61
+ if i != j and j not in to_remove:
62
+ # check if
63
+ if b['area'] < a['area']:
64
+ if (a['segmentation'] & b['segmentation']).sum() / b['segmentation'].sum() > 0.8:
65
+ to_remove.add(j)
66
+
67
+ return [a for i, a in enumerate(annotations) if i not in to_remove], to_remove
68
+
69
+ def _get_bbox_from_mask(self, mask):
70
+ mask = mask.astype(np.uint8)
71
+ contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
72
+ x1, y1, w, h = cv2.boundingRect(contours[0])
73
+ x2, y2 = x1 + w, y1 + h
74
+ if len(contours) > 1:
75
+ for b in contours:
76
+ x_t, y_t, w_t, h_t = cv2.boundingRect(b)
77
+ # Merge multiple bounding boxes into one.
78
+ x1 = min(x1, x_t)
79
+ y1 = min(y1, y_t)
80
+ x2 = max(x2, x_t + w_t)
81
+ y2 = max(y2, y_t + h_t)
82
+ h = y2 - y1
83
+ w = x2 - x1
84
+ return [x1, y1, x2, y2]
85
+
86
+ def plot_to_result(self,
87
+ annotations,
88
+ bboxes=None,
89
+ points=None,
90
+ point_label=None,
91
+ mask_random_color=True,
92
+ better_quality=True,
93
+ retina=False,
94
+ withContours=True) -> np.ndarray:
95
+ if isinstance(annotations[0], dict):
96
+ annotations = [annotation['segmentation'] for annotation in annotations]
97
+ image = self.img
98
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
99
+ original_h = image.shape[0]
100
+ original_w = image.shape[1]
101
+ if sys.platform == "darwin":
102
+ plt.switch_backend("TkAgg")
103
+ plt.figure(figsize=(original_w / 100, original_h / 100))
104
+ # Add subplot with no margin.
105
+ plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
106
+ plt.margins(0, 0)
107
+ plt.gca().xaxis.set_major_locator(plt.NullLocator())
108
+ plt.gca().yaxis.set_major_locator(plt.NullLocator())
109
+
110
+ plt.imshow(image)
111
+ if better_quality:
112
+ if isinstance(annotations[0], torch.Tensor):
113
+ annotations = np.array(annotations.cpu())
114
+ for i, mask in enumerate(annotations):
115
+ mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8))
116
+ annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8))
117
+ if self.device == 'cpu':
118
+ annotations = np.array(annotations)
119
+ self.fast_show_mask(
120
+ annotations,
121
+ plt.gca(),
122
+ random_color=mask_random_color,
123
+ bboxes=bboxes,
124
+ points=points,
125
+ pointlabel=point_label,
126
+ retinamask=retina,
127
+ target_height=original_h,
128
+ target_width=original_w,
129
+ )
130
+ else:
131
+ if isinstance(annotations[0], np.ndarray):
132
+ annotations = torch.from_numpy(annotations)
133
+ self.fast_show_mask_gpu(
134
+ annotations,
135
+ plt.gca(),
136
+ random_color=mask_random_color,
137
+ bboxes=bboxes,
138
+ points=points,
139
+ pointlabel=point_label,
140
+ retinamask=retina,
141
+ target_height=original_h,
142
+ target_width=original_w,
143
+ )
144
+ if isinstance(annotations, torch.Tensor):
145
+ annotations = annotations.cpu().numpy()
146
+ if withContours:
147
+ contour_all = []
148
+ temp = np.zeros((original_h, original_w, 1))
149
+ for i, mask in enumerate(annotations):
150
+ if type(mask) == dict:
151
+ mask = mask['segmentation']
152
+ annotation = mask.astype(np.uint8)
153
+ if not retina:
154
+ annotation = cv2.resize(
155
+ annotation,
156
+ (original_w, original_h),
157
+ interpolation=cv2.INTER_NEAREST,
158
+ )
159
+ contours, hierarchy = cv2.findContours(annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
160
+ for contour in contours:
161
+ contour_all.append(contour)
162
+ cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2)
163
+ color = np.array([0 / 255, 0 / 255, 255 / 255, 0.8])
164
+ contour_mask = temp / 255 * color.reshape(1, 1, -1)
165
+ plt.imshow(contour_mask)
166
+
167
+ plt.axis('off')
168
+ fig = plt.gcf()
169
+ plt.draw()
170
+
171
+ try:
172
+ buf = fig.canvas.tostring_rgb()
173
+ except AttributeError:
174
+ fig.canvas.draw()
175
+ buf = fig.canvas.tostring_rgb()
176
+ cols, rows = fig.canvas.get_width_height()
177
+ img_array = np.frombuffer(buf, dtype=np.uint8).reshape(rows, cols, 3)
178
+ result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
179
+ plt.close()
180
+ return result
181
+
182
+ # Remark for refactoring: IMO a function should do one thing only, storing the image and plotting should be seperated and do not necessarily need to be class functions but standalone utility functions that the user can chain in his scripts to have more fine-grained control.
183
+ def plot(self,
184
+ annotations,
185
+ output_path,
186
+ bboxes=None,
187
+ points=None,
188
+ point_label=None,
189
+ mask_random_color=True,
190
+ better_quality=True,
191
+ retina=False,
192
+ withContours=True):
193
+ if len(annotations) == 0:
194
+ return None
195
+ result = self.plot_to_result(
196
+ annotations,
197
+ bboxes,
198
+ points,
199
+ point_label,
200
+ mask_random_color,
201
+ better_quality,
202
+ retina,
203
+ withContours,
204
+ )
205
+
206
+ path = os.path.dirname(os.path.abspath(output_path))
207
+ if not os.path.exists(path):
208
+ os.makedirs(path)
209
+ result = result[:, :, ::-1]
210
+ cv2.imwrite(output_path, result)
211
+
212
+ # CPU post process
213
+ def fast_show_mask(
214
+ self,
215
+ annotation,
216
+ ax,
217
+ random_color=False,
218
+ bboxes=None,
219
+ points=None,
220
+ pointlabel=None,
221
+ retinamask=True,
222
+ target_height=960,
223
+ target_width=960,
224
+ ):
225
+ msak_sum = annotation.shape[0]
226
+ height = annotation.shape[1]
227
+ weight = annotation.shape[2]
228
+ #Sort annotations based on area.
229
+ areas = np.sum(annotation, axis=(1, 2))
230
+ sorted_indices = np.argsort(areas)
231
+ annotation = annotation[sorted_indices]
232
+
233
+ index = (annotation != 0).argmax(axis=0)
234
+ if random_color:
235
+ color = np.random.random((msak_sum, 1, 1, 3))
236
+ else:
237
+ color = np.ones((msak_sum, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 255 / 255])
238
+ transparency = np.ones((msak_sum, 1, 1, 1)) * 0.6
239
+ visual = np.concatenate([color, transparency], axis=-1)
240
+ mask_image = np.expand_dims(annotation, -1) * visual
241
+
242
+ show = np.zeros((height, weight, 4))
243
+ h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing='ij')
244
+ indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
245
+ # Use vectorized indexing to update the values of 'show'.
246
+ show[h_indices, w_indices, :] = mask_image[indices]
247
+ if bboxes is not None:
248
+ for bbox in bboxes:
249
+ x1, y1, x2, y2 = bbox
250
+ ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
251
+ # draw point
252
+ if points is not None:
253
+ plt.scatter(
254
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
255
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
256
+ s=20,
257
+ c='y',
258
+ )
259
+ plt.scatter(
260
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
261
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
262
+ s=20,
263
+ c='m',
264
+ )
265
+
266
+ if not retinamask:
267
+ show = cv2.resize(show, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
268
+ ax.imshow(show)
269
+
270
+ def fast_show_mask_gpu(
271
+ self,
272
+ annotation,
273
+ ax,
274
+ random_color=False,
275
+ bboxes=None,
276
+ points=None,
277
+ pointlabel=None,
278
+ retinamask=True,
279
+ target_height=960,
280
+ target_width=960,
281
+ ):
282
+ msak_sum = annotation.shape[0]
283
+ height = annotation.shape[1]
284
+ weight = annotation.shape[2]
285
+ areas = torch.sum(annotation, dim=(1, 2))
286
+ sorted_indices = torch.argsort(areas, descending=False)
287
+ annotation = annotation[sorted_indices]
288
+ # Find the index of the first non-zero value at each position.
289
+ index = (annotation != 0).to(torch.long).argmax(dim=0)
290
+ if random_color:
291
+ color = torch.rand((msak_sum, 1, 1, 3)).to(annotation.device)
292
+ else:
293
+ color = torch.ones((msak_sum, 1, 1, 3)).to(annotation.device) * torch.tensor([
294
+ 30 / 255, 144 / 255, 255 / 255]).to(annotation.device)
295
+ transparency = torch.ones((msak_sum, 1, 1, 1)).to(annotation.device) * 0.6
296
+ visual = torch.cat([color, transparency], dim=-1)
297
+ mask_image = torch.unsqueeze(annotation, -1) * visual
298
+ # Select data according to the index. The index indicates which batch's data to choose at each position, converting the mask_image into a single batch form.
299
+ show = torch.zeros((height, weight, 4)).to(annotation.device)
300
+ try:
301
+ h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight), indexing='ij')
302
+ except:
303
+ h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight))
304
+ indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
305
+ # Use vectorized indexing to update the values of 'show'.
306
+ show[h_indices, w_indices, :] = mask_image[indices]
307
+ show_cpu = show.cpu().numpy()
308
+ if bboxes is not None:
309
+ for bbox in bboxes:
310
+ x1, y1, x2, y2 = bbox
311
+ ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
312
+ # draw point
313
+ if points is not None:
314
+ plt.scatter(
315
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
316
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
317
+ s=20,
318
+ c='y',
319
+ )
320
+ plt.scatter(
321
+ [point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
322
+ [point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
323
+ s=20,
324
+ c='m',
325
+ )
326
+ if not retinamask:
327
+ show_cpu = cv2.resize(show_cpu, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
328
+ ax.imshow(show_cpu)
329
+
330
+ # clip
331
+ @torch.no_grad()
332
+ def retrieve(self, model, preprocess, elements, search_text: str, device) -> int:
333
+ preprocessed_images = [preprocess(image).to(device) for image in elements]
334
+ try:
335
+ import clip # for linear_assignment
336
+
337
+ except (ImportError, AssertionError, AttributeError):
338
+ from ultralytics.yolo.utils.checks import check_requirements
339
+
340
+ check_requirements('git+https://github.com/openai/CLIP.git') # required before installing lap from source
341
+ import clip
342
+
343
+
344
+ tokenized_text = clip.tokenize([search_text]).to(device)
345
+ stacked_images = torch.stack(preprocessed_images)
346
+ image_features = model.encode_image(stacked_images)
347
+ text_features = model.encode_text(tokenized_text)
348
+ image_features /= image_features.norm(dim=-1, keepdim=True)
349
+ text_features /= text_features.norm(dim=-1, keepdim=True)
350
+ probs = 100.0 * image_features @ text_features.T
351
+ return probs[:, 0].softmax(dim=0)
352
+
353
+ def _crop_image(self, format_results):
354
+
355
+ image = Image.fromarray(cv2.cvtColor(self.img, cv2.COLOR_BGR2RGB))
356
+ ori_w, ori_h = image.size
357
+ annotations = format_results
358
+ mask_h, mask_w = annotations[0]['segmentation'].shape
359
+ if ori_w != mask_w or ori_h != mask_h:
360
+ image = image.resize((mask_w, mask_h))
361
+ cropped_boxes = []
362
+ cropped_images = []
363
+ not_crop = []
364
+ filter_id = []
365
+ # annotations, _ = filter_masks(annotations)
366
+ # filter_id = list(_)
367
+ for _, mask in enumerate(annotations):
368
+ if np.sum(mask['segmentation']) <= 100:
369
+ filter_id.append(_)
370
+ continue
371
+ bbox = self._get_bbox_from_mask(mask['segmentation']) # mask 的 bbox
372
+ cropped_boxes.append(self._segment_image(image, bbox))
373
+ # cropped_boxes.append(segment_image(image,mask["segmentation"]))
374
+ cropped_images.append(bbox) # Save the bounding box of the cropped image.
375
+
376
+ return cropped_boxes, cropped_images, not_crop, filter_id, annotations
377
+
378
+ def box_prompt(self, bbox=None, bboxes=None):
379
+ if self.results == None:
380
+ return []
381
+ assert bbox or bboxes
382
+ if bboxes is None:
383
+ bboxes = [bbox]
384
+ max_iou_index = []
385
+ for bbox in bboxes:
386
+ assert (bbox[2] != 0 and bbox[3] != 0)
387
+ masks = self.results[0].masks.data
388
+ target_height = self.img.shape[0]
389
+ target_width = self.img.shape[1]
390
+ h = masks.shape[1]
391
+ w = masks.shape[2]
392
+ if h != target_height or w != target_width:
393
+ bbox = [
394
+ int(bbox[0] * w / target_width),
395
+ int(bbox[1] * h / target_height),
396
+ int(bbox[2] * w / target_width),
397
+ int(bbox[3] * h / target_height), ]
398
+ bbox[0] = round(bbox[0]) if round(bbox[0]) > 0 else 0
399
+ bbox[1] = round(bbox[1]) if round(bbox[1]) > 0 else 0
400
+ bbox[2] = round(bbox[2]) if round(bbox[2]) < w else w
401
+ bbox[3] = round(bbox[3]) if round(bbox[3]) < h else h
402
+
403
+ # IoUs = torch.zeros(len(masks), dtype=torch.float32)
404
+ bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
405
+
406
+ masks_area = torch.sum(masks[:, bbox[1]:bbox[3], bbox[0]:bbox[2]], dim=(1, 2))
407
+ orig_masks_area = torch.sum(masks, dim=(1, 2))
408
+
409
+ union = bbox_area + orig_masks_area - masks_area
410
+ IoUs = masks_area / union
411
+ max_iou_index.append(int(torch.argmax(IoUs)))
412
+ max_iou_index = list(set(max_iou_index))
413
+ return np.array(masks[max_iou_index].cpu().numpy())
414
+
415
+ def point_prompt(self, points, pointlabel): # numpy
416
+ if self.results == None:
417
+ return []
418
+ masks = self._format_results(self.results[0], 0)
419
+ target_height = self.img.shape[0]
420
+ target_width = self.img.shape[1]
421
+ h = masks[0]['segmentation'].shape[0]
422
+ w = masks[0]['segmentation'].shape[1]
423
+ if h != target_height or w != target_width:
424
+ points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points]
425
+ onemask = np.zeros((h, w))
426
+ masks = sorted(masks, key=lambda x: x['area'], reverse=True)
427
+ for i, annotation in enumerate(masks):
428
+ if type(annotation) == dict:
429
+ mask = annotation['segmentation']
430
+ else:
431
+ mask = annotation
432
+ for i, point in enumerate(points):
433
+ if mask[point[1], point[0]] == 1 and pointlabel[i] == 1:
434
+ onemask[mask] = 1
435
+ if mask[point[1], point[0]] == 1 and pointlabel[i] == 0:
436
+ onemask[mask] = 0
437
+ onemask = onemask >= 1
438
+ return np.array([onemask])
439
+
440
+ def text_prompt(self, text):
441
+ if self.results == None:
442
+ return []
443
+ format_results = self._format_results(self.results[0], 0)
444
+ cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._crop_image(format_results)
445
+ clip_model, preprocess = clip.load('ViT-B/32', device=self.device)
446
+ scores = self.retrieve(clip_model, preprocess, cropped_boxes, text, device=self.device)
447
+ max_idx = scores.argsort()
448
+ max_idx = max_idx[-1]
449
+ max_idx += sum(np.array(filter_id) <= int(max_idx))
450
+ return np.array([annotations[max_idx]['segmentation']])
451
+
452
+ def everything_prompt(self):
453
+ if self.results == None:
454
+ return []
455
+ return self.results[0].masks.data
456
+
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/run_test.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import cv2
4
+ import numpy as np
5
+ import onnxruntime
6
+ import time
7
+ import matplotlib.pyplot as plt
8
+ import torch
9
+ from ultralytics.engine.results import Results
10
+ from tools_pt import *
11
+ from prompt import FastSAMPrompt
12
+ import aidlite
13
+ import argparse
14
+ import ast
15
+
16
+ # 定义相似度函数
17
+ def get_acc(onnx_out,other_out):
18
+ cosine_similarity=np.dot(np.array(onnx_out),np.array(other_out))/(np.linalg.norm(np.array(onnx_out)) * np.linalg.norm(np.array(other_out)))
19
+ return cosine_similarity
20
+
21
+ def cal_sigmoid(x):
22
+ return 1 / (1 + np.exp(-x))
23
+
24
+ class qnn_predict(object):
25
+ def __init__(self,inputshape,outputshape,args) -> None:
26
+ aidlite.set_log_level(aidlite.LogLevel.INFO)
27
+ aidlite.log_to_stderr()
28
+ print(f"Aidlite library version : {aidlite.get_library_version()}")
29
+ print(f"Aidlite python library version : {aidlite.get_py_library_version()}")
30
+ config = aidlite.Config.create_instance()
31
+ if config is None:
32
+ print("Create model failed !")
33
+ config.implement_type = aidlite.ImplementType.TYPE_LOCAL
34
+ config.framework_type = aidlite.FrameworkType.TYPE_QNN
35
+ config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
36
+ config.is_quantify_model = 1
37
+
38
+ model = aidlite.Model.create_instance(args.target_model)
39
+ if model is None:
40
+ print("Create model failed !")
41
+
42
+ self.input_shape=inputshape
43
+ self.out_shape = outputshape
44
+ model.set_model_properties(self.input_shape, aidlite.DataType.TYPE_FLOAT32, self.out_shape, aidlite.DataType.TYPE_FLOAT32)
45
+ self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(model, config)
46
+ if self.interpreter is None:
47
+ print("build_interpretper_from_model_and_config failed !")
48
+ result = self.interpreter.init()
49
+ if result != 0:
50
+ print(f"interpreter init failed !")
51
+ result = self.interpreter.load_model()
52
+ if result != 0:
53
+ print("interpreter load model failed !")
54
+ print("detect model load success!")
55
+
56
+ self.conf = 0.4
57
+ self.iou=0.9
58
+ self.size = 640
59
+ self.agnostic_nms=False
60
+ self.max_det = 300
61
+ self.names=['object']
62
+ self.classes =None
63
+ self.retina_masks=True
64
+
65
+ def pretreat_img(self,img):
66
+ scale = 1/255.
67
+ img_size = cv2.resize(img, (self.size,self.size), interpolation=cv2.INTER_LINEAR)
68
+ float_img = img_size.astype('float32')
69
+ float_img = float_img* scale
70
+ float_img = float_img[:, :, ::-1]
71
+ return float_img
72
+
73
+ def postprocess(self, preds, img, orig_imgs):
74
+ """TODO: filter by classes."""
75
+ p = non_max_suppression(torch.from_numpy(preds[0]),
76
+ self.conf,
77
+ self.iou,
78
+ agnostic=self.agnostic_nms,
79
+ max_det=self.max_det,
80
+ nc=len(self.names),
81
+ classes=self.classes)
82
+
83
+ results = []
84
+ if len(p) == 0 or len(p[0]) == 0:
85
+ print("No object detected.")
86
+ return results
87
+
88
+ full_box = torch.zeros_like(p[0][0])
89
+ full_box[2], full_box[3], full_box[4], full_box[6:] = img.shape[3], img.shape[2], 1.0, 1.0
90
+ full_box = full_box.view(1, -1)
91
+ critical_iou_index = bbox_iou(full_box[0][:4], p[0][:, :4], iou_thres=0.9, image_shape=img.shape[2:])
92
+ if critical_iou_index.numel() != 0:
93
+ full_box[0][4] = p[0][critical_iou_index][:,4]
94
+ full_box[0][6:] = p[0][critical_iou_index][:,6:]
95
+ p[0][critical_iou_index] = full_box
96
+
97
+ #proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported
98
+ proto=torch.from_numpy(preds[-1])
99
+ for i, pred in enumerate(p):
100
+ orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs
101
+ path =img[0] #self.batch[0]
102
+ img_path = path[i] if isinstance(path, list) else path
103
+ if not len(pred): # save empty boxes
104
+ results.append(Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6]))
105
+ continue
106
+ if self.retina_masks:
107
+ if not isinstance(orig_imgs, torch.Tensor):
108
+ pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
109
+ masks = process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2]) # HWC
110
+ else:
111
+ masks = process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True) # HWC
112
+ if not isinstance(orig_imgs, torch.Tensor):
113
+ pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
114
+ results.append(
115
+ Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6], masks=masks))
116
+ return results
117
+
118
+ def qnn_run(self, orig_imgs,img_path,args):
119
+ input_img_f =self.pretreat_img(orig_imgs) # 图片resize HWC
120
+ # print("qnn_input:",input_img_f)
121
+ # encoder texts
122
+ input_img = np.expand_dims(input_img_f, 0)
123
+
124
+ invoke_time=[]
125
+ for i in range(args.invoke_nums):
126
+ result = self.interpreter.set_input_tensor(0, input_img.data)
127
+ t0 = time.time()
128
+ result = self.interpreter.invoke()
129
+ t1 = time.time()
130
+ cost_time=(t1-t0)*1000
131
+ invoke_time.append(cost_time)
132
+ mask_ = self.interpreter.get_output_tensor(0)
133
+ concat_ = self.interpreter.get_output_tensor(1)
134
+ mul_ = self.interpreter.get_output_tensor(3)
135
+ split_ = self.interpreter.get_output_tensor(2)
136
+ mask_ = mask_.reshape( * self.out_shape[3])
137
+ mask_=mask_.transpose((0, 3, 1,2))
138
+ concat_ = concat_.reshape( *self.out_shape[2])
139
+ mul_ = mul_.reshape( *self.out_shape[1])
140
+ split_ = split_.reshape( *self.out_shape[0])
141
+ sig_ = cal_sigmoid(split_)
142
+
143
+ output_concat = np.concatenate((mul_,sig_),axis=1)
144
+ output_concat = np.concatenate((output_concat,concat_),axis=1)
145
+
146
+ # outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
147
+ ## time 统计
148
+ max_invoke_time = max(invoke_time)
149
+ min_invoke_time = min(invoke_time)
150
+ mean_invoke_time = sum(invoke_time)/args.invoke_nums
151
+ var_invoketime=np.var(invoke_time)
152
+ print("========================================")
153
+ print(f"QNN inference {args.invoke_nums} times :\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
154
+ print("========================================")
155
+
156
+ qnn_out = [np.array(output_concat),np.array(mask_)]
157
+ # print("qnn predict out:",qnn_out)
158
+
159
+ nchw_img = input_img.transpose(0,3,1,2)
160
+ everything_results = self.postprocess( qnn_out, nchw_img, [orig_imgs])
161
+ # print("everything_results: ",everything_results)
162
+
163
+ prompt_process = FastSAMPrompt(args.imgs, everything_results, device="cpu")
164
+
165
+ # ann = prompt_process.point_prompt(points=[[620, 360]], pointlabel=[1])
166
+ try:
167
+ if args.point_prompt ==[[0,0]]:
168
+ ann = prompt_process.everything_prompt()
169
+ else:
170
+ ann = prompt_process.point_prompt(points=args.point_prompt, pointlabel=[1])
171
+ out_name = os.path.basename(img_path).split(".")[0]
172
+ if True: # savepic
173
+ outpath = "python/"
174
+ if not os.path.exists(outpath):
175
+ os.mkdir(outpath)
176
+ prompt_process.plot(
177
+ annotations=ann,
178
+ output_path=os.path.join(outpath,out_name+"_result.jpg"),
179
+ mask_random_color=True,
180
+ better_quality=True,
181
+ retina=False,
182
+ withContours=True,
183
+ )
184
+ else:
185
+ plt.figure()
186
+ prompt_process.fast_show_mask(annotation=ann,
187
+ ax = plt)
188
+ except Exception as e:
189
+ print(f"Waning : An error occurred in the picture {img_path} prediction -{e}")
190
+ return [mask_.reshape(-1),output_concat.reshape(-1)]
191
+
192
+
193
+
194
+ def parser_args():
195
+ parser = argparse.ArgumentParser(description="Run model benchmarks")
196
+ parser.add_argument('--target_model',type=str,default='models/cutoff_fastsam_x_w8a16.qnn216.ctx.bin.aidem',help="inference model path")
197
+ parser.add_argument('--source_model',type=str,default='models/fastsam_x.onnx',help="original model path")
198
+ parser.add_argument('--imgs',type=str,default='python/dogs.jpg',help="Predict images path")
199
+ parser.add_argument('--invoke_nums',type=int,default=10,help="Inference nums")
200
+ parser.add_argument('--point_prompt',type=str,default="[[0,0]]",help="example:[[x1,y1],[x2,y2]]")
201
+ args = parser.parse_args()
202
+ return args
203
+
204
+
205
+ if __name__ == "__main__":
206
+ args = parser_args()
207
+ inputshape=[[1,640,640,3]]
208
+ outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
209
+ args.point_prompt = ast.literal_eval(args.point_prompt)
210
+
211
+ predict = qnn_predict(inputshape,outputshape,args)
212
+ if os.path.isdir(args.imgs):
213
+ img_files = os.listdir(args.imgs)
214
+ for fi in img_files:
215
+ img_path = os.path.join(args.imgs,fi)
216
+ im0s = cv2.imread(img_path) # BGR
217
+ im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
218
+ predict.qnn_run(im0s,img_path,args)
219
+ else:
220
+ img_path = args.imgs
221
+ im0s = cv2.imread(img_path) # BGR
222
+ im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
223
+ qnn_result = predict.qnn_run(im0s,img_path,args)
224
+ print("Prediction completion and the results are saved !")
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/tools_pt.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import time
3
+ import torch
4
+ import torchvision
5
+ import torch.nn.functional as F
6
+
7
+
8
+
9
+ def clip_boxes(boxes, shape):
10
+ """
11
+ Takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the shape.
12
+
13
+ Args:
14
+ boxes (torch.Tensor): the bounding boxes to clip
15
+ shape (tuple): the shape of the image
16
+ """
17
+ if isinstance(boxes, torch.Tensor): # faster individually
18
+ boxes[..., 0].clamp_(0, shape[1]) # x1
19
+ boxes[..., 1].clamp_(0, shape[0]) # y1
20
+ boxes[..., 2].clamp_(0, shape[1]) # x2
21
+ boxes[..., 3].clamp_(0, shape[0]) # y2
22
+ else: # np.array (faster grouped)
23
+ boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
24
+ boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
25
+
26
+ def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding=True):
27
+ """
28
+ Rescales bounding boxes (in the format of xyxy) from the shape of the image they were originally specified in
29
+ (img1_shape) to the shape of a different image (img0_shape).
30
+
31
+ Args:
32
+ img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
33
+ boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
34
+ img0_shape (tuple): the shape of the target image, in the format of (height, width).
35
+ ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
36
+ calculated based on the size difference between the two images.
37
+ padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
38
+ rescaling.
39
+
40
+ Returns:
41
+ boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
42
+ """
43
+ if ratio_pad is None: # calculate from img0_shape
44
+ gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
45
+ pad = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1), round(
46
+ (img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # wh padding
47
+ else:
48
+ gain = ratio_pad[0][0]
49
+ pad = ratio_pad[1]
50
+
51
+ if padding:
52
+ boxes[..., [0, 2]] -= pad[0] # x padding
53
+ boxes[..., [1, 3]] -= pad[1] # y padding
54
+ boxes[..., :4] /= gain
55
+ clip_boxes(boxes, img0_shape)
56
+ return boxes
57
+
58
+
59
+ def xywh2xyxy(x):
60
+ """
61
+ Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
62
+ top-left corner and (x2, y2) is the bottom-right corner.
63
+
64
+ Args:
65
+ x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.
66
+
67
+ Returns:
68
+ y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
69
+ """
70
+ assert x.shape[-1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'
71
+ y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x) # faster than clone/copy
72
+ dw = x[..., 2] / 2 # half-width
73
+ dh = x[..., 3] / 2 # half-height
74
+ y[..., 0] = x[..., 0] - dw # top left x
75
+ y[..., 1] = x[..., 1] - dh # top left y
76
+ y[..., 2] = x[..., 0] + dw # bottom right x
77
+ y[..., 3] = x[..., 1] + dh # bottom right y
78
+ return y
79
+
80
+
81
+ def non_max_suppression(
82
+ prediction,
83
+ conf_thres=0.25,
84
+ iou_thres=0.45,
85
+ classes=None,
86
+ agnostic=False,
87
+ multi_label=False,
88
+ labels=(),
89
+ max_det=300,
90
+ nc=0, # number of classes (optional)
91
+ max_time_img=0.05,
92
+ max_nms=30000,
93
+ max_wh=7680,
94
+ ):
95
+ """
96
+ Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box.
97
+
98
+ Args:
99
+ prediction (torch.Tensor): A tensor of shape (batch_size, num_classes + 4 + num_masks, num_boxes)
100
+ containing the predicted boxes, classes, and masks. The tensor should be in the format
101
+ output by a model, such as YOLO.
102
+ conf_thres (float): The confidence threshold below which boxes will be filtered out.
103
+ Valid values are between 0.0 and 1.0.
104
+ iou_thres (float): The IoU threshold below which boxes will be filtered out during NMS.
105
+ Valid values are between 0.0 and 1.0.
106
+ classes (List[int]): A list of class indices to consider. If None, all classes will be considered.
107
+ agnostic (bool): If True, the model is agnostic to the number of classes, and all
108
+ classes will be considered as one.
109
+ multi_label (bool): If True, each box may have multiple labels.
110
+ labels (List[List[Union[int, float, torch.Tensor]]]): A list of lists, where each inner
111
+ list contains the apriori labels for a given image. The list should be in the format
112
+ output by a dataloader, with each label being a tuple of (class_index, x1, y1, x2, y2).
113
+ max_det (int): The maximum number of boxes to keep after NMS.
114
+ nc (int, optional): The number of classes output by the model. Any indices after this will be considered masks.
115
+ max_time_img (float): The maximum time (seconds) for processing one image.
116
+ max_nms (int): The maximum number of boxes into torchvision.ops.nms().
117
+ max_wh (int): The maximum box width and height in pixels
118
+
119
+ Returns:
120
+ (List[torch.Tensor]): A list of length batch_size, where each element is a tensor of
121
+ shape (num_boxes, 6 + num_masks) containing the kept boxes, with columns
122
+ (x1, y1, x2, y2, confidence, class, mask1, mask2, ...).
123
+ """
124
+
125
+ # Checks
126
+ assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
127
+ assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
128
+ if isinstance(prediction, (list, tuple)): # YOLOv8 model in validation model, output = (inference_out, loss_out)
129
+ prediction = prediction[0] # select only inference output
130
+
131
+ device = prediction.device
132
+ mps = 'mps' in device.type # Apple MPS
133
+ if mps: # MPS not fully supported yet, convert tensors to CPU before NMS
134
+ prediction = prediction.cpu()
135
+ bs = prediction.shape[0] # batch size
136
+ nc = nc or (prediction.shape[1] - 4) # number of classes
137
+ nm = prediction.shape[1] - nc - 4
138
+ mi = 4 + nc # mask start index
139
+ xc = prediction[:, 4:mi].amax(1) > conf_thres # candidates
140
+
141
+ # Settings
142
+ # min_wh = 2 # (pixels) minimum box width and height
143
+ time_limit = 0.5 + max_time_img * bs # seconds to quit after
144
+ multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
145
+
146
+ prediction = prediction.transpose(-1, -2) # shape(1,84,6300) to shape(1,6300,84)
147
+ prediction[..., :4] = xywh2xyxy(prediction[..., :4]) # xywh to xyxy
148
+
149
+ t = time.time()
150
+ output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
151
+ for xi, x in enumerate(prediction): # image index, image inference
152
+ # Apply constraints
153
+ # x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0 # width-height
154
+ x = x[xc[xi]] # confidence
155
+
156
+ # Cat apriori labels if autolabelling
157
+ if labels and len(labels[xi]):
158
+ lb = labels[xi]
159
+ v = torch.zeros((len(lb), nc + nm + 4), device=x.device)
160
+ v[:, :4] = xywh2xyxy(lb[:, 1:5]) # box
161
+ v[range(len(lb)), lb[:, 0].long() + 4] = 1.0 # cls
162
+ x = torch.cat((x, v), 0)
163
+
164
+ # If none remain process next image
165
+ if not x.shape[0]:
166
+ continue
167
+
168
+ # Detections matrix nx6 (xyxy, conf, cls)
169
+ box, cls, mask = x.split((4, nc, nm), 1)
170
+
171
+ if multi_label:
172
+ i, j = torch.where(cls > conf_thres)
173
+ x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1)
174
+ else: # best class only
175
+ conf, j = cls.max(1, keepdim=True)
176
+ x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
177
+
178
+ # Filter by class
179
+ if classes is not None:
180
+ x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
181
+
182
+ # Check shape
183
+ n = x.shape[0] # number of boxes
184
+ if not n: # no boxes
185
+ continue
186
+ if n > max_nms: # excess boxes
187
+ x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence and remove excess boxes
188
+
189
+ # Batched NMS
190
+ c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
191
+ boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
192
+ i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
193
+ i = i[:max_det] # limit detections
194
+
195
+ # # Experimental
196
+ # merge = False # use merge-NMS
197
+ # if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
198
+ # # Update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
199
+ # from .metrics import box_iou
200
+ # iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
201
+ # weights = iou * scores[None] # box weights
202
+ # x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
203
+ # redundant = True # require redundant detections
204
+ # if redundant:
205
+ # i = i[iou.sum(1) > 1] # require redundancy
206
+
207
+ output[xi] = x[i]
208
+ if mps:
209
+ output[xi] = output[xi].to(device)
210
+ # if (time.time() - t) > time_limit:
211
+ # LOGGER.warning(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded')
212
+ # break # time limit exceeded
213
+
214
+ return output
215
+
216
+
217
+ def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
218
+ '''Adjust bounding boxes to stick to image border if they are within a certain threshold.
219
+ Args:
220
+ boxes: (n, 4)
221
+ image_shape: (height, width)
222
+ threshold: pixel threshold
223
+ Returns:
224
+ adjusted_boxes: adjusted bounding boxes
225
+ '''
226
+
227
+ # Image dimensions
228
+ h, w = image_shape
229
+
230
+ # Adjust boxes
231
+ boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
232
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
233
+ boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
234
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
235
+ boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
236
+ w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
237
+ boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
238
+ h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
239
+
240
+ return boxes
241
+
242
+ def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
243
+ '''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
244
+ Args:
245
+ box1: (4, )
246
+ boxes: (n, 4)
247
+ Returns:
248
+ high_iou_indices: Indices of boxes with IoU > thres
249
+ '''
250
+ boxes = adjust_bboxes_to_image_border(boxes, image_shape)
251
+ # obtain coordinates for intersections
252
+ x1 = torch.max(box1[0], boxes[:, 0])
253
+ y1 = torch.max(box1[1], boxes[:, 1])
254
+ x2 = torch.min(box1[2], boxes[:, 2])
255
+ y2 = torch.min(box1[3], boxes[:, 3])
256
+
257
+ # compute the area of intersection
258
+ intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
259
+
260
+ # compute the area of both individual boxes
261
+ box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
262
+ box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
263
+
264
+ # compute the area of union
265
+ union = box1_area + box2_area - intersection
266
+
267
+ # compute the IoU
268
+ iou = intersection / union # Should be shape (n, )
269
+ if raw_output:
270
+ if iou.numel() == 0:
271
+ return 0
272
+ return iou
273
+
274
+ # get indices of boxes with IoU > thres
275
+ high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
276
+
277
+ return high_iou_indices
278
+
279
+
280
+ def scale_masks(masks, shape, padding=True):
281
+ """
282
+ Rescale segment masks to shape.
283
+
284
+ Args:
285
+ masks (torch.Tensor): (N, C, H, W).
286
+ shape (tuple): Height and width.
287
+ padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
288
+ rescaling.
289
+ """
290
+ mh, mw = masks.shape[2:]
291
+ gain = min(mh / shape[0], mw / shape[1]) # gain = old / new
292
+ pad = [mw - shape[1] * gain, mh - shape[0] * gain] # wh padding
293
+ if padding:
294
+ pad[0] /= 2
295
+ pad[1] /= 2
296
+ top, left = (int(pad[1]), int(pad[0])) if padding else (0, 0) # y, x
297
+ bottom, right = (int(mh - pad[1]), int(mw - pad[0]))
298
+ masks = masks[..., top:bottom, left:right]
299
+
300
+ masks = F.interpolate(masks, shape, mode="bilinear", align_corners=False) # NCHW
301
+ return masks
302
+
303
+
304
+ def process_mask_native(protos, masks_in, bboxes, shape):
305
+ """
306
+ It takes the output of the mask head, and crops it after upsampling to the bounding boxes.
307
+
308
+ Args:
309
+ protos (torch.Tensor): [mask_dim, mask_h, mask_w]
310
+ masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms
311
+ bboxes (torch.Tensor): [n, 4], n is number of masks after nms
312
+ shape (tuple): the size of the input image (h,w)
313
+
314
+ Returns:
315
+ masks (torch.Tensor): The returned masks with dimensions [h, w, n]
316
+ """
317
+ c, mh, mw = protos.shape # CHW
318
+ masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw)
319
+ masks = scale_masks(masks[None], shape)[0] # CHW
320
+ masks = crop_mask(masks, bboxes) # CHW
321
+ return masks.gt_(0.5)
322
+
323
+ def crop_mask(masks, boxes):
324
+ """
325
+ It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box.
326
+
327
+ Args:
328
+ masks (torch.Tensor): [n, h, w] tensor of masks
329
+ boxes (torch.Tensor): [n, 4] tensor of bbox coordinates in relative point form
330
+
331
+ Returns:
332
+ (torch.Tensor): The masks are being cropped to the bounding box.
333
+ """
334
+ _, h, w = masks.shape
335
+ x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(n,1,1)
336
+ r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,1,w)
337
+ c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(1,h,1)
338
+
339
+ return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
340
+
341
+ def process_mask(protos, masks_in, bboxes, shape, upsample=False):
342
+ """
343
+ Apply masks to bounding boxes using the output of the mask head.
344
+
345
+ Args:
346
+ protos (torch.Tensor): A tensor of shape [mask_dim, mask_h, mask_w].
347
+ masks_in (torch.Tensor): A tensor of shape [n, mask_dim], where n is the number of masks after NMS.
348
+ bboxes (torch.Tensor): A tensor of shape [n, 4], where n is the number of masks after NMS.
349
+ shape (tuple): A tuple of integers representing the size of the input image in the format (h, w).
350
+ upsample (bool): A flag to indicate whether to upsample the mask to the original image size. Default is False.
351
+
352
+ Returns:
353
+ (torch.Tensor): A binary mask tensor of shape [n, h, w], where n is the number of masks after NMS, and h and w
354
+ are the height and width of the input image. The mask is applied to the bounding boxes.
355
+ """
356
+
357
+ c, mh, mw = protos.shape # CHW
358
+ ih, iw = shape
359
+ masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) # CHW
360
+
361
+ downsampled_bboxes = bboxes.clone()
362
+ downsampled_bboxes[:, 0] *= mw / iw
363
+ downsampled_bboxes[:, 2] *= mw / iw
364
+ downsampled_bboxes[:, 3] *= mh / ih
365
+ downsampled_bboxes[:, 1] *= mh / ih
366
+
367
+ masks = crop_mask(masks, downsampled_bboxes) # CHW
368
+ if upsample:
369
+ masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW
370
+ return masks.gt_(0.5)
371
+
372
+
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/utils.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from PIL import Image
4
+
5
+
6
+ def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
7
+ '''Adjust bounding boxes to stick to image border if they are within a certain threshold.
8
+ Args:
9
+ boxes: (n, 4)
10
+ image_shape: (height, width)
11
+ threshold: pixel threshold
12
+ Returns:
13
+ adjusted_boxes: adjusted bounding boxes
14
+ '''
15
+
16
+ # Image dimensions
17
+ h, w = image_shape
18
+
19
+ # Adjust boxes
20
+ boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
21
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
22
+ boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
23
+ 0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
24
+ boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
25
+ w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
26
+ boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
27
+ h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
28
+
29
+ return boxes
30
+
31
+
32
+
33
+ def convert_box_xywh_to_xyxy(box):
34
+ x1 = box[0]
35
+ y1 = box[1]
36
+ x2 = box[0] + box[2]
37
+ y2 = box[1] + box[3]
38
+ return [x1, y1, x2, y2]
39
+
40
+
41
+ def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
42
+ '''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
43
+ Args:
44
+ box1: (4, )
45
+ boxes: (n, 4)
46
+ Returns:
47
+ high_iou_indices: Indices of boxes with IoU > thres
48
+ '''
49
+ boxes = adjust_bboxes_to_image_border(boxes, image_shape)
50
+ # obtain coordinates for intersections
51
+ x1 = torch.max(box1[0], boxes[:, 0])
52
+ y1 = torch.max(box1[1], boxes[:, 1])
53
+ x2 = torch.min(box1[2], boxes[:, 2])
54
+ y2 = torch.min(box1[3], boxes[:, 3])
55
+
56
+ # compute the area of intersection
57
+ intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
58
+
59
+ # compute the area of both individual boxes
60
+ box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
61
+ box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
62
+
63
+ # compute the area of union
64
+ union = box1_area + box2_area - intersection
65
+
66
+ # compute the IoU
67
+ iou = intersection / union # Should be shape (n, )
68
+ if raw_output:
69
+ if iou.numel() == 0:
70
+ return 0
71
+ return iou
72
+
73
+ # get indices of boxes with IoU > thres
74
+ high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
75
+
76
+ return high_iou_indices
77
+
78
+
79
+ def image_to_np_ndarray(image):
80
+ if type(image) is str:
81
+ return np.array(Image.open(image))
82
+ elif issubclass(type(image), Image.Image):
83
+ return np.array(image)
84
+ elif type(image) is np.ndarray:
85
+ return image
86
+ return None