Upload 35 files
Browse files- model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/README.md +48 -0
- model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt +32 -0
- model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/cpp/dogs.jpg +0 -0
- model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp +899 -0
- model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin +3 -0
- model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/dogs.jpg +0 -0
- model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/onnx_export.py +50 -0
- model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/prompt.py +456 -0
- model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/run_test.py +224 -0
- model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/tools_pt.py +372 -0
- model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/utils.py +86 -0
- model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/README.md +48 -0
- model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/models/cutoff_fastsam_x_fp16.qnn216.ctx.bin +3 -0
- model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/dogs.jpg +0 -0
- model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/onnx_export.py +50 -0
- model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/prompt.py +456 -0
- model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/run_test.py +224 -0
- model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/tools_pt.py +372 -0
- model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/utils.py +86 -0
- model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/README.md +48 -0
- model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin +3 -0
- model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/dogs.jpg +0 -0
- model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/onnx_export.py +50 -0
- model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/prompt.py +456 -0
- model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/run_test.py +224 -0
- model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/tools_pt.py +372 -0
- model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/utils.py +86 -0
- model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/README.md +48 -0
- model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/models/cutoff_fastsam_x_w8a16.qnn216.ctx.bin +3 -0
- model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/dogs.jpg +0 -0
- model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/onnx_export.py +50 -0
- model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/prompt.py +456 -0
- model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/run_test.py +224 -0
- model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/tools_pt.py +372 -0
- model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/utils.py +86 -0
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/README.md
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Model Information
|
2 |
+
### Source model
|
3 |
+
- Input shape: 640x640
|
4 |
+
- Number of parameters: 68.89M
|
5 |
+
- Model size: 277.39M
|
6 |
+
- Output shape: 1x37x8400,1x32x160x160
|
7 |
+
|
8 |
+
Source model repository: [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
|
9 |
+
|
10 |
+
### Converted model
|
11 |
+
|
12 |
+
- Precision: INT8
|
13 |
+
- Backend: QNN2.16
|
14 |
+
- Target Device: FV01 QCS6490
|
15 |
+
|
16 |
+
## Inference with AidLite SDK
|
17 |
+
|
18 |
+
### SDK installation
|
19 |
+
Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
|
20 |
+
|
21 |
+
- Install AidLite SDK
|
22 |
+
|
23 |
+
```bash
|
24 |
+
# Install the appropriate version of the aidlite sdk
|
25 |
+
sudo aid-pkg update
|
26 |
+
sudo aid-pkg install aidlite-sdk
|
27 |
+
# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
|
28 |
+
sudo aid-pkg install aidlite-{QNN VERSION}
|
29 |
+
```
|
30 |
+
|
31 |
+
- Verify AidLite SDK
|
32 |
+
|
33 |
+
```bash
|
34 |
+
# aidlite sdk c++ check
|
35 |
+
python3 -c "import aidlite ; print(aidlite.get_library_version())"
|
36 |
+
|
37 |
+
# aidlite sdk python check
|
38 |
+
python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
|
39 |
+
```
|
40 |
+
|
41 |
+
### Run demo
|
42 |
+
```bash
|
43 |
+
cd fastsam_x/model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite
|
44 |
+
export LD_PRELOAD=/home/aidlux/.local/lib/python3.8/site-packages/torch/lib/../../torch.libs/libgomp-804f19d4.so.1.0.0
|
45 |
+
|
46 |
+
python3 ./python/run_test.py --target_model ./models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin --imgs ./python/dogs.jpg --invoke_nums 10
|
47 |
+
```
|
48 |
+
|
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cmake_minimum_required (VERSION 3.5)
|
2 |
+
project("run_test")
|
3 |
+
|
4 |
+
find_package(OpenCV REQUIRED)
|
5 |
+
|
6 |
+
message(STATUS "oPENCV Library status:")
|
7 |
+
message(STATUS ">version:${OpenCV_VERSION}")
|
8 |
+
message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
|
9 |
+
|
10 |
+
set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
|
11 |
+
|
12 |
+
include_directories(
|
13 |
+
/usr/local/include
|
14 |
+
/usr/include/opencv4
|
15 |
+
)
|
16 |
+
|
17 |
+
link_directories(
|
18 |
+
/usr/local/lib/
|
19 |
+
)
|
20 |
+
|
21 |
+
file(GLOB SRC_LISTS
|
22 |
+
${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp
|
23 |
+
)
|
24 |
+
|
25 |
+
add_executable(run_test ${SRC_LISTS})
|
26 |
+
|
27 |
+
target_link_libraries(run_test
|
28 |
+
aidlite
|
29 |
+
${OpenCV_LIBS}
|
30 |
+
pthread
|
31 |
+
jsoncpp
|
32 |
+
)
|
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/cpp/dogs.jpg
ADDED
![]() |
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp
ADDED
@@ -0,0 +1,899 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <iostream>
|
2 |
+
#include <fstream>
|
3 |
+
#include <opencv2/opencv.hpp>
|
4 |
+
#include <aidlux/aidlite/aidlite.hpp>
|
5 |
+
#include <vector>
|
6 |
+
#include <numeric>
|
7 |
+
#include <cmath>
|
8 |
+
#include <jsoncpp/json/json.h>
|
9 |
+
#include <cstring> // for memcpy
|
10 |
+
#include <algorithm>
|
11 |
+
#include <opencv2/dnn.hpp>
|
12 |
+
#include <random>
|
13 |
+
|
14 |
+
using namespace cv;
|
15 |
+
using namespace std;
|
16 |
+
using namespace Aidlux::Aidlite;
|
17 |
+
|
18 |
+
|
19 |
+
struct Tensor {
|
20 |
+
float* data;
|
21 |
+
std::vector<int> shape; // 保存维度信息,例如 [1, 37, 8400]
|
22 |
+
};
|
23 |
+
|
24 |
+
struct Detection {
|
25 |
+
float x1, y1, x2, y2; // bbox
|
26 |
+
float conf; // confidence
|
27 |
+
int anchor_idx; // 新增字段
|
28 |
+
std::vector<float> mask_vec; // 32维掩码向量
|
29 |
+
};
|
30 |
+
|
31 |
+
struct Args {
|
32 |
+
std::string target_model = "../../models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin";
|
33 |
+
std::string imgs = "../dogs.jpg";
|
34 |
+
int invoke_nums = 10;
|
35 |
+
std::string model_type = "QNN";
|
36 |
+
};
|
37 |
+
|
38 |
+
|
39 |
+
Args parse_args(int argc, char* argv[]) {
|
40 |
+
Args args;
|
41 |
+
for (int i = 1; i < argc; ++i) {
|
42 |
+
std::string arg = argv[i];
|
43 |
+
if (arg == "--target_model" && i + 1 < argc) {
|
44 |
+
args.target_model = argv[++i];
|
45 |
+
} else if (arg == "--imgs" && i + 1 < argc) {
|
46 |
+
args.imgs = argv[++i];
|
47 |
+
} else if (arg == "--invoke_nums" && i + 1 < argc) {
|
48 |
+
args.invoke_nums = std::stoi(argv[++i]);
|
49 |
+
} else if (arg == "--model_type" && i + 1 < argc) {
|
50 |
+
args.model_type = argv[++i];
|
51 |
+
}
|
52 |
+
}
|
53 |
+
return args;
|
54 |
+
}
|
55 |
+
|
56 |
+
std::string to_lower(const std::string& str) {
|
57 |
+
std::string lower_str = str;
|
58 |
+
std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
|
59 |
+
return std::tolower(c);
|
60 |
+
});
|
61 |
+
return lower_str;
|
62 |
+
}
|
63 |
+
|
64 |
+
std::vector<float> convert_to_NCHW(const cv::Mat& float_img) {
|
65 |
+
int H = float_img.rows;
|
66 |
+
int W = float_img.cols;
|
67 |
+
|
68 |
+
std::vector<float> nchw_data(1 * 3 * H * W); // 1=batch size
|
69 |
+
|
70 |
+
// 从 HWC 转为 NCHW(按通道拷贝)
|
71 |
+
int channel_size = H * W;
|
72 |
+
for (int c = 0; c < 3; ++c) {
|
73 |
+
for (int h = 0; h < H; ++h) {
|
74 |
+
for (int w = 0; w < W; ++w) {
|
75 |
+
float val = float_img.at<cv::Vec3f>(h, w)[c];
|
76 |
+
nchw_data[c * channel_size + h * W + w] = val;
|
77 |
+
}
|
78 |
+
}
|
79 |
+
}
|
80 |
+
|
81 |
+
return nchw_data;
|
82 |
+
}
|
83 |
+
|
84 |
+
int transpose(float* src, unsigned int* src_dims, unsigned int* tsp_dims, float* dest){
|
85 |
+
|
86 |
+
int current_coordinate[4] = {0, 0, 0, 0};
|
87 |
+
for(int a = 0; a < src_dims[0]; ++a){
|
88 |
+
current_coordinate[0] = a;
|
89 |
+
for(int b = 0; b < src_dims[1]; ++b){
|
90 |
+
current_coordinate[1] = b;
|
91 |
+
for(int c = 0; c < src_dims[2]; ++c){
|
92 |
+
current_coordinate[2] = c;
|
93 |
+
for(int d = 0; d < src_dims[3]; ++d){
|
94 |
+
current_coordinate[3] = d;
|
95 |
+
|
96 |
+
int old_index = current_coordinate[0]*src_dims[1]*src_dims[2]*src_dims[3] +
|
97 |
+
current_coordinate[1]*src_dims[2]*src_dims[3] +
|
98 |
+
current_coordinate[2]*src_dims[3] +
|
99 |
+
current_coordinate[3];
|
100 |
+
|
101 |
+
int new_index = current_coordinate[tsp_dims[0]]*src_dims[tsp_dims[1]]*src_dims[tsp_dims[2]]*src_dims[tsp_dims[3]] +
|
102 |
+
current_coordinate[tsp_dims[1]]*src_dims[tsp_dims[2]]*src_dims[tsp_dims[3]] +
|
103 |
+
current_coordinate[tsp_dims[2]]*src_dims[tsp_dims[3]] +
|
104 |
+
current_coordinate[tsp_dims[3]];
|
105 |
+
|
106 |
+
dest[new_index] = src[old_index];
|
107 |
+
}
|
108 |
+
}
|
109 |
+
}
|
110 |
+
}
|
111 |
+
|
112 |
+
return EXIT_SUCCESS;
|
113 |
+
}
|
114 |
+
|
115 |
+
// 拼接两个 float* 的张量(沿 axis=1)
|
116 |
+
void concat_along_axis1(
|
117 |
+
const float* in1, int C1,
|
118 |
+
const float* in2, int C2,
|
119 |
+
const float* in3, int C3,
|
120 |
+
int N, int W, // N=1, W=8400
|
121 |
+
float* out // 输出大小: N * (C1+C2+C3) * W
|
122 |
+
) {
|
123 |
+
int offset = 0;
|
124 |
+
int block = W; // 每个通道的元素数 = W
|
125 |
+
|
126 |
+
// 处理 in1: [N, C1, W]
|
127 |
+
for (int c = 0; c < C1; ++c) {
|
128 |
+
std::memcpy(out + offset, in1 + c * block, sizeof(float) * block);
|
129 |
+
offset += block;
|
130 |
+
}
|
131 |
+
|
132 |
+
// 处理 in2: [N, C2, W]
|
133 |
+
for (int c = 0; c < C2; ++c) {
|
134 |
+
std::memcpy(out + offset, in2 + c * block, sizeof(float) * block);
|
135 |
+
offset += block;
|
136 |
+
}
|
137 |
+
|
138 |
+
// 处理 in3: [N, C3, W]
|
139 |
+
for (int c = 0; c < C3; ++c) {
|
140 |
+
std::memcpy(out + offset, in3 + c * block, sizeof(float) * block);
|
141 |
+
offset += block;
|
142 |
+
}
|
143 |
+
}
|
144 |
+
|
145 |
+
|
146 |
+
// 创建 qnn_out 相当于 Python 的列表
|
147 |
+
std::vector<Tensor> create_qnn_out(float* output_concat1, float* outdata0) {
|
148 |
+
std::vector<Tensor> qnn_out;
|
149 |
+
|
150 |
+
// 第一个张量: [1, 37, 8400]
|
151 |
+
Tensor t1;
|
152 |
+
t1.data = output_concat1;
|
153 |
+
t1.shape = {1, 37, 8400};
|
154 |
+
qnn_out.push_back(t1);
|
155 |
+
|
156 |
+
// 第二个张量: [1, 32, 160, 160]
|
157 |
+
Tensor t2;
|
158 |
+
t2.data = outdata0;
|
159 |
+
t2.shape = {1, 32, 160, 160};
|
160 |
+
qnn_out.push_back(t2);
|
161 |
+
|
162 |
+
return qnn_out;
|
163 |
+
}
|
164 |
+
|
165 |
+
void xywh2xyxy(const std::vector<std::vector<float>>& boxes_xywh, std::vector<std::vector<float>>& boxes_xyxy) {
|
166 |
+
boxes_xyxy.clear();
|
167 |
+
for (const auto& box : boxes_xywh) {
|
168 |
+
float x = box[0], y = box[1], w = box[2], h = box[3];
|
169 |
+
float x1 = x - w / 2.0f;
|
170 |
+
float y1 = y - h / 2.0f;
|
171 |
+
float x2 = x + w / 2.0f;
|
172 |
+
float y2 = y + h / 2.0f;
|
173 |
+
boxes_xyxy.push_back({x1, y1, x2, y2});
|
174 |
+
}
|
175 |
+
}
|
176 |
+
|
177 |
+
// std::vector<Detection> non_max_suppression_qnn(
|
178 |
+
// const Tensor& det_output, // qnn_out[0], shape [1, 84, 8400]
|
179 |
+
// float conf_thres,
|
180 |
+
// float iou_thres,
|
181 |
+
// int max_det,
|
182 |
+
// int num_classes,
|
183 |
+
// bool agnostic = false,
|
184 |
+
// const std::vector<int>& class_filter = {} // 可选类过滤
|
185 |
+
// ) {
|
186 |
+
// std::vector<Detection> candidates;
|
187 |
+
|
188 |
+
// int num_outputs = det_output.shape[1]; // 通常是 4 + num_classes
|
189 |
+
// int num_anchors = det_output.shape[2]; // 8400
|
190 |
+
// num_classes = num_outputs - 5;
|
191 |
+
// float* data = det_output.data;
|
192 |
+
// // for (int i =0 ; i<20;++i) {
|
193 |
+
// // float val = data[i];
|
194 |
+
// // std::cout << "det_output data[" << i << "] = " << val << std::endl;
|
195 |
+
// // }
|
196 |
+
// // std::cout << "shape: ["
|
197 |
+
// // << det_output.shape[0] << ", "
|
198 |
+
// // << det_output.shape[1] << ", "
|
199 |
+
// // << det_output.shape[2] << "]" << std::endl;
|
200 |
+
// // std::cout << "num_outputs: " << num_outputs << ", num_anchors: " << num_anchors << std::endl;
|
201 |
+
// for (int i = 0; i < num_anchors; ++i) {
|
202 |
+
// float x = data[0 * num_outputs * num_anchors + 0 * num_anchors + i];
|
203 |
+
// float y = data[0 * num_outputs * num_anchors + 1 * num_anchors + i];
|
204 |
+
// float w = data[0 * num_outputs * num_anchors + 2 * num_anchors + i];
|
205 |
+
// float h = data[0 * num_outputs * num_anchors + 3 * num_anchors + i];
|
206 |
+
// float obj_conf = data[4 * num_anchors + i];
|
207 |
+
|
208 |
+
// // 置信度最大类别
|
209 |
+
// float max_conf = -1.0f;
|
210 |
+
// int cls_id = -1;
|
211 |
+
// for (int c = 0; c < num_classes; ++c) {
|
212 |
+
|
213 |
+
// // float class_conf = data[0 * num_outputs * num_anchors + (5 + c) * num_anchors + i];
|
214 |
+
// float class_conf = data[(5 + c) * num_anchors + i];
|
215 |
+
// float conf = obj_conf * class_conf ;
|
216 |
+
// if (conf > max_conf) {
|
217 |
+
// max_conf = conf;
|
218 |
+
// cls_id = c;
|
219 |
+
// }
|
220 |
+
// }
|
221 |
+
// if (max_conf < conf_thres) continue;
|
222 |
+
// if (!class_filter.empty() &&
|
223 |
+
// std::find(class_filter.begin(), class_filter.end(), cls_id) == class_filter.end()) {
|
224 |
+
// continue;
|
225 |
+
// }
|
226 |
+
|
227 |
+
|
228 |
+
// Detection det;
|
229 |
+
// det.x1 = x - w / 2.0f;
|
230 |
+
// det.y1 = y - h / 2.0f;
|
231 |
+
// det.x2 = x + w / 2.0f;
|
232 |
+
// det.y2 = y + h / 2.0f;
|
233 |
+
// det.conf = max_conf;
|
234 |
+
// det.class_id = agnostic ? 0 : cls_id;
|
235 |
+
// det.anchor_idx = i;
|
236 |
+
// candidates.push_back(det);
|
237 |
+
// }
|
238 |
+
// // NMS
|
239 |
+
// std::vector<cv::Rect> boxes;
|
240 |
+
// std::vector<float> scores;
|
241 |
+
// for (const auto& d : candidates) {
|
242 |
+
// int x = static_cast<int>(d.x1);
|
243 |
+
// int y = static_cast<int>(d.y1);
|
244 |
+
// int w = static_cast<int>(d.x2 - d.x1);
|
245 |
+
// int h = static_cast<int>(d.y2 - d.y1);
|
246 |
+
// boxes.emplace_back(cv::Rect(x, y, w, h));
|
247 |
+
// scores.push_back(d.conf);
|
248 |
+
// }
|
249 |
+
|
250 |
+
// std::vector<int> keep;
|
251 |
+
// cv::dnn::NMSBoxes(boxes, scores, conf_thres, iou_thres, keep, 1.f, max_det);
|
252 |
+
|
253 |
+
// std::vector<Detection> result;
|
254 |
+
// for (int idx : keep) {
|
255 |
+
// result.push_back(candidates[idx]);
|
256 |
+
// }
|
257 |
+
|
258 |
+
// return result;
|
259 |
+
// }
|
260 |
+
|
261 |
+
|
262 |
+
std::vector<Detection> non_max_suppression_qnn(
|
263 |
+
const Tensor& det_output,
|
264 |
+
float conf_thres,
|
265 |
+
float iou_thres,
|
266 |
+
int max_det,
|
267 |
+
int mask_dim = 32
|
268 |
+
) {
|
269 |
+
std::vector<Detection> candidates;
|
270 |
+
int num_outputs = det_output.shape[1]; // 37 = 4 + 1 + 32
|
271 |
+
int num_anchors = det_output.shape[2];
|
272 |
+
float* data = det_output.data;
|
273 |
+
|
274 |
+
for (int i = 0; i < num_anchors; ++i) {
|
275 |
+
float x = data[0 * num_outputs * num_anchors + 0 * num_anchors + i];
|
276 |
+
float y = data[0 * num_outputs * num_anchors + 1 * num_anchors + i];
|
277 |
+
float w = data[0 * num_outputs * num_anchors + 2 * num_anchors + i];
|
278 |
+
float h = data[0 * num_outputs * num_anchors + 3 * num_anchors + i];
|
279 |
+
float conf = data[0 * num_outputs * num_anchors + 4 * num_anchors + i];
|
280 |
+
|
281 |
+
if (conf < conf_thres)
|
282 |
+
continue;
|
283 |
+
|
284 |
+
Detection det;
|
285 |
+
det.x1 = x - w / 2.0f;
|
286 |
+
det.y1 = y - h / 2.0f;
|
287 |
+
det.x2 = x + w / 2.0f;
|
288 |
+
det.y2 = y + h / 2.0f;
|
289 |
+
det.conf = conf;
|
290 |
+
det.anchor_idx = i;
|
291 |
+
|
292 |
+
// 提取 32维掩码向量
|
293 |
+
det.mask_vec.resize(mask_dim);
|
294 |
+
for (int m = 0; m < mask_dim; ++m) {
|
295 |
+
det.mask_vec[m] = data[0 * num_outputs * num_anchors + (5 + m) * num_anchors + i];
|
296 |
+
}
|
297 |
+
|
298 |
+
candidates.push_back(det);
|
299 |
+
}
|
300 |
+
|
301 |
+
// OpenCV NMS
|
302 |
+
std::vector<cv::Rect> boxes;
|
303 |
+
std::vector<float> scores;
|
304 |
+
for (const auto& d : candidates) {
|
305 |
+
boxes.emplace_back(cv::Rect(cv::Point(d.x1, d.y1), cv::Point(d.x2, d.y2)));
|
306 |
+
scores.push_back(d.conf);
|
307 |
+
}
|
308 |
+
|
309 |
+
std::vector<int> keep;
|
310 |
+
cv::dnn::NMSBoxes(boxes, scores, conf_thres, iou_thres, keep, 1.f, max_det);
|
311 |
+
|
312 |
+
std::vector<Detection> result;
|
313 |
+
for (int idx : keep) {
|
314 |
+
result.push_back(candidates[idx]);
|
315 |
+
}
|
316 |
+
|
317 |
+
return result;
|
318 |
+
}
|
319 |
+
|
320 |
+
|
321 |
+
|
322 |
+
|
323 |
+
|
324 |
+
|
325 |
+
void adjust_bboxes_to_image_border(std::vector<Detection>& boxes, int image_h, int image_w, int threshold = 20) {
|
326 |
+
for (auto& box : boxes) {
|
327 |
+
if (box.x1 < threshold) box.x1 = 0;
|
328 |
+
if (box.y1 < threshold) box.y1 = 0;
|
329 |
+
if (box.x2 > image_w - threshold) box.x2 = image_w;
|
330 |
+
if (box.y2 > image_h - threshold) box.y2 = image_h;
|
331 |
+
}
|
332 |
+
}
|
333 |
+
|
334 |
+
std::vector<int> bbox_iou(const Detection& box1,
|
335 |
+
const std::vector<Detection>& boxes,
|
336 |
+
float iou_thres,
|
337 |
+
int image_h,
|
338 |
+
int image_w) {
|
339 |
+
std::vector<Detection> adjusted_boxes = boxes;
|
340 |
+
adjust_bboxes_to_image_border(adjusted_boxes, image_h, image_w);
|
341 |
+
|
342 |
+
std::vector<int> indices;
|
343 |
+
float area1 = (box1.x2 - box1.x1) * (box1.y2 - box1.y1);
|
344 |
+
|
345 |
+
for (size_t i = 0; i < adjusted_boxes.size(); ++i) {
|
346 |
+
const auto& box2 = adjusted_boxes[i];
|
347 |
+
|
348 |
+
float inter_x1 = std::max(box1.x1, box2.x1);
|
349 |
+
float inter_y1 = std::max(box1.y1, box2.y1);
|
350 |
+
float inter_x2 = std::min(box1.x2, box2.x2);
|
351 |
+
float inter_y2 = std::min(box1.y2, box2.y2);
|
352 |
+
|
353 |
+
float inter_w = std::max(0.0f, inter_x2 - inter_x1);
|
354 |
+
float inter_h = std::max(0.0f, inter_y2 - inter_y1);
|
355 |
+
float inter_area = inter_w * inter_h;
|
356 |
+
|
357 |
+
float area2 = (box2.x2 - box2.x1) * (box2.y2 - box2.y1);
|
358 |
+
float union_area = area1 + area2 - inter_area;
|
359 |
+
|
360 |
+
float iou = union_area > 0 ? inter_area / union_area : 0.0f;
|
361 |
+
|
362 |
+
if (iou > iou_thres) {
|
363 |
+
indices.push_back(i);
|
364 |
+
}
|
365 |
+
}
|
366 |
+
|
367 |
+
return indices;
|
368 |
+
}
|
369 |
+
|
370 |
+
void inject_full_box(std::vector<Detection>& detections, int image_h, int image_w, float iou_thres = 0.9f) {
|
371 |
+
if (detections.empty()) {
|
372 |
+
std::cout << "No object detected." << std::endl;
|
373 |
+
return;
|
374 |
+
}
|
375 |
+
|
376 |
+
// 构造全图框(x1=0, y1=0, x2=w, y2=h, conf=1.0)
|
377 |
+
Detection full_box;
|
378 |
+
full_box.x1 = 0.0f;
|
379 |
+
full_box.y1 = 0.0f;
|
380 |
+
full_box.x2 = static_cast<float>(image_w);
|
381 |
+
full_box.y2 = static_cast<float>(image_h);
|
382 |
+
full_box.conf = 1.0f;
|
383 |
+
// full_box.class_id = -1; // 表示未定义类别,可自定义
|
384 |
+
|
385 |
+
// 计算高 IoU 匹配框
|
386 |
+
std::vector<int> matched = bbox_iou(full_box, detections, iou_thres, image_h, image_w);
|
387 |
+
if (!matched.empty()) {
|
388 |
+
for (int idx : matched) {
|
389 |
+
detections[idx] = full_box;
|
390 |
+
}
|
391 |
+
}
|
392 |
+
}
|
393 |
+
|
394 |
+
// ----------- clip_boxes -----------
|
395 |
+
void clip_boxes(std::vector<Detection>& boxes, const cv::Size& shape) {
|
396 |
+
int width = shape.width;
|
397 |
+
int height = shape.height;
|
398 |
+
for (auto& box : boxes) {
|
399 |
+
box.x1 = std::max(0.f, std::min(box.x1, static_cast<float>(width)));
|
400 |
+
box.y1 = std::max(0.f, std::min(box.y1, static_cast<float>(height)));
|
401 |
+
box.x2 = std::max(0.f, std::min(box.x2, static_cast<float>(width)));
|
402 |
+
box.y2 = std::max(0.f, std::min(box.y2, static_cast<float>(height)));
|
403 |
+
}
|
404 |
+
}
|
405 |
+
|
406 |
+
// ----------- scale_boxes -----------
|
407 |
+
// 输入:原推理图大小 img1_shape (h,w), 检测框 boxes (vector<Detection>)
|
408 |
+
// 输出:boxes坐标映射到原图大小img0_shape (h0,w0)的坐标系
|
409 |
+
// 近似实现参考Python版本,返回修改后的boxes
|
410 |
+
void scale_boxes(const cv::Size& img1_shape, std::vector<Detection>& boxes, const cv::Size& img0_shape) {
|
411 |
+
// 计算缩放比例
|
412 |
+
float gain = std::min(img1_shape.height / (float)img0_shape.height, img1_shape.width / (float)img0_shape.width);
|
413 |
+
float pad_w = (img1_shape.width - img0_shape.width * gain) / 2.0f;
|
414 |
+
float pad_h = (img1_shape.height - img0_shape.height * gain) / 2.0f;
|
415 |
+
|
416 |
+
// 调整boxes
|
417 |
+
for (auto& box : boxes) {
|
418 |
+
box.x1 = (box.x1 - pad_w) / gain;
|
419 |
+
box.x2 = (box.x2 - pad_w) / gain;
|
420 |
+
box.y1 = (box.y1 - pad_h) / gain;
|
421 |
+
box.y2 = (box.y2 - pad_h) / gain;
|
422 |
+
}
|
423 |
+
|
424 |
+
// 限制框坐标
|
425 |
+
clip_boxes(boxes, img0_shape);
|
426 |
+
}
|
427 |
+
|
428 |
+
|
429 |
+
// ----------- crop_masks -----------
|
430 |
+
// masks: vector<cv::Mat>,每个mask单通道float,尺寸HxW
|
431 |
+
// boxes: 对应检测框
|
432 |
+
// 返回每个掩码按bbox裁剪后的掩码
|
433 |
+
std::vector<cv::Mat> crop_masks(const std::vector<cv::Mat>& masks, const std::vector<Detection>& boxes) {
|
434 |
+
assert(masks.size() == boxes.size());
|
435 |
+
if (masks.empty()) return {};
|
436 |
+
|
437 |
+
int H = masks[0].rows;
|
438 |
+
int W = masks[0].cols;
|
439 |
+
|
440 |
+
// 构造r,c坐标矩阵
|
441 |
+
cv::Mat r(1, W, CV_32F);
|
442 |
+
for (int i = 0; i < W; ++i) r.at<float>(0, i) = float(i);
|
443 |
+
cv::Mat c(H, 1, CV_32F);
|
444 |
+
for (int i = 0; i < H; ++i) c.at<float>(i, 0) = float(i);
|
445 |
+
|
446 |
+
std::vector<cv::Mat> cropped_masks;
|
447 |
+
cropped_masks.reserve(masks.size());
|
448 |
+
|
449 |
+
for (size_t i = 0; i < masks.size(); ++i) {
|
450 |
+
const cv::Mat& mask = masks[i];
|
451 |
+
const Detection& box = boxes[i];
|
452 |
+
|
453 |
+
cv::Mat r_mat, c_mat;
|
454 |
+
cv::repeat(r, H, 1, r_mat); // (H, W)
|
455 |
+
cv::repeat(c, 1, W, c_mat); // (H, W)
|
456 |
+
|
457 |
+
cv::Mat mask_r1, mask_r2, mask_c1, mask_c2;
|
458 |
+
cv::compare(r_mat, box.x1, mask_r1, cv::CMP_GE);
|
459 |
+
cv::compare(r_mat, box.x2, mask_r2, cv::CMP_LT);
|
460 |
+
cv::compare(c_mat, box.y1, mask_c1, cv::CMP_GE);
|
461 |
+
cv::compare(c_mat, box.y2, mask_c2, cv::CMP_LT);
|
462 |
+
|
463 |
+
cv::Mat region_mask = (mask_r1 & mask_r2 & mask_c1 & mask_c2);
|
464 |
+
region_mask.convertTo(region_mask, CV_32F, 1.0 / 255.0);
|
465 |
+
|
466 |
+
cv::Mat cropped = mask.mul(region_mask);
|
467 |
+
cropped_masks.push_back(cropped);
|
468 |
+
}
|
469 |
+
return cropped_masks;
|
470 |
+
}
|
471 |
+
|
472 |
+
// ----------- process_mask_native -----------
|
473 |
+
// protos: Tensor c x mh x mw
|
474 |
+
// masks_in: 多个mask (N x mask_channels)
|
475 |
+
// bboxes: vector<Detection>
|
476 |
+
// shape: 原图h,w
|
477 |
+
// 计算掩码矩阵 (masks_in @ protos) sigmoid -> resize -> crop -> threshold
|
478 |
+
// 这里用cv::Mat做矩阵乘法近似
|
479 |
+
std::vector<cv::Mat> process_mask_native(const Tensor& protos,
|
480 |
+
const std::vector<cv::Mat>& masks_in,
|
481 |
+
const std::vector<Detection>& bboxes,
|
482 |
+
const cv::Size& shape)
|
483 |
+
{
|
484 |
+
int c = protos.shape[1]; // 通道数
|
485 |
+
int mh = protos.shape[2];
|
486 |
+
int mw = protos.shape[3];
|
487 |
+
int N = static_cast<int>(masks_in.size());
|
488 |
+
|
489 |
+
// 将 protos.data 转为 cv::Mat (c, mh*mw)
|
490 |
+
cv::Mat proto_mat(c, mh * mw, CV_32F, protos.data);
|
491 |
+
|
492 |
+
std::vector<cv::Mat> results;
|
493 |
+
results.reserve(N);
|
494 |
+
|
495 |
+
for (int i = 0; i < N; ++i) {
|
496 |
+
// masks_in[i] 是单个掩码,尺寸应是 (1, c),这里假设它是1xC的cv::Mat
|
497 |
+
// 做矩阵乘法 masks_in[i] (1 x c) * proto_mat (c x mh*mw) => 1 x (mh*mw)
|
498 |
+
std::cout << "proto_mat size: " << proto_mat.rows << " x " << proto_mat.cols << std::endl;
|
499 |
+
std::cout << "masks_in[" << i << "] size: " << masks_in[i].rows << " x " << masks_in[i].cols << std::endl;
|
500 |
+
cv::Mat mask_flat = masks_in[i] * proto_mat; // 1 x (mh*mw)
|
501 |
+
|
502 |
+
mask_flat = mask_flat.reshape(1, mh); // reshape成 (mh, mw)
|
503 |
+
|
504 |
+
// sigmoid
|
505 |
+
cv::Mat mask_sigmoid;
|
506 |
+
cv::exp(-mask_flat, mask_sigmoid);
|
507 |
+
mask_sigmoid = 1.0 / (1.0 + mask_sigmoid);
|
508 |
+
|
509 |
+
// Resize到原图尺寸 shape (h,w)
|
510 |
+
cv::Mat mask_resized;
|
511 |
+
cv::resize(mask_sigmoid, mask_resized, shape, 0, 0, cv::INTER_LINEAR);
|
512 |
+
|
513 |
+
// 裁剪掩码
|
514 |
+
std::vector<cv::Mat> temp_vec = { mask_resized };
|
515 |
+
auto mask_cropped = crop_masks(temp_vec, std::vector<Detection>{bboxes[i]});
|
516 |
+
assert(mask_cropped.size() == 1);
|
517 |
+
|
518 |
+
// threshold 0.5
|
519 |
+
cv::Mat mask_bin;
|
520 |
+
cv::threshold(mask_cropped[0], mask_bin, 0.5, 1.0, cv::THRESH_BINARY);
|
521 |
+
|
522 |
+
results.push_back(mask_bin);
|
523 |
+
}
|
524 |
+
return results;
|
525 |
+
}
|
526 |
+
|
527 |
+
// ----------- process_mask -----------
|
528 |
+
// 类似 process_mask_native,但多了bbox缩放和可选上采样upsample
|
529 |
+
// masks_in: N x mask_channels (对应pred中[:,6:])
|
530 |
+
// bboxes输入是经过缩放的boxes,image_shape是原图尺寸
|
531 |
+
std::vector<cv::Mat> process_mask(const Tensor& protos,
|
532 |
+
const std::vector<cv::Mat>& masks_in,
|
533 |
+
std::vector<Detection> bboxes,
|
534 |
+
const cv::Size& image_shape,
|
535 |
+
bool upsample = true)
|
536 |
+
{
|
537 |
+
int c = protos.shape[1];
|
538 |
+
int mh = protos.shape[2];
|
539 |
+
int mw = protos.shape[3];
|
540 |
+
int N = static_cast<int>(masks_in.size());
|
541 |
+
|
542 |
+
// 将 protos.data 转为 cv::Mat (c, mh*mw)
|
543 |
+
cv::Mat proto_mat(c, mh * mw, CV_32F, protos.data);
|
544 |
+
|
545 |
+
// 调整bbox至downsample尺寸
|
546 |
+
for (auto& box : bboxes) {
|
547 |
+
box.x1 *= (float)mw / image_shape.width;
|
548 |
+
box.x2 *= (float)mw / image_shape.width;
|
549 |
+
box.y1 *= (float)mh / image_shape.height;
|
550 |
+
box.y2 *= (float)mh / image_shape.height;
|
551 |
+
}
|
552 |
+
|
553 |
+
std::vector<cv::Mat> results;
|
554 |
+
results.reserve(N);
|
555 |
+
|
556 |
+
for (int i = 0; i < N; ++i) {
|
557 |
+
// 矩阵乘法
|
558 |
+
cv::Mat mask_flat = masks_in[i] * proto_mat;
|
559 |
+
mask_flat = mask_flat.reshape(1, mh);
|
560 |
+
|
561 |
+
// sigmoid
|
562 |
+
cv::Mat mask_sigmoid;
|
563 |
+
cv::exp(-mask_flat, mask_sigmoid);
|
564 |
+
mask_sigmoid = 1.0 / (1.0 + mask_sigmoid);
|
565 |
+
|
566 |
+
// 裁剪掩码
|
567 |
+
std::vector<cv::Mat> temp_vec = { mask_sigmoid };
|
568 |
+
auto mask_cropped = crop_masks(temp_vec, std::vector<Detection>{bboxes[i]});
|
569 |
+
assert(mask_cropped.size() == 1);
|
570 |
+
cv::Mat mask = mask_cropped[0];
|
571 |
+
|
572 |
+
// 上采样回原图大小
|
573 |
+
if (upsample) {
|
574 |
+
cv::Mat mask_upsampled;
|
575 |
+
cv::resize(mask, mask_upsampled, image_shape, 0, 0, cv::INTER_LINEAR);
|
576 |
+
mask = mask_upsampled;
|
577 |
+
}
|
578 |
+
|
579 |
+
// threshold
|
580 |
+
cv::Mat mask_bin;
|
581 |
+
cv::threshold(mask, mask_bin, 0.5, 1.0, cv::THRESH_BINARY);
|
582 |
+
|
583 |
+
results.push_back(mask_bin);
|
584 |
+
}
|
585 |
+
return results;
|
586 |
+
}
|
587 |
+
|
588 |
+
cv::Mat plot_to_result(const cv::Mat& image,
|
589 |
+
const std::vector<cv::Mat>& masks,
|
590 |
+
bool mask_random_color = true,
|
591 |
+
bool withContours = true,
|
592 |
+
bool retina = false) {
|
593 |
+
// 图像转换
|
594 |
+
cv::Mat rgb_img;
|
595 |
+
cv::cvtColor(image, rgb_img, cv::COLOR_BGR2RGB);
|
596 |
+
int original_h = rgb_img.rows;
|
597 |
+
int original_w = rgb_img.cols;
|
598 |
+
|
599 |
+
// 准备RGBA可视图
|
600 |
+
cv::Mat rgba(rgb_img.size(), CV_8UC4);
|
601 |
+
for (int y = 0; y < rgb_img.rows; ++y) {
|
602 |
+
for (int x = 0; x < rgb_img.cols; ++x) {
|
603 |
+
cv::Vec3b pix = rgb_img.at<cv::Vec3b>(y, x);
|
604 |
+
rgba.at<cv::Vec4b>(y, x) = cv::Vec4b(pix[0], pix[1], pix[2], 255); // BGR -> BGRA
|
605 |
+
}
|
606 |
+
}
|
607 |
+
|
608 |
+
// 遍历mask,构建叠加透明颜色
|
609 |
+
for (size_t i = 0; i < masks.size(); ++i) {
|
610 |
+
cv::Mat mask = masks[i];
|
611 |
+
if (mask.type() != CV_8UC1)
|
612 |
+
mask.convertTo(mask, CV_8UC1);
|
613 |
+
|
614 |
+
// 形态学操作
|
615 |
+
cv::morphologyEx(mask, mask, cv::MORPH_CLOSE, cv::Mat::ones(3, 3, CV_8U));
|
616 |
+
cv::morphologyEx(mask, mask, cv::MORPH_OPEN, cv::Mat::ones(8, 8, CV_8U));
|
617 |
+
|
618 |
+
if (!retina) {
|
619 |
+
cv::resize(mask, mask, cv::Size(original_w, original_h), 0, 0, cv::INTER_NEAREST);
|
620 |
+
}
|
621 |
+
|
622 |
+
// 设置颜色
|
623 |
+
cv::Scalar color;
|
624 |
+
if (mask_random_color) {
|
625 |
+
std::mt19937 rng(i + 42);
|
626 |
+
color = cv::Scalar(rng() % 255, rng() % 255, rng() % 255); // BGR
|
627 |
+
} else {
|
628 |
+
color = cv::Scalar(255, 144, 30); // 蓝色调
|
629 |
+
}
|
630 |
+
|
631 |
+
// 叠加到 RGBA 图
|
632 |
+
for (int y = 0; y < mask.rows; ++y) {
|
633 |
+
for (int x = 0; x < mask.cols; ++x) {
|
634 |
+
if (mask.at<uchar>(y, x) > 0) {
|
635 |
+
cv::Vec4b& pix = rgba.at<cv::Vec4b>(y, x);
|
636 |
+
for (int c = 0; c < 3; ++c) {
|
637 |
+
pix[c] = static_cast<uchar>(0.6 * color[c] + 0.4 * pix[c]); // 混合颜色
|
638 |
+
}
|
639 |
+
pix[3] = 255;
|
640 |
+
}
|
641 |
+
}
|
642 |
+
}
|
643 |
+
}
|
644 |
+
|
645 |
+
// 绘制轮廓
|
646 |
+
if (withContours) {
|
647 |
+
for (const auto& mask : masks) {
|
648 |
+
cv::Mat bin;
|
649 |
+
mask.convertTo(bin, CV_8UC1);
|
650 |
+
if (!retina)
|
651 |
+
cv::resize(bin, bin, cv::Size(original_w, original_h), 0, 0, cv::INTER_NEAREST);
|
652 |
+
|
653 |
+
std::vector<std::vector<cv::Point>> contours;
|
654 |
+
cv::findContours(bin, contours, cv::RETR_TREE, cv::CHAIN_APPROX_SIMPLE);
|
655 |
+
cv::drawContours(rgba, contours, -1, cv::Scalar(255, 255, 255, 255), 2);
|
656 |
+
}
|
657 |
+
}
|
658 |
+
|
659 |
+
// 转为3通道 BGR 输出
|
660 |
+
cv::Mat result;
|
661 |
+
cv::cvtColor(rgba, result, cv::COLOR_RGBA2BGR);
|
662 |
+
return result;
|
663 |
+
}
|
664 |
+
|
665 |
+
|
666 |
+
|
667 |
+
|
668 |
+
|
669 |
+
|
670 |
+
int invoke(const Args& args) {
|
671 |
+
std::cout << "Start main ... ... Model Path: " << args.target_model << "\n"
|
672 |
+
<< "Image Path: " << args.imgs << "\n"
|
673 |
+
<< "Inference Nums: " << args.invoke_nums << "\n"
|
674 |
+
<< "Model Type: " << args.model_type << "\n";
|
675 |
+
Model* model = Model::create_instance(args.target_model);
|
676 |
+
if(model == nullptr){
|
677 |
+
printf("Create model failed !\n");
|
678 |
+
return EXIT_FAILURE;
|
679 |
+
}
|
680 |
+
Config* config = Config::create_instance();
|
681 |
+
if(config == nullptr){
|
682 |
+
printf("Create config failed !\n");
|
683 |
+
return EXIT_FAILURE;
|
684 |
+
}
|
685 |
+
config->implement_type = ImplementType::TYPE_LOCAL;
|
686 |
+
std::string model_type_lower = to_lower(args.model_type);
|
687 |
+
if (model_type_lower == "qnn"){
|
688 |
+
config->framework_type = FrameworkType::TYPE_QNN;
|
689 |
+
} else if (model_type_lower == "snpe2" || model_type_lower == "snpe") {
|
690 |
+
config->framework_type = FrameworkType::TYPE_SNPE2;
|
691 |
+
}
|
692 |
+
config->accelerate_type = AccelerateType::TYPE_DSP;
|
693 |
+
config->is_quantify_model = 1;
|
694 |
+
|
695 |
+
unsigned int model_h = 640;
|
696 |
+
unsigned int model_w = 640;
|
697 |
+
std::vector<std::vector<uint32_t>> input_shapes = {{1,model_h,model_w,3}};
|
698 |
+
std::vector<std::vector<uint32_t>> output_shapes = {{1,1,8400},{1,4,8400},{1,32,8400},{1,160,160,32}};
|
699 |
+
model->set_model_properties(input_shapes, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
|
700 |
+
std::unique_ptr<Interpreter> fast_interpreter = InterpreterBuilder::build_interpretper_from_model_and_config(model, config);
|
701 |
+
if(fast_interpreter == nullptr){
|
702 |
+
printf("build_interpretper_from_model_and_config failed !\n");
|
703 |
+
return EXIT_FAILURE;
|
704 |
+
}
|
705 |
+
int result = fast_interpreter->init();
|
706 |
+
if(result != EXIT_SUCCESS){
|
707 |
+
printf("interpreter->init() failed !\n");
|
708 |
+
return EXIT_FAILURE;
|
709 |
+
}
|
710 |
+
// load model
|
711 |
+
fast_interpreter->load_model();
|
712 |
+
if(result != EXIT_SUCCESS){
|
713 |
+
printf("interpreter->load_model() failed !\n");
|
714 |
+
return EXIT_FAILURE;
|
715 |
+
}
|
716 |
+
printf("detect model load success!\n");
|
717 |
+
|
718 |
+
cv::Mat frame = cv::imread(args.imgs);
|
719 |
+
if (frame.empty()) {
|
720 |
+
printf("detect image load failed!\n");
|
721 |
+
return 1;
|
722 |
+
}
|
723 |
+
printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
|
724 |
+
cv::Mat input_data;
|
725 |
+
cv::Mat frame_clone = frame.clone();
|
726 |
+
cv::cvtColor(frame_clone, frame_clone, cv::COLOR_BGR2RGB);
|
727 |
+
cv::resize(frame_clone, frame_clone, cv::Size(model_w, model_h));
|
728 |
+
cv::Mat float_img;
|
729 |
+
frame_clone.convertTo(float_img, CV_32FC3, 1.0 / 255.0);
|
730 |
+
std::vector<float> input_tensor = convert_to_NCHW(float_img);
|
731 |
+
|
732 |
+
|
733 |
+
float *outdata0 = nullptr;
|
734 |
+
float *outdata1 = nullptr;
|
735 |
+
float *outdata2 = nullptr;
|
736 |
+
float *outdata3 = nullptr;
|
737 |
+
std::vector<float> invoke_time;
|
738 |
+
for (int i = 0; i < args.invoke_nums; ++i) {
|
739 |
+
result = fast_interpreter->set_input_tensor(0, input_tensor.data());
|
740 |
+
if(result != EXIT_SUCCESS){
|
741 |
+
printf("interpreter->set_input_tensor() failed !\n");
|
742 |
+
return EXIT_FAILURE;
|
743 |
+
}
|
744 |
+
auto t1 = std::chrono::high_resolution_clock::now();
|
745 |
+
result = fast_interpreter->invoke();
|
746 |
+
auto t2 = std::chrono::high_resolution_clock::now();
|
747 |
+
std::chrono::duration<double> cost_time = t2 - t1;
|
748 |
+
invoke_time.push_back(cost_time.count() * 1000);
|
749 |
+
if(result != EXIT_SUCCESS){
|
750 |
+
printf("interpreter->invoke() failed !\n");
|
751 |
+
return EXIT_FAILURE;
|
752 |
+
}
|
753 |
+
// [1,160,160,32]
|
754 |
+
uint32_t out_data_0 = 0;
|
755 |
+
result = fast_interpreter->get_output_tensor(0, (void**)&outdata0, &out_data_0);
|
756 |
+
if(result != EXIT_SUCCESS){
|
757 |
+
printf("interpreter->get_output_tensor() 1 failed !\n");
|
758 |
+
return EXIT_FAILURE;
|
759 |
+
}
|
760 |
+
|
761 |
+
// [1,32,8400]
|
762 |
+
uint32_t out_data_1 = 0;
|
763 |
+
result = fast_interpreter->get_output_tensor(1, (void**)&outdata1, &out_data_1);
|
764 |
+
if(result != EXIT_SUCCESS){
|
765 |
+
printf("interpreter->get_output_tensor() 1 failed !\n");
|
766 |
+
return EXIT_FAILURE;
|
767 |
+
}
|
768 |
+
|
769 |
+
// [1,1,8400]
|
770 |
+
uint32_t out_data_2 = 0;
|
771 |
+
result = fast_interpreter->get_output_tensor(2, (void**)&outdata2, &out_data_2);
|
772 |
+
if(result != EXIT_SUCCESS){
|
773 |
+
printf("interpreter->get_output_tensor() 1 failed !\n");
|
774 |
+
return EXIT_FAILURE;
|
775 |
+
}
|
776 |
+
|
777 |
+
// [1,4,8400]
|
778 |
+
uint32_t out_data_3 = 0;
|
779 |
+
result = fast_interpreter->get_output_tensor(3, (void**)&outdata3, &out_data_3);
|
780 |
+
if(result != EXIT_SUCCESS){
|
781 |
+
printf("interpreter->get_output_tensor() 1 failed !\n");
|
782 |
+
return EXIT_FAILURE;
|
783 |
+
}
|
784 |
+
|
785 |
+
}
|
786 |
+
|
787 |
+
float max_invoke_time = *std::max_element(invoke_time.begin(), invoke_time.end());
|
788 |
+
float min_invoke_time = *std::min_element(invoke_time.begin(), invoke_time.end());
|
789 |
+
float mean_invoke_time = std::accumulate(invoke_time.begin(), invoke_time.end(), 0.0f) / args.invoke_nums;
|
790 |
+
float var_invoketime = 0.0f;
|
791 |
+
for (auto time : invoke_time) {
|
792 |
+
var_invoketime += (time - mean_invoke_time) * (time - mean_invoke_time);
|
793 |
+
}
|
794 |
+
var_invoketime /= args.invoke_nums;
|
795 |
+
printf("=======================================\n");
|
796 |
+
printf("QNN inference %d times :\n --mean_invoke_time is %f \n --max_invoke_time is %f \n --min_invoke_time is %f \n --var_invoketime is %f\n",
|
797 |
+
args.invoke_nums, mean_invoke_time, max_invoke_time, min_invoke_time, var_invoketime);
|
798 |
+
printf("=======================================\n");
|
799 |
+
|
800 |
+
// post process
|
801 |
+
// outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
|
802 |
+
|
803 |
+
unsigned int src_dims[4] = {1, 160,160,32};
|
804 |
+
unsigned int tsp_dims[4] = {0,3,1,2};
|
805 |
+
unsigned int stride_data_num = 1*160*160*32;
|
806 |
+
float* format_data = new float[stride_data_num];
|
807 |
+
transpose(outdata0, src_dims, tsp_dims, format_data);
|
808 |
+
// cv::Mat proto_buffer(32,160*160, CV_32F, format_data);
|
809 |
+
// std::cout << "proto_buffer 维度: " << proto_buffer.rows << "x" << proto_buffer.cols << std::endl;
|
810 |
+
|
811 |
+
for (int i = 0; i < 8400; ++i) {
|
812 |
+
outdata2[i] = 1.0f / (1.0f + std::exp(-outdata2[i]));
|
813 |
+
}
|
814 |
+
|
815 |
+
const int N = 1;
|
816 |
+
const int W = 8400;
|
817 |
+
const int C1 = 4, C2 = 1, C3 = 32;
|
818 |
+
const int total_C = C1 + C2 + C3;
|
819 |
+
// 分配输出空间
|
820 |
+
float output_concat1[total_C * W]; // [1, 37, 8400]
|
821 |
+
concat_along_axis1(outdata3, C1, outdata2, C2, outdata1, C3, N, W, output_concat1);
|
822 |
+
|
823 |
+
std::vector<Tensor> qnn_out = create_qnn_out(output_concat1, format_data);
|
824 |
+
|
825 |
+
float conf_thres = 0.25f;
|
826 |
+
float iou_thres = 0.45f;
|
827 |
+
int max_det = 300;
|
828 |
+
int num_classes = 1;
|
829 |
+
bool agnostic = false;
|
830 |
+
|
831 |
+
int total_channels = qnn_out[0].shape[1]; // 37
|
832 |
+
int mask_dim = 32; // 你从 YOLOv8 导出的模型可能是 mask_dim=32
|
833 |
+
num_classes = total_channels - 5 - mask_dim;
|
834 |
+
std::cout << "num_classes = " << num_classes << ", mask_dim = " << mask_dim << std::endl;
|
835 |
+
|
836 |
+
std::vector<Detection> dets = non_max_suppression_qnn(qnn_out[0], conf_thres, iou_thres, max_det, mask_dim);
|
837 |
+
std::cout << "dets size: " << dets.size() << std::endl;
|
838 |
+
inject_full_box(dets, 640, 640, 0.9f);
|
839 |
+
|
840 |
+
std::vector<cv::Mat> orig_imgs;
|
841 |
+
orig_imgs.push_back(frame.clone());
|
842 |
+
|
843 |
+
const Tensor& proto_tensor = qnn_out.back();
|
844 |
+
size_t batch_size = dets.size();
|
845 |
+
std::vector<std::vector<cv::Mat>> results(batch_size);
|
846 |
+
// scale_boxes将dets框映射回原图
|
847 |
+
std::cout << "infer_img_shape: " << proto_tensor.shape[0] << "x" << proto_tensor.shape[1] << "x" << proto_tensor.shape[3] << " x " << proto_tensor.shape[2] << std::endl;
|
848 |
+
cv::Size infer_img_shape(proto_tensor.shape[3], proto_tensor.shape[2]); // [1,c,h,w]
|
849 |
+
scale_boxes(infer_img_shape, dets, orig_imgs[0].size());
|
850 |
+
|
851 |
+
// 从 dets 提取 mask向量,掩码向量是 dets中假设对应的mask数组,需要你准备好
|
852 |
+
// 这里暂时用空,需你准备 N个 masks_in (N x c) 的 cv::Mat
|
853 |
+
std::vector<cv::Mat> masks_in;
|
854 |
+
float* data = qnn_out[0].data;
|
855 |
+
int C = qnn_out[0].shape[1]; // 37
|
856 |
+
int W1 = qnn_out[0].shape[2]; // 8400
|
857 |
+
// int mask_channels = 3;
|
858 |
+
// int num_cls = 29;
|
859 |
+
for (size_t i = 0; i < dets.size(); ++i) {
|
860 |
+
int anchor_idx = dets[i].anchor_idx;
|
861 |
+
cv::Mat mask_vec(1, mask_dim, CV_32F);
|
862 |
+
std::cout << "Fixed mask_vec[" << i << "] = " << mask_vec << std::endl;
|
863 |
+
for (int m = 0; m < mask_dim; ++m) {
|
864 |
+
int ch = 5 + m;
|
865 |
+
mask_vec.at<float>(0, m) = data[ch * W + anchor_idx];
|
866 |
+
}
|
867 |
+
masks_in.push_back(mask_vec);
|
868 |
+
}
|
869 |
+
// 请根据你的推理结果把 dets中每个框对应的mask向量填入masks_in,尺寸应为1 x c (float)
|
870 |
+
|
871 |
+
// 下面只示范调用:
|
872 |
+
std::vector<cv::Mat> masks;
|
873 |
+
masks = process_mask_native(proto_tensor, masks_in, dets, orig_imgs[0].size());
|
874 |
+
|
875 |
+
// for (int i = 0; i < masks.size(); ++i) {
|
876 |
+
// // Save raw mask
|
877 |
+
// cv::imwrite("mask_raw_" + std::to_string(i) + ".png", masks[i] * 255);
|
878 |
+
|
879 |
+
// }
|
880 |
+
|
881 |
+
results[0] = masks;
|
882 |
+
if (results.empty()) {
|
883 |
+
return -1;
|
884 |
+
}
|
885 |
+
|
886 |
+
std::vector<cv::Mat> ann = results[0]; // 假设只处理第一个batch的结果
|
887 |
+
|
888 |
+
cv::Mat result_img = plot_to_result(frame, ann);
|
889 |
+
cv::imwrite("result_with_mask.png", result_img);
|
890 |
+
|
891 |
+
fast_interpreter->destory();
|
892 |
+
return 0;
|
893 |
+
}
|
894 |
+
|
895 |
+
|
896 |
+
int main(int argc, char* argv[]) {
|
897 |
+
Args args = parse_args(argc, argv);
|
898 |
+
return invoke(args);
|
899 |
+
}
|
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e0334b0766f80b127bf90f14831339b6c94e66e4bbf0767c40e1602028accd0
|
3 |
+
size 82550024
|
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/dogs.jpg
ADDED
![]() |
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/onnx_export.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import cv2
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
|
6 |
+
from ultralytics.models.fastsam import FastSAM
|
7 |
+
|
8 |
+
class Fast_SAM(torch.nn.Module):
|
9 |
+
"""Exportable FastSAM model, end-to-end."""
|
10 |
+
|
11 |
+
def __init__(self) -> None:
|
12 |
+
super().__init__()
|
13 |
+
pt_name ='./models/FastSAM-s.pt'
|
14 |
+
self.model =FastSAM(pt_name).model
|
15 |
+
|
16 |
+
def forward(self, image: torch.Tensor):
|
17 |
+
"""
|
18 |
+
Run FastSAM on `image`, and produce high quality segmentation masks.
|
19 |
+
Faster than SAM as it is based on YOLOv8.
|
20 |
+
|
21 |
+
Parameters:
|
22 |
+
image: Pixel values pre-processed for encoder consumption.
|
23 |
+
Range: float[0, 1]
|
24 |
+
3-channel Color Space: BGR
|
25 |
+
Returns:
|
26 |
+
|
27 |
+
"""
|
28 |
+
predictions = self.model(image)
|
29 |
+
# Return predictions as a tuple instead of nested tuple.
|
30 |
+
return (predictions[0], predictions[1][2])
|
31 |
+
|
32 |
+
|
33 |
+
model = Fast_SAM()
|
34 |
+
num_params = sum(p.numel() for p in model.parameters())
|
35 |
+
print(f'Number of FastSAM-s parameters: {num_params}')
|
36 |
+
dummy_input = torch.randn( [1,3,640,640],dtype=torch.float32 )
|
37 |
+
source_model = torch.jit.trace(
|
38 |
+
model.to("cpu"), dummy_input, check_trace=False
|
39 |
+
)
|
40 |
+
torch.onnx.export(model, # model being run
|
41 |
+
dummy_input, # model input (or a tuple for multiple inputs)
|
42 |
+
"./models/fastsam_s.onnx", # where to save the model
|
43 |
+
export_params=True, # store the trained parameter weights inside the model file
|
44 |
+
opset_version=12, # the ONNX version to export the model to
|
45 |
+
do_constant_folding=True, # whether to execute constant folding for optimization
|
46 |
+
input_names = ['input'], # the model's input names
|
47 |
+
output_names = ['boxes','mask'],
|
48 |
+
verbose=True,
|
49 |
+
)
|
50 |
+
print("Convert to onnx successfully!")
|
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/prompt.py
ADDED
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import cv2
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
from utils import image_to_np_ndarray
|
8 |
+
from PIL import Image
|
9 |
+
|
10 |
+
|
11 |
+
class FastSAMPrompt:
|
12 |
+
|
13 |
+
def __init__(self, image, results, device='cpu'):
|
14 |
+
if isinstance(image, str) or isinstance(image, Image.Image):
|
15 |
+
image = image_to_np_ndarray(image)
|
16 |
+
self.device = device
|
17 |
+
self.results = results
|
18 |
+
self.img = image
|
19 |
+
|
20 |
+
def _segment_image(self, image, bbox):
|
21 |
+
if isinstance(image, Image.Image):
|
22 |
+
image_array = np.array(image)
|
23 |
+
else:
|
24 |
+
image_array = image
|
25 |
+
segmented_image_array = np.zeros_like(image_array)
|
26 |
+
x1, y1, x2, y2 = bbox
|
27 |
+
segmented_image_array[y1:y2, x1:x2] = image_array[y1:y2, x1:x2]
|
28 |
+
segmented_image = Image.fromarray(segmented_image_array)
|
29 |
+
black_image = Image.new('RGB', image.size, (255, 255, 255))
|
30 |
+
# transparency_mask = np.zeros_like((), dtype=np.uint8)
|
31 |
+
transparency_mask = np.zeros((image_array.shape[0], image_array.shape[1]), dtype=np.uint8)
|
32 |
+
transparency_mask[y1:y2, x1:x2] = 255
|
33 |
+
transparency_mask_image = Image.fromarray(transparency_mask, mode='L')
|
34 |
+
black_image.paste(segmented_image, mask=transparency_mask_image)
|
35 |
+
return black_image
|
36 |
+
|
37 |
+
def _format_results(self, result, filter=0):
|
38 |
+
annotations = []
|
39 |
+
n = len(result.masks.data)
|
40 |
+
for i in range(n):
|
41 |
+
annotation = {}
|
42 |
+
mask = result.masks.data[i] == 1.0
|
43 |
+
|
44 |
+
if torch.sum(mask) < filter:
|
45 |
+
continue
|
46 |
+
annotation['id'] = i
|
47 |
+
annotation['segmentation'] = mask.cpu().numpy()
|
48 |
+
annotation['bbox'] = result.boxes.data[i]
|
49 |
+
annotation['score'] = result.boxes.conf[i]
|
50 |
+
annotation['area'] = annotation['segmentation'].sum()
|
51 |
+
annotations.append(annotation)
|
52 |
+
return annotations
|
53 |
+
|
54 |
+
def filter_masks(annotations): # filte the overlap mask
|
55 |
+
annotations.sort(key=lambda x: x['area'], reverse=True)
|
56 |
+
to_remove = set()
|
57 |
+
for i in range(0, len(annotations)):
|
58 |
+
a = annotations[i]
|
59 |
+
for j in range(i + 1, len(annotations)):
|
60 |
+
b = annotations[j]
|
61 |
+
if i != j and j not in to_remove:
|
62 |
+
# check if
|
63 |
+
if b['area'] < a['area']:
|
64 |
+
if (a['segmentation'] & b['segmentation']).sum() / b['segmentation'].sum() > 0.8:
|
65 |
+
to_remove.add(j)
|
66 |
+
|
67 |
+
return [a for i, a in enumerate(annotations) if i not in to_remove], to_remove
|
68 |
+
|
69 |
+
def _get_bbox_from_mask(self, mask):
|
70 |
+
mask = mask.astype(np.uint8)
|
71 |
+
contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
72 |
+
x1, y1, w, h = cv2.boundingRect(contours[0])
|
73 |
+
x2, y2 = x1 + w, y1 + h
|
74 |
+
if len(contours) > 1:
|
75 |
+
for b in contours:
|
76 |
+
x_t, y_t, w_t, h_t = cv2.boundingRect(b)
|
77 |
+
# Merge multiple bounding boxes into one.
|
78 |
+
x1 = min(x1, x_t)
|
79 |
+
y1 = min(y1, y_t)
|
80 |
+
x2 = max(x2, x_t + w_t)
|
81 |
+
y2 = max(y2, y_t + h_t)
|
82 |
+
h = y2 - y1
|
83 |
+
w = x2 - x1
|
84 |
+
return [x1, y1, x2, y2]
|
85 |
+
|
86 |
+
def plot_to_result(self,
|
87 |
+
annotations,
|
88 |
+
bboxes=None,
|
89 |
+
points=None,
|
90 |
+
point_label=None,
|
91 |
+
mask_random_color=True,
|
92 |
+
better_quality=True,
|
93 |
+
retina=False,
|
94 |
+
withContours=True) -> np.ndarray:
|
95 |
+
if isinstance(annotations[0], dict):
|
96 |
+
annotations = [annotation['segmentation'] for annotation in annotations]
|
97 |
+
image = self.img
|
98 |
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
99 |
+
original_h = image.shape[0]
|
100 |
+
original_w = image.shape[1]
|
101 |
+
if sys.platform == "darwin":
|
102 |
+
plt.switch_backend("TkAgg")
|
103 |
+
plt.figure(figsize=(original_w / 100, original_h / 100))
|
104 |
+
# Add subplot with no margin.
|
105 |
+
plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
|
106 |
+
plt.margins(0, 0)
|
107 |
+
plt.gca().xaxis.set_major_locator(plt.NullLocator())
|
108 |
+
plt.gca().yaxis.set_major_locator(plt.NullLocator())
|
109 |
+
|
110 |
+
plt.imshow(image)
|
111 |
+
if better_quality:
|
112 |
+
if isinstance(annotations[0], torch.Tensor):
|
113 |
+
annotations = np.array(annotations.cpu())
|
114 |
+
for i, mask in enumerate(annotations):
|
115 |
+
mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8))
|
116 |
+
annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8))
|
117 |
+
if self.device == 'cpu':
|
118 |
+
annotations = np.array(annotations)
|
119 |
+
self.fast_show_mask(
|
120 |
+
annotations,
|
121 |
+
plt.gca(),
|
122 |
+
random_color=mask_random_color,
|
123 |
+
bboxes=bboxes,
|
124 |
+
points=points,
|
125 |
+
pointlabel=point_label,
|
126 |
+
retinamask=retina,
|
127 |
+
target_height=original_h,
|
128 |
+
target_width=original_w,
|
129 |
+
)
|
130 |
+
else:
|
131 |
+
if isinstance(annotations[0], np.ndarray):
|
132 |
+
annotations = torch.from_numpy(annotations)
|
133 |
+
self.fast_show_mask_gpu(
|
134 |
+
annotations,
|
135 |
+
plt.gca(),
|
136 |
+
random_color=mask_random_color,
|
137 |
+
bboxes=bboxes,
|
138 |
+
points=points,
|
139 |
+
pointlabel=point_label,
|
140 |
+
retinamask=retina,
|
141 |
+
target_height=original_h,
|
142 |
+
target_width=original_w,
|
143 |
+
)
|
144 |
+
if isinstance(annotations, torch.Tensor):
|
145 |
+
annotations = annotations.cpu().numpy()
|
146 |
+
if withContours:
|
147 |
+
contour_all = []
|
148 |
+
temp = np.zeros((original_h, original_w, 1))
|
149 |
+
for i, mask in enumerate(annotations):
|
150 |
+
if type(mask) == dict:
|
151 |
+
mask = mask['segmentation']
|
152 |
+
annotation = mask.astype(np.uint8)
|
153 |
+
if not retina:
|
154 |
+
annotation = cv2.resize(
|
155 |
+
annotation,
|
156 |
+
(original_w, original_h),
|
157 |
+
interpolation=cv2.INTER_NEAREST,
|
158 |
+
)
|
159 |
+
contours, hierarchy = cv2.findContours(annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
160 |
+
for contour in contours:
|
161 |
+
contour_all.append(contour)
|
162 |
+
cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2)
|
163 |
+
color = np.array([0 / 255, 0 / 255, 255 / 255, 0.8])
|
164 |
+
contour_mask = temp / 255 * color.reshape(1, 1, -1)
|
165 |
+
plt.imshow(contour_mask)
|
166 |
+
|
167 |
+
plt.axis('off')
|
168 |
+
fig = plt.gcf()
|
169 |
+
plt.draw()
|
170 |
+
|
171 |
+
try:
|
172 |
+
buf = fig.canvas.tostring_rgb()
|
173 |
+
except AttributeError:
|
174 |
+
fig.canvas.draw()
|
175 |
+
buf = fig.canvas.tostring_rgb()
|
176 |
+
cols, rows = fig.canvas.get_width_height()
|
177 |
+
img_array = np.frombuffer(buf, dtype=np.uint8).reshape(rows, cols, 3)
|
178 |
+
result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
|
179 |
+
plt.close()
|
180 |
+
return result
|
181 |
+
|
182 |
+
# Remark for refactoring: IMO a function should do one thing only, storing the image and plotting should be seperated and do not necessarily need to be class functions but standalone utility functions that the user can chain in his scripts to have more fine-grained control.
|
183 |
+
def plot(self,
|
184 |
+
annotations,
|
185 |
+
output_path,
|
186 |
+
bboxes=None,
|
187 |
+
points=None,
|
188 |
+
point_label=None,
|
189 |
+
mask_random_color=True,
|
190 |
+
better_quality=True,
|
191 |
+
retina=False,
|
192 |
+
withContours=True):
|
193 |
+
if len(annotations) == 0:
|
194 |
+
return None
|
195 |
+
result = self.plot_to_result(
|
196 |
+
annotations,
|
197 |
+
bboxes,
|
198 |
+
points,
|
199 |
+
point_label,
|
200 |
+
mask_random_color,
|
201 |
+
better_quality,
|
202 |
+
retina,
|
203 |
+
withContours,
|
204 |
+
)
|
205 |
+
|
206 |
+
path = os.path.dirname(os.path.abspath(output_path))
|
207 |
+
if not os.path.exists(path):
|
208 |
+
os.makedirs(path)
|
209 |
+
result = result[:, :, ::-1]
|
210 |
+
cv2.imwrite(output_path, result)
|
211 |
+
|
212 |
+
# CPU post process
|
213 |
+
def fast_show_mask(
|
214 |
+
self,
|
215 |
+
annotation,
|
216 |
+
ax,
|
217 |
+
random_color=False,
|
218 |
+
bboxes=None,
|
219 |
+
points=None,
|
220 |
+
pointlabel=None,
|
221 |
+
retinamask=True,
|
222 |
+
target_height=960,
|
223 |
+
target_width=960,
|
224 |
+
):
|
225 |
+
msak_sum = annotation.shape[0]
|
226 |
+
height = annotation.shape[1]
|
227 |
+
weight = annotation.shape[2]
|
228 |
+
#Sort annotations based on area.
|
229 |
+
areas = np.sum(annotation, axis=(1, 2))
|
230 |
+
sorted_indices = np.argsort(areas)
|
231 |
+
annotation = annotation[sorted_indices]
|
232 |
+
|
233 |
+
index = (annotation != 0).argmax(axis=0)
|
234 |
+
if random_color:
|
235 |
+
color = np.random.random((msak_sum, 1, 1, 3))
|
236 |
+
else:
|
237 |
+
color = np.ones((msak_sum, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 255 / 255])
|
238 |
+
transparency = np.ones((msak_sum, 1, 1, 1)) * 0.6
|
239 |
+
visual = np.concatenate([color, transparency], axis=-1)
|
240 |
+
mask_image = np.expand_dims(annotation, -1) * visual
|
241 |
+
|
242 |
+
show = np.zeros((height, weight, 4))
|
243 |
+
h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing='ij')
|
244 |
+
indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
|
245 |
+
# Use vectorized indexing to update the values of 'show'.
|
246 |
+
show[h_indices, w_indices, :] = mask_image[indices]
|
247 |
+
if bboxes is not None:
|
248 |
+
for bbox in bboxes:
|
249 |
+
x1, y1, x2, y2 = bbox
|
250 |
+
ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
|
251 |
+
# draw point
|
252 |
+
if points is not None:
|
253 |
+
plt.scatter(
|
254 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
|
255 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
|
256 |
+
s=20,
|
257 |
+
c='y',
|
258 |
+
)
|
259 |
+
plt.scatter(
|
260 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
|
261 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
|
262 |
+
s=20,
|
263 |
+
c='m',
|
264 |
+
)
|
265 |
+
|
266 |
+
if not retinamask:
|
267 |
+
show = cv2.resize(show, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
|
268 |
+
ax.imshow(show)
|
269 |
+
|
270 |
+
def fast_show_mask_gpu(
|
271 |
+
self,
|
272 |
+
annotation,
|
273 |
+
ax,
|
274 |
+
random_color=False,
|
275 |
+
bboxes=None,
|
276 |
+
points=None,
|
277 |
+
pointlabel=None,
|
278 |
+
retinamask=True,
|
279 |
+
target_height=960,
|
280 |
+
target_width=960,
|
281 |
+
):
|
282 |
+
msak_sum = annotation.shape[0]
|
283 |
+
height = annotation.shape[1]
|
284 |
+
weight = annotation.shape[2]
|
285 |
+
areas = torch.sum(annotation, dim=(1, 2))
|
286 |
+
sorted_indices = torch.argsort(areas, descending=False)
|
287 |
+
annotation = annotation[sorted_indices]
|
288 |
+
# Find the index of the first non-zero value at each position.
|
289 |
+
index = (annotation != 0).to(torch.long).argmax(dim=0)
|
290 |
+
if random_color:
|
291 |
+
color = torch.rand((msak_sum, 1, 1, 3)).to(annotation.device)
|
292 |
+
else:
|
293 |
+
color = torch.ones((msak_sum, 1, 1, 3)).to(annotation.device) * torch.tensor([
|
294 |
+
30 / 255, 144 / 255, 255 / 255]).to(annotation.device)
|
295 |
+
transparency = torch.ones((msak_sum, 1, 1, 1)).to(annotation.device) * 0.6
|
296 |
+
visual = torch.cat([color, transparency], dim=-1)
|
297 |
+
mask_image = torch.unsqueeze(annotation, -1) * visual
|
298 |
+
# Select data according to the index. The index indicates which batch's data to choose at each position, converting the mask_image into a single batch form.
|
299 |
+
show = torch.zeros((height, weight, 4)).to(annotation.device)
|
300 |
+
try:
|
301 |
+
h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight), indexing='ij')
|
302 |
+
except:
|
303 |
+
h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight))
|
304 |
+
indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
|
305 |
+
# Use vectorized indexing to update the values of 'show'.
|
306 |
+
show[h_indices, w_indices, :] = mask_image[indices]
|
307 |
+
show_cpu = show.cpu().numpy()
|
308 |
+
if bboxes is not None:
|
309 |
+
for bbox in bboxes:
|
310 |
+
x1, y1, x2, y2 = bbox
|
311 |
+
ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
|
312 |
+
# draw point
|
313 |
+
if points is not None:
|
314 |
+
plt.scatter(
|
315 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
|
316 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
|
317 |
+
s=20,
|
318 |
+
c='y',
|
319 |
+
)
|
320 |
+
plt.scatter(
|
321 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
|
322 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
|
323 |
+
s=20,
|
324 |
+
c='m',
|
325 |
+
)
|
326 |
+
if not retinamask:
|
327 |
+
show_cpu = cv2.resize(show_cpu, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
|
328 |
+
ax.imshow(show_cpu)
|
329 |
+
|
330 |
+
# clip
|
331 |
+
@torch.no_grad()
|
332 |
+
def retrieve(self, model, preprocess, elements, search_text: str, device) -> int:
|
333 |
+
preprocessed_images = [preprocess(image).to(device) for image in elements]
|
334 |
+
try:
|
335 |
+
import clip # for linear_assignment
|
336 |
+
|
337 |
+
except (ImportError, AssertionError, AttributeError):
|
338 |
+
from ultralytics.yolo.utils.checks import check_requirements
|
339 |
+
|
340 |
+
check_requirements('git+https://github.com/openai/CLIP.git') # required before installing lap from source
|
341 |
+
import clip
|
342 |
+
|
343 |
+
|
344 |
+
tokenized_text = clip.tokenize([search_text]).to(device)
|
345 |
+
stacked_images = torch.stack(preprocessed_images)
|
346 |
+
image_features = model.encode_image(stacked_images)
|
347 |
+
text_features = model.encode_text(tokenized_text)
|
348 |
+
image_features /= image_features.norm(dim=-1, keepdim=True)
|
349 |
+
text_features /= text_features.norm(dim=-1, keepdim=True)
|
350 |
+
probs = 100.0 * image_features @ text_features.T
|
351 |
+
return probs[:, 0].softmax(dim=0)
|
352 |
+
|
353 |
+
def _crop_image(self, format_results):
|
354 |
+
|
355 |
+
image = Image.fromarray(cv2.cvtColor(self.img, cv2.COLOR_BGR2RGB))
|
356 |
+
ori_w, ori_h = image.size
|
357 |
+
annotations = format_results
|
358 |
+
mask_h, mask_w = annotations[0]['segmentation'].shape
|
359 |
+
if ori_w != mask_w or ori_h != mask_h:
|
360 |
+
image = image.resize((mask_w, mask_h))
|
361 |
+
cropped_boxes = []
|
362 |
+
cropped_images = []
|
363 |
+
not_crop = []
|
364 |
+
filter_id = []
|
365 |
+
# annotations, _ = filter_masks(annotations)
|
366 |
+
# filter_id = list(_)
|
367 |
+
for _, mask in enumerate(annotations):
|
368 |
+
if np.sum(mask['segmentation']) <= 100:
|
369 |
+
filter_id.append(_)
|
370 |
+
continue
|
371 |
+
bbox = self._get_bbox_from_mask(mask['segmentation']) # mask 的 bbox
|
372 |
+
cropped_boxes.append(self._segment_image(image, bbox))
|
373 |
+
# cropped_boxes.append(segment_image(image,mask["segmentation"]))
|
374 |
+
cropped_images.append(bbox) # Save the bounding box of the cropped image.
|
375 |
+
|
376 |
+
return cropped_boxes, cropped_images, not_crop, filter_id, annotations
|
377 |
+
|
378 |
+
def box_prompt(self, bbox=None, bboxes=None):
|
379 |
+
if self.results == None:
|
380 |
+
return []
|
381 |
+
assert bbox or bboxes
|
382 |
+
if bboxes is None:
|
383 |
+
bboxes = [bbox]
|
384 |
+
max_iou_index = []
|
385 |
+
for bbox in bboxes:
|
386 |
+
assert (bbox[2] != 0 and bbox[3] != 0)
|
387 |
+
masks = self.results[0].masks.data
|
388 |
+
target_height = self.img.shape[0]
|
389 |
+
target_width = self.img.shape[1]
|
390 |
+
h = masks.shape[1]
|
391 |
+
w = masks.shape[2]
|
392 |
+
if h != target_height or w != target_width:
|
393 |
+
bbox = [
|
394 |
+
int(bbox[0] * w / target_width),
|
395 |
+
int(bbox[1] * h / target_height),
|
396 |
+
int(bbox[2] * w / target_width),
|
397 |
+
int(bbox[3] * h / target_height), ]
|
398 |
+
bbox[0] = round(bbox[0]) if round(bbox[0]) > 0 else 0
|
399 |
+
bbox[1] = round(bbox[1]) if round(bbox[1]) > 0 else 0
|
400 |
+
bbox[2] = round(bbox[2]) if round(bbox[2]) < w else w
|
401 |
+
bbox[3] = round(bbox[3]) if round(bbox[3]) < h else h
|
402 |
+
|
403 |
+
# IoUs = torch.zeros(len(masks), dtype=torch.float32)
|
404 |
+
bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
|
405 |
+
|
406 |
+
masks_area = torch.sum(masks[:, bbox[1]:bbox[3], bbox[0]:bbox[2]], dim=(1, 2))
|
407 |
+
orig_masks_area = torch.sum(masks, dim=(1, 2))
|
408 |
+
|
409 |
+
union = bbox_area + orig_masks_area - masks_area
|
410 |
+
IoUs = masks_area / union
|
411 |
+
max_iou_index.append(int(torch.argmax(IoUs)))
|
412 |
+
max_iou_index = list(set(max_iou_index))
|
413 |
+
return np.array(masks[max_iou_index].cpu().numpy())
|
414 |
+
|
415 |
+
def point_prompt(self, points, pointlabel): # numpy
|
416 |
+
if self.results == None:
|
417 |
+
return []
|
418 |
+
masks = self._format_results(self.results[0], 0)
|
419 |
+
target_height = self.img.shape[0]
|
420 |
+
target_width = self.img.shape[1]
|
421 |
+
h = masks[0]['segmentation'].shape[0]
|
422 |
+
w = masks[0]['segmentation'].shape[1]
|
423 |
+
if h != target_height or w != target_width:
|
424 |
+
points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points]
|
425 |
+
onemask = np.zeros((h, w))
|
426 |
+
masks = sorted(masks, key=lambda x: x['area'], reverse=True)
|
427 |
+
for i, annotation in enumerate(masks):
|
428 |
+
if type(annotation) == dict:
|
429 |
+
mask = annotation['segmentation']
|
430 |
+
else:
|
431 |
+
mask = annotation
|
432 |
+
for i, point in enumerate(points):
|
433 |
+
if mask[point[1], point[0]] == 1 and pointlabel[i] == 1:
|
434 |
+
onemask[mask] = 1
|
435 |
+
if mask[point[1], point[0]] == 1 and pointlabel[i] == 0:
|
436 |
+
onemask[mask] = 0
|
437 |
+
onemask = onemask >= 1
|
438 |
+
return np.array([onemask])
|
439 |
+
|
440 |
+
def text_prompt(self, text):
|
441 |
+
if self.results == None:
|
442 |
+
return []
|
443 |
+
format_results = self._format_results(self.results[0], 0)
|
444 |
+
cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._crop_image(format_results)
|
445 |
+
clip_model, preprocess = clip.load('ViT-B/32', device=self.device)
|
446 |
+
scores = self.retrieve(clip_model, preprocess, cropped_boxes, text, device=self.device)
|
447 |
+
max_idx = scores.argsort()
|
448 |
+
max_idx = max_idx[-1]
|
449 |
+
max_idx += sum(np.array(filter_id) <= int(max_idx))
|
450 |
+
return np.array([annotations[max_idx]['segmentation']])
|
451 |
+
|
452 |
+
def everything_prompt(self):
|
453 |
+
if self.results == None:
|
454 |
+
return []
|
455 |
+
return self.results[0].masks.data
|
456 |
+
|
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/run_test.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import cv2
|
4 |
+
import numpy as np
|
5 |
+
import onnxruntime
|
6 |
+
import time
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import torch
|
9 |
+
from ultralytics.engine.results import Results
|
10 |
+
from tools_pt import *
|
11 |
+
from prompt import FastSAMPrompt
|
12 |
+
import aidlite
|
13 |
+
import argparse
|
14 |
+
import ast
|
15 |
+
|
16 |
+
# 定义相似度函数
|
17 |
+
def get_acc(onnx_out,other_out):
|
18 |
+
cosine_similarity=np.dot(np.array(onnx_out),np.array(other_out))/(np.linalg.norm(np.array(onnx_out)) * np.linalg.norm(np.array(other_out)))
|
19 |
+
return cosine_similarity
|
20 |
+
|
21 |
+
def cal_sigmoid(x):
|
22 |
+
return 1 / (1 + np.exp(-x))
|
23 |
+
|
24 |
+
class qnn_predict(object):
|
25 |
+
def __init__(self,inputshape,outputshape,args) -> None:
|
26 |
+
aidlite.set_log_level(aidlite.LogLevel.INFO)
|
27 |
+
aidlite.log_to_stderr()
|
28 |
+
print(f"Aidlite library version : {aidlite.get_library_version()}")
|
29 |
+
print(f"Aidlite python library version : {aidlite.get_py_library_version()}")
|
30 |
+
config = aidlite.Config.create_instance()
|
31 |
+
if config is None:
|
32 |
+
print("Create model failed !")
|
33 |
+
config.implement_type = aidlite.ImplementType.TYPE_LOCAL
|
34 |
+
config.framework_type = aidlite.FrameworkType.TYPE_QNN
|
35 |
+
config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
|
36 |
+
config.is_quantify_model = 1
|
37 |
+
|
38 |
+
model = aidlite.Model.create_instance(args.target_model)
|
39 |
+
if model is None:
|
40 |
+
print("Create model failed !")
|
41 |
+
|
42 |
+
self.input_shape=inputshape
|
43 |
+
self.out_shape = outputshape
|
44 |
+
model.set_model_properties(self.input_shape, aidlite.DataType.TYPE_FLOAT32, self.out_shape, aidlite.DataType.TYPE_FLOAT32)
|
45 |
+
self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(model, config)
|
46 |
+
if self.interpreter is None:
|
47 |
+
print("build_interpretper_from_model_and_config failed !")
|
48 |
+
result = self.interpreter.init()
|
49 |
+
if result != 0:
|
50 |
+
print(f"interpreter init failed !")
|
51 |
+
result = self.interpreter.load_model()
|
52 |
+
if result != 0:
|
53 |
+
print("interpreter load model failed !")
|
54 |
+
print("detect model load success!")
|
55 |
+
|
56 |
+
self.conf = 0.4
|
57 |
+
self.iou=0.9
|
58 |
+
self.size = 640
|
59 |
+
self.agnostic_nms=False
|
60 |
+
self.max_det = 300
|
61 |
+
self.names=['object']
|
62 |
+
self.classes =None
|
63 |
+
self.retina_masks=True
|
64 |
+
|
65 |
+
def pretreat_img(self,img):
|
66 |
+
scale = 1/255.
|
67 |
+
img_size = cv2.resize(img, (self.size,self.size), interpolation=cv2.INTER_LINEAR)
|
68 |
+
float_img = img_size.astype('float32')
|
69 |
+
float_img = float_img* scale
|
70 |
+
float_img = float_img[:, :, ::-1]
|
71 |
+
return float_img
|
72 |
+
|
73 |
+
def postprocess(self, preds, img, orig_imgs):
|
74 |
+
"""TODO: filter by classes."""
|
75 |
+
p = non_max_suppression(torch.from_numpy(preds[0]),
|
76 |
+
self.conf,
|
77 |
+
self.iou,
|
78 |
+
agnostic=self.agnostic_nms,
|
79 |
+
max_det=self.max_det,
|
80 |
+
nc=len(self.names),
|
81 |
+
classes=self.classes)
|
82 |
+
|
83 |
+
results = []
|
84 |
+
if len(p) == 0 or len(p[0]) == 0:
|
85 |
+
print("No object detected.")
|
86 |
+
return results
|
87 |
+
|
88 |
+
full_box = torch.zeros_like(p[0][0])
|
89 |
+
full_box[2], full_box[3], full_box[4], full_box[6:] = img.shape[3], img.shape[2], 1.0, 1.0
|
90 |
+
full_box = full_box.view(1, -1)
|
91 |
+
critical_iou_index = bbox_iou(full_box[0][:4], p[0][:, :4], iou_thres=0.9, image_shape=img.shape[2:])
|
92 |
+
if critical_iou_index.numel() != 0:
|
93 |
+
full_box[0][4] = p[0][critical_iou_index][:,4]
|
94 |
+
full_box[0][6:] = p[0][critical_iou_index][:,6:]
|
95 |
+
p[0][critical_iou_index] = full_box
|
96 |
+
|
97 |
+
#proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported
|
98 |
+
proto=torch.from_numpy(preds[-1])
|
99 |
+
for i, pred in enumerate(p):
|
100 |
+
orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs
|
101 |
+
path =img[0] #self.batch[0]
|
102 |
+
img_path = path[i] if isinstance(path, list) else path
|
103 |
+
if not len(pred): # save empty boxes
|
104 |
+
results.append(Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6]))
|
105 |
+
continue
|
106 |
+
if self.retina_masks:
|
107 |
+
if not isinstance(orig_imgs, torch.Tensor):
|
108 |
+
pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
|
109 |
+
masks = process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2]) # HWC
|
110 |
+
else:
|
111 |
+
masks = process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True) # HWC
|
112 |
+
if not isinstance(orig_imgs, torch.Tensor):
|
113 |
+
pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
|
114 |
+
results.append(
|
115 |
+
Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6], masks=masks))
|
116 |
+
return results
|
117 |
+
|
118 |
+
def qnn_run(self, orig_imgs,img_path,args):
|
119 |
+
input_img_f =self.pretreat_img(orig_imgs) # 图片resize HWC
|
120 |
+
# print("qnn_input:",input_img_f)
|
121 |
+
# encoder texts
|
122 |
+
input_img = np.expand_dims(input_img_f, 0)
|
123 |
+
|
124 |
+
invoke_time=[]
|
125 |
+
for i in range(args.invoke_nums):
|
126 |
+
result = self.interpreter.set_input_tensor(0, input_img.data)
|
127 |
+
t0 = time.time()
|
128 |
+
result = self.interpreter.invoke()
|
129 |
+
t1 = time.time()
|
130 |
+
cost_time=(t1-t0)*1000
|
131 |
+
invoke_time.append(cost_time)
|
132 |
+
mask_ = self.interpreter.get_output_tensor(0)
|
133 |
+
concat_ = self.interpreter.get_output_tensor(1)
|
134 |
+
mul_ = self.interpreter.get_output_tensor(3)
|
135 |
+
split_ = self.interpreter.get_output_tensor(2)
|
136 |
+
mask_ = mask_.reshape( * self.out_shape[3])
|
137 |
+
mask_=mask_.transpose((0, 3, 1,2))
|
138 |
+
concat_ = concat_.reshape( *self.out_shape[2])
|
139 |
+
mul_ = mul_.reshape( *self.out_shape[1])
|
140 |
+
split_ = split_.reshape( *self.out_shape[0])
|
141 |
+
sig_ = cal_sigmoid(split_)
|
142 |
+
|
143 |
+
output_concat = np.concatenate((mul_,sig_),axis=1)
|
144 |
+
output_concat = np.concatenate((output_concat,concat_),axis=1)
|
145 |
+
|
146 |
+
# outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
|
147 |
+
## time 统计
|
148 |
+
max_invoke_time = max(invoke_time)
|
149 |
+
min_invoke_time = min(invoke_time)
|
150 |
+
mean_invoke_time = sum(invoke_time)/args.invoke_nums
|
151 |
+
var_invoketime=np.var(invoke_time)
|
152 |
+
print("========================================")
|
153 |
+
print(f"QNN inference {args.invoke_nums} times :\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
|
154 |
+
print("========================================")
|
155 |
+
|
156 |
+
qnn_out = [np.array(output_concat),np.array(mask_)]
|
157 |
+
# print("qnn predict out:",qnn_out)
|
158 |
+
|
159 |
+
nchw_img = input_img.transpose(0,3,1,2)
|
160 |
+
everything_results = self.postprocess( qnn_out, nchw_img, [orig_imgs])
|
161 |
+
# print("everything_results: ",everything_results)
|
162 |
+
|
163 |
+
prompt_process = FastSAMPrompt(args.imgs, everything_results, device="cpu")
|
164 |
+
|
165 |
+
# ann = prompt_process.point_prompt(points=[[620, 360]], pointlabel=[1])
|
166 |
+
try:
|
167 |
+
if args.point_prompt ==[[0,0]]:
|
168 |
+
ann = prompt_process.everything_prompt()
|
169 |
+
else:
|
170 |
+
ann = prompt_process.point_prompt(points=args.point_prompt, pointlabel=[1])
|
171 |
+
out_name = os.path.basename(img_path).split(".")[0]
|
172 |
+
if True: # savepic
|
173 |
+
outpath = "python/"
|
174 |
+
if not os.path.exists(outpath):
|
175 |
+
os.mkdir(outpath)
|
176 |
+
prompt_process.plot(
|
177 |
+
annotations=ann,
|
178 |
+
output_path=os.path.join(outpath,out_name+"_result_int8.jpg"),
|
179 |
+
mask_random_color=True,
|
180 |
+
better_quality=True,
|
181 |
+
retina=False,
|
182 |
+
withContours=True,
|
183 |
+
)
|
184 |
+
else:
|
185 |
+
plt.figure()
|
186 |
+
prompt_process.fast_show_mask(annotation=ann,
|
187 |
+
ax = plt)
|
188 |
+
except Exception as e:
|
189 |
+
print(f"Waning : An error occurred in the picture {img_path} prediction -{e}")
|
190 |
+
return [mask_.reshape(-1),output_concat.reshape(-1)]
|
191 |
+
|
192 |
+
|
193 |
+
|
194 |
+
def parser_args():
|
195 |
+
parser = argparse.ArgumentParser(description="Run model benchmarks")
|
196 |
+
parser.add_argument('--target_model',type=str,default='models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin.aidem',help="inference model path")
|
197 |
+
parser.add_argument('--source_model',type=str,default='models/fastsam_x.onnx',help="original model path")
|
198 |
+
parser.add_argument('--imgs',type=str,default='python/dogs.jpg',help="Predict images path")
|
199 |
+
parser.add_argument('--invoke_nums',type=int,default=10,help="Inference nums")
|
200 |
+
parser.add_argument('--point_prompt',type=str,default="[[0,0]]",help="example:[[x1,y1],[x2,y2]]")
|
201 |
+
args = parser.parse_args()
|
202 |
+
return args
|
203 |
+
|
204 |
+
|
205 |
+
if __name__ == "__main__":
|
206 |
+
args = parser_args()
|
207 |
+
inputshape=[[1,640,640,3]]
|
208 |
+
outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
|
209 |
+
args.point_prompt = ast.literal_eval(args.point_prompt)
|
210 |
+
|
211 |
+
predict = qnn_predict(inputshape,outputshape,args)
|
212 |
+
if os.path.isdir(args.imgs):
|
213 |
+
img_files = os.listdir(args.imgs)
|
214 |
+
for fi in img_files:
|
215 |
+
img_path = os.path.join(args.imgs,fi)
|
216 |
+
im0s = cv2.imread(img_path) # BGR
|
217 |
+
im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
|
218 |
+
predict.qnn_run(im0s,img_path,args)
|
219 |
+
else:
|
220 |
+
img_path = args.imgs
|
221 |
+
im0s = cv2.imread(img_path) # BGR
|
222 |
+
im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
|
223 |
+
qnn_result = predict.qnn_run(im0s,img_path,args)
|
224 |
+
print("Prediction completion and the results are saved !")
|
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/tools_pt.py
ADDED
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import time
|
3 |
+
import torch
|
4 |
+
import torchvision
|
5 |
+
import torch.nn.functional as F
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
def clip_boxes(boxes, shape):
|
10 |
+
"""
|
11 |
+
Takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the shape.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
boxes (torch.Tensor): the bounding boxes to clip
|
15 |
+
shape (tuple): the shape of the image
|
16 |
+
"""
|
17 |
+
if isinstance(boxes, torch.Tensor): # faster individually
|
18 |
+
boxes[..., 0].clamp_(0, shape[1]) # x1
|
19 |
+
boxes[..., 1].clamp_(0, shape[0]) # y1
|
20 |
+
boxes[..., 2].clamp_(0, shape[1]) # x2
|
21 |
+
boxes[..., 3].clamp_(0, shape[0]) # y2
|
22 |
+
else: # np.array (faster grouped)
|
23 |
+
boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
|
24 |
+
boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
|
25 |
+
|
26 |
+
def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding=True):
|
27 |
+
"""
|
28 |
+
Rescales bounding boxes (in the format of xyxy) from the shape of the image they were originally specified in
|
29 |
+
(img1_shape) to the shape of a different image (img0_shape).
|
30 |
+
|
31 |
+
Args:
|
32 |
+
img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
|
33 |
+
boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
|
34 |
+
img0_shape (tuple): the shape of the target image, in the format of (height, width).
|
35 |
+
ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
|
36 |
+
calculated based on the size difference between the two images.
|
37 |
+
padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
|
38 |
+
rescaling.
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
|
42 |
+
"""
|
43 |
+
if ratio_pad is None: # calculate from img0_shape
|
44 |
+
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
|
45 |
+
pad = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1), round(
|
46 |
+
(img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # wh padding
|
47 |
+
else:
|
48 |
+
gain = ratio_pad[0][0]
|
49 |
+
pad = ratio_pad[1]
|
50 |
+
|
51 |
+
if padding:
|
52 |
+
boxes[..., [0, 2]] -= pad[0] # x padding
|
53 |
+
boxes[..., [1, 3]] -= pad[1] # y padding
|
54 |
+
boxes[..., :4] /= gain
|
55 |
+
clip_boxes(boxes, img0_shape)
|
56 |
+
return boxes
|
57 |
+
|
58 |
+
|
59 |
+
def xywh2xyxy(x):
|
60 |
+
"""
|
61 |
+
Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
|
62 |
+
top-left corner and (x2, y2) is the bottom-right corner.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
|
69 |
+
"""
|
70 |
+
assert x.shape[-1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'
|
71 |
+
y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x) # faster than clone/copy
|
72 |
+
dw = x[..., 2] / 2 # half-width
|
73 |
+
dh = x[..., 3] / 2 # half-height
|
74 |
+
y[..., 0] = x[..., 0] - dw # top left x
|
75 |
+
y[..., 1] = x[..., 1] - dh # top left y
|
76 |
+
y[..., 2] = x[..., 0] + dw # bottom right x
|
77 |
+
y[..., 3] = x[..., 1] + dh # bottom right y
|
78 |
+
return y
|
79 |
+
|
80 |
+
|
81 |
+
def non_max_suppression(
|
82 |
+
prediction,
|
83 |
+
conf_thres=0.25,
|
84 |
+
iou_thres=0.45,
|
85 |
+
classes=None,
|
86 |
+
agnostic=False,
|
87 |
+
multi_label=False,
|
88 |
+
labels=(),
|
89 |
+
max_det=300,
|
90 |
+
nc=0, # number of classes (optional)
|
91 |
+
max_time_img=0.05,
|
92 |
+
max_nms=30000,
|
93 |
+
max_wh=7680,
|
94 |
+
):
|
95 |
+
"""
|
96 |
+
Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box.
|
97 |
+
|
98 |
+
Args:
|
99 |
+
prediction (torch.Tensor): A tensor of shape (batch_size, num_classes + 4 + num_masks, num_boxes)
|
100 |
+
containing the predicted boxes, classes, and masks. The tensor should be in the format
|
101 |
+
output by a model, such as YOLO.
|
102 |
+
conf_thres (float): The confidence threshold below which boxes will be filtered out.
|
103 |
+
Valid values are between 0.0 and 1.0.
|
104 |
+
iou_thres (float): The IoU threshold below which boxes will be filtered out during NMS.
|
105 |
+
Valid values are between 0.0 and 1.0.
|
106 |
+
classes (List[int]): A list of class indices to consider. If None, all classes will be considered.
|
107 |
+
agnostic (bool): If True, the model is agnostic to the number of classes, and all
|
108 |
+
classes will be considered as one.
|
109 |
+
multi_label (bool): If True, each box may have multiple labels.
|
110 |
+
labels (List[List[Union[int, float, torch.Tensor]]]): A list of lists, where each inner
|
111 |
+
list contains the apriori labels for a given image. The list should be in the format
|
112 |
+
output by a dataloader, with each label being a tuple of (class_index, x1, y1, x2, y2).
|
113 |
+
max_det (int): The maximum number of boxes to keep after NMS.
|
114 |
+
nc (int, optional): The number of classes output by the model. Any indices after this will be considered masks.
|
115 |
+
max_time_img (float): The maximum time (seconds) for processing one image.
|
116 |
+
max_nms (int): The maximum number of boxes into torchvision.ops.nms().
|
117 |
+
max_wh (int): The maximum box width and height in pixels
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
(List[torch.Tensor]): A list of length batch_size, where each element is a tensor of
|
121 |
+
shape (num_boxes, 6 + num_masks) containing the kept boxes, with columns
|
122 |
+
(x1, y1, x2, y2, confidence, class, mask1, mask2, ...).
|
123 |
+
"""
|
124 |
+
|
125 |
+
# Checks
|
126 |
+
assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
|
127 |
+
assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
|
128 |
+
if isinstance(prediction, (list, tuple)): # YOLOv8 model in validation model, output = (inference_out, loss_out)
|
129 |
+
prediction = prediction[0] # select only inference output
|
130 |
+
|
131 |
+
device = prediction.device
|
132 |
+
mps = 'mps' in device.type # Apple MPS
|
133 |
+
if mps: # MPS not fully supported yet, convert tensors to CPU before NMS
|
134 |
+
prediction = prediction.cpu()
|
135 |
+
bs = prediction.shape[0] # batch size
|
136 |
+
nc = nc or (prediction.shape[1] - 4) # number of classes
|
137 |
+
nm = prediction.shape[1] - nc - 4
|
138 |
+
mi = 4 + nc # mask start index
|
139 |
+
xc = prediction[:, 4:mi].amax(1) > conf_thres # candidates
|
140 |
+
|
141 |
+
# Settings
|
142 |
+
# min_wh = 2 # (pixels) minimum box width and height
|
143 |
+
time_limit = 0.5 + max_time_img * bs # seconds to quit after
|
144 |
+
multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
|
145 |
+
|
146 |
+
prediction = prediction.transpose(-1, -2) # shape(1,84,6300) to shape(1,6300,84)
|
147 |
+
prediction[..., :4] = xywh2xyxy(prediction[..., :4]) # xywh to xyxy
|
148 |
+
|
149 |
+
t = time.time()
|
150 |
+
output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
|
151 |
+
for xi, x in enumerate(prediction): # image index, image inference
|
152 |
+
# Apply constraints
|
153 |
+
# x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0 # width-height
|
154 |
+
x = x[xc[xi]] # confidence
|
155 |
+
|
156 |
+
# Cat apriori labels if autolabelling
|
157 |
+
if labels and len(labels[xi]):
|
158 |
+
lb = labels[xi]
|
159 |
+
v = torch.zeros((len(lb), nc + nm + 4), device=x.device)
|
160 |
+
v[:, :4] = xywh2xyxy(lb[:, 1:5]) # box
|
161 |
+
v[range(len(lb)), lb[:, 0].long() + 4] = 1.0 # cls
|
162 |
+
x = torch.cat((x, v), 0)
|
163 |
+
|
164 |
+
# If none remain process next image
|
165 |
+
if not x.shape[0]:
|
166 |
+
continue
|
167 |
+
|
168 |
+
# Detections matrix nx6 (xyxy, conf, cls)
|
169 |
+
box, cls, mask = x.split((4, nc, nm), 1)
|
170 |
+
|
171 |
+
if multi_label:
|
172 |
+
i, j = torch.where(cls > conf_thres)
|
173 |
+
x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1)
|
174 |
+
else: # best class only
|
175 |
+
conf, j = cls.max(1, keepdim=True)
|
176 |
+
x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
|
177 |
+
|
178 |
+
# Filter by class
|
179 |
+
if classes is not None:
|
180 |
+
x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
|
181 |
+
|
182 |
+
# Check shape
|
183 |
+
n = x.shape[0] # number of boxes
|
184 |
+
if not n: # no boxes
|
185 |
+
continue
|
186 |
+
if n > max_nms: # excess boxes
|
187 |
+
x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence and remove excess boxes
|
188 |
+
|
189 |
+
# Batched NMS
|
190 |
+
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
|
191 |
+
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
|
192 |
+
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
|
193 |
+
i = i[:max_det] # limit detections
|
194 |
+
|
195 |
+
# # Experimental
|
196 |
+
# merge = False # use merge-NMS
|
197 |
+
# if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
|
198 |
+
# # Update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
|
199 |
+
# from .metrics import box_iou
|
200 |
+
# iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
|
201 |
+
# weights = iou * scores[None] # box weights
|
202 |
+
# x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
|
203 |
+
# redundant = True # require redundant detections
|
204 |
+
# if redundant:
|
205 |
+
# i = i[iou.sum(1) > 1] # require redundancy
|
206 |
+
|
207 |
+
output[xi] = x[i]
|
208 |
+
if mps:
|
209 |
+
output[xi] = output[xi].to(device)
|
210 |
+
# if (time.time() - t) > time_limit:
|
211 |
+
# LOGGER.warning(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded')
|
212 |
+
# break # time limit exceeded
|
213 |
+
|
214 |
+
return output
|
215 |
+
|
216 |
+
|
217 |
+
def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
|
218 |
+
'''Adjust bounding boxes to stick to image border if they are within a certain threshold.
|
219 |
+
Args:
|
220 |
+
boxes: (n, 4)
|
221 |
+
image_shape: (height, width)
|
222 |
+
threshold: pixel threshold
|
223 |
+
Returns:
|
224 |
+
adjusted_boxes: adjusted bounding boxes
|
225 |
+
'''
|
226 |
+
|
227 |
+
# Image dimensions
|
228 |
+
h, w = image_shape
|
229 |
+
|
230 |
+
# Adjust boxes
|
231 |
+
boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
|
232 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
|
233 |
+
boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
|
234 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
|
235 |
+
boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
|
236 |
+
w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
|
237 |
+
boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
|
238 |
+
h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
|
239 |
+
|
240 |
+
return boxes
|
241 |
+
|
242 |
+
def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
|
243 |
+
'''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
|
244 |
+
Args:
|
245 |
+
box1: (4, )
|
246 |
+
boxes: (n, 4)
|
247 |
+
Returns:
|
248 |
+
high_iou_indices: Indices of boxes with IoU > thres
|
249 |
+
'''
|
250 |
+
boxes = adjust_bboxes_to_image_border(boxes, image_shape)
|
251 |
+
# obtain coordinates for intersections
|
252 |
+
x1 = torch.max(box1[0], boxes[:, 0])
|
253 |
+
y1 = torch.max(box1[1], boxes[:, 1])
|
254 |
+
x2 = torch.min(box1[2], boxes[:, 2])
|
255 |
+
y2 = torch.min(box1[3], boxes[:, 3])
|
256 |
+
|
257 |
+
# compute the area of intersection
|
258 |
+
intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
|
259 |
+
|
260 |
+
# compute the area of both individual boxes
|
261 |
+
box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
262 |
+
box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
263 |
+
|
264 |
+
# compute the area of union
|
265 |
+
union = box1_area + box2_area - intersection
|
266 |
+
|
267 |
+
# compute the IoU
|
268 |
+
iou = intersection / union # Should be shape (n, )
|
269 |
+
if raw_output:
|
270 |
+
if iou.numel() == 0:
|
271 |
+
return 0
|
272 |
+
return iou
|
273 |
+
|
274 |
+
# get indices of boxes with IoU > thres
|
275 |
+
high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
|
276 |
+
|
277 |
+
return high_iou_indices
|
278 |
+
|
279 |
+
|
280 |
+
def scale_masks(masks, shape, padding=True):
|
281 |
+
"""
|
282 |
+
Rescale segment masks to shape.
|
283 |
+
|
284 |
+
Args:
|
285 |
+
masks (torch.Tensor): (N, C, H, W).
|
286 |
+
shape (tuple): Height and width.
|
287 |
+
padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
|
288 |
+
rescaling.
|
289 |
+
"""
|
290 |
+
mh, mw = masks.shape[2:]
|
291 |
+
gain = min(mh / shape[0], mw / shape[1]) # gain = old / new
|
292 |
+
pad = [mw - shape[1] * gain, mh - shape[0] * gain] # wh padding
|
293 |
+
if padding:
|
294 |
+
pad[0] /= 2
|
295 |
+
pad[1] /= 2
|
296 |
+
top, left = (int(pad[1]), int(pad[0])) if padding else (0, 0) # y, x
|
297 |
+
bottom, right = (int(mh - pad[1]), int(mw - pad[0]))
|
298 |
+
masks = masks[..., top:bottom, left:right]
|
299 |
+
|
300 |
+
masks = F.interpolate(masks, shape, mode="bilinear", align_corners=False) # NCHW
|
301 |
+
return masks
|
302 |
+
|
303 |
+
|
304 |
+
def process_mask_native(protos, masks_in, bboxes, shape):
|
305 |
+
"""
|
306 |
+
It takes the output of the mask head, and crops it after upsampling to the bounding boxes.
|
307 |
+
|
308 |
+
Args:
|
309 |
+
protos (torch.Tensor): [mask_dim, mask_h, mask_w]
|
310 |
+
masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms
|
311 |
+
bboxes (torch.Tensor): [n, 4], n is number of masks after nms
|
312 |
+
shape (tuple): the size of the input image (h,w)
|
313 |
+
|
314 |
+
Returns:
|
315 |
+
masks (torch.Tensor): The returned masks with dimensions [h, w, n]
|
316 |
+
"""
|
317 |
+
c, mh, mw = protos.shape # CHW
|
318 |
+
masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw)
|
319 |
+
masks = scale_masks(masks[None], shape)[0] # CHW
|
320 |
+
masks = crop_mask(masks, bboxes) # CHW
|
321 |
+
return masks.gt_(0.5)
|
322 |
+
|
323 |
+
def crop_mask(masks, boxes):
|
324 |
+
"""
|
325 |
+
It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box.
|
326 |
+
|
327 |
+
Args:
|
328 |
+
masks (torch.Tensor): [n, h, w] tensor of masks
|
329 |
+
boxes (torch.Tensor): [n, 4] tensor of bbox coordinates in relative point form
|
330 |
+
|
331 |
+
Returns:
|
332 |
+
(torch.Tensor): The masks are being cropped to the bounding box.
|
333 |
+
"""
|
334 |
+
_, h, w = masks.shape
|
335 |
+
x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(n,1,1)
|
336 |
+
r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,1,w)
|
337 |
+
c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(1,h,1)
|
338 |
+
|
339 |
+
return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
|
340 |
+
|
341 |
+
def process_mask(protos, masks_in, bboxes, shape, upsample=False):
|
342 |
+
"""
|
343 |
+
Apply masks to bounding boxes using the output of the mask head.
|
344 |
+
|
345 |
+
Args:
|
346 |
+
protos (torch.Tensor): A tensor of shape [mask_dim, mask_h, mask_w].
|
347 |
+
masks_in (torch.Tensor): A tensor of shape [n, mask_dim], where n is the number of masks after NMS.
|
348 |
+
bboxes (torch.Tensor): A tensor of shape [n, 4], where n is the number of masks after NMS.
|
349 |
+
shape (tuple): A tuple of integers representing the size of the input image in the format (h, w).
|
350 |
+
upsample (bool): A flag to indicate whether to upsample the mask to the original image size. Default is False.
|
351 |
+
|
352 |
+
Returns:
|
353 |
+
(torch.Tensor): A binary mask tensor of shape [n, h, w], where n is the number of masks after NMS, and h and w
|
354 |
+
are the height and width of the input image. The mask is applied to the bounding boxes.
|
355 |
+
"""
|
356 |
+
|
357 |
+
c, mh, mw = protos.shape # CHW
|
358 |
+
ih, iw = shape
|
359 |
+
masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) # CHW
|
360 |
+
|
361 |
+
downsampled_bboxes = bboxes.clone()
|
362 |
+
downsampled_bboxes[:, 0] *= mw / iw
|
363 |
+
downsampled_bboxes[:, 2] *= mw / iw
|
364 |
+
downsampled_bboxes[:, 3] *= mh / ih
|
365 |
+
downsampled_bboxes[:, 1] *= mh / ih
|
366 |
+
|
367 |
+
masks = crop_mask(masks, downsampled_bboxes) # CHW
|
368 |
+
if upsample:
|
369 |
+
masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW
|
370 |
+
return masks.gt_(0.5)
|
371 |
+
|
372 |
+
|
model_farm_fastsamx_qcs6490_qnn2.16_int8_aidlite/python/utils.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
|
6 |
+
def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
|
7 |
+
'''Adjust bounding boxes to stick to image border if they are within a certain threshold.
|
8 |
+
Args:
|
9 |
+
boxes: (n, 4)
|
10 |
+
image_shape: (height, width)
|
11 |
+
threshold: pixel threshold
|
12 |
+
Returns:
|
13 |
+
adjusted_boxes: adjusted bounding boxes
|
14 |
+
'''
|
15 |
+
|
16 |
+
# Image dimensions
|
17 |
+
h, w = image_shape
|
18 |
+
|
19 |
+
# Adjust boxes
|
20 |
+
boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
|
21 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
|
22 |
+
boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
|
23 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
|
24 |
+
boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
|
25 |
+
w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
|
26 |
+
boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
|
27 |
+
h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
|
28 |
+
|
29 |
+
return boxes
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
def convert_box_xywh_to_xyxy(box):
|
34 |
+
x1 = box[0]
|
35 |
+
y1 = box[1]
|
36 |
+
x2 = box[0] + box[2]
|
37 |
+
y2 = box[1] + box[3]
|
38 |
+
return [x1, y1, x2, y2]
|
39 |
+
|
40 |
+
|
41 |
+
def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
|
42 |
+
'''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
|
43 |
+
Args:
|
44 |
+
box1: (4, )
|
45 |
+
boxes: (n, 4)
|
46 |
+
Returns:
|
47 |
+
high_iou_indices: Indices of boxes with IoU > thres
|
48 |
+
'''
|
49 |
+
boxes = adjust_bboxes_to_image_border(boxes, image_shape)
|
50 |
+
# obtain coordinates for intersections
|
51 |
+
x1 = torch.max(box1[0], boxes[:, 0])
|
52 |
+
y1 = torch.max(box1[1], boxes[:, 1])
|
53 |
+
x2 = torch.min(box1[2], boxes[:, 2])
|
54 |
+
y2 = torch.min(box1[3], boxes[:, 3])
|
55 |
+
|
56 |
+
# compute the area of intersection
|
57 |
+
intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
|
58 |
+
|
59 |
+
# compute the area of both individual boxes
|
60 |
+
box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
61 |
+
box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
62 |
+
|
63 |
+
# compute the area of union
|
64 |
+
union = box1_area + box2_area - intersection
|
65 |
+
|
66 |
+
# compute the IoU
|
67 |
+
iou = intersection / union # Should be shape (n, )
|
68 |
+
if raw_output:
|
69 |
+
if iou.numel() == 0:
|
70 |
+
return 0
|
71 |
+
return iou
|
72 |
+
|
73 |
+
# get indices of boxes with IoU > thres
|
74 |
+
high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
|
75 |
+
|
76 |
+
return high_iou_indices
|
77 |
+
|
78 |
+
|
79 |
+
def image_to_np_ndarray(image):
|
80 |
+
if type(image) is str:
|
81 |
+
return np.array(Image.open(image))
|
82 |
+
elif issubclass(type(image), Image.Image):
|
83 |
+
return np.array(image)
|
84 |
+
elif type(image) is np.ndarray:
|
85 |
+
return image
|
86 |
+
return None
|
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/README.md
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Model Information
|
2 |
+
### Source model
|
3 |
+
- Input shape: 640x640
|
4 |
+
- Number of parameters: 68.89M
|
5 |
+
- Model size: 277.39M
|
6 |
+
- Output shape: 1x37x8400,1x32x160x160
|
7 |
+
|
8 |
+
Source model repository: [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
|
9 |
+
|
10 |
+
### Converted model
|
11 |
+
|
12 |
+
- Precision: FP16
|
13 |
+
- Backend: QNN2.16
|
14 |
+
- Target Device: SNM972 QCS8550
|
15 |
+
|
16 |
+
## Inference with AidLite SDK
|
17 |
+
|
18 |
+
### SDK installation
|
19 |
+
Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
|
20 |
+
|
21 |
+
- Install AidLite SDK
|
22 |
+
|
23 |
+
```bash
|
24 |
+
# Install the appropriate version of the aidlite sdk
|
25 |
+
sudo aid-pkg update
|
26 |
+
sudo aid-pkg install aidlite-sdk
|
27 |
+
# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
|
28 |
+
sudo aid-pkg install aidlite-{QNN VERSION}
|
29 |
+
```
|
30 |
+
|
31 |
+
- Verify AidLite SDK
|
32 |
+
|
33 |
+
```bash
|
34 |
+
# aidlite sdk c++ check
|
35 |
+
python3 -c "import aidlite ; print(aidlite.get_library_version())"
|
36 |
+
|
37 |
+
# aidlite sdk python check
|
38 |
+
python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
|
39 |
+
```
|
40 |
+
|
41 |
+
### Run demo
|
42 |
+
```bash
|
43 |
+
cd fastsam_x/model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite
|
44 |
+
export LD_PRELOAD=/home/aidlux/.local/lib/python3.8/site-packages/torch/lib/../../torch.libs/libgomp-804f19d4.so.1.0.0
|
45 |
+
|
46 |
+
python3 ./python/run_test.py --target_model ./models/cutoff_fastsam_x_fp16.qnn216.ctx.bin --imgs ./python/dogs.jpg --invoke_nums 10
|
47 |
+
```
|
48 |
+
|
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/models/cutoff_fastsam_x_fp16.qnn216.ctx.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4dc4b65549ba459736b2fcfe9cd190c63d70108c3edd5c35f9af310af19e5871
|
3 |
+
size 148172000
|
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/dogs.jpg
ADDED
![]() |
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/onnx_export.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import cv2
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
|
6 |
+
from ultralytics.models.fastsam import FastSAM
|
7 |
+
|
8 |
+
class Fast_SAM(torch.nn.Module):
|
9 |
+
"""Exportable FastSAM model, end-to-end."""
|
10 |
+
|
11 |
+
def __init__(self) -> None:
|
12 |
+
super().__init__()
|
13 |
+
pt_name ='./models/FastSAM-s.pt'
|
14 |
+
self.model =FastSAM(pt_name).model
|
15 |
+
|
16 |
+
def forward(self, image: torch.Tensor):
|
17 |
+
"""
|
18 |
+
Run FastSAM on `image`, and produce high quality segmentation masks.
|
19 |
+
Faster than SAM as it is based on YOLOv8.
|
20 |
+
|
21 |
+
Parameters:
|
22 |
+
image: Pixel values pre-processed for encoder consumption.
|
23 |
+
Range: float[0, 1]
|
24 |
+
3-channel Color Space: BGR
|
25 |
+
Returns:
|
26 |
+
|
27 |
+
"""
|
28 |
+
predictions = self.model(image)
|
29 |
+
# Return predictions as a tuple instead of nested tuple.
|
30 |
+
return (predictions[0], predictions[1][2])
|
31 |
+
|
32 |
+
|
33 |
+
model = Fast_SAM()
|
34 |
+
num_params = sum(p.numel() for p in model.parameters())
|
35 |
+
print(f'Number of FastSAM-s parameters: {num_params}')
|
36 |
+
dummy_input = torch.randn( [1,3,640,640],dtype=torch.float32 )
|
37 |
+
source_model = torch.jit.trace(
|
38 |
+
model.to("cpu"), dummy_input, check_trace=False
|
39 |
+
)
|
40 |
+
torch.onnx.export(model, # model being run
|
41 |
+
dummy_input, # model input (or a tuple for multiple inputs)
|
42 |
+
"./models/fastsam_s.onnx", # where to save the model
|
43 |
+
export_params=True, # store the trained parameter weights inside the model file
|
44 |
+
opset_version=12, # the ONNX version to export the model to
|
45 |
+
do_constant_folding=True, # whether to execute constant folding for optimization
|
46 |
+
input_names = ['input'], # the model's input names
|
47 |
+
output_names = ['boxes','mask'],
|
48 |
+
verbose=True,
|
49 |
+
)
|
50 |
+
print("Convert to onnx successfully!")
|
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/prompt.py
ADDED
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import cv2
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
from utils import image_to_np_ndarray
|
8 |
+
from PIL import Image
|
9 |
+
|
10 |
+
|
11 |
+
class FastSAMPrompt:
|
12 |
+
|
13 |
+
def __init__(self, image, results, device='cpu'):
|
14 |
+
if isinstance(image, str) or isinstance(image, Image.Image):
|
15 |
+
image = image_to_np_ndarray(image)
|
16 |
+
self.device = device
|
17 |
+
self.results = results
|
18 |
+
self.img = image
|
19 |
+
|
20 |
+
def _segment_image(self, image, bbox):
|
21 |
+
if isinstance(image, Image.Image):
|
22 |
+
image_array = np.array(image)
|
23 |
+
else:
|
24 |
+
image_array = image
|
25 |
+
segmented_image_array = np.zeros_like(image_array)
|
26 |
+
x1, y1, x2, y2 = bbox
|
27 |
+
segmented_image_array[y1:y2, x1:x2] = image_array[y1:y2, x1:x2]
|
28 |
+
segmented_image = Image.fromarray(segmented_image_array)
|
29 |
+
black_image = Image.new('RGB', image.size, (255, 255, 255))
|
30 |
+
# transparency_mask = np.zeros_like((), dtype=np.uint8)
|
31 |
+
transparency_mask = np.zeros((image_array.shape[0], image_array.shape[1]), dtype=np.uint8)
|
32 |
+
transparency_mask[y1:y2, x1:x2] = 255
|
33 |
+
transparency_mask_image = Image.fromarray(transparency_mask, mode='L')
|
34 |
+
black_image.paste(segmented_image, mask=transparency_mask_image)
|
35 |
+
return black_image
|
36 |
+
|
37 |
+
def _format_results(self, result, filter=0):
|
38 |
+
annotations = []
|
39 |
+
n = len(result.masks.data)
|
40 |
+
for i in range(n):
|
41 |
+
annotation = {}
|
42 |
+
mask = result.masks.data[i] == 1.0
|
43 |
+
|
44 |
+
if torch.sum(mask) < filter:
|
45 |
+
continue
|
46 |
+
annotation['id'] = i
|
47 |
+
annotation['segmentation'] = mask.cpu().numpy()
|
48 |
+
annotation['bbox'] = result.boxes.data[i]
|
49 |
+
annotation['score'] = result.boxes.conf[i]
|
50 |
+
annotation['area'] = annotation['segmentation'].sum()
|
51 |
+
annotations.append(annotation)
|
52 |
+
return annotations
|
53 |
+
|
54 |
+
def filter_masks(annotations): # filte the overlap mask
|
55 |
+
annotations.sort(key=lambda x: x['area'], reverse=True)
|
56 |
+
to_remove = set()
|
57 |
+
for i in range(0, len(annotations)):
|
58 |
+
a = annotations[i]
|
59 |
+
for j in range(i + 1, len(annotations)):
|
60 |
+
b = annotations[j]
|
61 |
+
if i != j and j not in to_remove:
|
62 |
+
# check if
|
63 |
+
if b['area'] < a['area']:
|
64 |
+
if (a['segmentation'] & b['segmentation']).sum() / b['segmentation'].sum() > 0.8:
|
65 |
+
to_remove.add(j)
|
66 |
+
|
67 |
+
return [a for i, a in enumerate(annotations) if i not in to_remove], to_remove
|
68 |
+
|
69 |
+
def _get_bbox_from_mask(self, mask):
|
70 |
+
mask = mask.astype(np.uint8)
|
71 |
+
contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
72 |
+
x1, y1, w, h = cv2.boundingRect(contours[0])
|
73 |
+
x2, y2 = x1 + w, y1 + h
|
74 |
+
if len(contours) > 1:
|
75 |
+
for b in contours:
|
76 |
+
x_t, y_t, w_t, h_t = cv2.boundingRect(b)
|
77 |
+
# Merge multiple bounding boxes into one.
|
78 |
+
x1 = min(x1, x_t)
|
79 |
+
y1 = min(y1, y_t)
|
80 |
+
x2 = max(x2, x_t + w_t)
|
81 |
+
y2 = max(y2, y_t + h_t)
|
82 |
+
h = y2 - y1
|
83 |
+
w = x2 - x1
|
84 |
+
return [x1, y1, x2, y2]
|
85 |
+
|
86 |
+
def plot_to_result(self,
|
87 |
+
annotations,
|
88 |
+
bboxes=None,
|
89 |
+
points=None,
|
90 |
+
point_label=None,
|
91 |
+
mask_random_color=True,
|
92 |
+
better_quality=True,
|
93 |
+
retina=False,
|
94 |
+
withContours=True) -> np.ndarray:
|
95 |
+
if isinstance(annotations[0], dict):
|
96 |
+
annotations = [annotation['segmentation'] for annotation in annotations]
|
97 |
+
image = self.img
|
98 |
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
99 |
+
original_h = image.shape[0]
|
100 |
+
original_w = image.shape[1]
|
101 |
+
if sys.platform == "darwin":
|
102 |
+
plt.switch_backend("TkAgg")
|
103 |
+
plt.figure(figsize=(original_w / 100, original_h / 100))
|
104 |
+
# Add subplot with no margin.
|
105 |
+
plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
|
106 |
+
plt.margins(0, 0)
|
107 |
+
plt.gca().xaxis.set_major_locator(plt.NullLocator())
|
108 |
+
plt.gca().yaxis.set_major_locator(plt.NullLocator())
|
109 |
+
|
110 |
+
plt.imshow(image)
|
111 |
+
if better_quality:
|
112 |
+
if isinstance(annotations[0], torch.Tensor):
|
113 |
+
annotations = np.array(annotations.cpu())
|
114 |
+
for i, mask in enumerate(annotations):
|
115 |
+
mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8))
|
116 |
+
annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8))
|
117 |
+
if self.device == 'cpu':
|
118 |
+
annotations = np.array(annotations)
|
119 |
+
self.fast_show_mask(
|
120 |
+
annotations,
|
121 |
+
plt.gca(),
|
122 |
+
random_color=mask_random_color,
|
123 |
+
bboxes=bboxes,
|
124 |
+
points=points,
|
125 |
+
pointlabel=point_label,
|
126 |
+
retinamask=retina,
|
127 |
+
target_height=original_h,
|
128 |
+
target_width=original_w,
|
129 |
+
)
|
130 |
+
else:
|
131 |
+
if isinstance(annotations[0], np.ndarray):
|
132 |
+
annotations = torch.from_numpy(annotations)
|
133 |
+
self.fast_show_mask_gpu(
|
134 |
+
annotations,
|
135 |
+
plt.gca(),
|
136 |
+
random_color=mask_random_color,
|
137 |
+
bboxes=bboxes,
|
138 |
+
points=points,
|
139 |
+
pointlabel=point_label,
|
140 |
+
retinamask=retina,
|
141 |
+
target_height=original_h,
|
142 |
+
target_width=original_w,
|
143 |
+
)
|
144 |
+
if isinstance(annotations, torch.Tensor):
|
145 |
+
annotations = annotations.cpu().numpy()
|
146 |
+
if withContours:
|
147 |
+
contour_all = []
|
148 |
+
temp = np.zeros((original_h, original_w, 1))
|
149 |
+
for i, mask in enumerate(annotations):
|
150 |
+
if type(mask) == dict:
|
151 |
+
mask = mask['segmentation']
|
152 |
+
annotation = mask.astype(np.uint8)
|
153 |
+
if not retina:
|
154 |
+
annotation = cv2.resize(
|
155 |
+
annotation,
|
156 |
+
(original_w, original_h),
|
157 |
+
interpolation=cv2.INTER_NEAREST,
|
158 |
+
)
|
159 |
+
contours, hierarchy = cv2.findContours(annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
160 |
+
for contour in contours:
|
161 |
+
contour_all.append(contour)
|
162 |
+
cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2)
|
163 |
+
color = np.array([0 / 255, 0 / 255, 255 / 255, 0.8])
|
164 |
+
contour_mask = temp / 255 * color.reshape(1, 1, -1)
|
165 |
+
plt.imshow(contour_mask)
|
166 |
+
|
167 |
+
plt.axis('off')
|
168 |
+
fig = plt.gcf()
|
169 |
+
plt.draw()
|
170 |
+
|
171 |
+
try:
|
172 |
+
buf = fig.canvas.tostring_rgb()
|
173 |
+
except AttributeError:
|
174 |
+
fig.canvas.draw()
|
175 |
+
buf = fig.canvas.tostring_rgb()
|
176 |
+
cols, rows = fig.canvas.get_width_height()
|
177 |
+
img_array = np.frombuffer(buf, dtype=np.uint8).reshape(rows, cols, 3)
|
178 |
+
result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
|
179 |
+
plt.close()
|
180 |
+
return result
|
181 |
+
|
182 |
+
# Remark for refactoring: IMO a function should do one thing only, storing the image and plotting should be seperated and do not necessarily need to be class functions but standalone utility functions that the user can chain in his scripts to have more fine-grained control.
|
183 |
+
def plot(self,
|
184 |
+
annotations,
|
185 |
+
output_path,
|
186 |
+
bboxes=None,
|
187 |
+
points=None,
|
188 |
+
point_label=None,
|
189 |
+
mask_random_color=True,
|
190 |
+
better_quality=True,
|
191 |
+
retina=False,
|
192 |
+
withContours=True):
|
193 |
+
if len(annotations) == 0:
|
194 |
+
return None
|
195 |
+
result = self.plot_to_result(
|
196 |
+
annotations,
|
197 |
+
bboxes,
|
198 |
+
points,
|
199 |
+
point_label,
|
200 |
+
mask_random_color,
|
201 |
+
better_quality,
|
202 |
+
retina,
|
203 |
+
withContours,
|
204 |
+
)
|
205 |
+
|
206 |
+
path = os.path.dirname(os.path.abspath(output_path))
|
207 |
+
if not os.path.exists(path):
|
208 |
+
os.makedirs(path)
|
209 |
+
result = result[:, :, ::-1]
|
210 |
+
cv2.imwrite(output_path, result)
|
211 |
+
|
212 |
+
# CPU post process
|
213 |
+
def fast_show_mask(
|
214 |
+
self,
|
215 |
+
annotation,
|
216 |
+
ax,
|
217 |
+
random_color=False,
|
218 |
+
bboxes=None,
|
219 |
+
points=None,
|
220 |
+
pointlabel=None,
|
221 |
+
retinamask=True,
|
222 |
+
target_height=960,
|
223 |
+
target_width=960,
|
224 |
+
):
|
225 |
+
msak_sum = annotation.shape[0]
|
226 |
+
height = annotation.shape[1]
|
227 |
+
weight = annotation.shape[2]
|
228 |
+
#Sort annotations based on area.
|
229 |
+
areas = np.sum(annotation, axis=(1, 2))
|
230 |
+
sorted_indices = np.argsort(areas)
|
231 |
+
annotation = annotation[sorted_indices]
|
232 |
+
|
233 |
+
index = (annotation != 0).argmax(axis=0)
|
234 |
+
if random_color:
|
235 |
+
color = np.random.random((msak_sum, 1, 1, 3))
|
236 |
+
else:
|
237 |
+
color = np.ones((msak_sum, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 255 / 255])
|
238 |
+
transparency = np.ones((msak_sum, 1, 1, 1)) * 0.6
|
239 |
+
visual = np.concatenate([color, transparency], axis=-1)
|
240 |
+
mask_image = np.expand_dims(annotation, -1) * visual
|
241 |
+
|
242 |
+
show = np.zeros((height, weight, 4))
|
243 |
+
h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing='ij')
|
244 |
+
indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
|
245 |
+
# Use vectorized indexing to update the values of 'show'.
|
246 |
+
show[h_indices, w_indices, :] = mask_image[indices]
|
247 |
+
if bboxes is not None:
|
248 |
+
for bbox in bboxes:
|
249 |
+
x1, y1, x2, y2 = bbox
|
250 |
+
ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
|
251 |
+
# draw point
|
252 |
+
if points is not None:
|
253 |
+
plt.scatter(
|
254 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
|
255 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
|
256 |
+
s=20,
|
257 |
+
c='y',
|
258 |
+
)
|
259 |
+
plt.scatter(
|
260 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
|
261 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
|
262 |
+
s=20,
|
263 |
+
c='m',
|
264 |
+
)
|
265 |
+
|
266 |
+
if not retinamask:
|
267 |
+
show = cv2.resize(show, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
|
268 |
+
ax.imshow(show)
|
269 |
+
|
270 |
+
def fast_show_mask_gpu(
|
271 |
+
self,
|
272 |
+
annotation,
|
273 |
+
ax,
|
274 |
+
random_color=False,
|
275 |
+
bboxes=None,
|
276 |
+
points=None,
|
277 |
+
pointlabel=None,
|
278 |
+
retinamask=True,
|
279 |
+
target_height=960,
|
280 |
+
target_width=960,
|
281 |
+
):
|
282 |
+
msak_sum = annotation.shape[0]
|
283 |
+
height = annotation.shape[1]
|
284 |
+
weight = annotation.shape[2]
|
285 |
+
areas = torch.sum(annotation, dim=(1, 2))
|
286 |
+
sorted_indices = torch.argsort(areas, descending=False)
|
287 |
+
annotation = annotation[sorted_indices]
|
288 |
+
# Find the index of the first non-zero value at each position.
|
289 |
+
index = (annotation != 0).to(torch.long).argmax(dim=0)
|
290 |
+
if random_color:
|
291 |
+
color = torch.rand((msak_sum, 1, 1, 3)).to(annotation.device)
|
292 |
+
else:
|
293 |
+
color = torch.ones((msak_sum, 1, 1, 3)).to(annotation.device) * torch.tensor([
|
294 |
+
30 / 255, 144 / 255, 255 / 255]).to(annotation.device)
|
295 |
+
transparency = torch.ones((msak_sum, 1, 1, 1)).to(annotation.device) * 0.6
|
296 |
+
visual = torch.cat([color, transparency], dim=-1)
|
297 |
+
mask_image = torch.unsqueeze(annotation, -1) * visual
|
298 |
+
# Select data according to the index. The index indicates which batch's data to choose at each position, converting the mask_image into a single batch form.
|
299 |
+
show = torch.zeros((height, weight, 4)).to(annotation.device)
|
300 |
+
try:
|
301 |
+
h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight), indexing='ij')
|
302 |
+
except:
|
303 |
+
h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight))
|
304 |
+
indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
|
305 |
+
# Use vectorized indexing to update the values of 'show'.
|
306 |
+
show[h_indices, w_indices, :] = mask_image[indices]
|
307 |
+
show_cpu = show.cpu().numpy()
|
308 |
+
if bboxes is not None:
|
309 |
+
for bbox in bboxes:
|
310 |
+
x1, y1, x2, y2 = bbox
|
311 |
+
ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
|
312 |
+
# draw point
|
313 |
+
if points is not None:
|
314 |
+
plt.scatter(
|
315 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
|
316 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
|
317 |
+
s=20,
|
318 |
+
c='y',
|
319 |
+
)
|
320 |
+
plt.scatter(
|
321 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
|
322 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
|
323 |
+
s=20,
|
324 |
+
c='m',
|
325 |
+
)
|
326 |
+
if not retinamask:
|
327 |
+
show_cpu = cv2.resize(show_cpu, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
|
328 |
+
ax.imshow(show_cpu)
|
329 |
+
|
330 |
+
# clip
|
331 |
+
@torch.no_grad()
|
332 |
+
def retrieve(self, model, preprocess, elements, search_text: str, device) -> int:
|
333 |
+
preprocessed_images = [preprocess(image).to(device) for image in elements]
|
334 |
+
try:
|
335 |
+
import clip # for linear_assignment
|
336 |
+
|
337 |
+
except (ImportError, AssertionError, AttributeError):
|
338 |
+
from ultralytics.yolo.utils.checks import check_requirements
|
339 |
+
|
340 |
+
check_requirements('git+https://github.com/openai/CLIP.git') # required before installing lap from source
|
341 |
+
import clip
|
342 |
+
|
343 |
+
|
344 |
+
tokenized_text = clip.tokenize([search_text]).to(device)
|
345 |
+
stacked_images = torch.stack(preprocessed_images)
|
346 |
+
image_features = model.encode_image(stacked_images)
|
347 |
+
text_features = model.encode_text(tokenized_text)
|
348 |
+
image_features /= image_features.norm(dim=-1, keepdim=True)
|
349 |
+
text_features /= text_features.norm(dim=-1, keepdim=True)
|
350 |
+
probs = 100.0 * image_features @ text_features.T
|
351 |
+
return probs[:, 0].softmax(dim=0)
|
352 |
+
|
353 |
+
def _crop_image(self, format_results):
|
354 |
+
|
355 |
+
image = Image.fromarray(cv2.cvtColor(self.img, cv2.COLOR_BGR2RGB))
|
356 |
+
ori_w, ori_h = image.size
|
357 |
+
annotations = format_results
|
358 |
+
mask_h, mask_w = annotations[0]['segmentation'].shape
|
359 |
+
if ori_w != mask_w or ori_h != mask_h:
|
360 |
+
image = image.resize((mask_w, mask_h))
|
361 |
+
cropped_boxes = []
|
362 |
+
cropped_images = []
|
363 |
+
not_crop = []
|
364 |
+
filter_id = []
|
365 |
+
# annotations, _ = filter_masks(annotations)
|
366 |
+
# filter_id = list(_)
|
367 |
+
for _, mask in enumerate(annotations):
|
368 |
+
if np.sum(mask['segmentation']) <= 100:
|
369 |
+
filter_id.append(_)
|
370 |
+
continue
|
371 |
+
bbox = self._get_bbox_from_mask(mask['segmentation']) # mask 的 bbox
|
372 |
+
cropped_boxes.append(self._segment_image(image, bbox))
|
373 |
+
# cropped_boxes.append(segment_image(image,mask["segmentation"]))
|
374 |
+
cropped_images.append(bbox) # Save the bounding box of the cropped image.
|
375 |
+
|
376 |
+
return cropped_boxes, cropped_images, not_crop, filter_id, annotations
|
377 |
+
|
378 |
+
def box_prompt(self, bbox=None, bboxes=None):
|
379 |
+
if self.results == None:
|
380 |
+
return []
|
381 |
+
assert bbox or bboxes
|
382 |
+
if bboxes is None:
|
383 |
+
bboxes = [bbox]
|
384 |
+
max_iou_index = []
|
385 |
+
for bbox in bboxes:
|
386 |
+
assert (bbox[2] != 0 and bbox[3] != 0)
|
387 |
+
masks = self.results[0].masks.data
|
388 |
+
target_height = self.img.shape[0]
|
389 |
+
target_width = self.img.shape[1]
|
390 |
+
h = masks.shape[1]
|
391 |
+
w = masks.shape[2]
|
392 |
+
if h != target_height or w != target_width:
|
393 |
+
bbox = [
|
394 |
+
int(bbox[0] * w / target_width),
|
395 |
+
int(bbox[1] * h / target_height),
|
396 |
+
int(bbox[2] * w / target_width),
|
397 |
+
int(bbox[3] * h / target_height), ]
|
398 |
+
bbox[0] = round(bbox[0]) if round(bbox[0]) > 0 else 0
|
399 |
+
bbox[1] = round(bbox[1]) if round(bbox[1]) > 0 else 0
|
400 |
+
bbox[2] = round(bbox[2]) if round(bbox[2]) < w else w
|
401 |
+
bbox[3] = round(bbox[3]) if round(bbox[3]) < h else h
|
402 |
+
|
403 |
+
# IoUs = torch.zeros(len(masks), dtype=torch.float32)
|
404 |
+
bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
|
405 |
+
|
406 |
+
masks_area = torch.sum(masks[:, bbox[1]:bbox[3], bbox[0]:bbox[2]], dim=(1, 2))
|
407 |
+
orig_masks_area = torch.sum(masks, dim=(1, 2))
|
408 |
+
|
409 |
+
union = bbox_area + orig_masks_area - masks_area
|
410 |
+
IoUs = masks_area / union
|
411 |
+
max_iou_index.append(int(torch.argmax(IoUs)))
|
412 |
+
max_iou_index = list(set(max_iou_index))
|
413 |
+
return np.array(masks[max_iou_index].cpu().numpy())
|
414 |
+
|
415 |
+
def point_prompt(self, points, pointlabel): # numpy
|
416 |
+
if self.results == None:
|
417 |
+
return []
|
418 |
+
masks = self._format_results(self.results[0], 0)
|
419 |
+
target_height = self.img.shape[0]
|
420 |
+
target_width = self.img.shape[1]
|
421 |
+
h = masks[0]['segmentation'].shape[0]
|
422 |
+
w = masks[0]['segmentation'].shape[1]
|
423 |
+
if h != target_height or w != target_width:
|
424 |
+
points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points]
|
425 |
+
onemask = np.zeros((h, w))
|
426 |
+
masks = sorted(masks, key=lambda x: x['area'], reverse=True)
|
427 |
+
for i, annotation in enumerate(masks):
|
428 |
+
if type(annotation) == dict:
|
429 |
+
mask = annotation['segmentation']
|
430 |
+
else:
|
431 |
+
mask = annotation
|
432 |
+
for i, point in enumerate(points):
|
433 |
+
if mask[point[1], point[0]] == 1 and pointlabel[i] == 1:
|
434 |
+
onemask[mask] = 1
|
435 |
+
if mask[point[1], point[0]] == 1 and pointlabel[i] == 0:
|
436 |
+
onemask[mask] = 0
|
437 |
+
onemask = onemask >= 1
|
438 |
+
return np.array([onemask])
|
439 |
+
|
440 |
+
def text_prompt(self, text):
|
441 |
+
if self.results == None:
|
442 |
+
return []
|
443 |
+
format_results = self._format_results(self.results[0], 0)
|
444 |
+
cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._crop_image(format_results)
|
445 |
+
clip_model, preprocess = clip.load('ViT-B/32', device=self.device)
|
446 |
+
scores = self.retrieve(clip_model, preprocess, cropped_boxes, text, device=self.device)
|
447 |
+
max_idx = scores.argsort()
|
448 |
+
max_idx = max_idx[-1]
|
449 |
+
max_idx += sum(np.array(filter_id) <= int(max_idx))
|
450 |
+
return np.array([annotations[max_idx]['segmentation']])
|
451 |
+
|
452 |
+
def everything_prompt(self):
|
453 |
+
if self.results == None:
|
454 |
+
return []
|
455 |
+
return self.results[0].masks.data
|
456 |
+
|
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/run_test.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import cv2
|
4 |
+
import numpy as np
|
5 |
+
import onnxruntime
|
6 |
+
import time
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import torch
|
9 |
+
from ultralytics.engine.results import Results
|
10 |
+
from tools_pt import *
|
11 |
+
from prompt import FastSAMPrompt
|
12 |
+
import aidlite
|
13 |
+
import argparse
|
14 |
+
import ast
|
15 |
+
|
16 |
+
# 定义相似度函数
|
17 |
+
def get_acc(onnx_out,other_out):
|
18 |
+
cosine_similarity=np.dot(np.array(onnx_out),np.array(other_out))/(np.linalg.norm(np.array(onnx_out)) * np.linalg.norm(np.array(other_out)))
|
19 |
+
return cosine_similarity
|
20 |
+
|
21 |
+
def cal_sigmoid(x):
|
22 |
+
return 1 / (1 + np.exp(-x))
|
23 |
+
|
24 |
+
class qnn_predict(object):
|
25 |
+
def __init__(self,inputshape,outputshape,args) -> None:
|
26 |
+
aidlite.set_log_level(aidlite.LogLevel.INFO)
|
27 |
+
aidlite.log_to_stderr()
|
28 |
+
print(f"Aidlite library version : {aidlite.get_library_version()}")
|
29 |
+
print(f"Aidlite python library version : {aidlite.get_py_library_version()}")
|
30 |
+
config = aidlite.Config.create_instance()
|
31 |
+
if config is None:
|
32 |
+
print("Create model failed !")
|
33 |
+
config.implement_type = aidlite.ImplementType.TYPE_LOCAL
|
34 |
+
config.framework_type = aidlite.FrameworkType.TYPE_QNN
|
35 |
+
config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
|
36 |
+
config.is_quantify_model = 1
|
37 |
+
|
38 |
+
model = aidlite.Model.create_instance(args.target_model)
|
39 |
+
if model is None:
|
40 |
+
print("Create model failed !")
|
41 |
+
|
42 |
+
self.input_shape=inputshape
|
43 |
+
self.out_shape = outputshape
|
44 |
+
model.set_model_properties(self.input_shape, aidlite.DataType.TYPE_FLOAT32, self.out_shape, aidlite.DataType.TYPE_FLOAT32)
|
45 |
+
self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(model, config)
|
46 |
+
if self.interpreter is None:
|
47 |
+
print("build_interpretper_from_model_and_config failed !")
|
48 |
+
result = self.interpreter.init()
|
49 |
+
if result != 0:
|
50 |
+
print(f"interpreter init failed !")
|
51 |
+
result = self.interpreter.load_model()
|
52 |
+
if result != 0:
|
53 |
+
print("interpreter load model failed !")
|
54 |
+
print("detect model load success!")
|
55 |
+
|
56 |
+
self.conf = 0.4
|
57 |
+
self.iou=0.9
|
58 |
+
self.size = 640
|
59 |
+
self.agnostic_nms=False
|
60 |
+
self.max_det = 300
|
61 |
+
self.names=['object']
|
62 |
+
self.classes =None
|
63 |
+
self.retina_masks=True
|
64 |
+
|
65 |
+
def pretreat_img(self,img):
|
66 |
+
scale = 1/255.
|
67 |
+
img_size = cv2.resize(img, (self.size,self.size), interpolation=cv2.INTER_LINEAR)
|
68 |
+
float_img = img_size.astype('float32')
|
69 |
+
float_img = float_img* scale
|
70 |
+
float_img = float_img[:, :, ::-1]
|
71 |
+
return float_img
|
72 |
+
|
73 |
+
def postprocess(self, preds, img, orig_imgs):
|
74 |
+
"""TODO: filter by classes."""
|
75 |
+
p = non_max_suppression(torch.from_numpy(preds[0]),
|
76 |
+
self.conf,
|
77 |
+
self.iou,
|
78 |
+
agnostic=self.agnostic_nms,
|
79 |
+
max_det=self.max_det,
|
80 |
+
nc=len(self.names),
|
81 |
+
classes=self.classes)
|
82 |
+
|
83 |
+
results = []
|
84 |
+
if len(p) == 0 or len(p[0]) == 0:
|
85 |
+
print("No object detected.")
|
86 |
+
return results
|
87 |
+
|
88 |
+
full_box = torch.zeros_like(p[0][0])
|
89 |
+
full_box[2], full_box[3], full_box[4], full_box[6:] = img.shape[3], img.shape[2], 1.0, 1.0
|
90 |
+
full_box = full_box.view(1, -1)
|
91 |
+
critical_iou_index = bbox_iou(full_box[0][:4], p[0][:, :4], iou_thres=0.9, image_shape=img.shape[2:])
|
92 |
+
if critical_iou_index.numel() != 0:
|
93 |
+
full_box[0][4] = p[0][critical_iou_index][:,4]
|
94 |
+
full_box[0][6:] = p[0][critical_iou_index][:,6:]
|
95 |
+
p[0][critical_iou_index] = full_box
|
96 |
+
|
97 |
+
#proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported
|
98 |
+
proto=torch.from_numpy(preds[-1])
|
99 |
+
for i, pred in enumerate(p):
|
100 |
+
orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs
|
101 |
+
path =img[0] #self.batch[0]
|
102 |
+
img_path = path[i] if isinstance(path, list) else path
|
103 |
+
if not len(pred): # save empty boxes
|
104 |
+
results.append(Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6]))
|
105 |
+
continue
|
106 |
+
if self.retina_masks:
|
107 |
+
if not isinstance(orig_imgs, torch.Tensor):
|
108 |
+
pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
|
109 |
+
masks = process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2]) # HWC
|
110 |
+
else:
|
111 |
+
masks = process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True) # HWC
|
112 |
+
if not isinstance(orig_imgs, torch.Tensor):
|
113 |
+
pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
|
114 |
+
results.append(
|
115 |
+
Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6], masks=masks))
|
116 |
+
return results
|
117 |
+
|
118 |
+
def qnn_run(self, orig_imgs,img_path,args):
|
119 |
+
input_img_f =self.pretreat_img(orig_imgs) # 图片resize HWC
|
120 |
+
# print("qnn_input:",input_img_f)
|
121 |
+
# encoder texts
|
122 |
+
input_img = np.expand_dims(input_img_f, 0)
|
123 |
+
|
124 |
+
invoke_time=[]
|
125 |
+
for i in range(args.invoke_nums):
|
126 |
+
result = self.interpreter.set_input_tensor(0, input_img.data)
|
127 |
+
t0 = time.time()
|
128 |
+
result = self.interpreter.invoke()
|
129 |
+
t1 = time.time()
|
130 |
+
cost_time=(t1-t0)*1000
|
131 |
+
invoke_time.append(cost_time)
|
132 |
+
mask_ = self.interpreter.get_output_tensor(0)
|
133 |
+
concat_ = self.interpreter.get_output_tensor(1)
|
134 |
+
mul_ = self.interpreter.get_output_tensor(3)
|
135 |
+
split_ = self.interpreter.get_output_tensor(2)
|
136 |
+
mask_ = mask_.reshape( * self.out_shape[3])
|
137 |
+
mask_=mask_.transpose((0, 3, 1,2))
|
138 |
+
concat_ = concat_.reshape( *self.out_shape[2])
|
139 |
+
mul_ = mul_.reshape( *self.out_shape[1])
|
140 |
+
split_ = split_.reshape( *self.out_shape[0])
|
141 |
+
sig_ = cal_sigmoid(split_)
|
142 |
+
|
143 |
+
output_concat = np.concatenate((mul_,sig_),axis=1)
|
144 |
+
output_concat = np.concatenate((output_concat,concat_),axis=1)
|
145 |
+
|
146 |
+
# outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
|
147 |
+
## time 统计
|
148 |
+
max_invoke_time = max(invoke_time)
|
149 |
+
min_invoke_time = min(invoke_time)
|
150 |
+
mean_invoke_time = sum(invoke_time)/args.invoke_nums
|
151 |
+
var_invoketime=np.var(invoke_time)
|
152 |
+
print("========================================")
|
153 |
+
print(f"QNN inference {args.invoke_nums} times :\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
|
154 |
+
print("========================================")
|
155 |
+
|
156 |
+
qnn_out = [np.array(output_concat),np.array(mask_)]
|
157 |
+
# print("qnn predict out:",qnn_out)
|
158 |
+
|
159 |
+
nchw_img = input_img.transpose(0,3,1,2)
|
160 |
+
everything_results = self.postprocess( qnn_out, nchw_img, [orig_imgs])
|
161 |
+
# print("everything_results: ",everything_results)
|
162 |
+
|
163 |
+
prompt_process = FastSAMPrompt(args.imgs, everything_results, device="cpu")
|
164 |
+
|
165 |
+
# ann = prompt_process.point_prompt(points=[[620, 360]], pointlabel=[1])
|
166 |
+
try:
|
167 |
+
if args.point_prompt ==[[0,0]]:
|
168 |
+
ann = prompt_process.everything_prompt()
|
169 |
+
else:
|
170 |
+
ann = prompt_process.point_prompt(points=args.point_prompt, pointlabel=[1])
|
171 |
+
out_name = os.path.basename(img_path).split(".")[0]
|
172 |
+
if True: # savepic
|
173 |
+
outpath = "python/"
|
174 |
+
if not os.path.exists(outpath):
|
175 |
+
os.mkdir(outpath)
|
176 |
+
prompt_process.plot(
|
177 |
+
annotations=ann,
|
178 |
+
output_path=os.path.join(outpath,out_name+"_result.jpg"),
|
179 |
+
mask_random_color=True,
|
180 |
+
better_quality=True,
|
181 |
+
retina=False,
|
182 |
+
withContours=True,
|
183 |
+
)
|
184 |
+
else:
|
185 |
+
plt.figure()
|
186 |
+
prompt_process.fast_show_mask(annotation=ann,
|
187 |
+
ax = plt)
|
188 |
+
except Exception as e:
|
189 |
+
print(f"Waning : An error occurred in the picture {img_path} prediction -{e}")
|
190 |
+
return [mask_.reshape(-1),output_concat.reshape(-1)]
|
191 |
+
|
192 |
+
|
193 |
+
|
194 |
+
def parser_args():
|
195 |
+
parser = argparse.ArgumentParser(description="Run model benchmarks")
|
196 |
+
parser.add_argument('--target_model',type=str,default='models/cutoff_fastsam_x_fp16.qnn216.ctx.bin.aidem',help="inference model path")
|
197 |
+
parser.add_argument('--source_model',type=str,default='models/fastsam_x.onnx',help="original model path")
|
198 |
+
parser.add_argument('--imgs',type=str,default='python/dogs.jpg',help="Predict images path")
|
199 |
+
parser.add_argument('--invoke_nums',type=int,default=10,help="Inference nums")
|
200 |
+
parser.add_argument('--point_prompt',type=str,default="[[0,0]]",help="example:[[x1,y1],[x2,y2]]")
|
201 |
+
args = parser.parse_args()
|
202 |
+
return args
|
203 |
+
|
204 |
+
|
205 |
+
if __name__ == "__main__":
|
206 |
+
args = parser_args()
|
207 |
+
inputshape=[[1,640,640,3]]
|
208 |
+
outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
|
209 |
+
args.point_prompt = ast.literal_eval(args.point_prompt)
|
210 |
+
|
211 |
+
predict = qnn_predict(inputshape,outputshape,args)
|
212 |
+
if os.path.isdir(args.imgs):
|
213 |
+
img_files = os.listdir(args.imgs)
|
214 |
+
for fi in img_files:
|
215 |
+
img_path = os.path.join(args.imgs,fi)
|
216 |
+
im0s = cv2.imread(img_path) # BGR
|
217 |
+
im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
|
218 |
+
predict.qnn_run(im0s,img_path,args)
|
219 |
+
else:
|
220 |
+
img_path = args.imgs
|
221 |
+
im0s = cv2.imread(img_path) # BGR
|
222 |
+
im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
|
223 |
+
qnn_result = predict.qnn_run(im0s,img_path,args)
|
224 |
+
print("Prediction completion and the results are saved !")
|
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/tools_pt.py
ADDED
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import time
|
3 |
+
import torch
|
4 |
+
import torchvision
|
5 |
+
import torch.nn.functional as F
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
def clip_boxes(boxes, shape):
|
10 |
+
"""
|
11 |
+
Takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the shape.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
boxes (torch.Tensor): the bounding boxes to clip
|
15 |
+
shape (tuple): the shape of the image
|
16 |
+
"""
|
17 |
+
if isinstance(boxes, torch.Tensor): # faster individually
|
18 |
+
boxes[..., 0].clamp_(0, shape[1]) # x1
|
19 |
+
boxes[..., 1].clamp_(0, shape[0]) # y1
|
20 |
+
boxes[..., 2].clamp_(0, shape[1]) # x2
|
21 |
+
boxes[..., 3].clamp_(0, shape[0]) # y2
|
22 |
+
else: # np.array (faster grouped)
|
23 |
+
boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
|
24 |
+
boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
|
25 |
+
|
26 |
+
def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding=True):
|
27 |
+
"""
|
28 |
+
Rescales bounding boxes (in the format of xyxy) from the shape of the image they were originally specified in
|
29 |
+
(img1_shape) to the shape of a different image (img0_shape).
|
30 |
+
|
31 |
+
Args:
|
32 |
+
img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
|
33 |
+
boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
|
34 |
+
img0_shape (tuple): the shape of the target image, in the format of (height, width).
|
35 |
+
ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
|
36 |
+
calculated based on the size difference between the two images.
|
37 |
+
padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
|
38 |
+
rescaling.
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
|
42 |
+
"""
|
43 |
+
if ratio_pad is None: # calculate from img0_shape
|
44 |
+
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
|
45 |
+
pad = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1), round(
|
46 |
+
(img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # wh padding
|
47 |
+
else:
|
48 |
+
gain = ratio_pad[0][0]
|
49 |
+
pad = ratio_pad[1]
|
50 |
+
|
51 |
+
if padding:
|
52 |
+
boxes[..., [0, 2]] -= pad[0] # x padding
|
53 |
+
boxes[..., [1, 3]] -= pad[1] # y padding
|
54 |
+
boxes[..., :4] /= gain
|
55 |
+
clip_boxes(boxes, img0_shape)
|
56 |
+
return boxes
|
57 |
+
|
58 |
+
|
59 |
+
def xywh2xyxy(x):
|
60 |
+
"""
|
61 |
+
Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
|
62 |
+
top-left corner and (x2, y2) is the bottom-right corner.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
|
69 |
+
"""
|
70 |
+
assert x.shape[-1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'
|
71 |
+
y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x) # faster than clone/copy
|
72 |
+
dw = x[..., 2] / 2 # half-width
|
73 |
+
dh = x[..., 3] / 2 # half-height
|
74 |
+
y[..., 0] = x[..., 0] - dw # top left x
|
75 |
+
y[..., 1] = x[..., 1] - dh # top left y
|
76 |
+
y[..., 2] = x[..., 0] + dw # bottom right x
|
77 |
+
y[..., 3] = x[..., 1] + dh # bottom right y
|
78 |
+
return y
|
79 |
+
|
80 |
+
|
81 |
+
def non_max_suppression(
|
82 |
+
prediction,
|
83 |
+
conf_thres=0.25,
|
84 |
+
iou_thres=0.45,
|
85 |
+
classes=None,
|
86 |
+
agnostic=False,
|
87 |
+
multi_label=False,
|
88 |
+
labels=(),
|
89 |
+
max_det=300,
|
90 |
+
nc=0, # number of classes (optional)
|
91 |
+
max_time_img=0.05,
|
92 |
+
max_nms=30000,
|
93 |
+
max_wh=7680,
|
94 |
+
):
|
95 |
+
"""
|
96 |
+
Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box.
|
97 |
+
|
98 |
+
Args:
|
99 |
+
prediction (torch.Tensor): A tensor of shape (batch_size, num_classes + 4 + num_masks, num_boxes)
|
100 |
+
containing the predicted boxes, classes, and masks. The tensor should be in the format
|
101 |
+
output by a model, such as YOLO.
|
102 |
+
conf_thres (float): The confidence threshold below which boxes will be filtered out.
|
103 |
+
Valid values are between 0.0 and 1.0.
|
104 |
+
iou_thres (float): The IoU threshold below which boxes will be filtered out during NMS.
|
105 |
+
Valid values are between 0.0 and 1.0.
|
106 |
+
classes (List[int]): A list of class indices to consider. If None, all classes will be considered.
|
107 |
+
agnostic (bool): If True, the model is agnostic to the number of classes, and all
|
108 |
+
classes will be considered as one.
|
109 |
+
multi_label (bool): If True, each box may have multiple labels.
|
110 |
+
labels (List[List[Union[int, float, torch.Tensor]]]): A list of lists, where each inner
|
111 |
+
list contains the apriori labels for a given image. The list should be in the format
|
112 |
+
output by a dataloader, with each label being a tuple of (class_index, x1, y1, x2, y2).
|
113 |
+
max_det (int): The maximum number of boxes to keep after NMS.
|
114 |
+
nc (int, optional): The number of classes output by the model. Any indices after this will be considered masks.
|
115 |
+
max_time_img (float): The maximum time (seconds) for processing one image.
|
116 |
+
max_nms (int): The maximum number of boxes into torchvision.ops.nms().
|
117 |
+
max_wh (int): The maximum box width and height in pixels
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
(List[torch.Tensor]): A list of length batch_size, where each element is a tensor of
|
121 |
+
shape (num_boxes, 6 + num_masks) containing the kept boxes, with columns
|
122 |
+
(x1, y1, x2, y2, confidence, class, mask1, mask2, ...).
|
123 |
+
"""
|
124 |
+
|
125 |
+
# Checks
|
126 |
+
assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
|
127 |
+
assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
|
128 |
+
if isinstance(prediction, (list, tuple)): # YOLOv8 model in validation model, output = (inference_out, loss_out)
|
129 |
+
prediction = prediction[0] # select only inference output
|
130 |
+
|
131 |
+
device = prediction.device
|
132 |
+
mps = 'mps' in device.type # Apple MPS
|
133 |
+
if mps: # MPS not fully supported yet, convert tensors to CPU before NMS
|
134 |
+
prediction = prediction.cpu()
|
135 |
+
bs = prediction.shape[0] # batch size
|
136 |
+
nc = nc or (prediction.shape[1] - 4) # number of classes
|
137 |
+
nm = prediction.shape[1] - nc - 4
|
138 |
+
mi = 4 + nc # mask start index
|
139 |
+
xc = prediction[:, 4:mi].amax(1) > conf_thres # candidates
|
140 |
+
|
141 |
+
# Settings
|
142 |
+
# min_wh = 2 # (pixels) minimum box width and height
|
143 |
+
time_limit = 0.5 + max_time_img * bs # seconds to quit after
|
144 |
+
multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
|
145 |
+
|
146 |
+
prediction = prediction.transpose(-1, -2) # shape(1,84,6300) to shape(1,6300,84)
|
147 |
+
prediction[..., :4] = xywh2xyxy(prediction[..., :4]) # xywh to xyxy
|
148 |
+
|
149 |
+
t = time.time()
|
150 |
+
output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
|
151 |
+
for xi, x in enumerate(prediction): # image index, image inference
|
152 |
+
# Apply constraints
|
153 |
+
# x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0 # width-height
|
154 |
+
x = x[xc[xi]] # confidence
|
155 |
+
|
156 |
+
# Cat apriori labels if autolabelling
|
157 |
+
if labels and len(labels[xi]):
|
158 |
+
lb = labels[xi]
|
159 |
+
v = torch.zeros((len(lb), nc + nm + 4), device=x.device)
|
160 |
+
v[:, :4] = xywh2xyxy(lb[:, 1:5]) # box
|
161 |
+
v[range(len(lb)), lb[:, 0].long() + 4] = 1.0 # cls
|
162 |
+
x = torch.cat((x, v), 0)
|
163 |
+
|
164 |
+
# If none remain process next image
|
165 |
+
if not x.shape[0]:
|
166 |
+
continue
|
167 |
+
|
168 |
+
# Detections matrix nx6 (xyxy, conf, cls)
|
169 |
+
box, cls, mask = x.split((4, nc, nm), 1)
|
170 |
+
|
171 |
+
if multi_label:
|
172 |
+
i, j = torch.where(cls > conf_thres)
|
173 |
+
x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1)
|
174 |
+
else: # best class only
|
175 |
+
conf, j = cls.max(1, keepdim=True)
|
176 |
+
x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
|
177 |
+
|
178 |
+
# Filter by class
|
179 |
+
if classes is not None:
|
180 |
+
x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
|
181 |
+
|
182 |
+
# Check shape
|
183 |
+
n = x.shape[0] # number of boxes
|
184 |
+
if not n: # no boxes
|
185 |
+
continue
|
186 |
+
if n > max_nms: # excess boxes
|
187 |
+
x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence and remove excess boxes
|
188 |
+
|
189 |
+
# Batched NMS
|
190 |
+
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
|
191 |
+
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
|
192 |
+
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
|
193 |
+
i = i[:max_det] # limit detections
|
194 |
+
|
195 |
+
# # Experimental
|
196 |
+
# merge = False # use merge-NMS
|
197 |
+
# if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
|
198 |
+
# # Update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
|
199 |
+
# from .metrics import box_iou
|
200 |
+
# iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
|
201 |
+
# weights = iou * scores[None] # box weights
|
202 |
+
# x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
|
203 |
+
# redundant = True # require redundant detections
|
204 |
+
# if redundant:
|
205 |
+
# i = i[iou.sum(1) > 1] # require redundancy
|
206 |
+
|
207 |
+
output[xi] = x[i]
|
208 |
+
if mps:
|
209 |
+
output[xi] = output[xi].to(device)
|
210 |
+
# if (time.time() - t) > time_limit:
|
211 |
+
# LOGGER.warning(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded')
|
212 |
+
# break # time limit exceeded
|
213 |
+
|
214 |
+
return output
|
215 |
+
|
216 |
+
|
217 |
+
def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
|
218 |
+
'''Adjust bounding boxes to stick to image border if they are within a certain threshold.
|
219 |
+
Args:
|
220 |
+
boxes: (n, 4)
|
221 |
+
image_shape: (height, width)
|
222 |
+
threshold: pixel threshold
|
223 |
+
Returns:
|
224 |
+
adjusted_boxes: adjusted bounding boxes
|
225 |
+
'''
|
226 |
+
|
227 |
+
# Image dimensions
|
228 |
+
h, w = image_shape
|
229 |
+
|
230 |
+
# Adjust boxes
|
231 |
+
boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
|
232 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
|
233 |
+
boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
|
234 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
|
235 |
+
boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
|
236 |
+
w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
|
237 |
+
boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
|
238 |
+
h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
|
239 |
+
|
240 |
+
return boxes
|
241 |
+
|
242 |
+
def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
|
243 |
+
'''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
|
244 |
+
Args:
|
245 |
+
box1: (4, )
|
246 |
+
boxes: (n, 4)
|
247 |
+
Returns:
|
248 |
+
high_iou_indices: Indices of boxes with IoU > thres
|
249 |
+
'''
|
250 |
+
boxes = adjust_bboxes_to_image_border(boxes, image_shape)
|
251 |
+
# obtain coordinates for intersections
|
252 |
+
x1 = torch.max(box1[0], boxes[:, 0])
|
253 |
+
y1 = torch.max(box1[1], boxes[:, 1])
|
254 |
+
x2 = torch.min(box1[2], boxes[:, 2])
|
255 |
+
y2 = torch.min(box1[3], boxes[:, 3])
|
256 |
+
|
257 |
+
# compute the area of intersection
|
258 |
+
intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
|
259 |
+
|
260 |
+
# compute the area of both individual boxes
|
261 |
+
box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
262 |
+
box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
263 |
+
|
264 |
+
# compute the area of union
|
265 |
+
union = box1_area + box2_area - intersection
|
266 |
+
|
267 |
+
# compute the IoU
|
268 |
+
iou = intersection / union # Should be shape (n, )
|
269 |
+
if raw_output:
|
270 |
+
if iou.numel() == 0:
|
271 |
+
return 0
|
272 |
+
return iou
|
273 |
+
|
274 |
+
# get indices of boxes with IoU > thres
|
275 |
+
high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
|
276 |
+
|
277 |
+
return high_iou_indices
|
278 |
+
|
279 |
+
|
280 |
+
def scale_masks(masks, shape, padding=True):
|
281 |
+
"""
|
282 |
+
Rescale segment masks to shape.
|
283 |
+
|
284 |
+
Args:
|
285 |
+
masks (torch.Tensor): (N, C, H, W).
|
286 |
+
shape (tuple): Height and width.
|
287 |
+
padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
|
288 |
+
rescaling.
|
289 |
+
"""
|
290 |
+
mh, mw = masks.shape[2:]
|
291 |
+
gain = min(mh / shape[0], mw / shape[1]) # gain = old / new
|
292 |
+
pad = [mw - shape[1] * gain, mh - shape[0] * gain] # wh padding
|
293 |
+
if padding:
|
294 |
+
pad[0] /= 2
|
295 |
+
pad[1] /= 2
|
296 |
+
top, left = (int(pad[1]), int(pad[0])) if padding else (0, 0) # y, x
|
297 |
+
bottom, right = (int(mh - pad[1]), int(mw - pad[0]))
|
298 |
+
masks = masks[..., top:bottom, left:right]
|
299 |
+
|
300 |
+
masks = F.interpolate(masks, shape, mode="bilinear", align_corners=False) # NCHW
|
301 |
+
return masks
|
302 |
+
|
303 |
+
|
304 |
+
def process_mask_native(protos, masks_in, bboxes, shape):
|
305 |
+
"""
|
306 |
+
It takes the output of the mask head, and crops it after upsampling to the bounding boxes.
|
307 |
+
|
308 |
+
Args:
|
309 |
+
protos (torch.Tensor): [mask_dim, mask_h, mask_w]
|
310 |
+
masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms
|
311 |
+
bboxes (torch.Tensor): [n, 4], n is number of masks after nms
|
312 |
+
shape (tuple): the size of the input image (h,w)
|
313 |
+
|
314 |
+
Returns:
|
315 |
+
masks (torch.Tensor): The returned masks with dimensions [h, w, n]
|
316 |
+
"""
|
317 |
+
c, mh, mw = protos.shape # CHW
|
318 |
+
masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw)
|
319 |
+
masks = scale_masks(masks[None], shape)[0] # CHW
|
320 |
+
masks = crop_mask(masks, bboxes) # CHW
|
321 |
+
return masks.gt_(0.5)
|
322 |
+
|
323 |
+
def crop_mask(masks, boxes):
|
324 |
+
"""
|
325 |
+
It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box.
|
326 |
+
|
327 |
+
Args:
|
328 |
+
masks (torch.Tensor): [n, h, w] tensor of masks
|
329 |
+
boxes (torch.Tensor): [n, 4] tensor of bbox coordinates in relative point form
|
330 |
+
|
331 |
+
Returns:
|
332 |
+
(torch.Tensor): The masks are being cropped to the bounding box.
|
333 |
+
"""
|
334 |
+
_, h, w = masks.shape
|
335 |
+
x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(n,1,1)
|
336 |
+
r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,1,w)
|
337 |
+
c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(1,h,1)
|
338 |
+
|
339 |
+
return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
|
340 |
+
|
341 |
+
def process_mask(protos, masks_in, bboxes, shape, upsample=False):
|
342 |
+
"""
|
343 |
+
Apply masks to bounding boxes using the output of the mask head.
|
344 |
+
|
345 |
+
Args:
|
346 |
+
protos (torch.Tensor): A tensor of shape [mask_dim, mask_h, mask_w].
|
347 |
+
masks_in (torch.Tensor): A tensor of shape [n, mask_dim], where n is the number of masks after NMS.
|
348 |
+
bboxes (torch.Tensor): A tensor of shape [n, 4], where n is the number of masks after NMS.
|
349 |
+
shape (tuple): A tuple of integers representing the size of the input image in the format (h, w).
|
350 |
+
upsample (bool): A flag to indicate whether to upsample the mask to the original image size. Default is False.
|
351 |
+
|
352 |
+
Returns:
|
353 |
+
(torch.Tensor): A binary mask tensor of shape [n, h, w], where n is the number of masks after NMS, and h and w
|
354 |
+
are the height and width of the input image. The mask is applied to the bounding boxes.
|
355 |
+
"""
|
356 |
+
|
357 |
+
c, mh, mw = protos.shape # CHW
|
358 |
+
ih, iw = shape
|
359 |
+
masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) # CHW
|
360 |
+
|
361 |
+
downsampled_bboxes = bboxes.clone()
|
362 |
+
downsampled_bboxes[:, 0] *= mw / iw
|
363 |
+
downsampled_bboxes[:, 2] *= mw / iw
|
364 |
+
downsampled_bboxes[:, 3] *= mh / ih
|
365 |
+
downsampled_bboxes[:, 1] *= mh / ih
|
366 |
+
|
367 |
+
masks = crop_mask(masks, downsampled_bboxes) # CHW
|
368 |
+
if upsample:
|
369 |
+
masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW
|
370 |
+
return masks.gt_(0.5)
|
371 |
+
|
372 |
+
|
model_farm_fastsamx_qcs8550_qnn2.16_fp16_aidlite/python/utils.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
|
6 |
+
def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
|
7 |
+
'''Adjust bounding boxes to stick to image border if they are within a certain threshold.
|
8 |
+
Args:
|
9 |
+
boxes: (n, 4)
|
10 |
+
image_shape: (height, width)
|
11 |
+
threshold: pixel threshold
|
12 |
+
Returns:
|
13 |
+
adjusted_boxes: adjusted bounding boxes
|
14 |
+
'''
|
15 |
+
|
16 |
+
# Image dimensions
|
17 |
+
h, w = image_shape
|
18 |
+
|
19 |
+
# Adjust boxes
|
20 |
+
boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
|
21 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
|
22 |
+
boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
|
23 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
|
24 |
+
boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
|
25 |
+
w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
|
26 |
+
boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
|
27 |
+
h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
|
28 |
+
|
29 |
+
return boxes
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
def convert_box_xywh_to_xyxy(box):
|
34 |
+
x1 = box[0]
|
35 |
+
y1 = box[1]
|
36 |
+
x2 = box[0] + box[2]
|
37 |
+
y2 = box[1] + box[3]
|
38 |
+
return [x1, y1, x2, y2]
|
39 |
+
|
40 |
+
|
41 |
+
def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
|
42 |
+
'''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
|
43 |
+
Args:
|
44 |
+
box1: (4, )
|
45 |
+
boxes: (n, 4)
|
46 |
+
Returns:
|
47 |
+
high_iou_indices: Indices of boxes with IoU > thres
|
48 |
+
'''
|
49 |
+
boxes = adjust_bboxes_to_image_border(boxes, image_shape)
|
50 |
+
# obtain coordinates for intersections
|
51 |
+
x1 = torch.max(box1[0], boxes[:, 0])
|
52 |
+
y1 = torch.max(box1[1], boxes[:, 1])
|
53 |
+
x2 = torch.min(box1[2], boxes[:, 2])
|
54 |
+
y2 = torch.min(box1[3], boxes[:, 3])
|
55 |
+
|
56 |
+
# compute the area of intersection
|
57 |
+
intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
|
58 |
+
|
59 |
+
# compute the area of both individual boxes
|
60 |
+
box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
61 |
+
box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
62 |
+
|
63 |
+
# compute the area of union
|
64 |
+
union = box1_area + box2_area - intersection
|
65 |
+
|
66 |
+
# compute the IoU
|
67 |
+
iou = intersection / union # Should be shape (n, )
|
68 |
+
if raw_output:
|
69 |
+
if iou.numel() == 0:
|
70 |
+
return 0
|
71 |
+
return iou
|
72 |
+
|
73 |
+
# get indices of boxes with IoU > thres
|
74 |
+
high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
|
75 |
+
|
76 |
+
return high_iou_indices
|
77 |
+
|
78 |
+
|
79 |
+
def image_to_np_ndarray(image):
|
80 |
+
if type(image) is str:
|
81 |
+
return np.array(Image.open(image))
|
82 |
+
elif issubclass(type(image), Image.Image):
|
83 |
+
return np.array(image)
|
84 |
+
elif type(image) is np.ndarray:
|
85 |
+
return image
|
86 |
+
return None
|
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/README.md
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Model Information
|
2 |
+
### Source model
|
3 |
+
- Input shape: 640x640
|
4 |
+
- Number of parameters: 68.89M
|
5 |
+
- Model size: 277.39M
|
6 |
+
- Output shape: 1x37x8400,1x32x160x160
|
7 |
+
|
8 |
+
Source model repository: [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
|
9 |
+
|
10 |
+
### Converted model
|
11 |
+
|
12 |
+
- Precision: INT8
|
13 |
+
- Backend: QNN2.16
|
14 |
+
- Target Device: SNM972 QCS8550
|
15 |
+
|
16 |
+
## Inference with AidLite SDK
|
17 |
+
|
18 |
+
### SDK installation
|
19 |
+
Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
|
20 |
+
|
21 |
+
- Install AidLite SDK
|
22 |
+
|
23 |
+
```bash
|
24 |
+
# Install the appropriate version of the aidlite sdk
|
25 |
+
sudo aid-pkg update
|
26 |
+
sudo aid-pkg install aidlite-sdk
|
27 |
+
# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
|
28 |
+
sudo aid-pkg install aidlite-{QNN VERSION}
|
29 |
+
```
|
30 |
+
|
31 |
+
- Verify AidLite SDK
|
32 |
+
|
33 |
+
```bash
|
34 |
+
# aidlite sdk c++ check
|
35 |
+
python3 -c "import aidlite ; print(aidlite.get_library_version())"
|
36 |
+
|
37 |
+
# aidlite sdk python check
|
38 |
+
python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
|
39 |
+
```
|
40 |
+
|
41 |
+
### Run demo
|
42 |
+
```bash
|
43 |
+
cd fastsam_x/model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite
|
44 |
+
export LD_PRELOAD=/home/aidlux/.local/lib/python3.8/site-packages/torch/lib/../../torch.libs/libgomp-804f19d4.so.1.0.0
|
45 |
+
|
46 |
+
python3 ./python/run_test.py --target_model ./models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin --imgs ./python/dogs.jpg --invoke_nums 10
|
47 |
+
```
|
48 |
+
|
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4b41dab35e31041716c09ed9bb079e180a7f0d0e7ae8a34596507b55260efe6a
|
3 |
+
size 74562824
|
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/dogs.jpg
ADDED
![]() |
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/onnx_export.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import cv2
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
|
6 |
+
from ultralytics.models.fastsam import FastSAM
|
7 |
+
|
8 |
+
class Fast_SAM(torch.nn.Module):
|
9 |
+
"""Exportable FastSAM model, end-to-end."""
|
10 |
+
|
11 |
+
def __init__(self) -> None:
|
12 |
+
super().__init__()
|
13 |
+
pt_name ='./models/FastSAM-s.pt'
|
14 |
+
self.model =FastSAM(pt_name).model
|
15 |
+
|
16 |
+
def forward(self, image: torch.Tensor):
|
17 |
+
"""
|
18 |
+
Run FastSAM on `image`, and produce high quality segmentation masks.
|
19 |
+
Faster than SAM as it is based on YOLOv8.
|
20 |
+
|
21 |
+
Parameters:
|
22 |
+
image: Pixel values pre-processed for encoder consumption.
|
23 |
+
Range: float[0, 1]
|
24 |
+
3-channel Color Space: BGR
|
25 |
+
Returns:
|
26 |
+
|
27 |
+
"""
|
28 |
+
predictions = self.model(image)
|
29 |
+
# Return predictions as a tuple instead of nested tuple.
|
30 |
+
return (predictions[0], predictions[1][2])
|
31 |
+
|
32 |
+
|
33 |
+
model = Fast_SAM()
|
34 |
+
num_params = sum(p.numel() for p in model.parameters())
|
35 |
+
print(f'Number of FastSAM-s parameters: {num_params}')
|
36 |
+
dummy_input = torch.randn( [1,3,640,640],dtype=torch.float32 )
|
37 |
+
source_model = torch.jit.trace(
|
38 |
+
model.to("cpu"), dummy_input, check_trace=False
|
39 |
+
)
|
40 |
+
torch.onnx.export(model, # model being run
|
41 |
+
dummy_input, # model input (or a tuple for multiple inputs)
|
42 |
+
"./models/fastsam_s.onnx", # where to save the model
|
43 |
+
export_params=True, # store the trained parameter weights inside the model file
|
44 |
+
opset_version=12, # the ONNX version to export the model to
|
45 |
+
do_constant_folding=True, # whether to execute constant folding for optimization
|
46 |
+
input_names = ['input'], # the model's input names
|
47 |
+
output_names = ['boxes','mask'],
|
48 |
+
verbose=True,
|
49 |
+
)
|
50 |
+
print("Convert to onnx successfully!")
|
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/prompt.py
ADDED
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import cv2
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
from utils import image_to_np_ndarray
|
8 |
+
from PIL import Image
|
9 |
+
|
10 |
+
|
11 |
+
class FastSAMPrompt:
|
12 |
+
|
13 |
+
def __init__(self, image, results, device='cpu'):
|
14 |
+
if isinstance(image, str) or isinstance(image, Image.Image):
|
15 |
+
image = image_to_np_ndarray(image)
|
16 |
+
self.device = device
|
17 |
+
self.results = results
|
18 |
+
self.img = image
|
19 |
+
|
20 |
+
def _segment_image(self, image, bbox):
|
21 |
+
if isinstance(image, Image.Image):
|
22 |
+
image_array = np.array(image)
|
23 |
+
else:
|
24 |
+
image_array = image
|
25 |
+
segmented_image_array = np.zeros_like(image_array)
|
26 |
+
x1, y1, x2, y2 = bbox
|
27 |
+
segmented_image_array[y1:y2, x1:x2] = image_array[y1:y2, x1:x2]
|
28 |
+
segmented_image = Image.fromarray(segmented_image_array)
|
29 |
+
black_image = Image.new('RGB', image.size, (255, 255, 255))
|
30 |
+
# transparency_mask = np.zeros_like((), dtype=np.uint8)
|
31 |
+
transparency_mask = np.zeros((image_array.shape[0], image_array.shape[1]), dtype=np.uint8)
|
32 |
+
transparency_mask[y1:y2, x1:x2] = 255
|
33 |
+
transparency_mask_image = Image.fromarray(transparency_mask, mode='L')
|
34 |
+
black_image.paste(segmented_image, mask=transparency_mask_image)
|
35 |
+
return black_image
|
36 |
+
|
37 |
+
def _format_results(self, result, filter=0):
|
38 |
+
annotations = []
|
39 |
+
n = len(result.masks.data)
|
40 |
+
for i in range(n):
|
41 |
+
annotation = {}
|
42 |
+
mask = result.masks.data[i] == 1.0
|
43 |
+
|
44 |
+
if torch.sum(mask) < filter:
|
45 |
+
continue
|
46 |
+
annotation['id'] = i
|
47 |
+
annotation['segmentation'] = mask.cpu().numpy()
|
48 |
+
annotation['bbox'] = result.boxes.data[i]
|
49 |
+
annotation['score'] = result.boxes.conf[i]
|
50 |
+
annotation['area'] = annotation['segmentation'].sum()
|
51 |
+
annotations.append(annotation)
|
52 |
+
return annotations
|
53 |
+
|
54 |
+
def filter_masks(annotations): # filte the overlap mask
|
55 |
+
annotations.sort(key=lambda x: x['area'], reverse=True)
|
56 |
+
to_remove = set()
|
57 |
+
for i in range(0, len(annotations)):
|
58 |
+
a = annotations[i]
|
59 |
+
for j in range(i + 1, len(annotations)):
|
60 |
+
b = annotations[j]
|
61 |
+
if i != j and j not in to_remove:
|
62 |
+
# check if
|
63 |
+
if b['area'] < a['area']:
|
64 |
+
if (a['segmentation'] & b['segmentation']).sum() / b['segmentation'].sum() > 0.8:
|
65 |
+
to_remove.add(j)
|
66 |
+
|
67 |
+
return [a for i, a in enumerate(annotations) if i not in to_remove], to_remove
|
68 |
+
|
69 |
+
def _get_bbox_from_mask(self, mask):
|
70 |
+
mask = mask.astype(np.uint8)
|
71 |
+
contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
72 |
+
x1, y1, w, h = cv2.boundingRect(contours[0])
|
73 |
+
x2, y2 = x1 + w, y1 + h
|
74 |
+
if len(contours) > 1:
|
75 |
+
for b in contours:
|
76 |
+
x_t, y_t, w_t, h_t = cv2.boundingRect(b)
|
77 |
+
# Merge multiple bounding boxes into one.
|
78 |
+
x1 = min(x1, x_t)
|
79 |
+
y1 = min(y1, y_t)
|
80 |
+
x2 = max(x2, x_t + w_t)
|
81 |
+
y2 = max(y2, y_t + h_t)
|
82 |
+
h = y2 - y1
|
83 |
+
w = x2 - x1
|
84 |
+
return [x1, y1, x2, y2]
|
85 |
+
|
86 |
+
def plot_to_result(self,
|
87 |
+
annotations,
|
88 |
+
bboxes=None,
|
89 |
+
points=None,
|
90 |
+
point_label=None,
|
91 |
+
mask_random_color=True,
|
92 |
+
better_quality=True,
|
93 |
+
retina=False,
|
94 |
+
withContours=True) -> np.ndarray:
|
95 |
+
if isinstance(annotations[0], dict):
|
96 |
+
annotations = [annotation['segmentation'] for annotation in annotations]
|
97 |
+
image = self.img
|
98 |
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
99 |
+
original_h = image.shape[0]
|
100 |
+
original_w = image.shape[1]
|
101 |
+
if sys.platform == "darwin":
|
102 |
+
plt.switch_backend("TkAgg")
|
103 |
+
plt.figure(figsize=(original_w / 100, original_h / 100))
|
104 |
+
# Add subplot with no margin.
|
105 |
+
plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
|
106 |
+
plt.margins(0, 0)
|
107 |
+
plt.gca().xaxis.set_major_locator(plt.NullLocator())
|
108 |
+
plt.gca().yaxis.set_major_locator(plt.NullLocator())
|
109 |
+
|
110 |
+
plt.imshow(image)
|
111 |
+
if better_quality:
|
112 |
+
if isinstance(annotations[0], torch.Tensor):
|
113 |
+
annotations = np.array(annotations.cpu())
|
114 |
+
for i, mask in enumerate(annotations):
|
115 |
+
mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8))
|
116 |
+
annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8))
|
117 |
+
if self.device == 'cpu':
|
118 |
+
annotations = np.array(annotations)
|
119 |
+
self.fast_show_mask(
|
120 |
+
annotations,
|
121 |
+
plt.gca(),
|
122 |
+
random_color=mask_random_color,
|
123 |
+
bboxes=bboxes,
|
124 |
+
points=points,
|
125 |
+
pointlabel=point_label,
|
126 |
+
retinamask=retina,
|
127 |
+
target_height=original_h,
|
128 |
+
target_width=original_w,
|
129 |
+
)
|
130 |
+
else:
|
131 |
+
if isinstance(annotations[0], np.ndarray):
|
132 |
+
annotations = torch.from_numpy(annotations)
|
133 |
+
self.fast_show_mask_gpu(
|
134 |
+
annotations,
|
135 |
+
plt.gca(),
|
136 |
+
random_color=mask_random_color,
|
137 |
+
bboxes=bboxes,
|
138 |
+
points=points,
|
139 |
+
pointlabel=point_label,
|
140 |
+
retinamask=retina,
|
141 |
+
target_height=original_h,
|
142 |
+
target_width=original_w,
|
143 |
+
)
|
144 |
+
if isinstance(annotations, torch.Tensor):
|
145 |
+
annotations = annotations.cpu().numpy()
|
146 |
+
if withContours:
|
147 |
+
contour_all = []
|
148 |
+
temp = np.zeros((original_h, original_w, 1))
|
149 |
+
for i, mask in enumerate(annotations):
|
150 |
+
if type(mask) == dict:
|
151 |
+
mask = mask['segmentation']
|
152 |
+
annotation = mask.astype(np.uint8)
|
153 |
+
if not retina:
|
154 |
+
annotation = cv2.resize(
|
155 |
+
annotation,
|
156 |
+
(original_w, original_h),
|
157 |
+
interpolation=cv2.INTER_NEAREST,
|
158 |
+
)
|
159 |
+
contours, hierarchy = cv2.findContours(annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
160 |
+
for contour in contours:
|
161 |
+
contour_all.append(contour)
|
162 |
+
cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2)
|
163 |
+
color = np.array([0 / 255, 0 / 255, 255 / 255, 0.8])
|
164 |
+
contour_mask = temp / 255 * color.reshape(1, 1, -1)
|
165 |
+
plt.imshow(contour_mask)
|
166 |
+
|
167 |
+
plt.axis('off')
|
168 |
+
fig = plt.gcf()
|
169 |
+
plt.draw()
|
170 |
+
|
171 |
+
try:
|
172 |
+
buf = fig.canvas.tostring_rgb()
|
173 |
+
except AttributeError:
|
174 |
+
fig.canvas.draw()
|
175 |
+
buf = fig.canvas.tostring_rgb()
|
176 |
+
cols, rows = fig.canvas.get_width_height()
|
177 |
+
img_array = np.frombuffer(buf, dtype=np.uint8).reshape(rows, cols, 3)
|
178 |
+
result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
|
179 |
+
plt.close()
|
180 |
+
return result
|
181 |
+
|
182 |
+
# Remark for refactoring: IMO a function should do one thing only, storing the image and plotting should be seperated and do not necessarily need to be class functions but standalone utility functions that the user can chain in his scripts to have more fine-grained control.
|
183 |
+
def plot(self,
|
184 |
+
annotations,
|
185 |
+
output_path,
|
186 |
+
bboxes=None,
|
187 |
+
points=None,
|
188 |
+
point_label=None,
|
189 |
+
mask_random_color=True,
|
190 |
+
better_quality=True,
|
191 |
+
retina=False,
|
192 |
+
withContours=True):
|
193 |
+
if len(annotations) == 0:
|
194 |
+
return None
|
195 |
+
result = self.plot_to_result(
|
196 |
+
annotations,
|
197 |
+
bboxes,
|
198 |
+
points,
|
199 |
+
point_label,
|
200 |
+
mask_random_color,
|
201 |
+
better_quality,
|
202 |
+
retina,
|
203 |
+
withContours,
|
204 |
+
)
|
205 |
+
|
206 |
+
path = os.path.dirname(os.path.abspath(output_path))
|
207 |
+
if not os.path.exists(path):
|
208 |
+
os.makedirs(path)
|
209 |
+
result = result[:, :, ::-1]
|
210 |
+
cv2.imwrite(output_path, result)
|
211 |
+
|
212 |
+
# CPU post process
|
213 |
+
def fast_show_mask(
|
214 |
+
self,
|
215 |
+
annotation,
|
216 |
+
ax,
|
217 |
+
random_color=False,
|
218 |
+
bboxes=None,
|
219 |
+
points=None,
|
220 |
+
pointlabel=None,
|
221 |
+
retinamask=True,
|
222 |
+
target_height=960,
|
223 |
+
target_width=960,
|
224 |
+
):
|
225 |
+
msak_sum = annotation.shape[0]
|
226 |
+
height = annotation.shape[1]
|
227 |
+
weight = annotation.shape[2]
|
228 |
+
#Sort annotations based on area.
|
229 |
+
areas = np.sum(annotation, axis=(1, 2))
|
230 |
+
sorted_indices = np.argsort(areas)
|
231 |
+
annotation = annotation[sorted_indices]
|
232 |
+
|
233 |
+
index = (annotation != 0).argmax(axis=0)
|
234 |
+
if random_color:
|
235 |
+
color = np.random.random((msak_sum, 1, 1, 3))
|
236 |
+
else:
|
237 |
+
color = np.ones((msak_sum, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 255 / 255])
|
238 |
+
transparency = np.ones((msak_sum, 1, 1, 1)) * 0.6
|
239 |
+
visual = np.concatenate([color, transparency], axis=-1)
|
240 |
+
mask_image = np.expand_dims(annotation, -1) * visual
|
241 |
+
|
242 |
+
show = np.zeros((height, weight, 4))
|
243 |
+
h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing='ij')
|
244 |
+
indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
|
245 |
+
# Use vectorized indexing to update the values of 'show'.
|
246 |
+
show[h_indices, w_indices, :] = mask_image[indices]
|
247 |
+
if bboxes is not None:
|
248 |
+
for bbox in bboxes:
|
249 |
+
x1, y1, x2, y2 = bbox
|
250 |
+
ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
|
251 |
+
# draw point
|
252 |
+
if points is not None:
|
253 |
+
plt.scatter(
|
254 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
|
255 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
|
256 |
+
s=20,
|
257 |
+
c='y',
|
258 |
+
)
|
259 |
+
plt.scatter(
|
260 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
|
261 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
|
262 |
+
s=20,
|
263 |
+
c='m',
|
264 |
+
)
|
265 |
+
|
266 |
+
if not retinamask:
|
267 |
+
show = cv2.resize(show, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
|
268 |
+
ax.imshow(show)
|
269 |
+
|
270 |
+
def fast_show_mask_gpu(
|
271 |
+
self,
|
272 |
+
annotation,
|
273 |
+
ax,
|
274 |
+
random_color=False,
|
275 |
+
bboxes=None,
|
276 |
+
points=None,
|
277 |
+
pointlabel=None,
|
278 |
+
retinamask=True,
|
279 |
+
target_height=960,
|
280 |
+
target_width=960,
|
281 |
+
):
|
282 |
+
msak_sum = annotation.shape[0]
|
283 |
+
height = annotation.shape[1]
|
284 |
+
weight = annotation.shape[2]
|
285 |
+
areas = torch.sum(annotation, dim=(1, 2))
|
286 |
+
sorted_indices = torch.argsort(areas, descending=False)
|
287 |
+
annotation = annotation[sorted_indices]
|
288 |
+
# Find the index of the first non-zero value at each position.
|
289 |
+
index = (annotation != 0).to(torch.long).argmax(dim=0)
|
290 |
+
if random_color:
|
291 |
+
color = torch.rand((msak_sum, 1, 1, 3)).to(annotation.device)
|
292 |
+
else:
|
293 |
+
color = torch.ones((msak_sum, 1, 1, 3)).to(annotation.device) * torch.tensor([
|
294 |
+
30 / 255, 144 / 255, 255 / 255]).to(annotation.device)
|
295 |
+
transparency = torch.ones((msak_sum, 1, 1, 1)).to(annotation.device) * 0.6
|
296 |
+
visual = torch.cat([color, transparency], dim=-1)
|
297 |
+
mask_image = torch.unsqueeze(annotation, -1) * visual
|
298 |
+
# Select data according to the index. The index indicates which batch's data to choose at each position, converting the mask_image into a single batch form.
|
299 |
+
show = torch.zeros((height, weight, 4)).to(annotation.device)
|
300 |
+
try:
|
301 |
+
h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight), indexing='ij')
|
302 |
+
except:
|
303 |
+
h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight))
|
304 |
+
indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
|
305 |
+
# Use vectorized indexing to update the values of 'show'.
|
306 |
+
show[h_indices, w_indices, :] = mask_image[indices]
|
307 |
+
show_cpu = show.cpu().numpy()
|
308 |
+
if bboxes is not None:
|
309 |
+
for bbox in bboxes:
|
310 |
+
x1, y1, x2, y2 = bbox
|
311 |
+
ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
|
312 |
+
# draw point
|
313 |
+
if points is not None:
|
314 |
+
plt.scatter(
|
315 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
|
316 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
|
317 |
+
s=20,
|
318 |
+
c='y',
|
319 |
+
)
|
320 |
+
plt.scatter(
|
321 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
|
322 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
|
323 |
+
s=20,
|
324 |
+
c='m',
|
325 |
+
)
|
326 |
+
if not retinamask:
|
327 |
+
show_cpu = cv2.resize(show_cpu, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
|
328 |
+
ax.imshow(show_cpu)
|
329 |
+
|
330 |
+
# clip
|
331 |
+
@torch.no_grad()
|
332 |
+
def retrieve(self, model, preprocess, elements, search_text: str, device) -> int:
|
333 |
+
preprocessed_images = [preprocess(image).to(device) for image in elements]
|
334 |
+
try:
|
335 |
+
import clip # for linear_assignment
|
336 |
+
|
337 |
+
except (ImportError, AssertionError, AttributeError):
|
338 |
+
from ultralytics.yolo.utils.checks import check_requirements
|
339 |
+
|
340 |
+
check_requirements('git+https://github.com/openai/CLIP.git') # required before installing lap from source
|
341 |
+
import clip
|
342 |
+
|
343 |
+
|
344 |
+
tokenized_text = clip.tokenize([search_text]).to(device)
|
345 |
+
stacked_images = torch.stack(preprocessed_images)
|
346 |
+
image_features = model.encode_image(stacked_images)
|
347 |
+
text_features = model.encode_text(tokenized_text)
|
348 |
+
image_features /= image_features.norm(dim=-1, keepdim=True)
|
349 |
+
text_features /= text_features.norm(dim=-1, keepdim=True)
|
350 |
+
probs = 100.0 * image_features @ text_features.T
|
351 |
+
return probs[:, 0].softmax(dim=0)
|
352 |
+
|
353 |
+
def _crop_image(self, format_results):
|
354 |
+
|
355 |
+
image = Image.fromarray(cv2.cvtColor(self.img, cv2.COLOR_BGR2RGB))
|
356 |
+
ori_w, ori_h = image.size
|
357 |
+
annotations = format_results
|
358 |
+
mask_h, mask_w = annotations[0]['segmentation'].shape
|
359 |
+
if ori_w != mask_w or ori_h != mask_h:
|
360 |
+
image = image.resize((mask_w, mask_h))
|
361 |
+
cropped_boxes = []
|
362 |
+
cropped_images = []
|
363 |
+
not_crop = []
|
364 |
+
filter_id = []
|
365 |
+
# annotations, _ = filter_masks(annotations)
|
366 |
+
# filter_id = list(_)
|
367 |
+
for _, mask in enumerate(annotations):
|
368 |
+
if np.sum(mask['segmentation']) <= 100:
|
369 |
+
filter_id.append(_)
|
370 |
+
continue
|
371 |
+
bbox = self._get_bbox_from_mask(mask['segmentation']) # mask 的 bbox
|
372 |
+
cropped_boxes.append(self._segment_image(image, bbox))
|
373 |
+
# cropped_boxes.append(segment_image(image,mask["segmentation"]))
|
374 |
+
cropped_images.append(bbox) # Save the bounding box of the cropped image.
|
375 |
+
|
376 |
+
return cropped_boxes, cropped_images, not_crop, filter_id, annotations
|
377 |
+
|
378 |
+
def box_prompt(self, bbox=None, bboxes=None):
|
379 |
+
if self.results == None:
|
380 |
+
return []
|
381 |
+
assert bbox or bboxes
|
382 |
+
if bboxes is None:
|
383 |
+
bboxes = [bbox]
|
384 |
+
max_iou_index = []
|
385 |
+
for bbox in bboxes:
|
386 |
+
assert (bbox[2] != 0 and bbox[3] != 0)
|
387 |
+
masks = self.results[0].masks.data
|
388 |
+
target_height = self.img.shape[0]
|
389 |
+
target_width = self.img.shape[1]
|
390 |
+
h = masks.shape[1]
|
391 |
+
w = masks.shape[2]
|
392 |
+
if h != target_height or w != target_width:
|
393 |
+
bbox = [
|
394 |
+
int(bbox[0] * w / target_width),
|
395 |
+
int(bbox[1] * h / target_height),
|
396 |
+
int(bbox[2] * w / target_width),
|
397 |
+
int(bbox[3] * h / target_height), ]
|
398 |
+
bbox[0] = round(bbox[0]) if round(bbox[0]) > 0 else 0
|
399 |
+
bbox[1] = round(bbox[1]) if round(bbox[1]) > 0 else 0
|
400 |
+
bbox[2] = round(bbox[2]) if round(bbox[2]) < w else w
|
401 |
+
bbox[3] = round(bbox[3]) if round(bbox[3]) < h else h
|
402 |
+
|
403 |
+
# IoUs = torch.zeros(len(masks), dtype=torch.float32)
|
404 |
+
bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
|
405 |
+
|
406 |
+
masks_area = torch.sum(masks[:, bbox[1]:bbox[3], bbox[0]:bbox[2]], dim=(1, 2))
|
407 |
+
orig_masks_area = torch.sum(masks, dim=(1, 2))
|
408 |
+
|
409 |
+
union = bbox_area + orig_masks_area - masks_area
|
410 |
+
IoUs = masks_area / union
|
411 |
+
max_iou_index.append(int(torch.argmax(IoUs)))
|
412 |
+
max_iou_index = list(set(max_iou_index))
|
413 |
+
return np.array(masks[max_iou_index].cpu().numpy())
|
414 |
+
|
415 |
+
def point_prompt(self, points, pointlabel): # numpy
|
416 |
+
if self.results == None:
|
417 |
+
return []
|
418 |
+
masks = self._format_results(self.results[0], 0)
|
419 |
+
target_height = self.img.shape[0]
|
420 |
+
target_width = self.img.shape[1]
|
421 |
+
h = masks[0]['segmentation'].shape[0]
|
422 |
+
w = masks[0]['segmentation'].shape[1]
|
423 |
+
if h != target_height or w != target_width:
|
424 |
+
points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points]
|
425 |
+
onemask = np.zeros((h, w))
|
426 |
+
masks = sorted(masks, key=lambda x: x['area'], reverse=True)
|
427 |
+
for i, annotation in enumerate(masks):
|
428 |
+
if type(annotation) == dict:
|
429 |
+
mask = annotation['segmentation']
|
430 |
+
else:
|
431 |
+
mask = annotation
|
432 |
+
for i, point in enumerate(points):
|
433 |
+
if mask[point[1], point[0]] == 1 and pointlabel[i] == 1:
|
434 |
+
onemask[mask] = 1
|
435 |
+
if mask[point[1], point[0]] == 1 and pointlabel[i] == 0:
|
436 |
+
onemask[mask] = 0
|
437 |
+
onemask = onemask >= 1
|
438 |
+
return np.array([onemask])
|
439 |
+
|
440 |
+
def text_prompt(self, text):
|
441 |
+
if self.results == None:
|
442 |
+
return []
|
443 |
+
format_results = self._format_results(self.results[0], 0)
|
444 |
+
cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._crop_image(format_results)
|
445 |
+
clip_model, preprocess = clip.load('ViT-B/32', device=self.device)
|
446 |
+
scores = self.retrieve(clip_model, preprocess, cropped_boxes, text, device=self.device)
|
447 |
+
max_idx = scores.argsort()
|
448 |
+
max_idx = max_idx[-1]
|
449 |
+
max_idx += sum(np.array(filter_id) <= int(max_idx))
|
450 |
+
return np.array([annotations[max_idx]['segmentation']])
|
451 |
+
|
452 |
+
def everything_prompt(self):
|
453 |
+
if self.results == None:
|
454 |
+
return []
|
455 |
+
return self.results[0].masks.data
|
456 |
+
|
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/run_test.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import cv2
|
4 |
+
import numpy as np
|
5 |
+
import onnxruntime
|
6 |
+
import time
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import torch
|
9 |
+
from ultralytics.engine.results import Results
|
10 |
+
from tools_pt import *
|
11 |
+
from prompt import FastSAMPrompt
|
12 |
+
import aidlite
|
13 |
+
import argparse
|
14 |
+
import ast
|
15 |
+
|
16 |
+
# 定义相似度函数
|
17 |
+
def get_acc(onnx_out,other_out):
|
18 |
+
cosine_similarity=np.dot(np.array(onnx_out),np.array(other_out))/(np.linalg.norm(np.array(onnx_out)) * np.linalg.norm(np.array(other_out)))
|
19 |
+
return cosine_similarity
|
20 |
+
|
21 |
+
def cal_sigmoid(x):
|
22 |
+
return 1 / (1 + np.exp(-x))
|
23 |
+
|
24 |
+
class qnn_predict(object):
|
25 |
+
def __init__(self,inputshape,outputshape,args) -> None:
|
26 |
+
aidlite.set_log_level(aidlite.LogLevel.INFO)
|
27 |
+
aidlite.log_to_stderr()
|
28 |
+
print(f"Aidlite library version : {aidlite.get_library_version()}")
|
29 |
+
print(f"Aidlite python library version : {aidlite.get_py_library_version()}")
|
30 |
+
config = aidlite.Config.create_instance()
|
31 |
+
if config is None:
|
32 |
+
print("Create model failed !")
|
33 |
+
config.implement_type = aidlite.ImplementType.TYPE_LOCAL
|
34 |
+
config.framework_type = aidlite.FrameworkType.TYPE_QNN
|
35 |
+
config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
|
36 |
+
config.is_quantify_model = 1
|
37 |
+
|
38 |
+
model = aidlite.Model.create_instance(args.target_model)
|
39 |
+
if model is None:
|
40 |
+
print("Create model failed !")
|
41 |
+
|
42 |
+
self.input_shape=inputshape
|
43 |
+
self.out_shape = outputshape
|
44 |
+
model.set_model_properties(self.input_shape, aidlite.DataType.TYPE_FLOAT32, self.out_shape, aidlite.DataType.TYPE_FLOAT32)
|
45 |
+
self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(model, config)
|
46 |
+
if self.interpreter is None:
|
47 |
+
print("build_interpretper_from_model_and_config failed !")
|
48 |
+
result = self.interpreter.init()
|
49 |
+
if result != 0:
|
50 |
+
print(f"interpreter init failed !")
|
51 |
+
result = self.interpreter.load_model()
|
52 |
+
if result != 0:
|
53 |
+
print("interpreter load model failed !")
|
54 |
+
print("detect model load success!")
|
55 |
+
|
56 |
+
self.conf = 0.4
|
57 |
+
self.iou=0.9
|
58 |
+
self.size = 640
|
59 |
+
self.agnostic_nms=False
|
60 |
+
self.max_det = 300
|
61 |
+
self.names=['object']
|
62 |
+
self.classes =None
|
63 |
+
self.retina_masks=True
|
64 |
+
|
65 |
+
def pretreat_img(self,img):
|
66 |
+
scale = 1/255.
|
67 |
+
img_size = cv2.resize(img, (self.size,self.size), interpolation=cv2.INTER_LINEAR)
|
68 |
+
float_img = img_size.astype('float32')
|
69 |
+
float_img = float_img* scale
|
70 |
+
float_img = float_img[:, :, ::-1]
|
71 |
+
return float_img
|
72 |
+
|
73 |
+
def postprocess(self, preds, img, orig_imgs):
|
74 |
+
"""TODO: filter by classes."""
|
75 |
+
p = non_max_suppression(torch.from_numpy(preds[0]),
|
76 |
+
self.conf,
|
77 |
+
self.iou,
|
78 |
+
agnostic=self.agnostic_nms,
|
79 |
+
max_det=self.max_det,
|
80 |
+
nc=len(self.names),
|
81 |
+
classes=self.classes)
|
82 |
+
|
83 |
+
results = []
|
84 |
+
if len(p) == 0 or len(p[0]) == 0:
|
85 |
+
print("No object detected.")
|
86 |
+
return results
|
87 |
+
|
88 |
+
full_box = torch.zeros_like(p[0][0])
|
89 |
+
full_box[2], full_box[3], full_box[4], full_box[6:] = img.shape[3], img.shape[2], 1.0, 1.0
|
90 |
+
full_box = full_box.view(1, -1)
|
91 |
+
critical_iou_index = bbox_iou(full_box[0][:4], p[0][:, :4], iou_thres=0.9, image_shape=img.shape[2:])
|
92 |
+
if critical_iou_index.numel() != 0:
|
93 |
+
full_box[0][4] = p[0][critical_iou_index][:,4]
|
94 |
+
full_box[0][6:] = p[0][critical_iou_index][:,6:]
|
95 |
+
p[0][critical_iou_index] = full_box
|
96 |
+
|
97 |
+
#proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported
|
98 |
+
proto=torch.from_numpy(preds[-1])
|
99 |
+
for i, pred in enumerate(p):
|
100 |
+
orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs
|
101 |
+
path =img[0] #self.batch[0]
|
102 |
+
img_path = path[i] if isinstance(path, list) else path
|
103 |
+
if not len(pred): # save empty boxes
|
104 |
+
results.append(Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6]))
|
105 |
+
continue
|
106 |
+
if self.retina_masks:
|
107 |
+
if not isinstance(orig_imgs, torch.Tensor):
|
108 |
+
pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
|
109 |
+
masks = process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2]) # HWC
|
110 |
+
else:
|
111 |
+
masks = process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True) # HWC
|
112 |
+
if not isinstance(orig_imgs, torch.Tensor):
|
113 |
+
pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
|
114 |
+
results.append(
|
115 |
+
Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6], masks=masks))
|
116 |
+
return results
|
117 |
+
|
118 |
+
def qnn_run(self, orig_imgs,img_path,args):
|
119 |
+
input_img_f =self.pretreat_img(orig_imgs) # 图片resize HWC
|
120 |
+
# print("qnn_input:",input_img_f)
|
121 |
+
# encoder texts
|
122 |
+
input_img = np.expand_dims(input_img_f, 0)
|
123 |
+
|
124 |
+
invoke_time=[]
|
125 |
+
for i in range(args.invoke_nums):
|
126 |
+
result = self.interpreter.set_input_tensor(0, input_img.data)
|
127 |
+
t0 = time.time()
|
128 |
+
result = self.interpreter.invoke()
|
129 |
+
t1 = time.time()
|
130 |
+
cost_time=(t1-t0)*1000
|
131 |
+
invoke_time.append(cost_time)
|
132 |
+
mask_ = self.interpreter.get_output_tensor(0)
|
133 |
+
concat_ = self.interpreter.get_output_tensor(1)
|
134 |
+
mul_ = self.interpreter.get_output_tensor(3)
|
135 |
+
split_ = self.interpreter.get_output_tensor(2)
|
136 |
+
mask_ = mask_.reshape( * self.out_shape[3])
|
137 |
+
mask_=mask_.transpose((0, 3, 1,2))
|
138 |
+
concat_ = concat_.reshape( *self.out_shape[2])
|
139 |
+
mul_ = mul_.reshape( *self.out_shape[1])
|
140 |
+
split_ = split_.reshape( *self.out_shape[0])
|
141 |
+
sig_ = cal_sigmoid(split_)
|
142 |
+
|
143 |
+
output_concat = np.concatenate((mul_,sig_),axis=1)
|
144 |
+
output_concat = np.concatenate((output_concat,concat_),axis=1)
|
145 |
+
|
146 |
+
# outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
|
147 |
+
## time 统计
|
148 |
+
max_invoke_time = max(invoke_time)
|
149 |
+
min_invoke_time = min(invoke_time)
|
150 |
+
mean_invoke_time = sum(invoke_time)/args.invoke_nums
|
151 |
+
var_invoketime=np.var(invoke_time)
|
152 |
+
print("========================================")
|
153 |
+
print(f"QNN inference {args.invoke_nums} times :\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
|
154 |
+
print("========================================")
|
155 |
+
|
156 |
+
qnn_out = [np.array(output_concat),np.array(mask_)]
|
157 |
+
# print("qnn predict out:",qnn_out)
|
158 |
+
|
159 |
+
nchw_img = input_img.transpose(0,3,1,2)
|
160 |
+
everything_results = self.postprocess( qnn_out, nchw_img, [orig_imgs])
|
161 |
+
# print("everything_results: ",everything_results)
|
162 |
+
|
163 |
+
prompt_process = FastSAMPrompt(args.imgs, everything_results, device="cpu")
|
164 |
+
|
165 |
+
# ann = prompt_process.point_prompt(points=[[620, 360]], pointlabel=[1])
|
166 |
+
try:
|
167 |
+
if args.point_prompt ==[[0,0]]:
|
168 |
+
ann = prompt_process.everything_prompt()
|
169 |
+
else:
|
170 |
+
ann = prompt_process.point_prompt(points=args.point_prompt, pointlabel=[1])
|
171 |
+
out_name = os.path.basename(img_path).split(".")[0]
|
172 |
+
if True: # savepic
|
173 |
+
outpath = "python/"
|
174 |
+
if not os.path.exists(outpath):
|
175 |
+
os.mkdir(outpath)
|
176 |
+
prompt_process.plot(
|
177 |
+
annotations=ann,
|
178 |
+
output_path=os.path.join(outpath,out_name+"_result.jpg"),
|
179 |
+
mask_random_color=True,
|
180 |
+
better_quality=True,
|
181 |
+
retina=False,
|
182 |
+
withContours=True,
|
183 |
+
)
|
184 |
+
else:
|
185 |
+
plt.figure()
|
186 |
+
prompt_process.fast_show_mask(annotation=ann,
|
187 |
+
ax = plt)
|
188 |
+
except Exception as e:
|
189 |
+
print(f"Waning : An error occurred in the picture {img_path} prediction -{e}")
|
190 |
+
return [mask_.reshape(-1),output_concat.reshape(-1)]
|
191 |
+
|
192 |
+
|
193 |
+
|
194 |
+
def parser_args():
|
195 |
+
parser = argparse.ArgumentParser(description="Run model benchmarks")
|
196 |
+
parser.add_argument('--target_model',type=str,default='models/cutoff_fastsam_x_w8a8.qnn216.ctx.bin.aidem',help="inference model path")
|
197 |
+
parser.add_argument('--source_model',type=str,default='models/fastsam_x.onnx',help="original model path")
|
198 |
+
parser.add_argument('--imgs',type=str,default='python/dogs.jpg',help="Predict images path")
|
199 |
+
parser.add_argument('--invoke_nums',type=int,default=10,help="Inference nums")
|
200 |
+
parser.add_argument('--point_prompt',type=str,default="[[0,0]]",help="example:[[x1,y1],[x2,y2]]")
|
201 |
+
args = parser.parse_args()
|
202 |
+
return args
|
203 |
+
|
204 |
+
|
205 |
+
if __name__ == "__main__":
|
206 |
+
args = parser_args()
|
207 |
+
inputshape=[[1,640,640,3]]
|
208 |
+
outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
|
209 |
+
args.point_prompt = ast.literal_eval(args.point_prompt)
|
210 |
+
|
211 |
+
predict = qnn_predict(inputshape,outputshape,args)
|
212 |
+
if os.path.isdir(args.imgs):
|
213 |
+
img_files = os.listdir(args.imgs)
|
214 |
+
for fi in img_files:
|
215 |
+
img_path = os.path.join(args.imgs,fi)
|
216 |
+
im0s = cv2.imread(img_path) # BGR
|
217 |
+
im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
|
218 |
+
predict.qnn_run(im0s,img_path,args)
|
219 |
+
else:
|
220 |
+
img_path = args.imgs
|
221 |
+
im0s = cv2.imread(img_path) # BGR
|
222 |
+
im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
|
223 |
+
qnn_result = predict.qnn_run(im0s,img_path,args)
|
224 |
+
print("Prediction completion and the results are saved !")
|
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/tools_pt.py
ADDED
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import time
|
3 |
+
import torch
|
4 |
+
import torchvision
|
5 |
+
import torch.nn.functional as F
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
def clip_boxes(boxes, shape):
|
10 |
+
"""
|
11 |
+
Takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the shape.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
boxes (torch.Tensor): the bounding boxes to clip
|
15 |
+
shape (tuple): the shape of the image
|
16 |
+
"""
|
17 |
+
if isinstance(boxes, torch.Tensor): # faster individually
|
18 |
+
boxes[..., 0].clamp_(0, shape[1]) # x1
|
19 |
+
boxes[..., 1].clamp_(0, shape[0]) # y1
|
20 |
+
boxes[..., 2].clamp_(0, shape[1]) # x2
|
21 |
+
boxes[..., 3].clamp_(0, shape[0]) # y2
|
22 |
+
else: # np.array (faster grouped)
|
23 |
+
boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
|
24 |
+
boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
|
25 |
+
|
26 |
+
def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding=True):
|
27 |
+
"""
|
28 |
+
Rescales bounding boxes (in the format of xyxy) from the shape of the image they were originally specified in
|
29 |
+
(img1_shape) to the shape of a different image (img0_shape).
|
30 |
+
|
31 |
+
Args:
|
32 |
+
img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
|
33 |
+
boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
|
34 |
+
img0_shape (tuple): the shape of the target image, in the format of (height, width).
|
35 |
+
ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
|
36 |
+
calculated based on the size difference between the two images.
|
37 |
+
padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
|
38 |
+
rescaling.
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
|
42 |
+
"""
|
43 |
+
if ratio_pad is None: # calculate from img0_shape
|
44 |
+
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
|
45 |
+
pad = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1), round(
|
46 |
+
(img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # wh padding
|
47 |
+
else:
|
48 |
+
gain = ratio_pad[0][0]
|
49 |
+
pad = ratio_pad[1]
|
50 |
+
|
51 |
+
if padding:
|
52 |
+
boxes[..., [0, 2]] -= pad[0] # x padding
|
53 |
+
boxes[..., [1, 3]] -= pad[1] # y padding
|
54 |
+
boxes[..., :4] /= gain
|
55 |
+
clip_boxes(boxes, img0_shape)
|
56 |
+
return boxes
|
57 |
+
|
58 |
+
|
59 |
+
def xywh2xyxy(x):
|
60 |
+
"""
|
61 |
+
Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
|
62 |
+
top-left corner and (x2, y2) is the bottom-right corner.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
|
69 |
+
"""
|
70 |
+
assert x.shape[-1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'
|
71 |
+
y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x) # faster than clone/copy
|
72 |
+
dw = x[..., 2] / 2 # half-width
|
73 |
+
dh = x[..., 3] / 2 # half-height
|
74 |
+
y[..., 0] = x[..., 0] - dw # top left x
|
75 |
+
y[..., 1] = x[..., 1] - dh # top left y
|
76 |
+
y[..., 2] = x[..., 0] + dw # bottom right x
|
77 |
+
y[..., 3] = x[..., 1] + dh # bottom right y
|
78 |
+
return y
|
79 |
+
|
80 |
+
|
81 |
+
def non_max_suppression(
|
82 |
+
prediction,
|
83 |
+
conf_thres=0.25,
|
84 |
+
iou_thres=0.45,
|
85 |
+
classes=None,
|
86 |
+
agnostic=False,
|
87 |
+
multi_label=False,
|
88 |
+
labels=(),
|
89 |
+
max_det=300,
|
90 |
+
nc=0, # number of classes (optional)
|
91 |
+
max_time_img=0.05,
|
92 |
+
max_nms=30000,
|
93 |
+
max_wh=7680,
|
94 |
+
):
|
95 |
+
"""
|
96 |
+
Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box.
|
97 |
+
|
98 |
+
Args:
|
99 |
+
prediction (torch.Tensor): A tensor of shape (batch_size, num_classes + 4 + num_masks, num_boxes)
|
100 |
+
containing the predicted boxes, classes, and masks. The tensor should be in the format
|
101 |
+
output by a model, such as YOLO.
|
102 |
+
conf_thres (float): The confidence threshold below which boxes will be filtered out.
|
103 |
+
Valid values are between 0.0 and 1.0.
|
104 |
+
iou_thres (float): The IoU threshold below which boxes will be filtered out during NMS.
|
105 |
+
Valid values are between 0.0 and 1.0.
|
106 |
+
classes (List[int]): A list of class indices to consider. If None, all classes will be considered.
|
107 |
+
agnostic (bool): If True, the model is agnostic to the number of classes, and all
|
108 |
+
classes will be considered as one.
|
109 |
+
multi_label (bool): If True, each box may have multiple labels.
|
110 |
+
labels (List[List[Union[int, float, torch.Tensor]]]): A list of lists, where each inner
|
111 |
+
list contains the apriori labels for a given image. The list should be in the format
|
112 |
+
output by a dataloader, with each label being a tuple of (class_index, x1, y1, x2, y2).
|
113 |
+
max_det (int): The maximum number of boxes to keep after NMS.
|
114 |
+
nc (int, optional): The number of classes output by the model. Any indices after this will be considered masks.
|
115 |
+
max_time_img (float): The maximum time (seconds) for processing one image.
|
116 |
+
max_nms (int): The maximum number of boxes into torchvision.ops.nms().
|
117 |
+
max_wh (int): The maximum box width and height in pixels
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
(List[torch.Tensor]): A list of length batch_size, where each element is a tensor of
|
121 |
+
shape (num_boxes, 6 + num_masks) containing the kept boxes, with columns
|
122 |
+
(x1, y1, x2, y2, confidence, class, mask1, mask2, ...).
|
123 |
+
"""
|
124 |
+
|
125 |
+
# Checks
|
126 |
+
assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
|
127 |
+
assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
|
128 |
+
if isinstance(prediction, (list, tuple)): # YOLOv8 model in validation model, output = (inference_out, loss_out)
|
129 |
+
prediction = prediction[0] # select only inference output
|
130 |
+
|
131 |
+
device = prediction.device
|
132 |
+
mps = 'mps' in device.type # Apple MPS
|
133 |
+
if mps: # MPS not fully supported yet, convert tensors to CPU before NMS
|
134 |
+
prediction = prediction.cpu()
|
135 |
+
bs = prediction.shape[0] # batch size
|
136 |
+
nc = nc or (prediction.shape[1] - 4) # number of classes
|
137 |
+
nm = prediction.shape[1] - nc - 4
|
138 |
+
mi = 4 + nc # mask start index
|
139 |
+
xc = prediction[:, 4:mi].amax(1) > conf_thres # candidates
|
140 |
+
|
141 |
+
# Settings
|
142 |
+
# min_wh = 2 # (pixels) minimum box width and height
|
143 |
+
time_limit = 0.5 + max_time_img * bs # seconds to quit after
|
144 |
+
multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
|
145 |
+
|
146 |
+
prediction = prediction.transpose(-1, -2) # shape(1,84,6300) to shape(1,6300,84)
|
147 |
+
prediction[..., :4] = xywh2xyxy(prediction[..., :4]) # xywh to xyxy
|
148 |
+
|
149 |
+
t = time.time()
|
150 |
+
output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
|
151 |
+
for xi, x in enumerate(prediction): # image index, image inference
|
152 |
+
# Apply constraints
|
153 |
+
# x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0 # width-height
|
154 |
+
x = x[xc[xi]] # confidence
|
155 |
+
|
156 |
+
# Cat apriori labels if autolabelling
|
157 |
+
if labels and len(labels[xi]):
|
158 |
+
lb = labels[xi]
|
159 |
+
v = torch.zeros((len(lb), nc + nm + 4), device=x.device)
|
160 |
+
v[:, :4] = xywh2xyxy(lb[:, 1:5]) # box
|
161 |
+
v[range(len(lb)), lb[:, 0].long() + 4] = 1.0 # cls
|
162 |
+
x = torch.cat((x, v), 0)
|
163 |
+
|
164 |
+
# If none remain process next image
|
165 |
+
if not x.shape[0]:
|
166 |
+
continue
|
167 |
+
|
168 |
+
# Detections matrix nx6 (xyxy, conf, cls)
|
169 |
+
box, cls, mask = x.split((4, nc, nm), 1)
|
170 |
+
|
171 |
+
if multi_label:
|
172 |
+
i, j = torch.where(cls > conf_thres)
|
173 |
+
x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1)
|
174 |
+
else: # best class only
|
175 |
+
conf, j = cls.max(1, keepdim=True)
|
176 |
+
x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
|
177 |
+
|
178 |
+
# Filter by class
|
179 |
+
if classes is not None:
|
180 |
+
x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
|
181 |
+
|
182 |
+
# Check shape
|
183 |
+
n = x.shape[0] # number of boxes
|
184 |
+
if not n: # no boxes
|
185 |
+
continue
|
186 |
+
if n > max_nms: # excess boxes
|
187 |
+
x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence and remove excess boxes
|
188 |
+
|
189 |
+
# Batched NMS
|
190 |
+
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
|
191 |
+
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
|
192 |
+
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
|
193 |
+
i = i[:max_det] # limit detections
|
194 |
+
|
195 |
+
# # Experimental
|
196 |
+
# merge = False # use merge-NMS
|
197 |
+
# if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
|
198 |
+
# # Update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
|
199 |
+
# from .metrics import box_iou
|
200 |
+
# iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
|
201 |
+
# weights = iou * scores[None] # box weights
|
202 |
+
# x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
|
203 |
+
# redundant = True # require redundant detections
|
204 |
+
# if redundant:
|
205 |
+
# i = i[iou.sum(1) > 1] # require redundancy
|
206 |
+
|
207 |
+
output[xi] = x[i]
|
208 |
+
if mps:
|
209 |
+
output[xi] = output[xi].to(device)
|
210 |
+
# if (time.time() - t) > time_limit:
|
211 |
+
# LOGGER.warning(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded')
|
212 |
+
# break # time limit exceeded
|
213 |
+
|
214 |
+
return output
|
215 |
+
|
216 |
+
|
217 |
+
def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
|
218 |
+
'''Adjust bounding boxes to stick to image border if they are within a certain threshold.
|
219 |
+
Args:
|
220 |
+
boxes: (n, 4)
|
221 |
+
image_shape: (height, width)
|
222 |
+
threshold: pixel threshold
|
223 |
+
Returns:
|
224 |
+
adjusted_boxes: adjusted bounding boxes
|
225 |
+
'''
|
226 |
+
|
227 |
+
# Image dimensions
|
228 |
+
h, w = image_shape
|
229 |
+
|
230 |
+
# Adjust boxes
|
231 |
+
boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
|
232 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
|
233 |
+
boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
|
234 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
|
235 |
+
boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
|
236 |
+
w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
|
237 |
+
boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
|
238 |
+
h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
|
239 |
+
|
240 |
+
return boxes
|
241 |
+
|
242 |
+
def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
|
243 |
+
'''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
|
244 |
+
Args:
|
245 |
+
box1: (4, )
|
246 |
+
boxes: (n, 4)
|
247 |
+
Returns:
|
248 |
+
high_iou_indices: Indices of boxes with IoU > thres
|
249 |
+
'''
|
250 |
+
boxes = adjust_bboxes_to_image_border(boxes, image_shape)
|
251 |
+
# obtain coordinates for intersections
|
252 |
+
x1 = torch.max(box1[0], boxes[:, 0])
|
253 |
+
y1 = torch.max(box1[1], boxes[:, 1])
|
254 |
+
x2 = torch.min(box1[2], boxes[:, 2])
|
255 |
+
y2 = torch.min(box1[3], boxes[:, 3])
|
256 |
+
|
257 |
+
# compute the area of intersection
|
258 |
+
intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
|
259 |
+
|
260 |
+
# compute the area of both individual boxes
|
261 |
+
box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
262 |
+
box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
263 |
+
|
264 |
+
# compute the area of union
|
265 |
+
union = box1_area + box2_area - intersection
|
266 |
+
|
267 |
+
# compute the IoU
|
268 |
+
iou = intersection / union # Should be shape (n, )
|
269 |
+
if raw_output:
|
270 |
+
if iou.numel() == 0:
|
271 |
+
return 0
|
272 |
+
return iou
|
273 |
+
|
274 |
+
# get indices of boxes with IoU > thres
|
275 |
+
high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
|
276 |
+
|
277 |
+
return high_iou_indices
|
278 |
+
|
279 |
+
|
280 |
+
def scale_masks(masks, shape, padding=True):
|
281 |
+
"""
|
282 |
+
Rescale segment masks to shape.
|
283 |
+
|
284 |
+
Args:
|
285 |
+
masks (torch.Tensor): (N, C, H, W).
|
286 |
+
shape (tuple): Height and width.
|
287 |
+
padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
|
288 |
+
rescaling.
|
289 |
+
"""
|
290 |
+
mh, mw = masks.shape[2:]
|
291 |
+
gain = min(mh / shape[0], mw / shape[1]) # gain = old / new
|
292 |
+
pad = [mw - shape[1] * gain, mh - shape[0] * gain] # wh padding
|
293 |
+
if padding:
|
294 |
+
pad[0] /= 2
|
295 |
+
pad[1] /= 2
|
296 |
+
top, left = (int(pad[1]), int(pad[0])) if padding else (0, 0) # y, x
|
297 |
+
bottom, right = (int(mh - pad[1]), int(mw - pad[0]))
|
298 |
+
masks = masks[..., top:bottom, left:right]
|
299 |
+
|
300 |
+
masks = F.interpolate(masks, shape, mode="bilinear", align_corners=False) # NCHW
|
301 |
+
return masks
|
302 |
+
|
303 |
+
|
304 |
+
def process_mask_native(protos, masks_in, bboxes, shape):
|
305 |
+
"""
|
306 |
+
It takes the output of the mask head, and crops it after upsampling to the bounding boxes.
|
307 |
+
|
308 |
+
Args:
|
309 |
+
protos (torch.Tensor): [mask_dim, mask_h, mask_w]
|
310 |
+
masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms
|
311 |
+
bboxes (torch.Tensor): [n, 4], n is number of masks after nms
|
312 |
+
shape (tuple): the size of the input image (h,w)
|
313 |
+
|
314 |
+
Returns:
|
315 |
+
masks (torch.Tensor): The returned masks with dimensions [h, w, n]
|
316 |
+
"""
|
317 |
+
c, mh, mw = protos.shape # CHW
|
318 |
+
masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw)
|
319 |
+
masks = scale_masks(masks[None], shape)[0] # CHW
|
320 |
+
masks = crop_mask(masks, bboxes) # CHW
|
321 |
+
return masks.gt_(0.5)
|
322 |
+
|
323 |
+
def crop_mask(masks, boxes):
|
324 |
+
"""
|
325 |
+
It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box.
|
326 |
+
|
327 |
+
Args:
|
328 |
+
masks (torch.Tensor): [n, h, w] tensor of masks
|
329 |
+
boxes (torch.Tensor): [n, 4] tensor of bbox coordinates in relative point form
|
330 |
+
|
331 |
+
Returns:
|
332 |
+
(torch.Tensor): The masks are being cropped to the bounding box.
|
333 |
+
"""
|
334 |
+
_, h, w = masks.shape
|
335 |
+
x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(n,1,1)
|
336 |
+
r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,1,w)
|
337 |
+
c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(1,h,1)
|
338 |
+
|
339 |
+
return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
|
340 |
+
|
341 |
+
def process_mask(protos, masks_in, bboxes, shape, upsample=False):
|
342 |
+
"""
|
343 |
+
Apply masks to bounding boxes using the output of the mask head.
|
344 |
+
|
345 |
+
Args:
|
346 |
+
protos (torch.Tensor): A tensor of shape [mask_dim, mask_h, mask_w].
|
347 |
+
masks_in (torch.Tensor): A tensor of shape [n, mask_dim], where n is the number of masks after NMS.
|
348 |
+
bboxes (torch.Tensor): A tensor of shape [n, 4], where n is the number of masks after NMS.
|
349 |
+
shape (tuple): A tuple of integers representing the size of the input image in the format (h, w).
|
350 |
+
upsample (bool): A flag to indicate whether to upsample the mask to the original image size. Default is False.
|
351 |
+
|
352 |
+
Returns:
|
353 |
+
(torch.Tensor): A binary mask tensor of shape [n, h, w], where n is the number of masks after NMS, and h and w
|
354 |
+
are the height and width of the input image. The mask is applied to the bounding boxes.
|
355 |
+
"""
|
356 |
+
|
357 |
+
c, mh, mw = protos.shape # CHW
|
358 |
+
ih, iw = shape
|
359 |
+
masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) # CHW
|
360 |
+
|
361 |
+
downsampled_bboxes = bboxes.clone()
|
362 |
+
downsampled_bboxes[:, 0] *= mw / iw
|
363 |
+
downsampled_bboxes[:, 2] *= mw / iw
|
364 |
+
downsampled_bboxes[:, 3] *= mh / ih
|
365 |
+
downsampled_bboxes[:, 1] *= mh / ih
|
366 |
+
|
367 |
+
masks = crop_mask(masks, downsampled_bboxes) # CHW
|
368 |
+
if upsample:
|
369 |
+
masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW
|
370 |
+
return masks.gt_(0.5)
|
371 |
+
|
372 |
+
|
model_farm_fastsamx_qcs8550_qnn2.16_int8_aidlite/python/utils.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
|
6 |
+
def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
|
7 |
+
'''Adjust bounding boxes to stick to image border if they are within a certain threshold.
|
8 |
+
Args:
|
9 |
+
boxes: (n, 4)
|
10 |
+
image_shape: (height, width)
|
11 |
+
threshold: pixel threshold
|
12 |
+
Returns:
|
13 |
+
adjusted_boxes: adjusted bounding boxes
|
14 |
+
'''
|
15 |
+
|
16 |
+
# Image dimensions
|
17 |
+
h, w = image_shape
|
18 |
+
|
19 |
+
# Adjust boxes
|
20 |
+
boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
|
21 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
|
22 |
+
boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
|
23 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
|
24 |
+
boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
|
25 |
+
w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
|
26 |
+
boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
|
27 |
+
h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
|
28 |
+
|
29 |
+
return boxes
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
def convert_box_xywh_to_xyxy(box):
|
34 |
+
x1 = box[0]
|
35 |
+
y1 = box[1]
|
36 |
+
x2 = box[0] + box[2]
|
37 |
+
y2 = box[1] + box[3]
|
38 |
+
return [x1, y1, x2, y2]
|
39 |
+
|
40 |
+
|
41 |
+
def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
|
42 |
+
'''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
|
43 |
+
Args:
|
44 |
+
box1: (4, )
|
45 |
+
boxes: (n, 4)
|
46 |
+
Returns:
|
47 |
+
high_iou_indices: Indices of boxes with IoU > thres
|
48 |
+
'''
|
49 |
+
boxes = adjust_bboxes_to_image_border(boxes, image_shape)
|
50 |
+
# obtain coordinates for intersections
|
51 |
+
x1 = torch.max(box1[0], boxes[:, 0])
|
52 |
+
y1 = torch.max(box1[1], boxes[:, 1])
|
53 |
+
x2 = torch.min(box1[2], boxes[:, 2])
|
54 |
+
y2 = torch.min(box1[3], boxes[:, 3])
|
55 |
+
|
56 |
+
# compute the area of intersection
|
57 |
+
intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
|
58 |
+
|
59 |
+
# compute the area of both individual boxes
|
60 |
+
box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
61 |
+
box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
62 |
+
|
63 |
+
# compute the area of union
|
64 |
+
union = box1_area + box2_area - intersection
|
65 |
+
|
66 |
+
# compute the IoU
|
67 |
+
iou = intersection / union # Should be shape (n, )
|
68 |
+
if raw_output:
|
69 |
+
if iou.numel() == 0:
|
70 |
+
return 0
|
71 |
+
return iou
|
72 |
+
|
73 |
+
# get indices of boxes with IoU > thres
|
74 |
+
high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
|
75 |
+
|
76 |
+
return high_iou_indices
|
77 |
+
|
78 |
+
|
79 |
+
def image_to_np_ndarray(image):
|
80 |
+
if type(image) is str:
|
81 |
+
return np.array(Image.open(image))
|
82 |
+
elif issubclass(type(image), Image.Image):
|
83 |
+
return np.array(image)
|
84 |
+
elif type(image) is np.ndarray:
|
85 |
+
return image
|
86 |
+
return None
|
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/README.md
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Model Information
|
2 |
+
### Source model
|
3 |
+
- Input shape: 640x640
|
4 |
+
- Number of parameters: 68.89M
|
5 |
+
- Model size: 277.39M
|
6 |
+
- Output shape: 1x37x8400,1x32x160x160
|
7 |
+
|
8 |
+
Source model repository: [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
|
9 |
+
|
10 |
+
### Converted model
|
11 |
+
|
12 |
+
- Precision: W8A16
|
13 |
+
- Backend: QNN2.16
|
14 |
+
- Target Device: SNM972 QCS8550
|
15 |
+
|
16 |
+
## Inference with AidLite SDK
|
17 |
+
|
18 |
+
### SDK installation
|
19 |
+
Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
|
20 |
+
|
21 |
+
- Install AidLite SDK
|
22 |
+
|
23 |
+
```bash
|
24 |
+
# Install the appropriate version of the aidlite sdk
|
25 |
+
sudo aid-pkg update
|
26 |
+
sudo aid-pkg install aidlite-sdk
|
27 |
+
# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
|
28 |
+
sudo aid-pkg install aidlite-{QNN VERSION}
|
29 |
+
```
|
30 |
+
|
31 |
+
- Verify AidLite SDK
|
32 |
+
|
33 |
+
```bash
|
34 |
+
# aidlite sdk c++ check
|
35 |
+
python3 -c "import aidlite ; print(aidlite.get_library_version())"
|
36 |
+
|
37 |
+
# aidlite sdk python check
|
38 |
+
python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
|
39 |
+
```
|
40 |
+
|
41 |
+
### Run demo
|
42 |
+
```bash
|
43 |
+
cd fastsam_x/model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite
|
44 |
+
export LD_PRELOAD=/home/aidlux/.local/lib/python3.8/site-packages/torch/lib/../../torch.libs/libgomp-804f19d4.so.1.0.0
|
45 |
+
|
46 |
+
python3 ./python/run_test.py --target_model ./models/cutoff_fastsam_x_w8a16.qnn216.ctx.bin --imgs ./python/dogs.jpg --invoke_nums 10
|
47 |
+
```
|
48 |
+
|
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/models/cutoff_fastsam_x_w8a16.qnn216.ctx.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e08622accfab5d0dac489e898818d66b717f8b4ac820ed213585dfe0a37023b
|
3 |
+
size 75750664
|
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/dogs.jpg
ADDED
![]() |
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/onnx_export.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import cv2
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
|
6 |
+
from ultralytics.models.fastsam import FastSAM
|
7 |
+
|
8 |
+
class Fast_SAM(torch.nn.Module):
|
9 |
+
"""Exportable FastSAM model, end-to-end."""
|
10 |
+
|
11 |
+
def __init__(self) -> None:
|
12 |
+
super().__init__()
|
13 |
+
pt_name ='./models/FastSAM-s.pt'
|
14 |
+
self.model =FastSAM(pt_name).model
|
15 |
+
|
16 |
+
def forward(self, image: torch.Tensor):
|
17 |
+
"""
|
18 |
+
Run FastSAM on `image`, and produce high quality segmentation masks.
|
19 |
+
Faster than SAM as it is based on YOLOv8.
|
20 |
+
|
21 |
+
Parameters:
|
22 |
+
image: Pixel values pre-processed for encoder consumption.
|
23 |
+
Range: float[0, 1]
|
24 |
+
3-channel Color Space: BGR
|
25 |
+
Returns:
|
26 |
+
|
27 |
+
"""
|
28 |
+
predictions = self.model(image)
|
29 |
+
# Return predictions as a tuple instead of nested tuple.
|
30 |
+
return (predictions[0], predictions[1][2])
|
31 |
+
|
32 |
+
|
33 |
+
model = Fast_SAM()
|
34 |
+
num_params = sum(p.numel() for p in model.parameters())
|
35 |
+
print(f'Number of FastSAM-s parameters: {num_params}')
|
36 |
+
dummy_input = torch.randn( [1,3,640,640],dtype=torch.float32 )
|
37 |
+
source_model = torch.jit.trace(
|
38 |
+
model.to("cpu"), dummy_input, check_trace=False
|
39 |
+
)
|
40 |
+
torch.onnx.export(model, # model being run
|
41 |
+
dummy_input, # model input (or a tuple for multiple inputs)
|
42 |
+
"./models/fastsam_s.onnx", # where to save the model
|
43 |
+
export_params=True, # store the trained parameter weights inside the model file
|
44 |
+
opset_version=12, # the ONNX version to export the model to
|
45 |
+
do_constant_folding=True, # whether to execute constant folding for optimization
|
46 |
+
input_names = ['input'], # the model's input names
|
47 |
+
output_names = ['boxes','mask'],
|
48 |
+
verbose=True,
|
49 |
+
)
|
50 |
+
print("Convert to onnx successfully!")
|
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/prompt.py
ADDED
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import cv2
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
from utils import image_to_np_ndarray
|
8 |
+
from PIL import Image
|
9 |
+
|
10 |
+
|
11 |
+
class FastSAMPrompt:
|
12 |
+
|
13 |
+
def __init__(self, image, results, device='cpu'):
|
14 |
+
if isinstance(image, str) or isinstance(image, Image.Image):
|
15 |
+
image = image_to_np_ndarray(image)
|
16 |
+
self.device = device
|
17 |
+
self.results = results
|
18 |
+
self.img = image
|
19 |
+
|
20 |
+
def _segment_image(self, image, bbox):
|
21 |
+
if isinstance(image, Image.Image):
|
22 |
+
image_array = np.array(image)
|
23 |
+
else:
|
24 |
+
image_array = image
|
25 |
+
segmented_image_array = np.zeros_like(image_array)
|
26 |
+
x1, y1, x2, y2 = bbox
|
27 |
+
segmented_image_array[y1:y2, x1:x2] = image_array[y1:y2, x1:x2]
|
28 |
+
segmented_image = Image.fromarray(segmented_image_array)
|
29 |
+
black_image = Image.new('RGB', image.size, (255, 255, 255))
|
30 |
+
# transparency_mask = np.zeros_like((), dtype=np.uint8)
|
31 |
+
transparency_mask = np.zeros((image_array.shape[0], image_array.shape[1]), dtype=np.uint8)
|
32 |
+
transparency_mask[y1:y2, x1:x2] = 255
|
33 |
+
transparency_mask_image = Image.fromarray(transparency_mask, mode='L')
|
34 |
+
black_image.paste(segmented_image, mask=transparency_mask_image)
|
35 |
+
return black_image
|
36 |
+
|
37 |
+
def _format_results(self, result, filter=0):
|
38 |
+
annotations = []
|
39 |
+
n = len(result.masks.data)
|
40 |
+
for i in range(n):
|
41 |
+
annotation = {}
|
42 |
+
mask = result.masks.data[i] == 1.0
|
43 |
+
|
44 |
+
if torch.sum(mask) < filter:
|
45 |
+
continue
|
46 |
+
annotation['id'] = i
|
47 |
+
annotation['segmentation'] = mask.cpu().numpy()
|
48 |
+
annotation['bbox'] = result.boxes.data[i]
|
49 |
+
annotation['score'] = result.boxes.conf[i]
|
50 |
+
annotation['area'] = annotation['segmentation'].sum()
|
51 |
+
annotations.append(annotation)
|
52 |
+
return annotations
|
53 |
+
|
54 |
+
def filter_masks(annotations): # filte the overlap mask
|
55 |
+
annotations.sort(key=lambda x: x['area'], reverse=True)
|
56 |
+
to_remove = set()
|
57 |
+
for i in range(0, len(annotations)):
|
58 |
+
a = annotations[i]
|
59 |
+
for j in range(i + 1, len(annotations)):
|
60 |
+
b = annotations[j]
|
61 |
+
if i != j and j not in to_remove:
|
62 |
+
# check if
|
63 |
+
if b['area'] < a['area']:
|
64 |
+
if (a['segmentation'] & b['segmentation']).sum() / b['segmentation'].sum() > 0.8:
|
65 |
+
to_remove.add(j)
|
66 |
+
|
67 |
+
return [a for i, a in enumerate(annotations) if i not in to_remove], to_remove
|
68 |
+
|
69 |
+
def _get_bbox_from_mask(self, mask):
|
70 |
+
mask = mask.astype(np.uint8)
|
71 |
+
contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
72 |
+
x1, y1, w, h = cv2.boundingRect(contours[0])
|
73 |
+
x2, y2 = x1 + w, y1 + h
|
74 |
+
if len(contours) > 1:
|
75 |
+
for b in contours:
|
76 |
+
x_t, y_t, w_t, h_t = cv2.boundingRect(b)
|
77 |
+
# Merge multiple bounding boxes into one.
|
78 |
+
x1 = min(x1, x_t)
|
79 |
+
y1 = min(y1, y_t)
|
80 |
+
x2 = max(x2, x_t + w_t)
|
81 |
+
y2 = max(y2, y_t + h_t)
|
82 |
+
h = y2 - y1
|
83 |
+
w = x2 - x1
|
84 |
+
return [x1, y1, x2, y2]
|
85 |
+
|
86 |
+
def plot_to_result(self,
|
87 |
+
annotations,
|
88 |
+
bboxes=None,
|
89 |
+
points=None,
|
90 |
+
point_label=None,
|
91 |
+
mask_random_color=True,
|
92 |
+
better_quality=True,
|
93 |
+
retina=False,
|
94 |
+
withContours=True) -> np.ndarray:
|
95 |
+
if isinstance(annotations[0], dict):
|
96 |
+
annotations = [annotation['segmentation'] for annotation in annotations]
|
97 |
+
image = self.img
|
98 |
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
99 |
+
original_h = image.shape[0]
|
100 |
+
original_w = image.shape[1]
|
101 |
+
if sys.platform == "darwin":
|
102 |
+
plt.switch_backend("TkAgg")
|
103 |
+
plt.figure(figsize=(original_w / 100, original_h / 100))
|
104 |
+
# Add subplot with no margin.
|
105 |
+
plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
|
106 |
+
plt.margins(0, 0)
|
107 |
+
plt.gca().xaxis.set_major_locator(plt.NullLocator())
|
108 |
+
plt.gca().yaxis.set_major_locator(plt.NullLocator())
|
109 |
+
|
110 |
+
plt.imshow(image)
|
111 |
+
if better_quality:
|
112 |
+
if isinstance(annotations[0], torch.Tensor):
|
113 |
+
annotations = np.array(annotations.cpu())
|
114 |
+
for i, mask in enumerate(annotations):
|
115 |
+
mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8))
|
116 |
+
annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8))
|
117 |
+
if self.device == 'cpu':
|
118 |
+
annotations = np.array(annotations)
|
119 |
+
self.fast_show_mask(
|
120 |
+
annotations,
|
121 |
+
plt.gca(),
|
122 |
+
random_color=mask_random_color,
|
123 |
+
bboxes=bboxes,
|
124 |
+
points=points,
|
125 |
+
pointlabel=point_label,
|
126 |
+
retinamask=retina,
|
127 |
+
target_height=original_h,
|
128 |
+
target_width=original_w,
|
129 |
+
)
|
130 |
+
else:
|
131 |
+
if isinstance(annotations[0], np.ndarray):
|
132 |
+
annotations = torch.from_numpy(annotations)
|
133 |
+
self.fast_show_mask_gpu(
|
134 |
+
annotations,
|
135 |
+
plt.gca(),
|
136 |
+
random_color=mask_random_color,
|
137 |
+
bboxes=bboxes,
|
138 |
+
points=points,
|
139 |
+
pointlabel=point_label,
|
140 |
+
retinamask=retina,
|
141 |
+
target_height=original_h,
|
142 |
+
target_width=original_w,
|
143 |
+
)
|
144 |
+
if isinstance(annotations, torch.Tensor):
|
145 |
+
annotations = annotations.cpu().numpy()
|
146 |
+
if withContours:
|
147 |
+
contour_all = []
|
148 |
+
temp = np.zeros((original_h, original_w, 1))
|
149 |
+
for i, mask in enumerate(annotations):
|
150 |
+
if type(mask) == dict:
|
151 |
+
mask = mask['segmentation']
|
152 |
+
annotation = mask.astype(np.uint8)
|
153 |
+
if not retina:
|
154 |
+
annotation = cv2.resize(
|
155 |
+
annotation,
|
156 |
+
(original_w, original_h),
|
157 |
+
interpolation=cv2.INTER_NEAREST,
|
158 |
+
)
|
159 |
+
contours, hierarchy = cv2.findContours(annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
160 |
+
for contour in contours:
|
161 |
+
contour_all.append(contour)
|
162 |
+
cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2)
|
163 |
+
color = np.array([0 / 255, 0 / 255, 255 / 255, 0.8])
|
164 |
+
contour_mask = temp / 255 * color.reshape(1, 1, -1)
|
165 |
+
plt.imshow(contour_mask)
|
166 |
+
|
167 |
+
plt.axis('off')
|
168 |
+
fig = plt.gcf()
|
169 |
+
plt.draw()
|
170 |
+
|
171 |
+
try:
|
172 |
+
buf = fig.canvas.tostring_rgb()
|
173 |
+
except AttributeError:
|
174 |
+
fig.canvas.draw()
|
175 |
+
buf = fig.canvas.tostring_rgb()
|
176 |
+
cols, rows = fig.canvas.get_width_height()
|
177 |
+
img_array = np.frombuffer(buf, dtype=np.uint8).reshape(rows, cols, 3)
|
178 |
+
result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
|
179 |
+
plt.close()
|
180 |
+
return result
|
181 |
+
|
182 |
+
# Remark for refactoring: IMO a function should do one thing only, storing the image and plotting should be seperated and do not necessarily need to be class functions but standalone utility functions that the user can chain in his scripts to have more fine-grained control.
|
183 |
+
def plot(self,
|
184 |
+
annotations,
|
185 |
+
output_path,
|
186 |
+
bboxes=None,
|
187 |
+
points=None,
|
188 |
+
point_label=None,
|
189 |
+
mask_random_color=True,
|
190 |
+
better_quality=True,
|
191 |
+
retina=False,
|
192 |
+
withContours=True):
|
193 |
+
if len(annotations) == 0:
|
194 |
+
return None
|
195 |
+
result = self.plot_to_result(
|
196 |
+
annotations,
|
197 |
+
bboxes,
|
198 |
+
points,
|
199 |
+
point_label,
|
200 |
+
mask_random_color,
|
201 |
+
better_quality,
|
202 |
+
retina,
|
203 |
+
withContours,
|
204 |
+
)
|
205 |
+
|
206 |
+
path = os.path.dirname(os.path.abspath(output_path))
|
207 |
+
if not os.path.exists(path):
|
208 |
+
os.makedirs(path)
|
209 |
+
result = result[:, :, ::-1]
|
210 |
+
cv2.imwrite(output_path, result)
|
211 |
+
|
212 |
+
# CPU post process
|
213 |
+
def fast_show_mask(
|
214 |
+
self,
|
215 |
+
annotation,
|
216 |
+
ax,
|
217 |
+
random_color=False,
|
218 |
+
bboxes=None,
|
219 |
+
points=None,
|
220 |
+
pointlabel=None,
|
221 |
+
retinamask=True,
|
222 |
+
target_height=960,
|
223 |
+
target_width=960,
|
224 |
+
):
|
225 |
+
msak_sum = annotation.shape[0]
|
226 |
+
height = annotation.shape[1]
|
227 |
+
weight = annotation.shape[2]
|
228 |
+
#Sort annotations based on area.
|
229 |
+
areas = np.sum(annotation, axis=(1, 2))
|
230 |
+
sorted_indices = np.argsort(areas)
|
231 |
+
annotation = annotation[sorted_indices]
|
232 |
+
|
233 |
+
index = (annotation != 0).argmax(axis=0)
|
234 |
+
if random_color:
|
235 |
+
color = np.random.random((msak_sum, 1, 1, 3))
|
236 |
+
else:
|
237 |
+
color = np.ones((msak_sum, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 255 / 255])
|
238 |
+
transparency = np.ones((msak_sum, 1, 1, 1)) * 0.6
|
239 |
+
visual = np.concatenate([color, transparency], axis=-1)
|
240 |
+
mask_image = np.expand_dims(annotation, -1) * visual
|
241 |
+
|
242 |
+
show = np.zeros((height, weight, 4))
|
243 |
+
h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing='ij')
|
244 |
+
indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
|
245 |
+
# Use vectorized indexing to update the values of 'show'.
|
246 |
+
show[h_indices, w_indices, :] = mask_image[indices]
|
247 |
+
if bboxes is not None:
|
248 |
+
for bbox in bboxes:
|
249 |
+
x1, y1, x2, y2 = bbox
|
250 |
+
ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
|
251 |
+
# draw point
|
252 |
+
if points is not None:
|
253 |
+
plt.scatter(
|
254 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
|
255 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
|
256 |
+
s=20,
|
257 |
+
c='y',
|
258 |
+
)
|
259 |
+
plt.scatter(
|
260 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
|
261 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
|
262 |
+
s=20,
|
263 |
+
c='m',
|
264 |
+
)
|
265 |
+
|
266 |
+
if not retinamask:
|
267 |
+
show = cv2.resize(show, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
|
268 |
+
ax.imshow(show)
|
269 |
+
|
270 |
+
def fast_show_mask_gpu(
|
271 |
+
self,
|
272 |
+
annotation,
|
273 |
+
ax,
|
274 |
+
random_color=False,
|
275 |
+
bboxes=None,
|
276 |
+
points=None,
|
277 |
+
pointlabel=None,
|
278 |
+
retinamask=True,
|
279 |
+
target_height=960,
|
280 |
+
target_width=960,
|
281 |
+
):
|
282 |
+
msak_sum = annotation.shape[0]
|
283 |
+
height = annotation.shape[1]
|
284 |
+
weight = annotation.shape[2]
|
285 |
+
areas = torch.sum(annotation, dim=(1, 2))
|
286 |
+
sorted_indices = torch.argsort(areas, descending=False)
|
287 |
+
annotation = annotation[sorted_indices]
|
288 |
+
# Find the index of the first non-zero value at each position.
|
289 |
+
index = (annotation != 0).to(torch.long).argmax(dim=0)
|
290 |
+
if random_color:
|
291 |
+
color = torch.rand((msak_sum, 1, 1, 3)).to(annotation.device)
|
292 |
+
else:
|
293 |
+
color = torch.ones((msak_sum, 1, 1, 3)).to(annotation.device) * torch.tensor([
|
294 |
+
30 / 255, 144 / 255, 255 / 255]).to(annotation.device)
|
295 |
+
transparency = torch.ones((msak_sum, 1, 1, 1)).to(annotation.device) * 0.6
|
296 |
+
visual = torch.cat([color, transparency], dim=-1)
|
297 |
+
mask_image = torch.unsqueeze(annotation, -1) * visual
|
298 |
+
# Select data according to the index. The index indicates which batch's data to choose at each position, converting the mask_image into a single batch form.
|
299 |
+
show = torch.zeros((height, weight, 4)).to(annotation.device)
|
300 |
+
try:
|
301 |
+
h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight), indexing='ij')
|
302 |
+
except:
|
303 |
+
h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight))
|
304 |
+
indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
|
305 |
+
# Use vectorized indexing to update the values of 'show'.
|
306 |
+
show[h_indices, w_indices, :] = mask_image[indices]
|
307 |
+
show_cpu = show.cpu().numpy()
|
308 |
+
if bboxes is not None:
|
309 |
+
for bbox in bboxes:
|
310 |
+
x1, y1, x2, y2 = bbox
|
311 |
+
ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
|
312 |
+
# draw point
|
313 |
+
if points is not None:
|
314 |
+
plt.scatter(
|
315 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
|
316 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
|
317 |
+
s=20,
|
318 |
+
c='y',
|
319 |
+
)
|
320 |
+
plt.scatter(
|
321 |
+
[point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
|
322 |
+
[point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
|
323 |
+
s=20,
|
324 |
+
c='m',
|
325 |
+
)
|
326 |
+
if not retinamask:
|
327 |
+
show_cpu = cv2.resize(show_cpu, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
|
328 |
+
ax.imshow(show_cpu)
|
329 |
+
|
330 |
+
# clip
|
331 |
+
@torch.no_grad()
|
332 |
+
def retrieve(self, model, preprocess, elements, search_text: str, device) -> int:
|
333 |
+
preprocessed_images = [preprocess(image).to(device) for image in elements]
|
334 |
+
try:
|
335 |
+
import clip # for linear_assignment
|
336 |
+
|
337 |
+
except (ImportError, AssertionError, AttributeError):
|
338 |
+
from ultralytics.yolo.utils.checks import check_requirements
|
339 |
+
|
340 |
+
check_requirements('git+https://github.com/openai/CLIP.git') # required before installing lap from source
|
341 |
+
import clip
|
342 |
+
|
343 |
+
|
344 |
+
tokenized_text = clip.tokenize([search_text]).to(device)
|
345 |
+
stacked_images = torch.stack(preprocessed_images)
|
346 |
+
image_features = model.encode_image(stacked_images)
|
347 |
+
text_features = model.encode_text(tokenized_text)
|
348 |
+
image_features /= image_features.norm(dim=-1, keepdim=True)
|
349 |
+
text_features /= text_features.norm(dim=-1, keepdim=True)
|
350 |
+
probs = 100.0 * image_features @ text_features.T
|
351 |
+
return probs[:, 0].softmax(dim=0)
|
352 |
+
|
353 |
+
def _crop_image(self, format_results):
|
354 |
+
|
355 |
+
image = Image.fromarray(cv2.cvtColor(self.img, cv2.COLOR_BGR2RGB))
|
356 |
+
ori_w, ori_h = image.size
|
357 |
+
annotations = format_results
|
358 |
+
mask_h, mask_w = annotations[0]['segmentation'].shape
|
359 |
+
if ori_w != mask_w or ori_h != mask_h:
|
360 |
+
image = image.resize((mask_w, mask_h))
|
361 |
+
cropped_boxes = []
|
362 |
+
cropped_images = []
|
363 |
+
not_crop = []
|
364 |
+
filter_id = []
|
365 |
+
# annotations, _ = filter_masks(annotations)
|
366 |
+
# filter_id = list(_)
|
367 |
+
for _, mask in enumerate(annotations):
|
368 |
+
if np.sum(mask['segmentation']) <= 100:
|
369 |
+
filter_id.append(_)
|
370 |
+
continue
|
371 |
+
bbox = self._get_bbox_from_mask(mask['segmentation']) # mask 的 bbox
|
372 |
+
cropped_boxes.append(self._segment_image(image, bbox))
|
373 |
+
# cropped_boxes.append(segment_image(image,mask["segmentation"]))
|
374 |
+
cropped_images.append(bbox) # Save the bounding box of the cropped image.
|
375 |
+
|
376 |
+
return cropped_boxes, cropped_images, not_crop, filter_id, annotations
|
377 |
+
|
378 |
+
def box_prompt(self, bbox=None, bboxes=None):
|
379 |
+
if self.results == None:
|
380 |
+
return []
|
381 |
+
assert bbox or bboxes
|
382 |
+
if bboxes is None:
|
383 |
+
bboxes = [bbox]
|
384 |
+
max_iou_index = []
|
385 |
+
for bbox in bboxes:
|
386 |
+
assert (bbox[2] != 0 and bbox[3] != 0)
|
387 |
+
masks = self.results[0].masks.data
|
388 |
+
target_height = self.img.shape[0]
|
389 |
+
target_width = self.img.shape[1]
|
390 |
+
h = masks.shape[1]
|
391 |
+
w = masks.shape[2]
|
392 |
+
if h != target_height or w != target_width:
|
393 |
+
bbox = [
|
394 |
+
int(bbox[0] * w / target_width),
|
395 |
+
int(bbox[1] * h / target_height),
|
396 |
+
int(bbox[2] * w / target_width),
|
397 |
+
int(bbox[3] * h / target_height), ]
|
398 |
+
bbox[0] = round(bbox[0]) if round(bbox[0]) > 0 else 0
|
399 |
+
bbox[1] = round(bbox[1]) if round(bbox[1]) > 0 else 0
|
400 |
+
bbox[2] = round(bbox[2]) if round(bbox[2]) < w else w
|
401 |
+
bbox[3] = round(bbox[3]) if round(bbox[3]) < h else h
|
402 |
+
|
403 |
+
# IoUs = torch.zeros(len(masks), dtype=torch.float32)
|
404 |
+
bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
|
405 |
+
|
406 |
+
masks_area = torch.sum(masks[:, bbox[1]:bbox[3], bbox[0]:bbox[2]], dim=(1, 2))
|
407 |
+
orig_masks_area = torch.sum(masks, dim=(1, 2))
|
408 |
+
|
409 |
+
union = bbox_area + orig_masks_area - masks_area
|
410 |
+
IoUs = masks_area / union
|
411 |
+
max_iou_index.append(int(torch.argmax(IoUs)))
|
412 |
+
max_iou_index = list(set(max_iou_index))
|
413 |
+
return np.array(masks[max_iou_index].cpu().numpy())
|
414 |
+
|
415 |
+
def point_prompt(self, points, pointlabel): # numpy
|
416 |
+
if self.results == None:
|
417 |
+
return []
|
418 |
+
masks = self._format_results(self.results[0], 0)
|
419 |
+
target_height = self.img.shape[0]
|
420 |
+
target_width = self.img.shape[1]
|
421 |
+
h = masks[0]['segmentation'].shape[0]
|
422 |
+
w = masks[0]['segmentation'].shape[1]
|
423 |
+
if h != target_height or w != target_width:
|
424 |
+
points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points]
|
425 |
+
onemask = np.zeros((h, w))
|
426 |
+
masks = sorted(masks, key=lambda x: x['area'], reverse=True)
|
427 |
+
for i, annotation in enumerate(masks):
|
428 |
+
if type(annotation) == dict:
|
429 |
+
mask = annotation['segmentation']
|
430 |
+
else:
|
431 |
+
mask = annotation
|
432 |
+
for i, point in enumerate(points):
|
433 |
+
if mask[point[1], point[0]] == 1 and pointlabel[i] == 1:
|
434 |
+
onemask[mask] = 1
|
435 |
+
if mask[point[1], point[0]] == 1 and pointlabel[i] == 0:
|
436 |
+
onemask[mask] = 0
|
437 |
+
onemask = onemask >= 1
|
438 |
+
return np.array([onemask])
|
439 |
+
|
440 |
+
def text_prompt(self, text):
|
441 |
+
if self.results == None:
|
442 |
+
return []
|
443 |
+
format_results = self._format_results(self.results[0], 0)
|
444 |
+
cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._crop_image(format_results)
|
445 |
+
clip_model, preprocess = clip.load('ViT-B/32', device=self.device)
|
446 |
+
scores = self.retrieve(clip_model, preprocess, cropped_boxes, text, device=self.device)
|
447 |
+
max_idx = scores.argsort()
|
448 |
+
max_idx = max_idx[-1]
|
449 |
+
max_idx += sum(np.array(filter_id) <= int(max_idx))
|
450 |
+
return np.array([annotations[max_idx]['segmentation']])
|
451 |
+
|
452 |
+
def everything_prompt(self):
|
453 |
+
if self.results == None:
|
454 |
+
return []
|
455 |
+
return self.results[0].masks.data
|
456 |
+
|
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/run_test.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import cv2
|
4 |
+
import numpy as np
|
5 |
+
import onnxruntime
|
6 |
+
import time
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import torch
|
9 |
+
from ultralytics.engine.results import Results
|
10 |
+
from tools_pt import *
|
11 |
+
from prompt import FastSAMPrompt
|
12 |
+
import aidlite
|
13 |
+
import argparse
|
14 |
+
import ast
|
15 |
+
|
16 |
+
# 定义相似度函数
|
17 |
+
def get_acc(onnx_out,other_out):
|
18 |
+
cosine_similarity=np.dot(np.array(onnx_out),np.array(other_out))/(np.linalg.norm(np.array(onnx_out)) * np.linalg.norm(np.array(other_out)))
|
19 |
+
return cosine_similarity
|
20 |
+
|
21 |
+
def cal_sigmoid(x):
|
22 |
+
return 1 / (1 + np.exp(-x))
|
23 |
+
|
24 |
+
class qnn_predict(object):
|
25 |
+
def __init__(self,inputshape,outputshape,args) -> None:
|
26 |
+
aidlite.set_log_level(aidlite.LogLevel.INFO)
|
27 |
+
aidlite.log_to_stderr()
|
28 |
+
print(f"Aidlite library version : {aidlite.get_library_version()}")
|
29 |
+
print(f"Aidlite python library version : {aidlite.get_py_library_version()}")
|
30 |
+
config = aidlite.Config.create_instance()
|
31 |
+
if config is None:
|
32 |
+
print("Create model failed !")
|
33 |
+
config.implement_type = aidlite.ImplementType.TYPE_LOCAL
|
34 |
+
config.framework_type = aidlite.FrameworkType.TYPE_QNN
|
35 |
+
config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
|
36 |
+
config.is_quantify_model = 1
|
37 |
+
|
38 |
+
model = aidlite.Model.create_instance(args.target_model)
|
39 |
+
if model is None:
|
40 |
+
print("Create model failed !")
|
41 |
+
|
42 |
+
self.input_shape=inputshape
|
43 |
+
self.out_shape = outputshape
|
44 |
+
model.set_model_properties(self.input_shape, aidlite.DataType.TYPE_FLOAT32, self.out_shape, aidlite.DataType.TYPE_FLOAT32)
|
45 |
+
self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(model, config)
|
46 |
+
if self.interpreter is None:
|
47 |
+
print("build_interpretper_from_model_and_config failed !")
|
48 |
+
result = self.interpreter.init()
|
49 |
+
if result != 0:
|
50 |
+
print(f"interpreter init failed !")
|
51 |
+
result = self.interpreter.load_model()
|
52 |
+
if result != 0:
|
53 |
+
print("interpreter load model failed !")
|
54 |
+
print("detect model load success!")
|
55 |
+
|
56 |
+
self.conf = 0.4
|
57 |
+
self.iou=0.9
|
58 |
+
self.size = 640
|
59 |
+
self.agnostic_nms=False
|
60 |
+
self.max_det = 300
|
61 |
+
self.names=['object']
|
62 |
+
self.classes =None
|
63 |
+
self.retina_masks=True
|
64 |
+
|
65 |
+
def pretreat_img(self,img):
|
66 |
+
scale = 1/255.
|
67 |
+
img_size = cv2.resize(img, (self.size,self.size), interpolation=cv2.INTER_LINEAR)
|
68 |
+
float_img = img_size.astype('float32')
|
69 |
+
float_img = float_img* scale
|
70 |
+
float_img = float_img[:, :, ::-1]
|
71 |
+
return float_img
|
72 |
+
|
73 |
+
def postprocess(self, preds, img, orig_imgs):
|
74 |
+
"""TODO: filter by classes."""
|
75 |
+
p = non_max_suppression(torch.from_numpy(preds[0]),
|
76 |
+
self.conf,
|
77 |
+
self.iou,
|
78 |
+
agnostic=self.agnostic_nms,
|
79 |
+
max_det=self.max_det,
|
80 |
+
nc=len(self.names),
|
81 |
+
classes=self.classes)
|
82 |
+
|
83 |
+
results = []
|
84 |
+
if len(p) == 0 or len(p[0]) == 0:
|
85 |
+
print("No object detected.")
|
86 |
+
return results
|
87 |
+
|
88 |
+
full_box = torch.zeros_like(p[0][0])
|
89 |
+
full_box[2], full_box[3], full_box[4], full_box[6:] = img.shape[3], img.shape[2], 1.0, 1.0
|
90 |
+
full_box = full_box.view(1, -1)
|
91 |
+
critical_iou_index = bbox_iou(full_box[0][:4], p[0][:, :4], iou_thres=0.9, image_shape=img.shape[2:])
|
92 |
+
if critical_iou_index.numel() != 0:
|
93 |
+
full_box[0][4] = p[0][critical_iou_index][:,4]
|
94 |
+
full_box[0][6:] = p[0][critical_iou_index][:,6:]
|
95 |
+
p[0][critical_iou_index] = full_box
|
96 |
+
|
97 |
+
#proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported
|
98 |
+
proto=torch.from_numpy(preds[-1])
|
99 |
+
for i, pred in enumerate(p):
|
100 |
+
orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs
|
101 |
+
path =img[0] #self.batch[0]
|
102 |
+
img_path = path[i] if isinstance(path, list) else path
|
103 |
+
if not len(pred): # save empty boxes
|
104 |
+
results.append(Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6]))
|
105 |
+
continue
|
106 |
+
if self.retina_masks:
|
107 |
+
if not isinstance(orig_imgs, torch.Tensor):
|
108 |
+
pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
|
109 |
+
masks = process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2]) # HWC
|
110 |
+
else:
|
111 |
+
masks = process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True) # HWC
|
112 |
+
if not isinstance(orig_imgs, torch.Tensor):
|
113 |
+
pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
|
114 |
+
results.append(
|
115 |
+
Results(orig_img=orig_img, path=img_path, names=self.names, boxes=pred[:, :6], masks=masks))
|
116 |
+
return results
|
117 |
+
|
118 |
+
def qnn_run(self, orig_imgs,img_path,args):
|
119 |
+
input_img_f =self.pretreat_img(orig_imgs) # 图片resize HWC
|
120 |
+
# print("qnn_input:",input_img_f)
|
121 |
+
# encoder texts
|
122 |
+
input_img = np.expand_dims(input_img_f, 0)
|
123 |
+
|
124 |
+
invoke_time=[]
|
125 |
+
for i in range(args.invoke_nums):
|
126 |
+
result = self.interpreter.set_input_tensor(0, input_img.data)
|
127 |
+
t0 = time.time()
|
128 |
+
result = self.interpreter.invoke()
|
129 |
+
t1 = time.time()
|
130 |
+
cost_time=(t1-t0)*1000
|
131 |
+
invoke_time.append(cost_time)
|
132 |
+
mask_ = self.interpreter.get_output_tensor(0)
|
133 |
+
concat_ = self.interpreter.get_output_tensor(1)
|
134 |
+
mul_ = self.interpreter.get_output_tensor(3)
|
135 |
+
split_ = self.interpreter.get_output_tensor(2)
|
136 |
+
mask_ = mask_.reshape( * self.out_shape[3])
|
137 |
+
mask_=mask_.transpose((0, 3, 1,2))
|
138 |
+
concat_ = concat_.reshape( *self.out_shape[2])
|
139 |
+
mul_ = mul_.reshape( *self.out_shape[1])
|
140 |
+
split_ = split_.reshape( *self.out_shape[0])
|
141 |
+
sig_ = cal_sigmoid(split_)
|
142 |
+
|
143 |
+
output_concat = np.concatenate((mul_,sig_),axis=1)
|
144 |
+
output_concat = np.concatenate((output_concat,concat_),axis=1)
|
145 |
+
|
146 |
+
# outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
|
147 |
+
## time 统计
|
148 |
+
max_invoke_time = max(invoke_time)
|
149 |
+
min_invoke_time = min(invoke_time)
|
150 |
+
mean_invoke_time = sum(invoke_time)/args.invoke_nums
|
151 |
+
var_invoketime=np.var(invoke_time)
|
152 |
+
print("========================================")
|
153 |
+
print(f"QNN inference {args.invoke_nums} times :\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
|
154 |
+
print("========================================")
|
155 |
+
|
156 |
+
qnn_out = [np.array(output_concat),np.array(mask_)]
|
157 |
+
# print("qnn predict out:",qnn_out)
|
158 |
+
|
159 |
+
nchw_img = input_img.transpose(0,3,1,2)
|
160 |
+
everything_results = self.postprocess( qnn_out, nchw_img, [orig_imgs])
|
161 |
+
# print("everything_results: ",everything_results)
|
162 |
+
|
163 |
+
prompt_process = FastSAMPrompt(args.imgs, everything_results, device="cpu")
|
164 |
+
|
165 |
+
# ann = prompt_process.point_prompt(points=[[620, 360]], pointlabel=[1])
|
166 |
+
try:
|
167 |
+
if args.point_prompt ==[[0,0]]:
|
168 |
+
ann = prompt_process.everything_prompt()
|
169 |
+
else:
|
170 |
+
ann = prompt_process.point_prompt(points=args.point_prompt, pointlabel=[1])
|
171 |
+
out_name = os.path.basename(img_path).split(".")[0]
|
172 |
+
if True: # savepic
|
173 |
+
outpath = "python/"
|
174 |
+
if not os.path.exists(outpath):
|
175 |
+
os.mkdir(outpath)
|
176 |
+
prompt_process.plot(
|
177 |
+
annotations=ann,
|
178 |
+
output_path=os.path.join(outpath,out_name+"_result.jpg"),
|
179 |
+
mask_random_color=True,
|
180 |
+
better_quality=True,
|
181 |
+
retina=False,
|
182 |
+
withContours=True,
|
183 |
+
)
|
184 |
+
else:
|
185 |
+
plt.figure()
|
186 |
+
prompt_process.fast_show_mask(annotation=ann,
|
187 |
+
ax = plt)
|
188 |
+
except Exception as e:
|
189 |
+
print(f"Waning : An error occurred in the picture {img_path} prediction -{e}")
|
190 |
+
return [mask_.reshape(-1),output_concat.reshape(-1)]
|
191 |
+
|
192 |
+
|
193 |
+
|
194 |
+
def parser_args():
|
195 |
+
parser = argparse.ArgumentParser(description="Run model benchmarks")
|
196 |
+
parser.add_argument('--target_model',type=str,default='models/cutoff_fastsam_x_w8a16.qnn216.ctx.bin.aidem',help="inference model path")
|
197 |
+
parser.add_argument('--source_model',type=str,default='models/fastsam_x.onnx',help="original model path")
|
198 |
+
parser.add_argument('--imgs',type=str,default='python/dogs.jpg',help="Predict images path")
|
199 |
+
parser.add_argument('--invoke_nums',type=int,default=10,help="Inference nums")
|
200 |
+
parser.add_argument('--point_prompt',type=str,default="[[0,0]]",help="example:[[x1,y1],[x2,y2]]")
|
201 |
+
args = parser.parse_args()
|
202 |
+
return args
|
203 |
+
|
204 |
+
|
205 |
+
if __name__ == "__main__":
|
206 |
+
args = parser_args()
|
207 |
+
inputshape=[[1,640,640,3]]
|
208 |
+
outputshape=[[1,1,8400],[1,4,8400],[1,32,8400],[1,160,160,32]]
|
209 |
+
args.point_prompt = ast.literal_eval(args.point_prompt)
|
210 |
+
|
211 |
+
predict = qnn_predict(inputshape,outputshape,args)
|
212 |
+
if os.path.isdir(args.imgs):
|
213 |
+
img_files = os.listdir(args.imgs)
|
214 |
+
for fi in img_files:
|
215 |
+
img_path = os.path.join(args.imgs,fi)
|
216 |
+
im0s = cv2.imread(img_path) # BGR
|
217 |
+
im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
|
218 |
+
predict.qnn_run(im0s,img_path,args)
|
219 |
+
else:
|
220 |
+
img_path = args.imgs
|
221 |
+
im0s = cv2.imread(img_path) # BGR
|
222 |
+
im0s = cv2.resize(im0s, (640,640), interpolation=cv2.INTER_LINEAR)
|
223 |
+
qnn_result = predict.qnn_run(im0s,img_path,args)
|
224 |
+
print("Prediction completion and the results are saved !")
|
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/tools_pt.py
ADDED
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import time
|
3 |
+
import torch
|
4 |
+
import torchvision
|
5 |
+
import torch.nn.functional as F
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
def clip_boxes(boxes, shape):
|
10 |
+
"""
|
11 |
+
Takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the shape.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
boxes (torch.Tensor): the bounding boxes to clip
|
15 |
+
shape (tuple): the shape of the image
|
16 |
+
"""
|
17 |
+
if isinstance(boxes, torch.Tensor): # faster individually
|
18 |
+
boxes[..., 0].clamp_(0, shape[1]) # x1
|
19 |
+
boxes[..., 1].clamp_(0, shape[0]) # y1
|
20 |
+
boxes[..., 2].clamp_(0, shape[1]) # x2
|
21 |
+
boxes[..., 3].clamp_(0, shape[0]) # y2
|
22 |
+
else: # np.array (faster grouped)
|
23 |
+
boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
|
24 |
+
boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
|
25 |
+
|
26 |
+
def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding=True):
|
27 |
+
"""
|
28 |
+
Rescales bounding boxes (in the format of xyxy) from the shape of the image they were originally specified in
|
29 |
+
(img1_shape) to the shape of a different image (img0_shape).
|
30 |
+
|
31 |
+
Args:
|
32 |
+
img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
|
33 |
+
boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
|
34 |
+
img0_shape (tuple): the shape of the target image, in the format of (height, width).
|
35 |
+
ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
|
36 |
+
calculated based on the size difference between the two images.
|
37 |
+
padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
|
38 |
+
rescaling.
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
|
42 |
+
"""
|
43 |
+
if ratio_pad is None: # calculate from img0_shape
|
44 |
+
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
|
45 |
+
pad = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1), round(
|
46 |
+
(img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # wh padding
|
47 |
+
else:
|
48 |
+
gain = ratio_pad[0][0]
|
49 |
+
pad = ratio_pad[1]
|
50 |
+
|
51 |
+
if padding:
|
52 |
+
boxes[..., [0, 2]] -= pad[0] # x padding
|
53 |
+
boxes[..., [1, 3]] -= pad[1] # y padding
|
54 |
+
boxes[..., :4] /= gain
|
55 |
+
clip_boxes(boxes, img0_shape)
|
56 |
+
return boxes
|
57 |
+
|
58 |
+
|
59 |
+
def xywh2xyxy(x):
|
60 |
+
"""
|
61 |
+
Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
|
62 |
+
top-left corner and (x2, y2) is the bottom-right corner.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
|
69 |
+
"""
|
70 |
+
assert x.shape[-1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'
|
71 |
+
y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x) # faster than clone/copy
|
72 |
+
dw = x[..., 2] / 2 # half-width
|
73 |
+
dh = x[..., 3] / 2 # half-height
|
74 |
+
y[..., 0] = x[..., 0] - dw # top left x
|
75 |
+
y[..., 1] = x[..., 1] - dh # top left y
|
76 |
+
y[..., 2] = x[..., 0] + dw # bottom right x
|
77 |
+
y[..., 3] = x[..., 1] + dh # bottom right y
|
78 |
+
return y
|
79 |
+
|
80 |
+
|
81 |
+
def non_max_suppression(
|
82 |
+
prediction,
|
83 |
+
conf_thres=0.25,
|
84 |
+
iou_thres=0.45,
|
85 |
+
classes=None,
|
86 |
+
agnostic=False,
|
87 |
+
multi_label=False,
|
88 |
+
labels=(),
|
89 |
+
max_det=300,
|
90 |
+
nc=0, # number of classes (optional)
|
91 |
+
max_time_img=0.05,
|
92 |
+
max_nms=30000,
|
93 |
+
max_wh=7680,
|
94 |
+
):
|
95 |
+
"""
|
96 |
+
Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box.
|
97 |
+
|
98 |
+
Args:
|
99 |
+
prediction (torch.Tensor): A tensor of shape (batch_size, num_classes + 4 + num_masks, num_boxes)
|
100 |
+
containing the predicted boxes, classes, and masks. The tensor should be in the format
|
101 |
+
output by a model, such as YOLO.
|
102 |
+
conf_thres (float): The confidence threshold below which boxes will be filtered out.
|
103 |
+
Valid values are between 0.0 and 1.0.
|
104 |
+
iou_thres (float): The IoU threshold below which boxes will be filtered out during NMS.
|
105 |
+
Valid values are between 0.0 and 1.0.
|
106 |
+
classes (List[int]): A list of class indices to consider. If None, all classes will be considered.
|
107 |
+
agnostic (bool): If True, the model is agnostic to the number of classes, and all
|
108 |
+
classes will be considered as one.
|
109 |
+
multi_label (bool): If True, each box may have multiple labels.
|
110 |
+
labels (List[List[Union[int, float, torch.Tensor]]]): A list of lists, where each inner
|
111 |
+
list contains the apriori labels for a given image. The list should be in the format
|
112 |
+
output by a dataloader, with each label being a tuple of (class_index, x1, y1, x2, y2).
|
113 |
+
max_det (int): The maximum number of boxes to keep after NMS.
|
114 |
+
nc (int, optional): The number of classes output by the model. Any indices after this will be considered masks.
|
115 |
+
max_time_img (float): The maximum time (seconds) for processing one image.
|
116 |
+
max_nms (int): The maximum number of boxes into torchvision.ops.nms().
|
117 |
+
max_wh (int): The maximum box width and height in pixels
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
(List[torch.Tensor]): A list of length batch_size, where each element is a tensor of
|
121 |
+
shape (num_boxes, 6 + num_masks) containing the kept boxes, with columns
|
122 |
+
(x1, y1, x2, y2, confidence, class, mask1, mask2, ...).
|
123 |
+
"""
|
124 |
+
|
125 |
+
# Checks
|
126 |
+
assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
|
127 |
+
assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
|
128 |
+
if isinstance(prediction, (list, tuple)): # YOLOv8 model in validation model, output = (inference_out, loss_out)
|
129 |
+
prediction = prediction[0] # select only inference output
|
130 |
+
|
131 |
+
device = prediction.device
|
132 |
+
mps = 'mps' in device.type # Apple MPS
|
133 |
+
if mps: # MPS not fully supported yet, convert tensors to CPU before NMS
|
134 |
+
prediction = prediction.cpu()
|
135 |
+
bs = prediction.shape[0] # batch size
|
136 |
+
nc = nc or (prediction.shape[1] - 4) # number of classes
|
137 |
+
nm = prediction.shape[1] - nc - 4
|
138 |
+
mi = 4 + nc # mask start index
|
139 |
+
xc = prediction[:, 4:mi].amax(1) > conf_thres # candidates
|
140 |
+
|
141 |
+
# Settings
|
142 |
+
# min_wh = 2 # (pixels) minimum box width and height
|
143 |
+
time_limit = 0.5 + max_time_img * bs # seconds to quit after
|
144 |
+
multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
|
145 |
+
|
146 |
+
prediction = prediction.transpose(-1, -2) # shape(1,84,6300) to shape(1,6300,84)
|
147 |
+
prediction[..., :4] = xywh2xyxy(prediction[..., :4]) # xywh to xyxy
|
148 |
+
|
149 |
+
t = time.time()
|
150 |
+
output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
|
151 |
+
for xi, x in enumerate(prediction): # image index, image inference
|
152 |
+
# Apply constraints
|
153 |
+
# x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0 # width-height
|
154 |
+
x = x[xc[xi]] # confidence
|
155 |
+
|
156 |
+
# Cat apriori labels if autolabelling
|
157 |
+
if labels and len(labels[xi]):
|
158 |
+
lb = labels[xi]
|
159 |
+
v = torch.zeros((len(lb), nc + nm + 4), device=x.device)
|
160 |
+
v[:, :4] = xywh2xyxy(lb[:, 1:5]) # box
|
161 |
+
v[range(len(lb)), lb[:, 0].long() + 4] = 1.0 # cls
|
162 |
+
x = torch.cat((x, v), 0)
|
163 |
+
|
164 |
+
# If none remain process next image
|
165 |
+
if not x.shape[0]:
|
166 |
+
continue
|
167 |
+
|
168 |
+
# Detections matrix nx6 (xyxy, conf, cls)
|
169 |
+
box, cls, mask = x.split((4, nc, nm), 1)
|
170 |
+
|
171 |
+
if multi_label:
|
172 |
+
i, j = torch.where(cls > conf_thres)
|
173 |
+
x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1)
|
174 |
+
else: # best class only
|
175 |
+
conf, j = cls.max(1, keepdim=True)
|
176 |
+
x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
|
177 |
+
|
178 |
+
# Filter by class
|
179 |
+
if classes is not None:
|
180 |
+
x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
|
181 |
+
|
182 |
+
# Check shape
|
183 |
+
n = x.shape[0] # number of boxes
|
184 |
+
if not n: # no boxes
|
185 |
+
continue
|
186 |
+
if n > max_nms: # excess boxes
|
187 |
+
x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence and remove excess boxes
|
188 |
+
|
189 |
+
# Batched NMS
|
190 |
+
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
|
191 |
+
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
|
192 |
+
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
|
193 |
+
i = i[:max_det] # limit detections
|
194 |
+
|
195 |
+
# # Experimental
|
196 |
+
# merge = False # use merge-NMS
|
197 |
+
# if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
|
198 |
+
# # Update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
|
199 |
+
# from .metrics import box_iou
|
200 |
+
# iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
|
201 |
+
# weights = iou * scores[None] # box weights
|
202 |
+
# x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
|
203 |
+
# redundant = True # require redundant detections
|
204 |
+
# if redundant:
|
205 |
+
# i = i[iou.sum(1) > 1] # require redundancy
|
206 |
+
|
207 |
+
output[xi] = x[i]
|
208 |
+
if mps:
|
209 |
+
output[xi] = output[xi].to(device)
|
210 |
+
# if (time.time() - t) > time_limit:
|
211 |
+
# LOGGER.warning(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded')
|
212 |
+
# break # time limit exceeded
|
213 |
+
|
214 |
+
return output
|
215 |
+
|
216 |
+
|
217 |
+
def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
|
218 |
+
'''Adjust bounding boxes to stick to image border if they are within a certain threshold.
|
219 |
+
Args:
|
220 |
+
boxes: (n, 4)
|
221 |
+
image_shape: (height, width)
|
222 |
+
threshold: pixel threshold
|
223 |
+
Returns:
|
224 |
+
adjusted_boxes: adjusted bounding boxes
|
225 |
+
'''
|
226 |
+
|
227 |
+
# Image dimensions
|
228 |
+
h, w = image_shape
|
229 |
+
|
230 |
+
# Adjust boxes
|
231 |
+
boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
|
232 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
|
233 |
+
boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
|
234 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
|
235 |
+
boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
|
236 |
+
w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
|
237 |
+
boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
|
238 |
+
h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
|
239 |
+
|
240 |
+
return boxes
|
241 |
+
|
242 |
+
def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
|
243 |
+
'''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
|
244 |
+
Args:
|
245 |
+
box1: (4, )
|
246 |
+
boxes: (n, 4)
|
247 |
+
Returns:
|
248 |
+
high_iou_indices: Indices of boxes with IoU > thres
|
249 |
+
'''
|
250 |
+
boxes = adjust_bboxes_to_image_border(boxes, image_shape)
|
251 |
+
# obtain coordinates for intersections
|
252 |
+
x1 = torch.max(box1[0], boxes[:, 0])
|
253 |
+
y1 = torch.max(box1[1], boxes[:, 1])
|
254 |
+
x2 = torch.min(box1[2], boxes[:, 2])
|
255 |
+
y2 = torch.min(box1[3], boxes[:, 3])
|
256 |
+
|
257 |
+
# compute the area of intersection
|
258 |
+
intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
|
259 |
+
|
260 |
+
# compute the area of both individual boxes
|
261 |
+
box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
262 |
+
box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
263 |
+
|
264 |
+
# compute the area of union
|
265 |
+
union = box1_area + box2_area - intersection
|
266 |
+
|
267 |
+
# compute the IoU
|
268 |
+
iou = intersection / union # Should be shape (n, )
|
269 |
+
if raw_output:
|
270 |
+
if iou.numel() == 0:
|
271 |
+
return 0
|
272 |
+
return iou
|
273 |
+
|
274 |
+
# get indices of boxes with IoU > thres
|
275 |
+
high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
|
276 |
+
|
277 |
+
return high_iou_indices
|
278 |
+
|
279 |
+
|
280 |
+
def scale_masks(masks, shape, padding=True):
|
281 |
+
"""
|
282 |
+
Rescale segment masks to shape.
|
283 |
+
|
284 |
+
Args:
|
285 |
+
masks (torch.Tensor): (N, C, H, W).
|
286 |
+
shape (tuple): Height and width.
|
287 |
+
padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
|
288 |
+
rescaling.
|
289 |
+
"""
|
290 |
+
mh, mw = masks.shape[2:]
|
291 |
+
gain = min(mh / shape[0], mw / shape[1]) # gain = old / new
|
292 |
+
pad = [mw - shape[1] * gain, mh - shape[0] * gain] # wh padding
|
293 |
+
if padding:
|
294 |
+
pad[0] /= 2
|
295 |
+
pad[1] /= 2
|
296 |
+
top, left = (int(pad[1]), int(pad[0])) if padding else (0, 0) # y, x
|
297 |
+
bottom, right = (int(mh - pad[1]), int(mw - pad[0]))
|
298 |
+
masks = masks[..., top:bottom, left:right]
|
299 |
+
|
300 |
+
masks = F.interpolate(masks, shape, mode="bilinear", align_corners=False) # NCHW
|
301 |
+
return masks
|
302 |
+
|
303 |
+
|
304 |
+
def process_mask_native(protos, masks_in, bboxes, shape):
|
305 |
+
"""
|
306 |
+
It takes the output of the mask head, and crops it after upsampling to the bounding boxes.
|
307 |
+
|
308 |
+
Args:
|
309 |
+
protos (torch.Tensor): [mask_dim, mask_h, mask_w]
|
310 |
+
masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms
|
311 |
+
bboxes (torch.Tensor): [n, 4], n is number of masks after nms
|
312 |
+
shape (tuple): the size of the input image (h,w)
|
313 |
+
|
314 |
+
Returns:
|
315 |
+
masks (torch.Tensor): The returned masks with dimensions [h, w, n]
|
316 |
+
"""
|
317 |
+
c, mh, mw = protos.shape # CHW
|
318 |
+
masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw)
|
319 |
+
masks = scale_masks(masks[None], shape)[0] # CHW
|
320 |
+
masks = crop_mask(masks, bboxes) # CHW
|
321 |
+
return masks.gt_(0.5)
|
322 |
+
|
323 |
+
def crop_mask(masks, boxes):
|
324 |
+
"""
|
325 |
+
It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box.
|
326 |
+
|
327 |
+
Args:
|
328 |
+
masks (torch.Tensor): [n, h, w] tensor of masks
|
329 |
+
boxes (torch.Tensor): [n, 4] tensor of bbox coordinates in relative point form
|
330 |
+
|
331 |
+
Returns:
|
332 |
+
(torch.Tensor): The masks are being cropped to the bounding box.
|
333 |
+
"""
|
334 |
+
_, h, w = masks.shape
|
335 |
+
x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(n,1,1)
|
336 |
+
r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,1,w)
|
337 |
+
c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(1,h,1)
|
338 |
+
|
339 |
+
return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
|
340 |
+
|
341 |
+
def process_mask(protos, masks_in, bboxes, shape, upsample=False):
|
342 |
+
"""
|
343 |
+
Apply masks to bounding boxes using the output of the mask head.
|
344 |
+
|
345 |
+
Args:
|
346 |
+
protos (torch.Tensor): A tensor of shape [mask_dim, mask_h, mask_w].
|
347 |
+
masks_in (torch.Tensor): A tensor of shape [n, mask_dim], where n is the number of masks after NMS.
|
348 |
+
bboxes (torch.Tensor): A tensor of shape [n, 4], where n is the number of masks after NMS.
|
349 |
+
shape (tuple): A tuple of integers representing the size of the input image in the format (h, w).
|
350 |
+
upsample (bool): A flag to indicate whether to upsample the mask to the original image size. Default is False.
|
351 |
+
|
352 |
+
Returns:
|
353 |
+
(torch.Tensor): A binary mask tensor of shape [n, h, w], where n is the number of masks after NMS, and h and w
|
354 |
+
are the height and width of the input image. The mask is applied to the bounding boxes.
|
355 |
+
"""
|
356 |
+
|
357 |
+
c, mh, mw = protos.shape # CHW
|
358 |
+
ih, iw = shape
|
359 |
+
masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) # CHW
|
360 |
+
|
361 |
+
downsampled_bboxes = bboxes.clone()
|
362 |
+
downsampled_bboxes[:, 0] *= mw / iw
|
363 |
+
downsampled_bboxes[:, 2] *= mw / iw
|
364 |
+
downsampled_bboxes[:, 3] *= mh / ih
|
365 |
+
downsampled_bboxes[:, 1] *= mh / ih
|
366 |
+
|
367 |
+
masks = crop_mask(masks, downsampled_bboxes) # CHW
|
368 |
+
if upsample:
|
369 |
+
masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW
|
370 |
+
return masks.gt_(0.5)
|
371 |
+
|
372 |
+
|
model_farm_fastsamx_qcs8550_qnn2.16_w8a16_aidlite/python/utils.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
|
6 |
+
def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
|
7 |
+
'''Adjust bounding boxes to stick to image border if they are within a certain threshold.
|
8 |
+
Args:
|
9 |
+
boxes: (n, 4)
|
10 |
+
image_shape: (height, width)
|
11 |
+
threshold: pixel threshold
|
12 |
+
Returns:
|
13 |
+
adjusted_boxes: adjusted bounding boxes
|
14 |
+
'''
|
15 |
+
|
16 |
+
# Image dimensions
|
17 |
+
h, w = image_shape
|
18 |
+
|
19 |
+
# Adjust boxes
|
20 |
+
boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
|
21 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 0]) # x1
|
22 |
+
boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
|
23 |
+
0, dtype=torch.float, device=boxes.device), boxes[:, 1]) # y1
|
24 |
+
boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
|
25 |
+
w, dtype=torch.float, device=boxes.device), boxes[:, 2]) # x2
|
26 |
+
boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
|
27 |
+
h, dtype=torch.float, device=boxes.device), boxes[:, 3]) # y2
|
28 |
+
|
29 |
+
return boxes
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
def convert_box_xywh_to_xyxy(box):
|
34 |
+
x1 = box[0]
|
35 |
+
y1 = box[1]
|
36 |
+
x2 = box[0] + box[2]
|
37 |
+
y2 = box[1] + box[3]
|
38 |
+
return [x1, y1, x2, y2]
|
39 |
+
|
40 |
+
|
41 |
+
def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
|
42 |
+
'''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
|
43 |
+
Args:
|
44 |
+
box1: (4, )
|
45 |
+
boxes: (n, 4)
|
46 |
+
Returns:
|
47 |
+
high_iou_indices: Indices of boxes with IoU > thres
|
48 |
+
'''
|
49 |
+
boxes = adjust_bboxes_to_image_border(boxes, image_shape)
|
50 |
+
# obtain coordinates for intersections
|
51 |
+
x1 = torch.max(box1[0], boxes[:, 0])
|
52 |
+
y1 = torch.max(box1[1], boxes[:, 1])
|
53 |
+
x2 = torch.min(box1[2], boxes[:, 2])
|
54 |
+
y2 = torch.min(box1[3], boxes[:, 3])
|
55 |
+
|
56 |
+
# compute the area of intersection
|
57 |
+
intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
|
58 |
+
|
59 |
+
# compute the area of both individual boxes
|
60 |
+
box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
61 |
+
box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
62 |
+
|
63 |
+
# compute the area of union
|
64 |
+
union = box1_area + box2_area - intersection
|
65 |
+
|
66 |
+
# compute the IoU
|
67 |
+
iou = intersection / union # Should be shape (n, )
|
68 |
+
if raw_output:
|
69 |
+
if iou.numel() == 0:
|
70 |
+
return 0
|
71 |
+
return iou
|
72 |
+
|
73 |
+
# get indices of boxes with IoU > thres
|
74 |
+
high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
|
75 |
+
|
76 |
+
return high_iou_indices
|
77 |
+
|
78 |
+
|
79 |
+
def image_to_np_ndarray(image):
|
80 |
+
if type(image) is str:
|
81 |
+
return np.array(Image.open(image))
|
82 |
+
elif issubclass(type(image), Image.Image):
|
83 |
+
return np.array(image)
|
84 |
+
elif type(image) is np.ndarray:
|
85 |
+
return image
|
86 |
+
return None
|