Upload 121 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +11 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/README.md +63 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-38.pyc +0 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-39.pyc +0 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface.cpython-39.pyc +0 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc +0 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-38.pyc +0 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-39.pyc +0 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/CMakeLists.txt +34 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/anchors_float32.npy +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/coco.jpg +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/run_test.cpp +909 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/anchors_face_back.npy +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazeface_landmark.pth +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazefaceback.pth +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor.pt +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor_w8a16.qnn216.ctx.bin +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark.pt +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark_w8a16.qnn216.ctx.bin +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/0000.jpg +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-38.pyc +0 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-39.pyc +0 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface.cpython-39.pyc +0 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc +0 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/visualization.cpython-38.pyc +0 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazebase.py +513 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface.py +182 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface_landmark.py +74 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/coco.jpg +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/demo_qnn.py +389 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/export_jit.py +57 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/visualization.py +125 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/README.md +63 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt +34 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/anchors_float32.npy +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/coco.jpg +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp +909 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceDetector_w8a8.qnn216.ctx.bin +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceLandmark_w8a8.qnn216.ctx.bin +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_face_back.npy +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_float32.npy +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazebase.py +513 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface.py +182 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface_landmark.py +74 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/coco.jpg +3 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/demo_qnn.py +424 -0
- model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/visualization.py +125 -0
- model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/README.md +63 -0
- model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-38.pyc +0 -0
- model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-39.pyc +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text
|
37 |
+
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/0000.jpg filter=lfs diff=lfs merge=lfs -text
|
38 |
+
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text
|
39 |
+
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text
|
40 |
+
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text
|
41 |
+
model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text
|
42 |
+
model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text
|
43 |
+
model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text
|
44 |
+
model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text
|
45 |
+
model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text
|
46 |
+
model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/README.md
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Model Information
|
2 |
+
### Source model
|
3 |
+
- Input shape: 256x256
|
4 |
+
- Number of parameters:0.13M, 0.6M
|
5 |
+
- Model size:0.58MB, 2.32MB
|
6 |
+
- Output shape: [1x896x16, 1x896x1], [1, 1x486x3]
|
7 |
+
|
8 |
+
Source model repository: [MediaPipe-Face-Detection](https://github.com/zmurez/MediaPipePyTorch/)
|
9 |
+
|
10 |
+
### Converted model
|
11 |
+
|
12 |
+
- Precision: INT16
|
13 |
+
- Backend: QNN2.16
|
14 |
+
- Target Device: FV01 QCS6490
|
15 |
+
|
16 |
+
## Inference with AidLite SDK
|
17 |
+
|
18 |
+
### SDK installation
|
19 |
+
Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
|
20 |
+
|
21 |
+
- install AidLite SDK
|
22 |
+
|
23 |
+
```bash
|
24 |
+
# Install the appropriate version of the aidlite sdk
|
25 |
+
sudo aid-pkg update
|
26 |
+
sudo aid-pkg install aidlite-sdk
|
27 |
+
# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
|
28 |
+
sudo aid-pkg install aidlite-{QNN VERSION}
|
29 |
+
```
|
30 |
+
|
31 |
+
- Verify AidLite SDK
|
32 |
+
|
33 |
+
```bash
|
34 |
+
# aidlite sdk c++ check
|
35 |
+
python3 -c "import aidlite ; print(aidlite.get_library_version())"
|
36 |
+
|
37 |
+
# aidlite sdk python check
|
38 |
+
python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
|
39 |
+
```
|
40 |
+
|
41 |
+
### Run demo
|
42 |
+
#### python
|
43 |
+
```bash
|
44 |
+
cd python
|
45 |
+
python3 demo_qnn.py
|
46 |
+
```
|
47 |
+
|
48 |
+
#### c++
|
49 |
+
```bash
|
50 |
+
# 加载.npy文件需要用到cnpy库(终端默认路径下执行即可)
|
51 |
+
git clone https://github.com/rogersce/cnpy.git
|
52 |
+
cd cnpy
|
53 |
+
mkdir build && cd build
|
54 |
+
cmake ..
|
55 |
+
make
|
56 |
+
sudo make install
|
57 |
+
|
58 |
+
cd mediapipe-face-detection/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp
|
59 |
+
mkdir build && cd build
|
60 |
+
cmake ..
|
61 |
+
make
|
62 |
+
./run_test
|
63 |
+
```
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-38.pyc
ADDED
Binary file (16.5 kB). View file
|
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-39.pyc
ADDED
Binary file (16.5 kB). View file
|
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface.cpython-39.pyc
ADDED
Binary file (3.95 kB). View file
|
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc
ADDED
Binary file (2.07 kB). View file
|
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-38.pyc
ADDED
Binary file (4.54 kB). View file
|
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-39.pyc
ADDED
Binary file (3.92 kB). View file
|
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/CMakeLists.txt
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cmake_minimum_required (VERSION 3.5)
|
2 |
+
project("run_test")
|
3 |
+
|
4 |
+
find_package(OpenCV REQUIRED)
|
5 |
+
find_library(CNPY_LIB cnpy REQUIRED)
|
6 |
+
|
7 |
+
message(STATUS "oPENCV Library status:")
|
8 |
+
message(STATUS ">version:${OpenCV_VERSION}")
|
9 |
+
message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
|
10 |
+
|
11 |
+
set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
|
12 |
+
|
13 |
+
include_directories(
|
14 |
+
/usr/local/include
|
15 |
+
/usr/include/opencv4
|
16 |
+
)
|
17 |
+
|
18 |
+
link_directories(
|
19 |
+
/usr/local/lib/
|
20 |
+
)
|
21 |
+
|
22 |
+
file(GLOB SRC_LISTS
|
23 |
+
${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp
|
24 |
+
)
|
25 |
+
|
26 |
+
add_executable(run_test ${SRC_LISTS})
|
27 |
+
|
28 |
+
target_link_libraries(run_test
|
29 |
+
aidlite
|
30 |
+
${OpenCV_LIBS}
|
31 |
+
pthread
|
32 |
+
jsoncpp
|
33 |
+
${CNPY_LIB}
|
34 |
+
)
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/anchors_float32.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94
|
3 |
+
size 14464
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/coco.jpg
ADDED
![]() |
Git LFS Details
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/run_test.cpp
ADDED
@@ -0,0 +1,909 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <iostream>
|
2 |
+
#include <fstream>
|
3 |
+
#include <opencv2/opencv.hpp>
|
4 |
+
#include <aidlux/aidlite/aidlite.hpp>
|
5 |
+
#include <vector>
|
6 |
+
#include <numeric>
|
7 |
+
#include <cmath>
|
8 |
+
#include <jsoncpp/json/json.h>
|
9 |
+
#include <tuple>
|
10 |
+
#include <algorithm>
|
11 |
+
#include <sstream>
|
12 |
+
#include <string>
|
13 |
+
#include <cassert>
|
14 |
+
#include "cnpy.h"
|
15 |
+
|
16 |
+
using namespace cv;
|
17 |
+
using namespace std;
|
18 |
+
using namespace Aidlux::Aidlite;
|
19 |
+
|
20 |
+
|
21 |
+
// 人脸 landmark 连接索引定义(来自 MediaPipe Face Mesh)
|
22 |
+
const std::vector<std::pair<int, int>> FACE_CONNECTIONS = {
|
23 |
+
{61, 146}, {146, 91}, {91, 181}, {181, 84}, {84, 17},
|
24 |
+
{17, 314}, {314, 405}, {405, 321}, {321, 375}, {375, 291},
|
25 |
+
{61, 185}, {185, 40}, {40, 39}, {39, 37}, {37, 0},
|
26 |
+
{0, 267}, {267, 269}, {269, 270}, {270, 409}, {409, 291},
|
27 |
+
{78, 95}, {95, 88}, {88, 178}, {178, 87}, {87, 14},
|
28 |
+
{14, 317}, {317, 402}, {402, 318}, {318, 324}, {324, 308},
|
29 |
+
{78, 191}, {191, 80}, {80, 81}, {81, 82}, {82, 13},
|
30 |
+
{13, 312}, {312, 311}, {311, 310}, {310, 415}, {415, 308},
|
31 |
+
{263, 249}, {249, 390}, {390, 373}, {373, 374}, {374, 380},
|
32 |
+
{380, 381}, {381, 382}, {382, 362}, {263, 466}, {466, 388},
|
33 |
+
{388, 387}, {387, 386}, {386, 385}, {385, 384}, {384, 398},
|
34 |
+
{398, 362}, {276, 283}, {283, 282}, {282, 295}, {295, 285},
|
35 |
+
{300, 293}, {293, 334}, {334, 296}, {296, 336}, {33, 7},
|
36 |
+
{7, 163}, {163, 144}, {144, 145}, {145, 153}, {153, 154},
|
37 |
+
{154, 155}, {155, 133}, {33, 246}, {246, 161}, {161, 160},
|
38 |
+
{160, 159}, {159, 158}, {158, 157}, {157, 173}, {173, 133},
|
39 |
+
{46, 53}, {53, 52}, {52, 65}, {65, 55}, {70, 63}, {63, 105},
|
40 |
+
{105, 66}, {66, 107}, {10, 338}, {338, 297}, {297, 332},
|
41 |
+
{332, 284}, {284, 251}, {251, 389}, {389, 356}, {356, 454},
|
42 |
+
{454, 323}, {323, 361}, {361, 288}, {288, 397}, {397, 365},
|
43 |
+
{365, 379}, {379, 378}, {378, 400}, {400, 377}, {377, 152},
|
44 |
+
{152, 148}, {148, 176}, {176, 149}, {149, 150}, {150, 136},
|
45 |
+
{136, 172}, {172, 58}, {58, 132}, {132, 93}, {93, 234},
|
46 |
+
{234, 127}, {127, 162}, {162, 21}, {21, 54}, {54, 103},
|
47 |
+
{103, 67}, {67, 109}, {109, 10}
|
48 |
+
};
|
49 |
+
|
50 |
+
struct Args {
|
51 |
+
std::string faceDetector_model = "../../models/m_faceDetctor_w8a16.qnn216.ctx.bin";
|
52 |
+
std::string faceLandmark_model = "../../models/m_faceLandmark_w8a16.qnn216.ctx.bin";
|
53 |
+
std::string imgs = "../coco.jpg";
|
54 |
+
int invoke_nums = 10;
|
55 |
+
std::string model_type = "QNN";
|
56 |
+
};
|
57 |
+
|
58 |
+
|
59 |
+
Args parse_args(int argc, char* argv[]) {
|
60 |
+
Args args;
|
61 |
+
for (int i = 1; i < argc; ++i) {
|
62 |
+
std::string arg = argv[i];
|
63 |
+
if (arg == "--faceDetector_model" && i + 1 < argc) {
|
64 |
+
args.faceDetector_model = argv[++i];
|
65 |
+
} else if (arg == "--faceLandmark_model" && i + 1 < argc) {
|
66 |
+
args.faceLandmark_model = argv[++i];
|
67 |
+
} else if (arg == "--imgs" && i + 1 < argc) {
|
68 |
+
args.imgs = argv[++i];
|
69 |
+
} else if (arg == "--invoke_nums" && i + 1 < argc) {
|
70 |
+
args.invoke_nums = std::stoi(argv[++i]);
|
71 |
+
} else if (arg == "--model_type" && i + 1 < argc) {
|
72 |
+
args.model_type = argv[++i];
|
73 |
+
}
|
74 |
+
}
|
75 |
+
return args;
|
76 |
+
}
|
77 |
+
|
78 |
+
std::string to_lower(const std::string& str) {
|
79 |
+
std::string lower_str = str;
|
80 |
+
std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
|
81 |
+
return std::tolower(c);
|
82 |
+
});
|
83 |
+
return lower_str;
|
84 |
+
}
|
85 |
+
|
86 |
+
std::vector<std::vector<float>> load_anchors_from_npy(const std::string& path) {
|
87 |
+
cnpy::NpyArray arr = cnpy::npy_load(path);
|
88 |
+
float* data_ptr = arr.data<float>();
|
89 |
+
|
90 |
+
size_t num_rows = arr.shape[0]; // 896
|
91 |
+
size_t num_cols = arr.shape[1]; // 4
|
92 |
+
|
93 |
+
std::vector<std::vector<float>> anchors(num_rows, std::vector<float>(num_cols));
|
94 |
+
for (size_t i = 0; i < num_rows; ++i) {
|
95 |
+
for (size_t j = 0; j < num_cols; ++j) {
|
96 |
+
anchors[i][j] = data_ptr[i * num_cols + j];
|
97 |
+
}
|
98 |
+
}
|
99 |
+
|
100 |
+
return anchors;
|
101 |
+
}
|
102 |
+
|
103 |
+
|
104 |
+
// 绘制人脸关键点和连接线
|
105 |
+
void draw_landmarks(
|
106 |
+
cv::Mat& img,
|
107 |
+
const std::vector<cv::Point2f>& points,
|
108 |
+
const std::vector<float>& flags,
|
109 |
+
const std::vector<std::pair<int, int>>& connections,
|
110 |
+
float threshold = 0.4f,
|
111 |
+
cv::Scalar point_color = cv::Scalar(0, 255, 0),
|
112 |
+
cv::Scalar line_color = cv::Scalar(0, 0, 0),
|
113 |
+
int size = 2)
|
114 |
+
{
|
115 |
+
// 画关键点
|
116 |
+
for (size_t i = 0; i < points.size(); ++i) {
|
117 |
+
// if (i < flags.size() && flags[i] > threshold) {
|
118 |
+
int x = static_cast<int>(points[i].x);
|
119 |
+
int y = static_cast<int>(points[i].y);
|
120 |
+
cv::circle(img, cv::Point(x, y), size, point_color, size);
|
121 |
+
// }
|
122 |
+
}
|
123 |
+
|
124 |
+
// 画连接线(两端都要可见)
|
125 |
+
for (const auto& conn : connections) {
|
126 |
+
int i0 = conn.first;
|
127 |
+
int i1 = conn.second;
|
128 |
+
// if (i0 < points.size() && i1 < points.size() &&
|
129 |
+
// i0 < flags.size() && i1 < flags.size() &&
|
130 |
+
// flags[i0] > threshold && flags[i1] > threshold)
|
131 |
+
// {
|
132 |
+
cv::line(img, points[i0], points[i1], line_color, size);
|
133 |
+
// }
|
134 |
+
}
|
135 |
+
}
|
136 |
+
|
137 |
+
|
138 |
+
std::tuple<cv::Mat, cv::Mat, float, cv::Point> resize_pad(const cv::Mat& img) {
|
139 |
+
int orig_h = img.rows; // 480
|
140 |
+
int orig_w = img.cols; // 640
|
141 |
+
|
142 |
+
// Step 1: resize width to 256, keep aspect ratio
|
143 |
+
int w1 = 256;
|
144 |
+
int h1 = w1 * orig_h / orig_w; // 等效于 int(256 * h / w)
|
145 |
+
|
146 |
+
// Step 2: compute padding in height direction
|
147 |
+
int padh = 256 - h1;
|
148 |
+
int padw = 0;
|
149 |
+
|
150 |
+
int padh1 = padh / 2;
|
151 |
+
int padh2 = padh1 + (padh % 2);
|
152 |
+
int padw1 = padw / 2;
|
153 |
+
int padw2 = padw1 + (padw % 2);
|
154 |
+
|
155 |
+
// Step 3: resize to (w1, h1)
|
156 |
+
cv::Mat resized;
|
157 |
+
cv::resize(img, resized, cv::Size(w1, h1)); // (256, h1)
|
158 |
+
|
159 |
+
// Step 4: pad to (256, 256)
|
160 |
+
cv::Mat padded;
|
161 |
+
cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
|
162 |
+
|
163 |
+
// Step 5: resize padded to 128×128
|
164 |
+
cv::Mat resized128;
|
165 |
+
cv::resize(padded, resized128, cv::Size(128, 128));
|
166 |
+
|
167 |
+
// Step 6: compute scale and pad in original image space
|
168 |
+
float scale = static_cast<float>(orig_h) / h1; // h / h1
|
169 |
+
cv::Point pad_point(static_cast<int>(padh1 * scale), static_cast<int>(padw1 * scale));
|
170 |
+
|
171 |
+
return std::make_tuple(padded, resized128, scale, pad_point);
|
172 |
+
}
|
173 |
+
|
174 |
+
|
175 |
+
// 将图像转换为 1xC×H×W 格式并归一化(除以 255)
|
176 |
+
std::vector<float> preprocess_image(const cv::Mat& img) {
|
177 |
+
int H = img.rows;
|
178 |
+
int W = img.cols;
|
179 |
+
int C = img.channels(); // should be 3
|
180 |
+
|
181 |
+
std::vector<float> chw(H * W * C); // CHW
|
182 |
+
std::vector<float> nchw(1 * C * H * W); // NCHW
|
183 |
+
|
184 |
+
// 1. HWC → CHW + normalize (float32 / 255.0)
|
185 |
+
for (int h = 0; h < H; ++h) {
|
186 |
+
for (int w = 0; w < W; ++w) {
|
187 |
+
for (int c = 0; c < C; ++c) {
|
188 |
+
// OpenCV uses BGR order
|
189 |
+
float value = img.at<cv::Vec3b>(h, w)[c] / 255.0f;
|
190 |
+
chw[c * H * W + h * W + w] = value;
|
191 |
+
}
|
192 |
+
}
|
193 |
+
}
|
194 |
+
|
195 |
+
// 2. CHW → NCHW (add batch dimension, actually just copy)
|
196 |
+
for (int i = 0; i < C * H * W; ++i) {
|
197 |
+
nchw[i] = chw[i];
|
198 |
+
}
|
199 |
+
|
200 |
+
return nchw; // shape: [1, 3, H, W]
|
201 |
+
}
|
202 |
+
|
203 |
+
|
204 |
+
// 只用前4个坐标计算IOU(默认框位置在前4个坐标)
|
205 |
+
float IoU(const std::vector<float>& box1, const std::vector<float>& box2) {
|
206 |
+
float x1 = std::max(box1[0], box2[0]);
|
207 |
+
float y1 = std::max(box1[1], box2[1]);
|
208 |
+
float x2 = std::min(box1[2], box2[2]);
|
209 |
+
float y2 = std::min(box1[3], box2[3]);
|
210 |
+
|
211 |
+
float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
|
212 |
+
float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]);
|
213 |
+
float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]);
|
214 |
+
float union_area = box1_area + box2_area - inter_area;
|
215 |
+
|
216 |
+
return union_area > 0 ? inter_area / union_area : 0.0f;
|
217 |
+
}
|
218 |
+
|
219 |
+
std::vector<std::vector<float>> weighted_non_max_suppression(
|
220 |
+
std::vector<std::vector<float>>& detections,
|
221 |
+
int num_coords = 16,
|
222 |
+
float min_suppression_threshold = 0.3f)
|
223 |
+
{
|
224 |
+
if (detections.empty()) return {};
|
225 |
+
|
226 |
+
std::vector<int> indices(detections.size());
|
227 |
+
std::iota(indices.begin(), indices.end(), 0);
|
228 |
+
|
229 |
+
// 按置信度降序排序
|
230 |
+
std::sort(indices.begin(), indices.end(), [&](int a, int b) {
|
231 |
+
return detections[a][num_coords] > detections[b][num_coords];
|
232 |
+
});
|
233 |
+
|
234 |
+
std::vector<std::vector<float>> output;
|
235 |
+
|
236 |
+
while (!indices.empty()) {
|
237 |
+
int best_idx = indices.front();
|
238 |
+
const auto& best_det = detections[best_idx];
|
239 |
+
std::vector<int> overlapping = { best_idx };
|
240 |
+
|
241 |
+
for (size_t i = 1; i < indices.size(); ++i) {
|
242 |
+
float iou = IoU(best_det, detections[indices[i]]);
|
243 |
+
if (iou > min_suppression_threshold) {
|
244 |
+
overlapping.push_back(indices[i]);
|
245 |
+
}
|
246 |
+
}
|
247 |
+
|
248 |
+
// 更新剩余索引
|
249 |
+
std::vector<int> new_indices;
|
250 |
+
for (int idx : indices) {
|
251 |
+
if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) {
|
252 |
+
new_indices.push_back(idx);
|
253 |
+
}
|
254 |
+
}
|
255 |
+
indices = new_indices;
|
256 |
+
|
257 |
+
// 加权平均:坐标 * 置信度
|
258 |
+
if (overlapping.size() == 1) {
|
259 |
+
output.push_back(best_det);
|
260 |
+
} else {
|
261 |
+
std::vector<float> weighted(num_coords + 1, 0.0f);
|
262 |
+
float total_score = 0.0f;
|
263 |
+
|
264 |
+
for (int idx : overlapping) {
|
265 |
+
float score = detections[idx][num_coords];
|
266 |
+
total_score += score;
|
267 |
+
for (int k = 0; k < num_coords; ++k) {
|
268 |
+
weighted[k] += detections[idx][k] * score;
|
269 |
+
}
|
270 |
+
}
|
271 |
+
|
272 |
+
for (int k = 0; k < num_coords; ++k) {
|
273 |
+
weighted[k] /= total_score;
|
274 |
+
}
|
275 |
+
weighted[num_coords] = total_score / overlapping.size(); // 取平均得分
|
276 |
+
|
277 |
+
// std::cout << "Weighted box: ";
|
278 |
+
// for (float v : weighted) std::cout << v << " ";
|
279 |
+
// std::cout << "\n";
|
280 |
+
|
281 |
+
output.push_back(weighted);
|
282 |
+
}
|
283 |
+
}
|
284 |
+
|
285 |
+
// TODO
|
286 |
+
auto x = output[0];
|
287 |
+
output.clear();
|
288 |
+
output.push_back(x);
|
289 |
+
|
290 |
+
return output;
|
291 |
+
}
|
292 |
+
|
293 |
+
|
294 |
+
std::vector<std::vector<float>> denormalize_detections(
|
295 |
+
const std::vector<std::vector<float>>& detections,
|
296 |
+
float scale,
|
297 |
+
const cv::Point& pad
|
298 |
+
) {
|
299 |
+
std::vector<std::vector<float>> result = detections;
|
300 |
+
|
301 |
+
for (size_t i = 0; i < result.size(); ++i) {
|
302 |
+
std::vector<float>& det = result[i];
|
303 |
+
|
304 |
+
// bbox coords: x1, y1, x2, y2
|
305 |
+
det[0] = det[0] * scale * 256.0f - pad.x; // x1
|
306 |
+
det[1] = det[1] * scale * 256.0f - pad.y; // y1
|
307 |
+
det[2] = det[2] * scale * 256.0f - pad.x; // x2
|
308 |
+
det[3] = det[3] * scale * 256.0f - pad.y; // y2
|
309 |
+
|
310 |
+
// keypoints (starting from index 4): format [y, x, y, x, ...]
|
311 |
+
for (size_t k = 4; k + 1 < det.size(); k += 2) {
|
312 |
+
det[k] = det[k] * scale * 256.0f - pad.y; // y
|
313 |
+
det[k + 1] = det[k + 1] * scale * 256.0f - pad.x; // x
|
314 |
+
}
|
315 |
+
}
|
316 |
+
|
317 |
+
return result;
|
318 |
+
}
|
319 |
+
|
320 |
+
|
321 |
+
void detection2roi(
|
322 |
+
const std::vector<std::vector<float>>& detections,
|
323 |
+
std::vector<float>& xc,
|
324 |
+
std::vector<float>& yc,
|
325 |
+
std::vector<float>& scale,
|
326 |
+
std::vector<float>& theta,
|
327 |
+
int kp1, int kp2, // 关键点索引
|
328 |
+
float dy, float dscale, float theta0
|
329 |
+
) {
|
330 |
+
size_t N = detections.size();
|
331 |
+
xc.resize(N);
|
332 |
+
yc.resize(N);
|
333 |
+
scale.resize(N);
|
334 |
+
theta.resize(N);
|
335 |
+
|
336 |
+
for (size_t i = 0; i < N; ++i) {
|
337 |
+
const std::vector<float>& det = detections[i];
|
338 |
+
|
339 |
+
float x1 = det[1];
|
340 |
+
float x2 = det[3];
|
341 |
+
float y1 = det[0];
|
342 |
+
float y2 = det[2];
|
343 |
+
|
344 |
+
float x_center = (x1 + x2) / 2.0f;
|
345 |
+
float y_center = (y1 + y2) / 2.0f;
|
346 |
+
float box_scale = (x2 - x1); // assumes square box
|
347 |
+
|
348 |
+
// yc 偏移
|
349 |
+
y_center += dy * box_scale;
|
350 |
+
box_scale *= dscale;
|
351 |
+
|
352 |
+
// 获取两个关键点的位置
|
353 |
+
int base = 4;
|
354 |
+
int idx_y0 = base + 2 * kp1;
|
355 |
+
int idx_x0 = base + 2 * kp1 + 1;
|
356 |
+
int idx_y1 = base + 2 * kp2;
|
357 |
+
int idx_x1 = base + 2 * kp2 + 1;
|
358 |
+
|
359 |
+
float x0 = det[idx_x0];
|
360 |
+
float y0 = det[idx_y0];
|
361 |
+
float x1_kp = det[idx_x1];
|
362 |
+
float y1_kp = det[idx_y1];
|
363 |
+
|
364 |
+
float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0;
|
365 |
+
|
366 |
+
// 输出赋值
|
367 |
+
xc[i] = x_center;
|
368 |
+
yc[i] = y_center;
|
369 |
+
scale[i] = box_scale;
|
370 |
+
// TODO: 这里的 theta 需要根据实际情况调整
|
371 |
+
// theta[i] = angle; // 如果需要使用计算的角度
|
372 |
+
theta[i] = -0.0094;
|
373 |
+
}
|
374 |
+
}
|
375 |
+
|
376 |
+
|
377 |
+
void extract_roi(
|
378 |
+
const cv::Mat& frame,
|
379 |
+
const std::vector<float>& xc,
|
380 |
+
const std::vector<float>& yc,
|
381 |
+
const std::vector<float>& theta,
|
382 |
+
const std::vector<float>& scale,
|
383 |
+
std::vector<cv::Mat>& cropped_rois,
|
384 |
+
std::vector<cv::Mat>& affine_matrices,
|
385 |
+
std::vector<std::vector<cv::Point2f>>& roi_boxes, // 添加返回点坐标
|
386 |
+
int resolution = 192
|
387 |
+
) {
|
388 |
+
cropped_rois.clear();
|
389 |
+
affine_matrices.clear();
|
390 |
+
roi_boxes.clear();
|
391 |
+
|
392 |
+
for (size_t i = 0; i < xc.size(); ++i) {
|
393 |
+
float s = scale[i] / 2.0f;
|
394 |
+
float cos_t = std::cos(theta[i]);
|
395 |
+
float sin_t = std::sin(theta[i]);
|
396 |
+
|
397 |
+
// 定义4个 unit square 点经过变换后的点(顺序和 Python 中一样)
|
398 |
+
std::vector<cv::Point2f> points(4);
|
399 |
+
// [-1, -1]
|
400 |
+
points[0].x = xc[i] + (-s * cos_t + s * sin_t);
|
401 |
+
points[0].y = yc[i] + (-s * sin_t - s * cos_t);
|
402 |
+
// [1, -1]
|
403 |
+
points[1].x = xc[i] + ( s * cos_t + s * sin_t);
|
404 |
+
points[1].y = yc[i] + ( s * sin_t - s * cos_t);
|
405 |
+
// [-1, 1]
|
406 |
+
points[2].x = xc[i] + (-s * cos_t - s * sin_t);
|
407 |
+
points[2].y = yc[i] + (-s * sin_t + s * cos_t);
|
408 |
+
// [1, 1]
|
409 |
+
points[3].x = xc[i] + ( s * cos_t - s * sin_t);
|
410 |
+
points[3].y = yc[i] + ( s * sin_t + s * cos_t);
|
411 |
+
|
412 |
+
// 用前三个点计算仿射变换
|
413 |
+
std::vector<cv::Point2f> src_pts = { points[0], points[1], points[2] };
|
414 |
+
std::vector<cv::Point2f> dst_pts = {
|
415 |
+
cv::Point2f(0, 0),
|
416 |
+
cv::Point2f(resolution - 1, 0),
|
417 |
+
cv::Point2f(0, resolution - 1)
|
418 |
+
};
|
419 |
+
|
420 |
+
cv::Mat M = cv::getAffineTransform(src_pts, dst_pts);
|
421 |
+
cv::Mat M_inv;
|
422 |
+
cv::invertAffineTransform(M, M_inv);
|
423 |
+
|
424 |
+
cv::Mat cropped;
|
425 |
+
cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5));
|
426 |
+
cropped_rois.push_back(cropped);
|
427 |
+
affine_matrices.push_back(M_inv);
|
428 |
+
roi_boxes.push_back(points); // 添加变换后的 box 点
|
429 |
+
}
|
430 |
+
}
|
431 |
+
|
432 |
+
std::vector<float> preprocess_imgs_to_nchw(const std::vector<cv::Mat>& imgs) {
|
433 |
+
int N = imgs.size();
|
434 |
+
if (N == 0) return {};
|
435 |
+
|
436 |
+
int H = 192;
|
437 |
+
int W = 192;
|
438 |
+
int C = 3; // assume 3 channels (BGR)
|
439 |
+
|
440 |
+
std::vector<float> output;
|
441 |
+
output.reserve(N * C * H * W);
|
442 |
+
|
443 |
+
for (int n = 0; n < N; ++n) {
|
444 |
+
cv::Mat img_float;
|
445 |
+
imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0); // Normalize to [0,1]
|
446 |
+
|
447 |
+
// Split channels (HWC → CHW)
|
448 |
+
std::vector<cv::Mat> channels(3);
|
449 |
+
cv::split(img_float, channels); // channels[0] = B, [1] = G, [2] = R
|
450 |
+
|
451 |
+
for (int c = 0; c < C; ++c) {
|
452 |
+
for (int i = 0; i < H; ++i) {
|
453 |
+
for (int j = 0; j < W; ++j) {
|
454 |
+
output.push_back(channels[c].at<float>(i, j));
|
455 |
+
}
|
456 |
+
}
|
457 |
+
}
|
458 |
+
}
|
459 |
+
|
460 |
+
return output; // shape: N x C x H x W
|
461 |
+
}
|
462 |
+
|
463 |
+
// resolution 一般为 192
|
464 |
+
std::vector<cv::Point2f> denormalize_landmarks(
|
465 |
+
const std::vector<float>& normalized_landmarks,
|
466 |
+
const std::vector<cv::Mat>& affines,
|
467 |
+
int resolution = 192)
|
468 |
+
{
|
469 |
+
std::vector<cv::Point2f> output;
|
470 |
+
|
471 |
+
// 检查输入尺寸
|
472 |
+
const int num_faces = 1; // 假设只有一个人脸
|
473 |
+
const int num_landmarks = 468;
|
474 |
+
if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) {
|
475 |
+
std::cerr << "Error: Input size mismatch. Expected "
|
476 |
+
<< num_faces * num_landmarks * 3 << " landmarks and "
|
477 |
+
<< num_faces << " affine matrices." << std::endl;
|
478 |
+
throw std::runtime_error("Input size mismatch");
|
479 |
+
}
|
480 |
+
|
481 |
+
for (int i = 0; i < num_faces; ++i) {
|
482 |
+
const cv::Mat& affine = affines[i]; // 2x3 CV_32F
|
483 |
+
for (int j = 0; j < num_landmarks; ++j) {
|
484 |
+
int idx = i * num_landmarks * 3 + j * 3;
|
485 |
+
float x = normalized_landmarks[idx + 0] * resolution;
|
486 |
+
float y = normalized_landmarks[idx + 1] * resolution;
|
487 |
+
// float z = normalized_landmarks[idx + 2]; // 可选使用
|
488 |
+
|
489 |
+
// 2x1 输入向量
|
490 |
+
cv::Mat pt = (cv::Mat_<float>(2, 1) << x, y);
|
491 |
+
|
492 |
+
// 提取 affine 的旋转和平移
|
493 |
+
cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone();
|
494 |
+
cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone();
|
495 |
+
M2x2.convertTo(M2x2, CV_32F);
|
496 |
+
t2x1.convertTo(t2x1, CV_32F);
|
497 |
+
|
498 |
+
// 反仿射变换
|
499 |
+
cv::Mat out = M2x2 * pt + t2x1;
|
500 |
+
|
501 |
+
// 存储为 Point2f
|
502 |
+
output.emplace_back(out.at<float>(0, 0), out.at<float>(1, 0));
|
503 |
+
}
|
504 |
+
}
|
505 |
+
|
506 |
+
return output; // 输出为 denormalized landmarks,大小为 468 个 Point2f
|
507 |
+
}
|
508 |
+
|
509 |
+
|
510 |
+
void draw_roi(cv::Mat& img, const std::vector<std::vector<cv::Point2f>>& boxes) {
|
511 |
+
for (const auto& roi : boxes) {
|
512 |
+
if (roi.size() < 4) continue;
|
513 |
+
|
514 |
+
const cv::Point2f& p1 = roi[0];
|
515 |
+
const cv::Point2f& p2 = roi[1];
|
516 |
+
const cv::Point2f& p3 = roi[2];
|
517 |
+
const cv::Point2f& p4 = roi[3];
|
518 |
+
|
519 |
+
cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2); // 黑色
|
520 |
+
cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2); // 绿色
|
521 |
+
cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2); // 黑色
|
522 |
+
cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2); // 黑色
|
523 |
+
}
|
524 |
+
}
|
525 |
+
|
526 |
+
|
527 |
+
void draw_detections(cv::Mat& img, const std::vector<std::vector<float>>& detections, bool with_keypoints = true) {
|
528 |
+
for (const auto& det : detections) {
|
529 |
+
if (det.size() < 4) continue;
|
530 |
+
|
531 |
+
float ymin = det[0];
|
532 |
+
float xmin = det[1];
|
533 |
+
float ymax = det[2];
|
534 |
+
float xmax = det[3];
|
535 |
+
|
536 |
+
cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1);
|
537 |
+
|
538 |
+
if (with_keypoints && det.size() > 4) {
|
539 |
+
int n_keypoints = (det.size() - 4) / 2;
|
540 |
+
for (int k = 0; k < n_keypoints; ++k) {
|
541 |
+
int kp_x = int(det[4 + k * 2]);
|
542 |
+
int kp_y = int(det[4 + k * 2 + 1]);
|
543 |
+
cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2);
|
544 |
+
}
|
545 |
+
}
|
546 |
+
}
|
547 |
+
}
|
548 |
+
|
549 |
+
|
550 |
+
std::vector<std::vector<float>> loadAnchors(const std::string& filename) {
|
551 |
+
std::ifstream in(filename);
|
552 |
+
std::vector<std::vector<float>> anchors;
|
553 |
+
|
554 |
+
if (!in.is_open()) {
|
555 |
+
std::cerr << "Failed to open file: " << filename << std::endl;
|
556 |
+
return anchors;
|
557 |
+
}
|
558 |
+
|
559 |
+
std::string line;
|
560 |
+
while (std::getline(in, line)) {
|
561 |
+
std::istringstream ss(line);
|
562 |
+
std::vector<float> anchor;
|
563 |
+
float value;
|
564 |
+
while (ss >> value) {
|
565 |
+
anchor.push_back(value);
|
566 |
+
}
|
567 |
+
if (!anchor.empty()) {
|
568 |
+
anchors.push_back(anchor);
|
569 |
+
}
|
570 |
+
}
|
571 |
+
|
572 |
+
in.close();
|
573 |
+
return anchors;
|
574 |
+
}
|
575 |
+
|
576 |
+
// sigmoid 函数
|
577 |
+
float sigmoid(float x) {
|
578 |
+
return 1.0f / (1.0f + std::exp(-x));
|
579 |
+
}
|
580 |
+
|
581 |
+
// clamp 函数
|
582 |
+
float clamp(float x, float min_val, float max_val) {
|
583 |
+
return std::max(min_val, std::min(max_val, x));
|
584 |
+
}
|
585 |
+
|
586 |
+
// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord]
|
587 |
+
std::vector<std::vector<std::vector<float>>> decode_boxes(
|
588 |
+
const std::vector<float>& raw_boxes,
|
589 |
+
const std::vector<std::vector<float>>& anchors,
|
590 |
+
int batch, int num_anchors, int num_coords,
|
591 |
+
float x_scale, float y_scale, float w_scale, float h_scale,
|
592 |
+
int num_keypoints)
|
593 |
+
{
|
594 |
+
std::vector<std::vector<std::vector<float>>> decoded_boxes(batch,
|
595 |
+
std::vector<std::vector<float>>(num_anchors, std::vector<float>(num_coords, 0)));
|
596 |
+
|
597 |
+
for (int b = 0; b < batch; ++b) {
|
598 |
+
for (int i = 0; i < num_anchors; ++i) {
|
599 |
+
int base = b * num_anchors * num_coords + i * num_coords;
|
600 |
+
|
601 |
+
float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0];
|
602 |
+
float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1];
|
603 |
+
float w = raw_boxes[base + 2] / w_scale * anchors[i][2];
|
604 |
+
float h = raw_boxes[base + 3] / h_scale * anchors[i][3];
|
605 |
+
|
606 |
+
decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin
|
607 |
+
decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin
|
608 |
+
decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax
|
609 |
+
decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax
|
610 |
+
|
611 |
+
for (int k = 0; k < num_keypoints; ++k) {
|
612 |
+
int offset = 4 + k * 2;
|
613 |
+
float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0];
|
614 |
+
float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1];
|
615 |
+
decoded_boxes[b][i][offset] = keypoint_x;
|
616 |
+
decoded_boxes[b][i][offset + 1] = keypoint_y;
|
617 |
+
}
|
618 |
+
}
|
619 |
+
}
|
620 |
+
|
621 |
+
return decoded_boxes;
|
622 |
+
}
|
623 |
+
|
624 |
+
std::vector<std::vector<std::vector<float>>> tensors_to_detections(
|
625 |
+
const std::vector<float>& raw_box_tensor,
|
626 |
+
const std::vector<float>& raw_score_tensor,
|
627 |
+
const std::vector<std::vector<float>>& anchors,
|
628 |
+
int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints,
|
629 |
+
float x_scale, float y_scale, float w_scale, float h_scale,
|
630 |
+
float score_clipping_thresh, float min_score_thresh)
|
631 |
+
{
|
632 |
+
assert(raw_box_tensor.size() == batch * num_anchors * num_coords);
|
633 |
+
assert(raw_score_tensor.size() == batch * num_anchors * num_classes);
|
634 |
+
assert(anchors.size() == size_t(num_anchors));
|
635 |
+
|
636 |
+
auto detection_boxes = decode_boxes(
|
637 |
+
raw_box_tensor, anchors, batch, num_anchors, num_coords,
|
638 |
+
x_scale, y_scale, w_scale, h_scale, num_keypoints);
|
639 |
+
|
640 |
+
std::vector<std::vector<std::vector<float>>> output_detections;
|
641 |
+
|
642 |
+
for (int b = 0; b < batch; ++b) {
|
643 |
+
std::vector<std::vector<float>> detections;
|
644 |
+
|
645 |
+
for (int i = 0; i < num_anchors; ++i) {
|
646 |
+
int score_index = b * num_anchors * num_classes + i * num_classes;
|
647 |
+
|
648 |
+
// 单类情况,取第0类
|
649 |
+
float score_raw = raw_score_tensor[score_index];
|
650 |
+
float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh));
|
651 |
+
|
652 |
+
if (score >= min_score_thresh) {
|
653 |
+
std::vector<float> det = detection_boxes[b][i]; // shape [num_coords]
|
654 |
+
det.push_back(score); // 追加置信度
|
655 |
+
detections.push_back(det); // shape [num_coords+1]
|
656 |
+
}
|
657 |
+
}
|
658 |
+
|
659 |
+
output_detections.push_back(detections); // 每个 batch 一个 vector
|
660 |
+
}
|
661 |
+
|
662 |
+
return output_detections;
|
663 |
+
}
|
664 |
+
|
665 |
+
|
666 |
+
int invoke(const Args& args) {
|
667 |
+
std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n"
|
668 |
+
<< args.faceLandmark_model << "\n"
|
669 |
+
<< "Image Path: " << args.imgs << "\n"
|
670 |
+
<< "Inference Nums: " << args.invoke_nums << "\n"
|
671 |
+
<< "Model Type: " << args.model_type << "\n";
|
672 |
+
// =============================================================faceDetector_model start
|
673 |
+
Model* model1 = Model::create_instance(args.faceDetector_model);
|
674 |
+
if(model1 == nullptr){
|
675 |
+
printf("Create model1 failed !\n");
|
676 |
+
return EXIT_FAILURE;
|
677 |
+
}
|
678 |
+
Config* config1 = Config::create_instance();
|
679 |
+
if(config1 == nullptr){
|
680 |
+
printf("Create config1 failed !\n");
|
681 |
+
return EXIT_FAILURE;
|
682 |
+
}
|
683 |
+
config1->implement_type = ImplementType::TYPE_LOCAL;
|
684 |
+
std::string model_type_lower1 = to_lower(args.model_type);
|
685 |
+
if (model_type_lower1 == "qnn"){
|
686 |
+
config1->framework_type = FrameworkType::TYPE_QNN;
|
687 |
+
} else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") {
|
688 |
+
config1->framework_type = FrameworkType::TYPE_SNPE2;
|
689 |
+
}
|
690 |
+
config1->accelerate_type = AccelerateType::TYPE_DSP;
|
691 |
+
config1->is_quantify_model = 1;
|
692 |
+
|
693 |
+
std::vector<std::vector<uint32_t>> input_shapes1 = {{1,3,256,256}};
|
694 |
+
std::vector<std::vector<uint32_t>> output_shapes1 = {{1,896,16},{1,896,1}};
|
695 |
+
model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
|
696 |
+
std::unique_ptr<Interpreter> fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1);
|
697 |
+
if(fast_interpreter1 == nullptr){
|
698 |
+
printf("build_interpretper_from_model_and_config failed !\n");
|
699 |
+
return EXIT_FAILURE;
|
700 |
+
}
|
701 |
+
int result = fast_interpreter1->init();
|
702 |
+
if(result != EXIT_SUCCESS){
|
703 |
+
printf("interpreter->init() failed !\n");
|
704 |
+
return EXIT_FAILURE;
|
705 |
+
}
|
706 |
+
// load model
|
707 |
+
fast_interpreter1->load_model();
|
708 |
+
if(result != EXIT_SUCCESS){
|
709 |
+
printf("interpreter->load_model() failed !\n");
|
710 |
+
return EXIT_FAILURE;
|
711 |
+
}
|
712 |
+
printf("detect model load success!\n");
|
713 |
+
// =============================================================faceDetector_model over
|
714 |
+
|
715 |
+
// =============================================================faceLandmark_model start
|
716 |
+
Model* model2 = Model::create_instance(args.faceLandmark_model);
|
717 |
+
if(model2 == nullptr){
|
718 |
+
printf("Create model2 failed !\n");
|
719 |
+
return EXIT_FAILURE;
|
720 |
+
}
|
721 |
+
Config* config2 = Config::create_instance();
|
722 |
+
if(config2 == nullptr){
|
723 |
+
printf("Create config2 failed !\n");
|
724 |
+
return EXIT_FAILURE;
|
725 |
+
}
|
726 |
+
config2->implement_type = ImplementType::TYPE_LOCAL;
|
727 |
+
std::string model_type_lower2 = to_lower(args.model_type);
|
728 |
+
if (model_type_lower2 == "qnn"){
|
729 |
+
config2->framework_type = FrameworkType::TYPE_QNN;
|
730 |
+
} else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") {
|
731 |
+
config2->framework_type = FrameworkType::TYPE_SNPE2;
|
732 |
+
}
|
733 |
+
config2->accelerate_type = AccelerateType::TYPE_DSP;
|
734 |
+
config2->is_quantify_model = 1;
|
735 |
+
|
736 |
+
std::vector<std::vector<uint32_t>> input_shapes2 = {{1,3,192,192}};
|
737 |
+
std::vector<std::vector<uint32_t>> output_shapes2 = {{1},{1,468,3}};
|
738 |
+
model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
|
739 |
+
std::unique_ptr<Interpreter> fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2);
|
740 |
+
if(fast_interpreter2 == nullptr){
|
741 |
+
printf("build_interpretper_from_model_and_config2 failed !\n");
|
742 |
+
return EXIT_FAILURE;
|
743 |
+
}
|
744 |
+
result = fast_interpreter2->init();
|
745 |
+
if(result != EXIT_SUCCESS){
|
746 |
+
printf("interpreter2->init() failed !\n");
|
747 |
+
return EXIT_FAILURE;
|
748 |
+
}
|
749 |
+
// load model
|
750 |
+
fast_interpreter2->load_model();
|
751 |
+
if(result != EXIT_SUCCESS){
|
752 |
+
printf("interpreter2->load_model() failed !\n");
|
753 |
+
return EXIT_FAILURE;
|
754 |
+
}
|
755 |
+
printf("detect model2 load success!\n");
|
756 |
+
// =============================================================faceLandmark_model over
|
757 |
+
|
758 |
+
|
759 |
+
auto anchors = load_anchors_from_npy("../anchors_float32.npy");
|
760 |
+
cv::Mat frame = cv::imread(args.imgs);
|
761 |
+
if (frame.empty()) {
|
762 |
+
printf("detect image load failed!\n");
|
763 |
+
return 1;
|
764 |
+
}
|
765 |
+
// printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
|
766 |
+
cv::Mat input_data;
|
767 |
+
cv::Mat frame_clone1 = frame.clone();
|
768 |
+
cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB);
|
769 |
+
cv::Mat frame_clone = frame.clone();
|
770 |
+
|
771 |
+
|
772 |
+
cv::Mat img1, img2;
|
773 |
+
float scale;
|
774 |
+
cv::Point pad;
|
775 |
+
std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1);
|
776 |
+
std::vector<float> input_tensor = preprocess_image(img1);
|
777 |
+
|
778 |
+
float *outdata0 = nullptr;
|
779 |
+
float *outdata1 = nullptr;
|
780 |
+
std::vector<float> invoke_time;
|
781 |
+
for (int i = 0; i < args.invoke_nums; ++i) {
|
782 |
+
result = fast_interpreter1->set_input_tensor(0, input_tensor.data());
|
783 |
+
if(result != EXIT_SUCCESS){
|
784 |
+
printf("interpreter->set_input_tensor() failed !\n");
|
785 |
+
return EXIT_FAILURE;
|
786 |
+
}
|
787 |
+
auto t1 = std::chrono::high_resolution_clock::now();
|
788 |
+
result = fast_interpreter1->invoke();
|
789 |
+
auto t2 = std::chrono::high_resolution_clock::now();
|
790 |
+
std::chrono::duration<double> cost_time = t2 - t1;
|
791 |
+
invoke_time.push_back(cost_time.count() * 1000);
|
792 |
+
if(result != EXIT_SUCCESS){
|
793 |
+
printf("interpreter->invoke() failed !\n");
|
794 |
+
return EXIT_FAILURE;
|
795 |
+
}
|
796 |
+
uint32_t out_data_0 = 0;
|
797 |
+
result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0);
|
798 |
+
if(result != EXIT_SUCCESS){
|
799 |
+
printf("interpreter1->get_output_tensor() 0 failed !\n");
|
800 |
+
return EXIT_FAILURE;
|
801 |
+
}
|
802 |
+
|
803 |
+
uint32_t out_data_1 = 0;
|
804 |
+
result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1);
|
805 |
+
if(result != EXIT_SUCCESS){
|
806 |
+
printf("interpreter1->get_output_tensor() 1 failed !\n");
|
807 |
+
return EXIT_FAILURE;
|
808 |
+
}
|
809 |
+
|
810 |
+
}
|
811 |
+
|
812 |
+
std::vector<float> tensor_1_896_16(outdata0, outdata0 + 896*16);
|
813 |
+
std::vector<float> tensor_1_896_1(outdata1, outdata1 + 896*1);
|
814 |
+
|
815 |
+
std::vector<std::vector<std::vector<float>>> detections = tensors_to_detections(
|
816 |
+
tensor_1_896_16, tensor_1_896_1, anchors,
|
817 |
+
1, 896, 16, 1, 6,
|
818 |
+
256.0f, 256.0f, 256.0f, 256.0f,
|
819 |
+
100.0f, 0.4f);
|
820 |
+
|
821 |
+
|
822 |
+
std::vector<std::vector<std::vector<float>>> filtered_detections;
|
823 |
+
for (size_t i = 0; i < detections.size(); ++i) {
|
824 |
+
std::vector<std::vector<float>>& dets = detections[i];
|
825 |
+
std::vector<std::vector<float>> faces = weighted_non_max_suppression(dets);
|
826 |
+
filtered_detections.push_back(faces);
|
827 |
+
}
|
828 |
+
|
829 |
+
|
830 |
+
// std::cout << "filtered_detections size: " << filtered_detections.size() << "\n";
|
831 |
+
// std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n";
|
832 |
+
std::vector<std::vector<float>> face_detections = denormalize_detections(filtered_detections[0], scale, pad);
|
833 |
+
|
834 |
+
// std::cout << "face_detections size: " << face_detections.size() << "\n";
|
835 |
+
std::vector<float> xc, yc, scales, theta;
|
836 |
+
int kp1 = 0, kp2 = 1; // 关键点索引
|
837 |
+
float dy = 0.0f; // 根据模型定义设定
|
838 |
+
float dscale = 1.5f; // 缩放因子
|
839 |
+
float theta0 = 0.0f; // 基准角度
|
840 |
+
|
841 |
+
detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0);
|
842 |
+
std::vector<cv::Mat> rois;
|
843 |
+
std::vector<cv::Mat> affines;
|
844 |
+
std::vector<std::vector<cv::Point2f>> boxes;
|
845 |
+
|
846 |
+
extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes);
|
847 |
+
if (!boxes.empty()) {
|
848 |
+
std::cout << "Detected " << boxes.size() << " faces.\n";
|
849 |
+
// 检测到人脸,继续处理 boxes[0] ...
|
850 |
+
std::vector<float> input_tensor = preprocess_imgs_to_nchw(rois);
|
851 |
+
|
852 |
+
float *outdata1_0 = nullptr;
|
853 |
+
float *outdata1_1 = nullptr;
|
854 |
+
|
855 |
+
result = fast_interpreter2->set_input_tensor(0, input_tensor.data());
|
856 |
+
if(result != EXIT_SUCCESS){
|
857 |
+
printf("interpreter2->set_input_tensor() failed !\n");
|
858 |
+
return EXIT_FAILURE;
|
859 |
+
}
|
860 |
+
auto t1 = std::chrono::high_resolution_clock::now();
|
861 |
+
result = fast_interpreter2->invoke();
|
862 |
+
auto t2 = std::chrono::high_resolution_clock::now();
|
863 |
+
std::chrono::duration<double> cost_time = t2 - t1;
|
864 |
+
invoke_time.push_back(cost_time.count() * 1000);
|
865 |
+
if(result != EXIT_SUCCESS){
|
866 |
+
printf("interpreter2->invoke() failed !\n");
|
867 |
+
return EXIT_FAILURE;
|
868 |
+
}
|
869 |
+
uint32_t out_data_1_0 = 0;
|
870 |
+
result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0);
|
871 |
+
if(result != EXIT_SUCCESS){
|
872 |
+
printf("interpreter2->get_output_tensor() 0 failed !\n");
|
873 |
+
return EXIT_FAILURE;
|
874 |
+
}
|
875 |
+
|
876 |
+
uint32_t out_data_1_1 = 0;
|
877 |
+
result = fast_interpreter2->get_output_tensor(1, (void**)&outdata1_1, &out_data_1_1);
|
878 |
+
if(result != EXIT_SUCCESS){
|
879 |
+
printf("interpreter2->get_output_tensor() 1 failed !\n");
|
880 |
+
return EXIT_FAILURE;
|
881 |
+
}
|
882 |
+
|
883 |
+
std::vector<float> flags(outdata1_0, outdata1_0 + 1);
|
884 |
+
std::vector<float> normalized_landmarks(outdata1_1, outdata1_1 + 468*3);
|
885 |
+
|
886 |
+
std::vector<cv::Point2f> landmarks = denormalize_landmarks(normalized_landmarks, affines);
|
887 |
+
draw_landmarks(frame_clone1, landmarks, flags, FACE_CONNECTIONS);
|
888 |
+
} else {
|
889 |
+
std::cout << "not detect face!" << std::endl;
|
890 |
+
}
|
891 |
+
|
892 |
+
|
893 |
+
draw_roi(frame_clone1, boxes);
|
894 |
+
draw_detections(frame_clone1, face_detections);
|
895 |
+
cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR);
|
896 |
+
cv::imwrite("vis_result.jpg", frame_clone1);
|
897 |
+
|
898 |
+
|
899 |
+
fast_interpreter1->destory();
|
900 |
+
fast_interpreter2->destory();
|
901 |
+
return 0;
|
902 |
+
|
903 |
+
}
|
904 |
+
|
905 |
+
|
906 |
+
int main(int argc, char* argv[]) {
|
907 |
+
Args args = parse_args(argc, argv);
|
908 |
+
return invoke(args);
|
909 |
+
}
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/anchors_face_back.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95
|
3 |
+
size 28800
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazeface_landmark.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2c529987e67f82e58a608a394aabf245a3afa19ac2f761981894f70b4df9fdca
|
3 |
+
size 2439235
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazefaceback.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9658c6459c5d5450d7da9d5fbb74b3beca11157f4cdb35e4d948aa6b4efc0ded
|
3 |
+
size 594825
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:56336b04831d9f9f41bdcddcd4598e5660a2925451ee50da634fea6598ce6620
|
3 |
+
size 855238
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor_w8a16.qnn216.ctx.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:06f7e7016506a415bb7e02aaf9469a5fd406d31bb7349d3ae0fe97f1a0cb3b9a
|
3 |
+
size 728616
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:96374d173e67c5c3690b75d030b729e23e41de6b1a1ebd5daef7ff3992118c54
|
3 |
+
size 2643322
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark_w8a16.qnn216.ctx.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:61316298a6690650feea876b64b2efe520940d753af3264202689b12dd1c779e
|
3 |
+
size 1096800
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/0000.jpg
ADDED
![]() |
Git LFS Details
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-38.pyc
ADDED
Binary file (16.5 kB). View file
|
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-39.pyc
ADDED
Binary file (16.6 kB). View file
|
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface.cpython-39.pyc
ADDED
Binary file (4.03 kB). View file
|
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc
ADDED
Binary file (2.14 kB). View file
|
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/visualization.cpython-38.pyc
ADDED
Binary file (4.6 kB). View file
|
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazebase.py
ADDED
@@ -0,0 +1,513 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
import torch.nn.functional as F
|
6 |
+
|
7 |
+
|
8 |
+
def resize_pad(img):
|
9 |
+
""" resize and pad images to be input to the detectors
|
10 |
+
|
11 |
+
The face and palm detector networks take 256x256 and 128x128 images
|
12 |
+
as input. As such the input image is padded and resized to fit the
|
13 |
+
size while maintaing the aspect ratio.
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
img1: 256x256
|
17 |
+
img2: 128x128
|
18 |
+
scale: scale factor between original image and 256x256 image
|
19 |
+
pad: pixels of padding in the original image
|
20 |
+
"""
|
21 |
+
|
22 |
+
size0 = img.shape
|
23 |
+
if size0[0]>=size0[1]:
|
24 |
+
h1 = 256
|
25 |
+
w1 = 256 * size0[1] // size0[0]
|
26 |
+
padh = 0
|
27 |
+
padw = 256 - w1
|
28 |
+
scale = size0[1] / w1
|
29 |
+
else:
|
30 |
+
h1 = 256 * size0[0] // size0[1]
|
31 |
+
w1 = 256
|
32 |
+
padh = 256 - h1
|
33 |
+
padw = 0
|
34 |
+
scale = size0[0] / h1
|
35 |
+
padh1 = padh//2
|
36 |
+
padh2 = padh//2 + padh%2
|
37 |
+
padw1 = padw//2
|
38 |
+
padw2 = padw//2 + padw%2
|
39 |
+
img1 = cv2.resize(img, (w1,h1))
|
40 |
+
img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0)))
|
41 |
+
pad = (int(padh1 * scale), int(padw1 * scale))
|
42 |
+
img2 = cv2.resize(img1, (128,128))
|
43 |
+
return img1, img2, scale, pad
|
44 |
+
|
45 |
+
|
46 |
+
def denormalize_detections(detections, scale, pad):
|
47 |
+
""" maps detection coordinates from [0,1] to image coordinates
|
48 |
+
|
49 |
+
The face and palm detector networks take 256x256 and 128x128 images
|
50 |
+
as input. As such the input image is padded and resized to fit the
|
51 |
+
size while maintaing the aspect ratio. This function maps the
|
52 |
+
normalized coordinates back to the original image coordinates.
|
53 |
+
|
54 |
+
Inputs:
|
55 |
+
detections: nxm tensor. n is the number of detections.
|
56 |
+
m is 4+2*k where the first 4 valuse are the bounding
|
57 |
+
box coordinates and k is the number of additional
|
58 |
+
keypoints output by the detector.
|
59 |
+
scale: scalar that was used to resize the image
|
60 |
+
pad: padding in the x and y dimensions
|
61 |
+
|
62 |
+
"""
|
63 |
+
detections[:, 0] = detections[:, 0] * scale * 256 - pad[0]
|
64 |
+
detections[:, 1] = detections[:, 1] * scale * 256 - pad[1]
|
65 |
+
detections[:, 2] = detections[:, 2] * scale * 256 - pad[0]
|
66 |
+
detections[:, 3] = detections[:, 3] * scale * 256 - pad[1]
|
67 |
+
|
68 |
+
detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1]
|
69 |
+
detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0]
|
70 |
+
return detections
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
class BlazeBlock(nn.Module):
|
76 |
+
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False):
|
77 |
+
super(BlazeBlock, self).__init__()
|
78 |
+
|
79 |
+
self.stride = stride
|
80 |
+
self.kernel_size = kernel_size
|
81 |
+
self.channel_pad = out_channels - in_channels
|
82 |
+
|
83 |
+
# TFLite uses slightly different padding than PyTorch
|
84 |
+
# on the depthwise conv layer when the stride is 2.
|
85 |
+
if stride == 2:
|
86 |
+
self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
|
87 |
+
padding = 0
|
88 |
+
else:
|
89 |
+
padding = (kernel_size - 1) // 2
|
90 |
+
|
91 |
+
self.convs = nn.Sequential(
|
92 |
+
nn.Conv2d(in_channels=in_channels, out_channels=in_channels,
|
93 |
+
kernel_size=kernel_size, stride=stride, padding=padding,
|
94 |
+
groups=in_channels, bias=True),
|
95 |
+
nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
|
96 |
+
kernel_size=1, stride=1, padding=0, bias=True),
|
97 |
+
)
|
98 |
+
|
99 |
+
if skip_proj:
|
100 |
+
self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
|
101 |
+
kernel_size=1, stride=1, padding=0, bias=True)
|
102 |
+
else:
|
103 |
+
self.skip_proj = None
|
104 |
+
|
105 |
+
if act == 'relu':
|
106 |
+
self.act = nn.ReLU(inplace=True)
|
107 |
+
elif act == 'prelu':
|
108 |
+
self.act = nn.PReLU(out_channels)
|
109 |
+
else:
|
110 |
+
raise NotImplementedError("unknown activation %s"%act)
|
111 |
+
|
112 |
+
def forward(self, x):
|
113 |
+
if self.stride == 2:
|
114 |
+
if self.kernel_size==3:
|
115 |
+
h = F.pad(x, (0, 2, 0, 2), "constant", 0)
|
116 |
+
else:
|
117 |
+
h = F.pad(x, (1, 2, 1, 2), "constant", 0)
|
118 |
+
x = self.max_pool(x)
|
119 |
+
else:
|
120 |
+
h = x
|
121 |
+
|
122 |
+
if self.skip_proj is not None:
|
123 |
+
x = self.skip_proj(x)
|
124 |
+
elif self.channel_pad > 0:
|
125 |
+
x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
|
126 |
+
|
127 |
+
|
128 |
+
return self.act(self.convs(h) + x)
|
129 |
+
|
130 |
+
|
131 |
+
class FinalBlazeBlock(nn.Module):
|
132 |
+
def __init__(self, channels, kernel_size=3):
|
133 |
+
super(FinalBlazeBlock, self).__init__()
|
134 |
+
|
135 |
+
# TFLite uses slightly different padding than PyTorch
|
136 |
+
# on the depthwise conv layer when the stride is 2.
|
137 |
+
self.convs = nn.Sequential(
|
138 |
+
nn.Conv2d(in_channels=channels, out_channels=channels,
|
139 |
+
kernel_size=kernel_size, stride=2, padding=0,
|
140 |
+
groups=channels, bias=True),
|
141 |
+
nn.Conv2d(in_channels=channels, out_channels=channels,
|
142 |
+
kernel_size=1, stride=1, padding=0, bias=True),
|
143 |
+
)
|
144 |
+
|
145 |
+
self.act = nn.ReLU(inplace=True)
|
146 |
+
|
147 |
+
def forward(self, x):
|
148 |
+
h = F.pad(x, (0, 2, 0, 2), "constant", 0)
|
149 |
+
|
150 |
+
return self.act(self.convs(h))
|
151 |
+
|
152 |
+
|
153 |
+
class BlazeBase(nn.Module):
|
154 |
+
""" Base class for media pipe models. """
|
155 |
+
|
156 |
+
def _device(self):
|
157 |
+
"""Which device (CPU or GPU) is being used by this model?"""
|
158 |
+
return self.classifier_8.weight.device
|
159 |
+
|
160 |
+
def load_weights(self, path):
|
161 |
+
self.load_state_dict(torch.load(path))
|
162 |
+
self.eval()
|
163 |
+
|
164 |
+
|
165 |
+
class BlazeLandmark(BlazeBase):
|
166 |
+
""" Base class for landmark models. """
|
167 |
+
|
168 |
+
def extract_roi(self, frame, xc, yc, theta, scale):
|
169 |
+
|
170 |
+
# take points on unit square and transform them according to the roi
|
171 |
+
points = torch.tensor([[-1, -1, 1, 1],
|
172 |
+
[-1, 1, -1, 1]], device=scale.device).view(1,2,4)
|
173 |
+
points = points * scale.view(-1,1,1)/2
|
174 |
+
theta = theta.view(-1, 1, 1)
|
175 |
+
R = torch.cat((
|
176 |
+
torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
|
177 |
+
torch.cat((torch.sin(theta), torch.cos(theta)), 2),
|
178 |
+
), 1)
|
179 |
+
center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
|
180 |
+
points = R @ points + center
|
181 |
+
|
182 |
+
# use the points to compute the affine transform that maps
|
183 |
+
# these points back to the output square
|
184 |
+
res = self.resolution
|
185 |
+
points1 = np.array([[0, 0, res-1],
|
186 |
+
[0, res-1, 0]], dtype=np.float32).T
|
187 |
+
affines = []
|
188 |
+
imgs = []
|
189 |
+
for i in range(points.shape[0]):
|
190 |
+
pts = points[i, :, :3].cpu().numpy().T
|
191 |
+
M = cv2.getAffineTransform(pts, points1)
|
192 |
+
img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
|
193 |
+
img = torch.tensor(img, device=scale.device)
|
194 |
+
imgs.append(img)
|
195 |
+
affine = cv2.invertAffineTransform(M).astype('float32')
|
196 |
+
affine = torch.tensor(affine, device=scale.device)
|
197 |
+
affines.append(affine)
|
198 |
+
if imgs:
|
199 |
+
imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
|
200 |
+
affines = torch.stack(affines)
|
201 |
+
else:
|
202 |
+
imgs = torch.zeros((0, 3, res, res), device=scale.device)
|
203 |
+
affines = torch.zeros((0, 2, 3), device=scale.device)
|
204 |
+
|
205 |
+
return imgs, affines, points
|
206 |
+
|
207 |
+
def denormalize_landmarks(self, landmarks, affines):
|
208 |
+
landmarks[:,:,:2] *= self.resolution
|
209 |
+
for i in range(len(landmarks)):
|
210 |
+
landmark, affine = landmarks[i], affines[i]
|
211 |
+
landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
|
212 |
+
landmarks[i,:,:2] = landmark
|
213 |
+
return landmarks
|
214 |
+
|
215 |
+
|
216 |
+
|
217 |
+
class BlazeDetector(BlazeBase):
|
218 |
+
""" Base class for detector models.
|
219 |
+
|
220 |
+
Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
|
221 |
+
https://github.com/hollance/BlazeFace-PyTorch and
|
222 |
+
https://github.com/google/mediapipe/
|
223 |
+
"""
|
224 |
+
def load_anchors(self, path):
|
225 |
+
self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
|
226 |
+
assert(self.anchors.ndimension() == 2)
|
227 |
+
assert(self.anchors.shape[0] == self.num_anchors)
|
228 |
+
assert(self.anchors.shape[1] == 4)
|
229 |
+
|
230 |
+
def _preprocess(self, x):
|
231 |
+
"""Converts the image pixels to the range [-1, 1]."""
|
232 |
+
return x.float() / 255.# 127.5 - 1.0
|
233 |
+
|
234 |
+
def predict_on_image(self, img):
|
235 |
+
"""Makes a prediction on a single image.
|
236 |
+
|
237 |
+
Arguments:
|
238 |
+
img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
|
239 |
+
shape (3, H, W). The image's height and width should be
|
240 |
+
128 pixels.
|
241 |
+
|
242 |
+
Returns:
|
243 |
+
A tensor with face detections.
|
244 |
+
"""
|
245 |
+
if isinstance(img, np.ndarray):
|
246 |
+
img = torch.from_numpy(img).permute((2, 0, 1))
|
247 |
+
|
248 |
+
return self.predict_on_batch(img.unsqueeze(0))[0]
|
249 |
+
|
250 |
+
def predict_on_batch(self, x):
|
251 |
+
"""Makes a prediction on a batch of images.
|
252 |
+
|
253 |
+
Arguments:
|
254 |
+
x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
|
255 |
+
shape (b, 3, H, W). The height and width should be 128 pixels.
|
256 |
+
|
257 |
+
Returns:
|
258 |
+
A list containing a tensor of face detections for each image in
|
259 |
+
the batch. If no faces are found for an image, returns a tensor
|
260 |
+
of shape (0, 17).
|
261 |
+
|
262 |
+
Each face detection is a PyTorch tensor consisting of 17 numbers:
|
263 |
+
- ymin, xmin, ymax, xmax
|
264 |
+
- x,y-coordinates for the 6 keypoints
|
265 |
+
- confidence score
|
266 |
+
"""
|
267 |
+
if isinstance(x, np.ndarray):
|
268 |
+
x = torch.from_numpy(x).permute((0, 3, 1, 2))
|
269 |
+
|
270 |
+
assert x.shape[1] == 3
|
271 |
+
assert x.shape[2] == self.y_scale
|
272 |
+
assert x.shape[3] == self.x_scale
|
273 |
+
|
274 |
+
# 1. Preprocess the images into tensors:
|
275 |
+
x = x.to(self._device())
|
276 |
+
x = self._preprocess(x)
|
277 |
+
|
278 |
+
# 2. Run the neural network:
|
279 |
+
with torch.no_grad():
|
280 |
+
out = self.__call__(x)
|
281 |
+
|
282 |
+
# 3. Postprocess the raw predictions:
|
283 |
+
detections = self._tensors_to_detections(out[0], out[1], self.anchors)
|
284 |
+
|
285 |
+
# 4. Non-maximum suppression to remove overlapping detections:
|
286 |
+
filtered_detections = []
|
287 |
+
for i in range(len(detections)):
|
288 |
+
faces = self._weighted_non_max_suppression(detections[i])
|
289 |
+
faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1))
|
290 |
+
filtered_detections.append(faces)
|
291 |
+
|
292 |
+
return filtered_detections
|
293 |
+
|
294 |
+
|
295 |
+
def detection2roi(self, detection):
|
296 |
+
""" Convert detections from detector to an oriented bounding box.
|
297 |
+
|
298 |
+
Adapted from:
|
299 |
+
# mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
|
300 |
+
|
301 |
+
The center and size of the box is calculated from the center
|
302 |
+
of the detected box. Rotation is calcualted from the vector
|
303 |
+
between kp1 and kp2 relative to theta0. The box is scaled
|
304 |
+
and shifted by dscale and dy.
|
305 |
+
|
306 |
+
"""
|
307 |
+
if self.detection2roi_method == 'box':
|
308 |
+
# compute box center and scale
|
309 |
+
# use mediapipe/calculators/util/detections_to_rects_calculator.cc
|
310 |
+
xc = (detection[:,1] + detection[:,3]) / 2
|
311 |
+
yc = (detection[:,0] + detection[:,2]) / 2
|
312 |
+
scale = (detection[:,3] - detection[:,1]) # assumes square boxes
|
313 |
+
|
314 |
+
elif self.detection2roi_method == 'alignment':
|
315 |
+
# compute box center and scale
|
316 |
+
# use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc
|
317 |
+
xc = detection[:,4+2*self.kp1]
|
318 |
+
yc = detection[:,4+2*self.kp1+1]
|
319 |
+
x1 = detection[:,4+2*self.kp2]
|
320 |
+
y1 = detection[:,4+2*self.kp2+1]
|
321 |
+
scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2
|
322 |
+
else:
|
323 |
+
raise NotImplementedError(
|
324 |
+
"detection2roi_method [%s] not supported"%self.detection2roi_method)
|
325 |
+
|
326 |
+
yc += self.dy * scale
|
327 |
+
scale *= self.dscale
|
328 |
+
|
329 |
+
# compute box rotation
|
330 |
+
x0 = detection[:,4+2*self.kp1]
|
331 |
+
y0 = detection[:,4+2*self.kp1+1]
|
332 |
+
x1 = detection[:,4+2*self.kp2]
|
333 |
+
y1 = detection[:,4+2*self.kp2+1]
|
334 |
+
#theta = np.arctan2(y0-y1, x0-x1) - self.theta0
|
335 |
+
theta = torch.atan2(y0-y1, x0-x1) - self.theta0
|
336 |
+
return xc, yc, scale, theta
|
337 |
+
|
338 |
+
|
339 |
+
def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
|
340 |
+
"""The output of the neural network is a tensor of shape (b, 896, 16)
|
341 |
+
containing the bounding box regressor predictions, as well as a tensor
|
342 |
+
of shape (b, 896, 1) with the classification confidences.
|
343 |
+
|
344 |
+
This function converts these two "raw" tensors into proper detections.
|
345 |
+
Returns a list of (num_detections, 17) tensors, one for each image in
|
346 |
+
the batch.
|
347 |
+
|
348 |
+
This is based on the source code from:
|
349 |
+
mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
|
350 |
+
mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
|
351 |
+
"""
|
352 |
+
assert raw_box_tensor.ndimension() == 3
|
353 |
+
assert raw_box_tensor.shape[1] == self.num_anchors
|
354 |
+
assert raw_box_tensor.shape[2] == self.num_coords
|
355 |
+
|
356 |
+
assert raw_score_tensor.ndimension() == 3
|
357 |
+
assert raw_score_tensor.shape[1] == self.num_anchors
|
358 |
+
assert raw_score_tensor.shape[2] == self.num_classes
|
359 |
+
|
360 |
+
assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
|
361 |
+
|
362 |
+
detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
|
363 |
+
|
364 |
+
thresh = self.score_clipping_thresh
|
365 |
+
raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
|
366 |
+
detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
|
367 |
+
|
368 |
+
# Note: we stripped off the last dimension from the scores tensor
|
369 |
+
# because there is only has one class. Now we can simply use a mask
|
370 |
+
# to filter out the boxes with too low confidence.
|
371 |
+
mask = detection_scores >= self.min_score_thresh
|
372 |
+
|
373 |
+
# Because each image from the batch can have a different number of
|
374 |
+
# detections, process them one at a time using a loop.
|
375 |
+
output_detections = []
|
376 |
+
for i in range(raw_box_tensor.shape[0]):
|
377 |
+
boxes = detection_boxes[i, mask[i]]
|
378 |
+
scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
|
379 |
+
output_detections.append(torch.cat((boxes, scores), dim=-1))
|
380 |
+
|
381 |
+
return output_detections
|
382 |
+
|
383 |
+
def _decode_boxes(self, raw_boxes, anchors):
|
384 |
+
"""Converts the predictions into actual coordinates using
|
385 |
+
the anchor boxes. Processes the entire batch at once.
|
386 |
+
"""
|
387 |
+
boxes = torch.zeros_like(raw_boxes)
|
388 |
+
|
389 |
+
x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
|
390 |
+
y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
|
391 |
+
|
392 |
+
w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
|
393 |
+
h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
|
394 |
+
|
395 |
+
boxes[..., 0] = y_center - h / 2. # ymin
|
396 |
+
boxes[..., 1] = x_center - w / 2. # xmin
|
397 |
+
boxes[..., 2] = y_center + h / 2. # ymax
|
398 |
+
boxes[..., 3] = x_center + w / 2. # xmax
|
399 |
+
|
400 |
+
for k in range(self.num_keypoints):
|
401 |
+
offset = 4 + k*2
|
402 |
+
keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
|
403 |
+
keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
|
404 |
+
boxes[..., offset ] = keypoint_x
|
405 |
+
boxes[..., offset + 1] = keypoint_y
|
406 |
+
|
407 |
+
return boxes
|
408 |
+
|
409 |
+
def _weighted_non_max_suppression(self, detections):
|
410 |
+
"""The alternative NMS method as mentioned in the BlazeFace paper:
|
411 |
+
|
412 |
+
"We replace the suppression algorithm with a blending strategy that
|
413 |
+
estimates the regression parameters of a bounding box as a weighted
|
414 |
+
mean between the overlapping predictions."
|
415 |
+
|
416 |
+
The original MediaPipe code assigns the score of the most confident
|
417 |
+
detection to the weighted detection, but we take the average score
|
418 |
+
of the overlapping detections.
|
419 |
+
|
420 |
+
The input detections should be a Tensor of shape (count, 17).
|
421 |
+
|
422 |
+
Returns a list of PyTorch tensors, one for each detected face.
|
423 |
+
|
424 |
+
This is based on the source code from:
|
425 |
+
mediapipe/calculators/util/non_max_suppression_calculator.cc
|
426 |
+
mediapipe/calculators/util/non_max_suppression_calculator.proto
|
427 |
+
"""
|
428 |
+
if len(detections) == 0: return []
|
429 |
+
|
430 |
+
output_detections = []
|
431 |
+
|
432 |
+
# Sort the detections from highest to lowest score.
|
433 |
+
remaining = torch.argsort(detections[:, self.num_coords], descending=True)
|
434 |
+
|
435 |
+
while len(remaining) > 0:
|
436 |
+
detection = detections[remaining[0]]
|
437 |
+
|
438 |
+
# Compute the overlap between the first box and the other
|
439 |
+
# remaining boxes. (Note that the other_boxes also include
|
440 |
+
# the first_box.)
|
441 |
+
first_box = detection[:4]
|
442 |
+
other_boxes = detections[remaining, :4]
|
443 |
+
ious = overlap_similarity(first_box, other_boxes)
|
444 |
+
|
445 |
+
# If two detections don't overlap enough, they are considered
|
446 |
+
# to be from different faces.
|
447 |
+
mask = ious > self.min_suppression_threshold
|
448 |
+
overlapping = remaining[mask]
|
449 |
+
remaining = remaining[~mask]
|
450 |
+
|
451 |
+
# Take an average of the coordinates from the overlapping
|
452 |
+
# detections, weighted by their confidence scores.
|
453 |
+
weighted_detection = detection.clone()
|
454 |
+
if len(overlapping) > 1:
|
455 |
+
coordinates = detections[overlapping, :self.num_coords]
|
456 |
+
scores = detections[overlapping, self.num_coords:self.num_coords+1]
|
457 |
+
total_score = scores.sum()
|
458 |
+
weighted = (coordinates * scores).sum(dim=0) / total_score
|
459 |
+
weighted_detection[:self.num_coords] = weighted
|
460 |
+
weighted_detection[self.num_coords] = total_score / len(overlapping)
|
461 |
+
|
462 |
+
output_detections.append(weighted_detection)
|
463 |
+
|
464 |
+
return output_detections
|
465 |
+
|
466 |
+
|
467 |
+
# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
|
468 |
+
|
469 |
+
def intersect(box_a, box_b):
|
470 |
+
""" We resize both tensors to [A,B,2] without new malloc:
|
471 |
+
[A,2] -> [A,1,2] -> [A,B,2]
|
472 |
+
[B,2] -> [1,B,2] -> [A,B,2]
|
473 |
+
Then we compute the area of intersect between box_a and box_b.
|
474 |
+
Args:
|
475 |
+
box_a: (tensor) bounding boxes, Shape: [A,4].
|
476 |
+
box_b: (tensor) bounding boxes, Shape: [B,4].
|
477 |
+
Return:
|
478 |
+
(tensor) intersection area, Shape: [A,B].
|
479 |
+
"""
|
480 |
+
A = box_a.size(0)
|
481 |
+
B = box_b.size(0)
|
482 |
+
max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
|
483 |
+
box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
|
484 |
+
min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
|
485 |
+
box_b[:, :2].unsqueeze(0).expand(A, B, 2))
|
486 |
+
inter = torch.clamp((max_xy - min_xy), min=0)
|
487 |
+
return inter[:, :, 0] * inter[:, :, 1]
|
488 |
+
|
489 |
+
|
490 |
+
def jaccard(box_a, box_b):
|
491 |
+
"""Compute the jaccard overlap of two sets of boxes. The jaccard overlap
|
492 |
+
is simply the intersection over union of two boxes. Here we operate on
|
493 |
+
ground truth boxes and default boxes.
|
494 |
+
E.g.:
|
495 |
+
A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
|
496 |
+
Args:
|
497 |
+
box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
|
498 |
+
box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
|
499 |
+
Return:
|
500 |
+
jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
|
501 |
+
"""
|
502 |
+
inter = intersect(box_a, box_b)
|
503 |
+
area_a = ((box_a[:, 2]-box_a[:, 0]) *
|
504 |
+
(box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
|
505 |
+
area_b = ((box_b[:, 2]-box_b[:, 0]) *
|
506 |
+
(box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
|
507 |
+
union = area_a + area_b - inter
|
508 |
+
return inter / union # [A,B]
|
509 |
+
|
510 |
+
|
511 |
+
def overlap_similarity(box, other_boxes):
|
512 |
+
"""Computes the IOU between a bounding box and set of other boxes."""
|
513 |
+
return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface.py
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import torch.nn.functional as F
|
5 |
+
|
6 |
+
from blazebase import BlazeDetector, BlazeBlock, FinalBlazeBlock
|
7 |
+
|
8 |
+
|
9 |
+
class BlazeFace(BlazeDetector):
|
10 |
+
"""The BlazeFace face detection model from MediaPipe.
|
11 |
+
|
12 |
+
The version from MediaPipe is simpler than the one in the paper;
|
13 |
+
it does not use the "double" BlazeBlocks.
|
14 |
+
|
15 |
+
Because we won't be training this model, it doesn't need to have
|
16 |
+
batchnorm layers. These have already been "folded" into the conv
|
17 |
+
weights by TFLite.
|
18 |
+
|
19 |
+
The conversion to PyTorch is fairly straightforward, but there are
|
20 |
+
some small differences between TFLite and PyTorch in how they handle
|
21 |
+
padding on conv layers with stride 2.
|
22 |
+
|
23 |
+
This version works on batches, while the MediaPipe version can only
|
24 |
+
handle a single image at a time.
|
25 |
+
|
26 |
+
Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
|
27 |
+
https://github.com/hollance/BlazeFace-PyTorch and
|
28 |
+
https://github.com/google/mediapipe/
|
29 |
+
|
30 |
+
"""
|
31 |
+
def __init__(self, back_model=False):
|
32 |
+
super(BlazeFace, self).__init__()
|
33 |
+
|
34 |
+
# These are the settings from the MediaPipe example graph
|
35 |
+
# mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt
|
36 |
+
self.num_classes = 1
|
37 |
+
self.num_anchors = 896
|
38 |
+
self.num_coords = 16
|
39 |
+
self.score_clipping_thresh = 100.0
|
40 |
+
self.back_model = back_model
|
41 |
+
if back_model:
|
42 |
+
self.x_scale = 256.0
|
43 |
+
self.y_scale = 256.0
|
44 |
+
self.h_scale = 256.0
|
45 |
+
self.w_scale = 256.0
|
46 |
+
self.min_score_thresh = 0.65
|
47 |
+
else:
|
48 |
+
self.x_scale = 128.0
|
49 |
+
self.y_scale = 128.0
|
50 |
+
self.h_scale = 128.0
|
51 |
+
self.w_scale = 128.0
|
52 |
+
self.min_score_thresh = 0.75
|
53 |
+
self.min_suppression_threshold = 0.3
|
54 |
+
self.num_keypoints = 6
|
55 |
+
|
56 |
+
# These settings are for converting detections to ROIs which can then
|
57 |
+
# be extracted and feed into the landmark network
|
58 |
+
# use mediapipe/calculators/util/detections_to_rects_calculator.cc
|
59 |
+
self.detection2roi_method = 'box'
|
60 |
+
# mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
|
61 |
+
self.kp1 = 1
|
62 |
+
self.kp2 = 0
|
63 |
+
self.theta0 = 0.
|
64 |
+
self.dscale = 1.5
|
65 |
+
self.dy = 0.
|
66 |
+
|
67 |
+
self._define_layers()
|
68 |
+
|
69 |
+
def _define_layers(self):
|
70 |
+
if self.back_model:
|
71 |
+
self.backbone = nn.Sequential(
|
72 |
+
nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
|
73 |
+
nn.ReLU(inplace=True),
|
74 |
+
|
75 |
+
BlazeBlock(24, 24),
|
76 |
+
BlazeBlock(24, 24),
|
77 |
+
BlazeBlock(24, 24),
|
78 |
+
BlazeBlock(24, 24),
|
79 |
+
BlazeBlock(24, 24),
|
80 |
+
BlazeBlock(24, 24),
|
81 |
+
BlazeBlock(24, 24),
|
82 |
+
BlazeBlock(24, 24, stride=2),
|
83 |
+
BlazeBlock(24, 24),
|
84 |
+
BlazeBlock(24, 24),
|
85 |
+
BlazeBlock(24, 24),
|
86 |
+
BlazeBlock(24, 24),
|
87 |
+
BlazeBlock(24, 24),
|
88 |
+
BlazeBlock(24, 24),
|
89 |
+
BlazeBlock(24, 24),
|
90 |
+
BlazeBlock(24, 48, stride=2),
|
91 |
+
BlazeBlock(48, 48),
|
92 |
+
BlazeBlock(48, 48),
|
93 |
+
BlazeBlock(48, 48),
|
94 |
+
BlazeBlock(48, 48),
|
95 |
+
BlazeBlock(48, 48),
|
96 |
+
BlazeBlock(48, 48),
|
97 |
+
BlazeBlock(48, 48),
|
98 |
+
BlazeBlock(48, 96, stride=2),
|
99 |
+
BlazeBlock(96, 96),
|
100 |
+
BlazeBlock(96, 96),
|
101 |
+
BlazeBlock(96, 96),
|
102 |
+
BlazeBlock(96, 96),
|
103 |
+
BlazeBlock(96, 96),
|
104 |
+
BlazeBlock(96, 96),
|
105 |
+
BlazeBlock(96, 96),
|
106 |
+
)
|
107 |
+
self.final = FinalBlazeBlock(96)
|
108 |
+
self.classifier_8 = nn.Conv2d(96, 2, 1, bias=True)
|
109 |
+
self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
|
110 |
+
|
111 |
+
self.regressor_8 = nn.Conv2d(96, 32, 1, bias=True)
|
112 |
+
self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
|
113 |
+
else:
|
114 |
+
self.backbone1 = nn.Sequential(
|
115 |
+
nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
|
116 |
+
nn.ReLU(inplace=True),
|
117 |
+
|
118 |
+
BlazeBlock(24, 24),
|
119 |
+
BlazeBlock(24, 28),
|
120 |
+
BlazeBlock(28, 32, stride=2),
|
121 |
+
BlazeBlock(32, 36),
|
122 |
+
BlazeBlock(36, 42),
|
123 |
+
BlazeBlock(42, 48, stride=2),
|
124 |
+
BlazeBlock(48, 56),
|
125 |
+
BlazeBlock(56, 64),
|
126 |
+
BlazeBlock(64, 72),
|
127 |
+
BlazeBlock(72, 80),
|
128 |
+
BlazeBlock(80, 88),
|
129 |
+
)
|
130 |
+
|
131 |
+
self.backbone2 = nn.Sequential(
|
132 |
+
BlazeBlock(88, 96, stride=2),
|
133 |
+
BlazeBlock(96, 96),
|
134 |
+
BlazeBlock(96, 96),
|
135 |
+
BlazeBlock(96, 96),
|
136 |
+
BlazeBlock(96, 96),
|
137 |
+
)
|
138 |
+
|
139 |
+
self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True)
|
140 |
+
self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
|
141 |
+
|
142 |
+
self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True)
|
143 |
+
self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
|
144 |
+
|
145 |
+
def forward(self, x):
|
146 |
+
# TFLite uses slightly different padding on the first conv layer
|
147 |
+
# than PyTorch, so do it manually.
|
148 |
+
x = F.pad(x, (1, 2, 1, 2), "constant", 0)
|
149 |
+
|
150 |
+
b = x.shape[0] # batch size, needed for reshaping later
|
151 |
+
|
152 |
+
if self.back_model:
|
153 |
+
x = self.backbone(x) # (b, 16, 16, 96)
|
154 |
+
h = self.final(x) # (b, 8, 8, 96)
|
155 |
+
else:
|
156 |
+
x = self.backbone1(x) # (b, 88, 16, 16)
|
157 |
+
h = self.backbone2(x) # (b, 96, 8, 8)
|
158 |
+
|
159 |
+
# Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
|
160 |
+
# permute the output from the conv layers before reshaping it.
|
161 |
+
|
162 |
+
c1 = self.classifier_8(x) # (b, 2, 16, 16)
|
163 |
+
c1 = c1.permute(0, 2, 3, 1) # (b, 16, 16, 2)
|
164 |
+
c1 = c1.reshape(b, -1, 1) # (b, 512, 1)
|
165 |
+
|
166 |
+
c2 = self.classifier_16(h) # (b, 6, 8, 8)
|
167 |
+
c2 = c2.permute(0, 2, 3, 1) # (b, 8, 8, 6)
|
168 |
+
c2 = c2.reshape(b, -1, 1) # (b, 384, 1)
|
169 |
+
|
170 |
+
c = torch.cat((c1, c2), dim=1) # (b, 896, 1)
|
171 |
+
|
172 |
+
r1 = self.regressor_8(x) # (b, 32, 16, 16)
|
173 |
+
r1 = r1.permute(0, 2, 3, 1) # (b, 16, 16, 32)
|
174 |
+
r1 = r1.reshape(b, -1, 16) # (b, 512, 16)
|
175 |
+
|
176 |
+
r2 = self.regressor_16(h) # (b, 96, 8, 8)
|
177 |
+
r2 = r2.permute(0, 2, 3, 1) # (b, 8, 8, 96)
|
178 |
+
r2 = r2.reshape(b, -1, 16) # (b, 384, 16)
|
179 |
+
|
180 |
+
r = torch.cat((r1, r2), dim=1) # (b, 896, 16)
|
181 |
+
return [r, c]
|
182 |
+
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface_landmark.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import torch.nn.functional as F
|
5 |
+
|
6 |
+
from blazebase import BlazeLandmark, BlazeBlock
|
7 |
+
|
8 |
+
class BlazeFaceLandmark(BlazeLandmark):
|
9 |
+
"""The face landmark model from MediaPipe.
|
10 |
+
|
11 |
+
"""
|
12 |
+
def __init__(self):
|
13 |
+
super(BlazeFaceLandmark, self).__init__()
|
14 |
+
|
15 |
+
# size of ROIs used for input
|
16 |
+
self.resolution = 192
|
17 |
+
|
18 |
+
self._define_layers()
|
19 |
+
|
20 |
+
def _define_layers(self):
|
21 |
+
self.backbone1 = nn.Sequential(
|
22 |
+
nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=0, bias=True),
|
23 |
+
nn.PReLU(16),
|
24 |
+
|
25 |
+
BlazeBlock(16, 16, 3, act='prelu'),
|
26 |
+
BlazeBlock(16, 16, 3, act='prelu'),
|
27 |
+
BlazeBlock(16, 32, 3, 2, act='prelu'),
|
28 |
+
|
29 |
+
BlazeBlock(32, 32, 3, act='prelu'),
|
30 |
+
BlazeBlock(32, 32, 3, act='prelu'),
|
31 |
+
BlazeBlock(32, 64, 3, 2, act='prelu'),
|
32 |
+
|
33 |
+
BlazeBlock(64, 64, 3, act='prelu'),
|
34 |
+
BlazeBlock(64, 64, 3, act='prelu'),
|
35 |
+
BlazeBlock(64, 128, 3, 2, act='prelu'),
|
36 |
+
|
37 |
+
BlazeBlock(128, 128, 3, act='prelu'),
|
38 |
+
BlazeBlock(128, 128, 3, act='prelu'),
|
39 |
+
BlazeBlock(128, 128, 3, 2, act='prelu'),
|
40 |
+
|
41 |
+
BlazeBlock(128, 128, 3, act='prelu'),
|
42 |
+
BlazeBlock(128, 128, 3, act='prelu'),
|
43 |
+
)
|
44 |
+
|
45 |
+
|
46 |
+
self.backbone2a = nn.Sequential(
|
47 |
+
BlazeBlock(128, 128, 3, 2, act='prelu'),
|
48 |
+
BlazeBlock(128, 128, 3, act='prelu'),
|
49 |
+
BlazeBlock(128, 128, 3, act='prelu'),
|
50 |
+
nn.Conv2d(128, 32, 1, padding=0, bias=True),
|
51 |
+
nn.PReLU(32),
|
52 |
+
BlazeBlock(32, 32, 3, act='prelu'),
|
53 |
+
nn.Conv2d(32, 1404, 3, padding=0, bias=True)
|
54 |
+
)
|
55 |
+
|
56 |
+
self.backbone2b = nn.Sequential(
|
57 |
+
BlazeBlock(128, 128, 3, 2, act='prelu'),
|
58 |
+
nn.Conv2d(128, 32, 1, padding=0, bias=True),
|
59 |
+
nn.PReLU(32),
|
60 |
+
BlazeBlock(32, 32, 3, act='prelu'),
|
61 |
+
nn.Conv2d(32, 1, 3, padding=0, bias=True)
|
62 |
+
)
|
63 |
+
|
64 |
+
def forward(self, x):
|
65 |
+
if x.shape[0] == 0:
|
66 |
+
return torch.zeros((0,)), torch.zeros((0, 468, 3))
|
67 |
+
|
68 |
+
x = F.pad(x, (0, 1, 0, 1), "constant", 0)
|
69 |
+
|
70 |
+
x = self.backbone1(x)
|
71 |
+
landmarks = self.backbone2a(x).view(-1, 468, 3) / 192
|
72 |
+
flag = self.backbone2b(x).sigmoid().view(-1)
|
73 |
+
|
74 |
+
return flag, landmarks
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/coco.jpg
ADDED
![]() |
Git LFS Details
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/demo_qnn.py
ADDED
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import cv2
|
4 |
+
import sys
|
5 |
+
from blazebase import resize_pad, denormalize_detections
|
6 |
+
from visualization import draw_landmarks, draw_roi, FACE_CONNECTIONS
|
7 |
+
import time
|
8 |
+
import aidlite
|
9 |
+
import os
|
10 |
+
|
11 |
+
class post_mediapipe_face:
|
12 |
+
def __init__(self):
|
13 |
+
self.kp1 = 1
|
14 |
+
self.kp2 = 0
|
15 |
+
self.theta0 = 0.
|
16 |
+
self.dscale = 1.5
|
17 |
+
self.dy = 0.
|
18 |
+
self.x_scale = 256.0
|
19 |
+
self.y_scale = 256.0
|
20 |
+
self.h_scale = 256.0
|
21 |
+
self.w_scale = 256.0
|
22 |
+
self.num_keypoints = 6
|
23 |
+
self.num_classes = 1
|
24 |
+
self.num_anchors = 896
|
25 |
+
self.num_coords = 16
|
26 |
+
self.min_score_thresh = 0.4 #0.65
|
27 |
+
self.score_clipping_thresh = 100.0
|
28 |
+
self.min_suppression_threshold = 0.3
|
29 |
+
self.resolution = 192
|
30 |
+
|
31 |
+
|
32 |
+
def detection2roi(self,detection):
|
33 |
+
xc = (detection[:,1] + detection[:,3]) / 2
|
34 |
+
yc = (detection[:,0] + detection[:,2]) / 2
|
35 |
+
scale = (detection[:,3] - detection[:,1]) # assumes square boxes
|
36 |
+
yc += self.dy * scale
|
37 |
+
scale *= self.dscale
|
38 |
+
# compute box rotation
|
39 |
+
x0 = detection[:,4+2*self.kp1]
|
40 |
+
y0 = detection[:,4+2*self.kp1+1]
|
41 |
+
x1 = detection[:,4+2*self.kp2]
|
42 |
+
y1 = detection[:,4+2*self.kp2+1]
|
43 |
+
theta = torch.atan2(y0-y1, x0-x1) - self.theta0
|
44 |
+
return xc, yc, scale, theta
|
45 |
+
|
46 |
+
def _decode_boxes( self,raw_boxes, anchors):
|
47 |
+
boxes = torch.zeros_like(raw_boxes)
|
48 |
+
|
49 |
+
x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
|
50 |
+
y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
|
51 |
+
|
52 |
+
w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
|
53 |
+
h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
|
54 |
+
|
55 |
+
boxes[..., 0] = y_center - h / 2. # ymin
|
56 |
+
boxes[..., 1] = x_center - w / 2. # xmin
|
57 |
+
boxes[..., 2] = y_center + h / 2. # ymax
|
58 |
+
boxes[..., 3] = x_center + w / 2. # xmax
|
59 |
+
|
60 |
+
for k in range(self.num_keypoints):
|
61 |
+
offset = 4 + k*2
|
62 |
+
keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
|
63 |
+
keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
|
64 |
+
boxes[..., offset ] = keypoint_x
|
65 |
+
boxes[..., offset + 1] = keypoint_y
|
66 |
+
return boxes
|
67 |
+
|
68 |
+
def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
|
69 |
+
assert raw_box_tensor.ndimension() == 3
|
70 |
+
assert raw_box_tensor.shape[1] == self.num_anchors
|
71 |
+
assert raw_box_tensor.shape[2] == self.num_coords
|
72 |
+
|
73 |
+
assert raw_score_tensor.ndimension() == 3
|
74 |
+
assert raw_score_tensor.shape[1] == self.num_anchors
|
75 |
+
assert raw_score_tensor.shape[2] == self.num_classes
|
76 |
+
|
77 |
+
assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
|
78 |
+
|
79 |
+
detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
|
80 |
+
|
81 |
+
thresh = self.score_clipping_thresh
|
82 |
+
raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
|
83 |
+
detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
|
84 |
+
|
85 |
+
# Note: we stripped off the last dimension from the scores tensor
|
86 |
+
# because there is only has one class. Now we can simply use a mask
|
87 |
+
# to filter out the boxes with too low confidence.
|
88 |
+
mask = detection_scores >= self.min_score_thresh
|
89 |
+
|
90 |
+
# Because each image from the batch can have a different number of
|
91 |
+
# detections, process them one at a time using a loop.
|
92 |
+
output_detections = []
|
93 |
+
for i in range(raw_box_tensor.shape[0]):
|
94 |
+
boxes = detection_boxes[i, mask[i]]
|
95 |
+
scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
|
96 |
+
output_detections.append(torch.cat((boxes, scores), dim=-1))
|
97 |
+
|
98 |
+
return output_detections
|
99 |
+
|
100 |
+
def extract_roi( self,frame, xc, yc, theta, scale):
|
101 |
+
resolution = 192
|
102 |
+
# take points on unit square and transform them according to the roi
|
103 |
+
points = torch.tensor([[-1, -1, 1, 1],
|
104 |
+
[-1, 1, -1, 1]], device=scale.device).view(1,2,4)
|
105 |
+
points = points * scale.view(-1,1,1)/2
|
106 |
+
theta = theta.view(-1, 1, 1)
|
107 |
+
R = torch.cat((
|
108 |
+
torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
|
109 |
+
torch.cat((torch.sin(theta), torch.cos(theta)), 2),
|
110 |
+
), 1)
|
111 |
+
center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
|
112 |
+
points = R @ points + center
|
113 |
+
|
114 |
+
# use the points to compute the affine transform that maps
|
115 |
+
# these points back to the output square
|
116 |
+
res = resolution
|
117 |
+
points1 = np.array([[0, 0, res-1],
|
118 |
+
[0, res-1, 0]], dtype=np.float32).T
|
119 |
+
affines = []
|
120 |
+
imgs = []
|
121 |
+
for i in range(points.shape[0]):
|
122 |
+
pts = points[i, :, :3].detach().numpy().T
|
123 |
+
M = cv2.getAffineTransform(pts, points1)
|
124 |
+
img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
|
125 |
+
img = torch.tensor(img, device=scale.device)
|
126 |
+
imgs.append(img)
|
127 |
+
affine = cv2.invertAffineTransform(M).astype('float32')
|
128 |
+
affine = torch.tensor(affine, device=scale.device)
|
129 |
+
affines.append(affine)
|
130 |
+
if imgs:
|
131 |
+
imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
|
132 |
+
affines = torch.stack(affines)
|
133 |
+
else:
|
134 |
+
imgs = torch.zeros((0, 3, res, res), device=scale.device)
|
135 |
+
affines = torch.zeros((0, 2, 3), device=scale.device)
|
136 |
+
|
137 |
+
return imgs, affines, points
|
138 |
+
|
139 |
+
def denormalize_landmarks(self, landmarks, affines):
|
140 |
+
landmarks[:,:,:2] *= self.resolution
|
141 |
+
for i in range(len(landmarks)):
|
142 |
+
landmark, affine = landmarks[i], affines[i]
|
143 |
+
landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
|
144 |
+
landmarks[i,:,:2] = landmark
|
145 |
+
return landmarks
|
146 |
+
|
147 |
+
def intersect(self,box_a, box_b):
|
148 |
+
A = box_a.size(0)
|
149 |
+
B = box_b.size(0)
|
150 |
+
max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
|
151 |
+
box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
|
152 |
+
min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
|
153 |
+
box_b[:, :2].unsqueeze(0).expand(A, B, 2))
|
154 |
+
inter = torch.clamp((max_xy - min_xy), min=0)
|
155 |
+
return inter[:, :, 0] * inter[:, :, 1]
|
156 |
+
|
157 |
+
def jaccard(self,box_a, box_b):
|
158 |
+
inter = self.intersect(box_a, box_b)
|
159 |
+
area_a = ((box_a[:, 2]-box_a[:, 0]) *
|
160 |
+
(box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
|
161 |
+
area_b = ((box_b[:, 2]-box_b[:, 0]) *
|
162 |
+
(box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
|
163 |
+
union = area_a + area_b - inter
|
164 |
+
return inter / union # [A,B]
|
165 |
+
|
166 |
+
|
167 |
+
def overlap_similarity(self,box, other_boxes):
|
168 |
+
"""Computes the IOU between a bounding box and set of other boxes."""
|
169 |
+
return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
|
170 |
+
|
171 |
+
def _weighted_non_max_suppression(self,detections):
|
172 |
+
if len(detections) == 0: return []
|
173 |
+
output_detections = []
|
174 |
+
|
175 |
+
# Sort the detections from highest to lowest score.
|
176 |
+
remaining = torch.argsort(detections[:, num_coords], descending=True)
|
177 |
+
|
178 |
+
while len(remaining) > 0:
|
179 |
+
detection = detections[remaining[0]]
|
180 |
+
|
181 |
+
# Compute the overlap between the first box and the other
|
182 |
+
# remaining boxes. (Note that the other_boxes also include
|
183 |
+
# the first_box.)
|
184 |
+
first_box = detection[:4]
|
185 |
+
other_boxes = detections[remaining, :4]
|
186 |
+
ious = self.overlap_similarity(first_box, other_boxes)
|
187 |
+
|
188 |
+
# If two detections don't overlap enough, they are considered
|
189 |
+
# to be from different faces.
|
190 |
+
mask = ious > self.min_suppression_threshold
|
191 |
+
overlapping = remaining[mask]
|
192 |
+
remaining = remaining[~mask]
|
193 |
+
|
194 |
+
# Take an average of the coordinates from the overlapping
|
195 |
+
# detections, weighted by their confidence scores.
|
196 |
+
weighted_detection = detection.clone()
|
197 |
+
if len(overlapping) > 1:
|
198 |
+
coordinates = detections[overlapping, :num_coords]
|
199 |
+
scores = detections[overlapping, num_coords:num_coords+1]
|
200 |
+
total_score = scores.sum()
|
201 |
+
weighted = (coordinates * scores).sum(dim=0) / total_score
|
202 |
+
weighted_detection[:num_coords] = weighted
|
203 |
+
weighted_detection[num_coords] = total_score / len(overlapping)
|
204 |
+
|
205 |
+
output_detections.append(weighted_detection)
|
206 |
+
|
207 |
+
return output_detections
|
208 |
+
|
209 |
+
def draw_detections(img, detections, with_keypoints=True):
|
210 |
+
if isinstance(detections, torch.Tensor):
|
211 |
+
detections = detections.detach().numpy()
|
212 |
+
|
213 |
+
if detections.ndim == 1:
|
214 |
+
detections = np.expand_dims(detections, axis=0)
|
215 |
+
|
216 |
+
n_keypoints = detections.shape[1] // 2 - 2
|
217 |
+
|
218 |
+
for i in range(detections.shape[0]):
|
219 |
+
ymin = detections[i, 0]
|
220 |
+
xmin = detections[i, 1]
|
221 |
+
ymax = detections[i, 2]
|
222 |
+
xmax = detections[i, 3]
|
223 |
+
|
224 |
+
start_point = (int(xmin), int(ymin))
|
225 |
+
end_point = (int(xmax), int(ymax))
|
226 |
+
img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1)
|
227 |
+
|
228 |
+
if with_keypoints:
|
229 |
+
for k in range(n_keypoints):
|
230 |
+
kp_x = int(detections[i, 4 + k*2 ])
|
231 |
+
kp_y = int(detections[i, 4 + k*2 + 1])
|
232 |
+
cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
|
233 |
+
return img
|
234 |
+
|
235 |
+
|
236 |
+
|
237 |
+
post_process=post_mediapipe_face()
|
238 |
+
|
239 |
+
class faceDetectionQnn:
|
240 |
+
def __init__(self):
|
241 |
+
super().__init__()
|
242 |
+
self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceDetctor_w8a16.qnn216.ctx.bin"))
|
243 |
+
if self.model is None:
|
244 |
+
print("Create model failed !")
|
245 |
+
return
|
246 |
+
|
247 |
+
self.config = aidlite.Config.create_instance()
|
248 |
+
if self.config is None:
|
249 |
+
print("build_interpretper_from_model_and_config failed !")
|
250 |
+
return
|
251 |
+
|
252 |
+
self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
|
253 |
+
self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
|
254 |
+
self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
|
255 |
+
self.config.is_quantify_model = 1
|
256 |
+
|
257 |
+
self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
|
258 |
+
if self.interpreter is None:
|
259 |
+
print("build_interpretper_from_model_and_config failed !")
|
260 |
+
return
|
261 |
+
input_shapes = [[1,3, 256, 256]]
|
262 |
+
output_shapes = [[1, 896,16],[1,896,1]]
|
263 |
+
self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
|
264 |
+
output_shapes, aidlite.DataType.TYPE_FLOAT32)
|
265 |
+
|
266 |
+
if self.interpreter is None:
|
267 |
+
print("build_interpretper_from_model_and_config failed !")
|
268 |
+
result = self.interpreter.init()
|
269 |
+
if result != 0:
|
270 |
+
print(f"interpreter init failed !")
|
271 |
+
result = self.interpreter.load_model()
|
272 |
+
if result != 0:
|
273 |
+
print("interpreter load model failed !")
|
274 |
+
|
275 |
+
print(" model load success!")
|
276 |
+
|
277 |
+
def __call__(self, input):
|
278 |
+
self.interpreter.set_input_tensor(0,input)
|
279 |
+
self.interpreter.invoke()
|
280 |
+
features_0 = self.interpreter.get_output_tensor(0).reshape(1, 896,16).copy()
|
281 |
+
features_1 = self.interpreter.get_output_tensor(1).reshape(1, 896,1).copy()
|
282 |
+
return features_0,features_1
|
283 |
+
|
284 |
+
|
285 |
+
class faceLandmarkQnn:
|
286 |
+
def __init__(self):
|
287 |
+
super().__init__()
|
288 |
+
self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceLandmark_w8a16.qnn216.ctx.bin"))
|
289 |
+
if self.model is None:
|
290 |
+
print("Create model failed !")
|
291 |
+
return
|
292 |
+
|
293 |
+
self.config = aidlite.Config.create_instance()
|
294 |
+
if self.config is None:
|
295 |
+
print("build_interpretper_from_model_and_config failed !")
|
296 |
+
return
|
297 |
+
|
298 |
+
self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
|
299 |
+
self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
|
300 |
+
self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
|
301 |
+
self.config.is_quantify_model = 1
|
302 |
+
|
303 |
+
self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
|
304 |
+
if self.interpreter is None:
|
305 |
+
print("build_interpretper_from_model_and_config failed !")
|
306 |
+
return
|
307 |
+
input_shapes = [[1, 3, 192, 192]]
|
308 |
+
output_shapes = [[1],[1,468,3]]
|
309 |
+
self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
|
310 |
+
output_shapes, aidlite.DataType.TYPE_FLOAT32)
|
311 |
+
|
312 |
+
if self.interpreter is None:
|
313 |
+
print("build_interpretper_from_model_and_config failed !")
|
314 |
+
result = self.interpreter.init()
|
315 |
+
if result != 0:
|
316 |
+
print(f"interpreter init failed !")
|
317 |
+
result = self.interpreter.load_model()
|
318 |
+
if result != 0:
|
319 |
+
print("interpreter load model failed !")
|
320 |
+
|
321 |
+
print(" model load success!")
|
322 |
+
|
323 |
+
def __call__(self, input):
|
324 |
+
self.interpreter.set_input_tensor(0,input)
|
325 |
+
self.interpreter.invoke()
|
326 |
+
features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy()
|
327 |
+
features_1 = self.interpreter.get_output_tensor(1).reshape(1,468,3).copy()
|
328 |
+
return features_0,features_1
|
329 |
+
|
330 |
+
|
331 |
+
|
332 |
+
anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/anchors_face_back.npy")), dtype=torch.float32, device='cpu')
|
333 |
+
face_detc = faceDetectionQnn()
|
334 |
+
face_rec = faceLandmarkQnn()
|
335 |
+
|
336 |
+
image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"coco.jpg")
|
337 |
+
|
338 |
+
frame_ct=0
|
339 |
+
image = cv2.imread(image_path)
|
340 |
+
|
341 |
+
frame = np.ascontiguousarray(image[:,:,::-1])
|
342 |
+
|
343 |
+
img1, img2, scale, pad = resize_pad(frame)
|
344 |
+
|
345 |
+
input = (img1 / 255).astype(np.float32)
|
346 |
+
input = np.transpose(input, (2, 0, 1))
|
347 |
+
input = input[np.newaxis, ...]
|
348 |
+
t0 = time.time()
|
349 |
+
out = face_detc(input)
|
350 |
+
use_time = round((time.time() - t0) * 1000, 2)
|
351 |
+
print(f"face detction inference_time:{use_time} ms")
|
352 |
+
detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors)
|
353 |
+
|
354 |
+
filtered_detections = []
|
355 |
+
num_coords = 16
|
356 |
+
for i in range(len(detections)):
|
357 |
+
faces = post_process._weighted_non_max_suppression(detections[i])
|
358 |
+
faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1))
|
359 |
+
filtered_detections.append(faces)
|
360 |
+
|
361 |
+
face_detections = denormalize_detections(filtered_detections[0], scale, pad)
|
362 |
+
|
363 |
+
xc, yc, scale, theta = post_process.detection2roi(face_detections)
|
364 |
+
|
365 |
+
img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale)
|
366 |
+
if box.size()[0]!=0:
|
367 |
+
t2 = time.time()
|
368 |
+
flags, normalized_landmarks = face_rec(img.numpy())
|
369 |
+
|
370 |
+
use_time = round((time.time() - t2) * 1000, 2)
|
371 |
+
print(f"landmark inference_time:{use_time} ms")
|
372 |
+
|
373 |
+
landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine)
|
374 |
+
|
375 |
+
for i in range(len(flags)):
|
376 |
+
landmark, flag = landmarks[i], flags[i]
|
377 |
+
if flag>.4: # 0.5
|
378 |
+
draw_landmarks(frame, landmark[:,:2], FACE_CONNECTIONS, size=1)
|
379 |
+
else:
|
380 |
+
print("not detect face !")
|
381 |
+
|
382 |
+
draw_roi(frame, box)
|
383 |
+
draw_detections(frame, face_detections)
|
384 |
+
cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1])
|
385 |
+
face_detc.interpreter.destory()
|
386 |
+
face_rec.interpreter.destory()
|
387 |
+
|
388 |
+
|
389 |
+
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/export_jit.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import os
|
3 |
+
from typing import Callable, Tuple
|
4 |
+
from blazeface import BlazeFace
|
5 |
+
from blazeface_landmark import BlazeFaceLandmark
|
6 |
+
|
7 |
+
class FaceDetector(torch.nn.Module):
|
8 |
+
def __init__(
|
9 |
+
self,
|
10 |
+
detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]],
|
11 |
+
anchors: torch.Tensor,
|
12 |
+
):
|
13 |
+
super().__init__()
|
14 |
+
self.detector = detector
|
15 |
+
self.anchors = anchors
|
16 |
+
|
17 |
+
def forward(self, image):
|
18 |
+
return self.detector(image)
|
19 |
+
|
20 |
+
back_detector = True
|
21 |
+
face_detector = BlazeFace(back_model=back_detector)
|
22 |
+
face_detector.load_weights(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/blazefaceback.pth"))
|
23 |
+
face_detector.load_anchors(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/anchors_face_back.npy"))
|
24 |
+
face_detect = FaceDetector(face_detector,face_detector.anchors)
|
25 |
+
num_params = sum(p.numel() for p in face_detect.parameters() if p.requires_grad)
|
26 |
+
print(f'Number of face_detect parameters: {num_params}')
|
27 |
+
|
28 |
+
face_d_in = torch.randn(1, 3, 256, 256,dtype= torch.float32)
|
29 |
+
source_model = torch.jit.trace(face_detect,face_d_in)
|
30 |
+
source_model.save(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/m_faceDetctor.pt"))
|
31 |
+
print("export face detect ok!")
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
class FaceLandmarkDetector(torch.nn.Module):
|
38 |
+
def __init__(
|
39 |
+
self,
|
40 |
+
detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]],
|
41 |
+
):
|
42 |
+
super().__init__()
|
43 |
+
self.detector = detector
|
44 |
+
|
45 |
+
def forward(self, image):
|
46 |
+
return self.detector(image)
|
47 |
+
|
48 |
+
face_regressor = BlazeFaceLandmark()
|
49 |
+
face_regressor.load_weights(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/blazeface_landmark.pth"))
|
50 |
+
face_regres = FaceLandmarkDetector(face_regressor)
|
51 |
+
num_params = sum(p.numel() for p in face_regres.parameters() if p.requires_grad)
|
52 |
+
print(f'Number of face_regres parameters: {num_params}')
|
53 |
+
|
54 |
+
face_r_in = torch.randn(1, 3, 192, 192,dtype= torch.float32)
|
55 |
+
source_model = torch.jit.trace(face_regres, face_r_in)
|
56 |
+
source_model.save(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/m_faceLandmark.pt"))
|
57 |
+
print("export face landmark ok!")
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/visualization.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import cv2
|
3 |
+
import torch
|
4 |
+
|
5 |
+
def draw_detections(img, detections, with_keypoints=True):
|
6 |
+
if isinstance(detections, torch.Tensor):
|
7 |
+
detections = detections.cpu().numpy()
|
8 |
+
|
9 |
+
if detections.ndim == 1:
|
10 |
+
detections = np.expand_dims(detections, axis=0)
|
11 |
+
|
12 |
+
n_keypoints = detections.shape[1] // 2 - 2
|
13 |
+
|
14 |
+
for i in range(detections.shape[0]):
|
15 |
+
ymin = detections[i, 0]
|
16 |
+
xmin = detections[i, 1]
|
17 |
+
ymax = detections[i, 2]
|
18 |
+
xmax = detections[i, 3]
|
19 |
+
|
20 |
+
start_point = (int(xmin), int(ymin))
|
21 |
+
end_point = (int(xmax), int(ymax))
|
22 |
+
img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1)
|
23 |
+
|
24 |
+
if with_keypoints:
|
25 |
+
for k in range(n_keypoints):
|
26 |
+
kp_x = int(detections[i, 4 + k*2 ])
|
27 |
+
kp_y = int(detections[i, 4 + k*2 + 1])
|
28 |
+
cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
|
29 |
+
return img
|
30 |
+
|
31 |
+
|
32 |
+
def draw_roi(img, roi):
|
33 |
+
for i in range(roi.shape[0]):
|
34 |
+
(x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i]
|
35 |
+
cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2)
|
36 |
+
cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2)
|
37 |
+
cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2)
|
38 |
+
cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2)
|
39 |
+
|
40 |
+
|
41 |
+
def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2):
|
42 |
+
points = points[:,:2]
|
43 |
+
for point in points:
|
44 |
+
x, y = point
|
45 |
+
x, y = int(x), int(y)
|
46 |
+
cv2.circle(img, (x, y), size, color, thickness=size)
|
47 |
+
for connection in connections:
|
48 |
+
x0, y0 = points[connection[0]]
|
49 |
+
x1, y1 = points[connection[1]]
|
50 |
+
x0, y0 = int(x0), int(y0)
|
51 |
+
x1, y1 = int(x1), int(y1)
|
52 |
+
cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size)
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py
|
57 |
+
# 8 12 16 20
|
58 |
+
# | | | |
|
59 |
+
# 7 11 15 19
|
60 |
+
# 4 | | | |
|
61 |
+
# | 6 10 14 18
|
62 |
+
# 3 | | | |
|
63 |
+
# | 5---9---13--17
|
64 |
+
# 2 \ /
|
65 |
+
# \ \ /
|
66 |
+
# 1 \ /
|
67 |
+
# \ \ /
|
68 |
+
# ------0-
|
69 |
+
HAND_CONNECTIONS = [
|
70 |
+
(0, 1), (1, 2), (2, 3), (3, 4),
|
71 |
+
(5, 6), (6, 7), (7, 8),
|
72 |
+
(9, 10), (10, 11), (11, 12),
|
73 |
+
(13, 14), (14, 15), (15, 16),
|
74 |
+
(17, 18), (18, 19), (19, 20),
|
75 |
+
(0, 5), (5, 9), (9, 13), (13, 17), (0, 17)
|
76 |
+
]
|
77 |
+
|
78 |
+
POSE_CONNECTIONS = [
|
79 |
+
(0,1), (1,2), (2,3), (3,7),
|
80 |
+
(0,4), (4,5), (5,6), (6,8),
|
81 |
+
(9,10),
|
82 |
+
(11,13), (13,15), (15,17), (17,19), (19,15), (15,21),
|
83 |
+
(12,14), (14,16), (16,18), (18,20), (20,16), (16,22),
|
84 |
+
(11,12), (12,24), (24,23), (23,11)
|
85 |
+
]
|
86 |
+
|
87 |
+
# Vertex indices can be found in
|
88 |
+
# github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png
|
89 |
+
# Found in github.com/google/mediapipe/python/solutions/face_mesh.py
|
90 |
+
FACE_CONNECTIONS = [
|
91 |
+
# Lips.
|
92 |
+
(61, 146), (146, 91), (91, 181), (181, 84), (84, 17),
|
93 |
+
(17, 314), (314, 405), (405, 321), (321, 375), (375, 291),
|
94 |
+
(61, 185), (185, 40), (40, 39), (39, 37), (37, 0),
|
95 |
+
(0, 267), (267, 269), (269, 270), (270, 409), (409, 291),
|
96 |
+
(78, 95), (95, 88), (88, 178), (178, 87), (87, 14),
|
97 |
+
(14, 317), (317, 402), (402, 318), (318, 324), (324, 308),
|
98 |
+
(78, 191), (191, 80), (80, 81), (81, 82), (82, 13),
|
99 |
+
(13, 312), (312, 311), (311, 310), (310, 415), (415, 308),
|
100 |
+
# Left eye.
|
101 |
+
(263, 249), (249, 390), (390, 373), (373, 374), (374, 380),
|
102 |
+
(380, 381), (381, 382), (382, 362), (263, 466), (466, 388),
|
103 |
+
(388, 387), (387, 386), (386, 385), (385, 384), (384, 398),
|
104 |
+
(398, 362),
|
105 |
+
# Left eyebrow.
|
106 |
+
(276, 283), (283, 282), (282, 295), (295, 285), (300, 293),
|
107 |
+
(293, 334), (334, 296), (296, 336),
|
108 |
+
# Right eye.
|
109 |
+
(33, 7), (7, 163), (163, 144), (144, 145), (145, 153),
|
110 |
+
(153, 154), (154, 155), (155, 133), (33, 246), (246, 161),
|
111 |
+
(161, 160), (160, 159), (159, 158), (158, 157), (157, 173),
|
112 |
+
(173, 133),
|
113 |
+
# Right eyebrow.
|
114 |
+
(46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105),
|
115 |
+
(105, 66), (66, 107),
|
116 |
+
# Face oval.
|
117 |
+
(10, 338), (338, 297), (297, 332), (332, 284), (284, 251),
|
118 |
+
(251, 389), (389, 356), (356, 454), (454, 323), (323, 361),
|
119 |
+
(361, 288), (288, 397), (397, 365), (365, 379), (379, 378),
|
120 |
+
(378, 400), (400, 377), (377, 152), (152, 148), (148, 176),
|
121 |
+
(176, 149), (149, 150), (150, 136), (136, 172), (172, 58),
|
122 |
+
(58, 132), (132, 93), (93, 234), (234, 127), (127, 162),
|
123 |
+
(162, 21), (21, 54), (54, 103), (103, 67), (67, 109),
|
124 |
+
(109, 10)
|
125 |
+
]
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/README.md
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Model Information
|
2 |
+
### Source model
|
3 |
+
- Input shape: [1x3x256x256],[1x3x192x192]
|
4 |
+
- Number of parameters:0.13M, 0.6M
|
5 |
+
- Model size:0.58MB, 2.32MB
|
6 |
+
- Output shape: [1x896x16, 1x896x1], [1, 1x486x3]
|
7 |
+
|
8 |
+
Source model repository: [MediaPipe-Face-Detection](https://github.com/zmurez/MediaPipePyTorch/)
|
9 |
+
|
10 |
+
### Converted model
|
11 |
+
|
12 |
+
- Precision: INT8
|
13 |
+
- Backend: QNN2.16
|
14 |
+
- Target Device: FV01 QCS6490
|
15 |
+
|
16 |
+
## Inference with AidLite SDK
|
17 |
+
|
18 |
+
### SDK installation
|
19 |
+
Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
|
20 |
+
|
21 |
+
- install AidLite SDK
|
22 |
+
|
23 |
+
```bash
|
24 |
+
# Install the appropriate version of the aidlite sdk
|
25 |
+
sudo aid-pkg update
|
26 |
+
sudo aid-pkg install aidlite-sdk
|
27 |
+
# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
|
28 |
+
sudo aid-pkg install aidlite-{QNN VERSION}
|
29 |
+
```
|
30 |
+
|
31 |
+
- Verify AidLite SDK
|
32 |
+
|
33 |
+
```bash
|
34 |
+
# aidlite sdk c++ check
|
35 |
+
python3 -c "import aidlite ; print(aidlite.get_library_version())"
|
36 |
+
|
37 |
+
# aidlite sdk python check
|
38 |
+
python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
|
39 |
+
```
|
40 |
+
|
41 |
+
### Run demo
|
42 |
+
#### python
|
43 |
+
```bash
|
44 |
+
cd python
|
45 |
+
python3 demo_qnn.py
|
46 |
+
```
|
47 |
+
|
48 |
+
#### c++
|
49 |
+
```bash
|
50 |
+
# 加载.npy文件需要用到cnpy库(终端默认路径下执行即可)
|
51 |
+
git clone https://github.com/rogersce/cnpy.git
|
52 |
+
cd cnpy
|
53 |
+
mkdir build && cd build
|
54 |
+
cmake ..
|
55 |
+
make
|
56 |
+
sudo make install
|
57 |
+
|
58 |
+
cd mediapipe-face-detection/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp
|
59 |
+
mkdir build && cd build
|
60 |
+
cmake ..
|
61 |
+
make
|
62 |
+
./run_test
|
63 |
+
```
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cmake_minimum_required (VERSION 3.5)
|
2 |
+
project("run_test")
|
3 |
+
|
4 |
+
find_package(OpenCV REQUIRED)
|
5 |
+
find_library(CNPY_LIB cnpy REQUIRED)
|
6 |
+
|
7 |
+
message(STATUS "oPENCV Library status:")
|
8 |
+
message(STATUS ">version:${OpenCV_VERSION}")
|
9 |
+
message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
|
10 |
+
|
11 |
+
set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
|
12 |
+
|
13 |
+
include_directories(
|
14 |
+
/usr/local/include
|
15 |
+
/usr/include/opencv4
|
16 |
+
)
|
17 |
+
|
18 |
+
link_directories(
|
19 |
+
/usr/local/lib/
|
20 |
+
)
|
21 |
+
|
22 |
+
file(GLOB SRC_LISTS
|
23 |
+
${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp
|
24 |
+
)
|
25 |
+
|
26 |
+
add_executable(run_test ${SRC_LISTS})
|
27 |
+
|
28 |
+
target_link_libraries(run_test
|
29 |
+
aidlite
|
30 |
+
${OpenCV_LIBS}
|
31 |
+
pthread
|
32 |
+
jsoncpp
|
33 |
+
${CNPY_LIB}
|
34 |
+
)
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/anchors_float32.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94
|
3 |
+
size 14464
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/coco.jpg
ADDED
![]() |
Git LFS Details
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp
ADDED
@@ -0,0 +1,909 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <iostream>
|
2 |
+
#include <fstream>
|
3 |
+
#include <opencv2/opencv.hpp>
|
4 |
+
#include <aidlux/aidlite/aidlite.hpp>
|
5 |
+
#include <vector>
|
6 |
+
#include <numeric>
|
7 |
+
#include <cmath>
|
8 |
+
#include <jsoncpp/json/json.h>
|
9 |
+
#include <tuple>
|
10 |
+
#include <algorithm>
|
11 |
+
#include <sstream>
|
12 |
+
#include <string>
|
13 |
+
#include <cassert>
|
14 |
+
#include "cnpy.h"
|
15 |
+
|
16 |
+
using namespace cv;
|
17 |
+
using namespace std;
|
18 |
+
using namespace Aidlux::Aidlite;
|
19 |
+
|
20 |
+
|
21 |
+
// 人脸 landmark 连接索引定义(来自 MediaPipe Face Mesh)
|
22 |
+
const std::vector<std::pair<int, int>> FACE_CONNECTIONS = {
|
23 |
+
{61, 146}, {146, 91}, {91, 181}, {181, 84}, {84, 17},
|
24 |
+
{17, 314}, {314, 405}, {405, 321}, {321, 375}, {375, 291},
|
25 |
+
{61, 185}, {185, 40}, {40, 39}, {39, 37}, {37, 0},
|
26 |
+
{0, 267}, {267, 269}, {269, 270}, {270, 409}, {409, 291},
|
27 |
+
{78, 95}, {95, 88}, {88, 178}, {178, 87}, {87, 14},
|
28 |
+
{14, 317}, {317, 402}, {402, 318}, {318, 324}, {324, 308},
|
29 |
+
{78, 191}, {191, 80}, {80, 81}, {81, 82}, {82, 13},
|
30 |
+
{13, 312}, {312, 311}, {311, 310}, {310, 415}, {415, 308},
|
31 |
+
{263, 249}, {249, 390}, {390, 373}, {373, 374}, {374, 380},
|
32 |
+
{380, 381}, {381, 382}, {382, 362}, {263, 466}, {466, 388},
|
33 |
+
{388, 387}, {387, 386}, {386, 385}, {385, 384}, {384, 398},
|
34 |
+
{398, 362}, {276, 283}, {283, 282}, {282, 295}, {295, 285},
|
35 |
+
{300, 293}, {293, 334}, {334, 296}, {296, 336}, {33, 7},
|
36 |
+
{7, 163}, {163, 144}, {144, 145}, {145, 153}, {153, 154},
|
37 |
+
{154, 155}, {155, 133}, {33, 246}, {246, 161}, {161, 160},
|
38 |
+
{160, 159}, {159, 158}, {158, 157}, {157, 173}, {173, 133},
|
39 |
+
{46, 53}, {53, 52}, {52, 65}, {65, 55}, {70, 63}, {63, 105},
|
40 |
+
{105, 66}, {66, 107}, {10, 338}, {338, 297}, {297, 332},
|
41 |
+
{332, 284}, {284, 251}, {251, 389}, {389, 356}, {356, 454},
|
42 |
+
{454, 323}, {323, 361}, {361, 288}, {288, 397}, {397, 365},
|
43 |
+
{365, 379}, {379, 378}, {378, 400}, {400, 377}, {377, 152},
|
44 |
+
{152, 148}, {148, 176}, {176, 149}, {149, 150}, {150, 136},
|
45 |
+
{136, 172}, {172, 58}, {58, 132}, {132, 93}, {93, 234},
|
46 |
+
{234, 127}, {127, 162}, {162, 21}, {21, 54}, {54, 103},
|
47 |
+
{103, 67}, {67, 109}, {109, 10}
|
48 |
+
};
|
49 |
+
|
50 |
+
struct Args {
|
51 |
+
std::string faceDetector_model = "../../models/m_faceDetector_w8a8.qnn216.ctx.bin";
|
52 |
+
std::string faceLandmark_model = "../../models/m_faceLandmark_w8a8.qnn216.ctx.bin";
|
53 |
+
std::string imgs = "../coco.jpg";
|
54 |
+
int invoke_nums = 10;
|
55 |
+
std::string model_type = "QNN";
|
56 |
+
};
|
57 |
+
|
58 |
+
|
59 |
+
Args parse_args(int argc, char* argv[]) {
|
60 |
+
Args args;
|
61 |
+
for (int i = 1; i < argc; ++i) {
|
62 |
+
std::string arg = argv[i];
|
63 |
+
if (arg == "--faceDetector_model" && i + 1 < argc) {
|
64 |
+
args.faceDetector_model = argv[++i];
|
65 |
+
} else if (arg == "--faceLandmark_model" && i + 1 < argc) {
|
66 |
+
args.faceLandmark_model = argv[++i];
|
67 |
+
} else if (arg == "--imgs" && i + 1 < argc) {
|
68 |
+
args.imgs = argv[++i];
|
69 |
+
} else if (arg == "--invoke_nums" && i + 1 < argc) {
|
70 |
+
args.invoke_nums = std::stoi(argv[++i]);
|
71 |
+
} else if (arg == "--model_type" && i + 1 < argc) {
|
72 |
+
args.model_type = argv[++i];
|
73 |
+
}
|
74 |
+
}
|
75 |
+
return args;
|
76 |
+
}
|
77 |
+
|
78 |
+
std::string to_lower(const std::string& str) {
|
79 |
+
std::string lower_str = str;
|
80 |
+
std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
|
81 |
+
return std::tolower(c);
|
82 |
+
});
|
83 |
+
return lower_str;
|
84 |
+
}
|
85 |
+
|
86 |
+
std::vector<std::vector<float>> load_anchors_from_npy(const std::string& path) {
|
87 |
+
cnpy::NpyArray arr = cnpy::npy_load(path);
|
88 |
+
float* data_ptr = arr.data<float>();
|
89 |
+
|
90 |
+
size_t num_rows = arr.shape[0]; // 896
|
91 |
+
size_t num_cols = arr.shape[1]; // 4
|
92 |
+
|
93 |
+
std::vector<std::vector<float>> anchors(num_rows, std::vector<float>(num_cols));
|
94 |
+
for (size_t i = 0; i < num_rows; ++i) {
|
95 |
+
for (size_t j = 0; j < num_cols; ++j) {
|
96 |
+
anchors[i][j] = data_ptr[i * num_cols + j];
|
97 |
+
}
|
98 |
+
}
|
99 |
+
|
100 |
+
return anchors;
|
101 |
+
}
|
102 |
+
|
103 |
+
|
104 |
+
// 绘制人脸关键点和连接线
|
105 |
+
void draw_landmarks(
|
106 |
+
cv::Mat& img,
|
107 |
+
const std::vector<cv::Point2f>& points,
|
108 |
+
const std::vector<float>& flags,
|
109 |
+
const std::vector<std::pair<int, int>>& connections,
|
110 |
+
float threshold = 0.4f,
|
111 |
+
cv::Scalar point_color = cv::Scalar(0, 255, 0),
|
112 |
+
cv::Scalar line_color = cv::Scalar(0, 0, 0),
|
113 |
+
int size = 2)
|
114 |
+
{
|
115 |
+
// 画关键点
|
116 |
+
for (size_t i = 0; i < points.size(); ++i) {
|
117 |
+
if (i < flags.size() && flags[i] > threshold) {
|
118 |
+
int x = static_cast<int>(points[i].x);
|
119 |
+
int y = static_cast<int>(points[i].y);
|
120 |
+
cv::circle(img, cv::Point(x, y), size, point_color, size);
|
121 |
+
}
|
122 |
+
}
|
123 |
+
|
124 |
+
// 画连接线(两端都要可见)
|
125 |
+
for (const auto& conn : connections) {
|
126 |
+
int i0 = conn.first;
|
127 |
+
int i1 = conn.second;
|
128 |
+
if (i0 < points.size() && i1 < points.size() &&
|
129 |
+
i0 < flags.size() && i1 < flags.size() &&
|
130 |
+
flags[i0] > threshold && flags[i1] > threshold)
|
131 |
+
{
|
132 |
+
cv::line(img, points[i0], points[i1], line_color, size);
|
133 |
+
}
|
134 |
+
}
|
135 |
+
}
|
136 |
+
|
137 |
+
|
138 |
+
std::tuple<cv::Mat, cv::Mat, float, cv::Point> resize_pad(const cv::Mat& img) {
|
139 |
+
int orig_h = img.rows; // 480
|
140 |
+
int orig_w = img.cols; // 640
|
141 |
+
|
142 |
+
// Step 1: resize width to 256, keep aspect ratio
|
143 |
+
int w1 = 256;
|
144 |
+
int h1 = w1 * orig_h / orig_w; // 等效于 int(256 * h / w)
|
145 |
+
|
146 |
+
// Step 2: compute padding in height direction
|
147 |
+
int padh = 256 - h1;
|
148 |
+
int padw = 0;
|
149 |
+
|
150 |
+
int padh1 = padh / 2;
|
151 |
+
int padh2 = padh1 + (padh % 2);
|
152 |
+
int padw1 = padw / 2;
|
153 |
+
int padw2 = padw1 + (padw % 2);
|
154 |
+
|
155 |
+
// Step 3: resize to (w1, h1)
|
156 |
+
cv::Mat resized;
|
157 |
+
cv::resize(img, resized, cv::Size(w1, h1)); // (256, h1)
|
158 |
+
|
159 |
+
// Step 4: pad to (256, 256)
|
160 |
+
cv::Mat padded;
|
161 |
+
cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
|
162 |
+
|
163 |
+
// Step 5: resize padded to 128×128
|
164 |
+
cv::Mat resized128;
|
165 |
+
cv::resize(padded, resized128, cv::Size(128, 128));
|
166 |
+
|
167 |
+
// Step 6: compute scale and pad in original image space
|
168 |
+
float scale = static_cast<float>(orig_h) / h1; // h / h1
|
169 |
+
cv::Point pad_point(static_cast<int>(padh1 * scale), static_cast<int>(padw1 * scale));
|
170 |
+
|
171 |
+
return std::make_tuple(padded, resized128, scale, pad_point);
|
172 |
+
}
|
173 |
+
|
174 |
+
|
175 |
+
// 将图像转换为 1xC×H×W 格式并归一化(除以 255)
|
176 |
+
std::vector<float> preprocess_image(const cv::Mat& img) {
|
177 |
+
int H = img.rows;
|
178 |
+
int W = img.cols;
|
179 |
+
int C = img.channels(); // should be 3
|
180 |
+
|
181 |
+
std::vector<float> chw(H * W * C); // CHW
|
182 |
+
std::vector<float> nchw(1 * C * H * W); // NCHW
|
183 |
+
|
184 |
+
// 1. HWC → CHW + normalize (float32 / 255.0)
|
185 |
+
for (int h = 0; h < H; ++h) {
|
186 |
+
for (int w = 0; w < W; ++w) {
|
187 |
+
for (int c = 0; c < C; ++c) {
|
188 |
+
// OpenCV uses BGR order
|
189 |
+
float value = img.at<cv::Vec3b>(h, w)[c] / 255.0f;
|
190 |
+
chw[c * H * W + h * W + w] = value;
|
191 |
+
}
|
192 |
+
}
|
193 |
+
}
|
194 |
+
|
195 |
+
// 2. CHW → NCHW (add batch dimension, actually just copy)
|
196 |
+
for (int i = 0; i < C * H * W; ++i) {
|
197 |
+
nchw[i] = chw[i];
|
198 |
+
}
|
199 |
+
|
200 |
+
return nchw; // shape: [1, 3, H, W]
|
201 |
+
}
|
202 |
+
|
203 |
+
|
204 |
+
// 只用前4个坐标计算IOU(默认框位置在前4个坐标)
|
205 |
+
float IoU(const std::vector<float>& box1, const std::vector<float>& box2) {
|
206 |
+
float x1 = std::max(box1[0], box2[0]);
|
207 |
+
float y1 = std::max(box1[1], box2[1]);
|
208 |
+
float x2 = std::min(box1[2], box2[2]);
|
209 |
+
float y2 = std::min(box1[3], box2[3]);
|
210 |
+
|
211 |
+
float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
|
212 |
+
float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]);
|
213 |
+
float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]);
|
214 |
+
float union_area = box1_area + box2_area - inter_area;
|
215 |
+
|
216 |
+
return union_area > 0 ? inter_area / union_area : 0.0f;
|
217 |
+
}
|
218 |
+
|
219 |
+
std::vector<std::vector<float>> weighted_non_max_suppression(
|
220 |
+
std::vector<std::vector<float>>& detections,
|
221 |
+
int num_coords = 16,
|
222 |
+
float min_suppression_threshold = 0.3f)
|
223 |
+
{
|
224 |
+
if (detections.empty()) return {};
|
225 |
+
|
226 |
+
std::vector<int> indices(detections.size());
|
227 |
+
std::iota(indices.begin(), indices.end(), 0);
|
228 |
+
|
229 |
+
// 按置信度降序排序
|
230 |
+
std::sort(indices.begin(), indices.end(), [&](int a, int b) {
|
231 |
+
return detections[a][num_coords] > detections[b][num_coords];
|
232 |
+
});
|
233 |
+
|
234 |
+
std::vector<std::vector<float>> output;
|
235 |
+
|
236 |
+
while (!indices.empty()) {
|
237 |
+
int best_idx = indices.front();
|
238 |
+
const auto& best_det = detections[best_idx];
|
239 |
+
std::vector<int> overlapping = { best_idx };
|
240 |
+
|
241 |
+
for (size_t i = 1; i < indices.size(); ++i) {
|
242 |
+
float iou = IoU(best_det, detections[indices[i]]);
|
243 |
+
if (iou > min_suppression_threshold) {
|
244 |
+
overlapping.push_back(indices[i]);
|
245 |
+
}
|
246 |
+
}
|
247 |
+
|
248 |
+
// 更新剩余索引
|
249 |
+
std::vector<int> new_indices;
|
250 |
+
for (int idx : indices) {
|
251 |
+
if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) {
|
252 |
+
new_indices.push_back(idx);
|
253 |
+
}
|
254 |
+
}
|
255 |
+
indices = new_indices;
|
256 |
+
|
257 |
+
// 加权平均:坐标 * 置信度
|
258 |
+
if (overlapping.size() == 1) {
|
259 |
+
output.push_back(best_det);
|
260 |
+
} else {
|
261 |
+
std::vector<float> weighted(num_coords + 1, 0.0f);
|
262 |
+
float total_score = 0.0f;
|
263 |
+
|
264 |
+
for (int idx : overlapping) {
|
265 |
+
float score = detections[idx][num_coords];
|
266 |
+
total_score += score;
|
267 |
+
for (int k = 0; k < num_coords; ++k) {
|
268 |
+
weighted[k] += detections[idx][k] * score;
|
269 |
+
}
|
270 |
+
}
|
271 |
+
|
272 |
+
for (int k = 0; k < num_coords; ++k) {
|
273 |
+
weighted[k] /= total_score;
|
274 |
+
}
|
275 |
+
weighted[num_coords] = total_score / overlapping.size(); // 取平均得分
|
276 |
+
|
277 |
+
// std::cout << "Weighted box: ";
|
278 |
+
// for (float v : weighted) std::cout << v << " ";
|
279 |
+
// std::cout << "\n";
|
280 |
+
|
281 |
+
output.push_back(weighted);
|
282 |
+
}
|
283 |
+
}
|
284 |
+
|
285 |
+
// TODO
|
286 |
+
auto x = output[0];
|
287 |
+
output.clear();
|
288 |
+
output.push_back(x);
|
289 |
+
|
290 |
+
return output;
|
291 |
+
}
|
292 |
+
|
293 |
+
|
294 |
+
std::vector<std::vector<float>> denormalize_detections(
|
295 |
+
const std::vector<std::vector<float>>& detections,
|
296 |
+
float scale,
|
297 |
+
const cv::Point& pad
|
298 |
+
) {
|
299 |
+
std::vector<std::vector<float>> result = detections;
|
300 |
+
|
301 |
+
for (size_t i = 0; i < result.size(); ++i) {
|
302 |
+
std::vector<float>& det = result[i];
|
303 |
+
|
304 |
+
// bbox coords: x1, y1, x2, y2
|
305 |
+
det[0] = det[0] * scale * 256.0f - pad.x; // x1
|
306 |
+
det[1] = det[1] * scale * 256.0f - pad.y; // y1
|
307 |
+
det[2] = det[2] * scale * 256.0f - pad.x; // x2
|
308 |
+
det[3] = det[3] * scale * 256.0f - pad.y; // y2
|
309 |
+
|
310 |
+
// keypoints (starting from index 4): format [y, x, y, x, ...]
|
311 |
+
for (size_t k = 4; k + 1 < det.size(); k += 2) {
|
312 |
+
det[k] = det[k] * scale * 256.0f - pad.y; // y
|
313 |
+
det[k + 1] = det[k + 1] * scale * 256.0f - pad.x; // x
|
314 |
+
}
|
315 |
+
}
|
316 |
+
|
317 |
+
return result;
|
318 |
+
}
|
319 |
+
|
320 |
+
|
321 |
+
void detection2roi(
|
322 |
+
const std::vector<std::vector<float>>& detections,
|
323 |
+
std::vector<float>& xc,
|
324 |
+
std::vector<float>& yc,
|
325 |
+
std::vector<float>& scale,
|
326 |
+
std::vector<float>& theta,
|
327 |
+
int kp1, int kp2, // 关键点索引
|
328 |
+
float dy, float dscale, float theta0
|
329 |
+
) {
|
330 |
+
size_t N = detections.size();
|
331 |
+
xc.resize(N);
|
332 |
+
yc.resize(N);
|
333 |
+
scale.resize(N);
|
334 |
+
theta.resize(N);
|
335 |
+
|
336 |
+
for (size_t i = 0; i < N; ++i) {
|
337 |
+
const std::vector<float>& det = detections[i];
|
338 |
+
|
339 |
+
float x1 = det[1];
|
340 |
+
float x2 = det[3];
|
341 |
+
float y1 = det[0];
|
342 |
+
float y2 = det[2];
|
343 |
+
|
344 |
+
float x_center = (x1 + x2) / 2.0f;
|
345 |
+
float y_center = (y1 + y2) / 2.0f;
|
346 |
+
float box_scale = (x2 - x1); // assumes square box
|
347 |
+
|
348 |
+
// yc 偏移
|
349 |
+
y_center += dy * box_scale;
|
350 |
+
box_scale *= dscale;
|
351 |
+
|
352 |
+
// 获取两个关键点的位置
|
353 |
+
int base = 4;
|
354 |
+
int idx_y0 = base + 2 * kp1;
|
355 |
+
int idx_x0 = base + 2 * kp1 + 1;
|
356 |
+
int idx_y1 = base + 2 * kp2;
|
357 |
+
int idx_x1 = base + 2 * kp2 + 1;
|
358 |
+
|
359 |
+
float x0 = det[idx_x0];
|
360 |
+
float y0 = det[idx_y0];
|
361 |
+
float x1_kp = det[idx_x1];
|
362 |
+
float y1_kp = det[idx_y1];
|
363 |
+
|
364 |
+
float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0;
|
365 |
+
|
366 |
+
// 输出赋值
|
367 |
+
xc[i] = x_center;
|
368 |
+
yc[i] = y_center;
|
369 |
+
scale[i] = box_scale;
|
370 |
+
// TODO: 这里的 theta 需要根据实际情况调整
|
371 |
+
// theta[i] = angle; // 如果需要使用计算的角度
|
372 |
+
theta[i] = -0.0094;
|
373 |
+
}
|
374 |
+
}
|
375 |
+
|
376 |
+
|
377 |
+
void extract_roi(
|
378 |
+
const cv::Mat& frame,
|
379 |
+
const std::vector<float>& xc,
|
380 |
+
const std::vector<float>& yc,
|
381 |
+
const std::vector<float>& theta,
|
382 |
+
const std::vector<float>& scale,
|
383 |
+
std::vector<cv::Mat>& cropped_rois,
|
384 |
+
std::vector<cv::Mat>& affine_matrices,
|
385 |
+
std::vector<std::vector<cv::Point2f>>& roi_boxes, // 添加返回点坐标
|
386 |
+
int resolution = 192
|
387 |
+
) {
|
388 |
+
cropped_rois.clear();
|
389 |
+
affine_matrices.clear();
|
390 |
+
roi_boxes.clear();
|
391 |
+
|
392 |
+
for (size_t i = 0; i < xc.size(); ++i) {
|
393 |
+
float s = scale[i] / 2.0f;
|
394 |
+
float cos_t = std::cos(theta[i]);
|
395 |
+
float sin_t = std::sin(theta[i]);
|
396 |
+
|
397 |
+
// 定义4个 unit square 点经过变换后的点(顺序和 Python 中一样)
|
398 |
+
std::vector<cv::Point2f> points(4);
|
399 |
+
// [-1, -1]
|
400 |
+
points[0].x = xc[i] + (-s * cos_t + s * sin_t);
|
401 |
+
points[0].y = yc[i] + (-s * sin_t - s * cos_t);
|
402 |
+
// [1, -1]
|
403 |
+
points[1].x = xc[i] + ( s * cos_t + s * sin_t);
|
404 |
+
points[1].y = yc[i] + ( s * sin_t - s * cos_t);
|
405 |
+
// [-1, 1]
|
406 |
+
points[2].x = xc[i] + (-s * cos_t - s * sin_t);
|
407 |
+
points[2].y = yc[i] + (-s * sin_t + s * cos_t);
|
408 |
+
// [1, 1]
|
409 |
+
points[3].x = xc[i] + ( s * cos_t - s * sin_t);
|
410 |
+
points[3].y = yc[i] + ( s * sin_t + s * cos_t);
|
411 |
+
|
412 |
+
// 用前三个点计算仿射变换
|
413 |
+
std::vector<cv::Point2f> src_pts = { points[0], points[1], points[2] };
|
414 |
+
std::vector<cv::Point2f> dst_pts = {
|
415 |
+
cv::Point2f(0, 0),
|
416 |
+
cv::Point2f(resolution - 1, 0),
|
417 |
+
cv::Point2f(0, resolution - 1)
|
418 |
+
};
|
419 |
+
|
420 |
+
cv::Mat M = cv::getAffineTransform(src_pts, dst_pts);
|
421 |
+
cv::Mat M_inv;
|
422 |
+
cv::invertAffineTransform(M, M_inv);
|
423 |
+
|
424 |
+
cv::Mat cropped;
|
425 |
+
cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5));
|
426 |
+
cropped_rois.push_back(cropped);
|
427 |
+
affine_matrices.push_back(M_inv);
|
428 |
+
roi_boxes.push_back(points); // 添加变换后的 box 点
|
429 |
+
}
|
430 |
+
}
|
431 |
+
|
432 |
+
std::vector<float> preprocess_imgs_to_nchw(const std::vector<cv::Mat>& imgs) {
|
433 |
+
int N = imgs.size();
|
434 |
+
if (N == 0) return {};
|
435 |
+
|
436 |
+
int H = 192;
|
437 |
+
int W = 192;
|
438 |
+
int C = 3; // assume 3 channels (BGR)
|
439 |
+
|
440 |
+
std::vector<float> output;
|
441 |
+
output.reserve(N * C * H * W);
|
442 |
+
|
443 |
+
for (int n = 0; n < N; ++n) {
|
444 |
+
cv::Mat img_float;
|
445 |
+
imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0); // Normalize to [0,1]
|
446 |
+
|
447 |
+
// Split channels (HWC → CHW)
|
448 |
+
std::vector<cv::Mat> channels(3);
|
449 |
+
cv::split(img_float, channels); // channels[0] = B, [1] = G, [2] = R
|
450 |
+
|
451 |
+
for (int c = 0; c < C; ++c) {
|
452 |
+
for (int i = 0; i < H; ++i) {
|
453 |
+
for (int j = 0; j < W; ++j) {
|
454 |
+
output.push_back(channels[c].at<float>(i, j));
|
455 |
+
}
|
456 |
+
}
|
457 |
+
}
|
458 |
+
}
|
459 |
+
|
460 |
+
return output; // shape: N x C x H x W
|
461 |
+
}
|
462 |
+
|
463 |
+
// resolution 一般为 192
|
464 |
+
std::vector<cv::Point2f> denormalize_landmarks(
|
465 |
+
const std::vector<float>& normalized_landmarks,
|
466 |
+
const std::vector<cv::Mat>& affines,
|
467 |
+
int resolution = 192)
|
468 |
+
{
|
469 |
+
std::vector<cv::Point2f> output;
|
470 |
+
|
471 |
+
// 检查输入尺寸
|
472 |
+
const int num_faces = 1; // 假设只有一个人脸
|
473 |
+
const int num_landmarks = 468;
|
474 |
+
if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) {
|
475 |
+
std::cerr << "Error: Input size mismatch. Expected "
|
476 |
+
<< num_faces * num_landmarks * 3 << " landmarks and "
|
477 |
+
<< num_faces << " affine matrices." << std::endl;
|
478 |
+
throw std::runtime_error("Input size mismatch");
|
479 |
+
}
|
480 |
+
|
481 |
+
for (int i = 0; i < num_faces; ++i) {
|
482 |
+
const cv::Mat& affine = affines[i]; // 2x3 CV_32F
|
483 |
+
for (int j = 0; j < num_landmarks; ++j) {
|
484 |
+
int idx = i * num_landmarks * 3 + j * 3;
|
485 |
+
float x = normalized_landmarks[idx + 0] * resolution;
|
486 |
+
float y = normalized_landmarks[idx + 1] * resolution;
|
487 |
+
// float z = normalized_landmarks[idx + 2]; // 可选使用
|
488 |
+
|
489 |
+
// 2x1 输入向量
|
490 |
+
cv::Mat pt = (cv::Mat_<float>(2, 1) << x, y);
|
491 |
+
|
492 |
+
// 提取 affine 的旋转和平移
|
493 |
+
cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone();
|
494 |
+
cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone();
|
495 |
+
M2x2.convertTo(M2x2, CV_32F);
|
496 |
+
t2x1.convertTo(t2x1, CV_32F);
|
497 |
+
|
498 |
+
// 反仿射变换
|
499 |
+
cv::Mat out = M2x2 * pt + t2x1;
|
500 |
+
|
501 |
+
// 存储为 Point2f
|
502 |
+
output.emplace_back(out.at<float>(0, 0), out.at<float>(1, 0));
|
503 |
+
}
|
504 |
+
}
|
505 |
+
|
506 |
+
return output; // 输出为 denormalized landmarks,大小为 468 个 Point2f
|
507 |
+
}
|
508 |
+
|
509 |
+
|
510 |
+
void draw_roi(cv::Mat& img, const std::vector<std::vector<cv::Point2f>>& boxes) {
|
511 |
+
for (const auto& roi : boxes) {
|
512 |
+
if (roi.size() < 4) continue;
|
513 |
+
|
514 |
+
const cv::Point2f& p1 = roi[0];
|
515 |
+
const cv::Point2f& p2 = roi[1];
|
516 |
+
const cv::Point2f& p3 = roi[2];
|
517 |
+
const cv::Point2f& p4 = roi[3];
|
518 |
+
|
519 |
+
cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2); // 黑色
|
520 |
+
cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2); // 绿色
|
521 |
+
cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2); // 黑色
|
522 |
+
cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2); // 黑色
|
523 |
+
}
|
524 |
+
}
|
525 |
+
|
526 |
+
|
527 |
+
void draw_detections(cv::Mat& img, const std::vector<std::vector<float>>& detections, bool with_keypoints = true) {
|
528 |
+
for (const auto& det : detections) {
|
529 |
+
if (det.size() < 4) continue;
|
530 |
+
|
531 |
+
float ymin = det[0];
|
532 |
+
float xmin = det[1];
|
533 |
+
float ymax = det[2];
|
534 |
+
float xmax = det[3];
|
535 |
+
|
536 |
+
cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1);
|
537 |
+
|
538 |
+
if (with_keypoints && det.size() > 4) {
|
539 |
+
int n_keypoints = (det.size() - 4) / 2;
|
540 |
+
for (int k = 0; k < n_keypoints; ++k) {
|
541 |
+
int kp_x = int(det[4 + k * 2]);
|
542 |
+
int kp_y = int(det[4 + k * 2 + 1]);
|
543 |
+
cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2);
|
544 |
+
}
|
545 |
+
}
|
546 |
+
}
|
547 |
+
}
|
548 |
+
|
549 |
+
|
550 |
+
std::vector<std::vector<float>> loadAnchors(const std::string& filename) {
|
551 |
+
std::ifstream in(filename);
|
552 |
+
std::vector<std::vector<float>> anchors;
|
553 |
+
|
554 |
+
if (!in.is_open()) {
|
555 |
+
std::cerr << "Failed to open file: " << filename << std::endl;
|
556 |
+
return anchors;
|
557 |
+
}
|
558 |
+
|
559 |
+
std::string line;
|
560 |
+
while (std::getline(in, line)) {
|
561 |
+
std::istringstream ss(line);
|
562 |
+
std::vector<float> anchor;
|
563 |
+
float value;
|
564 |
+
while (ss >> value) {
|
565 |
+
anchor.push_back(value);
|
566 |
+
}
|
567 |
+
if (!anchor.empty()) {
|
568 |
+
anchors.push_back(anchor);
|
569 |
+
}
|
570 |
+
}
|
571 |
+
|
572 |
+
in.close();
|
573 |
+
return anchors;
|
574 |
+
}
|
575 |
+
|
576 |
+
// sigmoid 函数
|
577 |
+
float sigmoid(float x) {
|
578 |
+
return 1.0f / (1.0f + std::exp(-x));
|
579 |
+
}
|
580 |
+
|
581 |
+
// clamp 函数
|
582 |
+
float clamp(float x, float min_val, float max_val) {
|
583 |
+
return std::max(min_val, std::min(max_val, x));
|
584 |
+
}
|
585 |
+
|
586 |
+
// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord]
|
587 |
+
std::vector<std::vector<std::vector<float>>> decode_boxes(
|
588 |
+
const std::vector<float>& raw_boxes,
|
589 |
+
const std::vector<std::vector<float>>& anchors,
|
590 |
+
int batch, int num_anchors, int num_coords,
|
591 |
+
float x_scale, float y_scale, float w_scale, float h_scale,
|
592 |
+
int num_keypoints)
|
593 |
+
{
|
594 |
+
std::vector<std::vector<std::vector<float>>> decoded_boxes(batch,
|
595 |
+
std::vector<std::vector<float>>(num_anchors, std::vector<float>(num_coords, 0)));
|
596 |
+
|
597 |
+
for (int b = 0; b < batch; ++b) {
|
598 |
+
for (int i = 0; i < num_anchors; ++i) {
|
599 |
+
int base = b * num_anchors * num_coords + i * num_coords;
|
600 |
+
|
601 |
+
float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0];
|
602 |
+
float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1];
|
603 |
+
float w = raw_boxes[base + 2] / w_scale * anchors[i][2];
|
604 |
+
float h = raw_boxes[base + 3] / h_scale * anchors[i][3];
|
605 |
+
|
606 |
+
decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin
|
607 |
+
decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin
|
608 |
+
decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax
|
609 |
+
decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax
|
610 |
+
|
611 |
+
for (int k = 0; k < num_keypoints; ++k) {
|
612 |
+
int offset = 4 + k * 2;
|
613 |
+
float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0];
|
614 |
+
float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1];
|
615 |
+
decoded_boxes[b][i][offset] = keypoint_x;
|
616 |
+
decoded_boxes[b][i][offset + 1] = keypoint_y;
|
617 |
+
}
|
618 |
+
}
|
619 |
+
}
|
620 |
+
|
621 |
+
return decoded_boxes;
|
622 |
+
}
|
623 |
+
|
624 |
+
std::vector<std::vector<std::vector<float>>> tensors_to_detections(
|
625 |
+
const std::vector<float>& raw_box_tensor,
|
626 |
+
const std::vector<float>& raw_score_tensor,
|
627 |
+
const std::vector<std::vector<float>>& anchors,
|
628 |
+
int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints,
|
629 |
+
float x_scale, float y_scale, float w_scale, float h_scale,
|
630 |
+
float score_clipping_thresh, float min_score_thresh)
|
631 |
+
{
|
632 |
+
assert(raw_box_tensor.size() == batch * num_anchors * num_coords);
|
633 |
+
assert(raw_score_tensor.size() == batch * num_anchors * num_classes);
|
634 |
+
assert(anchors.size() == size_t(num_anchors));
|
635 |
+
|
636 |
+
auto detection_boxes = decode_boxes(
|
637 |
+
raw_box_tensor, anchors, batch, num_anchors, num_coords,
|
638 |
+
x_scale, y_scale, w_scale, h_scale, num_keypoints);
|
639 |
+
|
640 |
+
std::vector<std::vector<std::vector<float>>> output_detections;
|
641 |
+
|
642 |
+
for (int b = 0; b < batch; ++b) {
|
643 |
+
std::vector<std::vector<float>> detections;
|
644 |
+
|
645 |
+
for (int i = 0; i < num_anchors; ++i) {
|
646 |
+
int score_index = b * num_anchors * num_classes + i * num_classes;
|
647 |
+
|
648 |
+
// 单类情况,取第0类
|
649 |
+
float score_raw = raw_score_tensor[score_index];
|
650 |
+
float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh));
|
651 |
+
|
652 |
+
if (score >= min_score_thresh) {
|
653 |
+
std::vector<float> det = detection_boxes[b][i]; // shape [num_coords]
|
654 |
+
det.push_back(score); // 追加置信度
|
655 |
+
detections.push_back(det); // shape [num_coords+1]
|
656 |
+
}
|
657 |
+
}
|
658 |
+
|
659 |
+
output_detections.push_back(detections); // 每个 batch 一个 vector
|
660 |
+
}
|
661 |
+
|
662 |
+
return output_detections;
|
663 |
+
}
|
664 |
+
|
665 |
+
|
666 |
+
int invoke(const Args& args) {
|
667 |
+
std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n"
|
668 |
+
<< args.faceLandmark_model << "\n"
|
669 |
+
<< "Image Path: " << args.imgs << "\n"
|
670 |
+
<< "Inference Nums: " << args.invoke_nums << "\n"
|
671 |
+
<< "Model Type: " << args.model_type << "\n";
|
672 |
+
// =============================================================faceDetector_model start
|
673 |
+
Model* model1 = Model::create_instance(args.faceDetector_model);
|
674 |
+
if(model1 == nullptr){
|
675 |
+
printf("Create model1 failed !\n");
|
676 |
+
return EXIT_FAILURE;
|
677 |
+
}
|
678 |
+
Config* config1 = Config::create_instance();
|
679 |
+
if(config1 == nullptr){
|
680 |
+
printf("Create config1 failed !\n");
|
681 |
+
return EXIT_FAILURE;
|
682 |
+
}
|
683 |
+
config1->implement_type = ImplementType::TYPE_LOCAL;
|
684 |
+
std::string model_type_lower1 = to_lower(args.model_type);
|
685 |
+
if (model_type_lower1 == "qnn"){
|
686 |
+
config1->framework_type = FrameworkType::TYPE_QNN;
|
687 |
+
} else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") {
|
688 |
+
config1->framework_type = FrameworkType::TYPE_SNPE2;
|
689 |
+
}
|
690 |
+
config1->accelerate_type = AccelerateType::TYPE_DSP;
|
691 |
+
config1->is_quantify_model = 1;
|
692 |
+
|
693 |
+
std::vector<std::vector<uint32_t>> input_shapes1 = {{1,3,256,256}};
|
694 |
+
std::vector<std::vector<uint32_t>> output_shapes1 = {{1,896,16},{1,896,1}};
|
695 |
+
model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
|
696 |
+
std::unique_ptr<Interpreter> fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1);
|
697 |
+
if(fast_interpreter1 == nullptr){
|
698 |
+
printf("build_interpretper_from_model_and_config failed !\n");
|
699 |
+
return EXIT_FAILURE;
|
700 |
+
}
|
701 |
+
int result = fast_interpreter1->init();
|
702 |
+
if(result != EXIT_SUCCESS){
|
703 |
+
printf("interpreter->init() failed !\n");
|
704 |
+
return EXIT_FAILURE;
|
705 |
+
}
|
706 |
+
// load model
|
707 |
+
fast_interpreter1->load_model();
|
708 |
+
if(result != EXIT_SUCCESS){
|
709 |
+
printf("interpreter->load_model() failed !\n");
|
710 |
+
return EXIT_FAILURE;
|
711 |
+
}
|
712 |
+
printf("detect model load success!\n");
|
713 |
+
// =============================================================faceDetector_model over
|
714 |
+
|
715 |
+
// =============================================================faceLandmark_model start
|
716 |
+
Model* model2 = Model::create_instance(args.faceLandmark_model);
|
717 |
+
if(model2 == nullptr){
|
718 |
+
printf("Create model2 failed !\n");
|
719 |
+
return EXIT_FAILURE;
|
720 |
+
}
|
721 |
+
Config* config2 = Config::create_instance();
|
722 |
+
if(config2 == nullptr){
|
723 |
+
printf("Create config2 failed !\n");
|
724 |
+
return EXIT_FAILURE;
|
725 |
+
}
|
726 |
+
config2->implement_type = ImplementType::TYPE_LOCAL;
|
727 |
+
std::string model_type_lower2 = to_lower(args.model_type);
|
728 |
+
if (model_type_lower2 == "qnn"){
|
729 |
+
config2->framework_type = FrameworkType::TYPE_QNN;
|
730 |
+
} else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") {
|
731 |
+
config2->framework_type = FrameworkType::TYPE_SNPE2;
|
732 |
+
}
|
733 |
+
config2->accelerate_type = AccelerateType::TYPE_DSP;
|
734 |
+
config2->is_quantify_model = 1;
|
735 |
+
|
736 |
+
std::vector<std::vector<uint32_t>> input_shapes2 = {{1,3,192,192}};
|
737 |
+
std::vector<std::vector<uint32_t>> output_shapes2 = {{1},{1,468,3}};
|
738 |
+
model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
|
739 |
+
std::unique_ptr<Interpreter> fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2);
|
740 |
+
if(fast_interpreter2 == nullptr){
|
741 |
+
printf("build_interpretper_from_model_and_config2 failed !\n");
|
742 |
+
return EXIT_FAILURE;
|
743 |
+
}
|
744 |
+
result = fast_interpreter2->init();
|
745 |
+
if(result != EXIT_SUCCESS){
|
746 |
+
printf("interpreter2->init() failed !\n");
|
747 |
+
return EXIT_FAILURE;
|
748 |
+
}
|
749 |
+
// load model
|
750 |
+
fast_interpreter2->load_model();
|
751 |
+
if(result != EXIT_SUCCESS){
|
752 |
+
printf("interpreter2->load_model() failed !\n");
|
753 |
+
return EXIT_FAILURE;
|
754 |
+
}
|
755 |
+
printf("detect model2 load success!\n");
|
756 |
+
// =============================================================faceLandmark_model over
|
757 |
+
|
758 |
+
|
759 |
+
auto anchors = load_anchors_from_npy("../anchors_float32.npy");
|
760 |
+
cv::Mat frame = cv::imread(args.imgs);
|
761 |
+
if (frame.empty()) {
|
762 |
+
printf("detect image load failed!\n");
|
763 |
+
return 1;
|
764 |
+
}
|
765 |
+
// printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
|
766 |
+
cv::Mat input_data;
|
767 |
+
cv::Mat frame_clone1 = frame.clone();
|
768 |
+
cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB);
|
769 |
+
cv::Mat frame_clone = frame.clone();
|
770 |
+
|
771 |
+
|
772 |
+
cv::Mat img1, img2;
|
773 |
+
float scale;
|
774 |
+
cv::Point pad;
|
775 |
+
std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1);
|
776 |
+
std::vector<float> input_tensor = preprocess_image(img1);
|
777 |
+
|
778 |
+
float *outdata0 = nullptr;
|
779 |
+
float *outdata1 = nullptr;
|
780 |
+
std::vector<float> invoke_time;
|
781 |
+
for (int i = 0; i < args.invoke_nums; ++i) {
|
782 |
+
result = fast_interpreter1->set_input_tensor(0, input_tensor.data());
|
783 |
+
if(result != EXIT_SUCCESS){
|
784 |
+
printf("interpreter->set_input_tensor() failed !\n");
|
785 |
+
return EXIT_FAILURE;
|
786 |
+
}
|
787 |
+
auto t1 = std::chrono::high_resolution_clock::now();
|
788 |
+
result = fast_interpreter1->invoke();
|
789 |
+
auto t2 = std::chrono::high_resolution_clock::now();
|
790 |
+
std::chrono::duration<double> cost_time = t2 - t1;
|
791 |
+
invoke_time.push_back(cost_time.count() * 1000);
|
792 |
+
if(result != EXIT_SUCCESS){
|
793 |
+
printf("interpreter->invoke() failed !\n");
|
794 |
+
return EXIT_FAILURE;
|
795 |
+
}
|
796 |
+
uint32_t out_data_0 = 0;
|
797 |
+
result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0);
|
798 |
+
if(result != EXIT_SUCCESS){
|
799 |
+
printf("interpreter1->get_output_tensor() 0 failed !\n");
|
800 |
+
return EXIT_FAILURE;
|
801 |
+
}
|
802 |
+
|
803 |
+
uint32_t out_data_1 = 0;
|
804 |
+
result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1);
|
805 |
+
if(result != EXIT_SUCCESS){
|
806 |
+
printf("interpreter1->get_output_tensor() 1 failed !\n");
|
807 |
+
return EXIT_FAILURE;
|
808 |
+
}
|
809 |
+
|
810 |
+
}
|
811 |
+
|
812 |
+
std::vector<float> tensor_1_896_16(outdata0, outdata0 + 896*16);
|
813 |
+
std::vector<float> tensor_1_896_1(outdata1, outdata1 + 896*1);
|
814 |
+
|
815 |
+
std::vector<std::vector<std::vector<float>>> detections = tensors_to_detections(
|
816 |
+
tensor_1_896_16, tensor_1_896_1, anchors,
|
817 |
+
1, 896, 16, 1, 6,
|
818 |
+
256.0f, 256.0f, 256.0f, 256.0f,
|
819 |
+
100.0f, 0.4f);
|
820 |
+
|
821 |
+
|
822 |
+
std::vector<std::vector<std::vector<float>>> filtered_detections;
|
823 |
+
for (size_t i = 0; i < detections.size(); ++i) {
|
824 |
+
std::vector<std::vector<float>>& dets = detections[i];
|
825 |
+
std::vector<std::vector<float>> faces = weighted_non_max_suppression(dets);
|
826 |
+
filtered_detections.push_back(faces);
|
827 |
+
}
|
828 |
+
|
829 |
+
|
830 |
+
// std::cout << "filtered_detections size: " << filtered_detections.size() << "\n";
|
831 |
+
// std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n";
|
832 |
+
std::vector<std::vector<float>> face_detections = denormalize_detections(filtered_detections[0], scale, pad);
|
833 |
+
|
834 |
+
// std::cout << "face_detections size: " << face_detections.size() << "\n";
|
835 |
+
std::vector<float> xc, yc, scales, theta;
|
836 |
+
int kp1 = 0, kp2 = 1; // 关键点索引
|
837 |
+
float dy = 0.0f; // 根据模型定义设定
|
838 |
+
float dscale = 1.5f; // 缩放因子
|
839 |
+
float theta0 = 0.0f; // 基准角度
|
840 |
+
|
841 |
+
detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0);
|
842 |
+
std::vector<cv::Mat> rois;
|
843 |
+
std::vector<cv::Mat> affines;
|
844 |
+
std::vector<std::vector<cv::Point2f>> boxes;
|
845 |
+
|
846 |
+
extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes);
|
847 |
+
if (!boxes.empty()) {
|
848 |
+
std::cout << "Detected " << boxes.size() << " faces.\n";
|
849 |
+
// 检测到人脸,继续处理 boxes[0] ...
|
850 |
+
std::vector<float> input_tensor = preprocess_imgs_to_nchw(rois);
|
851 |
+
|
852 |
+
float *outdata1_0 = nullptr;
|
853 |
+
float *outdata1_1 = nullptr;
|
854 |
+
|
855 |
+
result = fast_interpreter2->set_input_tensor(0, input_tensor.data());
|
856 |
+
if(result != EXIT_SUCCESS){
|
857 |
+
printf("interpreter2->set_input_tensor() failed !\n");
|
858 |
+
return EXIT_FAILURE;
|
859 |
+
}
|
860 |
+
auto t1 = std::chrono::high_resolution_clock::now();
|
861 |
+
result = fast_interpreter2->invoke();
|
862 |
+
auto t2 = std::chrono::high_resolution_clock::now();
|
863 |
+
std::chrono::duration<double> cost_time = t2 - t1;
|
864 |
+
invoke_time.push_back(cost_time.count() * 1000);
|
865 |
+
if(result != EXIT_SUCCESS){
|
866 |
+
printf("interpreter2->invoke() failed !\n");
|
867 |
+
return EXIT_FAILURE;
|
868 |
+
}
|
869 |
+
uint32_t out_data_1_0 = 0;
|
870 |
+
result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0);
|
871 |
+
if(result != EXIT_SUCCESS){
|
872 |
+
printf("interpreter2->get_output_tensor() 0 failed !\n");
|
873 |
+
return EXIT_FAILURE;
|
874 |
+
}
|
875 |
+
|
876 |
+
uint32_t out_data_1_1 = 0;
|
877 |
+
result = fast_interpreter2->get_output_tensor(1, (void**)&outdata1_1, &out_data_1_1);
|
878 |
+
if(result != EXIT_SUCCESS){
|
879 |
+
printf("interpreter2->get_output_tensor() 1 failed !\n");
|
880 |
+
return EXIT_FAILURE;
|
881 |
+
}
|
882 |
+
|
883 |
+
std::vector<float> flags(outdata1_0, outdata1_0 + 1);
|
884 |
+
std::vector<float> normalized_landmarks(outdata1_1, outdata1_1 + 468*3);
|
885 |
+
|
886 |
+
std::vector<cv::Point2f> landmarks = denormalize_landmarks(normalized_landmarks, affines);
|
887 |
+
draw_landmarks(frame_clone1, landmarks, flags, FACE_CONNECTIONS);
|
888 |
+
} else {
|
889 |
+
std::cout << "not detect face!" << std::endl;
|
890 |
+
}
|
891 |
+
|
892 |
+
|
893 |
+
draw_roi(frame_clone1, boxes);
|
894 |
+
draw_detections(frame_clone1, face_detections);
|
895 |
+
cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR);
|
896 |
+
cv::imwrite("vis_result.jpg", frame_clone1);
|
897 |
+
|
898 |
+
|
899 |
+
fast_interpreter1->destory();
|
900 |
+
fast_interpreter2->destory();
|
901 |
+
return 0;
|
902 |
+
|
903 |
+
}
|
904 |
+
|
905 |
+
|
906 |
+
int main(int argc, char* argv[]) {
|
907 |
+
Args args = parse_args(argc, argv);
|
908 |
+
return invoke(args);
|
909 |
+
}
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceDetector_w8a8.qnn216.ctx.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:42ccf2e3a2ee4ff2adf15ea7b00b453bb1a0a183ebd764e8542eb9d56182191d
|
3 |
+
size 720424
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceLandmark_w8a8.qnn216.ctx.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:355785d3eeb5a26ad29e3b128d803d3f20b443e01bed3249ff4013ac57d634b4
|
3 |
+
size 1068128
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_face_back.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95
|
3 |
+
size 28800
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_float32.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94
|
3 |
+
size 14464
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazebase.py
ADDED
@@ -0,0 +1,513 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
import torch.nn.functional as F
|
6 |
+
|
7 |
+
|
8 |
+
def resize_pad(img):
|
9 |
+
""" resize and pad images to be input to the detectors
|
10 |
+
|
11 |
+
The face and palm detector networks take 256x256 and 128x128 images
|
12 |
+
as input. As such the input image is padded and resized to fit the
|
13 |
+
size while maintaing the aspect ratio.
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
img1: 256x256
|
17 |
+
img2: 128x128
|
18 |
+
scale: scale factor between original image and 256x256 image
|
19 |
+
pad: pixels of padding in the original image
|
20 |
+
"""
|
21 |
+
|
22 |
+
size0 = img.shape
|
23 |
+
if size0[0]>=size0[1]:
|
24 |
+
h1 = 256
|
25 |
+
w1 = 256 * size0[1] // size0[0]
|
26 |
+
padh = 0
|
27 |
+
padw = 256 - w1
|
28 |
+
scale = size0[1] / w1
|
29 |
+
else:
|
30 |
+
h1 = 256 * size0[0] // size0[1]
|
31 |
+
w1 = 256
|
32 |
+
padh = 256 - h1
|
33 |
+
padw = 0
|
34 |
+
scale = size0[0] / h1
|
35 |
+
padh1 = padh//2
|
36 |
+
padh2 = padh//2 + padh%2
|
37 |
+
padw1 = padw//2
|
38 |
+
padw2 = padw//2 + padw%2
|
39 |
+
img1 = cv2.resize(img, (w1,h1))
|
40 |
+
img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0)))
|
41 |
+
pad = (int(padh1 * scale), int(padw1 * scale))
|
42 |
+
img2 = cv2.resize(img1, (128,128))
|
43 |
+
return img1, img2, scale, pad
|
44 |
+
|
45 |
+
|
46 |
+
def denormalize_detections(detections, scale, pad):
|
47 |
+
""" maps detection coordinates from [0,1] to image coordinates
|
48 |
+
|
49 |
+
The face and palm detector networks take 256x256 and 128x128 images
|
50 |
+
as input. As such the input image is padded and resized to fit the
|
51 |
+
size while maintaing the aspect ratio. This function maps the
|
52 |
+
normalized coordinates back to the original image coordinates.
|
53 |
+
|
54 |
+
Inputs:
|
55 |
+
detections: nxm tensor. n is the number of detections.
|
56 |
+
m is 4+2*k where the first 4 valuse are the bounding
|
57 |
+
box coordinates and k is the number of additional
|
58 |
+
keypoints output by the detector.
|
59 |
+
scale: scalar that was used to resize the image
|
60 |
+
pad: padding in the x and y dimensions
|
61 |
+
|
62 |
+
"""
|
63 |
+
detections[:, 0] = detections[:, 0] * scale * 256 - pad[0]
|
64 |
+
detections[:, 1] = detections[:, 1] * scale * 256 - pad[1]
|
65 |
+
detections[:, 2] = detections[:, 2] * scale * 256 - pad[0]
|
66 |
+
detections[:, 3] = detections[:, 3] * scale * 256 - pad[1]
|
67 |
+
|
68 |
+
detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1]
|
69 |
+
detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0]
|
70 |
+
return detections
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
class BlazeBlock(nn.Module):
|
76 |
+
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False):
|
77 |
+
super(BlazeBlock, self).__init__()
|
78 |
+
|
79 |
+
self.stride = stride
|
80 |
+
self.kernel_size = kernel_size
|
81 |
+
self.channel_pad = out_channels - in_channels
|
82 |
+
|
83 |
+
# TFLite uses slightly different padding than PyTorch
|
84 |
+
# on the depthwise conv layer when the stride is 2.
|
85 |
+
if stride == 2:
|
86 |
+
self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
|
87 |
+
padding = 0
|
88 |
+
else:
|
89 |
+
padding = (kernel_size - 1) // 2
|
90 |
+
|
91 |
+
self.convs = nn.Sequential(
|
92 |
+
nn.Conv2d(in_channels=in_channels, out_channels=in_channels,
|
93 |
+
kernel_size=kernel_size, stride=stride, padding=padding,
|
94 |
+
groups=in_channels, bias=True),
|
95 |
+
nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
|
96 |
+
kernel_size=1, stride=1, padding=0, bias=True),
|
97 |
+
)
|
98 |
+
|
99 |
+
if skip_proj:
|
100 |
+
self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
|
101 |
+
kernel_size=1, stride=1, padding=0, bias=True)
|
102 |
+
else:
|
103 |
+
self.skip_proj = None
|
104 |
+
|
105 |
+
if act == 'relu':
|
106 |
+
self.act = nn.ReLU(inplace=True)
|
107 |
+
elif act == 'prelu':
|
108 |
+
self.act = nn.PReLU(out_channels)
|
109 |
+
else:
|
110 |
+
raise NotImplementedError("unknown activation %s"%act)
|
111 |
+
|
112 |
+
def forward(self, x):
|
113 |
+
if self.stride == 2:
|
114 |
+
if self.kernel_size==3:
|
115 |
+
h = F.pad(x, (0, 2, 0, 2), "constant", 0)
|
116 |
+
else:
|
117 |
+
h = F.pad(x, (1, 2, 1, 2), "constant", 0)
|
118 |
+
x = self.max_pool(x)
|
119 |
+
else:
|
120 |
+
h = x
|
121 |
+
|
122 |
+
if self.skip_proj is not None:
|
123 |
+
x = self.skip_proj(x)
|
124 |
+
elif self.channel_pad > 0:
|
125 |
+
x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
|
126 |
+
|
127 |
+
|
128 |
+
return self.act(self.convs(h) + x)
|
129 |
+
|
130 |
+
|
131 |
+
class FinalBlazeBlock(nn.Module):
|
132 |
+
def __init__(self, channels, kernel_size=3):
|
133 |
+
super(FinalBlazeBlock, self).__init__()
|
134 |
+
|
135 |
+
# TFLite uses slightly different padding than PyTorch
|
136 |
+
# on the depthwise conv layer when the stride is 2.
|
137 |
+
self.convs = nn.Sequential(
|
138 |
+
nn.Conv2d(in_channels=channels, out_channels=channels,
|
139 |
+
kernel_size=kernel_size, stride=2, padding=0,
|
140 |
+
groups=channels, bias=True),
|
141 |
+
nn.Conv2d(in_channels=channels, out_channels=channels,
|
142 |
+
kernel_size=1, stride=1, padding=0, bias=True),
|
143 |
+
)
|
144 |
+
|
145 |
+
self.act = nn.ReLU(inplace=True)
|
146 |
+
|
147 |
+
def forward(self, x):
|
148 |
+
h = F.pad(x, (0, 2, 0, 2), "constant", 0)
|
149 |
+
|
150 |
+
return self.act(self.convs(h))
|
151 |
+
|
152 |
+
|
153 |
+
class BlazeBase(nn.Module):
|
154 |
+
""" Base class for media pipe models. """
|
155 |
+
|
156 |
+
def _device(self):
|
157 |
+
"""Which device (CPU or GPU) is being used by this model?"""
|
158 |
+
return self.classifier_8.weight.device
|
159 |
+
|
160 |
+
def load_weights(self, path):
|
161 |
+
self.load_state_dict(torch.load(path))
|
162 |
+
self.eval()
|
163 |
+
|
164 |
+
|
165 |
+
class BlazeLandmark(BlazeBase):
|
166 |
+
""" Base class for landmark models. """
|
167 |
+
|
168 |
+
def extract_roi(self, frame, xc, yc, theta, scale):
|
169 |
+
|
170 |
+
# take points on unit square and transform them according to the roi
|
171 |
+
points = torch.tensor([[-1, -1, 1, 1],
|
172 |
+
[-1, 1, -1, 1]], device=scale.device).view(1,2,4)
|
173 |
+
points = points * scale.view(-1,1,1)/2
|
174 |
+
theta = theta.view(-1, 1, 1)
|
175 |
+
R = torch.cat((
|
176 |
+
torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
|
177 |
+
torch.cat((torch.sin(theta), torch.cos(theta)), 2),
|
178 |
+
), 1)
|
179 |
+
center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
|
180 |
+
points = R @ points + center
|
181 |
+
|
182 |
+
# use the points to compute the affine transform that maps
|
183 |
+
# these points back to the output square
|
184 |
+
res = self.resolution
|
185 |
+
points1 = np.array([[0, 0, res-1],
|
186 |
+
[0, res-1, 0]], dtype=np.float32).T
|
187 |
+
affines = []
|
188 |
+
imgs = []
|
189 |
+
for i in range(points.shape[0]):
|
190 |
+
pts = points[i, :, :3].cpu().numpy().T
|
191 |
+
M = cv2.getAffineTransform(pts, points1)
|
192 |
+
img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
|
193 |
+
img = torch.tensor(img, device=scale.device)
|
194 |
+
imgs.append(img)
|
195 |
+
affine = cv2.invertAffineTransform(M).astype('float32')
|
196 |
+
affine = torch.tensor(affine, device=scale.device)
|
197 |
+
affines.append(affine)
|
198 |
+
if imgs:
|
199 |
+
imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
|
200 |
+
affines = torch.stack(affines)
|
201 |
+
else:
|
202 |
+
imgs = torch.zeros((0, 3, res, res), device=scale.device)
|
203 |
+
affines = torch.zeros((0, 2, 3), device=scale.device)
|
204 |
+
|
205 |
+
return imgs, affines, points
|
206 |
+
|
207 |
+
def denormalize_landmarks(self, landmarks, affines):
|
208 |
+
landmarks[:,:,:2] *= self.resolution
|
209 |
+
for i in range(len(landmarks)):
|
210 |
+
landmark, affine = landmarks[i], affines[i]
|
211 |
+
landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
|
212 |
+
landmarks[i,:,:2] = landmark
|
213 |
+
return landmarks
|
214 |
+
|
215 |
+
|
216 |
+
|
217 |
+
class BlazeDetector(BlazeBase):
|
218 |
+
""" Base class for detector models.
|
219 |
+
|
220 |
+
Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
|
221 |
+
https://github.com/hollance/BlazeFace-PyTorch and
|
222 |
+
https://github.com/google/mediapipe/
|
223 |
+
"""
|
224 |
+
def load_anchors(self, path):
|
225 |
+
self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
|
226 |
+
assert(self.anchors.ndimension() == 2)
|
227 |
+
assert(self.anchors.shape[0] == self.num_anchors)
|
228 |
+
assert(self.anchors.shape[1] == 4)
|
229 |
+
|
230 |
+
def _preprocess(self, x):
|
231 |
+
"""Converts the image pixels to the range [-1, 1]."""
|
232 |
+
return x.float() / 255.# 127.5 - 1.0
|
233 |
+
|
234 |
+
def predict_on_image(self, img):
|
235 |
+
"""Makes a prediction on a single image.
|
236 |
+
|
237 |
+
Arguments:
|
238 |
+
img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
|
239 |
+
shape (3, H, W). The image's height and width should be
|
240 |
+
128 pixels.
|
241 |
+
|
242 |
+
Returns:
|
243 |
+
A tensor with face detections.
|
244 |
+
"""
|
245 |
+
if isinstance(img, np.ndarray):
|
246 |
+
img = torch.from_numpy(img).permute((2, 0, 1))
|
247 |
+
|
248 |
+
return self.predict_on_batch(img.unsqueeze(0))[0]
|
249 |
+
|
250 |
+
def predict_on_batch(self, x):
|
251 |
+
"""Makes a prediction on a batch of images.
|
252 |
+
|
253 |
+
Arguments:
|
254 |
+
x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
|
255 |
+
shape (b, 3, H, W). The height and width should be 128 pixels.
|
256 |
+
|
257 |
+
Returns:
|
258 |
+
A list containing a tensor of face detections for each image in
|
259 |
+
the batch. If no faces are found for an image, returns a tensor
|
260 |
+
of shape (0, 17).
|
261 |
+
|
262 |
+
Each face detection is a PyTorch tensor consisting of 17 numbers:
|
263 |
+
- ymin, xmin, ymax, xmax
|
264 |
+
- x,y-coordinates for the 6 keypoints
|
265 |
+
- confidence score
|
266 |
+
"""
|
267 |
+
if isinstance(x, np.ndarray):
|
268 |
+
x = torch.from_numpy(x).permute((0, 3, 1, 2))
|
269 |
+
|
270 |
+
assert x.shape[1] == 3
|
271 |
+
assert x.shape[2] == self.y_scale
|
272 |
+
assert x.shape[3] == self.x_scale
|
273 |
+
|
274 |
+
# 1. Preprocess the images into tensors:
|
275 |
+
x = x.to(self._device())
|
276 |
+
x = self._preprocess(x)
|
277 |
+
|
278 |
+
# 2. Run the neural network:
|
279 |
+
with torch.no_grad():
|
280 |
+
out = self.__call__(x)
|
281 |
+
|
282 |
+
# 3. Postprocess the raw predictions:
|
283 |
+
detections = self._tensors_to_detections(out[0], out[1], self.anchors)
|
284 |
+
|
285 |
+
# 4. Non-maximum suppression to remove overlapping detections:
|
286 |
+
filtered_detections = []
|
287 |
+
for i in range(len(detections)):
|
288 |
+
faces = self._weighted_non_max_suppression(detections[i])
|
289 |
+
faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1))
|
290 |
+
filtered_detections.append(faces)
|
291 |
+
|
292 |
+
return filtered_detections
|
293 |
+
|
294 |
+
|
295 |
+
def detection2roi(self, detection):
|
296 |
+
""" Convert detections from detector to an oriented bounding box.
|
297 |
+
|
298 |
+
Adapted from:
|
299 |
+
# mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
|
300 |
+
|
301 |
+
The center and size of the box is calculated from the center
|
302 |
+
of the detected box. Rotation is calcualted from the vector
|
303 |
+
between kp1 and kp2 relative to theta0. The box is scaled
|
304 |
+
and shifted by dscale and dy.
|
305 |
+
|
306 |
+
"""
|
307 |
+
if self.detection2roi_method == 'box':
|
308 |
+
# compute box center and scale
|
309 |
+
# use mediapipe/calculators/util/detections_to_rects_calculator.cc
|
310 |
+
xc = (detection[:,1] + detection[:,3]) / 2
|
311 |
+
yc = (detection[:,0] + detection[:,2]) / 2
|
312 |
+
scale = (detection[:,3] - detection[:,1]) # assumes square boxes
|
313 |
+
|
314 |
+
elif self.detection2roi_method == 'alignment':
|
315 |
+
# compute box center and scale
|
316 |
+
# use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc
|
317 |
+
xc = detection[:,4+2*self.kp1]
|
318 |
+
yc = detection[:,4+2*self.kp1+1]
|
319 |
+
x1 = detection[:,4+2*self.kp2]
|
320 |
+
y1 = detection[:,4+2*self.kp2+1]
|
321 |
+
scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2
|
322 |
+
else:
|
323 |
+
raise NotImplementedError(
|
324 |
+
"detection2roi_method [%s] not supported"%self.detection2roi_method)
|
325 |
+
|
326 |
+
yc += self.dy * scale
|
327 |
+
scale *= self.dscale
|
328 |
+
|
329 |
+
# compute box rotation
|
330 |
+
x0 = detection[:,4+2*self.kp1]
|
331 |
+
y0 = detection[:,4+2*self.kp1+1]
|
332 |
+
x1 = detection[:,4+2*self.kp2]
|
333 |
+
y1 = detection[:,4+2*self.kp2+1]
|
334 |
+
#theta = np.arctan2(y0-y1, x0-x1) - self.theta0
|
335 |
+
theta = torch.atan2(y0-y1, x0-x1) - self.theta0
|
336 |
+
return xc, yc, scale, theta
|
337 |
+
|
338 |
+
|
339 |
+
def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
|
340 |
+
"""The output of the neural network is a tensor of shape (b, 896, 16)
|
341 |
+
containing the bounding box regressor predictions, as well as a tensor
|
342 |
+
of shape (b, 896, 1) with the classification confidences.
|
343 |
+
|
344 |
+
This function converts these two "raw" tensors into proper detections.
|
345 |
+
Returns a list of (num_detections, 17) tensors, one for each image in
|
346 |
+
the batch.
|
347 |
+
|
348 |
+
This is based on the source code from:
|
349 |
+
mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
|
350 |
+
mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
|
351 |
+
"""
|
352 |
+
assert raw_box_tensor.ndimension() == 3
|
353 |
+
assert raw_box_tensor.shape[1] == self.num_anchors
|
354 |
+
assert raw_box_tensor.shape[2] == self.num_coords
|
355 |
+
|
356 |
+
assert raw_score_tensor.ndimension() == 3
|
357 |
+
assert raw_score_tensor.shape[1] == self.num_anchors
|
358 |
+
assert raw_score_tensor.shape[2] == self.num_classes
|
359 |
+
|
360 |
+
assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
|
361 |
+
|
362 |
+
detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
|
363 |
+
|
364 |
+
thresh = self.score_clipping_thresh
|
365 |
+
raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
|
366 |
+
detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
|
367 |
+
|
368 |
+
# Note: we stripped off the last dimension from the scores tensor
|
369 |
+
# because there is only has one class. Now we can simply use a mask
|
370 |
+
# to filter out the boxes with too low confidence.
|
371 |
+
mask = detection_scores >= self.min_score_thresh
|
372 |
+
|
373 |
+
# Because each image from the batch can have a different number of
|
374 |
+
# detections, process them one at a time using a loop.
|
375 |
+
output_detections = []
|
376 |
+
for i in range(raw_box_tensor.shape[0]):
|
377 |
+
boxes = detection_boxes[i, mask[i]]
|
378 |
+
scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
|
379 |
+
output_detections.append(torch.cat((boxes, scores), dim=-1))
|
380 |
+
|
381 |
+
return output_detections
|
382 |
+
|
383 |
+
def _decode_boxes(self, raw_boxes, anchors):
|
384 |
+
"""Converts the predictions into actual coordinates using
|
385 |
+
the anchor boxes. Processes the entire batch at once.
|
386 |
+
"""
|
387 |
+
boxes = torch.zeros_like(raw_boxes)
|
388 |
+
|
389 |
+
x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
|
390 |
+
y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
|
391 |
+
|
392 |
+
w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
|
393 |
+
h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
|
394 |
+
|
395 |
+
boxes[..., 0] = y_center - h / 2. # ymin
|
396 |
+
boxes[..., 1] = x_center - w / 2. # xmin
|
397 |
+
boxes[..., 2] = y_center + h / 2. # ymax
|
398 |
+
boxes[..., 3] = x_center + w / 2. # xmax
|
399 |
+
|
400 |
+
for k in range(self.num_keypoints):
|
401 |
+
offset = 4 + k*2
|
402 |
+
keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
|
403 |
+
keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
|
404 |
+
boxes[..., offset ] = keypoint_x
|
405 |
+
boxes[..., offset + 1] = keypoint_y
|
406 |
+
|
407 |
+
return boxes
|
408 |
+
|
409 |
+
def _weighted_non_max_suppression(self, detections):
|
410 |
+
"""The alternative NMS method as mentioned in the BlazeFace paper:
|
411 |
+
|
412 |
+
"We replace the suppression algorithm with a blending strategy that
|
413 |
+
estimates the regression parameters of a bounding box as a weighted
|
414 |
+
mean between the overlapping predictions."
|
415 |
+
|
416 |
+
The original MediaPipe code assigns the score of the most confident
|
417 |
+
detection to the weighted detection, but we take the average score
|
418 |
+
of the overlapping detections.
|
419 |
+
|
420 |
+
The input detections should be a Tensor of shape (count, 17).
|
421 |
+
|
422 |
+
Returns a list of PyTorch tensors, one for each detected face.
|
423 |
+
|
424 |
+
This is based on the source code from:
|
425 |
+
mediapipe/calculators/util/non_max_suppression_calculator.cc
|
426 |
+
mediapipe/calculators/util/non_max_suppression_calculator.proto
|
427 |
+
"""
|
428 |
+
if len(detections) == 0: return []
|
429 |
+
|
430 |
+
output_detections = []
|
431 |
+
|
432 |
+
# Sort the detections from highest to lowest score.
|
433 |
+
remaining = torch.argsort(detections[:, self.num_coords], descending=True)
|
434 |
+
|
435 |
+
while len(remaining) > 0:
|
436 |
+
detection = detections[remaining[0]]
|
437 |
+
|
438 |
+
# Compute the overlap between the first box and the other
|
439 |
+
# remaining boxes. (Note that the other_boxes also include
|
440 |
+
# the first_box.)
|
441 |
+
first_box = detection[:4]
|
442 |
+
other_boxes = detections[remaining, :4]
|
443 |
+
ious = overlap_similarity(first_box, other_boxes)
|
444 |
+
|
445 |
+
# If two detections don't overlap enough, they are considered
|
446 |
+
# to be from different faces.
|
447 |
+
mask = ious > self.min_suppression_threshold
|
448 |
+
overlapping = remaining[mask]
|
449 |
+
remaining = remaining[~mask]
|
450 |
+
|
451 |
+
# Take an average of the coordinates from the overlapping
|
452 |
+
# detections, weighted by their confidence scores.
|
453 |
+
weighted_detection = detection.clone()
|
454 |
+
if len(overlapping) > 1:
|
455 |
+
coordinates = detections[overlapping, :self.num_coords]
|
456 |
+
scores = detections[overlapping, self.num_coords:self.num_coords+1]
|
457 |
+
total_score = scores.sum()
|
458 |
+
weighted = (coordinates * scores).sum(dim=0) / total_score
|
459 |
+
weighted_detection[:self.num_coords] = weighted
|
460 |
+
weighted_detection[self.num_coords] = total_score / len(overlapping)
|
461 |
+
|
462 |
+
output_detections.append(weighted_detection)
|
463 |
+
|
464 |
+
return output_detections
|
465 |
+
|
466 |
+
|
467 |
+
# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
|
468 |
+
|
469 |
+
def intersect(box_a, box_b):
|
470 |
+
""" We resize both tensors to [A,B,2] without new malloc:
|
471 |
+
[A,2] -> [A,1,2] -> [A,B,2]
|
472 |
+
[B,2] -> [1,B,2] -> [A,B,2]
|
473 |
+
Then we compute the area of intersect between box_a and box_b.
|
474 |
+
Args:
|
475 |
+
box_a: (tensor) bounding boxes, Shape: [A,4].
|
476 |
+
box_b: (tensor) bounding boxes, Shape: [B,4].
|
477 |
+
Return:
|
478 |
+
(tensor) intersection area, Shape: [A,B].
|
479 |
+
"""
|
480 |
+
A = box_a.size(0)
|
481 |
+
B = box_b.size(0)
|
482 |
+
max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
|
483 |
+
box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
|
484 |
+
min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
|
485 |
+
box_b[:, :2].unsqueeze(0).expand(A, B, 2))
|
486 |
+
inter = torch.clamp((max_xy - min_xy), min=0)
|
487 |
+
return inter[:, :, 0] * inter[:, :, 1]
|
488 |
+
|
489 |
+
|
490 |
+
def jaccard(box_a, box_b):
|
491 |
+
"""Compute the jaccard overlap of two sets of boxes. The jaccard overlap
|
492 |
+
is simply the intersection over union of two boxes. Here we operate on
|
493 |
+
ground truth boxes and default boxes.
|
494 |
+
E.g.:
|
495 |
+
A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
|
496 |
+
Args:
|
497 |
+
box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
|
498 |
+
box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
|
499 |
+
Return:
|
500 |
+
jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
|
501 |
+
"""
|
502 |
+
inter = intersect(box_a, box_b)
|
503 |
+
area_a = ((box_a[:, 2]-box_a[:, 0]) *
|
504 |
+
(box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
|
505 |
+
area_b = ((box_b[:, 2]-box_b[:, 0]) *
|
506 |
+
(box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
|
507 |
+
union = area_a + area_b - inter
|
508 |
+
return inter / union # [A,B]
|
509 |
+
|
510 |
+
|
511 |
+
def overlap_similarity(box, other_boxes):
|
512 |
+
"""Computes the IOU between a bounding box and set of other boxes."""
|
513 |
+
return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface.py
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import torch.nn.functional as F
|
5 |
+
|
6 |
+
from blazebase import BlazeDetector, BlazeBlock, FinalBlazeBlock
|
7 |
+
|
8 |
+
|
9 |
+
class BlazeFace(BlazeDetector):
|
10 |
+
"""The BlazeFace face detection model from MediaPipe.
|
11 |
+
|
12 |
+
The version from MediaPipe is simpler than the one in the paper;
|
13 |
+
it does not use the "double" BlazeBlocks.
|
14 |
+
|
15 |
+
Because we won't be training this model, it doesn't need to have
|
16 |
+
batchnorm layers. These have already been "folded" into the conv
|
17 |
+
weights by TFLite.
|
18 |
+
|
19 |
+
The conversion to PyTorch is fairly straightforward, but there are
|
20 |
+
some small differences between TFLite and PyTorch in how they handle
|
21 |
+
padding on conv layers with stride 2.
|
22 |
+
|
23 |
+
This version works on batches, while the MediaPipe version can only
|
24 |
+
handle a single image at a time.
|
25 |
+
|
26 |
+
Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
|
27 |
+
https://github.com/hollance/BlazeFace-PyTorch and
|
28 |
+
https://github.com/google/mediapipe/
|
29 |
+
|
30 |
+
"""
|
31 |
+
def __init__(self, back_model=False):
|
32 |
+
super(BlazeFace, self).__init__()
|
33 |
+
|
34 |
+
# These are the settings from the MediaPipe example graph
|
35 |
+
# mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt
|
36 |
+
self.num_classes = 1
|
37 |
+
self.num_anchors = 896
|
38 |
+
self.num_coords = 16
|
39 |
+
self.score_clipping_thresh = 100.0
|
40 |
+
self.back_model = back_model
|
41 |
+
if back_model:
|
42 |
+
self.x_scale = 256.0
|
43 |
+
self.y_scale = 256.0
|
44 |
+
self.h_scale = 256.0
|
45 |
+
self.w_scale = 256.0
|
46 |
+
self.min_score_thresh = 0.65
|
47 |
+
else:
|
48 |
+
self.x_scale = 128.0
|
49 |
+
self.y_scale = 128.0
|
50 |
+
self.h_scale = 128.0
|
51 |
+
self.w_scale = 128.0
|
52 |
+
self.min_score_thresh = 0.75
|
53 |
+
self.min_suppression_threshold = 0.3
|
54 |
+
self.num_keypoints = 6
|
55 |
+
|
56 |
+
# These settings are for converting detections to ROIs which can then
|
57 |
+
# be extracted and feed into the landmark network
|
58 |
+
# use mediapipe/calculators/util/detections_to_rects_calculator.cc
|
59 |
+
self.detection2roi_method = 'box'
|
60 |
+
# mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
|
61 |
+
self.kp1 = 1
|
62 |
+
self.kp2 = 0
|
63 |
+
self.theta0 = 0.
|
64 |
+
self.dscale = 1.5
|
65 |
+
self.dy = 0.
|
66 |
+
|
67 |
+
self._define_layers()
|
68 |
+
|
69 |
+
def _define_layers(self):
|
70 |
+
if self.back_model:
|
71 |
+
self.backbone = nn.Sequential(
|
72 |
+
nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
|
73 |
+
nn.ReLU(inplace=True),
|
74 |
+
|
75 |
+
BlazeBlock(24, 24),
|
76 |
+
BlazeBlock(24, 24),
|
77 |
+
BlazeBlock(24, 24),
|
78 |
+
BlazeBlock(24, 24),
|
79 |
+
BlazeBlock(24, 24),
|
80 |
+
BlazeBlock(24, 24),
|
81 |
+
BlazeBlock(24, 24),
|
82 |
+
BlazeBlock(24, 24, stride=2),
|
83 |
+
BlazeBlock(24, 24),
|
84 |
+
BlazeBlock(24, 24),
|
85 |
+
BlazeBlock(24, 24),
|
86 |
+
BlazeBlock(24, 24),
|
87 |
+
BlazeBlock(24, 24),
|
88 |
+
BlazeBlock(24, 24),
|
89 |
+
BlazeBlock(24, 24),
|
90 |
+
BlazeBlock(24, 48, stride=2),
|
91 |
+
BlazeBlock(48, 48),
|
92 |
+
BlazeBlock(48, 48),
|
93 |
+
BlazeBlock(48, 48),
|
94 |
+
BlazeBlock(48, 48),
|
95 |
+
BlazeBlock(48, 48),
|
96 |
+
BlazeBlock(48, 48),
|
97 |
+
BlazeBlock(48, 48),
|
98 |
+
BlazeBlock(48, 96, stride=2),
|
99 |
+
BlazeBlock(96, 96),
|
100 |
+
BlazeBlock(96, 96),
|
101 |
+
BlazeBlock(96, 96),
|
102 |
+
BlazeBlock(96, 96),
|
103 |
+
BlazeBlock(96, 96),
|
104 |
+
BlazeBlock(96, 96),
|
105 |
+
BlazeBlock(96, 96),
|
106 |
+
)
|
107 |
+
self.final = FinalBlazeBlock(96)
|
108 |
+
self.classifier_8 = nn.Conv2d(96, 2, 1, bias=True)
|
109 |
+
self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
|
110 |
+
|
111 |
+
self.regressor_8 = nn.Conv2d(96, 32, 1, bias=True)
|
112 |
+
self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
|
113 |
+
else:
|
114 |
+
self.backbone1 = nn.Sequential(
|
115 |
+
nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
|
116 |
+
nn.ReLU(inplace=True),
|
117 |
+
|
118 |
+
BlazeBlock(24, 24),
|
119 |
+
BlazeBlock(24, 28),
|
120 |
+
BlazeBlock(28, 32, stride=2),
|
121 |
+
BlazeBlock(32, 36),
|
122 |
+
BlazeBlock(36, 42),
|
123 |
+
BlazeBlock(42, 48, stride=2),
|
124 |
+
BlazeBlock(48, 56),
|
125 |
+
BlazeBlock(56, 64),
|
126 |
+
BlazeBlock(64, 72),
|
127 |
+
BlazeBlock(72, 80),
|
128 |
+
BlazeBlock(80, 88),
|
129 |
+
)
|
130 |
+
|
131 |
+
self.backbone2 = nn.Sequential(
|
132 |
+
BlazeBlock(88, 96, stride=2),
|
133 |
+
BlazeBlock(96, 96),
|
134 |
+
BlazeBlock(96, 96),
|
135 |
+
BlazeBlock(96, 96),
|
136 |
+
BlazeBlock(96, 96),
|
137 |
+
)
|
138 |
+
|
139 |
+
self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True)
|
140 |
+
self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
|
141 |
+
|
142 |
+
self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True)
|
143 |
+
self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
|
144 |
+
|
145 |
+
def forward(self, x):
|
146 |
+
# TFLite uses slightly different padding on the first conv layer
|
147 |
+
# than PyTorch, so do it manually.
|
148 |
+
x = F.pad(x, (1, 2, 1, 2), "constant", 0)
|
149 |
+
|
150 |
+
b = x.shape[0] # batch size, needed for reshaping later
|
151 |
+
|
152 |
+
if self.back_model:
|
153 |
+
x = self.backbone(x) # (b, 16, 16, 96)
|
154 |
+
h = self.final(x) # (b, 8, 8, 96)
|
155 |
+
else:
|
156 |
+
x = self.backbone1(x) # (b, 88, 16, 16)
|
157 |
+
h = self.backbone2(x) # (b, 96, 8, 8)
|
158 |
+
|
159 |
+
# Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
|
160 |
+
# permute the output from the conv layers before reshaping it.
|
161 |
+
|
162 |
+
c1 = self.classifier_8(x) # (b, 2, 16, 16)
|
163 |
+
c1 = c1.permute(0, 2, 3, 1) # (b, 16, 16, 2)
|
164 |
+
c1 = c1.reshape(b, -1, 1) # (b, 512, 1)
|
165 |
+
|
166 |
+
c2 = self.classifier_16(h) # (b, 6, 8, 8)
|
167 |
+
c2 = c2.permute(0, 2, 3, 1) # (b, 8, 8, 6)
|
168 |
+
c2 = c2.reshape(b, -1, 1) # (b, 384, 1)
|
169 |
+
|
170 |
+
c = torch.cat((c1, c2), dim=1) # (b, 896, 1)
|
171 |
+
|
172 |
+
r1 = self.regressor_8(x) # (b, 32, 16, 16)
|
173 |
+
r1 = r1.permute(0, 2, 3, 1) # (b, 16, 16, 32)
|
174 |
+
r1 = r1.reshape(b, -1, 16) # (b, 512, 16)
|
175 |
+
|
176 |
+
r2 = self.regressor_16(h) # (b, 96, 8, 8)
|
177 |
+
r2 = r2.permute(0, 2, 3, 1) # (b, 8, 8, 96)
|
178 |
+
r2 = r2.reshape(b, -1, 16) # (b, 384, 16)
|
179 |
+
|
180 |
+
r = torch.cat((r1, r2), dim=1) # (b, 896, 16)
|
181 |
+
return [r, c]
|
182 |
+
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface_landmark.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import torch.nn.functional as F
|
5 |
+
|
6 |
+
from blazebase import BlazeLandmark, BlazeBlock
|
7 |
+
|
8 |
+
class BlazeFaceLandmark(BlazeLandmark):
|
9 |
+
"""The face landmark model from MediaPipe.
|
10 |
+
|
11 |
+
"""
|
12 |
+
def __init__(self):
|
13 |
+
super(BlazeFaceLandmark, self).__init__()
|
14 |
+
|
15 |
+
# size of ROIs used for input
|
16 |
+
self.resolution = 192
|
17 |
+
|
18 |
+
self._define_layers()
|
19 |
+
|
20 |
+
def _define_layers(self):
|
21 |
+
self.backbone1 = nn.Sequential(
|
22 |
+
nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=0, bias=True),
|
23 |
+
nn.PReLU(16),
|
24 |
+
|
25 |
+
BlazeBlock(16, 16, 3, act='prelu'),
|
26 |
+
BlazeBlock(16, 16, 3, act='prelu'),
|
27 |
+
BlazeBlock(16, 32, 3, 2, act='prelu'),
|
28 |
+
|
29 |
+
BlazeBlock(32, 32, 3, act='prelu'),
|
30 |
+
BlazeBlock(32, 32, 3, act='prelu'),
|
31 |
+
BlazeBlock(32, 64, 3, 2, act='prelu'),
|
32 |
+
|
33 |
+
BlazeBlock(64, 64, 3, act='prelu'),
|
34 |
+
BlazeBlock(64, 64, 3, act='prelu'),
|
35 |
+
BlazeBlock(64, 128, 3, 2, act='prelu'),
|
36 |
+
|
37 |
+
BlazeBlock(128, 128, 3, act='prelu'),
|
38 |
+
BlazeBlock(128, 128, 3, act='prelu'),
|
39 |
+
BlazeBlock(128, 128, 3, 2, act='prelu'),
|
40 |
+
|
41 |
+
BlazeBlock(128, 128, 3, act='prelu'),
|
42 |
+
BlazeBlock(128, 128, 3, act='prelu'),
|
43 |
+
)
|
44 |
+
|
45 |
+
|
46 |
+
self.backbone2a = nn.Sequential(
|
47 |
+
BlazeBlock(128, 128, 3, 2, act='prelu'),
|
48 |
+
BlazeBlock(128, 128, 3, act='prelu'),
|
49 |
+
BlazeBlock(128, 128, 3, act='prelu'),
|
50 |
+
nn.Conv2d(128, 32, 1, padding=0, bias=True),
|
51 |
+
nn.PReLU(32),
|
52 |
+
BlazeBlock(32, 32, 3, act='prelu'),
|
53 |
+
nn.Conv2d(32, 1404, 3, padding=0, bias=True)
|
54 |
+
)
|
55 |
+
|
56 |
+
self.backbone2b = nn.Sequential(
|
57 |
+
BlazeBlock(128, 128, 3, 2, act='prelu'),
|
58 |
+
nn.Conv2d(128, 32, 1, padding=0, bias=True),
|
59 |
+
nn.PReLU(32),
|
60 |
+
BlazeBlock(32, 32, 3, act='prelu'),
|
61 |
+
nn.Conv2d(32, 1, 3, padding=0, bias=True)
|
62 |
+
)
|
63 |
+
|
64 |
+
def forward(self, x):
|
65 |
+
if x.shape[0] == 0:
|
66 |
+
return torch.zeros((0,)), torch.zeros((0, 468, 3))
|
67 |
+
|
68 |
+
x = F.pad(x, (0, 1, 0, 1), "constant", 0)
|
69 |
+
|
70 |
+
x = self.backbone1(x)
|
71 |
+
landmarks = self.backbone2a(x).view(-1, 468, 3) / 192
|
72 |
+
flag = self.backbone2b(x).sigmoid().view(-1)
|
73 |
+
|
74 |
+
return flag, landmarks
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/coco.jpg
ADDED
![]() |
Git LFS Details
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/demo_qnn.py
ADDED
@@ -0,0 +1,424 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import cv2
|
4 |
+
import sys
|
5 |
+
from blazebase import resize_pad, denormalize_detections
|
6 |
+
from visualization import draw_landmarks, draw_roi, FACE_CONNECTIONS
|
7 |
+
import time
|
8 |
+
import aidlite
|
9 |
+
import os
|
10 |
+
|
11 |
+
class post_mediapipe_face:
|
12 |
+
def __init__(self):
|
13 |
+
self.kp1 = 1
|
14 |
+
self.kp2 = 0
|
15 |
+
self.theta0 = 0.
|
16 |
+
self.dscale = 1.5
|
17 |
+
self.dy = 0.
|
18 |
+
self.x_scale = 256.0
|
19 |
+
self.y_scale = 256.0
|
20 |
+
self.h_scale = 256.0
|
21 |
+
self.w_scale = 256.0
|
22 |
+
self.num_keypoints = 6
|
23 |
+
self.num_classes = 1
|
24 |
+
self.num_anchors = 896
|
25 |
+
self.num_coords = 16
|
26 |
+
self.min_score_thresh = 0.4 #0.65
|
27 |
+
self.score_clipping_thresh = 100.0
|
28 |
+
self.min_suppression_threshold = 0.3
|
29 |
+
self.resolution = 192
|
30 |
+
|
31 |
+
|
32 |
+
def detection2roi(self,detection):
|
33 |
+
xc = (detection[:,1] + detection[:,3]) / 2
|
34 |
+
yc = (detection[:,0] + detection[:,2]) / 2
|
35 |
+
scale = (detection[:,3] - detection[:,1]) # assumes square boxes
|
36 |
+
yc += self.dy * scale
|
37 |
+
scale *= self.dscale
|
38 |
+
# compute box rotation
|
39 |
+
x0 = detection[:,4+2*self.kp1]
|
40 |
+
y0 = detection[:,4+2*self.kp1+1]
|
41 |
+
x1 = detection[:,4+2*self.kp2]
|
42 |
+
y1 = detection[:,4+2*self.kp2+1]
|
43 |
+
theta = torch.atan2(y0-y1, x0-x1) - self.theta0
|
44 |
+
return xc, yc, scale, theta
|
45 |
+
|
46 |
+
def _decode_boxes( self,raw_boxes, anchors):
|
47 |
+
boxes = torch.zeros_like(raw_boxes)
|
48 |
+
|
49 |
+
x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
|
50 |
+
y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
|
51 |
+
|
52 |
+
w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
|
53 |
+
h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
|
54 |
+
|
55 |
+
boxes[..., 0] = y_center - h / 2. # ymin
|
56 |
+
boxes[..., 1] = x_center - w / 2. # xmin
|
57 |
+
boxes[..., 2] = y_center + h / 2. # ymax
|
58 |
+
boxes[..., 3] = x_center + w / 2. # xmax
|
59 |
+
|
60 |
+
for k in range(self.num_keypoints):
|
61 |
+
offset = 4 + k*2
|
62 |
+
keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
|
63 |
+
keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
|
64 |
+
boxes[..., offset ] = keypoint_x
|
65 |
+
boxes[..., offset + 1] = keypoint_y
|
66 |
+
return boxes
|
67 |
+
|
68 |
+
def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
|
69 |
+
assert raw_box_tensor.ndimension() == 3
|
70 |
+
assert raw_box_tensor.shape[1] == self.num_anchors
|
71 |
+
assert raw_box_tensor.shape[2] == self.num_coords
|
72 |
+
|
73 |
+
assert raw_score_tensor.ndimension() == 3
|
74 |
+
assert raw_score_tensor.shape[1] == self.num_anchors
|
75 |
+
assert raw_score_tensor.shape[2] == self.num_classes
|
76 |
+
|
77 |
+
assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
|
78 |
+
|
79 |
+
detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
|
80 |
+
|
81 |
+
thresh = self.score_clipping_thresh
|
82 |
+
raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
|
83 |
+
detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
|
84 |
+
|
85 |
+
# Note: we stripped off the last dimension from the scores tensor
|
86 |
+
# because there is only has one class. Now we can simply use a mask
|
87 |
+
# to filter out the boxes with too low confidence.
|
88 |
+
mask = detection_scores >= self.min_score_thresh
|
89 |
+
|
90 |
+
# Because each image from the batch can have a different number of
|
91 |
+
# detections, process them one at a time using a loop.
|
92 |
+
output_detections = []
|
93 |
+
for i in range(raw_box_tensor.shape[0]):
|
94 |
+
boxes = detection_boxes[i, mask[i]]
|
95 |
+
scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
|
96 |
+
output_detections.append(torch.cat((boxes, scores), dim=-1))
|
97 |
+
|
98 |
+
return output_detections
|
99 |
+
|
100 |
+
def extract_roi( self,frame, xc, yc, theta, scale):
|
101 |
+
resolution = 192
|
102 |
+
# take points on unit square and transform them according to the roi
|
103 |
+
points = torch.tensor([[-1, -1, 1, 1],
|
104 |
+
[-1, 1, -1, 1]], device=scale.device).view(1,2,4)
|
105 |
+
points = points * scale.view(-1,1,1)/2
|
106 |
+
theta = theta.view(-1, 1, 1)
|
107 |
+
R = torch.cat((
|
108 |
+
torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
|
109 |
+
torch.cat((torch.sin(theta), torch.cos(theta)), 2),
|
110 |
+
), 1)
|
111 |
+
center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
|
112 |
+
points = R @ points + center
|
113 |
+
|
114 |
+
# use the points to compute the affine transform that maps
|
115 |
+
# these points back to the output square
|
116 |
+
res = resolution
|
117 |
+
points1 = np.array([[0, 0, res-1],
|
118 |
+
[0, res-1, 0]], dtype=np.float32).T
|
119 |
+
affines = []
|
120 |
+
imgs = []
|
121 |
+
for i in range(points.shape[0]):
|
122 |
+
pts = points[i, :, :3].detach().numpy().T
|
123 |
+
M = cv2.getAffineTransform(pts, points1)
|
124 |
+
img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
|
125 |
+
img = torch.tensor(img, device=scale.device)
|
126 |
+
imgs.append(img)
|
127 |
+
affine = cv2.invertAffineTransform(M).astype('float32')
|
128 |
+
affine = torch.tensor(affine, device=scale.device)
|
129 |
+
affines.append(affine)
|
130 |
+
if imgs:
|
131 |
+
imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
|
132 |
+
affines = torch.stack(affines)
|
133 |
+
else:
|
134 |
+
imgs = torch.zeros((0, 3, res, res), device=scale.device)
|
135 |
+
affines = torch.zeros((0, 2, 3), device=scale.device)
|
136 |
+
|
137 |
+
return imgs, affines, points
|
138 |
+
|
139 |
+
def denormalize_landmarks(self, landmarks, affines):
|
140 |
+
landmarks[:,:,:2] *= self.resolution
|
141 |
+
for i in range(len(landmarks)):
|
142 |
+
landmark, affine = landmarks[i], affines[i]
|
143 |
+
landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
|
144 |
+
landmarks[i,:,:2] = landmark
|
145 |
+
return landmarks
|
146 |
+
|
147 |
+
def intersect(self,box_a, box_b):
|
148 |
+
A = box_a.size(0)
|
149 |
+
B = box_b.size(0)
|
150 |
+
max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
|
151 |
+
box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
|
152 |
+
min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
|
153 |
+
box_b[:, :2].unsqueeze(0).expand(A, B, 2))
|
154 |
+
inter = torch.clamp((max_xy - min_xy), min=0)
|
155 |
+
return inter[:, :, 0] * inter[:, :, 1]
|
156 |
+
|
157 |
+
def jaccard(self,box_a, box_b):
|
158 |
+
inter = self.intersect(box_a, box_b)
|
159 |
+
area_a = ((box_a[:, 2]-box_a[:, 0]) *
|
160 |
+
(box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
|
161 |
+
area_b = ((box_b[:, 2]-box_b[:, 0]) *
|
162 |
+
(box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
|
163 |
+
union = area_a + area_b - inter
|
164 |
+
return inter / union # [A,B]
|
165 |
+
|
166 |
+
|
167 |
+
def overlap_similarity(self,box, other_boxes):
|
168 |
+
"""Computes the IOU between a bounding box and set of other boxes."""
|
169 |
+
return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
|
170 |
+
|
171 |
+
def _weighted_non_max_suppression(self,detections):
|
172 |
+
if len(detections) == 0: return []
|
173 |
+
output_detections = []
|
174 |
+
|
175 |
+
# Sort the detections from highest to lowest score.
|
176 |
+
remaining = torch.argsort(detections[:, num_coords], descending=True)
|
177 |
+
|
178 |
+
while len(remaining) > 0:
|
179 |
+
detection = detections[remaining[0]]
|
180 |
+
|
181 |
+
# Compute the overlap between the first box and the other
|
182 |
+
# remaining boxes. (Note that the other_boxes also include
|
183 |
+
# the first_box.)
|
184 |
+
first_box = detection[:4]
|
185 |
+
other_boxes = detections[remaining, :4]
|
186 |
+
ious = self.overlap_similarity(first_box, other_boxes)
|
187 |
+
|
188 |
+
# If two detections don't overlap enough, they are considered
|
189 |
+
# to be from different faces.
|
190 |
+
mask = ious > self.min_suppression_threshold
|
191 |
+
overlapping = remaining[mask]
|
192 |
+
remaining = remaining[~mask]
|
193 |
+
|
194 |
+
# Take an average of the coordinates from the overlapping
|
195 |
+
# detections, weighted by their confidence scores.
|
196 |
+
weighted_detection = detection.clone()
|
197 |
+
if len(overlapping) > 1:
|
198 |
+
coordinates = detections[overlapping, :num_coords]
|
199 |
+
scores = detections[overlapping, num_coords:num_coords+1]
|
200 |
+
total_score = scores.sum()
|
201 |
+
weighted = (coordinates * scores).sum(dim=0) / total_score
|
202 |
+
weighted_detection[:num_coords] = weighted
|
203 |
+
weighted_detection[num_coords] = total_score / len(overlapping)
|
204 |
+
|
205 |
+
output_detections.append(weighted_detection)
|
206 |
+
|
207 |
+
return output_detections
|
208 |
+
|
209 |
+
def draw_detections(img, detections, with_keypoints=True):
|
210 |
+
if isinstance(detections, torch.Tensor):
|
211 |
+
detections = detections.detach().numpy()
|
212 |
+
|
213 |
+
if detections.ndim == 1:
|
214 |
+
detections = np.expand_dims(detections, axis=0)
|
215 |
+
|
216 |
+
n_keypoints = detections.shape[1] // 2 - 2
|
217 |
+
|
218 |
+
for i in range(detections.shape[0]):
|
219 |
+
ymin = detections[i, 0]
|
220 |
+
xmin = detections[i, 1]
|
221 |
+
ymax = detections[i, 2]
|
222 |
+
xmax = detections[i, 3]
|
223 |
+
|
224 |
+
start_point = (int(xmin), int(ymin))
|
225 |
+
end_point = (int(xmax), int(ymax))
|
226 |
+
img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1)
|
227 |
+
|
228 |
+
if with_keypoints:
|
229 |
+
for k in range(n_keypoints):
|
230 |
+
kp_x = int(detections[i, 4 + k*2 ])
|
231 |
+
kp_y = int(detections[i, 4 + k*2 + 1])
|
232 |
+
cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
|
233 |
+
return img
|
234 |
+
|
235 |
+
|
236 |
+
|
237 |
+
post_process=post_mediapipe_face()
|
238 |
+
|
239 |
+
class faceDetectionQnn:
|
240 |
+
def __init__(self):
|
241 |
+
super().__init__()
|
242 |
+
self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceDetector_w8a8.qnn216.ctx.bin"))
|
243 |
+
if self.model is None:
|
244 |
+
print("Create model failed !")
|
245 |
+
return
|
246 |
+
|
247 |
+
self.config = aidlite.Config.create_instance()
|
248 |
+
if self.config is None:
|
249 |
+
print("build_interpretper_from_model_and_config failed !")
|
250 |
+
return
|
251 |
+
|
252 |
+
self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
|
253 |
+
self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
|
254 |
+
self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
|
255 |
+
self.config.is_quantify_model = 1
|
256 |
+
|
257 |
+
self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
|
258 |
+
if self.interpreter is None:
|
259 |
+
print("build_interpretper_from_model_and_config failed !")
|
260 |
+
return
|
261 |
+
input_shapes = [[1,3, 256, 256]]
|
262 |
+
output_shapes = [[1, 896,16],[1,896,1]]
|
263 |
+
self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
|
264 |
+
output_shapes, aidlite.DataType.TYPE_FLOAT32)
|
265 |
+
|
266 |
+
if self.interpreter is None:
|
267 |
+
print("build_interpretper_from_model_and_config failed !")
|
268 |
+
result = self.interpreter.init()
|
269 |
+
if result != 0:
|
270 |
+
print(f"interpreter init failed !")
|
271 |
+
result = self.interpreter.load_model()
|
272 |
+
if result != 0:
|
273 |
+
print("interpreter load model failed !")
|
274 |
+
|
275 |
+
print(" model load success!")
|
276 |
+
|
277 |
+
def __call__(self, input):
|
278 |
+
self.interpreter.set_input_tensor(0,input)
|
279 |
+
invoke_time=[]
|
280 |
+
invoke_nums =10
|
281 |
+
for i in range(invoke_nums):
|
282 |
+
result = self.interpreter.set_input_tensor(0, input.data)
|
283 |
+
if result != 0:
|
284 |
+
print("interpreter set_input_tensor() failed")
|
285 |
+
t1=time.time()
|
286 |
+
result = self.interpreter.invoke()
|
287 |
+
cost_time = (time.time()-t1)*1000
|
288 |
+
invoke_time.append(cost_time)
|
289 |
+
|
290 |
+
max_invoke_time = max(invoke_time)
|
291 |
+
min_invoke_time = min(invoke_time)
|
292 |
+
mean_invoke_time = sum(invoke_time)/invoke_nums
|
293 |
+
var_invoketime=np.var(invoke_time)
|
294 |
+
print("====================================")
|
295 |
+
print(f"QNN Detection invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
|
296 |
+
print("====================================")
|
297 |
+
features_0 = self.interpreter.get_output_tensor(0).reshape(1, 896,16).copy()
|
298 |
+
features_1 = self.interpreter.get_output_tensor(1).reshape(1, 896,1).copy()
|
299 |
+
return features_0,features_1
|
300 |
+
|
301 |
+
|
302 |
+
class faceLandmarkQnn:
|
303 |
+
def __init__(self):
|
304 |
+
super().__init__()
|
305 |
+
self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceLandmark_w8a8.qnn216.ctx.bin"))
|
306 |
+
if self.model is None:
|
307 |
+
print("Create model failed !")
|
308 |
+
return
|
309 |
+
|
310 |
+
self.config = aidlite.Config.create_instance()
|
311 |
+
if self.config is None:
|
312 |
+
print("build_interpretper_from_model_and_config failed !")
|
313 |
+
return
|
314 |
+
|
315 |
+
self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
|
316 |
+
self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
|
317 |
+
self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
|
318 |
+
self.config.is_quantify_model = 1
|
319 |
+
|
320 |
+
self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
|
321 |
+
if self.interpreter is None:
|
322 |
+
print("build_interpretper_from_model_and_config failed !")
|
323 |
+
return
|
324 |
+
input_shapes = [[1, 3, 192, 192]]
|
325 |
+
output_shapes = [[1],[1,468,3]]
|
326 |
+
self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
|
327 |
+
output_shapes, aidlite.DataType.TYPE_FLOAT32)
|
328 |
+
|
329 |
+
if self.interpreter is None:
|
330 |
+
print("build_interpretper_from_model_and_config failed !")
|
331 |
+
result = self.interpreter.init()
|
332 |
+
if result != 0:
|
333 |
+
print(f"interpreter init failed !")
|
334 |
+
result = self.interpreter.load_model()
|
335 |
+
if result != 0:
|
336 |
+
print("interpreter load model failed !")
|
337 |
+
|
338 |
+
print(" model load success!")
|
339 |
+
|
340 |
+
def __call__(self, input):
|
341 |
+
self.interpreter.set_input_tensor(0,input)
|
342 |
+
invoke_time=[]
|
343 |
+
invoke_nums =10
|
344 |
+
for i in range(invoke_nums):
|
345 |
+
result = self.interpreter.set_input_tensor(0, input.data)
|
346 |
+
if result != 0:
|
347 |
+
print("interpreter set_input_tensor() failed")
|
348 |
+
t1=time.time()
|
349 |
+
result = self.interpreter.invoke()
|
350 |
+
cost_time = (time.time()-t1)*1000
|
351 |
+
invoke_time.append(cost_time)
|
352 |
+
|
353 |
+
max_invoke_time = max(invoke_time)
|
354 |
+
min_invoke_time = min(invoke_time)
|
355 |
+
mean_invoke_time = sum(invoke_time)/invoke_nums
|
356 |
+
var_invoketime=np.var(invoke_time)
|
357 |
+
print("====================================")
|
358 |
+
print(f"QNN LandMark invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
|
359 |
+
print("====================================")
|
360 |
+
features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy()
|
361 |
+
features_1 = self.interpreter.get_output_tensor(1).reshape(1,468,3).copy()
|
362 |
+
return features_0,features_1
|
363 |
+
|
364 |
+
|
365 |
+
|
366 |
+
anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"anchors_face_back.npy")), dtype=torch.float32, device='cpu')
|
367 |
+
# anchors_np = anchors.cpu().numpy().astype(np.float32)
|
368 |
+
# np.save("anchors_float32.npy", anchors_np)
|
369 |
+
|
370 |
+
face_detc = faceDetectionQnn()
|
371 |
+
face_rec = faceLandmarkQnn()
|
372 |
+
|
373 |
+
image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"coco.jpg")
|
374 |
+
|
375 |
+
frame_ct=0
|
376 |
+
image = cv2.imread(image_path)
|
377 |
+
|
378 |
+
frame = np.ascontiguousarray(image[:,:,::-1])
|
379 |
+
|
380 |
+
img1, img2, scale, pad = resize_pad(frame)
|
381 |
+
|
382 |
+
input = (img1 / 255).astype(np.float32)
|
383 |
+
input = np.transpose(input, (2, 0, 1))
|
384 |
+
input = input[np.newaxis, ...]
|
385 |
+
t0 = time.time()
|
386 |
+
out = face_detc(input)
|
387 |
+
use_time = round((time.time() - t0) * 1000, 2)
|
388 |
+
detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors)
|
389 |
+
|
390 |
+
filtered_detections = []
|
391 |
+
num_coords = 16
|
392 |
+
for i in range(len(detections)):
|
393 |
+
faces = post_process._weighted_non_max_suppression(detections[i])
|
394 |
+
faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1))
|
395 |
+
filtered_detections.append(faces)
|
396 |
+
|
397 |
+
face_detections = denormalize_detections(filtered_detections[0], scale, pad)
|
398 |
+
|
399 |
+
xc, yc, scale, theta = post_process.detection2roi(face_detections)
|
400 |
+
|
401 |
+
img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale)
|
402 |
+
if box.size()[0]!=0:
|
403 |
+
t2 = time.time()
|
404 |
+
flags, normalized_landmarks = face_rec(img.numpy())
|
405 |
+
|
406 |
+
use_time = round((time.time() - t2) * 1000, 2)
|
407 |
+
|
408 |
+
landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine)
|
409 |
+
|
410 |
+
for i in range(len(flags)):
|
411 |
+
landmark, flag = landmarks[i], flags[i]
|
412 |
+
if flag>.4: # 0.5
|
413 |
+
draw_landmarks(frame, landmark[:,:2], FACE_CONNECTIONS, size=1)
|
414 |
+
else:
|
415 |
+
print("not detect face !")
|
416 |
+
|
417 |
+
draw_roi(frame, box)
|
418 |
+
draw_detections(frame, face_detections)
|
419 |
+
cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1])
|
420 |
+
face_detc.interpreter.destory()
|
421 |
+
face_rec.interpreter.destory()
|
422 |
+
|
423 |
+
|
424 |
+
|
model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/visualization.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import cv2
|
3 |
+
import torch
|
4 |
+
|
5 |
+
def draw_detections(img, detections, with_keypoints=True):
|
6 |
+
if isinstance(detections, torch.Tensor):
|
7 |
+
detections = detections.cpu().numpy()
|
8 |
+
|
9 |
+
if detections.ndim == 1:
|
10 |
+
detections = np.expand_dims(detections, axis=0)
|
11 |
+
|
12 |
+
n_keypoints = detections.shape[1] // 2 - 2
|
13 |
+
|
14 |
+
for i in range(detections.shape[0]):
|
15 |
+
ymin = detections[i, 0]
|
16 |
+
xmin = detections[i, 1]
|
17 |
+
ymax = detections[i, 2]
|
18 |
+
xmax = detections[i, 3]
|
19 |
+
|
20 |
+
start_point = (int(xmin), int(ymin))
|
21 |
+
end_point = (int(xmax), int(ymax))
|
22 |
+
img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1)
|
23 |
+
|
24 |
+
if with_keypoints:
|
25 |
+
for k in range(n_keypoints):
|
26 |
+
kp_x = int(detections[i, 4 + k*2 ])
|
27 |
+
kp_y = int(detections[i, 4 + k*2 + 1])
|
28 |
+
cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
|
29 |
+
return img
|
30 |
+
|
31 |
+
|
32 |
+
def draw_roi(img, roi):
|
33 |
+
for i in range(roi.shape[0]):
|
34 |
+
(x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i]
|
35 |
+
cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2)
|
36 |
+
cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2)
|
37 |
+
cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2)
|
38 |
+
cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2)
|
39 |
+
|
40 |
+
|
41 |
+
def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2):
|
42 |
+
points = points[:,:2]
|
43 |
+
for point in points:
|
44 |
+
x, y = point
|
45 |
+
x, y = int(x), int(y)
|
46 |
+
cv2.circle(img, (x, y), size, color, thickness=size)
|
47 |
+
for connection in connections:
|
48 |
+
x0, y0 = points[connection[0]]
|
49 |
+
x1, y1 = points[connection[1]]
|
50 |
+
x0, y0 = int(x0), int(y0)
|
51 |
+
x1, y1 = int(x1), int(y1)
|
52 |
+
cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size)
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py
|
57 |
+
# 8 12 16 20
|
58 |
+
# | | | |
|
59 |
+
# 7 11 15 19
|
60 |
+
# 4 | | | |
|
61 |
+
# | 6 10 14 18
|
62 |
+
# 3 | | | |
|
63 |
+
# | 5---9---13--17
|
64 |
+
# 2 \ /
|
65 |
+
# \ \ /
|
66 |
+
# 1 \ /
|
67 |
+
# \ \ /
|
68 |
+
# ------0-
|
69 |
+
HAND_CONNECTIONS = [
|
70 |
+
(0, 1), (1, 2), (2, 3), (3, 4),
|
71 |
+
(5, 6), (6, 7), (7, 8),
|
72 |
+
(9, 10), (10, 11), (11, 12),
|
73 |
+
(13, 14), (14, 15), (15, 16),
|
74 |
+
(17, 18), (18, 19), (19, 20),
|
75 |
+
(0, 5), (5, 9), (9, 13), (13, 17), (0, 17)
|
76 |
+
]
|
77 |
+
|
78 |
+
POSE_CONNECTIONS = [
|
79 |
+
(0,1), (1,2), (2,3), (3,7),
|
80 |
+
(0,4), (4,5), (5,6), (6,8),
|
81 |
+
(9,10),
|
82 |
+
(11,13), (13,15), (15,17), (17,19), (19,15), (15,21),
|
83 |
+
(12,14), (14,16), (16,18), (18,20), (20,16), (16,22),
|
84 |
+
(11,12), (12,24), (24,23), (23,11)
|
85 |
+
]
|
86 |
+
|
87 |
+
# Vertex indices can be found in
|
88 |
+
# github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png
|
89 |
+
# Found in github.com/google/mediapipe/python/solutions/face_mesh.py
|
90 |
+
FACE_CONNECTIONS = [
|
91 |
+
# Lips.
|
92 |
+
(61, 146), (146, 91), (91, 181), (181, 84), (84, 17),
|
93 |
+
(17, 314), (314, 405), (405, 321), (321, 375), (375, 291),
|
94 |
+
(61, 185), (185, 40), (40, 39), (39, 37), (37, 0),
|
95 |
+
(0, 267), (267, 269), (269, 270), (270, 409), (409, 291),
|
96 |
+
(78, 95), (95, 88), (88, 178), (178, 87), (87, 14),
|
97 |
+
(14, 317), (317, 402), (402, 318), (318, 324), (324, 308),
|
98 |
+
(78, 191), (191, 80), (80, 81), (81, 82), (82, 13),
|
99 |
+
(13, 312), (312, 311), (311, 310), (310, 415), (415, 308),
|
100 |
+
# Left eye.
|
101 |
+
(263, 249), (249, 390), (390, 373), (373, 374), (374, 380),
|
102 |
+
(380, 381), (381, 382), (382, 362), (263, 466), (466, 388),
|
103 |
+
(388, 387), (387, 386), (386, 385), (385, 384), (384, 398),
|
104 |
+
(398, 362),
|
105 |
+
# Left eyebrow.
|
106 |
+
(276, 283), (283, 282), (282, 295), (295, 285), (300, 293),
|
107 |
+
(293, 334), (334, 296), (296, 336),
|
108 |
+
# Right eye.
|
109 |
+
(33, 7), (7, 163), (163, 144), (144, 145), (145, 153),
|
110 |
+
(153, 154), (154, 155), (155, 133), (33, 246), (246, 161),
|
111 |
+
(161, 160), (160, 159), (159, 158), (158, 157), (157, 173),
|
112 |
+
(173, 133),
|
113 |
+
# Right eyebrow.
|
114 |
+
(46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105),
|
115 |
+
(105, 66), (66, 107),
|
116 |
+
# Face oval.
|
117 |
+
(10, 338), (338, 297), (297, 332), (332, 284), (284, 251),
|
118 |
+
(251, 389), (389, 356), (356, 454), (454, 323), (323, 361),
|
119 |
+
(361, 288), (288, 397), (397, 365), (365, 379), (379, 378),
|
120 |
+
(378, 400), (400, 377), (377, 152), (152, 148), (148, 176),
|
121 |
+
(176, 149), (149, 150), (150, 136), (136, 172), (172, 58),
|
122 |
+
(58, 132), (132, 93), (93, 234), (234, 127), (127, 162),
|
123 |
+
(162, 21), (21, 54), (54, 103), (103, 67), (67, 109),
|
124 |
+
(109, 10)
|
125 |
+
]
|
model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/README.md
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Model Information
|
2 |
+
### Source model
|
3 |
+
- Input shape: [1x3x256x256],[1x3x192x192]
|
4 |
+
- Number of parameters:0.13M, 0.6M
|
5 |
+
- Model size:0.58MB, 2.32MB
|
6 |
+
- Output shape: [1x896x16, 1x896x1], [1, 1x486x3]
|
7 |
+
|
8 |
+
Source model repository: [MediaPipe-Face-Detection](https://github.com/zmurez/MediaPipePyTorch/)
|
9 |
+
|
10 |
+
### Converted model
|
11 |
+
|
12 |
+
- Precision: FP16
|
13 |
+
- Backend: QNN2.16
|
14 |
+
- Target Device: SNM972 QCS8550
|
15 |
+
|
16 |
+
## Inference with AidLite SDK
|
17 |
+
|
18 |
+
### SDK installation
|
19 |
+
Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
|
20 |
+
|
21 |
+
- install AidLite SDK
|
22 |
+
|
23 |
+
```bash
|
24 |
+
# Install the appropriate version of the aidlite sdk
|
25 |
+
sudo aid-pkg update
|
26 |
+
sudo aid-pkg install aidlite-sdk
|
27 |
+
# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
|
28 |
+
sudo aid-pkg install aidlite-{QNN VERSION}
|
29 |
+
```
|
30 |
+
|
31 |
+
- Verify AidLite SDK
|
32 |
+
|
33 |
+
```bash
|
34 |
+
# aidlite sdk c++ check
|
35 |
+
python3 -c "import aidlite ; print(aidlite.get_library_version())"
|
36 |
+
|
37 |
+
# aidlite sdk python check
|
38 |
+
python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
|
39 |
+
```
|
40 |
+
|
41 |
+
### Run demo
|
42 |
+
#### python
|
43 |
+
```bash
|
44 |
+
cd python
|
45 |
+
python3 demo_qnn.py
|
46 |
+
```
|
47 |
+
|
48 |
+
#### c++
|
49 |
+
```bash
|
50 |
+
# 加载.npy文件需要用到cnpy库(终端默认路径下执行即可)
|
51 |
+
git clone https://github.com/rogersce/cnpy.git
|
52 |
+
cd cnpy
|
53 |
+
mkdir build && cd build
|
54 |
+
cmake ..
|
55 |
+
make
|
56 |
+
sudo make install
|
57 |
+
|
58 |
+
cd mediapipe-face-detection/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp
|
59 |
+
mkdir build && cd build
|
60 |
+
cmake ..
|
61 |
+
make
|
62 |
+
./run_test
|
63 |
+
```
|
model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-38.pyc
ADDED
Binary file (16.5 kB). View file
|
|
model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-39.pyc
ADDED
Binary file (16.5 kB). View file
|
|