diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..85cfee69e0412a68ebbcf4fd822d65022339186e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text +model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/0000.jpg filter=lfs diff=lfs merge=lfs -text +model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text +model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text +model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text +model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text +model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text +model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text +model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text +model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text +model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/README.md b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/README.md new file mode 100644 index 0000000000000000000000000000000000000000..19042355074f86c6cb4409726a1e7319d5c82f29 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/README.md @@ -0,0 +1,63 @@ +## Model Information +### Source model +- Input shape: 256x256 +- Number of parameters:0.13M, 0.6M +- Model size:0.58MB, 2.32MB +- Output shape: [1x896x16, 1x896x1], [1, 1x486x3] + +Source model repository: [MediaPipe-Face-Detection](https://github.com/zmurez/MediaPipePyTorch/) + +### Converted model + +- Precision: INT16 +- Backend: QNN2.16 +- Target Device: FV01 QCS6490 + +## Inference with AidLite SDK + +### SDK installation +Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/) + +- install AidLite SDK + +```bash +# Install the appropriate version of the aidlite sdk +sudo aid-pkg update +sudo aid-pkg install aidlite-sdk +# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223 +sudo aid-pkg install aidlite-{QNN VERSION} +``` + +- Verify AidLite SDK + +```bash +# aidlite sdk c++ check +python3 -c "import aidlite ; print(aidlite.get_library_version())" + +# aidlite sdk python check +python3 -c "import aidlite ; print(aidlite.get_py_library_version())" +``` + +### Run demo +#### python +```bash +cd python +python3 demo_qnn.py +``` + +#### c++ +```bash +# 加载.npy文件需要用到cnpy库(终端默认路径下执行即可) +git clone https://github.com/rogersce/cnpy.git +cd cnpy +mkdir build && cd build +cmake .. +make +sudo make install + +cd mediapipe-face-detection/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp +mkdir build && cd build +cmake .. +make +./run_test +``` diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8655a0360f730c1fb7b4f81367a6279e5acc8060 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2044fec25dc146bd44e23e22b15c320e189fd99f Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3d9d716ed4bbd036e086c876ce0c5308d5a9a5d Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27b6da065e0c25967994af97ae8c72f67594bf5f Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e9b9b371d9ab88b3885f249fa1a9fc68d8751c1 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c16fa3b7c9a01a594bb21e59a6c00d0b19945548 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/CMakeLists.txt b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..13cca3db86e64b8a8458fd55c52323bed1d4282f --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/CMakeLists.txt @@ -0,0 +1,34 @@ +cmake_minimum_required (VERSION 3.5) +project("run_test") + +find_package(OpenCV REQUIRED) +find_library(CNPY_LIB cnpy REQUIRED) + +message(STATUS "oPENCV Library status:") +message(STATUS ">version:${OpenCV_VERSION}") +message(STATUS "Include:${OpenCV_INCLUDE_DIRS}") + +set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations") + +include_directories( + /usr/local/include + /usr/include/opencv4 +) + +link_directories( + /usr/local/lib/ +) + +file(GLOB SRC_LISTS + ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp +) + +add_executable(run_test ${SRC_LISTS}) + +target_link_libraries(run_test + aidlite + ${OpenCV_LIBS} + pthread + jsoncpp + ${CNPY_LIB} +) diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/anchors_float32.npy b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/anchors_float32.npy new file mode 100644 index 0000000000000000000000000000000000000000..fc25c0d2c161ee090e9a04c6003418bd15caf45c --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/anchors_float32.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94 +size 14464 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/coco.jpg b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/coco.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/coco.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378 +size 158750 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/run_test.cpp b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/run_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7e046e317fa11e6f130903db87f29b4cef6b4ae5 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/run_test.cpp @@ -0,0 +1,909 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cnpy.h" + +using namespace cv; +using namespace std; +using namespace Aidlux::Aidlite; + + +// 人脸 landmark 连接索引定义(来自 MediaPipe Face Mesh) +const std::vector> FACE_CONNECTIONS = { + {61, 146}, {146, 91}, {91, 181}, {181, 84}, {84, 17}, + {17, 314}, {314, 405}, {405, 321}, {321, 375}, {375, 291}, + {61, 185}, {185, 40}, {40, 39}, {39, 37}, {37, 0}, + {0, 267}, {267, 269}, {269, 270}, {270, 409}, {409, 291}, + {78, 95}, {95, 88}, {88, 178}, {178, 87}, {87, 14}, + {14, 317}, {317, 402}, {402, 318}, {318, 324}, {324, 308}, + {78, 191}, {191, 80}, {80, 81}, {81, 82}, {82, 13}, + {13, 312}, {312, 311}, {311, 310}, {310, 415}, {415, 308}, + {263, 249}, {249, 390}, {390, 373}, {373, 374}, {374, 380}, + {380, 381}, {381, 382}, {382, 362}, {263, 466}, {466, 388}, + {388, 387}, {387, 386}, {386, 385}, {385, 384}, {384, 398}, + {398, 362}, {276, 283}, {283, 282}, {282, 295}, {295, 285}, + {300, 293}, {293, 334}, {334, 296}, {296, 336}, {33, 7}, + {7, 163}, {163, 144}, {144, 145}, {145, 153}, {153, 154}, + {154, 155}, {155, 133}, {33, 246}, {246, 161}, {161, 160}, + {160, 159}, {159, 158}, {158, 157}, {157, 173}, {173, 133}, + {46, 53}, {53, 52}, {52, 65}, {65, 55}, {70, 63}, {63, 105}, + {105, 66}, {66, 107}, {10, 338}, {338, 297}, {297, 332}, + {332, 284}, {284, 251}, {251, 389}, {389, 356}, {356, 454}, + {454, 323}, {323, 361}, {361, 288}, {288, 397}, {397, 365}, + {365, 379}, {379, 378}, {378, 400}, {400, 377}, {377, 152}, + {152, 148}, {148, 176}, {176, 149}, {149, 150}, {150, 136}, + {136, 172}, {172, 58}, {58, 132}, {132, 93}, {93, 234}, + {234, 127}, {127, 162}, {162, 21}, {21, 54}, {54, 103}, + {103, 67}, {67, 109}, {109, 10} +}; + +struct Args { + std::string faceDetector_model = "../../models/m_faceDetctor_w8a16.qnn216.ctx.bin"; + std::string faceLandmark_model = "../../models/m_faceLandmark_w8a16.qnn216.ctx.bin"; + std::string imgs = "../coco.jpg"; + int invoke_nums = 10; + std::string model_type = "QNN"; +}; + + +Args parse_args(int argc, char* argv[]) { + Args args; + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if (arg == "--faceDetector_model" && i + 1 < argc) { + args.faceDetector_model = argv[++i]; + } else if (arg == "--faceLandmark_model" && i + 1 < argc) { + args.faceLandmark_model = argv[++i]; + } else if (arg == "--imgs" && i + 1 < argc) { + args.imgs = argv[++i]; + } else if (arg == "--invoke_nums" && i + 1 < argc) { + args.invoke_nums = std::stoi(argv[++i]); + } else if (arg == "--model_type" && i + 1 < argc) { + args.model_type = argv[++i]; + } + } + return args; +} + +std::string to_lower(const std::string& str) { + std::string lower_str = str; + std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) { + return std::tolower(c); + }); + return lower_str; +} + +std::vector> load_anchors_from_npy(const std::string& path) { + cnpy::NpyArray arr = cnpy::npy_load(path); + float* data_ptr = arr.data(); + + size_t num_rows = arr.shape[0]; // 896 + size_t num_cols = arr.shape[1]; // 4 + + std::vector> anchors(num_rows, std::vector(num_cols)); + for (size_t i = 0; i < num_rows; ++i) { + for (size_t j = 0; j < num_cols; ++j) { + anchors[i][j] = data_ptr[i * num_cols + j]; + } + } + + return anchors; +} + + +// 绘制人脸关键点和连接线 +void draw_landmarks( + cv::Mat& img, + const std::vector& points, + const std::vector& flags, + const std::vector>& connections, + float threshold = 0.4f, + cv::Scalar point_color = cv::Scalar(0, 255, 0), + cv::Scalar line_color = cv::Scalar(0, 0, 0), + int size = 2) +{ + // 画关键点 + for (size_t i = 0; i < points.size(); ++i) { + // if (i < flags.size() && flags[i] > threshold) { + int x = static_cast(points[i].x); + int y = static_cast(points[i].y); + cv::circle(img, cv::Point(x, y), size, point_color, size); + // } + } + + // 画连接线(两端都要可见) + for (const auto& conn : connections) { + int i0 = conn.first; + int i1 = conn.second; + // if (i0 < points.size() && i1 < points.size() && + // i0 < flags.size() && i1 < flags.size() && + // flags[i0] > threshold && flags[i1] > threshold) + // { + cv::line(img, points[i0], points[i1], line_color, size); + // } + } +} + + +std::tuple resize_pad(const cv::Mat& img) { + int orig_h = img.rows; // 480 + int orig_w = img.cols; // 640 + + // Step 1: resize width to 256, keep aspect ratio + int w1 = 256; + int h1 = w1 * orig_h / orig_w; // 等效于 int(256 * h / w) + + // Step 2: compute padding in height direction + int padh = 256 - h1; + int padw = 0; + + int padh1 = padh / 2; + int padh2 = padh1 + (padh % 2); + int padw1 = padw / 2; + int padw2 = padw1 + (padw % 2); + + // Step 3: resize to (w1, h1) + cv::Mat resized; + cv::resize(img, resized, cv::Size(w1, h1)); // (256, h1) + + // Step 4: pad to (256, 256) + cv::Mat padded; + cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); + + // Step 5: resize padded to 128×128 + cv::Mat resized128; + cv::resize(padded, resized128, cv::Size(128, 128)); + + // Step 6: compute scale and pad in original image space + float scale = static_cast(orig_h) / h1; // h / h1 + cv::Point pad_point(static_cast(padh1 * scale), static_cast(padw1 * scale)); + + return std::make_tuple(padded, resized128, scale, pad_point); +} + + +// 将图像转换为 1xC×H×W 格式并归一化(除以 255) +std::vector preprocess_image(const cv::Mat& img) { + int H = img.rows; + int W = img.cols; + int C = img.channels(); // should be 3 + + std::vector chw(H * W * C); // CHW + std::vector nchw(1 * C * H * W); // NCHW + + // 1. HWC → CHW + normalize (float32 / 255.0) + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int c = 0; c < C; ++c) { + // OpenCV uses BGR order + float value = img.at(h, w)[c] / 255.0f; + chw[c * H * W + h * W + w] = value; + } + } + } + + // 2. CHW → NCHW (add batch dimension, actually just copy) + for (int i = 0; i < C * H * W; ++i) { + nchw[i] = chw[i]; + } + + return nchw; // shape: [1, 3, H, W] +} + + +// 只用前4个坐标计算IOU(默认框位置在前4个坐标) +float IoU(const std::vector& box1, const std::vector& box2) { + float x1 = std::max(box1[0], box2[0]); + float y1 = std::max(box1[1], box2[1]); + float x2 = std::min(box1[2], box2[2]); + float y2 = std::min(box1[3], box2[3]); + + float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1); + float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]); + float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]); + float union_area = box1_area + box2_area - inter_area; + + return union_area > 0 ? inter_area / union_area : 0.0f; +} + +std::vector> weighted_non_max_suppression( + std::vector>& detections, + int num_coords = 16, + float min_suppression_threshold = 0.3f) +{ + if (detections.empty()) return {}; + + std::vector indices(detections.size()); + std::iota(indices.begin(), indices.end(), 0); + + // 按置信度降序排序 + std::sort(indices.begin(), indices.end(), [&](int a, int b) { + return detections[a][num_coords] > detections[b][num_coords]; + }); + + std::vector> output; + + while (!indices.empty()) { + int best_idx = indices.front(); + const auto& best_det = detections[best_idx]; + std::vector overlapping = { best_idx }; + + for (size_t i = 1; i < indices.size(); ++i) { + float iou = IoU(best_det, detections[indices[i]]); + if (iou > min_suppression_threshold) { + overlapping.push_back(indices[i]); + } + } + + // 更新剩余索引 + std::vector new_indices; + for (int idx : indices) { + if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) { + new_indices.push_back(idx); + } + } + indices = new_indices; + + // 加权平均:坐标 * 置信度 + if (overlapping.size() == 1) { + output.push_back(best_det); + } else { + std::vector weighted(num_coords + 1, 0.0f); + float total_score = 0.0f; + + for (int idx : overlapping) { + float score = detections[idx][num_coords]; + total_score += score; + for (int k = 0; k < num_coords; ++k) { + weighted[k] += detections[idx][k] * score; + } + } + + for (int k = 0; k < num_coords; ++k) { + weighted[k] /= total_score; + } + weighted[num_coords] = total_score / overlapping.size(); // 取平均得分 + + // std::cout << "Weighted box: "; + // for (float v : weighted) std::cout << v << " "; + // std::cout << "\n"; + + output.push_back(weighted); + } + } + + // TODO + auto x = output[0]; + output.clear(); + output.push_back(x); + + return output; +} + + +std::vector> denormalize_detections( + const std::vector>& detections, + float scale, + const cv::Point& pad +) { + std::vector> result = detections; + + for (size_t i = 0; i < result.size(); ++i) { + std::vector& det = result[i]; + + // bbox coords: x1, y1, x2, y2 + det[0] = det[0] * scale * 256.0f - pad.x; // x1 + det[1] = det[1] * scale * 256.0f - pad.y; // y1 + det[2] = det[2] * scale * 256.0f - pad.x; // x2 + det[3] = det[3] * scale * 256.0f - pad.y; // y2 + + // keypoints (starting from index 4): format [y, x, y, x, ...] + for (size_t k = 4; k + 1 < det.size(); k += 2) { + det[k] = det[k] * scale * 256.0f - pad.y; // y + det[k + 1] = det[k + 1] * scale * 256.0f - pad.x; // x + } + } + + return result; +} + + +void detection2roi( + const std::vector>& detections, + std::vector& xc, + std::vector& yc, + std::vector& scale, + std::vector& theta, + int kp1, int kp2, // 关键点索引 + float dy, float dscale, float theta0 +) { + size_t N = detections.size(); + xc.resize(N); + yc.resize(N); + scale.resize(N); + theta.resize(N); + + for (size_t i = 0; i < N; ++i) { + const std::vector& det = detections[i]; + + float x1 = det[1]; + float x2 = det[3]; + float y1 = det[0]; + float y2 = det[2]; + + float x_center = (x1 + x2) / 2.0f; + float y_center = (y1 + y2) / 2.0f; + float box_scale = (x2 - x1); // assumes square box + + // yc 偏移 + y_center += dy * box_scale; + box_scale *= dscale; + + // 获取两个关键点的位置 + int base = 4; + int idx_y0 = base + 2 * kp1; + int idx_x0 = base + 2 * kp1 + 1; + int idx_y1 = base + 2 * kp2; + int idx_x1 = base + 2 * kp2 + 1; + + float x0 = det[idx_x0]; + float y0 = det[idx_y0]; + float x1_kp = det[idx_x1]; + float y1_kp = det[idx_y1]; + + float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0; + + // 输出赋值 + xc[i] = x_center; + yc[i] = y_center; + scale[i] = box_scale; + // TODO: 这里的 theta 需要根据实际情况调整 + // theta[i] = angle; // 如果需要使用计算的角度 + theta[i] = -0.0094; + } +} + + +void extract_roi( + const cv::Mat& frame, + const std::vector& xc, + const std::vector& yc, + const std::vector& theta, + const std::vector& scale, + std::vector& cropped_rois, + std::vector& affine_matrices, + std::vector>& roi_boxes, // 添加返回点坐标 + int resolution = 192 +) { + cropped_rois.clear(); + affine_matrices.clear(); + roi_boxes.clear(); + + for (size_t i = 0; i < xc.size(); ++i) { + float s = scale[i] / 2.0f; + float cos_t = std::cos(theta[i]); + float sin_t = std::sin(theta[i]); + + // 定义4个 unit square 点经过变换后的点(顺序和 Python 中一样) + std::vector points(4); + // [-1, -1] + points[0].x = xc[i] + (-s * cos_t + s * sin_t); + points[0].y = yc[i] + (-s * sin_t - s * cos_t); + // [1, -1] + points[1].x = xc[i] + ( s * cos_t + s * sin_t); + points[1].y = yc[i] + ( s * sin_t - s * cos_t); + // [-1, 1] + points[2].x = xc[i] + (-s * cos_t - s * sin_t); + points[2].y = yc[i] + (-s * sin_t + s * cos_t); + // [1, 1] + points[3].x = xc[i] + ( s * cos_t - s * sin_t); + points[3].y = yc[i] + ( s * sin_t + s * cos_t); + + // 用前三个点计算仿射变换 + std::vector src_pts = { points[0], points[1], points[2] }; + std::vector dst_pts = { + cv::Point2f(0, 0), + cv::Point2f(resolution - 1, 0), + cv::Point2f(0, resolution - 1) + }; + + cv::Mat M = cv::getAffineTransform(src_pts, dst_pts); + cv::Mat M_inv; + cv::invertAffineTransform(M, M_inv); + + cv::Mat cropped; + cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5)); + cropped_rois.push_back(cropped); + affine_matrices.push_back(M_inv); + roi_boxes.push_back(points); // 添加变换后的 box 点 + } +} + +std::vector preprocess_imgs_to_nchw(const std::vector& imgs) { + int N = imgs.size(); + if (N == 0) return {}; + + int H = 192; + int W = 192; + int C = 3; // assume 3 channels (BGR) + + std::vector output; + output.reserve(N * C * H * W); + + for (int n = 0; n < N; ++n) { + cv::Mat img_float; + imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0); // Normalize to [0,1] + + // Split channels (HWC → CHW) + std::vector channels(3); + cv::split(img_float, channels); // channels[0] = B, [1] = G, [2] = R + + for (int c = 0; c < C; ++c) { + for (int i = 0; i < H; ++i) { + for (int j = 0; j < W; ++j) { + output.push_back(channels[c].at(i, j)); + } + } + } + } + + return output; // shape: N x C x H x W +} + +// resolution 一般为 192 +std::vector denormalize_landmarks( + const std::vector& normalized_landmarks, + const std::vector& affines, + int resolution = 192) +{ + std::vector output; + + // 检查输入尺寸 + const int num_faces = 1; // 假设只有一个人脸 + const int num_landmarks = 468; + if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) { + std::cerr << "Error: Input size mismatch. Expected " + << num_faces * num_landmarks * 3 << " landmarks and " + << num_faces << " affine matrices." << std::endl; + throw std::runtime_error("Input size mismatch"); + } + + for (int i = 0; i < num_faces; ++i) { + const cv::Mat& affine = affines[i]; // 2x3 CV_32F + for (int j = 0; j < num_landmarks; ++j) { + int idx = i * num_landmarks * 3 + j * 3; + float x = normalized_landmarks[idx + 0] * resolution; + float y = normalized_landmarks[idx + 1] * resolution; + // float z = normalized_landmarks[idx + 2]; // 可选使用 + + // 2x1 输入向量 + cv::Mat pt = (cv::Mat_(2, 1) << x, y); + + // 提取 affine 的旋转和平移 + cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone(); + cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone(); + M2x2.convertTo(M2x2, CV_32F); + t2x1.convertTo(t2x1, CV_32F); + + // 反仿射变换 + cv::Mat out = M2x2 * pt + t2x1; + + // 存储为 Point2f + output.emplace_back(out.at(0, 0), out.at(1, 0)); + } + } + + return output; // 输出为 denormalized landmarks,大小为 468 个 Point2f +} + + +void draw_roi(cv::Mat& img, const std::vector>& boxes) { + for (const auto& roi : boxes) { + if (roi.size() < 4) continue; + + const cv::Point2f& p1 = roi[0]; + const cv::Point2f& p2 = roi[1]; + const cv::Point2f& p3 = roi[2]; + const cv::Point2f& p4 = roi[3]; + + cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2); // 黑色 + cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2); // 绿色 + cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2); // 黑色 + cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2); // 黑色 + } +} + + +void draw_detections(cv::Mat& img, const std::vector>& detections, bool with_keypoints = true) { + for (const auto& det : detections) { + if (det.size() < 4) continue; + + float ymin = det[0]; + float xmin = det[1]; + float ymax = det[2]; + float xmax = det[3]; + + cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1); + + if (with_keypoints && det.size() > 4) { + int n_keypoints = (det.size() - 4) / 2; + for (int k = 0; k < n_keypoints; ++k) { + int kp_x = int(det[4 + k * 2]); + int kp_y = int(det[4 + k * 2 + 1]); + cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2); + } + } + } +} + + +std::vector> loadAnchors(const std::string& filename) { + std::ifstream in(filename); + std::vector> anchors; + + if (!in.is_open()) { + std::cerr << "Failed to open file: " << filename << std::endl; + return anchors; + } + + std::string line; + while (std::getline(in, line)) { + std::istringstream ss(line); + std::vector anchor; + float value; + while (ss >> value) { + anchor.push_back(value); + } + if (!anchor.empty()) { + anchors.push_back(anchor); + } + } + + in.close(); + return anchors; +} + +// sigmoid 函数 +float sigmoid(float x) { + return 1.0f / (1.0f + std::exp(-x)); +} + +// clamp 函数 +float clamp(float x, float min_val, float max_val) { + return std::max(min_val, std::min(max_val, x)); +} + +// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord] +std::vector>> decode_boxes( + const std::vector& raw_boxes, + const std::vector>& anchors, + int batch, int num_anchors, int num_coords, + float x_scale, float y_scale, float w_scale, float h_scale, + int num_keypoints) +{ + std::vector>> decoded_boxes(batch, + std::vector>(num_anchors, std::vector(num_coords, 0))); + + for (int b = 0; b < batch; ++b) { + for (int i = 0; i < num_anchors; ++i) { + int base = b * num_anchors * num_coords + i * num_coords; + + float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0]; + float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1]; + float w = raw_boxes[base + 2] / w_scale * anchors[i][2]; + float h = raw_boxes[base + 3] / h_scale * anchors[i][3]; + + decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin + decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin + decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax + decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax + + for (int k = 0; k < num_keypoints; ++k) { + int offset = 4 + k * 2; + float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0]; + float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1]; + decoded_boxes[b][i][offset] = keypoint_x; + decoded_boxes[b][i][offset + 1] = keypoint_y; + } + } + } + + return decoded_boxes; +} + +std::vector>> tensors_to_detections( + const std::vector& raw_box_tensor, + const std::vector& raw_score_tensor, + const std::vector>& anchors, + int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints, + float x_scale, float y_scale, float w_scale, float h_scale, + float score_clipping_thresh, float min_score_thresh) +{ + assert(raw_box_tensor.size() == batch * num_anchors * num_coords); + assert(raw_score_tensor.size() == batch * num_anchors * num_classes); + assert(anchors.size() == size_t(num_anchors)); + + auto detection_boxes = decode_boxes( + raw_box_tensor, anchors, batch, num_anchors, num_coords, + x_scale, y_scale, w_scale, h_scale, num_keypoints); + + std::vector>> output_detections; + + for (int b = 0; b < batch; ++b) { + std::vector> detections; + + for (int i = 0; i < num_anchors; ++i) { + int score_index = b * num_anchors * num_classes + i * num_classes; + + // 单类情况,取第0类 + float score_raw = raw_score_tensor[score_index]; + float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh)); + + if (score >= min_score_thresh) { + std::vector det = detection_boxes[b][i]; // shape [num_coords] + det.push_back(score); // 追加置信度 + detections.push_back(det); // shape [num_coords+1] + } + } + + output_detections.push_back(detections); // 每个 batch 一个 vector + } + + return output_detections; +} + + +int invoke(const Args& args) { + std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n" + << args.faceLandmark_model << "\n" + << "Image Path: " << args.imgs << "\n" + << "Inference Nums: " << args.invoke_nums << "\n" + << "Model Type: " << args.model_type << "\n"; + // =============================================================faceDetector_model start + Model* model1 = Model::create_instance(args.faceDetector_model); + if(model1 == nullptr){ + printf("Create model1 failed !\n"); + return EXIT_FAILURE; + } + Config* config1 = Config::create_instance(); + if(config1 == nullptr){ + printf("Create config1 failed !\n"); + return EXIT_FAILURE; + } + config1->implement_type = ImplementType::TYPE_LOCAL; + std::string model_type_lower1 = to_lower(args.model_type); + if (model_type_lower1 == "qnn"){ + config1->framework_type = FrameworkType::TYPE_QNN; + } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") { + config1->framework_type = FrameworkType::TYPE_SNPE2; + } + config1->accelerate_type = AccelerateType::TYPE_DSP; + config1->is_quantify_model = 1; + + std::vector> input_shapes1 = {{1,3,256,256}}; + std::vector> output_shapes1 = {{1,896,16},{1,896,1}}; + model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32); + std::unique_ptr fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1); + if(fast_interpreter1 == nullptr){ + printf("build_interpretper_from_model_and_config failed !\n"); + return EXIT_FAILURE; + } + int result = fast_interpreter1->init(); + if(result != EXIT_SUCCESS){ + printf("interpreter->init() failed !\n"); + return EXIT_FAILURE; + } + // load model + fast_interpreter1->load_model(); + if(result != EXIT_SUCCESS){ + printf("interpreter->load_model() failed !\n"); + return EXIT_FAILURE; + } + printf("detect model load success!\n"); + // =============================================================faceDetector_model over + + // =============================================================faceLandmark_model start + Model* model2 = Model::create_instance(args.faceLandmark_model); + if(model2 == nullptr){ + printf("Create model2 failed !\n"); + return EXIT_FAILURE; + } + Config* config2 = Config::create_instance(); + if(config2 == nullptr){ + printf("Create config2 failed !\n"); + return EXIT_FAILURE; + } + config2->implement_type = ImplementType::TYPE_LOCAL; + std::string model_type_lower2 = to_lower(args.model_type); + if (model_type_lower2 == "qnn"){ + config2->framework_type = FrameworkType::TYPE_QNN; + } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") { + config2->framework_type = FrameworkType::TYPE_SNPE2; + } + config2->accelerate_type = AccelerateType::TYPE_DSP; + config2->is_quantify_model = 1; + + std::vector> input_shapes2 = {{1,3,192,192}}; + std::vector> output_shapes2 = {{1},{1,468,3}}; + model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32); + std::unique_ptr fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2); + if(fast_interpreter2 == nullptr){ + printf("build_interpretper_from_model_and_config2 failed !\n"); + return EXIT_FAILURE; + } + result = fast_interpreter2->init(); + if(result != EXIT_SUCCESS){ + printf("interpreter2->init() failed !\n"); + return EXIT_FAILURE; + } + // load model + fast_interpreter2->load_model(); + if(result != EXIT_SUCCESS){ + printf("interpreter2->load_model() failed !\n"); + return EXIT_FAILURE; + } + printf("detect model2 load success!\n"); + // =============================================================faceLandmark_model over + + + auto anchors = load_anchors_from_npy("../anchors_float32.npy"); + cv::Mat frame = cv::imread(args.imgs); + if (frame.empty()) { + printf("detect image load failed!\n"); + return 1; + } + // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows); + cv::Mat input_data; + cv::Mat frame_clone1 = frame.clone(); + cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB); + cv::Mat frame_clone = frame.clone(); + + + cv::Mat img1, img2; + float scale; + cv::Point pad; + std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1); + std::vector input_tensor = preprocess_image(img1); + + float *outdata0 = nullptr; + float *outdata1 = nullptr; + std::vector invoke_time; + for (int i = 0; i < args.invoke_nums; ++i) { + result = fast_interpreter1->set_input_tensor(0, input_tensor.data()); + if(result != EXIT_SUCCESS){ + printf("interpreter->set_input_tensor() failed !\n"); + return EXIT_FAILURE; + } + auto t1 = std::chrono::high_resolution_clock::now(); + result = fast_interpreter1->invoke(); + auto t2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration cost_time = t2 - t1; + invoke_time.push_back(cost_time.count() * 1000); + if(result != EXIT_SUCCESS){ + printf("interpreter->invoke() failed !\n"); + return EXIT_FAILURE; + } + uint32_t out_data_0 = 0; + result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0); + if(result != EXIT_SUCCESS){ + printf("interpreter1->get_output_tensor() 0 failed !\n"); + return EXIT_FAILURE; + } + + uint32_t out_data_1 = 0; + result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1); + if(result != EXIT_SUCCESS){ + printf("interpreter1->get_output_tensor() 1 failed !\n"); + return EXIT_FAILURE; + } + + } + + std::vector tensor_1_896_16(outdata0, outdata0 + 896*16); + std::vector tensor_1_896_1(outdata1, outdata1 + 896*1); + + std::vector>> detections = tensors_to_detections( + tensor_1_896_16, tensor_1_896_1, anchors, + 1, 896, 16, 1, 6, + 256.0f, 256.0f, 256.0f, 256.0f, + 100.0f, 0.4f); + + + std::vector>> filtered_detections; + for (size_t i = 0; i < detections.size(); ++i) { + std::vector>& dets = detections[i]; + std::vector> faces = weighted_non_max_suppression(dets); + filtered_detections.push_back(faces); + } + + + // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n"; + // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n"; + std::vector> face_detections = denormalize_detections(filtered_detections[0], scale, pad); + + // std::cout << "face_detections size: " << face_detections.size() << "\n"; + std::vector xc, yc, scales, theta; + int kp1 = 0, kp2 = 1; // 关键点索引 + float dy = 0.0f; // 根据模型定义设定 + float dscale = 1.5f; // 缩放因子 + float theta0 = 0.0f; // 基准角度 + + detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0); + std::vector rois; + std::vector affines; + std::vector> boxes; + + extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes); + if (!boxes.empty()) { + std::cout << "Detected " << boxes.size() << " faces.\n"; + // 检测到人脸,继续处理 boxes[0] ... + std::vector input_tensor = preprocess_imgs_to_nchw(rois); + + float *outdata1_0 = nullptr; + float *outdata1_1 = nullptr; + + result = fast_interpreter2->set_input_tensor(0, input_tensor.data()); + if(result != EXIT_SUCCESS){ + printf("interpreter2->set_input_tensor() failed !\n"); + return EXIT_FAILURE; + } + auto t1 = std::chrono::high_resolution_clock::now(); + result = fast_interpreter2->invoke(); + auto t2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration cost_time = t2 - t1; + invoke_time.push_back(cost_time.count() * 1000); + if(result != EXIT_SUCCESS){ + printf("interpreter2->invoke() failed !\n"); + return EXIT_FAILURE; + } + uint32_t out_data_1_0 = 0; + result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0); + if(result != EXIT_SUCCESS){ + printf("interpreter2->get_output_tensor() 0 failed !\n"); + return EXIT_FAILURE; + } + + uint32_t out_data_1_1 = 0; + result = fast_interpreter2->get_output_tensor(1, (void**)&outdata1_1, &out_data_1_1); + if(result != EXIT_SUCCESS){ + printf("interpreter2->get_output_tensor() 1 failed !\n"); + return EXIT_FAILURE; + } + + std::vector flags(outdata1_0, outdata1_0 + 1); + std::vector normalized_landmarks(outdata1_1, outdata1_1 + 468*3); + + std::vector landmarks = denormalize_landmarks(normalized_landmarks, affines); + draw_landmarks(frame_clone1, landmarks, flags, FACE_CONNECTIONS); + } else { + std::cout << "not detect face!" << std::endl; + } + + + draw_roi(frame_clone1, boxes); + draw_detections(frame_clone1, face_detections); + cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR); + cv::imwrite("vis_result.jpg", frame_clone1); + + + fast_interpreter1->destory(); + fast_interpreter2->destory(); + return 0; + +} + + +int main(int argc, char* argv[]) { + Args args = parse_args(argc, argv); + return invoke(args); +} diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/anchors_face_back.npy b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/anchors_face_back.npy new file mode 100644 index 0000000000000000000000000000000000000000..3ba12474802adea36a8e37ca648d46556cd35c92 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/anchors_face_back.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95 +size 28800 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazeface_landmark.pth b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazeface_landmark.pth new file mode 100644 index 0000000000000000000000000000000000000000..b9fe34feb756f128a94ee424dacfec09bb01a811 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazeface_landmark.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c529987e67f82e58a608a394aabf245a3afa19ac2f761981894f70b4df9fdca +size 2439235 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazefaceback.pth b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazefaceback.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8c31116b779d40ad7f2ae3bd8d633370af894d1 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazefaceback.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9658c6459c5d5450d7da9d5fbb74b3beca11157f4cdb35e4d948aa6b4efc0ded +size 594825 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor.pt b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor.pt new file mode 100644 index 0000000000000000000000000000000000000000..524cbb24eb52d05a050437f9dd5fe735c349bd72 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56336b04831d9f9f41bdcddcd4598e5660a2925451ee50da634fea6598ce6620 +size 855238 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor_w8a16.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor_w8a16.qnn216.ctx.bin new file mode 100644 index 0000000000000000000000000000000000000000..f4815d40ae5719217efb7af7d1c4859163f1fc3b --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor_w8a16.qnn216.ctx.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06f7e7016506a415bb7e02aaf9469a5fd406d31bb7349d3ae0fe97f1a0cb3b9a +size 728616 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark.pt b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark.pt new file mode 100644 index 0000000000000000000000000000000000000000..d280840347f2fbc83daff7553385fc689ffdd848 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96374d173e67c5c3690b75d030b729e23e41de6b1a1ebd5daef7ff3992118c54 +size 2643322 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark_w8a16.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark_w8a16.qnn216.ctx.bin new file mode 100644 index 0000000000000000000000000000000000000000..363b5bb1c4c319e12c8545f826ed6b43a11c3a03 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark_w8a16.qnn216.ctx.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61316298a6690650feea876b64b2efe520940d753af3264202689b12dd1c779e +size 1096800 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/0000.jpg b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/0000.jpg new file mode 100644 index 0000000000000000000000000000000000000000..811afcc48481b7da1cc1417a349a3d0f8bf7dab3 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/0000.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1b278e84efa32b0e25d982219d31438f74a73b58af62b7f4751df3076221078 +size 173585 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5f01c881032cc0ea0dafbbe76b4d02bf51f5dc2 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e15f793e9a3a77c122c6cae36e26c5bd9b77ab7e Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4d54b68c53f26e930688f7bd66bc42e64431265 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3960f9eaa3babd661de15273d7037029ce46e706 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/visualization.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffaa52099e2019a3dc97a8eef82319610de6aaf4 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/visualization.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazebase.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazebase.py new file mode 100644 index 0000000000000000000000000000000000000000..414ae950199d7fa1ebae4fa540805cb53cf1ff0d --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazebase.py @@ -0,0 +1,513 @@ +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def resize_pad(img): + """ resize and pad images to be input to the detectors + + The face and palm detector networks take 256x256 and 128x128 images + as input. As such the input image is padded and resized to fit the + size while maintaing the aspect ratio. + + Returns: + img1: 256x256 + img2: 128x128 + scale: scale factor between original image and 256x256 image + pad: pixels of padding in the original image + """ + + size0 = img.shape + if size0[0]>=size0[1]: + h1 = 256 + w1 = 256 * size0[1] // size0[0] + padh = 0 + padw = 256 - w1 + scale = size0[1] / w1 + else: + h1 = 256 * size0[0] // size0[1] + w1 = 256 + padh = 256 - h1 + padw = 0 + scale = size0[0] / h1 + padh1 = padh//2 + padh2 = padh//2 + padh%2 + padw1 = padw//2 + padw2 = padw//2 + padw%2 + img1 = cv2.resize(img, (w1,h1)) + img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0))) + pad = (int(padh1 * scale), int(padw1 * scale)) + img2 = cv2.resize(img1, (128,128)) + return img1, img2, scale, pad + + +def denormalize_detections(detections, scale, pad): + """ maps detection coordinates from [0,1] to image coordinates + + The face and palm detector networks take 256x256 and 128x128 images + as input. As such the input image is padded and resized to fit the + size while maintaing the aspect ratio. This function maps the + normalized coordinates back to the original image coordinates. + + Inputs: + detections: nxm tensor. n is the number of detections. + m is 4+2*k where the first 4 valuse are the bounding + box coordinates and k is the number of additional + keypoints output by the detector. + scale: scalar that was used to resize the image + pad: padding in the x and y dimensions + + """ + detections[:, 0] = detections[:, 0] * scale * 256 - pad[0] + detections[:, 1] = detections[:, 1] * scale * 256 - pad[1] + detections[:, 2] = detections[:, 2] * scale * 256 - pad[0] + detections[:, 3] = detections[:, 3] * scale * 256 - pad[1] + + detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1] + detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0] + return detections + + + + +class BlazeBlock(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False): + super(BlazeBlock, self).__init__() + + self.stride = stride + self.kernel_size = kernel_size + self.channel_pad = out_channels - in_channels + + # TFLite uses slightly different padding than PyTorch + # on the depthwise conv layer when the stride is 2. + if stride == 2: + self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride) + padding = 0 + else: + padding = (kernel_size - 1) // 2 + + self.convs = nn.Sequential( + nn.Conv2d(in_channels=in_channels, out_channels=in_channels, + kernel_size=kernel_size, stride=stride, padding=padding, + groups=in_channels, bias=True), + nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=1, stride=1, padding=0, bias=True), + ) + + if skip_proj: + self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=1, stride=1, padding=0, bias=True) + else: + self.skip_proj = None + + if act == 'relu': + self.act = nn.ReLU(inplace=True) + elif act == 'prelu': + self.act = nn.PReLU(out_channels) + else: + raise NotImplementedError("unknown activation %s"%act) + + def forward(self, x): + if self.stride == 2: + if self.kernel_size==3: + h = F.pad(x, (0, 2, 0, 2), "constant", 0) + else: + h = F.pad(x, (1, 2, 1, 2), "constant", 0) + x = self.max_pool(x) + else: + h = x + + if self.skip_proj is not None: + x = self.skip_proj(x) + elif self.channel_pad > 0: + x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0) + + + return self.act(self.convs(h) + x) + + +class FinalBlazeBlock(nn.Module): + def __init__(self, channels, kernel_size=3): + super(FinalBlazeBlock, self).__init__() + + # TFLite uses slightly different padding than PyTorch + # on the depthwise conv layer when the stride is 2. + self.convs = nn.Sequential( + nn.Conv2d(in_channels=channels, out_channels=channels, + kernel_size=kernel_size, stride=2, padding=0, + groups=channels, bias=True), + nn.Conv2d(in_channels=channels, out_channels=channels, + kernel_size=1, stride=1, padding=0, bias=True), + ) + + self.act = nn.ReLU(inplace=True) + + def forward(self, x): + h = F.pad(x, (0, 2, 0, 2), "constant", 0) + + return self.act(self.convs(h)) + + +class BlazeBase(nn.Module): + """ Base class for media pipe models. """ + + def _device(self): + """Which device (CPU or GPU) is being used by this model?""" + return self.classifier_8.weight.device + + def load_weights(self, path): + self.load_state_dict(torch.load(path)) + self.eval() + + +class BlazeLandmark(BlazeBase): + """ Base class for landmark models. """ + + def extract_roi(self, frame, xc, yc, theta, scale): + + # take points on unit square and transform them according to the roi + points = torch.tensor([[-1, -1, 1, 1], + [-1, 1, -1, 1]], device=scale.device).view(1,2,4) + points = points * scale.view(-1,1,1)/2 + theta = theta.view(-1, 1, 1) + R = torch.cat(( + torch.cat((torch.cos(theta), -torch.sin(theta)), 2), + torch.cat((torch.sin(theta), torch.cos(theta)), 2), + ), 1) + center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1) + points = R @ points + center + + # use the points to compute the affine transform that maps + # these points back to the output square + res = self.resolution + points1 = np.array([[0, 0, res-1], + [0, res-1, 0]], dtype=np.float32).T + affines = [] + imgs = [] + for i in range(points.shape[0]): + pts = points[i, :, :3].cpu().numpy().T + M = cv2.getAffineTransform(pts, points1) + img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5) + img = torch.tensor(img, device=scale.device) + imgs.append(img) + affine = cv2.invertAffineTransform(M).astype('float32') + affine = torch.tensor(affine, device=scale.device) + affines.append(affine) + if imgs: + imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0 + affines = torch.stack(affines) + else: + imgs = torch.zeros((0, 3, res, res), device=scale.device) + affines = torch.zeros((0, 2, 3), device=scale.device) + + return imgs, affines, points + + def denormalize_landmarks(self, landmarks, affines): + landmarks[:,:,:2] *= self.resolution + for i in range(len(landmarks)): + landmark, affine = landmarks[i], affines[i] + landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T + landmarks[i,:,:2] = landmark + return landmarks + + + +class BlazeDetector(BlazeBase): + """ Base class for detector models. + + Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and + https://github.com/hollance/BlazeFace-PyTorch and + https://github.com/google/mediapipe/ + """ + def load_anchors(self, path): + self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device()) + assert(self.anchors.ndimension() == 2) + assert(self.anchors.shape[0] == self.num_anchors) + assert(self.anchors.shape[1] == 4) + + def _preprocess(self, x): + """Converts the image pixels to the range [-1, 1].""" + return x.float() / 255.# 127.5 - 1.0 + + def predict_on_image(self, img): + """Makes a prediction on a single image. + + Arguments: + img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of + shape (3, H, W). The image's height and width should be + 128 pixels. + + Returns: + A tensor with face detections. + """ + if isinstance(img, np.ndarray): + img = torch.from_numpy(img).permute((2, 0, 1)) + + return self.predict_on_batch(img.unsqueeze(0))[0] + + def predict_on_batch(self, x): + """Makes a prediction on a batch of images. + + Arguments: + x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of + shape (b, 3, H, W). The height and width should be 128 pixels. + + Returns: + A list containing a tensor of face detections for each image in + the batch. If no faces are found for an image, returns a tensor + of shape (0, 17). + + Each face detection is a PyTorch tensor consisting of 17 numbers: + - ymin, xmin, ymax, xmax + - x,y-coordinates for the 6 keypoints + - confidence score + """ + if isinstance(x, np.ndarray): + x = torch.from_numpy(x).permute((0, 3, 1, 2)) + + assert x.shape[1] == 3 + assert x.shape[2] == self.y_scale + assert x.shape[3] == self.x_scale + + # 1. Preprocess the images into tensors: + x = x.to(self._device()) + x = self._preprocess(x) + + # 2. Run the neural network: + with torch.no_grad(): + out = self.__call__(x) + + # 3. Postprocess the raw predictions: + detections = self._tensors_to_detections(out[0], out[1], self.anchors) + + # 4. Non-maximum suppression to remove overlapping detections: + filtered_detections = [] + for i in range(len(detections)): + faces = self._weighted_non_max_suppression(detections[i]) + faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1)) + filtered_detections.append(faces) + + return filtered_detections + + + def detection2roi(self, detection): + """ Convert detections from detector to an oriented bounding box. + + Adapted from: + # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt + + The center and size of the box is calculated from the center + of the detected box. Rotation is calcualted from the vector + between kp1 and kp2 relative to theta0. The box is scaled + and shifted by dscale and dy. + + """ + if self.detection2roi_method == 'box': + # compute box center and scale + # use mediapipe/calculators/util/detections_to_rects_calculator.cc + xc = (detection[:,1] + detection[:,3]) / 2 + yc = (detection[:,0] + detection[:,2]) / 2 + scale = (detection[:,3] - detection[:,1]) # assumes square boxes + + elif self.detection2roi_method == 'alignment': + # compute box center and scale + # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc + xc = detection[:,4+2*self.kp1] + yc = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2 + else: + raise NotImplementedError( + "detection2roi_method [%s] not supported"%self.detection2roi_method) + + yc += self.dy * scale + scale *= self.dscale + + # compute box rotation + x0 = detection[:,4+2*self.kp1] + y0 = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + #theta = np.arctan2(y0-y1, x0-x1) - self.theta0 + theta = torch.atan2(y0-y1, x0-x1) - self.theta0 + return xc, yc, scale, theta + + + def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors): + """The output of the neural network is a tensor of shape (b, 896, 16) + containing the bounding box regressor predictions, as well as a tensor + of shape (b, 896, 1) with the classification confidences. + + This function converts these two "raw" tensors into proper detections. + Returns a list of (num_detections, 17) tensors, one for each image in + the batch. + + This is based on the source code from: + mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc + mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto + """ + assert raw_box_tensor.ndimension() == 3 + assert raw_box_tensor.shape[1] == self.num_anchors + assert raw_box_tensor.shape[2] == self.num_coords + + assert raw_score_tensor.ndimension() == 3 + assert raw_score_tensor.shape[1] == self.num_anchors + assert raw_score_tensor.shape[2] == self.num_classes + + assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0] + + detection_boxes = self._decode_boxes(raw_box_tensor, anchors) + + thresh = self.score_clipping_thresh + raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh) + detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1) + + # Note: we stripped off the last dimension from the scores tensor + # because there is only has one class. Now we can simply use a mask + # to filter out the boxes with too low confidence. + mask = detection_scores >= self.min_score_thresh + + # Because each image from the batch can have a different number of + # detections, process them one at a time using a loop. + output_detections = [] + for i in range(raw_box_tensor.shape[0]): + boxes = detection_boxes[i, mask[i]] + scores = detection_scores[i, mask[i]].unsqueeze(dim=-1) + output_detections.append(torch.cat((boxes, scores), dim=-1)) + + return output_detections + + def _decode_boxes(self, raw_boxes, anchors): + """Converts the predictions into actual coordinates using + the anchor boxes. Processes the entire batch at once. + """ + boxes = torch.zeros_like(raw_boxes) + + x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0] + y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + + w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2] + h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3] + + boxes[..., 0] = y_center - h / 2. # ymin + boxes[..., 1] = x_center - w / 2. # xmin + boxes[..., 2] = y_center + h / 2. # ymax + boxes[..., 3] = x_center + w / 2. # xmax + + for k in range(self.num_keypoints): + offset = 4 + k*2 + keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0] + keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + boxes[..., offset ] = keypoint_x + boxes[..., offset + 1] = keypoint_y + + return boxes + + def _weighted_non_max_suppression(self, detections): + """The alternative NMS method as mentioned in the BlazeFace paper: + + "We replace the suppression algorithm with a blending strategy that + estimates the regression parameters of a bounding box as a weighted + mean between the overlapping predictions." + + The original MediaPipe code assigns the score of the most confident + detection to the weighted detection, but we take the average score + of the overlapping detections. + + The input detections should be a Tensor of shape (count, 17). + + Returns a list of PyTorch tensors, one for each detected face. + + This is based on the source code from: + mediapipe/calculators/util/non_max_suppression_calculator.cc + mediapipe/calculators/util/non_max_suppression_calculator.proto + """ + if len(detections) == 0: return [] + + output_detections = [] + + # Sort the detections from highest to lowest score. + remaining = torch.argsort(detections[:, self.num_coords], descending=True) + + while len(remaining) > 0: + detection = detections[remaining[0]] + + # Compute the overlap between the first box and the other + # remaining boxes. (Note that the other_boxes also include + # the first_box.) + first_box = detection[:4] + other_boxes = detections[remaining, :4] + ious = overlap_similarity(first_box, other_boxes) + + # If two detections don't overlap enough, they are considered + # to be from different faces. + mask = ious > self.min_suppression_threshold + overlapping = remaining[mask] + remaining = remaining[~mask] + + # Take an average of the coordinates from the overlapping + # detections, weighted by their confidence scores. + weighted_detection = detection.clone() + if len(overlapping) > 1: + coordinates = detections[overlapping, :self.num_coords] + scores = detections[overlapping, self.num_coords:self.num_coords+1] + total_score = scores.sum() + weighted = (coordinates * scores).sum(dim=0) / total_score + weighted_detection[:self.num_coords] = weighted + weighted_detection[self.num_coords] = total_score / len(overlapping) + + output_detections.append(weighted_detection) + + return output_detections + + +# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py + +def intersect(box_a, box_b): + """ We resize both tensors to [A,B,2] without new malloc: + [A,2] -> [A,1,2] -> [A,B,2] + [B,2] -> [1,B,2] -> [A,B,2] + Then we compute the area of intersect between box_a and box_b. + Args: + box_a: (tensor) bounding boxes, Shape: [A,4]. + box_b: (tensor) bounding boxes, Shape: [B,4]. + Return: + (tensor) intersection area, Shape: [A,B]. + """ + A = box_a.size(0) + B = box_b.size(0) + max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), + box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) + min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), + box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + return inter[:, :, 0] * inter[:, :, 1] + + +def jaccard(box_a, box_b): + """Compute the jaccard overlap of two sets of boxes. The jaccard overlap + is simply the intersection over union of two boxes. Here we operate on + ground truth boxes and default boxes. + E.g.: + A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) + Args: + box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] + box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] + Return: + jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] + """ + inter = intersect(box_a, box_b) + area_a = ((box_a[:, 2]-box_a[:, 0]) * + (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_b = ((box_b[:, 2]-box_b[:, 0]) * + (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + union = area_a + area_b - inter + return inter / union # [A,B] + + +def overlap_similarity(box, other_boxes): + """Computes the IOU between a bounding box and set of other boxes.""" + return jaccard(box.unsqueeze(0), other_boxes).squeeze(0) diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface.py new file mode 100644 index 0000000000000000000000000000000000000000..6d630b3b28ead006d2864f18f1637f6545dbe507 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface.py @@ -0,0 +1,182 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from blazebase import BlazeDetector, BlazeBlock, FinalBlazeBlock + + +class BlazeFace(BlazeDetector): + """The BlazeFace face detection model from MediaPipe. + + The version from MediaPipe is simpler than the one in the paper; + it does not use the "double" BlazeBlocks. + + Because we won't be training this model, it doesn't need to have + batchnorm layers. These have already been "folded" into the conv + weights by TFLite. + + The conversion to PyTorch is fairly straightforward, but there are + some small differences between TFLite and PyTorch in how they handle + padding on conv layers with stride 2. + + This version works on batches, while the MediaPipe version can only + handle a single image at a time. + + Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and + https://github.com/hollance/BlazeFace-PyTorch and + https://github.com/google/mediapipe/ + + """ + def __init__(self, back_model=False): + super(BlazeFace, self).__init__() + + # These are the settings from the MediaPipe example graph + # mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt + self.num_classes = 1 + self.num_anchors = 896 + self.num_coords = 16 + self.score_clipping_thresh = 100.0 + self.back_model = back_model + if back_model: + self.x_scale = 256.0 + self.y_scale = 256.0 + self.h_scale = 256.0 + self.w_scale = 256.0 + self.min_score_thresh = 0.65 + else: + self.x_scale = 128.0 + self.y_scale = 128.0 + self.h_scale = 128.0 + self.w_scale = 128.0 + self.min_score_thresh = 0.75 + self.min_suppression_threshold = 0.3 + self.num_keypoints = 6 + + # These settings are for converting detections to ROIs which can then + # be extracted and feed into the landmark network + # use mediapipe/calculators/util/detections_to_rects_calculator.cc + self.detection2roi_method = 'box' + # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt + self.kp1 = 1 + self.kp2 = 0 + self.theta0 = 0. + self.dscale = 1.5 + self.dy = 0. + + self._define_layers() + + def _define_layers(self): + if self.back_model: + self.backbone = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True), + nn.ReLU(inplace=True), + + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24, stride=2), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 48, stride=2), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 96, stride=2), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + ) + self.final = FinalBlazeBlock(96) + self.classifier_8 = nn.Conv2d(96, 2, 1, bias=True) + self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True) + + self.regressor_8 = nn.Conv2d(96, 32, 1, bias=True) + self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True) + else: + self.backbone1 = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True), + nn.ReLU(inplace=True), + + BlazeBlock(24, 24), + BlazeBlock(24, 28), + BlazeBlock(28, 32, stride=2), + BlazeBlock(32, 36), + BlazeBlock(36, 42), + BlazeBlock(42, 48, stride=2), + BlazeBlock(48, 56), + BlazeBlock(56, 64), + BlazeBlock(64, 72), + BlazeBlock(72, 80), + BlazeBlock(80, 88), + ) + + self.backbone2 = nn.Sequential( + BlazeBlock(88, 96, stride=2), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + ) + + self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True) + self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True) + + self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True) + self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True) + + def forward(self, x): + # TFLite uses slightly different padding on the first conv layer + # than PyTorch, so do it manually. + x = F.pad(x, (1, 2, 1, 2), "constant", 0) + + b = x.shape[0] # batch size, needed for reshaping later + + if self.back_model: + x = self.backbone(x) # (b, 16, 16, 96) + h = self.final(x) # (b, 8, 8, 96) + else: + x = self.backbone1(x) # (b, 88, 16, 16) + h = self.backbone2(x) # (b, 96, 8, 8) + + # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to + # permute the output from the conv layers before reshaping it. + + c1 = self.classifier_8(x) # (b, 2, 16, 16) + c1 = c1.permute(0, 2, 3, 1) # (b, 16, 16, 2) + c1 = c1.reshape(b, -1, 1) # (b, 512, 1) + + c2 = self.classifier_16(h) # (b, 6, 8, 8) + c2 = c2.permute(0, 2, 3, 1) # (b, 8, 8, 6) + c2 = c2.reshape(b, -1, 1) # (b, 384, 1) + + c = torch.cat((c1, c2), dim=1) # (b, 896, 1) + + r1 = self.regressor_8(x) # (b, 32, 16, 16) + r1 = r1.permute(0, 2, 3, 1) # (b, 16, 16, 32) + r1 = r1.reshape(b, -1, 16) # (b, 512, 16) + + r2 = self.regressor_16(h) # (b, 96, 8, 8) + r2 = r2.permute(0, 2, 3, 1) # (b, 8, 8, 96) + r2 = r2.reshape(b, -1, 16) # (b, 384, 16) + + r = torch.cat((r1, r2), dim=1) # (b, 896, 16) + return [r, c] + diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface_landmark.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface_landmark.py new file mode 100644 index 0000000000000000000000000000000000000000..8b21d17e31c8a8a39e95abb27cc3220440a01fca --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface_landmark.py @@ -0,0 +1,74 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from blazebase import BlazeLandmark, BlazeBlock + +class BlazeFaceLandmark(BlazeLandmark): + """The face landmark model from MediaPipe. + + """ + def __init__(self): + super(BlazeFaceLandmark, self).__init__() + + # size of ROIs used for input + self.resolution = 192 + + self._define_layers() + + def _define_layers(self): + self.backbone1 = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=0, bias=True), + nn.PReLU(16), + + BlazeBlock(16, 16, 3, act='prelu'), + BlazeBlock(16, 16, 3, act='prelu'), + BlazeBlock(16, 32, 3, 2, act='prelu'), + + BlazeBlock(32, 32, 3, act='prelu'), + BlazeBlock(32, 32, 3, act='prelu'), + BlazeBlock(32, 64, 3, 2, act='prelu'), + + BlazeBlock(64, 64, 3, act='prelu'), + BlazeBlock(64, 64, 3, act='prelu'), + BlazeBlock(64, 128, 3, 2, act='prelu'), + + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, 2, act='prelu'), + + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + ) + + + self.backbone2a = nn.Sequential( + BlazeBlock(128, 128, 3, 2, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + nn.Conv2d(128, 32, 1, padding=0, bias=True), + nn.PReLU(32), + BlazeBlock(32, 32, 3, act='prelu'), + nn.Conv2d(32, 1404, 3, padding=0, bias=True) + ) + + self.backbone2b = nn.Sequential( + BlazeBlock(128, 128, 3, 2, act='prelu'), + nn.Conv2d(128, 32, 1, padding=0, bias=True), + nn.PReLU(32), + BlazeBlock(32, 32, 3, act='prelu'), + nn.Conv2d(32, 1, 3, padding=0, bias=True) + ) + + def forward(self, x): + if x.shape[0] == 0: + return torch.zeros((0,)), torch.zeros((0, 468, 3)) + + x = F.pad(x, (0, 1, 0, 1), "constant", 0) + + x = self.backbone1(x) + landmarks = self.backbone2a(x).view(-1, 468, 3) / 192 + flag = self.backbone2b(x).sigmoid().view(-1) + + return flag, landmarks \ No newline at end of file diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/coco.jpg b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/coco.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/coco.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378 +size 158750 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/demo_qnn.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/demo_qnn.py new file mode 100644 index 0000000000000000000000000000000000000000..eb902b5d3886ae0e09f7433f6d95a05c076b78a9 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/demo_qnn.py @@ -0,0 +1,389 @@ +import numpy as np +import torch +import cv2 +import sys +from blazebase import resize_pad, denormalize_detections +from visualization import draw_landmarks, draw_roi, FACE_CONNECTIONS +import time +import aidlite +import os + +class post_mediapipe_face: + def __init__(self): + self.kp1 = 1 + self.kp2 = 0 + self.theta0 = 0. + self.dscale = 1.5 + self.dy = 0. + self.x_scale = 256.0 + self.y_scale = 256.0 + self.h_scale = 256.0 + self.w_scale = 256.0 + self.num_keypoints = 6 + self.num_classes = 1 + self.num_anchors = 896 + self.num_coords = 16 + self.min_score_thresh = 0.4 #0.65 + self.score_clipping_thresh = 100.0 + self.min_suppression_threshold = 0.3 + self.resolution = 192 + + + def detection2roi(self,detection): + xc = (detection[:,1] + detection[:,3]) / 2 + yc = (detection[:,0] + detection[:,2]) / 2 + scale = (detection[:,3] - detection[:,1]) # assumes square boxes + yc += self.dy * scale + scale *= self.dscale + # compute box rotation + x0 = detection[:,4+2*self.kp1] + y0 = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + theta = torch.atan2(y0-y1, x0-x1) - self.theta0 + return xc, yc, scale, theta + + def _decode_boxes( self,raw_boxes, anchors): + boxes = torch.zeros_like(raw_boxes) + + x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0] + y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + + w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2] + h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3] + + boxes[..., 0] = y_center - h / 2. # ymin + boxes[..., 1] = x_center - w / 2. # xmin + boxes[..., 2] = y_center + h / 2. # ymax + boxes[..., 3] = x_center + w / 2. # xmax + + for k in range(self.num_keypoints): + offset = 4 + k*2 + keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0] + keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + boxes[..., offset ] = keypoint_x + boxes[..., offset + 1] = keypoint_y + return boxes + + def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors): + assert raw_box_tensor.ndimension() == 3 + assert raw_box_tensor.shape[1] == self.num_anchors + assert raw_box_tensor.shape[2] == self.num_coords + + assert raw_score_tensor.ndimension() == 3 + assert raw_score_tensor.shape[1] == self.num_anchors + assert raw_score_tensor.shape[2] == self.num_classes + + assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0] + + detection_boxes = self._decode_boxes(raw_box_tensor, anchors) + + thresh = self.score_clipping_thresh + raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh) + detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1) + + # Note: we stripped off the last dimension from the scores tensor + # because there is only has one class. Now we can simply use a mask + # to filter out the boxes with too low confidence. + mask = detection_scores >= self.min_score_thresh + + # Because each image from the batch can have a different number of + # detections, process them one at a time using a loop. + output_detections = [] + for i in range(raw_box_tensor.shape[0]): + boxes = detection_boxes[i, mask[i]] + scores = detection_scores[i, mask[i]].unsqueeze(dim=-1) + output_detections.append(torch.cat((boxes, scores), dim=-1)) + + return output_detections + + def extract_roi( self,frame, xc, yc, theta, scale): + resolution = 192 + # take points on unit square and transform them according to the roi + points = torch.tensor([[-1, -1, 1, 1], + [-1, 1, -1, 1]], device=scale.device).view(1,2,4) + points = points * scale.view(-1,1,1)/2 + theta = theta.view(-1, 1, 1) + R = torch.cat(( + torch.cat((torch.cos(theta), -torch.sin(theta)), 2), + torch.cat((torch.sin(theta), torch.cos(theta)), 2), + ), 1) + center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1) + points = R @ points + center + + # use the points to compute the affine transform that maps + # these points back to the output square + res = resolution + points1 = np.array([[0, 0, res-1], + [0, res-1, 0]], dtype=np.float32).T + affines = [] + imgs = [] + for i in range(points.shape[0]): + pts = points[i, :, :3].detach().numpy().T + M = cv2.getAffineTransform(pts, points1) + img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5) + img = torch.tensor(img, device=scale.device) + imgs.append(img) + affine = cv2.invertAffineTransform(M).astype('float32') + affine = torch.tensor(affine, device=scale.device) + affines.append(affine) + if imgs: + imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0 + affines = torch.stack(affines) + else: + imgs = torch.zeros((0, 3, res, res), device=scale.device) + affines = torch.zeros((0, 2, 3), device=scale.device) + + return imgs, affines, points + + def denormalize_landmarks(self, landmarks, affines): + landmarks[:,:,:2] *= self.resolution + for i in range(len(landmarks)): + landmark, affine = landmarks[i], affines[i] + landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T + landmarks[i,:,:2] = landmark + return landmarks + + def intersect(self,box_a, box_b): + A = box_a.size(0) + B = box_b.size(0) + max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), + box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) + min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), + box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + return inter[:, :, 0] * inter[:, :, 1] + + def jaccard(self,box_a, box_b): + inter = self.intersect(box_a, box_b) + area_a = ((box_a[:, 2]-box_a[:, 0]) * + (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_b = ((box_b[:, 2]-box_b[:, 0]) * + (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + union = area_a + area_b - inter + return inter / union # [A,B] + + + def overlap_similarity(self,box, other_boxes): + """Computes the IOU between a bounding box and set of other boxes.""" + return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0) + + def _weighted_non_max_suppression(self,detections): + if len(detections) == 0: return [] + output_detections = [] + + # Sort the detections from highest to lowest score. + remaining = torch.argsort(detections[:, num_coords], descending=True) + + while len(remaining) > 0: + detection = detections[remaining[0]] + + # Compute the overlap between the first box and the other + # remaining boxes. (Note that the other_boxes also include + # the first_box.) + first_box = detection[:4] + other_boxes = detections[remaining, :4] + ious = self.overlap_similarity(first_box, other_boxes) + + # If two detections don't overlap enough, they are considered + # to be from different faces. + mask = ious > self.min_suppression_threshold + overlapping = remaining[mask] + remaining = remaining[~mask] + + # Take an average of the coordinates from the overlapping + # detections, weighted by their confidence scores. + weighted_detection = detection.clone() + if len(overlapping) > 1: + coordinates = detections[overlapping, :num_coords] + scores = detections[overlapping, num_coords:num_coords+1] + total_score = scores.sum() + weighted = (coordinates * scores).sum(dim=0) / total_score + weighted_detection[:num_coords] = weighted + weighted_detection[num_coords] = total_score / len(overlapping) + + output_detections.append(weighted_detection) + + return output_detections + +def draw_detections(img, detections, with_keypoints=True): + if isinstance(detections, torch.Tensor): + detections = detections.detach().numpy() + + if detections.ndim == 1: + detections = np.expand_dims(detections, axis=0) + + n_keypoints = detections.shape[1] // 2 - 2 + + for i in range(detections.shape[0]): + ymin = detections[i, 0] + xmin = detections[i, 1] + ymax = detections[i, 2] + xmax = detections[i, 3] + + start_point = (int(xmin), int(ymin)) + end_point = (int(xmax), int(ymax)) + img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) + + if with_keypoints: + for k in range(n_keypoints): + kp_x = int(detections[i, 4 + k*2 ]) + kp_y = int(detections[i, 4 + k*2 + 1]) + cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2) + return img + + + +post_process=post_mediapipe_face() + +class faceDetectionQnn: + def __init__(self): + super().__init__() + self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceDetctor_w8a16.qnn216.ctx.bin")) + if self.model is None: + print("Create model failed !") + return + + self.config = aidlite.Config.create_instance() + if self.config is None: + print("build_interpretper_from_model_and_config failed !") + return + + self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL + self.config.framework_type = aidlite.FrameworkType.TYPE_QNN + self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP + self.config.is_quantify_model = 1 + + self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config) + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + return + input_shapes = [[1,3, 256, 256]] + output_shapes = [[1, 896,16],[1,896,1]] + self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32, + output_shapes, aidlite.DataType.TYPE_FLOAT32) + + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + result = self.interpreter.init() + if result != 0: + print(f"interpreter init failed !") + result = self.interpreter.load_model() + if result != 0: + print("interpreter load model failed !") + + print(" model load success!") + + def __call__(self, input): + self.interpreter.set_input_tensor(0,input) + self.interpreter.invoke() + features_0 = self.interpreter.get_output_tensor(0).reshape(1, 896,16).copy() + features_1 = self.interpreter.get_output_tensor(1).reshape(1, 896,1).copy() + return features_0,features_1 + + +class faceLandmarkQnn: + def __init__(self): + super().__init__() + self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceLandmark_w8a16.qnn216.ctx.bin")) + if self.model is None: + print("Create model failed !") + return + + self.config = aidlite.Config.create_instance() + if self.config is None: + print("build_interpretper_from_model_and_config failed !") + return + + self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL + self.config.framework_type = aidlite.FrameworkType.TYPE_QNN + self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP + self.config.is_quantify_model = 1 + + self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config) + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + return + input_shapes = [[1, 3, 192, 192]] + output_shapes = [[1],[1,468,3]] + self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32, + output_shapes, aidlite.DataType.TYPE_FLOAT32) + + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + result = self.interpreter.init() + if result != 0: + print(f"interpreter init failed !") + result = self.interpreter.load_model() + if result != 0: + print("interpreter load model failed !") + + print(" model load success!") + + def __call__(self, input): + self.interpreter.set_input_tensor(0,input) + self.interpreter.invoke() + features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy() + features_1 = self.interpreter.get_output_tensor(1).reshape(1,468,3).copy() + return features_0,features_1 + + + +anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/anchors_face_back.npy")), dtype=torch.float32, device='cpu') +face_detc = faceDetectionQnn() +face_rec = faceLandmarkQnn() + +image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"coco.jpg") + +frame_ct=0 +image = cv2.imread(image_path) + +frame = np.ascontiguousarray(image[:,:,::-1]) + +img1, img2, scale, pad = resize_pad(frame) + +input = (img1 / 255).astype(np.float32) +input = np.transpose(input, (2, 0, 1)) +input = input[np.newaxis, ...] +t0 = time.time() +out = face_detc(input) +use_time = round((time.time() - t0) * 1000, 2) +print(f"face detction inference_time:{use_time} ms") +detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors) + +filtered_detections = [] +num_coords = 16 +for i in range(len(detections)): + faces = post_process._weighted_non_max_suppression(detections[i]) + faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1)) + filtered_detections.append(faces) + +face_detections = denormalize_detections(filtered_detections[0], scale, pad) + +xc, yc, scale, theta = post_process.detection2roi(face_detections) + +img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale) +if box.size()[0]!=0: + t2 = time.time() + flags, normalized_landmarks = face_rec(img.numpy()) + + use_time = round((time.time() - t2) * 1000, 2) + print(f"landmark inference_time:{use_time} ms") + + landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine) + + for i in range(len(flags)): + landmark, flag = landmarks[i], flags[i] + if flag>.4: # 0.5 + draw_landmarks(frame, landmark[:,:2], FACE_CONNECTIONS, size=1) +else: + print("not detect face !") + +draw_roi(frame, box) +draw_detections(frame, face_detections) +cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1]) +face_detc.interpreter.destory() +face_rec.interpreter.destory() + + + diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/export_jit.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/export_jit.py new file mode 100644 index 0000000000000000000000000000000000000000..d2bfb877b73ba9f7826a20ae51ea5dcdddecf2ff --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/export_jit.py @@ -0,0 +1,57 @@ +import torch +import os +from typing import Callable, Tuple +from blazeface import BlazeFace +from blazeface_landmark import BlazeFaceLandmark + +class FaceDetector(torch.nn.Module): + def __init__( + self, + detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]], + anchors: torch.Tensor, + ): + super().__init__() + self.detector = detector + self.anchors = anchors + + def forward(self, image): + return self.detector(image) + +back_detector = True +face_detector = BlazeFace(back_model=back_detector) +face_detector.load_weights(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/blazefaceback.pth")) +face_detector.load_anchors(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/anchors_face_back.npy")) +face_detect = FaceDetector(face_detector,face_detector.anchors) +num_params = sum(p.numel() for p in face_detect.parameters() if p.requires_grad) +print(f'Number of face_detect parameters: {num_params}') + +face_d_in = torch.randn(1, 3, 256, 256,dtype= torch.float32) +source_model = torch.jit.trace(face_detect,face_d_in) +source_model.save(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/m_faceDetctor.pt")) +print("export face detect ok!") + + + + + +class FaceLandmarkDetector(torch.nn.Module): + def __init__( + self, + detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]], + ): + super().__init__() + self.detector = detector + + def forward(self, image): + return self.detector(image) + +face_regressor = BlazeFaceLandmark() +face_regressor.load_weights(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/blazeface_landmark.pth")) +face_regres = FaceLandmarkDetector(face_regressor) +num_params = sum(p.numel() for p in face_regres.parameters() if p.requires_grad) +print(f'Number of face_regres parameters: {num_params}') + +face_r_in = torch.randn(1, 3, 192, 192,dtype= torch.float32) +source_model = torch.jit.trace(face_regres, face_r_in) +source_model.save(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/m_faceLandmark.pt")) +print("export face landmark ok!") \ No newline at end of file diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/visualization.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..e1187c9d99336d53562f4fa3a4a32f6af6e769fb --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/visualization.py @@ -0,0 +1,125 @@ +import numpy as np +import cv2 +import torch + +def draw_detections(img, detections, with_keypoints=True): + if isinstance(detections, torch.Tensor): + detections = detections.cpu().numpy() + + if detections.ndim == 1: + detections = np.expand_dims(detections, axis=0) + + n_keypoints = detections.shape[1] // 2 - 2 + + for i in range(detections.shape[0]): + ymin = detections[i, 0] + xmin = detections[i, 1] + ymax = detections[i, 2] + xmax = detections[i, 3] + + start_point = (int(xmin), int(ymin)) + end_point = (int(xmax), int(ymax)) + img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) + + if with_keypoints: + for k in range(n_keypoints): + kp_x = int(detections[i, 4 + k*2 ]) + kp_y = int(detections[i, 4 + k*2 + 1]) + cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2) + return img + + +def draw_roi(img, roi): + for i in range(roi.shape[0]): + (x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i] + cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2) + cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2) + cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2) + cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2) + + +def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2): + points = points[:,:2] + for point in points: + x, y = point + x, y = int(x), int(y) + cv2.circle(img, (x, y), size, color, thickness=size) + for connection in connections: + x0, y0 = points[connection[0]] + x1, y1 = points[connection[1]] + x0, y0 = int(x0), int(y0) + x1, y1 = int(x1), int(y1) + cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size) + + + +# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py +# 8 12 16 20 +# | | | | +# 7 11 15 19 +# 4 | | | | +# | 6 10 14 18 +# 3 | | | | +# | 5---9---13--17 +# 2 \ / +# \ \ / +# 1 \ / +# \ \ / +# ------0- +HAND_CONNECTIONS = [ + (0, 1), (1, 2), (2, 3), (3, 4), + (5, 6), (6, 7), (7, 8), + (9, 10), (10, 11), (11, 12), + (13, 14), (14, 15), (15, 16), + (17, 18), (18, 19), (19, 20), + (0, 5), (5, 9), (9, 13), (13, 17), (0, 17) +] + +POSE_CONNECTIONS = [ + (0,1), (1,2), (2,3), (3,7), + (0,4), (4,5), (5,6), (6,8), + (9,10), + (11,13), (13,15), (15,17), (17,19), (19,15), (15,21), + (12,14), (14,16), (16,18), (18,20), (20,16), (16,22), + (11,12), (12,24), (24,23), (23,11) +] + +# Vertex indices can be found in +# github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png +# Found in github.com/google/mediapipe/python/solutions/face_mesh.py +FACE_CONNECTIONS = [ + # Lips. + (61, 146), (146, 91), (91, 181), (181, 84), (84, 17), + (17, 314), (314, 405), (405, 321), (321, 375), (375, 291), + (61, 185), (185, 40), (40, 39), (39, 37), (37, 0), + (0, 267), (267, 269), (269, 270), (270, 409), (409, 291), + (78, 95), (95, 88), (88, 178), (178, 87), (87, 14), + (14, 317), (317, 402), (402, 318), (318, 324), (324, 308), + (78, 191), (191, 80), (80, 81), (81, 82), (82, 13), + (13, 312), (312, 311), (311, 310), (310, 415), (415, 308), + # Left eye. + (263, 249), (249, 390), (390, 373), (373, 374), (374, 380), + (380, 381), (381, 382), (382, 362), (263, 466), (466, 388), + (388, 387), (387, 386), (386, 385), (385, 384), (384, 398), + (398, 362), + # Left eyebrow. + (276, 283), (283, 282), (282, 295), (295, 285), (300, 293), + (293, 334), (334, 296), (296, 336), + # Right eye. + (33, 7), (7, 163), (163, 144), (144, 145), (145, 153), + (153, 154), (154, 155), (155, 133), (33, 246), (246, 161), + (161, 160), (160, 159), (159, 158), (158, 157), (157, 173), + (173, 133), + # Right eyebrow. + (46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105), + (105, 66), (66, 107), + # Face oval. + (10, 338), (338, 297), (297, 332), (332, 284), (284, 251), + (251, 389), (389, 356), (356, 454), (454, 323), (323, 361), + (361, 288), (288, 397), (397, 365), (365, 379), (379, 378), + (378, 400), (400, 377), (377, 152), (152, 148), (148, 176), + (176, 149), (149, 150), (150, 136), (136, 172), (172, 58), + (58, 132), (132, 93), (93, 234), (234, 127), (127, 162), + (162, 21), (21, 54), (54, 103), (103, 67), (67, 109), + (109, 10) +] diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/README.md b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7fb769136de5f3805e830684440b3b5897d99aec --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/README.md @@ -0,0 +1,63 @@ +## Model Information +### Source model +- Input shape: [1x3x256x256],[1x3x192x192] +- Number of parameters:0.13M, 0.6M +- Model size:0.58MB, 2.32MB +- Output shape: [1x896x16, 1x896x1], [1, 1x486x3] + +Source model repository: [MediaPipe-Face-Detection](https://github.com/zmurez/MediaPipePyTorch/) + +### Converted model + +- Precision: INT8 +- Backend: QNN2.16 +- Target Device: FV01 QCS6490 + +## Inference with AidLite SDK + +### SDK installation +Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/) + +- install AidLite SDK + +```bash +# Install the appropriate version of the aidlite sdk +sudo aid-pkg update +sudo aid-pkg install aidlite-sdk +# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223 +sudo aid-pkg install aidlite-{QNN VERSION} +``` + +- Verify AidLite SDK + +```bash +# aidlite sdk c++ check +python3 -c "import aidlite ; print(aidlite.get_library_version())" + +# aidlite sdk python check +python3 -c "import aidlite ; print(aidlite.get_py_library_version())" +``` + +### Run demo +#### python +```bash +cd python +python3 demo_qnn.py +``` + +#### c++ +```bash +# 加载.npy文件需要用到cnpy库(终端默认路径下执行即可) +git clone https://github.com/rogersce/cnpy.git +cd cnpy +mkdir build && cd build +cmake .. +make +sudo make install + +cd mediapipe-face-detection/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp +mkdir build && cd build +cmake .. +make +./run_test +``` diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..13cca3db86e64b8a8458fd55c52323bed1d4282f --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt @@ -0,0 +1,34 @@ +cmake_minimum_required (VERSION 3.5) +project("run_test") + +find_package(OpenCV REQUIRED) +find_library(CNPY_LIB cnpy REQUIRED) + +message(STATUS "oPENCV Library status:") +message(STATUS ">version:${OpenCV_VERSION}") +message(STATUS "Include:${OpenCV_INCLUDE_DIRS}") + +set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations") + +include_directories( + /usr/local/include + /usr/include/opencv4 +) + +link_directories( + /usr/local/lib/ +) + +file(GLOB SRC_LISTS + ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp +) + +add_executable(run_test ${SRC_LISTS}) + +target_link_libraries(run_test + aidlite + ${OpenCV_LIBS} + pthread + jsoncpp + ${CNPY_LIB} +) diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/anchors_float32.npy b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/anchors_float32.npy new file mode 100644 index 0000000000000000000000000000000000000000..fc25c0d2c161ee090e9a04c6003418bd15caf45c --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/anchors_float32.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94 +size 14464 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/coco.jpg b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/coco.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/coco.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378 +size 158750 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..15a817e6b6b8d96cb1edff9e2ee5dae04e7fccde --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp @@ -0,0 +1,909 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cnpy.h" + +using namespace cv; +using namespace std; +using namespace Aidlux::Aidlite; + + +// 人脸 landmark 连接索引定义(来自 MediaPipe Face Mesh) +const std::vector> FACE_CONNECTIONS = { + {61, 146}, {146, 91}, {91, 181}, {181, 84}, {84, 17}, + {17, 314}, {314, 405}, {405, 321}, {321, 375}, {375, 291}, + {61, 185}, {185, 40}, {40, 39}, {39, 37}, {37, 0}, + {0, 267}, {267, 269}, {269, 270}, {270, 409}, {409, 291}, + {78, 95}, {95, 88}, {88, 178}, {178, 87}, {87, 14}, + {14, 317}, {317, 402}, {402, 318}, {318, 324}, {324, 308}, + {78, 191}, {191, 80}, {80, 81}, {81, 82}, {82, 13}, + {13, 312}, {312, 311}, {311, 310}, {310, 415}, {415, 308}, + {263, 249}, {249, 390}, {390, 373}, {373, 374}, {374, 380}, + {380, 381}, {381, 382}, {382, 362}, {263, 466}, {466, 388}, + {388, 387}, {387, 386}, {386, 385}, {385, 384}, {384, 398}, + {398, 362}, {276, 283}, {283, 282}, {282, 295}, {295, 285}, + {300, 293}, {293, 334}, {334, 296}, {296, 336}, {33, 7}, + {7, 163}, {163, 144}, {144, 145}, {145, 153}, {153, 154}, + {154, 155}, {155, 133}, {33, 246}, {246, 161}, {161, 160}, + {160, 159}, {159, 158}, {158, 157}, {157, 173}, {173, 133}, + {46, 53}, {53, 52}, {52, 65}, {65, 55}, {70, 63}, {63, 105}, + {105, 66}, {66, 107}, {10, 338}, {338, 297}, {297, 332}, + {332, 284}, {284, 251}, {251, 389}, {389, 356}, {356, 454}, + {454, 323}, {323, 361}, {361, 288}, {288, 397}, {397, 365}, + {365, 379}, {379, 378}, {378, 400}, {400, 377}, {377, 152}, + {152, 148}, {148, 176}, {176, 149}, {149, 150}, {150, 136}, + {136, 172}, {172, 58}, {58, 132}, {132, 93}, {93, 234}, + {234, 127}, {127, 162}, {162, 21}, {21, 54}, {54, 103}, + {103, 67}, {67, 109}, {109, 10} +}; + +struct Args { + std::string faceDetector_model = "../../models/m_faceDetector_w8a8.qnn216.ctx.bin"; + std::string faceLandmark_model = "../../models/m_faceLandmark_w8a8.qnn216.ctx.bin"; + std::string imgs = "../coco.jpg"; + int invoke_nums = 10; + std::string model_type = "QNN"; +}; + + +Args parse_args(int argc, char* argv[]) { + Args args; + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if (arg == "--faceDetector_model" && i + 1 < argc) { + args.faceDetector_model = argv[++i]; + } else if (arg == "--faceLandmark_model" && i + 1 < argc) { + args.faceLandmark_model = argv[++i]; + } else if (arg == "--imgs" && i + 1 < argc) { + args.imgs = argv[++i]; + } else if (arg == "--invoke_nums" && i + 1 < argc) { + args.invoke_nums = std::stoi(argv[++i]); + } else if (arg == "--model_type" && i + 1 < argc) { + args.model_type = argv[++i]; + } + } + return args; +} + +std::string to_lower(const std::string& str) { + std::string lower_str = str; + std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) { + return std::tolower(c); + }); + return lower_str; +} + +std::vector> load_anchors_from_npy(const std::string& path) { + cnpy::NpyArray arr = cnpy::npy_load(path); + float* data_ptr = arr.data(); + + size_t num_rows = arr.shape[0]; // 896 + size_t num_cols = arr.shape[1]; // 4 + + std::vector> anchors(num_rows, std::vector(num_cols)); + for (size_t i = 0; i < num_rows; ++i) { + for (size_t j = 0; j < num_cols; ++j) { + anchors[i][j] = data_ptr[i * num_cols + j]; + } + } + + return anchors; +} + + +// 绘制人脸关键点和连接线 +void draw_landmarks( + cv::Mat& img, + const std::vector& points, + const std::vector& flags, + const std::vector>& connections, + float threshold = 0.4f, + cv::Scalar point_color = cv::Scalar(0, 255, 0), + cv::Scalar line_color = cv::Scalar(0, 0, 0), + int size = 2) +{ + // 画关键点 + for (size_t i = 0; i < points.size(); ++i) { + if (i < flags.size() && flags[i] > threshold) { + int x = static_cast(points[i].x); + int y = static_cast(points[i].y); + cv::circle(img, cv::Point(x, y), size, point_color, size); + } + } + + // 画连接线(两端都要可见) + for (const auto& conn : connections) { + int i0 = conn.first; + int i1 = conn.second; + if (i0 < points.size() && i1 < points.size() && + i0 < flags.size() && i1 < flags.size() && + flags[i0] > threshold && flags[i1] > threshold) + { + cv::line(img, points[i0], points[i1], line_color, size); + } + } +} + + +std::tuple resize_pad(const cv::Mat& img) { + int orig_h = img.rows; // 480 + int orig_w = img.cols; // 640 + + // Step 1: resize width to 256, keep aspect ratio + int w1 = 256; + int h1 = w1 * orig_h / orig_w; // 等效于 int(256 * h / w) + + // Step 2: compute padding in height direction + int padh = 256 - h1; + int padw = 0; + + int padh1 = padh / 2; + int padh2 = padh1 + (padh % 2); + int padw1 = padw / 2; + int padw2 = padw1 + (padw % 2); + + // Step 3: resize to (w1, h1) + cv::Mat resized; + cv::resize(img, resized, cv::Size(w1, h1)); // (256, h1) + + // Step 4: pad to (256, 256) + cv::Mat padded; + cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); + + // Step 5: resize padded to 128×128 + cv::Mat resized128; + cv::resize(padded, resized128, cv::Size(128, 128)); + + // Step 6: compute scale and pad in original image space + float scale = static_cast(orig_h) / h1; // h / h1 + cv::Point pad_point(static_cast(padh1 * scale), static_cast(padw1 * scale)); + + return std::make_tuple(padded, resized128, scale, pad_point); +} + + +// 将图像转换为 1xC×H×W 格式并归一化(除以 255) +std::vector preprocess_image(const cv::Mat& img) { + int H = img.rows; + int W = img.cols; + int C = img.channels(); // should be 3 + + std::vector chw(H * W * C); // CHW + std::vector nchw(1 * C * H * W); // NCHW + + // 1. HWC → CHW + normalize (float32 / 255.0) + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int c = 0; c < C; ++c) { + // OpenCV uses BGR order + float value = img.at(h, w)[c] / 255.0f; + chw[c * H * W + h * W + w] = value; + } + } + } + + // 2. CHW → NCHW (add batch dimension, actually just copy) + for (int i = 0; i < C * H * W; ++i) { + nchw[i] = chw[i]; + } + + return nchw; // shape: [1, 3, H, W] +} + + +// 只用前4个坐标计算IOU(默认框位置在前4个坐标) +float IoU(const std::vector& box1, const std::vector& box2) { + float x1 = std::max(box1[0], box2[0]); + float y1 = std::max(box1[1], box2[1]); + float x2 = std::min(box1[2], box2[2]); + float y2 = std::min(box1[3], box2[3]); + + float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1); + float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]); + float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]); + float union_area = box1_area + box2_area - inter_area; + + return union_area > 0 ? inter_area / union_area : 0.0f; +} + +std::vector> weighted_non_max_suppression( + std::vector>& detections, + int num_coords = 16, + float min_suppression_threshold = 0.3f) +{ + if (detections.empty()) return {}; + + std::vector indices(detections.size()); + std::iota(indices.begin(), indices.end(), 0); + + // 按置信度降序排序 + std::sort(indices.begin(), indices.end(), [&](int a, int b) { + return detections[a][num_coords] > detections[b][num_coords]; + }); + + std::vector> output; + + while (!indices.empty()) { + int best_idx = indices.front(); + const auto& best_det = detections[best_idx]; + std::vector overlapping = { best_idx }; + + for (size_t i = 1; i < indices.size(); ++i) { + float iou = IoU(best_det, detections[indices[i]]); + if (iou > min_suppression_threshold) { + overlapping.push_back(indices[i]); + } + } + + // 更新剩余索引 + std::vector new_indices; + for (int idx : indices) { + if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) { + new_indices.push_back(idx); + } + } + indices = new_indices; + + // 加权平均:坐标 * 置信度 + if (overlapping.size() == 1) { + output.push_back(best_det); + } else { + std::vector weighted(num_coords + 1, 0.0f); + float total_score = 0.0f; + + for (int idx : overlapping) { + float score = detections[idx][num_coords]; + total_score += score; + for (int k = 0; k < num_coords; ++k) { + weighted[k] += detections[idx][k] * score; + } + } + + for (int k = 0; k < num_coords; ++k) { + weighted[k] /= total_score; + } + weighted[num_coords] = total_score / overlapping.size(); // 取平均得分 + + // std::cout << "Weighted box: "; + // for (float v : weighted) std::cout << v << " "; + // std::cout << "\n"; + + output.push_back(weighted); + } + } + + // TODO + auto x = output[0]; + output.clear(); + output.push_back(x); + + return output; +} + + +std::vector> denormalize_detections( + const std::vector>& detections, + float scale, + const cv::Point& pad +) { + std::vector> result = detections; + + for (size_t i = 0; i < result.size(); ++i) { + std::vector& det = result[i]; + + // bbox coords: x1, y1, x2, y2 + det[0] = det[0] * scale * 256.0f - pad.x; // x1 + det[1] = det[1] * scale * 256.0f - pad.y; // y1 + det[2] = det[2] * scale * 256.0f - pad.x; // x2 + det[3] = det[3] * scale * 256.0f - pad.y; // y2 + + // keypoints (starting from index 4): format [y, x, y, x, ...] + for (size_t k = 4; k + 1 < det.size(); k += 2) { + det[k] = det[k] * scale * 256.0f - pad.y; // y + det[k + 1] = det[k + 1] * scale * 256.0f - pad.x; // x + } + } + + return result; +} + + +void detection2roi( + const std::vector>& detections, + std::vector& xc, + std::vector& yc, + std::vector& scale, + std::vector& theta, + int kp1, int kp2, // 关键点索引 + float dy, float dscale, float theta0 +) { + size_t N = detections.size(); + xc.resize(N); + yc.resize(N); + scale.resize(N); + theta.resize(N); + + for (size_t i = 0; i < N; ++i) { + const std::vector& det = detections[i]; + + float x1 = det[1]; + float x2 = det[3]; + float y1 = det[0]; + float y2 = det[2]; + + float x_center = (x1 + x2) / 2.0f; + float y_center = (y1 + y2) / 2.0f; + float box_scale = (x2 - x1); // assumes square box + + // yc 偏移 + y_center += dy * box_scale; + box_scale *= dscale; + + // 获取两个关键点的位置 + int base = 4; + int idx_y0 = base + 2 * kp1; + int idx_x0 = base + 2 * kp1 + 1; + int idx_y1 = base + 2 * kp2; + int idx_x1 = base + 2 * kp2 + 1; + + float x0 = det[idx_x0]; + float y0 = det[idx_y0]; + float x1_kp = det[idx_x1]; + float y1_kp = det[idx_y1]; + + float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0; + + // 输出赋值 + xc[i] = x_center; + yc[i] = y_center; + scale[i] = box_scale; + // TODO: 这里的 theta 需要根据实际情况调整 + // theta[i] = angle; // 如果需要使用计算的角度 + theta[i] = -0.0094; + } +} + + +void extract_roi( + const cv::Mat& frame, + const std::vector& xc, + const std::vector& yc, + const std::vector& theta, + const std::vector& scale, + std::vector& cropped_rois, + std::vector& affine_matrices, + std::vector>& roi_boxes, // 添加返回点坐标 + int resolution = 192 +) { + cropped_rois.clear(); + affine_matrices.clear(); + roi_boxes.clear(); + + for (size_t i = 0; i < xc.size(); ++i) { + float s = scale[i] / 2.0f; + float cos_t = std::cos(theta[i]); + float sin_t = std::sin(theta[i]); + + // 定义4个 unit square 点经过变换后的点(顺序和 Python 中一样) + std::vector points(4); + // [-1, -1] + points[0].x = xc[i] + (-s * cos_t + s * sin_t); + points[0].y = yc[i] + (-s * sin_t - s * cos_t); + // [1, -1] + points[1].x = xc[i] + ( s * cos_t + s * sin_t); + points[1].y = yc[i] + ( s * sin_t - s * cos_t); + // [-1, 1] + points[2].x = xc[i] + (-s * cos_t - s * sin_t); + points[2].y = yc[i] + (-s * sin_t + s * cos_t); + // [1, 1] + points[3].x = xc[i] + ( s * cos_t - s * sin_t); + points[3].y = yc[i] + ( s * sin_t + s * cos_t); + + // 用前三个点计算仿射变换 + std::vector src_pts = { points[0], points[1], points[2] }; + std::vector dst_pts = { + cv::Point2f(0, 0), + cv::Point2f(resolution - 1, 0), + cv::Point2f(0, resolution - 1) + }; + + cv::Mat M = cv::getAffineTransform(src_pts, dst_pts); + cv::Mat M_inv; + cv::invertAffineTransform(M, M_inv); + + cv::Mat cropped; + cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5)); + cropped_rois.push_back(cropped); + affine_matrices.push_back(M_inv); + roi_boxes.push_back(points); // 添加变换后的 box 点 + } +} + +std::vector preprocess_imgs_to_nchw(const std::vector& imgs) { + int N = imgs.size(); + if (N == 0) return {}; + + int H = 192; + int W = 192; + int C = 3; // assume 3 channels (BGR) + + std::vector output; + output.reserve(N * C * H * W); + + for (int n = 0; n < N; ++n) { + cv::Mat img_float; + imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0); // Normalize to [0,1] + + // Split channels (HWC → CHW) + std::vector channels(3); + cv::split(img_float, channels); // channels[0] = B, [1] = G, [2] = R + + for (int c = 0; c < C; ++c) { + for (int i = 0; i < H; ++i) { + for (int j = 0; j < W; ++j) { + output.push_back(channels[c].at(i, j)); + } + } + } + } + + return output; // shape: N x C x H x W +} + +// resolution 一般为 192 +std::vector denormalize_landmarks( + const std::vector& normalized_landmarks, + const std::vector& affines, + int resolution = 192) +{ + std::vector output; + + // 检查输入尺寸 + const int num_faces = 1; // 假设只有一个人脸 + const int num_landmarks = 468; + if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) { + std::cerr << "Error: Input size mismatch. Expected " + << num_faces * num_landmarks * 3 << " landmarks and " + << num_faces << " affine matrices." << std::endl; + throw std::runtime_error("Input size mismatch"); + } + + for (int i = 0; i < num_faces; ++i) { + const cv::Mat& affine = affines[i]; // 2x3 CV_32F + for (int j = 0; j < num_landmarks; ++j) { + int idx = i * num_landmarks * 3 + j * 3; + float x = normalized_landmarks[idx + 0] * resolution; + float y = normalized_landmarks[idx + 1] * resolution; + // float z = normalized_landmarks[idx + 2]; // 可选使用 + + // 2x1 输入向量 + cv::Mat pt = (cv::Mat_(2, 1) << x, y); + + // 提取 affine 的旋转和平移 + cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone(); + cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone(); + M2x2.convertTo(M2x2, CV_32F); + t2x1.convertTo(t2x1, CV_32F); + + // 反仿射变换 + cv::Mat out = M2x2 * pt + t2x1; + + // 存储为 Point2f + output.emplace_back(out.at(0, 0), out.at(1, 0)); + } + } + + return output; // 输出为 denormalized landmarks,大小为 468 个 Point2f +} + + +void draw_roi(cv::Mat& img, const std::vector>& boxes) { + for (const auto& roi : boxes) { + if (roi.size() < 4) continue; + + const cv::Point2f& p1 = roi[0]; + const cv::Point2f& p2 = roi[1]; + const cv::Point2f& p3 = roi[2]; + const cv::Point2f& p4 = roi[3]; + + cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2); // 黑色 + cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2); // 绿色 + cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2); // 黑色 + cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2); // 黑色 + } +} + + +void draw_detections(cv::Mat& img, const std::vector>& detections, bool with_keypoints = true) { + for (const auto& det : detections) { + if (det.size() < 4) continue; + + float ymin = det[0]; + float xmin = det[1]; + float ymax = det[2]; + float xmax = det[3]; + + cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1); + + if (with_keypoints && det.size() > 4) { + int n_keypoints = (det.size() - 4) / 2; + for (int k = 0; k < n_keypoints; ++k) { + int kp_x = int(det[4 + k * 2]); + int kp_y = int(det[4 + k * 2 + 1]); + cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2); + } + } + } +} + + +std::vector> loadAnchors(const std::string& filename) { + std::ifstream in(filename); + std::vector> anchors; + + if (!in.is_open()) { + std::cerr << "Failed to open file: " << filename << std::endl; + return anchors; + } + + std::string line; + while (std::getline(in, line)) { + std::istringstream ss(line); + std::vector anchor; + float value; + while (ss >> value) { + anchor.push_back(value); + } + if (!anchor.empty()) { + anchors.push_back(anchor); + } + } + + in.close(); + return anchors; +} + +// sigmoid 函数 +float sigmoid(float x) { + return 1.0f / (1.0f + std::exp(-x)); +} + +// clamp 函数 +float clamp(float x, float min_val, float max_val) { + return std::max(min_val, std::min(max_val, x)); +} + +// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord] +std::vector>> decode_boxes( + const std::vector& raw_boxes, + const std::vector>& anchors, + int batch, int num_anchors, int num_coords, + float x_scale, float y_scale, float w_scale, float h_scale, + int num_keypoints) +{ + std::vector>> decoded_boxes(batch, + std::vector>(num_anchors, std::vector(num_coords, 0))); + + for (int b = 0; b < batch; ++b) { + for (int i = 0; i < num_anchors; ++i) { + int base = b * num_anchors * num_coords + i * num_coords; + + float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0]; + float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1]; + float w = raw_boxes[base + 2] / w_scale * anchors[i][2]; + float h = raw_boxes[base + 3] / h_scale * anchors[i][3]; + + decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin + decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin + decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax + decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax + + for (int k = 0; k < num_keypoints; ++k) { + int offset = 4 + k * 2; + float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0]; + float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1]; + decoded_boxes[b][i][offset] = keypoint_x; + decoded_boxes[b][i][offset + 1] = keypoint_y; + } + } + } + + return decoded_boxes; +} + +std::vector>> tensors_to_detections( + const std::vector& raw_box_tensor, + const std::vector& raw_score_tensor, + const std::vector>& anchors, + int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints, + float x_scale, float y_scale, float w_scale, float h_scale, + float score_clipping_thresh, float min_score_thresh) +{ + assert(raw_box_tensor.size() == batch * num_anchors * num_coords); + assert(raw_score_tensor.size() == batch * num_anchors * num_classes); + assert(anchors.size() == size_t(num_anchors)); + + auto detection_boxes = decode_boxes( + raw_box_tensor, anchors, batch, num_anchors, num_coords, + x_scale, y_scale, w_scale, h_scale, num_keypoints); + + std::vector>> output_detections; + + for (int b = 0; b < batch; ++b) { + std::vector> detections; + + for (int i = 0; i < num_anchors; ++i) { + int score_index = b * num_anchors * num_classes + i * num_classes; + + // 单类情况,取第0类 + float score_raw = raw_score_tensor[score_index]; + float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh)); + + if (score >= min_score_thresh) { + std::vector det = detection_boxes[b][i]; // shape [num_coords] + det.push_back(score); // 追加置信度 + detections.push_back(det); // shape [num_coords+1] + } + } + + output_detections.push_back(detections); // 每个 batch 一个 vector + } + + return output_detections; +} + + +int invoke(const Args& args) { + std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n" + << args.faceLandmark_model << "\n" + << "Image Path: " << args.imgs << "\n" + << "Inference Nums: " << args.invoke_nums << "\n" + << "Model Type: " << args.model_type << "\n"; + // =============================================================faceDetector_model start + Model* model1 = Model::create_instance(args.faceDetector_model); + if(model1 == nullptr){ + printf("Create model1 failed !\n"); + return EXIT_FAILURE; + } + Config* config1 = Config::create_instance(); + if(config1 == nullptr){ + printf("Create config1 failed !\n"); + return EXIT_FAILURE; + } + config1->implement_type = ImplementType::TYPE_LOCAL; + std::string model_type_lower1 = to_lower(args.model_type); + if (model_type_lower1 == "qnn"){ + config1->framework_type = FrameworkType::TYPE_QNN; + } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") { + config1->framework_type = FrameworkType::TYPE_SNPE2; + } + config1->accelerate_type = AccelerateType::TYPE_DSP; + config1->is_quantify_model = 1; + + std::vector> input_shapes1 = {{1,3,256,256}}; + std::vector> output_shapes1 = {{1,896,16},{1,896,1}}; + model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32); + std::unique_ptr fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1); + if(fast_interpreter1 == nullptr){ + printf("build_interpretper_from_model_and_config failed !\n"); + return EXIT_FAILURE; + } + int result = fast_interpreter1->init(); + if(result != EXIT_SUCCESS){ + printf("interpreter->init() failed !\n"); + return EXIT_FAILURE; + } + // load model + fast_interpreter1->load_model(); + if(result != EXIT_SUCCESS){ + printf("interpreter->load_model() failed !\n"); + return EXIT_FAILURE; + } + printf("detect model load success!\n"); + // =============================================================faceDetector_model over + + // =============================================================faceLandmark_model start + Model* model2 = Model::create_instance(args.faceLandmark_model); + if(model2 == nullptr){ + printf("Create model2 failed !\n"); + return EXIT_FAILURE; + } + Config* config2 = Config::create_instance(); + if(config2 == nullptr){ + printf("Create config2 failed !\n"); + return EXIT_FAILURE; + } + config2->implement_type = ImplementType::TYPE_LOCAL; + std::string model_type_lower2 = to_lower(args.model_type); + if (model_type_lower2 == "qnn"){ + config2->framework_type = FrameworkType::TYPE_QNN; + } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") { + config2->framework_type = FrameworkType::TYPE_SNPE2; + } + config2->accelerate_type = AccelerateType::TYPE_DSP; + config2->is_quantify_model = 1; + + std::vector> input_shapes2 = {{1,3,192,192}}; + std::vector> output_shapes2 = {{1},{1,468,3}}; + model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32); + std::unique_ptr fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2); + if(fast_interpreter2 == nullptr){ + printf("build_interpretper_from_model_and_config2 failed !\n"); + return EXIT_FAILURE; + } + result = fast_interpreter2->init(); + if(result != EXIT_SUCCESS){ + printf("interpreter2->init() failed !\n"); + return EXIT_FAILURE; + } + // load model + fast_interpreter2->load_model(); + if(result != EXIT_SUCCESS){ + printf("interpreter2->load_model() failed !\n"); + return EXIT_FAILURE; + } + printf("detect model2 load success!\n"); + // =============================================================faceLandmark_model over + + + auto anchors = load_anchors_from_npy("../anchors_float32.npy"); + cv::Mat frame = cv::imread(args.imgs); + if (frame.empty()) { + printf("detect image load failed!\n"); + return 1; + } + // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows); + cv::Mat input_data; + cv::Mat frame_clone1 = frame.clone(); + cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB); + cv::Mat frame_clone = frame.clone(); + + + cv::Mat img1, img2; + float scale; + cv::Point pad; + std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1); + std::vector input_tensor = preprocess_image(img1); + + float *outdata0 = nullptr; + float *outdata1 = nullptr; + std::vector invoke_time; + for (int i = 0; i < args.invoke_nums; ++i) { + result = fast_interpreter1->set_input_tensor(0, input_tensor.data()); + if(result != EXIT_SUCCESS){ + printf("interpreter->set_input_tensor() failed !\n"); + return EXIT_FAILURE; + } + auto t1 = std::chrono::high_resolution_clock::now(); + result = fast_interpreter1->invoke(); + auto t2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration cost_time = t2 - t1; + invoke_time.push_back(cost_time.count() * 1000); + if(result != EXIT_SUCCESS){ + printf("interpreter->invoke() failed !\n"); + return EXIT_FAILURE; + } + uint32_t out_data_0 = 0; + result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0); + if(result != EXIT_SUCCESS){ + printf("interpreter1->get_output_tensor() 0 failed !\n"); + return EXIT_FAILURE; + } + + uint32_t out_data_1 = 0; + result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1); + if(result != EXIT_SUCCESS){ + printf("interpreter1->get_output_tensor() 1 failed !\n"); + return EXIT_FAILURE; + } + + } + + std::vector tensor_1_896_16(outdata0, outdata0 + 896*16); + std::vector tensor_1_896_1(outdata1, outdata1 + 896*1); + + std::vector>> detections = tensors_to_detections( + tensor_1_896_16, tensor_1_896_1, anchors, + 1, 896, 16, 1, 6, + 256.0f, 256.0f, 256.0f, 256.0f, + 100.0f, 0.4f); + + + std::vector>> filtered_detections; + for (size_t i = 0; i < detections.size(); ++i) { + std::vector>& dets = detections[i]; + std::vector> faces = weighted_non_max_suppression(dets); + filtered_detections.push_back(faces); + } + + + // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n"; + // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n"; + std::vector> face_detections = denormalize_detections(filtered_detections[0], scale, pad); + + // std::cout << "face_detections size: " << face_detections.size() << "\n"; + std::vector xc, yc, scales, theta; + int kp1 = 0, kp2 = 1; // 关键点索引 + float dy = 0.0f; // 根据模型定义设定 + float dscale = 1.5f; // 缩放因子 + float theta0 = 0.0f; // 基准角度 + + detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0); + std::vector rois; + std::vector affines; + std::vector> boxes; + + extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes); + if (!boxes.empty()) { + std::cout << "Detected " << boxes.size() << " faces.\n"; + // 检测到人脸,继续处理 boxes[0] ... + std::vector input_tensor = preprocess_imgs_to_nchw(rois); + + float *outdata1_0 = nullptr; + float *outdata1_1 = nullptr; + + result = fast_interpreter2->set_input_tensor(0, input_tensor.data()); + if(result != EXIT_SUCCESS){ + printf("interpreter2->set_input_tensor() failed !\n"); + return EXIT_FAILURE; + } + auto t1 = std::chrono::high_resolution_clock::now(); + result = fast_interpreter2->invoke(); + auto t2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration cost_time = t2 - t1; + invoke_time.push_back(cost_time.count() * 1000); + if(result != EXIT_SUCCESS){ + printf("interpreter2->invoke() failed !\n"); + return EXIT_FAILURE; + } + uint32_t out_data_1_0 = 0; + result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0); + if(result != EXIT_SUCCESS){ + printf("interpreter2->get_output_tensor() 0 failed !\n"); + return EXIT_FAILURE; + } + + uint32_t out_data_1_1 = 0; + result = fast_interpreter2->get_output_tensor(1, (void**)&outdata1_1, &out_data_1_1); + if(result != EXIT_SUCCESS){ + printf("interpreter2->get_output_tensor() 1 failed !\n"); + return EXIT_FAILURE; + } + + std::vector flags(outdata1_0, outdata1_0 + 1); + std::vector normalized_landmarks(outdata1_1, outdata1_1 + 468*3); + + std::vector landmarks = denormalize_landmarks(normalized_landmarks, affines); + draw_landmarks(frame_clone1, landmarks, flags, FACE_CONNECTIONS); + } else { + std::cout << "not detect face!" << std::endl; + } + + + draw_roi(frame_clone1, boxes); + draw_detections(frame_clone1, face_detections); + cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR); + cv::imwrite("vis_result.jpg", frame_clone1); + + + fast_interpreter1->destory(); + fast_interpreter2->destory(); + return 0; + +} + + +int main(int argc, char* argv[]) { + Args args = parse_args(argc, argv); + return invoke(args); +} diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceDetector_w8a8.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceDetector_w8a8.qnn216.ctx.bin new file mode 100644 index 0000000000000000000000000000000000000000..b365b40ca2fa478ae2e677871408350abc5cf355 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceDetector_w8a8.qnn216.ctx.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42ccf2e3a2ee4ff2adf15ea7b00b453bb1a0a183ebd764e8542eb9d56182191d +size 720424 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceLandmark_w8a8.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceLandmark_w8a8.qnn216.ctx.bin new file mode 100644 index 0000000000000000000000000000000000000000..56eeb3e249c0a2205e5ba5bc37379b35e9323b12 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceLandmark_w8a8.qnn216.ctx.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:355785d3eeb5a26ad29e3b128d803d3f20b443e01bed3249ff4013ac57d634b4 +size 1068128 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_face_back.npy b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_face_back.npy new file mode 100644 index 0000000000000000000000000000000000000000..3ba12474802adea36a8e37ca648d46556cd35c92 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_face_back.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95 +size 28800 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_float32.npy b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_float32.npy new file mode 100644 index 0000000000000000000000000000000000000000..fc25c0d2c161ee090e9a04c6003418bd15caf45c --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_float32.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94 +size 14464 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazebase.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazebase.py new file mode 100644 index 0000000000000000000000000000000000000000..414ae950199d7fa1ebae4fa540805cb53cf1ff0d --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazebase.py @@ -0,0 +1,513 @@ +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def resize_pad(img): + """ resize and pad images to be input to the detectors + + The face and palm detector networks take 256x256 and 128x128 images + as input. As such the input image is padded and resized to fit the + size while maintaing the aspect ratio. + + Returns: + img1: 256x256 + img2: 128x128 + scale: scale factor between original image and 256x256 image + pad: pixels of padding in the original image + """ + + size0 = img.shape + if size0[0]>=size0[1]: + h1 = 256 + w1 = 256 * size0[1] // size0[0] + padh = 0 + padw = 256 - w1 + scale = size0[1] / w1 + else: + h1 = 256 * size0[0] // size0[1] + w1 = 256 + padh = 256 - h1 + padw = 0 + scale = size0[0] / h1 + padh1 = padh//2 + padh2 = padh//2 + padh%2 + padw1 = padw//2 + padw2 = padw//2 + padw%2 + img1 = cv2.resize(img, (w1,h1)) + img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0))) + pad = (int(padh1 * scale), int(padw1 * scale)) + img2 = cv2.resize(img1, (128,128)) + return img1, img2, scale, pad + + +def denormalize_detections(detections, scale, pad): + """ maps detection coordinates from [0,1] to image coordinates + + The face and palm detector networks take 256x256 and 128x128 images + as input. As such the input image is padded and resized to fit the + size while maintaing the aspect ratio. This function maps the + normalized coordinates back to the original image coordinates. + + Inputs: + detections: nxm tensor. n is the number of detections. + m is 4+2*k where the first 4 valuse are the bounding + box coordinates and k is the number of additional + keypoints output by the detector. + scale: scalar that was used to resize the image + pad: padding in the x and y dimensions + + """ + detections[:, 0] = detections[:, 0] * scale * 256 - pad[0] + detections[:, 1] = detections[:, 1] * scale * 256 - pad[1] + detections[:, 2] = detections[:, 2] * scale * 256 - pad[0] + detections[:, 3] = detections[:, 3] * scale * 256 - pad[1] + + detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1] + detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0] + return detections + + + + +class BlazeBlock(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False): + super(BlazeBlock, self).__init__() + + self.stride = stride + self.kernel_size = kernel_size + self.channel_pad = out_channels - in_channels + + # TFLite uses slightly different padding than PyTorch + # on the depthwise conv layer when the stride is 2. + if stride == 2: + self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride) + padding = 0 + else: + padding = (kernel_size - 1) // 2 + + self.convs = nn.Sequential( + nn.Conv2d(in_channels=in_channels, out_channels=in_channels, + kernel_size=kernel_size, stride=stride, padding=padding, + groups=in_channels, bias=True), + nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=1, stride=1, padding=0, bias=True), + ) + + if skip_proj: + self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=1, stride=1, padding=0, bias=True) + else: + self.skip_proj = None + + if act == 'relu': + self.act = nn.ReLU(inplace=True) + elif act == 'prelu': + self.act = nn.PReLU(out_channels) + else: + raise NotImplementedError("unknown activation %s"%act) + + def forward(self, x): + if self.stride == 2: + if self.kernel_size==3: + h = F.pad(x, (0, 2, 0, 2), "constant", 0) + else: + h = F.pad(x, (1, 2, 1, 2), "constant", 0) + x = self.max_pool(x) + else: + h = x + + if self.skip_proj is not None: + x = self.skip_proj(x) + elif self.channel_pad > 0: + x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0) + + + return self.act(self.convs(h) + x) + + +class FinalBlazeBlock(nn.Module): + def __init__(self, channels, kernel_size=3): + super(FinalBlazeBlock, self).__init__() + + # TFLite uses slightly different padding than PyTorch + # on the depthwise conv layer when the stride is 2. + self.convs = nn.Sequential( + nn.Conv2d(in_channels=channels, out_channels=channels, + kernel_size=kernel_size, stride=2, padding=0, + groups=channels, bias=True), + nn.Conv2d(in_channels=channels, out_channels=channels, + kernel_size=1, stride=1, padding=0, bias=True), + ) + + self.act = nn.ReLU(inplace=True) + + def forward(self, x): + h = F.pad(x, (0, 2, 0, 2), "constant", 0) + + return self.act(self.convs(h)) + + +class BlazeBase(nn.Module): + """ Base class for media pipe models. """ + + def _device(self): + """Which device (CPU or GPU) is being used by this model?""" + return self.classifier_8.weight.device + + def load_weights(self, path): + self.load_state_dict(torch.load(path)) + self.eval() + + +class BlazeLandmark(BlazeBase): + """ Base class for landmark models. """ + + def extract_roi(self, frame, xc, yc, theta, scale): + + # take points on unit square and transform them according to the roi + points = torch.tensor([[-1, -1, 1, 1], + [-1, 1, -1, 1]], device=scale.device).view(1,2,4) + points = points * scale.view(-1,1,1)/2 + theta = theta.view(-1, 1, 1) + R = torch.cat(( + torch.cat((torch.cos(theta), -torch.sin(theta)), 2), + torch.cat((torch.sin(theta), torch.cos(theta)), 2), + ), 1) + center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1) + points = R @ points + center + + # use the points to compute the affine transform that maps + # these points back to the output square + res = self.resolution + points1 = np.array([[0, 0, res-1], + [0, res-1, 0]], dtype=np.float32).T + affines = [] + imgs = [] + for i in range(points.shape[0]): + pts = points[i, :, :3].cpu().numpy().T + M = cv2.getAffineTransform(pts, points1) + img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5) + img = torch.tensor(img, device=scale.device) + imgs.append(img) + affine = cv2.invertAffineTransform(M).astype('float32') + affine = torch.tensor(affine, device=scale.device) + affines.append(affine) + if imgs: + imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0 + affines = torch.stack(affines) + else: + imgs = torch.zeros((0, 3, res, res), device=scale.device) + affines = torch.zeros((0, 2, 3), device=scale.device) + + return imgs, affines, points + + def denormalize_landmarks(self, landmarks, affines): + landmarks[:,:,:2] *= self.resolution + for i in range(len(landmarks)): + landmark, affine = landmarks[i], affines[i] + landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T + landmarks[i,:,:2] = landmark + return landmarks + + + +class BlazeDetector(BlazeBase): + """ Base class for detector models. + + Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and + https://github.com/hollance/BlazeFace-PyTorch and + https://github.com/google/mediapipe/ + """ + def load_anchors(self, path): + self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device()) + assert(self.anchors.ndimension() == 2) + assert(self.anchors.shape[0] == self.num_anchors) + assert(self.anchors.shape[1] == 4) + + def _preprocess(self, x): + """Converts the image pixels to the range [-1, 1].""" + return x.float() / 255.# 127.5 - 1.0 + + def predict_on_image(self, img): + """Makes a prediction on a single image. + + Arguments: + img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of + shape (3, H, W). The image's height and width should be + 128 pixels. + + Returns: + A tensor with face detections. + """ + if isinstance(img, np.ndarray): + img = torch.from_numpy(img).permute((2, 0, 1)) + + return self.predict_on_batch(img.unsqueeze(0))[0] + + def predict_on_batch(self, x): + """Makes a prediction on a batch of images. + + Arguments: + x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of + shape (b, 3, H, W). The height and width should be 128 pixels. + + Returns: + A list containing a tensor of face detections for each image in + the batch. If no faces are found for an image, returns a tensor + of shape (0, 17). + + Each face detection is a PyTorch tensor consisting of 17 numbers: + - ymin, xmin, ymax, xmax + - x,y-coordinates for the 6 keypoints + - confidence score + """ + if isinstance(x, np.ndarray): + x = torch.from_numpy(x).permute((0, 3, 1, 2)) + + assert x.shape[1] == 3 + assert x.shape[2] == self.y_scale + assert x.shape[3] == self.x_scale + + # 1. Preprocess the images into tensors: + x = x.to(self._device()) + x = self._preprocess(x) + + # 2. Run the neural network: + with torch.no_grad(): + out = self.__call__(x) + + # 3. Postprocess the raw predictions: + detections = self._tensors_to_detections(out[0], out[1], self.anchors) + + # 4. Non-maximum suppression to remove overlapping detections: + filtered_detections = [] + for i in range(len(detections)): + faces = self._weighted_non_max_suppression(detections[i]) + faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1)) + filtered_detections.append(faces) + + return filtered_detections + + + def detection2roi(self, detection): + """ Convert detections from detector to an oriented bounding box. + + Adapted from: + # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt + + The center and size of the box is calculated from the center + of the detected box. Rotation is calcualted from the vector + between kp1 and kp2 relative to theta0. The box is scaled + and shifted by dscale and dy. + + """ + if self.detection2roi_method == 'box': + # compute box center and scale + # use mediapipe/calculators/util/detections_to_rects_calculator.cc + xc = (detection[:,1] + detection[:,3]) / 2 + yc = (detection[:,0] + detection[:,2]) / 2 + scale = (detection[:,3] - detection[:,1]) # assumes square boxes + + elif self.detection2roi_method == 'alignment': + # compute box center and scale + # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc + xc = detection[:,4+2*self.kp1] + yc = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2 + else: + raise NotImplementedError( + "detection2roi_method [%s] not supported"%self.detection2roi_method) + + yc += self.dy * scale + scale *= self.dscale + + # compute box rotation + x0 = detection[:,4+2*self.kp1] + y0 = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + #theta = np.arctan2(y0-y1, x0-x1) - self.theta0 + theta = torch.atan2(y0-y1, x0-x1) - self.theta0 + return xc, yc, scale, theta + + + def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors): + """The output of the neural network is a tensor of shape (b, 896, 16) + containing the bounding box regressor predictions, as well as a tensor + of shape (b, 896, 1) with the classification confidences. + + This function converts these two "raw" tensors into proper detections. + Returns a list of (num_detections, 17) tensors, one for each image in + the batch. + + This is based on the source code from: + mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc + mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto + """ + assert raw_box_tensor.ndimension() == 3 + assert raw_box_tensor.shape[1] == self.num_anchors + assert raw_box_tensor.shape[2] == self.num_coords + + assert raw_score_tensor.ndimension() == 3 + assert raw_score_tensor.shape[1] == self.num_anchors + assert raw_score_tensor.shape[2] == self.num_classes + + assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0] + + detection_boxes = self._decode_boxes(raw_box_tensor, anchors) + + thresh = self.score_clipping_thresh + raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh) + detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1) + + # Note: we stripped off the last dimension from the scores tensor + # because there is only has one class. Now we can simply use a mask + # to filter out the boxes with too low confidence. + mask = detection_scores >= self.min_score_thresh + + # Because each image from the batch can have a different number of + # detections, process them one at a time using a loop. + output_detections = [] + for i in range(raw_box_tensor.shape[0]): + boxes = detection_boxes[i, mask[i]] + scores = detection_scores[i, mask[i]].unsqueeze(dim=-1) + output_detections.append(torch.cat((boxes, scores), dim=-1)) + + return output_detections + + def _decode_boxes(self, raw_boxes, anchors): + """Converts the predictions into actual coordinates using + the anchor boxes. Processes the entire batch at once. + """ + boxes = torch.zeros_like(raw_boxes) + + x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0] + y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + + w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2] + h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3] + + boxes[..., 0] = y_center - h / 2. # ymin + boxes[..., 1] = x_center - w / 2. # xmin + boxes[..., 2] = y_center + h / 2. # ymax + boxes[..., 3] = x_center + w / 2. # xmax + + for k in range(self.num_keypoints): + offset = 4 + k*2 + keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0] + keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + boxes[..., offset ] = keypoint_x + boxes[..., offset + 1] = keypoint_y + + return boxes + + def _weighted_non_max_suppression(self, detections): + """The alternative NMS method as mentioned in the BlazeFace paper: + + "We replace the suppression algorithm with a blending strategy that + estimates the regression parameters of a bounding box as a weighted + mean between the overlapping predictions." + + The original MediaPipe code assigns the score of the most confident + detection to the weighted detection, but we take the average score + of the overlapping detections. + + The input detections should be a Tensor of shape (count, 17). + + Returns a list of PyTorch tensors, one for each detected face. + + This is based on the source code from: + mediapipe/calculators/util/non_max_suppression_calculator.cc + mediapipe/calculators/util/non_max_suppression_calculator.proto + """ + if len(detections) == 0: return [] + + output_detections = [] + + # Sort the detections from highest to lowest score. + remaining = torch.argsort(detections[:, self.num_coords], descending=True) + + while len(remaining) > 0: + detection = detections[remaining[0]] + + # Compute the overlap between the first box and the other + # remaining boxes. (Note that the other_boxes also include + # the first_box.) + first_box = detection[:4] + other_boxes = detections[remaining, :4] + ious = overlap_similarity(first_box, other_boxes) + + # If two detections don't overlap enough, they are considered + # to be from different faces. + mask = ious > self.min_suppression_threshold + overlapping = remaining[mask] + remaining = remaining[~mask] + + # Take an average of the coordinates from the overlapping + # detections, weighted by their confidence scores. + weighted_detection = detection.clone() + if len(overlapping) > 1: + coordinates = detections[overlapping, :self.num_coords] + scores = detections[overlapping, self.num_coords:self.num_coords+1] + total_score = scores.sum() + weighted = (coordinates * scores).sum(dim=0) / total_score + weighted_detection[:self.num_coords] = weighted + weighted_detection[self.num_coords] = total_score / len(overlapping) + + output_detections.append(weighted_detection) + + return output_detections + + +# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py + +def intersect(box_a, box_b): + """ We resize both tensors to [A,B,2] without new malloc: + [A,2] -> [A,1,2] -> [A,B,2] + [B,2] -> [1,B,2] -> [A,B,2] + Then we compute the area of intersect between box_a and box_b. + Args: + box_a: (tensor) bounding boxes, Shape: [A,4]. + box_b: (tensor) bounding boxes, Shape: [B,4]. + Return: + (tensor) intersection area, Shape: [A,B]. + """ + A = box_a.size(0) + B = box_b.size(0) + max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), + box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) + min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), + box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + return inter[:, :, 0] * inter[:, :, 1] + + +def jaccard(box_a, box_b): + """Compute the jaccard overlap of two sets of boxes. The jaccard overlap + is simply the intersection over union of two boxes. Here we operate on + ground truth boxes and default boxes. + E.g.: + A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) + Args: + box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] + box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] + Return: + jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] + """ + inter = intersect(box_a, box_b) + area_a = ((box_a[:, 2]-box_a[:, 0]) * + (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_b = ((box_b[:, 2]-box_b[:, 0]) * + (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + union = area_a + area_b - inter + return inter / union # [A,B] + + +def overlap_similarity(box, other_boxes): + """Computes the IOU between a bounding box and set of other boxes.""" + return jaccard(box.unsqueeze(0), other_boxes).squeeze(0) diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface.py new file mode 100644 index 0000000000000000000000000000000000000000..6d630b3b28ead006d2864f18f1637f6545dbe507 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface.py @@ -0,0 +1,182 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from blazebase import BlazeDetector, BlazeBlock, FinalBlazeBlock + + +class BlazeFace(BlazeDetector): + """The BlazeFace face detection model from MediaPipe. + + The version from MediaPipe is simpler than the one in the paper; + it does not use the "double" BlazeBlocks. + + Because we won't be training this model, it doesn't need to have + batchnorm layers. These have already been "folded" into the conv + weights by TFLite. + + The conversion to PyTorch is fairly straightforward, but there are + some small differences between TFLite and PyTorch in how they handle + padding on conv layers with stride 2. + + This version works on batches, while the MediaPipe version can only + handle a single image at a time. + + Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and + https://github.com/hollance/BlazeFace-PyTorch and + https://github.com/google/mediapipe/ + + """ + def __init__(self, back_model=False): + super(BlazeFace, self).__init__() + + # These are the settings from the MediaPipe example graph + # mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt + self.num_classes = 1 + self.num_anchors = 896 + self.num_coords = 16 + self.score_clipping_thresh = 100.0 + self.back_model = back_model + if back_model: + self.x_scale = 256.0 + self.y_scale = 256.0 + self.h_scale = 256.0 + self.w_scale = 256.0 + self.min_score_thresh = 0.65 + else: + self.x_scale = 128.0 + self.y_scale = 128.0 + self.h_scale = 128.0 + self.w_scale = 128.0 + self.min_score_thresh = 0.75 + self.min_suppression_threshold = 0.3 + self.num_keypoints = 6 + + # These settings are for converting detections to ROIs which can then + # be extracted and feed into the landmark network + # use mediapipe/calculators/util/detections_to_rects_calculator.cc + self.detection2roi_method = 'box' + # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt + self.kp1 = 1 + self.kp2 = 0 + self.theta0 = 0. + self.dscale = 1.5 + self.dy = 0. + + self._define_layers() + + def _define_layers(self): + if self.back_model: + self.backbone = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True), + nn.ReLU(inplace=True), + + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24, stride=2), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 48, stride=2), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 96, stride=2), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + ) + self.final = FinalBlazeBlock(96) + self.classifier_8 = nn.Conv2d(96, 2, 1, bias=True) + self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True) + + self.regressor_8 = nn.Conv2d(96, 32, 1, bias=True) + self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True) + else: + self.backbone1 = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True), + nn.ReLU(inplace=True), + + BlazeBlock(24, 24), + BlazeBlock(24, 28), + BlazeBlock(28, 32, stride=2), + BlazeBlock(32, 36), + BlazeBlock(36, 42), + BlazeBlock(42, 48, stride=2), + BlazeBlock(48, 56), + BlazeBlock(56, 64), + BlazeBlock(64, 72), + BlazeBlock(72, 80), + BlazeBlock(80, 88), + ) + + self.backbone2 = nn.Sequential( + BlazeBlock(88, 96, stride=2), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + ) + + self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True) + self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True) + + self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True) + self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True) + + def forward(self, x): + # TFLite uses slightly different padding on the first conv layer + # than PyTorch, so do it manually. + x = F.pad(x, (1, 2, 1, 2), "constant", 0) + + b = x.shape[0] # batch size, needed for reshaping later + + if self.back_model: + x = self.backbone(x) # (b, 16, 16, 96) + h = self.final(x) # (b, 8, 8, 96) + else: + x = self.backbone1(x) # (b, 88, 16, 16) + h = self.backbone2(x) # (b, 96, 8, 8) + + # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to + # permute the output from the conv layers before reshaping it. + + c1 = self.classifier_8(x) # (b, 2, 16, 16) + c1 = c1.permute(0, 2, 3, 1) # (b, 16, 16, 2) + c1 = c1.reshape(b, -1, 1) # (b, 512, 1) + + c2 = self.classifier_16(h) # (b, 6, 8, 8) + c2 = c2.permute(0, 2, 3, 1) # (b, 8, 8, 6) + c2 = c2.reshape(b, -1, 1) # (b, 384, 1) + + c = torch.cat((c1, c2), dim=1) # (b, 896, 1) + + r1 = self.regressor_8(x) # (b, 32, 16, 16) + r1 = r1.permute(0, 2, 3, 1) # (b, 16, 16, 32) + r1 = r1.reshape(b, -1, 16) # (b, 512, 16) + + r2 = self.regressor_16(h) # (b, 96, 8, 8) + r2 = r2.permute(0, 2, 3, 1) # (b, 8, 8, 96) + r2 = r2.reshape(b, -1, 16) # (b, 384, 16) + + r = torch.cat((r1, r2), dim=1) # (b, 896, 16) + return [r, c] + diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface_landmark.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface_landmark.py new file mode 100644 index 0000000000000000000000000000000000000000..8b21d17e31c8a8a39e95abb27cc3220440a01fca --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface_landmark.py @@ -0,0 +1,74 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from blazebase import BlazeLandmark, BlazeBlock + +class BlazeFaceLandmark(BlazeLandmark): + """The face landmark model from MediaPipe. + + """ + def __init__(self): + super(BlazeFaceLandmark, self).__init__() + + # size of ROIs used for input + self.resolution = 192 + + self._define_layers() + + def _define_layers(self): + self.backbone1 = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=0, bias=True), + nn.PReLU(16), + + BlazeBlock(16, 16, 3, act='prelu'), + BlazeBlock(16, 16, 3, act='prelu'), + BlazeBlock(16, 32, 3, 2, act='prelu'), + + BlazeBlock(32, 32, 3, act='prelu'), + BlazeBlock(32, 32, 3, act='prelu'), + BlazeBlock(32, 64, 3, 2, act='prelu'), + + BlazeBlock(64, 64, 3, act='prelu'), + BlazeBlock(64, 64, 3, act='prelu'), + BlazeBlock(64, 128, 3, 2, act='prelu'), + + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, 2, act='prelu'), + + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + ) + + + self.backbone2a = nn.Sequential( + BlazeBlock(128, 128, 3, 2, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + nn.Conv2d(128, 32, 1, padding=0, bias=True), + nn.PReLU(32), + BlazeBlock(32, 32, 3, act='prelu'), + nn.Conv2d(32, 1404, 3, padding=0, bias=True) + ) + + self.backbone2b = nn.Sequential( + BlazeBlock(128, 128, 3, 2, act='prelu'), + nn.Conv2d(128, 32, 1, padding=0, bias=True), + nn.PReLU(32), + BlazeBlock(32, 32, 3, act='prelu'), + nn.Conv2d(32, 1, 3, padding=0, bias=True) + ) + + def forward(self, x): + if x.shape[0] == 0: + return torch.zeros((0,)), torch.zeros((0, 468, 3)) + + x = F.pad(x, (0, 1, 0, 1), "constant", 0) + + x = self.backbone1(x) + landmarks = self.backbone2a(x).view(-1, 468, 3) / 192 + flag = self.backbone2b(x).sigmoid().view(-1) + + return flag, landmarks \ No newline at end of file diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/coco.jpg b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/coco.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/coco.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378 +size 158750 diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/demo_qnn.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/demo_qnn.py new file mode 100644 index 0000000000000000000000000000000000000000..36a0bca99d904b047a4fb4dc4bcec0b93a4651dd --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/demo_qnn.py @@ -0,0 +1,424 @@ +import numpy as np +import torch +import cv2 +import sys +from blazebase import resize_pad, denormalize_detections +from visualization import draw_landmarks, draw_roi, FACE_CONNECTIONS +import time +import aidlite +import os + +class post_mediapipe_face: + def __init__(self): + self.kp1 = 1 + self.kp2 = 0 + self.theta0 = 0. + self.dscale = 1.5 + self.dy = 0. + self.x_scale = 256.0 + self.y_scale = 256.0 + self.h_scale = 256.0 + self.w_scale = 256.0 + self.num_keypoints = 6 + self.num_classes = 1 + self.num_anchors = 896 + self.num_coords = 16 + self.min_score_thresh = 0.4 #0.65 + self.score_clipping_thresh = 100.0 + self.min_suppression_threshold = 0.3 + self.resolution = 192 + + + def detection2roi(self,detection): + xc = (detection[:,1] + detection[:,3]) / 2 + yc = (detection[:,0] + detection[:,2]) / 2 + scale = (detection[:,3] - detection[:,1]) # assumes square boxes + yc += self.dy * scale + scale *= self.dscale + # compute box rotation + x0 = detection[:,4+2*self.kp1] + y0 = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + theta = torch.atan2(y0-y1, x0-x1) - self.theta0 + return xc, yc, scale, theta + + def _decode_boxes( self,raw_boxes, anchors): + boxes = torch.zeros_like(raw_boxes) + + x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0] + y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + + w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2] + h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3] + + boxes[..., 0] = y_center - h / 2. # ymin + boxes[..., 1] = x_center - w / 2. # xmin + boxes[..., 2] = y_center + h / 2. # ymax + boxes[..., 3] = x_center + w / 2. # xmax + + for k in range(self.num_keypoints): + offset = 4 + k*2 + keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0] + keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + boxes[..., offset ] = keypoint_x + boxes[..., offset + 1] = keypoint_y + return boxes + + def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors): + assert raw_box_tensor.ndimension() == 3 + assert raw_box_tensor.shape[1] == self.num_anchors + assert raw_box_tensor.shape[2] == self.num_coords + + assert raw_score_tensor.ndimension() == 3 + assert raw_score_tensor.shape[1] == self.num_anchors + assert raw_score_tensor.shape[2] == self.num_classes + + assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0] + + detection_boxes = self._decode_boxes(raw_box_tensor, anchors) + + thresh = self.score_clipping_thresh + raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh) + detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1) + + # Note: we stripped off the last dimension from the scores tensor + # because there is only has one class. Now we can simply use a mask + # to filter out the boxes with too low confidence. + mask = detection_scores >= self.min_score_thresh + + # Because each image from the batch can have a different number of + # detections, process them one at a time using a loop. + output_detections = [] + for i in range(raw_box_tensor.shape[0]): + boxes = detection_boxes[i, mask[i]] + scores = detection_scores[i, mask[i]].unsqueeze(dim=-1) + output_detections.append(torch.cat((boxes, scores), dim=-1)) + + return output_detections + + def extract_roi( self,frame, xc, yc, theta, scale): + resolution = 192 + # take points on unit square and transform them according to the roi + points = torch.tensor([[-1, -1, 1, 1], + [-1, 1, -1, 1]], device=scale.device).view(1,2,4) + points = points * scale.view(-1,1,1)/2 + theta = theta.view(-1, 1, 1) + R = torch.cat(( + torch.cat((torch.cos(theta), -torch.sin(theta)), 2), + torch.cat((torch.sin(theta), torch.cos(theta)), 2), + ), 1) + center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1) + points = R @ points + center + + # use the points to compute the affine transform that maps + # these points back to the output square + res = resolution + points1 = np.array([[0, 0, res-1], + [0, res-1, 0]], dtype=np.float32).T + affines = [] + imgs = [] + for i in range(points.shape[0]): + pts = points[i, :, :3].detach().numpy().T + M = cv2.getAffineTransform(pts, points1) + img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5) + img = torch.tensor(img, device=scale.device) + imgs.append(img) + affine = cv2.invertAffineTransform(M).astype('float32') + affine = torch.tensor(affine, device=scale.device) + affines.append(affine) + if imgs: + imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0 + affines = torch.stack(affines) + else: + imgs = torch.zeros((0, 3, res, res), device=scale.device) + affines = torch.zeros((0, 2, 3), device=scale.device) + + return imgs, affines, points + + def denormalize_landmarks(self, landmarks, affines): + landmarks[:,:,:2] *= self.resolution + for i in range(len(landmarks)): + landmark, affine = landmarks[i], affines[i] + landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T + landmarks[i,:,:2] = landmark + return landmarks + + def intersect(self,box_a, box_b): + A = box_a.size(0) + B = box_b.size(0) + max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), + box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) + min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), + box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + return inter[:, :, 0] * inter[:, :, 1] + + def jaccard(self,box_a, box_b): + inter = self.intersect(box_a, box_b) + area_a = ((box_a[:, 2]-box_a[:, 0]) * + (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_b = ((box_b[:, 2]-box_b[:, 0]) * + (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + union = area_a + area_b - inter + return inter / union # [A,B] + + + def overlap_similarity(self,box, other_boxes): + """Computes the IOU between a bounding box and set of other boxes.""" + return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0) + + def _weighted_non_max_suppression(self,detections): + if len(detections) == 0: return [] + output_detections = [] + + # Sort the detections from highest to lowest score. + remaining = torch.argsort(detections[:, num_coords], descending=True) + + while len(remaining) > 0: + detection = detections[remaining[0]] + + # Compute the overlap between the first box and the other + # remaining boxes. (Note that the other_boxes also include + # the first_box.) + first_box = detection[:4] + other_boxes = detections[remaining, :4] + ious = self.overlap_similarity(first_box, other_boxes) + + # If two detections don't overlap enough, they are considered + # to be from different faces. + mask = ious > self.min_suppression_threshold + overlapping = remaining[mask] + remaining = remaining[~mask] + + # Take an average of the coordinates from the overlapping + # detections, weighted by their confidence scores. + weighted_detection = detection.clone() + if len(overlapping) > 1: + coordinates = detections[overlapping, :num_coords] + scores = detections[overlapping, num_coords:num_coords+1] + total_score = scores.sum() + weighted = (coordinates * scores).sum(dim=0) / total_score + weighted_detection[:num_coords] = weighted + weighted_detection[num_coords] = total_score / len(overlapping) + + output_detections.append(weighted_detection) + + return output_detections + +def draw_detections(img, detections, with_keypoints=True): + if isinstance(detections, torch.Tensor): + detections = detections.detach().numpy() + + if detections.ndim == 1: + detections = np.expand_dims(detections, axis=0) + + n_keypoints = detections.shape[1] // 2 - 2 + + for i in range(detections.shape[0]): + ymin = detections[i, 0] + xmin = detections[i, 1] + ymax = detections[i, 2] + xmax = detections[i, 3] + + start_point = (int(xmin), int(ymin)) + end_point = (int(xmax), int(ymax)) + img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) + + if with_keypoints: + for k in range(n_keypoints): + kp_x = int(detections[i, 4 + k*2 ]) + kp_y = int(detections[i, 4 + k*2 + 1]) + cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2) + return img + + + +post_process=post_mediapipe_face() + +class faceDetectionQnn: + def __init__(self): + super().__init__() + self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceDetector_w8a8.qnn216.ctx.bin")) + if self.model is None: + print("Create model failed !") + return + + self.config = aidlite.Config.create_instance() + if self.config is None: + print("build_interpretper_from_model_and_config failed !") + return + + self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL + self.config.framework_type = aidlite.FrameworkType.TYPE_QNN + self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP + self.config.is_quantify_model = 1 + + self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config) + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + return + input_shapes = [[1,3, 256, 256]] + output_shapes = [[1, 896,16],[1,896,1]] + self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32, + output_shapes, aidlite.DataType.TYPE_FLOAT32) + + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + result = self.interpreter.init() + if result != 0: + print(f"interpreter init failed !") + result = self.interpreter.load_model() + if result != 0: + print("interpreter load model failed !") + + print(" model load success!") + + def __call__(self, input): + self.interpreter.set_input_tensor(0,input) + invoke_time=[] + invoke_nums =10 + for i in range(invoke_nums): + result = self.interpreter.set_input_tensor(0, input.data) + if result != 0: + print("interpreter set_input_tensor() failed") + t1=time.time() + result = self.interpreter.invoke() + cost_time = (time.time()-t1)*1000 + invoke_time.append(cost_time) + + max_invoke_time = max(invoke_time) + min_invoke_time = min(invoke_time) + mean_invoke_time = sum(invoke_time)/invoke_nums + var_invoketime=np.var(invoke_time) + print("====================================") + print(f"QNN Detection invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}") + print("====================================") + features_0 = self.interpreter.get_output_tensor(0).reshape(1, 896,16).copy() + features_1 = self.interpreter.get_output_tensor(1).reshape(1, 896,1).copy() + return features_0,features_1 + + +class faceLandmarkQnn: + def __init__(self): + super().__init__() + self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceLandmark_w8a8.qnn216.ctx.bin")) + if self.model is None: + print("Create model failed !") + return + + self.config = aidlite.Config.create_instance() + if self.config is None: + print("build_interpretper_from_model_and_config failed !") + return + + self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL + self.config.framework_type = aidlite.FrameworkType.TYPE_QNN + self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP + self.config.is_quantify_model = 1 + + self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config) + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + return + input_shapes = [[1, 3, 192, 192]] + output_shapes = [[1],[1,468,3]] + self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32, + output_shapes, aidlite.DataType.TYPE_FLOAT32) + + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + result = self.interpreter.init() + if result != 0: + print(f"interpreter init failed !") + result = self.interpreter.load_model() + if result != 0: + print("interpreter load model failed !") + + print(" model load success!") + + def __call__(self, input): + self.interpreter.set_input_tensor(0,input) + invoke_time=[] + invoke_nums =10 + for i in range(invoke_nums): + result = self.interpreter.set_input_tensor(0, input.data) + if result != 0: + print("interpreter set_input_tensor() failed") + t1=time.time() + result = self.interpreter.invoke() + cost_time = (time.time()-t1)*1000 + invoke_time.append(cost_time) + + max_invoke_time = max(invoke_time) + min_invoke_time = min(invoke_time) + mean_invoke_time = sum(invoke_time)/invoke_nums + var_invoketime=np.var(invoke_time) + print("====================================") + print(f"QNN LandMark invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}") + print("====================================") + features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy() + features_1 = self.interpreter.get_output_tensor(1).reshape(1,468,3).copy() + return features_0,features_1 + + + +anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"anchors_face_back.npy")), dtype=torch.float32, device='cpu') +# anchors_np = anchors.cpu().numpy().astype(np.float32) +# np.save("anchors_float32.npy", anchors_np) + +face_detc = faceDetectionQnn() +face_rec = faceLandmarkQnn() + +image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"coco.jpg") + +frame_ct=0 +image = cv2.imread(image_path) + +frame = np.ascontiguousarray(image[:,:,::-1]) + +img1, img2, scale, pad = resize_pad(frame) + +input = (img1 / 255).astype(np.float32) +input = np.transpose(input, (2, 0, 1)) +input = input[np.newaxis, ...] +t0 = time.time() +out = face_detc(input) +use_time = round((time.time() - t0) * 1000, 2) +detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors) + +filtered_detections = [] +num_coords = 16 +for i in range(len(detections)): + faces = post_process._weighted_non_max_suppression(detections[i]) + faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1)) + filtered_detections.append(faces) + +face_detections = denormalize_detections(filtered_detections[0], scale, pad) + +xc, yc, scale, theta = post_process.detection2roi(face_detections) + +img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale) +if box.size()[0]!=0: + t2 = time.time() + flags, normalized_landmarks = face_rec(img.numpy()) + + use_time = round((time.time() - t2) * 1000, 2) + + landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine) + + for i in range(len(flags)): + landmark, flag = landmarks[i], flags[i] + if flag>.4: # 0.5 + draw_landmarks(frame, landmark[:,:2], FACE_CONNECTIONS, size=1) +else: + print("not detect face !") + +draw_roi(frame, box) +draw_detections(frame, face_detections) +cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1]) +face_detc.interpreter.destory() +face_rec.interpreter.destory() + + + diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/visualization.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..e1187c9d99336d53562f4fa3a4a32f6af6e769fb --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/visualization.py @@ -0,0 +1,125 @@ +import numpy as np +import cv2 +import torch + +def draw_detections(img, detections, with_keypoints=True): + if isinstance(detections, torch.Tensor): + detections = detections.cpu().numpy() + + if detections.ndim == 1: + detections = np.expand_dims(detections, axis=0) + + n_keypoints = detections.shape[1] // 2 - 2 + + for i in range(detections.shape[0]): + ymin = detections[i, 0] + xmin = detections[i, 1] + ymax = detections[i, 2] + xmax = detections[i, 3] + + start_point = (int(xmin), int(ymin)) + end_point = (int(xmax), int(ymax)) + img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) + + if with_keypoints: + for k in range(n_keypoints): + kp_x = int(detections[i, 4 + k*2 ]) + kp_y = int(detections[i, 4 + k*2 + 1]) + cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2) + return img + + +def draw_roi(img, roi): + for i in range(roi.shape[0]): + (x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i] + cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2) + cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2) + cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2) + cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2) + + +def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2): + points = points[:,:2] + for point in points: + x, y = point + x, y = int(x), int(y) + cv2.circle(img, (x, y), size, color, thickness=size) + for connection in connections: + x0, y0 = points[connection[0]] + x1, y1 = points[connection[1]] + x0, y0 = int(x0), int(y0) + x1, y1 = int(x1), int(y1) + cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size) + + + +# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py +# 8 12 16 20 +# | | | | +# 7 11 15 19 +# 4 | | | | +# | 6 10 14 18 +# 3 | | | | +# | 5---9---13--17 +# 2 \ / +# \ \ / +# 1 \ / +# \ \ / +# ------0- +HAND_CONNECTIONS = [ + (0, 1), (1, 2), (2, 3), (3, 4), + (5, 6), (6, 7), (7, 8), + (9, 10), (10, 11), (11, 12), + (13, 14), (14, 15), (15, 16), + (17, 18), (18, 19), (19, 20), + (0, 5), (5, 9), (9, 13), (13, 17), (0, 17) +] + +POSE_CONNECTIONS = [ + (0,1), (1,2), (2,3), (3,7), + (0,4), (4,5), (5,6), (6,8), + (9,10), + (11,13), (13,15), (15,17), (17,19), (19,15), (15,21), + (12,14), (14,16), (16,18), (18,20), (20,16), (16,22), + (11,12), (12,24), (24,23), (23,11) +] + +# Vertex indices can be found in +# github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png +# Found in github.com/google/mediapipe/python/solutions/face_mesh.py +FACE_CONNECTIONS = [ + # Lips. + (61, 146), (146, 91), (91, 181), (181, 84), (84, 17), + (17, 314), (314, 405), (405, 321), (321, 375), (375, 291), + (61, 185), (185, 40), (40, 39), (39, 37), (37, 0), + (0, 267), (267, 269), (269, 270), (270, 409), (409, 291), + (78, 95), (95, 88), (88, 178), (178, 87), (87, 14), + (14, 317), (317, 402), (402, 318), (318, 324), (324, 308), + (78, 191), (191, 80), (80, 81), (81, 82), (82, 13), + (13, 312), (312, 311), (311, 310), (310, 415), (415, 308), + # Left eye. + (263, 249), (249, 390), (390, 373), (373, 374), (374, 380), + (380, 381), (381, 382), (382, 362), (263, 466), (466, 388), + (388, 387), (387, 386), (386, 385), (385, 384), (384, 398), + (398, 362), + # Left eyebrow. + (276, 283), (283, 282), (282, 295), (295, 285), (300, 293), + (293, 334), (334, 296), (296, 336), + # Right eye. + (33, 7), (7, 163), (163, 144), (144, 145), (145, 153), + (153, 154), (154, 155), (155, 133), (33, 246), (246, 161), + (161, 160), (160, 159), (159, 158), (158, 157), (157, 173), + (173, 133), + # Right eyebrow. + (46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105), + (105, 66), (66, 107), + # Face oval. + (10, 338), (338, 297), (297, 332), (332, 284), (284, 251), + (251, 389), (389, 356), (356, 454), (454, 323), (323, 361), + (361, 288), (288, 397), (397, 365), (365, 379), (379, 378), + (378, 400), (400, 377), (377, 152), (152, 148), (148, 176), + (176, 149), (149, 150), (150, 136), (136, 172), (172, 58), + (58, 132), (132, 93), (93, 234), (234, 127), (127, 162), + (162, 21), (21, 54), (54, 103), (103, 67), (67, 109), + (109, 10) +] diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/README.md b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/README.md new file mode 100644 index 0000000000000000000000000000000000000000..72fa909e339bab96870b8548fed5822dbc87d6fd --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/README.md @@ -0,0 +1,63 @@ +## Model Information +### Source model +- Input shape: [1x3x256x256],[1x3x192x192] +- Number of parameters:0.13M, 0.6M +- Model size:0.58MB, 2.32MB +- Output shape: [1x896x16, 1x896x1], [1, 1x486x3] + +Source model repository: [MediaPipe-Face-Detection](https://github.com/zmurez/MediaPipePyTorch/) + +### Converted model + +- Precision: FP16 +- Backend: QNN2.16 +- Target Device: SNM972 QCS8550 + +## Inference with AidLite SDK + +### SDK installation +Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/) + +- install AidLite SDK + +```bash +# Install the appropriate version of the aidlite sdk +sudo aid-pkg update +sudo aid-pkg install aidlite-sdk +# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223 +sudo aid-pkg install aidlite-{QNN VERSION} +``` + +- Verify AidLite SDK + +```bash +# aidlite sdk c++ check +python3 -c "import aidlite ; print(aidlite.get_library_version())" + +# aidlite sdk python check +python3 -c "import aidlite ; print(aidlite.get_py_library_version())" +``` + +### Run demo +#### python +```bash +cd python +python3 demo_qnn.py +``` + +#### c++ +```bash +# 加载.npy文件需要用到cnpy库(终端默认路径下执行即可) +git clone https://github.com/rogersce/cnpy.git +cd cnpy +mkdir build && cd build +cmake .. +make +sudo make install + +cd mediapipe-face-detection/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp +mkdir build && cd build +cmake .. +make +./run_test +``` diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8655a0360f730c1fb7b4f81367a6279e5acc8060 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2044fec25dc146bd44e23e22b15c320e189fd99f Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazeface.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3d9d716ed4bbd036e086c876ce0c5308d5a9a5d Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazeface.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27b6da065e0c25967994af97ae8c72f67594bf5f Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/visualization.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e9b9b371d9ab88b3885f249fa1a9fc68d8751c1 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/visualization.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/visualization.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/visualization.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c16fa3b7c9a01a594bb21e59a6c00d0b19945548 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/visualization.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/CMakeLists.txt b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..13cca3db86e64b8a8458fd55c52323bed1d4282f --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/CMakeLists.txt @@ -0,0 +1,34 @@ +cmake_minimum_required (VERSION 3.5) +project("run_test") + +find_package(OpenCV REQUIRED) +find_library(CNPY_LIB cnpy REQUIRED) + +message(STATUS "oPENCV Library status:") +message(STATUS ">version:${OpenCV_VERSION}") +message(STATUS "Include:${OpenCV_INCLUDE_DIRS}") + +set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations") + +include_directories( + /usr/local/include + /usr/include/opencv4 +) + +link_directories( + /usr/local/lib/ +) + +file(GLOB SRC_LISTS + ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp +) + +add_executable(run_test ${SRC_LISTS}) + +target_link_libraries(run_test + aidlite + ${OpenCV_LIBS} + pthread + jsoncpp + ${CNPY_LIB} +) diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/anchors_float32.npy b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/anchors_float32.npy new file mode 100644 index 0000000000000000000000000000000000000000..fc25c0d2c161ee090e9a04c6003418bd15caf45c --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/anchors_float32.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94 +size 14464 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/coco.jpg b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/coco.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/coco.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378 +size 158750 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/run_test.cpp b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/run_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..536aca649b9d8cbd0d95d2022cbdc32c40b1ed4a --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/run_test.cpp @@ -0,0 +1,909 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cnpy.h" + +using namespace cv; +using namespace std; +using namespace Aidlux::Aidlite; + + +// 人脸 landmark 连接索引定义(来自 MediaPipe Face Mesh) +const std::vector> FACE_CONNECTIONS = { + {61, 146}, {146, 91}, {91, 181}, {181, 84}, {84, 17}, + {17, 314}, {314, 405}, {405, 321}, {321, 375}, {375, 291}, + {61, 185}, {185, 40}, {40, 39}, {39, 37}, {37, 0}, + {0, 267}, {267, 269}, {269, 270}, {270, 409}, {409, 291}, + {78, 95}, {95, 88}, {88, 178}, {178, 87}, {87, 14}, + {14, 317}, {317, 402}, {402, 318}, {318, 324}, {324, 308}, + {78, 191}, {191, 80}, {80, 81}, {81, 82}, {82, 13}, + {13, 312}, {312, 311}, {311, 310}, {310, 415}, {415, 308}, + {263, 249}, {249, 390}, {390, 373}, {373, 374}, {374, 380}, + {380, 381}, {381, 382}, {382, 362}, {263, 466}, {466, 388}, + {388, 387}, {387, 386}, {386, 385}, {385, 384}, {384, 398}, + {398, 362}, {276, 283}, {283, 282}, {282, 295}, {295, 285}, + {300, 293}, {293, 334}, {334, 296}, {296, 336}, {33, 7}, + {7, 163}, {163, 144}, {144, 145}, {145, 153}, {153, 154}, + {154, 155}, {155, 133}, {33, 246}, {246, 161}, {161, 160}, + {160, 159}, {159, 158}, {158, 157}, {157, 173}, {173, 133}, + {46, 53}, {53, 52}, {52, 65}, {65, 55}, {70, 63}, {63, 105}, + {105, 66}, {66, 107}, {10, 338}, {338, 297}, {297, 332}, + {332, 284}, {284, 251}, {251, 389}, {389, 356}, {356, 454}, + {454, 323}, {323, 361}, {361, 288}, {288, 397}, {397, 365}, + {365, 379}, {379, 378}, {378, 400}, {400, 377}, {377, 152}, + {152, 148}, {148, 176}, {176, 149}, {149, 150}, {150, 136}, + {136, 172}, {172, 58}, {58, 132}, {132, 93}, {93, 234}, + {234, 127}, {127, 162}, {162, 21}, {21, 54}, {54, 103}, + {103, 67}, {67, 109}, {109, 10} +}; + +struct Args { + std::string faceDetector_model = "../../models/m_faceDetector_fp16.qnn216.ctx.bin"; + std::string faceLandmark_model = "../../models/m_faceLandmark_fp16.qnn216.ctx.bin"; + std::string imgs = "../coco.jpg"; + int invoke_nums = 10; + std::string model_type = "QNN"; +}; + + +Args parse_args(int argc, char* argv[]) { + Args args; + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if (arg == "--faceDetector_model" && i + 1 < argc) { + args.faceDetector_model = argv[++i]; + } else if (arg == "--faceLandmark_model" && i + 1 < argc) { + args.faceLandmark_model = argv[++i]; + } else if (arg == "--imgs" && i + 1 < argc) { + args.imgs = argv[++i]; + } else if (arg == "--invoke_nums" && i + 1 < argc) { + args.invoke_nums = std::stoi(argv[++i]); + } else if (arg == "--model_type" && i + 1 < argc) { + args.model_type = argv[++i]; + } + } + return args; +} + +std::string to_lower(const std::string& str) { + std::string lower_str = str; + std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) { + return std::tolower(c); + }); + return lower_str; +} + +std::vector> load_anchors_from_npy(const std::string& path) { + cnpy::NpyArray arr = cnpy::npy_load(path); + float* data_ptr = arr.data(); + + size_t num_rows = arr.shape[0]; // 896 + size_t num_cols = arr.shape[1]; // 4 + + std::vector> anchors(num_rows, std::vector(num_cols)); + for (size_t i = 0; i < num_rows; ++i) { + for (size_t j = 0; j < num_cols; ++j) { + anchors[i][j] = data_ptr[i * num_cols + j]; + } + } + + return anchors; +} + + +// 绘制人脸关键点和连接线 +void draw_landmarks( + cv::Mat& img, + const std::vector& points, + const std::vector& flags, + const std::vector>& connections, + float threshold = 0.4f, + cv::Scalar point_color = cv::Scalar(0, 255, 0), + cv::Scalar line_color = cv::Scalar(0, 0, 0), + int size = 2) +{ + // 画关键点 + for (size_t i = 0; i < points.size(); ++i) { + if (i < flags.size() && flags[i] > threshold) { + int x = static_cast(points[i].x); + int y = static_cast(points[i].y); + cv::circle(img, cv::Point(x, y), size, point_color, size); + } + } + + // 画连接线(两端都要可见) + for (const auto& conn : connections) { + int i0 = conn.first; + int i1 = conn.second; + if (i0 < points.size() && i1 < points.size() && + i0 < flags.size() && i1 < flags.size() && + flags[i0] > threshold && flags[i1] > threshold) + { + cv::line(img, points[i0], points[i1], line_color, size); + } + } +} + + +std::tuple resize_pad(const cv::Mat& img) { + int orig_h = img.rows; // 480 + int orig_w = img.cols; // 640 + + // Step 1: resize width to 256, keep aspect ratio + int w1 = 256; + int h1 = w1 * orig_h / orig_w; // 等效于 int(256 * h / w) + + // Step 2: compute padding in height direction + int padh = 256 - h1; + int padw = 0; + + int padh1 = padh / 2; + int padh2 = padh1 + (padh % 2); + int padw1 = padw / 2; + int padw2 = padw1 + (padw % 2); + + // Step 3: resize to (w1, h1) + cv::Mat resized; + cv::resize(img, resized, cv::Size(w1, h1)); // (256, h1) + + // Step 4: pad to (256, 256) + cv::Mat padded; + cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); + + // Step 5: resize padded to 128×128 + cv::Mat resized128; + cv::resize(padded, resized128, cv::Size(128, 128)); + + // Step 6: compute scale and pad in original image space + float scale = static_cast(orig_h) / h1; // h / h1 + cv::Point pad_point(static_cast(padh1 * scale), static_cast(padw1 * scale)); + + return std::make_tuple(padded, resized128, scale, pad_point); +} + + +// 将图像转换为 1xC×H×W 格式并归一化(除以 255) +std::vector preprocess_image(const cv::Mat& img) { + int H = img.rows; + int W = img.cols; + int C = img.channels(); // should be 3 + + std::vector chw(H * W * C); // CHW + std::vector nchw(1 * C * H * W); // NCHW + + // 1. HWC → CHW + normalize (float32 / 255.0) + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int c = 0; c < C; ++c) { + // OpenCV uses BGR order + float value = img.at(h, w)[c] / 255.0f; + chw[c * H * W + h * W + w] = value; + } + } + } + + // 2. CHW → NCHW (add batch dimension, actually just copy) + for (int i = 0; i < C * H * W; ++i) { + nchw[i] = chw[i]; + } + + return nchw; // shape: [1, 3, H, W] +} + + +// 只用前4个坐标计算IOU(默认框位置在前4个坐标) +float IoU(const std::vector& box1, const std::vector& box2) { + float x1 = std::max(box1[0], box2[0]); + float y1 = std::max(box1[1], box2[1]); + float x2 = std::min(box1[2], box2[2]); + float y2 = std::min(box1[3], box2[3]); + + float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1); + float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]); + float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]); + float union_area = box1_area + box2_area - inter_area; + + return union_area > 0 ? inter_area / union_area : 0.0f; +} + +std::vector> weighted_non_max_suppression( + std::vector>& detections, + int num_coords = 16, + float min_suppression_threshold = 0.3f) +{ + if (detections.empty()) return {}; + + std::vector indices(detections.size()); + std::iota(indices.begin(), indices.end(), 0); + + // 按置信度降序排序 + std::sort(indices.begin(), indices.end(), [&](int a, int b) { + return detections[a][num_coords] > detections[b][num_coords]; + }); + + std::vector> output; + + while (!indices.empty()) { + int best_idx = indices.front(); + const auto& best_det = detections[best_idx]; + std::vector overlapping = { best_idx }; + + for (size_t i = 1; i < indices.size(); ++i) { + float iou = IoU(best_det, detections[indices[i]]); + if (iou > min_suppression_threshold) { + overlapping.push_back(indices[i]); + } + } + + // 更新剩余索引 + std::vector new_indices; + for (int idx : indices) { + if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) { + new_indices.push_back(idx); + } + } + indices = new_indices; + + // 加权平均:坐标 * 置信度 + if (overlapping.size() == 1) { + output.push_back(best_det); + } else { + std::vector weighted(num_coords + 1, 0.0f); + float total_score = 0.0f; + + for (int idx : overlapping) { + float score = detections[idx][num_coords]; + total_score += score; + for (int k = 0; k < num_coords; ++k) { + weighted[k] += detections[idx][k] * score; + } + } + + for (int k = 0; k < num_coords; ++k) { + weighted[k] /= total_score; + } + weighted[num_coords] = total_score / overlapping.size(); // 取平均得分 + + // std::cout << "Weighted box: "; + // for (float v : weighted) std::cout << v << " "; + // std::cout << "\n"; + + output.push_back(weighted); + } + } + + // TODO + auto x = output[0]; + output.clear(); + output.push_back(x); + + return output; +} + + +std::vector> denormalize_detections( + const std::vector>& detections, + float scale, + const cv::Point& pad +) { + std::vector> result = detections; + + for (size_t i = 0; i < result.size(); ++i) { + std::vector& det = result[i]; + + // bbox coords: x1, y1, x2, y2 + det[0] = det[0] * scale * 256.0f - pad.x; // x1 + det[1] = det[1] * scale * 256.0f - pad.y; // y1 + det[2] = det[2] * scale * 256.0f - pad.x; // x2 + det[3] = det[3] * scale * 256.0f - pad.y; // y2 + + // keypoints (starting from index 4): format [y, x, y, x, ...] + for (size_t k = 4; k + 1 < det.size(); k += 2) { + det[k] = det[k] * scale * 256.0f - pad.y; // y + det[k + 1] = det[k + 1] * scale * 256.0f - pad.x; // x + } + } + + return result; +} + + +void detection2roi( + const std::vector>& detections, + std::vector& xc, + std::vector& yc, + std::vector& scale, + std::vector& theta, + int kp1, int kp2, // 关键点索引 + float dy, float dscale, float theta0 +) { + size_t N = detections.size(); + xc.resize(N); + yc.resize(N); + scale.resize(N); + theta.resize(N); + + for (size_t i = 0; i < N; ++i) { + const std::vector& det = detections[i]; + + float x1 = det[1]; + float x2 = det[3]; + float y1 = det[0]; + float y2 = det[2]; + + float x_center = (x1 + x2) / 2.0f; + float y_center = (y1 + y2) / 2.0f; + float box_scale = (x2 - x1); // assumes square box + + // yc 偏移 + y_center += dy * box_scale; + box_scale *= dscale; + + // 获取两个关键点的位置 + int base = 4; + int idx_y0 = base + 2 * kp1; + int idx_x0 = base + 2 * kp1 + 1; + int idx_y1 = base + 2 * kp2; + int idx_x1 = base + 2 * kp2 + 1; + + float x0 = det[idx_x0]; + float y0 = det[idx_y0]; + float x1_kp = det[idx_x1]; + float y1_kp = det[idx_y1]; + + float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0; + + // 输出赋值 + xc[i] = x_center; + yc[i] = y_center; + scale[i] = box_scale; + // TODO: 这里的 theta 需要根据实际情况调整 + // theta[i] = angle; // 如果需要使用计算的角度 + theta[i] = -0.0094; + } +} + + +void extract_roi( + const cv::Mat& frame, + const std::vector& xc, + const std::vector& yc, + const std::vector& theta, + const std::vector& scale, + std::vector& cropped_rois, + std::vector& affine_matrices, + std::vector>& roi_boxes, // 添加返回点坐标 + int resolution = 192 +) { + cropped_rois.clear(); + affine_matrices.clear(); + roi_boxes.clear(); + + for (size_t i = 0; i < xc.size(); ++i) { + float s = scale[i] / 2.0f; + float cos_t = std::cos(theta[i]); + float sin_t = std::sin(theta[i]); + + // 定义4个 unit square 点经过变换后的点(顺序和 Python 中一样) + std::vector points(4); + // [-1, -1] + points[0].x = xc[i] + (-s * cos_t + s * sin_t); + points[0].y = yc[i] + (-s * sin_t - s * cos_t); + // [1, -1] + points[1].x = xc[i] + ( s * cos_t + s * sin_t); + points[1].y = yc[i] + ( s * sin_t - s * cos_t); + // [-1, 1] + points[2].x = xc[i] + (-s * cos_t - s * sin_t); + points[2].y = yc[i] + (-s * sin_t + s * cos_t); + // [1, 1] + points[3].x = xc[i] + ( s * cos_t - s * sin_t); + points[3].y = yc[i] + ( s * sin_t + s * cos_t); + + // 用前三个点计算仿射变换 + std::vector src_pts = { points[0], points[1], points[2] }; + std::vector dst_pts = { + cv::Point2f(0, 0), + cv::Point2f(resolution - 1, 0), + cv::Point2f(0, resolution - 1) + }; + + cv::Mat M = cv::getAffineTransform(src_pts, dst_pts); + cv::Mat M_inv; + cv::invertAffineTransform(M, M_inv); + + cv::Mat cropped; + cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5)); + cropped_rois.push_back(cropped); + affine_matrices.push_back(M_inv); + roi_boxes.push_back(points); // 添加变换后的 box 点 + } +} + +std::vector preprocess_imgs_to_nchw(const std::vector& imgs) { + int N = imgs.size(); + if (N == 0) return {}; + + int H = 192; + int W = 192; + int C = 3; // assume 3 channels (BGR) + + std::vector output; + output.reserve(N * C * H * W); + + for (int n = 0; n < N; ++n) { + cv::Mat img_float; + imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0); // Normalize to [0,1] + + // Split channels (HWC → CHW) + std::vector channels(3); + cv::split(img_float, channels); // channels[0] = B, [1] = G, [2] = R + + for (int c = 0; c < C; ++c) { + for (int i = 0; i < H; ++i) { + for (int j = 0; j < W; ++j) { + output.push_back(channels[c].at(i, j)); + } + } + } + } + + return output; // shape: N x C x H x W +} + +// resolution 一般为 192 +std::vector denormalize_landmarks( + const std::vector& normalized_landmarks, + const std::vector& affines, + int resolution = 192) +{ + std::vector output; + + // 检查输入尺寸 + const int num_faces = 1; // 假设只有一个人脸 + const int num_landmarks = 468; + if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) { + std::cerr << "Error: Input size mismatch. Expected " + << num_faces * num_landmarks * 3 << " landmarks and " + << num_faces << " affine matrices." << std::endl; + throw std::runtime_error("Input size mismatch"); + } + + for (int i = 0; i < num_faces; ++i) { + const cv::Mat& affine = affines[i]; // 2x3 CV_32F + for (int j = 0; j < num_landmarks; ++j) { + int idx = i * num_landmarks * 3 + j * 3; + float x = normalized_landmarks[idx + 0] * resolution; + float y = normalized_landmarks[idx + 1] * resolution; + // float z = normalized_landmarks[idx + 2]; // 可选使用 + + // 2x1 输入向量 + cv::Mat pt = (cv::Mat_(2, 1) << x, y); + + // 提取 affine 的旋转和平移 + cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone(); + cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone(); + M2x2.convertTo(M2x2, CV_32F); + t2x1.convertTo(t2x1, CV_32F); + + // 反仿射变换 + cv::Mat out = M2x2 * pt + t2x1; + + // 存储为 Point2f + output.emplace_back(out.at(0, 0), out.at(1, 0)); + } + } + + return output; // 输出为 denormalized landmarks,大小为 468 个 Point2f +} + + +void draw_roi(cv::Mat& img, const std::vector>& boxes) { + for (const auto& roi : boxes) { + if (roi.size() < 4) continue; + + const cv::Point2f& p1 = roi[0]; + const cv::Point2f& p2 = roi[1]; + const cv::Point2f& p3 = roi[2]; + const cv::Point2f& p4 = roi[3]; + + cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2); // 黑色 + cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2); // 绿色 + cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2); // 黑色 + cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2); // 黑色 + } +} + + +void draw_detections(cv::Mat& img, const std::vector>& detections, bool with_keypoints = true) { + for (const auto& det : detections) { + if (det.size() < 4) continue; + + float ymin = det[0]; + float xmin = det[1]; + float ymax = det[2]; + float xmax = det[3]; + + cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1); + + if (with_keypoints && det.size() > 4) { + int n_keypoints = (det.size() - 4) / 2; + for (int k = 0; k < n_keypoints; ++k) { + int kp_x = int(det[4 + k * 2]); + int kp_y = int(det[4 + k * 2 + 1]); + cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2); + } + } + } +} + + +std::vector> loadAnchors(const std::string& filename) { + std::ifstream in(filename); + std::vector> anchors; + + if (!in.is_open()) { + std::cerr << "Failed to open file: " << filename << std::endl; + return anchors; + } + + std::string line; + while (std::getline(in, line)) { + std::istringstream ss(line); + std::vector anchor; + float value; + while (ss >> value) { + anchor.push_back(value); + } + if (!anchor.empty()) { + anchors.push_back(anchor); + } + } + + in.close(); + return anchors; +} + +// sigmoid 函数 +float sigmoid(float x) { + return 1.0f / (1.0f + std::exp(-x)); +} + +// clamp 函数 +float clamp(float x, float min_val, float max_val) { + return std::max(min_val, std::min(max_val, x)); +} + +// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord] +std::vector>> decode_boxes( + const std::vector& raw_boxes, + const std::vector>& anchors, + int batch, int num_anchors, int num_coords, + float x_scale, float y_scale, float w_scale, float h_scale, + int num_keypoints) +{ + std::vector>> decoded_boxes(batch, + std::vector>(num_anchors, std::vector(num_coords, 0))); + + for (int b = 0; b < batch; ++b) { + for (int i = 0; i < num_anchors; ++i) { + int base = b * num_anchors * num_coords + i * num_coords; + + float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0]; + float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1]; + float w = raw_boxes[base + 2] / w_scale * anchors[i][2]; + float h = raw_boxes[base + 3] / h_scale * anchors[i][3]; + + decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin + decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin + decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax + decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax + + for (int k = 0; k < num_keypoints; ++k) { + int offset = 4 + k * 2; + float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0]; + float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1]; + decoded_boxes[b][i][offset] = keypoint_x; + decoded_boxes[b][i][offset + 1] = keypoint_y; + } + } + } + + return decoded_boxes; +} + +std::vector>> tensors_to_detections( + const std::vector& raw_box_tensor, + const std::vector& raw_score_tensor, + const std::vector>& anchors, + int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints, + float x_scale, float y_scale, float w_scale, float h_scale, + float score_clipping_thresh, float min_score_thresh) +{ + assert(raw_box_tensor.size() == batch * num_anchors * num_coords); + assert(raw_score_tensor.size() == batch * num_anchors * num_classes); + assert(anchors.size() == size_t(num_anchors)); + + auto detection_boxes = decode_boxes( + raw_box_tensor, anchors, batch, num_anchors, num_coords, + x_scale, y_scale, w_scale, h_scale, num_keypoints); + + std::vector>> output_detections; + + for (int b = 0; b < batch; ++b) { + std::vector> detections; + + for (int i = 0; i < num_anchors; ++i) { + int score_index = b * num_anchors * num_classes + i * num_classes; + + // 单类情况,取第0类 + float score_raw = raw_score_tensor[score_index]; + float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh)); + + if (score >= min_score_thresh) { + std::vector det = detection_boxes[b][i]; // shape [num_coords] + det.push_back(score); // 追加置信度 + detections.push_back(det); // shape [num_coords+1] + } + } + + output_detections.push_back(detections); // 每个 batch 一个 vector + } + + return output_detections; +} + + +int invoke(const Args& args) { + std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n" + << args.faceLandmark_model << "\n" + << "Image Path: " << args.imgs << "\n" + << "Inference Nums: " << args.invoke_nums << "\n" + << "Model Type: " << args.model_type << "\n"; + // =============================================================faceDetector_model start + Model* model1 = Model::create_instance(args.faceDetector_model); + if(model1 == nullptr){ + printf("Create model1 failed !\n"); + return EXIT_FAILURE; + } + Config* config1 = Config::create_instance(); + if(config1 == nullptr){ + printf("Create config1 failed !\n"); + return EXIT_FAILURE; + } + config1->implement_type = ImplementType::TYPE_LOCAL; + std::string model_type_lower1 = to_lower(args.model_type); + if (model_type_lower1 == "qnn"){ + config1->framework_type = FrameworkType::TYPE_QNN; + } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") { + config1->framework_type = FrameworkType::TYPE_SNPE2; + } + config1->accelerate_type = AccelerateType::TYPE_DSP; + config1->is_quantify_model = 1; + + std::vector> input_shapes1 = {{1,3,256,256}}; + std::vector> output_shapes1 = {{1,896,16},{1,896,1}}; + model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32); + std::unique_ptr fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1); + if(fast_interpreter1 == nullptr){ + printf("build_interpretper_from_model_and_config failed !\n"); + return EXIT_FAILURE; + } + int result = fast_interpreter1->init(); + if(result != EXIT_SUCCESS){ + printf("interpreter->init() failed !\n"); + return EXIT_FAILURE; + } + // load model + fast_interpreter1->load_model(); + if(result != EXIT_SUCCESS){ + printf("interpreter->load_model() failed !\n"); + return EXIT_FAILURE; + } + printf("detect model load success!\n"); + // =============================================================faceDetector_model over + + // =============================================================faceLandmark_model start + Model* model2 = Model::create_instance(args.faceLandmark_model); + if(model2 == nullptr){ + printf("Create model2 failed !\n"); + return EXIT_FAILURE; + } + Config* config2 = Config::create_instance(); + if(config2 == nullptr){ + printf("Create config2 failed !\n"); + return EXIT_FAILURE; + } + config2->implement_type = ImplementType::TYPE_LOCAL; + std::string model_type_lower2 = to_lower(args.model_type); + if (model_type_lower2 == "qnn"){ + config2->framework_type = FrameworkType::TYPE_QNN; + } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") { + config2->framework_type = FrameworkType::TYPE_SNPE2; + } + config2->accelerate_type = AccelerateType::TYPE_DSP; + config2->is_quantify_model = 1; + + std::vector> input_shapes2 = {{1,3,192,192}}; + std::vector> output_shapes2 = {{1},{1,468,3}}; + model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32); + std::unique_ptr fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2); + if(fast_interpreter2 == nullptr){ + printf("build_interpretper_from_model_and_config2 failed !\n"); + return EXIT_FAILURE; + } + result = fast_interpreter2->init(); + if(result != EXIT_SUCCESS){ + printf("interpreter2->init() failed !\n"); + return EXIT_FAILURE; + } + // load model + fast_interpreter2->load_model(); + if(result != EXIT_SUCCESS){ + printf("interpreter2->load_model() failed !\n"); + return EXIT_FAILURE; + } + printf("detect model2 load success!\n"); + // =============================================================faceLandmark_model over + + + auto anchors = load_anchors_from_npy("../anchors_float32.npy"); + cv::Mat frame = cv::imread(args.imgs); + if (frame.empty()) { + printf("detect image load failed!\n"); + return 1; + } + // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows); + cv::Mat input_data; + cv::Mat frame_clone1 = frame.clone(); + cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB); + cv::Mat frame_clone = frame.clone(); + + + cv::Mat img1, img2; + float scale; + cv::Point pad; + std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1); + std::vector input_tensor = preprocess_image(img1); + + float *outdata0 = nullptr; + float *outdata1 = nullptr; + std::vector invoke_time; + for (int i = 0; i < args.invoke_nums; ++i) { + result = fast_interpreter1->set_input_tensor(0, input_tensor.data()); + if(result != EXIT_SUCCESS){ + printf("interpreter->set_input_tensor() failed !\n"); + return EXIT_FAILURE; + } + auto t1 = std::chrono::high_resolution_clock::now(); + result = fast_interpreter1->invoke(); + auto t2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration cost_time = t2 - t1; + invoke_time.push_back(cost_time.count() * 1000); + if(result != EXIT_SUCCESS){ + printf("interpreter->invoke() failed !\n"); + return EXIT_FAILURE; + } + uint32_t out_data_0 = 0; + result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0); + if(result != EXIT_SUCCESS){ + printf("interpreter1->get_output_tensor() 0 failed !\n"); + return EXIT_FAILURE; + } + + uint32_t out_data_1 = 0; + result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1); + if(result != EXIT_SUCCESS){ + printf("interpreter1->get_output_tensor() 1 failed !\n"); + return EXIT_FAILURE; + } + + } + + std::vector tensor_1_896_16(outdata0, outdata0 + 896*16); + std::vector tensor_1_896_1(outdata1, outdata1 + 896*1); + + std::vector>> detections = tensors_to_detections( + tensor_1_896_16, tensor_1_896_1, anchors, + 1, 896, 16, 1, 6, + 256.0f, 256.0f, 256.0f, 256.0f, + 100.0f, 0.4f); + + + std::vector>> filtered_detections; + for (size_t i = 0; i < detections.size(); ++i) { + std::vector>& dets = detections[i]; + std::vector> faces = weighted_non_max_suppression(dets); + filtered_detections.push_back(faces); + } + + + // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n"; + // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n"; + std::vector> face_detections = denormalize_detections(filtered_detections[0], scale, pad); + + // std::cout << "face_detections size: " << face_detections.size() << "\n"; + std::vector xc, yc, scales, theta; + int kp1 = 0, kp2 = 1; // 关键点索引 + float dy = 0.0f; // 根据模型定义设定 + float dscale = 1.5f; // 缩放因子 + float theta0 = 0.0f; // 基准角度 + + detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0); + std::vector rois; + std::vector affines; + std::vector> boxes; + + extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes); + if (!boxes.empty()) { + std::cout << "Detected " << boxes.size() << " faces.\n"; + // 检测到人脸,继续处理 boxes[0] ... + std::vector input_tensor = preprocess_imgs_to_nchw(rois); + + float *outdata1_0 = nullptr; + float *outdata1_1 = nullptr; + + result = fast_interpreter2->set_input_tensor(0, input_tensor.data()); + if(result != EXIT_SUCCESS){ + printf("interpreter2->set_input_tensor() failed !\n"); + return EXIT_FAILURE; + } + auto t1 = std::chrono::high_resolution_clock::now(); + result = fast_interpreter2->invoke(); + auto t2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration cost_time = t2 - t1; + invoke_time.push_back(cost_time.count() * 1000); + if(result != EXIT_SUCCESS){ + printf("interpreter2->invoke() failed !\n"); + return EXIT_FAILURE; + } + uint32_t out_data_1_0 = 0; + result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0); + if(result != EXIT_SUCCESS){ + printf("interpreter2->get_output_tensor() 0 failed !\n"); + return EXIT_FAILURE; + } + + uint32_t out_data_1_1 = 0; + result = fast_interpreter2->get_output_tensor(1, (void**)&outdata1_1, &out_data_1_1); + if(result != EXIT_SUCCESS){ + printf("interpreter2->get_output_tensor() 1 failed !\n"); + return EXIT_FAILURE; + } + + std::vector flags(outdata1_0, outdata1_0 + 1); + std::vector normalized_landmarks(outdata1_1, outdata1_1 + 468*3); + + std::vector landmarks = denormalize_landmarks(normalized_landmarks, affines); + draw_landmarks(frame_clone1, landmarks, flags, FACE_CONNECTIONS); + } else { + std::cout << "not detect face!" << std::endl; + } + + + draw_roi(frame_clone1, boxes); + draw_detections(frame_clone1, face_detections); + cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR); + cv::imwrite("vis_result.jpg", frame_clone1); + + + fast_interpreter1->destory(); + fast_interpreter2->destory(); + return 0; + +} + + +int main(int argc, char* argv[]) { + Args args = parse_args(argc, argv); + return invoke(args); +} diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/models/m_faceDetector_fp16.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/models/m_faceDetector_fp16.qnn216.ctx.bin new file mode 100644 index 0000000000000000000000000000000000000000..839433e794ebb61e5e0166b62966f383e6c43d60 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/models/m_faceDetector_fp16.qnn216.ctx.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:751919064bb2fe2682c29f514a76318d7f2d6518013cbe88e1b2c4cef8b2bb20 +size 668864 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/models/m_faceLandmark_fp16.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/models/m_faceLandmark_fp16.qnn216.ctx.bin new file mode 100644 index 0000000000000000000000000000000000000000..2d80e19b1013f1b872011af7abf82f3850271938 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/models/m_faceLandmark_fp16.qnn216.ctx.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ba9a12397917cef8dcf20df00d5855fa6be79789e2bf2ae1cbd24ccf32ee666 +size 1674312 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5f01c881032cc0ea0dafbbe76b4d02bf51f5dc2 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e15f793e9a3a77c122c6cae36e26c5bd9b77ab7e Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazeface.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4d54b68c53f26e930688f7bd66bc42e64431265 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazeface.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3960f9eaa3babd661de15273d7037029ce46e706 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/visualization.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffaa52099e2019a3dc97a8eef82319610de6aaf4 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/visualization.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/anchors_face_back.npy b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/anchors_face_back.npy new file mode 100644 index 0000000000000000000000000000000000000000..3ba12474802adea36a8e37ca648d46556cd35c92 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/anchors_face_back.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95 +size 28800 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazebase.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazebase.py new file mode 100644 index 0000000000000000000000000000000000000000..414ae950199d7fa1ebae4fa540805cb53cf1ff0d --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazebase.py @@ -0,0 +1,513 @@ +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def resize_pad(img): + """ resize and pad images to be input to the detectors + + The face and palm detector networks take 256x256 and 128x128 images + as input. As such the input image is padded and resized to fit the + size while maintaing the aspect ratio. + + Returns: + img1: 256x256 + img2: 128x128 + scale: scale factor between original image and 256x256 image + pad: pixels of padding in the original image + """ + + size0 = img.shape + if size0[0]>=size0[1]: + h1 = 256 + w1 = 256 * size0[1] // size0[0] + padh = 0 + padw = 256 - w1 + scale = size0[1] / w1 + else: + h1 = 256 * size0[0] // size0[1] + w1 = 256 + padh = 256 - h1 + padw = 0 + scale = size0[0] / h1 + padh1 = padh//2 + padh2 = padh//2 + padh%2 + padw1 = padw//2 + padw2 = padw//2 + padw%2 + img1 = cv2.resize(img, (w1,h1)) + img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0))) + pad = (int(padh1 * scale), int(padw1 * scale)) + img2 = cv2.resize(img1, (128,128)) + return img1, img2, scale, pad + + +def denormalize_detections(detections, scale, pad): + """ maps detection coordinates from [0,1] to image coordinates + + The face and palm detector networks take 256x256 and 128x128 images + as input. As such the input image is padded and resized to fit the + size while maintaing the aspect ratio. This function maps the + normalized coordinates back to the original image coordinates. + + Inputs: + detections: nxm tensor. n is the number of detections. + m is 4+2*k where the first 4 valuse are the bounding + box coordinates and k is the number of additional + keypoints output by the detector. + scale: scalar that was used to resize the image + pad: padding in the x and y dimensions + + """ + detections[:, 0] = detections[:, 0] * scale * 256 - pad[0] + detections[:, 1] = detections[:, 1] * scale * 256 - pad[1] + detections[:, 2] = detections[:, 2] * scale * 256 - pad[0] + detections[:, 3] = detections[:, 3] * scale * 256 - pad[1] + + detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1] + detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0] + return detections + + + + +class BlazeBlock(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False): + super(BlazeBlock, self).__init__() + + self.stride = stride + self.kernel_size = kernel_size + self.channel_pad = out_channels - in_channels + + # TFLite uses slightly different padding than PyTorch + # on the depthwise conv layer when the stride is 2. + if stride == 2: + self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride) + padding = 0 + else: + padding = (kernel_size - 1) // 2 + + self.convs = nn.Sequential( + nn.Conv2d(in_channels=in_channels, out_channels=in_channels, + kernel_size=kernel_size, stride=stride, padding=padding, + groups=in_channels, bias=True), + nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=1, stride=1, padding=0, bias=True), + ) + + if skip_proj: + self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=1, stride=1, padding=0, bias=True) + else: + self.skip_proj = None + + if act == 'relu': + self.act = nn.ReLU(inplace=True) + elif act == 'prelu': + self.act = nn.PReLU(out_channels) + else: + raise NotImplementedError("unknown activation %s"%act) + + def forward(self, x): + if self.stride == 2: + if self.kernel_size==3: + h = F.pad(x, (0, 2, 0, 2), "constant", 0) + else: + h = F.pad(x, (1, 2, 1, 2), "constant", 0) + x = self.max_pool(x) + else: + h = x + + if self.skip_proj is not None: + x = self.skip_proj(x) + elif self.channel_pad > 0: + x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0) + + + return self.act(self.convs(h) + x) + + +class FinalBlazeBlock(nn.Module): + def __init__(self, channels, kernel_size=3): + super(FinalBlazeBlock, self).__init__() + + # TFLite uses slightly different padding than PyTorch + # on the depthwise conv layer when the stride is 2. + self.convs = nn.Sequential( + nn.Conv2d(in_channels=channels, out_channels=channels, + kernel_size=kernel_size, stride=2, padding=0, + groups=channels, bias=True), + nn.Conv2d(in_channels=channels, out_channels=channels, + kernel_size=1, stride=1, padding=0, bias=True), + ) + + self.act = nn.ReLU(inplace=True) + + def forward(self, x): + h = F.pad(x, (0, 2, 0, 2), "constant", 0) + + return self.act(self.convs(h)) + + +class BlazeBase(nn.Module): + """ Base class for media pipe models. """ + + def _device(self): + """Which device (CPU or GPU) is being used by this model?""" + return self.classifier_8.weight.device + + def load_weights(self, path): + self.load_state_dict(torch.load(path)) + self.eval() + + +class BlazeLandmark(BlazeBase): + """ Base class for landmark models. """ + + def extract_roi(self, frame, xc, yc, theta, scale): + + # take points on unit square and transform them according to the roi + points = torch.tensor([[-1, -1, 1, 1], + [-1, 1, -1, 1]], device=scale.device).view(1,2,4) + points = points * scale.view(-1,1,1)/2 + theta = theta.view(-1, 1, 1) + R = torch.cat(( + torch.cat((torch.cos(theta), -torch.sin(theta)), 2), + torch.cat((torch.sin(theta), torch.cos(theta)), 2), + ), 1) + center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1) + points = R @ points + center + + # use the points to compute the affine transform that maps + # these points back to the output square + res = self.resolution + points1 = np.array([[0, 0, res-1], + [0, res-1, 0]], dtype=np.float32).T + affines = [] + imgs = [] + for i in range(points.shape[0]): + pts = points[i, :, :3].cpu().numpy().T + M = cv2.getAffineTransform(pts, points1) + img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5) + img = torch.tensor(img, device=scale.device) + imgs.append(img) + affine = cv2.invertAffineTransform(M).astype('float32') + affine = torch.tensor(affine, device=scale.device) + affines.append(affine) + if imgs: + imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0 + affines = torch.stack(affines) + else: + imgs = torch.zeros((0, 3, res, res), device=scale.device) + affines = torch.zeros((0, 2, 3), device=scale.device) + + return imgs, affines, points + + def denormalize_landmarks(self, landmarks, affines): + landmarks[:,:,:2] *= self.resolution + for i in range(len(landmarks)): + landmark, affine = landmarks[i], affines[i] + landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T + landmarks[i,:,:2] = landmark + return landmarks + + + +class BlazeDetector(BlazeBase): + """ Base class for detector models. + + Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and + https://github.com/hollance/BlazeFace-PyTorch and + https://github.com/google/mediapipe/ + """ + def load_anchors(self, path): + self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device()) + assert(self.anchors.ndimension() == 2) + assert(self.anchors.shape[0] == self.num_anchors) + assert(self.anchors.shape[1] == 4) + + def _preprocess(self, x): + """Converts the image pixels to the range [-1, 1].""" + return x.float() / 255.# 127.5 - 1.0 + + def predict_on_image(self, img): + """Makes a prediction on a single image. + + Arguments: + img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of + shape (3, H, W). The image's height and width should be + 128 pixels. + + Returns: + A tensor with face detections. + """ + if isinstance(img, np.ndarray): + img = torch.from_numpy(img).permute((2, 0, 1)) + + return self.predict_on_batch(img.unsqueeze(0))[0] + + def predict_on_batch(self, x): + """Makes a prediction on a batch of images. + + Arguments: + x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of + shape (b, 3, H, W). The height and width should be 128 pixels. + + Returns: + A list containing a tensor of face detections for each image in + the batch. If no faces are found for an image, returns a tensor + of shape (0, 17). + + Each face detection is a PyTorch tensor consisting of 17 numbers: + - ymin, xmin, ymax, xmax + - x,y-coordinates for the 6 keypoints + - confidence score + """ + if isinstance(x, np.ndarray): + x = torch.from_numpy(x).permute((0, 3, 1, 2)) + + assert x.shape[1] == 3 + assert x.shape[2] == self.y_scale + assert x.shape[3] == self.x_scale + + # 1. Preprocess the images into tensors: + x = x.to(self._device()) + x = self._preprocess(x) + + # 2. Run the neural network: + with torch.no_grad(): + out = self.__call__(x) + + # 3. Postprocess the raw predictions: + detections = self._tensors_to_detections(out[0], out[1], self.anchors) + + # 4. Non-maximum suppression to remove overlapping detections: + filtered_detections = [] + for i in range(len(detections)): + faces = self._weighted_non_max_suppression(detections[i]) + faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1)) + filtered_detections.append(faces) + + return filtered_detections + + + def detection2roi(self, detection): + """ Convert detections from detector to an oriented bounding box. + + Adapted from: + # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt + + The center and size of the box is calculated from the center + of the detected box. Rotation is calcualted from the vector + between kp1 and kp2 relative to theta0. The box is scaled + and shifted by dscale and dy. + + """ + if self.detection2roi_method == 'box': + # compute box center and scale + # use mediapipe/calculators/util/detections_to_rects_calculator.cc + xc = (detection[:,1] + detection[:,3]) / 2 + yc = (detection[:,0] + detection[:,2]) / 2 + scale = (detection[:,3] - detection[:,1]) # assumes square boxes + + elif self.detection2roi_method == 'alignment': + # compute box center and scale + # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc + xc = detection[:,4+2*self.kp1] + yc = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2 + else: + raise NotImplementedError( + "detection2roi_method [%s] not supported"%self.detection2roi_method) + + yc += self.dy * scale + scale *= self.dscale + + # compute box rotation + x0 = detection[:,4+2*self.kp1] + y0 = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + #theta = np.arctan2(y0-y1, x0-x1) - self.theta0 + theta = torch.atan2(y0-y1, x0-x1) - self.theta0 + return xc, yc, scale, theta + + + def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors): + """The output of the neural network is a tensor of shape (b, 896, 16) + containing the bounding box regressor predictions, as well as a tensor + of shape (b, 896, 1) with the classification confidences. + + This function converts these two "raw" tensors into proper detections. + Returns a list of (num_detections, 17) tensors, one for each image in + the batch. + + This is based on the source code from: + mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc + mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto + """ + assert raw_box_tensor.ndimension() == 3 + assert raw_box_tensor.shape[1] == self.num_anchors + assert raw_box_tensor.shape[2] == self.num_coords + + assert raw_score_tensor.ndimension() == 3 + assert raw_score_tensor.shape[1] == self.num_anchors + assert raw_score_tensor.shape[2] == self.num_classes + + assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0] + + detection_boxes = self._decode_boxes(raw_box_tensor, anchors) + + thresh = self.score_clipping_thresh + raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh) + detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1) + + # Note: we stripped off the last dimension from the scores tensor + # because there is only has one class. Now we can simply use a mask + # to filter out the boxes with too low confidence. + mask = detection_scores >= self.min_score_thresh + + # Because each image from the batch can have a different number of + # detections, process them one at a time using a loop. + output_detections = [] + for i in range(raw_box_tensor.shape[0]): + boxes = detection_boxes[i, mask[i]] + scores = detection_scores[i, mask[i]].unsqueeze(dim=-1) + output_detections.append(torch.cat((boxes, scores), dim=-1)) + + return output_detections + + def _decode_boxes(self, raw_boxes, anchors): + """Converts the predictions into actual coordinates using + the anchor boxes. Processes the entire batch at once. + """ + boxes = torch.zeros_like(raw_boxes) + + x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0] + y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + + w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2] + h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3] + + boxes[..., 0] = y_center - h / 2. # ymin + boxes[..., 1] = x_center - w / 2. # xmin + boxes[..., 2] = y_center + h / 2. # ymax + boxes[..., 3] = x_center + w / 2. # xmax + + for k in range(self.num_keypoints): + offset = 4 + k*2 + keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0] + keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + boxes[..., offset ] = keypoint_x + boxes[..., offset + 1] = keypoint_y + + return boxes + + def _weighted_non_max_suppression(self, detections): + """The alternative NMS method as mentioned in the BlazeFace paper: + + "We replace the suppression algorithm with a blending strategy that + estimates the regression parameters of a bounding box as a weighted + mean between the overlapping predictions." + + The original MediaPipe code assigns the score of the most confident + detection to the weighted detection, but we take the average score + of the overlapping detections. + + The input detections should be a Tensor of shape (count, 17). + + Returns a list of PyTorch tensors, one for each detected face. + + This is based on the source code from: + mediapipe/calculators/util/non_max_suppression_calculator.cc + mediapipe/calculators/util/non_max_suppression_calculator.proto + """ + if len(detections) == 0: return [] + + output_detections = [] + + # Sort the detections from highest to lowest score. + remaining = torch.argsort(detections[:, self.num_coords], descending=True) + + while len(remaining) > 0: + detection = detections[remaining[0]] + + # Compute the overlap between the first box and the other + # remaining boxes. (Note that the other_boxes also include + # the first_box.) + first_box = detection[:4] + other_boxes = detections[remaining, :4] + ious = overlap_similarity(first_box, other_boxes) + + # If two detections don't overlap enough, they are considered + # to be from different faces. + mask = ious > self.min_suppression_threshold + overlapping = remaining[mask] + remaining = remaining[~mask] + + # Take an average of the coordinates from the overlapping + # detections, weighted by their confidence scores. + weighted_detection = detection.clone() + if len(overlapping) > 1: + coordinates = detections[overlapping, :self.num_coords] + scores = detections[overlapping, self.num_coords:self.num_coords+1] + total_score = scores.sum() + weighted = (coordinates * scores).sum(dim=0) / total_score + weighted_detection[:self.num_coords] = weighted + weighted_detection[self.num_coords] = total_score / len(overlapping) + + output_detections.append(weighted_detection) + + return output_detections + + +# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py + +def intersect(box_a, box_b): + """ We resize both tensors to [A,B,2] without new malloc: + [A,2] -> [A,1,2] -> [A,B,2] + [B,2] -> [1,B,2] -> [A,B,2] + Then we compute the area of intersect between box_a and box_b. + Args: + box_a: (tensor) bounding boxes, Shape: [A,4]. + box_b: (tensor) bounding boxes, Shape: [B,4]. + Return: + (tensor) intersection area, Shape: [A,B]. + """ + A = box_a.size(0) + B = box_b.size(0) + max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), + box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) + min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), + box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + return inter[:, :, 0] * inter[:, :, 1] + + +def jaccard(box_a, box_b): + """Compute the jaccard overlap of two sets of boxes. The jaccard overlap + is simply the intersection over union of two boxes. Here we operate on + ground truth boxes and default boxes. + E.g.: + A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) + Args: + box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] + box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] + Return: + jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] + """ + inter = intersect(box_a, box_b) + area_a = ((box_a[:, 2]-box_a[:, 0]) * + (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_b = ((box_b[:, 2]-box_b[:, 0]) * + (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + union = area_a + area_b - inter + return inter / union # [A,B] + + +def overlap_similarity(box, other_boxes): + """Computes the IOU between a bounding box and set of other boxes.""" + return jaccard(box.unsqueeze(0), other_boxes).squeeze(0) diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazeface.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazeface.py new file mode 100644 index 0000000000000000000000000000000000000000..6d630b3b28ead006d2864f18f1637f6545dbe507 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazeface.py @@ -0,0 +1,182 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from blazebase import BlazeDetector, BlazeBlock, FinalBlazeBlock + + +class BlazeFace(BlazeDetector): + """The BlazeFace face detection model from MediaPipe. + + The version from MediaPipe is simpler than the one in the paper; + it does not use the "double" BlazeBlocks. + + Because we won't be training this model, it doesn't need to have + batchnorm layers. These have already been "folded" into the conv + weights by TFLite. + + The conversion to PyTorch is fairly straightforward, but there are + some small differences between TFLite and PyTorch in how they handle + padding on conv layers with stride 2. + + This version works on batches, while the MediaPipe version can only + handle a single image at a time. + + Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and + https://github.com/hollance/BlazeFace-PyTorch and + https://github.com/google/mediapipe/ + + """ + def __init__(self, back_model=False): + super(BlazeFace, self).__init__() + + # These are the settings from the MediaPipe example graph + # mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt + self.num_classes = 1 + self.num_anchors = 896 + self.num_coords = 16 + self.score_clipping_thresh = 100.0 + self.back_model = back_model + if back_model: + self.x_scale = 256.0 + self.y_scale = 256.0 + self.h_scale = 256.0 + self.w_scale = 256.0 + self.min_score_thresh = 0.65 + else: + self.x_scale = 128.0 + self.y_scale = 128.0 + self.h_scale = 128.0 + self.w_scale = 128.0 + self.min_score_thresh = 0.75 + self.min_suppression_threshold = 0.3 + self.num_keypoints = 6 + + # These settings are for converting detections to ROIs which can then + # be extracted and feed into the landmark network + # use mediapipe/calculators/util/detections_to_rects_calculator.cc + self.detection2roi_method = 'box' + # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt + self.kp1 = 1 + self.kp2 = 0 + self.theta0 = 0. + self.dscale = 1.5 + self.dy = 0. + + self._define_layers() + + def _define_layers(self): + if self.back_model: + self.backbone = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True), + nn.ReLU(inplace=True), + + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24, stride=2), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 48, stride=2), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 96, stride=2), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + ) + self.final = FinalBlazeBlock(96) + self.classifier_8 = nn.Conv2d(96, 2, 1, bias=True) + self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True) + + self.regressor_8 = nn.Conv2d(96, 32, 1, bias=True) + self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True) + else: + self.backbone1 = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True), + nn.ReLU(inplace=True), + + BlazeBlock(24, 24), + BlazeBlock(24, 28), + BlazeBlock(28, 32, stride=2), + BlazeBlock(32, 36), + BlazeBlock(36, 42), + BlazeBlock(42, 48, stride=2), + BlazeBlock(48, 56), + BlazeBlock(56, 64), + BlazeBlock(64, 72), + BlazeBlock(72, 80), + BlazeBlock(80, 88), + ) + + self.backbone2 = nn.Sequential( + BlazeBlock(88, 96, stride=2), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + ) + + self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True) + self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True) + + self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True) + self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True) + + def forward(self, x): + # TFLite uses slightly different padding on the first conv layer + # than PyTorch, so do it manually. + x = F.pad(x, (1, 2, 1, 2), "constant", 0) + + b = x.shape[0] # batch size, needed for reshaping later + + if self.back_model: + x = self.backbone(x) # (b, 16, 16, 96) + h = self.final(x) # (b, 8, 8, 96) + else: + x = self.backbone1(x) # (b, 88, 16, 16) + h = self.backbone2(x) # (b, 96, 8, 8) + + # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to + # permute the output from the conv layers before reshaping it. + + c1 = self.classifier_8(x) # (b, 2, 16, 16) + c1 = c1.permute(0, 2, 3, 1) # (b, 16, 16, 2) + c1 = c1.reshape(b, -1, 1) # (b, 512, 1) + + c2 = self.classifier_16(h) # (b, 6, 8, 8) + c2 = c2.permute(0, 2, 3, 1) # (b, 8, 8, 6) + c2 = c2.reshape(b, -1, 1) # (b, 384, 1) + + c = torch.cat((c1, c2), dim=1) # (b, 896, 1) + + r1 = self.regressor_8(x) # (b, 32, 16, 16) + r1 = r1.permute(0, 2, 3, 1) # (b, 16, 16, 32) + r1 = r1.reshape(b, -1, 16) # (b, 512, 16) + + r2 = self.regressor_16(h) # (b, 96, 8, 8) + r2 = r2.permute(0, 2, 3, 1) # (b, 8, 8, 96) + r2 = r2.reshape(b, -1, 16) # (b, 384, 16) + + r = torch.cat((r1, r2), dim=1) # (b, 896, 16) + return [r, c] + diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazeface_landmark.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazeface_landmark.py new file mode 100644 index 0000000000000000000000000000000000000000..8b21d17e31c8a8a39e95abb27cc3220440a01fca --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazeface_landmark.py @@ -0,0 +1,74 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from blazebase import BlazeLandmark, BlazeBlock + +class BlazeFaceLandmark(BlazeLandmark): + """The face landmark model from MediaPipe. + + """ + def __init__(self): + super(BlazeFaceLandmark, self).__init__() + + # size of ROIs used for input + self.resolution = 192 + + self._define_layers() + + def _define_layers(self): + self.backbone1 = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=0, bias=True), + nn.PReLU(16), + + BlazeBlock(16, 16, 3, act='prelu'), + BlazeBlock(16, 16, 3, act='prelu'), + BlazeBlock(16, 32, 3, 2, act='prelu'), + + BlazeBlock(32, 32, 3, act='prelu'), + BlazeBlock(32, 32, 3, act='prelu'), + BlazeBlock(32, 64, 3, 2, act='prelu'), + + BlazeBlock(64, 64, 3, act='prelu'), + BlazeBlock(64, 64, 3, act='prelu'), + BlazeBlock(64, 128, 3, 2, act='prelu'), + + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, 2, act='prelu'), + + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + ) + + + self.backbone2a = nn.Sequential( + BlazeBlock(128, 128, 3, 2, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + nn.Conv2d(128, 32, 1, padding=0, bias=True), + nn.PReLU(32), + BlazeBlock(32, 32, 3, act='prelu'), + nn.Conv2d(32, 1404, 3, padding=0, bias=True) + ) + + self.backbone2b = nn.Sequential( + BlazeBlock(128, 128, 3, 2, act='prelu'), + nn.Conv2d(128, 32, 1, padding=0, bias=True), + nn.PReLU(32), + BlazeBlock(32, 32, 3, act='prelu'), + nn.Conv2d(32, 1, 3, padding=0, bias=True) + ) + + def forward(self, x): + if x.shape[0] == 0: + return torch.zeros((0,)), torch.zeros((0, 468, 3)) + + x = F.pad(x, (0, 1, 0, 1), "constant", 0) + + x = self.backbone1(x) + landmarks = self.backbone2a(x).view(-1, 468, 3) / 192 + flag = self.backbone2b(x).sigmoid().view(-1) + + return flag, landmarks \ No newline at end of file diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/coco.jpg b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/coco.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/coco.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378 +size 158750 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/demo_qnn.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/demo_qnn.py new file mode 100644 index 0000000000000000000000000000000000000000..4bacffe80b7dfb4b20ed6fe45557496d0a02925a --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/demo_qnn.py @@ -0,0 +1,421 @@ +import numpy as np +import torch +import cv2 +import sys +from blazebase import resize_pad, denormalize_detections +from visualization import draw_landmarks, draw_roi, FACE_CONNECTIONS +import time +import aidlite +import os + +class post_mediapipe_face: + def __init__(self): + self.kp1 = 1 + self.kp2 = 0 + self.theta0 = 0. + self.dscale = 1.5 + self.dy = 0. + self.x_scale = 256.0 + self.y_scale = 256.0 + self.h_scale = 256.0 + self.w_scale = 256.0 + self.num_keypoints = 6 + self.num_classes = 1 + self.num_anchors = 896 + self.num_coords = 16 + self.min_score_thresh = 0.4 #0.65 + self.score_clipping_thresh = 100.0 + self.min_suppression_threshold = 0.3 + self.resolution = 192 + + + def detection2roi(self,detection): + xc = (detection[:,1] + detection[:,3]) / 2 + yc = (detection[:,0] + detection[:,2]) / 2 + scale = (detection[:,3] - detection[:,1]) # assumes square boxes + yc += self.dy * scale + scale *= self.dscale + # compute box rotation + x0 = detection[:,4+2*self.kp1] + y0 = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + theta = torch.atan2(y0-y1, x0-x1) - self.theta0 + return xc, yc, scale, theta + + def _decode_boxes( self,raw_boxes, anchors): + boxes = torch.zeros_like(raw_boxes) + + x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0] + y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + + w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2] + h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3] + + boxes[..., 0] = y_center - h / 2. # ymin + boxes[..., 1] = x_center - w / 2. # xmin + boxes[..., 2] = y_center + h / 2. # ymax + boxes[..., 3] = x_center + w / 2. # xmax + + for k in range(self.num_keypoints): + offset = 4 + k*2 + keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0] + keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + boxes[..., offset ] = keypoint_x + boxes[..., offset + 1] = keypoint_y + return boxes + + def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors): + assert raw_box_tensor.ndimension() == 3 + assert raw_box_tensor.shape[1] == self.num_anchors + assert raw_box_tensor.shape[2] == self.num_coords + + assert raw_score_tensor.ndimension() == 3 + assert raw_score_tensor.shape[1] == self.num_anchors + assert raw_score_tensor.shape[2] == self.num_classes + + assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0] + + detection_boxes = self._decode_boxes(raw_box_tensor, anchors) + + thresh = self.score_clipping_thresh + raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh) + detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1) + + # Note: we stripped off the last dimension from the scores tensor + # because there is only has one class. Now we can simply use a mask + # to filter out the boxes with too low confidence. + mask = detection_scores >= self.min_score_thresh + + # Because each image from the batch can have a different number of + # detections, process them one at a time using a loop. + output_detections = [] + for i in range(raw_box_tensor.shape[0]): + boxes = detection_boxes[i, mask[i]] + scores = detection_scores[i, mask[i]].unsqueeze(dim=-1) + output_detections.append(torch.cat((boxes, scores), dim=-1)) + + return output_detections + + def extract_roi( self,frame, xc, yc, theta, scale): + resolution = 192 + # take points on unit square and transform them according to the roi + points = torch.tensor([[-1, -1, 1, 1], + [-1, 1, -1, 1]], device=scale.device).view(1,2,4) + points = points * scale.view(-1,1,1)/2 + theta = theta.view(-1, 1, 1) + R = torch.cat(( + torch.cat((torch.cos(theta), -torch.sin(theta)), 2), + torch.cat((torch.sin(theta), torch.cos(theta)), 2), + ), 1) + center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1) + points = R @ points + center + + # use the points to compute the affine transform that maps + # these points back to the output square + res = resolution + points1 = np.array([[0, 0, res-1], + [0, res-1, 0]], dtype=np.float32).T + affines = [] + imgs = [] + for i in range(points.shape[0]): + pts = points[i, :, :3].detach().numpy().T + M = cv2.getAffineTransform(pts, points1) + img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5) + img = torch.tensor(img, device=scale.device) + imgs.append(img) + affine = cv2.invertAffineTransform(M).astype('float32') + affine = torch.tensor(affine, device=scale.device) + affines.append(affine) + if imgs: + imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0 + affines = torch.stack(affines) + else: + imgs = torch.zeros((0, 3, res, res), device=scale.device) + affines = torch.zeros((0, 2, 3), device=scale.device) + + return imgs, affines, points + + def denormalize_landmarks(self, landmarks, affines): + landmarks[:,:,:2] *= self.resolution + for i in range(len(landmarks)): + landmark, affine = landmarks[i], affines[i] + landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T + landmarks[i,:,:2] = landmark + return landmarks + + def intersect(self,box_a, box_b): + A = box_a.size(0) + B = box_b.size(0) + max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), + box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) + min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), + box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + return inter[:, :, 0] * inter[:, :, 1] + + def jaccard(self,box_a, box_b): + inter = self.intersect(box_a, box_b) + area_a = ((box_a[:, 2]-box_a[:, 0]) * + (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_b = ((box_b[:, 2]-box_b[:, 0]) * + (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + union = area_a + area_b - inter + return inter / union # [A,B] + + + def overlap_similarity(self,box, other_boxes): + """Computes the IOU between a bounding box and set of other boxes.""" + return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0) + + def _weighted_non_max_suppression(self,detections): + if len(detections) == 0: return [] + output_detections = [] + + # Sort the detections from highest to lowest score. + remaining = torch.argsort(detections[:, num_coords], descending=True) + + while len(remaining) > 0: + detection = detections[remaining[0]] + + # Compute the overlap between the first box and the other + # remaining boxes. (Note that the other_boxes also include + # the first_box.) + first_box = detection[:4] + other_boxes = detections[remaining, :4] + ious = self.overlap_similarity(first_box, other_boxes) + + # If two detections don't overlap enough, they are considered + # to be from different faces. + mask = ious > self.min_suppression_threshold + overlapping = remaining[mask] + remaining = remaining[~mask] + + # Take an average of the coordinates from the overlapping + # detections, weighted by their confidence scores. + weighted_detection = detection.clone() + if len(overlapping) > 1: + coordinates = detections[overlapping, :num_coords] + scores = detections[overlapping, num_coords:num_coords+1] + total_score = scores.sum() + weighted = (coordinates * scores).sum(dim=0) / total_score + weighted_detection[:num_coords] = weighted + weighted_detection[num_coords] = total_score / len(overlapping) + + output_detections.append(weighted_detection) + + return output_detections + +def draw_detections(img, detections, with_keypoints=True): + if isinstance(detections, torch.Tensor): + detections = detections.detach().numpy() + + if detections.ndim == 1: + detections = np.expand_dims(detections, axis=0) + + n_keypoints = detections.shape[1] // 2 - 2 + + for i in range(detections.shape[0]): + ymin = detections[i, 0] + xmin = detections[i, 1] + ymax = detections[i, 2] + xmax = detections[i, 3] + + start_point = (int(xmin), int(ymin)) + end_point = (int(xmax), int(ymax)) + img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) + + if with_keypoints: + for k in range(n_keypoints): + kp_x = int(detections[i, 4 + k*2 ]) + kp_y = int(detections[i, 4 + k*2 + 1]) + cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2) + return img + + + +post_process=post_mediapipe_face() + +class faceDetectionQnn: + def __init__(self): + super().__init__() + self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceDetector_fp16.qnn216.ctx.bin")) + if self.model is None: + print("Create model failed !") + return + + self.config = aidlite.Config.create_instance() + if self.config is None: + print("build_interpretper_from_model_and_config failed !") + return + + self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL + self.config.framework_type = aidlite.FrameworkType.TYPE_QNN + self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP + self.config.is_quantify_model = 1 + + self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config) + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + return + input_shapes = [[1,3, 256, 256]] + output_shapes = [[1, 896,16],[1,896,1]] + self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32, + output_shapes, aidlite.DataType.TYPE_FLOAT32) + + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + result = self.interpreter.init() + if result != 0: + print(f"interpreter init failed !") + result = self.interpreter.load_model() + if result != 0: + print("interpreter load model failed !") + + print(" model load success!") + + def __call__(self, input): + self.interpreter.set_input_tensor(0,input) + invoke_time=[] + invoke_nums =10 + for i in range(invoke_nums): + result = self.interpreter.set_input_tensor(0, input.data) + if result != 0: + print("interpreter set_input_tensor() failed") + t1=time.time() + result = self.interpreter.invoke() + cost_time = (time.time()-t1)*1000 + invoke_time.append(cost_time) + + max_invoke_time = max(invoke_time) + min_invoke_time = min(invoke_time) + mean_invoke_time = sum(invoke_time)/invoke_nums + var_invoketime=np.var(invoke_time) + print("====================================") + print(f"QNN Detection invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}") + print("====================================") + features_0 = self.interpreter.get_output_tensor(0).reshape(1, 896,16).copy() + features_1 = self.interpreter.get_output_tensor(1).reshape(1, 896,1).copy() + return features_0,features_1 + + +class faceLandmarkQnn: + def __init__(self): + super().__init__() + self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceLandmark_fp16.qnn216.ctx.bin")) + if self.model is None: + print("Create model failed !") + return + + self.config = aidlite.Config.create_instance() + if self.config is None: + print("build_interpretper_from_model_and_config failed !") + return + + self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL + self.config.framework_type = aidlite.FrameworkType.TYPE_QNN + self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP + self.config.is_quantify_model = 1 + + self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config) + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + return + input_shapes = [[1, 3, 192, 192]] + output_shapes = [[1],[1,468,3]] + self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32, + output_shapes, aidlite.DataType.TYPE_FLOAT32) + + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + result = self.interpreter.init() + if result != 0: + print(f"interpreter init failed !") + result = self.interpreter.load_model() + if result != 0: + print("interpreter load model failed !") + + print(" model load success!") + + def __call__(self, input): + self.interpreter.set_input_tensor(0,input) + invoke_time=[] + invoke_nums =10 + for i in range(invoke_nums): + result = self.interpreter.set_input_tensor(0, input.data) + if result != 0: + print("interpreter set_input_tensor() failed") + t1=time.time() + result = self.interpreter.invoke() + cost_time = (time.time()-t1)*1000 + invoke_time.append(cost_time) + + max_invoke_time = max(invoke_time) + min_invoke_time = min(invoke_time) + mean_invoke_time = sum(invoke_time)/invoke_nums + var_invoketime=np.var(invoke_time) + print("====================================") + print(f"QNN LandMark invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}") + print("====================================") + features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy() + features_1 = self.interpreter.get_output_tensor(1).reshape(1,468,3).copy() + return features_0,features_1 + + + +anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"anchors_face_back.npy")), dtype=torch.float32, device='cpu') +face_detc = faceDetectionQnn() +face_rec = faceLandmarkQnn() + +image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"coco.jpg") + +frame_ct=0 +image = cv2.imread(image_path) + +frame = np.ascontiguousarray(image[:,:,::-1]) + +img1, img2, scale, pad = resize_pad(frame) + +input = (img1 / 255).astype(np.float32) +input = np.transpose(input, (2, 0, 1)) +input = input[np.newaxis, ...] +t0 = time.time() +out = face_detc(input) +use_time = round((time.time() - t0) * 1000, 2) +detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors) + +filtered_detections = [] +num_coords = 16 +for i in range(len(detections)): + faces = post_process._weighted_non_max_suppression(detections[i]) + faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1)) + filtered_detections.append(faces) + +face_detections = denormalize_detections(filtered_detections[0], scale, pad) + +xc, yc, scale, theta = post_process.detection2roi(face_detections) + +img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale) +if box.size()[0]!=0: + t2 = time.time() + flags, normalized_landmarks = face_rec(img.numpy()) + + use_time = round((time.time() - t2) * 1000, 2) + + landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine) + + for i in range(len(flags)): + landmark, flag = landmarks[i], flags[i] + if flag>.4: # 0.5 + draw_landmarks(frame, landmark[:,:2], FACE_CONNECTIONS, size=1) +else: + print("not detect face !") + +draw_roi(frame, box) +draw_detections(frame, face_detections) +cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1]) +face_detc.interpreter.destory() +face_rec.interpreter.destory() + + + diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/visualization.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..e1187c9d99336d53562f4fa3a4a32f6af6e769fb --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/visualization.py @@ -0,0 +1,125 @@ +import numpy as np +import cv2 +import torch + +def draw_detections(img, detections, with_keypoints=True): + if isinstance(detections, torch.Tensor): + detections = detections.cpu().numpy() + + if detections.ndim == 1: + detections = np.expand_dims(detections, axis=0) + + n_keypoints = detections.shape[1] // 2 - 2 + + for i in range(detections.shape[0]): + ymin = detections[i, 0] + xmin = detections[i, 1] + ymax = detections[i, 2] + xmax = detections[i, 3] + + start_point = (int(xmin), int(ymin)) + end_point = (int(xmax), int(ymax)) + img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) + + if with_keypoints: + for k in range(n_keypoints): + kp_x = int(detections[i, 4 + k*2 ]) + kp_y = int(detections[i, 4 + k*2 + 1]) + cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2) + return img + + +def draw_roi(img, roi): + for i in range(roi.shape[0]): + (x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i] + cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2) + cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2) + cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2) + cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2) + + +def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2): + points = points[:,:2] + for point in points: + x, y = point + x, y = int(x), int(y) + cv2.circle(img, (x, y), size, color, thickness=size) + for connection in connections: + x0, y0 = points[connection[0]] + x1, y1 = points[connection[1]] + x0, y0 = int(x0), int(y0) + x1, y1 = int(x1), int(y1) + cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size) + + + +# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py +# 8 12 16 20 +# | | | | +# 7 11 15 19 +# 4 | | | | +# | 6 10 14 18 +# 3 | | | | +# | 5---9---13--17 +# 2 \ / +# \ \ / +# 1 \ / +# \ \ / +# ------0- +HAND_CONNECTIONS = [ + (0, 1), (1, 2), (2, 3), (3, 4), + (5, 6), (6, 7), (7, 8), + (9, 10), (10, 11), (11, 12), + (13, 14), (14, 15), (15, 16), + (17, 18), (18, 19), (19, 20), + (0, 5), (5, 9), (9, 13), (13, 17), (0, 17) +] + +POSE_CONNECTIONS = [ + (0,1), (1,2), (2,3), (3,7), + (0,4), (4,5), (5,6), (6,8), + (9,10), + (11,13), (13,15), (15,17), (17,19), (19,15), (15,21), + (12,14), (14,16), (16,18), (18,20), (20,16), (16,22), + (11,12), (12,24), (24,23), (23,11) +] + +# Vertex indices can be found in +# github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png +# Found in github.com/google/mediapipe/python/solutions/face_mesh.py +FACE_CONNECTIONS = [ + # Lips. + (61, 146), (146, 91), (91, 181), (181, 84), (84, 17), + (17, 314), (314, 405), (405, 321), (321, 375), (375, 291), + (61, 185), (185, 40), (40, 39), (39, 37), (37, 0), + (0, 267), (267, 269), (269, 270), (270, 409), (409, 291), + (78, 95), (95, 88), (88, 178), (178, 87), (87, 14), + (14, 317), (317, 402), (402, 318), (318, 324), (324, 308), + (78, 191), (191, 80), (80, 81), (81, 82), (82, 13), + (13, 312), (312, 311), (311, 310), (310, 415), (415, 308), + # Left eye. + (263, 249), (249, 390), (390, 373), (373, 374), (374, 380), + (380, 381), (381, 382), (382, 362), (263, 466), (466, 388), + (388, 387), (387, 386), (386, 385), (385, 384), (384, 398), + (398, 362), + # Left eyebrow. + (276, 283), (283, 282), (282, 295), (295, 285), (300, 293), + (293, 334), (334, 296), (296, 336), + # Right eye. + (33, 7), (7, 163), (163, 144), (144, 145), (145, 153), + (153, 154), (154, 155), (155, 133), (33, 246), (246, 161), + (161, 160), (160, 159), (159, 158), (158, 157), (157, 173), + (173, 133), + # Right eyebrow. + (46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105), + (105, 66), (66, 107), + # Face oval. + (10, 338), (338, 297), (297, 332), (332, 284), (284, 251), + (251, 389), (389, 356), (356, 454), (454, 323), (323, 361), + (361, 288), (288, 397), (397, 365), (365, 379), (379, 378), + (378, 400), (400, 377), (377, 152), (152, 148), (148, 176), + (176, 149), (149, 150), (150, 136), (136, 172), (172, 58), + (58, 132), (132, 93), (93, 234), (234, 127), (127, 162), + (162, 21), (21, 54), (54, 103), (103, 67), (67, 109), + (109, 10) +] diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/README.md b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f6af35b4625656b8d75e35fceafd1023d67dea5b --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/README.md @@ -0,0 +1,63 @@ +## Model Information +### Source model +- Input shape: [1x3x256x256],[1x3x192x192] +- Number of parameters:0.13M, 0.6M +- Model size:0.58MB, 2.32MB +- Output shape: [1x896x16, 1x896x1], [1, 1x486x3] + +Source model repository: [MediaPipe-Face-Detection](https://github.com/zmurez/MediaPipePyTorch/) + +### Converted model + +- Precision: INT8 +- Backend: QNN2.16 +- Target Device: SNM972 QCS8550 + +## Inference with AidLite SDK + +### SDK installation +Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/) + +- install AidLite SDK + +```bash +# Install the appropriate version of the aidlite sdk +sudo aid-pkg update +sudo aid-pkg install aidlite-sdk +# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223 +sudo aid-pkg install aidlite-{QNN VERSION} +``` + +- Verify AidLite SDK + +```bash +# aidlite sdk c++ check +python3 -c "import aidlite ; print(aidlite.get_library_version())" + +# aidlite sdk python check +python3 -c "import aidlite ; print(aidlite.get_py_library_version())" +``` + +### Run demo +#### python +```bash +cd python +python3 demo_qnn.py +``` + +#### c++ +```bash +# 加载.npy文件需要用到cnpy库(终端默认路径下执行即可) +git clone https://github.com/rogersce/cnpy.git +cd cnpy +mkdir build && cd build +cmake .. +make +sudo make install + +cd mediapipe-face-detection/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp +mkdir build && cd build +cmake .. +make +./run_test +``` diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazebase.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8655a0360f730c1fb7b4f81367a6279e5acc8060 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazebase.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazebase.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2044fec25dc146bd44e23e22b15c320e189fd99f Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazebase.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazeface.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3d9d716ed4bbd036e086c876ce0c5308d5a9a5d Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazeface.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27b6da065e0c25967994af97ae8c72f67594bf5f Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/visualization.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e9b9b371d9ab88b3885f249fa1a9fc68d8751c1 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/visualization.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/visualization.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/visualization.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c16fa3b7c9a01a594bb21e59a6c00d0b19945548 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/visualization.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/CMakeLists.txt b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..13cca3db86e64b8a8458fd55c52323bed1d4282f --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/CMakeLists.txt @@ -0,0 +1,34 @@ +cmake_minimum_required (VERSION 3.5) +project("run_test") + +find_package(OpenCV REQUIRED) +find_library(CNPY_LIB cnpy REQUIRED) + +message(STATUS "oPENCV Library status:") +message(STATUS ">version:${OpenCV_VERSION}") +message(STATUS "Include:${OpenCV_INCLUDE_DIRS}") + +set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations") + +include_directories( + /usr/local/include + /usr/include/opencv4 +) + +link_directories( + /usr/local/lib/ +) + +file(GLOB SRC_LISTS + ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp +) + +add_executable(run_test ${SRC_LISTS}) + +target_link_libraries(run_test + aidlite + ${OpenCV_LIBS} + pthread + jsoncpp + ${CNPY_LIB} +) diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/anchors_float32.npy b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/anchors_float32.npy new file mode 100644 index 0000000000000000000000000000000000000000..fc25c0d2c161ee090e9a04c6003418bd15caf45c --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/anchors_float32.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94 +size 14464 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/coco.jpg b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/coco.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/coco.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378 +size 158750 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/run_test.cpp b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/run_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..15a817e6b6b8d96cb1edff9e2ee5dae04e7fccde --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/run_test.cpp @@ -0,0 +1,909 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cnpy.h" + +using namespace cv; +using namespace std; +using namespace Aidlux::Aidlite; + + +// 人脸 landmark 连接索引定义(来自 MediaPipe Face Mesh) +const std::vector> FACE_CONNECTIONS = { + {61, 146}, {146, 91}, {91, 181}, {181, 84}, {84, 17}, + {17, 314}, {314, 405}, {405, 321}, {321, 375}, {375, 291}, + {61, 185}, {185, 40}, {40, 39}, {39, 37}, {37, 0}, + {0, 267}, {267, 269}, {269, 270}, {270, 409}, {409, 291}, + {78, 95}, {95, 88}, {88, 178}, {178, 87}, {87, 14}, + {14, 317}, {317, 402}, {402, 318}, {318, 324}, {324, 308}, + {78, 191}, {191, 80}, {80, 81}, {81, 82}, {82, 13}, + {13, 312}, {312, 311}, {311, 310}, {310, 415}, {415, 308}, + {263, 249}, {249, 390}, {390, 373}, {373, 374}, {374, 380}, + {380, 381}, {381, 382}, {382, 362}, {263, 466}, {466, 388}, + {388, 387}, {387, 386}, {386, 385}, {385, 384}, {384, 398}, + {398, 362}, {276, 283}, {283, 282}, {282, 295}, {295, 285}, + {300, 293}, {293, 334}, {334, 296}, {296, 336}, {33, 7}, + {7, 163}, {163, 144}, {144, 145}, {145, 153}, {153, 154}, + {154, 155}, {155, 133}, {33, 246}, {246, 161}, {161, 160}, + {160, 159}, {159, 158}, {158, 157}, {157, 173}, {173, 133}, + {46, 53}, {53, 52}, {52, 65}, {65, 55}, {70, 63}, {63, 105}, + {105, 66}, {66, 107}, {10, 338}, {338, 297}, {297, 332}, + {332, 284}, {284, 251}, {251, 389}, {389, 356}, {356, 454}, + {454, 323}, {323, 361}, {361, 288}, {288, 397}, {397, 365}, + {365, 379}, {379, 378}, {378, 400}, {400, 377}, {377, 152}, + {152, 148}, {148, 176}, {176, 149}, {149, 150}, {150, 136}, + {136, 172}, {172, 58}, {58, 132}, {132, 93}, {93, 234}, + {234, 127}, {127, 162}, {162, 21}, {21, 54}, {54, 103}, + {103, 67}, {67, 109}, {109, 10} +}; + +struct Args { + std::string faceDetector_model = "../../models/m_faceDetector_w8a8.qnn216.ctx.bin"; + std::string faceLandmark_model = "../../models/m_faceLandmark_w8a8.qnn216.ctx.bin"; + std::string imgs = "../coco.jpg"; + int invoke_nums = 10; + std::string model_type = "QNN"; +}; + + +Args parse_args(int argc, char* argv[]) { + Args args; + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if (arg == "--faceDetector_model" && i + 1 < argc) { + args.faceDetector_model = argv[++i]; + } else if (arg == "--faceLandmark_model" && i + 1 < argc) { + args.faceLandmark_model = argv[++i]; + } else if (arg == "--imgs" && i + 1 < argc) { + args.imgs = argv[++i]; + } else if (arg == "--invoke_nums" && i + 1 < argc) { + args.invoke_nums = std::stoi(argv[++i]); + } else if (arg == "--model_type" && i + 1 < argc) { + args.model_type = argv[++i]; + } + } + return args; +} + +std::string to_lower(const std::string& str) { + std::string lower_str = str; + std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) { + return std::tolower(c); + }); + return lower_str; +} + +std::vector> load_anchors_from_npy(const std::string& path) { + cnpy::NpyArray arr = cnpy::npy_load(path); + float* data_ptr = arr.data(); + + size_t num_rows = arr.shape[0]; // 896 + size_t num_cols = arr.shape[1]; // 4 + + std::vector> anchors(num_rows, std::vector(num_cols)); + for (size_t i = 0; i < num_rows; ++i) { + for (size_t j = 0; j < num_cols; ++j) { + anchors[i][j] = data_ptr[i * num_cols + j]; + } + } + + return anchors; +} + + +// 绘制人脸关键点和连接线 +void draw_landmarks( + cv::Mat& img, + const std::vector& points, + const std::vector& flags, + const std::vector>& connections, + float threshold = 0.4f, + cv::Scalar point_color = cv::Scalar(0, 255, 0), + cv::Scalar line_color = cv::Scalar(0, 0, 0), + int size = 2) +{ + // 画关键点 + for (size_t i = 0; i < points.size(); ++i) { + if (i < flags.size() && flags[i] > threshold) { + int x = static_cast(points[i].x); + int y = static_cast(points[i].y); + cv::circle(img, cv::Point(x, y), size, point_color, size); + } + } + + // 画连接线(两端都要可见) + for (const auto& conn : connections) { + int i0 = conn.first; + int i1 = conn.second; + if (i0 < points.size() && i1 < points.size() && + i0 < flags.size() && i1 < flags.size() && + flags[i0] > threshold && flags[i1] > threshold) + { + cv::line(img, points[i0], points[i1], line_color, size); + } + } +} + + +std::tuple resize_pad(const cv::Mat& img) { + int orig_h = img.rows; // 480 + int orig_w = img.cols; // 640 + + // Step 1: resize width to 256, keep aspect ratio + int w1 = 256; + int h1 = w1 * orig_h / orig_w; // 等效于 int(256 * h / w) + + // Step 2: compute padding in height direction + int padh = 256 - h1; + int padw = 0; + + int padh1 = padh / 2; + int padh2 = padh1 + (padh % 2); + int padw1 = padw / 2; + int padw2 = padw1 + (padw % 2); + + // Step 3: resize to (w1, h1) + cv::Mat resized; + cv::resize(img, resized, cv::Size(w1, h1)); // (256, h1) + + // Step 4: pad to (256, 256) + cv::Mat padded; + cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); + + // Step 5: resize padded to 128×128 + cv::Mat resized128; + cv::resize(padded, resized128, cv::Size(128, 128)); + + // Step 6: compute scale and pad in original image space + float scale = static_cast(orig_h) / h1; // h / h1 + cv::Point pad_point(static_cast(padh1 * scale), static_cast(padw1 * scale)); + + return std::make_tuple(padded, resized128, scale, pad_point); +} + + +// 将图像转换为 1xC×H×W 格式并归一化(除以 255) +std::vector preprocess_image(const cv::Mat& img) { + int H = img.rows; + int W = img.cols; + int C = img.channels(); // should be 3 + + std::vector chw(H * W * C); // CHW + std::vector nchw(1 * C * H * W); // NCHW + + // 1. HWC → CHW + normalize (float32 / 255.0) + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int c = 0; c < C; ++c) { + // OpenCV uses BGR order + float value = img.at(h, w)[c] / 255.0f; + chw[c * H * W + h * W + w] = value; + } + } + } + + // 2. CHW → NCHW (add batch dimension, actually just copy) + for (int i = 0; i < C * H * W; ++i) { + nchw[i] = chw[i]; + } + + return nchw; // shape: [1, 3, H, W] +} + + +// 只用前4个坐标计算IOU(默认框位置在前4个坐标) +float IoU(const std::vector& box1, const std::vector& box2) { + float x1 = std::max(box1[0], box2[0]); + float y1 = std::max(box1[1], box2[1]); + float x2 = std::min(box1[2], box2[2]); + float y2 = std::min(box1[3], box2[3]); + + float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1); + float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]); + float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]); + float union_area = box1_area + box2_area - inter_area; + + return union_area > 0 ? inter_area / union_area : 0.0f; +} + +std::vector> weighted_non_max_suppression( + std::vector>& detections, + int num_coords = 16, + float min_suppression_threshold = 0.3f) +{ + if (detections.empty()) return {}; + + std::vector indices(detections.size()); + std::iota(indices.begin(), indices.end(), 0); + + // 按置信度降序排序 + std::sort(indices.begin(), indices.end(), [&](int a, int b) { + return detections[a][num_coords] > detections[b][num_coords]; + }); + + std::vector> output; + + while (!indices.empty()) { + int best_idx = indices.front(); + const auto& best_det = detections[best_idx]; + std::vector overlapping = { best_idx }; + + for (size_t i = 1; i < indices.size(); ++i) { + float iou = IoU(best_det, detections[indices[i]]); + if (iou > min_suppression_threshold) { + overlapping.push_back(indices[i]); + } + } + + // 更新剩余索引 + std::vector new_indices; + for (int idx : indices) { + if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) { + new_indices.push_back(idx); + } + } + indices = new_indices; + + // 加权平均:坐标 * 置信度 + if (overlapping.size() == 1) { + output.push_back(best_det); + } else { + std::vector weighted(num_coords + 1, 0.0f); + float total_score = 0.0f; + + for (int idx : overlapping) { + float score = detections[idx][num_coords]; + total_score += score; + for (int k = 0; k < num_coords; ++k) { + weighted[k] += detections[idx][k] * score; + } + } + + for (int k = 0; k < num_coords; ++k) { + weighted[k] /= total_score; + } + weighted[num_coords] = total_score / overlapping.size(); // 取平均得分 + + // std::cout << "Weighted box: "; + // for (float v : weighted) std::cout << v << " "; + // std::cout << "\n"; + + output.push_back(weighted); + } + } + + // TODO + auto x = output[0]; + output.clear(); + output.push_back(x); + + return output; +} + + +std::vector> denormalize_detections( + const std::vector>& detections, + float scale, + const cv::Point& pad +) { + std::vector> result = detections; + + for (size_t i = 0; i < result.size(); ++i) { + std::vector& det = result[i]; + + // bbox coords: x1, y1, x2, y2 + det[0] = det[0] * scale * 256.0f - pad.x; // x1 + det[1] = det[1] * scale * 256.0f - pad.y; // y1 + det[2] = det[2] * scale * 256.0f - pad.x; // x2 + det[3] = det[3] * scale * 256.0f - pad.y; // y2 + + // keypoints (starting from index 4): format [y, x, y, x, ...] + for (size_t k = 4; k + 1 < det.size(); k += 2) { + det[k] = det[k] * scale * 256.0f - pad.y; // y + det[k + 1] = det[k + 1] * scale * 256.0f - pad.x; // x + } + } + + return result; +} + + +void detection2roi( + const std::vector>& detections, + std::vector& xc, + std::vector& yc, + std::vector& scale, + std::vector& theta, + int kp1, int kp2, // 关键点索引 + float dy, float dscale, float theta0 +) { + size_t N = detections.size(); + xc.resize(N); + yc.resize(N); + scale.resize(N); + theta.resize(N); + + for (size_t i = 0; i < N; ++i) { + const std::vector& det = detections[i]; + + float x1 = det[1]; + float x2 = det[3]; + float y1 = det[0]; + float y2 = det[2]; + + float x_center = (x1 + x2) / 2.0f; + float y_center = (y1 + y2) / 2.0f; + float box_scale = (x2 - x1); // assumes square box + + // yc 偏移 + y_center += dy * box_scale; + box_scale *= dscale; + + // 获取两个关键点的位置 + int base = 4; + int idx_y0 = base + 2 * kp1; + int idx_x0 = base + 2 * kp1 + 1; + int idx_y1 = base + 2 * kp2; + int idx_x1 = base + 2 * kp2 + 1; + + float x0 = det[idx_x0]; + float y0 = det[idx_y0]; + float x1_kp = det[idx_x1]; + float y1_kp = det[idx_y1]; + + float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0; + + // 输出赋值 + xc[i] = x_center; + yc[i] = y_center; + scale[i] = box_scale; + // TODO: 这里的 theta 需要根据实际情况调整 + // theta[i] = angle; // 如果需要使用计算的角度 + theta[i] = -0.0094; + } +} + + +void extract_roi( + const cv::Mat& frame, + const std::vector& xc, + const std::vector& yc, + const std::vector& theta, + const std::vector& scale, + std::vector& cropped_rois, + std::vector& affine_matrices, + std::vector>& roi_boxes, // 添加返回点坐标 + int resolution = 192 +) { + cropped_rois.clear(); + affine_matrices.clear(); + roi_boxes.clear(); + + for (size_t i = 0; i < xc.size(); ++i) { + float s = scale[i] / 2.0f; + float cos_t = std::cos(theta[i]); + float sin_t = std::sin(theta[i]); + + // 定义4个 unit square 点经过变换后的点(顺序和 Python 中一样) + std::vector points(4); + // [-1, -1] + points[0].x = xc[i] + (-s * cos_t + s * sin_t); + points[0].y = yc[i] + (-s * sin_t - s * cos_t); + // [1, -1] + points[1].x = xc[i] + ( s * cos_t + s * sin_t); + points[1].y = yc[i] + ( s * sin_t - s * cos_t); + // [-1, 1] + points[2].x = xc[i] + (-s * cos_t - s * sin_t); + points[2].y = yc[i] + (-s * sin_t + s * cos_t); + // [1, 1] + points[3].x = xc[i] + ( s * cos_t - s * sin_t); + points[3].y = yc[i] + ( s * sin_t + s * cos_t); + + // 用前三个点计算仿射变换 + std::vector src_pts = { points[0], points[1], points[2] }; + std::vector dst_pts = { + cv::Point2f(0, 0), + cv::Point2f(resolution - 1, 0), + cv::Point2f(0, resolution - 1) + }; + + cv::Mat M = cv::getAffineTransform(src_pts, dst_pts); + cv::Mat M_inv; + cv::invertAffineTransform(M, M_inv); + + cv::Mat cropped; + cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5)); + cropped_rois.push_back(cropped); + affine_matrices.push_back(M_inv); + roi_boxes.push_back(points); // 添加变换后的 box 点 + } +} + +std::vector preprocess_imgs_to_nchw(const std::vector& imgs) { + int N = imgs.size(); + if (N == 0) return {}; + + int H = 192; + int W = 192; + int C = 3; // assume 3 channels (BGR) + + std::vector output; + output.reserve(N * C * H * W); + + for (int n = 0; n < N; ++n) { + cv::Mat img_float; + imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0); // Normalize to [0,1] + + // Split channels (HWC → CHW) + std::vector channels(3); + cv::split(img_float, channels); // channels[0] = B, [1] = G, [2] = R + + for (int c = 0; c < C; ++c) { + for (int i = 0; i < H; ++i) { + for (int j = 0; j < W; ++j) { + output.push_back(channels[c].at(i, j)); + } + } + } + } + + return output; // shape: N x C x H x W +} + +// resolution 一般为 192 +std::vector denormalize_landmarks( + const std::vector& normalized_landmarks, + const std::vector& affines, + int resolution = 192) +{ + std::vector output; + + // 检查输入尺寸 + const int num_faces = 1; // 假设只有一个人脸 + const int num_landmarks = 468; + if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) { + std::cerr << "Error: Input size mismatch. Expected " + << num_faces * num_landmarks * 3 << " landmarks and " + << num_faces << " affine matrices." << std::endl; + throw std::runtime_error("Input size mismatch"); + } + + for (int i = 0; i < num_faces; ++i) { + const cv::Mat& affine = affines[i]; // 2x3 CV_32F + for (int j = 0; j < num_landmarks; ++j) { + int idx = i * num_landmarks * 3 + j * 3; + float x = normalized_landmarks[idx + 0] * resolution; + float y = normalized_landmarks[idx + 1] * resolution; + // float z = normalized_landmarks[idx + 2]; // 可选使用 + + // 2x1 输入向量 + cv::Mat pt = (cv::Mat_(2, 1) << x, y); + + // 提取 affine 的旋转和平移 + cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone(); + cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone(); + M2x2.convertTo(M2x2, CV_32F); + t2x1.convertTo(t2x1, CV_32F); + + // 反仿射变换 + cv::Mat out = M2x2 * pt + t2x1; + + // 存储为 Point2f + output.emplace_back(out.at(0, 0), out.at(1, 0)); + } + } + + return output; // 输出为 denormalized landmarks,大小为 468 个 Point2f +} + + +void draw_roi(cv::Mat& img, const std::vector>& boxes) { + for (const auto& roi : boxes) { + if (roi.size() < 4) continue; + + const cv::Point2f& p1 = roi[0]; + const cv::Point2f& p2 = roi[1]; + const cv::Point2f& p3 = roi[2]; + const cv::Point2f& p4 = roi[3]; + + cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2); // 黑色 + cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2); // 绿色 + cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2); // 黑色 + cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2); // 黑色 + } +} + + +void draw_detections(cv::Mat& img, const std::vector>& detections, bool with_keypoints = true) { + for (const auto& det : detections) { + if (det.size() < 4) continue; + + float ymin = det[0]; + float xmin = det[1]; + float ymax = det[2]; + float xmax = det[3]; + + cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1); + + if (with_keypoints && det.size() > 4) { + int n_keypoints = (det.size() - 4) / 2; + for (int k = 0; k < n_keypoints; ++k) { + int kp_x = int(det[4 + k * 2]); + int kp_y = int(det[4 + k * 2 + 1]); + cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2); + } + } + } +} + + +std::vector> loadAnchors(const std::string& filename) { + std::ifstream in(filename); + std::vector> anchors; + + if (!in.is_open()) { + std::cerr << "Failed to open file: " << filename << std::endl; + return anchors; + } + + std::string line; + while (std::getline(in, line)) { + std::istringstream ss(line); + std::vector anchor; + float value; + while (ss >> value) { + anchor.push_back(value); + } + if (!anchor.empty()) { + anchors.push_back(anchor); + } + } + + in.close(); + return anchors; +} + +// sigmoid 函数 +float sigmoid(float x) { + return 1.0f / (1.0f + std::exp(-x)); +} + +// clamp 函数 +float clamp(float x, float min_val, float max_val) { + return std::max(min_val, std::min(max_val, x)); +} + +// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord] +std::vector>> decode_boxes( + const std::vector& raw_boxes, + const std::vector>& anchors, + int batch, int num_anchors, int num_coords, + float x_scale, float y_scale, float w_scale, float h_scale, + int num_keypoints) +{ + std::vector>> decoded_boxes(batch, + std::vector>(num_anchors, std::vector(num_coords, 0))); + + for (int b = 0; b < batch; ++b) { + for (int i = 0; i < num_anchors; ++i) { + int base = b * num_anchors * num_coords + i * num_coords; + + float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0]; + float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1]; + float w = raw_boxes[base + 2] / w_scale * anchors[i][2]; + float h = raw_boxes[base + 3] / h_scale * anchors[i][3]; + + decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin + decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin + decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax + decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax + + for (int k = 0; k < num_keypoints; ++k) { + int offset = 4 + k * 2; + float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0]; + float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1]; + decoded_boxes[b][i][offset] = keypoint_x; + decoded_boxes[b][i][offset + 1] = keypoint_y; + } + } + } + + return decoded_boxes; +} + +std::vector>> tensors_to_detections( + const std::vector& raw_box_tensor, + const std::vector& raw_score_tensor, + const std::vector>& anchors, + int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints, + float x_scale, float y_scale, float w_scale, float h_scale, + float score_clipping_thresh, float min_score_thresh) +{ + assert(raw_box_tensor.size() == batch * num_anchors * num_coords); + assert(raw_score_tensor.size() == batch * num_anchors * num_classes); + assert(anchors.size() == size_t(num_anchors)); + + auto detection_boxes = decode_boxes( + raw_box_tensor, anchors, batch, num_anchors, num_coords, + x_scale, y_scale, w_scale, h_scale, num_keypoints); + + std::vector>> output_detections; + + for (int b = 0; b < batch; ++b) { + std::vector> detections; + + for (int i = 0; i < num_anchors; ++i) { + int score_index = b * num_anchors * num_classes + i * num_classes; + + // 单类情况,取第0类 + float score_raw = raw_score_tensor[score_index]; + float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh)); + + if (score >= min_score_thresh) { + std::vector det = detection_boxes[b][i]; // shape [num_coords] + det.push_back(score); // 追加置信度 + detections.push_back(det); // shape [num_coords+1] + } + } + + output_detections.push_back(detections); // 每个 batch 一个 vector + } + + return output_detections; +} + + +int invoke(const Args& args) { + std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n" + << args.faceLandmark_model << "\n" + << "Image Path: " << args.imgs << "\n" + << "Inference Nums: " << args.invoke_nums << "\n" + << "Model Type: " << args.model_type << "\n"; + // =============================================================faceDetector_model start + Model* model1 = Model::create_instance(args.faceDetector_model); + if(model1 == nullptr){ + printf("Create model1 failed !\n"); + return EXIT_FAILURE; + } + Config* config1 = Config::create_instance(); + if(config1 == nullptr){ + printf("Create config1 failed !\n"); + return EXIT_FAILURE; + } + config1->implement_type = ImplementType::TYPE_LOCAL; + std::string model_type_lower1 = to_lower(args.model_type); + if (model_type_lower1 == "qnn"){ + config1->framework_type = FrameworkType::TYPE_QNN; + } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") { + config1->framework_type = FrameworkType::TYPE_SNPE2; + } + config1->accelerate_type = AccelerateType::TYPE_DSP; + config1->is_quantify_model = 1; + + std::vector> input_shapes1 = {{1,3,256,256}}; + std::vector> output_shapes1 = {{1,896,16},{1,896,1}}; + model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32); + std::unique_ptr fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1); + if(fast_interpreter1 == nullptr){ + printf("build_interpretper_from_model_and_config failed !\n"); + return EXIT_FAILURE; + } + int result = fast_interpreter1->init(); + if(result != EXIT_SUCCESS){ + printf("interpreter->init() failed !\n"); + return EXIT_FAILURE; + } + // load model + fast_interpreter1->load_model(); + if(result != EXIT_SUCCESS){ + printf("interpreter->load_model() failed !\n"); + return EXIT_FAILURE; + } + printf("detect model load success!\n"); + // =============================================================faceDetector_model over + + // =============================================================faceLandmark_model start + Model* model2 = Model::create_instance(args.faceLandmark_model); + if(model2 == nullptr){ + printf("Create model2 failed !\n"); + return EXIT_FAILURE; + } + Config* config2 = Config::create_instance(); + if(config2 == nullptr){ + printf("Create config2 failed !\n"); + return EXIT_FAILURE; + } + config2->implement_type = ImplementType::TYPE_LOCAL; + std::string model_type_lower2 = to_lower(args.model_type); + if (model_type_lower2 == "qnn"){ + config2->framework_type = FrameworkType::TYPE_QNN; + } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") { + config2->framework_type = FrameworkType::TYPE_SNPE2; + } + config2->accelerate_type = AccelerateType::TYPE_DSP; + config2->is_quantify_model = 1; + + std::vector> input_shapes2 = {{1,3,192,192}}; + std::vector> output_shapes2 = {{1},{1,468,3}}; + model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32); + std::unique_ptr fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2); + if(fast_interpreter2 == nullptr){ + printf("build_interpretper_from_model_and_config2 failed !\n"); + return EXIT_FAILURE; + } + result = fast_interpreter2->init(); + if(result != EXIT_SUCCESS){ + printf("interpreter2->init() failed !\n"); + return EXIT_FAILURE; + } + // load model + fast_interpreter2->load_model(); + if(result != EXIT_SUCCESS){ + printf("interpreter2->load_model() failed !\n"); + return EXIT_FAILURE; + } + printf("detect model2 load success!\n"); + // =============================================================faceLandmark_model over + + + auto anchors = load_anchors_from_npy("../anchors_float32.npy"); + cv::Mat frame = cv::imread(args.imgs); + if (frame.empty()) { + printf("detect image load failed!\n"); + return 1; + } + // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows); + cv::Mat input_data; + cv::Mat frame_clone1 = frame.clone(); + cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB); + cv::Mat frame_clone = frame.clone(); + + + cv::Mat img1, img2; + float scale; + cv::Point pad; + std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1); + std::vector input_tensor = preprocess_image(img1); + + float *outdata0 = nullptr; + float *outdata1 = nullptr; + std::vector invoke_time; + for (int i = 0; i < args.invoke_nums; ++i) { + result = fast_interpreter1->set_input_tensor(0, input_tensor.data()); + if(result != EXIT_SUCCESS){ + printf("interpreter->set_input_tensor() failed !\n"); + return EXIT_FAILURE; + } + auto t1 = std::chrono::high_resolution_clock::now(); + result = fast_interpreter1->invoke(); + auto t2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration cost_time = t2 - t1; + invoke_time.push_back(cost_time.count() * 1000); + if(result != EXIT_SUCCESS){ + printf("interpreter->invoke() failed !\n"); + return EXIT_FAILURE; + } + uint32_t out_data_0 = 0; + result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0); + if(result != EXIT_SUCCESS){ + printf("interpreter1->get_output_tensor() 0 failed !\n"); + return EXIT_FAILURE; + } + + uint32_t out_data_1 = 0; + result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1); + if(result != EXIT_SUCCESS){ + printf("interpreter1->get_output_tensor() 1 failed !\n"); + return EXIT_FAILURE; + } + + } + + std::vector tensor_1_896_16(outdata0, outdata0 + 896*16); + std::vector tensor_1_896_1(outdata1, outdata1 + 896*1); + + std::vector>> detections = tensors_to_detections( + tensor_1_896_16, tensor_1_896_1, anchors, + 1, 896, 16, 1, 6, + 256.0f, 256.0f, 256.0f, 256.0f, + 100.0f, 0.4f); + + + std::vector>> filtered_detections; + for (size_t i = 0; i < detections.size(); ++i) { + std::vector>& dets = detections[i]; + std::vector> faces = weighted_non_max_suppression(dets); + filtered_detections.push_back(faces); + } + + + // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n"; + // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n"; + std::vector> face_detections = denormalize_detections(filtered_detections[0], scale, pad); + + // std::cout << "face_detections size: " << face_detections.size() << "\n"; + std::vector xc, yc, scales, theta; + int kp1 = 0, kp2 = 1; // 关键点索引 + float dy = 0.0f; // 根据模型定义设定 + float dscale = 1.5f; // 缩放因子 + float theta0 = 0.0f; // 基准角度 + + detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0); + std::vector rois; + std::vector affines; + std::vector> boxes; + + extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes); + if (!boxes.empty()) { + std::cout << "Detected " << boxes.size() << " faces.\n"; + // 检测到人脸,继续处理 boxes[0] ... + std::vector input_tensor = preprocess_imgs_to_nchw(rois); + + float *outdata1_0 = nullptr; + float *outdata1_1 = nullptr; + + result = fast_interpreter2->set_input_tensor(0, input_tensor.data()); + if(result != EXIT_SUCCESS){ + printf("interpreter2->set_input_tensor() failed !\n"); + return EXIT_FAILURE; + } + auto t1 = std::chrono::high_resolution_clock::now(); + result = fast_interpreter2->invoke(); + auto t2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration cost_time = t2 - t1; + invoke_time.push_back(cost_time.count() * 1000); + if(result != EXIT_SUCCESS){ + printf("interpreter2->invoke() failed !\n"); + return EXIT_FAILURE; + } + uint32_t out_data_1_0 = 0; + result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0); + if(result != EXIT_SUCCESS){ + printf("interpreter2->get_output_tensor() 0 failed !\n"); + return EXIT_FAILURE; + } + + uint32_t out_data_1_1 = 0; + result = fast_interpreter2->get_output_tensor(1, (void**)&outdata1_1, &out_data_1_1); + if(result != EXIT_SUCCESS){ + printf("interpreter2->get_output_tensor() 1 failed !\n"); + return EXIT_FAILURE; + } + + std::vector flags(outdata1_0, outdata1_0 + 1); + std::vector normalized_landmarks(outdata1_1, outdata1_1 + 468*3); + + std::vector landmarks = denormalize_landmarks(normalized_landmarks, affines); + draw_landmarks(frame_clone1, landmarks, flags, FACE_CONNECTIONS); + } else { + std::cout << "not detect face!" << std::endl; + } + + + draw_roi(frame_clone1, boxes); + draw_detections(frame_clone1, face_detections); + cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR); + cv::imwrite("vis_result.jpg", frame_clone1); + + + fast_interpreter1->destory(); + fast_interpreter2->destory(); + return 0; + +} + + +int main(int argc, char* argv[]) { + Args args = parse_args(argc, argv); + return invoke(args); +} diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/models/m_faceDetector_w8a8.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/models/m_faceDetector_w8a8.qnn216.ctx.bin new file mode 100644 index 0000000000000000000000000000000000000000..71e0c857c57af5cd3fd5f945284f92ff77d831f9 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/models/m_faceDetector_w8a8.qnn216.ctx.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa0abbe810564cd4afc9f7645ab15576b2444515c92567556a84aff7f9d4ace3 +size 347688 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/models/m_faceLandmark_w8a8.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/models/m_faceLandmark_w8a8.qnn216.ctx.bin new file mode 100644 index 0000000000000000000000000000000000000000..47fa26017d52528dfbb6efd01c2f632b7a0a1b35 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/models/m_faceLandmark_w8a8.qnn216.ctx.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63778b82d307633a3c22ad2e25a228976e2df67b6e53a1366f042b9305d1b9c7 +size 797792 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5f01c881032cc0ea0dafbbe76b4d02bf51f5dc2 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e15f793e9a3a77c122c6cae36e26c5bd9b77ab7e Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazeface.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4d54b68c53f26e930688f7bd66bc42e64431265 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazeface.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3960f9eaa3babd661de15273d7037029ce46e706 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/visualization.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffaa52099e2019a3dc97a8eef82319610de6aaf4 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/visualization.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/anchors_face_back.npy b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/anchors_face_back.npy new file mode 100644 index 0000000000000000000000000000000000000000..3ba12474802adea36a8e37ca648d46556cd35c92 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/anchors_face_back.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95 +size 28800 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazebase.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazebase.py new file mode 100644 index 0000000000000000000000000000000000000000..414ae950199d7fa1ebae4fa540805cb53cf1ff0d --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazebase.py @@ -0,0 +1,513 @@ +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def resize_pad(img): + """ resize and pad images to be input to the detectors + + The face and palm detector networks take 256x256 and 128x128 images + as input. As such the input image is padded and resized to fit the + size while maintaing the aspect ratio. + + Returns: + img1: 256x256 + img2: 128x128 + scale: scale factor between original image and 256x256 image + pad: pixels of padding in the original image + """ + + size0 = img.shape + if size0[0]>=size0[1]: + h1 = 256 + w1 = 256 * size0[1] // size0[0] + padh = 0 + padw = 256 - w1 + scale = size0[1] / w1 + else: + h1 = 256 * size0[0] // size0[1] + w1 = 256 + padh = 256 - h1 + padw = 0 + scale = size0[0] / h1 + padh1 = padh//2 + padh2 = padh//2 + padh%2 + padw1 = padw//2 + padw2 = padw//2 + padw%2 + img1 = cv2.resize(img, (w1,h1)) + img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0))) + pad = (int(padh1 * scale), int(padw1 * scale)) + img2 = cv2.resize(img1, (128,128)) + return img1, img2, scale, pad + + +def denormalize_detections(detections, scale, pad): + """ maps detection coordinates from [0,1] to image coordinates + + The face and palm detector networks take 256x256 and 128x128 images + as input. As such the input image is padded and resized to fit the + size while maintaing the aspect ratio. This function maps the + normalized coordinates back to the original image coordinates. + + Inputs: + detections: nxm tensor. n is the number of detections. + m is 4+2*k where the first 4 valuse are the bounding + box coordinates and k is the number of additional + keypoints output by the detector. + scale: scalar that was used to resize the image + pad: padding in the x and y dimensions + + """ + detections[:, 0] = detections[:, 0] * scale * 256 - pad[0] + detections[:, 1] = detections[:, 1] * scale * 256 - pad[1] + detections[:, 2] = detections[:, 2] * scale * 256 - pad[0] + detections[:, 3] = detections[:, 3] * scale * 256 - pad[1] + + detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1] + detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0] + return detections + + + + +class BlazeBlock(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False): + super(BlazeBlock, self).__init__() + + self.stride = stride + self.kernel_size = kernel_size + self.channel_pad = out_channels - in_channels + + # TFLite uses slightly different padding than PyTorch + # on the depthwise conv layer when the stride is 2. + if stride == 2: + self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride) + padding = 0 + else: + padding = (kernel_size - 1) // 2 + + self.convs = nn.Sequential( + nn.Conv2d(in_channels=in_channels, out_channels=in_channels, + kernel_size=kernel_size, stride=stride, padding=padding, + groups=in_channels, bias=True), + nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=1, stride=1, padding=0, bias=True), + ) + + if skip_proj: + self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=1, stride=1, padding=0, bias=True) + else: + self.skip_proj = None + + if act == 'relu': + self.act = nn.ReLU(inplace=True) + elif act == 'prelu': + self.act = nn.PReLU(out_channels) + else: + raise NotImplementedError("unknown activation %s"%act) + + def forward(self, x): + if self.stride == 2: + if self.kernel_size==3: + h = F.pad(x, (0, 2, 0, 2), "constant", 0) + else: + h = F.pad(x, (1, 2, 1, 2), "constant", 0) + x = self.max_pool(x) + else: + h = x + + if self.skip_proj is not None: + x = self.skip_proj(x) + elif self.channel_pad > 0: + x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0) + + + return self.act(self.convs(h) + x) + + +class FinalBlazeBlock(nn.Module): + def __init__(self, channels, kernel_size=3): + super(FinalBlazeBlock, self).__init__() + + # TFLite uses slightly different padding than PyTorch + # on the depthwise conv layer when the stride is 2. + self.convs = nn.Sequential( + nn.Conv2d(in_channels=channels, out_channels=channels, + kernel_size=kernel_size, stride=2, padding=0, + groups=channels, bias=True), + nn.Conv2d(in_channels=channels, out_channels=channels, + kernel_size=1, stride=1, padding=0, bias=True), + ) + + self.act = nn.ReLU(inplace=True) + + def forward(self, x): + h = F.pad(x, (0, 2, 0, 2), "constant", 0) + + return self.act(self.convs(h)) + + +class BlazeBase(nn.Module): + """ Base class for media pipe models. """ + + def _device(self): + """Which device (CPU or GPU) is being used by this model?""" + return self.classifier_8.weight.device + + def load_weights(self, path): + self.load_state_dict(torch.load(path)) + self.eval() + + +class BlazeLandmark(BlazeBase): + """ Base class for landmark models. """ + + def extract_roi(self, frame, xc, yc, theta, scale): + + # take points on unit square and transform them according to the roi + points = torch.tensor([[-1, -1, 1, 1], + [-1, 1, -1, 1]], device=scale.device).view(1,2,4) + points = points * scale.view(-1,1,1)/2 + theta = theta.view(-1, 1, 1) + R = torch.cat(( + torch.cat((torch.cos(theta), -torch.sin(theta)), 2), + torch.cat((torch.sin(theta), torch.cos(theta)), 2), + ), 1) + center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1) + points = R @ points + center + + # use the points to compute the affine transform that maps + # these points back to the output square + res = self.resolution + points1 = np.array([[0, 0, res-1], + [0, res-1, 0]], dtype=np.float32).T + affines = [] + imgs = [] + for i in range(points.shape[0]): + pts = points[i, :, :3].cpu().numpy().T + M = cv2.getAffineTransform(pts, points1) + img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5) + img = torch.tensor(img, device=scale.device) + imgs.append(img) + affine = cv2.invertAffineTransform(M).astype('float32') + affine = torch.tensor(affine, device=scale.device) + affines.append(affine) + if imgs: + imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0 + affines = torch.stack(affines) + else: + imgs = torch.zeros((0, 3, res, res), device=scale.device) + affines = torch.zeros((0, 2, 3), device=scale.device) + + return imgs, affines, points + + def denormalize_landmarks(self, landmarks, affines): + landmarks[:,:,:2] *= self.resolution + for i in range(len(landmarks)): + landmark, affine = landmarks[i], affines[i] + landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T + landmarks[i,:,:2] = landmark + return landmarks + + + +class BlazeDetector(BlazeBase): + """ Base class for detector models. + + Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and + https://github.com/hollance/BlazeFace-PyTorch and + https://github.com/google/mediapipe/ + """ + def load_anchors(self, path): + self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device()) + assert(self.anchors.ndimension() == 2) + assert(self.anchors.shape[0] == self.num_anchors) + assert(self.anchors.shape[1] == 4) + + def _preprocess(self, x): + """Converts the image pixels to the range [-1, 1].""" + return x.float() / 255.# 127.5 - 1.0 + + def predict_on_image(self, img): + """Makes a prediction on a single image. + + Arguments: + img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of + shape (3, H, W). The image's height and width should be + 128 pixels. + + Returns: + A tensor with face detections. + """ + if isinstance(img, np.ndarray): + img = torch.from_numpy(img).permute((2, 0, 1)) + + return self.predict_on_batch(img.unsqueeze(0))[0] + + def predict_on_batch(self, x): + """Makes a prediction on a batch of images. + + Arguments: + x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of + shape (b, 3, H, W). The height and width should be 128 pixels. + + Returns: + A list containing a tensor of face detections for each image in + the batch. If no faces are found for an image, returns a tensor + of shape (0, 17). + + Each face detection is a PyTorch tensor consisting of 17 numbers: + - ymin, xmin, ymax, xmax + - x,y-coordinates for the 6 keypoints + - confidence score + """ + if isinstance(x, np.ndarray): + x = torch.from_numpy(x).permute((0, 3, 1, 2)) + + assert x.shape[1] == 3 + assert x.shape[2] == self.y_scale + assert x.shape[3] == self.x_scale + + # 1. Preprocess the images into tensors: + x = x.to(self._device()) + x = self._preprocess(x) + + # 2. Run the neural network: + with torch.no_grad(): + out = self.__call__(x) + + # 3. Postprocess the raw predictions: + detections = self._tensors_to_detections(out[0], out[1], self.anchors) + + # 4. Non-maximum suppression to remove overlapping detections: + filtered_detections = [] + for i in range(len(detections)): + faces = self._weighted_non_max_suppression(detections[i]) + faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1)) + filtered_detections.append(faces) + + return filtered_detections + + + def detection2roi(self, detection): + """ Convert detections from detector to an oriented bounding box. + + Adapted from: + # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt + + The center and size of the box is calculated from the center + of the detected box. Rotation is calcualted from the vector + between kp1 and kp2 relative to theta0. The box is scaled + and shifted by dscale and dy. + + """ + if self.detection2roi_method == 'box': + # compute box center and scale + # use mediapipe/calculators/util/detections_to_rects_calculator.cc + xc = (detection[:,1] + detection[:,3]) / 2 + yc = (detection[:,0] + detection[:,2]) / 2 + scale = (detection[:,3] - detection[:,1]) # assumes square boxes + + elif self.detection2roi_method == 'alignment': + # compute box center and scale + # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc + xc = detection[:,4+2*self.kp1] + yc = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2 + else: + raise NotImplementedError( + "detection2roi_method [%s] not supported"%self.detection2roi_method) + + yc += self.dy * scale + scale *= self.dscale + + # compute box rotation + x0 = detection[:,4+2*self.kp1] + y0 = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + #theta = np.arctan2(y0-y1, x0-x1) - self.theta0 + theta = torch.atan2(y0-y1, x0-x1) - self.theta0 + return xc, yc, scale, theta + + + def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors): + """The output of the neural network is a tensor of shape (b, 896, 16) + containing the bounding box regressor predictions, as well as a tensor + of shape (b, 896, 1) with the classification confidences. + + This function converts these two "raw" tensors into proper detections. + Returns a list of (num_detections, 17) tensors, one for each image in + the batch. + + This is based on the source code from: + mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc + mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto + """ + assert raw_box_tensor.ndimension() == 3 + assert raw_box_tensor.shape[1] == self.num_anchors + assert raw_box_tensor.shape[2] == self.num_coords + + assert raw_score_tensor.ndimension() == 3 + assert raw_score_tensor.shape[1] == self.num_anchors + assert raw_score_tensor.shape[2] == self.num_classes + + assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0] + + detection_boxes = self._decode_boxes(raw_box_tensor, anchors) + + thresh = self.score_clipping_thresh + raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh) + detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1) + + # Note: we stripped off the last dimension from the scores tensor + # because there is only has one class. Now we can simply use a mask + # to filter out the boxes with too low confidence. + mask = detection_scores >= self.min_score_thresh + + # Because each image from the batch can have a different number of + # detections, process them one at a time using a loop. + output_detections = [] + for i in range(raw_box_tensor.shape[0]): + boxes = detection_boxes[i, mask[i]] + scores = detection_scores[i, mask[i]].unsqueeze(dim=-1) + output_detections.append(torch.cat((boxes, scores), dim=-1)) + + return output_detections + + def _decode_boxes(self, raw_boxes, anchors): + """Converts the predictions into actual coordinates using + the anchor boxes. Processes the entire batch at once. + """ + boxes = torch.zeros_like(raw_boxes) + + x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0] + y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + + w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2] + h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3] + + boxes[..., 0] = y_center - h / 2. # ymin + boxes[..., 1] = x_center - w / 2. # xmin + boxes[..., 2] = y_center + h / 2. # ymax + boxes[..., 3] = x_center + w / 2. # xmax + + for k in range(self.num_keypoints): + offset = 4 + k*2 + keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0] + keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + boxes[..., offset ] = keypoint_x + boxes[..., offset + 1] = keypoint_y + + return boxes + + def _weighted_non_max_suppression(self, detections): + """The alternative NMS method as mentioned in the BlazeFace paper: + + "We replace the suppression algorithm with a blending strategy that + estimates the regression parameters of a bounding box as a weighted + mean between the overlapping predictions." + + The original MediaPipe code assigns the score of the most confident + detection to the weighted detection, but we take the average score + of the overlapping detections. + + The input detections should be a Tensor of shape (count, 17). + + Returns a list of PyTorch tensors, one for each detected face. + + This is based on the source code from: + mediapipe/calculators/util/non_max_suppression_calculator.cc + mediapipe/calculators/util/non_max_suppression_calculator.proto + """ + if len(detections) == 0: return [] + + output_detections = [] + + # Sort the detections from highest to lowest score. + remaining = torch.argsort(detections[:, self.num_coords], descending=True) + + while len(remaining) > 0: + detection = detections[remaining[0]] + + # Compute the overlap between the first box and the other + # remaining boxes. (Note that the other_boxes also include + # the first_box.) + first_box = detection[:4] + other_boxes = detections[remaining, :4] + ious = overlap_similarity(first_box, other_boxes) + + # If two detections don't overlap enough, they are considered + # to be from different faces. + mask = ious > self.min_suppression_threshold + overlapping = remaining[mask] + remaining = remaining[~mask] + + # Take an average of the coordinates from the overlapping + # detections, weighted by their confidence scores. + weighted_detection = detection.clone() + if len(overlapping) > 1: + coordinates = detections[overlapping, :self.num_coords] + scores = detections[overlapping, self.num_coords:self.num_coords+1] + total_score = scores.sum() + weighted = (coordinates * scores).sum(dim=0) / total_score + weighted_detection[:self.num_coords] = weighted + weighted_detection[self.num_coords] = total_score / len(overlapping) + + output_detections.append(weighted_detection) + + return output_detections + + +# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py + +def intersect(box_a, box_b): + """ We resize both tensors to [A,B,2] without new malloc: + [A,2] -> [A,1,2] -> [A,B,2] + [B,2] -> [1,B,2] -> [A,B,2] + Then we compute the area of intersect between box_a and box_b. + Args: + box_a: (tensor) bounding boxes, Shape: [A,4]. + box_b: (tensor) bounding boxes, Shape: [B,4]. + Return: + (tensor) intersection area, Shape: [A,B]. + """ + A = box_a.size(0) + B = box_b.size(0) + max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), + box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) + min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), + box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + return inter[:, :, 0] * inter[:, :, 1] + + +def jaccard(box_a, box_b): + """Compute the jaccard overlap of two sets of boxes. The jaccard overlap + is simply the intersection over union of two boxes. Here we operate on + ground truth boxes and default boxes. + E.g.: + A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) + Args: + box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] + box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] + Return: + jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] + """ + inter = intersect(box_a, box_b) + area_a = ((box_a[:, 2]-box_a[:, 0]) * + (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_b = ((box_b[:, 2]-box_b[:, 0]) * + (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + union = area_a + area_b - inter + return inter / union # [A,B] + + +def overlap_similarity(box, other_boxes): + """Computes the IOU between a bounding box and set of other boxes.""" + return jaccard(box.unsqueeze(0), other_boxes).squeeze(0) diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazeface.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazeface.py new file mode 100644 index 0000000000000000000000000000000000000000..6d630b3b28ead006d2864f18f1637f6545dbe507 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazeface.py @@ -0,0 +1,182 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from blazebase import BlazeDetector, BlazeBlock, FinalBlazeBlock + + +class BlazeFace(BlazeDetector): + """The BlazeFace face detection model from MediaPipe. + + The version from MediaPipe is simpler than the one in the paper; + it does not use the "double" BlazeBlocks. + + Because we won't be training this model, it doesn't need to have + batchnorm layers. These have already been "folded" into the conv + weights by TFLite. + + The conversion to PyTorch is fairly straightforward, but there are + some small differences between TFLite and PyTorch in how they handle + padding on conv layers with stride 2. + + This version works on batches, while the MediaPipe version can only + handle a single image at a time. + + Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and + https://github.com/hollance/BlazeFace-PyTorch and + https://github.com/google/mediapipe/ + + """ + def __init__(self, back_model=False): + super(BlazeFace, self).__init__() + + # These are the settings from the MediaPipe example graph + # mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt + self.num_classes = 1 + self.num_anchors = 896 + self.num_coords = 16 + self.score_clipping_thresh = 100.0 + self.back_model = back_model + if back_model: + self.x_scale = 256.0 + self.y_scale = 256.0 + self.h_scale = 256.0 + self.w_scale = 256.0 + self.min_score_thresh = 0.65 + else: + self.x_scale = 128.0 + self.y_scale = 128.0 + self.h_scale = 128.0 + self.w_scale = 128.0 + self.min_score_thresh = 0.75 + self.min_suppression_threshold = 0.3 + self.num_keypoints = 6 + + # These settings are for converting detections to ROIs which can then + # be extracted and feed into the landmark network + # use mediapipe/calculators/util/detections_to_rects_calculator.cc + self.detection2roi_method = 'box' + # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt + self.kp1 = 1 + self.kp2 = 0 + self.theta0 = 0. + self.dscale = 1.5 + self.dy = 0. + + self._define_layers() + + def _define_layers(self): + if self.back_model: + self.backbone = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True), + nn.ReLU(inplace=True), + + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24, stride=2), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 48, stride=2), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 96, stride=2), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + ) + self.final = FinalBlazeBlock(96) + self.classifier_8 = nn.Conv2d(96, 2, 1, bias=True) + self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True) + + self.regressor_8 = nn.Conv2d(96, 32, 1, bias=True) + self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True) + else: + self.backbone1 = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True), + nn.ReLU(inplace=True), + + BlazeBlock(24, 24), + BlazeBlock(24, 28), + BlazeBlock(28, 32, stride=2), + BlazeBlock(32, 36), + BlazeBlock(36, 42), + BlazeBlock(42, 48, stride=2), + BlazeBlock(48, 56), + BlazeBlock(56, 64), + BlazeBlock(64, 72), + BlazeBlock(72, 80), + BlazeBlock(80, 88), + ) + + self.backbone2 = nn.Sequential( + BlazeBlock(88, 96, stride=2), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + ) + + self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True) + self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True) + + self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True) + self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True) + + def forward(self, x): + # TFLite uses slightly different padding on the first conv layer + # than PyTorch, so do it manually. + x = F.pad(x, (1, 2, 1, 2), "constant", 0) + + b = x.shape[0] # batch size, needed for reshaping later + + if self.back_model: + x = self.backbone(x) # (b, 16, 16, 96) + h = self.final(x) # (b, 8, 8, 96) + else: + x = self.backbone1(x) # (b, 88, 16, 16) + h = self.backbone2(x) # (b, 96, 8, 8) + + # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to + # permute the output from the conv layers before reshaping it. + + c1 = self.classifier_8(x) # (b, 2, 16, 16) + c1 = c1.permute(0, 2, 3, 1) # (b, 16, 16, 2) + c1 = c1.reshape(b, -1, 1) # (b, 512, 1) + + c2 = self.classifier_16(h) # (b, 6, 8, 8) + c2 = c2.permute(0, 2, 3, 1) # (b, 8, 8, 6) + c2 = c2.reshape(b, -1, 1) # (b, 384, 1) + + c = torch.cat((c1, c2), dim=1) # (b, 896, 1) + + r1 = self.regressor_8(x) # (b, 32, 16, 16) + r1 = r1.permute(0, 2, 3, 1) # (b, 16, 16, 32) + r1 = r1.reshape(b, -1, 16) # (b, 512, 16) + + r2 = self.regressor_16(h) # (b, 96, 8, 8) + r2 = r2.permute(0, 2, 3, 1) # (b, 8, 8, 96) + r2 = r2.reshape(b, -1, 16) # (b, 384, 16) + + r = torch.cat((r1, r2), dim=1) # (b, 896, 16) + return [r, c] + diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazeface_landmark.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazeface_landmark.py new file mode 100644 index 0000000000000000000000000000000000000000..8b21d17e31c8a8a39e95abb27cc3220440a01fca --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazeface_landmark.py @@ -0,0 +1,74 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from blazebase import BlazeLandmark, BlazeBlock + +class BlazeFaceLandmark(BlazeLandmark): + """The face landmark model from MediaPipe. + + """ + def __init__(self): + super(BlazeFaceLandmark, self).__init__() + + # size of ROIs used for input + self.resolution = 192 + + self._define_layers() + + def _define_layers(self): + self.backbone1 = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=0, bias=True), + nn.PReLU(16), + + BlazeBlock(16, 16, 3, act='prelu'), + BlazeBlock(16, 16, 3, act='prelu'), + BlazeBlock(16, 32, 3, 2, act='prelu'), + + BlazeBlock(32, 32, 3, act='prelu'), + BlazeBlock(32, 32, 3, act='prelu'), + BlazeBlock(32, 64, 3, 2, act='prelu'), + + BlazeBlock(64, 64, 3, act='prelu'), + BlazeBlock(64, 64, 3, act='prelu'), + BlazeBlock(64, 128, 3, 2, act='prelu'), + + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, 2, act='prelu'), + + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + ) + + + self.backbone2a = nn.Sequential( + BlazeBlock(128, 128, 3, 2, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + nn.Conv2d(128, 32, 1, padding=0, bias=True), + nn.PReLU(32), + BlazeBlock(32, 32, 3, act='prelu'), + nn.Conv2d(32, 1404, 3, padding=0, bias=True) + ) + + self.backbone2b = nn.Sequential( + BlazeBlock(128, 128, 3, 2, act='prelu'), + nn.Conv2d(128, 32, 1, padding=0, bias=True), + nn.PReLU(32), + BlazeBlock(32, 32, 3, act='prelu'), + nn.Conv2d(32, 1, 3, padding=0, bias=True) + ) + + def forward(self, x): + if x.shape[0] == 0: + return torch.zeros((0,)), torch.zeros((0, 468, 3)) + + x = F.pad(x, (0, 1, 0, 1), "constant", 0) + + x = self.backbone1(x) + landmarks = self.backbone2a(x).view(-1, 468, 3) / 192 + flag = self.backbone2b(x).sigmoid().view(-1) + + return flag, landmarks \ No newline at end of file diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/coco.jpg b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/coco.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/coco.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378 +size 158750 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/demo_qnn.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/demo_qnn.py new file mode 100644 index 0000000000000000000000000000000000000000..fe5547b031145d7e58913c711009e39f294ca046 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/demo_qnn.py @@ -0,0 +1,421 @@ +import numpy as np +import torch +import cv2 +import sys +from blazebase import resize_pad, denormalize_detections +from visualization import draw_landmarks, draw_roi, FACE_CONNECTIONS +import time +import aidlite +import os + +class post_mediapipe_face: + def __init__(self): + self.kp1 = 1 + self.kp2 = 0 + self.theta0 = 0. + self.dscale = 1.5 + self.dy = 0. + self.x_scale = 256.0 + self.y_scale = 256.0 + self.h_scale = 256.0 + self.w_scale = 256.0 + self.num_keypoints = 6 + self.num_classes = 1 + self.num_anchors = 896 + self.num_coords = 16 + self.min_score_thresh = 0.4 #0.65 + self.score_clipping_thresh = 100.0 + self.min_suppression_threshold = 0.3 + self.resolution = 192 + + + def detection2roi(self,detection): + xc = (detection[:,1] + detection[:,3]) / 2 + yc = (detection[:,0] + detection[:,2]) / 2 + scale = (detection[:,3] - detection[:,1]) # assumes square boxes + yc += self.dy * scale + scale *= self.dscale + # compute box rotation + x0 = detection[:,4+2*self.kp1] + y0 = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + theta = torch.atan2(y0-y1, x0-x1) - self.theta0 + return xc, yc, scale, theta + + def _decode_boxes( self,raw_boxes, anchors): + boxes = torch.zeros_like(raw_boxes) + + x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0] + y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + + w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2] + h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3] + + boxes[..., 0] = y_center - h / 2. # ymin + boxes[..., 1] = x_center - w / 2. # xmin + boxes[..., 2] = y_center + h / 2. # ymax + boxes[..., 3] = x_center + w / 2. # xmax + + for k in range(self.num_keypoints): + offset = 4 + k*2 + keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0] + keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + boxes[..., offset ] = keypoint_x + boxes[..., offset + 1] = keypoint_y + return boxes + + def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors): + assert raw_box_tensor.ndimension() == 3 + assert raw_box_tensor.shape[1] == self.num_anchors + assert raw_box_tensor.shape[2] == self.num_coords + + assert raw_score_tensor.ndimension() == 3 + assert raw_score_tensor.shape[1] == self.num_anchors + assert raw_score_tensor.shape[2] == self.num_classes + + assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0] + + detection_boxes = self._decode_boxes(raw_box_tensor, anchors) + + thresh = self.score_clipping_thresh + raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh) + detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1) + + # Note: we stripped off the last dimension from the scores tensor + # because there is only has one class. Now we can simply use a mask + # to filter out the boxes with too low confidence. + mask = detection_scores >= self.min_score_thresh + + # Because each image from the batch can have a different number of + # detections, process them one at a time using a loop. + output_detections = [] + for i in range(raw_box_tensor.shape[0]): + boxes = detection_boxes[i, mask[i]] + scores = detection_scores[i, mask[i]].unsqueeze(dim=-1) + output_detections.append(torch.cat((boxes, scores), dim=-1)) + + return output_detections + + def extract_roi( self,frame, xc, yc, theta, scale): + resolution = 192 + # take points on unit square and transform them according to the roi + points = torch.tensor([[-1, -1, 1, 1], + [-1, 1, -1, 1]], device=scale.device).view(1,2,4) + points = points * scale.view(-1,1,1)/2 + theta = theta.view(-1, 1, 1) + R = torch.cat(( + torch.cat((torch.cos(theta), -torch.sin(theta)), 2), + torch.cat((torch.sin(theta), torch.cos(theta)), 2), + ), 1) + center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1) + points = R @ points + center + + # use the points to compute the affine transform that maps + # these points back to the output square + res = resolution + points1 = np.array([[0, 0, res-1], + [0, res-1, 0]], dtype=np.float32).T + affines = [] + imgs = [] + for i in range(points.shape[0]): + pts = points[i, :, :3].detach().numpy().T + M = cv2.getAffineTransform(pts, points1) + img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5) + img = torch.tensor(img, device=scale.device) + imgs.append(img) + affine = cv2.invertAffineTransform(M).astype('float32') + affine = torch.tensor(affine, device=scale.device) + affines.append(affine) + if imgs: + imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0 + affines = torch.stack(affines) + else: + imgs = torch.zeros((0, 3, res, res), device=scale.device) + affines = torch.zeros((0, 2, 3), device=scale.device) + + return imgs, affines, points + + def denormalize_landmarks(self, landmarks, affines): + landmarks[:,:,:2] *= self.resolution + for i in range(len(landmarks)): + landmark, affine = landmarks[i], affines[i] + landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T + landmarks[i,:,:2] = landmark + return landmarks + + def intersect(self,box_a, box_b): + A = box_a.size(0) + B = box_b.size(0) + max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), + box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) + min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), + box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + return inter[:, :, 0] * inter[:, :, 1] + + def jaccard(self,box_a, box_b): + inter = self.intersect(box_a, box_b) + area_a = ((box_a[:, 2]-box_a[:, 0]) * + (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_b = ((box_b[:, 2]-box_b[:, 0]) * + (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + union = area_a + area_b - inter + return inter / union # [A,B] + + + def overlap_similarity(self,box, other_boxes): + """Computes the IOU between a bounding box and set of other boxes.""" + return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0) + + def _weighted_non_max_suppression(self,detections): + if len(detections) == 0: return [] + output_detections = [] + + # Sort the detections from highest to lowest score. + remaining = torch.argsort(detections[:, num_coords], descending=True) + + while len(remaining) > 0: + detection = detections[remaining[0]] + + # Compute the overlap between the first box and the other + # remaining boxes. (Note that the other_boxes also include + # the first_box.) + first_box = detection[:4] + other_boxes = detections[remaining, :4] + ious = self.overlap_similarity(first_box, other_boxes) + + # If two detections don't overlap enough, they are considered + # to be from different faces. + mask = ious > self.min_suppression_threshold + overlapping = remaining[mask] + remaining = remaining[~mask] + + # Take an average of the coordinates from the overlapping + # detections, weighted by their confidence scores. + weighted_detection = detection.clone() + if len(overlapping) > 1: + coordinates = detections[overlapping, :num_coords] + scores = detections[overlapping, num_coords:num_coords+1] + total_score = scores.sum() + weighted = (coordinates * scores).sum(dim=0) / total_score + weighted_detection[:num_coords] = weighted + weighted_detection[num_coords] = total_score / len(overlapping) + + output_detections.append(weighted_detection) + + return output_detections + +def draw_detections(img, detections, with_keypoints=True): + if isinstance(detections, torch.Tensor): + detections = detections.detach().numpy() + + if detections.ndim == 1: + detections = np.expand_dims(detections, axis=0) + + n_keypoints = detections.shape[1] // 2 - 2 + + for i in range(detections.shape[0]): + ymin = detections[i, 0] + xmin = detections[i, 1] + ymax = detections[i, 2] + xmax = detections[i, 3] + + start_point = (int(xmin), int(ymin)) + end_point = (int(xmax), int(ymax)) + img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) + + if with_keypoints: + for k in range(n_keypoints): + kp_x = int(detections[i, 4 + k*2 ]) + kp_y = int(detections[i, 4 + k*2 + 1]) + cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2) + return img + + + +post_process=post_mediapipe_face() + +class faceDetectionQnn: + def __init__(self): + super().__init__() + self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceDetector_w8a8.qnn216.ctx.bin")) + if self.model is None: + print("Create model failed !") + return + + self.config = aidlite.Config.create_instance() + if self.config is None: + print("build_interpretper_from_model_and_config failed !") + return + + self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL + self.config.framework_type = aidlite.FrameworkType.TYPE_QNN + self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP + self.config.is_quantify_model = 1 + + self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config) + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + return + input_shapes = [[1,3, 256, 256]] + output_shapes = [[1, 896,16],[1,896,1]] + self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32, + output_shapes, aidlite.DataType.TYPE_FLOAT32) + + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + result = self.interpreter.init() + if result != 0: + print(f"interpreter init failed !") + result = self.interpreter.load_model() + if result != 0: + print("interpreter load model failed !") + + print(" model load success!") + + def __call__(self, input): + self.interpreter.set_input_tensor(0,input) + invoke_time=[] + invoke_nums =10 + for i in range(invoke_nums): + result = self.interpreter.set_input_tensor(0, input.data) + if result != 0: + print("interpreter set_input_tensor() failed") + t1=time.time() + result = self.interpreter.invoke() + cost_time = (time.time()-t1)*1000 + invoke_time.append(cost_time) + + max_invoke_time = max(invoke_time) + min_invoke_time = min(invoke_time) + mean_invoke_time = sum(invoke_time)/invoke_nums + var_invoketime=np.var(invoke_time) + print("====================================") + print(f"QNN Detection invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}") + print("====================================") + features_0 = self.interpreter.get_output_tensor(0).reshape(1, 896,16).copy() + features_1 = self.interpreter.get_output_tensor(1).reshape(1, 896,1).copy() + return features_0,features_1 + + +class faceLandmarkQnn: + def __init__(self): + super().__init__() + self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceLandmark_w8a8.qnn216.ctx.bin")) + if self.model is None: + print("Create model failed !") + return + + self.config = aidlite.Config.create_instance() + if self.config is None: + print("build_interpretper_from_model_and_config failed !") + return + + self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL + self.config.framework_type = aidlite.FrameworkType.TYPE_QNN + self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP + self.config.is_quantify_model = 1 + + self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config) + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + return + input_shapes = [[1, 3, 192, 192]] + output_shapes = [[1],[1,468,3]] + self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32, + output_shapes, aidlite.DataType.TYPE_FLOAT32) + + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + result = self.interpreter.init() + if result != 0: + print(f"interpreter init failed !") + result = self.interpreter.load_model() + if result != 0: + print("interpreter load model failed !") + + print(" model load success!") + + def __call__(self, input): + self.interpreter.set_input_tensor(0,input) + invoke_time=[] + invoke_nums =10 + for i in range(invoke_nums): + result = self.interpreter.set_input_tensor(0, input.data) + if result != 0: + print("interpreter set_input_tensor() failed") + t1=time.time() + result = self.interpreter.invoke() + cost_time = (time.time()-t1)*1000 + invoke_time.append(cost_time) + + max_invoke_time = max(invoke_time) + min_invoke_time = min(invoke_time) + mean_invoke_time = sum(invoke_time)/invoke_nums + var_invoketime=np.var(invoke_time) + print("====================================") + print(f"QNN LandMark invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}") + print("====================================") + features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy() + features_1 = self.interpreter.get_output_tensor(1).reshape(1,468,3).copy() + return features_0,features_1 + + + +anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"anchors_face_back.npy")), dtype=torch.float32, device='cpu') +face_detc = faceDetectionQnn() +face_rec = faceLandmarkQnn() + +image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"coco.jpg") + +frame_ct=0 +image = cv2.imread(image_path) + +frame = np.ascontiguousarray(image[:,:,::-1]) + +img1, img2, scale, pad = resize_pad(frame) + +input = (img1 / 255).astype(np.float32) +input = np.transpose(input, (2, 0, 1)) +input = input[np.newaxis, ...] +t0 = time.time() +out = face_detc(input) +use_time = round((time.time() - t0) * 1000, 2) +detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors) + +filtered_detections = [] +num_coords = 16 +for i in range(len(detections)): + faces = post_process._weighted_non_max_suppression(detections[i]) + faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1)) + filtered_detections.append(faces) + +face_detections = denormalize_detections(filtered_detections[0], scale, pad) + +xc, yc, scale, theta = post_process.detection2roi(face_detections) + +img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale) +if box.size()[0]!=0: + t2 = time.time() + flags, normalized_landmarks = face_rec(img.numpy()) + + use_time = round((time.time() - t2) * 1000, 2) + + landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine) + + for i in range(len(flags)): + landmark, flag = landmarks[i], flags[i] + if flag>.4: # 0.5 + draw_landmarks(frame, landmark[:,:2], FACE_CONNECTIONS, size=1) +else: + print("not detect face !") + +draw_roi(frame, box) +draw_detections(frame, face_detections) +cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1]) +face_detc.interpreter.destory() +face_rec.interpreter.destory() + + + diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/visualization.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..e1187c9d99336d53562f4fa3a4a32f6af6e769fb --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/visualization.py @@ -0,0 +1,125 @@ +import numpy as np +import cv2 +import torch + +def draw_detections(img, detections, with_keypoints=True): + if isinstance(detections, torch.Tensor): + detections = detections.cpu().numpy() + + if detections.ndim == 1: + detections = np.expand_dims(detections, axis=0) + + n_keypoints = detections.shape[1] // 2 - 2 + + for i in range(detections.shape[0]): + ymin = detections[i, 0] + xmin = detections[i, 1] + ymax = detections[i, 2] + xmax = detections[i, 3] + + start_point = (int(xmin), int(ymin)) + end_point = (int(xmax), int(ymax)) + img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) + + if with_keypoints: + for k in range(n_keypoints): + kp_x = int(detections[i, 4 + k*2 ]) + kp_y = int(detections[i, 4 + k*2 + 1]) + cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2) + return img + + +def draw_roi(img, roi): + for i in range(roi.shape[0]): + (x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i] + cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2) + cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2) + cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2) + cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2) + + +def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2): + points = points[:,:2] + for point in points: + x, y = point + x, y = int(x), int(y) + cv2.circle(img, (x, y), size, color, thickness=size) + for connection in connections: + x0, y0 = points[connection[0]] + x1, y1 = points[connection[1]] + x0, y0 = int(x0), int(y0) + x1, y1 = int(x1), int(y1) + cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size) + + + +# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py +# 8 12 16 20 +# | | | | +# 7 11 15 19 +# 4 | | | | +# | 6 10 14 18 +# 3 | | | | +# | 5---9---13--17 +# 2 \ / +# \ \ / +# 1 \ / +# \ \ / +# ------0- +HAND_CONNECTIONS = [ + (0, 1), (1, 2), (2, 3), (3, 4), + (5, 6), (6, 7), (7, 8), + (9, 10), (10, 11), (11, 12), + (13, 14), (14, 15), (15, 16), + (17, 18), (18, 19), (19, 20), + (0, 5), (5, 9), (9, 13), (13, 17), (0, 17) +] + +POSE_CONNECTIONS = [ + (0,1), (1,2), (2,3), (3,7), + (0,4), (4,5), (5,6), (6,8), + (9,10), + (11,13), (13,15), (15,17), (17,19), (19,15), (15,21), + (12,14), (14,16), (16,18), (18,20), (20,16), (16,22), + (11,12), (12,24), (24,23), (23,11) +] + +# Vertex indices can be found in +# github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png +# Found in github.com/google/mediapipe/python/solutions/face_mesh.py +FACE_CONNECTIONS = [ + # Lips. + (61, 146), (146, 91), (91, 181), (181, 84), (84, 17), + (17, 314), (314, 405), (405, 321), (321, 375), (375, 291), + (61, 185), (185, 40), (40, 39), (39, 37), (37, 0), + (0, 267), (267, 269), (269, 270), (270, 409), (409, 291), + (78, 95), (95, 88), (88, 178), (178, 87), (87, 14), + (14, 317), (317, 402), (402, 318), (318, 324), (324, 308), + (78, 191), (191, 80), (80, 81), (81, 82), (82, 13), + (13, 312), (312, 311), (311, 310), (310, 415), (415, 308), + # Left eye. + (263, 249), (249, 390), (390, 373), (373, 374), (374, 380), + (380, 381), (381, 382), (382, 362), (263, 466), (466, 388), + (388, 387), (387, 386), (386, 385), (385, 384), (384, 398), + (398, 362), + # Left eyebrow. + (276, 283), (283, 282), (282, 295), (295, 285), (300, 293), + (293, 334), (334, 296), (296, 336), + # Right eye. + (33, 7), (7, 163), (163, 144), (144, 145), (145, 153), + (153, 154), (154, 155), (155, 133), (33, 246), (246, 161), + (161, 160), (160, 159), (159, 158), (158, 157), (157, 173), + (173, 133), + # Right eyebrow. + (46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105), + (105, 66), (66, 107), + # Face oval. + (10, 338), (338, 297), (297, 332), (332, 284), (284, 251), + (251, 389), (389, 356), (356, 454), (454, 323), (323, 361), + (361, 288), (288, 397), (397, 365), (365, 379), (379, 378), + (378, 400), (400, 377), (377, 152), (152, 148), (148, 176), + (176, 149), (149, 150), (150, 136), (136, 172), (172, 58), + (58, 132), (132, 93), (93, 234), (234, 127), (127, 162), + (162, 21), (21, 54), (54, 103), (103, 67), (67, 109), + (109, 10) +] diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/README.md b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a2d5ae2cc86de93979c3b3183841d880fbf720cc --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/README.md @@ -0,0 +1,63 @@ +## Model Information +### Source model +- Input shape: [1x3x256x256],[1x3x192x192] +- Number of parameters:0.13M, 0.6M +- Model size:0.58MB, 2.32MB +- Output shape: [1x896x16, 1x896x1], [1, 1x486x3] + +Source model repository: [MediaPipe-Face-Detection](https://github.com/zmurez/MediaPipePyTorch/) + +### Converted model + +- Precision: W8A16 +- Backend: QNN2.16 +- Target Device: SNM972 QCS8550 + +## Inference with AidLite SDK + +### SDK installation +Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/) + +- install AidLite SDK + +```bash +# Install the appropriate version of the aidlite sdk +sudo aid-pkg update +sudo aid-pkg install aidlite-sdk +# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223 +sudo aid-pkg install aidlite-{QNN VERSION} +``` + +- Verify AidLite SDK + +```bash +# aidlite sdk c++ check +python3 -c "import aidlite ; print(aidlite.get_library_version())" + +# aidlite sdk python check +python3 -c "import aidlite ; print(aidlite.get_py_library_version())" +``` + +### Run demo +#### python +```bash +cd python +python3 demo_qnn.py +``` + +#### c++ +```bash +# 加载.npy文件需要用到cnpy库(终端默认路径下执行即可) +git clone https://github.com/rogersce/cnpy.git +cd cnpy +mkdir build && cd build +cmake .. +make +sudo make install + +cd mediapipe-face-detection/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp +mkdir build && cd build +cmake .. +make +./run_test +``` diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazebase.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8655a0360f730c1fb7b4f81367a6279e5acc8060 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazebase.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazebase.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2044fec25dc146bd44e23e22b15c320e189fd99f Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazebase.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazeface.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3d9d716ed4bbd036e086c876ce0c5308d5a9a5d Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazeface.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27b6da065e0c25967994af97ae8c72f67594bf5f Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/visualization.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e9b9b371d9ab88b3885f249fa1a9fc68d8751c1 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/visualization.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/visualization.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/visualization.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c16fa3b7c9a01a594bb21e59a6c00d0b19945548 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/visualization.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/CMakeLists.txt b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..13cca3db86e64b8a8458fd55c52323bed1d4282f --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/CMakeLists.txt @@ -0,0 +1,34 @@ +cmake_minimum_required (VERSION 3.5) +project("run_test") + +find_package(OpenCV REQUIRED) +find_library(CNPY_LIB cnpy REQUIRED) + +message(STATUS "oPENCV Library status:") +message(STATUS ">version:${OpenCV_VERSION}") +message(STATUS "Include:${OpenCV_INCLUDE_DIRS}") + +set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations") + +include_directories( + /usr/local/include + /usr/include/opencv4 +) + +link_directories( + /usr/local/lib/ +) + +file(GLOB SRC_LISTS + ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp +) + +add_executable(run_test ${SRC_LISTS}) + +target_link_libraries(run_test + aidlite + ${OpenCV_LIBS} + pthread + jsoncpp + ${CNPY_LIB} +) diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/anchors_float32.npy b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/anchors_float32.npy new file mode 100644 index 0000000000000000000000000000000000000000..fc25c0d2c161ee090e9a04c6003418bd15caf45c --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/anchors_float32.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94 +size 14464 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/coco.jpg b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/coco.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/coco.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378 +size 158750 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/run_test.cpp b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/run_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b8e051fdefc5ee932622844d3afb0c5d8f3f76d7 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/run_test.cpp @@ -0,0 +1,909 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cnpy.h" + +using namespace cv; +using namespace std; +using namespace Aidlux::Aidlite; + + +// 人脸 landmark 连接索引定义(来自 MediaPipe Face Mesh) +const std::vector> FACE_CONNECTIONS = { + {61, 146}, {146, 91}, {91, 181}, {181, 84}, {84, 17}, + {17, 314}, {314, 405}, {405, 321}, {321, 375}, {375, 291}, + {61, 185}, {185, 40}, {40, 39}, {39, 37}, {37, 0}, + {0, 267}, {267, 269}, {269, 270}, {270, 409}, {409, 291}, + {78, 95}, {95, 88}, {88, 178}, {178, 87}, {87, 14}, + {14, 317}, {317, 402}, {402, 318}, {318, 324}, {324, 308}, + {78, 191}, {191, 80}, {80, 81}, {81, 82}, {82, 13}, + {13, 312}, {312, 311}, {311, 310}, {310, 415}, {415, 308}, + {263, 249}, {249, 390}, {390, 373}, {373, 374}, {374, 380}, + {380, 381}, {381, 382}, {382, 362}, {263, 466}, {466, 388}, + {388, 387}, {387, 386}, {386, 385}, {385, 384}, {384, 398}, + {398, 362}, {276, 283}, {283, 282}, {282, 295}, {295, 285}, + {300, 293}, {293, 334}, {334, 296}, {296, 336}, {33, 7}, + {7, 163}, {163, 144}, {144, 145}, {145, 153}, {153, 154}, + {154, 155}, {155, 133}, {33, 246}, {246, 161}, {161, 160}, + {160, 159}, {159, 158}, {158, 157}, {157, 173}, {173, 133}, + {46, 53}, {53, 52}, {52, 65}, {65, 55}, {70, 63}, {63, 105}, + {105, 66}, {66, 107}, {10, 338}, {338, 297}, {297, 332}, + {332, 284}, {284, 251}, {251, 389}, {389, 356}, {356, 454}, + {454, 323}, {323, 361}, {361, 288}, {288, 397}, {397, 365}, + {365, 379}, {379, 378}, {378, 400}, {400, 377}, {377, 152}, + {152, 148}, {148, 176}, {176, 149}, {149, 150}, {150, 136}, + {136, 172}, {172, 58}, {58, 132}, {132, 93}, {93, 234}, + {234, 127}, {127, 162}, {162, 21}, {21, 54}, {54, 103}, + {103, 67}, {67, 109}, {109, 10} +}; + +struct Args { + std::string faceDetector_model = "../../models/m_faceDetector_w8a16.qnn216.ctx.bin"; + std::string faceLandmark_model = "../../models/m_faceLandmark_w8a16.qnn216.ctx.bin"; + std::string imgs = "../coco.jpg"; + int invoke_nums = 10; + std::string model_type = "QNN"; +}; + + +Args parse_args(int argc, char* argv[]) { + Args args; + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if (arg == "--faceDetector_model" && i + 1 < argc) { + args.faceDetector_model = argv[++i]; + } else if (arg == "--faceLandmark_model" && i + 1 < argc) { + args.faceLandmark_model = argv[++i]; + } else if (arg == "--imgs" && i + 1 < argc) { + args.imgs = argv[++i]; + } else if (arg == "--invoke_nums" && i + 1 < argc) { + args.invoke_nums = std::stoi(argv[++i]); + } else if (arg == "--model_type" && i + 1 < argc) { + args.model_type = argv[++i]; + } + } + return args; +} + +std::string to_lower(const std::string& str) { + std::string lower_str = str; + std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) { + return std::tolower(c); + }); + return lower_str; +} + +std::vector> load_anchors_from_npy(const std::string& path) { + cnpy::NpyArray arr = cnpy::npy_load(path); + float* data_ptr = arr.data(); + + size_t num_rows = arr.shape[0]; // 896 + size_t num_cols = arr.shape[1]; // 4 + + std::vector> anchors(num_rows, std::vector(num_cols)); + for (size_t i = 0; i < num_rows; ++i) { + for (size_t j = 0; j < num_cols; ++j) { + anchors[i][j] = data_ptr[i * num_cols + j]; + } + } + + return anchors; +} + + +// 绘制人脸关键点和连接线 +void draw_landmarks( + cv::Mat& img, + const std::vector& points, + const std::vector& flags, + const std::vector>& connections, + float threshold = 0.4f, + cv::Scalar point_color = cv::Scalar(0, 255, 0), + cv::Scalar line_color = cv::Scalar(0, 0, 0), + int size = 2) +{ + // 画关键点 + for (size_t i = 0; i < points.size(); ++i) { + if (i < flags.size() && flags[i] > threshold) { + int x = static_cast(points[i].x); + int y = static_cast(points[i].y); + cv::circle(img, cv::Point(x, y), size, point_color, size); + } + } + + // 画连接线(两端都要可见) + for (const auto& conn : connections) { + int i0 = conn.first; + int i1 = conn.second; + if (i0 < points.size() && i1 < points.size() && + i0 < flags.size() && i1 < flags.size() && + flags[i0] > threshold && flags[i1] > threshold) + { + cv::line(img, points[i0], points[i1], line_color, size); + } + } +} + + +std::tuple resize_pad(const cv::Mat& img) { + int orig_h = img.rows; // 480 + int orig_w = img.cols; // 640 + + // Step 1: resize width to 256, keep aspect ratio + int w1 = 256; + int h1 = w1 * orig_h / orig_w; // 等效于 int(256 * h / w) + + // Step 2: compute padding in height direction + int padh = 256 - h1; + int padw = 0; + + int padh1 = padh / 2; + int padh2 = padh1 + (padh % 2); + int padw1 = padw / 2; + int padw2 = padw1 + (padw % 2); + + // Step 3: resize to (w1, h1) + cv::Mat resized; + cv::resize(img, resized, cv::Size(w1, h1)); // (256, h1) + + // Step 4: pad to (256, 256) + cv::Mat padded; + cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); + + // Step 5: resize padded to 128×128 + cv::Mat resized128; + cv::resize(padded, resized128, cv::Size(128, 128)); + + // Step 6: compute scale and pad in original image space + float scale = static_cast(orig_h) / h1; // h / h1 + cv::Point pad_point(static_cast(padh1 * scale), static_cast(padw1 * scale)); + + return std::make_tuple(padded, resized128, scale, pad_point); +} + + +// 将图像转换为 1xC×H×W 格式并归一化(除以 255) +std::vector preprocess_image(const cv::Mat& img) { + int H = img.rows; + int W = img.cols; + int C = img.channels(); // should be 3 + + std::vector chw(H * W * C); // CHW + std::vector nchw(1 * C * H * W); // NCHW + + // 1. HWC → CHW + normalize (float32 / 255.0) + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int c = 0; c < C; ++c) { + // OpenCV uses BGR order + float value = img.at(h, w)[c] / 255.0f; + chw[c * H * W + h * W + w] = value; + } + } + } + + // 2. CHW → NCHW (add batch dimension, actually just copy) + for (int i = 0; i < C * H * W; ++i) { + nchw[i] = chw[i]; + } + + return nchw; // shape: [1, 3, H, W] +} + + +// 只用前4个坐标计算IOU(默认框位置在前4个坐标) +float IoU(const std::vector& box1, const std::vector& box2) { + float x1 = std::max(box1[0], box2[0]); + float y1 = std::max(box1[1], box2[1]); + float x2 = std::min(box1[2], box2[2]); + float y2 = std::min(box1[3], box2[3]); + + float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1); + float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]); + float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]); + float union_area = box1_area + box2_area - inter_area; + + return union_area > 0 ? inter_area / union_area : 0.0f; +} + +std::vector> weighted_non_max_suppression( + std::vector>& detections, + int num_coords = 16, + float min_suppression_threshold = 0.3f) +{ + if (detections.empty()) return {}; + + std::vector indices(detections.size()); + std::iota(indices.begin(), indices.end(), 0); + + // 按置信度降序排序 + std::sort(indices.begin(), indices.end(), [&](int a, int b) { + return detections[a][num_coords] > detections[b][num_coords]; + }); + + std::vector> output; + + while (!indices.empty()) { + int best_idx = indices.front(); + const auto& best_det = detections[best_idx]; + std::vector overlapping = { best_idx }; + + for (size_t i = 1; i < indices.size(); ++i) { + float iou = IoU(best_det, detections[indices[i]]); + if (iou > min_suppression_threshold) { + overlapping.push_back(indices[i]); + } + } + + // 更新剩余索引 + std::vector new_indices; + for (int idx : indices) { + if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) { + new_indices.push_back(idx); + } + } + indices = new_indices; + + // 加权平均:坐标 * 置信度 + if (overlapping.size() == 1) { + output.push_back(best_det); + } else { + std::vector weighted(num_coords + 1, 0.0f); + float total_score = 0.0f; + + for (int idx : overlapping) { + float score = detections[idx][num_coords]; + total_score += score; + for (int k = 0; k < num_coords; ++k) { + weighted[k] += detections[idx][k] * score; + } + } + + for (int k = 0; k < num_coords; ++k) { + weighted[k] /= total_score; + } + weighted[num_coords] = total_score / overlapping.size(); // 取平均得分 + + // std::cout << "Weighted box: "; + // for (float v : weighted) std::cout << v << " "; + // std::cout << "\n"; + + output.push_back(weighted); + } + } + + // TODO + auto x = output[0]; + output.clear(); + output.push_back(x); + + return output; +} + + +std::vector> denormalize_detections( + const std::vector>& detections, + float scale, + const cv::Point& pad +) { + std::vector> result = detections; + + for (size_t i = 0; i < result.size(); ++i) { + std::vector& det = result[i]; + + // bbox coords: x1, y1, x2, y2 + det[0] = det[0] * scale * 256.0f - pad.x; // x1 + det[1] = det[1] * scale * 256.0f - pad.y; // y1 + det[2] = det[2] * scale * 256.0f - pad.x; // x2 + det[3] = det[3] * scale * 256.0f - pad.y; // y2 + + // keypoints (starting from index 4): format [y, x, y, x, ...] + for (size_t k = 4; k + 1 < det.size(); k += 2) { + det[k] = det[k] * scale * 256.0f - pad.y; // y + det[k + 1] = det[k + 1] * scale * 256.0f - pad.x; // x + } + } + + return result; +} + + +void detection2roi( + const std::vector>& detections, + std::vector& xc, + std::vector& yc, + std::vector& scale, + std::vector& theta, + int kp1, int kp2, // 关键点索引 + float dy, float dscale, float theta0 +) { + size_t N = detections.size(); + xc.resize(N); + yc.resize(N); + scale.resize(N); + theta.resize(N); + + for (size_t i = 0; i < N; ++i) { + const std::vector& det = detections[i]; + + float x1 = det[1]; + float x2 = det[3]; + float y1 = det[0]; + float y2 = det[2]; + + float x_center = (x1 + x2) / 2.0f; + float y_center = (y1 + y2) / 2.0f; + float box_scale = (x2 - x1); // assumes square box + + // yc 偏移 + y_center += dy * box_scale; + box_scale *= dscale; + + // 获取两个关键点的位置 + int base = 4; + int idx_y0 = base + 2 * kp1; + int idx_x0 = base + 2 * kp1 + 1; + int idx_y1 = base + 2 * kp2; + int idx_x1 = base + 2 * kp2 + 1; + + float x0 = det[idx_x0]; + float y0 = det[idx_y0]; + float x1_kp = det[idx_x1]; + float y1_kp = det[idx_y1]; + + float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0; + + // 输出赋值 + xc[i] = x_center; + yc[i] = y_center; + scale[i] = box_scale; + // TODO: 这里的 theta 需要根据实际情况调整 + // theta[i] = angle; // 如果需要使用计算的角度 + theta[i] = -0.0094; + } +} + + +void extract_roi( + const cv::Mat& frame, + const std::vector& xc, + const std::vector& yc, + const std::vector& theta, + const std::vector& scale, + std::vector& cropped_rois, + std::vector& affine_matrices, + std::vector>& roi_boxes, // 添加返回点坐标 + int resolution = 192 +) { + cropped_rois.clear(); + affine_matrices.clear(); + roi_boxes.clear(); + + for (size_t i = 0; i < xc.size(); ++i) { + float s = scale[i] / 2.0f; + float cos_t = std::cos(theta[i]); + float sin_t = std::sin(theta[i]); + + // 定义4个 unit square 点经过变换后的点(顺序和 Python 中一样) + std::vector points(4); + // [-1, -1] + points[0].x = xc[i] + (-s * cos_t + s * sin_t); + points[0].y = yc[i] + (-s * sin_t - s * cos_t); + // [1, -1] + points[1].x = xc[i] + ( s * cos_t + s * sin_t); + points[1].y = yc[i] + ( s * sin_t - s * cos_t); + // [-1, 1] + points[2].x = xc[i] + (-s * cos_t - s * sin_t); + points[2].y = yc[i] + (-s * sin_t + s * cos_t); + // [1, 1] + points[3].x = xc[i] + ( s * cos_t - s * sin_t); + points[3].y = yc[i] + ( s * sin_t + s * cos_t); + + // 用前三个点计算仿射变换 + std::vector src_pts = { points[0], points[1], points[2] }; + std::vector dst_pts = { + cv::Point2f(0, 0), + cv::Point2f(resolution - 1, 0), + cv::Point2f(0, resolution - 1) + }; + + cv::Mat M = cv::getAffineTransform(src_pts, dst_pts); + cv::Mat M_inv; + cv::invertAffineTransform(M, M_inv); + + cv::Mat cropped; + cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5)); + cropped_rois.push_back(cropped); + affine_matrices.push_back(M_inv); + roi_boxes.push_back(points); // 添加变换后的 box 点 + } +} + +std::vector preprocess_imgs_to_nchw(const std::vector& imgs) { + int N = imgs.size(); + if (N == 0) return {}; + + int H = 192; + int W = 192; + int C = 3; // assume 3 channels (BGR) + + std::vector output; + output.reserve(N * C * H * W); + + for (int n = 0; n < N; ++n) { + cv::Mat img_float; + imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0); // Normalize to [0,1] + + // Split channels (HWC → CHW) + std::vector channels(3); + cv::split(img_float, channels); // channels[0] = B, [1] = G, [2] = R + + for (int c = 0; c < C; ++c) { + for (int i = 0; i < H; ++i) { + for (int j = 0; j < W; ++j) { + output.push_back(channels[c].at(i, j)); + } + } + } + } + + return output; // shape: N x C x H x W +} + +// resolution 一般为 192 +std::vector denormalize_landmarks( + const std::vector& normalized_landmarks, + const std::vector& affines, + int resolution = 192) +{ + std::vector output; + + // 检查输入尺寸 + const int num_faces = 1; // 假设只有一个人脸 + const int num_landmarks = 468; + if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) { + std::cerr << "Error: Input size mismatch. Expected " + << num_faces * num_landmarks * 3 << " landmarks and " + << num_faces << " affine matrices." << std::endl; + throw std::runtime_error("Input size mismatch"); + } + + for (int i = 0; i < num_faces; ++i) { + const cv::Mat& affine = affines[i]; // 2x3 CV_32F + for (int j = 0; j < num_landmarks; ++j) { + int idx = i * num_landmarks * 3 + j * 3; + float x = normalized_landmarks[idx + 0] * resolution; + float y = normalized_landmarks[idx + 1] * resolution; + // float z = normalized_landmarks[idx + 2]; // 可选使用 + + // 2x1 输入向量 + cv::Mat pt = (cv::Mat_(2, 1) << x, y); + + // 提取 affine 的旋转和平移 + cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone(); + cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone(); + M2x2.convertTo(M2x2, CV_32F); + t2x1.convertTo(t2x1, CV_32F); + + // 反仿射变换 + cv::Mat out = M2x2 * pt + t2x1; + + // 存储为 Point2f + output.emplace_back(out.at(0, 0), out.at(1, 0)); + } + } + + return output; // 输出为 denormalized landmarks,大小为 468 个 Point2f +} + + +void draw_roi(cv::Mat& img, const std::vector>& boxes) { + for (const auto& roi : boxes) { + if (roi.size() < 4) continue; + + const cv::Point2f& p1 = roi[0]; + const cv::Point2f& p2 = roi[1]; + const cv::Point2f& p3 = roi[2]; + const cv::Point2f& p4 = roi[3]; + + cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2); // 黑色 + cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2); // 绿色 + cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2); // 黑色 + cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2); // 黑色 + } +} + + +void draw_detections(cv::Mat& img, const std::vector>& detections, bool with_keypoints = true) { + for (const auto& det : detections) { + if (det.size() < 4) continue; + + float ymin = det[0]; + float xmin = det[1]; + float ymax = det[2]; + float xmax = det[3]; + + cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1); + + if (with_keypoints && det.size() > 4) { + int n_keypoints = (det.size() - 4) / 2; + for (int k = 0; k < n_keypoints; ++k) { + int kp_x = int(det[4 + k * 2]); + int kp_y = int(det[4 + k * 2 + 1]); + cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2); + } + } + } +} + + +std::vector> loadAnchors(const std::string& filename) { + std::ifstream in(filename); + std::vector> anchors; + + if (!in.is_open()) { + std::cerr << "Failed to open file: " << filename << std::endl; + return anchors; + } + + std::string line; + while (std::getline(in, line)) { + std::istringstream ss(line); + std::vector anchor; + float value; + while (ss >> value) { + anchor.push_back(value); + } + if (!anchor.empty()) { + anchors.push_back(anchor); + } + } + + in.close(); + return anchors; +} + +// sigmoid 函数 +float sigmoid(float x) { + return 1.0f / (1.0f + std::exp(-x)); +} + +// clamp 函数 +float clamp(float x, float min_val, float max_val) { + return std::max(min_val, std::min(max_val, x)); +} + +// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord] +std::vector>> decode_boxes( + const std::vector& raw_boxes, + const std::vector>& anchors, + int batch, int num_anchors, int num_coords, + float x_scale, float y_scale, float w_scale, float h_scale, + int num_keypoints) +{ + std::vector>> decoded_boxes(batch, + std::vector>(num_anchors, std::vector(num_coords, 0))); + + for (int b = 0; b < batch; ++b) { + for (int i = 0; i < num_anchors; ++i) { + int base = b * num_anchors * num_coords + i * num_coords; + + float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0]; + float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1]; + float w = raw_boxes[base + 2] / w_scale * anchors[i][2]; + float h = raw_boxes[base + 3] / h_scale * anchors[i][3]; + + decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin + decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin + decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax + decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax + + for (int k = 0; k < num_keypoints; ++k) { + int offset = 4 + k * 2; + float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0]; + float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1]; + decoded_boxes[b][i][offset] = keypoint_x; + decoded_boxes[b][i][offset + 1] = keypoint_y; + } + } + } + + return decoded_boxes; +} + +std::vector>> tensors_to_detections( + const std::vector& raw_box_tensor, + const std::vector& raw_score_tensor, + const std::vector>& anchors, + int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints, + float x_scale, float y_scale, float w_scale, float h_scale, + float score_clipping_thresh, float min_score_thresh) +{ + assert(raw_box_tensor.size() == batch * num_anchors * num_coords); + assert(raw_score_tensor.size() == batch * num_anchors * num_classes); + assert(anchors.size() == size_t(num_anchors)); + + auto detection_boxes = decode_boxes( + raw_box_tensor, anchors, batch, num_anchors, num_coords, + x_scale, y_scale, w_scale, h_scale, num_keypoints); + + std::vector>> output_detections; + + for (int b = 0; b < batch; ++b) { + std::vector> detections; + + for (int i = 0; i < num_anchors; ++i) { + int score_index = b * num_anchors * num_classes + i * num_classes; + + // 单类情况,取第0类 + float score_raw = raw_score_tensor[score_index]; + float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh)); + + if (score >= min_score_thresh) { + std::vector det = detection_boxes[b][i]; // shape [num_coords] + det.push_back(score); // 追加置信度 + detections.push_back(det); // shape [num_coords+1] + } + } + + output_detections.push_back(detections); // 每个 batch 一个 vector + } + + return output_detections; +} + + +int invoke(const Args& args) { + std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n" + << args.faceLandmark_model << "\n" + << "Image Path: " << args.imgs << "\n" + << "Inference Nums: " << args.invoke_nums << "\n" + << "Model Type: " << args.model_type << "\n"; + // =============================================================faceDetector_model start + Model* model1 = Model::create_instance(args.faceDetector_model); + if(model1 == nullptr){ + printf("Create model1 failed !\n"); + return EXIT_FAILURE; + } + Config* config1 = Config::create_instance(); + if(config1 == nullptr){ + printf("Create config1 failed !\n"); + return EXIT_FAILURE; + } + config1->implement_type = ImplementType::TYPE_LOCAL; + std::string model_type_lower1 = to_lower(args.model_type); + if (model_type_lower1 == "qnn"){ + config1->framework_type = FrameworkType::TYPE_QNN; + } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") { + config1->framework_type = FrameworkType::TYPE_SNPE2; + } + config1->accelerate_type = AccelerateType::TYPE_DSP; + config1->is_quantify_model = 1; + + std::vector> input_shapes1 = {{1,3,256,256}}; + std::vector> output_shapes1 = {{1,896,16},{1,896,1}}; + model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32); + std::unique_ptr fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1); + if(fast_interpreter1 == nullptr){ + printf("build_interpretper_from_model_and_config failed !\n"); + return EXIT_FAILURE; + } + int result = fast_interpreter1->init(); + if(result != EXIT_SUCCESS){ + printf("interpreter->init() failed !\n"); + return EXIT_FAILURE; + } + // load model + fast_interpreter1->load_model(); + if(result != EXIT_SUCCESS){ + printf("interpreter->load_model() failed !\n"); + return EXIT_FAILURE; + } + printf("detect model load success!\n"); + // =============================================================faceDetector_model over + + // =============================================================faceLandmark_model start + Model* model2 = Model::create_instance(args.faceLandmark_model); + if(model2 == nullptr){ + printf("Create model2 failed !\n"); + return EXIT_FAILURE; + } + Config* config2 = Config::create_instance(); + if(config2 == nullptr){ + printf("Create config2 failed !\n"); + return EXIT_FAILURE; + } + config2->implement_type = ImplementType::TYPE_LOCAL; + std::string model_type_lower2 = to_lower(args.model_type); + if (model_type_lower2 == "qnn"){ + config2->framework_type = FrameworkType::TYPE_QNN; + } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") { + config2->framework_type = FrameworkType::TYPE_SNPE2; + } + config2->accelerate_type = AccelerateType::TYPE_DSP; + config2->is_quantify_model = 1; + + std::vector> input_shapes2 = {{1,3,192,192}}; + std::vector> output_shapes2 = {{1},{1,468,3}}; + model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32); + std::unique_ptr fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2); + if(fast_interpreter2 == nullptr){ + printf("build_interpretper_from_model_and_config2 failed !\n"); + return EXIT_FAILURE; + } + result = fast_interpreter2->init(); + if(result != EXIT_SUCCESS){ + printf("interpreter2->init() failed !\n"); + return EXIT_FAILURE; + } + // load model + fast_interpreter2->load_model(); + if(result != EXIT_SUCCESS){ + printf("interpreter2->load_model() failed !\n"); + return EXIT_FAILURE; + } + printf("detect model2 load success!\n"); + // =============================================================faceLandmark_model over + + + auto anchors = load_anchors_from_npy("../anchors_float32.npy"); + cv::Mat frame = cv::imread(args.imgs); + if (frame.empty()) { + printf("detect image load failed!\n"); + return 1; + } + // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows); + cv::Mat input_data; + cv::Mat frame_clone1 = frame.clone(); + cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB); + cv::Mat frame_clone = frame.clone(); + + + cv::Mat img1, img2; + float scale; + cv::Point pad; + std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1); + std::vector input_tensor = preprocess_image(img1); + + float *outdata0 = nullptr; + float *outdata1 = nullptr; + std::vector invoke_time; + for (int i = 0; i < args.invoke_nums; ++i) { + result = fast_interpreter1->set_input_tensor(0, input_tensor.data()); + if(result != EXIT_SUCCESS){ + printf("interpreter->set_input_tensor() failed !\n"); + return EXIT_FAILURE; + } + auto t1 = std::chrono::high_resolution_clock::now(); + result = fast_interpreter1->invoke(); + auto t2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration cost_time = t2 - t1; + invoke_time.push_back(cost_time.count() * 1000); + if(result != EXIT_SUCCESS){ + printf("interpreter->invoke() failed !\n"); + return EXIT_FAILURE; + } + uint32_t out_data_0 = 0; + result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0); + if(result != EXIT_SUCCESS){ + printf("interpreter1->get_output_tensor() 0 failed !\n"); + return EXIT_FAILURE; + } + + uint32_t out_data_1 = 0; + result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1); + if(result != EXIT_SUCCESS){ + printf("interpreter1->get_output_tensor() 1 failed !\n"); + return EXIT_FAILURE; + } + + } + + std::vector tensor_1_896_16(outdata0, outdata0 + 896*16); + std::vector tensor_1_896_1(outdata1, outdata1 + 896*1); + + std::vector>> detections = tensors_to_detections( + tensor_1_896_16, tensor_1_896_1, anchors, + 1, 896, 16, 1, 6, + 256.0f, 256.0f, 256.0f, 256.0f, + 100.0f, 0.4f); + + + std::vector>> filtered_detections; + for (size_t i = 0; i < detections.size(); ++i) { + std::vector>& dets = detections[i]; + std::vector> faces = weighted_non_max_suppression(dets); + filtered_detections.push_back(faces); + } + + + // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n"; + // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n"; + std::vector> face_detections = denormalize_detections(filtered_detections[0], scale, pad); + + // std::cout << "face_detections size: " << face_detections.size() << "\n"; + std::vector xc, yc, scales, theta; + int kp1 = 0, kp2 = 1; // 关键点索引 + float dy = 0.0f; // 根据模型定义设定 + float dscale = 1.5f; // 缩放因子 + float theta0 = 0.0f; // 基准角度 + + detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0); + std::vector rois; + std::vector affines; + std::vector> boxes; + + extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes); + if (!boxes.empty()) { + std::cout << "Detected " << boxes.size() << " faces.\n"; + // 检测到人脸,继续处理 boxes[0] ... + std::vector input_tensor = preprocess_imgs_to_nchw(rois); + + float *outdata1_0 = nullptr; + float *outdata1_1 = nullptr; + + result = fast_interpreter2->set_input_tensor(0, input_tensor.data()); + if(result != EXIT_SUCCESS){ + printf("interpreter2->set_input_tensor() failed !\n"); + return EXIT_FAILURE; + } + auto t1 = std::chrono::high_resolution_clock::now(); + result = fast_interpreter2->invoke(); + auto t2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration cost_time = t2 - t1; + invoke_time.push_back(cost_time.count() * 1000); + if(result != EXIT_SUCCESS){ + printf("interpreter2->invoke() failed !\n"); + return EXIT_FAILURE; + } + uint32_t out_data_1_0 = 0; + result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0); + if(result != EXIT_SUCCESS){ + printf("interpreter2->get_output_tensor() 0 failed !\n"); + return EXIT_FAILURE; + } + + uint32_t out_data_1_1 = 0; + result = fast_interpreter2->get_output_tensor(1, (void**)&outdata1_1, &out_data_1_1); + if(result != EXIT_SUCCESS){ + printf("interpreter2->get_output_tensor() 1 failed !\n"); + return EXIT_FAILURE; + } + + std::vector flags(outdata1_0, outdata1_0 + 1); + std::vector normalized_landmarks(outdata1_1, outdata1_1 + 468*3); + + std::vector landmarks = denormalize_landmarks(normalized_landmarks, affines); + draw_landmarks(frame_clone1, landmarks, flags, FACE_CONNECTIONS); + } else { + std::cout << "not detect face!" << std::endl; + } + + + draw_roi(frame_clone1, boxes); + draw_detections(frame_clone1, face_detections); + cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR); + cv::imwrite("vis_result.jpg", frame_clone1); + + + fast_interpreter1->destory(); + fast_interpreter2->destory(); + return 0; + +} + + +int main(int argc, char* argv[]) { + Args args = parse_args(argc, argv); + return invoke(args); +} diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/models/m_faceDetector_w8a16.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/models/m_faceDetector_w8a16.qnn216.ctx.bin new file mode 100644 index 0000000000000000000000000000000000000000..061668eb5b47e8cf8940e2169e4c3f144c05d65c --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/models/m_faceDetector_w8a16.qnn216.ctx.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bef9e6fc5d306e30b740020cb8cb496a1c207dd62c2aba75ebc0fa5ea83457fd +size 380456 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/models/m_faceLandmark_w8a16.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/models/m_faceLandmark_w8a16.qnn216.ctx.bin new file mode 100644 index 0000000000000000000000000000000000000000..0ad68e6aba188eb4eec12f2941c9afbb0b6d8751 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/models/m_faceLandmark_w8a16.qnn216.ctx.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d029adb842fb3663541ada2f71b06741ccbe0e71aa0b62200ed74963b7d266bc +size 842848 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5f01c881032cc0ea0dafbbe76b4d02bf51f5dc2 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e15f793e9a3a77c122c6cae36e26c5bd9b77ab7e Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazeface.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4d54b68c53f26e930688f7bd66bc42e64431265 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazeface.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3960f9eaa3babd661de15273d7037029ce46e706 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/visualization.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffaa52099e2019a3dc97a8eef82319610de6aaf4 Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/visualization.cpython-38.pyc differ diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/anchors_face_back.npy b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/anchors_face_back.npy new file mode 100644 index 0000000000000000000000000000000000000000..3ba12474802adea36a8e37ca648d46556cd35c92 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/anchors_face_back.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95 +size 28800 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazebase.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazebase.py new file mode 100644 index 0000000000000000000000000000000000000000..414ae950199d7fa1ebae4fa540805cb53cf1ff0d --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazebase.py @@ -0,0 +1,513 @@ +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def resize_pad(img): + """ resize and pad images to be input to the detectors + + The face and palm detector networks take 256x256 and 128x128 images + as input. As such the input image is padded and resized to fit the + size while maintaing the aspect ratio. + + Returns: + img1: 256x256 + img2: 128x128 + scale: scale factor between original image and 256x256 image + pad: pixels of padding in the original image + """ + + size0 = img.shape + if size0[0]>=size0[1]: + h1 = 256 + w1 = 256 * size0[1] // size0[0] + padh = 0 + padw = 256 - w1 + scale = size0[1] / w1 + else: + h1 = 256 * size0[0] // size0[1] + w1 = 256 + padh = 256 - h1 + padw = 0 + scale = size0[0] / h1 + padh1 = padh//2 + padh2 = padh//2 + padh%2 + padw1 = padw//2 + padw2 = padw//2 + padw%2 + img1 = cv2.resize(img, (w1,h1)) + img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0))) + pad = (int(padh1 * scale), int(padw1 * scale)) + img2 = cv2.resize(img1, (128,128)) + return img1, img2, scale, pad + + +def denormalize_detections(detections, scale, pad): + """ maps detection coordinates from [0,1] to image coordinates + + The face and palm detector networks take 256x256 and 128x128 images + as input. As such the input image is padded and resized to fit the + size while maintaing the aspect ratio. This function maps the + normalized coordinates back to the original image coordinates. + + Inputs: + detections: nxm tensor. n is the number of detections. + m is 4+2*k where the first 4 valuse are the bounding + box coordinates and k is the number of additional + keypoints output by the detector. + scale: scalar that was used to resize the image + pad: padding in the x and y dimensions + + """ + detections[:, 0] = detections[:, 0] * scale * 256 - pad[0] + detections[:, 1] = detections[:, 1] * scale * 256 - pad[1] + detections[:, 2] = detections[:, 2] * scale * 256 - pad[0] + detections[:, 3] = detections[:, 3] * scale * 256 - pad[1] + + detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1] + detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0] + return detections + + + + +class BlazeBlock(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False): + super(BlazeBlock, self).__init__() + + self.stride = stride + self.kernel_size = kernel_size + self.channel_pad = out_channels - in_channels + + # TFLite uses slightly different padding than PyTorch + # on the depthwise conv layer when the stride is 2. + if stride == 2: + self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride) + padding = 0 + else: + padding = (kernel_size - 1) // 2 + + self.convs = nn.Sequential( + nn.Conv2d(in_channels=in_channels, out_channels=in_channels, + kernel_size=kernel_size, stride=stride, padding=padding, + groups=in_channels, bias=True), + nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=1, stride=1, padding=0, bias=True), + ) + + if skip_proj: + self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=1, stride=1, padding=0, bias=True) + else: + self.skip_proj = None + + if act == 'relu': + self.act = nn.ReLU(inplace=True) + elif act == 'prelu': + self.act = nn.PReLU(out_channels) + else: + raise NotImplementedError("unknown activation %s"%act) + + def forward(self, x): + if self.stride == 2: + if self.kernel_size==3: + h = F.pad(x, (0, 2, 0, 2), "constant", 0) + else: + h = F.pad(x, (1, 2, 1, 2), "constant", 0) + x = self.max_pool(x) + else: + h = x + + if self.skip_proj is not None: + x = self.skip_proj(x) + elif self.channel_pad > 0: + x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0) + + + return self.act(self.convs(h) + x) + + +class FinalBlazeBlock(nn.Module): + def __init__(self, channels, kernel_size=3): + super(FinalBlazeBlock, self).__init__() + + # TFLite uses slightly different padding than PyTorch + # on the depthwise conv layer when the stride is 2. + self.convs = nn.Sequential( + nn.Conv2d(in_channels=channels, out_channels=channels, + kernel_size=kernel_size, stride=2, padding=0, + groups=channels, bias=True), + nn.Conv2d(in_channels=channels, out_channels=channels, + kernel_size=1, stride=1, padding=0, bias=True), + ) + + self.act = nn.ReLU(inplace=True) + + def forward(self, x): + h = F.pad(x, (0, 2, 0, 2), "constant", 0) + + return self.act(self.convs(h)) + + +class BlazeBase(nn.Module): + """ Base class for media pipe models. """ + + def _device(self): + """Which device (CPU or GPU) is being used by this model?""" + return self.classifier_8.weight.device + + def load_weights(self, path): + self.load_state_dict(torch.load(path)) + self.eval() + + +class BlazeLandmark(BlazeBase): + """ Base class for landmark models. """ + + def extract_roi(self, frame, xc, yc, theta, scale): + + # take points on unit square and transform them according to the roi + points = torch.tensor([[-1, -1, 1, 1], + [-1, 1, -1, 1]], device=scale.device).view(1,2,4) + points = points * scale.view(-1,1,1)/2 + theta = theta.view(-1, 1, 1) + R = torch.cat(( + torch.cat((torch.cos(theta), -torch.sin(theta)), 2), + torch.cat((torch.sin(theta), torch.cos(theta)), 2), + ), 1) + center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1) + points = R @ points + center + + # use the points to compute the affine transform that maps + # these points back to the output square + res = self.resolution + points1 = np.array([[0, 0, res-1], + [0, res-1, 0]], dtype=np.float32).T + affines = [] + imgs = [] + for i in range(points.shape[0]): + pts = points[i, :, :3].cpu().numpy().T + M = cv2.getAffineTransform(pts, points1) + img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5) + img = torch.tensor(img, device=scale.device) + imgs.append(img) + affine = cv2.invertAffineTransform(M).astype('float32') + affine = torch.tensor(affine, device=scale.device) + affines.append(affine) + if imgs: + imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0 + affines = torch.stack(affines) + else: + imgs = torch.zeros((0, 3, res, res), device=scale.device) + affines = torch.zeros((0, 2, 3), device=scale.device) + + return imgs, affines, points + + def denormalize_landmarks(self, landmarks, affines): + landmarks[:,:,:2] *= self.resolution + for i in range(len(landmarks)): + landmark, affine = landmarks[i], affines[i] + landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T + landmarks[i,:,:2] = landmark + return landmarks + + + +class BlazeDetector(BlazeBase): + """ Base class for detector models. + + Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and + https://github.com/hollance/BlazeFace-PyTorch and + https://github.com/google/mediapipe/ + """ + def load_anchors(self, path): + self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device()) + assert(self.anchors.ndimension() == 2) + assert(self.anchors.shape[0] == self.num_anchors) + assert(self.anchors.shape[1] == 4) + + def _preprocess(self, x): + """Converts the image pixels to the range [-1, 1].""" + return x.float() / 255.# 127.5 - 1.0 + + def predict_on_image(self, img): + """Makes a prediction on a single image. + + Arguments: + img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of + shape (3, H, W). The image's height and width should be + 128 pixels. + + Returns: + A tensor with face detections. + """ + if isinstance(img, np.ndarray): + img = torch.from_numpy(img).permute((2, 0, 1)) + + return self.predict_on_batch(img.unsqueeze(0))[0] + + def predict_on_batch(self, x): + """Makes a prediction on a batch of images. + + Arguments: + x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of + shape (b, 3, H, W). The height and width should be 128 pixels. + + Returns: + A list containing a tensor of face detections for each image in + the batch. If no faces are found for an image, returns a tensor + of shape (0, 17). + + Each face detection is a PyTorch tensor consisting of 17 numbers: + - ymin, xmin, ymax, xmax + - x,y-coordinates for the 6 keypoints + - confidence score + """ + if isinstance(x, np.ndarray): + x = torch.from_numpy(x).permute((0, 3, 1, 2)) + + assert x.shape[1] == 3 + assert x.shape[2] == self.y_scale + assert x.shape[3] == self.x_scale + + # 1. Preprocess the images into tensors: + x = x.to(self._device()) + x = self._preprocess(x) + + # 2. Run the neural network: + with torch.no_grad(): + out = self.__call__(x) + + # 3. Postprocess the raw predictions: + detections = self._tensors_to_detections(out[0], out[1], self.anchors) + + # 4. Non-maximum suppression to remove overlapping detections: + filtered_detections = [] + for i in range(len(detections)): + faces = self._weighted_non_max_suppression(detections[i]) + faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1)) + filtered_detections.append(faces) + + return filtered_detections + + + def detection2roi(self, detection): + """ Convert detections from detector to an oriented bounding box. + + Adapted from: + # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt + + The center and size of the box is calculated from the center + of the detected box. Rotation is calcualted from the vector + between kp1 and kp2 relative to theta0. The box is scaled + and shifted by dscale and dy. + + """ + if self.detection2roi_method == 'box': + # compute box center and scale + # use mediapipe/calculators/util/detections_to_rects_calculator.cc + xc = (detection[:,1] + detection[:,3]) / 2 + yc = (detection[:,0] + detection[:,2]) / 2 + scale = (detection[:,3] - detection[:,1]) # assumes square boxes + + elif self.detection2roi_method == 'alignment': + # compute box center and scale + # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc + xc = detection[:,4+2*self.kp1] + yc = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2 + else: + raise NotImplementedError( + "detection2roi_method [%s] not supported"%self.detection2roi_method) + + yc += self.dy * scale + scale *= self.dscale + + # compute box rotation + x0 = detection[:,4+2*self.kp1] + y0 = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + #theta = np.arctan2(y0-y1, x0-x1) - self.theta0 + theta = torch.atan2(y0-y1, x0-x1) - self.theta0 + return xc, yc, scale, theta + + + def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors): + """The output of the neural network is a tensor of shape (b, 896, 16) + containing the bounding box regressor predictions, as well as a tensor + of shape (b, 896, 1) with the classification confidences. + + This function converts these two "raw" tensors into proper detections. + Returns a list of (num_detections, 17) tensors, one for each image in + the batch. + + This is based on the source code from: + mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc + mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto + """ + assert raw_box_tensor.ndimension() == 3 + assert raw_box_tensor.shape[1] == self.num_anchors + assert raw_box_tensor.shape[2] == self.num_coords + + assert raw_score_tensor.ndimension() == 3 + assert raw_score_tensor.shape[1] == self.num_anchors + assert raw_score_tensor.shape[2] == self.num_classes + + assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0] + + detection_boxes = self._decode_boxes(raw_box_tensor, anchors) + + thresh = self.score_clipping_thresh + raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh) + detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1) + + # Note: we stripped off the last dimension from the scores tensor + # because there is only has one class. Now we can simply use a mask + # to filter out the boxes with too low confidence. + mask = detection_scores >= self.min_score_thresh + + # Because each image from the batch can have a different number of + # detections, process them one at a time using a loop. + output_detections = [] + for i in range(raw_box_tensor.shape[0]): + boxes = detection_boxes[i, mask[i]] + scores = detection_scores[i, mask[i]].unsqueeze(dim=-1) + output_detections.append(torch.cat((boxes, scores), dim=-1)) + + return output_detections + + def _decode_boxes(self, raw_boxes, anchors): + """Converts the predictions into actual coordinates using + the anchor boxes. Processes the entire batch at once. + """ + boxes = torch.zeros_like(raw_boxes) + + x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0] + y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + + w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2] + h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3] + + boxes[..., 0] = y_center - h / 2. # ymin + boxes[..., 1] = x_center - w / 2. # xmin + boxes[..., 2] = y_center + h / 2. # ymax + boxes[..., 3] = x_center + w / 2. # xmax + + for k in range(self.num_keypoints): + offset = 4 + k*2 + keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0] + keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + boxes[..., offset ] = keypoint_x + boxes[..., offset + 1] = keypoint_y + + return boxes + + def _weighted_non_max_suppression(self, detections): + """The alternative NMS method as mentioned in the BlazeFace paper: + + "We replace the suppression algorithm with a blending strategy that + estimates the regression parameters of a bounding box as a weighted + mean between the overlapping predictions." + + The original MediaPipe code assigns the score of the most confident + detection to the weighted detection, but we take the average score + of the overlapping detections. + + The input detections should be a Tensor of shape (count, 17). + + Returns a list of PyTorch tensors, one for each detected face. + + This is based on the source code from: + mediapipe/calculators/util/non_max_suppression_calculator.cc + mediapipe/calculators/util/non_max_suppression_calculator.proto + """ + if len(detections) == 0: return [] + + output_detections = [] + + # Sort the detections from highest to lowest score. + remaining = torch.argsort(detections[:, self.num_coords], descending=True) + + while len(remaining) > 0: + detection = detections[remaining[0]] + + # Compute the overlap between the first box and the other + # remaining boxes. (Note that the other_boxes also include + # the first_box.) + first_box = detection[:4] + other_boxes = detections[remaining, :4] + ious = overlap_similarity(first_box, other_boxes) + + # If two detections don't overlap enough, they are considered + # to be from different faces. + mask = ious > self.min_suppression_threshold + overlapping = remaining[mask] + remaining = remaining[~mask] + + # Take an average of the coordinates from the overlapping + # detections, weighted by their confidence scores. + weighted_detection = detection.clone() + if len(overlapping) > 1: + coordinates = detections[overlapping, :self.num_coords] + scores = detections[overlapping, self.num_coords:self.num_coords+1] + total_score = scores.sum() + weighted = (coordinates * scores).sum(dim=0) / total_score + weighted_detection[:self.num_coords] = weighted + weighted_detection[self.num_coords] = total_score / len(overlapping) + + output_detections.append(weighted_detection) + + return output_detections + + +# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py + +def intersect(box_a, box_b): + """ We resize both tensors to [A,B,2] without new malloc: + [A,2] -> [A,1,2] -> [A,B,2] + [B,2] -> [1,B,2] -> [A,B,2] + Then we compute the area of intersect between box_a and box_b. + Args: + box_a: (tensor) bounding boxes, Shape: [A,4]. + box_b: (tensor) bounding boxes, Shape: [B,4]. + Return: + (tensor) intersection area, Shape: [A,B]. + """ + A = box_a.size(0) + B = box_b.size(0) + max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), + box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) + min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), + box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + return inter[:, :, 0] * inter[:, :, 1] + + +def jaccard(box_a, box_b): + """Compute the jaccard overlap of two sets of boxes. The jaccard overlap + is simply the intersection over union of two boxes. Here we operate on + ground truth boxes and default boxes. + E.g.: + A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) + Args: + box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] + box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] + Return: + jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] + """ + inter = intersect(box_a, box_b) + area_a = ((box_a[:, 2]-box_a[:, 0]) * + (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_b = ((box_b[:, 2]-box_b[:, 0]) * + (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + union = area_a + area_b - inter + return inter / union # [A,B] + + +def overlap_similarity(box, other_boxes): + """Computes the IOU between a bounding box and set of other boxes.""" + return jaccard(box.unsqueeze(0), other_boxes).squeeze(0) diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazeface.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazeface.py new file mode 100644 index 0000000000000000000000000000000000000000..6d630b3b28ead006d2864f18f1637f6545dbe507 --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazeface.py @@ -0,0 +1,182 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from blazebase import BlazeDetector, BlazeBlock, FinalBlazeBlock + + +class BlazeFace(BlazeDetector): + """The BlazeFace face detection model from MediaPipe. + + The version from MediaPipe is simpler than the one in the paper; + it does not use the "double" BlazeBlocks. + + Because we won't be training this model, it doesn't need to have + batchnorm layers. These have already been "folded" into the conv + weights by TFLite. + + The conversion to PyTorch is fairly straightforward, but there are + some small differences between TFLite and PyTorch in how they handle + padding on conv layers with stride 2. + + This version works on batches, while the MediaPipe version can only + handle a single image at a time. + + Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and + https://github.com/hollance/BlazeFace-PyTorch and + https://github.com/google/mediapipe/ + + """ + def __init__(self, back_model=False): + super(BlazeFace, self).__init__() + + # These are the settings from the MediaPipe example graph + # mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt + self.num_classes = 1 + self.num_anchors = 896 + self.num_coords = 16 + self.score_clipping_thresh = 100.0 + self.back_model = back_model + if back_model: + self.x_scale = 256.0 + self.y_scale = 256.0 + self.h_scale = 256.0 + self.w_scale = 256.0 + self.min_score_thresh = 0.65 + else: + self.x_scale = 128.0 + self.y_scale = 128.0 + self.h_scale = 128.0 + self.w_scale = 128.0 + self.min_score_thresh = 0.75 + self.min_suppression_threshold = 0.3 + self.num_keypoints = 6 + + # These settings are for converting detections to ROIs which can then + # be extracted and feed into the landmark network + # use mediapipe/calculators/util/detections_to_rects_calculator.cc + self.detection2roi_method = 'box' + # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt + self.kp1 = 1 + self.kp2 = 0 + self.theta0 = 0. + self.dscale = 1.5 + self.dy = 0. + + self._define_layers() + + def _define_layers(self): + if self.back_model: + self.backbone = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True), + nn.ReLU(inplace=True), + + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24, stride=2), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 24), + BlazeBlock(24, 48, stride=2), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 48), + BlazeBlock(48, 96, stride=2), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + ) + self.final = FinalBlazeBlock(96) + self.classifier_8 = nn.Conv2d(96, 2, 1, bias=True) + self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True) + + self.regressor_8 = nn.Conv2d(96, 32, 1, bias=True) + self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True) + else: + self.backbone1 = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True), + nn.ReLU(inplace=True), + + BlazeBlock(24, 24), + BlazeBlock(24, 28), + BlazeBlock(28, 32, stride=2), + BlazeBlock(32, 36), + BlazeBlock(36, 42), + BlazeBlock(42, 48, stride=2), + BlazeBlock(48, 56), + BlazeBlock(56, 64), + BlazeBlock(64, 72), + BlazeBlock(72, 80), + BlazeBlock(80, 88), + ) + + self.backbone2 = nn.Sequential( + BlazeBlock(88, 96, stride=2), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + BlazeBlock(96, 96), + ) + + self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True) + self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True) + + self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True) + self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True) + + def forward(self, x): + # TFLite uses slightly different padding on the first conv layer + # than PyTorch, so do it manually. + x = F.pad(x, (1, 2, 1, 2), "constant", 0) + + b = x.shape[0] # batch size, needed for reshaping later + + if self.back_model: + x = self.backbone(x) # (b, 16, 16, 96) + h = self.final(x) # (b, 8, 8, 96) + else: + x = self.backbone1(x) # (b, 88, 16, 16) + h = self.backbone2(x) # (b, 96, 8, 8) + + # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to + # permute the output from the conv layers before reshaping it. + + c1 = self.classifier_8(x) # (b, 2, 16, 16) + c1 = c1.permute(0, 2, 3, 1) # (b, 16, 16, 2) + c1 = c1.reshape(b, -1, 1) # (b, 512, 1) + + c2 = self.classifier_16(h) # (b, 6, 8, 8) + c2 = c2.permute(0, 2, 3, 1) # (b, 8, 8, 6) + c2 = c2.reshape(b, -1, 1) # (b, 384, 1) + + c = torch.cat((c1, c2), dim=1) # (b, 896, 1) + + r1 = self.regressor_8(x) # (b, 32, 16, 16) + r1 = r1.permute(0, 2, 3, 1) # (b, 16, 16, 32) + r1 = r1.reshape(b, -1, 16) # (b, 512, 16) + + r2 = self.regressor_16(h) # (b, 96, 8, 8) + r2 = r2.permute(0, 2, 3, 1) # (b, 8, 8, 96) + r2 = r2.reshape(b, -1, 16) # (b, 384, 16) + + r = torch.cat((r1, r2), dim=1) # (b, 896, 16) + return [r, c] + diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazeface_landmark.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazeface_landmark.py new file mode 100644 index 0000000000000000000000000000000000000000..8b21d17e31c8a8a39e95abb27cc3220440a01fca --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazeface_landmark.py @@ -0,0 +1,74 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from blazebase import BlazeLandmark, BlazeBlock + +class BlazeFaceLandmark(BlazeLandmark): + """The face landmark model from MediaPipe. + + """ + def __init__(self): + super(BlazeFaceLandmark, self).__init__() + + # size of ROIs used for input + self.resolution = 192 + + self._define_layers() + + def _define_layers(self): + self.backbone1 = nn.Sequential( + nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=0, bias=True), + nn.PReLU(16), + + BlazeBlock(16, 16, 3, act='prelu'), + BlazeBlock(16, 16, 3, act='prelu'), + BlazeBlock(16, 32, 3, 2, act='prelu'), + + BlazeBlock(32, 32, 3, act='prelu'), + BlazeBlock(32, 32, 3, act='prelu'), + BlazeBlock(32, 64, 3, 2, act='prelu'), + + BlazeBlock(64, 64, 3, act='prelu'), + BlazeBlock(64, 64, 3, act='prelu'), + BlazeBlock(64, 128, 3, 2, act='prelu'), + + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, 2, act='prelu'), + + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + ) + + + self.backbone2a = nn.Sequential( + BlazeBlock(128, 128, 3, 2, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + BlazeBlock(128, 128, 3, act='prelu'), + nn.Conv2d(128, 32, 1, padding=0, bias=True), + nn.PReLU(32), + BlazeBlock(32, 32, 3, act='prelu'), + nn.Conv2d(32, 1404, 3, padding=0, bias=True) + ) + + self.backbone2b = nn.Sequential( + BlazeBlock(128, 128, 3, 2, act='prelu'), + nn.Conv2d(128, 32, 1, padding=0, bias=True), + nn.PReLU(32), + BlazeBlock(32, 32, 3, act='prelu'), + nn.Conv2d(32, 1, 3, padding=0, bias=True) + ) + + def forward(self, x): + if x.shape[0] == 0: + return torch.zeros((0,)), torch.zeros((0, 468, 3)) + + x = F.pad(x, (0, 1, 0, 1), "constant", 0) + + x = self.backbone1(x) + landmarks = self.backbone2a(x).view(-1, 468, 3) / 192 + flag = self.backbone2b(x).sigmoid().view(-1) + + return flag, landmarks \ No newline at end of file diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/coco.jpg b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/coco.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/coco.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378 +size 158750 diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/demo_qnn.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/demo_qnn.py new file mode 100644 index 0000000000000000000000000000000000000000..1f7eb48ab9aed5e3d6b2f4df608b9ec95182221f --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/demo_qnn.py @@ -0,0 +1,423 @@ +import numpy as np +import torch +import cv2 +import sys +from blazebase import resize_pad, denormalize_detections +from visualization import draw_landmarks, draw_roi, FACE_CONNECTIONS +import time +import aidlite +import os + +class post_mediapipe_face: + def __init__(self): + self.kp1 = 1 + self.kp2 = 0 + self.theta0 = 0. + self.dscale = 1.5 + self.dy = 0. + self.x_scale = 256.0 + self.y_scale = 256.0 + self.h_scale = 256.0 + self.w_scale = 256.0 + self.num_keypoints = 6 + self.num_classes = 1 + self.num_anchors = 896 + self.num_coords = 16 + self.min_score_thresh = 0.4 #0.65 + self.score_clipping_thresh = 100.0 + self.min_suppression_threshold = 0.3 + self.resolution = 192 + + + def detection2roi(self,detection): + xc = (detection[:,1] + detection[:,3]) / 2 + yc = (detection[:,0] + detection[:,2]) / 2 + scale = (detection[:,3] - detection[:,1]) # assumes square boxes + yc += self.dy * scale + scale *= self.dscale + # compute box rotation + x0 = detection[:,4+2*self.kp1] + y0 = detection[:,4+2*self.kp1+1] + x1 = detection[:,4+2*self.kp2] + y1 = detection[:,4+2*self.kp2+1] + theta = torch.atan2(y0-y1, x0-x1) - self.theta0 + return xc, yc, scale, theta + + def _decode_boxes( self,raw_boxes, anchors): + boxes = torch.zeros_like(raw_boxes) + + x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0] + y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + + w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2] + h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3] + + boxes[..., 0] = y_center - h / 2. # ymin + boxes[..., 1] = x_center - w / 2. # xmin + boxes[..., 2] = y_center + h / 2. # ymax + boxes[..., 3] = x_center + w / 2. # xmax + + for k in range(self.num_keypoints): + offset = 4 + k*2 + keypoint_x = raw_boxes[..., offset ] / self.x_scale * anchors[:, 2] + anchors[:, 0] + keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1] + boxes[..., offset ] = keypoint_x + boxes[..., offset + 1] = keypoint_y + return boxes + + def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors): + assert raw_box_tensor.ndimension() == 3 + assert raw_box_tensor.shape[1] == self.num_anchors + assert raw_box_tensor.shape[2] == self.num_coords + + assert raw_score_tensor.ndimension() == 3 + assert raw_score_tensor.shape[1] == self.num_anchors + assert raw_score_tensor.shape[2] == self.num_classes + + assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0] + + detection_boxes = self._decode_boxes(raw_box_tensor, anchors) + + thresh = self.score_clipping_thresh + raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh) + detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1) + + # Note: we stripped off the last dimension from the scores tensor + # because there is only has one class. Now we can simply use a mask + # to filter out the boxes with too low confidence. + mask = detection_scores >= self.min_score_thresh + + # Because each image from the batch can have a different number of + # detections, process them one at a time using a loop. + output_detections = [] + for i in range(raw_box_tensor.shape[0]): + boxes = detection_boxes[i, mask[i]] + scores = detection_scores[i, mask[i]].unsqueeze(dim=-1) + output_detections.append(torch.cat((boxes, scores), dim=-1)) + + return output_detections + + def extract_roi( self,frame, xc, yc, theta, scale): + resolution = 192 + # take points on unit square and transform them according to the roi + points = torch.tensor([[-1, -1, 1, 1], + [-1, 1, -1, 1]], device=scale.device).view(1,2,4) + points = points * scale.view(-1,1,1)/2 + theta = theta.view(-1, 1, 1) + R = torch.cat(( + torch.cat((torch.cos(theta), -torch.sin(theta)), 2), + torch.cat((torch.sin(theta), torch.cos(theta)), 2), + ), 1) + center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1) + points = R @ points + center + + # use the points to compute the affine transform that maps + # these points back to the output square + res = resolution + points1 = np.array([[0, 0, res-1], + [0, res-1, 0]], dtype=np.float32).T + affines = [] + imgs = [] + for i in range(points.shape[0]): + pts = points[i, :, :3].detach().numpy().T + M = cv2.getAffineTransform(pts, points1) + img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5) + img = torch.tensor(img, device=scale.device) + imgs.append(img) + affine = cv2.invertAffineTransform(M).astype('float32') + affine = torch.tensor(affine, device=scale.device) + affines.append(affine) + if imgs: + imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0 + affines = torch.stack(affines) + else: + imgs = torch.zeros((0, 3, res, res), device=scale.device) + affines = torch.zeros((0, 2, 3), device=scale.device) + + return imgs, affines, points + + def denormalize_landmarks(self, landmarks, affines): + landmarks[:,:,:2] *= self.resolution + for i in range(len(landmarks)): + landmark, affine = landmarks[i], affines[i] + landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T + landmarks[i,:,:2] = landmark + return landmarks + + def intersect(self,box_a, box_b): + A = box_a.size(0) + B = box_b.size(0) + max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), + box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) + min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), + box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + return inter[:, :, 0] * inter[:, :, 1] + + def jaccard(self,box_a, box_b): + inter = self.intersect(box_a, box_b) + area_a = ((box_a[:, 2]-box_a[:, 0]) * + (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_b = ((box_b[:, 2]-box_b[:, 0]) * + (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + union = area_a + area_b - inter + return inter / union # [A,B] + + + def overlap_similarity(self,box, other_boxes): + """Computes the IOU between a bounding box and set of other boxes.""" + return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0) + + def _weighted_non_max_suppression(self,detections): + if len(detections) == 0: return [] + output_detections = [] + + # Sort the detections from highest to lowest score. + remaining = torch.argsort(detections[:, num_coords], descending=True) + + while len(remaining) > 0: + detection = detections[remaining[0]] + + # Compute the overlap between the first box and the other + # remaining boxes. (Note that the other_boxes also include + # the first_box.) + first_box = detection[:4] + other_boxes = detections[remaining, :4] + ious = self.overlap_similarity(first_box, other_boxes) + + # If two detections don't overlap enough, they are considered + # to be from different faces. + mask = ious > self.min_suppression_threshold + overlapping = remaining[mask] + remaining = remaining[~mask] + + # Take an average of the coordinates from the overlapping + # detections, weighted by their confidence scores. + weighted_detection = detection.clone() + if len(overlapping) > 1: + coordinates = detections[overlapping, :num_coords] + scores = detections[overlapping, num_coords:num_coords+1] + total_score = scores.sum() + weighted = (coordinates * scores).sum(dim=0) / total_score + weighted_detection[:num_coords] = weighted + weighted_detection[num_coords] = total_score / len(overlapping) + + output_detections.append(weighted_detection) + + return output_detections + +def draw_detections(img, detections, with_keypoints=True): + if isinstance(detections, torch.Tensor): + detections = detections.detach().numpy() + + if detections.ndim == 1: + detections = np.expand_dims(detections, axis=0) + + n_keypoints = detections.shape[1] // 2 - 2 + + for i in range(detections.shape[0]): + ymin = detections[i, 0] + xmin = detections[i, 1] + ymax = detections[i, 2] + xmax = detections[i, 3] + + start_point = (int(xmin), int(ymin)) + end_point = (int(xmax), int(ymax)) + img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) + + if with_keypoints: + for k in range(n_keypoints): + kp_x = int(detections[i, 4 + k*2 ]) + kp_y = int(detections[i, 4 + k*2 + 1]) + cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2) + return img + + + +post_process=post_mediapipe_face() + +class faceDetectionQnn: + def __init__(self): + super().__init__() + self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceDetector_w8a16.qnn216.ctx.bin")) + if self.model is None: + print("Create model failed !") + return + + self.config = aidlite.Config.create_instance() + if self.config is None: + print("build_interpretper_from_model_and_config failed !") + return + + self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL + self.config.framework_type = aidlite.FrameworkType.TYPE_QNN + self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP + self.config.is_quantify_model = 1 + + self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config) + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + return + input_shapes = [[1,3, 256, 256]] + output_shapes = [[1, 896,16],[1,896,1]] + self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32, + output_shapes, aidlite.DataType.TYPE_FLOAT32) + + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + result = self.interpreter.init() + if result != 0: + print(f"interpreter init failed !") + result = self.interpreter.load_model() + if result != 0: + print("interpreter load model failed !") + + print(" model load success!") + + def __call__(self, input): + self.interpreter.set_input_tensor(0,input) + invoke_time=[] + invoke_nums =10 + for i in range(invoke_nums): + result = self.interpreter.set_input_tensor(0, input.data) + if result != 0: + print("interpreter set_input_tensor() failed") + t1=time.time() + result = self.interpreter.invoke() + cost_time = (time.time()-t1)*1000 + invoke_time.append(cost_time) + + max_invoke_time = max(invoke_time) + min_invoke_time = min(invoke_time) + mean_invoke_time = sum(invoke_time)/invoke_nums + var_invoketime=np.var(invoke_time) + print("====================================") + print(f"QNN Detection invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}") + print("====================================") + features_0 = self.interpreter.get_output_tensor(0).reshape(1, 896,16).copy() + features_1 = self.interpreter.get_output_tensor(1).reshape(1, 896,1).copy() + return features_0,features_1 + + +class faceLandmarkQnn: + def __init__(self): + super().__init__() + self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceLandmark_w8a16.qnn216.ctx.bin")) + if self.model is None: + print("Create model failed !") + return + + self.config = aidlite.Config.create_instance() + if self.config is None: + print("build_interpretper_from_model_and_config failed !") + return + + self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL + self.config.framework_type = aidlite.FrameworkType.TYPE_QNN + self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP + self.config.is_quantify_model = 1 + + self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config) + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + return + input_shapes = [[1, 3, 192, 192]] + output_shapes = [[1],[1,468,3]] + self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32, + output_shapes, aidlite.DataType.TYPE_FLOAT32) + + if self.interpreter is None: + print("build_interpretper_from_model_and_config failed !") + result = self.interpreter.init() + if result != 0: + print(f"interpreter init failed !") + result = self.interpreter.load_model() + if result != 0: + print("interpreter load model failed !") + + print(" model load success!") + + def __call__(self, input): + self.interpreter.set_input_tensor(0,input) + invoke_time=[] + invoke_nums =10 + for i in range(invoke_nums): + result = self.interpreter.set_input_tensor(0, input.data) + if result != 0: + print("interpreter set_input_tensor() failed") + t1=time.time() + result = self.interpreter.invoke() + cost_time = (time.time()-t1)*1000 + invoke_time.append(cost_time) + + max_invoke_time = max(invoke_time) + min_invoke_time = min(invoke_time) + mean_invoke_time = sum(invoke_time)/invoke_nums + var_invoketime=np.var(invoke_time) + print("====================================") + print(f"QNN LandMark invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}") + print("====================================") + features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy() + features_1 = self.interpreter.get_output_tensor(1).reshape(1,468,3).copy() + return features_0,features_1 + + + +anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"anchors_face_back.npy")), dtype=torch.float32, device='cpu') +face_detc = faceDetectionQnn() +face_rec = faceLandmarkQnn() + +image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"coco.jpg") + +frame_ct=0 +image = cv2.imread(image_path) + +frame = np.ascontiguousarray(image[:,:,::-1]) + +img1, img2, scale, pad = resize_pad(frame) + +input = (img1 / 255).astype(np.float32) +input = np.transpose(input, (2, 0, 1)) +input = input[np.newaxis, ...] +t0 = time.time() +out = face_detc(input) +use_time = round((time.time() - t0) * 1000, 2) +print(f"face detction inference_time:{use_time} ms") +detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors) + +filtered_detections = [] +num_coords = 16 +for i in range(len(detections)): + faces = post_process._weighted_non_max_suppression(detections[i]) + faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1)) + filtered_detections.append(faces) + +face_detections = denormalize_detections(filtered_detections[0], scale, pad) + +xc, yc, scale, theta = post_process.detection2roi(face_detections) + +img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale) +if box.size()[0]!=0: + t2 = time.time() + flags, normalized_landmarks = face_rec(img.numpy()) + + use_time = round((time.time() - t2) * 1000, 2) + print(f"landmark inference_time:{use_time} ms") + + landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine) + + for i in range(len(flags)): + landmark, flag = landmarks[i], flags[i] + if flag>.4: # 0.5 + draw_landmarks(frame, landmark[:,:2], FACE_CONNECTIONS, size=1) +else: + print("not detect face !") + +draw_roi(frame, box) +draw_detections(frame, face_detections) +cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1]) +face_detc.interpreter.destory() +face_rec.interpreter.destory() + + + diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/visualization.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..e1187c9d99336d53562f4fa3a4a32f6af6e769fb --- /dev/null +++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/visualization.py @@ -0,0 +1,125 @@ +import numpy as np +import cv2 +import torch + +def draw_detections(img, detections, with_keypoints=True): + if isinstance(detections, torch.Tensor): + detections = detections.cpu().numpy() + + if detections.ndim == 1: + detections = np.expand_dims(detections, axis=0) + + n_keypoints = detections.shape[1] // 2 - 2 + + for i in range(detections.shape[0]): + ymin = detections[i, 0] + xmin = detections[i, 1] + ymax = detections[i, 2] + xmax = detections[i, 3] + + start_point = (int(xmin), int(ymin)) + end_point = (int(xmax), int(ymax)) + img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) + + if with_keypoints: + for k in range(n_keypoints): + kp_x = int(detections[i, 4 + k*2 ]) + kp_y = int(detections[i, 4 + k*2 + 1]) + cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2) + return img + + +def draw_roi(img, roi): + for i in range(roi.shape[0]): + (x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i] + cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2) + cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2) + cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2) + cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2) + + +def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2): + points = points[:,:2] + for point in points: + x, y = point + x, y = int(x), int(y) + cv2.circle(img, (x, y), size, color, thickness=size) + for connection in connections: + x0, y0 = points[connection[0]] + x1, y1 = points[connection[1]] + x0, y0 = int(x0), int(y0) + x1, y1 = int(x1), int(y1) + cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size) + + + +# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py +# 8 12 16 20 +# | | | | +# 7 11 15 19 +# 4 | | | | +# | 6 10 14 18 +# 3 | | | | +# | 5---9---13--17 +# 2 \ / +# \ \ / +# 1 \ / +# \ \ / +# ------0- +HAND_CONNECTIONS = [ + (0, 1), (1, 2), (2, 3), (3, 4), + (5, 6), (6, 7), (7, 8), + (9, 10), (10, 11), (11, 12), + (13, 14), (14, 15), (15, 16), + (17, 18), (18, 19), (19, 20), + (0, 5), (5, 9), (9, 13), (13, 17), (0, 17) +] + +POSE_CONNECTIONS = [ + (0,1), (1,2), (2,3), (3,7), + (0,4), (4,5), (5,6), (6,8), + (9,10), + (11,13), (13,15), (15,17), (17,19), (19,15), (15,21), + (12,14), (14,16), (16,18), (18,20), (20,16), (16,22), + (11,12), (12,24), (24,23), (23,11) +] + +# Vertex indices can be found in +# github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png +# Found in github.com/google/mediapipe/python/solutions/face_mesh.py +FACE_CONNECTIONS = [ + # Lips. + (61, 146), (146, 91), (91, 181), (181, 84), (84, 17), + (17, 314), (314, 405), (405, 321), (321, 375), (375, 291), + (61, 185), (185, 40), (40, 39), (39, 37), (37, 0), + (0, 267), (267, 269), (269, 270), (270, 409), (409, 291), + (78, 95), (95, 88), (88, 178), (178, 87), (87, 14), + (14, 317), (317, 402), (402, 318), (318, 324), (324, 308), + (78, 191), (191, 80), (80, 81), (81, 82), (82, 13), + (13, 312), (312, 311), (311, 310), (310, 415), (415, 308), + # Left eye. + (263, 249), (249, 390), (390, 373), (373, 374), (374, 380), + (380, 381), (381, 382), (382, 362), (263, 466), (466, 388), + (388, 387), (387, 386), (386, 385), (385, 384), (384, 398), + (398, 362), + # Left eyebrow. + (276, 283), (283, 282), (282, 295), (295, 285), (300, 293), + (293, 334), (334, 296), (296, 336), + # Right eye. + (33, 7), (7, 163), (163, 144), (144, 145), (145, 153), + (153, 154), (154, 155), (155, 133), (33, 246), (246, 161), + (161, 160), (160, 159), (159, 158), (158, 157), (157, 173), + (173, 133), + # Right eyebrow. + (46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105), + (105, 66), (66, 107), + # Face oval. + (10, 338), (338, 297), (297, 332), (332, 284), (284, 251), + (251, 389), (389, 356), (356, 454), (454, 323), (323, 361), + (361, 288), (288, 397), (397, 365), (365, 379), (379, 378), + (378, 400), (400, 377), (377, 152), (152, 148), (148, 176), + (176, 149), (149, 150), (150, 136), (136, 172), (172, 58), + (58, 132), (132, 93), (93, 234), (234, 127), (127, 162), + (162, 21), (21, 54), (54, 103), (103, 67), (67, 109), + (109, 10) +]