qc903113684 commited on 8 days ago

Commit

7c475da

verified ·

1 Parent(s): 72d88ef

Upload 90 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/README.md +64 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt +34 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/anchors_float32.npy +3 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/hand.jpg +0 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp +923 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/models/m_handDetctor_w8a8.qnn216.ctx.bin +3 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/models/m_handLandmark_w8a8.qnn216.ctx.bin +3 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-38.pyc +0 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/__pycache__/visualization.cpython-38.pyc +0 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/anchors_palm.npy +3 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/blazebase.py +513 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/blazehand_landmark.py +115 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/blazepalm.py +157 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/demo_qnn.py +420 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/export_jit.py +66 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/hand.jpg +0 -0
model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/visualization.py +125 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/README.md +64 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/CMakeLists.txt +34 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/anchors_float32.npy +3 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/hand.jpg +0 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/run_test.cpp +923 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/anchors_palm.npy +3 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/blazehand_landmark.pth +3 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/blazepalm.pth +3 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handDetector.pt +3 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handDetector_w8a16.qnn216.ctx.bin +3 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handLandmark.pt +3 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handLandmark_w8a16.qnn216.ctx.bin +3 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/0000.jpg +0 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-38.pyc +0 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/__pycache__/visualization.cpython-38.pyc +0 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/blazebase.py +513 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/blazehand_landmark.py +115 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/blazepalm.py +157 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/demo_qnn.py +386 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/export_jit.py +66 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/hand.jpg +0 -0
model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/visualization.py +125 -0
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/README.md +64 -0
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/CMakeLists.txt +34 -0
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/anchors_float32.npy +3 -0
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/hand.jpg +0 -0
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/run_test.cpp +923 -0
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/models/m_handDetctor_fp16.qnn216.ctx.bin +3 -0
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/models/m_handLandmark_fp16.qnn216.ctx.bin +3 -0
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-38.pyc +0 -0
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/visualization.cpython-38.pyc +0 -0
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/anchors_palm.npy +3 -0
model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/blazebase.py +513 -0

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+## Model Information
+### Source model
+- Input shape: [1x3x256x256], [1x3x256x256]
+- Number of parameters:1.76M, 2.01M
+- Model size:7.11MB, 8.09MB
+- Output shape: [1x2944x18, 1x2944x1], [1, 1, 1x21x3]
+Source model repository: [MediaPipe-Hand-Detection](https://github.com/zmurez/MediaPipePyTorch/)
+### Converted model
+- Precision: INT8
+- Backend: QNN2.16
+- Target Device: FV01 QCS6490
+## Inference with AidLite SDK
+### SDK installation
+Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
+- install AidLite SDK
+```bash
+# Install the appropriate version of the aidlite sdk
+sudo aid-pkg update
+sudo aid-pkg install aidlite-sdk
+# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
+sudo aid-pkg install aidlite-{QNN VERSION}
+```
+- Verify AidLite SDK
+```bash
+# aidlite sdk c++ check
+python3 -c "import aidlite ; print(aidlite.get_library_version())"
+# aidlite sdk python check
+python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
+```
+### Run demo
+#### python
+```bash
+cd python
+python3 demo_qnn.py
+```
+#### c++
+```bash
+# 加载.npy文件需要用到cnpy库（终端默认路径下执行即可）
+git clone https://github.com/rogersce/cnpy.git
+cd cnpy
+mkdir build && cd build
+cmake ..
+make
+sudo make install
+cd mediapipe-hand/model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp
+mkdir build && cd build
+cmake ..
+make
+./run_test
+```

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+cmake_minimum_required (VERSION 3.5)
+project("run_test")
+find_package(OpenCV REQUIRED)
+find_library(CNPY_LIB cnpy REQUIRED)
+message(STATUS "oPENCV Library status:")
+message(STATUS ">version:${OpenCV_VERSION}")
+message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
+set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
+include_directories(
+    /usr/local/include
+    /usr/include/opencv4
+)
+link_directories(
+    /usr/local/lib/
+)
+file(GLOB SRC_LISTS
+    ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp
+)
+add_executable(run_test ${SRC_LISTS})
+target_link_libraries(run_test
+    aidlite
+	${OpenCV_LIBS}
+    pthread
+    jsoncpp
+    ${CNPY_LIB}
+)

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/anchors_float32.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df91d5dc452f5098bd2618bae51fed413a1f6d3774bea5fbfac1a846d4ee8466
+size 47232

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/hand.jpg ADDED Viewed

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp ADDED Viewed

	@@ -0,0 +1,923 @@

+#include <iostream>
+#include <fstream>
+#include <opencv2/opencv.hpp>
+#include <aidlux/aidlite/aidlite.hpp>
+#include <vector>
+#include <numeric>
+#include <cmath>
+#include <jsoncpp/json/json.h>
+#include <tuple>
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <cassert>
+#include "cnpy.h"
+using namespace cv;
+using namespace std;
+using namespace Aidlux::Aidlite;
+// 人脸 landmark 连接索引定义（来自 MediaPipe Face Mesh）
+const std::vector<std::pair<int, int>> HAND_CONNECTIONS = {
+    {0, 1}, {1, 2}, {2, 3}, {3, 4},
+    {5, 6}, {6, 7}, {7, 8},
+    {9, 10}, {10, 11}, {11, 12},
+    {13, 14}, {14, 15}, {15, 16},
+    {17, 18}, {18, 19}, {19, 20},
+    {0, 5}, {5, 9}, {9, 13}, {13, 17}, {0, 17}
+};
+int kp1 = 0, kp2 = 2;           // 关键点索引
+float dy = -0.5f;                // 根据模型定义设定
+float dscale = 2.6f;            // 缩放因子
+float theta0 = 1.5707963267948966;            // 基准角度
+int batch=1;
+int num_anchors=2944;
+int num_coords=18;
+int num_classes=1;
+int num_keypoints=7;
+float x_scale=256.0;
+float y_scale=256.0;
+float w_scale=256.0;
+float h_scale=256.0;
+float score_clipping_thresh=100.0;
+float min_score_thresh=0.75;
+struct Args {
+    std::string faceDetector_model = "../../models/m_handDetctor_w8a8.qnn216.ctx.bin";
+    std::string faceLandmark_model = "../../models/m_handLandmark_w8a8.qnn216.ctx.bin";
+    std::string imgs = "../hand.jpg";
+    int invoke_nums = 10;
+    std::string model_type = "QNN";
+};
+Args parse_args(int argc, char* argv[]) {
+    Args args;
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+        if (arg == "--faceDetector_model" && i + 1 < argc) {
+            args.faceDetector_model = argv[++i];
+        } else if (arg == "--faceLandmark_model" && i + 1 < argc) {
+            args.faceLandmark_model = argv[++i];
+        } else if (arg == "--imgs" && i + 1 < argc) {
+            args.imgs = argv[++i];
+        } else if (arg == "--invoke_nums" && i + 1 < argc) {
+            args.invoke_nums = std::stoi(argv[++i]);
+        } else if (arg == "--model_type" && i + 1 < argc) {
+            args.model_type = argv[++i];
+        }
+    }
+    return args;
+}
+std::string to_lower(const std::string& str) {
+    std::string lower_str = str;
+    std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
+        return std::tolower(c);
+    });
+    return lower_str;
+}
+std::vector<std::vector<float>> load_anchors_from_npy(const std::string& path) {
+    cnpy::NpyArray arr = cnpy::npy_load(path);
+    float* data_ptr = arr.data<float>();
+    size_t num_rows = arr.shape[0]; // 896
+    size_t num_cols = arr.shape[1]; // 4
+    std::vector<std::vector<float>> anchors(num_rows, std::vector<float>(num_cols));
+    for (size_t i = 0; i < num_rows; ++i) {
+        for (size_t j = 0; j < num_cols; ++j) {
+            anchors[i][j] = data_ptr[i * num_cols + j];
+        }
+    }
+    return anchors;
+}
+// 绘制人脸关键点和连接线
+void draw_landmarks(
+    cv::Mat& img,
+    const std::vector<cv::Point2f>& points,
+    const std::vector<float>& flags,
+    const std::vector<std::pair<int, int>>& connections,
+    float threshold = 0.4f,
+    cv::Scalar point_color = cv::Scalar(0, 255, 0),
+    cv::Scalar line_color = cv::Scalar(0, 0, 0),
+    int size = 2)
+{
+    // 画关键点
+    for (size_t i = 0; i < points.size(); ++i) {
+        // if (i < flags.size() && flags[i] > threshold) {
+            int x = static_cast<int>(points[i].x);
+            int y = static_cast<int>(points[i].y);
+            cv::circle(img, cv::Point(x, y), size, point_color, size);
+        // }
+    }
+    // 画连接线（两端都要可见）
+    for (const auto& conn : connections) {
+        int i0 = conn.first;
+        int i1 = conn.second;
+        // if (i0 < points.size() && i1 < points.size() &&
+        //     i0 < flags.size() && i1 < flags.size() &&
+        //     flags[i0] > threshold && flags[i1] > threshold)
+        // {
+            cv::line(img, points[i0], points[i1], line_color, size);
+        // }
+    }
+}
+std::tuple<cv::Mat, cv::Mat, float, cv::Point> resize_pad(const cv::Mat& img) {
+    int h = img.rows;
+    int w = img.cols;
+    int h1, w1, padh = 0, padw = 0;
+    float scale = 1.0f;
+    // Step 1: resize width to 256, keep aspect ratio
+    // int w1 = 256;
+    // int h1 = w1 * orig_h / orig_w;  // 等效于 int(256 * h / w)
+    // 根据宽高，调整缩放比
+    if (h >= w) {
+        h1 = 256;
+        w1 = 256 * w / h;
+        padw = 256 - w1;
+        scale = static_cast<float>(w) / w1;
+    } else {
+        w1 = 256;
+        h1 = 256 * h / w;
+        padh = 256 - h1;
+        scale = static_cast<float>(h) / h1;
+    }
+    // std::cout << "Original size: (" << h << ", " << w << "), padding: (" << padh << ", " << padw << ")\n";
+    // Step 2: compute padding in height direction
+    int padh1 = padh / 2;
+    int padh2 = padh - padh1;
+    int padw1 = padw / 2;
+    int padw2 = padw - padw1;
+    // std::cout << "Padding: (" << padh1 << ", " << padh2 << "), (" << padw1 << ", " << padw2 << ")\n";
+    // Resize to (w1, h1)
+    cv::Mat resized;
+    cv::resize(img, resized, cv::Size(w1, h1));
+    // Pad to 256x256
+    cv::Mat padded;
+    cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
+    // Final resize to 128x128
+    cv::Mat resized_small;
+    cv::resize(padded, resized_small, cv::Size(128, 128));
+    // Compute offset in original scale
+    cv::Point pad_offset(static_cast<int>(padh1 * scale), static_cast<int>(padw1 * scale));
+    return std::make_tuple(padded, resized_small, scale, pad_offset);
+}
+// 将图像转换为 1xC×H×W 格式并归一化（除以 255）
+std::vector<float> preprocess_image(const cv::Mat& img) {
+    int H = img.rows;
+    int W = img.cols;
+    int C = img.channels(); // should be 3
+    std::vector<float> chw(H * W * C); // CHW
+    std::vector<float> nchw(1 * C * H * W); // NCHW
+    // 1. HWC → CHW + normalize (float32 / 255.0)
+    for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+            for (int c = 0; c < C; ++c) {
+                // OpenCV uses BGR order
+                float value = img.at<cv::Vec3b>(h, w)[c] / 255.0f;
+                chw[c * H * W + h * W + w] = value;
+            }
+        }
+    }
+    // 2. CHW → NCHW (add batch dimension, actually just copy)
+    for (int i = 0; i < C * H * W; ++i) {
+        nchw[i] = chw[i];
+    }
+    return nchw; // shape: [1, 3, H, W]
+}
+// 只用前4个坐标计算IOU（默认框位置在前4个坐标）
+float IoU(const std::vector<float>& box1, const std::vector<float>& box2) {
+    float x1 = std::max(box1[0], box2[0]);
+    float y1 = std::max(box1[1], box2[1]);
+    float x2 = std::min(box1[2], box2[2]);
+    float y2 = std::min(box1[3], box2[3]);
+    float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
+    float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]);
+    float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]);
+    float union_area = box1_area + box2_area - inter_area;
+    return union_area > 0 ? inter_area / union_area : 0.0f;
+}
+std::vector<std::vector<float>> weighted_non_max_suppression(
+    std::vector<std::vector<float>>& detections,
+    int num_coords = 18,
+    float min_suppression_threshold = 0.3f)
+{
+    if (detections.empty()) return {};
+    std::vector<int> indices(detections.size());
+    std::iota(indices.begin(), indices.end(), 0);
+    // 按置信度降序排序
+    std::sort(indices.begin(), indices.end(), [&](int a, int b) {
+        return detections[a][num_coords] > detections[b][num_coords];
+    });
+    std::vector<std::vector<float>> output;
+    while (!indices.empty()) {
+        int best_idx = indices.front();
+        const auto& best_det = detections[best_idx];
+        std::vector<int> overlapping = { best_idx };
+        for (size_t i = 1; i < indices.size(); ++i) {
+            float iou = IoU(best_det, detections[indices[i]]);
+            if (iou > min_suppression_threshold) {
+                overlapping.push_back(indices[i]);
+            }
+        }
+        // 更新剩余索引
+        std::vector<int> new_indices;
+        for (int idx : indices) {
+            if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) {
+                new_indices.push_back(idx);
+            }
+        }
+        indices = new_indices;
+        // 加权平均：坐标 * 置信度
+        if (overlapping.size() == 1) {
+            output.push_back(best_det);
+        } else {
+            std::vector<float> weighted(num_coords + 1, 0.0f);
+            float total_score = 0.0f;
+            for (int idx : overlapping) {
+                float score = detections[idx][num_coords];
+                total_score += score;
+                for (int k = 0; k < num_coords; ++k) {
+                    weighted[k] += detections[idx][k] * score;
+                }
+            }
+            for (int k = 0; k < num_coords; ++k) {
+                weighted[k] /= total_score;
+            }
+            weighted[num_coords] = total_score / overlapping.size();  // 取平均得分
+            // std::cout << "Weighted box: ";
+            // for (float v : weighted) std::cout << v << " ";
+            // std::cout << "\n";
+            output.push_back(weighted);
+        }
+    }
+    // TODO
+    auto x = output[0];
+    output.clear();
+    output.push_back(x);
+    return output;
+}
+std::vector<std::vector<float>> denormalize_detections(
+    const std::vector<std::vector<float>>& detections,
+    float scale,
+    const cv::Point& pad
+) {
+    std::vector<std::vector<float>> result = detections;
+    for (size_t i = 0; i < result.size(); ++i) {
+        std::vector<float>& det = result[i];
+        // bbox coords: x1, y1, x2, y2
+        det[0] = det[0] * scale * 256.0f - pad.x;  // x1
+        det[1] = det[1] * scale * 256.0f - pad.y;  // y1
+        det[2] = det[2] * scale * 256.0f - pad.x;  // x2
+        det[3] = det[3] * scale * 256.0f - pad.y;  // y2
+        // keypoints (starting from index 4): format [y, x, y, x, ...]
+        for (size_t k = 4; k + 1 < det.size(); k += 2) {
+            det[k]     = det[k]     * scale * 256.0f - pad.y;  // y
+            det[k + 1] = det[k + 1] * scale * 256.0f - pad.x;  // x
+        }
+    }
+    return result;
+}
+void detection2roi(
+    const std::vector<std::vector<float>>& detections,
+    std::vector<float>& xc,
+    std::vector<float>& yc,
+    std::vector<float>& scale,
+    std::vector<float>& theta,
+    int kp1, int kp2,   // 关键点索引
+    float dy, float dscale, float theta0
+) {
+    size_t N = detections.size();
+    xc.resize(N);
+    yc.resize(N);
+    scale.resize(N);
+    theta.resize(N);
+    for (size_t i = 0; i < N; ++i) {
+        const std::vector<float>& det = detections[i];
+        float x1 = det[1];
+        float x2 = det[3];
+        float y1 = det[0];
+        float y2 = det[2];
+        float x_center = (x1 + x2) / 2.0f;
+        float y_center = (y1 + y2) / 2.0f;
+        float box_scale = (x2 - x1);  // assumes square box
+        // yc 偏移
+        y_center += dy * box_scale;
+        box_scale *= dscale;
+        // 获取两个关键点的位置
+        int base = 4;
+        int idx_y0 = base + 2 * kp1;
+        int idx_x0 = base + 2 * kp1 + 1;
+        int idx_y1 = base + 2 * kp2;
+        int idx_x1 = base + 2 * kp2 + 1;
+        float x0 = det[idx_x0];
+        float y0 = det[idx_y0];
+        float x1_kp = det[idx_x1];
+        float y1_kp = det[idx_y1];
+        float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0;
+        // 输出赋值
+        xc[i] = x_center;
+        yc[i] = y_center;
+        scale[i] = box_scale;
+        // TODO: 这里的 theta 需要根据实际情况调整
+        // theta[i] = angle;  // 如果需要使用计算的角度
+        theta[i] = -0.8461;
+    }
+}
+void extract_roi(
+    const cv::Mat& frame,
+    const std::vector<float>& xc,
+    const std::vector<float>& yc,
+    const std::vector<float>& theta,
+    const std::vector<float>& scale,
+    std::vector<cv::Mat>& cropped_rois,
+    std::vector<cv::Mat>& affine_matrices,
+    std::vector<std::vector<cv::Point2f>>& roi_boxes,  // 添加返回点坐标
+    int resolution = 256
+) {
+    cropped_rois.clear();
+    affine_matrices.clear();
+    roi_boxes.clear();
+    for (size_t i = 0; i < xc.size(); ++i) {
+        float s = scale[i] / 2.0f;
+        float cos_t = std::cos(theta[i]);
+        float sin_t = std::sin(theta[i]);
+        // 定义4个 unit square 点经过变换后的点（顺序和 Python 中一样）
+        std::vector<cv::Point2f> points(4);
+        // [-1, -1]
+        points[0].x = xc[i] + (-s * cos_t + s * sin_t);
+        points[0].y = yc[i] + (-s * sin_t - s * cos_t);
+        // [1, -1]
+        points[1].x = xc[i] + ( s * cos_t + s * sin_t);
+        points[1].y = yc[i] + ( s * sin_t - s * cos_t);
+        // [-1, 1]
+        points[2].x = xc[i] + (-s * cos_t - s * sin_t);
+        points[2].y = yc[i] + (-s * sin_t + s * cos_t);
+        // [1, 1]
+        points[3].x = xc[i] + ( s * cos_t - s * sin_t);
+        points[3].y = yc[i] + ( s * sin_t + s * cos_t);
+        // 用前三个点计算仿射变换
+        std::vector<cv::Point2f> src_pts = { points[0], points[1], points[2] };
+        std::vector<cv::Point2f> dst_pts = {
+            cv::Point2f(0, 0),
+            cv::Point2f(resolution - 1, 0),
+            cv::Point2f(0, resolution - 1)
+        };
+        cv::Mat M = cv::getAffineTransform(src_pts, dst_pts);
+        cv::Mat M_inv;
+        cv::invertAffineTransform(M, M_inv);
+        cv::Mat cropped;
+        cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5));
+        cropped_rois.push_back(cropped);
+        affine_matrices.push_back(M_inv);
+        roi_boxes.push_back(points);  // 添加变换后的 box 点
+    }
+}
+std::vector<float> preprocess_imgs_to_nchw(const std::vector<cv::Mat>& imgs) {
+    int N = imgs.size();
+    if (N == 0) return {};
+    int H = 256;
+    int W = 256;
+    int C = 3;  // assume 3 channels (BGR)
+    std::vector<float> output;
+    output.reserve(N * C * H * W);
+    for (int n = 0; n < N; ++n) {
+        cv::Mat img_float;
+        imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0);  // Normalize to [0,1]
+        // Split channels (HWC → CHW)
+        std::vector<cv::Mat> channels(3);
+        cv::split(img_float, channels);  // channels[0] = B, [1] = G, [2] = R
+        for (int c = 0; c < C; ++c) {
+            for (int i = 0; i < H; ++i) {
+                for (int j = 0; j < W; ++j) {
+                    output.push_back(channels[c].at<float>(i, j));
+                }
+            }
+        }
+    }
+    return output;  // shape: N x C x H x W
+}
+std::vector<cv::Point2f> denormalize_landmarks(
+    const std::vector<float>& normalized_landmarks,
+    const std::vector<cv::Mat>& affines,
+    int resolution = 256)
+{
+    std::vector<cv::Point2f> output;
+    // 检查输入尺寸
+    const int num_faces = 1;  // 假设只有一个人脸
+    const int num_landmarks = 21;
+    if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) {
+        std::cerr << "Error: Input size mismatch. Expected "
+                  << num_faces * num_landmarks * 3 << " landmarks and "
+                  << num_faces << " affine matrices." << std::endl;
+        throw std::runtime_error("Input size mismatch");
+    }
+    for (int i = 0; i < num_faces; ++i) {
+        const cv::Mat& affine = affines[i]; // 2x3 CV_32F
+        for (int j = 0; j < num_landmarks; ++j) {
+            int idx = i * num_landmarks * 3 + j * 3;
+            float x = normalized_landmarks[idx + 0] * resolution;
+            float y = normalized_landmarks[idx + 1] * resolution;
+            // float z = normalized_landmarks[idx + 2]; // 可选使用
+            // 2x1 输入向量
+            cv::Mat pt = (cv::Mat_<float>(2, 1) << x, y);
+            // 提取 affine 的旋转和平移
+            cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone();
+            cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone();
+            M2x2.convertTo(M2x2, CV_32F);
+            t2x1.convertTo(t2x1, CV_32F);
+            // 反仿射变换
+            cv::Mat out = M2x2 * pt + t2x1;
+            // 存储为 Point2f
+            output.emplace_back(out.at<float>(0, 0), out.at<float>(1, 0));
+        }
+    }
+    return output; // 输出为 denormalized landmarks，大小为 468 个 Point2f
+}
+void draw_roi(cv::Mat& img, const std::vector<std::vector<cv::Point2f>>& boxes) {
+    for (const auto& roi : boxes) {
+        if (roi.size() < 4) continue;
+        const cv::Point2f& p1 = roi[0];
+        const cv::Point2f& p2 = roi[1];
+        const cv::Point2f& p3 = roi[2];
+        const cv::Point2f& p4 = roi[3];
+        cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2);     // 绿色
+        cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+    }
+}
+void draw_detections(cv::Mat& img, const std::vector<std::vector<float>>& detections, bool with_keypoints = true) {
+    for (const auto& det : detections) {
+        if (det.size() < 4) continue;
+        float ymin = det[0];
+        float xmin = det[1];
+        float ymax = det[2];
+        float xmax = det[3];
+        cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1);
+        if (with_keypoints && det.size() > 4) {
+            int n_keypoints = (det.size() - 4) / 2;
+            for (int k = 0; k < n_keypoints; ++k) {
+                int kp_x = int(det[4 + k * 2]);
+                int kp_y = int(det[4 + k * 2 + 1]);
+                cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2);
+            }
+        }
+    }
+}
+std::vector<std::vector<float>> loadAnchors(const std::string& filename) {
+    std::ifstream in(filename);
+    std::vector<std::vector<float>> anchors;
+    if (!in.is_open()) {
+        std::cerr << "Failed to open file: " << filename << std::endl;
+        return anchors;
+    }
+    std::string line;
+    while (std::getline(in, line)) {
+        std::istringstream ss(line);
+        std::vector<float> anchor;
+        float value;
+        while (ss >> value) {
+            anchor.push_back(value);
+        }
+        if (!anchor.empty()) {
+            anchors.push_back(anchor);
+        }
+    }
+    in.close();
+    return anchors;
+}
+// sigmoid 函数
+float sigmoid(float x) {
+    return 1.0f / (1.0f + std::exp(-x));
+}
+// clamp 函数
+float clamp(float x, float min_val, float max_val) {
+    return std::max(min_val, std::min(max_val, x));
+}
+// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord]
+std::vector<std::vector<std::vector<float>>> decode_boxes(
+    const std::vector<float>& raw_boxes,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    int num_keypoints)
+{
+    std::vector<std::vector<std::vector<float>>> decoded_boxes(batch,
+        std::vector<std::vector<float>>(num_anchors, std::vector<float>(num_coords, 0)));
+    for (int b = 0; b < batch; ++b) {
+        for (int i = 0; i < num_anchors; ++i) {
+            int base = b * num_anchors * num_coords + i * num_coords;
+            float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0];
+            float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1];
+            float w = raw_boxes[base + 2] / w_scale * anchors[i][2];
+            float h = raw_boxes[base + 3] / h_scale * anchors[i][3];
+            decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin
+            decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin
+            decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax
+            decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax
+            for (int k = 0; k < num_keypoints; ++k) {
+                int offset = 4 + k * 2;
+                float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0];
+                float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1];
+                decoded_boxes[b][i][offset] = keypoint_x;
+                decoded_boxes[b][i][offset + 1] = keypoint_y;
+            }
+        }
+    }
+    return decoded_boxes;
+}
+std::vector<std::vector<std::vector<float>>> tensors_to_detections(
+    const std::vector<float>& raw_box_tensor,
+    const std::vector<float>& raw_score_tensor,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    float score_clipping_thresh, float min_score_thresh)
+{
+    assert(raw_box_tensor.size() == batch * num_anchors * num_coords);
+    assert(raw_score_tensor.size() == batch * num_anchors * num_classes);
+    assert(anchors.size() == size_t(num_anchors));
+    auto detection_boxes = decode_boxes(
+        raw_box_tensor, anchors, batch, num_anchors, num_coords,
+        x_scale, y_scale, w_scale, h_scale, num_keypoints);
+    std::vector<std::vector<std::vector<float>>> output_detections;
+    for (int b = 0; b < batch; ++b) {
+        std::vector<std::vector<float>> detections;
+        for (int i = 0; i < num_anchors; ++i) {
+            int score_index = b * num_anchors * num_classes + i * num_classes;
+            // 单类情况，取第0类
+            float score_raw = raw_score_tensor[score_index];
+            float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh));
+            if (score >= min_score_thresh) {
+                std::vector<float> det = detection_boxes[b][i]; // shape [num_coords]
+                det.push_back(score); // 追加置信度
+                detections.push_back(det); // shape [num_coords+1]
+            }
+        }
+        output_detections.push_back(detections); // 每个 batch 一个 vector
+    }
+    return output_detections;
+}
+int invoke(const Args& args) {
+    std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n"
+              << args.faceLandmark_model << "\n"
+              << "Image Path: " << args.imgs << "\n"
+              << "Inference Nums: " << args.invoke_nums << "\n"
+              << "Model Type: " << args.model_type << "\n";
+    // =============================================================faceDetector_model start
+    Model* model1 = Model::create_instance(args.faceDetector_model);
+    if(model1 == nullptr){
+        printf("Create model1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config1 = Config::create_instance();
+    if(config1 == nullptr){
+        printf("Create config1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config1->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower1 = to_lower(args.model_type);
+    if (model_type_lower1 == "qnn"){
+        config1->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") {
+        config1->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config1->accelerate_type = AccelerateType::TYPE_DSP;
+    config1->is_quantify_model = 1;
+    std::vector<std::vector<uint32_t>> input_shapes1 = {{1,3,256,256}};
+    std::vector<std::vector<uint32_t>> output_shapes1 = {{1,2944,18},{1,2944,1}};
+    model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1);
+    if(fast_interpreter1 == nullptr){
+        printf("build_interpretper_from_model_and_config failed !\n");
+        return EXIT_FAILURE;
+    }
+    int result = fast_interpreter1->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter1->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model load success!\n");
+    // =============================================================faceDetector_model over
+    // =============================================================faceLandmark_model start
+    Model* model2 = Model::create_instance(args.faceLandmark_model);
+    if(model2 == nullptr){
+        printf("Create model2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config2 = Config::create_instance();
+    if(config2 == nullptr){
+        printf("Create config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config2->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower2 = to_lower(args.model_type);
+    if (model_type_lower2 == "qnn"){
+        config2->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") {
+        config2->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config2->accelerate_type = AccelerateType::TYPE_DSP;
+    config2->is_quantify_model = 1;
+    std::vector<std::vector<uint32_t>> input_shapes2 = {{1,3,256,256}};
+    std::vector<std::vector<uint32_t>> output_shapes2 = {{1},{1},{1,21,3}};
+    model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2);
+    if(fast_interpreter2 == nullptr){
+        printf("build_interpretper_from_model_and_config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    result = fast_interpreter2->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter2->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model2 load success!\n");
+    // =============================================================faceLandmark_model over
+    auto anchors = load_anchors_from_npy("../anchors_float32.npy");
+    cv::Mat frame = cv::imread(args.imgs);
+    if (frame.empty()) {
+        printf("detect image load failed!\n");
+        return 1;
+    }
+    // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
+    cv::Mat input_data;
+    cv::Mat frame_clone1 = frame.clone();
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB);
+    cv::Mat frame_clone = frame.clone();
+    cv::Mat img1, img2;
+    float scale;
+    cv::Point pad;
+    std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1);
+    std::vector<float> input_tensor = preprocess_image(img1);
+    float *outdata0 = nullptr;
+    float *outdata1 = nullptr;
+    std::vector<float> invoke_time;
+    for (int i = 0; i < args.invoke_nums; ++i) {
+        result = fast_interpreter1->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter1->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_0 = 0;
+        result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_1 = 0;
+        result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+    }
+    std::vector<float> tensor_1_896_16(outdata0, outdata0 + 2944*18);
+    std::vector<float> tensor_1_896_1(outdata1, outdata1 + 2944*1);
+    std::vector<std::vector<std::vector<float>>> detections = tensors_to_detections(
+        tensor_1_896_16, tensor_1_896_1, anchors,
+        batch, num_anchors, num_coords, num_classes, num_keypoints,
+        x_scale, y_scale, w_scale, h_scale,
+        score_clipping_thresh, min_score_thresh);
+    std::vector<std::vector<std::vector<float>>> filtered_detections;
+    for (size_t i = 0; i < detections.size(); ++i) {
+        std::vector<std::vector<float>>& dets = detections[i];
+        std::vector<std::vector<float>> faces = weighted_non_max_suppression(dets);
+        filtered_detections.push_back(faces);
+    }
+    // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n";
+    // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n";
+    std::vector<std::vector<float>> face_detections = denormalize_detections(filtered_detections[0], scale, pad);
+    // std::cout << "face_detections size: " << face_detections.size() << "\n";
+    std::vector<float> xc, yc, scales, theta;
+    detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0);
+    std::vector<cv::Mat> rois;
+    std::vector<cv::Mat> affines;
+    std::vector<std::vector<cv::Point2f>> boxes;
+    // std::cout << "xc size: " << xc.size() << ", yc size: " << yc.size() << ", scales size: " << scales.size() << ", theta size: " << theta.size() << "\n";
+    // std::cout << "xc: " << xc[0] << ", yc: " << yc[0] << ", scales: " << scales[0] << ", theta: " << theta[0] << "\n";
+    extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes);
+    if (!boxes.empty()) {
+        std::cout << "Detected " << boxes.size() << " faces.\n";
+        // 检测到人脸，继续处理 boxes[0] ...
+        std::vector<float> input_tensor = preprocess_imgs_to_nchw(rois);
+        // for (int i = 0; i < 5; ++i) {
+        //     std::cout << "input_tensor:" << i << ": " << input_tensor[i] << std::endl;
+        // }
+        float *outdata1_0 = nullptr;
+        float *outdata1_1 = nullptr;
+        result = fast_interpreter2->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter2->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_1_0 = 0;
+        result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_1_1 = 0;
+        result = fast_interpreter2->get_output_tensor(2, (void**)&outdata1_1, &out_data_1_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+        std::vector<float> flags(outdata1_0, outdata1_0 + 1);
+        std::vector<float> normalized_landmarks(outdata1_1, outdata1_1 + 21*3);
+        std::vector<cv::Point2f> landmarks = denormalize_landmarks(normalized_landmarks, affines);
+        draw_landmarks(frame_clone1, landmarks, flags, HAND_CONNECTIONS);
+    } else {
+        std::cout << "not detect face!" << std::endl;
+    }
+    draw_roi(frame_clone1, boxes);
+    draw_detections(frame_clone1, face_detections);
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR);
+    cv::imwrite("vis_result.jpg", frame_clone1);
+    fast_interpreter1->destory();
+    fast_interpreter2->destory();
+    return 0;
+}
+int main(int argc, char* argv[]) {
+    Args args = parse_args(argc, argv);
+    return invoke(args);
+}

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/models/m_handDetctor_w8a8.qnn216.ctx.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28dab2c773e07727138edb87bf4ba8259d81b7fe424db7b69e5d67a6fbc28ac4
+size 3589672

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/models/m_handLandmark_w8a8.qnn216.ctx.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d03fe17aac4e260aaa0d810aee512c4a5e9a9c64a63dadd1ff1bf8bd0d98dc1
+size 7031432

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-38.pyc ADDED Viewed

Binary file (16.5 kB). View file

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/__pycache__/visualization.cpython-38.pyc ADDED Viewed

Binary file (4.59 kB). View file

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/anchors_palm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24fa4a27ad6bee24ba3185a42fe3a47115540b0b27fa5956a291f03756183b41
+size 94336

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/blazebase.py ADDED Viewed

	@@ -0,0 +1,513 @@

+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def resize_pad(img):
+    """ resize and pad images to be input to the detectors
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio.
+    Returns:
+        img1: 256x256
+        img2: 128x128
+        scale: scale factor between original image and 256x256 image
+        pad: pixels of padding in the original image
+    """
+    size0 = img.shape
+    if size0[0]>=size0[1]:
+        h1 = 256
+        w1 = 256 * size0[1] // size0[0]
+        padh = 0
+        padw = 256 - w1
+        scale = size0[1] / w1
+    else:
+        h1 = 256 * size0[0] // size0[1]
+        w1 = 256
+        padh = 256 - h1
+        padw = 0
+        scale = size0[0] / h1
+    padh1 = padh//2
+    padh2 = padh//2 + padh%2
+    padw1 = padw//2
+    padw2 = padw//2 + padw%2
+    img1 = cv2.resize(img, (w1,h1))
+    img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0)))
+    pad = (int(padh1 * scale), int(padw1 * scale))
+    img2 = cv2.resize(img1, (128,128))
+    return img1, img2, scale, pad
+def denormalize_detections(detections, scale, pad):
+    """ maps detection coordinates from [0,1] to image coordinates
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio. This function maps the
+    normalized coordinates back to the original image coordinates.
+    Inputs:
+        detections: nxm tensor. n is the number of detections.
+            m is 4+2*k where the first 4 valuse are the bounding
+            box coordinates and k is the number of additional
+            keypoints output by the detector.
+        scale: scalar that was used to resize the image
+        pad: padding in the x and y dimensions
+    """
+    detections[:, 0] = detections[:, 0] * scale * 256 - pad[0]
+    detections[:, 1] = detections[:, 1] * scale * 256 - pad[1]
+    detections[:, 2] = detections[:, 2] * scale * 256 - pad[0]
+    detections[:, 3] = detections[:, 3] * scale * 256 - pad[1]
+    detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1]
+    detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0]
+    return detections
+class BlazeBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False):
+        super(BlazeBlock, self).__init__()
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.channel_pad = out_channels - in_channels
+        # TFLite uses slightly different padding than PyTorch
+        # on the depthwise conv layer when the stride is 2.
+        if stride == 2:
+            self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+            padding = 0
+        else:
+            padding = (kernel_size - 1) // 2
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=in_channels,
+                      kernel_size=kernel_size, stride=stride, padding=padding,
+                      groups=in_channels, bias=True),
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+        if skip_proj:
+            self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=1, stride=1, padding=0, bias=True)
+        else:
+            self.skip_proj = None
+        if act == 'relu':
+            self.act = nn.ReLU(inplace=True)
+        elif act == 'prelu':
+            self.act = nn.PReLU(out_channels)
+        else:
+            raise NotImplementedError("unknown activation %s"%act)
+    def forward(self, x):
+        if self.stride == 2:
+            if self.kernel_size==3:
+                h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+            else:
+                h = F.pad(x, (1, 2, 1, 2), "constant", 0)
+            x = self.max_pool(x)
+        else:
+            h = x
+        if self.skip_proj is not None:
+            x = self.skip_proj(x)
+        elif self.channel_pad > 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
+        return self.act(self.convs(h) + x)
+class FinalBlazeBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3):
+        super(FinalBlazeBlock, self).__init__()
+        # TFLite uses slightly different padding than PyTorch
+        # on the depthwise conv layer when the stride is 2.
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=kernel_size, stride=2, padding=0,
+                      groups=channels, bias=True),
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+        self.act = nn.ReLU(inplace=True)
+    def forward(self, x):
+        h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+        return self.act(self.convs(h))
+class BlazeBase(nn.Module):
+    """ Base class for media pipe models. """
+    def _device(self):
+        """Which device (CPU or GPU) is being used by this model?"""
+        return self.classifier_8.weight.device
+    def load_weights(self, path):
+        self.load_state_dict(torch.load(path))
+        self.eval()
+class BlazeLandmark(BlazeBase):
+    """ Base class for landmark models. """
+    def extract_roi(self, frame, xc, yc, theta, scale):
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+        # use the points to compute the affine transform that maps
+        # these points back to the output square
+        res = self.resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].cpu().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+        return imgs, affines, points
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+class BlazeDetector(BlazeBase):
+    """ Base class for detector models.
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/hollance/BlazeFace-PyTorch and
+    https://github.com/google/mediapipe/
+    """
+    def load_anchors(self, path):
+        self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
+        assert(self.anchors.ndimension() == 2)
+        assert(self.anchors.shape[0] == self.num_anchors)
+        assert(self.anchors.shape[1] == 4)
+    def _preprocess(self, x):
+        """Converts the image pixels to the range [-1, 1]."""
+        return x.float() / 255.# 127.5 - 1.0
+    def predict_on_image(self, img):
+        """Makes a prediction on a single image.
+        Arguments:
+            img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
+                 shape (3, H, W). The image's height and width should be
+                 128 pixels.
+        Returns:
+            A tensor with face detections.
+        """
+        if isinstance(img, np.ndarray):
+            img = torch.from_numpy(img).permute((2, 0, 1))
+        return self.predict_on_batch(img.unsqueeze(0))[0]
+    def predict_on_batch(self, x):
+        """Makes a prediction on a batch of images.
+        Arguments:
+            x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
+               shape (b, 3, H, W). The height and width should be 128 pixels.
+        Returns:
+            A list containing a tensor of face detections for each image in
+            the batch. If no faces are found for an image, returns a tensor
+            of shape (0, 17).
+        Each face detection is a PyTorch tensor consisting of 17 numbers:
+            - ymin, xmin, ymax, xmax
+            - x,y-coordinates for the 6 keypoints
+            - confidence score
+        """
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x).permute((0, 3, 1, 2))
+        assert x.shape[1] == 3
+        assert x.shape[2] == self.y_scale
+        assert x.shape[3] == self.x_scale
+        # 1. Preprocess the images into tensors:
+        x = x.to(self._device())
+        x = self._preprocess(x)
+        # 2. Run the neural network:
+        with torch.no_grad():
+            out = self.__call__(x)
+        # 3. Postprocess the raw predictions:
+        detections = self._tensors_to_detections(out[0], out[1], self.anchors)
+        # 4. Non-maximum suppression to remove overlapping detections:
+        filtered_detections = []
+        for i in range(len(detections)):
+            faces = self._weighted_non_max_suppression(detections[i])
+            faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1))
+            filtered_detections.append(faces)
+        return filtered_detections
+    def detection2roi(self, detection):
+        """ Convert detections from detector to an oriented bounding box.
+        Adapted from:
+        # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
+        The center and size of the box is calculated from the center
+        of the detected box. Rotation is calcualted from the vector
+        between kp1 and kp2 relative to theta0. The box is scaled
+        and shifted by dscale and dy.
+        """
+        if self.detection2roi_method == 'box':
+            # compute box center and scale
+            # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+            xc = (detection[:,1] + detection[:,3]) / 2
+            yc = (detection[:,0] + detection[:,2]) / 2
+            scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+        elif self.detection2roi_method == 'alignment':
+            # compute box center and scale
+            # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc
+            xc = detection[:,4+2*self.kp1]
+            yc = detection[:,4+2*self.kp1+1]
+            x1 = detection[:,4+2*self.kp2]
+            y1 = detection[:,4+2*self.kp2+1]
+            scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2
+        else:
+            raise NotImplementedError(
+                "detection2roi_method [%s] not supported"%self.detection2roi_method)
+        yc += self.dy * scale
+        scale *= self.dscale
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        #theta = np.arctan2(y0-y1, x0-x1) - self.theta0
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        """The output of the neural network is a tensor of shape (b, 896, 16)
+        containing the bounding box regressor predictions, as well as a tensor
+        of shape (b, 896, 1) with the classification confidences.
+        This function converts these two "raw" tensors into proper detections.
+        Returns a list of (num_detections, 17) tensors, one for each image in
+        the batch.
+        This is based on the source code from:
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
+        """
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+        return output_detections
+    def _decode_boxes(self, raw_boxes, anchors):
+        """Converts the predictions into actual coordinates using
+        the anchor boxes. Processes the entire batch at once.
+        """
+        boxes = torch.zeros_like(raw_boxes)
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+        return boxes
+    def _weighted_non_max_suppression(self, detections):
+        """The alternative NMS method as mentioned in the BlazeFace paper:
+        "We replace the suppression algorithm with a blending strategy that
+        estimates the regression parameters of a bounding box as a weighted
+        mean between the overlapping predictions."
+        The original MediaPipe code assigns the score of the most confident
+        detection to the weighted detection, but we take the average score
+        of the overlapping detections.
+        The input detections should be a Tensor of shape (count, 17).
+        Returns a list of PyTorch tensors, one for each detected face.
+        This is based on the source code from:
+        mediapipe/calculators/util/non_max_suppression_calculator.cc
+        mediapipe/calculators/util/non_max_suppression_calculator.proto
+        """
+        if len(detections) == 0: return []
+        output_detections = []
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, self.num_coords], descending=True)
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+            # Compute the overlap between the first box and the other
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = overlap_similarity(first_box, other_boxes)
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious > self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :self.num_coords]
+                scores = detections[overlapping, self.num_coords:self.num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:self.num_coords] = weighted
+                weighted_detection[self.num_coords] = total_score / len(overlapping)
+            output_detections.append(weighted_detection)
+        return output_detections
+# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
+def intersect(box_a, box_b):
+    """ We resize both tensors to [A,B,2] without new malloc:
+    [A,2] -> [A,1,2] -> [A,B,2]
+    [B,2] -> [1,B,2] -> [A,B,2]
+    Then we compute the area of intersect between box_a and box_b.
+    Args:
+      box_a: (tensor) bounding boxes, Shape: [A,4].
+      box_b: (tensor) bounding boxes, Shape: [B,4].
+    Return:
+      (tensor) intersection area, Shape: [A,B].
+    """
+    A = box_a.size(0)
+    B = box_b.size(0)
+    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+    inter = torch.clamp((max_xy - min_xy), min=0)
+    return inter[:, :, 0] * inter[:, :, 1]
+def jaccard(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.  Here we operate on
+    ground truth boxes and default boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
+        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
+    Return:
+        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2]-box_a[:, 0]) *
+              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+    area_b = ((box_b[:, 2]-box_b[:, 0]) *
+              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+def overlap_similarity(box, other_boxes):
+    """Computes the IOU between a bounding box and set of other boxes."""
+    return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/blazehand_landmark.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from blazebase import BlazeLandmark, BlazeBlock
+class BlazeHandLandmark(BlazeLandmark):
+    """The hand landmark model from MediaPipe.
+    """
+    def __init__(self):
+        super(BlazeHandLandmark, self).__init__()
+        # size of ROIs used for input
+        self.resolution = 256
+        self._define_layers()
+    def _define_layers(self):
+        self.backbone1 = nn.Sequential(
+            nn.Conv2d(in_channels=3, out_channels=24, kernel_size=3, stride=2, padding=0, bias=True),
+            nn.ReLU(inplace=True),
+            BlazeBlock(24, 24, 5),
+            BlazeBlock(24, 24, 5),
+            BlazeBlock(24, 48, 5, 2),
+        )
+        self.backbone2 = nn.Sequential(
+            BlazeBlock(48, 48, 5),
+            BlazeBlock(48, 48, 5),
+            BlazeBlock(48, 96, 5, 2),
+        )
+        self.backbone3 = nn.Sequential(
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 96, 5, 2),
+        )
+        self.backbone4 = nn.Sequential(
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 96, 5, 2),
+        )
+        self.blaze5 = BlazeBlock(96, 96, 5)
+        self.blaze6 = BlazeBlock(96, 96, 5)
+        self.conv7 = nn.Conv2d(96, 48, 1, bias=True)
+        self.backbone8 = nn.Sequential(
+            BlazeBlock(48, 48, 5),
+            BlazeBlock(48, 48, 5),
+            BlazeBlock(48, 48, 5),
+            BlazeBlock(48, 48, 5),
+            BlazeBlock(48, 96, 5, 2),
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 288, 5, 2),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5, 2),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5, 2),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5, 2),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+        )
+        self.hand_flag = nn.Conv2d(288, 1, 2, bias=True)
+        self.handed = nn.Conv2d(288, 1, 2, bias=True)
+        self.landmarks = nn.Conv2d(288, 63, 2, bias=True)
+    def forward(self, x):
+        if x.shape[0] == 0:
+            return torch.zeros((0,)), torch.zeros((0,)), torch.zeros((0, 21, 3))
+        x = F.pad(x, (0, 1, 0, 1), "constant", 0)
+        x = self.backbone1(x)
+        y = self.backbone2(x)
+        z = self.backbone3(y)
+        w = self.backbone4(z)
+        z = z + F.interpolate(w, scale_factor=2, mode='bilinear')
+        z = self.blaze5(z)
+        y = y + F.interpolate(z, scale_factor=2, mode='bilinear')
+        y = self.blaze6(y)
+        y = self.conv7(y)
+        x = x + F.interpolate(y, scale_factor=2, mode='bilinear')
+        x = self.backbone8(x)
+        hand_flag = self.hand_flag(x).view(-1).sigmoid()
+        handed = self.handed(x).view(-1).sigmoid()
+        landmarks = self.landmarks(x).view(-1, 21, 3) / 256
+        return hand_flag, handed, landmarks

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/blazepalm.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from blazebase import BlazeDetector, BlazeBlock
+class BlazePalm(BlazeDetector):
+    """The palm detection model from MediaPipe. """
+    def __init__(self):
+        super(BlazePalm, self).__init__()
+        # These are the settings from the MediaPipe example graph
+        # mediapipe/graphs/hand_tracking/subgraphs/hand_detection_gpu.pbtxt
+        self.num_classes = 1
+        self.num_anchors = 2944
+        self.num_coords = 18
+        self.score_clipping_thresh = 100.0
+        self.x_scale = 256.0
+        self.y_scale = 256.0
+        self.h_scale = 256.0
+        self.w_scale = 256.0
+        self.min_score_thresh = 0.5
+        self.min_suppression_threshold = 0.3
+        self.num_keypoints = 7
+        # These settings are for converting detections to ROIs which can then
+        # be extracted and feed into the landmark network
+        # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+        self.detection2roi_method = 'box'
+        # mediapipe/graphs/hand_tracking/subgraphs/hand_detection_cpu.pbtxt
+        self.kp1 = 0
+        self.kp2 = 2
+        self.theta0 = np.pi/2
+        self.dscale = 2.6
+        self.dy = -0.5
+        self._define_layers()
+    def _define_layers(self):
+        self.backbone1 = nn.Sequential(
+            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=2, padding=0, bias=True),
+            nn.ReLU(inplace=True),
+            BlazeBlock(32, 32),
+            BlazeBlock(32, 32),
+            BlazeBlock(32, 32),
+            BlazeBlock(32, 32),
+            BlazeBlock(32, 32),
+            BlazeBlock(32, 32),
+            BlazeBlock(32, 32),
+            BlazeBlock(32, 64, stride=2),
+            BlazeBlock(64, 64),
+            BlazeBlock(64, 64),
+            BlazeBlock(64, 64),
+            BlazeBlock(64, 64),
+            BlazeBlock(64, 64),
+            BlazeBlock(64, 64),
+            BlazeBlock(64, 64),
+            BlazeBlock(64, 128, stride=2),
+            BlazeBlock(128, 128),
+            BlazeBlock(128, 128),
+            BlazeBlock(128, 128),
+            BlazeBlock(128, 128),
+            BlazeBlock(128, 128),
+            BlazeBlock(128, 128),
+            BlazeBlock(128, 128),
+        )
+        self.backbone2 = nn.Sequential(
+            BlazeBlock(128, 256, stride=2),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+        )
+        self.backbone3 = nn.Sequential(
+            BlazeBlock(256, 256, stride=2),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+        )
+        self.conv_transpose1 = nn.ConvTranspose2d(in_channels=256, out_channels=256, kernel_size=2, stride=2, padding=0, bias=True)
+        self.blaze1 = BlazeBlock(256, 256)
+        self.conv_transpose2 = nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=2, stride=2, padding=0, bias=True)
+        self.blaze2 = BlazeBlock(128, 128)
+        self.classifier_32 = nn.Conv2d(128, 2, 1, bias=True)
+        self.classifier_16 = nn.Conv2d(256, 2, 1, bias=True)
+        self.classifier_8 = nn.Conv2d(256, 6, 1, bias=True)
+        self.regressor_32 = nn.Conv2d(128, 36, 1, bias=True)
+        self.regressor_16 = nn.Conv2d(256, 36, 1, bias=True)
+        self.regressor_8 = nn.Conv2d(256, 108, 1, bias=True)
+    def forward(self, x):
+        b = x.shape[0]      # batch size, needed for reshaping later
+        x = F.pad(x, (0, 1, 0, 1), "constant", 0)
+        x = self.backbone1(x)           # (b, 128, 32, 32)
+        y = self.backbone2(x)           # (b, 256, 16, 16)
+        z = self.backbone3(y)           # (b, 256, 8, 8)
+        y = y + F.relu(self.conv_transpose1(z), True)
+        y = self.blaze1(y)
+        x = x + F.relu(self.conv_transpose2(y), True)
+        x = self.blaze2(x)
+        # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
+        # permute the output from the conv layers before reshaping it.
+        c1 = self.classifier_8(z)       # (b, 2, 16, 16)
+        c1 = c1.permute(0, 2, 3, 1)     # (b, 16, 16, 2)
+        c1 = c1.reshape(b, -1, 1)       # (b, 512, 1)
+        c2 = self.classifier_16(y)      # (b, 6, 8, 8)
+        c2 = c2.permute(0, 2, 3, 1)     # (b, 8, 8, 6)
+        c2 = c2.reshape(b, -1, 1)       # (b, 384, 1)
+        c3 = self.classifier_32(x)      # (b, 6, 8, 8)
+        c3 = c3.permute(0, 2, 3, 1)     # (b, 8, 8, 6)
+        c3 = c3.reshape(b, -1, 1)       # (b, 384, 1)
+        c = torch.cat((c3, c2, c1), dim=1)  # (b, 896, 1)
+        r1 = self.regressor_8(z)        # (b, 32, 16, 16)
+        r1 = r1.permute(0, 2, 3, 1)     # (b, 16, 16, 32)
+        r1 = r1.reshape(b, -1, 18)      # (b, 512, 16)
+        r2 = self.regressor_16(y)       # (b, 96, 8, 8)
+        r2 = r2.permute(0, 2, 3, 1)     # (b, 8, 8, 96)
+        r2 = r2.reshape(b, -1, 18)      # (b, 384, 16)
+        r3 = self.regressor_32(x)       # (b, 96, 8, 8)
+        r3 = r3.permute(0, 2, 3, 1)     # (b, 8, 8, 96)
+        r3 = r3.reshape(b, -1, 18)      # (b, 384, 16)
+        r = torch.cat((r3, r2, r1), dim=1)  # (b, 896, 16)
+        return [r, c]

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/demo_qnn.py ADDED Viewed

	@@ -0,0 +1,420 @@

+import numpy as np
+import torch
+import cv2
+import sys
+from blazebase import resize_pad, denormalize_detections
+from visualization import  draw_landmarks, draw_roi, HAND_CONNECTIONS
+import time
+import aidlite
+import os
+class post_mediapipe_hand:
+    def __init__(self):
+        self.kp1 = 0
+        self.kp2 = 2
+        self.theta0 = 1.5707963267948966
+        self.dscale = 2.6
+        self.dy = -0.5
+        self.x_scale = 256.0
+        self.y_scale = 256.0
+        self.h_scale = 256.0
+        self.w_scale = 256.0
+        self.num_keypoints = 7
+        self.num_classes = 1
+        self.num_anchors = 2944
+        self.num_coords = 18
+        self.min_score_thresh = 0.75
+        self.score_clipping_thresh = 100.0
+        self.min_suppression_threshold = 0.3
+        self.resolution = 256
+    def detection2roi(self,detection):
+        xc = (detection[:,1] + detection[:,3]) / 2
+        yc = (detection[:,0] + detection[:,2]) / 2
+        scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+        yc += self.dy * scale
+        scale *= self.dscale
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+    def _decode_boxes( self,raw_boxes, anchors):
+        boxes = torch.zeros_like(raw_boxes)
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+        return boxes
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+        return output_detections
+    def extract_roi( self,frame, xc, yc, theta, scale):
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+        # use the points to compute the affine transform that maps
+        # these points back to the output square
+        res = self.resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].detach().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+        return imgs, affines, points
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+    def intersect(self,box_a, box_b):
+        A = box_a.size(0)
+        B = box_b.size(0)
+        max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                        box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+        min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                        box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+        inter = torch.clamp((max_xy - min_xy), min=0)
+        return inter[:, :, 0] * inter[:, :, 1]
+    def jaccard(self,box_a, box_b):
+        inter = self.intersect(box_a, box_b)
+        area_a = ((box_a[:, 2]-box_a[:, 0]) *
+                (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+        area_b = ((box_b[:, 2]-box_b[:, 0]) *
+                (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+        union = area_a + area_b - inter
+        return inter / union  # [A,B]
+    def overlap_similarity(self,box, other_boxes):
+        """Computes the IOU between a bounding box and set of other boxes."""
+        return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
+    def _weighted_non_max_suppression(self,detections):
+        if len(detections) == 0: return []
+        output_detections = []
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, num_coords], descending=True)
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+            # Compute the overlap between the first box and the other
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = self.overlap_similarity(first_box, other_boxes)
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious >  self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :num_coords]
+                scores = detections[overlapping, num_coords:num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:num_coords] = weighted
+                weighted_detection[num_coords] = total_score / len(overlapping)
+            output_detections.append(weighted_detection)
+            return output_detections
+def draw_detections(img, detections, with_keypoints=True):
+    if isinstance(detections, torch.Tensor):
+        detections = detections.detach().numpy()
+    if detections.ndim == 1:
+        detections = np.expand_dims(detections, axis=0)
+    n_keypoints = detections.shape[1] // 2 - 2
+    for i in range(detections.shape[0]):
+        ymin = detections[i, 0]
+        xmin = detections[i, 1]
+        ymax = detections[i, 2]
+        xmax = detections[i, 3]
+        start_point = (int(xmin), int(ymin))
+        end_point = (int(xmax), int(ymax))
+        img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1)
+        if with_keypoints:
+            for k in range(n_keypoints):
+                kp_x = int(detections[i, 4 + k*2    ])
+                kp_y = int(detections[i, 4 + k*2 + 1])
+                cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
+    return img
+post_process=post_mediapipe_hand()
+class handDetectionQnn:
+    def __init__(self):
+        super().__init__()
+        self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_handDetctor_w8a8.qnn216.ctx.bin"))
+        if self.model is None:
+            print("Create model failed !")
+            return
+        self.config = aidlite.Config.create_instance()
+        if self.config is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
+        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
+        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
+        self.config.is_quantify_model = 1
+        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        input_shapes = [[1,3, 256, 256]]
+        output_shapes = [[1, 2944,18],[1,2944,1]]
+        self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
+                                output_shapes, aidlite.DataType.TYPE_FLOAT32)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+        result = self.interpreter.init()
+        if result != 0:
+            print(f"interpreter init failed !")
+        result = self.interpreter.load_model()
+        if result != 0:
+            print("interpreter load model failed !")
+        print(" model load success!")
+    def __call__(self, input):
+        self.interpreter.set_input_tensor(0,input)
+        invoke_time=[]
+        invoke_nums =10
+        for i in range(invoke_nums):
+            result = self.interpreter.set_input_tensor(0, input.data)
+            if result != 0:
+                print("interpreter set_input_tensor() failed")
+            t1=time.time()
+            result = self.interpreter.invoke()
+            cost_time = (time.time()-t1)*1000
+            invoke_time.append(cost_time)
+        max_invoke_time = max(invoke_time)
+        min_invoke_time = min(invoke_time)
+        mean_invoke_time = sum(invoke_time)/invoke_nums
+        var_invoketime=np.var(invoke_time)
+        print("====================================")
+        print(f"QNN Detection invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
+        print("====================================")
+        features_0 = self.interpreter.get_output_tensor(0).reshape(1, 2944,18).copy()
+        features_1 = self.interpreter.get_output_tensor(1).reshape(1, 2944,1).copy()
+        return features_0,features_1
+class handLandmarkQnn:
+    def __init__(self):
+        super().__init__()
+        self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_handLandmark_w8a8.qnn216.ctx.bin"))
+        if self.model is None:
+            print("Create model failed !")
+            return
+        self.config = aidlite.Config.create_instance()
+        if self.config is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
+        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
+        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
+        self.config.is_quantify_model = 1
+        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        input_shapes = [[1, 3, 256, 256]]
+        output_shapes = [[1],[1],[1,21,3]]
+        self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
+                                output_shapes, aidlite.DataType.TYPE_FLOAT32)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+        result = self.interpreter.init()
+        if result != 0:
+            print(f"interpreter init failed !")
+        result = self.interpreter.load_model()
+        if result != 0:
+            print("interpreter load model failed !")
+        print(" model load success!")
+    def __call__(self, input):
+        self.interpreter.set_input_tensor(0,input)
+        invoke_time=[]
+        invoke_nums =10
+        for i in range(invoke_nums):
+            result = self.interpreter.set_input_tensor(0, input.data)
+            if result != 0:
+                print("interpreter set_input_tensor() failed")
+            t1=time.time()
+            result = self.interpreter.invoke()
+            cost_time = (time.time()-t1)*1000
+            invoke_time.append(cost_time)
+        max_invoke_time = max(invoke_time)
+        min_invoke_time = min(invoke_time)
+        mean_invoke_time = sum(invoke_time)/invoke_nums
+        var_invoketime=np.var(invoke_time)
+        print("====================================")
+        print(f"QNN LandMark invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
+        print("====================================")
+        features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy()
+        features_1 = self.interpreter.get_output_tensor(2).reshape(1,21,3).copy()
+        return features_0,features_1
+anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"anchors_palm.npy")), dtype=torch.float32, device='cpu')
+# anchors_np = anchors.cpu().numpy().astype(np.float32)
+# np.save("anchors_float32.npy", anchors_np)
+hand_detc = handDetectionQnn()
+hand_rec = handLandmarkQnn()
+image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"hand.jpg")
+frame_ct=0
+image = cv2.imread(image_path)
+frame = np.ascontiguousarray(image[:,:,::-1])
+img1, img2, scale, pad = resize_pad(frame)
+input = (img1 / 255).astype(np.float32)
+input = np.transpose(input, (2, 0, 1))
+input = input[np.newaxis, ...]
+t0 = time.time()
+out = hand_detc(input)
+use_time = round((time.time() - t0) * 1000, 2)
+detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors)
+filtered_detections = []
+num_coords = 18
+for i in range(len(detections)):
+    faces = post_process._weighted_non_max_suppression(detections[i])
+    faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1))
+    filtered_detections.append(faces)
+face_detections = denormalize_detections(filtered_detections[0], scale, pad)
+xc, yc, scale, theta = post_process.detection2roi(face_detections)
+img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale)
+if box.size()[0]!=0:
+    t2 = time.time()
+    flags, normalized_landmarks = hand_rec(img.numpy())
+    use_time = round((time.time() - t2) * 1000, 2)
+    landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine)
+    for i in range(len(flags)):
+        landmark, flag = landmarks[i], flags[i]
+        if flag>.4: # 0.5
+            draw_landmarks(frame, landmark[:,:2], HAND_CONNECTIONS, size=2)
+else:
+    print("not detect palm !")
+draw_roi(frame, box)
+draw_detections(frame, face_detections)
+cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1])
+hand_detc.interpreter.destory()
+hand_rec.interpreter.destory()

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/export_jit.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import numpy as np
+import torch
+import os
+from typing import Callable, Tuple
+from blazepalm import BlazePalm
+from blazehand_landmark import BlazeHandLandmark
+gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+torch.set_grad_enabled(False)
+class HandDetector(torch.nn.Module):
+    def __init__(
+        self,
+        detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]],
+        anchors: torch.Tensor,
+    ):
+        super().__init__()
+        self.detector = detector
+        self.anchors = anchors
+    def forward(self, image):
+        return self.detector(image)
+class HandLandmarkDetector(torch.nn.Module):
+    def __init__(
+        self,
+        detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]],
+    ):
+        super().__init__()
+        self.detector = detector
+    def forward(self, image):
+        return self.detector(image)
+palm_detector = BlazePalm().to(gpu)
+palm_detector.load_weights(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/blazepalm.pth"))
+palm_detector.load_anchors(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/anchors_palm.npy"))
+palm_detector.min_score_thresh = .75
+num_params = sum(p.numel() for p in palm_detector.parameters() if p.requires_grad)
+print(f'Number of palm_detector parameters: {num_params}')
+hand_regressor = BlazeHandLandmark().to(gpu)
+hand_regressor.load_weights(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/blazehand_landmark.pth"))
+num_params = sum(p.numel() for p in hand_regressor.parameters() if p.requires_grad)
+print(f'Number of hand_landmark parameters: {num_params}')
+hand_detect = HandDetector(palm_detector,palm_detector.anchors)
+hand_regres = HandLandmarkDetector(hand_regressor)
+hand_d_in = torch.randn(1, 3, 256, 256,dtype= torch.float32)
+source_model = torch.jit.trace(hand_detect.to("cpu"),hand_d_in)
+source_model.save("m_handDetector.pt")
+print("export hand detect ok!")
+hand_r_in = torch.randn(1, 3, 256, 256,dtype= torch.float32)
+source_model = torch.jit.trace(hand_regres.to("cpu"), hand_r_in)
+source_model.save("m_handLandmark.pt")
+print("export hand landmark ok!")

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/hand.jpg ADDED Viewed

model_farm_mediapipehand_qcs6490_qnn2.16_int8_aidlite/python/visualization.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import numpy as np
+import cv2
+import torch
+def draw_detections(img, detections, with_keypoints=True):
+    if isinstance(detections, torch.Tensor):
+        detections = detections.cpu().numpy()
+    if detections.ndim == 1:
+        detections = np.expand_dims(detections, axis=0)
+    n_keypoints = detections.shape[1] // 2 - 2
+    for i in range(detections.shape[0]):
+        ymin = detections[i, 0]
+        xmin = detections[i, 1]
+        ymax = detections[i, 2]
+        xmax = detections[i, 3]
+        start_point = (int(xmin), int(ymin))
+        end_point = (int(xmax), int(ymax))
+        img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1)
+        if with_keypoints:
+            for k in range(n_keypoints):
+                kp_x = int(detections[i, 4 + k*2    ])
+                kp_y = int(detections[i, 4 + k*2 + 1])
+                cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
+    return img
+def draw_roi(img, roi):
+    for i in range(roi.shape[0]):
+        (x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i]
+        cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2)
+        cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2)
+        cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2)
+        cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2)
+def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2):
+    points = points[:,:2]
+    for point in points:
+        x, y = point
+        x, y = int(x), int(y)
+        cv2.circle(img, (x, y), size, color, thickness=size)
+    for connection in connections:
+        x0, y0 = points[connection[0]]
+        x1, y1 = points[connection[1]]
+        x0, y0 = int(x0), int(y0)
+        x1, y1 = int(x1), int(y1)
+        cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size)
+# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py
+#        8   12  16  20
+#        |   |   |   |
+#        7   11  15  19
+#    4   |   |   |   |
+#    |   6   10  14  18
+#    3   |   |   |   |
+#    |   5---9---13--17
+#    2    \         /
+#     \    \       /
+#      1    \     /
+#       \    \   /
+#        ------0-
+HAND_CONNECTIONS = [
+    (0, 1), (1, 2), (2, 3), (3, 4),
+    (5, 6), (6, 7), (7, 8),
+    (9, 10), (10, 11), (11, 12),
+    (13, 14), (14, 15), (15, 16),
+    (17, 18), (18, 19), (19, 20),
+    (0, 5), (5, 9), (9, 13), (13, 17), (0, 17)
+]
+POSE_CONNECTIONS = [
+    (0,1), (1,2), (2,3), (3,7),
+    (0,4), (4,5), (5,6), (6,8),
+    (9,10),
+    (11,13), (13,15), (15,17), (17,19), (19,15), (15,21),
+    (12,14), (14,16), (16,18), (18,20), (20,16), (16,22),
+    (11,12), (12,24), (24,23), (23,11)
+]
+# Vertex indices can be found in
+# github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png
+# Found in github.com/google/mediapipe/python/solutions/face_mesh.py
+FACE_CONNECTIONS = [
+    # Lips.
+    (61, 146), (146, 91), (91, 181), (181, 84), (84, 17),
+    (17, 314), (314, 405), (405, 321), (321, 375), (375, 291),
+    (61, 185), (185, 40), (40, 39), (39, 37), (37, 0),
+    (0, 267), (267, 269), (269, 270), (270, 409), (409, 291),
+    (78, 95), (95, 88), (88, 178), (178, 87), (87, 14),
+    (14, 317), (317, 402), (402, 318), (318, 324), (324, 308),
+    (78, 191), (191, 80), (80, 81), (81, 82), (82, 13),
+    (13, 312), (312, 311), (311, 310), (310, 415), (415, 308),
+    # Left eye.
+    (263, 249), (249, 390), (390, 373), (373, 374), (374, 380),
+    (380, 381), (381, 382), (382, 362), (263, 466), (466, 388),
+    (388, 387), (387, 386), (386, 385), (385, 384), (384, 398),
+    (398, 362),
+    # Left eyebrow.
+    (276, 283), (283, 282), (282, 295), (295, 285), (300, 293),
+    (293, 334), (334, 296), (296, 336),
+    # Right eye.
+    (33, 7), (7, 163), (163, 144), (144, 145), (145, 153),
+    (153, 154), (154, 155), (155, 133), (33, 246), (246, 161),
+    (161, 160), (160, 159), (159, 158), (158, 157), (157, 173),
+    (173, 133),
+    # Right eyebrow.
+    (46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105),
+    (105, 66), (66, 107),
+    # Face oval.
+    (10, 338), (338, 297), (297, 332), (332, 284), (284, 251),
+    (251, 389), (389, 356), (356, 454), (454, 323), (323, 361),
+    (361, 288), (288, 397), (397, 365), (365, 379), (379, 378),
+    (378, 400), (400, 377), (377, 152), (152, 148), (148, 176),
+    (176, 149), (149, 150), (150, 136), (136, 172), (172, 58),
+    (58, 132), (132, 93), (93, 234), (234, 127), (127, 162),
+    (162, 21), (21, 54), (54, 103), (103, 67), (67, 109),
+    (109, 10)
+]

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+## Model Information
+### Source model
+- Input shape: [1x3x256x256], [1x3x256x256]
+- Number of parameters:1.76M, 2.01M
+- Model size:7.11MB, 8.09MB
+- Output shape: [1x2944x18, 1x2944x1], [1, 1, 1x21x3]
+Source model repository: [MediaPipe-Hand-Detection](https://github.com/zmurez/MediaPipePyTorch/)
+### Converted model
+- Precision: W8A16
+- Backend: QNN2.16
+- Target Device: FV01 QCS6490
+## Inference with AidLite SDK
+### SDK installation
+Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
+- install AidLite SDK
+```bash
+# Install the appropriate version of the aidlite sdk
+sudo aid-pkg update
+sudo aid-pkg install aidlite-sdk
+# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
+sudo aid-pkg install aidlite-{QNN VERSION}
+```
+- Verify AidLite SDK
+```bash
+# aidlite sdk c++ check
+python3 -c "import aidlite ; print(aidlite.get_library_version())"
+# aidlite sdk python check
+python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
+```
+### Run demo
+#### python
+```bash
+cd python
+python3 demo_qnn.py
+```
+#### c++
+```bash
+# 加载.npy文件需要用到cnpy库（终端默认路径下执行即可）
+git clone https://github.com/rogersce/cnpy.git
+cd cnpy
+mkdir build && cd build
+cmake ..
+make
+sudo make install
+cd mediapipe-hand/model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp
+mkdir build && cd build
+cmake ..
+make
+./run_test
+```

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+cmake_minimum_required (VERSION 3.5)
+project("run_test")
+find_package(OpenCV REQUIRED)
+find_library(CNPY_LIB cnpy REQUIRED)
+message(STATUS "oPENCV Library status:")
+message(STATUS ">version:${OpenCV_VERSION}")
+message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
+set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
+include_directories(
+    /usr/local/include
+    /usr/include/opencv4
+)
+link_directories(
+    /usr/local/lib/
+)
+file(GLOB SRC_LISTS
+    ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp
+)
+add_executable(run_test ${SRC_LISTS})
+target_link_libraries(run_test
+    aidlite
+	${OpenCV_LIBS}
+    pthread
+    jsoncpp
+    ${CNPY_LIB}
+)

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/anchors_float32.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df91d5dc452f5098bd2618bae51fed413a1f6d3774bea5fbfac1a846d4ee8466
+size 47232

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/hand.jpg ADDED Viewed

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/cpp/run_test.cpp ADDED Viewed

	@@ -0,0 +1,923 @@

+#include <iostream>
+#include <fstream>
+#include <opencv2/opencv.hpp>
+#include <aidlux/aidlite/aidlite.hpp>
+#include <vector>
+#include <numeric>
+#include <cmath>
+#include <jsoncpp/json/json.h>
+#include <tuple>
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <cassert>
+#include "cnpy.h"
+using namespace cv;
+using namespace std;
+using namespace Aidlux::Aidlite;
+// 人脸 landmark 连接索引定义（来自 MediaPipe Face Mesh）
+const std::vector<std::pair<int, int>> HAND_CONNECTIONS = {
+    {0, 1}, {1, 2}, {2, 3}, {3, 4},
+    {5, 6}, {6, 7}, {7, 8},
+    {9, 10}, {10, 11}, {11, 12},
+    {13, 14}, {14, 15}, {15, 16},
+    {17, 18}, {18, 19}, {19, 20},
+    {0, 5}, {5, 9}, {9, 13}, {13, 17}, {0, 17}
+};
+int kp1 = 0, kp2 = 2;           // 关键点索引
+float dy = -0.5f;                // 根据模型定义设定
+float dscale = 2.6f;            // 缩放因子
+float theta0 = 1.5707963267948966;            // 基准角度
+int batch=1;
+int num_anchors=2944;
+int num_coords=18;
+int num_classes=1;
+int num_keypoints=7;
+float x_scale=256.0;
+float y_scale=256.0;
+float w_scale=256.0;
+float h_scale=256.0;
+float score_clipping_thresh=100.0;
+float min_score_thresh=0.75;
+struct Args {
+    std::string faceDetector_model = "../../models/m_handDetector_w8a16.qnn216.ctx.bin";
+    std::string faceLandmark_model = "../../models/m_handLandmark_w8a16.qnn216.ctx.bin";
+    std::string imgs = "../hand.jpg";
+    int invoke_nums = 10;
+    std::string model_type = "QNN";
+};
+Args parse_args(int argc, char* argv[]) {
+    Args args;
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+        if (arg == "--faceDetector_model" && i + 1 < argc) {
+            args.faceDetector_model = argv[++i];
+        } else if (arg == "--faceLandmark_model" && i + 1 < argc) {
+            args.faceLandmark_model = argv[++i];
+        } else if (arg == "--imgs" && i + 1 < argc) {
+            args.imgs = argv[++i];
+        } else if (arg == "--invoke_nums" && i + 1 < argc) {
+            args.invoke_nums = std::stoi(argv[++i]);
+        } else if (arg == "--model_type" && i + 1 < argc) {
+            args.model_type = argv[++i];
+        }
+    }
+    return args;
+}
+std::string to_lower(const std::string& str) {
+    std::string lower_str = str;
+    std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
+        return std::tolower(c);
+    });
+    return lower_str;
+}
+std::vector<std::vector<float>> load_anchors_from_npy(const std::string& path) {
+    cnpy::NpyArray arr = cnpy::npy_load(path);
+    float* data_ptr = arr.data<float>();
+    size_t num_rows = arr.shape[0]; // 896
+    size_t num_cols = arr.shape[1]; // 4
+    std::vector<std::vector<float>> anchors(num_rows, std::vector<float>(num_cols));
+    for (size_t i = 0; i < num_rows; ++i) {
+        for (size_t j = 0; j < num_cols; ++j) {
+            anchors[i][j] = data_ptr[i * num_cols + j];
+        }
+    }
+    return anchors;
+}
+// 绘制人脸关键点和连接线
+void draw_landmarks(
+    cv::Mat& img,
+    const std::vector<cv::Point2f>& points,
+    const std::vector<float>& flags,
+    const std::vector<std::pair<int, int>>& connections,
+    float threshold = 0.4f,
+    cv::Scalar point_color = cv::Scalar(0, 255, 0),
+    cv::Scalar line_color = cv::Scalar(0, 0, 0),
+    int size = 2)
+{
+    // 画关键点
+    for (size_t i = 0; i < points.size(); ++i) {
+        // if (i < flags.size() && flags[i] > threshold) {
+            int x = static_cast<int>(points[i].x);
+            int y = static_cast<int>(points[i].y);
+            cv::circle(img, cv::Point(x, y), size, point_color, size);
+        // }
+    }
+    // 画连接线（两端都要可见）
+    for (const auto& conn : connections) {
+        int i0 = conn.first;
+        int i1 = conn.second;
+        // if (i0 < points.size() && i1 < points.size() &&
+        //     i0 < flags.size() && i1 < flags.size() &&
+        //     flags[i0] > threshold && flags[i1] > threshold)
+        // {
+            cv::line(img, points[i0], points[i1], line_color, size);
+        // }
+    }
+}
+std::tuple<cv::Mat, cv::Mat, float, cv::Point> resize_pad(const cv::Mat& img) {
+    int h = img.rows;
+    int w = img.cols;
+    int h1, w1, padh = 0, padw = 0;
+    float scale = 1.0f;
+    // Step 1: resize width to 256, keep aspect ratio
+    // int w1 = 256;
+    // int h1 = w1 * orig_h / orig_w;  // 等效于 int(256 * h / w)
+    // 根据宽高，调整缩放比
+    if (h >= w) {
+        h1 = 256;
+        w1 = 256 * w / h;
+        padw = 256 - w1;
+        scale = static_cast<float>(w) / w1;
+    } else {
+        w1 = 256;
+        h1 = 256 * h / w;
+        padh = 256 - h1;
+        scale = static_cast<float>(h) / h1;
+    }
+    // std::cout << "Original size: (" << h << ", " << w << "), padding: (" << padh << ", " << padw << ")\n";
+    // Step 2: compute padding in height direction
+    int padh1 = padh / 2;
+    int padh2 = padh - padh1;
+    int padw1 = padw / 2;
+    int padw2 = padw - padw1;
+    // std::cout << "Padding: (" << padh1 << ", " << padh2 << "), (" << padw1 << ", " << padw2 << ")\n";
+    // Resize to (w1, h1)
+    cv::Mat resized;
+    cv::resize(img, resized, cv::Size(w1, h1));
+    // Pad to 256x256
+    cv::Mat padded;
+    cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
+    // Final resize to 128x128
+    cv::Mat resized_small;
+    cv::resize(padded, resized_small, cv::Size(128, 128));
+    // Compute offset in original scale
+    cv::Point pad_offset(static_cast<int>(padh1 * scale), static_cast<int>(padw1 * scale));
+    return std::make_tuple(padded, resized_small, scale, pad_offset);
+}
+// 将图像转换为 1xC×H×W 格式并归一化（除以 255）
+std::vector<float> preprocess_image(const cv::Mat& img) {
+    int H = img.rows;
+    int W = img.cols;
+    int C = img.channels(); // should be 3
+    std::vector<float> chw(H * W * C); // CHW
+    std::vector<float> nchw(1 * C * H * W); // NCHW
+    // 1. HWC → CHW + normalize (float32 / 255.0)
+    for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+            for (int c = 0; c < C; ++c) {
+                // OpenCV uses BGR order
+                float value = img.at<cv::Vec3b>(h, w)[c] / 255.0f;
+                chw[c * H * W + h * W + w] = value;
+            }
+        }
+    }
+    // 2. CHW → NCHW (add batch dimension, actually just copy)
+    for (int i = 0; i < C * H * W; ++i) {
+        nchw[i] = chw[i];
+    }
+    return nchw; // shape: [1, 3, H, W]
+}
+// 只用前4个坐标计算IOU（默认框位置在前4个坐标）
+float IoU(const std::vector<float>& box1, const std::vector<float>& box2) {
+    float x1 = std::max(box1[0], box2[0]);
+    float y1 = std::max(box1[1], box2[1]);
+    float x2 = std::min(box1[2], box2[2]);
+    float y2 = std::min(box1[3], box2[3]);
+    float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
+    float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]);
+    float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]);
+    float union_area = box1_area + box2_area - inter_area;
+    return union_area > 0 ? inter_area / union_area : 0.0f;
+}
+std::vector<std::vector<float>> weighted_non_max_suppression(
+    std::vector<std::vector<float>>& detections,
+    int num_coords = 18,
+    float min_suppression_threshold = 0.3f)
+{
+    if (detections.empty()) return {};
+    std::vector<int> indices(detections.size());
+    std::iota(indices.begin(), indices.end(), 0);
+    // 按置信度降序排序
+    std::sort(indices.begin(), indices.end(), [&](int a, int b) {
+        return detections[a][num_coords] > detections[b][num_coords];
+    });
+    std::vector<std::vector<float>> output;
+    while (!indices.empty()) {
+        int best_idx = indices.front();
+        const auto& best_det = detections[best_idx];
+        std::vector<int> overlapping = { best_idx };
+        for (size_t i = 1; i < indices.size(); ++i) {
+            float iou = IoU(best_det, detections[indices[i]]);
+            if (iou > min_suppression_threshold) {
+                overlapping.push_back(indices[i]);
+            }
+        }
+        // 更新剩余索引
+        std::vector<int> new_indices;
+        for (int idx : indices) {
+            if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) {
+                new_indices.push_back(idx);
+            }
+        }
+        indices = new_indices;
+        // 加权平均：坐标 * 置信度
+        if (overlapping.size() == 1) {
+            output.push_back(best_det);
+        } else {
+            std::vector<float> weighted(num_coords + 1, 0.0f);
+            float total_score = 0.0f;
+            for (int idx : overlapping) {
+                float score = detections[idx][num_coords];
+                total_score += score;
+                for (int k = 0; k < num_coords; ++k) {
+                    weighted[k] += detections[idx][k] * score;
+                }
+            }
+            for (int k = 0; k < num_coords; ++k) {
+                weighted[k] /= total_score;
+            }
+            weighted[num_coords] = total_score / overlapping.size();  // 取平均得分
+            // std::cout << "Weighted box: ";
+            // for (float v : weighted) std::cout << v << " ";
+            // std::cout << "\n";
+            output.push_back(weighted);
+        }
+    }
+    // TODO
+    auto x = output[0];
+    output.clear();
+    output.push_back(x);
+    return output;
+}
+std::vector<std::vector<float>> denormalize_detections(
+    const std::vector<std::vector<float>>& detections,
+    float scale,
+    const cv::Point& pad
+) {
+    std::vector<std::vector<float>> result = detections;
+    for (size_t i = 0; i < result.size(); ++i) {
+        std::vector<float>& det = result[i];
+        // bbox coords: x1, y1, x2, y2
+        det[0] = det[0] * scale * 256.0f - pad.x;  // x1
+        det[1] = det[1] * scale * 256.0f - pad.y;  // y1
+        det[2] = det[2] * scale * 256.0f - pad.x;  // x2
+        det[3] = det[3] * scale * 256.0f - pad.y;  // y2
+        // keypoints (starting from index 4): format [y, x, y, x, ...]
+        for (size_t k = 4; k + 1 < det.size(); k += 2) {
+            det[k]     = det[k]     * scale * 256.0f - pad.y;  // y
+            det[k + 1] = det[k + 1] * scale * 256.0f - pad.x;  // x
+        }
+    }
+    return result;
+}
+void detection2roi(
+    const std::vector<std::vector<float>>& detections,
+    std::vector<float>& xc,
+    std::vector<float>& yc,
+    std::vector<float>& scale,
+    std::vector<float>& theta,
+    int kp1, int kp2,   // 关键点索引
+    float dy, float dscale, float theta0
+) {
+    size_t N = detections.size();
+    xc.resize(N);
+    yc.resize(N);
+    scale.resize(N);
+    theta.resize(N);
+    for (size_t i = 0; i < N; ++i) {
+        const std::vector<float>& det = detections[i];
+        float x1 = det[1];
+        float x2 = det[3];
+        float y1 = det[0];
+        float y2 = det[2];
+        float x_center = (x1 + x2) / 2.0f;
+        float y_center = (y1 + y2) / 2.0f;
+        float box_scale = (x2 - x1);  // assumes square box
+        // yc 偏移
+        y_center += dy * box_scale;
+        box_scale *= dscale;
+        // 获取两个关键点的位置
+        int base = 4;
+        int idx_y0 = base + 2 * kp1;
+        int idx_x0 = base + 2 * kp1 + 1;
+        int idx_y1 = base + 2 * kp2;
+        int idx_x1 = base + 2 * kp2 + 1;
+        float x0 = det[idx_x0];
+        float y0 = det[idx_y0];
+        float x1_kp = det[idx_x1];
+        float y1_kp = det[idx_y1];
+        float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0;
+        // 输出赋值
+        xc[i] = x_center;
+        yc[i] = y_center;
+        scale[i] = box_scale;
+        // TODO: 这里的 theta 需要根据实际情况调整
+        // theta[i] = angle;  // 如果需要使用计算的角度
+        theta[i] = -0.8461;
+    }
+}
+void extract_roi(
+    const cv::Mat& frame,
+    const std::vector<float>& xc,
+    const std::vector<float>& yc,
+    const std::vector<float>& theta,
+    const std::vector<float>& scale,
+    std::vector<cv::Mat>& cropped_rois,
+    std::vector<cv::Mat>& affine_matrices,
+    std::vector<std::vector<cv::Point2f>>& roi_boxes,  // 添加返回点坐标
+    int resolution = 256
+) {
+    cropped_rois.clear();
+    affine_matrices.clear();
+    roi_boxes.clear();
+    for (size_t i = 0; i < xc.size(); ++i) {
+        float s = scale[i] / 2.0f;
+        float cos_t = std::cos(theta[i]);
+        float sin_t = std::sin(theta[i]);
+        // 定义4个 unit square 点经过变换后的点（顺序和 Python 中一样）
+        std::vector<cv::Point2f> points(4);
+        // [-1, -1]
+        points[0].x = xc[i] + (-s * cos_t + s * sin_t);
+        points[0].y = yc[i] + (-s * sin_t - s * cos_t);
+        // [1, -1]
+        points[1].x = xc[i] + ( s * cos_t + s * sin_t);
+        points[1].y = yc[i] + ( s * sin_t - s * cos_t);
+        // [-1, 1]
+        points[2].x = xc[i] + (-s * cos_t - s * sin_t);
+        points[2].y = yc[i] + (-s * sin_t + s * cos_t);
+        // [1, 1]
+        points[3].x = xc[i] + ( s * cos_t - s * sin_t);
+        points[3].y = yc[i] + ( s * sin_t + s * cos_t);
+        // 用前三个点计算仿射变换
+        std::vector<cv::Point2f> src_pts = { points[0], points[1], points[2] };
+        std::vector<cv::Point2f> dst_pts = {
+            cv::Point2f(0, 0),
+            cv::Point2f(resolution - 1, 0),
+            cv::Point2f(0, resolution - 1)
+        };
+        cv::Mat M = cv::getAffineTransform(src_pts, dst_pts);
+        cv::Mat M_inv;
+        cv::invertAffineTransform(M, M_inv);
+        cv::Mat cropped;
+        cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5));
+        cropped_rois.push_back(cropped);
+        affine_matrices.push_back(M_inv);
+        roi_boxes.push_back(points);  // 添加变换后的 box 点
+    }
+}
+std::vector<float> preprocess_imgs_to_nchw(const std::vector<cv::Mat>& imgs) {
+    int N = imgs.size();
+    if (N == 0) return {};
+    int H = 256;
+    int W = 256;
+    int C = 3;  // assume 3 channels (BGR)
+    std::vector<float> output;
+    output.reserve(N * C * H * W);
+    for (int n = 0; n < N; ++n) {
+        cv::Mat img_float;
+        imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0);  // Normalize to [0,1]
+        // Split channels (HWC → CHW)
+        std::vector<cv::Mat> channels(3);
+        cv::split(img_float, channels);  // channels[0] = B, [1] = G, [2] = R
+        for (int c = 0; c < C; ++c) {
+            for (int i = 0; i < H; ++i) {
+                for (int j = 0; j < W; ++j) {
+                    output.push_back(channels[c].at<float>(i, j));
+                }
+            }
+        }
+    }
+    return output;  // shape: N x C x H x W
+}
+std::vector<cv::Point2f> denormalize_landmarks(
+    const std::vector<float>& normalized_landmarks,
+    const std::vector<cv::Mat>& affines,
+    int resolution = 256)
+{
+    std::vector<cv::Point2f> output;
+    // 检查输入尺寸
+    const int num_faces = 1;  // 假设只有一个人脸
+    const int num_landmarks = 21;
+    if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) {
+        std::cerr << "Error: Input size mismatch. Expected "
+                  << num_faces * num_landmarks * 3 << " landmarks and "
+                  << num_faces << " affine matrices." << std::endl;
+        throw std::runtime_error("Input size mismatch");
+    }
+    for (int i = 0; i < num_faces; ++i) {
+        const cv::Mat& affine = affines[i]; // 2x3 CV_32F
+        for (int j = 0; j < num_landmarks; ++j) {
+            int idx = i * num_landmarks * 3 + j * 3;
+            float x = normalized_landmarks[idx + 0] * resolution;
+            float y = normalized_landmarks[idx + 1] * resolution;
+            // float z = normalized_landmarks[idx + 2]; // 可选使用
+            // 2x1 输入向量
+            cv::Mat pt = (cv::Mat_<float>(2, 1) << x, y);
+            // 提取 affine 的旋转和平移
+            cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone();
+            cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone();
+            M2x2.convertTo(M2x2, CV_32F);
+            t2x1.convertTo(t2x1, CV_32F);
+            // 反仿射变换
+            cv::Mat out = M2x2 * pt + t2x1;
+            // 存储为 Point2f
+            output.emplace_back(out.at<float>(0, 0), out.at<float>(1, 0));
+        }
+    }
+    return output; // 输出为 denormalized landmarks，大小为 468 个 Point2f
+}
+void draw_roi(cv::Mat& img, const std::vector<std::vector<cv::Point2f>>& boxes) {
+    for (const auto& roi : boxes) {
+        if (roi.size() < 4) continue;
+        const cv::Point2f& p1 = roi[0];
+        const cv::Point2f& p2 = roi[1];
+        const cv::Point2f& p3 = roi[2];
+        const cv::Point2f& p4 = roi[3];
+        cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2);     // 绿色
+        cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+    }
+}
+void draw_detections(cv::Mat& img, const std::vector<std::vector<float>>& detections, bool with_keypoints = true) {
+    for (const auto& det : detections) {
+        if (det.size() < 4) continue;
+        float ymin = det[0];
+        float xmin = det[1];
+        float ymax = det[2];
+        float xmax = det[3];
+        cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1);
+        if (with_keypoints && det.size() > 4) {
+            int n_keypoints = (det.size() - 4) / 2;
+            for (int k = 0; k < n_keypoints; ++k) {
+                int kp_x = int(det[4 + k * 2]);
+                int kp_y = int(det[4 + k * 2 + 1]);
+                cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2);
+            }
+        }
+    }
+}
+std::vector<std::vector<float>> loadAnchors(const std::string& filename) {
+    std::ifstream in(filename);
+    std::vector<std::vector<float>> anchors;
+    if (!in.is_open()) {
+        std::cerr << "Failed to open file: " << filename << std::endl;
+        return anchors;
+    }
+    std::string line;
+    while (std::getline(in, line)) {
+        std::istringstream ss(line);
+        std::vector<float> anchor;
+        float value;
+        while (ss >> value) {
+            anchor.push_back(value);
+        }
+        if (!anchor.empty()) {
+            anchors.push_back(anchor);
+        }
+    }
+    in.close();
+    return anchors;
+}
+// sigmoid 函数
+float sigmoid(float x) {
+    return 1.0f / (1.0f + std::exp(-x));
+}
+// clamp 函数
+float clamp(float x, float min_val, float max_val) {
+    return std::max(min_val, std::min(max_val, x));
+}
+// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord]
+std::vector<std::vector<std::vector<float>>> decode_boxes(
+    const std::vector<float>& raw_boxes,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    int num_keypoints)
+{
+    std::vector<std::vector<std::vector<float>>> decoded_boxes(batch,
+        std::vector<std::vector<float>>(num_anchors, std::vector<float>(num_coords, 0)));
+    for (int b = 0; b < batch; ++b) {
+        for (int i = 0; i < num_anchors; ++i) {
+            int base = b * num_anchors * num_coords + i * num_coords;
+            float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0];
+            float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1];
+            float w = raw_boxes[base + 2] / w_scale * anchors[i][2];
+            float h = raw_boxes[base + 3] / h_scale * anchors[i][3];
+            decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin
+            decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin
+            decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax
+            decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax
+            for (int k = 0; k < num_keypoints; ++k) {
+                int offset = 4 + k * 2;
+                float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0];
+                float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1];
+                decoded_boxes[b][i][offset] = keypoint_x;
+                decoded_boxes[b][i][offset + 1] = keypoint_y;
+            }
+        }
+    }
+    return decoded_boxes;
+}
+std::vector<std::vector<std::vector<float>>> tensors_to_detections(
+    const std::vector<float>& raw_box_tensor,
+    const std::vector<float>& raw_score_tensor,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    float score_clipping_thresh, float min_score_thresh)
+{
+    assert(raw_box_tensor.size() == batch * num_anchors * num_coords);
+    assert(raw_score_tensor.size() == batch * num_anchors * num_classes);
+    assert(anchors.size() == size_t(num_anchors));
+    auto detection_boxes = decode_boxes(
+        raw_box_tensor, anchors, batch, num_anchors, num_coords,
+        x_scale, y_scale, w_scale, h_scale, num_keypoints);
+    std::vector<std::vector<std::vector<float>>> output_detections;
+    for (int b = 0; b < batch; ++b) {
+        std::vector<std::vector<float>> detections;
+        for (int i = 0; i < num_anchors; ++i) {
+            int score_index = b * num_anchors * num_classes + i * num_classes;
+            // 单类情况，取第0类
+            float score_raw = raw_score_tensor[score_index];
+            float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh));
+            if (score >= min_score_thresh) {
+                std::vector<float> det = detection_boxes[b][i]; // shape [num_coords]
+                det.push_back(score); // 追加置信度
+                detections.push_back(det); // shape [num_coords+1]
+            }
+        }
+        output_detections.push_back(detections); // 每个 batch 一个 vector
+    }
+    return output_detections;
+}
+int invoke(const Args& args) {
+    std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n"
+              << args.faceLandmark_model << "\n"
+              << "Image Path: " << args.imgs << "\n"
+              << "Inference Nums: " << args.invoke_nums << "\n"
+              << "Model Type: " << args.model_type << "\n";
+    // =============================================================faceDetector_model start
+    Model* model1 = Model::create_instance(args.faceDetector_model);
+    if(model1 == nullptr){
+        printf("Create model1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config1 = Config::create_instance();
+    if(config1 == nullptr){
+        printf("Create config1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config1->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower1 = to_lower(args.model_type);
+    if (model_type_lower1 == "qnn"){
+        config1->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") {
+        config1->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config1->accelerate_type = AccelerateType::TYPE_DSP;
+    config1->is_quantify_model = 1;
+    std::vector<std::vector<uint32_t>> input_shapes1 = {{1,3,256,256}};
+    std::vector<std::vector<uint32_t>> output_shapes1 = {{1,2944,18},{1,2944,1}};
+    model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1);
+    if(fast_interpreter1 == nullptr){
+        printf("build_interpretper_from_model_and_config failed !\n");
+        return EXIT_FAILURE;
+    }
+    int result = fast_interpreter1->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter1->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model load success!\n");
+    // =============================================================faceDetector_model over
+    // =============================================================faceLandmark_model start
+    Model* model2 = Model::create_instance(args.faceLandmark_model);
+    if(model2 == nullptr){
+        printf("Create model2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config2 = Config::create_instance();
+    if(config2 == nullptr){
+        printf("Create config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config2->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower2 = to_lower(args.model_type);
+    if (model_type_lower2 == "qnn"){
+        config2->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") {
+        config2->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config2->accelerate_type = AccelerateType::TYPE_DSP;
+    config2->is_quantify_model = 1;
+    std::vector<std::vector<uint32_t>> input_shapes2 = {{1,3,256,256}};
+    std::vector<std::vector<uint32_t>> output_shapes2 = {{1},{1},{1,21,3}};
+    model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2);
+    if(fast_interpreter2 == nullptr){
+        printf("build_interpretper_from_model_and_config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    result = fast_interpreter2->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter2->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model2 load success!\n");
+    // =============================================================faceLandmark_model over
+    auto anchors = load_anchors_from_npy("../anchors_float32.npy");
+    cv::Mat frame = cv::imread(args.imgs);
+    if (frame.empty()) {
+        printf("detect image load failed!\n");
+        return 1;
+    }
+    // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
+    cv::Mat input_data;
+    cv::Mat frame_clone1 = frame.clone();
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB);
+    cv::Mat frame_clone = frame.clone();
+    cv::Mat img1, img2;
+    float scale;
+    cv::Point pad;
+    std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1);
+    std::vector<float> input_tensor = preprocess_image(img1);
+    float *outdata0 = nullptr;
+    float *outdata1 = nullptr;
+    std::vector<float> invoke_time;
+    for (int i = 0; i < args.invoke_nums; ++i) {
+        result = fast_interpreter1->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter1->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_0 = 0;
+        result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_1 = 0;
+        result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+    }
+    std::vector<float> tensor_1_896_16(outdata0, outdata0 + 2944*18);
+    std::vector<float> tensor_1_896_1(outdata1, outdata1 + 2944*1);
+    std::vector<std::vector<std::vector<float>>> detections = tensors_to_detections(
+        tensor_1_896_16, tensor_1_896_1, anchors,
+        batch, num_anchors, num_coords, num_classes, num_keypoints,
+        x_scale, y_scale, w_scale, h_scale,
+        score_clipping_thresh, min_score_thresh);
+    std::vector<std::vector<std::vector<float>>> filtered_detections;
+    for (size_t i = 0; i < detections.size(); ++i) {
+        std::vector<std::vector<float>>& dets = detections[i];
+        std::vector<std::vector<float>> faces = weighted_non_max_suppression(dets);
+        filtered_detections.push_back(faces);
+    }
+    // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n";
+    // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n";
+    std::vector<std::vector<float>> face_detections = denormalize_detections(filtered_detections[0], scale, pad);
+    // std::cout << "face_detections size: " << face_detections.size() << "\n";
+    std::vector<float> xc, yc, scales, theta;
+    detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0);
+    std::vector<cv::Mat> rois;
+    std::vector<cv::Mat> affines;
+    std::vector<std::vector<cv::Point2f>> boxes;
+    // std::cout << "xc size: " << xc.size() << ", yc size: " << yc.size() << ", scales size: " << scales.size() << ", theta size: " << theta.size() << "\n";
+    // std::cout << "xc: " << xc[0] << ", yc: " << yc[0] << ", scales: " << scales[0] << ", theta: " << theta[0] << "\n";
+    extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes);
+    if (!boxes.empty()) {
+        std::cout << "Detected " << boxes.size() << " faces.\n";
+        // 检测到人脸，继续处理 boxes[0] ...
+        std::vector<float> input_tensor = preprocess_imgs_to_nchw(rois);
+        // for (int i = 0; i < 5; ++i) {
+        //     std::cout << "input_tensor:" << i << ": " << input_tensor[i] << std::endl;
+        // }
+        float *outdata1_0 = nullptr;
+        float *outdata1_1 = nullptr;
+        result = fast_interpreter2->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter2->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_1_0 = 0;
+        result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_1_1 = 0;
+        result = fast_interpreter2->get_output_tensor(2, (void**)&outdata1_1, &out_data_1_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+        std::vector<float> flags(outdata1_0, outdata1_0 + 1);
+        std::vector<float> normalized_landmarks(outdata1_1, outdata1_1 + 21*3);
+        std::vector<cv::Point2f> landmarks = denormalize_landmarks(normalized_landmarks, affines);
+        draw_landmarks(frame_clone1, landmarks, flags, HAND_CONNECTIONS);
+    } else {
+        std::cout << "not detect face!" << std::endl;
+    }
+    draw_roi(frame_clone1, boxes);
+    draw_detections(frame_clone1, face_detections);
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR);
+    cv::imwrite("vis_result.jpg", frame_clone1);
+    fast_interpreter1->destory();
+    fast_interpreter2->destory();
+    return 0;
+}
+int main(int argc, char* argv[]) {
+    Args args = parse_args(argc, argv);
+    return invoke(args);
+}

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/anchors_palm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24fa4a27ad6bee24ba3185a42fe3a47115540b0b27fa5956a291f03756183b41
+size 94336

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/blazehand_landmark.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd0be6683b0a2f003a3dd3f38da5d12eee3368828d707a04fda247a9793bcb80
+size 8090697

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/blazepalm.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f39b855c35b7d31bee1d9fdcdf0a819763bcfd8d59dabfed00d04b0eafd3eba
+size 7088188

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handDetector.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6eeba4a7e513d23ddd1cd96fae5d22eb620118d0d786e830dc40f8aab149d29d
+size 7460035

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handDetector_w8a16.qnn216.ctx.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14348a3ca0562d7cb3e31c46eef90b02cf5bfa5a87e1020e782b466245bf7ecb
+size 3601960

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handLandmark.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1789c79df6bb361e045b83c7361053722ffbd9e443507fc1dfd33c0abae82f0
+size 8486422

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/models/m_handLandmark_w8a16.qnn216.ctx.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0252aa6a2c6865f78d77d8403804e1570c17d9a81d79d2b760fa81de893dcc61
+size 7039624

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/0000.jpg ADDED Viewed

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-38.pyc ADDED Viewed

Binary file (16.5 kB). View file

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/__pycache__/visualization.cpython-38.pyc ADDED Viewed

Binary file (4.59 kB). View file

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/blazebase.py ADDED Viewed

	@@ -0,0 +1,513 @@

+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def resize_pad(img):
+    """ resize and pad images to be input to the detectors
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio.
+    Returns:
+        img1: 256x256
+        img2: 128x128
+        scale: scale factor between original image and 256x256 image
+        pad: pixels of padding in the original image
+    """
+    size0 = img.shape
+    if size0[0]>=size0[1]:
+        h1 = 256
+        w1 = 256 * size0[1] // size0[0]
+        padh = 0
+        padw = 256 - w1
+        scale = size0[1] / w1
+    else:
+        h1 = 256 * size0[0] // size0[1]
+        w1 = 256
+        padh = 256 - h1
+        padw = 0
+        scale = size0[0] / h1
+    padh1 = padh//2
+    padh2 = padh//2 + padh%2
+    padw1 = padw//2
+    padw2 = padw//2 + padw%2
+    img1 = cv2.resize(img, (w1,h1))
+    img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0)))
+    pad = (int(padh1 * scale), int(padw1 * scale))
+    img2 = cv2.resize(img1, (128,128))
+    return img1, img2, scale, pad
+def denormalize_detections(detections, scale, pad):
+    """ maps detection coordinates from [0,1] to image coordinates
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio. This function maps the
+    normalized coordinates back to the original image coordinates.
+    Inputs:
+        detections: nxm tensor. n is the number of detections.
+            m is 4+2*k where the first 4 valuse are the bounding
+            box coordinates and k is the number of additional
+            keypoints output by the detector.
+        scale: scalar that was used to resize the image
+        pad: padding in the x and y dimensions
+    """
+    detections[:, 0] = detections[:, 0] * scale * 256 - pad[0]
+    detections[:, 1] = detections[:, 1] * scale * 256 - pad[1]
+    detections[:, 2] = detections[:, 2] * scale * 256 - pad[0]
+    detections[:, 3] = detections[:, 3] * scale * 256 - pad[1]
+    detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1]
+    detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0]
+    return detections
+class BlazeBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False):
+        super(BlazeBlock, self).__init__()
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.channel_pad = out_channels - in_channels
+        # TFLite uses slightly different padding than PyTorch
+        # on the depthwise conv layer when the stride is 2.
+        if stride == 2:
+            self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+            padding = 0
+        else:
+            padding = (kernel_size - 1) // 2
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=in_channels,
+                      kernel_size=kernel_size, stride=stride, padding=padding,
+                      groups=in_channels, bias=True),
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+        if skip_proj:
+            self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=1, stride=1, padding=0, bias=True)
+        else:
+            self.skip_proj = None
+        if act == 'relu':
+            self.act = nn.ReLU(inplace=True)
+        elif act == 'prelu':
+            self.act = nn.PReLU(out_channels)
+        else:
+            raise NotImplementedError("unknown activation %s"%act)
+    def forward(self, x):
+        if self.stride == 2:
+            if self.kernel_size==3:
+                h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+            else:
+                h = F.pad(x, (1, 2, 1, 2), "constant", 0)
+            x = self.max_pool(x)
+        else:
+            h = x
+        if self.skip_proj is not None:
+            x = self.skip_proj(x)
+        elif self.channel_pad > 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
+        return self.act(self.convs(h) + x)
+class FinalBlazeBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3):
+        super(FinalBlazeBlock, self).__init__()
+        # TFLite uses slightly different padding than PyTorch
+        # on the depthwise conv layer when the stride is 2.
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=kernel_size, stride=2, padding=0,
+                      groups=channels, bias=True),
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+        self.act = nn.ReLU(inplace=True)
+    def forward(self, x):
+        h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+        return self.act(self.convs(h))
+class BlazeBase(nn.Module):
+    """ Base class for media pipe models. """
+    def _device(self):
+        """Which device (CPU or GPU) is being used by this model?"""
+        return self.classifier_8.weight.device
+    def load_weights(self, path):
+        self.load_state_dict(torch.load(path))
+        self.eval()
+class BlazeLandmark(BlazeBase):
+    """ Base class for landmark models. """
+    def extract_roi(self, frame, xc, yc, theta, scale):
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+        # use the points to compute the affine transform that maps
+        # these points back to the output square
+        res = self.resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].cpu().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+        return imgs, affines, points
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+class BlazeDetector(BlazeBase):
+    """ Base class for detector models.
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/hollance/BlazeFace-PyTorch and
+    https://github.com/google/mediapipe/
+    """
+    def load_anchors(self, path):
+        self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
+        assert(self.anchors.ndimension() == 2)
+        assert(self.anchors.shape[0] == self.num_anchors)
+        assert(self.anchors.shape[1] == 4)
+    def _preprocess(self, x):
+        """Converts the image pixels to the range [-1, 1]."""
+        return x.float() / 255.# 127.5 - 1.0
+    def predict_on_image(self, img):
+        """Makes a prediction on a single image.
+        Arguments:
+            img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
+                 shape (3, H, W). The image's height and width should be
+                 128 pixels.
+        Returns:
+            A tensor with face detections.
+        """
+        if isinstance(img, np.ndarray):
+            img = torch.from_numpy(img).permute((2, 0, 1))
+        return self.predict_on_batch(img.unsqueeze(0))[0]
+    def predict_on_batch(self, x):
+        """Makes a prediction on a batch of images.
+        Arguments:
+            x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
+               shape (b, 3, H, W). The height and width should be 128 pixels.
+        Returns:
+            A list containing a tensor of face detections for each image in
+            the batch. If no faces are found for an image, returns a tensor
+            of shape (0, 17).
+        Each face detection is a PyTorch tensor consisting of 17 numbers:
+            - ymin, xmin, ymax, xmax
+            - x,y-coordinates for the 6 keypoints
+            - confidence score
+        """
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x).permute((0, 3, 1, 2))
+        assert x.shape[1] == 3
+        assert x.shape[2] == self.y_scale
+        assert x.shape[3] == self.x_scale
+        # 1. Preprocess the images into tensors:
+        x = x.to(self._device())
+        x = self._preprocess(x)
+        # 2. Run the neural network:
+        with torch.no_grad():
+            out = self.__call__(x)
+        # 3. Postprocess the raw predictions:
+        detections = self._tensors_to_detections(out[0], out[1], self.anchors)
+        # 4. Non-maximum suppression to remove overlapping detections:
+        filtered_detections = []
+        for i in range(len(detections)):
+            faces = self._weighted_non_max_suppression(detections[i])
+            faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1))
+            filtered_detections.append(faces)
+        return filtered_detections
+    def detection2roi(self, detection):
+        """ Convert detections from detector to an oriented bounding box.
+        Adapted from:
+        # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
+        The center and size of the box is calculated from the center
+        of the detected box. Rotation is calcualted from the vector
+        between kp1 and kp2 relative to theta0. The box is scaled
+        and shifted by dscale and dy.
+        """
+        if self.detection2roi_method == 'box':
+            # compute box center and scale
+            # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+            xc = (detection[:,1] + detection[:,3]) / 2
+            yc = (detection[:,0] + detection[:,2]) / 2
+            scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+        elif self.detection2roi_method == 'alignment':
+            # compute box center and scale
+            # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc
+            xc = detection[:,4+2*self.kp1]
+            yc = detection[:,4+2*self.kp1+1]
+            x1 = detection[:,4+2*self.kp2]
+            y1 = detection[:,4+2*self.kp2+1]
+            scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2
+        else:
+            raise NotImplementedError(
+                "detection2roi_method [%s] not supported"%self.detection2roi_method)
+        yc += self.dy * scale
+        scale *= self.dscale
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        #theta = np.arctan2(y0-y1, x0-x1) - self.theta0
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        """The output of the neural network is a tensor of shape (b, 896, 16)
+        containing the bounding box regressor predictions, as well as a tensor
+        of shape (b, 896, 1) with the classification confidences.
+        This function converts these two "raw" tensors into proper detections.
+        Returns a list of (num_detections, 17) tensors, one for each image in
+        the batch.
+        This is based on the source code from:
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
+        """
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+        return output_detections
+    def _decode_boxes(self, raw_boxes, anchors):
+        """Converts the predictions into actual coordinates using
+        the anchor boxes. Processes the entire batch at once.
+        """
+        boxes = torch.zeros_like(raw_boxes)
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+        return boxes
+    def _weighted_non_max_suppression(self, detections):
+        """The alternative NMS method as mentioned in the BlazeFace paper:
+        "We replace the suppression algorithm with a blending strategy that
+        estimates the regression parameters of a bounding box as a weighted
+        mean between the overlapping predictions."
+        The original MediaPipe code assigns the score of the most confident
+        detection to the weighted detection, but we take the average score
+        of the overlapping detections.
+        The input detections should be a Tensor of shape (count, 17).
+        Returns a list of PyTorch tensors, one for each detected face.
+        This is based on the source code from:
+        mediapipe/calculators/util/non_max_suppression_calculator.cc
+        mediapipe/calculators/util/non_max_suppression_calculator.proto
+        """
+        if len(detections) == 0: return []
+        output_detections = []
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, self.num_coords], descending=True)
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+            # Compute the overlap between the first box and the other
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = overlap_similarity(first_box, other_boxes)
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious > self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :self.num_coords]
+                scores = detections[overlapping, self.num_coords:self.num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:self.num_coords] = weighted
+                weighted_detection[self.num_coords] = total_score / len(overlapping)
+            output_detections.append(weighted_detection)
+        return output_detections
+# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
+def intersect(box_a, box_b):
+    """ We resize both tensors to [A,B,2] without new malloc:
+    [A,2] -> [A,1,2] -> [A,B,2]
+    [B,2] -> [1,B,2] -> [A,B,2]
+    Then we compute the area of intersect between box_a and box_b.
+    Args:
+      box_a: (tensor) bounding boxes, Shape: [A,4].
+      box_b: (tensor) bounding boxes, Shape: [B,4].
+    Return:
+      (tensor) intersection area, Shape: [A,B].
+    """
+    A = box_a.size(0)
+    B = box_b.size(0)
+    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+    inter = torch.clamp((max_xy - min_xy), min=0)
+    return inter[:, :, 0] * inter[:, :, 1]
+def jaccard(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.  Here we operate on
+    ground truth boxes and default boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
+        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
+    Return:
+        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2]-box_a[:, 0]) *
+              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+    area_b = ((box_b[:, 2]-box_b[:, 0]) *
+              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+def overlap_similarity(box, other_boxes):
+    """Computes the IOU between a bounding box and set of other boxes."""
+    return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/blazehand_landmark.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from blazebase import BlazeLandmark, BlazeBlock
+class BlazeHandLandmark(BlazeLandmark):
+    """The hand landmark model from MediaPipe.
+    """
+    def __init__(self):
+        super(BlazeHandLandmark, self).__init__()
+        # size of ROIs used for input
+        self.resolution = 256
+        self._define_layers()
+    def _define_layers(self):
+        self.backbone1 = nn.Sequential(
+            nn.Conv2d(in_channels=3, out_channels=24, kernel_size=3, stride=2, padding=0, bias=True),
+            nn.ReLU(inplace=True),
+            BlazeBlock(24, 24, 5),
+            BlazeBlock(24, 24, 5),
+            BlazeBlock(24, 48, 5, 2),
+        )
+        self.backbone2 = nn.Sequential(
+            BlazeBlock(48, 48, 5),
+            BlazeBlock(48, 48, 5),
+            BlazeBlock(48, 96, 5, 2),
+        )
+        self.backbone3 = nn.Sequential(
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 96, 5, 2),
+        )
+        self.backbone4 = nn.Sequential(
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 96, 5, 2),
+        )
+        self.blaze5 = BlazeBlock(96, 96, 5)
+        self.blaze6 = BlazeBlock(96, 96, 5)
+        self.conv7 = nn.Conv2d(96, 48, 1, bias=True)
+        self.backbone8 = nn.Sequential(
+            BlazeBlock(48, 48, 5),
+            BlazeBlock(48, 48, 5),
+            BlazeBlock(48, 48, 5),
+            BlazeBlock(48, 48, 5),
+            BlazeBlock(48, 96, 5, 2),
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 96, 5),
+            BlazeBlock(96, 288, 5, 2),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5, 2),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5, 2),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5, 2),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+            BlazeBlock(288, 288, 5),
+        )
+        self.hand_flag = nn.Conv2d(288, 1, 2, bias=True)
+        self.handed = nn.Conv2d(288, 1, 2, bias=True)
+        self.landmarks = nn.Conv2d(288, 63, 2, bias=True)
+    def forward(self, x):
+        if x.shape[0] == 0:
+            return torch.zeros((0,)), torch.zeros((0,)), torch.zeros((0, 21, 3))
+        x = F.pad(x, (0, 1, 0, 1), "constant", 0)
+        x = self.backbone1(x)
+        y = self.backbone2(x)
+        z = self.backbone3(y)
+        w = self.backbone4(z)
+        z = z + F.interpolate(w, scale_factor=2, mode='bilinear')
+        z = self.blaze5(z)
+        y = y + F.interpolate(z, scale_factor=2, mode='bilinear')
+        y = self.blaze6(y)
+        y = self.conv7(y)
+        x = x + F.interpolate(y, scale_factor=2, mode='bilinear')
+        x = self.backbone8(x)
+        hand_flag = self.hand_flag(x).view(-1).sigmoid()
+        handed = self.handed(x).view(-1).sigmoid()
+        landmarks = self.landmarks(x).view(-1, 21, 3) / 256
+        return hand_flag, handed, landmarks

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/blazepalm.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from blazebase import BlazeDetector, BlazeBlock
+class BlazePalm(BlazeDetector):
+    """The palm detection model from MediaPipe. """
+    def __init__(self):
+        super(BlazePalm, self).__init__()
+        # These are the settings from the MediaPipe example graph
+        # mediapipe/graphs/hand_tracking/subgraphs/hand_detection_gpu.pbtxt
+        self.num_classes = 1
+        self.num_anchors = 2944
+        self.num_coords = 18
+        self.score_clipping_thresh = 100.0
+        self.x_scale = 256.0
+        self.y_scale = 256.0
+        self.h_scale = 256.0
+        self.w_scale = 256.0
+        self.min_score_thresh = 0.5
+        self.min_suppression_threshold = 0.3
+        self.num_keypoints = 7
+        # These settings are for converting detections to ROIs which can then
+        # be extracted and feed into the landmark network
+        # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+        self.detection2roi_method = 'box'
+        # mediapipe/graphs/hand_tracking/subgraphs/hand_detection_cpu.pbtxt
+        self.kp1 = 0
+        self.kp2 = 2
+        self.theta0 = np.pi/2
+        self.dscale = 2.6
+        self.dy = -0.5
+        self._define_layers()
+    def _define_layers(self):
+        self.backbone1 = nn.Sequential(
+            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=2, padding=0, bias=True),
+            nn.ReLU(inplace=True),
+            BlazeBlock(32, 32),
+            BlazeBlock(32, 32),
+            BlazeBlock(32, 32),
+            BlazeBlock(32, 32),
+            BlazeBlock(32, 32),
+            BlazeBlock(32, 32),
+            BlazeBlock(32, 32),
+            BlazeBlock(32, 64, stride=2),
+            BlazeBlock(64, 64),
+            BlazeBlock(64, 64),
+            BlazeBlock(64, 64),
+            BlazeBlock(64, 64),
+            BlazeBlock(64, 64),
+            BlazeBlock(64, 64),
+            BlazeBlock(64, 64),
+            BlazeBlock(64, 128, stride=2),
+            BlazeBlock(128, 128),
+            BlazeBlock(128, 128),
+            BlazeBlock(128, 128),
+            BlazeBlock(128, 128),
+            BlazeBlock(128, 128),
+            BlazeBlock(128, 128),
+            BlazeBlock(128, 128),
+        )
+        self.backbone2 = nn.Sequential(
+            BlazeBlock(128, 256, stride=2),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+        )
+        self.backbone3 = nn.Sequential(
+            BlazeBlock(256, 256, stride=2),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+            BlazeBlock(256, 256),
+        )
+        self.conv_transpose1 = nn.ConvTranspose2d(in_channels=256, out_channels=256, kernel_size=2, stride=2, padding=0, bias=True)
+        self.blaze1 = BlazeBlock(256, 256)
+        self.conv_transpose2 = nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=2, stride=2, padding=0, bias=True)
+        self.blaze2 = BlazeBlock(128, 128)
+        self.classifier_32 = nn.Conv2d(128, 2, 1, bias=True)
+        self.classifier_16 = nn.Conv2d(256, 2, 1, bias=True)
+        self.classifier_8 = nn.Conv2d(256, 6, 1, bias=True)
+        self.regressor_32 = nn.Conv2d(128, 36, 1, bias=True)
+        self.regressor_16 = nn.Conv2d(256, 36, 1, bias=True)
+        self.regressor_8 = nn.Conv2d(256, 108, 1, bias=True)
+    def forward(self, x):
+        b = x.shape[0]      # batch size, needed for reshaping later
+        x = F.pad(x, (0, 1, 0, 1), "constant", 0)
+        x = self.backbone1(x)           # (b, 128, 32, 32)
+        y = self.backbone2(x)           # (b, 256, 16, 16)
+        z = self.backbone3(y)           # (b, 256, 8, 8)
+        y = y + F.relu(self.conv_transpose1(z), True)
+        y = self.blaze1(y)
+        x = x + F.relu(self.conv_transpose2(y), True)
+        x = self.blaze2(x)
+        # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
+        # permute the output from the conv layers before reshaping it.
+        c1 = self.classifier_8(z)       # (b, 2, 16, 16)
+        c1 = c1.permute(0, 2, 3, 1)     # (b, 16, 16, 2)
+        c1 = c1.reshape(b, -1, 1)       # (b, 512, 1)
+        c2 = self.classifier_16(y)      # (b, 6, 8, 8)
+        c2 = c2.permute(0, 2, 3, 1)     # (b, 8, 8, 6)
+        c2 = c2.reshape(b, -1, 1)       # (b, 384, 1)
+        c3 = self.classifier_32(x)      # (b, 6, 8, 8)
+        c3 = c3.permute(0, 2, 3, 1)     # (b, 8, 8, 6)
+        c3 = c3.reshape(b, -1, 1)       # (b, 384, 1)
+        c = torch.cat((c3, c2, c1), dim=1)  # (b, 896, 1)
+        r1 = self.regressor_8(z)        # (b, 32, 16, 16)
+        r1 = r1.permute(0, 2, 3, 1)     # (b, 16, 16, 32)
+        r1 = r1.reshape(b, -1, 18)      # (b, 512, 16)
+        r2 = self.regressor_16(y)       # (b, 96, 8, 8)
+        r2 = r2.permute(0, 2, 3, 1)     # (b, 8, 8, 96)
+        r2 = r2.reshape(b, -1, 18)      # (b, 384, 16)
+        r3 = self.regressor_32(x)       # (b, 96, 8, 8)
+        r3 = r3.permute(0, 2, 3, 1)     # (b, 8, 8, 96)
+        r3 = r3.reshape(b, -1, 18)      # (b, 384, 16)
+        r = torch.cat((r3, r2, r1), dim=1)  # (b, 896, 16)
+        return [r, c]

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/demo_qnn.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import numpy as np
+import torch
+import cv2
+import sys
+from blazebase import resize_pad, denormalize_detections
+from visualization import  draw_landmarks, draw_roi, HAND_CONNECTIONS
+import time
+import aidlite
+import os
+class post_mediapipe_hand:
+    def __init__(self):
+        self.kp1 = 0
+        self.kp2 = 2
+        self.theta0 = 1.5707963267948966
+        self.dscale = 2.6
+        self.dy = -0.5
+        self.x_scale = 256.0
+        self.y_scale = 256.0
+        self.h_scale = 256.0
+        self.w_scale = 256.0
+        self.num_keypoints = 7
+        self.num_classes = 1
+        self.num_anchors = 2944
+        self.num_coords = 18
+        self.min_score_thresh = 0.75
+        self.score_clipping_thresh = 100.0
+        self.min_suppression_threshold = 0.3
+        self.resolution = 256
+    def detection2roi(self,detection):
+        xc = (detection[:,1] + detection[:,3]) / 2
+        yc = (detection[:,0] + detection[:,2]) / 2
+        scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+        yc += self.dy * scale
+        scale *= self.dscale
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+    def _decode_boxes( self,raw_boxes, anchors):
+        boxes = torch.zeros_like(raw_boxes)
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+        return boxes
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+        return output_detections
+    def extract_roi( self,frame, xc, yc, theta, scale):
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+        # use the points to compute the affine transform that maps
+        # these points back to the output square
+        res = self.resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].detach().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+        return imgs, affines, points
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+    def intersect(self,box_a, box_b):
+        A = box_a.size(0)
+        B = box_b.size(0)
+        max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                        box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+        min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                        box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+        inter = torch.clamp((max_xy - min_xy), min=0)
+        return inter[:, :, 0] * inter[:, :, 1]
+    def jaccard(self,box_a, box_b):
+        inter = self.intersect(box_a, box_b)
+        area_a = ((box_a[:, 2]-box_a[:, 0]) *
+                (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+        area_b = ((box_b[:, 2]-box_b[:, 0]) *
+                (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+        union = area_a + area_b - inter
+        return inter / union  # [A,B]
+    def overlap_similarity(self,box, other_boxes):
+        """Computes the IOU between a bounding box and set of other boxes."""
+        return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
+    def _weighted_non_max_suppression(self,detections):
+        if len(detections) == 0: return []
+        output_detections = []
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, num_coords], descending=True)
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+            # Compute the overlap between the first box and the other
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = self.overlap_similarity(first_box, other_boxes)
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious >  self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :num_coords]
+                scores = detections[overlapping, num_coords:num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:num_coords] = weighted
+                weighted_detection[num_coords] = total_score / len(overlapping)
+            output_detections.append(weighted_detection)
+            return output_detections
+def draw_detections(img, detections, with_keypoints=True):
+    if isinstance(detections, torch.Tensor):
+        detections = detections.detach().numpy()
+    if detections.ndim == 1:
+        detections = np.expand_dims(detections, axis=0)
+    n_keypoints = detections.shape[1] // 2 - 2
+    for i in range(detections.shape[0]):
+        ymin = detections[i, 0]
+        xmin = detections[i, 1]
+        ymax = detections[i, 2]
+        xmax = detections[i, 3]
+        start_point = (int(xmin), int(ymin))
+        end_point = (int(xmax), int(ymax))
+        img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1)
+        if with_keypoints:
+            for k in range(n_keypoints):
+                kp_x = int(detections[i, 4 + k*2    ])
+                kp_y = int(detections[i, 4 + k*2 + 1])
+                cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
+    return img
+post_process=post_mediapipe_hand()
+class handDetectionQnn:
+    def __init__(self):
+        super().__init__()
+        self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_handDetector_w8a16.qnn216.ctx.bin"))
+        if self.model is None:
+            print("Create model failed !")
+            return
+        self.config = aidlite.Config.create_instance()
+        if self.config is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
+        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
+        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
+        self.config.is_quantify_model = 1
+        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        input_shapes = [[1,3, 256, 256]]
+        output_shapes = [[1, 2944,18],[1,2944,1]]
+        self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
+                                output_shapes, aidlite.DataType.TYPE_FLOAT32)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+        result = self.interpreter.init()
+        if result != 0:
+            print(f"interpreter init failed !")
+        result = self.interpreter.load_model()
+        if result != 0:
+            print("interpreter load model failed !")
+        print(" model load success!")
+    def __call__(self, input):
+        self.interpreter.set_input_tensor(0,input)
+        self.interpreter.invoke()
+        features_0 = self.interpreter.get_output_tensor(0).reshape(1, 2944,18).copy()
+        features_1 = self.interpreter.get_output_tensor(1).reshape(1, 2944,1).copy()
+        return features_0,features_1
+class handLandmarkQnn:
+    def __init__(self):
+        super().__init__()
+        self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_handLandmark_w8a16.qnn216.ctx.bin"))
+        if self.model is None:
+            print("Create model failed !")
+            return
+        self.config = aidlite.Config.create_instance()
+        if self.config is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
+        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
+        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
+        self.config.is_quantify_model = 1
+        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        input_shapes = [[1, 3, 256, 256]]
+        output_shapes = [[1],[1],[1,21,3]]
+        self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
+                                output_shapes, aidlite.DataType.TYPE_FLOAT32)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+        result = self.interpreter.init()
+        if result != 0:
+            print(f"interpreter init failed !")
+        result = self.interpreter.load_model()
+        if result != 0:
+            print("interpreter load model failed !")
+        print(" model load success!")
+    def __call__(self, input):
+        self.interpreter.set_input_tensor(0,input)
+        self.interpreter.invoke()
+        features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy()
+        features_1 = self.interpreter.get_output_tensor(2).reshape(1,21,3).copy()
+        return features_0,features_1
+anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/anchors_palm.npy")), dtype=torch.float32, device='cpu')
+hand_detc = handDetectionQnn()
+hand_rec = handLandmarkQnn()
+image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"hand.jpg")
+frame_ct=0
+image = cv2.imread(image_path)
+frame = np.ascontiguousarray(image[:,:,::-1])
+img1, img2, scale, pad = resize_pad(frame)
+input = (img1 / 255).astype(np.float32)
+input = np.transpose(input, (2, 0, 1))
+input = input[np.newaxis, ...]
+t0 = time.time()
+out = hand_detc(input)
+use_time = round((time.time() - t0) * 1000, 2)
+print(f"face detction inference_time:{use_time} ms")
+detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors)
+filtered_detections = []
+num_coords = 18
+for i in range(len(detections)):
+    faces = post_process._weighted_non_max_suppression(detections[i])
+    faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1))
+    filtered_detections.append(faces)
+face_detections = denormalize_detections(filtered_detections[0], scale, pad)
+xc, yc, scale, theta = post_process.detection2roi(face_detections)
+img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale)
+if box.size()[0]!=0:
+    t2 = time.time()
+    flags, normalized_landmarks = hand_rec(img.numpy())
+    use_time = round((time.time() - t2) * 1000, 2)
+    print(f"landmark inference_time:{use_time} ms")
+    landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine)
+    for i in range(len(flags)):
+        landmark, flag = landmarks[i], flags[i]
+        if flag>.4: # 0.5
+            draw_landmarks(frame, landmark[:,:2], HAND_CONNECTIONS, size=2)
+else:
+    print("not detect palm !")
+draw_roi(frame, box)
+draw_detections(frame, face_detections)
+cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1])
+hand_detc.interpreter.destory()
+hand_rec.interpreter.destory()

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/export_jit.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import numpy as np
+import torch
+import os
+from typing import Callable, Tuple
+from blazepalm import BlazePalm
+from blazehand_landmark import BlazeHandLandmark
+gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+torch.set_grad_enabled(False)
+class HandDetector(torch.nn.Module):
+    def __init__(
+        self,
+        detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]],
+        anchors: torch.Tensor,
+    ):
+        super().__init__()
+        self.detector = detector
+        self.anchors = anchors
+    def forward(self, image):
+        return self.detector(image)
+class HandLandmarkDetector(torch.nn.Module):
+    def __init__(
+        self,
+        detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]],
+    ):
+        super().__init__()
+        self.detector = detector
+    def forward(self, image):
+        return self.detector(image)
+palm_detector = BlazePalm().to(gpu)
+palm_detector.load_weights(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/blazepalm.pth"))
+palm_detector.load_anchors(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/anchors_palm.npy"))
+palm_detector.min_score_thresh = .75
+num_params = sum(p.numel() for p in palm_detector.parameters() if p.requires_grad)
+print(f'Number of palm_detector parameters: {num_params}')
+hand_regressor = BlazeHandLandmark().to(gpu)
+hand_regressor.load_weights(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/blazehand_landmark.pth"))
+num_params = sum(p.numel() for p in hand_regressor.parameters() if p.requires_grad)
+print(f'Number of hand_landmark parameters: {num_params}')
+hand_detect = HandDetector(palm_detector,palm_detector.anchors)
+hand_regres = HandLandmarkDetector(hand_regressor)
+hand_d_in = torch.randn(1, 3, 256, 256,dtype= torch.float32)
+source_model = torch.jit.trace(hand_detect.to("cpu"),hand_d_in)
+source_model.save("m_handDetector.pt")
+print("export hand detect ok!")
+hand_r_in = torch.randn(1, 3, 256, 256,dtype= torch.float32)
+source_model = torch.jit.trace(hand_regres.to("cpu"), hand_r_in)
+source_model.save("m_handLandmark.pt")
+print("export hand landmark ok!")

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/hand.jpg ADDED Viewed

model_farm_mediapipehand_qcs6490_qnn2.16_w8a16_aidlite/python/visualization.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import numpy as np
+import cv2
+import torch
+def draw_detections(img, detections, with_keypoints=True):
+    if isinstance(detections, torch.Tensor):
+        detections = detections.cpu().numpy()
+    if detections.ndim == 1:
+        detections = np.expand_dims(detections, axis=0)
+    n_keypoints = detections.shape[1] // 2 - 2
+    for i in range(detections.shape[0]):
+        ymin = detections[i, 0]
+        xmin = detections[i, 1]
+        ymax = detections[i, 2]
+        xmax = detections[i, 3]
+        start_point = (int(xmin), int(ymin))
+        end_point = (int(xmax), int(ymax))
+        img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1)
+        if with_keypoints:
+            for k in range(n_keypoints):
+                kp_x = int(detections[i, 4 + k*2    ])
+                kp_y = int(detections[i, 4 + k*2 + 1])
+                cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
+    return img
+def draw_roi(img, roi):
+    for i in range(roi.shape[0]):
+        (x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i]
+        cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2)
+        cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2)
+        cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2)
+        cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2)
+def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2):
+    points = points[:,:2]
+    for point in points:
+        x, y = point
+        x, y = int(x), int(y)
+        cv2.circle(img, (x, y), size, color, thickness=size)
+    for connection in connections:
+        x0, y0 = points[connection[0]]
+        x1, y1 = points[connection[1]]
+        x0, y0 = int(x0), int(y0)
+        x1, y1 = int(x1), int(y1)
+        cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size)
+# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py
+#        8   12  16  20
+#        |   |   |   |
+#        7   11  15  19
+#    4   |   |   |   |
+#    |   6   10  14  18
+#    3   |   |   |   |
+#    |   5---9---13--17
+#    2    \         /
+#     \    \       /
+#      1    \     /
+#       \    \   /
+#        ------0-
+HAND_CONNECTIONS = [
+    (0, 1), (1, 2), (2, 3), (3, 4),
+    (5, 6), (6, 7), (7, 8),
+    (9, 10), (10, 11), (11, 12),
+    (13, 14), (14, 15), (15, 16),
+    (17, 18), (18, 19), (19, 20),
+    (0, 5), (5, 9), (9, 13), (13, 17), (0, 17)
+]
+POSE_CONNECTIONS = [
+    (0,1), (1,2), (2,3), (3,7),
+    (0,4), (4,5), (5,6), (6,8),
+    (9,10),
+    (11,13), (13,15), (15,17), (17,19), (19,15), (15,21),
+    (12,14), (14,16), (16,18), (18,20), (20,16), (16,22),
+    (11,12), (12,24), (24,23), (23,11)
+]
+# Vertex indices can be found in
+# github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png
+# Found in github.com/google/mediapipe/python/solutions/face_mesh.py
+FACE_CONNECTIONS = [
+    # Lips.
+    (61, 146), (146, 91), (91, 181), (181, 84), (84, 17),
+    (17, 314), (314, 405), (405, 321), (321, 375), (375, 291),
+    (61, 185), (185, 40), (40, 39), (39, 37), (37, 0),
+    (0, 267), (267, 269), (269, 270), (270, 409), (409, 291),
+    (78, 95), (95, 88), (88, 178), (178, 87), (87, 14),
+    (14, 317), (317, 402), (402, 318), (318, 324), (324, 308),
+    (78, 191), (191, 80), (80, 81), (81, 82), (82, 13),
+    (13, 312), (312, 311), (311, 310), (310, 415), (415, 308),
+    # Left eye.
+    (263, 249), (249, 390), (390, 373), (373, 374), (374, 380),
+    (380, 381), (381, 382), (382, 362), (263, 466), (466, 388),
+    (388, 387), (387, 386), (386, 385), (385, 384), (384, 398),
+    (398, 362),
+    # Left eyebrow.
+    (276, 283), (283, 282), (282, 295), (295, 285), (300, 293),
+    (293, 334), (334, 296), (296, 336),
+    # Right eye.
+    (33, 7), (7, 163), (163, 144), (144, 145), (145, 153),
+    (153, 154), (154, 155), (155, 133), (33, 246), (246, 161),
+    (161, 160), (160, 159), (159, 158), (158, 157), (157, 173),
+    (173, 133),
+    # Right eyebrow.
+    (46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105),
+    (105, 66), (66, 107),
+    # Face oval.
+    (10, 338), (338, 297), (297, 332), (332, 284), (284, 251),
+    (251, 389), (389, 356), (356, 454), (454, 323), (323, 361),
+    (361, 288), (288, 397), (397, 365), (365, 379), (379, 378),
+    (378, 400), (400, 377), (377, 152), (152, 148), (148, 176),
+    (176, 149), (149, 150), (150, 136), (136, 172), (172, 58),
+    (58, 132), (132, 93), (93, 234), (234, 127), (127, 162),
+    (162, 21), (21, 54), (54, 103), (103, 67), (67, 109),
+    (109, 10)
+]

model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+## Model Information
+### Source model
+- Input shape: [1x3x256x256], [1x3x256x256]
+- Number of parameters:1.76M, 2.01M
+- Model size:7.11MB, 8.09MB
+- Output shape: [1x2944x18, 1x2944x1], [1, 1, 1x21x3]
+Source model repository: [MediaPipe-Hand-Detection](https://github.com/zmurez/MediaPipePyTorch/)
+### Converted model
+- Precision: FP16
+- Backend: QNN2.16
+- Target Device: SNM972 QCS8550
+## Inference with AidLite SDK
+### SDK installation
+Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
+- install AidLite SDK
+```bash
+# Install the appropriate version of the aidlite sdk
+sudo aid-pkg update
+sudo aid-pkg install aidlite-sdk
+# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
+sudo aid-pkg install aidlite-{QNN VERSION}
+```
+- Verify AidLite SDK
+```bash
+# aidlite sdk c++ check
+python3 -c "import aidlite ; print(aidlite.get_library_version())"
+# aidlite sdk python check
+python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
+```
+### Run demo
+#### python
+```bash
+cd python
+python3 demo_qnn.py
+```
+#### c++
+```bash
+# 加载.npy文件需要用到cnpy库（终端默认路径下执行即可）
+git clone https://github.com/rogersce/cnpy.git
+cd cnpy
+mkdir build && cd build
+cmake ..
+make
+sudo make install
+cd mediapipe-hand/model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp
+mkdir build && cd build
+cmake ..
+make
+./run_test
+```

model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+cmake_minimum_required (VERSION 3.5)
+project("run_test")
+find_package(OpenCV REQUIRED)
+find_library(CNPY_LIB cnpy REQUIRED)
+message(STATUS "oPENCV Library status:")
+message(STATUS ">version:${OpenCV_VERSION}")
+message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
+set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
+include_directories(
+    /usr/local/include
+    /usr/include/opencv4
+)
+link_directories(
+    /usr/local/lib/
+)
+file(GLOB SRC_LISTS
+    ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp
+)
+add_executable(run_test ${SRC_LISTS})
+target_link_libraries(run_test
+    aidlite
+	${OpenCV_LIBS}
+    pthread
+    jsoncpp
+    ${CNPY_LIB}
+)

model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/anchors_float32.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df91d5dc452f5098bd2618bae51fed413a1f6d3774bea5fbfac1a846d4ee8466
+size 47232

model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/hand.jpg ADDED Viewed

model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/cpp/run_test.cpp ADDED Viewed

	@@ -0,0 +1,923 @@

+#include <iostream>
+#include <fstream>
+#include <opencv2/opencv.hpp>
+#include <aidlux/aidlite/aidlite.hpp>
+#include <vector>
+#include <numeric>
+#include <cmath>
+#include <jsoncpp/json/json.h>
+#include <tuple>
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <cassert>
+#include "cnpy.h"
+using namespace cv;
+using namespace std;
+using namespace Aidlux::Aidlite;
+// 人脸 landmark 连接索引定义（来自 MediaPipe Face Mesh）
+const std::vector<std::pair<int, int>> HAND_CONNECTIONS = {
+    {0, 1}, {1, 2}, {2, 3}, {3, 4},
+    {5, 6}, {6, 7}, {7, 8},
+    {9, 10}, {10, 11}, {11, 12},
+    {13, 14}, {14, 15}, {15, 16},
+    {17, 18}, {18, 19}, {19, 20},
+    {0, 5}, {5, 9}, {9, 13}, {13, 17}, {0, 17}
+};
+int kp1 = 0, kp2 = 2;           // 关键点索引
+float dy = -0.5f;                // 根据模型定义设定
+float dscale = 2.6f;            // 缩放因子
+float theta0 = 1.5707963267948966;            // 基准角度
+int batch=1;
+int num_anchors=2944;
+int num_coords=18;
+int num_classes=1;
+int num_keypoints=7;
+float x_scale=256.0;
+float y_scale=256.0;
+float w_scale=256.0;
+float h_scale=256.0;
+float score_clipping_thresh=100.0;
+float min_score_thresh=0.75;
+struct Args {
+    std::string faceDetector_model = "../../models/m_handDetctor_fp16.qnn216.ctx.bin";
+    std::string faceLandmark_model = "../../models/m_handLandmark_fp16.qnn216.ctx.bin";
+    std::string imgs = "../hand.jpg";
+    int invoke_nums = 10;
+    std::string model_type = "QNN";
+};
+Args parse_args(int argc, char* argv[]) {
+    Args args;
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+        if (arg == "--faceDetector_model" && i + 1 < argc) {
+            args.faceDetector_model = argv[++i];
+        } else if (arg == "--faceLandmark_model" && i + 1 < argc) {
+            args.faceLandmark_model = argv[++i];
+        } else if (arg == "--imgs" && i + 1 < argc) {
+            args.imgs = argv[++i];
+        } else if (arg == "--invoke_nums" && i + 1 < argc) {
+            args.invoke_nums = std::stoi(argv[++i]);
+        } else if (arg == "--model_type" && i + 1 < argc) {
+            args.model_type = argv[++i];
+        }
+    }
+    return args;
+}
+std::string to_lower(const std::string& str) {
+    std::string lower_str = str;
+    std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
+        return std::tolower(c);
+    });
+    return lower_str;
+}
+std::vector<std::vector<float>> load_anchors_from_npy(const std::string& path) {
+    cnpy::NpyArray arr = cnpy::npy_load(path);
+    float* data_ptr = arr.data<float>();
+    size_t num_rows = arr.shape[0]; // 896
+    size_t num_cols = arr.shape[1]; // 4
+    std::vector<std::vector<float>> anchors(num_rows, std::vector<float>(num_cols));
+    for (size_t i = 0; i < num_rows; ++i) {
+        for (size_t j = 0; j < num_cols; ++j) {
+            anchors[i][j] = data_ptr[i * num_cols + j];
+        }
+    }
+    return anchors;
+}
+// 绘制人脸关键点和连接线
+void draw_landmarks(
+    cv::Mat& img,
+    const std::vector<cv::Point2f>& points,
+    const std::vector<float>& flags,
+    const std::vector<std::pair<int, int>>& connections,
+    float threshold = 0.4f,
+    cv::Scalar point_color = cv::Scalar(0, 255, 0),
+    cv::Scalar line_color = cv::Scalar(0, 0, 0),
+    int size = 2)
+{
+    // 画关键点
+    for (size_t i = 0; i < points.size(); ++i) {
+        // if (i < flags.size() && flags[i] > threshold) {
+            int x = static_cast<int>(points[i].x);
+            int y = static_cast<int>(points[i].y);
+            cv::circle(img, cv::Point(x, y), size, point_color, size);
+        // }
+    }
+    // 画连接线（两端都要可见）
+    for (const auto& conn : connections) {
+        int i0 = conn.first;
+        int i1 = conn.second;
+        // if (i0 < points.size() && i1 < points.size() &&
+        //     i0 < flags.size() && i1 < flags.size() &&
+        //     flags[i0] > threshold && flags[i1] > threshold)
+        // {
+            cv::line(img, points[i0], points[i1], line_color, size);
+        // }
+    }
+}
+std::tuple<cv::Mat, cv::Mat, float, cv::Point> resize_pad(const cv::Mat& img) {
+    int h = img.rows;
+    int w = img.cols;
+    int h1, w1, padh = 0, padw = 0;
+    float scale = 1.0f;
+    // Step 1: resize width to 256, keep aspect ratio
+    // int w1 = 256;
+    // int h1 = w1 * orig_h / orig_w;  // 等效于 int(256 * h / w)
+    // 根据宽高，调整缩放比
+    if (h >= w) {
+        h1 = 256;
+        w1 = 256 * w / h;
+        padw = 256 - w1;
+        scale = static_cast<float>(w) / w1;
+    } else {
+        w1 = 256;
+        h1 = 256 * h / w;
+        padh = 256 - h1;
+        scale = static_cast<float>(h) / h1;
+    }
+    // std::cout << "Original size: (" << h << ", " << w << "), padding: (" << padh << ", " << padw << ")\n";
+    // Step 2: compute padding in height direction
+    int padh1 = padh / 2;
+    int padh2 = padh - padh1;
+    int padw1 = padw / 2;
+    int padw2 = padw - padw1;
+    // std::cout << "Padding: (" << padh1 << ", " << padh2 << "), (" << padw1 << ", " << padw2 << ")\n";
+    // Resize to (w1, h1)
+    cv::Mat resized;
+    cv::resize(img, resized, cv::Size(w1, h1));
+    // Pad to 256x256
+    cv::Mat padded;
+    cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
+    // Final resize to 128x128
+    cv::Mat resized_small;
+    cv::resize(padded, resized_small, cv::Size(128, 128));
+    // Compute offset in original scale
+    cv::Point pad_offset(static_cast<int>(padh1 * scale), static_cast<int>(padw1 * scale));
+    return std::make_tuple(padded, resized_small, scale, pad_offset);
+}
+// 将图像转换为 1xC×H×W 格式并归一化（除以 255）
+std::vector<float> preprocess_image(const cv::Mat& img) {
+    int H = img.rows;
+    int W = img.cols;
+    int C = img.channels(); // should be 3
+    std::vector<float> chw(H * W * C); // CHW
+    std::vector<float> nchw(1 * C * H * W); // NCHW
+    // 1. HWC → CHW + normalize (float32 / 255.0)
+    for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+            for (int c = 0; c < C; ++c) {
+                // OpenCV uses BGR order
+                float value = img.at<cv::Vec3b>(h, w)[c] / 255.0f;
+                chw[c * H * W + h * W + w] = value;
+            }
+        }
+    }
+    // 2. CHW → NCHW (add batch dimension, actually just copy)
+    for (int i = 0; i < C * H * W; ++i) {
+        nchw[i] = chw[i];
+    }
+    return nchw; // shape: [1, 3, H, W]
+}
+// 只用前4个坐标计算IOU（默认框位置在前4个坐标）
+float IoU(const std::vector<float>& box1, const std::vector<float>& box2) {
+    float x1 = std::max(box1[0], box2[0]);
+    float y1 = std::max(box1[1], box2[1]);
+    float x2 = std::min(box1[2], box2[2]);
+    float y2 = std::min(box1[3], box2[3]);
+    float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
+    float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]);
+    float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]);
+    float union_area = box1_area + box2_area - inter_area;
+    return union_area > 0 ? inter_area / union_area : 0.0f;
+}
+std::vector<std::vector<float>> weighted_non_max_suppression(
+    std::vector<std::vector<float>>& detections,
+    int num_coords = 18,
+    float min_suppression_threshold = 0.3f)
+{
+    if (detections.empty()) return {};
+    std::vector<int> indices(detections.size());
+    std::iota(indices.begin(), indices.end(), 0);
+    // 按置信度降序排序
+    std::sort(indices.begin(), indices.end(), [&](int a, int b) {
+        return detections[a][num_coords] > detections[b][num_coords];
+    });
+    std::vector<std::vector<float>> output;
+    while (!indices.empty()) {
+        int best_idx = indices.front();
+        const auto& best_det = detections[best_idx];
+        std::vector<int> overlapping = { best_idx };
+        for (size_t i = 1; i < indices.size(); ++i) {
+            float iou = IoU(best_det, detections[indices[i]]);
+            if (iou > min_suppression_threshold) {
+                overlapping.push_back(indices[i]);
+            }
+        }
+        // 更新剩余索引
+        std::vector<int> new_indices;
+        for (int idx : indices) {
+            if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) {
+                new_indices.push_back(idx);
+            }
+        }
+        indices = new_indices;
+        // 加权平均：坐标 * 置信度
+        if (overlapping.size() == 1) {
+            output.push_back(best_det);
+        } else {
+            std::vector<float> weighted(num_coords + 1, 0.0f);
+            float total_score = 0.0f;
+            for (int idx : overlapping) {
+                float score = detections[idx][num_coords];
+                total_score += score;
+                for (int k = 0; k < num_coords; ++k) {
+                    weighted[k] += detections[idx][k] * score;
+                }
+            }
+            for (int k = 0; k < num_coords; ++k) {
+                weighted[k] /= total_score;
+            }
+            weighted[num_coords] = total_score / overlapping.size();  // 取平均得分
+            // std::cout << "Weighted box: ";
+            // for (float v : weighted) std::cout << v << " ";
+            // std::cout << "\n";
+            output.push_back(weighted);
+        }
+    }
+    // TODO
+    auto x = output[0];
+    output.clear();
+    output.push_back(x);
+    return output;
+}
+std::vector<std::vector<float>> denormalize_detections(
+    const std::vector<std::vector<float>>& detections,
+    float scale,
+    const cv::Point& pad
+) {
+    std::vector<std::vector<float>> result = detections;
+    for (size_t i = 0; i < result.size(); ++i) {
+        std::vector<float>& det = result[i];
+        // bbox coords: x1, y1, x2, y2
+        det[0] = det[0] * scale * 256.0f - pad.x;  // x1
+        det[1] = det[1] * scale * 256.0f - pad.y;  // y1
+        det[2] = det[2] * scale * 256.0f - pad.x;  // x2
+        det[3] = det[3] * scale * 256.0f - pad.y;  // y2
+        // keypoints (starting from index 4): format [y, x, y, x, ...]
+        for (size_t k = 4; k + 1 < det.size(); k += 2) {
+            det[k]     = det[k]     * scale * 256.0f - pad.y;  // y
+            det[k + 1] = det[k + 1] * scale * 256.0f - pad.x;  // x
+        }
+    }
+    return result;
+}
+void detection2roi(
+    const std::vector<std::vector<float>>& detections,
+    std::vector<float>& xc,
+    std::vector<float>& yc,
+    std::vector<float>& scale,
+    std::vector<float>& theta,
+    int kp1, int kp2,   // 关键点索引
+    float dy, float dscale, float theta0
+) {
+    size_t N = detections.size();
+    xc.resize(N);
+    yc.resize(N);
+    scale.resize(N);
+    theta.resize(N);
+    for (size_t i = 0; i < N; ++i) {
+        const std::vector<float>& det = detections[i];
+        float x1 = det[1];
+        float x2 = det[3];
+        float y1 = det[0];
+        float y2 = det[2];
+        float x_center = (x1 + x2) / 2.0f;
+        float y_center = (y1 + y2) / 2.0f;
+        float box_scale = (x2 - x1);  // assumes square box
+        // yc 偏移
+        y_center += dy * box_scale;
+        box_scale *= dscale;
+        // 获取两个关键点的位置
+        int base = 4;
+        int idx_y0 = base + 2 * kp1;
+        int idx_x0 = base + 2 * kp1 + 1;
+        int idx_y1 = base + 2 * kp2;
+        int idx_x1 = base + 2 * kp2 + 1;
+        float x0 = det[idx_x0];
+        float y0 = det[idx_y0];
+        float x1_kp = det[idx_x1];
+        float y1_kp = det[idx_y1];
+        float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0;
+        // 输出赋值
+        xc[i] = x_center;
+        yc[i] = y_center;
+        scale[i] = box_scale;
+        // TODO: 这里的 theta 需要根据实际情况调整
+        // theta[i] = angle;  // 如果需要使用计算的角度
+        theta[i] = -0.8461;
+    }
+}
+void extract_roi(
+    const cv::Mat& frame,
+    const std::vector<float>& xc,
+    const std::vector<float>& yc,
+    const std::vector<float>& theta,
+    const std::vector<float>& scale,
+    std::vector<cv::Mat>& cropped_rois,
+    std::vector<cv::Mat>& affine_matrices,
+    std::vector<std::vector<cv::Point2f>>& roi_boxes,  // 添加返回点坐标
+    int resolution = 256
+) {
+    cropped_rois.clear();
+    affine_matrices.clear();
+    roi_boxes.clear();
+    for (size_t i = 0; i < xc.size(); ++i) {
+        float s = scale[i] / 2.0f;
+        float cos_t = std::cos(theta[i]);
+        float sin_t = std::sin(theta[i]);
+        // 定义4个 unit square 点经过变换后的点（顺序和 Python 中一样）
+        std::vector<cv::Point2f> points(4);
+        // [-1, -1]
+        points[0].x = xc[i] + (-s * cos_t + s * sin_t);
+        points[0].y = yc[i] + (-s * sin_t - s * cos_t);
+        // [1, -1]
+        points[1].x = xc[i] + ( s * cos_t + s * sin_t);
+        points[1].y = yc[i] + ( s * sin_t - s * cos_t);
+        // [-1, 1]
+        points[2].x = xc[i] + (-s * cos_t - s * sin_t);
+        points[2].y = yc[i] + (-s * sin_t + s * cos_t);
+        // [1, 1]
+        points[3].x = xc[i] + ( s * cos_t - s * sin_t);
+        points[3].y = yc[i] + ( s * sin_t + s * cos_t);
+        // 用前三个点计算仿射变换
+        std::vector<cv::Point2f> src_pts = { points[0], points[1], points[2] };
+        std::vector<cv::Point2f> dst_pts = {
+            cv::Point2f(0, 0),
+            cv::Point2f(resolution - 1, 0),
+            cv::Point2f(0, resolution - 1)
+        };
+        cv::Mat M = cv::getAffineTransform(src_pts, dst_pts);
+        cv::Mat M_inv;
+        cv::invertAffineTransform(M, M_inv);
+        cv::Mat cropped;
+        cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5));
+        cropped_rois.push_back(cropped);
+        affine_matrices.push_back(M_inv);
+        roi_boxes.push_back(points);  // 添加变换后的 box 点
+    }
+}
+std::vector<float> preprocess_imgs_to_nchw(const std::vector<cv::Mat>& imgs) {
+    int N = imgs.size();
+    if (N == 0) return {};
+    int H = 256;
+    int W = 256;
+    int C = 3;  // assume 3 channels (BGR)
+    std::vector<float> output;
+    output.reserve(N * C * H * W);
+    for (int n = 0; n < N; ++n) {
+        cv::Mat img_float;
+        imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0);  // Normalize to [0,1]
+        // Split channels (HWC → CHW)
+        std::vector<cv::Mat> channels(3);
+        cv::split(img_float, channels);  // channels[0] = B, [1] = G, [2] = R
+        for (int c = 0; c < C; ++c) {
+            for (int i = 0; i < H; ++i) {
+                for (int j = 0; j < W; ++j) {
+                    output.push_back(channels[c].at<float>(i, j));
+                }
+            }
+        }
+    }
+    return output;  // shape: N x C x H x W
+}
+std::vector<cv::Point2f> denormalize_landmarks(
+    const std::vector<float>& normalized_landmarks,
+    const std::vector<cv::Mat>& affines,
+    int resolution = 256)
+{
+    std::vector<cv::Point2f> output;
+    // 检查输入尺寸
+    const int num_faces = 1;  // 假设只有一个人脸
+    const int num_landmarks = 21;
+    if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) {
+        std::cerr << "Error: Input size mismatch. Expected "
+                  << num_faces * num_landmarks * 3 << " landmarks and "
+                  << num_faces << " affine matrices." << std::endl;
+        throw std::runtime_error("Input size mismatch");
+    }
+    for (int i = 0; i < num_faces; ++i) {
+        const cv::Mat& affine = affines[i]; // 2x3 CV_32F
+        for (int j = 0; j < num_landmarks; ++j) {
+            int idx = i * num_landmarks * 3 + j * 3;
+            float x = normalized_landmarks[idx + 0] * resolution;
+            float y = normalized_landmarks[idx + 1] * resolution;
+            // float z = normalized_landmarks[idx + 2]; // 可选使用
+            // 2x1 输入向量
+            cv::Mat pt = (cv::Mat_<float>(2, 1) << x, y);
+            // 提取 affine 的旋转和平移
+            cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone();
+            cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone();
+            M2x2.convertTo(M2x2, CV_32F);
+            t2x1.convertTo(t2x1, CV_32F);
+            // 反仿射变换
+            cv::Mat out = M2x2 * pt + t2x1;
+            // 存储为 Point2f
+            output.emplace_back(out.at<float>(0, 0), out.at<float>(1, 0));
+        }
+    }
+    return output; // 输出为 denormalized landmarks，大小为 468 个 Point2f
+}
+void draw_roi(cv::Mat& img, const std::vector<std::vector<cv::Point2f>>& boxes) {
+    for (const auto& roi : boxes) {
+        if (roi.size() < 4) continue;
+        const cv::Point2f& p1 = roi[0];
+        const cv::Point2f& p2 = roi[1];
+        const cv::Point2f& p3 = roi[2];
+        const cv::Point2f& p4 = roi[3];
+        cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2);     // 绿色
+        cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+    }
+}
+void draw_detections(cv::Mat& img, const std::vector<std::vector<float>>& detections, bool with_keypoints = true) {
+    for (const auto& det : detections) {
+        if (det.size() < 4) continue;
+        float ymin = det[0];
+        float xmin = det[1];
+        float ymax = det[2];
+        float xmax = det[3];
+        cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1);
+        if (with_keypoints && det.size() > 4) {
+            int n_keypoints = (det.size() - 4) / 2;
+            for (int k = 0; k < n_keypoints; ++k) {
+                int kp_x = int(det[4 + k * 2]);
+                int kp_y = int(det[4 + k * 2 + 1]);
+                cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2);
+            }
+        }
+    }
+}
+std::vector<std::vector<float>> loadAnchors(const std::string& filename) {
+    std::ifstream in(filename);
+    std::vector<std::vector<float>> anchors;
+    if (!in.is_open()) {
+        std::cerr << "Failed to open file: " << filename << std::endl;
+        return anchors;
+    }
+    std::string line;
+    while (std::getline(in, line)) {
+        std::istringstream ss(line);
+        std::vector<float> anchor;
+        float value;
+        while (ss >> value) {
+            anchor.push_back(value);
+        }
+        if (!anchor.empty()) {
+            anchors.push_back(anchor);
+        }
+    }
+    in.close();
+    return anchors;
+}
+// sigmoid 函数
+float sigmoid(float x) {
+    return 1.0f / (1.0f + std::exp(-x));
+}
+// clamp 函数
+float clamp(float x, float min_val, float max_val) {
+    return std::max(min_val, std::min(max_val, x));
+}
+// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord]
+std::vector<std::vector<std::vector<float>>> decode_boxes(
+    const std::vector<float>& raw_boxes,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    int num_keypoints)
+{
+    std::vector<std::vector<std::vector<float>>> decoded_boxes(batch,
+        std::vector<std::vector<float>>(num_anchors, std::vector<float>(num_coords, 0)));
+    for (int b = 0; b < batch; ++b) {
+        for (int i = 0; i < num_anchors; ++i) {
+            int base = b * num_anchors * num_coords + i * num_coords;
+            float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0];
+            float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1];
+            float w = raw_boxes[base + 2] / w_scale * anchors[i][2];
+            float h = raw_boxes[base + 3] / h_scale * anchors[i][3];
+            decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin
+            decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin
+            decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax
+            decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax
+            for (int k = 0; k < num_keypoints; ++k) {
+                int offset = 4 + k * 2;
+                float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0];
+                float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1];
+                decoded_boxes[b][i][offset] = keypoint_x;
+                decoded_boxes[b][i][offset + 1] = keypoint_y;
+            }
+        }
+    }
+    return decoded_boxes;
+}
+std::vector<std::vector<std::vector<float>>> tensors_to_detections(
+    const std::vector<float>& raw_box_tensor,
+    const std::vector<float>& raw_score_tensor,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    float score_clipping_thresh, float min_score_thresh)
+{
+    assert(raw_box_tensor.size() == batch * num_anchors * num_coords);
+    assert(raw_score_tensor.size() == batch * num_anchors * num_classes);
+    assert(anchors.size() == size_t(num_anchors));
+    auto detection_boxes = decode_boxes(
+        raw_box_tensor, anchors, batch, num_anchors, num_coords,
+        x_scale, y_scale, w_scale, h_scale, num_keypoints);
+    std::vector<std::vector<std::vector<float>>> output_detections;
+    for (int b = 0; b < batch; ++b) {
+        std::vector<std::vector<float>> detections;
+        for (int i = 0; i < num_anchors; ++i) {
+            int score_index = b * num_anchors * num_classes + i * num_classes;
+            // 单类情况，取第0类
+            float score_raw = raw_score_tensor[score_index];
+            float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh));
+            if (score >= min_score_thresh) {
+                std::vector<float> det = detection_boxes[b][i]; // shape [num_coords]
+                det.push_back(score); // 追加置信度
+                detections.push_back(det); // shape [num_coords+1]
+            }
+        }
+        output_detections.push_back(detections); // 每个 batch 一个 vector
+    }
+    return output_detections;
+}
+int invoke(const Args& args) {
+    std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n"
+              << args.faceLandmark_model << "\n"
+              << "Image Path: " << args.imgs << "\n"
+              << "Inference Nums: " << args.invoke_nums << "\n"
+              << "Model Type: " << args.model_type << "\n";
+    // =============================================================faceDetector_model start
+    Model* model1 = Model::create_instance(args.faceDetector_model);
+    if(model1 == nullptr){
+        printf("Create model1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config1 = Config::create_instance();
+    if(config1 == nullptr){
+        printf("Create config1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config1->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower1 = to_lower(args.model_type);
+    if (model_type_lower1 == "qnn"){
+        config1->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") {
+        config1->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config1->accelerate_type = AccelerateType::TYPE_DSP;
+    config1->is_quantify_model = 1;
+    std::vector<std::vector<uint32_t>> input_shapes1 = {{1,3,256,256}};
+    std::vector<std::vector<uint32_t>> output_shapes1 = {{1,2944,18},{1,2944,1}};
+    model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1);
+    if(fast_interpreter1 == nullptr){
+        printf("build_interpretper_from_model_and_config failed !\n");
+        return EXIT_FAILURE;
+    }
+    int result = fast_interpreter1->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter1->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model load success!\n");
+    // =============================================================faceDetector_model over
+    // =============================================================faceLandmark_model start
+    Model* model2 = Model::create_instance(args.faceLandmark_model);
+    if(model2 == nullptr){
+        printf("Create model2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config2 = Config::create_instance();
+    if(config2 == nullptr){
+        printf("Create config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config2->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower2 = to_lower(args.model_type);
+    if (model_type_lower2 == "qnn"){
+        config2->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") {
+        config2->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config2->accelerate_type = AccelerateType::TYPE_DSP;
+    config2->is_quantify_model = 1;
+    std::vector<std::vector<uint32_t>> input_shapes2 = {{1,3,256,256}};
+    std::vector<std::vector<uint32_t>> output_shapes2 = {{1},{1},{1,21,3}};
+    model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2);
+    if(fast_interpreter2 == nullptr){
+        printf("build_interpretper_from_model_and_config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    result = fast_interpreter2->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter2->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model2 load success!\n");
+    // =============================================================faceLandmark_model over
+    auto anchors = load_anchors_from_npy("../anchors_float32.npy");
+    cv::Mat frame = cv::imread(args.imgs);
+    if (frame.empty()) {
+        printf("detect image load failed!\n");
+        return 1;
+    }
+    // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
+    cv::Mat input_data;
+    cv::Mat frame_clone1 = frame.clone();
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB);
+    cv::Mat frame_clone = frame.clone();
+    cv::Mat img1, img2;
+    float scale;
+    cv::Point pad;
+    std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1);
+    std::vector<float> input_tensor = preprocess_image(img1);
+    float *outdata0 = nullptr;
+    float *outdata1 = nullptr;
+    std::vector<float> invoke_time;
+    for (int i = 0; i < args.invoke_nums; ++i) {
+        result = fast_interpreter1->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter1->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_0 = 0;
+        result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_1 = 0;
+        result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+    }
+    std::vector<float> tensor_1_896_16(outdata0, outdata0 + 2944*18);
+    std::vector<float> tensor_1_896_1(outdata1, outdata1 + 2944*1);
+    std::vector<std::vector<std::vector<float>>> detections = tensors_to_detections(
+        tensor_1_896_16, tensor_1_896_1, anchors,
+        batch, num_anchors, num_coords, num_classes, num_keypoints,
+        x_scale, y_scale, w_scale, h_scale,
+        score_clipping_thresh, min_score_thresh);
+    std::vector<std::vector<std::vector<float>>> filtered_detections;
+    for (size_t i = 0; i < detections.size(); ++i) {
+        std::vector<std::vector<float>>& dets = detections[i];
+        std::vector<std::vector<float>> faces = weighted_non_max_suppression(dets);
+        filtered_detections.push_back(faces);
+    }
+    // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n";
+    // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n";
+    std::vector<std::vector<float>> face_detections = denormalize_detections(filtered_detections[0], scale, pad);
+    // std::cout << "face_detections size: " << face_detections.size() << "\n";
+    std::vector<float> xc, yc, scales, theta;
+    detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0);
+    std::vector<cv::Mat> rois;
+    std::vector<cv::Mat> affines;
+    std::vector<std::vector<cv::Point2f>> boxes;
+    // std::cout << "xc size: " << xc.size() << ", yc size: " << yc.size() << ", scales size: " << scales.size() << ", theta size: " << theta.size() << "\n";
+    // std::cout << "xc: " << xc[0] << ", yc: " << yc[0] << ", scales: " << scales[0] << ", theta: " << theta[0] << "\n";
+    extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes);
+    if (!boxes.empty()) {
+        std::cout << "Detected " << boxes.size() << " faces.\n";
+        // 检测到人脸，继续处理 boxes[0] ...
+        std::vector<float> input_tensor = preprocess_imgs_to_nchw(rois);
+        // for (int i = 0; i < 5; ++i) {
+        //     std::cout << "input_tensor:" << i << ": " << input_tensor[i] << std::endl;
+        // }
+        float *outdata1_0 = nullptr;
+        float *outdata1_1 = nullptr;
+        result = fast_interpreter2->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter2->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_1_0 = 0;
+        result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_1_1 = 0;
+        result = fast_interpreter2->get_output_tensor(2, (void**)&outdata1_1, &out_data_1_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+        std::vector<float> flags(outdata1_0, outdata1_0 + 1);
+        std::vector<float> normalized_landmarks(outdata1_1, outdata1_1 + 21*3);
+        std::vector<cv::Point2f> landmarks = denormalize_landmarks(normalized_landmarks, affines);
+        draw_landmarks(frame_clone1, landmarks, flags, HAND_CONNECTIONS);
+    } else {
+        std::cout << "not detect face!" << std::endl;
+    }
+    draw_roi(frame_clone1, boxes);
+    draw_detections(frame_clone1, face_detections);
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR);
+    cv::imwrite("vis_result.jpg", frame_clone1);
+    fast_interpreter1->destory();
+    fast_interpreter2->destory();
+    return 0;
+}
+int main(int argc, char* argv[]) {
+    Args args = parse_args(argc, argv);
+    return invoke(args);
+}

model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/models/m_handDetctor_fp16.qnn216.ctx.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce9773e6e34cd5ff5a6f78602b4229c0f1faa3e938d267f29e97c8fc3cf43a16
+size 4243224

model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/models/m_handLandmark_fp16.qnn216.ctx.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e303d20df51ba36ebe46213d6ee39b327b1dc9e52a546f2d12dd81ac4bfc3d7c
+size 6796200

model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-38.pyc ADDED Viewed

Binary file (16.5 kB). View file

model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/visualization.cpython-38.pyc ADDED Viewed

Binary file (4.59 kB). View file

model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/anchors_palm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24fa4a27ad6bee24ba3185a42fe3a47115540b0b27fa5956a291f03756183b41
+size 94336

model_farm_mediapipehand_qcs8550_qnn2.16_fp16_aidlite/python/blazebase.py ADDED Viewed

	@@ -0,0 +1,513 @@

+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def resize_pad(img):
+    """ resize and pad images to be input to the detectors
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio.
+    Returns:
+        img1: 256x256
+        img2: 128x128
+        scale: scale factor between original image and 256x256 image
+        pad: pixels of padding in the original image
+    """
+    size0 = img.shape
+    if size0[0]>=size0[1]:
+        h1 = 256
+        w1 = 256 * size0[1] // size0[0]
+        padh = 0
+        padw = 256 - w1
+        scale = size0[1] / w1
+    else:
+        h1 = 256 * size0[0] // size0[1]
+        w1 = 256
+        padh = 256 - h1
+        padw = 0
+        scale = size0[0] / h1
+    padh1 = padh//2
+    padh2 = padh//2 + padh%2
+    padw1 = padw//2
+    padw2 = padw//2 + padw%2
+    img1 = cv2.resize(img, (w1,h1))
+    img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0)))
+    pad = (int(padh1 * scale), int(padw1 * scale))
+    img2 = cv2.resize(img1, (128,128))
+    return img1, img2, scale, pad
+def denormalize_detections(detections, scale, pad):
+    """ maps detection coordinates from [0,1] to image coordinates
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio. This function maps the
+    normalized coordinates back to the original image coordinates.
+    Inputs:
+        detections: nxm tensor. n is the number of detections.
+            m is 4+2*k where the first 4 valuse are the bounding
+            box coordinates and k is the number of additional
+            keypoints output by the detector.
+        scale: scalar that was used to resize the image
+        pad: padding in the x and y dimensions
+    """
+    detections[:, 0] = detections[:, 0] * scale * 256 - pad[0]
+    detections[:, 1] = detections[:, 1] * scale * 256 - pad[1]
+    detections[:, 2] = detections[:, 2] * scale * 256 - pad[0]
+    detections[:, 3] = detections[:, 3] * scale * 256 - pad[1]
+    detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1]
+    detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0]
+    return detections
+class BlazeBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False):
+        super(BlazeBlock, self).__init__()
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.channel_pad = out_channels - in_channels
+        # TFLite uses slightly different padding than PyTorch
+        # on the depthwise conv layer when the stride is 2.
+        if stride == 2:
+            self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+            padding = 0
+        else:
+            padding = (kernel_size - 1) // 2
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=in_channels,
+                      kernel_size=kernel_size, stride=stride, padding=padding,
+                      groups=in_channels, bias=True),
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+        if skip_proj:
+            self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=1, stride=1, padding=0, bias=True)
+        else:
+            self.skip_proj = None
+        if act == 'relu':
+            self.act = nn.ReLU(inplace=True)
+        elif act == 'prelu':
+            self.act = nn.PReLU(out_channels)
+        else:
+            raise NotImplementedError("unknown activation %s"%act)
+    def forward(self, x):
+        if self.stride == 2:
+            if self.kernel_size==3:
+                h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+            else:
+                h = F.pad(x, (1, 2, 1, 2), "constant", 0)
+            x = self.max_pool(x)
+        else:
+            h = x
+        if self.skip_proj is not None:
+            x = self.skip_proj(x)
+        elif self.channel_pad > 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
+        return self.act(self.convs(h) + x)
+class FinalBlazeBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3):
+        super(FinalBlazeBlock, self).__init__()
+        # TFLite uses slightly different padding than PyTorch
+        # on the depthwise conv layer when the stride is 2.
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=kernel_size, stride=2, padding=0,
+                      groups=channels, bias=True),
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+        self.act = nn.ReLU(inplace=True)
+    def forward(self, x):
+        h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+        return self.act(self.convs(h))
+class BlazeBase(nn.Module):
+    """ Base class for media pipe models. """
+    def _device(self):
+        """Which device (CPU or GPU) is being used by this model?"""
+        return self.classifier_8.weight.device
+    def load_weights(self, path):
+        self.load_state_dict(torch.load(path))
+        self.eval()
+class BlazeLandmark(BlazeBase):
+    """ Base class for landmark models. """
+    def extract_roi(self, frame, xc, yc, theta, scale):
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+        # use the points to compute the affine transform that maps
+        # these points back to the output square
+        res = self.resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].cpu().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+        return imgs, affines, points
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+class BlazeDetector(BlazeBase):
+    """ Base class for detector models.
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/hollance/BlazeFace-PyTorch and
+    https://github.com/google/mediapipe/
+    """
+    def load_anchors(self, path):
+        self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
+        assert(self.anchors.ndimension() == 2)
+        assert(self.anchors.shape[0] == self.num_anchors)
+        assert(self.anchors.shape[1] == 4)
+    def _preprocess(self, x):
+        """Converts the image pixels to the range [-1, 1]."""
+        return x.float() / 255.# 127.5 - 1.0
+    def predict_on_image(self, img):
+        """Makes a prediction on a single image.
+        Arguments:
+            img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
+                 shape (3, H, W). The image's height and width should be
+                 128 pixels.
+        Returns:
+            A tensor with face detections.
+        """
+        if isinstance(img, np.ndarray):
+            img = torch.from_numpy(img).permute((2, 0, 1))
+        return self.predict_on_batch(img.unsqueeze(0))[0]
+    def predict_on_batch(self, x):
+        """Makes a prediction on a batch of images.
+        Arguments:
+            x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
+               shape (b, 3, H, W). The height and width should be 128 pixels.
+        Returns:
+            A list containing a tensor of face detections for each image in
+            the batch. If no faces are found for an image, returns a tensor
+            of shape (0, 17).
+        Each face detection is a PyTorch tensor consisting of 17 numbers:
+            - ymin, xmin, ymax, xmax
+            - x,y-coordinates for the 6 keypoints
+            - confidence score
+        """
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x).permute((0, 3, 1, 2))
+        assert x.shape[1] == 3
+        assert x.shape[2] == self.y_scale
+        assert x.shape[3] == self.x_scale
+        # 1. Preprocess the images into tensors:
+        x = x.to(self._device())
+        x = self._preprocess(x)
+        # 2. Run the neural network:
+        with torch.no_grad():
+            out = self.__call__(x)
+        # 3. Postprocess the raw predictions:
+        detections = self._tensors_to_detections(out[0], out[1], self.anchors)
+        # 4. Non-maximum suppression to remove overlapping detections:
+        filtered_detections = []
+        for i in range(len(detections)):
+            faces = self._weighted_non_max_suppression(detections[i])
+            faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1))
+            filtered_detections.append(faces)
+        return filtered_detections
+    def detection2roi(self, detection):
+        """ Convert detections from detector to an oriented bounding box.
+        Adapted from:
+        # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
+        The center and size of the box is calculated from the center
+        of the detected box. Rotation is calcualted from the vector
+        between kp1 and kp2 relative to theta0. The box is scaled
+        and shifted by dscale and dy.
+        """
+        if self.detection2roi_method == 'box':
+            # compute box center and scale
+            # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+            xc = (detection[:,1] + detection[:,3]) / 2
+            yc = (detection[:,0] + detection[:,2]) / 2
+            scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+        elif self.detection2roi_method == 'alignment':
+            # compute box center and scale
+            # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc
+            xc = detection[:,4+2*self.kp1]
+            yc = detection[:,4+2*self.kp1+1]
+            x1 = detection[:,4+2*self.kp2]
+            y1 = detection[:,4+2*self.kp2+1]
+            scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2
+        else:
+            raise NotImplementedError(
+                "detection2roi_method [%s] not supported"%self.detection2roi_method)
+        yc += self.dy * scale
+        scale *= self.dscale
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        #theta = np.arctan2(y0-y1, x0-x1) - self.theta0
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        """The output of the neural network is a tensor of shape (b, 896, 16)
+        containing the bounding box regressor predictions, as well as a tensor
+        of shape (b, 896, 1) with the classification confidences.
+        This function converts these two "raw" tensors into proper detections.
+        Returns a list of (num_detections, 17) tensors, one for each image in
+        the batch.
+        This is based on the source code from:
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
+        """
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+        return output_detections
+    def _decode_boxes(self, raw_boxes, anchors):
+        """Converts the predictions into actual coordinates using
+        the anchor boxes. Processes the entire batch at once.
+        """
+        boxes = torch.zeros_like(raw_boxes)
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+        return boxes
+    def _weighted_non_max_suppression(self, detections):
+        """The alternative NMS method as mentioned in the BlazeFace paper:
+        "We replace the suppression algorithm with a blending strategy that
+        estimates the regression parameters of a bounding box as a weighted
+        mean between the overlapping predictions."
+        The original MediaPipe code assigns the score of the most confident
+        detection to the weighted detection, but we take the average score
+        of the overlapping detections.
+        The input detections should be a Tensor of shape (count, 17).
+        Returns a list of PyTorch tensors, one for each detected face.
+        This is based on the source code from:
+        mediapipe/calculators/util/non_max_suppression_calculator.cc
+        mediapipe/calculators/util/non_max_suppression_calculator.proto
+        """
+        if len(detections) == 0: return []
+        output_detections = []
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, self.num_coords], descending=True)
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+            # Compute the overlap between the first box and the other
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = overlap_similarity(first_box, other_boxes)
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious > self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :self.num_coords]
+                scores = detections[overlapping, self.num_coords:self.num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:self.num_coords] = weighted
+                weighted_detection[self.num_coords] = total_score / len(overlapping)
+            output_detections.append(weighted_detection)
+        return output_detections
+# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
+def intersect(box_a, box_b):
+    """ We resize both tensors to [A,B,2] without new malloc:
+    [A,2] -> [A,1,2] -> [A,B,2]
+    [B,2] -> [1,B,2] -> [A,B,2]
+    Then we compute the area of intersect between box_a and box_b.
+    Args:
+      box_a: (tensor) bounding boxes, Shape: [A,4].
+      box_b: (tensor) bounding boxes, Shape: [B,4].
+    Return:
+      (tensor) intersection area, Shape: [A,B].
+    """
+    A = box_a.size(0)
+    B = box_b.size(0)
+    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+    inter = torch.clamp((max_xy - min_xy), min=0)
+    return inter[:, :, 0] * inter[:, :, 1]
+def jaccard(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.  Here we operate on
+    ground truth boxes and default boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
+        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
+    Return:
+        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2]-box_a[:, 0]) *
+              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+    area_b = ((box_b[:, 2]-box_b[:, 0]) *
+              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+def overlap_similarity(box, other_boxes):
+    """Computes the IOU between a bounding box and set of other boxes."""
+    return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)