diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..85cfee69e0412a68ebbcf4fd822d65022339186e 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text
+model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/0000.jpg filter=lfs diff=lfs merge=lfs -text
+model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text
+model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text
+model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text
+model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text
+model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text
+model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text
+model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text
+model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/coco.jpg filter=lfs diff=lfs merge=lfs -text
+model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/coco.jpg filter=lfs diff=lfs merge=lfs -text
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/README.md b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..19042355074f86c6cb4409726a1e7319d5c82f29
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/README.md
@@ -0,0 +1,63 @@
+## Model Information
+### Source model
+- Input shape: 256x256
+- Number of parameters:0.13M, 0.6M
+- Model size:0.58MB, 2.32MB
+- Output shape: [1x896x16, 1x896x1], [1, 1x486x3]
+
+Source model repository: [MediaPipe-Face-Detection](https://github.com/zmurez/MediaPipePyTorch/)
+
+### Converted model
+
+- Precision: INT16
+- Backend: QNN2.16
+- Target Device: FV01 QCS6490
+
+## Inference with AidLite SDK
+
+### SDK installation
+Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
+
+- install AidLite SDK
+
+```bash
+# Install the appropriate version of the aidlite sdk
+sudo aid-pkg update
+sudo aid-pkg install aidlite-sdk
+# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
+sudo aid-pkg install aidlite-{QNN VERSION}
+```
+
+- Verify AidLite SDK
+
+```bash
+# aidlite sdk c++ check
+python3 -c "import aidlite ; print(aidlite.get_library_version())"
+
+# aidlite sdk python check
+python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
+```
+
+### Run demo
+#### python
+```bash
+cd python 
+python3 demo_qnn.py
+```
+
+#### c++
+```bash
+# 加载.npy文件需要用到cnpy库（终端默认路径下执行即可）
+git clone https://github.com/rogersce/cnpy.git
+cd cnpy
+mkdir build && cd build
+cmake ..
+make
+sudo make install
+
+cd mediapipe-face-detection/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp
+mkdir build && cd build 
+cmake ..
+make
+./run_test
+```
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8655a0360f730c1fb7b4f81367a6279e5acc8060
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2044fec25dc146bd44e23e22b15c320e189fd99f
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazebase.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3d9d716ed4bbd036e086c876ce0c5308d5a9a5d
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27b6da065e0c25967994af97ae8c72f67594bf5f
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e9b9b371d9ab88b3885f249fa1a9fc68d8751c1
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c16fa3b7c9a01a594bb21e59a6c00d0b19945548
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/__pycache__/visualization.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/CMakeLists.txt b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..13cca3db86e64b8a8458fd55c52323bed1d4282f
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/CMakeLists.txt
@@ -0,0 +1,34 @@
+cmake_minimum_required (VERSION 3.5)
+project("run_test")
+
+find_package(OpenCV REQUIRED)
+find_library(CNPY_LIB cnpy REQUIRED)
+
+message(STATUS "oPENCV Library status:")
+message(STATUS ">version:${OpenCV_VERSION}")
+message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
+
+set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
+
+include_directories(
+    /usr/local/include
+    /usr/include/opencv4
+)
+
+link_directories(
+    /usr/local/lib/
+)
+
+file(GLOB SRC_LISTS 
+    ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp  
+)
+
+add_executable(run_test ${SRC_LISTS})
+
+target_link_libraries(run_test
+    aidlite
+	${OpenCV_LIBS}
+    pthread
+    jsoncpp
+    ${CNPY_LIB}
+)
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/anchors_float32.npy b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/anchors_float32.npy
new file mode 100644
index 0000000000000000000000000000000000000000..fc25c0d2c161ee090e9a04c6003418bd15caf45c
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/anchors_float32.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94
+size 14464
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/coco.jpg b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/coco.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/coco.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378
+size 158750
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/run_test.cpp b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/run_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7e046e317fa11e6f130903db87f29b4cef6b4ae5
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/cpp/run_test.cpp
@@ -0,0 +1,909 @@
+#include <iostream>
+#include <fstream>
+#include <opencv2/opencv.hpp>
+#include <aidlux/aidlite/aidlite.hpp>
+#include <vector>
+#include <numeric>
+#include <cmath>
+#include <jsoncpp/json/json.h>
+#include <tuple>
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <cassert>
+#include "cnpy.h"
+
+using namespace cv;
+using namespace std;
+using namespace Aidlux::Aidlite;
+
+
+// 人脸 landmark 连接索引定义（来自 MediaPipe Face Mesh）
+const std::vector<std::pair<int, int>> FACE_CONNECTIONS = {
+    {61, 146}, {146, 91}, {91, 181}, {181, 84}, {84, 17},
+    {17, 314}, {314, 405}, {405, 321}, {321, 375}, {375, 291},
+    {61, 185}, {185, 40}, {40, 39}, {39, 37}, {37, 0},
+    {0, 267}, {267, 269}, {269, 270}, {270, 409}, {409, 291},
+    {78, 95}, {95, 88}, {88, 178}, {178, 87}, {87, 14},
+    {14, 317}, {317, 402}, {402, 318}, {318, 324}, {324, 308},
+    {78, 191}, {191, 80}, {80, 81}, {81, 82}, {82, 13},
+    {13, 312}, {312, 311}, {311, 310}, {310, 415}, {415, 308},
+    {263, 249}, {249, 390}, {390, 373}, {373, 374}, {374, 380},
+    {380, 381}, {381, 382}, {382, 362}, {263, 466}, {466, 388},
+    {388, 387}, {387, 386}, {386, 385}, {385, 384}, {384, 398},
+    {398, 362}, {276, 283}, {283, 282}, {282, 295}, {295, 285},
+    {300, 293}, {293, 334}, {334, 296}, {296, 336}, {33, 7},
+    {7, 163}, {163, 144}, {144, 145}, {145, 153}, {153, 154},
+    {154, 155}, {155, 133}, {33, 246}, {246, 161}, {161, 160},
+    {160, 159}, {159, 158}, {158, 157}, {157, 173}, {173, 133},
+    {46, 53}, {53, 52}, {52, 65}, {65, 55}, {70, 63}, {63, 105},
+    {105, 66}, {66, 107}, {10, 338}, {338, 297}, {297, 332},
+    {332, 284}, {284, 251}, {251, 389}, {389, 356}, {356, 454},
+    {454, 323}, {323, 361}, {361, 288}, {288, 397}, {397, 365},
+    {365, 379}, {379, 378}, {378, 400}, {400, 377}, {377, 152},
+    {152, 148}, {148, 176}, {176, 149}, {149, 150}, {150, 136},
+    {136, 172}, {172, 58}, {58, 132}, {132, 93}, {93, 234},
+    {234, 127}, {127, 162}, {162, 21}, {21, 54}, {54, 103},
+    {103, 67}, {67, 109}, {109, 10}
+};
+
+struct Args {
+    std::string faceDetector_model = "../../models/m_faceDetctor_w8a16.qnn216.ctx.bin";
+    std::string faceLandmark_model = "../../models/m_faceLandmark_w8a16.qnn216.ctx.bin";
+    std::string imgs = "../coco.jpg";
+    int invoke_nums = 10;
+    std::string model_type = "QNN";
+};
+
+
+Args parse_args(int argc, char* argv[]) {
+    Args args;
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+        if (arg == "--faceDetector_model" && i + 1 < argc) {
+            args.faceDetector_model = argv[++i];
+        } else if (arg == "--faceLandmark_model" && i + 1 < argc) {
+            args.faceLandmark_model = argv[++i];
+        } else if (arg == "--imgs" && i + 1 < argc) {
+            args.imgs = argv[++i];
+        } else if (arg == "--invoke_nums" && i + 1 < argc) {
+            args.invoke_nums = std::stoi(argv[++i]);
+        } else if (arg == "--model_type" && i + 1 < argc) {
+            args.model_type = argv[++i];
+        }
+    }
+    return args;
+}
+
+std::string to_lower(const std::string& str) {
+    std::string lower_str = str;
+    std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
+        return std::tolower(c);
+    });
+    return lower_str;
+}
+
+std::vector<std::vector<float>> load_anchors_from_npy(const std::string& path) {
+    cnpy::NpyArray arr = cnpy::npy_load(path);
+    float* data_ptr = arr.data<float>();
+
+    size_t num_rows = arr.shape[0]; // 896
+    size_t num_cols = arr.shape[1]; // 4
+
+    std::vector<std::vector<float>> anchors(num_rows, std::vector<float>(num_cols));
+    for (size_t i = 0; i < num_rows; ++i) {
+        for (size_t j = 0; j < num_cols; ++j) {
+            anchors[i][j] = data_ptr[i * num_cols + j];
+        }
+    }
+
+    return anchors;
+}
+
+
+// 绘制人脸关键点和连接线
+void draw_landmarks(
+    cv::Mat& img,
+    const std::vector<cv::Point2f>& points,
+    const std::vector<float>& flags,
+    const std::vector<std::pair<int, int>>& connections,
+    float threshold = 0.4f,
+    cv::Scalar point_color = cv::Scalar(0, 255, 0),
+    cv::Scalar line_color = cv::Scalar(0, 0, 0),
+    int size = 2)
+{
+    // 画关键点
+    for (size_t i = 0; i < points.size(); ++i) {
+        // if (i < flags.size() && flags[i] > threshold) {
+            int x = static_cast<int>(points[i].x);
+            int y = static_cast<int>(points[i].y);
+            cv::circle(img, cv::Point(x, y), size, point_color, size);
+        // }
+    }
+
+    // 画连接线（两端都要可见）
+    for (const auto& conn : connections) {
+        int i0 = conn.first;
+        int i1 = conn.second;
+        // if (i0 < points.size() && i1 < points.size() &&
+        //     i0 < flags.size() && i1 < flags.size() &&
+        //     flags[i0] > threshold && flags[i1] > threshold)
+        // {
+            cv::line(img, points[i0], points[i1], line_color, size);
+        // }
+    }
+}
+
+
+std::tuple<cv::Mat, cv::Mat, float, cv::Point> resize_pad(const cv::Mat& img) {
+    int orig_h = img.rows; // 480
+    int orig_w = img.cols; // 640
+
+    // Step 1: resize width to 256, keep aspect ratio
+    int w1 = 256;
+    int h1 = w1 * orig_h / orig_w;  // 等效于 int(256 * h / w)
+
+    // Step 2: compute padding in height direction
+    int padh = 256 - h1;
+    int padw = 0;
+
+    int padh1 = padh / 2;
+    int padh2 = padh1 + (padh % 2);
+    int padw1 = padw / 2;
+    int padw2 = padw1 + (padw % 2);
+
+    // Step 3: resize to (w1, h1)
+    cv::Mat resized;
+    cv::resize(img, resized, cv::Size(w1, h1));  // (256, h1)
+
+    // Step 4: pad to (256, 256)
+    cv::Mat padded;
+    cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
+
+    // Step 5: resize padded to 128×128
+    cv::Mat resized128;
+    cv::resize(padded, resized128, cv::Size(128, 128));
+
+    // Step 6: compute scale and pad in original image space
+    float scale = static_cast<float>(orig_h) / h1; // h / h1
+    cv::Point pad_point(static_cast<int>(padh1 * scale), static_cast<int>(padw1 * scale));
+
+    return std::make_tuple(padded, resized128, scale, pad_point);
+}
+
+
+// 将图像转换为 1xC×H×W 格式并归一化（除以 255）
+std::vector<float> preprocess_image(const cv::Mat& img) {
+    int H = img.rows;
+    int W = img.cols;
+    int C = img.channels(); // should be 3
+
+    std::vector<float> chw(H * W * C); // CHW
+    std::vector<float> nchw(1 * C * H * W); // NCHW
+
+    // 1. HWC → CHW + normalize (float32 / 255.0)
+    for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+            for (int c = 0; c < C; ++c) {
+                // OpenCV uses BGR order
+                float value = img.at<cv::Vec3b>(h, w)[c] / 255.0f;
+                chw[c * H * W + h * W + w] = value;
+            }
+        }
+    }
+
+    // 2. CHW → NCHW (add batch dimension, actually just copy)
+    for (int i = 0; i < C * H * W; ++i) {
+        nchw[i] = chw[i];
+    }
+
+    return nchw; // shape: [1, 3, H, W]
+}
+
+
+// 只用前4个坐标计算IOU（默认框位置在前4个坐标）
+float IoU(const std::vector<float>& box1, const std::vector<float>& box2) {
+    float x1 = std::max(box1[0], box2[0]);
+    float y1 = std::max(box1[1], box2[1]);
+    float x2 = std::min(box1[2], box2[2]);
+    float y2 = std::min(box1[3], box2[3]);
+
+    float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
+    float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]);
+    float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]);
+    float union_area = box1_area + box2_area - inter_area;
+
+    return union_area > 0 ? inter_area / union_area : 0.0f;
+}
+
+std::vector<std::vector<float>> weighted_non_max_suppression(
+    std::vector<std::vector<float>>& detections,
+    int num_coords = 16,
+    float min_suppression_threshold = 0.3f)
+{
+    if (detections.empty()) return {};
+
+    std::vector<int> indices(detections.size());
+    std::iota(indices.begin(), indices.end(), 0);
+
+    // 按置信度降序排序
+    std::sort(indices.begin(), indices.end(), [&](int a, int b) {
+        return detections[a][num_coords] > detections[b][num_coords];
+    });
+
+    std::vector<std::vector<float>> output;
+
+    while (!indices.empty()) {
+        int best_idx = indices.front();
+        const auto& best_det = detections[best_idx];
+        std::vector<int> overlapping = { best_idx };
+
+        for (size_t i = 1; i < indices.size(); ++i) {
+            float iou = IoU(best_det, detections[indices[i]]);
+            if (iou > min_suppression_threshold) {
+                overlapping.push_back(indices[i]);
+            }
+        }
+
+        // 更新剩余索引
+        std::vector<int> new_indices;
+        for (int idx : indices) {
+            if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) {
+                new_indices.push_back(idx);
+            }
+        }
+        indices = new_indices;
+
+        // 加权平均：坐标 * 置信度
+        if (overlapping.size() == 1) {
+            output.push_back(best_det);
+        } else {
+            std::vector<float> weighted(num_coords + 1, 0.0f);
+            float total_score = 0.0f;
+
+            for (int idx : overlapping) {
+                float score = detections[idx][num_coords];
+                total_score += score;
+                for (int k = 0; k < num_coords; ++k) {
+                    weighted[k] += detections[idx][k] * score;
+                }
+            }
+
+            for (int k = 0; k < num_coords; ++k) {
+                weighted[k] /= total_score;
+            }
+            weighted[num_coords] = total_score / overlapping.size();  // 取平均得分
+
+            // std::cout << "Weighted box: ";
+            // for (float v : weighted) std::cout << v << " ";
+            // std::cout << "\n";
+
+            output.push_back(weighted);
+        }
+    }
+
+    // TODO 
+    auto x = output[0];
+    output.clear();
+    output.push_back(x);
+
+    return output;
+}
+
+
+std::vector<std::vector<float>> denormalize_detections(
+    const std::vector<std::vector<float>>& detections,
+    float scale,
+    const cv::Point& pad
+) {
+    std::vector<std::vector<float>> result = detections;
+
+    for (size_t i = 0; i < result.size(); ++i) {
+        std::vector<float>& det = result[i];
+
+        // bbox coords: x1, y1, x2, y2
+        det[0] = det[0] * scale * 256.0f - pad.x;  // x1
+        det[1] = det[1] * scale * 256.0f - pad.y;  // y1
+        det[2] = det[2] * scale * 256.0f - pad.x;  // x2
+        det[3] = det[3] * scale * 256.0f - pad.y;  // y2
+
+        // keypoints (starting from index 4): format [y, x, y, x, ...]
+        for (size_t k = 4; k + 1 < det.size(); k += 2) {
+            det[k]     = det[k]     * scale * 256.0f - pad.y;  // y
+            det[k + 1] = det[k + 1] * scale * 256.0f - pad.x;  // x
+        }
+    }
+
+    return result;
+}
+
+
+void detection2roi(
+    const std::vector<std::vector<float>>& detections,
+    std::vector<float>& xc,
+    std::vector<float>& yc,
+    std::vector<float>& scale,
+    std::vector<float>& theta,
+    int kp1, int kp2,   // 关键点索引
+    float dy, float dscale, float theta0
+) {
+    size_t N = detections.size();
+    xc.resize(N);
+    yc.resize(N);
+    scale.resize(N);
+    theta.resize(N);
+
+    for (size_t i = 0; i < N; ++i) {
+        const std::vector<float>& det = detections[i];
+
+        float x1 = det[1];
+        float x2 = det[3];
+        float y1 = det[0];
+        float y2 = det[2];
+
+        float x_center = (x1 + x2) / 2.0f;
+        float y_center = (y1 + y2) / 2.0f;
+        float box_scale = (x2 - x1);  // assumes square box
+
+        // yc 偏移
+        y_center += dy * box_scale;
+        box_scale *= dscale;
+
+        // 获取两个关键点的位置
+        int base = 4;
+        int idx_y0 = base + 2 * kp1;
+        int idx_x0 = base + 2 * kp1 + 1;
+        int idx_y1 = base + 2 * kp2;
+        int idx_x1 = base + 2 * kp2 + 1;
+
+        float x0 = det[idx_x0];
+        float y0 = det[idx_y0];
+        float x1_kp = det[idx_x1];
+        float y1_kp = det[idx_y1];
+
+        float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0;
+
+        // 输出赋值
+        xc[i] = x_center;
+        yc[i] = y_center;
+        scale[i] = box_scale;
+        // TODO: 这里的 theta 需要根据实际情况调整
+        // theta[i] = angle;  // 如果需要使用计算的角度
+        theta[i] = -0.0094;
+    }
+}
+
+
+void extract_roi(
+    const cv::Mat& frame,
+    const std::vector<float>& xc,
+    const std::vector<float>& yc,
+    const std::vector<float>& theta,
+    const std::vector<float>& scale,
+    std::vector<cv::Mat>& cropped_rois,
+    std::vector<cv::Mat>& affine_matrices,
+    std::vector<std::vector<cv::Point2f>>& roi_boxes,  // 添加返回点坐标
+    int resolution = 192
+) {
+    cropped_rois.clear();
+    affine_matrices.clear();
+    roi_boxes.clear();
+
+    for (size_t i = 0; i < xc.size(); ++i) {
+        float s = scale[i] / 2.0f;
+        float cos_t = std::cos(theta[i]);
+        float sin_t = std::sin(theta[i]);
+
+        // 定义4个 unit square 点经过变换后的点（顺序和 Python 中一样）
+        std::vector<cv::Point2f> points(4);
+        // [-1, -1]
+        points[0].x = xc[i] + (-s * cos_t + s * sin_t);
+        points[0].y = yc[i] + (-s * sin_t - s * cos_t);
+        // [1, -1]
+        points[1].x = xc[i] + ( s * cos_t + s * sin_t);
+        points[1].y = yc[i] + ( s * sin_t - s * cos_t);
+        // [-1, 1]
+        points[2].x = xc[i] + (-s * cos_t - s * sin_t);
+        points[2].y = yc[i] + (-s * sin_t + s * cos_t);
+        // [1, 1]
+        points[3].x = xc[i] + ( s * cos_t - s * sin_t);
+        points[3].y = yc[i] + ( s * sin_t + s * cos_t);
+
+        // 用前三个点计算仿射变换
+        std::vector<cv::Point2f> src_pts = { points[0], points[1], points[2] };
+        std::vector<cv::Point2f> dst_pts = {
+            cv::Point2f(0, 0),
+            cv::Point2f(resolution - 1, 0),
+            cv::Point2f(0, resolution - 1)
+        };
+
+        cv::Mat M = cv::getAffineTransform(src_pts, dst_pts);
+        cv::Mat M_inv;
+        cv::invertAffineTransform(M, M_inv);
+
+        cv::Mat cropped;
+        cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5));
+        cropped_rois.push_back(cropped);
+        affine_matrices.push_back(M_inv);
+        roi_boxes.push_back(points);  // 添加变换后的 box 点
+    }
+}
+
+std::vector<float> preprocess_imgs_to_nchw(const std::vector<cv::Mat>& imgs) {
+    int N = imgs.size();
+    if (N == 0) return {};
+
+    int H = 192;
+    int W = 192;
+    int C = 3;  // assume 3 channels (BGR)
+
+    std::vector<float> output;
+    output.reserve(N * C * H * W);
+
+    for (int n = 0; n < N; ++n) {
+        cv::Mat img_float;
+        imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0);  // Normalize to [0,1]
+
+        // Split channels (HWC → CHW)
+        std::vector<cv::Mat> channels(3);
+        cv::split(img_float, channels);  // channels[0] = B, [1] = G, [2] = R
+
+        for (int c = 0; c < C; ++c) {
+            for (int i = 0; i < H; ++i) {
+                for (int j = 0; j < W; ++j) {
+                    output.push_back(channels[c].at<float>(i, j));
+                }
+            }
+        }
+    }
+
+    return output;  // shape: N x C x H x W
+}
+
+// resolution 一般为 192
+std::vector<cv::Point2f> denormalize_landmarks(
+    const std::vector<float>& normalized_landmarks,
+    const std::vector<cv::Mat>& affines,
+    int resolution = 192)
+{
+    std::vector<cv::Point2f> output;
+    
+    // 检查输入尺寸
+    const int num_faces = 1;  // 假设只有一个人脸
+    const int num_landmarks = 468;
+    if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) {
+        std::cerr << "Error: Input size mismatch. Expected " 
+                  << num_faces * num_landmarks * 3 << " landmarks and " 
+                  << num_faces << " affine matrices." << std::endl;
+        throw std::runtime_error("Input size mismatch");
+    }
+
+    for (int i = 0; i < num_faces; ++i) {
+        const cv::Mat& affine = affines[i]; // 2x3 CV_32F
+        for (int j = 0; j < num_landmarks; ++j) {
+            int idx = i * num_landmarks * 3 + j * 3;
+            float x = normalized_landmarks[idx + 0] * resolution;
+            float y = normalized_landmarks[idx + 1] * resolution;
+            // float z = normalized_landmarks[idx + 2]; // 可选使用
+
+            // 2x1 输入向量
+            cv::Mat pt = (cv::Mat_<float>(2, 1) << x, y);
+
+            // 提取 affine 的旋转和平移
+            cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone();
+            cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone();
+            M2x2.convertTo(M2x2, CV_32F);
+            t2x1.convertTo(t2x1, CV_32F);
+
+            // 反仿射变换
+            cv::Mat out = M2x2 * pt + t2x1;
+
+            // 存储为 Point2f
+            output.emplace_back(out.at<float>(0, 0), out.at<float>(1, 0));
+        }
+    }
+
+    return output; // 输出为 denormalized landmarks，大小为 468 个 Point2f
+}
+
+
+void draw_roi(cv::Mat& img, const std::vector<std::vector<cv::Point2f>>& boxes) {
+    for (const auto& roi : boxes) {
+        if (roi.size() < 4) continue;
+
+        const cv::Point2f& p1 = roi[0];
+        const cv::Point2f& p2 = roi[1];
+        const cv::Point2f& p3 = roi[2];
+        const cv::Point2f& p4 = roi[3];
+
+        cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2);     // 绿色
+        cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+    }
+}
+
+
+void draw_detections(cv::Mat& img, const std::vector<std::vector<float>>& detections, bool with_keypoints = true) {
+    for (const auto& det : detections) {
+        if (det.size() < 4) continue;
+
+        float ymin = det[0];
+        float xmin = det[1];
+        float ymax = det[2];
+        float xmax = det[3];
+
+        cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1);
+
+        if (with_keypoints && det.size() > 4) {
+            int n_keypoints = (det.size() - 4) / 2;
+            for (int k = 0; k < n_keypoints; ++k) {
+                int kp_x = int(det[4 + k * 2]);
+                int kp_y = int(det[4 + k * 2 + 1]);
+                cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2);
+            }
+        }
+    }
+}
+
+
+std::vector<std::vector<float>> loadAnchors(const std::string& filename) {
+    std::ifstream in(filename);
+    std::vector<std::vector<float>> anchors;
+
+    if (!in.is_open()) {
+        std::cerr << "Failed to open file: " << filename << std::endl;
+        return anchors;
+    }
+
+    std::string line;
+    while (std::getline(in, line)) {
+        std::istringstream ss(line);
+        std::vector<float> anchor;
+        float value;
+        while (ss >> value) {
+            anchor.push_back(value);
+        }
+        if (!anchor.empty()) {
+            anchors.push_back(anchor);
+        }
+    }
+
+    in.close();
+    return anchors;
+}
+
+// sigmoid 函数
+float sigmoid(float x) {
+    return 1.0f / (1.0f + std::exp(-x));
+}
+
+// clamp 函数
+float clamp(float x, float min_val, float max_val) {
+    return std::max(min_val, std::min(max_val, x));
+}
+
+// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord]
+std::vector<std::vector<std::vector<float>>> decode_boxes(
+    const std::vector<float>& raw_boxes,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    int num_keypoints)
+{
+    std::vector<std::vector<std::vector<float>>> decoded_boxes(batch,
+        std::vector<std::vector<float>>(num_anchors, std::vector<float>(num_coords, 0)));
+
+    for (int b = 0; b < batch; ++b) {
+        for (int i = 0; i < num_anchors; ++i) {
+            int base = b * num_anchors * num_coords + i * num_coords;
+
+            float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0];
+            float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1];
+            float w = raw_boxes[base + 2] / w_scale * anchors[i][2];
+            float h = raw_boxes[base + 3] / h_scale * anchors[i][3];
+
+            decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin
+            decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin
+            decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax
+            decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax
+
+            for (int k = 0; k < num_keypoints; ++k) {
+                int offset = 4 + k * 2;
+                float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0];
+                float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1];
+                decoded_boxes[b][i][offset] = keypoint_x;
+                decoded_boxes[b][i][offset + 1] = keypoint_y;
+            }
+        }
+    }
+
+    return decoded_boxes;
+}
+
+std::vector<std::vector<std::vector<float>>> tensors_to_detections(
+    const std::vector<float>& raw_box_tensor,
+    const std::vector<float>& raw_score_tensor,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    float score_clipping_thresh, float min_score_thresh)
+{
+    assert(raw_box_tensor.size() == batch * num_anchors * num_coords);
+    assert(raw_score_tensor.size() == batch * num_anchors * num_classes);
+    assert(anchors.size() == size_t(num_anchors));
+
+    auto detection_boxes = decode_boxes(
+        raw_box_tensor, anchors, batch, num_anchors, num_coords,
+        x_scale, y_scale, w_scale, h_scale, num_keypoints);
+
+    std::vector<std::vector<std::vector<float>>> output_detections;
+
+    for (int b = 0; b < batch; ++b) {
+        std::vector<std::vector<float>> detections;
+
+        for (int i = 0; i < num_anchors; ++i) {
+            int score_index = b * num_anchors * num_classes + i * num_classes;
+
+            // 单类情况，取第0类
+            float score_raw = raw_score_tensor[score_index];
+            float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh));
+
+            if (score >= min_score_thresh) {
+                std::vector<float> det = detection_boxes[b][i]; // shape [num_coords]
+                det.push_back(score); // 追加置信度
+                detections.push_back(det); // shape [num_coords+1]
+            }
+        }
+
+        output_detections.push_back(detections); // 每个 batch 一个 vector
+    }
+
+    return output_detections;
+}
+
+
+int invoke(const Args& args) {
+    std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n"
+              << args.faceLandmark_model << "\n"
+              << "Image Path: " << args.imgs << "\n"
+              << "Inference Nums: " << args.invoke_nums << "\n"
+              << "Model Type: " << args.model_type << "\n";
+    // =============================================================faceDetector_model start
+    Model* model1 = Model::create_instance(args.faceDetector_model);
+    if(model1 == nullptr){
+        printf("Create model1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config1 = Config::create_instance();
+    if(config1 == nullptr){
+        printf("Create config1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config1->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower1 = to_lower(args.model_type);
+    if (model_type_lower1 == "qnn"){
+        config1->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") {
+        config1->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config1->accelerate_type = AccelerateType::TYPE_DSP;
+    config1->is_quantify_model = 1;
+
+    std::vector<std::vector<uint32_t>> input_shapes1 = {{1,3,256,256}};
+    std::vector<std::vector<uint32_t>> output_shapes1 = {{1,896,16},{1,896,1}};
+    model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1);
+    if(fast_interpreter1 == nullptr){
+        printf("build_interpretper_from_model_and_config failed !\n");
+        return EXIT_FAILURE;
+    }
+    int result = fast_interpreter1->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter1->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model load success!\n");
+    // =============================================================faceDetector_model over
+
+    // =============================================================faceLandmark_model start
+    Model* model2 = Model::create_instance(args.faceLandmark_model);
+    if(model2 == nullptr){
+        printf("Create model2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config2 = Config::create_instance();
+    if(config2 == nullptr){
+        printf("Create config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config2->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower2 = to_lower(args.model_type);
+    if (model_type_lower2 == "qnn"){
+        config2->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") {
+        config2->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config2->accelerate_type = AccelerateType::TYPE_DSP;
+    config2->is_quantify_model = 1;
+
+    std::vector<std::vector<uint32_t>> input_shapes2 = {{1,3,192,192}};
+    std::vector<std::vector<uint32_t>> output_shapes2 = {{1},{1,468,3}};
+    model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2);
+    if(fast_interpreter2 == nullptr){
+        printf("build_interpretper_from_model_and_config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    result = fast_interpreter2->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter2->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model2 load success!\n");
+    // =============================================================faceLandmark_model over
+
+
+    auto anchors = load_anchors_from_npy("../anchors_float32.npy");
+    cv::Mat frame = cv::imread(args.imgs);
+    if (frame.empty()) {
+        printf("detect image load failed!\n");
+        return 1;
+    }
+    // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
+    cv::Mat input_data;
+    cv::Mat frame_clone1 = frame.clone();
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB); 
+    cv::Mat frame_clone = frame.clone();
+
+
+    cv::Mat img1, img2;
+    float scale;
+    cv::Point pad;
+    std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1);
+    std::vector<float> input_tensor = preprocess_image(img1);
+
+    float *outdata0 = nullptr;
+    float *outdata1 = nullptr;
+    std::vector<float> invoke_time;
+    for (int i = 0; i < args.invoke_nums; ++i) {
+        result = fast_interpreter1->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter1->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_0 = 0;
+        result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        uint32_t out_data_1 = 0;
+        result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+    }
+
+    std::vector<float> tensor_1_896_16(outdata0, outdata0 + 896*16);
+    std::vector<float> tensor_1_896_1(outdata1, outdata1 + 896*1);
+
+    std::vector<std::vector<std::vector<float>>> detections = tensors_to_detections(
+        tensor_1_896_16, tensor_1_896_1, anchors,
+        1, 896, 16, 1, 6,
+        256.0f, 256.0f, 256.0f, 256.0f,
+        100.0f, 0.4f);
+
+
+    std::vector<std::vector<std::vector<float>>> filtered_detections;
+    for (size_t i = 0; i < detections.size(); ++i) {
+        std::vector<std::vector<float>>& dets = detections[i];
+        std::vector<std::vector<float>> faces = weighted_non_max_suppression(dets);
+        filtered_detections.push_back(faces);
+    }
+
+
+    // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n";
+    // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n";
+    std::vector<std::vector<float>> face_detections = denormalize_detections(filtered_detections[0], scale, pad);
+
+    // std::cout << "face_detections size: " << face_detections.size() << "\n";
+    std::vector<float> xc, yc, scales, theta;
+    int kp1 = 0, kp2 = 1;           // 关键点索引
+    float dy = 0.0f;                // 根据模型定义设定
+    float dscale = 1.5f;            // 缩放因子
+    float theta0 = 0.0f;            // 基准角度
+
+    detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0);
+    std::vector<cv::Mat> rois;
+    std::vector<cv::Mat> affines;
+    std::vector<std::vector<cv::Point2f>> boxes;
+
+    extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes);
+    if (!boxes.empty()) {
+        std::cout << "Detected " << boxes.size() << " faces.\n";
+        // 检测到人脸，继续处理 boxes[0] ...
+        std::vector<float> input_tensor = preprocess_imgs_to_nchw(rois);
+
+        float *outdata1_0 = nullptr;
+        float *outdata1_1 = nullptr;
+
+        result = fast_interpreter2->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter2->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_1_0 = 0;
+        result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        uint32_t out_data_1_1 = 0;
+        result = fast_interpreter2->get_output_tensor(1, (void**)&outdata1_1, &out_data_1_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        std::vector<float> flags(outdata1_0, outdata1_0 + 1);
+        std::vector<float> normalized_landmarks(outdata1_1, outdata1_1 + 468*3);
+
+        std::vector<cv::Point2f> landmarks = denormalize_landmarks(normalized_landmarks, affines);
+        draw_landmarks(frame_clone1, landmarks, flags, FACE_CONNECTIONS);
+    } else {
+        std::cout << "not detect face!" << std::endl;
+    }
+
+    
+    draw_roi(frame_clone1, boxes);
+    draw_detections(frame_clone1, face_detections);
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR);
+    cv::imwrite("vis_result.jpg", frame_clone1);
+
+
+    fast_interpreter1->destory();
+    fast_interpreter2->destory();
+    return 0;
+    
+}
+
+
+int main(int argc, char* argv[]) {
+    Args args = parse_args(argc, argv);
+    return invoke(args);
+}
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/anchors_face_back.npy b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/anchors_face_back.npy
new file mode 100644
index 0000000000000000000000000000000000000000..3ba12474802adea36a8e37ca648d46556cd35c92
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/anchors_face_back.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95
+size 28800
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazeface_landmark.pth b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazeface_landmark.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b9fe34feb756f128a94ee424dacfec09bb01a811
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazeface_landmark.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c529987e67f82e58a608a394aabf245a3afa19ac2f761981894f70b4df9fdca
+size 2439235
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazefaceback.pth b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazefaceback.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d8c31116b779d40ad7f2ae3bd8d633370af894d1
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/blazefaceback.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9658c6459c5d5450d7da9d5fbb74b3beca11157f4cdb35e4d948aa6b4efc0ded
+size 594825
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor.pt b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor.pt
new file mode 100644
index 0000000000000000000000000000000000000000..524cbb24eb52d05a050437f9dd5fe735c349bd72
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56336b04831d9f9f41bdcddcd4598e5660a2925451ee50da634fea6598ce6620
+size 855238
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor_w8a16.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor_w8a16.qnn216.ctx.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f4815d40ae5719217efb7af7d1c4859163f1fc3b
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceDetctor_w8a16.qnn216.ctx.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06f7e7016506a415bb7e02aaf9469a5fd406d31bb7349d3ae0fe97f1a0cb3b9a
+size 728616
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark.pt b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d280840347f2fbc83daff7553385fc689ffdd848
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96374d173e67c5c3690b75d030b729e23e41de6b1a1ebd5daef7ff3992118c54
+size 2643322
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark_w8a16.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark_w8a16.qnn216.ctx.bin
new file mode 100644
index 0000000000000000000000000000000000000000..363b5bb1c4c319e12c8545f826ed6b43a11c3a03
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/models/m_faceLandmark_w8a16.qnn216.ctx.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61316298a6690650feea876b64b2efe520940d753af3264202689b12dd1c779e
+size 1096800
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/0000.jpg b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/0000.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..811afcc48481b7da1cc1417a349a3d0f8bf7dab3
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/0000.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1b278e84efa32b0e25d982219d31438f74a73b58af62b7f4751df3076221078
+size 173585
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5f01c881032cc0ea0dafbbe76b4d02bf51f5dc2
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e15f793e9a3a77c122c6cae36e26c5bd9b77ab7e
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazebase.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4d54b68c53f26e930688f7bd66bc42e64431265
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3960f9eaa3babd661de15273d7037029ce46e706
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/visualization.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffaa52099e2019a3dc97a8eef82319610de6aaf4
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/__pycache__/visualization.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazebase.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazebase.py
new file mode 100644
index 0000000000000000000000000000000000000000..414ae950199d7fa1ebae4fa540805cb53cf1ff0d
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazebase.py
@@ -0,0 +1,513 @@
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def resize_pad(img):
+    """ resize and pad images to be input to the detectors
+
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio.
+
+    Returns:
+        img1: 256x256
+        img2: 128x128
+        scale: scale factor between original image and 256x256 image
+        pad: pixels of padding in the original image
+    """
+
+    size0 = img.shape
+    if size0[0]>=size0[1]:
+        h1 = 256
+        w1 = 256 * size0[1] // size0[0]
+        padh = 0
+        padw = 256 - w1
+        scale = size0[1] / w1
+    else:
+        h1 = 256 * size0[0] // size0[1]
+        w1 = 256
+        padh = 256 - h1
+        padw = 0
+        scale = size0[0] / h1
+    padh1 = padh//2
+    padh2 = padh//2 + padh%2
+    padw1 = padw//2
+    padw2 = padw//2 + padw%2
+    img1 = cv2.resize(img, (w1,h1))
+    img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0)))
+    pad = (int(padh1 * scale), int(padw1 * scale))
+    img2 = cv2.resize(img1, (128,128))
+    return img1, img2, scale, pad
+
+
+def denormalize_detections(detections, scale, pad):
+    """ maps detection coordinates from [0,1] to image coordinates
+
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio. This function maps the
+    normalized coordinates back to the original image coordinates.
+
+    Inputs:
+        detections: nxm tensor. n is the number of detections.
+            m is 4+2*k where the first 4 valuse are the bounding
+            box coordinates and k is the number of additional
+            keypoints output by the detector.
+        scale: scalar that was used to resize the image
+        pad: padding in the x and y dimensions
+
+    """
+    detections[:, 0] = detections[:, 0] * scale * 256 - pad[0]
+    detections[:, 1] = detections[:, 1] * scale * 256 - pad[1]
+    detections[:, 2] = detections[:, 2] * scale * 256 - pad[0]
+    detections[:, 3] = detections[:, 3] * scale * 256 - pad[1]
+
+    detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1]
+    detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0]
+    return detections
+
+
+
+
+class BlazeBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False):
+        super(BlazeBlock, self).__init__()
+
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.channel_pad = out_channels - in_channels
+
+        # TFLite uses slightly different padding than PyTorch 
+        # on the depthwise conv layer when the stride is 2.
+        if stride == 2:
+            self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+            padding = 0
+        else:
+            padding = (kernel_size - 1) // 2
+
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=in_channels, 
+                      kernel_size=kernel_size, stride=stride, padding=padding, 
+                      groups=in_channels, bias=True),
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+
+        if skip_proj:
+            self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 
+                      kernel_size=1, stride=1, padding=0, bias=True)
+        else:
+            self.skip_proj = None
+
+        if act == 'relu':
+            self.act = nn.ReLU(inplace=True)
+        elif act == 'prelu':
+            self.act = nn.PReLU(out_channels)
+        else:
+            raise NotImplementedError("unknown activation %s"%act)
+
+    def forward(self, x):
+        if self.stride == 2:
+            if self.kernel_size==3:
+                h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+            else:
+                h = F.pad(x, (1, 2, 1, 2), "constant", 0)
+            x = self.max_pool(x)
+        else:
+            h = x
+
+        if self.skip_proj is not None:
+            x = self.skip_proj(x)
+        elif self.channel_pad > 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
+        
+
+        return self.act(self.convs(h) + x)
+
+
+class FinalBlazeBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3):
+        super(FinalBlazeBlock, self).__init__()
+
+        # TFLite uses slightly different padding than PyTorch
+        # on the depthwise conv layer when the stride is 2.
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=kernel_size, stride=2, padding=0,
+                      groups=channels, bias=True),
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+
+        self.act = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+
+        return self.act(self.convs(h))
+
+
+class BlazeBase(nn.Module):
+    """ Base class for media pipe models. """
+
+    def _device(self):
+        """Which device (CPU or GPU) is being used by this model?"""
+        return self.classifier_8.weight.device
+    
+    def load_weights(self, path):
+        self.load_state_dict(torch.load(path))
+        self.eval()        
+
+
+class BlazeLandmark(BlazeBase):
+    """ Base class for landmark models. """
+
+    def extract_roi(self, frame, xc, yc, theta, scale):
+
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+
+        # use the points to compute the affine transform that maps 
+        # these points back to the output square
+        res = self.resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].cpu().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+
+        return imgs, affines, points
+
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+
+
+
+class BlazeDetector(BlazeBase):
+    """ Base class for detector models.
+
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/hollance/BlazeFace-PyTorch and
+    https://github.com/google/mediapipe/
+    """
+    def load_anchors(self, path):
+        self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
+        assert(self.anchors.ndimension() == 2)
+        assert(self.anchors.shape[0] == self.num_anchors)
+        assert(self.anchors.shape[1] == 4)
+
+    def _preprocess(self, x):
+        """Converts the image pixels to the range [-1, 1]."""
+        return x.float() / 255.# 127.5 - 1.0
+
+    def predict_on_image(self, img):
+        """Makes a prediction on a single image.
+
+        Arguments:
+            img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
+                 shape (3, H, W). The image's height and width should be 
+                 128 pixels.
+
+        Returns:
+            A tensor with face detections.
+        """
+        if isinstance(img, np.ndarray):
+            img = torch.from_numpy(img).permute((2, 0, 1))
+
+        return self.predict_on_batch(img.unsqueeze(0))[0]
+
+    def predict_on_batch(self, x):
+        """Makes a prediction on a batch of images.
+
+        Arguments:
+            x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
+               shape (b, 3, H, W). The height and width should be 128 pixels.
+
+        Returns:
+            A list containing a tensor of face detections for each image in 
+            the batch. If no faces are found for an image, returns a tensor
+            of shape (0, 17).
+
+        Each face detection is a PyTorch tensor consisting of 17 numbers:
+            - ymin, xmin, ymax, xmax
+            - x,y-coordinates for the 6 keypoints
+            - confidence score
+        """
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x).permute((0, 3, 1, 2))
+
+        assert x.shape[1] == 3
+        assert x.shape[2] == self.y_scale
+        assert x.shape[3] == self.x_scale
+
+        # 1. Preprocess the images into tensors:
+        x = x.to(self._device())
+        x = self._preprocess(x)
+
+        # 2. Run the neural network:
+        with torch.no_grad():
+            out = self.__call__(x)
+
+        # 3. Postprocess the raw predictions:
+        detections = self._tensors_to_detections(out[0], out[1], self.anchors)
+
+        # 4. Non-maximum suppression to remove overlapping detections:
+        filtered_detections = []
+        for i in range(len(detections)):
+            faces = self._weighted_non_max_suppression(detections[i])
+            faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1))
+            filtered_detections.append(faces)
+
+        return filtered_detections
+
+
+    def detection2roi(self, detection):
+        """ Convert detections from detector to an oriented bounding box.
+
+        Adapted from:
+        # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
+
+        The center and size of the box is calculated from the center 
+        of the detected box. Rotation is calcualted from the vector
+        between kp1 and kp2 relative to theta0. The box is scaled
+        and shifted by dscale and dy.
+
+        """
+        if self.detection2roi_method == 'box':
+            # compute box center and scale
+            # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+            xc = (detection[:,1] + detection[:,3]) / 2
+            yc = (detection[:,0] + detection[:,2]) / 2
+            scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+
+        elif self.detection2roi_method == 'alignment':
+            # compute box center and scale
+            # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc
+            xc = detection[:,4+2*self.kp1]
+            yc = detection[:,4+2*self.kp1+1]
+            x1 = detection[:,4+2*self.kp2]
+            y1 = detection[:,4+2*self.kp2+1]
+            scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2
+        else:
+            raise NotImplementedError(
+                "detection2roi_method [%s] not supported"%self.detection2roi_method)
+
+        yc += self.dy * scale
+        scale *= self.dscale
+
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        #theta = np.arctan2(y0-y1, x0-x1) - self.theta0
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+
+
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        """The output of the neural network is a tensor of shape (b, 896, 16)
+        containing the bounding box regressor predictions, as well as a tensor 
+        of shape (b, 896, 1) with the classification confidences.
+
+        This function converts these two "raw" tensors into proper detections.
+        Returns a list of (num_detections, 17) tensors, one for each image in
+        the batch.
+
+        This is based on the source code from:
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
+        """
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+
+        return output_detections
+
+    def _decode_boxes(self, raw_boxes, anchors):
+        """Converts the predictions into actual coordinates using
+        the anchor boxes. Processes the entire batch at once.
+        """
+        boxes = torch.zeros_like(raw_boxes)
+
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+
+        return boxes
+
+    def _weighted_non_max_suppression(self, detections):
+        """The alternative NMS method as mentioned in the BlazeFace paper:
+
+        "We replace the suppression algorithm with a blending strategy that
+        estimates the regression parameters of a bounding box as a weighted
+        mean between the overlapping predictions."
+
+        The original MediaPipe code assigns the score of the most confident
+        detection to the weighted detection, but we take the average score
+        of the overlapping detections.
+
+        The input detections should be a Tensor of shape (count, 17).
+
+        Returns a list of PyTorch tensors, one for each detected face.
+        
+        This is based on the source code from:
+        mediapipe/calculators/util/non_max_suppression_calculator.cc
+        mediapipe/calculators/util/non_max_suppression_calculator.proto
+        """
+        if len(detections) == 0: return []
+
+        output_detections = []
+
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, self.num_coords], descending=True)
+
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+
+            # Compute the overlap between the first box and the other 
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = overlap_similarity(first_box, other_boxes)
+
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious > self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :self.num_coords]
+                scores = detections[overlapping, self.num_coords:self.num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:self.num_coords] = weighted
+                weighted_detection[self.num_coords] = total_score / len(overlapping)
+
+            output_detections.append(weighted_detection)
+
+        return output_detections    
+
+
+# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
+
+def intersect(box_a, box_b):
+    """ We resize both tensors to [A,B,2] without new malloc:
+    [A,2] -> [A,1,2] -> [A,B,2]
+    [B,2] -> [1,B,2] -> [A,B,2]
+    Then we compute the area of intersect between box_a and box_b.
+    Args:
+      box_a: (tensor) bounding boxes, Shape: [A,4].
+      box_b: (tensor) bounding boxes, Shape: [B,4].
+    Return:
+      (tensor) intersection area, Shape: [A,B].
+    """
+    A = box_a.size(0)
+    B = box_b.size(0)
+    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+    inter = torch.clamp((max_xy - min_xy), min=0)
+    return inter[:, :, 0] * inter[:, :, 1]
+
+
+def jaccard(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.  Here we operate on
+    ground truth boxes and default boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
+        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
+    Return:
+        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2]-box_a[:, 0]) *
+              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+    area_b = ((box_b[:, 2]-box_b[:, 0]) *
+              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+
+
+def overlap_similarity(box, other_boxes):
+    """Computes the IOU between a bounding box and set of other boxes."""
+    return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d630b3b28ead006d2864f18f1637f6545dbe507
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface.py
@@ -0,0 +1,182 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from blazebase import BlazeDetector, BlazeBlock, FinalBlazeBlock
+
+
+class BlazeFace(BlazeDetector):
+    """The BlazeFace face detection model from MediaPipe.
+    
+    The version from MediaPipe is simpler than the one in the paper; 
+    it does not use the "double" BlazeBlocks.
+
+    Because we won't be training this model, it doesn't need to have
+    batchnorm layers. These have already been "folded" into the conv 
+    weights by TFLite.
+
+    The conversion to PyTorch is fairly straightforward, but there are 
+    some small differences between TFLite and PyTorch in how they handle
+    padding on conv layers with stride 2.
+
+    This version works on batches, while the MediaPipe version can only
+    handle a single image at a time.
+
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/hollance/BlazeFace-PyTorch and
+    https://github.com/google/mediapipe/
+
+    """
+    def __init__(self, back_model=False):
+        super(BlazeFace, self).__init__()
+
+        # These are the settings from the MediaPipe example graph
+        # mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt
+        self.num_classes = 1
+        self.num_anchors = 896
+        self.num_coords = 16
+        self.score_clipping_thresh = 100.0
+        self.back_model = back_model
+        if back_model:
+            self.x_scale = 256.0
+            self.y_scale = 256.0
+            self.h_scale = 256.0
+            self.w_scale = 256.0
+            self.min_score_thresh = 0.65
+        else:
+            self.x_scale = 128.0
+            self.y_scale = 128.0
+            self.h_scale = 128.0
+            self.w_scale = 128.0
+            self.min_score_thresh = 0.75
+        self.min_suppression_threshold = 0.3
+        self.num_keypoints = 6
+
+        # These settings are for converting detections to ROIs which can then
+        # be extracted and feed into the landmark network
+        # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+        self.detection2roi_method = 'box'
+        # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
+        self.kp1 = 1
+        self.kp2 = 0
+        self.theta0 = 0.
+        self.dscale = 1.5
+        self.dy = 0.
+
+        self._define_layers()
+
+    def _define_layers(self):
+        if self.back_model:
+            self.backbone = nn.Sequential(
+                nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
+                nn.ReLU(inplace=True),
+
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24, stride=2),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 48, stride=2),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 96, stride=2),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+            )
+            self.final = FinalBlazeBlock(96)
+            self.classifier_8 = nn.Conv2d(96, 2, 1, bias=True)
+            self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
+
+            self.regressor_8 = nn.Conv2d(96, 32, 1, bias=True)
+            self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
+        else:
+            self.backbone1 = nn.Sequential(
+                nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
+                nn.ReLU(inplace=True),
+
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 28),
+                BlazeBlock(28, 32, stride=2),
+                BlazeBlock(32, 36),
+                BlazeBlock(36, 42),
+                BlazeBlock(42, 48, stride=2),
+                BlazeBlock(48, 56),
+                BlazeBlock(56, 64),
+                BlazeBlock(64, 72),
+                BlazeBlock(72, 80),
+                BlazeBlock(80, 88),
+            )
+        
+            self.backbone2 = nn.Sequential(
+                BlazeBlock(88, 96, stride=2),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+            )
+
+            self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True)
+            self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
+
+            self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True)
+            self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
+        
+    def forward(self, x):
+        # TFLite uses slightly different padding on the first conv layer
+        # than PyTorch, so do it manually.
+        x = F.pad(x, (1, 2, 1, 2), "constant", 0)
+        
+        b = x.shape[0]      # batch size, needed for reshaping later
+
+        if self.back_model:
+            x = self.backbone(x)           # (b, 16, 16, 96)
+            h = self.final(x)              # (b, 8, 8, 96)
+        else:
+            x = self.backbone1(x)           # (b, 88, 16, 16)
+            h = self.backbone2(x)           # (b, 96, 8, 8)
+        
+        # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
+        # permute the output from the conv layers before reshaping it.
+        
+        c1 = self.classifier_8(x)       # (b, 2, 16, 16)
+        c1 = c1.permute(0, 2, 3, 1)     # (b, 16, 16, 2)
+        c1 = c1.reshape(b, -1, 1)       # (b, 512, 1)
+
+        c2 = self.classifier_16(h)      # (b, 6, 8, 8)
+        c2 = c2.permute(0, 2, 3, 1)     # (b, 8, 8, 6)
+        c2 = c2.reshape(b, -1, 1)       # (b, 384, 1)
+
+        c = torch.cat((c1, c2), dim=1)  # (b, 896, 1)
+
+        r1 = self.regressor_8(x)        # (b, 32, 16, 16)
+        r1 = r1.permute(0, 2, 3, 1)     # (b, 16, 16, 32)
+        r1 = r1.reshape(b, -1, 16)      # (b, 512, 16)
+
+        r2 = self.regressor_16(h)       # (b, 96, 8, 8)
+        r2 = r2.permute(0, 2, 3, 1)     # (b, 8, 8, 96)
+        r2 = r2.reshape(b, -1, 16)      # (b, 384, 16)
+
+        r = torch.cat((r1, r2), dim=1)  # (b, 896, 16)
+        return [r, c]
+
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface_landmark.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface_landmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b21d17e31c8a8a39e95abb27cc3220440a01fca
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/blazeface_landmark.py
@@ -0,0 +1,74 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from blazebase import BlazeLandmark, BlazeBlock
+
+class BlazeFaceLandmark(BlazeLandmark):
+    """The face landmark model from MediaPipe.
+    
+    """
+    def __init__(self):
+        super(BlazeFaceLandmark, self).__init__()
+
+        # size of ROIs used for input
+        self.resolution = 192
+
+        self._define_layers()
+
+    def _define_layers(self):
+        self.backbone1 = nn.Sequential(
+            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=0, bias=True),
+            nn.PReLU(16),
+
+            BlazeBlock(16, 16, 3, act='prelu'),
+            BlazeBlock(16, 16, 3, act='prelu'),
+            BlazeBlock(16, 32, 3, 2, act='prelu'),
+
+            BlazeBlock(32, 32, 3, act='prelu'),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            BlazeBlock(32, 64, 3, 2, act='prelu'),
+
+            BlazeBlock(64, 64, 3, act='prelu'),
+            BlazeBlock(64, 64, 3, act='prelu'),
+            BlazeBlock(64, 128, 3, 2, act='prelu'),
+
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+        )
+
+
+        self.backbone2a = nn.Sequential(
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            nn.Conv2d(128, 32, 1, padding=0, bias=True),
+            nn.PReLU(32),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            nn.Conv2d(32, 1404, 3, padding=0, bias=True)
+        )
+
+        self.backbone2b = nn.Sequential(
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+            nn.Conv2d(128, 32, 1, padding=0, bias=True),
+            nn.PReLU(32),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            nn.Conv2d(32, 1, 3, padding=0, bias=True)
+        )
+
+    def forward(self, x):
+        if x.shape[0] == 0:
+            return torch.zeros((0,)), torch.zeros((0, 468, 3))
+            
+        x = F.pad(x, (0, 1, 0, 1), "constant", 0)
+
+        x = self.backbone1(x)
+        landmarks = self.backbone2a(x).view(-1, 468, 3) / 192
+        flag = self.backbone2b(x).sigmoid().view(-1)
+
+        return flag, landmarks
\ No newline at end of file
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/coco.jpg b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/coco.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/coco.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378
+size 158750
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/demo_qnn.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/demo_qnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb902b5d3886ae0e09f7433f6d95a05c076b78a9
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/demo_qnn.py
@@ -0,0 +1,389 @@
+import numpy as np
+import torch
+import cv2
+import sys
+from blazebase import resize_pad, denormalize_detections
+from visualization import  draw_landmarks, draw_roi, FACE_CONNECTIONS
+import time
+import aidlite
+import os
+
+class post_mediapipe_face:
+    def __init__(self):
+        self.kp1 = 1
+        self.kp2 = 0
+        self.theta0 = 0.
+        self.dscale = 1.5
+        self.dy = 0.
+        self.x_scale = 256.0
+        self.y_scale = 256.0
+        self.h_scale = 256.0
+        self.w_scale = 256.0
+        self.num_keypoints = 6
+        self.num_classes = 1
+        self.num_anchors = 896
+        self.num_coords = 16
+        self.min_score_thresh = 0.4  #0.65  
+        self.score_clipping_thresh = 100.0
+        self.min_suppression_threshold = 0.3
+        self.resolution = 192
+            
+        
+    def detection2roi(self,detection):
+        xc = (detection[:,1] + detection[:,3]) / 2
+        yc = (detection[:,0] + detection[:,2]) / 2
+        scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+        yc += self.dy * scale
+        scale *= self.dscale
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+    
+    def _decode_boxes( self,raw_boxes, anchors):
+        boxes = torch.zeros_like(raw_boxes)
+
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+        return boxes
+    
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+
+        return output_detections
+
+    def extract_roi( self,frame, xc, yc, theta, scale):
+        resolution = 192
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+
+        # use the points to compute the affine transform that maps 
+        # these points back to the output square
+        res = resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].detach().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+
+        return imgs, affines, points
+
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+
+    def intersect(self,box_a, box_b):
+        A = box_a.size(0)
+        B = box_b.size(0)
+        max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                        box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+        min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                        box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+        inter = torch.clamp((max_xy - min_xy), min=0)
+        return inter[:, :, 0] * inter[:, :, 1]
+
+    def jaccard(self,box_a, box_b):
+        inter = self.intersect(box_a, box_b)
+        area_a = ((box_a[:, 2]-box_a[:, 0]) *
+                (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+        area_b = ((box_b[:, 2]-box_b[:, 0]) *
+                (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+        union = area_a + area_b - inter
+        return inter / union  # [A,B]
+
+
+    def overlap_similarity(self,box, other_boxes):
+        """Computes the IOU between a bounding box and set of other boxes."""
+        return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
+
+    def _weighted_non_max_suppression(self,detections):
+        if len(detections) == 0: return []
+        output_detections = []
+
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, num_coords], descending=True)
+
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+
+            # Compute the overlap between the first box and the other 
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = self.overlap_similarity(first_box, other_boxes)
+
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious >  self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :num_coords]
+                scores = detections[overlapping, num_coords:num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:num_coords] = weighted
+                weighted_detection[num_coords] = total_score / len(overlapping)
+
+            output_detections.append(weighted_detection)
+
+            return output_detections
+
+def draw_detections(img, detections, with_keypoints=True):
+    if isinstance(detections, torch.Tensor):
+        detections = detections.detach().numpy()
+
+    if detections.ndim == 1:
+        detections = np.expand_dims(detections, axis=0)
+
+    n_keypoints = detections.shape[1] // 2 - 2
+
+    for i in range(detections.shape[0]):
+        ymin = detections[i, 0]
+        xmin = detections[i, 1]
+        ymax = detections[i, 2]
+        xmax = detections[i, 3]
+        
+        start_point = (int(xmin), int(ymin))
+        end_point = (int(xmax), int(ymax))
+        img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) 
+
+        if with_keypoints:
+            for k in range(n_keypoints):
+                kp_x = int(detections[i, 4 + k*2    ])
+                kp_y = int(detections[i, 4 + k*2 + 1])
+                cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
+    return img
+
+
+
+post_process=post_mediapipe_face()
+
+class faceDetectionQnn:
+    def __init__(self):
+        super().__init__()
+        self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceDetctor_w8a16.qnn216.ctx.bin"))
+        if self.model is None:
+            print("Create model failed !")
+            return
+
+        self.config = aidlite.Config.create_instance()
+        if self.config is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+
+        self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
+        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
+        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
+        self.config.is_quantify_model = 1
+        
+        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        input_shapes = [[1,3, 256, 256]]
+        output_shapes = [[1, 896,16],[1,896,1]]
+        self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
+                                output_shapes, aidlite.DataType.TYPE_FLOAT32)
+
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+        result = self.interpreter.init()
+        if result != 0:
+            print(f"interpreter init failed !")
+        result = self.interpreter.load_model()
+        if result != 0:
+            print("interpreter load model failed !")
+   
+        print(" model load success!")
+
+    def __call__(self, input):
+        self.interpreter.set_input_tensor(0,input)
+        self.interpreter.invoke()
+        features_0 = self.interpreter.get_output_tensor(0).reshape(1, 896,16).copy()
+        features_1 = self.interpreter.get_output_tensor(1).reshape(1, 896,1).copy()
+        return features_0,features_1
+
+
+class faceLandmarkQnn:
+    def __init__(self):
+        super().__init__()
+        self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceLandmark_w8a16.qnn216.ctx.bin"))
+        if self.model is None:
+            print("Create model failed !")
+            return
+
+        self.config = aidlite.Config.create_instance()
+        if self.config is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+
+        self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
+        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
+        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
+        self.config.is_quantify_model = 1
+        
+        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        input_shapes = [[1, 3, 192, 192]]
+        output_shapes = [[1],[1,468,3]]
+        self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
+                                output_shapes, aidlite.DataType.TYPE_FLOAT32)
+
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+        result = self.interpreter.init()
+        if result != 0:
+            print(f"interpreter init failed !")
+        result = self.interpreter.load_model()
+        if result != 0:
+            print("interpreter load model failed !")
+   
+        print(" model load success!")
+
+    def __call__(self, input):
+        self.interpreter.set_input_tensor(0,input)
+        self.interpreter.invoke()
+        features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy()
+        features_1 = self.interpreter.get_output_tensor(1).reshape(1,468,3).copy()
+        return features_0,features_1
+
+
+
+anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/anchors_face_back.npy")), dtype=torch.float32, device='cpu')
+face_detc = faceDetectionQnn()
+face_rec = faceLandmarkQnn()
+
+image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"coco.jpg")
+
+frame_ct=0
+image = cv2.imread(image_path)
+
+frame = np.ascontiguousarray(image[:,:,::-1])
+
+img1, img2, scale, pad = resize_pad(frame)
+
+input = (img1 / 255).astype(np.float32)
+input = np.transpose(input, (2, 0, 1))  
+input = input[np.newaxis, ...]
+t0 = time.time()
+out = face_detc(input)
+use_time = round((time.time() - t0) * 1000, 2)
+print(f"face detction inference_time:{use_time} ms")
+detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors)
+
+filtered_detections = []
+num_coords = 16
+for i in range(len(detections)):
+    faces = post_process._weighted_non_max_suppression(detections[i])
+    faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1))
+    filtered_detections.append(faces)
+
+face_detections = denormalize_detections(filtered_detections[0], scale, pad)
+
+xc, yc, scale, theta = post_process.detection2roi(face_detections)
+
+img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale)
+if box.size()[0]!=0:
+    t2 = time.time()
+    flags, normalized_landmarks = face_rec(img.numpy())
+
+    use_time = round((time.time() - t2) * 1000, 2)
+    print(f"landmark inference_time:{use_time} ms")
+
+    landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine)
+
+    for i in range(len(flags)):
+        landmark, flag = landmarks[i], flags[i]
+        if flag>.4: # 0.5
+            draw_landmarks(frame, landmark[:,:2], FACE_CONNECTIONS, size=1)
+else:
+    print("not detect face !")
+
+draw_roi(frame, box)
+draw_detections(frame, face_detections)
+cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1])
+face_detc.interpreter.destory()
+face_rec.interpreter.destory()
+
+
+
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/export_jit.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/export_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2bfb877b73ba9f7826a20ae51ea5dcdddecf2ff
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/export_jit.py
@@ -0,0 +1,57 @@
+import torch
+import os
+from typing import Callable, Tuple
+from blazeface import BlazeFace
+from blazeface_landmark import BlazeFaceLandmark
+
+class FaceDetector(torch.nn.Module):
+    def __init__(
+        self,
+        detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]],
+        anchors: torch.Tensor,
+    ):
+        super().__init__()
+        self.detector = detector
+        self.anchors = anchors
+
+    def forward(self, image):
+        return self.detector(image)
+
+back_detector = True
+face_detector = BlazeFace(back_model=back_detector)
+face_detector.load_weights(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/blazefaceback.pth"))
+face_detector.load_anchors(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/anchors_face_back.npy"))
+face_detect = FaceDetector(face_detector,face_detector.anchors)
+num_params = sum(p.numel() for p in face_detect.parameters() if p.requires_grad)
+print(f'Number of face_detect parameters: {num_params}')
+
+face_d_in = torch.randn(1, 3, 256, 256,dtype= torch.float32)
+source_model = torch.jit.trace(face_detect,face_d_in)
+source_model.save(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/m_faceDetctor.pt"))
+print("export face detect ok!")
+
+
+
+
+
+class FaceLandmarkDetector(torch.nn.Module):
+    def __init__(
+        self,
+        detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]],
+    ):
+        super().__init__()
+        self.detector = detector
+
+    def forward(self, image):
+        return self.detector(image)
+
+face_regressor = BlazeFaceLandmark()
+face_regressor.load_weights(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/blazeface_landmark.pth")) 
+face_regres = FaceLandmarkDetector(face_regressor)
+num_params = sum(p.numel() for p in face_regres.parameters() if p.requires_grad)
+print(f'Number of face_regres parameters: {num_params}')
+
+face_r_in = torch.randn(1, 3, 192, 192,dtype= torch.float32)
+source_model = torch.jit.trace(face_regres, face_r_in)
+source_model.save(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../models/m_faceLandmark.pt"))
+print("export face landmark ok!")
\ No newline at end of file
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/visualization.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/visualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1187c9d99336d53562f4fa3a4a32f6af6e769fb
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int16_aidlite/python/visualization.py
@@ -0,0 +1,125 @@
+import numpy as np
+import cv2
+import torch
+
+def draw_detections(img, detections, with_keypoints=True):
+    if isinstance(detections, torch.Tensor):
+        detections = detections.cpu().numpy()
+
+    if detections.ndim == 1:
+        detections = np.expand_dims(detections, axis=0)
+
+    n_keypoints = detections.shape[1] // 2 - 2
+
+    for i in range(detections.shape[0]):
+        ymin = detections[i, 0]
+        xmin = detections[i, 1]
+        ymax = detections[i, 2]
+        xmax = detections[i, 3]
+        
+        start_point = (int(xmin), int(ymin))
+        end_point = (int(xmax), int(ymax))
+        img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) 
+
+        if with_keypoints:
+            for k in range(n_keypoints):
+                kp_x = int(detections[i, 4 + k*2    ])
+                kp_y = int(detections[i, 4 + k*2 + 1])
+                cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
+    return img
+
+
+def draw_roi(img, roi):
+    for i in range(roi.shape[0]):
+        (x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i]
+        cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2)
+        cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2)
+        cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2)
+        cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2)
+
+
+def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2):
+    points = points[:,:2]
+    for point in points:
+        x, y = point
+        x, y = int(x), int(y)
+        cv2.circle(img, (x, y), size, color, thickness=size)
+    for connection in connections:
+        x0, y0 = points[connection[0]]
+        x1, y1 = points[connection[1]]
+        x0, y0 = int(x0), int(y0)
+        x1, y1 = int(x1), int(y1)
+        cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size)
+
+
+
+# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py
+#        8   12  16  20
+#        |   |   |   |
+#        7   11  15  19
+#    4   |   |   |   |
+#    |   6   10  14  18
+#    3   |   |   |   |
+#    |   5---9---13--17
+#    2    \         /
+#     \    \       /
+#      1    \     /
+#       \    \   /
+#        ------0-
+HAND_CONNECTIONS = [
+    (0, 1), (1, 2), (2, 3), (3, 4),
+    (5, 6), (6, 7), (7, 8),
+    (9, 10), (10, 11), (11, 12),
+    (13, 14), (14, 15), (15, 16),
+    (17, 18), (18, 19), (19, 20),
+    (0, 5), (5, 9), (9, 13), (13, 17), (0, 17)
+]
+
+POSE_CONNECTIONS = [
+    (0,1), (1,2), (2,3), (3,7),
+    (0,4), (4,5), (5,6), (6,8),
+    (9,10),
+    (11,13), (13,15), (15,17), (17,19), (19,15), (15,21),
+    (12,14), (14,16), (16,18), (18,20), (20,16), (16,22),
+    (11,12), (12,24), (24,23), (23,11)
+]
+
+# Vertex indices can be found in
+# github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png
+# Found in github.com/google/mediapipe/python/solutions/face_mesh.py
+FACE_CONNECTIONS = [
+    # Lips.
+    (61, 146), (146, 91), (91, 181), (181, 84), (84, 17),
+    (17, 314), (314, 405), (405, 321), (321, 375), (375, 291),
+    (61, 185), (185, 40), (40, 39), (39, 37), (37, 0),
+    (0, 267), (267, 269), (269, 270), (270, 409), (409, 291),
+    (78, 95), (95, 88), (88, 178), (178, 87), (87, 14),
+    (14, 317), (317, 402), (402, 318), (318, 324), (324, 308),
+    (78, 191), (191, 80), (80, 81), (81, 82), (82, 13),
+    (13, 312), (312, 311), (311, 310), (310, 415), (415, 308),
+    # Left eye.
+    (263, 249), (249, 390), (390, 373), (373, 374), (374, 380),
+    (380, 381), (381, 382), (382, 362), (263, 466), (466, 388),
+    (388, 387), (387, 386), (386, 385), (385, 384), (384, 398),
+    (398, 362),
+    # Left eyebrow.
+    (276, 283), (283, 282), (282, 295), (295, 285), (300, 293),
+    (293, 334), (334, 296), (296, 336),
+    # Right eye.
+    (33, 7), (7, 163), (163, 144), (144, 145), (145, 153),
+    (153, 154), (154, 155), (155, 133), (33, 246), (246, 161),
+    (161, 160), (160, 159), (159, 158), (158, 157), (157, 173),
+    (173, 133),
+    # Right eyebrow.
+    (46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105),
+    (105, 66), (66, 107),
+    # Face oval.
+    (10, 338), (338, 297), (297, 332), (332, 284), (284, 251),
+    (251, 389), (389, 356), (356, 454), (454, 323), (323, 361),
+    (361, 288), (288, 397), (397, 365), (365, 379), (379, 378),
+    (378, 400), (400, 377), (377, 152), (152, 148), (148, 176),
+    (176, 149), (149, 150), (150, 136), (136, 172), (172, 58),
+    (58, 132), (132, 93), (93, 234), (234, 127), (127, 162),
+    (162, 21), (21, 54), (54, 103), (103, 67), (67, 109),
+    (109, 10)
+]
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/README.md b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7fb769136de5f3805e830684440b3b5897d99aec
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/README.md
@@ -0,0 +1,63 @@
+## Model Information
+### Source model
+- Input shape: [1x3x256x256],[1x3x192x192]
+- Number of parameters:0.13M, 0.6M
+- Model size:0.58MB, 2.32MB
+- Output shape: [1x896x16, 1x896x1], [1, 1x486x3]
+
+Source model repository: [MediaPipe-Face-Detection](https://github.com/zmurez/MediaPipePyTorch/)
+
+### Converted model
+
+- Precision: INT8
+- Backend: QNN2.16
+- Target Device: FV01 QCS6490
+
+## Inference with AidLite SDK
+
+### SDK installation
+Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
+
+- install AidLite SDK
+
+```bash
+# Install the appropriate version of the aidlite sdk
+sudo aid-pkg update
+sudo aid-pkg install aidlite-sdk
+# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
+sudo aid-pkg install aidlite-{QNN VERSION}
+```
+
+- Verify AidLite SDK
+
+```bash
+# aidlite sdk c++ check
+python3 -c "import aidlite ; print(aidlite.get_library_version())"
+
+# aidlite sdk python check
+python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
+```
+
+### Run demo
+#### python
+```bash
+cd python 
+python3 demo_qnn.py
+```
+
+#### c++
+```bash
+# 加载.npy文件需要用到cnpy库（终端默认路径下执行即可）
+git clone https://github.com/rogersce/cnpy.git
+cd cnpy
+mkdir build && cd build
+cmake ..
+make
+sudo make install
+
+cd mediapipe-face-detection/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp
+mkdir build && cd build 
+cmake ..
+make
+./run_test
+```
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..13cca3db86e64b8a8458fd55c52323bed1d4282f
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/CMakeLists.txt
@@ -0,0 +1,34 @@
+cmake_minimum_required (VERSION 3.5)
+project("run_test")
+
+find_package(OpenCV REQUIRED)
+find_library(CNPY_LIB cnpy REQUIRED)
+
+message(STATUS "oPENCV Library status:")
+message(STATUS ">version:${OpenCV_VERSION}")
+message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
+
+set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
+
+include_directories(
+    /usr/local/include
+    /usr/include/opencv4
+)
+
+link_directories(
+    /usr/local/lib/
+)
+
+file(GLOB SRC_LISTS 
+    ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp  
+)
+
+add_executable(run_test ${SRC_LISTS})
+
+target_link_libraries(run_test
+    aidlite
+	${OpenCV_LIBS}
+    pthread
+    jsoncpp
+    ${CNPY_LIB}
+)
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/anchors_float32.npy b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/anchors_float32.npy
new file mode 100644
index 0000000000000000000000000000000000000000..fc25c0d2c161ee090e9a04c6003418bd15caf45c
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/anchors_float32.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94
+size 14464
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/coco.jpg b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/coco.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/coco.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378
+size 158750
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..15a817e6b6b8d96cb1edff9e2ee5dae04e7fccde
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/cpp/run_test.cpp
@@ -0,0 +1,909 @@
+#include <iostream>
+#include <fstream>
+#include <opencv2/opencv.hpp>
+#include <aidlux/aidlite/aidlite.hpp>
+#include <vector>
+#include <numeric>
+#include <cmath>
+#include <jsoncpp/json/json.h>
+#include <tuple>
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <cassert>
+#include "cnpy.h"
+
+using namespace cv;
+using namespace std;
+using namespace Aidlux::Aidlite;
+
+
+// 人脸 landmark 连接索引定义（来自 MediaPipe Face Mesh）
+const std::vector<std::pair<int, int>> FACE_CONNECTIONS = {
+    {61, 146}, {146, 91}, {91, 181}, {181, 84}, {84, 17},
+    {17, 314}, {314, 405}, {405, 321}, {321, 375}, {375, 291},
+    {61, 185}, {185, 40}, {40, 39}, {39, 37}, {37, 0},
+    {0, 267}, {267, 269}, {269, 270}, {270, 409}, {409, 291},
+    {78, 95}, {95, 88}, {88, 178}, {178, 87}, {87, 14},
+    {14, 317}, {317, 402}, {402, 318}, {318, 324}, {324, 308},
+    {78, 191}, {191, 80}, {80, 81}, {81, 82}, {82, 13},
+    {13, 312}, {312, 311}, {311, 310}, {310, 415}, {415, 308},
+    {263, 249}, {249, 390}, {390, 373}, {373, 374}, {374, 380},
+    {380, 381}, {381, 382}, {382, 362}, {263, 466}, {466, 388},
+    {388, 387}, {387, 386}, {386, 385}, {385, 384}, {384, 398},
+    {398, 362}, {276, 283}, {283, 282}, {282, 295}, {295, 285},
+    {300, 293}, {293, 334}, {334, 296}, {296, 336}, {33, 7},
+    {7, 163}, {163, 144}, {144, 145}, {145, 153}, {153, 154},
+    {154, 155}, {155, 133}, {33, 246}, {246, 161}, {161, 160},
+    {160, 159}, {159, 158}, {158, 157}, {157, 173}, {173, 133},
+    {46, 53}, {53, 52}, {52, 65}, {65, 55}, {70, 63}, {63, 105},
+    {105, 66}, {66, 107}, {10, 338}, {338, 297}, {297, 332},
+    {332, 284}, {284, 251}, {251, 389}, {389, 356}, {356, 454},
+    {454, 323}, {323, 361}, {361, 288}, {288, 397}, {397, 365},
+    {365, 379}, {379, 378}, {378, 400}, {400, 377}, {377, 152},
+    {152, 148}, {148, 176}, {176, 149}, {149, 150}, {150, 136},
+    {136, 172}, {172, 58}, {58, 132}, {132, 93}, {93, 234},
+    {234, 127}, {127, 162}, {162, 21}, {21, 54}, {54, 103},
+    {103, 67}, {67, 109}, {109, 10}
+};
+
+struct Args {
+    std::string faceDetector_model = "../../models/m_faceDetector_w8a8.qnn216.ctx.bin";
+    std::string faceLandmark_model = "../../models/m_faceLandmark_w8a8.qnn216.ctx.bin";
+    std::string imgs = "../coco.jpg";
+    int invoke_nums = 10;
+    std::string model_type = "QNN";
+};
+
+
+Args parse_args(int argc, char* argv[]) {
+    Args args;
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+        if (arg == "--faceDetector_model" && i + 1 < argc) {
+            args.faceDetector_model = argv[++i];
+        } else if (arg == "--faceLandmark_model" && i + 1 < argc) {
+            args.faceLandmark_model = argv[++i];
+        } else if (arg == "--imgs" && i + 1 < argc) {
+            args.imgs = argv[++i];
+        } else if (arg == "--invoke_nums" && i + 1 < argc) {
+            args.invoke_nums = std::stoi(argv[++i]);
+        } else if (arg == "--model_type" && i + 1 < argc) {
+            args.model_type = argv[++i];
+        }
+    }
+    return args;
+}
+
+std::string to_lower(const std::string& str) {
+    std::string lower_str = str;
+    std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
+        return std::tolower(c);
+    });
+    return lower_str;
+}
+
+std::vector<std::vector<float>> load_anchors_from_npy(const std::string& path) {
+    cnpy::NpyArray arr = cnpy::npy_load(path);
+    float* data_ptr = arr.data<float>();
+
+    size_t num_rows = arr.shape[0]; // 896
+    size_t num_cols = arr.shape[1]; // 4
+
+    std::vector<std::vector<float>> anchors(num_rows, std::vector<float>(num_cols));
+    for (size_t i = 0; i < num_rows; ++i) {
+        for (size_t j = 0; j < num_cols; ++j) {
+            anchors[i][j] = data_ptr[i * num_cols + j];
+        }
+    }
+
+    return anchors;
+}
+
+
+// 绘制人脸关键点和连接线
+void draw_landmarks(
+    cv::Mat& img,
+    const std::vector<cv::Point2f>& points,
+    const std::vector<float>& flags,
+    const std::vector<std::pair<int, int>>& connections,
+    float threshold = 0.4f,
+    cv::Scalar point_color = cv::Scalar(0, 255, 0),
+    cv::Scalar line_color = cv::Scalar(0, 0, 0),
+    int size = 2)
+{
+    // 画关键点
+    for (size_t i = 0; i < points.size(); ++i) {
+        if (i < flags.size() && flags[i] > threshold) {
+            int x = static_cast<int>(points[i].x);
+            int y = static_cast<int>(points[i].y);
+            cv::circle(img, cv::Point(x, y), size, point_color, size);
+        }
+    }
+
+    // 画连接线（两端都要可见）
+    for (const auto& conn : connections) {
+        int i0 = conn.first;
+        int i1 = conn.second;
+        if (i0 < points.size() && i1 < points.size() &&
+            i0 < flags.size() && i1 < flags.size() &&
+            flags[i0] > threshold && flags[i1] > threshold)
+        {
+            cv::line(img, points[i0], points[i1], line_color, size);
+        }
+    }
+}
+
+
+std::tuple<cv::Mat, cv::Mat, float, cv::Point> resize_pad(const cv::Mat& img) {
+    int orig_h = img.rows; // 480
+    int orig_w = img.cols; // 640
+
+    // Step 1: resize width to 256, keep aspect ratio
+    int w1 = 256;
+    int h1 = w1 * orig_h / orig_w;  // 等效于 int(256 * h / w)
+
+    // Step 2: compute padding in height direction
+    int padh = 256 - h1;
+    int padw = 0;
+
+    int padh1 = padh / 2;
+    int padh2 = padh1 + (padh % 2);
+    int padw1 = padw / 2;
+    int padw2 = padw1 + (padw % 2);
+
+    // Step 3: resize to (w1, h1)
+    cv::Mat resized;
+    cv::resize(img, resized, cv::Size(w1, h1));  // (256, h1)
+
+    // Step 4: pad to (256, 256)
+    cv::Mat padded;
+    cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
+
+    // Step 5: resize padded to 128×128
+    cv::Mat resized128;
+    cv::resize(padded, resized128, cv::Size(128, 128));
+
+    // Step 6: compute scale and pad in original image space
+    float scale = static_cast<float>(orig_h) / h1; // h / h1
+    cv::Point pad_point(static_cast<int>(padh1 * scale), static_cast<int>(padw1 * scale));
+
+    return std::make_tuple(padded, resized128, scale, pad_point);
+}
+
+
+// 将图像转换为 1xC×H×W 格式并归一化（除以 255）
+std::vector<float> preprocess_image(const cv::Mat& img) {
+    int H = img.rows;
+    int W = img.cols;
+    int C = img.channels(); // should be 3
+
+    std::vector<float> chw(H * W * C); // CHW
+    std::vector<float> nchw(1 * C * H * W); // NCHW
+
+    // 1. HWC → CHW + normalize (float32 / 255.0)
+    for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+            for (int c = 0; c < C; ++c) {
+                // OpenCV uses BGR order
+                float value = img.at<cv::Vec3b>(h, w)[c] / 255.0f;
+                chw[c * H * W + h * W + w] = value;
+            }
+        }
+    }
+
+    // 2. CHW → NCHW (add batch dimension, actually just copy)
+    for (int i = 0; i < C * H * W; ++i) {
+        nchw[i] = chw[i];
+    }
+
+    return nchw; // shape: [1, 3, H, W]
+}
+
+
+// 只用前4个坐标计算IOU（默认框位置在前4个坐标）
+float IoU(const std::vector<float>& box1, const std::vector<float>& box2) {
+    float x1 = std::max(box1[0], box2[0]);
+    float y1 = std::max(box1[1], box2[1]);
+    float x2 = std::min(box1[2], box2[2]);
+    float y2 = std::min(box1[3], box2[3]);
+
+    float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
+    float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]);
+    float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]);
+    float union_area = box1_area + box2_area - inter_area;
+
+    return union_area > 0 ? inter_area / union_area : 0.0f;
+}
+
+std::vector<std::vector<float>> weighted_non_max_suppression(
+    std::vector<std::vector<float>>& detections,
+    int num_coords = 16,
+    float min_suppression_threshold = 0.3f)
+{
+    if (detections.empty()) return {};
+
+    std::vector<int> indices(detections.size());
+    std::iota(indices.begin(), indices.end(), 0);
+
+    // 按置信度降序排序
+    std::sort(indices.begin(), indices.end(), [&](int a, int b) {
+        return detections[a][num_coords] > detections[b][num_coords];
+    });
+
+    std::vector<std::vector<float>> output;
+
+    while (!indices.empty()) {
+        int best_idx = indices.front();
+        const auto& best_det = detections[best_idx];
+        std::vector<int> overlapping = { best_idx };
+
+        for (size_t i = 1; i < indices.size(); ++i) {
+            float iou = IoU(best_det, detections[indices[i]]);
+            if (iou > min_suppression_threshold) {
+                overlapping.push_back(indices[i]);
+            }
+        }
+
+        // 更新剩余索引
+        std::vector<int> new_indices;
+        for (int idx : indices) {
+            if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) {
+                new_indices.push_back(idx);
+            }
+        }
+        indices = new_indices;
+
+        // 加权平均：坐标 * 置信度
+        if (overlapping.size() == 1) {
+            output.push_back(best_det);
+        } else {
+            std::vector<float> weighted(num_coords + 1, 0.0f);
+            float total_score = 0.0f;
+
+            for (int idx : overlapping) {
+                float score = detections[idx][num_coords];
+                total_score += score;
+                for (int k = 0; k < num_coords; ++k) {
+                    weighted[k] += detections[idx][k] * score;
+                }
+            }
+
+            for (int k = 0; k < num_coords; ++k) {
+                weighted[k] /= total_score;
+            }
+            weighted[num_coords] = total_score / overlapping.size();  // 取平均得分
+
+            // std::cout << "Weighted box: ";
+            // for (float v : weighted) std::cout << v << " ";
+            // std::cout << "\n";
+
+            output.push_back(weighted);
+        }
+    }
+
+    // TODO 
+    auto x = output[0];
+    output.clear();
+    output.push_back(x);
+
+    return output;
+}
+
+
+std::vector<std::vector<float>> denormalize_detections(
+    const std::vector<std::vector<float>>& detections,
+    float scale,
+    const cv::Point& pad
+) {
+    std::vector<std::vector<float>> result = detections;
+
+    for (size_t i = 0; i < result.size(); ++i) {
+        std::vector<float>& det = result[i];
+
+        // bbox coords: x1, y1, x2, y2
+        det[0] = det[0] * scale * 256.0f - pad.x;  // x1
+        det[1] = det[1] * scale * 256.0f - pad.y;  // y1
+        det[2] = det[2] * scale * 256.0f - pad.x;  // x2
+        det[3] = det[3] * scale * 256.0f - pad.y;  // y2
+
+        // keypoints (starting from index 4): format [y, x, y, x, ...]
+        for (size_t k = 4; k + 1 < det.size(); k += 2) {
+            det[k]     = det[k]     * scale * 256.0f - pad.y;  // y
+            det[k + 1] = det[k + 1] * scale * 256.0f - pad.x;  // x
+        }
+    }
+
+    return result;
+}
+
+
+void detection2roi(
+    const std::vector<std::vector<float>>& detections,
+    std::vector<float>& xc,
+    std::vector<float>& yc,
+    std::vector<float>& scale,
+    std::vector<float>& theta,
+    int kp1, int kp2,   // 关键点索引
+    float dy, float dscale, float theta0
+) {
+    size_t N = detections.size();
+    xc.resize(N);
+    yc.resize(N);
+    scale.resize(N);
+    theta.resize(N);
+
+    for (size_t i = 0; i < N; ++i) {
+        const std::vector<float>& det = detections[i];
+
+        float x1 = det[1];
+        float x2 = det[3];
+        float y1 = det[0];
+        float y2 = det[2];
+
+        float x_center = (x1 + x2) / 2.0f;
+        float y_center = (y1 + y2) / 2.0f;
+        float box_scale = (x2 - x1);  // assumes square box
+
+        // yc 偏移
+        y_center += dy * box_scale;
+        box_scale *= dscale;
+
+        // 获取两个关键点的位置
+        int base = 4;
+        int idx_y0 = base + 2 * kp1;
+        int idx_x0 = base + 2 * kp1 + 1;
+        int idx_y1 = base + 2 * kp2;
+        int idx_x1 = base + 2 * kp2 + 1;
+
+        float x0 = det[idx_x0];
+        float y0 = det[idx_y0];
+        float x1_kp = det[idx_x1];
+        float y1_kp = det[idx_y1];
+
+        float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0;
+
+        // 输出赋值
+        xc[i] = x_center;
+        yc[i] = y_center;
+        scale[i] = box_scale;
+        // TODO: 这里的 theta 需要根据实际情况调整
+        // theta[i] = angle;  // 如果需要使用计算的角度
+        theta[i] = -0.0094;
+    }
+}
+
+
+void extract_roi(
+    const cv::Mat& frame,
+    const std::vector<float>& xc,
+    const std::vector<float>& yc,
+    const std::vector<float>& theta,
+    const std::vector<float>& scale,
+    std::vector<cv::Mat>& cropped_rois,
+    std::vector<cv::Mat>& affine_matrices,
+    std::vector<std::vector<cv::Point2f>>& roi_boxes,  // 添加返回点坐标
+    int resolution = 192
+) {
+    cropped_rois.clear();
+    affine_matrices.clear();
+    roi_boxes.clear();
+
+    for (size_t i = 0; i < xc.size(); ++i) {
+        float s = scale[i] / 2.0f;
+        float cos_t = std::cos(theta[i]);
+        float sin_t = std::sin(theta[i]);
+
+        // 定义4个 unit square 点经过变换后的点（顺序和 Python 中一样）
+        std::vector<cv::Point2f> points(4);
+        // [-1, -1]
+        points[0].x = xc[i] + (-s * cos_t + s * sin_t);
+        points[0].y = yc[i] + (-s * sin_t - s * cos_t);
+        // [1, -1]
+        points[1].x = xc[i] + ( s * cos_t + s * sin_t);
+        points[1].y = yc[i] + ( s * sin_t - s * cos_t);
+        // [-1, 1]
+        points[2].x = xc[i] + (-s * cos_t - s * sin_t);
+        points[2].y = yc[i] + (-s * sin_t + s * cos_t);
+        // [1, 1]
+        points[3].x = xc[i] + ( s * cos_t - s * sin_t);
+        points[3].y = yc[i] + ( s * sin_t + s * cos_t);
+
+        // 用前三个点计算仿射变换
+        std::vector<cv::Point2f> src_pts = { points[0], points[1], points[2] };
+        std::vector<cv::Point2f> dst_pts = {
+            cv::Point2f(0, 0),
+            cv::Point2f(resolution - 1, 0),
+            cv::Point2f(0, resolution - 1)
+        };
+
+        cv::Mat M = cv::getAffineTransform(src_pts, dst_pts);
+        cv::Mat M_inv;
+        cv::invertAffineTransform(M, M_inv);
+
+        cv::Mat cropped;
+        cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5));
+        cropped_rois.push_back(cropped);
+        affine_matrices.push_back(M_inv);
+        roi_boxes.push_back(points);  // 添加变换后的 box 点
+    }
+}
+
+std::vector<float> preprocess_imgs_to_nchw(const std::vector<cv::Mat>& imgs) {
+    int N = imgs.size();
+    if (N == 0) return {};
+
+    int H = 192;
+    int W = 192;
+    int C = 3;  // assume 3 channels (BGR)
+
+    std::vector<float> output;
+    output.reserve(N * C * H * W);
+
+    for (int n = 0; n < N; ++n) {
+        cv::Mat img_float;
+        imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0);  // Normalize to [0,1]
+
+        // Split channels (HWC → CHW)
+        std::vector<cv::Mat> channels(3);
+        cv::split(img_float, channels);  // channels[0] = B, [1] = G, [2] = R
+
+        for (int c = 0; c < C; ++c) {
+            for (int i = 0; i < H; ++i) {
+                for (int j = 0; j < W; ++j) {
+                    output.push_back(channels[c].at<float>(i, j));
+                }
+            }
+        }
+    }
+
+    return output;  // shape: N x C x H x W
+}
+
+// resolution 一般为 192
+std::vector<cv::Point2f> denormalize_landmarks(
+    const std::vector<float>& normalized_landmarks,
+    const std::vector<cv::Mat>& affines,
+    int resolution = 192)
+{
+    std::vector<cv::Point2f> output;
+    
+    // 检查输入尺寸
+    const int num_faces = 1;  // 假设只有一个人脸
+    const int num_landmarks = 468;
+    if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) {
+        std::cerr << "Error: Input size mismatch. Expected " 
+                  << num_faces * num_landmarks * 3 << " landmarks and " 
+                  << num_faces << " affine matrices." << std::endl;
+        throw std::runtime_error("Input size mismatch");
+    }
+
+    for (int i = 0; i < num_faces; ++i) {
+        const cv::Mat& affine = affines[i]; // 2x3 CV_32F
+        for (int j = 0; j < num_landmarks; ++j) {
+            int idx = i * num_landmarks * 3 + j * 3;
+            float x = normalized_landmarks[idx + 0] * resolution;
+            float y = normalized_landmarks[idx + 1] * resolution;
+            // float z = normalized_landmarks[idx + 2]; // 可选使用
+
+            // 2x1 输入向量
+            cv::Mat pt = (cv::Mat_<float>(2, 1) << x, y);
+
+            // 提取 affine 的旋转和平移
+            cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone();
+            cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone();
+            M2x2.convertTo(M2x2, CV_32F);
+            t2x1.convertTo(t2x1, CV_32F);
+
+            // 反仿射变换
+            cv::Mat out = M2x2 * pt + t2x1;
+
+            // 存储为 Point2f
+            output.emplace_back(out.at<float>(0, 0), out.at<float>(1, 0));
+        }
+    }
+
+    return output; // 输出为 denormalized landmarks，大小为 468 个 Point2f
+}
+
+
+void draw_roi(cv::Mat& img, const std::vector<std::vector<cv::Point2f>>& boxes) {
+    for (const auto& roi : boxes) {
+        if (roi.size() < 4) continue;
+
+        const cv::Point2f& p1 = roi[0];
+        const cv::Point2f& p2 = roi[1];
+        const cv::Point2f& p3 = roi[2];
+        const cv::Point2f& p4 = roi[3];
+
+        cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2);     // 绿色
+        cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+    }
+}
+
+
+void draw_detections(cv::Mat& img, const std::vector<std::vector<float>>& detections, bool with_keypoints = true) {
+    for (const auto& det : detections) {
+        if (det.size() < 4) continue;
+
+        float ymin = det[0];
+        float xmin = det[1];
+        float ymax = det[2];
+        float xmax = det[3];
+
+        cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1);
+
+        if (with_keypoints && det.size() > 4) {
+            int n_keypoints = (det.size() - 4) / 2;
+            for (int k = 0; k < n_keypoints; ++k) {
+                int kp_x = int(det[4 + k * 2]);
+                int kp_y = int(det[4 + k * 2 + 1]);
+                cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2);
+            }
+        }
+    }
+}
+
+
+std::vector<std::vector<float>> loadAnchors(const std::string& filename) {
+    std::ifstream in(filename);
+    std::vector<std::vector<float>> anchors;
+
+    if (!in.is_open()) {
+        std::cerr << "Failed to open file: " << filename << std::endl;
+        return anchors;
+    }
+
+    std::string line;
+    while (std::getline(in, line)) {
+        std::istringstream ss(line);
+        std::vector<float> anchor;
+        float value;
+        while (ss >> value) {
+            anchor.push_back(value);
+        }
+        if (!anchor.empty()) {
+            anchors.push_back(anchor);
+        }
+    }
+
+    in.close();
+    return anchors;
+}
+
+// sigmoid 函数
+float sigmoid(float x) {
+    return 1.0f / (1.0f + std::exp(-x));
+}
+
+// clamp 函数
+float clamp(float x, float min_val, float max_val) {
+    return std::max(min_val, std::min(max_val, x));
+}
+
+// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord]
+std::vector<std::vector<std::vector<float>>> decode_boxes(
+    const std::vector<float>& raw_boxes,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    int num_keypoints)
+{
+    std::vector<std::vector<std::vector<float>>> decoded_boxes(batch,
+        std::vector<std::vector<float>>(num_anchors, std::vector<float>(num_coords, 0)));
+
+    for (int b = 0; b < batch; ++b) {
+        for (int i = 0; i < num_anchors; ++i) {
+            int base = b * num_anchors * num_coords + i * num_coords;
+
+            float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0];
+            float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1];
+            float w = raw_boxes[base + 2] / w_scale * anchors[i][2];
+            float h = raw_boxes[base + 3] / h_scale * anchors[i][3];
+
+            decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin
+            decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin
+            decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax
+            decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax
+
+            for (int k = 0; k < num_keypoints; ++k) {
+                int offset = 4 + k * 2;
+                float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0];
+                float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1];
+                decoded_boxes[b][i][offset] = keypoint_x;
+                decoded_boxes[b][i][offset + 1] = keypoint_y;
+            }
+        }
+    }
+
+    return decoded_boxes;
+}
+
+std::vector<std::vector<std::vector<float>>> tensors_to_detections(
+    const std::vector<float>& raw_box_tensor,
+    const std::vector<float>& raw_score_tensor,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    float score_clipping_thresh, float min_score_thresh)
+{
+    assert(raw_box_tensor.size() == batch * num_anchors * num_coords);
+    assert(raw_score_tensor.size() == batch * num_anchors * num_classes);
+    assert(anchors.size() == size_t(num_anchors));
+
+    auto detection_boxes = decode_boxes(
+        raw_box_tensor, anchors, batch, num_anchors, num_coords,
+        x_scale, y_scale, w_scale, h_scale, num_keypoints);
+
+    std::vector<std::vector<std::vector<float>>> output_detections;
+
+    for (int b = 0; b < batch; ++b) {
+        std::vector<std::vector<float>> detections;
+
+        for (int i = 0; i < num_anchors; ++i) {
+            int score_index = b * num_anchors * num_classes + i * num_classes;
+
+            // 单类情况，取第0类
+            float score_raw = raw_score_tensor[score_index];
+            float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh));
+
+            if (score >= min_score_thresh) {
+                std::vector<float> det = detection_boxes[b][i]; // shape [num_coords]
+                det.push_back(score); // 追加置信度
+                detections.push_back(det); // shape [num_coords+1]
+            }
+        }
+
+        output_detections.push_back(detections); // 每个 batch 一个 vector
+    }
+
+    return output_detections;
+}
+
+
+int invoke(const Args& args) {
+    std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n"
+              << args.faceLandmark_model << "\n"
+              << "Image Path: " << args.imgs << "\n"
+              << "Inference Nums: " << args.invoke_nums << "\n"
+              << "Model Type: " << args.model_type << "\n";
+    // =============================================================faceDetector_model start
+    Model* model1 = Model::create_instance(args.faceDetector_model);
+    if(model1 == nullptr){
+        printf("Create model1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config1 = Config::create_instance();
+    if(config1 == nullptr){
+        printf("Create config1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config1->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower1 = to_lower(args.model_type);
+    if (model_type_lower1 == "qnn"){
+        config1->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") {
+        config1->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config1->accelerate_type = AccelerateType::TYPE_DSP;
+    config1->is_quantify_model = 1;
+
+    std::vector<std::vector<uint32_t>> input_shapes1 = {{1,3,256,256}};
+    std::vector<std::vector<uint32_t>> output_shapes1 = {{1,896,16},{1,896,1}};
+    model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1);
+    if(fast_interpreter1 == nullptr){
+        printf("build_interpretper_from_model_and_config failed !\n");
+        return EXIT_FAILURE;
+    }
+    int result = fast_interpreter1->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter1->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model load success!\n");
+    // =============================================================faceDetector_model over
+
+    // =============================================================faceLandmark_model start
+    Model* model2 = Model::create_instance(args.faceLandmark_model);
+    if(model2 == nullptr){
+        printf("Create model2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config2 = Config::create_instance();
+    if(config2 == nullptr){
+        printf("Create config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config2->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower2 = to_lower(args.model_type);
+    if (model_type_lower2 == "qnn"){
+        config2->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") {
+        config2->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config2->accelerate_type = AccelerateType::TYPE_DSP;
+    config2->is_quantify_model = 1;
+
+    std::vector<std::vector<uint32_t>> input_shapes2 = {{1,3,192,192}};
+    std::vector<std::vector<uint32_t>> output_shapes2 = {{1},{1,468,3}};
+    model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2);
+    if(fast_interpreter2 == nullptr){
+        printf("build_interpretper_from_model_and_config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    result = fast_interpreter2->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter2->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model2 load success!\n");
+    // =============================================================faceLandmark_model over
+
+
+    auto anchors = load_anchors_from_npy("../anchors_float32.npy");
+    cv::Mat frame = cv::imread(args.imgs);
+    if (frame.empty()) {
+        printf("detect image load failed!\n");
+        return 1;
+    }
+    // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
+    cv::Mat input_data;
+    cv::Mat frame_clone1 = frame.clone();
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB); 
+    cv::Mat frame_clone = frame.clone();
+
+
+    cv::Mat img1, img2;
+    float scale;
+    cv::Point pad;
+    std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1);
+    std::vector<float> input_tensor = preprocess_image(img1);
+
+    float *outdata0 = nullptr;
+    float *outdata1 = nullptr;
+    std::vector<float> invoke_time;
+    for (int i = 0; i < args.invoke_nums; ++i) {
+        result = fast_interpreter1->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter1->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_0 = 0;
+        result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        uint32_t out_data_1 = 0;
+        result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+    }
+
+    std::vector<float> tensor_1_896_16(outdata0, outdata0 + 896*16);
+    std::vector<float> tensor_1_896_1(outdata1, outdata1 + 896*1);
+
+    std::vector<std::vector<std::vector<float>>> detections = tensors_to_detections(
+        tensor_1_896_16, tensor_1_896_1, anchors,
+        1, 896, 16, 1, 6,
+        256.0f, 256.0f, 256.0f, 256.0f,
+        100.0f, 0.4f);
+
+
+    std::vector<std::vector<std::vector<float>>> filtered_detections;
+    for (size_t i = 0; i < detections.size(); ++i) {
+        std::vector<std::vector<float>>& dets = detections[i];
+        std::vector<std::vector<float>> faces = weighted_non_max_suppression(dets);
+        filtered_detections.push_back(faces);
+    }
+
+
+    // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n";
+    // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n";
+    std::vector<std::vector<float>> face_detections = denormalize_detections(filtered_detections[0], scale, pad);
+
+    // std::cout << "face_detections size: " << face_detections.size() << "\n";
+    std::vector<float> xc, yc, scales, theta;
+    int kp1 = 0, kp2 = 1;           // 关键点索引
+    float dy = 0.0f;                // 根据模型定义设定
+    float dscale = 1.5f;            // 缩放因子
+    float theta0 = 0.0f;            // 基准角度
+
+    detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0);
+    std::vector<cv::Mat> rois;
+    std::vector<cv::Mat> affines;
+    std::vector<std::vector<cv::Point2f>> boxes;
+
+    extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes);
+    if (!boxes.empty()) {
+        std::cout << "Detected " << boxes.size() << " faces.\n";
+        // 检测到人脸，继续处理 boxes[0] ...
+        std::vector<float> input_tensor = preprocess_imgs_to_nchw(rois);
+
+        float *outdata1_0 = nullptr;
+        float *outdata1_1 = nullptr;
+
+        result = fast_interpreter2->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter2->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_1_0 = 0;
+        result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        uint32_t out_data_1_1 = 0;
+        result = fast_interpreter2->get_output_tensor(1, (void**)&outdata1_1, &out_data_1_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        std::vector<float> flags(outdata1_0, outdata1_0 + 1);
+        std::vector<float> normalized_landmarks(outdata1_1, outdata1_1 + 468*3);
+
+        std::vector<cv::Point2f> landmarks = denormalize_landmarks(normalized_landmarks, affines);
+        draw_landmarks(frame_clone1, landmarks, flags, FACE_CONNECTIONS);
+    } else {
+        std::cout << "not detect face!" << std::endl;
+    }
+
+    
+    draw_roi(frame_clone1, boxes);
+    draw_detections(frame_clone1, face_detections);
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR);
+    cv::imwrite("vis_result.jpg", frame_clone1);
+
+
+    fast_interpreter1->destory();
+    fast_interpreter2->destory();
+    return 0;
+    
+}
+
+
+int main(int argc, char* argv[]) {
+    Args args = parse_args(argc, argv);
+    return invoke(args);
+}
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceDetector_w8a8.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceDetector_w8a8.qnn216.ctx.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b365b40ca2fa478ae2e677871408350abc5cf355
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceDetector_w8a8.qnn216.ctx.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42ccf2e3a2ee4ff2adf15ea7b00b453bb1a0a183ebd764e8542eb9d56182191d
+size 720424
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceLandmark_w8a8.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceLandmark_w8a8.qnn216.ctx.bin
new file mode 100644
index 0000000000000000000000000000000000000000..56eeb3e249c0a2205e5ba5bc37379b35e9323b12
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/models/m_faceLandmark_w8a8.qnn216.ctx.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:355785d3eeb5a26ad29e3b128d803d3f20b443e01bed3249ff4013ac57d634b4
+size 1068128
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_face_back.npy b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_face_back.npy
new file mode 100644
index 0000000000000000000000000000000000000000..3ba12474802adea36a8e37ca648d46556cd35c92
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_face_back.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95
+size 28800
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_float32.npy b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_float32.npy
new file mode 100644
index 0000000000000000000000000000000000000000..fc25c0d2c161ee090e9a04c6003418bd15caf45c
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/anchors_float32.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94
+size 14464
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazebase.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazebase.py
new file mode 100644
index 0000000000000000000000000000000000000000..414ae950199d7fa1ebae4fa540805cb53cf1ff0d
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazebase.py
@@ -0,0 +1,513 @@
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def resize_pad(img):
+    """ resize and pad images to be input to the detectors
+
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio.
+
+    Returns:
+        img1: 256x256
+        img2: 128x128
+        scale: scale factor between original image and 256x256 image
+        pad: pixels of padding in the original image
+    """
+
+    size0 = img.shape
+    if size0[0]>=size0[1]:
+        h1 = 256
+        w1 = 256 * size0[1] // size0[0]
+        padh = 0
+        padw = 256 - w1
+        scale = size0[1] / w1
+    else:
+        h1 = 256 * size0[0] // size0[1]
+        w1 = 256
+        padh = 256 - h1
+        padw = 0
+        scale = size0[0] / h1
+    padh1 = padh//2
+    padh2 = padh//2 + padh%2
+    padw1 = padw//2
+    padw2 = padw//2 + padw%2
+    img1 = cv2.resize(img, (w1,h1))
+    img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0)))
+    pad = (int(padh1 * scale), int(padw1 * scale))
+    img2 = cv2.resize(img1, (128,128))
+    return img1, img2, scale, pad
+
+
+def denormalize_detections(detections, scale, pad):
+    """ maps detection coordinates from [0,1] to image coordinates
+
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio. This function maps the
+    normalized coordinates back to the original image coordinates.
+
+    Inputs:
+        detections: nxm tensor. n is the number of detections.
+            m is 4+2*k where the first 4 valuse are the bounding
+            box coordinates and k is the number of additional
+            keypoints output by the detector.
+        scale: scalar that was used to resize the image
+        pad: padding in the x and y dimensions
+
+    """
+    detections[:, 0] = detections[:, 0] * scale * 256 - pad[0]
+    detections[:, 1] = detections[:, 1] * scale * 256 - pad[1]
+    detections[:, 2] = detections[:, 2] * scale * 256 - pad[0]
+    detections[:, 3] = detections[:, 3] * scale * 256 - pad[1]
+
+    detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1]
+    detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0]
+    return detections
+
+
+
+
+class BlazeBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False):
+        super(BlazeBlock, self).__init__()
+
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.channel_pad = out_channels - in_channels
+
+        # TFLite uses slightly different padding than PyTorch 
+        # on the depthwise conv layer when the stride is 2.
+        if stride == 2:
+            self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+            padding = 0
+        else:
+            padding = (kernel_size - 1) // 2
+
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=in_channels, 
+                      kernel_size=kernel_size, stride=stride, padding=padding, 
+                      groups=in_channels, bias=True),
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+
+        if skip_proj:
+            self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 
+                      kernel_size=1, stride=1, padding=0, bias=True)
+        else:
+            self.skip_proj = None
+
+        if act == 'relu':
+            self.act = nn.ReLU(inplace=True)
+        elif act == 'prelu':
+            self.act = nn.PReLU(out_channels)
+        else:
+            raise NotImplementedError("unknown activation %s"%act)
+
+    def forward(self, x):
+        if self.stride == 2:
+            if self.kernel_size==3:
+                h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+            else:
+                h = F.pad(x, (1, 2, 1, 2), "constant", 0)
+            x = self.max_pool(x)
+        else:
+            h = x
+
+        if self.skip_proj is not None:
+            x = self.skip_proj(x)
+        elif self.channel_pad > 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
+        
+
+        return self.act(self.convs(h) + x)
+
+
+class FinalBlazeBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3):
+        super(FinalBlazeBlock, self).__init__()
+
+        # TFLite uses slightly different padding than PyTorch
+        # on the depthwise conv layer when the stride is 2.
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=kernel_size, stride=2, padding=0,
+                      groups=channels, bias=True),
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+
+        self.act = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+
+        return self.act(self.convs(h))
+
+
+class BlazeBase(nn.Module):
+    """ Base class for media pipe models. """
+
+    def _device(self):
+        """Which device (CPU or GPU) is being used by this model?"""
+        return self.classifier_8.weight.device
+    
+    def load_weights(self, path):
+        self.load_state_dict(torch.load(path))
+        self.eval()        
+
+
+class BlazeLandmark(BlazeBase):
+    """ Base class for landmark models. """
+
+    def extract_roi(self, frame, xc, yc, theta, scale):
+
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+
+        # use the points to compute the affine transform that maps 
+        # these points back to the output square
+        res = self.resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].cpu().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+
+        return imgs, affines, points
+
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+
+
+
+class BlazeDetector(BlazeBase):
+    """ Base class for detector models.
+
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/hollance/BlazeFace-PyTorch and
+    https://github.com/google/mediapipe/
+    """
+    def load_anchors(self, path):
+        self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
+        assert(self.anchors.ndimension() == 2)
+        assert(self.anchors.shape[0] == self.num_anchors)
+        assert(self.anchors.shape[1] == 4)
+
+    def _preprocess(self, x):
+        """Converts the image pixels to the range [-1, 1]."""
+        return x.float() / 255.# 127.5 - 1.0
+
+    def predict_on_image(self, img):
+        """Makes a prediction on a single image.
+
+        Arguments:
+            img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
+                 shape (3, H, W). The image's height and width should be 
+                 128 pixels.
+
+        Returns:
+            A tensor with face detections.
+        """
+        if isinstance(img, np.ndarray):
+            img = torch.from_numpy(img).permute((2, 0, 1))
+
+        return self.predict_on_batch(img.unsqueeze(0))[0]
+
+    def predict_on_batch(self, x):
+        """Makes a prediction on a batch of images.
+
+        Arguments:
+            x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
+               shape (b, 3, H, W). The height and width should be 128 pixels.
+
+        Returns:
+            A list containing a tensor of face detections for each image in 
+            the batch. If no faces are found for an image, returns a tensor
+            of shape (0, 17).
+
+        Each face detection is a PyTorch tensor consisting of 17 numbers:
+            - ymin, xmin, ymax, xmax
+            - x,y-coordinates for the 6 keypoints
+            - confidence score
+        """
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x).permute((0, 3, 1, 2))
+
+        assert x.shape[1] == 3
+        assert x.shape[2] == self.y_scale
+        assert x.shape[3] == self.x_scale
+
+        # 1. Preprocess the images into tensors:
+        x = x.to(self._device())
+        x = self._preprocess(x)
+
+        # 2. Run the neural network:
+        with torch.no_grad():
+            out = self.__call__(x)
+
+        # 3. Postprocess the raw predictions:
+        detections = self._tensors_to_detections(out[0], out[1], self.anchors)
+
+        # 4. Non-maximum suppression to remove overlapping detections:
+        filtered_detections = []
+        for i in range(len(detections)):
+            faces = self._weighted_non_max_suppression(detections[i])
+            faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1))
+            filtered_detections.append(faces)
+
+        return filtered_detections
+
+
+    def detection2roi(self, detection):
+        """ Convert detections from detector to an oriented bounding box.
+
+        Adapted from:
+        # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
+
+        The center and size of the box is calculated from the center 
+        of the detected box. Rotation is calcualted from the vector
+        between kp1 and kp2 relative to theta0. The box is scaled
+        and shifted by dscale and dy.
+
+        """
+        if self.detection2roi_method == 'box':
+            # compute box center and scale
+            # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+            xc = (detection[:,1] + detection[:,3]) / 2
+            yc = (detection[:,0] + detection[:,2]) / 2
+            scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+
+        elif self.detection2roi_method == 'alignment':
+            # compute box center and scale
+            # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc
+            xc = detection[:,4+2*self.kp1]
+            yc = detection[:,4+2*self.kp1+1]
+            x1 = detection[:,4+2*self.kp2]
+            y1 = detection[:,4+2*self.kp2+1]
+            scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2
+        else:
+            raise NotImplementedError(
+                "detection2roi_method [%s] not supported"%self.detection2roi_method)
+
+        yc += self.dy * scale
+        scale *= self.dscale
+
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        #theta = np.arctan2(y0-y1, x0-x1) - self.theta0
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+
+
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        """The output of the neural network is a tensor of shape (b, 896, 16)
+        containing the bounding box regressor predictions, as well as a tensor 
+        of shape (b, 896, 1) with the classification confidences.
+
+        This function converts these two "raw" tensors into proper detections.
+        Returns a list of (num_detections, 17) tensors, one for each image in
+        the batch.
+
+        This is based on the source code from:
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
+        """
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+
+        return output_detections
+
+    def _decode_boxes(self, raw_boxes, anchors):
+        """Converts the predictions into actual coordinates using
+        the anchor boxes. Processes the entire batch at once.
+        """
+        boxes = torch.zeros_like(raw_boxes)
+
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+
+        return boxes
+
+    def _weighted_non_max_suppression(self, detections):
+        """The alternative NMS method as mentioned in the BlazeFace paper:
+
+        "We replace the suppression algorithm with a blending strategy that
+        estimates the regression parameters of a bounding box as a weighted
+        mean between the overlapping predictions."
+
+        The original MediaPipe code assigns the score of the most confident
+        detection to the weighted detection, but we take the average score
+        of the overlapping detections.
+
+        The input detections should be a Tensor of shape (count, 17).
+
+        Returns a list of PyTorch tensors, one for each detected face.
+        
+        This is based on the source code from:
+        mediapipe/calculators/util/non_max_suppression_calculator.cc
+        mediapipe/calculators/util/non_max_suppression_calculator.proto
+        """
+        if len(detections) == 0: return []
+
+        output_detections = []
+
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, self.num_coords], descending=True)
+
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+
+            # Compute the overlap between the first box and the other 
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = overlap_similarity(first_box, other_boxes)
+
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious > self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :self.num_coords]
+                scores = detections[overlapping, self.num_coords:self.num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:self.num_coords] = weighted
+                weighted_detection[self.num_coords] = total_score / len(overlapping)
+
+            output_detections.append(weighted_detection)
+
+        return output_detections    
+
+
+# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
+
+def intersect(box_a, box_b):
+    """ We resize both tensors to [A,B,2] without new malloc:
+    [A,2] -> [A,1,2] -> [A,B,2]
+    [B,2] -> [1,B,2] -> [A,B,2]
+    Then we compute the area of intersect between box_a and box_b.
+    Args:
+      box_a: (tensor) bounding boxes, Shape: [A,4].
+      box_b: (tensor) bounding boxes, Shape: [B,4].
+    Return:
+      (tensor) intersection area, Shape: [A,B].
+    """
+    A = box_a.size(0)
+    B = box_b.size(0)
+    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+    inter = torch.clamp((max_xy - min_xy), min=0)
+    return inter[:, :, 0] * inter[:, :, 1]
+
+
+def jaccard(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.  Here we operate on
+    ground truth boxes and default boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
+        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
+    Return:
+        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2]-box_a[:, 0]) *
+              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+    area_b = ((box_b[:, 2]-box_b[:, 0]) *
+              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+
+
+def overlap_similarity(box, other_boxes):
+    """Computes the IOU between a bounding box and set of other boxes."""
+    return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d630b3b28ead006d2864f18f1637f6545dbe507
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface.py
@@ -0,0 +1,182 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from blazebase import BlazeDetector, BlazeBlock, FinalBlazeBlock
+
+
+class BlazeFace(BlazeDetector):
+    """The BlazeFace face detection model from MediaPipe.
+    
+    The version from MediaPipe is simpler than the one in the paper; 
+    it does not use the "double" BlazeBlocks.
+
+    Because we won't be training this model, it doesn't need to have
+    batchnorm layers. These have already been "folded" into the conv 
+    weights by TFLite.
+
+    The conversion to PyTorch is fairly straightforward, but there are 
+    some small differences between TFLite and PyTorch in how they handle
+    padding on conv layers with stride 2.
+
+    This version works on batches, while the MediaPipe version can only
+    handle a single image at a time.
+
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/hollance/BlazeFace-PyTorch and
+    https://github.com/google/mediapipe/
+
+    """
+    def __init__(self, back_model=False):
+        super(BlazeFace, self).__init__()
+
+        # These are the settings from the MediaPipe example graph
+        # mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt
+        self.num_classes = 1
+        self.num_anchors = 896
+        self.num_coords = 16
+        self.score_clipping_thresh = 100.0
+        self.back_model = back_model
+        if back_model:
+            self.x_scale = 256.0
+            self.y_scale = 256.0
+            self.h_scale = 256.0
+            self.w_scale = 256.0
+            self.min_score_thresh = 0.65
+        else:
+            self.x_scale = 128.0
+            self.y_scale = 128.0
+            self.h_scale = 128.0
+            self.w_scale = 128.0
+            self.min_score_thresh = 0.75
+        self.min_suppression_threshold = 0.3
+        self.num_keypoints = 6
+
+        # These settings are for converting detections to ROIs which can then
+        # be extracted and feed into the landmark network
+        # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+        self.detection2roi_method = 'box'
+        # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
+        self.kp1 = 1
+        self.kp2 = 0
+        self.theta0 = 0.
+        self.dscale = 1.5
+        self.dy = 0.
+
+        self._define_layers()
+
+    def _define_layers(self):
+        if self.back_model:
+            self.backbone = nn.Sequential(
+                nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
+                nn.ReLU(inplace=True),
+
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24, stride=2),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 48, stride=2),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 96, stride=2),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+            )
+            self.final = FinalBlazeBlock(96)
+            self.classifier_8 = nn.Conv2d(96, 2, 1, bias=True)
+            self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
+
+            self.regressor_8 = nn.Conv2d(96, 32, 1, bias=True)
+            self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
+        else:
+            self.backbone1 = nn.Sequential(
+                nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
+                nn.ReLU(inplace=True),
+
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 28),
+                BlazeBlock(28, 32, stride=2),
+                BlazeBlock(32, 36),
+                BlazeBlock(36, 42),
+                BlazeBlock(42, 48, stride=2),
+                BlazeBlock(48, 56),
+                BlazeBlock(56, 64),
+                BlazeBlock(64, 72),
+                BlazeBlock(72, 80),
+                BlazeBlock(80, 88),
+            )
+        
+            self.backbone2 = nn.Sequential(
+                BlazeBlock(88, 96, stride=2),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+            )
+
+            self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True)
+            self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
+
+            self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True)
+            self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
+        
+    def forward(self, x):
+        # TFLite uses slightly different padding on the first conv layer
+        # than PyTorch, so do it manually.
+        x = F.pad(x, (1, 2, 1, 2), "constant", 0)
+        
+        b = x.shape[0]      # batch size, needed for reshaping later
+
+        if self.back_model:
+            x = self.backbone(x)           # (b, 16, 16, 96)
+            h = self.final(x)              # (b, 8, 8, 96)
+        else:
+            x = self.backbone1(x)           # (b, 88, 16, 16)
+            h = self.backbone2(x)           # (b, 96, 8, 8)
+        
+        # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
+        # permute the output from the conv layers before reshaping it.
+        
+        c1 = self.classifier_8(x)       # (b, 2, 16, 16)
+        c1 = c1.permute(0, 2, 3, 1)     # (b, 16, 16, 2)
+        c1 = c1.reshape(b, -1, 1)       # (b, 512, 1)
+
+        c2 = self.classifier_16(h)      # (b, 6, 8, 8)
+        c2 = c2.permute(0, 2, 3, 1)     # (b, 8, 8, 6)
+        c2 = c2.reshape(b, -1, 1)       # (b, 384, 1)
+
+        c = torch.cat((c1, c2), dim=1)  # (b, 896, 1)
+
+        r1 = self.regressor_8(x)        # (b, 32, 16, 16)
+        r1 = r1.permute(0, 2, 3, 1)     # (b, 16, 16, 32)
+        r1 = r1.reshape(b, -1, 16)      # (b, 512, 16)
+
+        r2 = self.regressor_16(h)       # (b, 96, 8, 8)
+        r2 = r2.permute(0, 2, 3, 1)     # (b, 8, 8, 96)
+        r2 = r2.reshape(b, -1, 16)      # (b, 384, 16)
+
+        r = torch.cat((r1, r2), dim=1)  # (b, 896, 16)
+        return [r, c]
+
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface_landmark.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface_landmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b21d17e31c8a8a39e95abb27cc3220440a01fca
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/blazeface_landmark.py
@@ -0,0 +1,74 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from blazebase import BlazeLandmark, BlazeBlock
+
+class BlazeFaceLandmark(BlazeLandmark):
+    """The face landmark model from MediaPipe.
+    
+    """
+    def __init__(self):
+        super(BlazeFaceLandmark, self).__init__()
+
+        # size of ROIs used for input
+        self.resolution = 192
+
+        self._define_layers()
+
+    def _define_layers(self):
+        self.backbone1 = nn.Sequential(
+            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=0, bias=True),
+            nn.PReLU(16),
+
+            BlazeBlock(16, 16, 3, act='prelu'),
+            BlazeBlock(16, 16, 3, act='prelu'),
+            BlazeBlock(16, 32, 3, 2, act='prelu'),
+
+            BlazeBlock(32, 32, 3, act='prelu'),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            BlazeBlock(32, 64, 3, 2, act='prelu'),
+
+            BlazeBlock(64, 64, 3, act='prelu'),
+            BlazeBlock(64, 64, 3, act='prelu'),
+            BlazeBlock(64, 128, 3, 2, act='prelu'),
+
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+        )
+
+
+        self.backbone2a = nn.Sequential(
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            nn.Conv2d(128, 32, 1, padding=0, bias=True),
+            nn.PReLU(32),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            nn.Conv2d(32, 1404, 3, padding=0, bias=True)
+        )
+
+        self.backbone2b = nn.Sequential(
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+            nn.Conv2d(128, 32, 1, padding=0, bias=True),
+            nn.PReLU(32),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            nn.Conv2d(32, 1, 3, padding=0, bias=True)
+        )
+
+    def forward(self, x):
+        if x.shape[0] == 0:
+            return torch.zeros((0,)), torch.zeros((0, 468, 3))
+            
+        x = F.pad(x, (0, 1, 0, 1), "constant", 0)
+
+        x = self.backbone1(x)
+        landmarks = self.backbone2a(x).view(-1, 468, 3) / 192
+        flag = self.backbone2b(x).sigmoid().view(-1)
+
+        return flag, landmarks
\ No newline at end of file
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/coco.jpg b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/coco.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/coco.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378
+size 158750
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/demo_qnn.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/demo_qnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..36a0bca99d904b047a4fb4dc4bcec0b93a4651dd
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/demo_qnn.py
@@ -0,0 +1,424 @@
+import numpy as np
+import torch
+import cv2
+import sys
+from blazebase import resize_pad, denormalize_detections
+from visualization import  draw_landmarks, draw_roi, FACE_CONNECTIONS
+import time
+import aidlite
+import os
+
+class post_mediapipe_face:
+    def __init__(self):
+        self.kp1 = 1
+        self.kp2 = 0
+        self.theta0 = 0.
+        self.dscale = 1.5
+        self.dy = 0.
+        self.x_scale = 256.0
+        self.y_scale = 256.0
+        self.h_scale = 256.0
+        self.w_scale = 256.0
+        self.num_keypoints = 6
+        self.num_classes = 1
+        self.num_anchors = 896
+        self.num_coords = 16
+        self.min_score_thresh = 0.4  #0.65  
+        self.score_clipping_thresh = 100.0
+        self.min_suppression_threshold = 0.3
+        self.resolution = 192
+            
+        
+    def detection2roi(self,detection):
+        xc = (detection[:,1] + detection[:,3]) / 2
+        yc = (detection[:,0] + detection[:,2]) / 2
+        scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+        yc += self.dy * scale
+        scale *= self.dscale
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+    
+    def _decode_boxes( self,raw_boxes, anchors):
+        boxes = torch.zeros_like(raw_boxes)
+
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+        return boxes
+    
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+
+        return output_detections
+
+    def extract_roi( self,frame, xc, yc, theta, scale):
+        resolution = 192
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+
+        # use the points to compute the affine transform that maps 
+        # these points back to the output square
+        res = resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].detach().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+
+        return imgs, affines, points
+
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+
+    def intersect(self,box_a, box_b):
+        A = box_a.size(0)
+        B = box_b.size(0)
+        max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                        box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+        min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                        box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+        inter = torch.clamp((max_xy - min_xy), min=0)
+        return inter[:, :, 0] * inter[:, :, 1]
+
+    def jaccard(self,box_a, box_b):
+        inter = self.intersect(box_a, box_b)
+        area_a = ((box_a[:, 2]-box_a[:, 0]) *
+                (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+        area_b = ((box_b[:, 2]-box_b[:, 0]) *
+                (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+        union = area_a + area_b - inter
+        return inter / union  # [A,B]
+
+
+    def overlap_similarity(self,box, other_boxes):
+        """Computes the IOU between a bounding box and set of other boxes."""
+        return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
+
+    def _weighted_non_max_suppression(self,detections):
+        if len(detections) == 0: return []
+        output_detections = []
+
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, num_coords], descending=True)
+
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+
+            # Compute the overlap between the first box and the other 
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = self.overlap_similarity(first_box, other_boxes)
+
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious >  self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :num_coords]
+                scores = detections[overlapping, num_coords:num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:num_coords] = weighted
+                weighted_detection[num_coords] = total_score / len(overlapping)
+
+            output_detections.append(weighted_detection)
+
+            return output_detections
+
+def draw_detections(img, detections, with_keypoints=True):
+    if isinstance(detections, torch.Tensor):
+        detections = detections.detach().numpy()
+
+    if detections.ndim == 1:
+        detections = np.expand_dims(detections, axis=0)
+
+    n_keypoints = detections.shape[1] // 2 - 2
+
+    for i in range(detections.shape[0]):
+        ymin = detections[i, 0]
+        xmin = detections[i, 1]
+        ymax = detections[i, 2]
+        xmax = detections[i, 3]
+        
+        start_point = (int(xmin), int(ymin))
+        end_point = (int(xmax), int(ymax))
+        img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) 
+
+        if with_keypoints:
+            for k in range(n_keypoints):
+                kp_x = int(detections[i, 4 + k*2    ])
+                kp_y = int(detections[i, 4 + k*2 + 1])
+                cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
+    return img
+
+
+
+post_process=post_mediapipe_face()
+
+class faceDetectionQnn:
+    def __init__(self):
+        super().__init__()
+        self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceDetector_w8a8.qnn216.ctx.bin"))
+        if self.model is None:
+            print("Create model failed !")
+            return
+
+        self.config = aidlite.Config.create_instance()
+        if self.config is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+
+        self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
+        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
+        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
+        self.config.is_quantify_model = 1
+        
+        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        input_shapes = [[1,3, 256, 256]]
+        output_shapes = [[1, 896,16],[1,896,1]]
+        self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
+                                output_shapes, aidlite.DataType.TYPE_FLOAT32)
+
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+        result = self.interpreter.init()
+        if result != 0:
+            print(f"interpreter init failed !")
+        result = self.interpreter.load_model()
+        if result != 0:
+            print("interpreter load model failed !")
+   
+        print(" model load success!")
+
+    def __call__(self, input):
+        self.interpreter.set_input_tensor(0,input)
+        invoke_time=[]
+        invoke_nums =10
+        for i in range(invoke_nums):
+            result = self.interpreter.set_input_tensor(0, input.data)
+            if result != 0:
+                print("interpreter set_input_tensor() failed")
+            t1=time.time()
+            result = self.interpreter.invoke()
+            cost_time = (time.time()-t1)*1000
+            invoke_time.append(cost_time)
+
+        max_invoke_time = max(invoke_time)
+        min_invoke_time = min(invoke_time)
+        mean_invoke_time = sum(invoke_time)/invoke_nums
+        var_invoketime=np.var(invoke_time)
+        print("====================================")
+        print(f"QNN Detection invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
+        print("====================================")
+        features_0 = self.interpreter.get_output_tensor(0).reshape(1, 896,16).copy()
+        features_1 = self.interpreter.get_output_tensor(1).reshape(1, 896,1).copy()
+        return features_0,features_1
+
+
+class faceLandmarkQnn:
+    def __init__(self):
+        super().__init__()
+        self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceLandmark_w8a8.qnn216.ctx.bin"))
+        if self.model is None:
+            print("Create model failed !")
+            return
+
+        self.config = aidlite.Config.create_instance()
+        if self.config is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+
+        self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
+        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
+        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
+        self.config.is_quantify_model = 1
+        
+        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        input_shapes = [[1, 3, 192, 192]]
+        output_shapes = [[1],[1,468,3]]
+        self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
+                                output_shapes, aidlite.DataType.TYPE_FLOAT32)
+
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+        result = self.interpreter.init()
+        if result != 0:
+            print(f"interpreter init failed !")
+        result = self.interpreter.load_model()
+        if result != 0:
+            print("interpreter load model failed !")
+   
+        print(" model load success!")
+
+    def __call__(self, input):
+        self.interpreter.set_input_tensor(0,input)
+        invoke_time=[]
+        invoke_nums =10
+        for i in range(invoke_nums):
+            result = self.interpreter.set_input_tensor(0, input.data)
+            if result != 0:
+                print("interpreter set_input_tensor() failed")
+            t1=time.time()
+            result = self.interpreter.invoke()
+            cost_time = (time.time()-t1)*1000
+            invoke_time.append(cost_time)
+
+        max_invoke_time = max(invoke_time)
+        min_invoke_time = min(invoke_time)
+        mean_invoke_time = sum(invoke_time)/invoke_nums
+        var_invoketime=np.var(invoke_time)
+        print("====================================")
+        print(f"QNN LandMark invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
+        print("====================================")
+        features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy()
+        features_1 = self.interpreter.get_output_tensor(1).reshape(1,468,3).copy()
+        return features_0,features_1
+
+
+
+anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"anchors_face_back.npy")), dtype=torch.float32, device='cpu')
+# anchors_np = anchors.cpu().numpy().astype(np.float32)
+# np.save("anchors_float32.npy", anchors_np)
+
+face_detc = faceDetectionQnn()
+face_rec = faceLandmarkQnn()
+
+image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"coco.jpg")
+
+frame_ct=0
+image = cv2.imread(image_path)
+
+frame = np.ascontiguousarray(image[:,:,::-1])
+
+img1, img2, scale, pad = resize_pad(frame)
+
+input = (img1 / 255).astype(np.float32)
+input = np.transpose(input, (2, 0, 1))  
+input = input[np.newaxis, ...]
+t0 = time.time()
+out = face_detc(input)
+use_time = round((time.time() - t0) * 1000, 2)
+detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors)
+
+filtered_detections = []
+num_coords = 16
+for i in range(len(detections)):
+    faces = post_process._weighted_non_max_suppression(detections[i])
+    faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1))
+    filtered_detections.append(faces)
+
+face_detections = denormalize_detections(filtered_detections[0], scale, pad)
+
+xc, yc, scale, theta = post_process.detection2roi(face_detections)
+
+img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale)
+if box.size()[0]!=0:
+    t2 = time.time()
+    flags, normalized_landmarks = face_rec(img.numpy())
+
+    use_time = round((time.time() - t2) * 1000, 2)
+
+    landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine)
+
+    for i in range(len(flags)):
+        landmark, flag = landmarks[i], flags[i]
+        if flag>.4: # 0.5
+            draw_landmarks(frame, landmark[:,:2], FACE_CONNECTIONS, size=1)
+else:
+    print("not detect face !")
+
+draw_roi(frame, box)
+draw_detections(frame, face_detections)
+cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1])
+face_detc.interpreter.destory()
+face_rec.interpreter.destory()
+
+
+
diff --git a/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/visualization.py b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/visualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1187c9d99336d53562f4fa3a4a32f6af6e769fb
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs6490_qnn2.16_int8_aidlite/python/visualization.py
@@ -0,0 +1,125 @@
+import numpy as np
+import cv2
+import torch
+
+def draw_detections(img, detections, with_keypoints=True):
+    if isinstance(detections, torch.Tensor):
+        detections = detections.cpu().numpy()
+
+    if detections.ndim == 1:
+        detections = np.expand_dims(detections, axis=0)
+
+    n_keypoints = detections.shape[1] // 2 - 2
+
+    for i in range(detections.shape[0]):
+        ymin = detections[i, 0]
+        xmin = detections[i, 1]
+        ymax = detections[i, 2]
+        xmax = detections[i, 3]
+        
+        start_point = (int(xmin), int(ymin))
+        end_point = (int(xmax), int(ymax))
+        img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) 
+
+        if with_keypoints:
+            for k in range(n_keypoints):
+                kp_x = int(detections[i, 4 + k*2    ])
+                kp_y = int(detections[i, 4 + k*2 + 1])
+                cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
+    return img
+
+
+def draw_roi(img, roi):
+    for i in range(roi.shape[0]):
+        (x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i]
+        cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2)
+        cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2)
+        cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2)
+        cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2)
+
+
+def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2):
+    points = points[:,:2]
+    for point in points:
+        x, y = point
+        x, y = int(x), int(y)
+        cv2.circle(img, (x, y), size, color, thickness=size)
+    for connection in connections:
+        x0, y0 = points[connection[0]]
+        x1, y1 = points[connection[1]]
+        x0, y0 = int(x0), int(y0)
+        x1, y1 = int(x1), int(y1)
+        cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size)
+
+
+
+# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py
+#        8   12  16  20
+#        |   |   |   |
+#        7   11  15  19
+#    4   |   |   |   |
+#    |   6   10  14  18
+#    3   |   |   |   |
+#    |   5---9---13--17
+#    2    \         /
+#     \    \       /
+#      1    \     /
+#       \    \   /
+#        ------0-
+HAND_CONNECTIONS = [
+    (0, 1), (1, 2), (2, 3), (3, 4),
+    (5, 6), (6, 7), (7, 8),
+    (9, 10), (10, 11), (11, 12),
+    (13, 14), (14, 15), (15, 16),
+    (17, 18), (18, 19), (19, 20),
+    (0, 5), (5, 9), (9, 13), (13, 17), (0, 17)
+]
+
+POSE_CONNECTIONS = [
+    (0,1), (1,2), (2,3), (3,7),
+    (0,4), (4,5), (5,6), (6,8),
+    (9,10),
+    (11,13), (13,15), (15,17), (17,19), (19,15), (15,21),
+    (12,14), (14,16), (16,18), (18,20), (20,16), (16,22),
+    (11,12), (12,24), (24,23), (23,11)
+]
+
+# Vertex indices can be found in
+# github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png
+# Found in github.com/google/mediapipe/python/solutions/face_mesh.py
+FACE_CONNECTIONS = [
+    # Lips.
+    (61, 146), (146, 91), (91, 181), (181, 84), (84, 17),
+    (17, 314), (314, 405), (405, 321), (321, 375), (375, 291),
+    (61, 185), (185, 40), (40, 39), (39, 37), (37, 0),
+    (0, 267), (267, 269), (269, 270), (270, 409), (409, 291),
+    (78, 95), (95, 88), (88, 178), (178, 87), (87, 14),
+    (14, 317), (317, 402), (402, 318), (318, 324), (324, 308),
+    (78, 191), (191, 80), (80, 81), (81, 82), (82, 13),
+    (13, 312), (312, 311), (311, 310), (310, 415), (415, 308),
+    # Left eye.
+    (263, 249), (249, 390), (390, 373), (373, 374), (374, 380),
+    (380, 381), (381, 382), (382, 362), (263, 466), (466, 388),
+    (388, 387), (387, 386), (386, 385), (385, 384), (384, 398),
+    (398, 362),
+    # Left eyebrow.
+    (276, 283), (283, 282), (282, 295), (295, 285), (300, 293),
+    (293, 334), (334, 296), (296, 336),
+    # Right eye.
+    (33, 7), (7, 163), (163, 144), (144, 145), (145, 153),
+    (153, 154), (154, 155), (155, 133), (33, 246), (246, 161),
+    (161, 160), (160, 159), (159, 158), (158, 157), (157, 173),
+    (173, 133),
+    # Right eyebrow.
+    (46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105),
+    (105, 66), (66, 107),
+    # Face oval.
+    (10, 338), (338, 297), (297, 332), (332, 284), (284, 251),
+    (251, 389), (389, 356), (356, 454), (454, 323), (323, 361),
+    (361, 288), (288, 397), (397, 365), (365, 379), (379, 378),
+    (378, 400), (400, 377), (377, 152), (152, 148), (148, 176),
+    (176, 149), (149, 150), (150, 136), (136, 172), (172, 58),
+    (58, 132), (132, 93), (93, 234), (234, 127), (127, 162),
+    (162, 21), (21, 54), (54, 103), (103, 67), (67, 109),
+    (109, 10)
+]
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/README.md b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..72fa909e339bab96870b8548fed5822dbc87d6fd
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/README.md
@@ -0,0 +1,63 @@
+## Model Information
+### Source model
+- Input shape: [1x3x256x256],[1x3x192x192]
+- Number of parameters:0.13M, 0.6M
+- Model size:0.58MB, 2.32MB
+- Output shape: [1x896x16, 1x896x1], [1, 1x486x3]
+
+Source model repository: [MediaPipe-Face-Detection](https://github.com/zmurez/MediaPipePyTorch/)
+
+### Converted model
+
+- Precision: FP16
+- Backend: QNN2.16
+- Target Device: SNM972 QCS8550
+
+## Inference with AidLite SDK
+
+### SDK installation
+Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
+
+- install AidLite SDK
+
+```bash
+# Install the appropriate version of the aidlite sdk
+sudo aid-pkg update
+sudo aid-pkg install aidlite-sdk
+# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
+sudo aid-pkg install aidlite-{QNN VERSION}
+```
+
+- Verify AidLite SDK
+
+```bash
+# aidlite sdk c++ check
+python3 -c "import aidlite ; print(aidlite.get_library_version())"
+
+# aidlite sdk python check
+python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
+```
+
+### Run demo
+#### python
+```bash
+cd python 
+python3 demo_qnn.py
+```
+
+#### c++
+```bash
+# 加载.npy文件需要用到cnpy库（终端默认路径下执行即可）
+git clone https://github.com/rogersce/cnpy.git
+cd cnpy
+mkdir build && cd build
+cmake ..
+make
+sudo make install
+
+cd mediapipe-face-detection/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp
+mkdir build && cd build 
+cmake ..
+make
+./run_test
+```
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8655a0360f730c1fb7b4f81367a6279e5acc8060
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2044fec25dc146bd44e23e22b15c320e189fd99f
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazebase.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazeface.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3d9d716ed4bbd036e086c876ce0c5308d5a9a5d
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazeface.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27b6da065e0c25967994af97ae8c72f67594bf5f
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/visualization.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e9b9b371d9ab88b3885f249fa1a9fc68d8751c1
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/visualization.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/visualization.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/visualization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c16fa3b7c9a01a594bb21e59a6c00d0b19945548
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/__pycache__/visualization.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/CMakeLists.txt b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..13cca3db86e64b8a8458fd55c52323bed1d4282f
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/CMakeLists.txt
@@ -0,0 +1,34 @@
+cmake_minimum_required (VERSION 3.5)
+project("run_test")
+
+find_package(OpenCV REQUIRED)
+find_library(CNPY_LIB cnpy REQUIRED)
+
+message(STATUS "oPENCV Library status:")
+message(STATUS ">version:${OpenCV_VERSION}")
+message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
+
+set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
+
+include_directories(
+    /usr/local/include
+    /usr/include/opencv4
+)
+
+link_directories(
+    /usr/local/lib/
+)
+
+file(GLOB SRC_LISTS 
+    ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp  
+)
+
+add_executable(run_test ${SRC_LISTS})
+
+target_link_libraries(run_test
+    aidlite
+	${OpenCV_LIBS}
+    pthread
+    jsoncpp
+    ${CNPY_LIB}
+)
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/anchors_float32.npy b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/anchors_float32.npy
new file mode 100644
index 0000000000000000000000000000000000000000..fc25c0d2c161ee090e9a04c6003418bd15caf45c
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/anchors_float32.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94
+size 14464
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/coco.jpg b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/coco.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/coco.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378
+size 158750
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/run_test.cpp b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/run_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..536aca649b9d8cbd0d95d2022cbdc32c40b1ed4a
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/cpp/run_test.cpp
@@ -0,0 +1,909 @@
+#include <iostream>
+#include <fstream>
+#include <opencv2/opencv.hpp>
+#include <aidlux/aidlite/aidlite.hpp>
+#include <vector>
+#include <numeric>
+#include <cmath>
+#include <jsoncpp/json/json.h>
+#include <tuple>
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <cassert>
+#include "cnpy.h"
+
+using namespace cv;
+using namespace std;
+using namespace Aidlux::Aidlite;
+
+
+// 人脸 landmark 连接索引定义（来自 MediaPipe Face Mesh）
+const std::vector<std::pair<int, int>> FACE_CONNECTIONS = {
+    {61, 146}, {146, 91}, {91, 181}, {181, 84}, {84, 17},
+    {17, 314}, {314, 405}, {405, 321}, {321, 375}, {375, 291},
+    {61, 185}, {185, 40}, {40, 39}, {39, 37}, {37, 0},
+    {0, 267}, {267, 269}, {269, 270}, {270, 409}, {409, 291},
+    {78, 95}, {95, 88}, {88, 178}, {178, 87}, {87, 14},
+    {14, 317}, {317, 402}, {402, 318}, {318, 324}, {324, 308},
+    {78, 191}, {191, 80}, {80, 81}, {81, 82}, {82, 13},
+    {13, 312}, {312, 311}, {311, 310}, {310, 415}, {415, 308},
+    {263, 249}, {249, 390}, {390, 373}, {373, 374}, {374, 380},
+    {380, 381}, {381, 382}, {382, 362}, {263, 466}, {466, 388},
+    {388, 387}, {387, 386}, {386, 385}, {385, 384}, {384, 398},
+    {398, 362}, {276, 283}, {283, 282}, {282, 295}, {295, 285},
+    {300, 293}, {293, 334}, {334, 296}, {296, 336}, {33, 7},
+    {7, 163}, {163, 144}, {144, 145}, {145, 153}, {153, 154},
+    {154, 155}, {155, 133}, {33, 246}, {246, 161}, {161, 160},
+    {160, 159}, {159, 158}, {158, 157}, {157, 173}, {173, 133},
+    {46, 53}, {53, 52}, {52, 65}, {65, 55}, {70, 63}, {63, 105},
+    {105, 66}, {66, 107}, {10, 338}, {338, 297}, {297, 332},
+    {332, 284}, {284, 251}, {251, 389}, {389, 356}, {356, 454},
+    {454, 323}, {323, 361}, {361, 288}, {288, 397}, {397, 365},
+    {365, 379}, {379, 378}, {378, 400}, {400, 377}, {377, 152},
+    {152, 148}, {148, 176}, {176, 149}, {149, 150}, {150, 136},
+    {136, 172}, {172, 58}, {58, 132}, {132, 93}, {93, 234},
+    {234, 127}, {127, 162}, {162, 21}, {21, 54}, {54, 103},
+    {103, 67}, {67, 109}, {109, 10}
+};
+
+struct Args {
+    std::string faceDetector_model = "../../models/m_faceDetector_fp16.qnn216.ctx.bin";
+    std::string faceLandmark_model = "../../models/m_faceLandmark_fp16.qnn216.ctx.bin";
+    std::string imgs = "../coco.jpg";
+    int invoke_nums = 10;
+    std::string model_type = "QNN";
+};
+
+
+Args parse_args(int argc, char* argv[]) {
+    Args args;
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+        if (arg == "--faceDetector_model" && i + 1 < argc) {
+            args.faceDetector_model = argv[++i];
+        } else if (arg == "--faceLandmark_model" && i + 1 < argc) {
+            args.faceLandmark_model = argv[++i];
+        } else if (arg == "--imgs" && i + 1 < argc) {
+            args.imgs = argv[++i];
+        } else if (arg == "--invoke_nums" && i + 1 < argc) {
+            args.invoke_nums = std::stoi(argv[++i]);
+        } else if (arg == "--model_type" && i + 1 < argc) {
+            args.model_type = argv[++i];
+        }
+    }
+    return args;
+}
+
+std::string to_lower(const std::string& str) {
+    std::string lower_str = str;
+    std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
+        return std::tolower(c);
+    });
+    return lower_str;
+}
+
+std::vector<std::vector<float>> load_anchors_from_npy(const std::string& path) {
+    cnpy::NpyArray arr = cnpy::npy_load(path);
+    float* data_ptr = arr.data<float>();
+
+    size_t num_rows = arr.shape[0]; // 896
+    size_t num_cols = arr.shape[1]; // 4
+
+    std::vector<std::vector<float>> anchors(num_rows, std::vector<float>(num_cols));
+    for (size_t i = 0; i < num_rows; ++i) {
+        for (size_t j = 0; j < num_cols; ++j) {
+            anchors[i][j] = data_ptr[i * num_cols + j];
+        }
+    }
+
+    return anchors;
+}
+
+
+// 绘制人脸关键点和连接线
+void draw_landmarks(
+    cv::Mat& img,
+    const std::vector<cv::Point2f>& points,
+    const std::vector<float>& flags,
+    const std::vector<std::pair<int, int>>& connections,
+    float threshold = 0.4f,
+    cv::Scalar point_color = cv::Scalar(0, 255, 0),
+    cv::Scalar line_color = cv::Scalar(0, 0, 0),
+    int size = 2)
+{
+    // 画关键点
+    for (size_t i = 0; i < points.size(); ++i) {
+        if (i < flags.size() && flags[i] > threshold) {
+            int x = static_cast<int>(points[i].x);
+            int y = static_cast<int>(points[i].y);
+            cv::circle(img, cv::Point(x, y), size, point_color, size);
+        }
+    }
+
+    // 画连接线（两端都要可见）
+    for (const auto& conn : connections) {
+        int i0 = conn.first;
+        int i1 = conn.second;
+        if (i0 < points.size() && i1 < points.size() &&
+            i0 < flags.size() && i1 < flags.size() &&
+            flags[i0] > threshold && flags[i1] > threshold)
+        {
+            cv::line(img, points[i0], points[i1], line_color, size);
+        }
+    }
+}
+
+
+std::tuple<cv::Mat, cv::Mat, float, cv::Point> resize_pad(const cv::Mat& img) {
+    int orig_h = img.rows; // 480
+    int orig_w = img.cols; // 640
+
+    // Step 1: resize width to 256, keep aspect ratio
+    int w1 = 256;
+    int h1 = w1 * orig_h / orig_w;  // 等效于 int(256 * h / w)
+
+    // Step 2: compute padding in height direction
+    int padh = 256 - h1;
+    int padw = 0;
+
+    int padh1 = padh / 2;
+    int padh2 = padh1 + (padh % 2);
+    int padw1 = padw / 2;
+    int padw2 = padw1 + (padw % 2);
+
+    // Step 3: resize to (w1, h1)
+    cv::Mat resized;
+    cv::resize(img, resized, cv::Size(w1, h1));  // (256, h1)
+
+    // Step 4: pad to (256, 256)
+    cv::Mat padded;
+    cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
+
+    // Step 5: resize padded to 128×128
+    cv::Mat resized128;
+    cv::resize(padded, resized128, cv::Size(128, 128));
+
+    // Step 6: compute scale and pad in original image space
+    float scale = static_cast<float>(orig_h) / h1; // h / h1
+    cv::Point pad_point(static_cast<int>(padh1 * scale), static_cast<int>(padw1 * scale));
+
+    return std::make_tuple(padded, resized128, scale, pad_point);
+}
+
+
+// 将图像转换为 1xC×H×W 格式并归一化（除以 255）
+std::vector<float> preprocess_image(const cv::Mat& img) {
+    int H = img.rows;
+    int W = img.cols;
+    int C = img.channels(); // should be 3
+
+    std::vector<float> chw(H * W * C); // CHW
+    std::vector<float> nchw(1 * C * H * W); // NCHW
+
+    // 1. HWC → CHW + normalize (float32 / 255.0)
+    for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+            for (int c = 0; c < C; ++c) {
+                // OpenCV uses BGR order
+                float value = img.at<cv::Vec3b>(h, w)[c] / 255.0f;
+                chw[c * H * W + h * W + w] = value;
+            }
+        }
+    }
+
+    // 2. CHW → NCHW (add batch dimension, actually just copy)
+    for (int i = 0; i < C * H * W; ++i) {
+        nchw[i] = chw[i];
+    }
+
+    return nchw; // shape: [1, 3, H, W]
+}
+
+
+// 只用前4个坐标计算IOU（默认框位置在前4个坐标）
+float IoU(const std::vector<float>& box1, const std::vector<float>& box2) {
+    float x1 = std::max(box1[0], box2[0]);
+    float y1 = std::max(box1[1], box2[1]);
+    float x2 = std::min(box1[2], box2[2]);
+    float y2 = std::min(box1[3], box2[3]);
+
+    float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
+    float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]);
+    float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]);
+    float union_area = box1_area + box2_area - inter_area;
+
+    return union_area > 0 ? inter_area / union_area : 0.0f;
+}
+
+std::vector<std::vector<float>> weighted_non_max_suppression(
+    std::vector<std::vector<float>>& detections,
+    int num_coords = 16,
+    float min_suppression_threshold = 0.3f)
+{
+    if (detections.empty()) return {};
+
+    std::vector<int> indices(detections.size());
+    std::iota(indices.begin(), indices.end(), 0);
+
+    // 按置信度降序排序
+    std::sort(indices.begin(), indices.end(), [&](int a, int b) {
+        return detections[a][num_coords] > detections[b][num_coords];
+    });
+
+    std::vector<std::vector<float>> output;
+
+    while (!indices.empty()) {
+        int best_idx = indices.front();
+        const auto& best_det = detections[best_idx];
+        std::vector<int> overlapping = { best_idx };
+
+        for (size_t i = 1; i < indices.size(); ++i) {
+            float iou = IoU(best_det, detections[indices[i]]);
+            if (iou > min_suppression_threshold) {
+                overlapping.push_back(indices[i]);
+            }
+        }
+
+        // 更新剩余索引
+        std::vector<int> new_indices;
+        for (int idx : indices) {
+            if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) {
+                new_indices.push_back(idx);
+            }
+        }
+        indices = new_indices;
+
+        // 加权平均：坐标 * 置信度
+        if (overlapping.size() == 1) {
+            output.push_back(best_det);
+        } else {
+            std::vector<float> weighted(num_coords + 1, 0.0f);
+            float total_score = 0.0f;
+
+            for (int idx : overlapping) {
+                float score = detections[idx][num_coords];
+                total_score += score;
+                for (int k = 0; k < num_coords; ++k) {
+                    weighted[k] += detections[idx][k] * score;
+                }
+            }
+
+            for (int k = 0; k < num_coords; ++k) {
+                weighted[k] /= total_score;
+            }
+            weighted[num_coords] = total_score / overlapping.size();  // 取平均得分
+
+            // std::cout << "Weighted box: ";
+            // for (float v : weighted) std::cout << v << " ";
+            // std::cout << "\n";
+
+            output.push_back(weighted);
+        }
+    }
+
+    // TODO 
+    auto x = output[0];
+    output.clear();
+    output.push_back(x);
+
+    return output;
+}
+
+
+std::vector<std::vector<float>> denormalize_detections(
+    const std::vector<std::vector<float>>& detections,
+    float scale,
+    const cv::Point& pad
+) {
+    std::vector<std::vector<float>> result = detections;
+
+    for (size_t i = 0; i < result.size(); ++i) {
+        std::vector<float>& det = result[i];
+
+        // bbox coords: x1, y1, x2, y2
+        det[0] = det[0] * scale * 256.0f - pad.x;  // x1
+        det[1] = det[1] * scale * 256.0f - pad.y;  // y1
+        det[2] = det[2] * scale * 256.0f - pad.x;  // x2
+        det[3] = det[3] * scale * 256.0f - pad.y;  // y2
+
+        // keypoints (starting from index 4): format [y, x, y, x, ...]
+        for (size_t k = 4; k + 1 < det.size(); k += 2) {
+            det[k]     = det[k]     * scale * 256.0f - pad.y;  // y
+            det[k + 1] = det[k + 1] * scale * 256.0f - pad.x;  // x
+        }
+    }
+
+    return result;
+}
+
+
+void detection2roi(
+    const std::vector<std::vector<float>>& detections,
+    std::vector<float>& xc,
+    std::vector<float>& yc,
+    std::vector<float>& scale,
+    std::vector<float>& theta,
+    int kp1, int kp2,   // 关键点索引
+    float dy, float dscale, float theta0
+) {
+    size_t N = detections.size();
+    xc.resize(N);
+    yc.resize(N);
+    scale.resize(N);
+    theta.resize(N);
+
+    for (size_t i = 0; i < N; ++i) {
+        const std::vector<float>& det = detections[i];
+
+        float x1 = det[1];
+        float x2 = det[3];
+        float y1 = det[0];
+        float y2 = det[2];
+
+        float x_center = (x1 + x2) / 2.0f;
+        float y_center = (y1 + y2) / 2.0f;
+        float box_scale = (x2 - x1);  // assumes square box
+
+        // yc 偏移
+        y_center += dy * box_scale;
+        box_scale *= dscale;
+
+        // 获取两个关键点的位置
+        int base = 4;
+        int idx_y0 = base + 2 * kp1;
+        int idx_x0 = base + 2 * kp1 + 1;
+        int idx_y1 = base + 2 * kp2;
+        int idx_x1 = base + 2 * kp2 + 1;
+
+        float x0 = det[idx_x0];
+        float y0 = det[idx_y0];
+        float x1_kp = det[idx_x1];
+        float y1_kp = det[idx_y1];
+
+        float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0;
+
+        // 输出赋值
+        xc[i] = x_center;
+        yc[i] = y_center;
+        scale[i] = box_scale;
+        // TODO: 这里的 theta 需要根据实际情况调整
+        // theta[i] = angle;  // 如果需要使用计算的角度
+        theta[i] = -0.0094;
+    }
+}
+
+
+void extract_roi(
+    const cv::Mat& frame,
+    const std::vector<float>& xc,
+    const std::vector<float>& yc,
+    const std::vector<float>& theta,
+    const std::vector<float>& scale,
+    std::vector<cv::Mat>& cropped_rois,
+    std::vector<cv::Mat>& affine_matrices,
+    std::vector<std::vector<cv::Point2f>>& roi_boxes,  // 添加返回点坐标
+    int resolution = 192
+) {
+    cropped_rois.clear();
+    affine_matrices.clear();
+    roi_boxes.clear();
+
+    for (size_t i = 0; i < xc.size(); ++i) {
+        float s = scale[i] / 2.0f;
+        float cos_t = std::cos(theta[i]);
+        float sin_t = std::sin(theta[i]);
+
+        // 定义4个 unit square 点经过变换后的点（顺序和 Python 中一样）
+        std::vector<cv::Point2f> points(4);
+        // [-1, -1]
+        points[0].x = xc[i] + (-s * cos_t + s * sin_t);
+        points[0].y = yc[i] + (-s * sin_t - s * cos_t);
+        // [1, -1]
+        points[1].x = xc[i] + ( s * cos_t + s * sin_t);
+        points[1].y = yc[i] + ( s * sin_t - s * cos_t);
+        // [-1, 1]
+        points[2].x = xc[i] + (-s * cos_t - s * sin_t);
+        points[2].y = yc[i] + (-s * sin_t + s * cos_t);
+        // [1, 1]
+        points[3].x = xc[i] + ( s * cos_t - s * sin_t);
+        points[3].y = yc[i] + ( s * sin_t + s * cos_t);
+
+        // 用前三个点计算仿射变换
+        std::vector<cv::Point2f> src_pts = { points[0], points[1], points[2] };
+        std::vector<cv::Point2f> dst_pts = {
+            cv::Point2f(0, 0),
+            cv::Point2f(resolution - 1, 0),
+            cv::Point2f(0, resolution - 1)
+        };
+
+        cv::Mat M = cv::getAffineTransform(src_pts, dst_pts);
+        cv::Mat M_inv;
+        cv::invertAffineTransform(M, M_inv);
+
+        cv::Mat cropped;
+        cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5));
+        cropped_rois.push_back(cropped);
+        affine_matrices.push_back(M_inv);
+        roi_boxes.push_back(points);  // 添加变换后的 box 点
+    }
+}
+
+std::vector<float> preprocess_imgs_to_nchw(const std::vector<cv::Mat>& imgs) {
+    int N = imgs.size();
+    if (N == 0) return {};
+
+    int H = 192;
+    int W = 192;
+    int C = 3;  // assume 3 channels (BGR)
+
+    std::vector<float> output;
+    output.reserve(N * C * H * W);
+
+    for (int n = 0; n < N; ++n) {
+        cv::Mat img_float;
+        imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0);  // Normalize to [0,1]
+
+        // Split channels (HWC → CHW)
+        std::vector<cv::Mat> channels(3);
+        cv::split(img_float, channels);  // channels[0] = B, [1] = G, [2] = R
+
+        for (int c = 0; c < C; ++c) {
+            for (int i = 0; i < H; ++i) {
+                for (int j = 0; j < W; ++j) {
+                    output.push_back(channels[c].at<float>(i, j));
+                }
+            }
+        }
+    }
+
+    return output;  // shape: N x C x H x W
+}
+
+// resolution 一般为 192
+std::vector<cv::Point2f> denormalize_landmarks(
+    const std::vector<float>& normalized_landmarks,
+    const std::vector<cv::Mat>& affines,
+    int resolution = 192)
+{
+    std::vector<cv::Point2f> output;
+    
+    // 检查输入尺寸
+    const int num_faces = 1;  // 假设只有一个人脸
+    const int num_landmarks = 468;
+    if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) {
+        std::cerr << "Error: Input size mismatch. Expected " 
+                  << num_faces * num_landmarks * 3 << " landmarks and " 
+                  << num_faces << " affine matrices." << std::endl;
+        throw std::runtime_error("Input size mismatch");
+    }
+
+    for (int i = 0; i < num_faces; ++i) {
+        const cv::Mat& affine = affines[i]; // 2x3 CV_32F
+        for (int j = 0; j < num_landmarks; ++j) {
+            int idx = i * num_landmarks * 3 + j * 3;
+            float x = normalized_landmarks[idx + 0] * resolution;
+            float y = normalized_landmarks[idx + 1] * resolution;
+            // float z = normalized_landmarks[idx + 2]; // 可选使用
+
+            // 2x1 输入向量
+            cv::Mat pt = (cv::Mat_<float>(2, 1) << x, y);
+
+            // 提取 affine 的旋转和平移
+            cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone();
+            cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone();
+            M2x2.convertTo(M2x2, CV_32F);
+            t2x1.convertTo(t2x1, CV_32F);
+
+            // 反仿射变换
+            cv::Mat out = M2x2 * pt + t2x1;
+
+            // 存储为 Point2f
+            output.emplace_back(out.at<float>(0, 0), out.at<float>(1, 0));
+        }
+    }
+
+    return output; // 输出为 denormalized landmarks，大小为 468 个 Point2f
+}
+
+
+void draw_roi(cv::Mat& img, const std::vector<std::vector<cv::Point2f>>& boxes) {
+    for (const auto& roi : boxes) {
+        if (roi.size() < 4) continue;
+
+        const cv::Point2f& p1 = roi[0];
+        const cv::Point2f& p2 = roi[1];
+        const cv::Point2f& p3 = roi[2];
+        const cv::Point2f& p4 = roi[3];
+
+        cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2);     // 绿色
+        cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+    }
+}
+
+
+void draw_detections(cv::Mat& img, const std::vector<std::vector<float>>& detections, bool with_keypoints = true) {
+    for (const auto& det : detections) {
+        if (det.size() < 4) continue;
+
+        float ymin = det[0];
+        float xmin = det[1];
+        float ymax = det[2];
+        float xmax = det[3];
+
+        cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1);
+
+        if (with_keypoints && det.size() > 4) {
+            int n_keypoints = (det.size() - 4) / 2;
+            for (int k = 0; k < n_keypoints; ++k) {
+                int kp_x = int(det[4 + k * 2]);
+                int kp_y = int(det[4 + k * 2 + 1]);
+                cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2);
+            }
+        }
+    }
+}
+
+
+std::vector<std::vector<float>> loadAnchors(const std::string& filename) {
+    std::ifstream in(filename);
+    std::vector<std::vector<float>> anchors;
+
+    if (!in.is_open()) {
+        std::cerr << "Failed to open file: " << filename << std::endl;
+        return anchors;
+    }
+
+    std::string line;
+    while (std::getline(in, line)) {
+        std::istringstream ss(line);
+        std::vector<float> anchor;
+        float value;
+        while (ss >> value) {
+            anchor.push_back(value);
+        }
+        if (!anchor.empty()) {
+            anchors.push_back(anchor);
+        }
+    }
+
+    in.close();
+    return anchors;
+}
+
+// sigmoid 函数
+float sigmoid(float x) {
+    return 1.0f / (1.0f + std::exp(-x));
+}
+
+// clamp 函数
+float clamp(float x, float min_val, float max_val) {
+    return std::max(min_val, std::min(max_val, x));
+}
+
+// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord]
+std::vector<std::vector<std::vector<float>>> decode_boxes(
+    const std::vector<float>& raw_boxes,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    int num_keypoints)
+{
+    std::vector<std::vector<std::vector<float>>> decoded_boxes(batch,
+        std::vector<std::vector<float>>(num_anchors, std::vector<float>(num_coords, 0)));
+
+    for (int b = 0; b < batch; ++b) {
+        for (int i = 0; i < num_anchors; ++i) {
+            int base = b * num_anchors * num_coords + i * num_coords;
+
+            float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0];
+            float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1];
+            float w = raw_boxes[base + 2] / w_scale * anchors[i][2];
+            float h = raw_boxes[base + 3] / h_scale * anchors[i][3];
+
+            decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin
+            decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin
+            decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax
+            decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax
+
+            for (int k = 0; k < num_keypoints; ++k) {
+                int offset = 4 + k * 2;
+                float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0];
+                float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1];
+                decoded_boxes[b][i][offset] = keypoint_x;
+                decoded_boxes[b][i][offset + 1] = keypoint_y;
+            }
+        }
+    }
+
+    return decoded_boxes;
+}
+
+std::vector<std::vector<std::vector<float>>> tensors_to_detections(
+    const std::vector<float>& raw_box_tensor,
+    const std::vector<float>& raw_score_tensor,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    float score_clipping_thresh, float min_score_thresh)
+{
+    assert(raw_box_tensor.size() == batch * num_anchors * num_coords);
+    assert(raw_score_tensor.size() == batch * num_anchors * num_classes);
+    assert(anchors.size() == size_t(num_anchors));
+
+    auto detection_boxes = decode_boxes(
+        raw_box_tensor, anchors, batch, num_anchors, num_coords,
+        x_scale, y_scale, w_scale, h_scale, num_keypoints);
+
+    std::vector<std::vector<std::vector<float>>> output_detections;
+
+    for (int b = 0; b < batch; ++b) {
+        std::vector<std::vector<float>> detections;
+
+        for (int i = 0; i < num_anchors; ++i) {
+            int score_index = b * num_anchors * num_classes + i * num_classes;
+
+            // 单类情况，取第0类
+            float score_raw = raw_score_tensor[score_index];
+            float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh));
+
+            if (score >= min_score_thresh) {
+                std::vector<float> det = detection_boxes[b][i]; // shape [num_coords]
+                det.push_back(score); // 追加置信度
+                detections.push_back(det); // shape [num_coords+1]
+            }
+        }
+
+        output_detections.push_back(detections); // 每个 batch 一个 vector
+    }
+
+    return output_detections;
+}
+
+
+int invoke(const Args& args) {
+    std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n"
+              << args.faceLandmark_model << "\n"
+              << "Image Path: " << args.imgs << "\n"
+              << "Inference Nums: " << args.invoke_nums << "\n"
+              << "Model Type: " << args.model_type << "\n";
+    // =============================================================faceDetector_model start
+    Model* model1 = Model::create_instance(args.faceDetector_model);
+    if(model1 == nullptr){
+        printf("Create model1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config1 = Config::create_instance();
+    if(config1 == nullptr){
+        printf("Create config1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config1->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower1 = to_lower(args.model_type);
+    if (model_type_lower1 == "qnn"){
+        config1->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") {
+        config1->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config1->accelerate_type = AccelerateType::TYPE_DSP;
+    config1->is_quantify_model = 1;
+
+    std::vector<std::vector<uint32_t>> input_shapes1 = {{1,3,256,256}};
+    std::vector<std::vector<uint32_t>> output_shapes1 = {{1,896,16},{1,896,1}};
+    model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1);
+    if(fast_interpreter1 == nullptr){
+        printf("build_interpretper_from_model_and_config failed !\n");
+        return EXIT_FAILURE;
+    }
+    int result = fast_interpreter1->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter1->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model load success!\n");
+    // =============================================================faceDetector_model over
+
+    // =============================================================faceLandmark_model start
+    Model* model2 = Model::create_instance(args.faceLandmark_model);
+    if(model2 == nullptr){
+        printf("Create model2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config2 = Config::create_instance();
+    if(config2 == nullptr){
+        printf("Create config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config2->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower2 = to_lower(args.model_type);
+    if (model_type_lower2 == "qnn"){
+        config2->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") {
+        config2->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config2->accelerate_type = AccelerateType::TYPE_DSP;
+    config2->is_quantify_model = 1;
+
+    std::vector<std::vector<uint32_t>> input_shapes2 = {{1,3,192,192}};
+    std::vector<std::vector<uint32_t>> output_shapes2 = {{1},{1,468,3}};
+    model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2);
+    if(fast_interpreter2 == nullptr){
+        printf("build_interpretper_from_model_and_config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    result = fast_interpreter2->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter2->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model2 load success!\n");
+    // =============================================================faceLandmark_model over
+
+
+    auto anchors = load_anchors_from_npy("../anchors_float32.npy");
+    cv::Mat frame = cv::imread(args.imgs);
+    if (frame.empty()) {
+        printf("detect image load failed!\n");
+        return 1;
+    }
+    // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
+    cv::Mat input_data;
+    cv::Mat frame_clone1 = frame.clone();
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB); 
+    cv::Mat frame_clone = frame.clone();
+
+
+    cv::Mat img1, img2;
+    float scale;
+    cv::Point pad;
+    std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1);
+    std::vector<float> input_tensor = preprocess_image(img1);
+
+    float *outdata0 = nullptr;
+    float *outdata1 = nullptr;
+    std::vector<float> invoke_time;
+    for (int i = 0; i < args.invoke_nums; ++i) {
+        result = fast_interpreter1->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter1->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_0 = 0;
+        result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        uint32_t out_data_1 = 0;
+        result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+    }
+
+    std::vector<float> tensor_1_896_16(outdata0, outdata0 + 896*16);
+    std::vector<float> tensor_1_896_1(outdata1, outdata1 + 896*1);
+
+    std::vector<std::vector<std::vector<float>>> detections = tensors_to_detections(
+        tensor_1_896_16, tensor_1_896_1, anchors,
+        1, 896, 16, 1, 6,
+        256.0f, 256.0f, 256.0f, 256.0f,
+        100.0f, 0.4f);
+
+
+    std::vector<std::vector<std::vector<float>>> filtered_detections;
+    for (size_t i = 0; i < detections.size(); ++i) {
+        std::vector<std::vector<float>>& dets = detections[i];
+        std::vector<std::vector<float>> faces = weighted_non_max_suppression(dets);
+        filtered_detections.push_back(faces);
+    }
+
+
+    // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n";
+    // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n";
+    std::vector<std::vector<float>> face_detections = denormalize_detections(filtered_detections[0], scale, pad);
+
+    // std::cout << "face_detections size: " << face_detections.size() << "\n";
+    std::vector<float> xc, yc, scales, theta;
+    int kp1 = 0, kp2 = 1;           // 关键点索引
+    float dy = 0.0f;                // 根据模型定义设定
+    float dscale = 1.5f;            // 缩放因子
+    float theta0 = 0.0f;            // 基准角度
+
+    detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0);
+    std::vector<cv::Mat> rois;
+    std::vector<cv::Mat> affines;
+    std::vector<std::vector<cv::Point2f>> boxes;
+
+    extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes);
+    if (!boxes.empty()) {
+        std::cout << "Detected " << boxes.size() << " faces.\n";
+        // 检测到人脸，继续处理 boxes[0] ...
+        std::vector<float> input_tensor = preprocess_imgs_to_nchw(rois);
+
+        float *outdata1_0 = nullptr;
+        float *outdata1_1 = nullptr;
+
+        result = fast_interpreter2->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter2->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_1_0 = 0;
+        result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        uint32_t out_data_1_1 = 0;
+        result = fast_interpreter2->get_output_tensor(1, (void**)&outdata1_1, &out_data_1_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        std::vector<float> flags(outdata1_0, outdata1_0 + 1);
+        std::vector<float> normalized_landmarks(outdata1_1, outdata1_1 + 468*3);
+
+        std::vector<cv::Point2f> landmarks = denormalize_landmarks(normalized_landmarks, affines);
+        draw_landmarks(frame_clone1, landmarks, flags, FACE_CONNECTIONS);
+    } else {
+        std::cout << "not detect face!" << std::endl;
+    }
+
+    
+    draw_roi(frame_clone1, boxes);
+    draw_detections(frame_clone1, face_detections);
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR);
+    cv::imwrite("vis_result.jpg", frame_clone1);
+
+
+    fast_interpreter1->destory();
+    fast_interpreter2->destory();
+    return 0;
+    
+}
+
+
+int main(int argc, char* argv[]) {
+    Args args = parse_args(argc, argv);
+    return invoke(args);
+}
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/models/m_faceDetector_fp16.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/models/m_faceDetector_fp16.qnn216.ctx.bin
new file mode 100644
index 0000000000000000000000000000000000000000..839433e794ebb61e5e0166b62966f383e6c43d60
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/models/m_faceDetector_fp16.qnn216.ctx.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:751919064bb2fe2682c29f514a76318d7f2d6518013cbe88e1b2c4cef8b2bb20
+size 668864
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/models/m_faceLandmark_fp16.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/models/m_faceLandmark_fp16.qnn216.ctx.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2d80e19b1013f1b872011af7abf82f3850271938
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/models/m_faceLandmark_fp16.qnn216.ctx.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ba9a12397917cef8dcf20df00d5855fa6be79789e2bf2ae1cbd24ccf32ee666
+size 1674312
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5f01c881032cc0ea0dafbbe76b4d02bf51f5dc2
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e15f793e9a3a77c122c6cae36e26c5bd9b77ab7e
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazebase.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazeface.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4d54b68c53f26e930688f7bd66bc42e64431265
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazeface.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3960f9eaa3babd661de15273d7037029ce46e706
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/visualization.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffaa52099e2019a3dc97a8eef82319610de6aaf4
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/__pycache__/visualization.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/anchors_face_back.npy b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/anchors_face_back.npy
new file mode 100644
index 0000000000000000000000000000000000000000..3ba12474802adea36a8e37ca648d46556cd35c92
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/anchors_face_back.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95
+size 28800
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazebase.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazebase.py
new file mode 100644
index 0000000000000000000000000000000000000000..414ae950199d7fa1ebae4fa540805cb53cf1ff0d
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazebase.py
@@ -0,0 +1,513 @@
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def resize_pad(img):
+    """ resize and pad images to be input to the detectors
+
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio.
+
+    Returns:
+        img1: 256x256
+        img2: 128x128
+        scale: scale factor between original image and 256x256 image
+        pad: pixels of padding in the original image
+    """
+
+    size0 = img.shape
+    if size0[0]>=size0[1]:
+        h1 = 256
+        w1 = 256 * size0[1] // size0[0]
+        padh = 0
+        padw = 256 - w1
+        scale = size0[1] / w1
+    else:
+        h1 = 256 * size0[0] // size0[1]
+        w1 = 256
+        padh = 256 - h1
+        padw = 0
+        scale = size0[0] / h1
+    padh1 = padh//2
+    padh2 = padh//2 + padh%2
+    padw1 = padw//2
+    padw2 = padw//2 + padw%2
+    img1 = cv2.resize(img, (w1,h1))
+    img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0)))
+    pad = (int(padh1 * scale), int(padw1 * scale))
+    img2 = cv2.resize(img1, (128,128))
+    return img1, img2, scale, pad
+
+
+def denormalize_detections(detections, scale, pad):
+    """ maps detection coordinates from [0,1] to image coordinates
+
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio. This function maps the
+    normalized coordinates back to the original image coordinates.
+
+    Inputs:
+        detections: nxm tensor. n is the number of detections.
+            m is 4+2*k where the first 4 valuse are the bounding
+            box coordinates and k is the number of additional
+            keypoints output by the detector.
+        scale: scalar that was used to resize the image
+        pad: padding in the x and y dimensions
+
+    """
+    detections[:, 0] = detections[:, 0] * scale * 256 - pad[0]
+    detections[:, 1] = detections[:, 1] * scale * 256 - pad[1]
+    detections[:, 2] = detections[:, 2] * scale * 256 - pad[0]
+    detections[:, 3] = detections[:, 3] * scale * 256 - pad[1]
+
+    detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1]
+    detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0]
+    return detections
+
+
+
+
+class BlazeBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False):
+        super(BlazeBlock, self).__init__()
+
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.channel_pad = out_channels - in_channels
+
+        # TFLite uses slightly different padding than PyTorch 
+        # on the depthwise conv layer when the stride is 2.
+        if stride == 2:
+            self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+            padding = 0
+        else:
+            padding = (kernel_size - 1) // 2
+
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=in_channels, 
+                      kernel_size=kernel_size, stride=stride, padding=padding, 
+                      groups=in_channels, bias=True),
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+
+        if skip_proj:
+            self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 
+                      kernel_size=1, stride=1, padding=0, bias=True)
+        else:
+            self.skip_proj = None
+
+        if act == 'relu':
+            self.act = nn.ReLU(inplace=True)
+        elif act == 'prelu':
+            self.act = nn.PReLU(out_channels)
+        else:
+            raise NotImplementedError("unknown activation %s"%act)
+
+    def forward(self, x):
+        if self.stride == 2:
+            if self.kernel_size==3:
+                h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+            else:
+                h = F.pad(x, (1, 2, 1, 2), "constant", 0)
+            x = self.max_pool(x)
+        else:
+            h = x
+
+        if self.skip_proj is not None:
+            x = self.skip_proj(x)
+        elif self.channel_pad > 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
+        
+
+        return self.act(self.convs(h) + x)
+
+
+class FinalBlazeBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3):
+        super(FinalBlazeBlock, self).__init__()
+
+        # TFLite uses slightly different padding than PyTorch
+        # on the depthwise conv layer when the stride is 2.
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=kernel_size, stride=2, padding=0,
+                      groups=channels, bias=True),
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+
+        self.act = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+
+        return self.act(self.convs(h))
+
+
+class BlazeBase(nn.Module):
+    """ Base class for media pipe models. """
+
+    def _device(self):
+        """Which device (CPU or GPU) is being used by this model?"""
+        return self.classifier_8.weight.device
+    
+    def load_weights(self, path):
+        self.load_state_dict(torch.load(path))
+        self.eval()        
+
+
+class BlazeLandmark(BlazeBase):
+    """ Base class for landmark models. """
+
+    def extract_roi(self, frame, xc, yc, theta, scale):
+
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+
+        # use the points to compute the affine transform that maps 
+        # these points back to the output square
+        res = self.resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].cpu().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+
+        return imgs, affines, points
+
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+
+
+
+class BlazeDetector(BlazeBase):
+    """ Base class for detector models.
+
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/hollance/BlazeFace-PyTorch and
+    https://github.com/google/mediapipe/
+    """
+    def load_anchors(self, path):
+        self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
+        assert(self.anchors.ndimension() == 2)
+        assert(self.anchors.shape[0] == self.num_anchors)
+        assert(self.anchors.shape[1] == 4)
+
+    def _preprocess(self, x):
+        """Converts the image pixels to the range [-1, 1]."""
+        return x.float() / 255.# 127.5 - 1.0
+
+    def predict_on_image(self, img):
+        """Makes a prediction on a single image.
+
+        Arguments:
+            img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
+                 shape (3, H, W). The image's height and width should be 
+                 128 pixels.
+
+        Returns:
+            A tensor with face detections.
+        """
+        if isinstance(img, np.ndarray):
+            img = torch.from_numpy(img).permute((2, 0, 1))
+
+        return self.predict_on_batch(img.unsqueeze(0))[0]
+
+    def predict_on_batch(self, x):
+        """Makes a prediction on a batch of images.
+
+        Arguments:
+            x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
+               shape (b, 3, H, W). The height and width should be 128 pixels.
+
+        Returns:
+            A list containing a tensor of face detections for each image in 
+            the batch. If no faces are found for an image, returns a tensor
+            of shape (0, 17).
+
+        Each face detection is a PyTorch tensor consisting of 17 numbers:
+            - ymin, xmin, ymax, xmax
+            - x,y-coordinates for the 6 keypoints
+            - confidence score
+        """
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x).permute((0, 3, 1, 2))
+
+        assert x.shape[1] == 3
+        assert x.shape[2] == self.y_scale
+        assert x.shape[3] == self.x_scale
+
+        # 1. Preprocess the images into tensors:
+        x = x.to(self._device())
+        x = self._preprocess(x)
+
+        # 2. Run the neural network:
+        with torch.no_grad():
+            out = self.__call__(x)
+
+        # 3. Postprocess the raw predictions:
+        detections = self._tensors_to_detections(out[0], out[1], self.anchors)
+
+        # 4. Non-maximum suppression to remove overlapping detections:
+        filtered_detections = []
+        for i in range(len(detections)):
+            faces = self._weighted_non_max_suppression(detections[i])
+            faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1))
+            filtered_detections.append(faces)
+
+        return filtered_detections
+
+
+    def detection2roi(self, detection):
+        """ Convert detections from detector to an oriented bounding box.
+
+        Adapted from:
+        # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
+
+        The center and size of the box is calculated from the center 
+        of the detected box. Rotation is calcualted from the vector
+        between kp1 and kp2 relative to theta0. The box is scaled
+        and shifted by dscale and dy.
+
+        """
+        if self.detection2roi_method == 'box':
+            # compute box center and scale
+            # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+            xc = (detection[:,1] + detection[:,3]) / 2
+            yc = (detection[:,0] + detection[:,2]) / 2
+            scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+
+        elif self.detection2roi_method == 'alignment':
+            # compute box center and scale
+            # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc
+            xc = detection[:,4+2*self.kp1]
+            yc = detection[:,4+2*self.kp1+1]
+            x1 = detection[:,4+2*self.kp2]
+            y1 = detection[:,4+2*self.kp2+1]
+            scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2
+        else:
+            raise NotImplementedError(
+                "detection2roi_method [%s] not supported"%self.detection2roi_method)
+
+        yc += self.dy * scale
+        scale *= self.dscale
+
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        #theta = np.arctan2(y0-y1, x0-x1) - self.theta0
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+
+
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        """The output of the neural network is a tensor of shape (b, 896, 16)
+        containing the bounding box regressor predictions, as well as a tensor 
+        of shape (b, 896, 1) with the classification confidences.
+
+        This function converts these two "raw" tensors into proper detections.
+        Returns a list of (num_detections, 17) tensors, one for each image in
+        the batch.
+
+        This is based on the source code from:
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
+        """
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+
+        return output_detections
+
+    def _decode_boxes(self, raw_boxes, anchors):
+        """Converts the predictions into actual coordinates using
+        the anchor boxes. Processes the entire batch at once.
+        """
+        boxes = torch.zeros_like(raw_boxes)
+
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+
+        return boxes
+
+    def _weighted_non_max_suppression(self, detections):
+        """The alternative NMS method as mentioned in the BlazeFace paper:
+
+        "We replace the suppression algorithm with a blending strategy that
+        estimates the regression parameters of a bounding box as a weighted
+        mean between the overlapping predictions."
+
+        The original MediaPipe code assigns the score of the most confident
+        detection to the weighted detection, but we take the average score
+        of the overlapping detections.
+
+        The input detections should be a Tensor of shape (count, 17).
+
+        Returns a list of PyTorch tensors, one for each detected face.
+        
+        This is based on the source code from:
+        mediapipe/calculators/util/non_max_suppression_calculator.cc
+        mediapipe/calculators/util/non_max_suppression_calculator.proto
+        """
+        if len(detections) == 0: return []
+
+        output_detections = []
+
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, self.num_coords], descending=True)
+
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+
+            # Compute the overlap between the first box and the other 
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = overlap_similarity(first_box, other_boxes)
+
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious > self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :self.num_coords]
+                scores = detections[overlapping, self.num_coords:self.num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:self.num_coords] = weighted
+                weighted_detection[self.num_coords] = total_score / len(overlapping)
+
+            output_detections.append(weighted_detection)
+
+        return output_detections    
+
+
+# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
+
+def intersect(box_a, box_b):
+    """ We resize both tensors to [A,B,2] without new malloc:
+    [A,2] -> [A,1,2] -> [A,B,2]
+    [B,2] -> [1,B,2] -> [A,B,2]
+    Then we compute the area of intersect between box_a and box_b.
+    Args:
+      box_a: (tensor) bounding boxes, Shape: [A,4].
+      box_b: (tensor) bounding boxes, Shape: [B,4].
+    Return:
+      (tensor) intersection area, Shape: [A,B].
+    """
+    A = box_a.size(0)
+    B = box_b.size(0)
+    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+    inter = torch.clamp((max_xy - min_xy), min=0)
+    return inter[:, :, 0] * inter[:, :, 1]
+
+
+def jaccard(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.  Here we operate on
+    ground truth boxes and default boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
+        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
+    Return:
+        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2]-box_a[:, 0]) *
+              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+    area_b = ((box_b[:, 2]-box_b[:, 0]) *
+              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+
+
+def overlap_similarity(box, other_boxes):
+    """Computes the IOU between a bounding box and set of other boxes."""
+    return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazeface.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazeface.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d630b3b28ead006d2864f18f1637f6545dbe507
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazeface.py
@@ -0,0 +1,182 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from blazebase import BlazeDetector, BlazeBlock, FinalBlazeBlock
+
+
+class BlazeFace(BlazeDetector):
+    """The BlazeFace face detection model from MediaPipe.
+    
+    The version from MediaPipe is simpler than the one in the paper; 
+    it does not use the "double" BlazeBlocks.
+
+    Because we won't be training this model, it doesn't need to have
+    batchnorm layers. These have already been "folded" into the conv 
+    weights by TFLite.
+
+    The conversion to PyTorch is fairly straightforward, but there are 
+    some small differences between TFLite and PyTorch in how they handle
+    padding on conv layers with stride 2.
+
+    This version works on batches, while the MediaPipe version can only
+    handle a single image at a time.
+
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/hollance/BlazeFace-PyTorch and
+    https://github.com/google/mediapipe/
+
+    """
+    def __init__(self, back_model=False):
+        super(BlazeFace, self).__init__()
+
+        # These are the settings from the MediaPipe example graph
+        # mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt
+        self.num_classes = 1
+        self.num_anchors = 896
+        self.num_coords = 16
+        self.score_clipping_thresh = 100.0
+        self.back_model = back_model
+        if back_model:
+            self.x_scale = 256.0
+            self.y_scale = 256.0
+            self.h_scale = 256.0
+            self.w_scale = 256.0
+            self.min_score_thresh = 0.65
+        else:
+            self.x_scale = 128.0
+            self.y_scale = 128.0
+            self.h_scale = 128.0
+            self.w_scale = 128.0
+            self.min_score_thresh = 0.75
+        self.min_suppression_threshold = 0.3
+        self.num_keypoints = 6
+
+        # These settings are for converting detections to ROIs which can then
+        # be extracted and feed into the landmark network
+        # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+        self.detection2roi_method = 'box'
+        # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
+        self.kp1 = 1
+        self.kp2 = 0
+        self.theta0 = 0.
+        self.dscale = 1.5
+        self.dy = 0.
+
+        self._define_layers()
+
+    def _define_layers(self):
+        if self.back_model:
+            self.backbone = nn.Sequential(
+                nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
+                nn.ReLU(inplace=True),
+
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24, stride=2),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 48, stride=2),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 96, stride=2),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+            )
+            self.final = FinalBlazeBlock(96)
+            self.classifier_8 = nn.Conv2d(96, 2, 1, bias=True)
+            self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
+
+            self.regressor_8 = nn.Conv2d(96, 32, 1, bias=True)
+            self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
+        else:
+            self.backbone1 = nn.Sequential(
+                nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
+                nn.ReLU(inplace=True),
+
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 28),
+                BlazeBlock(28, 32, stride=2),
+                BlazeBlock(32, 36),
+                BlazeBlock(36, 42),
+                BlazeBlock(42, 48, stride=2),
+                BlazeBlock(48, 56),
+                BlazeBlock(56, 64),
+                BlazeBlock(64, 72),
+                BlazeBlock(72, 80),
+                BlazeBlock(80, 88),
+            )
+        
+            self.backbone2 = nn.Sequential(
+                BlazeBlock(88, 96, stride=2),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+            )
+
+            self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True)
+            self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
+
+            self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True)
+            self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
+        
+    def forward(self, x):
+        # TFLite uses slightly different padding on the first conv layer
+        # than PyTorch, so do it manually.
+        x = F.pad(x, (1, 2, 1, 2), "constant", 0)
+        
+        b = x.shape[0]      # batch size, needed for reshaping later
+
+        if self.back_model:
+            x = self.backbone(x)           # (b, 16, 16, 96)
+            h = self.final(x)              # (b, 8, 8, 96)
+        else:
+            x = self.backbone1(x)           # (b, 88, 16, 16)
+            h = self.backbone2(x)           # (b, 96, 8, 8)
+        
+        # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
+        # permute the output from the conv layers before reshaping it.
+        
+        c1 = self.classifier_8(x)       # (b, 2, 16, 16)
+        c1 = c1.permute(0, 2, 3, 1)     # (b, 16, 16, 2)
+        c1 = c1.reshape(b, -1, 1)       # (b, 512, 1)
+
+        c2 = self.classifier_16(h)      # (b, 6, 8, 8)
+        c2 = c2.permute(0, 2, 3, 1)     # (b, 8, 8, 6)
+        c2 = c2.reshape(b, -1, 1)       # (b, 384, 1)
+
+        c = torch.cat((c1, c2), dim=1)  # (b, 896, 1)
+
+        r1 = self.regressor_8(x)        # (b, 32, 16, 16)
+        r1 = r1.permute(0, 2, 3, 1)     # (b, 16, 16, 32)
+        r1 = r1.reshape(b, -1, 16)      # (b, 512, 16)
+
+        r2 = self.regressor_16(h)       # (b, 96, 8, 8)
+        r2 = r2.permute(0, 2, 3, 1)     # (b, 8, 8, 96)
+        r2 = r2.reshape(b, -1, 16)      # (b, 384, 16)
+
+        r = torch.cat((r1, r2), dim=1)  # (b, 896, 16)
+        return [r, c]
+
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazeface_landmark.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazeface_landmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b21d17e31c8a8a39e95abb27cc3220440a01fca
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/blazeface_landmark.py
@@ -0,0 +1,74 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from blazebase import BlazeLandmark, BlazeBlock
+
+class BlazeFaceLandmark(BlazeLandmark):
+    """The face landmark model from MediaPipe.
+    
+    """
+    def __init__(self):
+        super(BlazeFaceLandmark, self).__init__()
+
+        # size of ROIs used for input
+        self.resolution = 192
+
+        self._define_layers()
+
+    def _define_layers(self):
+        self.backbone1 = nn.Sequential(
+            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=0, bias=True),
+            nn.PReLU(16),
+
+            BlazeBlock(16, 16, 3, act='prelu'),
+            BlazeBlock(16, 16, 3, act='prelu'),
+            BlazeBlock(16, 32, 3, 2, act='prelu'),
+
+            BlazeBlock(32, 32, 3, act='prelu'),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            BlazeBlock(32, 64, 3, 2, act='prelu'),
+
+            BlazeBlock(64, 64, 3, act='prelu'),
+            BlazeBlock(64, 64, 3, act='prelu'),
+            BlazeBlock(64, 128, 3, 2, act='prelu'),
+
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+        )
+
+
+        self.backbone2a = nn.Sequential(
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            nn.Conv2d(128, 32, 1, padding=0, bias=True),
+            nn.PReLU(32),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            nn.Conv2d(32, 1404, 3, padding=0, bias=True)
+        )
+
+        self.backbone2b = nn.Sequential(
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+            nn.Conv2d(128, 32, 1, padding=0, bias=True),
+            nn.PReLU(32),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            nn.Conv2d(32, 1, 3, padding=0, bias=True)
+        )
+
+    def forward(self, x):
+        if x.shape[0] == 0:
+            return torch.zeros((0,)), torch.zeros((0, 468, 3))
+            
+        x = F.pad(x, (0, 1, 0, 1), "constant", 0)
+
+        x = self.backbone1(x)
+        landmarks = self.backbone2a(x).view(-1, 468, 3) / 192
+        flag = self.backbone2b(x).sigmoid().view(-1)
+
+        return flag, landmarks
\ No newline at end of file
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/coco.jpg b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/coco.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/coco.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378
+size 158750
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/demo_qnn.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/demo_qnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bacffe80b7dfb4b20ed6fe45557496d0a02925a
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/demo_qnn.py
@@ -0,0 +1,421 @@
+import numpy as np
+import torch
+import cv2
+import sys
+from blazebase import resize_pad, denormalize_detections
+from visualization import  draw_landmarks, draw_roi, FACE_CONNECTIONS
+import time
+import aidlite
+import os
+
+class post_mediapipe_face:
+    def __init__(self):
+        self.kp1 = 1
+        self.kp2 = 0
+        self.theta0 = 0.
+        self.dscale = 1.5
+        self.dy = 0.
+        self.x_scale = 256.0
+        self.y_scale = 256.0
+        self.h_scale = 256.0
+        self.w_scale = 256.0
+        self.num_keypoints = 6
+        self.num_classes = 1
+        self.num_anchors = 896
+        self.num_coords = 16
+        self.min_score_thresh = 0.4  #0.65  
+        self.score_clipping_thresh = 100.0
+        self.min_suppression_threshold = 0.3
+        self.resolution = 192
+            
+        
+    def detection2roi(self,detection):
+        xc = (detection[:,1] + detection[:,3]) / 2
+        yc = (detection[:,0] + detection[:,2]) / 2
+        scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+        yc += self.dy * scale
+        scale *= self.dscale
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+    
+    def _decode_boxes( self,raw_boxes, anchors):
+        boxes = torch.zeros_like(raw_boxes)
+
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+        return boxes
+    
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+
+        return output_detections
+
+    def extract_roi( self,frame, xc, yc, theta, scale):
+        resolution = 192
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+
+        # use the points to compute the affine transform that maps 
+        # these points back to the output square
+        res = resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].detach().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+
+        return imgs, affines, points
+
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+
+    def intersect(self,box_a, box_b):
+        A = box_a.size(0)
+        B = box_b.size(0)
+        max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                        box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+        min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                        box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+        inter = torch.clamp((max_xy - min_xy), min=0)
+        return inter[:, :, 0] * inter[:, :, 1]
+
+    def jaccard(self,box_a, box_b):
+        inter = self.intersect(box_a, box_b)
+        area_a = ((box_a[:, 2]-box_a[:, 0]) *
+                (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+        area_b = ((box_b[:, 2]-box_b[:, 0]) *
+                (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+        union = area_a + area_b - inter
+        return inter / union  # [A,B]
+
+
+    def overlap_similarity(self,box, other_boxes):
+        """Computes the IOU between a bounding box and set of other boxes."""
+        return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
+
+    def _weighted_non_max_suppression(self,detections):
+        if len(detections) == 0: return []
+        output_detections = []
+
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, num_coords], descending=True)
+
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+
+            # Compute the overlap between the first box and the other 
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = self.overlap_similarity(first_box, other_boxes)
+
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious >  self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :num_coords]
+                scores = detections[overlapping, num_coords:num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:num_coords] = weighted
+                weighted_detection[num_coords] = total_score / len(overlapping)
+
+            output_detections.append(weighted_detection)
+
+            return output_detections
+
+def draw_detections(img, detections, with_keypoints=True):
+    if isinstance(detections, torch.Tensor):
+        detections = detections.detach().numpy()
+
+    if detections.ndim == 1:
+        detections = np.expand_dims(detections, axis=0)
+
+    n_keypoints = detections.shape[1] // 2 - 2
+
+    for i in range(detections.shape[0]):
+        ymin = detections[i, 0]
+        xmin = detections[i, 1]
+        ymax = detections[i, 2]
+        xmax = detections[i, 3]
+        
+        start_point = (int(xmin), int(ymin))
+        end_point = (int(xmax), int(ymax))
+        img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) 
+
+        if with_keypoints:
+            for k in range(n_keypoints):
+                kp_x = int(detections[i, 4 + k*2    ])
+                kp_y = int(detections[i, 4 + k*2 + 1])
+                cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
+    return img
+
+
+
+post_process=post_mediapipe_face()
+
+class faceDetectionQnn:
+    def __init__(self):
+        super().__init__()
+        self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceDetector_fp16.qnn216.ctx.bin"))
+        if self.model is None:
+            print("Create model failed !")
+            return
+
+        self.config = aidlite.Config.create_instance()
+        if self.config is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+
+        self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
+        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
+        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
+        self.config.is_quantify_model = 1
+        
+        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        input_shapes = [[1,3, 256, 256]]
+        output_shapes = [[1, 896,16],[1,896,1]]
+        self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
+                                output_shapes, aidlite.DataType.TYPE_FLOAT32)
+
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+        result = self.interpreter.init()
+        if result != 0:
+            print(f"interpreter init failed !")
+        result = self.interpreter.load_model()
+        if result != 0:
+            print("interpreter load model failed !")
+   
+        print(" model load success!")
+
+    def __call__(self, input):
+        self.interpreter.set_input_tensor(0,input)
+        invoke_time=[]
+        invoke_nums =10
+        for i in range(invoke_nums):
+            result = self.interpreter.set_input_tensor(0, input.data)
+            if result != 0:
+                print("interpreter set_input_tensor() failed")
+            t1=time.time()
+            result = self.interpreter.invoke()
+            cost_time = (time.time()-t1)*1000
+            invoke_time.append(cost_time)
+
+        max_invoke_time = max(invoke_time)
+        min_invoke_time = min(invoke_time)
+        mean_invoke_time = sum(invoke_time)/invoke_nums
+        var_invoketime=np.var(invoke_time)
+        print("====================================")
+        print(f"QNN Detection invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
+        print("====================================")
+        features_0 = self.interpreter.get_output_tensor(0).reshape(1, 896,16).copy()
+        features_1 = self.interpreter.get_output_tensor(1).reshape(1, 896,1).copy()
+        return features_0,features_1
+
+
+class faceLandmarkQnn:
+    def __init__(self):
+        super().__init__()
+        self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceLandmark_fp16.qnn216.ctx.bin"))
+        if self.model is None:
+            print("Create model failed !")
+            return
+
+        self.config = aidlite.Config.create_instance()
+        if self.config is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+
+        self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
+        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
+        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
+        self.config.is_quantify_model = 1
+        
+        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        input_shapes = [[1, 3, 192, 192]]
+        output_shapes = [[1],[1,468,3]]
+        self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
+                                output_shapes, aidlite.DataType.TYPE_FLOAT32)
+
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+        result = self.interpreter.init()
+        if result != 0:
+            print(f"interpreter init failed !")
+        result = self.interpreter.load_model()
+        if result != 0:
+            print("interpreter load model failed !")
+   
+        print(" model load success!")
+
+    def __call__(self, input):
+        self.interpreter.set_input_tensor(0,input)
+        invoke_time=[]
+        invoke_nums =10
+        for i in range(invoke_nums):
+            result = self.interpreter.set_input_tensor(0, input.data)
+            if result != 0:
+                print("interpreter set_input_tensor() failed")
+            t1=time.time()
+            result = self.interpreter.invoke()
+            cost_time = (time.time()-t1)*1000
+            invoke_time.append(cost_time)
+
+        max_invoke_time = max(invoke_time)
+        min_invoke_time = min(invoke_time)
+        mean_invoke_time = sum(invoke_time)/invoke_nums
+        var_invoketime=np.var(invoke_time)
+        print("====================================")
+        print(f"QNN LandMark invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
+        print("====================================")
+        features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy()
+        features_1 = self.interpreter.get_output_tensor(1).reshape(1,468,3).copy()
+        return features_0,features_1
+
+
+
+anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"anchors_face_back.npy")), dtype=torch.float32, device='cpu')
+face_detc = faceDetectionQnn()
+face_rec = faceLandmarkQnn()
+
+image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"coco.jpg")
+
+frame_ct=0
+image = cv2.imread(image_path)
+
+frame = np.ascontiguousarray(image[:,:,::-1])
+
+img1, img2, scale, pad = resize_pad(frame)
+
+input = (img1 / 255).astype(np.float32)
+input = np.transpose(input, (2, 0, 1))  
+input = input[np.newaxis, ...]
+t0 = time.time()
+out = face_detc(input)
+use_time = round((time.time() - t0) * 1000, 2)
+detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors)
+
+filtered_detections = []
+num_coords = 16
+for i in range(len(detections)):
+    faces = post_process._weighted_non_max_suppression(detections[i])
+    faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1))
+    filtered_detections.append(faces)
+
+face_detections = denormalize_detections(filtered_detections[0], scale, pad)
+
+xc, yc, scale, theta = post_process.detection2roi(face_detections)
+
+img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale)
+if box.size()[0]!=0:
+    t2 = time.time()
+    flags, normalized_landmarks = face_rec(img.numpy())
+
+    use_time = round((time.time() - t2) * 1000, 2)
+
+    landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine)
+
+    for i in range(len(flags)):
+        landmark, flag = landmarks[i], flags[i]
+        if flag>.4: # 0.5
+            draw_landmarks(frame, landmark[:,:2], FACE_CONNECTIONS, size=1)
+else:
+    print("not detect face !")
+
+draw_roi(frame, box)
+draw_detections(frame, face_detections)
+cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1])
+face_detc.interpreter.destory()
+face_rec.interpreter.destory()
+
+
+
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/visualization.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/visualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1187c9d99336d53562f4fa3a4a32f6af6e769fb
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_fp16_aidlite/python/visualization.py
@@ -0,0 +1,125 @@
+import numpy as np
+import cv2
+import torch
+
+def draw_detections(img, detections, with_keypoints=True):
+    if isinstance(detections, torch.Tensor):
+        detections = detections.cpu().numpy()
+
+    if detections.ndim == 1:
+        detections = np.expand_dims(detections, axis=0)
+
+    n_keypoints = detections.shape[1] // 2 - 2
+
+    for i in range(detections.shape[0]):
+        ymin = detections[i, 0]
+        xmin = detections[i, 1]
+        ymax = detections[i, 2]
+        xmax = detections[i, 3]
+        
+        start_point = (int(xmin), int(ymin))
+        end_point = (int(xmax), int(ymax))
+        img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) 
+
+        if with_keypoints:
+            for k in range(n_keypoints):
+                kp_x = int(detections[i, 4 + k*2    ])
+                kp_y = int(detections[i, 4 + k*2 + 1])
+                cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
+    return img
+
+
+def draw_roi(img, roi):
+    for i in range(roi.shape[0]):
+        (x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i]
+        cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2)
+        cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2)
+        cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2)
+        cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2)
+
+
+def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2):
+    points = points[:,:2]
+    for point in points:
+        x, y = point
+        x, y = int(x), int(y)
+        cv2.circle(img, (x, y), size, color, thickness=size)
+    for connection in connections:
+        x0, y0 = points[connection[0]]
+        x1, y1 = points[connection[1]]
+        x0, y0 = int(x0), int(y0)
+        x1, y1 = int(x1), int(y1)
+        cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size)
+
+
+
+# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py
+#        8   12  16  20
+#        |   |   |   |
+#        7   11  15  19
+#    4   |   |   |   |
+#    |   6   10  14  18
+#    3   |   |   |   |
+#    |   5---9---13--17
+#    2    \         /
+#     \    \       /
+#      1    \     /
+#       \    \   /
+#        ------0-
+HAND_CONNECTIONS = [
+    (0, 1), (1, 2), (2, 3), (3, 4),
+    (5, 6), (6, 7), (7, 8),
+    (9, 10), (10, 11), (11, 12),
+    (13, 14), (14, 15), (15, 16),
+    (17, 18), (18, 19), (19, 20),
+    (0, 5), (5, 9), (9, 13), (13, 17), (0, 17)
+]
+
+POSE_CONNECTIONS = [
+    (0,1), (1,2), (2,3), (3,7),
+    (0,4), (4,5), (5,6), (6,8),
+    (9,10),
+    (11,13), (13,15), (15,17), (17,19), (19,15), (15,21),
+    (12,14), (14,16), (16,18), (18,20), (20,16), (16,22),
+    (11,12), (12,24), (24,23), (23,11)
+]
+
+# Vertex indices can be found in
+# github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png
+# Found in github.com/google/mediapipe/python/solutions/face_mesh.py
+FACE_CONNECTIONS = [
+    # Lips.
+    (61, 146), (146, 91), (91, 181), (181, 84), (84, 17),
+    (17, 314), (314, 405), (405, 321), (321, 375), (375, 291),
+    (61, 185), (185, 40), (40, 39), (39, 37), (37, 0),
+    (0, 267), (267, 269), (269, 270), (270, 409), (409, 291),
+    (78, 95), (95, 88), (88, 178), (178, 87), (87, 14),
+    (14, 317), (317, 402), (402, 318), (318, 324), (324, 308),
+    (78, 191), (191, 80), (80, 81), (81, 82), (82, 13),
+    (13, 312), (312, 311), (311, 310), (310, 415), (415, 308),
+    # Left eye.
+    (263, 249), (249, 390), (390, 373), (373, 374), (374, 380),
+    (380, 381), (381, 382), (382, 362), (263, 466), (466, 388),
+    (388, 387), (387, 386), (386, 385), (385, 384), (384, 398),
+    (398, 362),
+    # Left eyebrow.
+    (276, 283), (283, 282), (282, 295), (295, 285), (300, 293),
+    (293, 334), (334, 296), (296, 336),
+    # Right eye.
+    (33, 7), (7, 163), (163, 144), (144, 145), (145, 153),
+    (153, 154), (154, 155), (155, 133), (33, 246), (246, 161),
+    (161, 160), (160, 159), (159, 158), (158, 157), (157, 173),
+    (173, 133),
+    # Right eyebrow.
+    (46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105),
+    (105, 66), (66, 107),
+    # Face oval.
+    (10, 338), (338, 297), (297, 332), (332, 284), (284, 251),
+    (251, 389), (389, 356), (356, 454), (454, 323), (323, 361),
+    (361, 288), (288, 397), (397, 365), (365, 379), (379, 378),
+    (378, 400), (400, 377), (377, 152), (152, 148), (148, 176),
+    (176, 149), (149, 150), (150, 136), (136, 172), (172, 58),
+    (58, 132), (132, 93), (93, 234), (234, 127), (127, 162),
+    (162, 21), (21, 54), (54, 103), (103, 67), (67, 109),
+    (109, 10)
+]
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/README.md b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f6af35b4625656b8d75e35fceafd1023d67dea5b
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/README.md
@@ -0,0 +1,63 @@
+## Model Information
+### Source model
+- Input shape: [1x3x256x256],[1x3x192x192]
+- Number of parameters:0.13M, 0.6M
+- Model size:0.58MB, 2.32MB
+- Output shape: [1x896x16, 1x896x1], [1, 1x486x3]
+
+Source model repository: [MediaPipe-Face-Detection](https://github.com/zmurez/MediaPipePyTorch/)
+
+### Converted model
+
+- Precision: INT8
+- Backend: QNN2.16
+- Target Device: SNM972 QCS8550
+
+## Inference with AidLite SDK
+
+### SDK installation
+Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
+
+- install AidLite SDK
+
+```bash
+# Install the appropriate version of the aidlite sdk
+sudo aid-pkg update
+sudo aid-pkg install aidlite-sdk
+# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
+sudo aid-pkg install aidlite-{QNN VERSION}
+```
+
+- Verify AidLite SDK
+
+```bash
+# aidlite sdk c++ check
+python3 -c "import aidlite ; print(aidlite.get_library_version())"
+
+# aidlite sdk python check
+python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
+```
+
+### Run demo
+#### python
+```bash
+cd python 
+python3 demo_qnn.py
+```
+
+#### c++
+```bash
+# 加载.npy文件需要用到cnpy库（终端默认路径下执行即可）
+git clone https://github.com/rogersce/cnpy.git
+cd cnpy
+mkdir build && cd build
+cmake ..
+make
+sudo make install
+
+cd mediapipe-face-detection/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp
+mkdir build && cd build 
+cmake ..
+make
+./run_test
+```
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazebase.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8655a0360f730c1fb7b4f81367a6279e5acc8060
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazebase.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazebase.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2044fec25dc146bd44e23e22b15c320e189fd99f
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazebase.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazeface.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3d9d716ed4bbd036e086c876ce0c5308d5a9a5d
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazeface.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27b6da065e0c25967994af97ae8c72f67594bf5f
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/visualization.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e9b9b371d9ab88b3885f249fa1a9fc68d8751c1
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/visualization.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/visualization.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/visualization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c16fa3b7c9a01a594bb21e59a6c00d0b19945548
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/__pycache__/visualization.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/CMakeLists.txt b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..13cca3db86e64b8a8458fd55c52323bed1d4282f
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/CMakeLists.txt
@@ -0,0 +1,34 @@
+cmake_minimum_required (VERSION 3.5)
+project("run_test")
+
+find_package(OpenCV REQUIRED)
+find_library(CNPY_LIB cnpy REQUIRED)
+
+message(STATUS "oPENCV Library status:")
+message(STATUS ">version:${OpenCV_VERSION}")
+message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
+
+set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
+
+include_directories(
+    /usr/local/include
+    /usr/include/opencv4
+)
+
+link_directories(
+    /usr/local/lib/
+)
+
+file(GLOB SRC_LISTS 
+    ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp  
+)
+
+add_executable(run_test ${SRC_LISTS})
+
+target_link_libraries(run_test
+    aidlite
+	${OpenCV_LIBS}
+    pthread
+    jsoncpp
+    ${CNPY_LIB}
+)
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/anchors_float32.npy b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/anchors_float32.npy
new file mode 100644
index 0000000000000000000000000000000000000000..fc25c0d2c161ee090e9a04c6003418bd15caf45c
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/anchors_float32.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94
+size 14464
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/coco.jpg b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/coco.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/coco.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378
+size 158750
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/run_test.cpp b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/run_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..15a817e6b6b8d96cb1edff9e2ee5dae04e7fccde
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/cpp/run_test.cpp
@@ -0,0 +1,909 @@
+#include <iostream>
+#include <fstream>
+#include <opencv2/opencv.hpp>
+#include <aidlux/aidlite/aidlite.hpp>
+#include <vector>
+#include <numeric>
+#include <cmath>
+#include <jsoncpp/json/json.h>
+#include <tuple>
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <cassert>
+#include "cnpy.h"
+
+using namespace cv;
+using namespace std;
+using namespace Aidlux::Aidlite;
+
+
+// 人脸 landmark 连接索引定义（来自 MediaPipe Face Mesh）
+const std::vector<std::pair<int, int>> FACE_CONNECTIONS = {
+    {61, 146}, {146, 91}, {91, 181}, {181, 84}, {84, 17},
+    {17, 314}, {314, 405}, {405, 321}, {321, 375}, {375, 291},
+    {61, 185}, {185, 40}, {40, 39}, {39, 37}, {37, 0},
+    {0, 267}, {267, 269}, {269, 270}, {270, 409}, {409, 291},
+    {78, 95}, {95, 88}, {88, 178}, {178, 87}, {87, 14},
+    {14, 317}, {317, 402}, {402, 318}, {318, 324}, {324, 308},
+    {78, 191}, {191, 80}, {80, 81}, {81, 82}, {82, 13},
+    {13, 312}, {312, 311}, {311, 310}, {310, 415}, {415, 308},
+    {263, 249}, {249, 390}, {390, 373}, {373, 374}, {374, 380},
+    {380, 381}, {381, 382}, {382, 362}, {263, 466}, {466, 388},
+    {388, 387}, {387, 386}, {386, 385}, {385, 384}, {384, 398},
+    {398, 362}, {276, 283}, {283, 282}, {282, 295}, {295, 285},
+    {300, 293}, {293, 334}, {334, 296}, {296, 336}, {33, 7},
+    {7, 163}, {163, 144}, {144, 145}, {145, 153}, {153, 154},
+    {154, 155}, {155, 133}, {33, 246}, {246, 161}, {161, 160},
+    {160, 159}, {159, 158}, {158, 157}, {157, 173}, {173, 133},
+    {46, 53}, {53, 52}, {52, 65}, {65, 55}, {70, 63}, {63, 105},
+    {105, 66}, {66, 107}, {10, 338}, {338, 297}, {297, 332},
+    {332, 284}, {284, 251}, {251, 389}, {389, 356}, {356, 454},
+    {454, 323}, {323, 361}, {361, 288}, {288, 397}, {397, 365},
+    {365, 379}, {379, 378}, {378, 400}, {400, 377}, {377, 152},
+    {152, 148}, {148, 176}, {176, 149}, {149, 150}, {150, 136},
+    {136, 172}, {172, 58}, {58, 132}, {132, 93}, {93, 234},
+    {234, 127}, {127, 162}, {162, 21}, {21, 54}, {54, 103},
+    {103, 67}, {67, 109}, {109, 10}
+};
+
+struct Args {
+    std::string faceDetector_model = "../../models/m_faceDetector_w8a8.qnn216.ctx.bin";
+    std::string faceLandmark_model = "../../models/m_faceLandmark_w8a8.qnn216.ctx.bin";
+    std::string imgs = "../coco.jpg";
+    int invoke_nums = 10;
+    std::string model_type = "QNN";
+};
+
+
+Args parse_args(int argc, char* argv[]) {
+    Args args;
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+        if (arg == "--faceDetector_model" && i + 1 < argc) {
+            args.faceDetector_model = argv[++i];
+        } else if (arg == "--faceLandmark_model" && i + 1 < argc) {
+            args.faceLandmark_model = argv[++i];
+        } else if (arg == "--imgs" && i + 1 < argc) {
+            args.imgs = argv[++i];
+        } else if (arg == "--invoke_nums" && i + 1 < argc) {
+            args.invoke_nums = std::stoi(argv[++i]);
+        } else if (arg == "--model_type" && i + 1 < argc) {
+            args.model_type = argv[++i];
+        }
+    }
+    return args;
+}
+
+std::string to_lower(const std::string& str) {
+    std::string lower_str = str;
+    std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
+        return std::tolower(c);
+    });
+    return lower_str;
+}
+
+std::vector<std::vector<float>> load_anchors_from_npy(const std::string& path) {
+    cnpy::NpyArray arr = cnpy::npy_load(path);
+    float* data_ptr = arr.data<float>();
+
+    size_t num_rows = arr.shape[0]; // 896
+    size_t num_cols = arr.shape[1]; // 4
+
+    std::vector<std::vector<float>> anchors(num_rows, std::vector<float>(num_cols));
+    for (size_t i = 0; i < num_rows; ++i) {
+        for (size_t j = 0; j < num_cols; ++j) {
+            anchors[i][j] = data_ptr[i * num_cols + j];
+        }
+    }
+
+    return anchors;
+}
+
+
+// 绘制人脸关键点和连接线
+void draw_landmarks(
+    cv::Mat& img,
+    const std::vector<cv::Point2f>& points,
+    const std::vector<float>& flags,
+    const std::vector<std::pair<int, int>>& connections,
+    float threshold = 0.4f,
+    cv::Scalar point_color = cv::Scalar(0, 255, 0),
+    cv::Scalar line_color = cv::Scalar(0, 0, 0),
+    int size = 2)
+{
+    // 画关键点
+    for (size_t i = 0; i < points.size(); ++i) {
+        if (i < flags.size() && flags[i] > threshold) {
+            int x = static_cast<int>(points[i].x);
+            int y = static_cast<int>(points[i].y);
+            cv::circle(img, cv::Point(x, y), size, point_color, size);
+        }
+    }
+
+    // 画连接线（两端都要可见）
+    for (const auto& conn : connections) {
+        int i0 = conn.first;
+        int i1 = conn.second;
+        if (i0 < points.size() && i1 < points.size() &&
+            i0 < flags.size() && i1 < flags.size() &&
+            flags[i0] > threshold && flags[i1] > threshold)
+        {
+            cv::line(img, points[i0], points[i1], line_color, size);
+        }
+    }
+}
+
+
+std::tuple<cv::Mat, cv::Mat, float, cv::Point> resize_pad(const cv::Mat& img) {
+    int orig_h = img.rows; // 480
+    int orig_w = img.cols; // 640
+
+    // Step 1: resize width to 256, keep aspect ratio
+    int w1 = 256;
+    int h1 = w1 * orig_h / orig_w;  // 等效于 int(256 * h / w)
+
+    // Step 2: compute padding in height direction
+    int padh = 256 - h1;
+    int padw = 0;
+
+    int padh1 = padh / 2;
+    int padh2 = padh1 + (padh % 2);
+    int padw1 = padw / 2;
+    int padw2 = padw1 + (padw % 2);
+
+    // Step 3: resize to (w1, h1)
+    cv::Mat resized;
+    cv::resize(img, resized, cv::Size(w1, h1));  // (256, h1)
+
+    // Step 4: pad to (256, 256)
+    cv::Mat padded;
+    cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
+
+    // Step 5: resize padded to 128×128
+    cv::Mat resized128;
+    cv::resize(padded, resized128, cv::Size(128, 128));
+
+    // Step 6: compute scale and pad in original image space
+    float scale = static_cast<float>(orig_h) / h1; // h / h1
+    cv::Point pad_point(static_cast<int>(padh1 * scale), static_cast<int>(padw1 * scale));
+
+    return std::make_tuple(padded, resized128, scale, pad_point);
+}
+
+
+// 将图像转换为 1xC×H×W 格式并归一化（除以 255）
+std::vector<float> preprocess_image(const cv::Mat& img) {
+    int H = img.rows;
+    int W = img.cols;
+    int C = img.channels(); // should be 3
+
+    std::vector<float> chw(H * W * C); // CHW
+    std::vector<float> nchw(1 * C * H * W); // NCHW
+
+    // 1. HWC → CHW + normalize (float32 / 255.0)
+    for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+            for (int c = 0; c < C; ++c) {
+                // OpenCV uses BGR order
+                float value = img.at<cv::Vec3b>(h, w)[c] / 255.0f;
+                chw[c * H * W + h * W + w] = value;
+            }
+        }
+    }
+
+    // 2. CHW → NCHW (add batch dimension, actually just copy)
+    for (int i = 0; i < C * H * W; ++i) {
+        nchw[i] = chw[i];
+    }
+
+    return nchw; // shape: [1, 3, H, W]
+}
+
+
+// 只用前4个坐标计算IOU（默认框位置在前4个坐标）
+float IoU(const std::vector<float>& box1, const std::vector<float>& box2) {
+    float x1 = std::max(box1[0], box2[0]);
+    float y1 = std::max(box1[1], box2[1]);
+    float x2 = std::min(box1[2], box2[2]);
+    float y2 = std::min(box1[3], box2[3]);
+
+    float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
+    float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]);
+    float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]);
+    float union_area = box1_area + box2_area - inter_area;
+
+    return union_area > 0 ? inter_area / union_area : 0.0f;
+}
+
+std::vector<std::vector<float>> weighted_non_max_suppression(
+    std::vector<std::vector<float>>& detections,
+    int num_coords = 16,
+    float min_suppression_threshold = 0.3f)
+{
+    if (detections.empty()) return {};
+
+    std::vector<int> indices(detections.size());
+    std::iota(indices.begin(), indices.end(), 0);
+
+    // 按置信度降序排序
+    std::sort(indices.begin(), indices.end(), [&](int a, int b) {
+        return detections[a][num_coords] > detections[b][num_coords];
+    });
+
+    std::vector<std::vector<float>> output;
+
+    while (!indices.empty()) {
+        int best_idx = indices.front();
+        const auto& best_det = detections[best_idx];
+        std::vector<int> overlapping = { best_idx };
+
+        for (size_t i = 1; i < indices.size(); ++i) {
+            float iou = IoU(best_det, detections[indices[i]]);
+            if (iou > min_suppression_threshold) {
+                overlapping.push_back(indices[i]);
+            }
+        }
+
+        // 更新剩余索引
+        std::vector<int> new_indices;
+        for (int idx : indices) {
+            if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) {
+                new_indices.push_back(idx);
+            }
+        }
+        indices = new_indices;
+
+        // 加权平均：坐标 * 置信度
+        if (overlapping.size() == 1) {
+            output.push_back(best_det);
+        } else {
+            std::vector<float> weighted(num_coords + 1, 0.0f);
+            float total_score = 0.0f;
+
+            for (int idx : overlapping) {
+                float score = detections[idx][num_coords];
+                total_score += score;
+                for (int k = 0; k < num_coords; ++k) {
+                    weighted[k] += detections[idx][k] * score;
+                }
+            }
+
+            for (int k = 0; k < num_coords; ++k) {
+                weighted[k] /= total_score;
+            }
+            weighted[num_coords] = total_score / overlapping.size();  // 取平均得分
+
+            // std::cout << "Weighted box: ";
+            // for (float v : weighted) std::cout << v << " ";
+            // std::cout << "\n";
+
+            output.push_back(weighted);
+        }
+    }
+
+    // TODO 
+    auto x = output[0];
+    output.clear();
+    output.push_back(x);
+
+    return output;
+}
+
+
+std::vector<std::vector<float>> denormalize_detections(
+    const std::vector<std::vector<float>>& detections,
+    float scale,
+    const cv::Point& pad
+) {
+    std::vector<std::vector<float>> result = detections;
+
+    for (size_t i = 0; i < result.size(); ++i) {
+        std::vector<float>& det = result[i];
+
+        // bbox coords: x1, y1, x2, y2
+        det[0] = det[0] * scale * 256.0f - pad.x;  // x1
+        det[1] = det[1] * scale * 256.0f - pad.y;  // y1
+        det[2] = det[2] * scale * 256.0f - pad.x;  // x2
+        det[3] = det[3] * scale * 256.0f - pad.y;  // y2
+
+        // keypoints (starting from index 4): format [y, x, y, x, ...]
+        for (size_t k = 4; k + 1 < det.size(); k += 2) {
+            det[k]     = det[k]     * scale * 256.0f - pad.y;  // y
+            det[k + 1] = det[k + 1] * scale * 256.0f - pad.x;  // x
+        }
+    }
+
+    return result;
+}
+
+
+void detection2roi(
+    const std::vector<std::vector<float>>& detections,
+    std::vector<float>& xc,
+    std::vector<float>& yc,
+    std::vector<float>& scale,
+    std::vector<float>& theta,
+    int kp1, int kp2,   // 关键点索引
+    float dy, float dscale, float theta0
+) {
+    size_t N = detections.size();
+    xc.resize(N);
+    yc.resize(N);
+    scale.resize(N);
+    theta.resize(N);
+
+    for (size_t i = 0; i < N; ++i) {
+        const std::vector<float>& det = detections[i];
+
+        float x1 = det[1];
+        float x2 = det[3];
+        float y1 = det[0];
+        float y2 = det[2];
+
+        float x_center = (x1 + x2) / 2.0f;
+        float y_center = (y1 + y2) / 2.0f;
+        float box_scale = (x2 - x1);  // assumes square box
+
+        // yc 偏移
+        y_center += dy * box_scale;
+        box_scale *= dscale;
+
+        // 获取两个关键点的位置
+        int base = 4;
+        int idx_y0 = base + 2 * kp1;
+        int idx_x0 = base + 2 * kp1 + 1;
+        int idx_y1 = base + 2 * kp2;
+        int idx_x1 = base + 2 * kp2 + 1;
+
+        float x0 = det[idx_x0];
+        float y0 = det[idx_y0];
+        float x1_kp = det[idx_x1];
+        float y1_kp = det[idx_y1];
+
+        float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0;
+
+        // 输出赋值
+        xc[i] = x_center;
+        yc[i] = y_center;
+        scale[i] = box_scale;
+        // TODO: 这里的 theta 需要根据实际情况调整
+        // theta[i] = angle;  // 如果需要使用计算的角度
+        theta[i] = -0.0094;
+    }
+}
+
+
+void extract_roi(
+    const cv::Mat& frame,
+    const std::vector<float>& xc,
+    const std::vector<float>& yc,
+    const std::vector<float>& theta,
+    const std::vector<float>& scale,
+    std::vector<cv::Mat>& cropped_rois,
+    std::vector<cv::Mat>& affine_matrices,
+    std::vector<std::vector<cv::Point2f>>& roi_boxes,  // 添加返回点坐标
+    int resolution = 192
+) {
+    cropped_rois.clear();
+    affine_matrices.clear();
+    roi_boxes.clear();
+
+    for (size_t i = 0; i < xc.size(); ++i) {
+        float s = scale[i] / 2.0f;
+        float cos_t = std::cos(theta[i]);
+        float sin_t = std::sin(theta[i]);
+
+        // 定义4个 unit square 点经过变换后的点（顺序和 Python 中一样）
+        std::vector<cv::Point2f> points(4);
+        // [-1, -1]
+        points[0].x = xc[i] + (-s * cos_t + s * sin_t);
+        points[0].y = yc[i] + (-s * sin_t - s * cos_t);
+        // [1, -1]
+        points[1].x = xc[i] + ( s * cos_t + s * sin_t);
+        points[1].y = yc[i] + ( s * sin_t - s * cos_t);
+        // [-1, 1]
+        points[2].x = xc[i] + (-s * cos_t - s * sin_t);
+        points[2].y = yc[i] + (-s * sin_t + s * cos_t);
+        // [1, 1]
+        points[3].x = xc[i] + ( s * cos_t - s * sin_t);
+        points[3].y = yc[i] + ( s * sin_t + s * cos_t);
+
+        // 用前三个点计算仿射变换
+        std::vector<cv::Point2f> src_pts = { points[0], points[1], points[2] };
+        std::vector<cv::Point2f> dst_pts = {
+            cv::Point2f(0, 0),
+            cv::Point2f(resolution - 1, 0),
+            cv::Point2f(0, resolution - 1)
+        };
+
+        cv::Mat M = cv::getAffineTransform(src_pts, dst_pts);
+        cv::Mat M_inv;
+        cv::invertAffineTransform(M, M_inv);
+
+        cv::Mat cropped;
+        cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5));
+        cropped_rois.push_back(cropped);
+        affine_matrices.push_back(M_inv);
+        roi_boxes.push_back(points);  // 添加变换后的 box 点
+    }
+}
+
+std::vector<float> preprocess_imgs_to_nchw(const std::vector<cv::Mat>& imgs) {
+    int N = imgs.size();
+    if (N == 0) return {};
+
+    int H = 192;
+    int W = 192;
+    int C = 3;  // assume 3 channels (BGR)
+
+    std::vector<float> output;
+    output.reserve(N * C * H * W);
+
+    for (int n = 0; n < N; ++n) {
+        cv::Mat img_float;
+        imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0);  // Normalize to [0,1]
+
+        // Split channels (HWC → CHW)
+        std::vector<cv::Mat> channels(3);
+        cv::split(img_float, channels);  // channels[0] = B, [1] = G, [2] = R
+
+        for (int c = 0; c < C; ++c) {
+            for (int i = 0; i < H; ++i) {
+                for (int j = 0; j < W; ++j) {
+                    output.push_back(channels[c].at<float>(i, j));
+                }
+            }
+        }
+    }
+
+    return output;  // shape: N x C x H x W
+}
+
+// resolution 一般为 192
+std::vector<cv::Point2f> denormalize_landmarks(
+    const std::vector<float>& normalized_landmarks,
+    const std::vector<cv::Mat>& affines,
+    int resolution = 192)
+{
+    std::vector<cv::Point2f> output;
+    
+    // 检查输入尺寸
+    const int num_faces = 1;  // 假设只有一个人脸
+    const int num_landmarks = 468;
+    if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) {
+        std::cerr << "Error: Input size mismatch. Expected " 
+                  << num_faces * num_landmarks * 3 << " landmarks and " 
+                  << num_faces << " affine matrices." << std::endl;
+        throw std::runtime_error("Input size mismatch");
+    }
+
+    for (int i = 0; i < num_faces; ++i) {
+        const cv::Mat& affine = affines[i]; // 2x3 CV_32F
+        for (int j = 0; j < num_landmarks; ++j) {
+            int idx = i * num_landmarks * 3 + j * 3;
+            float x = normalized_landmarks[idx + 0] * resolution;
+            float y = normalized_landmarks[idx + 1] * resolution;
+            // float z = normalized_landmarks[idx + 2]; // 可选使用
+
+            // 2x1 输入向量
+            cv::Mat pt = (cv::Mat_<float>(2, 1) << x, y);
+
+            // 提取 affine 的旋转和平移
+            cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone();
+            cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone();
+            M2x2.convertTo(M2x2, CV_32F);
+            t2x1.convertTo(t2x1, CV_32F);
+
+            // 反仿射变换
+            cv::Mat out = M2x2 * pt + t2x1;
+
+            // 存储为 Point2f
+            output.emplace_back(out.at<float>(0, 0), out.at<float>(1, 0));
+        }
+    }
+
+    return output; // 输出为 denormalized landmarks，大小为 468 个 Point2f
+}
+
+
+void draw_roi(cv::Mat& img, const std::vector<std::vector<cv::Point2f>>& boxes) {
+    for (const auto& roi : boxes) {
+        if (roi.size() < 4) continue;
+
+        const cv::Point2f& p1 = roi[0];
+        const cv::Point2f& p2 = roi[1];
+        const cv::Point2f& p3 = roi[2];
+        const cv::Point2f& p4 = roi[3];
+
+        cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2);     // 绿色
+        cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+    }
+}
+
+
+void draw_detections(cv::Mat& img, const std::vector<std::vector<float>>& detections, bool with_keypoints = true) {
+    for (const auto& det : detections) {
+        if (det.size() < 4) continue;
+
+        float ymin = det[0];
+        float xmin = det[1];
+        float ymax = det[2];
+        float xmax = det[3];
+
+        cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1);
+
+        if (with_keypoints && det.size() > 4) {
+            int n_keypoints = (det.size() - 4) / 2;
+            for (int k = 0; k < n_keypoints; ++k) {
+                int kp_x = int(det[4 + k * 2]);
+                int kp_y = int(det[4 + k * 2 + 1]);
+                cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2);
+            }
+        }
+    }
+}
+
+
+std::vector<std::vector<float>> loadAnchors(const std::string& filename) {
+    std::ifstream in(filename);
+    std::vector<std::vector<float>> anchors;
+
+    if (!in.is_open()) {
+        std::cerr << "Failed to open file: " << filename << std::endl;
+        return anchors;
+    }
+
+    std::string line;
+    while (std::getline(in, line)) {
+        std::istringstream ss(line);
+        std::vector<float> anchor;
+        float value;
+        while (ss >> value) {
+            anchor.push_back(value);
+        }
+        if (!anchor.empty()) {
+            anchors.push_back(anchor);
+        }
+    }
+
+    in.close();
+    return anchors;
+}
+
+// sigmoid 函数
+float sigmoid(float x) {
+    return 1.0f / (1.0f + std::exp(-x));
+}
+
+// clamp 函数
+float clamp(float x, float min_val, float max_val) {
+    return std::max(min_val, std::min(max_val, x));
+}
+
+// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord]
+std::vector<std::vector<std::vector<float>>> decode_boxes(
+    const std::vector<float>& raw_boxes,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    int num_keypoints)
+{
+    std::vector<std::vector<std::vector<float>>> decoded_boxes(batch,
+        std::vector<std::vector<float>>(num_anchors, std::vector<float>(num_coords, 0)));
+
+    for (int b = 0; b < batch; ++b) {
+        for (int i = 0; i < num_anchors; ++i) {
+            int base = b * num_anchors * num_coords + i * num_coords;
+
+            float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0];
+            float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1];
+            float w = raw_boxes[base + 2] / w_scale * anchors[i][2];
+            float h = raw_boxes[base + 3] / h_scale * anchors[i][3];
+
+            decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin
+            decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin
+            decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax
+            decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax
+
+            for (int k = 0; k < num_keypoints; ++k) {
+                int offset = 4 + k * 2;
+                float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0];
+                float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1];
+                decoded_boxes[b][i][offset] = keypoint_x;
+                decoded_boxes[b][i][offset + 1] = keypoint_y;
+            }
+        }
+    }
+
+    return decoded_boxes;
+}
+
+std::vector<std::vector<std::vector<float>>> tensors_to_detections(
+    const std::vector<float>& raw_box_tensor,
+    const std::vector<float>& raw_score_tensor,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    float score_clipping_thresh, float min_score_thresh)
+{
+    assert(raw_box_tensor.size() == batch * num_anchors * num_coords);
+    assert(raw_score_tensor.size() == batch * num_anchors * num_classes);
+    assert(anchors.size() == size_t(num_anchors));
+
+    auto detection_boxes = decode_boxes(
+        raw_box_tensor, anchors, batch, num_anchors, num_coords,
+        x_scale, y_scale, w_scale, h_scale, num_keypoints);
+
+    std::vector<std::vector<std::vector<float>>> output_detections;
+
+    for (int b = 0; b < batch; ++b) {
+        std::vector<std::vector<float>> detections;
+
+        for (int i = 0; i < num_anchors; ++i) {
+            int score_index = b * num_anchors * num_classes + i * num_classes;
+
+            // 单类情况，取第0类
+            float score_raw = raw_score_tensor[score_index];
+            float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh));
+
+            if (score >= min_score_thresh) {
+                std::vector<float> det = detection_boxes[b][i]; // shape [num_coords]
+                det.push_back(score); // 追加置信度
+                detections.push_back(det); // shape [num_coords+1]
+            }
+        }
+
+        output_detections.push_back(detections); // 每个 batch 一个 vector
+    }
+
+    return output_detections;
+}
+
+
+int invoke(const Args& args) {
+    std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n"
+              << args.faceLandmark_model << "\n"
+              << "Image Path: " << args.imgs << "\n"
+              << "Inference Nums: " << args.invoke_nums << "\n"
+              << "Model Type: " << args.model_type << "\n";
+    // =============================================================faceDetector_model start
+    Model* model1 = Model::create_instance(args.faceDetector_model);
+    if(model1 == nullptr){
+        printf("Create model1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config1 = Config::create_instance();
+    if(config1 == nullptr){
+        printf("Create config1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config1->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower1 = to_lower(args.model_type);
+    if (model_type_lower1 == "qnn"){
+        config1->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") {
+        config1->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config1->accelerate_type = AccelerateType::TYPE_DSP;
+    config1->is_quantify_model = 1;
+
+    std::vector<std::vector<uint32_t>> input_shapes1 = {{1,3,256,256}};
+    std::vector<std::vector<uint32_t>> output_shapes1 = {{1,896,16},{1,896,1}};
+    model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1);
+    if(fast_interpreter1 == nullptr){
+        printf("build_interpretper_from_model_and_config failed !\n");
+        return EXIT_FAILURE;
+    }
+    int result = fast_interpreter1->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter1->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model load success!\n");
+    // =============================================================faceDetector_model over
+
+    // =============================================================faceLandmark_model start
+    Model* model2 = Model::create_instance(args.faceLandmark_model);
+    if(model2 == nullptr){
+        printf("Create model2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config2 = Config::create_instance();
+    if(config2 == nullptr){
+        printf("Create config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config2->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower2 = to_lower(args.model_type);
+    if (model_type_lower2 == "qnn"){
+        config2->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") {
+        config2->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config2->accelerate_type = AccelerateType::TYPE_DSP;
+    config2->is_quantify_model = 1;
+
+    std::vector<std::vector<uint32_t>> input_shapes2 = {{1,3,192,192}};
+    std::vector<std::vector<uint32_t>> output_shapes2 = {{1},{1,468,3}};
+    model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2);
+    if(fast_interpreter2 == nullptr){
+        printf("build_interpretper_from_model_and_config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    result = fast_interpreter2->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter2->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model2 load success!\n");
+    // =============================================================faceLandmark_model over
+
+
+    auto anchors = load_anchors_from_npy("../anchors_float32.npy");
+    cv::Mat frame = cv::imread(args.imgs);
+    if (frame.empty()) {
+        printf("detect image load failed!\n");
+        return 1;
+    }
+    // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
+    cv::Mat input_data;
+    cv::Mat frame_clone1 = frame.clone();
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB); 
+    cv::Mat frame_clone = frame.clone();
+
+
+    cv::Mat img1, img2;
+    float scale;
+    cv::Point pad;
+    std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1);
+    std::vector<float> input_tensor = preprocess_image(img1);
+
+    float *outdata0 = nullptr;
+    float *outdata1 = nullptr;
+    std::vector<float> invoke_time;
+    for (int i = 0; i < args.invoke_nums; ++i) {
+        result = fast_interpreter1->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter1->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_0 = 0;
+        result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        uint32_t out_data_1 = 0;
+        result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+    }
+
+    std::vector<float> tensor_1_896_16(outdata0, outdata0 + 896*16);
+    std::vector<float> tensor_1_896_1(outdata1, outdata1 + 896*1);
+
+    std::vector<std::vector<std::vector<float>>> detections = tensors_to_detections(
+        tensor_1_896_16, tensor_1_896_1, anchors,
+        1, 896, 16, 1, 6,
+        256.0f, 256.0f, 256.0f, 256.0f,
+        100.0f, 0.4f);
+
+
+    std::vector<std::vector<std::vector<float>>> filtered_detections;
+    for (size_t i = 0; i < detections.size(); ++i) {
+        std::vector<std::vector<float>>& dets = detections[i];
+        std::vector<std::vector<float>> faces = weighted_non_max_suppression(dets);
+        filtered_detections.push_back(faces);
+    }
+
+
+    // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n";
+    // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n";
+    std::vector<std::vector<float>> face_detections = denormalize_detections(filtered_detections[0], scale, pad);
+
+    // std::cout << "face_detections size: " << face_detections.size() << "\n";
+    std::vector<float> xc, yc, scales, theta;
+    int kp1 = 0, kp2 = 1;           // 关键点索引
+    float dy = 0.0f;                // 根据模型定义设定
+    float dscale = 1.5f;            // 缩放因子
+    float theta0 = 0.0f;            // 基准角度
+
+    detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0);
+    std::vector<cv::Mat> rois;
+    std::vector<cv::Mat> affines;
+    std::vector<std::vector<cv::Point2f>> boxes;
+
+    extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes);
+    if (!boxes.empty()) {
+        std::cout << "Detected " << boxes.size() << " faces.\n";
+        // 检测到人脸，继续处理 boxes[0] ...
+        std::vector<float> input_tensor = preprocess_imgs_to_nchw(rois);
+
+        float *outdata1_0 = nullptr;
+        float *outdata1_1 = nullptr;
+
+        result = fast_interpreter2->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter2->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_1_0 = 0;
+        result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        uint32_t out_data_1_1 = 0;
+        result = fast_interpreter2->get_output_tensor(1, (void**)&outdata1_1, &out_data_1_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        std::vector<float> flags(outdata1_0, outdata1_0 + 1);
+        std::vector<float> normalized_landmarks(outdata1_1, outdata1_1 + 468*3);
+
+        std::vector<cv::Point2f> landmarks = denormalize_landmarks(normalized_landmarks, affines);
+        draw_landmarks(frame_clone1, landmarks, flags, FACE_CONNECTIONS);
+    } else {
+        std::cout << "not detect face!" << std::endl;
+    }
+
+    
+    draw_roi(frame_clone1, boxes);
+    draw_detections(frame_clone1, face_detections);
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR);
+    cv::imwrite("vis_result.jpg", frame_clone1);
+
+
+    fast_interpreter1->destory();
+    fast_interpreter2->destory();
+    return 0;
+    
+}
+
+
+int main(int argc, char* argv[]) {
+    Args args = parse_args(argc, argv);
+    return invoke(args);
+}
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/models/m_faceDetector_w8a8.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/models/m_faceDetector_w8a8.qnn216.ctx.bin
new file mode 100644
index 0000000000000000000000000000000000000000..71e0c857c57af5cd3fd5f945284f92ff77d831f9
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/models/m_faceDetector_w8a8.qnn216.ctx.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa0abbe810564cd4afc9f7645ab15576b2444515c92567556a84aff7f9d4ace3
+size 347688
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/models/m_faceLandmark_w8a8.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/models/m_faceLandmark_w8a8.qnn216.ctx.bin
new file mode 100644
index 0000000000000000000000000000000000000000..47fa26017d52528dfbb6efd01c2f632b7a0a1b35
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/models/m_faceLandmark_w8a8.qnn216.ctx.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63778b82d307633a3c22ad2e25a228976e2df67b6e53a1366f042b9305d1b9c7
+size 797792
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5f01c881032cc0ea0dafbbe76b4d02bf51f5dc2
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e15f793e9a3a77c122c6cae36e26c5bd9b77ab7e
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazebase.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazeface.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4d54b68c53f26e930688f7bd66bc42e64431265
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazeface.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3960f9eaa3babd661de15273d7037029ce46e706
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/visualization.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffaa52099e2019a3dc97a8eef82319610de6aaf4
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/__pycache__/visualization.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/anchors_face_back.npy b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/anchors_face_back.npy
new file mode 100644
index 0000000000000000000000000000000000000000..3ba12474802adea36a8e37ca648d46556cd35c92
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/anchors_face_back.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95
+size 28800
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazebase.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazebase.py
new file mode 100644
index 0000000000000000000000000000000000000000..414ae950199d7fa1ebae4fa540805cb53cf1ff0d
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazebase.py
@@ -0,0 +1,513 @@
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def resize_pad(img):
+    """ resize and pad images to be input to the detectors
+
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio.
+
+    Returns:
+        img1: 256x256
+        img2: 128x128
+        scale: scale factor between original image and 256x256 image
+        pad: pixels of padding in the original image
+    """
+
+    size0 = img.shape
+    if size0[0]>=size0[1]:
+        h1 = 256
+        w1 = 256 * size0[1] // size0[0]
+        padh = 0
+        padw = 256 - w1
+        scale = size0[1] / w1
+    else:
+        h1 = 256 * size0[0] // size0[1]
+        w1 = 256
+        padh = 256 - h1
+        padw = 0
+        scale = size0[0] / h1
+    padh1 = padh//2
+    padh2 = padh//2 + padh%2
+    padw1 = padw//2
+    padw2 = padw//2 + padw%2
+    img1 = cv2.resize(img, (w1,h1))
+    img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0)))
+    pad = (int(padh1 * scale), int(padw1 * scale))
+    img2 = cv2.resize(img1, (128,128))
+    return img1, img2, scale, pad
+
+
+def denormalize_detections(detections, scale, pad):
+    """ maps detection coordinates from [0,1] to image coordinates
+
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio. This function maps the
+    normalized coordinates back to the original image coordinates.
+
+    Inputs:
+        detections: nxm tensor. n is the number of detections.
+            m is 4+2*k where the first 4 valuse are the bounding
+            box coordinates and k is the number of additional
+            keypoints output by the detector.
+        scale: scalar that was used to resize the image
+        pad: padding in the x and y dimensions
+
+    """
+    detections[:, 0] = detections[:, 0] * scale * 256 - pad[0]
+    detections[:, 1] = detections[:, 1] * scale * 256 - pad[1]
+    detections[:, 2] = detections[:, 2] * scale * 256 - pad[0]
+    detections[:, 3] = detections[:, 3] * scale * 256 - pad[1]
+
+    detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1]
+    detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0]
+    return detections
+
+
+
+
+class BlazeBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False):
+        super(BlazeBlock, self).__init__()
+
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.channel_pad = out_channels - in_channels
+
+        # TFLite uses slightly different padding than PyTorch 
+        # on the depthwise conv layer when the stride is 2.
+        if stride == 2:
+            self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+            padding = 0
+        else:
+            padding = (kernel_size - 1) // 2
+
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=in_channels, 
+                      kernel_size=kernel_size, stride=stride, padding=padding, 
+                      groups=in_channels, bias=True),
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+
+        if skip_proj:
+            self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 
+                      kernel_size=1, stride=1, padding=0, bias=True)
+        else:
+            self.skip_proj = None
+
+        if act == 'relu':
+            self.act = nn.ReLU(inplace=True)
+        elif act == 'prelu':
+            self.act = nn.PReLU(out_channels)
+        else:
+            raise NotImplementedError("unknown activation %s"%act)
+
+    def forward(self, x):
+        if self.stride == 2:
+            if self.kernel_size==3:
+                h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+            else:
+                h = F.pad(x, (1, 2, 1, 2), "constant", 0)
+            x = self.max_pool(x)
+        else:
+            h = x
+
+        if self.skip_proj is not None:
+            x = self.skip_proj(x)
+        elif self.channel_pad > 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
+        
+
+        return self.act(self.convs(h) + x)
+
+
+class FinalBlazeBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3):
+        super(FinalBlazeBlock, self).__init__()
+
+        # TFLite uses slightly different padding than PyTorch
+        # on the depthwise conv layer when the stride is 2.
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=kernel_size, stride=2, padding=0,
+                      groups=channels, bias=True),
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+
+        self.act = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+
+        return self.act(self.convs(h))
+
+
+class BlazeBase(nn.Module):
+    """ Base class for media pipe models. """
+
+    def _device(self):
+        """Which device (CPU or GPU) is being used by this model?"""
+        return self.classifier_8.weight.device
+    
+    def load_weights(self, path):
+        self.load_state_dict(torch.load(path))
+        self.eval()        
+
+
+class BlazeLandmark(BlazeBase):
+    """ Base class for landmark models. """
+
+    def extract_roi(self, frame, xc, yc, theta, scale):
+
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+
+        # use the points to compute the affine transform that maps 
+        # these points back to the output square
+        res = self.resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].cpu().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+
+        return imgs, affines, points
+
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+
+
+
+class BlazeDetector(BlazeBase):
+    """ Base class for detector models.
+
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/hollance/BlazeFace-PyTorch and
+    https://github.com/google/mediapipe/
+    """
+    def load_anchors(self, path):
+        self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
+        assert(self.anchors.ndimension() == 2)
+        assert(self.anchors.shape[0] == self.num_anchors)
+        assert(self.anchors.shape[1] == 4)
+
+    def _preprocess(self, x):
+        """Converts the image pixels to the range [-1, 1]."""
+        return x.float() / 255.# 127.5 - 1.0
+
+    def predict_on_image(self, img):
+        """Makes a prediction on a single image.
+
+        Arguments:
+            img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
+                 shape (3, H, W). The image's height and width should be 
+                 128 pixels.
+
+        Returns:
+            A tensor with face detections.
+        """
+        if isinstance(img, np.ndarray):
+            img = torch.from_numpy(img).permute((2, 0, 1))
+
+        return self.predict_on_batch(img.unsqueeze(0))[0]
+
+    def predict_on_batch(self, x):
+        """Makes a prediction on a batch of images.
+
+        Arguments:
+            x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
+               shape (b, 3, H, W). The height and width should be 128 pixels.
+
+        Returns:
+            A list containing a tensor of face detections for each image in 
+            the batch. If no faces are found for an image, returns a tensor
+            of shape (0, 17).
+
+        Each face detection is a PyTorch tensor consisting of 17 numbers:
+            - ymin, xmin, ymax, xmax
+            - x,y-coordinates for the 6 keypoints
+            - confidence score
+        """
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x).permute((0, 3, 1, 2))
+
+        assert x.shape[1] == 3
+        assert x.shape[2] == self.y_scale
+        assert x.shape[3] == self.x_scale
+
+        # 1. Preprocess the images into tensors:
+        x = x.to(self._device())
+        x = self._preprocess(x)
+
+        # 2. Run the neural network:
+        with torch.no_grad():
+            out = self.__call__(x)
+
+        # 3. Postprocess the raw predictions:
+        detections = self._tensors_to_detections(out[0], out[1], self.anchors)
+
+        # 4. Non-maximum suppression to remove overlapping detections:
+        filtered_detections = []
+        for i in range(len(detections)):
+            faces = self._weighted_non_max_suppression(detections[i])
+            faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1))
+            filtered_detections.append(faces)
+
+        return filtered_detections
+
+
+    def detection2roi(self, detection):
+        """ Convert detections from detector to an oriented bounding box.
+
+        Adapted from:
+        # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
+
+        The center and size of the box is calculated from the center 
+        of the detected box. Rotation is calcualted from the vector
+        between kp1 and kp2 relative to theta0. The box is scaled
+        and shifted by dscale and dy.
+
+        """
+        if self.detection2roi_method == 'box':
+            # compute box center and scale
+            # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+            xc = (detection[:,1] + detection[:,3]) / 2
+            yc = (detection[:,0] + detection[:,2]) / 2
+            scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+
+        elif self.detection2roi_method == 'alignment':
+            # compute box center and scale
+            # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc
+            xc = detection[:,4+2*self.kp1]
+            yc = detection[:,4+2*self.kp1+1]
+            x1 = detection[:,4+2*self.kp2]
+            y1 = detection[:,4+2*self.kp2+1]
+            scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2
+        else:
+            raise NotImplementedError(
+                "detection2roi_method [%s] not supported"%self.detection2roi_method)
+
+        yc += self.dy * scale
+        scale *= self.dscale
+
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        #theta = np.arctan2(y0-y1, x0-x1) - self.theta0
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+
+
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        """The output of the neural network is a tensor of shape (b, 896, 16)
+        containing the bounding box regressor predictions, as well as a tensor 
+        of shape (b, 896, 1) with the classification confidences.
+
+        This function converts these two "raw" tensors into proper detections.
+        Returns a list of (num_detections, 17) tensors, one for each image in
+        the batch.
+
+        This is based on the source code from:
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
+        """
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+
+        return output_detections
+
+    def _decode_boxes(self, raw_boxes, anchors):
+        """Converts the predictions into actual coordinates using
+        the anchor boxes. Processes the entire batch at once.
+        """
+        boxes = torch.zeros_like(raw_boxes)
+
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+
+        return boxes
+
+    def _weighted_non_max_suppression(self, detections):
+        """The alternative NMS method as mentioned in the BlazeFace paper:
+
+        "We replace the suppression algorithm with a blending strategy that
+        estimates the regression parameters of a bounding box as a weighted
+        mean between the overlapping predictions."
+
+        The original MediaPipe code assigns the score of the most confident
+        detection to the weighted detection, but we take the average score
+        of the overlapping detections.
+
+        The input detections should be a Tensor of shape (count, 17).
+
+        Returns a list of PyTorch tensors, one for each detected face.
+        
+        This is based on the source code from:
+        mediapipe/calculators/util/non_max_suppression_calculator.cc
+        mediapipe/calculators/util/non_max_suppression_calculator.proto
+        """
+        if len(detections) == 0: return []
+
+        output_detections = []
+
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, self.num_coords], descending=True)
+
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+
+            # Compute the overlap between the first box and the other 
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = overlap_similarity(first_box, other_boxes)
+
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious > self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :self.num_coords]
+                scores = detections[overlapping, self.num_coords:self.num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:self.num_coords] = weighted
+                weighted_detection[self.num_coords] = total_score / len(overlapping)
+
+            output_detections.append(weighted_detection)
+
+        return output_detections    
+
+
+# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
+
+def intersect(box_a, box_b):
+    """ We resize both tensors to [A,B,2] without new malloc:
+    [A,2] -> [A,1,2] -> [A,B,2]
+    [B,2] -> [1,B,2] -> [A,B,2]
+    Then we compute the area of intersect between box_a and box_b.
+    Args:
+      box_a: (tensor) bounding boxes, Shape: [A,4].
+      box_b: (tensor) bounding boxes, Shape: [B,4].
+    Return:
+      (tensor) intersection area, Shape: [A,B].
+    """
+    A = box_a.size(0)
+    B = box_b.size(0)
+    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+    inter = torch.clamp((max_xy - min_xy), min=0)
+    return inter[:, :, 0] * inter[:, :, 1]
+
+
+def jaccard(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.  Here we operate on
+    ground truth boxes and default boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
+        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
+    Return:
+        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2]-box_a[:, 0]) *
+              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+    area_b = ((box_b[:, 2]-box_b[:, 0]) *
+              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+
+
+def overlap_similarity(box, other_boxes):
+    """Computes the IOU between a bounding box and set of other boxes."""
+    return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazeface.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazeface.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d630b3b28ead006d2864f18f1637f6545dbe507
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazeface.py
@@ -0,0 +1,182 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from blazebase import BlazeDetector, BlazeBlock, FinalBlazeBlock
+
+
+class BlazeFace(BlazeDetector):
+    """The BlazeFace face detection model from MediaPipe.
+    
+    The version from MediaPipe is simpler than the one in the paper; 
+    it does not use the "double" BlazeBlocks.
+
+    Because we won't be training this model, it doesn't need to have
+    batchnorm layers. These have already been "folded" into the conv 
+    weights by TFLite.
+
+    The conversion to PyTorch is fairly straightforward, but there are 
+    some small differences between TFLite and PyTorch in how they handle
+    padding on conv layers with stride 2.
+
+    This version works on batches, while the MediaPipe version can only
+    handle a single image at a time.
+
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/hollance/BlazeFace-PyTorch and
+    https://github.com/google/mediapipe/
+
+    """
+    def __init__(self, back_model=False):
+        super(BlazeFace, self).__init__()
+
+        # These are the settings from the MediaPipe example graph
+        # mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt
+        self.num_classes = 1
+        self.num_anchors = 896
+        self.num_coords = 16
+        self.score_clipping_thresh = 100.0
+        self.back_model = back_model
+        if back_model:
+            self.x_scale = 256.0
+            self.y_scale = 256.0
+            self.h_scale = 256.0
+            self.w_scale = 256.0
+            self.min_score_thresh = 0.65
+        else:
+            self.x_scale = 128.0
+            self.y_scale = 128.0
+            self.h_scale = 128.0
+            self.w_scale = 128.0
+            self.min_score_thresh = 0.75
+        self.min_suppression_threshold = 0.3
+        self.num_keypoints = 6
+
+        # These settings are for converting detections to ROIs which can then
+        # be extracted and feed into the landmark network
+        # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+        self.detection2roi_method = 'box'
+        # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
+        self.kp1 = 1
+        self.kp2 = 0
+        self.theta0 = 0.
+        self.dscale = 1.5
+        self.dy = 0.
+
+        self._define_layers()
+
+    def _define_layers(self):
+        if self.back_model:
+            self.backbone = nn.Sequential(
+                nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
+                nn.ReLU(inplace=True),
+
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24, stride=2),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 48, stride=2),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 96, stride=2),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+            )
+            self.final = FinalBlazeBlock(96)
+            self.classifier_8 = nn.Conv2d(96, 2, 1, bias=True)
+            self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
+
+            self.regressor_8 = nn.Conv2d(96, 32, 1, bias=True)
+            self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
+        else:
+            self.backbone1 = nn.Sequential(
+                nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
+                nn.ReLU(inplace=True),
+
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 28),
+                BlazeBlock(28, 32, stride=2),
+                BlazeBlock(32, 36),
+                BlazeBlock(36, 42),
+                BlazeBlock(42, 48, stride=2),
+                BlazeBlock(48, 56),
+                BlazeBlock(56, 64),
+                BlazeBlock(64, 72),
+                BlazeBlock(72, 80),
+                BlazeBlock(80, 88),
+            )
+        
+            self.backbone2 = nn.Sequential(
+                BlazeBlock(88, 96, stride=2),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+            )
+
+            self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True)
+            self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
+
+            self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True)
+            self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
+        
+    def forward(self, x):
+        # TFLite uses slightly different padding on the first conv layer
+        # than PyTorch, so do it manually.
+        x = F.pad(x, (1, 2, 1, 2), "constant", 0)
+        
+        b = x.shape[0]      # batch size, needed for reshaping later
+
+        if self.back_model:
+            x = self.backbone(x)           # (b, 16, 16, 96)
+            h = self.final(x)              # (b, 8, 8, 96)
+        else:
+            x = self.backbone1(x)           # (b, 88, 16, 16)
+            h = self.backbone2(x)           # (b, 96, 8, 8)
+        
+        # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
+        # permute the output from the conv layers before reshaping it.
+        
+        c1 = self.classifier_8(x)       # (b, 2, 16, 16)
+        c1 = c1.permute(0, 2, 3, 1)     # (b, 16, 16, 2)
+        c1 = c1.reshape(b, -1, 1)       # (b, 512, 1)
+
+        c2 = self.classifier_16(h)      # (b, 6, 8, 8)
+        c2 = c2.permute(0, 2, 3, 1)     # (b, 8, 8, 6)
+        c2 = c2.reshape(b, -1, 1)       # (b, 384, 1)
+
+        c = torch.cat((c1, c2), dim=1)  # (b, 896, 1)
+
+        r1 = self.regressor_8(x)        # (b, 32, 16, 16)
+        r1 = r1.permute(0, 2, 3, 1)     # (b, 16, 16, 32)
+        r1 = r1.reshape(b, -1, 16)      # (b, 512, 16)
+
+        r2 = self.regressor_16(h)       # (b, 96, 8, 8)
+        r2 = r2.permute(0, 2, 3, 1)     # (b, 8, 8, 96)
+        r2 = r2.reshape(b, -1, 16)      # (b, 384, 16)
+
+        r = torch.cat((r1, r2), dim=1)  # (b, 896, 16)
+        return [r, c]
+
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazeface_landmark.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazeface_landmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b21d17e31c8a8a39e95abb27cc3220440a01fca
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/blazeface_landmark.py
@@ -0,0 +1,74 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from blazebase import BlazeLandmark, BlazeBlock
+
+class BlazeFaceLandmark(BlazeLandmark):
+    """The face landmark model from MediaPipe.
+    
+    """
+    def __init__(self):
+        super(BlazeFaceLandmark, self).__init__()
+
+        # size of ROIs used for input
+        self.resolution = 192
+
+        self._define_layers()
+
+    def _define_layers(self):
+        self.backbone1 = nn.Sequential(
+            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=0, bias=True),
+            nn.PReLU(16),
+
+            BlazeBlock(16, 16, 3, act='prelu'),
+            BlazeBlock(16, 16, 3, act='prelu'),
+            BlazeBlock(16, 32, 3, 2, act='prelu'),
+
+            BlazeBlock(32, 32, 3, act='prelu'),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            BlazeBlock(32, 64, 3, 2, act='prelu'),
+
+            BlazeBlock(64, 64, 3, act='prelu'),
+            BlazeBlock(64, 64, 3, act='prelu'),
+            BlazeBlock(64, 128, 3, 2, act='prelu'),
+
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+        )
+
+
+        self.backbone2a = nn.Sequential(
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            nn.Conv2d(128, 32, 1, padding=0, bias=True),
+            nn.PReLU(32),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            nn.Conv2d(32, 1404, 3, padding=0, bias=True)
+        )
+
+        self.backbone2b = nn.Sequential(
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+            nn.Conv2d(128, 32, 1, padding=0, bias=True),
+            nn.PReLU(32),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            nn.Conv2d(32, 1, 3, padding=0, bias=True)
+        )
+
+    def forward(self, x):
+        if x.shape[0] == 0:
+            return torch.zeros((0,)), torch.zeros((0, 468, 3))
+            
+        x = F.pad(x, (0, 1, 0, 1), "constant", 0)
+
+        x = self.backbone1(x)
+        landmarks = self.backbone2a(x).view(-1, 468, 3) / 192
+        flag = self.backbone2b(x).sigmoid().view(-1)
+
+        return flag, landmarks
\ No newline at end of file
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/coco.jpg b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/coco.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/coco.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378
+size 158750
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/demo_qnn.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/demo_qnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe5547b031145d7e58913c711009e39f294ca046
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/demo_qnn.py
@@ -0,0 +1,421 @@
+import numpy as np
+import torch
+import cv2
+import sys
+from blazebase import resize_pad, denormalize_detections
+from visualization import  draw_landmarks, draw_roi, FACE_CONNECTIONS
+import time
+import aidlite
+import os
+
+class post_mediapipe_face:
+    def __init__(self):
+        self.kp1 = 1
+        self.kp2 = 0
+        self.theta0 = 0.
+        self.dscale = 1.5
+        self.dy = 0.
+        self.x_scale = 256.0
+        self.y_scale = 256.0
+        self.h_scale = 256.0
+        self.w_scale = 256.0
+        self.num_keypoints = 6
+        self.num_classes = 1
+        self.num_anchors = 896
+        self.num_coords = 16
+        self.min_score_thresh = 0.4  #0.65  
+        self.score_clipping_thresh = 100.0
+        self.min_suppression_threshold = 0.3
+        self.resolution = 192
+            
+        
+    def detection2roi(self,detection):
+        xc = (detection[:,1] + detection[:,3]) / 2
+        yc = (detection[:,0] + detection[:,2]) / 2
+        scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+        yc += self.dy * scale
+        scale *= self.dscale
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+    
+    def _decode_boxes( self,raw_boxes, anchors):
+        boxes = torch.zeros_like(raw_boxes)
+
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+        return boxes
+    
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+
+        return output_detections
+
+    def extract_roi( self,frame, xc, yc, theta, scale):
+        resolution = 192
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+
+        # use the points to compute the affine transform that maps 
+        # these points back to the output square
+        res = resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].detach().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+
+        return imgs, affines, points
+
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+
+    def intersect(self,box_a, box_b):
+        A = box_a.size(0)
+        B = box_b.size(0)
+        max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                        box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+        min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                        box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+        inter = torch.clamp((max_xy - min_xy), min=0)
+        return inter[:, :, 0] * inter[:, :, 1]
+
+    def jaccard(self,box_a, box_b):
+        inter = self.intersect(box_a, box_b)
+        area_a = ((box_a[:, 2]-box_a[:, 0]) *
+                (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+        area_b = ((box_b[:, 2]-box_b[:, 0]) *
+                (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+        union = area_a + area_b - inter
+        return inter / union  # [A,B]
+
+
+    def overlap_similarity(self,box, other_boxes):
+        """Computes the IOU between a bounding box and set of other boxes."""
+        return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
+
+    def _weighted_non_max_suppression(self,detections):
+        if len(detections) == 0: return []
+        output_detections = []
+
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, num_coords], descending=True)
+
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+
+            # Compute the overlap between the first box and the other 
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = self.overlap_similarity(first_box, other_boxes)
+
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious >  self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :num_coords]
+                scores = detections[overlapping, num_coords:num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:num_coords] = weighted
+                weighted_detection[num_coords] = total_score / len(overlapping)
+
+            output_detections.append(weighted_detection)
+
+            return output_detections
+
+def draw_detections(img, detections, with_keypoints=True):
+    if isinstance(detections, torch.Tensor):
+        detections = detections.detach().numpy()
+
+    if detections.ndim == 1:
+        detections = np.expand_dims(detections, axis=0)
+
+    n_keypoints = detections.shape[1] // 2 - 2
+
+    for i in range(detections.shape[0]):
+        ymin = detections[i, 0]
+        xmin = detections[i, 1]
+        ymax = detections[i, 2]
+        xmax = detections[i, 3]
+        
+        start_point = (int(xmin), int(ymin))
+        end_point = (int(xmax), int(ymax))
+        img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) 
+
+        if with_keypoints:
+            for k in range(n_keypoints):
+                kp_x = int(detections[i, 4 + k*2    ])
+                kp_y = int(detections[i, 4 + k*2 + 1])
+                cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
+    return img
+
+
+
+post_process=post_mediapipe_face()
+
+class faceDetectionQnn:
+    def __init__(self):
+        super().__init__()
+        self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceDetector_w8a8.qnn216.ctx.bin"))
+        if self.model is None:
+            print("Create model failed !")
+            return
+
+        self.config = aidlite.Config.create_instance()
+        if self.config is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+
+        self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
+        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
+        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
+        self.config.is_quantify_model = 1
+        
+        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        input_shapes = [[1,3, 256, 256]]
+        output_shapes = [[1, 896,16],[1,896,1]]
+        self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
+                                output_shapes, aidlite.DataType.TYPE_FLOAT32)
+
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+        result = self.interpreter.init()
+        if result != 0:
+            print(f"interpreter init failed !")
+        result = self.interpreter.load_model()
+        if result != 0:
+            print("interpreter load model failed !")
+   
+        print(" model load success!")
+
+    def __call__(self, input):
+        self.interpreter.set_input_tensor(0,input)
+        invoke_time=[]
+        invoke_nums =10
+        for i in range(invoke_nums):
+            result = self.interpreter.set_input_tensor(0, input.data)
+            if result != 0:
+                print("interpreter set_input_tensor() failed")
+            t1=time.time()
+            result = self.interpreter.invoke()
+            cost_time = (time.time()-t1)*1000
+            invoke_time.append(cost_time)
+
+        max_invoke_time = max(invoke_time)
+        min_invoke_time = min(invoke_time)
+        mean_invoke_time = sum(invoke_time)/invoke_nums
+        var_invoketime=np.var(invoke_time)
+        print("====================================")
+        print(f"QNN Detection invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
+        print("====================================")
+        features_0 = self.interpreter.get_output_tensor(0).reshape(1, 896,16).copy()
+        features_1 = self.interpreter.get_output_tensor(1).reshape(1, 896,1).copy()
+        return features_0,features_1
+
+
+class faceLandmarkQnn:
+    def __init__(self):
+        super().__init__()
+        self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceLandmark_w8a8.qnn216.ctx.bin"))
+        if self.model is None:
+            print("Create model failed !")
+            return
+
+        self.config = aidlite.Config.create_instance()
+        if self.config is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+
+        self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
+        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
+        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
+        self.config.is_quantify_model = 1
+        
+        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        input_shapes = [[1, 3, 192, 192]]
+        output_shapes = [[1],[1,468,3]]
+        self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
+                                output_shapes, aidlite.DataType.TYPE_FLOAT32)
+
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+        result = self.interpreter.init()
+        if result != 0:
+            print(f"interpreter init failed !")
+        result = self.interpreter.load_model()
+        if result != 0:
+            print("interpreter load model failed !")
+   
+        print(" model load success!")
+
+    def __call__(self, input):
+        self.interpreter.set_input_tensor(0,input)
+        invoke_time=[]
+        invoke_nums =10
+        for i in range(invoke_nums):
+            result = self.interpreter.set_input_tensor(0, input.data)
+            if result != 0:
+                print("interpreter set_input_tensor() failed")
+            t1=time.time()
+            result = self.interpreter.invoke()
+            cost_time = (time.time()-t1)*1000
+            invoke_time.append(cost_time)
+
+        max_invoke_time = max(invoke_time)
+        min_invoke_time = min(invoke_time)
+        mean_invoke_time = sum(invoke_time)/invoke_nums
+        var_invoketime=np.var(invoke_time)
+        print("====================================")
+        print(f"QNN LandMark invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
+        print("====================================")
+        features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy()
+        features_1 = self.interpreter.get_output_tensor(1).reshape(1,468,3).copy()
+        return features_0,features_1
+
+
+
+anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"anchors_face_back.npy")), dtype=torch.float32, device='cpu')
+face_detc = faceDetectionQnn()
+face_rec = faceLandmarkQnn()
+
+image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"coco.jpg")
+
+frame_ct=0
+image = cv2.imread(image_path)
+
+frame = np.ascontiguousarray(image[:,:,::-1])
+
+img1, img2, scale, pad = resize_pad(frame)
+
+input = (img1 / 255).astype(np.float32)
+input = np.transpose(input, (2, 0, 1))  
+input = input[np.newaxis, ...]
+t0 = time.time()
+out = face_detc(input)
+use_time = round((time.time() - t0) * 1000, 2)
+detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors)
+
+filtered_detections = []
+num_coords = 16
+for i in range(len(detections)):
+    faces = post_process._weighted_non_max_suppression(detections[i])
+    faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1))
+    filtered_detections.append(faces)
+
+face_detections = denormalize_detections(filtered_detections[0], scale, pad)
+
+xc, yc, scale, theta = post_process.detection2roi(face_detections)
+
+img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale)
+if box.size()[0]!=0:
+    t2 = time.time()
+    flags, normalized_landmarks = face_rec(img.numpy())
+
+    use_time = round((time.time() - t2) * 1000, 2)
+
+    landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine)
+
+    for i in range(len(flags)):
+        landmark, flag = landmarks[i], flags[i]
+        if flag>.4: # 0.5
+            draw_landmarks(frame, landmark[:,:2], FACE_CONNECTIONS, size=1)
+else:
+    print("not detect face !")
+
+draw_roi(frame, box)
+draw_detections(frame, face_detections)
+cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1])
+face_detc.interpreter.destory()
+face_rec.interpreter.destory()
+
+
+
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/visualization.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/visualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1187c9d99336d53562f4fa3a4a32f6af6e769fb
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_int8_aidlite/python/visualization.py
@@ -0,0 +1,125 @@
+import numpy as np
+import cv2
+import torch
+
+def draw_detections(img, detections, with_keypoints=True):
+    if isinstance(detections, torch.Tensor):
+        detections = detections.cpu().numpy()
+
+    if detections.ndim == 1:
+        detections = np.expand_dims(detections, axis=0)
+
+    n_keypoints = detections.shape[1] // 2 - 2
+
+    for i in range(detections.shape[0]):
+        ymin = detections[i, 0]
+        xmin = detections[i, 1]
+        ymax = detections[i, 2]
+        xmax = detections[i, 3]
+        
+        start_point = (int(xmin), int(ymin))
+        end_point = (int(xmax), int(ymax))
+        img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) 
+
+        if with_keypoints:
+            for k in range(n_keypoints):
+                kp_x = int(detections[i, 4 + k*2    ])
+                kp_y = int(detections[i, 4 + k*2 + 1])
+                cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
+    return img
+
+
+def draw_roi(img, roi):
+    for i in range(roi.shape[0]):
+        (x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i]
+        cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2)
+        cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2)
+        cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2)
+        cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2)
+
+
+def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2):
+    points = points[:,:2]
+    for point in points:
+        x, y = point
+        x, y = int(x), int(y)
+        cv2.circle(img, (x, y), size, color, thickness=size)
+    for connection in connections:
+        x0, y0 = points[connection[0]]
+        x1, y1 = points[connection[1]]
+        x0, y0 = int(x0), int(y0)
+        x1, y1 = int(x1), int(y1)
+        cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size)
+
+
+
+# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py
+#        8   12  16  20
+#        |   |   |   |
+#        7   11  15  19
+#    4   |   |   |   |
+#    |   6   10  14  18
+#    3   |   |   |   |
+#    |   5---9---13--17
+#    2    \         /
+#     \    \       /
+#      1    \     /
+#       \    \   /
+#        ------0-
+HAND_CONNECTIONS = [
+    (0, 1), (1, 2), (2, 3), (3, 4),
+    (5, 6), (6, 7), (7, 8),
+    (9, 10), (10, 11), (11, 12),
+    (13, 14), (14, 15), (15, 16),
+    (17, 18), (18, 19), (19, 20),
+    (0, 5), (5, 9), (9, 13), (13, 17), (0, 17)
+]
+
+POSE_CONNECTIONS = [
+    (0,1), (1,2), (2,3), (3,7),
+    (0,4), (4,5), (5,6), (6,8),
+    (9,10),
+    (11,13), (13,15), (15,17), (17,19), (19,15), (15,21),
+    (12,14), (14,16), (16,18), (18,20), (20,16), (16,22),
+    (11,12), (12,24), (24,23), (23,11)
+]
+
+# Vertex indices can be found in
+# github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png
+# Found in github.com/google/mediapipe/python/solutions/face_mesh.py
+FACE_CONNECTIONS = [
+    # Lips.
+    (61, 146), (146, 91), (91, 181), (181, 84), (84, 17),
+    (17, 314), (314, 405), (405, 321), (321, 375), (375, 291),
+    (61, 185), (185, 40), (40, 39), (39, 37), (37, 0),
+    (0, 267), (267, 269), (269, 270), (270, 409), (409, 291),
+    (78, 95), (95, 88), (88, 178), (178, 87), (87, 14),
+    (14, 317), (317, 402), (402, 318), (318, 324), (324, 308),
+    (78, 191), (191, 80), (80, 81), (81, 82), (82, 13),
+    (13, 312), (312, 311), (311, 310), (310, 415), (415, 308),
+    # Left eye.
+    (263, 249), (249, 390), (390, 373), (373, 374), (374, 380),
+    (380, 381), (381, 382), (382, 362), (263, 466), (466, 388),
+    (388, 387), (387, 386), (386, 385), (385, 384), (384, 398),
+    (398, 362),
+    # Left eyebrow.
+    (276, 283), (283, 282), (282, 295), (295, 285), (300, 293),
+    (293, 334), (334, 296), (296, 336),
+    # Right eye.
+    (33, 7), (7, 163), (163, 144), (144, 145), (145, 153),
+    (153, 154), (154, 155), (155, 133), (33, 246), (246, 161),
+    (161, 160), (160, 159), (159, 158), (158, 157), (157, 173),
+    (173, 133),
+    # Right eyebrow.
+    (46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105),
+    (105, 66), (66, 107),
+    # Face oval.
+    (10, 338), (338, 297), (297, 332), (332, 284), (284, 251),
+    (251, 389), (389, 356), (356, 454), (454, 323), (323, 361),
+    (361, 288), (288, 397), (397, 365), (365, 379), (379, 378),
+    (378, 400), (400, 377), (377, 152), (152, 148), (148, 176),
+    (176, 149), (149, 150), (150, 136), (136, 172), (172, 58),
+    (58, 132), (132, 93), (93, 234), (234, 127), (127, 162),
+    (162, 21), (21, 54), (54, 103), (103, 67), (67, 109),
+    (109, 10)
+]
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/README.md b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a2d5ae2cc86de93979c3b3183841d880fbf720cc
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/README.md
@@ -0,0 +1,63 @@
+## Model Information
+### Source model
+- Input shape: [1x3x256x256],[1x3x192x192]
+- Number of parameters:0.13M, 0.6M
+- Model size:0.58MB, 2.32MB
+- Output shape: [1x896x16, 1x896x1], [1, 1x486x3]
+
+Source model repository: [MediaPipe-Face-Detection](https://github.com/zmurez/MediaPipePyTorch/)
+
+### Converted model
+
+- Precision: W8A16
+- Backend: QNN2.16
+- Target Device: SNM972 QCS8550
+
+## Inference with AidLite SDK
+
+### SDK installation
+Model Farm uses AidLite SDK as the model inference SDK. For details, please refer to the [AidLite Developer Documentation](https://v2.docs.aidlux.com/en/sdk-api/aidlite-sdk/)
+
+- install AidLite SDK
+
+```bash
+# Install the appropriate version of the aidlite sdk
+sudo aid-pkg update
+sudo aid-pkg install aidlite-sdk
+# Download the qnn version that matches the above backend. Eg Install QNN2.23 Aidlite: sudo aid-pkg install aidlite-qnn223
+sudo aid-pkg install aidlite-{QNN VERSION}
+```
+
+- Verify AidLite SDK
+
+```bash
+# aidlite sdk c++ check
+python3 -c "import aidlite ; print(aidlite.get_library_version())"
+
+# aidlite sdk python check
+python3 -c "import aidlite ; print(aidlite.get_py_library_version())"
+```
+
+### Run demo
+#### python
+```bash
+cd python 
+python3 demo_qnn.py
+```
+
+#### c++
+```bash
+# 加载.npy文件需要用到cnpy库（终端默认路径下执行即可）
+git clone https://github.com/rogersce/cnpy.git
+cd cnpy
+mkdir build && cd build
+cmake ..
+make
+sudo make install
+
+cd mediapipe-face-detection/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp
+mkdir build && cd build 
+cmake ..
+make
+./run_test
+```
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazebase.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8655a0360f730c1fb7b4f81367a6279e5acc8060
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazebase.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazebase.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2044fec25dc146bd44e23e22b15c320e189fd99f
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazebase.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazeface.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3d9d716ed4bbd036e086c876ce0c5308d5a9a5d
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazeface.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27b6da065e0c25967994af97ae8c72f67594bf5f
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/blazeface_landmark.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/visualization.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e9b9b371d9ab88b3885f249fa1a9fc68d8751c1
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/visualization.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/visualization.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/visualization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c16fa3b7c9a01a594bb21e59a6c00d0b19945548
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/__pycache__/visualization.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/CMakeLists.txt b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..13cca3db86e64b8a8458fd55c52323bed1d4282f
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/CMakeLists.txt
@@ -0,0 +1,34 @@
+cmake_minimum_required (VERSION 3.5)
+project("run_test")
+
+find_package(OpenCV REQUIRED)
+find_library(CNPY_LIB cnpy REQUIRED)
+
+message(STATUS "oPENCV Library status:")
+message(STATUS ">version:${OpenCV_VERSION}")
+message(STATUS "Include:${OpenCV_INCLUDE_DIRS}")
+
+set(CMAKE_CXX_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
+
+include_directories(
+    /usr/local/include
+    /usr/include/opencv4
+)
+
+link_directories(
+    /usr/local/lib/
+)
+
+file(GLOB SRC_LISTS 
+    ${CMAKE_CURRENT_SOURCE_DIR}/run_test.cpp  
+)
+
+add_executable(run_test ${SRC_LISTS})
+
+target_link_libraries(run_test
+    aidlite
+	${OpenCV_LIBS}
+    pthread
+    jsoncpp
+    ${CNPY_LIB}
+)
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/anchors_float32.npy b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/anchors_float32.npy
new file mode 100644
index 0000000000000000000000000000000000000000..fc25c0d2c161ee090e9a04c6003418bd15caf45c
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/anchors_float32.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79fa15d63ca4c37eaa953ddb462623e52b07f9af52be5deb7b935b8d3c3d7d94
+size 14464
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/coco.jpg b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/coco.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/coco.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378
+size 158750
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/run_test.cpp b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/run_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8e051fdefc5ee932622844d3afb0c5d8f3f76d7
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/cpp/run_test.cpp
@@ -0,0 +1,909 @@
+#include <iostream>
+#include <fstream>
+#include <opencv2/opencv.hpp>
+#include <aidlux/aidlite/aidlite.hpp>
+#include <vector>
+#include <numeric>
+#include <cmath>
+#include <jsoncpp/json/json.h>
+#include <tuple>
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <cassert>
+#include "cnpy.h"
+
+using namespace cv;
+using namespace std;
+using namespace Aidlux::Aidlite;
+
+
+// 人脸 landmark 连接索引定义（来自 MediaPipe Face Mesh）
+const std::vector<std::pair<int, int>> FACE_CONNECTIONS = {
+    {61, 146}, {146, 91}, {91, 181}, {181, 84}, {84, 17},
+    {17, 314}, {314, 405}, {405, 321}, {321, 375}, {375, 291},
+    {61, 185}, {185, 40}, {40, 39}, {39, 37}, {37, 0},
+    {0, 267}, {267, 269}, {269, 270}, {270, 409}, {409, 291},
+    {78, 95}, {95, 88}, {88, 178}, {178, 87}, {87, 14},
+    {14, 317}, {317, 402}, {402, 318}, {318, 324}, {324, 308},
+    {78, 191}, {191, 80}, {80, 81}, {81, 82}, {82, 13},
+    {13, 312}, {312, 311}, {311, 310}, {310, 415}, {415, 308},
+    {263, 249}, {249, 390}, {390, 373}, {373, 374}, {374, 380},
+    {380, 381}, {381, 382}, {382, 362}, {263, 466}, {466, 388},
+    {388, 387}, {387, 386}, {386, 385}, {385, 384}, {384, 398},
+    {398, 362}, {276, 283}, {283, 282}, {282, 295}, {295, 285},
+    {300, 293}, {293, 334}, {334, 296}, {296, 336}, {33, 7},
+    {7, 163}, {163, 144}, {144, 145}, {145, 153}, {153, 154},
+    {154, 155}, {155, 133}, {33, 246}, {246, 161}, {161, 160},
+    {160, 159}, {159, 158}, {158, 157}, {157, 173}, {173, 133},
+    {46, 53}, {53, 52}, {52, 65}, {65, 55}, {70, 63}, {63, 105},
+    {105, 66}, {66, 107}, {10, 338}, {338, 297}, {297, 332},
+    {332, 284}, {284, 251}, {251, 389}, {389, 356}, {356, 454},
+    {454, 323}, {323, 361}, {361, 288}, {288, 397}, {397, 365},
+    {365, 379}, {379, 378}, {378, 400}, {400, 377}, {377, 152},
+    {152, 148}, {148, 176}, {176, 149}, {149, 150}, {150, 136},
+    {136, 172}, {172, 58}, {58, 132}, {132, 93}, {93, 234},
+    {234, 127}, {127, 162}, {162, 21}, {21, 54}, {54, 103},
+    {103, 67}, {67, 109}, {109, 10}
+};
+
+struct Args {
+    std::string faceDetector_model = "../../models/m_faceDetector_w8a16.qnn216.ctx.bin";
+    std::string faceLandmark_model = "../../models/m_faceLandmark_w8a16.qnn216.ctx.bin";
+    std::string imgs = "../coco.jpg";
+    int invoke_nums = 10;
+    std::string model_type = "QNN";
+};
+
+
+Args parse_args(int argc, char* argv[]) {
+    Args args;
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+        if (arg == "--faceDetector_model" && i + 1 < argc) {
+            args.faceDetector_model = argv[++i];
+        } else if (arg == "--faceLandmark_model" && i + 1 < argc) {
+            args.faceLandmark_model = argv[++i];
+        } else if (arg == "--imgs" && i + 1 < argc) {
+            args.imgs = argv[++i];
+        } else if (arg == "--invoke_nums" && i + 1 < argc) {
+            args.invoke_nums = std::stoi(argv[++i]);
+        } else if (arg == "--model_type" && i + 1 < argc) {
+            args.model_type = argv[++i];
+        }
+    }
+    return args;
+}
+
+std::string to_lower(const std::string& str) {
+    std::string lower_str = str;
+    std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) {
+        return std::tolower(c);
+    });
+    return lower_str;
+}
+
+std::vector<std::vector<float>> load_anchors_from_npy(const std::string& path) {
+    cnpy::NpyArray arr = cnpy::npy_load(path);
+    float* data_ptr = arr.data<float>();
+
+    size_t num_rows = arr.shape[0]; // 896
+    size_t num_cols = arr.shape[1]; // 4
+
+    std::vector<std::vector<float>> anchors(num_rows, std::vector<float>(num_cols));
+    for (size_t i = 0; i < num_rows; ++i) {
+        for (size_t j = 0; j < num_cols; ++j) {
+            anchors[i][j] = data_ptr[i * num_cols + j];
+        }
+    }
+
+    return anchors;
+}
+
+
+// 绘制人脸关键点和连接线
+void draw_landmarks(
+    cv::Mat& img,
+    const std::vector<cv::Point2f>& points,
+    const std::vector<float>& flags,
+    const std::vector<std::pair<int, int>>& connections,
+    float threshold = 0.4f,
+    cv::Scalar point_color = cv::Scalar(0, 255, 0),
+    cv::Scalar line_color = cv::Scalar(0, 0, 0),
+    int size = 2)
+{
+    // 画关键点
+    for (size_t i = 0; i < points.size(); ++i) {
+        if (i < flags.size() && flags[i] > threshold) {
+            int x = static_cast<int>(points[i].x);
+            int y = static_cast<int>(points[i].y);
+            cv::circle(img, cv::Point(x, y), size, point_color, size);
+        }
+    }
+
+    // 画连接线（两端都要可见）
+    for (const auto& conn : connections) {
+        int i0 = conn.first;
+        int i1 = conn.second;
+        if (i0 < points.size() && i1 < points.size() &&
+            i0 < flags.size() && i1 < flags.size() &&
+            flags[i0] > threshold && flags[i1] > threshold)
+        {
+            cv::line(img, points[i0], points[i1], line_color, size);
+        }
+    }
+}
+
+
+std::tuple<cv::Mat, cv::Mat, float, cv::Point> resize_pad(const cv::Mat& img) {
+    int orig_h = img.rows; // 480
+    int orig_w = img.cols; // 640
+
+    // Step 1: resize width to 256, keep aspect ratio
+    int w1 = 256;
+    int h1 = w1 * orig_h / orig_w;  // 等效于 int(256 * h / w)
+
+    // Step 2: compute padding in height direction
+    int padh = 256 - h1;
+    int padw = 0;
+
+    int padh1 = padh / 2;
+    int padh2 = padh1 + (padh % 2);
+    int padw1 = padw / 2;
+    int padw2 = padw1 + (padw % 2);
+
+    // Step 3: resize to (w1, h1)
+    cv::Mat resized;
+    cv::resize(img, resized, cv::Size(w1, h1));  // (256, h1)
+
+    // Step 4: pad to (256, 256)
+    cv::Mat padded;
+    cv::copyMakeBorder(resized, padded, padh1, padh2, padw1, padw2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
+
+    // Step 5: resize padded to 128×128
+    cv::Mat resized128;
+    cv::resize(padded, resized128, cv::Size(128, 128));
+
+    // Step 6: compute scale and pad in original image space
+    float scale = static_cast<float>(orig_h) / h1; // h / h1
+    cv::Point pad_point(static_cast<int>(padh1 * scale), static_cast<int>(padw1 * scale));
+
+    return std::make_tuple(padded, resized128, scale, pad_point);
+}
+
+
+// 将图像转换为 1xC×H×W 格式并归一化（除以 255）
+std::vector<float> preprocess_image(const cv::Mat& img) {
+    int H = img.rows;
+    int W = img.cols;
+    int C = img.channels(); // should be 3
+
+    std::vector<float> chw(H * W * C); // CHW
+    std::vector<float> nchw(1 * C * H * W); // NCHW
+
+    // 1. HWC → CHW + normalize (float32 / 255.0)
+    for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+            for (int c = 0; c < C; ++c) {
+                // OpenCV uses BGR order
+                float value = img.at<cv::Vec3b>(h, w)[c] / 255.0f;
+                chw[c * H * W + h * W + w] = value;
+            }
+        }
+    }
+
+    // 2. CHW → NCHW (add batch dimension, actually just copy)
+    for (int i = 0; i < C * H * W; ++i) {
+        nchw[i] = chw[i];
+    }
+
+    return nchw; // shape: [1, 3, H, W]
+}
+
+
+// 只用前4个坐标计算IOU（默认框位置在前4个坐标）
+float IoU(const std::vector<float>& box1, const std::vector<float>& box2) {
+    float x1 = std::max(box1[0], box2[0]);
+    float y1 = std::max(box1[1], box2[1]);
+    float x2 = std::min(box1[2], box2[2]);
+    float y2 = std::min(box1[3], box2[3]);
+
+    float inter_area = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
+    float box1_area = std::max(0.0f, box1[2] - box1[0]) * std::max(0.0f, box1[3] - box1[1]);
+    float box2_area = std::max(0.0f, box2[2] - box2[0]) * std::max(0.0f, box2[3] - box2[1]);
+    float union_area = box1_area + box2_area - inter_area;
+
+    return union_area > 0 ? inter_area / union_area : 0.0f;
+}
+
+std::vector<std::vector<float>> weighted_non_max_suppression(
+    std::vector<std::vector<float>>& detections,
+    int num_coords = 16,
+    float min_suppression_threshold = 0.3f)
+{
+    if (detections.empty()) return {};
+
+    std::vector<int> indices(detections.size());
+    std::iota(indices.begin(), indices.end(), 0);
+
+    // 按置信度降序排序
+    std::sort(indices.begin(), indices.end(), [&](int a, int b) {
+        return detections[a][num_coords] > detections[b][num_coords];
+    });
+
+    std::vector<std::vector<float>> output;
+
+    while (!indices.empty()) {
+        int best_idx = indices.front();
+        const auto& best_det = detections[best_idx];
+        std::vector<int> overlapping = { best_idx };
+
+        for (size_t i = 1; i < indices.size(); ++i) {
+            float iou = IoU(best_det, detections[indices[i]]);
+            if (iou > min_suppression_threshold) {
+                overlapping.push_back(indices[i]);
+            }
+        }
+
+        // 更新剩余索引
+        std::vector<int> new_indices;
+        for (int idx : indices) {
+            if (std::find(overlapping.begin(), overlapping.end(), idx) == overlapping.end()) {
+                new_indices.push_back(idx);
+            }
+        }
+        indices = new_indices;
+
+        // 加权平均：坐标 * 置信度
+        if (overlapping.size() == 1) {
+            output.push_back(best_det);
+        } else {
+            std::vector<float> weighted(num_coords + 1, 0.0f);
+            float total_score = 0.0f;
+
+            for (int idx : overlapping) {
+                float score = detections[idx][num_coords];
+                total_score += score;
+                for (int k = 0; k < num_coords; ++k) {
+                    weighted[k] += detections[idx][k] * score;
+                }
+            }
+
+            for (int k = 0; k < num_coords; ++k) {
+                weighted[k] /= total_score;
+            }
+            weighted[num_coords] = total_score / overlapping.size();  // 取平均得分
+
+            // std::cout << "Weighted box: ";
+            // for (float v : weighted) std::cout << v << " ";
+            // std::cout << "\n";
+
+            output.push_back(weighted);
+        }
+    }
+
+    // TODO 
+    auto x = output[0];
+    output.clear();
+    output.push_back(x);
+
+    return output;
+}
+
+
+std::vector<std::vector<float>> denormalize_detections(
+    const std::vector<std::vector<float>>& detections,
+    float scale,
+    const cv::Point& pad
+) {
+    std::vector<std::vector<float>> result = detections;
+
+    for (size_t i = 0; i < result.size(); ++i) {
+        std::vector<float>& det = result[i];
+
+        // bbox coords: x1, y1, x2, y2
+        det[0] = det[0] * scale * 256.0f - pad.x;  // x1
+        det[1] = det[1] * scale * 256.0f - pad.y;  // y1
+        det[2] = det[2] * scale * 256.0f - pad.x;  // x2
+        det[3] = det[3] * scale * 256.0f - pad.y;  // y2
+
+        // keypoints (starting from index 4): format [y, x, y, x, ...]
+        for (size_t k = 4; k + 1 < det.size(); k += 2) {
+            det[k]     = det[k]     * scale * 256.0f - pad.y;  // y
+            det[k + 1] = det[k + 1] * scale * 256.0f - pad.x;  // x
+        }
+    }
+
+    return result;
+}
+
+
+void detection2roi(
+    const std::vector<std::vector<float>>& detections,
+    std::vector<float>& xc,
+    std::vector<float>& yc,
+    std::vector<float>& scale,
+    std::vector<float>& theta,
+    int kp1, int kp2,   // 关键点索引
+    float dy, float dscale, float theta0
+) {
+    size_t N = detections.size();
+    xc.resize(N);
+    yc.resize(N);
+    scale.resize(N);
+    theta.resize(N);
+
+    for (size_t i = 0; i < N; ++i) {
+        const std::vector<float>& det = detections[i];
+
+        float x1 = det[1];
+        float x2 = det[3];
+        float y1 = det[0];
+        float y2 = det[2];
+
+        float x_center = (x1 + x2) / 2.0f;
+        float y_center = (y1 + y2) / 2.0f;
+        float box_scale = (x2 - x1);  // assumes square box
+
+        // yc 偏移
+        y_center += dy * box_scale;
+        box_scale *= dscale;
+
+        // 获取两个关键点的位置
+        int base = 4;
+        int idx_y0 = base + 2 * kp1;
+        int idx_x0 = base + 2 * kp1 + 1;
+        int idx_y1 = base + 2 * kp2;
+        int idx_x1 = base + 2 * kp2 + 1;
+
+        float x0 = det[idx_x0];
+        float y0 = det[idx_y0];
+        float x1_kp = det[idx_x1];
+        float y1_kp = det[idx_y1];
+
+        float angle = std::atan2(y0 - y1_kp, x0 - x1_kp) - theta0;
+
+        // 输出赋值
+        xc[i] = x_center;
+        yc[i] = y_center;
+        scale[i] = box_scale;
+        // TODO: 这里的 theta 需要根据实际情况调整
+        // theta[i] = angle;  // 如果需要使用计算的角度
+        theta[i] = -0.0094;
+    }
+}
+
+
+void extract_roi(
+    const cv::Mat& frame,
+    const std::vector<float>& xc,
+    const std::vector<float>& yc,
+    const std::vector<float>& theta,
+    const std::vector<float>& scale,
+    std::vector<cv::Mat>& cropped_rois,
+    std::vector<cv::Mat>& affine_matrices,
+    std::vector<std::vector<cv::Point2f>>& roi_boxes,  // 添加返回点坐标
+    int resolution = 192
+) {
+    cropped_rois.clear();
+    affine_matrices.clear();
+    roi_boxes.clear();
+
+    for (size_t i = 0; i < xc.size(); ++i) {
+        float s = scale[i] / 2.0f;
+        float cos_t = std::cos(theta[i]);
+        float sin_t = std::sin(theta[i]);
+
+        // 定义4个 unit square 点经过变换后的点（顺序和 Python 中一样）
+        std::vector<cv::Point2f> points(4);
+        // [-1, -1]
+        points[0].x = xc[i] + (-s * cos_t + s * sin_t);
+        points[0].y = yc[i] + (-s * sin_t - s * cos_t);
+        // [1, -1]
+        points[1].x = xc[i] + ( s * cos_t + s * sin_t);
+        points[1].y = yc[i] + ( s * sin_t - s * cos_t);
+        // [-1, 1]
+        points[2].x = xc[i] + (-s * cos_t - s * sin_t);
+        points[2].y = yc[i] + (-s * sin_t + s * cos_t);
+        // [1, 1]
+        points[3].x = xc[i] + ( s * cos_t - s * sin_t);
+        points[3].y = yc[i] + ( s * sin_t + s * cos_t);
+
+        // 用前三个点计算仿射变换
+        std::vector<cv::Point2f> src_pts = { points[0], points[1], points[2] };
+        std::vector<cv::Point2f> dst_pts = {
+            cv::Point2f(0, 0),
+            cv::Point2f(resolution - 1, 0),
+            cv::Point2f(0, resolution - 1)
+        };
+
+        cv::Mat M = cv::getAffineTransform(src_pts, dst_pts);
+        cv::Mat M_inv;
+        cv::invertAffineTransform(M, M_inv);
+
+        cv::Mat cropped;
+        cv::warpAffine(frame, cropped, M, cv::Size(resolution, resolution), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(127.5, 127.5, 127.5));
+        cropped_rois.push_back(cropped);
+        affine_matrices.push_back(M_inv);
+        roi_boxes.push_back(points);  // 添加变换后的 box 点
+    }
+}
+
+std::vector<float> preprocess_imgs_to_nchw(const std::vector<cv::Mat>& imgs) {
+    int N = imgs.size();
+    if (N == 0) return {};
+
+    int H = 192;
+    int W = 192;
+    int C = 3;  // assume 3 channels (BGR)
+
+    std::vector<float> output;
+    output.reserve(N * C * H * W);
+
+    for (int n = 0; n < N; ++n) {
+        cv::Mat img_float;
+        imgs[n].convertTo(img_float, CV_32FC3, 1.0 / 255.0);  // Normalize to [0,1]
+
+        // Split channels (HWC → CHW)
+        std::vector<cv::Mat> channels(3);
+        cv::split(img_float, channels);  // channels[0] = B, [1] = G, [2] = R
+
+        for (int c = 0; c < C; ++c) {
+            for (int i = 0; i < H; ++i) {
+                for (int j = 0; j < W; ++j) {
+                    output.push_back(channels[c].at<float>(i, j));
+                }
+            }
+        }
+    }
+
+    return output;  // shape: N x C x H x W
+}
+
+// resolution 一般为 192
+std::vector<cv::Point2f> denormalize_landmarks(
+    const std::vector<float>& normalized_landmarks,
+    const std::vector<cv::Mat>& affines,
+    int resolution = 192)
+{
+    std::vector<cv::Point2f> output;
+    
+    // 检查输入尺寸
+    const int num_faces = 1;  // 假设只有一个人脸
+    const int num_landmarks = 468;
+    if (normalized_landmarks.size() != num_faces * num_landmarks * 3 || affines.size() != num_faces) {
+        std::cerr << "Error: Input size mismatch. Expected " 
+                  << num_faces * num_landmarks * 3 << " landmarks and " 
+                  << num_faces << " affine matrices." << std::endl;
+        throw std::runtime_error("Input size mismatch");
+    }
+
+    for (int i = 0; i < num_faces; ++i) {
+        const cv::Mat& affine = affines[i]; // 2x3 CV_32F
+        for (int j = 0; j < num_landmarks; ++j) {
+            int idx = i * num_landmarks * 3 + j * 3;
+            float x = normalized_landmarks[idx + 0] * resolution;
+            float y = normalized_landmarks[idx + 1] * resolution;
+            // float z = normalized_landmarks[idx + 2]; // 可选使用
+
+            // 2x1 输入向量
+            cv::Mat pt = (cv::Mat_<float>(2, 1) << x, y);
+
+            // 提取 affine 的旋转和平移
+            cv::Mat M2x2 = affine(cv::Rect(0, 0, 2, 2)).clone();
+            cv::Mat t2x1 = affine(cv::Rect(2, 0, 1, 2)).clone();
+            M2x2.convertTo(M2x2, CV_32F);
+            t2x1.convertTo(t2x1, CV_32F);
+
+            // 反仿射变换
+            cv::Mat out = M2x2 * pt + t2x1;
+
+            // 存储为 Point2f
+            output.emplace_back(out.at<float>(0, 0), out.at<float>(1, 0));
+        }
+    }
+
+    return output; // 输出为 denormalized landmarks，大小为 468 个 Point2f
+}
+
+
+void draw_roi(cv::Mat& img, const std::vector<std::vector<cv::Point2f>>& boxes) {
+    for (const auto& roi : boxes) {
+        if (roi.size() < 4) continue;
+
+        const cv::Point2f& p1 = roi[0];
+        const cv::Point2f& p2 = roi[1];
+        const cv::Point2f& p3 = roi[2];
+        const cv::Point2f& p4 = roi[3];
+
+        cv::line(img, p1, p2, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p1, p3, cv::Scalar(0, 255, 0), 2);     // 绿色
+        cv::line(img, p2, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+        cv::line(img, p3, p4, cv::Scalar(0, 0, 0), 2);       // 黑色
+    }
+}
+
+
+void draw_detections(cv::Mat& img, const std::vector<std::vector<float>>& detections, bool with_keypoints = true) {
+    for (const auto& det : detections) {
+        if (det.size() < 4) continue;
+
+        float ymin = det[0];
+        float xmin = det[1];
+        float ymax = det[2];
+        float xmax = det[3];
+
+        cv::rectangle(img, cv::Point(int(xmin), int(ymin)), cv::Point(int(xmax), int(ymax)), cv::Scalar(255, 0, 0), 1);
+
+        if (with_keypoints && det.size() > 4) {
+            int n_keypoints = (det.size() - 4) / 2;
+            for (int k = 0; k < n_keypoints; ++k) {
+                int kp_x = int(det[4 + k * 2]);
+                int kp_y = int(det[4 + k * 2 + 1]);
+                cv::circle(img, cv::Point(kp_x, kp_y), 2, cv::Scalar(0, 0, 255), 2);
+            }
+        }
+    }
+}
+
+
+std::vector<std::vector<float>> loadAnchors(const std::string& filename) {
+    std::ifstream in(filename);
+    std::vector<std::vector<float>> anchors;
+
+    if (!in.is_open()) {
+        std::cerr << "Failed to open file: " << filename << std::endl;
+        return anchors;
+    }
+
+    std::string line;
+    while (std::getline(in, line)) {
+        std::istringstream ss(line);
+        std::vector<float> anchor;
+        float value;
+        while (ss >> value) {
+            anchor.push_back(value);
+        }
+        if (!anchor.empty()) {
+            anchors.push_back(anchor);
+        }
+    }
+
+    in.close();
+    return anchors;
+}
+
+// sigmoid 函数
+float sigmoid(float x) {
+    return 1.0f / (1.0f + std::exp(-x));
+}
+
+// clamp 函数
+float clamp(float x, float min_val, float max_val) {
+    return std::max(min_val, std::min(max_val, x));
+}
+
+// shape: [batch, num_anchors, num_coords] => [batch][anchor][coord]
+std::vector<std::vector<std::vector<float>>> decode_boxes(
+    const std::vector<float>& raw_boxes,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    int num_keypoints)
+{
+    std::vector<std::vector<std::vector<float>>> decoded_boxes(batch,
+        std::vector<std::vector<float>>(num_anchors, std::vector<float>(num_coords, 0)));
+
+    for (int b = 0; b < batch; ++b) {
+        for (int i = 0; i < num_anchors; ++i) {
+            int base = b * num_anchors * num_coords + i * num_coords;
+
+            float x_center = raw_boxes[base + 0] / x_scale * anchors[i][2] + anchors[i][0];
+            float y_center = raw_boxes[base + 1] / y_scale * anchors[i][3] + anchors[i][1];
+            float w = raw_boxes[base + 2] / w_scale * anchors[i][2];
+            float h = raw_boxes[base + 3] / h_scale * anchors[i][3];
+
+            decoded_boxes[b][i][0] = y_center - h / 2.0f; // ymin
+            decoded_boxes[b][i][1] = x_center - w / 2.0f; // xmin
+            decoded_boxes[b][i][2] = y_center + h / 2.0f; // ymax
+            decoded_boxes[b][i][3] = x_center + w / 2.0f; // xmax
+
+            for (int k = 0; k < num_keypoints; ++k) {
+                int offset = 4 + k * 2;
+                float keypoint_x = raw_boxes[base + offset] / x_scale * anchors[i][2] + anchors[i][0];
+                float keypoint_y = raw_boxes[base + offset + 1] / y_scale * anchors[i][3] + anchors[i][1];
+                decoded_boxes[b][i][offset] = keypoint_x;
+                decoded_boxes[b][i][offset + 1] = keypoint_y;
+            }
+        }
+    }
+
+    return decoded_boxes;
+}
+
+std::vector<std::vector<std::vector<float>>> tensors_to_detections(
+    const std::vector<float>& raw_box_tensor,
+    const std::vector<float>& raw_score_tensor,
+    const std::vector<std::vector<float>>& anchors,
+    int batch, int num_anchors, int num_coords, int num_classes, int num_keypoints,
+    float x_scale, float y_scale, float w_scale, float h_scale,
+    float score_clipping_thresh, float min_score_thresh)
+{
+    assert(raw_box_tensor.size() == batch * num_anchors * num_coords);
+    assert(raw_score_tensor.size() == batch * num_anchors * num_classes);
+    assert(anchors.size() == size_t(num_anchors));
+
+    auto detection_boxes = decode_boxes(
+        raw_box_tensor, anchors, batch, num_anchors, num_coords,
+        x_scale, y_scale, w_scale, h_scale, num_keypoints);
+
+    std::vector<std::vector<std::vector<float>>> output_detections;
+
+    for (int b = 0; b < batch; ++b) {
+        std::vector<std::vector<float>> detections;
+
+        for (int i = 0; i < num_anchors; ++i) {
+            int score_index = b * num_anchors * num_classes + i * num_classes;
+
+            // 单类情况，取第0类
+            float score_raw = raw_score_tensor[score_index];
+            float score = sigmoid(clamp(score_raw, -score_clipping_thresh, score_clipping_thresh));
+
+            if (score >= min_score_thresh) {
+                std::vector<float> det = detection_boxes[b][i]; // shape [num_coords]
+                det.push_back(score); // 追加置信度
+                detections.push_back(det); // shape [num_coords+1]
+            }
+        }
+
+        output_detections.push_back(detections); // 每个 batch 一个 vector
+    }
+
+    return output_detections;
+}
+
+
+int invoke(const Args& args) {
+    std::cout << "Start main ... ... Model Path: " << args.faceDetector_model << "\n"
+              << args.faceLandmark_model << "\n"
+              << "Image Path: " << args.imgs << "\n"
+              << "Inference Nums: " << args.invoke_nums << "\n"
+              << "Model Type: " << args.model_type << "\n";
+    // =============================================================faceDetector_model start
+    Model* model1 = Model::create_instance(args.faceDetector_model);
+    if(model1 == nullptr){
+        printf("Create model1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config1 = Config::create_instance();
+    if(config1 == nullptr){
+        printf("Create config1 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config1->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower1 = to_lower(args.model_type);
+    if (model_type_lower1 == "qnn"){
+        config1->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower1 == "snpe2" || model_type_lower1 == "snpe") {
+        config1->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config1->accelerate_type = AccelerateType::TYPE_DSP;
+    config1->is_quantify_model = 1;
+
+    std::vector<std::vector<uint32_t>> input_shapes1 = {{1,3,256,256}};
+    std::vector<std::vector<uint32_t>> output_shapes1 = {{1,896,16},{1,896,1}};
+    model1->set_model_properties(input_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes1, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter1 = InterpreterBuilder::build_interpretper_from_model_and_config(model1, config1);
+    if(fast_interpreter1 == nullptr){
+        printf("build_interpretper_from_model_and_config failed !\n");
+        return EXIT_FAILURE;
+    }
+    int result = fast_interpreter1->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter1->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model load success!\n");
+    // =============================================================faceDetector_model over
+
+    // =============================================================faceLandmark_model start
+    Model* model2 = Model::create_instance(args.faceLandmark_model);
+    if(model2 == nullptr){
+        printf("Create model2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    Config* config2 = Config::create_instance();
+    if(config2 == nullptr){
+        printf("Create config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    config2->implement_type = ImplementType::TYPE_LOCAL;
+    std::string model_type_lower2 = to_lower(args.model_type);
+    if (model_type_lower2 == "qnn"){
+        config2->framework_type = FrameworkType::TYPE_QNN;
+    } else if (model_type_lower2 == "snpe2" || model_type_lower2 == "snpe") {
+        config2->framework_type = FrameworkType::TYPE_SNPE2;
+    }
+    config2->accelerate_type = AccelerateType::TYPE_DSP;
+    config2->is_quantify_model = 1;
+
+    std::vector<std::vector<uint32_t>> input_shapes2 = {{1,3,192,192}};
+    std::vector<std::vector<uint32_t>> output_shapes2 = {{1},{1,468,3}};
+    model2->set_model_properties(input_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32, output_shapes2, Aidlux::Aidlite::DataType::TYPE_FLOAT32);
+    std::unique_ptr<Interpreter> fast_interpreter2 = InterpreterBuilder::build_interpretper_from_model_and_config(model2, config2);
+    if(fast_interpreter2 == nullptr){
+        printf("build_interpretper_from_model_and_config2 failed !\n");
+        return EXIT_FAILURE;
+    }
+    result = fast_interpreter2->init();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->init() failed !\n");
+        return EXIT_FAILURE;
+    }
+    // load model
+    fast_interpreter2->load_model();
+    if(result != EXIT_SUCCESS){
+        printf("interpreter2->load_model() failed !\n");
+        return EXIT_FAILURE;
+    }
+    printf("detect model2 load success!\n");
+    // =============================================================faceLandmark_model over
+
+
+    auto anchors = load_anchors_from_npy("../anchors_float32.npy");
+    cv::Mat frame = cv::imread(args.imgs);
+    if (frame.empty()) {
+        printf("detect image load failed!\n");
+        return 1;
+    }
+    // printf("img_src cols: %d, img_src rows: %d\n", frame.cols, frame.rows);
+    cv::Mat input_data;
+    cv::Mat frame_clone1 = frame.clone();
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_BGR2RGB); 
+    cv::Mat frame_clone = frame.clone();
+
+
+    cv::Mat img1, img2;
+    float scale;
+    cv::Point pad;
+    std::tie(img1, img2, scale, pad) = resize_pad(frame_clone1);
+    std::vector<float> input_tensor = preprocess_image(img1);
+
+    float *outdata0 = nullptr;
+    float *outdata1 = nullptr;
+    std::vector<float> invoke_time;
+    for (int i = 0; i < args.invoke_nums; ++i) {
+        result = fast_interpreter1->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter1->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_0 = 0;
+        result = fast_interpreter1->get_output_tensor(0, (void**)&outdata0, &out_data_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        uint32_t out_data_1 = 0;
+        result = fast_interpreter1->get_output_tensor(1, (void**)&outdata1, &out_data_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter1->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+    }
+
+    std::vector<float> tensor_1_896_16(outdata0, outdata0 + 896*16);
+    std::vector<float> tensor_1_896_1(outdata1, outdata1 + 896*1);
+
+    std::vector<std::vector<std::vector<float>>> detections = tensors_to_detections(
+        tensor_1_896_16, tensor_1_896_1, anchors,
+        1, 896, 16, 1, 6,
+        256.0f, 256.0f, 256.0f, 256.0f,
+        100.0f, 0.4f);
+
+
+    std::vector<std::vector<std::vector<float>>> filtered_detections;
+    for (size_t i = 0; i < detections.size(); ++i) {
+        std::vector<std::vector<float>>& dets = detections[i];
+        std::vector<std::vector<float>> faces = weighted_non_max_suppression(dets);
+        filtered_detections.push_back(faces);
+    }
+
+
+    // std::cout << "filtered_detections size: " << filtered_detections.size() << "\n";
+    // std::cout << "scale: " << scale << ", pad: (" << pad.x << ", " << pad.y << ")\n";
+    std::vector<std::vector<float>> face_detections = denormalize_detections(filtered_detections[0], scale, pad);
+
+    // std::cout << "face_detections size: " << face_detections.size() << "\n";
+    std::vector<float> xc, yc, scales, theta;
+    int kp1 = 0, kp2 = 1;           // 关键点索引
+    float dy = 0.0f;                // 根据模型定义设定
+    float dscale = 1.5f;            // 缩放因子
+    float theta0 = 0.0f;            // 基准角度
+
+    detection2roi(face_detections, xc, yc, scales, theta, kp1, kp2, dy, dscale, theta0);
+    std::vector<cv::Mat> rois;
+    std::vector<cv::Mat> affines;
+    std::vector<std::vector<cv::Point2f>> boxes;
+
+    extract_roi(frame_clone1, xc, yc, theta, scales, rois, affines, boxes);
+    if (!boxes.empty()) {
+        std::cout << "Detected " << boxes.size() << " faces.\n";
+        // 检测到人脸，继续处理 boxes[0] ...
+        std::vector<float> input_tensor = preprocess_imgs_to_nchw(rois);
+
+        float *outdata1_0 = nullptr;
+        float *outdata1_1 = nullptr;
+
+        result = fast_interpreter2->set_input_tensor(0, input_tensor.data());
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->set_input_tensor() failed !\n");
+            return EXIT_FAILURE;
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        result = fast_interpreter2->invoke();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> cost_time = t2 - t1;
+        invoke_time.push_back(cost_time.count() * 1000);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->invoke() failed !\n");
+            return EXIT_FAILURE;
+        }
+        uint32_t out_data_1_0 = 0;
+        result = fast_interpreter2->get_output_tensor(0, (void**)&outdata1_0, &out_data_1_0);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 0 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        uint32_t out_data_1_1 = 0;
+        result = fast_interpreter2->get_output_tensor(1, (void**)&outdata1_1, &out_data_1_1);
+        if(result != EXIT_SUCCESS){
+            printf("interpreter2->get_output_tensor() 1 failed !\n");
+            return EXIT_FAILURE;
+        }
+
+        std::vector<float> flags(outdata1_0, outdata1_0 + 1);
+        std::vector<float> normalized_landmarks(outdata1_1, outdata1_1 + 468*3);
+
+        std::vector<cv::Point2f> landmarks = denormalize_landmarks(normalized_landmarks, affines);
+        draw_landmarks(frame_clone1, landmarks, flags, FACE_CONNECTIONS);
+    } else {
+        std::cout << "not detect face!" << std::endl;
+    }
+
+    
+    draw_roi(frame_clone1, boxes);
+    draw_detections(frame_clone1, face_detections);
+    cv::cvtColor(frame_clone1, frame_clone1, cv::COLOR_RGB2BGR);
+    cv::imwrite("vis_result.jpg", frame_clone1);
+
+
+    fast_interpreter1->destory();
+    fast_interpreter2->destory();
+    return 0;
+    
+}
+
+
+int main(int argc, char* argv[]) {
+    Args args = parse_args(argc, argv);
+    return invoke(args);
+}
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/models/m_faceDetector_w8a16.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/models/m_faceDetector_w8a16.qnn216.ctx.bin
new file mode 100644
index 0000000000000000000000000000000000000000..061668eb5b47e8cf8940e2169e4c3f144c05d65c
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/models/m_faceDetector_w8a16.qnn216.ctx.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bef9e6fc5d306e30b740020cb8cb496a1c207dd62c2aba75ebc0fa5ea83457fd
+size 380456
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/models/m_faceLandmark_w8a16.qnn216.ctx.bin b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/models/m_faceLandmark_w8a16.qnn216.ctx.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0ad68e6aba188eb4eec12f2941c9afbb0b6d8751
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/models/m_faceLandmark_w8a16.qnn216.ctx.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d029adb842fb3663541ada2f71b06741ccbe0e71aa0b62200ed74963b7d266bc
+size 842848
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5f01c881032cc0ea0dafbbe76b4d02bf51f5dc2
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e15f793e9a3a77c122c6cae36e26c5bd9b77ab7e
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazebase.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazeface.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazeface.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4d54b68c53f26e930688f7bd66bc42e64431265
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazeface.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3960f9eaa3babd661de15273d7037029ce46e706
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/blazeface_landmark.cpython-39.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/visualization.cpython-38.pyc b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/visualization.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffaa52099e2019a3dc97a8eef82319610de6aaf4
Binary files /dev/null and b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/__pycache__/visualization.cpython-38.pyc differ
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/anchors_face_back.npy b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/anchors_face_back.npy
new file mode 100644
index 0000000000000000000000000000000000000000..3ba12474802adea36a8e37ca648d46556cd35c92
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/anchors_face_back.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95
+size 28800
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazebase.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazebase.py
new file mode 100644
index 0000000000000000000000000000000000000000..414ae950199d7fa1ebae4fa540805cb53cf1ff0d
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazebase.py
@@ -0,0 +1,513 @@
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def resize_pad(img):
+    """ resize and pad images to be input to the detectors
+
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio.
+
+    Returns:
+        img1: 256x256
+        img2: 128x128
+        scale: scale factor between original image and 256x256 image
+        pad: pixels of padding in the original image
+    """
+
+    size0 = img.shape
+    if size0[0]>=size0[1]:
+        h1 = 256
+        w1 = 256 * size0[1] // size0[0]
+        padh = 0
+        padw = 256 - w1
+        scale = size0[1] / w1
+    else:
+        h1 = 256 * size0[0] // size0[1]
+        w1 = 256
+        padh = 256 - h1
+        padw = 0
+        scale = size0[0] / h1
+    padh1 = padh//2
+    padh2 = padh//2 + padh%2
+    padw1 = padw//2
+    padw2 = padw//2 + padw%2
+    img1 = cv2.resize(img, (w1,h1))
+    img1 = np.pad(img1, ((padh1, padh2), (padw1, padw2), (0,0)))
+    pad = (int(padh1 * scale), int(padw1 * scale))
+    img2 = cv2.resize(img1, (128,128))
+    return img1, img2, scale, pad
+
+
+def denormalize_detections(detections, scale, pad):
+    """ maps detection coordinates from [0,1] to image coordinates
+
+    The face and palm detector networks take 256x256 and 128x128 images
+    as input. As such the input image is padded and resized to fit the
+    size while maintaing the aspect ratio. This function maps the
+    normalized coordinates back to the original image coordinates.
+
+    Inputs:
+        detections: nxm tensor. n is the number of detections.
+            m is 4+2*k where the first 4 valuse are the bounding
+            box coordinates and k is the number of additional
+            keypoints output by the detector.
+        scale: scalar that was used to resize the image
+        pad: padding in the x and y dimensions
+
+    """
+    detections[:, 0] = detections[:, 0] * scale * 256 - pad[0]
+    detections[:, 1] = detections[:, 1] * scale * 256 - pad[1]
+    detections[:, 2] = detections[:, 2] * scale * 256 - pad[0]
+    detections[:, 3] = detections[:, 3] * scale * 256 - pad[1]
+
+    detections[:, 4::2] = detections[:, 4::2] * scale * 256 - pad[1]
+    detections[:, 5::2] = detections[:, 5::2] * scale * 256 - pad[0]
+    return detections
+
+
+
+
+class BlazeBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, act='relu', skip_proj=False):
+        super(BlazeBlock, self).__init__()
+
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.channel_pad = out_channels - in_channels
+
+        # TFLite uses slightly different padding than PyTorch 
+        # on the depthwise conv layer when the stride is 2.
+        if stride == 2:
+            self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+            padding = 0
+        else:
+            padding = (kernel_size - 1) // 2
+
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=in_channels, 
+                      kernel_size=kernel_size, stride=stride, padding=padding, 
+                      groups=in_channels, bias=True),
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+
+        if skip_proj:
+            self.skip_proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 
+                      kernel_size=1, stride=1, padding=0, bias=True)
+        else:
+            self.skip_proj = None
+
+        if act == 'relu':
+            self.act = nn.ReLU(inplace=True)
+        elif act == 'prelu':
+            self.act = nn.PReLU(out_channels)
+        else:
+            raise NotImplementedError("unknown activation %s"%act)
+
+    def forward(self, x):
+        if self.stride == 2:
+            if self.kernel_size==3:
+                h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+            else:
+                h = F.pad(x, (1, 2, 1, 2), "constant", 0)
+            x = self.max_pool(x)
+        else:
+            h = x
+
+        if self.skip_proj is not None:
+            x = self.skip_proj(x)
+        elif self.channel_pad > 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
+        
+
+        return self.act(self.convs(h) + x)
+
+
+class FinalBlazeBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3):
+        super(FinalBlazeBlock, self).__init__()
+
+        # TFLite uses slightly different padding than PyTorch
+        # on the depthwise conv layer when the stride is 2.
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=kernel_size, stride=2, padding=0,
+                      groups=channels, bias=True),
+            nn.Conv2d(in_channels=channels, out_channels=channels,
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+
+        self.act = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+
+        return self.act(self.convs(h))
+
+
+class BlazeBase(nn.Module):
+    """ Base class for media pipe models. """
+
+    def _device(self):
+        """Which device (CPU or GPU) is being used by this model?"""
+        return self.classifier_8.weight.device
+    
+    def load_weights(self, path):
+        self.load_state_dict(torch.load(path))
+        self.eval()        
+
+
+class BlazeLandmark(BlazeBase):
+    """ Base class for landmark models. """
+
+    def extract_roi(self, frame, xc, yc, theta, scale):
+
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+
+        # use the points to compute the affine transform that maps 
+        # these points back to the output square
+        res = self.resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].cpu().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+
+        return imgs, affines, points
+
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+
+
+
+class BlazeDetector(BlazeBase):
+    """ Base class for detector models.
+
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/hollance/BlazeFace-PyTorch and
+    https://github.com/google/mediapipe/
+    """
+    def load_anchors(self, path):
+        self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
+        assert(self.anchors.ndimension() == 2)
+        assert(self.anchors.shape[0] == self.num_anchors)
+        assert(self.anchors.shape[1] == 4)
+
+    def _preprocess(self, x):
+        """Converts the image pixels to the range [-1, 1]."""
+        return x.float() / 255.# 127.5 - 1.0
+
+    def predict_on_image(self, img):
+        """Makes a prediction on a single image.
+
+        Arguments:
+            img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
+                 shape (3, H, W). The image's height and width should be 
+                 128 pixels.
+
+        Returns:
+            A tensor with face detections.
+        """
+        if isinstance(img, np.ndarray):
+            img = torch.from_numpy(img).permute((2, 0, 1))
+
+        return self.predict_on_batch(img.unsqueeze(0))[0]
+
+    def predict_on_batch(self, x):
+        """Makes a prediction on a batch of images.
+
+        Arguments:
+            x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
+               shape (b, 3, H, W). The height and width should be 128 pixels.
+
+        Returns:
+            A list containing a tensor of face detections for each image in 
+            the batch. If no faces are found for an image, returns a tensor
+            of shape (0, 17).
+
+        Each face detection is a PyTorch tensor consisting of 17 numbers:
+            - ymin, xmin, ymax, xmax
+            - x,y-coordinates for the 6 keypoints
+            - confidence score
+        """
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x).permute((0, 3, 1, 2))
+
+        assert x.shape[1] == 3
+        assert x.shape[2] == self.y_scale
+        assert x.shape[3] == self.x_scale
+
+        # 1. Preprocess the images into tensors:
+        x = x.to(self._device())
+        x = self._preprocess(x)
+
+        # 2. Run the neural network:
+        with torch.no_grad():
+            out = self.__call__(x)
+
+        # 3. Postprocess the raw predictions:
+        detections = self._tensors_to_detections(out[0], out[1], self.anchors)
+
+        # 4. Non-maximum suppression to remove overlapping detections:
+        filtered_detections = []
+        for i in range(len(detections)):
+            faces = self._weighted_non_max_suppression(detections[i])
+            faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, self.num_coords+1))
+            filtered_detections.append(faces)
+
+        return filtered_detections
+
+
+    def detection2roi(self, detection):
+        """ Convert detections from detector to an oriented bounding box.
+
+        Adapted from:
+        # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
+
+        The center and size of the box is calculated from the center 
+        of the detected box. Rotation is calcualted from the vector
+        between kp1 and kp2 relative to theta0. The box is scaled
+        and shifted by dscale and dy.
+
+        """
+        if self.detection2roi_method == 'box':
+            # compute box center and scale
+            # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+            xc = (detection[:,1] + detection[:,3]) / 2
+            yc = (detection[:,0] + detection[:,2]) / 2
+            scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+
+        elif self.detection2roi_method == 'alignment':
+            # compute box center and scale
+            # use mediapipe/calculators/util/alignment_points_to_rects_calculator.cc
+            xc = detection[:,4+2*self.kp1]
+            yc = detection[:,4+2*self.kp1+1]
+            x1 = detection[:,4+2*self.kp2]
+            y1 = detection[:,4+2*self.kp2+1]
+            scale = ((xc-x1)**2 + (yc-y1)**2).sqrt() * 2
+        else:
+            raise NotImplementedError(
+                "detection2roi_method [%s] not supported"%self.detection2roi_method)
+
+        yc += self.dy * scale
+        scale *= self.dscale
+
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        #theta = np.arctan2(y0-y1, x0-x1) - self.theta0
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+
+
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        """The output of the neural network is a tensor of shape (b, 896, 16)
+        containing the bounding box regressor predictions, as well as a tensor 
+        of shape (b, 896, 1) with the classification confidences.
+
+        This function converts these two "raw" tensors into proper detections.
+        Returns a list of (num_detections, 17) tensors, one for each image in
+        the batch.
+
+        This is based on the source code from:
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
+        """
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+
+        return output_detections
+
+    def _decode_boxes(self, raw_boxes, anchors):
+        """Converts the predictions into actual coordinates using
+        the anchor boxes. Processes the entire batch at once.
+        """
+        boxes = torch.zeros_like(raw_boxes)
+
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+
+        return boxes
+
+    def _weighted_non_max_suppression(self, detections):
+        """The alternative NMS method as mentioned in the BlazeFace paper:
+
+        "We replace the suppression algorithm with a blending strategy that
+        estimates the regression parameters of a bounding box as a weighted
+        mean between the overlapping predictions."
+
+        The original MediaPipe code assigns the score of the most confident
+        detection to the weighted detection, but we take the average score
+        of the overlapping detections.
+
+        The input detections should be a Tensor of shape (count, 17).
+
+        Returns a list of PyTorch tensors, one for each detected face.
+        
+        This is based on the source code from:
+        mediapipe/calculators/util/non_max_suppression_calculator.cc
+        mediapipe/calculators/util/non_max_suppression_calculator.proto
+        """
+        if len(detections) == 0: return []
+
+        output_detections = []
+
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, self.num_coords], descending=True)
+
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+
+            # Compute the overlap between the first box and the other 
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = overlap_similarity(first_box, other_boxes)
+
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious > self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :self.num_coords]
+                scores = detections[overlapping, self.num_coords:self.num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:self.num_coords] = weighted
+                weighted_detection[self.num_coords] = total_score / len(overlapping)
+
+            output_detections.append(weighted_detection)
+
+        return output_detections    
+
+
+# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
+
+def intersect(box_a, box_b):
+    """ We resize both tensors to [A,B,2] without new malloc:
+    [A,2] -> [A,1,2] -> [A,B,2]
+    [B,2] -> [1,B,2] -> [A,B,2]
+    Then we compute the area of intersect between box_a and box_b.
+    Args:
+      box_a: (tensor) bounding boxes, Shape: [A,4].
+      box_b: (tensor) bounding boxes, Shape: [B,4].
+    Return:
+      (tensor) intersection area, Shape: [A,B].
+    """
+    A = box_a.size(0)
+    B = box_b.size(0)
+    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+    inter = torch.clamp((max_xy - min_xy), min=0)
+    return inter[:, :, 0] * inter[:, :, 1]
+
+
+def jaccard(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.  Here we operate on
+    ground truth boxes and default boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
+        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
+    Return:
+        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2]-box_a[:, 0]) *
+              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+    area_b = ((box_b[:, 2]-box_b[:, 0]) *
+              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+
+
+def overlap_similarity(box, other_boxes):
+    """Computes the IOU between a bounding box and set of other boxes."""
+    return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazeface.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazeface.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d630b3b28ead006d2864f18f1637f6545dbe507
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazeface.py
@@ -0,0 +1,182 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from blazebase import BlazeDetector, BlazeBlock, FinalBlazeBlock
+
+
+class BlazeFace(BlazeDetector):
+    """The BlazeFace face detection model from MediaPipe.
+    
+    The version from MediaPipe is simpler than the one in the paper; 
+    it does not use the "double" BlazeBlocks.
+
+    Because we won't be training this model, it doesn't need to have
+    batchnorm layers. These have already been "folded" into the conv 
+    weights by TFLite.
+
+    The conversion to PyTorch is fairly straightforward, but there are 
+    some small differences between TFLite and PyTorch in how they handle
+    padding on conv layers with stride 2.
+
+    This version works on batches, while the MediaPipe version can only
+    handle a single image at a time.
+
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/hollance/BlazeFace-PyTorch and
+    https://github.com/google/mediapipe/
+
+    """
+    def __init__(self, back_model=False):
+        super(BlazeFace, self).__init__()
+
+        # These are the settings from the MediaPipe example graph
+        # mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt
+        self.num_classes = 1
+        self.num_anchors = 896
+        self.num_coords = 16
+        self.score_clipping_thresh = 100.0
+        self.back_model = back_model
+        if back_model:
+            self.x_scale = 256.0
+            self.y_scale = 256.0
+            self.h_scale = 256.0
+            self.w_scale = 256.0
+            self.min_score_thresh = 0.65
+        else:
+            self.x_scale = 128.0
+            self.y_scale = 128.0
+            self.h_scale = 128.0
+            self.w_scale = 128.0
+            self.min_score_thresh = 0.75
+        self.min_suppression_threshold = 0.3
+        self.num_keypoints = 6
+
+        # These settings are for converting detections to ROIs which can then
+        # be extracted and feed into the landmark network
+        # use mediapipe/calculators/util/detections_to_rects_calculator.cc
+        self.detection2roi_method = 'box'
+        # mediapipe/modules/face_landmark/face_detection_front_detection_to_roi.pbtxt
+        self.kp1 = 1
+        self.kp2 = 0
+        self.theta0 = 0.
+        self.dscale = 1.5
+        self.dy = 0.
+
+        self._define_layers()
+
+    def _define_layers(self):
+        if self.back_model:
+            self.backbone = nn.Sequential(
+                nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
+                nn.ReLU(inplace=True),
+
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24, stride=2),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 48, stride=2),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 48),
+                BlazeBlock(48, 96, stride=2),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+            )
+            self.final = FinalBlazeBlock(96)
+            self.classifier_8 = nn.Conv2d(96, 2, 1, bias=True)
+            self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
+
+            self.regressor_8 = nn.Conv2d(96, 32, 1, bias=True)
+            self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
+        else:
+            self.backbone1 = nn.Sequential(
+                nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
+                nn.ReLU(inplace=True),
+
+                BlazeBlock(24, 24),
+                BlazeBlock(24, 28),
+                BlazeBlock(28, 32, stride=2),
+                BlazeBlock(32, 36),
+                BlazeBlock(36, 42),
+                BlazeBlock(42, 48, stride=2),
+                BlazeBlock(48, 56),
+                BlazeBlock(56, 64),
+                BlazeBlock(64, 72),
+                BlazeBlock(72, 80),
+                BlazeBlock(80, 88),
+            )
+        
+            self.backbone2 = nn.Sequential(
+                BlazeBlock(88, 96, stride=2),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+                BlazeBlock(96, 96),
+            )
+
+            self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True)
+            self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
+
+            self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True)
+            self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
+        
+    def forward(self, x):
+        # TFLite uses slightly different padding on the first conv layer
+        # than PyTorch, so do it manually.
+        x = F.pad(x, (1, 2, 1, 2), "constant", 0)
+        
+        b = x.shape[0]      # batch size, needed for reshaping later
+
+        if self.back_model:
+            x = self.backbone(x)           # (b, 16, 16, 96)
+            h = self.final(x)              # (b, 8, 8, 96)
+        else:
+            x = self.backbone1(x)           # (b, 88, 16, 16)
+            h = self.backbone2(x)           # (b, 96, 8, 8)
+        
+        # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
+        # permute the output from the conv layers before reshaping it.
+        
+        c1 = self.classifier_8(x)       # (b, 2, 16, 16)
+        c1 = c1.permute(0, 2, 3, 1)     # (b, 16, 16, 2)
+        c1 = c1.reshape(b, -1, 1)       # (b, 512, 1)
+
+        c2 = self.classifier_16(h)      # (b, 6, 8, 8)
+        c2 = c2.permute(0, 2, 3, 1)     # (b, 8, 8, 6)
+        c2 = c2.reshape(b, -1, 1)       # (b, 384, 1)
+
+        c = torch.cat((c1, c2), dim=1)  # (b, 896, 1)
+
+        r1 = self.regressor_8(x)        # (b, 32, 16, 16)
+        r1 = r1.permute(0, 2, 3, 1)     # (b, 16, 16, 32)
+        r1 = r1.reshape(b, -1, 16)      # (b, 512, 16)
+
+        r2 = self.regressor_16(h)       # (b, 96, 8, 8)
+        r2 = r2.permute(0, 2, 3, 1)     # (b, 8, 8, 96)
+        r2 = r2.reshape(b, -1, 16)      # (b, 384, 16)
+
+        r = torch.cat((r1, r2), dim=1)  # (b, 896, 16)
+        return [r, c]
+
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazeface_landmark.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazeface_landmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b21d17e31c8a8a39e95abb27cc3220440a01fca
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/blazeface_landmark.py
@@ -0,0 +1,74 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from blazebase import BlazeLandmark, BlazeBlock
+
+class BlazeFaceLandmark(BlazeLandmark):
+    """The face landmark model from MediaPipe.
+    
+    """
+    def __init__(self):
+        super(BlazeFaceLandmark, self).__init__()
+
+        # size of ROIs used for input
+        self.resolution = 192
+
+        self._define_layers()
+
+    def _define_layers(self):
+        self.backbone1 = nn.Sequential(
+            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=0, bias=True),
+            nn.PReLU(16),
+
+            BlazeBlock(16, 16, 3, act='prelu'),
+            BlazeBlock(16, 16, 3, act='prelu'),
+            BlazeBlock(16, 32, 3, 2, act='prelu'),
+
+            BlazeBlock(32, 32, 3, act='prelu'),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            BlazeBlock(32, 64, 3, 2, act='prelu'),
+
+            BlazeBlock(64, 64, 3, act='prelu'),
+            BlazeBlock(64, 64, 3, act='prelu'),
+            BlazeBlock(64, 128, 3, 2, act='prelu'),
+
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+        )
+
+
+        self.backbone2a = nn.Sequential(
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            BlazeBlock(128, 128, 3, act='prelu'),
+            nn.Conv2d(128, 32, 1, padding=0, bias=True),
+            nn.PReLU(32),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            nn.Conv2d(32, 1404, 3, padding=0, bias=True)
+        )
+
+        self.backbone2b = nn.Sequential(
+            BlazeBlock(128, 128, 3, 2, act='prelu'),
+            nn.Conv2d(128, 32, 1, padding=0, bias=True),
+            nn.PReLU(32),
+            BlazeBlock(32, 32, 3, act='prelu'),
+            nn.Conv2d(32, 1, 3, padding=0, bias=True)
+        )
+
+    def forward(self, x):
+        if x.shape[0] == 0:
+            return torch.zeros((0,)), torch.zeros((0, 468, 3))
+            
+        x = F.pad(x, (0, 1, 0, 1), "constant", 0)
+
+        x = self.backbone1(x)
+        landmarks = self.backbone2a(x).view(-1, 468, 3) / 192
+        flag = self.backbone2b(x).sigmoid().view(-1)
+
+        return flag, landmarks
\ No newline at end of file
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/coco.jpg b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/coco.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9c7bdd09623bcb4d4a41697c32505964a6be4fbf
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/coco.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d96313cff4ba7511af36d8dbc358dd33b2815ec378680a09b97353cadb2378
+size 158750
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/demo_qnn.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/demo_qnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f7eb48ab9aed5e3d6b2f4df608b9ec95182221f
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/demo_qnn.py
@@ -0,0 +1,423 @@
+import numpy as np
+import torch
+import cv2
+import sys
+from blazebase import resize_pad, denormalize_detections
+from visualization import  draw_landmarks, draw_roi, FACE_CONNECTIONS
+import time
+import aidlite
+import os
+
+class post_mediapipe_face:
+    def __init__(self):
+        self.kp1 = 1
+        self.kp2 = 0
+        self.theta0 = 0.
+        self.dscale = 1.5
+        self.dy = 0.
+        self.x_scale = 256.0
+        self.y_scale = 256.0
+        self.h_scale = 256.0
+        self.w_scale = 256.0
+        self.num_keypoints = 6
+        self.num_classes = 1
+        self.num_anchors = 896
+        self.num_coords = 16
+        self.min_score_thresh = 0.4  #0.65  
+        self.score_clipping_thresh = 100.0
+        self.min_suppression_threshold = 0.3
+        self.resolution = 192
+            
+        
+    def detection2roi(self,detection):
+        xc = (detection[:,1] + detection[:,3]) / 2
+        yc = (detection[:,0] + detection[:,2]) / 2
+        scale = (detection[:,3] - detection[:,1]) # assumes square boxes
+        yc += self.dy * scale
+        scale *= self.dscale
+        # compute box rotation
+        x0 = detection[:,4+2*self.kp1]
+        y0 = detection[:,4+2*self.kp1+1]
+        x1 = detection[:,4+2*self.kp2]
+        y1 = detection[:,4+2*self.kp2+1]
+        theta = torch.atan2(y0-y1, x0-x1) - self.theta0
+        return xc, yc, scale, theta
+    
+    def _decode_boxes( self,raw_boxes, anchors):
+        boxes = torch.zeros_like(raw_boxes)
+
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+
+        for k in range(self.num_keypoints):
+            offset = 4 + k*2
+            keypoint_x = raw_boxes[..., offset    ] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset    ] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+        return boxes
+    
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+
+        return output_detections
+
+    def extract_roi( self,frame, xc, yc, theta, scale):
+        resolution = 192
+        # take points on unit square and transform them according to the roi
+        points = torch.tensor([[-1, -1, 1, 1],
+                            [-1, 1, -1, 1]], device=scale.device).view(1,2,4)
+        points = points * scale.view(-1,1,1)/2
+        theta = theta.view(-1, 1, 1)
+        R = torch.cat((
+            torch.cat((torch.cos(theta), -torch.sin(theta)), 2),
+            torch.cat((torch.sin(theta), torch.cos(theta)), 2),
+            ), 1)
+        center = torch.cat((xc.view(-1,1,1), yc.view(-1,1,1)), 1)
+        points = R @ points + center
+
+        # use the points to compute the affine transform that maps 
+        # these points back to the output square
+        res = resolution
+        points1 = np.array([[0, 0, res-1],
+                            [0, res-1, 0]], dtype=np.float32).T
+        affines = []
+        imgs = []
+        for i in range(points.shape[0]):
+            pts = points[i, :, :3].detach().numpy().T
+            M = cv2.getAffineTransform(pts, points1)
+            img = cv2.warpAffine(frame, M, (res,res))#, borderValue=127.5)
+            img = torch.tensor(img, device=scale.device)
+            imgs.append(img)
+            affine = cv2.invertAffineTransform(M).astype('float32')
+            affine = torch.tensor(affine, device=scale.device)
+            affines.append(affine)
+        if imgs:
+            imgs = torch.stack(imgs).permute(0,3,1,2).float() / 255.#/ 127.5 - 1.0
+            affines = torch.stack(affines)
+        else:
+            imgs = torch.zeros((0, 3, res, res), device=scale.device)
+            affines = torch.zeros((0, 2, 3), device=scale.device)
+
+        return imgs, affines, points
+
+    def denormalize_landmarks(self, landmarks, affines):
+        landmarks[:,:,:2] *= self.resolution
+        for i in range(len(landmarks)):
+            landmark, affine = landmarks[i], affines[i]
+            landmark = (affine[:,:2] @ landmark[:,:2].T + affine[:,2:]).T
+            landmarks[i,:,:2] = landmark
+        return landmarks
+
+    def intersect(self,box_a, box_b):
+        A = box_a.size(0)
+        B = box_b.size(0)
+        max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                        box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+        min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                        box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+        inter = torch.clamp((max_xy - min_xy), min=0)
+        return inter[:, :, 0] * inter[:, :, 1]
+
+    def jaccard(self,box_a, box_b):
+        inter = self.intersect(box_a, box_b)
+        area_a = ((box_a[:, 2]-box_a[:, 0]) *
+                (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+        area_b = ((box_b[:, 2]-box_b[:, 0]) *
+                (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+        union = area_a + area_b - inter
+        return inter / union  # [A,B]
+
+
+    def overlap_similarity(self,box, other_boxes):
+        """Computes the IOU between a bounding box and set of other boxes."""
+        return self.jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
+
+    def _weighted_non_max_suppression(self,detections):
+        if len(detections) == 0: return []
+        output_detections = []
+
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, num_coords], descending=True)
+
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+
+            # Compute the overlap between the first box and the other 
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = self.overlap_similarity(first_box, other_boxes)
+
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious >  self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :num_coords]
+                scores = detections[overlapping, num_coords:num_coords+1]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:num_coords] = weighted
+                weighted_detection[num_coords] = total_score / len(overlapping)
+
+            output_detections.append(weighted_detection)
+
+            return output_detections
+
+def draw_detections(img, detections, with_keypoints=True):
+    if isinstance(detections, torch.Tensor):
+        detections = detections.detach().numpy()
+
+    if detections.ndim == 1:
+        detections = np.expand_dims(detections, axis=0)
+
+    n_keypoints = detections.shape[1] // 2 - 2
+
+    for i in range(detections.shape[0]):
+        ymin = detections[i, 0]
+        xmin = detections[i, 1]
+        ymax = detections[i, 2]
+        xmax = detections[i, 3]
+        
+        start_point = (int(xmin), int(ymin))
+        end_point = (int(xmax), int(ymax))
+        img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) 
+
+        if with_keypoints:
+            for k in range(n_keypoints):
+                kp_x = int(detections[i, 4 + k*2    ])
+                kp_y = int(detections[i, 4 + k*2 + 1])
+                cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
+    return img
+
+
+
+post_process=post_mediapipe_face()
+
+class faceDetectionQnn:
+    def __init__(self):
+        super().__init__()
+        self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceDetector_w8a16.qnn216.ctx.bin"))
+        if self.model is None:
+            print("Create model failed !")
+            return
+
+        self.config = aidlite.Config.create_instance()
+        if self.config is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+
+        self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
+        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
+        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
+        self.config.is_quantify_model = 1
+        
+        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        input_shapes = [[1,3, 256, 256]]
+        output_shapes = [[1, 896,16],[1,896,1]]
+        self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
+                                output_shapes, aidlite.DataType.TYPE_FLOAT32)
+
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+        result = self.interpreter.init()
+        if result != 0:
+            print(f"interpreter init failed !")
+        result = self.interpreter.load_model()
+        if result != 0:
+            print("interpreter load model failed !")
+   
+        print(" model load success!")
+
+    def __call__(self, input):
+        self.interpreter.set_input_tensor(0,input)
+        invoke_time=[]
+        invoke_nums =10
+        for i in range(invoke_nums):
+            result = self.interpreter.set_input_tensor(0, input.data)
+            if result != 0:
+                print("interpreter set_input_tensor() failed")
+            t1=time.time()
+            result = self.interpreter.invoke()
+            cost_time = (time.time()-t1)*1000
+            invoke_time.append(cost_time)
+
+        max_invoke_time = max(invoke_time)
+        min_invoke_time = min(invoke_time)
+        mean_invoke_time = sum(invoke_time)/invoke_nums
+        var_invoketime=np.var(invoke_time)
+        print("====================================")
+        print(f"QNN Detection invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
+        print("====================================")
+        features_0 = self.interpreter.get_output_tensor(0).reshape(1, 896,16).copy()
+        features_1 = self.interpreter.get_output_tensor(1).reshape(1, 896,1).copy()
+        return features_0,features_1
+
+
+class faceLandmarkQnn:
+    def __init__(self):
+        super().__init__()
+        self.model = aidlite.Model.create_instance(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../models/m_faceLandmark_w8a16.qnn216.ctx.bin"))
+        if self.model is None:
+            print("Create model failed !")
+            return
+
+        self.config = aidlite.Config.create_instance()
+        if self.config is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+
+        self.config.implement_type = aidlite.ImplementType.TYPE_LOCAL
+        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN
+        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
+        self.config.is_quantify_model = 1
+        
+        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+            return
+        input_shapes = [[1, 3, 192, 192]]
+        output_shapes = [[1],[1,468,3]]
+        self.model.set_model_properties(input_shapes, aidlite.DataType.TYPE_FLOAT32,
+                                output_shapes, aidlite.DataType.TYPE_FLOAT32)
+
+        if self.interpreter is None:
+            print("build_interpretper_from_model_and_config failed !")
+        result = self.interpreter.init()
+        if result != 0:
+            print(f"interpreter init failed !")
+        result = self.interpreter.load_model()
+        if result != 0:
+            print("interpreter load model failed !")
+   
+        print(" model load success!")
+
+    def __call__(self, input):
+        self.interpreter.set_input_tensor(0,input)
+        invoke_time=[]
+        invoke_nums =10
+        for i in range(invoke_nums):
+            result = self.interpreter.set_input_tensor(0, input.data)
+            if result != 0:
+                print("interpreter set_input_tensor() failed")
+            t1=time.time()
+            result = self.interpreter.invoke()
+            cost_time = (time.time()-t1)*1000
+            invoke_time.append(cost_time)
+
+        max_invoke_time = max(invoke_time)
+        min_invoke_time = min(invoke_time)
+        mean_invoke_time = sum(invoke_time)/invoke_nums
+        var_invoketime=np.var(invoke_time)
+        print("====================================")
+        print(f"QNN LandMark invoke time:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
+        print("====================================")
+        features_0 = self.interpreter.get_output_tensor(0).reshape(1).copy()
+        features_1 = self.interpreter.get_output_tensor(1).reshape(1,468,3).copy()
+        return features_0,features_1
+
+
+
+anchors = torch.tensor(np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),"anchors_face_back.npy")), dtype=torch.float32, device='cpu')
+face_detc = faceDetectionQnn()
+face_rec = faceLandmarkQnn()
+
+image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"coco.jpg")
+
+frame_ct=0
+image = cv2.imread(image_path)
+
+frame = np.ascontiguousarray(image[:,:,::-1])
+
+img1, img2, scale, pad = resize_pad(frame)
+
+input = (img1 / 255).astype(np.float32)
+input = np.transpose(input, (2, 0, 1))  
+input = input[np.newaxis, ...]
+t0 = time.time()
+out = face_detc(input)
+use_time = round((time.time() - t0) * 1000, 2)
+print(f"face detction inference_time:{use_time} ms")
+detections = post_process._tensors_to_detections(torch.from_numpy(out[0]), torch.from_numpy(out[1]), anchors)
+
+filtered_detections = []
+num_coords = 16
+for i in range(len(detections)):
+    faces = post_process._weighted_non_max_suppression(detections[i])
+    faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, num_coords+1))
+    filtered_detections.append(faces)
+
+face_detections = denormalize_detections(filtered_detections[0], scale, pad)
+
+xc, yc, scale, theta = post_process.detection2roi(face_detections)
+
+img, affine, box = post_process.extract_roi(frame, xc, yc, theta, scale)
+if box.size()[0]!=0:
+    t2 = time.time()
+    flags, normalized_landmarks = face_rec(img.numpy())
+
+    use_time = round((time.time() - t2) * 1000, 2)
+    print(f"landmark inference_time:{use_time} ms")
+
+    landmarks = post_process.denormalize_landmarks(torch.from_numpy(normalized_landmarks), affine)
+
+    for i in range(len(flags)):
+        landmark, flag = landmarks[i], flags[i]
+        if flag>.4: # 0.5
+            draw_landmarks(frame, landmark[:,:2], FACE_CONNECTIONS, size=1)
+else:
+    print("not detect face !")
+
+draw_roi(frame, box)
+draw_detections(frame, face_detections)
+cv2.imwrite(os.path.join(os.path.dirname(os.path.abspath(__file__)),'%04d.jpg'%frame_ct), frame[:,:,::-1])
+face_detc.interpreter.destory()
+face_rec.interpreter.destory()
+
+
+
diff --git a/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/visualization.py b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/visualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1187c9d99336d53562f4fa3a4a32f6af6e769fb
--- /dev/null
+++ b/model_farm_mediapipefacedetection_qcs8550_qnn2.16_w8a16_aidlite/python/visualization.py
@@ -0,0 +1,125 @@
+import numpy as np
+import cv2
+import torch
+
+def draw_detections(img, detections, with_keypoints=True):
+    if isinstance(detections, torch.Tensor):
+        detections = detections.cpu().numpy()
+
+    if detections.ndim == 1:
+        detections = np.expand_dims(detections, axis=0)
+
+    n_keypoints = detections.shape[1] // 2 - 2
+
+    for i in range(detections.shape[0]):
+        ymin = detections[i, 0]
+        xmin = detections[i, 1]
+        ymax = detections[i, 2]
+        xmax = detections[i, 3]
+        
+        start_point = (int(xmin), int(ymin))
+        end_point = (int(xmax), int(ymax))
+        img = cv2.rectangle(img, start_point, end_point, (255, 0, 0), 1) 
+
+        if with_keypoints:
+            for k in range(n_keypoints):
+                kp_x = int(detections[i, 4 + k*2    ])
+                kp_y = int(detections[i, 4 + k*2 + 1])
+                cv2.circle(img, (kp_x, kp_y), 2, (0, 0, 255), thickness=2)
+    return img
+
+
+def draw_roi(img, roi):
+    for i in range(roi.shape[0]):
+        (x1,x2,x3,x4), (y1,y2,y3,y4) = roi[i]
+        cv2.line(img, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,0), 2)
+        cv2.line(img, (int(x1), int(y1)), (int(x3), int(y3)), (0,255,0), 2)
+        cv2.line(img, (int(x2), int(y2)), (int(x4), int(y4)), (0,0,0), 2)
+        cv2.line(img, (int(x3), int(y3)), (int(x4), int(y4)), (0,0,0), 2)
+
+
+def draw_landmarks(img, points, connections=[], color=(0, 255, 0), size=2):
+    points = points[:,:2]
+    for point in points:
+        x, y = point
+        x, y = int(x), int(y)
+        cv2.circle(img, (x, y), size, color, thickness=size)
+    for connection in connections:
+        x0, y0 = points[connection[0]]
+        x1, y1 = points[connection[1]]
+        x0, y0 = int(x0), int(y0)
+        x1, y1 = int(x1), int(y1)
+        cv2.line(img, (x0, y0), (x1, y1), (0,0,0), size)
+
+
+
+# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py
+#        8   12  16  20
+#        |   |   |   |
+#        7   11  15  19
+#    4   |   |   |   |
+#    |   6   10  14  18
+#    3   |   |   |   |
+#    |   5---9---13--17
+#    2    \         /
+#     \    \       /
+#      1    \     /
+#       \    \   /
+#        ------0-
+HAND_CONNECTIONS = [
+    (0, 1), (1, 2), (2, 3), (3, 4),
+    (5, 6), (6, 7), (7, 8),
+    (9, 10), (10, 11), (11, 12),
+    (13, 14), (14, 15), (15, 16),
+    (17, 18), (18, 19), (19, 20),
+    (0, 5), (5, 9), (9, 13), (13, 17), (0, 17)
+]
+
+POSE_CONNECTIONS = [
+    (0,1), (1,2), (2,3), (3,7),
+    (0,4), (4,5), (5,6), (6,8),
+    (9,10),
+    (11,13), (13,15), (15,17), (17,19), (19,15), (15,21),
+    (12,14), (14,16), (16,18), (18,20), (20,16), (16,22),
+    (11,12), (12,24), (24,23), (23,11)
+]
+
+# Vertex indices can be found in
+# github.com/google/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualisation.png
+# Found in github.com/google/mediapipe/python/solutions/face_mesh.py
+FACE_CONNECTIONS = [
+    # Lips.
+    (61, 146), (146, 91), (91, 181), (181, 84), (84, 17),
+    (17, 314), (314, 405), (405, 321), (321, 375), (375, 291),
+    (61, 185), (185, 40), (40, 39), (39, 37), (37, 0),
+    (0, 267), (267, 269), (269, 270), (270, 409), (409, 291),
+    (78, 95), (95, 88), (88, 178), (178, 87), (87, 14),
+    (14, 317), (317, 402), (402, 318), (318, 324), (324, 308),
+    (78, 191), (191, 80), (80, 81), (81, 82), (82, 13),
+    (13, 312), (312, 311), (311, 310), (310, 415), (415, 308),
+    # Left eye.
+    (263, 249), (249, 390), (390, 373), (373, 374), (374, 380),
+    (380, 381), (381, 382), (382, 362), (263, 466), (466, 388),
+    (388, 387), (387, 386), (386, 385), (385, 384), (384, 398),
+    (398, 362),
+    # Left eyebrow.
+    (276, 283), (283, 282), (282, 295), (295, 285), (300, 293),
+    (293, 334), (334, 296), (296, 336),
+    # Right eye.
+    (33, 7), (7, 163), (163, 144), (144, 145), (145, 153),
+    (153, 154), (154, 155), (155, 133), (33, 246), (246, 161),
+    (161, 160), (160, 159), (159, 158), (158, 157), (157, 173),
+    (173, 133),
+    # Right eyebrow.
+    (46, 53), (53, 52), (52, 65), (65, 55), (70, 63), (63, 105),
+    (105, 66), (66, 107),
+    # Face oval.
+    (10, 338), (338, 297), (297, 332), (332, 284), (284, 251),
+    (251, 389), (389, 356), (356, 454), (454, 323), (323, 361),
+    (361, 288), (288, 397), (397, 365), (365, 379), (379, 378),
+    (378, 400), (400, 377), (377, 152), (152, 148), (148, 176),
+    (176, 149), (149, 150), (150, 136), (136, 172), (172, 58),
+    (58, 132), (132, 93), (93, 234), (234, 127), (127, 162),
+    (162, 21), (21, 54), (54, 103), (103, 67), (67, 109),
+    (109, 10)
+]