Spaces:
No application file
No application file
File size: 13,357 Bytes
b26e93d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 |
#include <opencv2/opencv.hpp>
#include <onnxruntime_cxx_api.h>
#include <iostream>
#include <chrono>
#include <vector>
#include <array>
std::vector<const char*> input_names = { "images", "orig_target_sizes" };
std::vector<const char*> output_names = { "labels", "boxes", "scores" };
/**
* @brief Draws bounding boxes, labels, and confidence scores on an image.
*
* This function takes an image, a list of labels, bounding boxes, and their corresponding confidence scores,
* and overlays the bounding boxes and labels on the image. The bounding boxes are adjusted to compensate
* for resizing and padding applied during preprocessing.
*
* @param image The input image (cv::Mat) on which to draw the bounding boxes and labels.
* @param labels A vector of integer labels corresponding to detected objects.
* @param boxes A vector of bounding boxes, where each box is represented as {x1, y1, x2, y2}.
* @param scores A vector of confidence scores corresponding to the bounding boxes.
* @param ratio The scaling factor used to resize the image during preprocessing.
* @param pad_w The horizontal padding applied to the image during preprocessing.
* @param pad_h The vertical padding applied to the image during preprocessing.
* @param thrh The confidence threshold; only boxes with scores above this value will be drawn (default is 0.4).
* @return A cv::Mat object containing the original image with bounding boxes, labels, and scores drawn on it.
*/
cv::Mat draw(
const cv::Mat& image,
const std::vector<int64_t>& labels,
const std::vector<std::vector<float>>& boxes,
const std::vector<float>& scores,
float ratio,
int pad_w,
int pad_h,
float thrh = 0.4)
{
// Clone the input image to preserve the original image
cv::Mat img = image.clone();
// Iterate over all detected objects
for (size_t i = 0; i < scores.size(); ++i) {
// Only process objects with confidence scores above the threshold
if (scores[i] > thrh) {
// Adjust bounding box coordinates to account for resizing and padding
float x1 = (boxes[i][0] - pad_w) / ratio; // Top-left x-coordinate
float y1 = (boxes[i][1] - pad_h) / ratio; // Top-left y-coordinate
float x2 = (boxes[i][2] - pad_w) / ratio; // Bottom-right x-coordinate
float y2 = (boxes[i][3] - pad_h) / ratio; // Bottom-right y-coordinate
// Draw the bounding box on the image
cv::rectangle(img, cv::Point(x1, y1), cv::Point(x2, y2), cv::Scalar(0, 0, 255), 1);
// Prepare the label text with class label and confidence score
std::string label_text = "Label: " + std::to_string(labels[i]) +
" Conf: " + std::to_string(scores[i]);
// Draw the label text above the bounding box
cv::putText(img, label_text, cv::Point(x1, y1 - 10), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(255, 0, 0), 1);
}
}
// Return the annotated image
return img;
}
/**
* @brief Resizes an image while maintaining its aspect ratio and pads the resized image to a square of a specified size.
*
* This function scales the input image proportionally to fit within a square of the specified size while preserving
* the aspect ratio. It then pads the resized image with black pixels (value 0) to fill the remaining space, creating
* a square output image.
*
* @param image Input image (cv::Mat) to be resized and padded.
* @param size Target size of the square output image (both width and height will be equal to size).
* @param ratio Output parameter that will contain the scaling factor applied to the image.
* @param pad_w Output parameter that will contain the width of padding applied on the left and right sides.
* @param pad_h Output parameter that will contain the height of padding applied on the top and bottom sides.
* @return A cv::Mat object containing the resized and padded square image.
*/
cv::Mat resizeWithAspectRatio(const cv::Mat& image, int size, float& ratio, int& pad_w, int& pad_h) {
// Get the original width and height of the input image
int original_width = image.cols;
int original_height = image.rows;
// Compute the scaling ratio to fit the image within the target size while maintaining aspect ratio
ratio = std::min(static_cast<float>(size) / original_width, static_cast<float>(size) / original_height);
int new_width = static_cast<int>(original_width * ratio); // New width after scaling
int new_height = static_cast<int>(original_height * ratio); // New height after scaling
// Resize the image using the computed dimensions
cv::Mat resized_image;
cv::resize(image, resized_image, cv::Size(new_width, new_height));
// Calculate the padding required to center the resized image in the square output
pad_w = (size - new_width) / 2; // Horizontal padding (left and right)
pad_h = (size - new_height) / 2; // Vertical padding (top and bottom)
// Create a square output image filled with black pixels (value 0)
cv::Mat padded_image(size, size, resized_image.type(), cv::Scalar(0, 0, 0));
// Copy the resized image into the center of the square output image
resized_image.copyTo(padded_image(cv::Rect(pad_w, pad_h, new_width, new_height)));
// Return the resized and padded image
return padded_image;
}
/**
* @brief Preprocess an input image, run inference using an ONNX model, and process the results.
*
* This function resizes the input image while maintaining its aspect ratio, prepares it for inference,
* runs the inference using the specified ONNX Runtime session, and processes the output to draw
* bounding boxes and labels on the original image.
*
* @param session The ONNX Runtime session used to perform inference.
* @param image The input image (OpenCV Mat) to process.
* @return cv::Mat The result image with bounding boxes and labels drawn.
*/
cv::Mat processImage(Ort::Session& session, const cv::Mat& image) {
float ratio; // Aspect ratio for resizing the image.
int pad_w, pad_h; // Padding added to maintain aspect ratio.
int target_size = 640; // Target size for resizing (typically square).
// Step 1: Resize and pad the image to the target size while preserving the aspect ratio.
cv::Mat resized_image = resizeWithAspectRatio(image, target_size, ratio, pad_w, pad_h);
// Step 2: Convert the resized image to RGB format as required by the model.
cv::cvtColor(resized_image, resized_image, cv::COLOR_BGR2RGB);
// Step 3: Prepare the input tensor in NCHW format (channels-first).
std::vector<int64_t> input_dims = { 1, 3, target_size, target_size }; // Batch size = 1, Channels = 3, HxW = target_size.
std::vector<float> input_tensor_values(input_dims[1] * input_dims[2] * input_dims[3]);
// Populate the input tensor with normalized pixel values (range 0 to 1).
int index = 0;
for (int c = 0; c < 3; ++c) { // Loop through channels.
for (int i = 0; i < resized_image.rows; ++i) { // Loop through rows.
for (int j = 0; j < resized_image.cols; ++j) { // Loop through columns.
input_tensor_values[index++] = resized_image.at<cv::Vec3b>(i, j)[c] / 255.0f; // Normalize pixel value.
}
}
}
// Step 4: Create ONNX Runtime input tensors.
Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
// Tensor for the preprocessed image.
Ort::Value input_tensor_images = Ort::Value::CreateTensor<float>(
memory_info, input_tensor_values.data(), input_tensor_values.size(),
input_dims.data(), input_dims.size()
);
// Tensor for the original target sizes (optional, used for postprocessing).
std::vector<int64_t> orig_size_dims = { 1, 2 };
std::vector<int64_t> orig_size_values = {
static_cast<int64_t>(resized_image.rows),
static_cast<int64_t>(resized_image.cols)
};
Ort::Value input_tensor_orig_target_sizes = Ort::Value::CreateTensor<int64_t>(
memory_info, orig_size_values.data(), orig_size_values.size(),
orig_size_dims.data(), orig_size_dims.size()
);
// Step 5: Run inference on the session.
auto outputs = session.Run(
Ort::RunOptions{ nullptr }, // Default run options.
input_names.data(), // Names of input nodes.
std::array<Ort::Value, 2>{std::move(input_tensor_images), std::move(input_tensor_orig_target_sizes)}.data(),
input_names.size(), // Number of inputs.
output_names.data(), // Names of output nodes.
output_names.size() // Number of outputs.
);
// Step 6: Extract and process model outputs.
auto labels_ptr = outputs[0].GetTensorMutableData<int64_t>(); // Labels for detected objects.
auto boxes_ptr = outputs[1].GetTensorMutableData<float>(); // Bounding boxes.
auto scores_ptr = outputs[2].GetTensorMutableData<float>(); // Confidence scores.
size_t num_boxes = outputs[2].GetTensorTypeAndShapeInfo().GetShape()[1]; // Number of detected boxes.
// Convert raw output to structured data.
std::vector<int64_t> labels(labels_ptr, labels_ptr + num_boxes);
std::vector<std::vector<float>> boxes;
std::vector<float> scores(scores_ptr, scores_ptr + num_boxes);
auto boxes_shape = outputs[1].GetTensorTypeAndShapeInfo().GetShape();
size_t num_coordinates = boxes_shape[2]; // Usually 4 coordinates: (x1, y1, x2, y2).
// Populate the `boxes` vector.
for (size_t i = 0; i < num_boxes; ++i) {
boxes.push_back({
boxes_ptr[i * num_coordinates + 0], // x1
boxes_ptr[i * num_coordinates + 1], // y1
boxes_ptr[i * num_coordinates + 2], // x2
boxes_ptr[i * num_coordinates + 3] // y2
});
}
// Step 7: Draw the results on the original image.
cv::Mat result_image = draw(image, labels, boxes, scores, ratio, pad_w, pad_h);
// Return the annotated image.
return result_image;
}
/**
* @brief Entry point of the application to perform object detection on an input source using a specified model.
*
* The program loads a pre-trained model, processes an input source (image, video, or webcam), and performs object
* detection using either a CPU or GPU for computation. The results are displayed or saved as appropriate.
*
* @param argc The number of command-line arguments passed to the program.
* @param argv The array of command-line arguments:
* - argv[0]: The name of the executable.
* - argv[1]: The path to the pre-trained model file.
* - argv[2]: The source of the input (image file, video file, or webcam index).
* - argv[3]: Flag to indicate whether to use GPU (1 for GPU, 0 for CPU).
* @return Exit status:
* - Returns 0 on success.
* - Returns -1 if incorrect arguments are provided.
*/
int main(int argc, char** argv) {
// Check if the required number of arguments is provided
if (argc < 4) {
// Display usage instructions if arguments are insufficient
std::cerr << "Usage: " << argv[0]
<< " <modelPath> <source[imagePath|videoPath|webcam]> <useGPU[1/0]>\n";
return -1;
}
// Parse arguments
std::string modelPath = argv[1];
std::string source = argv[2];
bool useGPU = std::stoi(argv[3]) != 0;
// Initialize ONNX Runtime environment
Ort::Env env(ORT_LOGGING_LEVEL_ERROR, "ONNXExample");
Ort::SessionOptions session_options;
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
if (useGPU) {
OrtCUDAProviderOptions cudaOptions;
cudaOptions.device_id = 0; // Default to GPU 0
session_options.AppendExecutionProvider_CUDA(cudaOptions);
std::cout << "Using GPU for inference.\n";
}
else {
std::cout << "Using CPU for inference.\n";
}
// Load ONNX model
std::wstring widestr = std::wstring(modelPath.begin(), modelPath.end());
const wchar_t* model_path = widestr.c_str();
Ort::Session session(env, model_path, session_options);
// Open source
cv::VideoCapture cap;
bool isVideo = false;
bool isWebcam = false;
bool isImage = false;
cv::Mat frame;
if (source == "webcam") {
isWebcam = true;
cap.open(0); // Open webcam
}
else if (source.find(".mp4") != std::string::npos ||
source.find(".avi") != std::string::npos ||
source.find(".mkv") != std::string::npos) {
isVideo = true;
cap.open(source); // Open video file
}
else {
isImage = true;
frame = cv::imread(source);
if (frame.empty()) {
std::cerr << "Error: Could not read image file.\n";
return -1;
}
}
if ((isVideo || isWebcam) && !cap.isOpened()) {
std::cerr << "Error: Could not open video source.\n";
return -1;
}
// Process source
do {
if (isWebcam || isVideo) {
cap >> frame;
if (frame.empty()) {
if (isVideo) {
std::cout << "End of video reached.\n";
}
break;
}
}
// Process the frame/image with ONNX model
auto result_image = processImage(session, frame);
cv::imshow("ONNX Result", result_image);
if (isImage) {
cv::waitKey(0); // Wait indefinitely for image
break;
}
else if (cv::waitKey(1) == 27) { // Exit on 'Esc' key for video/webcam
break;
}
// FPS calculation for video/webcam
static int frame_count = 0;
static auto last_time = std::chrono::high_resolution_clock::now();
frame_count++;
auto current_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = current_time - last_time;
if (elapsed.count() >= 1.0) {
std::cout << "FPS: " << frame_count / elapsed.count() << "\n";
frame_count = 0;
last_time = current_time;
}
} while (isWebcam || isVideo);
return 0;
}
|