Laurent Berger commited on
Commit
cda4a9b
·
1 Parent(s): c5ec220

Text Detection model DB (#175)

Browse files

* Text Detection model DB

* review 1

models/text_detection_db/CMakeLists.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 3.24)
2
+ set(project_name "opencv_zoo_text_detection_db")
3
+
4
+ PROJECT (${project_name})
5
+
6
+ set(OPENCV_VERSION "4.7.0")
7
+ set(OPENCV_INSTALLATION_PATH "" CACHE PATH "Where to look for OpenCV installation")
8
+ find_package(OpenCV ${OPENCV_VERSION} REQUIRED HINTS ${OPENCV_INSTALLATION_PATH})
9
+ # Find OpenCV, you may need to set OpenCV_DIR variable
10
+ # to the absolute path to the directory containing OpenCVConfig.cmake file
11
+ # via the command line or GUI
12
+
13
+ file(GLOB SourceFile
14
+ "demo.cpp")
15
+ # If the package has been found, several variables will
16
+ # be set, you can find the full list with descriptions
17
+ # in the OpenCVConfig.cmake file.
18
+ # Print some message showing some of them
19
+ message(STATUS "OpenCV library status:")
20
+ message(STATUS " config: ${OpenCV_DIR}")
21
+ message(STATUS " version: ${OpenCV_VERSION}")
22
+ message(STATUS " libraries: ${OpenCV_LIBS}")
23
+ message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}")
24
+
25
+ # Declare the executable target built from your sources
26
+ add_executable(${project_name} ${SourceFile})
27
+
28
+ # Link your application with OpenCV libraries
29
+ target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS})
models/text_detection_db/README.md CHANGED
@@ -11,6 +11,8 @@ Note:
11
 
12
  ## Demo
13
 
 
 
14
  Run the following command to try the demo:
15
 
16
  ```shell
@@ -23,6 +25,22 @@ python demo.py --input /path/to/image -v
23
  python demo.py --help
24
  ```
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  ### Example outputs
27
 
28
  ![mask](./example_outputs/mask.jpg)
 
11
 
12
  ## Demo
13
 
14
+ ### Python
15
+
16
  Run the following command to try the demo:
17
 
18
  ```shell
 
25
  python demo.py --help
26
  ```
27
 
28
+ ### C++
29
+
30
+ Install latest OpenCV and CMake >= 3.24.0 to get started with:
31
+
32
+ ```shell
33
+ # A typical and default installation path of OpenCV is /usr/local
34
+ cmake -B build -D OPENCV_INSTALLATION_PATH=/path/to/opencv/installation .
35
+ cmake --build build
36
+ # detect on camera input
37
+ ./build/opencv_zoo_text_detection_db -m=/path/to/model
38
+ # detect on an image
39
+ ./build/opencv_zoo_text_detection_db -m=/path/to/model -i=/path/to/image -v
40
+ # get help messages
41
+ ./build/opencv_zoo_text_detection_db -h
42
+ ```
43
+
44
  ### Example outputs
45
 
46
  ![mask](./example_outputs/mask.jpg)
models/text_detection_db/demo.cpp ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <iostream>
2
+
3
+ #include <opencv2/dnn.hpp>
4
+ #include <opencv2/imgproc.hpp>
5
+ #include <opencv2/highgui.hpp>
6
+
7
+ using namespace std;
8
+ using namespace cv;
9
+ using namespace dnn;
10
+
11
+ vector< pair<cv::dnn::Backend, cv::dnn::Target> > backendTargetPairs = {
12
+ std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_OPENCV, dnn::DNN_TARGET_CPU),
13
+ std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA),
14
+ std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA_FP16),
15
+ std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_TIMVX, dnn::DNN_TARGET_NPU),
16
+ std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CANN, dnn::DNN_TARGET_NPU)};
17
+
18
+
19
+ std::string keys =
20
+ "{ help h | | Print help message. }"
21
+ "{ model m | text_detection_DB_IC15_resnet18_2021sep.onnx | Usage: Set model type, defaults to text_detection_DB_IC15_resnet18_2021sep.onnx }"
22
+ "{ input i | | Usage: Path to input image or video file. Skip this argument to capture frames from a camera.}"
23
+ "{ width | 736 | Usage: Resize input image to certain width, default = 736. It should be multiple by 32.}"
24
+ "{ height | 736 | Usage: Resize input image to certain height, default = 736. It should be multiple by 32.}"
25
+ "{ binary_threshold | 0.3 | Usage: Threshold of the binary map, default = 0.3.}"
26
+ "{ polygon_threshold | 0.5 | Usage: Threshold of polygons, default = 0.5.}"
27
+ "{ max_candidates | 200 | Usage: Set maximum number of polygon candidates, default = 200.}"
28
+ "{ unclip_ratio | 2.0 | Usage: The unclip ratio of the detected text region, which determines the output size, default = 2.0.}"
29
+ "{ save s | true | Usage: Specify to save file with results (i.e. bounding box, confidence level). Invalid in case of camera input.}"
30
+ "{ viz v | true | Usage: Specify to open a new window to show results. Invalid in case of camera input.}"
31
+ "{ backend bt | 0 | Choose one of computation backends: "
32
+ "0: (default) OpenCV implementation + CPU, "
33
+ "1: CUDA + GPU (CUDA), "
34
+ "2: CUDA + GPU (CUDA FP16), "
35
+ "3: TIM-VX + NPU, "
36
+ "4: CANN + NPU}";
37
+
38
+
39
+ class DB {
40
+ public:
41
+
42
+ DB(string modPath, Size inSize = Size(736, 736), float binThresh = 0.3,
43
+ float polyThresh = 0.5, int maxCand = 200, double unRatio = 2.0,
44
+ dnn::Backend bId = DNN_BACKEND_DEFAULT, dnn::Target tId = DNN_TARGET_CPU) : modelPath(modPath), inputSize(inSize), binaryThreshold(binThresh),
45
+ polygonThreshold(polyThresh), maxCandidates(maxCand), unclipRatio(unRatio),
46
+ backendId(bId), targetId(tId)
47
+ {
48
+ this->model = TextDetectionModel_DB(readNet(modelPath));
49
+ this->model.setPreferableBackend(backendId);
50
+ this->model.setPreferableTarget(targetId);
51
+
52
+ this->model.setBinaryThreshold(binaryThreshold);
53
+ this->model.setPolygonThreshold(polygonThreshold);
54
+ this->model.setUnclipRatio(unclipRatio);
55
+ this->model.setMaxCandidates(maxCandidates);
56
+
57
+ this->model.setInputParams(1.0 / 255.0, inputSize, Scalar(122.67891434, 116.66876762, 104.00698793));
58
+ }
59
+ pair< vector<vector<Point>>, vector<float> > infer(Mat image) {
60
+ CV_Assert(image.rows == this->inputSize.height && "height of input image != net input size ");
61
+ CV_Assert(image.cols == this->inputSize.width && "width of input image != net input size ");
62
+ vector<vector<Point>> pt;
63
+ vector<float> confidence;
64
+ this->model.detect(image, pt, confidence);
65
+ return make_pair< vector<vector<Point>> &, vector< float > &>(pt, confidence);
66
+ }
67
+
68
+ private:
69
+ string modelPath;
70
+ TextDetectionModel_DB model;
71
+ Size inputSize;
72
+ float binaryThreshold;
73
+ float polygonThreshold;
74
+ int maxCandidates;
75
+ double unclipRatio;
76
+ dnn::Backend backendId;
77
+ dnn::Target targetId;
78
+
79
+ };
80
+
81
+ Mat visualize(Mat image, pair< vector<vector<Point>>, vector<float> >&results, double fps=-1, Scalar boxColor=Scalar(0, 255, 0), Scalar textColor=Scalar(0, 0, 255), bool isClosed=true, int thickness=2)
82
+ {
83
+ Mat output;
84
+ image.copyTo(output);
85
+ if (fps > 0)
86
+ putText(output, format("FPS: %.2f", fps), Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, textColor);
87
+ polylines(output, results.first, isClosed, boxColor, thickness);
88
+ return output;
89
+ }
90
+
91
+ int main(int argc, char** argv)
92
+ {
93
+ CommandLineParser parser(argc, argv, keys);
94
+
95
+ parser.about("Use this program to run Real-time Scene Text Detection with Differentiable Binarization in opencv Zoo using OpenCV.");
96
+ if (parser.has("help"))
97
+ {
98
+ parser.printMessage();
99
+ return 0;
100
+ }
101
+
102
+ int backendTargetid = parser.get<int>("backend");
103
+ String modelName = parser.get<String>("model");
104
+
105
+ if (modelName.empty())
106
+ {
107
+ CV_Error(Error::StsError, "Model file " + modelName + " not found");
108
+ }
109
+
110
+ Size inpSize(parser.get<int>("width"), parser.get<int>("height"));
111
+ float binThresh = parser.get<float>("binary_threshold");
112
+ float polyThresh = parser.get<float>("polygon_threshold");
113
+ int maxCand = parser.get<int>("max_candidates");
114
+ double unRatio = parser.get<float>("unclip_ratio");
115
+ bool save = parser.get<bool>("save");
116
+ bool viz = parser.get<float>("viz");
117
+
118
+ DB model(modelName, inpSize, binThresh, polyThresh, maxCand, unRatio, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second);
119
+
120
+ //! [Open a video file or an image file or a camera stream]
121
+ VideoCapture cap;
122
+ if (parser.has("input"))
123
+ cap.open(parser.get<String>("input"));
124
+ else
125
+ cap.open(0);
126
+ if (!cap.isOpened())
127
+ CV_Error(Error::StsError, "Cannot opend video or file");
128
+ Mat originalImage;
129
+ static const std::string kWinName = modelName;
130
+ while (waitKey(1) < 0)
131
+ {
132
+ cap >> originalImage;
133
+ if (originalImage.empty())
134
+ {
135
+ cout << "Frame is empty" << endl;
136
+ waitKey();
137
+ break;
138
+ }
139
+ int originalW = originalImage.cols;
140
+ int originalH = originalImage.rows;
141
+ double scaleHeight = originalH / double(inpSize.height);
142
+ double scaleWidth = originalW / double(inpSize.width);
143
+ Mat image;
144
+ resize(originalImage, image, inpSize);
145
+
146
+ // inference
147
+ TickMeter tm;
148
+ tm.start();
149
+ pair< vector<vector<Point>>, vector<float> > results = model.infer(image);
150
+ tm.stop();
151
+ auto x = results.first;
152
+ // Scale the results bounding box
153
+ for (auto &pts : results.first)
154
+ {
155
+ for (int i = 0; i < 4; i++)
156
+ {
157
+ pts[i].x = int(pts[i].x * scaleWidth);
158
+ pts[i].y = int(pts[i].y * scaleHeight);
159
+ }
160
+ }
161
+ originalImage = visualize(originalImage, results, tm.getFPS());
162
+ tm.reset();
163
+ if (parser.has("input"))
164
+ {
165
+ if (save)
166
+ {
167
+ cout << "Result image saved to result.jpg\n";
168
+ imwrite("result.jpg", originalImage);
169
+ }
170
+ if (viz)
171
+ imshow(kWinName, originalImage);
172
+ }
173
+ else
174
+ imshow(kWinName, originalImage);
175
+ }
176
+ return 0;
177
+ }
178
+
179
+