Visual Servoing Platform version 3.7.0
Loading...
Searching...
No Matches
tutorial-megapose-live-single-object-tracking.cpp
1
2#include <iostream>
3
4#include <visp3/core/vpConfig.h>
5
6// Check if std:c++17 or higher
7#if ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L))) && \
8 defined(VISP_HAVE_NLOHMANN_JSON) && defined(VISP_HAVE_OPENCV) && defined(HAVE_OPENCV_VIDEOIO) && \
9 defined(HAVE_OPENCV_DNN) && defined(VISP_HAVE_DISPLAY) && \
10 defined(VISP_HAVE_THREADS)
11
12#include <optional>
13
14#include <visp3/core/vpIoTools.h>
15#include <visp3/detection/vpDetectorDNNOpenCV.h>
16#include <visp3/gui/vpDisplayFactory.h>
17#include <visp3/dnn_tracker/vpMegaPose.h>
18#include <visp3/dnn_tracker/vpMegaPoseTracker.h>
19#include <visp3/io/vpJsonArgumentParser.h>
20
21#include VISP_NLOHMANN_JSON(json.hpp)
22
23#include <opencv2/videoio.hpp>
24
25
26using json = nlohmann::json;
27
28#ifdef ENABLE_VISP_NAMESPACE
29using namespace VISP_NAMESPACE_NAME;
30#endif
31
32/*
33 * Interpolate two vpColors. Linear interpolation between each components (R, G, B)
34 *
35 * low starting color
36 * high ending color
37 * f interpolation factor, between 0 and 1
38 * Returns the interpolated color
39 */
40vpColor interpolate(const vpColor &low, const vpColor &high, const float f)
41{
42 const float r = (static_cast<float>(high.R) - static_cast<float>(low.R)) * f;
43 const float g = (static_cast<float>(high.G) - static_cast<float>(low.G)) * f;
44 const float b = (static_cast<float>(high.B) - static_cast<float>(low.B)) * f;
45 return vpColor((unsigned char)r, (unsigned char)g, (unsigned char)b);
46}
47
48/*
49 * Display the Megapose confidence score as a rectangle in the image.
50 * This rectangle becomes green when Megapose is "confident" about its prediction
51 * The confidence score measures whether Megapose can, from its pose estimation, recover the true pose in future pose refinement iterations
52 *
53 * \param[in] I : The image in which to display the confidence.
54 * \param[in] score : The confidence score of Megapose, between 0 and 1.
55 */
56void displayScore(const vpImage<vpRGBa> &I, float score)
57{
58 const unsigned top = static_cast<unsigned>(I.getHeight() * 0.85f);
59 const unsigned height = static_cast<unsigned>(I.getHeight() * 0.1f);
60 const unsigned left = static_cast<unsigned>(I.getWidth() * 0.05f);
61 const unsigned width = static_cast<unsigned>(I.getWidth() * 0.5f);
62 vpRect full(left, top, width, height);
63 vpRect scoreRect(left, top, width * score, height);
64 const vpColor low = vpColor::red;
65 const vpColor high = vpColor::green;
66 const vpColor c = interpolate(low, high, score);
67
68 vpDisplay::displayRectangle(I, full, c, false, 5);
69 vpDisplay::displayRectangle(I, scoreRect, c, true, 1);
70}
71
72/*
73 * Add the Megapose rendering on top of the actual image I.
74 * Require I and overlay to be of the same size.
75 * Note that a fully black object will not render
76*/
77void overlayRender(vpImage<vpRGBa> &I, const vpImage<vpRGBa> &overlay)
78{
79 const vpRGBa black = vpRGBa(0, 0, 0);
80 for (unsigned int i = 0; i < I.getHeight(); ++i) {
81 for (unsigned int j = 0; j < I.getWidth(); ++j) {
82 if (overlay[i][j] != black) {
83 I[i][j] = overlay[i][j];
84 }
85 }
86 }
87}
88
90/*
91 * Run the detection network on an image in order to find a specific object.
92 * The best matching detection is returned:
93 * - If a previous Megapose estimation is available, find the closest match in the image (Euclidean distance between centers)
94 * - Otherwise, take the detection with highest confidence
95 * If no detection corresponding to detectionLabel is found, then std::nullopt is returned
96 */
97std::optional<vpRect> detectObjectForInitMegaposeDnn(vpDetectorDNNOpenCV &detector, const cv::Mat &I,
98 const std::string &detectionLabel,
99 std::optional<vpMegaPoseEstimate> previousEstimate)
100{
101 std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D> detections_vec;
102 detector.detect(I, detections_vec);
103 std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D> matchingDetections;
104 for (const auto &detection : detections_vec) {
105 std::optional<std::string> classnameOpt = detection.getClassName();
106 if (classnameOpt) {
107 if (*classnameOpt == detectionLabel) {
108 matchingDetections.push_back(detection);
109 }
110 }
111 }
112 if (matchingDetections.size() == 0) {
113 return std::nullopt;
114 }
115 else if (matchingDetections.size() == 1) {
116 return matchingDetections[0].getBoundingBox();
117 }
118 else {
119 // Get detection that is closest to previous object bounding box estimated by Megapose
120 if (previousEstimate) {
121 vpRect best;
122 double bestDist = 10000.f;
123 const vpImagePoint previousCenter = (*previousEstimate).boundingBox.getCenter();
124 for (const auto &detection : matchingDetections) {
125 const vpRect detectionBB = detection.getBoundingBox();
126 const vpImagePoint center = detectionBB.getCenter();
127 const double matchDist = vpImagePoint::distance(center, previousCenter);
128 if (matchDist < bestDist) {
129 bestDist = matchDist;
130 best = detectionBB;
131 }
132 }
133 return best;
134
135 }
136 else { // Get detection with highest confidence
137 vpRect best;
138 double highestConf = 0.0;
139 for (const auto &detection : matchingDetections) {
140 const double conf = detection.getConfidenceScore();
141 if (conf > highestConf) {
142 highestConf = conf;
143 best = detection.getBoundingBox();
144 }
145 }
146 return best;
147 }
148 }
149 return std::nullopt;
150}
151
152/*
153 * Ask user to provide the detection themselves. They must click to start labelling, then click on the top left and bottom right corner to create the detection.
154 */
155std::optional<vpRect> detectObjectForInitMegaposeClick(const vpImage<vpRGBa> &I)
156{
157 const bool startLabelling = vpDisplay::getClick(I, false);
158
159 const vpImagePoint textPosition(10.0, 20.0);
160
161 if (startLabelling) {
162 vpImagePoint topLeft, bottomRight;
163 vpDisplay::displayText(I, textPosition, "Click the upper left corner of the bounding box", vpColor::red);
165 vpDisplay::getClick(I, topLeft, true);
167 vpDisplay::displayCross(I, topLeft, 5, vpColor::red, 2);
168 vpDisplay::displayText(I, textPosition, "Click the bottom right corner of the bounding box", vpColor::red);
170 vpDisplay::getClick(I, bottomRight, true);
171 vpRect bb(topLeft, bottomRight);
172 return bb;
173 }
174 else {
176 vpDisplay::displayText(I, textPosition, "Click when the object is visible and static to start reinitializing megapose.", vpColor::red);
178 return std::nullopt;
179 }
180}
182
183enum DetectionMethod
184{
185 UNKNOWN,
186 CLICK,
187 DNN
188};
189
190#if defined(__clang__)
191// Mute warning : declaration requires an exit-time destructor [-Wexit-time-destructors]
192// message : expanded from macro 'NLOHMANN_JSON_SERIALIZE_ENUM'
193# pragma clang diagnostic push
194# pragma clang diagnostic ignored "-Wexit-time-destructors"
195#endif
196
197NLOHMANN_JSON_SERIALIZE_ENUM(DetectionMethod, {
198 {UNKNOWN, nullptr}, // Default value if the json string is not in "current", "desired" or "mean"
199 {CLICK, "click"},
200 {DNN, "dnn"} }
201 );
202
203#if defined(__clang__)
204# pragma clang diagnostic pop
205#endif
206
207int main(int argc, const char *argv[])
208{
209 unsigned width = 640, height = 480;
211 std::string videoDevice = "0";
212 std::string megaposeAddress = "127.0.0.1";
213 unsigned megaposePort = 5555;
214 int refinerIterations = 1, coarseNumSamples = 576;
215 double reinitThreshold = 0.2;
216
217 DetectionMethod detectionMethod = DetectionMethod::UNKNOWN;
218
219 std::string detectorModelPath = "path/to/model.onnx", detectorConfig = "none";
220 std::string detectorFramework = "onnx", detectorTypeString = "yolov7";
221 std::string objectName = "cube";
222 std::vector<std::string> labels = { "cube" };
223 float detectorMeanR = 0.f, detectorMeanG = 0.f, detectorMeanB = 0.f;
224 float detectorConfidenceThreshold = 0.65f, detectorNmsThreshold = 0.5f, detectorFilterThreshold = -0.25f;
225 float detectorScaleFactor = 0.0039f;
226 bool detectorSwapRB = false;
228 vpJsonArgumentParser parser("Single object tracking with Megapose", "--config", "/");
229 parser.addArgument("width", width, true, "The image width")
230 .addArgument("height", height, true, "The image height")
231 .addArgument("camera", cam, true, "The camera intrinsic parameters. Should correspond to a perspective projection model without distortion.")
232 .addArgument("video-device", videoDevice, true, "Video device")
233 .addArgument("object", objectName, true, "Name of the object to track with megapose.")
234 .addArgument("detectionMethod", detectionMethod, true, "How to perform detection of the object to get the bounding box:"
235 " \"click\" for user labelling, \"dnn\" for dnn detection.")
236 .addArgument("reinitThreshold", reinitThreshold, false, "If the Megapose score falls below this threshold, then a reinitialization is be required."
237 " Should be between 0 and 1")
238 .addArgument("megapose/address", megaposeAddress, true, "IP address of the Megapose server.")
239 .addArgument("megapose/port", megaposePort, true, "Port on which the Megapose server listens for connections.")
240 .addArgument("megapose/refinerIterations", refinerIterations, false, "Number of Megapose refiner model iterations."
241 "A higher count may lead to better accuracy, at the cost of more processing time")
242 .addArgument("megapose/initialisationNumSamples", coarseNumSamples, false, "Number of Megapose renderings used for the initial pose estimation.")
243
244 .addArgument("detector/model-path", detectorModelPath, true, "Path to the model")
245 .addArgument("detector/config", detectorConfig, true, "Path to the model configuration. Set to none if config is not required.")
246 .addArgument("detector/framework", detectorFramework, true, "Detector framework")
247 .addArgument("detector/type", detectorTypeString, true, "Detector type")
248 .addArgument("detector/labels", labels, true, "Detection class labels")
249 .addArgument("detector/mean/red", detectorMeanR, false, "Detector mean red component. Used to normalize image")
250 .addArgument("detector/mean/green", detectorMeanG, false, "Detector mean green component. Used to normalize image")
251 .addArgument("detector/mean/blue", detectorMeanB, false, "Detector mean red component. Used to normalize image")
252 .addArgument("detector/confidenceThreshold", detectorConfidenceThreshold, false, "Detector confidence threshold. "
253 "When a detection with a confidence below this threshold, it is ignored")
254 .addArgument("detector/nmsThreshold", detectorNmsThreshold, false, "Detector non maximal suppression threshold.")
255 .addArgument("detector/filterThreshold", detectorFilterThreshold, false)
256 .addArgument("detector/scaleFactor", detectorScaleFactor, false, "Pixel intensity rescaling factor. If set to 1/255, then pixel values are between 0 and 1.")
257 .addArgument("detector/swapRedAndBlue", detectorSwapRB, false, "Whether to swap red and blue channels before feeding the image to the detector.");
258
259 parser.parse(argc, argv);
261
263 throw vpException(vpException::badValue, "The camera projection model should be without distortion, as other models are ignored by Megapose");
264 }
265
266 if (detectionMethod == DetectionMethod::UNKNOWN) {
267 throw vpException(vpException::badValue, "The specified detection method is incorrect: it should be either \"click\" or \"dnn\"");
268 }
269
270 cv::VideoCapture capture;
271 bool isLiveCapture;
272 bool hasCaptureOpeningSucceeded;
273 double videoFrametime = 0; // Only for prerecorded videos
274 if (vpMath::isNumber(videoDevice)) {
275 hasCaptureOpeningSucceeded = capture.open(std::atoi(videoDevice.c_str()));
276 isLiveCapture = true;
277 }
278 else {
279 hasCaptureOpeningSucceeded = capture.open(videoDevice);
280 isLiveCapture = false;
281 double fps = capture.get(cv::CAP_PROP_FPS);
282 videoFrametime = (1.0 / fps) * 1000.0;
283 }
284 if (!hasCaptureOpeningSucceeded) {
285 std::cout << "Capture from camera: " << videoDevice << " didn't work" << std::endl;
286 return EXIT_FAILURE;
287 }
288
290 std::shared_ptr<vpDisplay> display = vpDisplayFactory::createDisplay();
291
292 //d.setDownScalingFactor(vpDisplay::SCALE_AUTO);
293#if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && \
294 ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
297 vpDetectorDNNOpenCV::NetConfig netConfig(detectorConfidenceThreshold, detectorNmsThreshold, labels,
298 cv::Size(width, height), detectorFilterThreshold);
299 vpDetectorDNNOpenCV dnn(netConfig, detectorType);
300 if (detectionMethod == DetectionMethod::DNN) {
301 dnn.readNet(detectorModelPath, detectorConfig, detectorFramework);
302 dnn.setMean(detectorMeanR, detectorMeanG, detectorMeanB);
303 dnn.setScaleFactor(detectorScaleFactor);
304 dnn.setSwapRB(detectorSwapRB);
305 }
306#endif
308 std::shared_ptr<vpMegaPose> megapose;
309 try {
310 megapose = std::make_shared<vpMegaPose>(megaposeAddress, megaposePort, cam, height, width);
311 }
312 catch (...) {
313 throw vpException(vpException::ioError, "Could not connect to Megapose server at " + megaposeAddress + " on port " + std::to_string(megaposePort));
314 }
315
316 vpMegaPoseTracker megaposeTracker(megapose, objectName, refinerIterations);
317 megapose->setCoarseNumSamples(coarseNumSamples);
318 const std::vector<std::string> allObjects = megapose->getObjectNames();
319 if (std::find(allObjects.begin(), allObjects.end(), objectName) == allObjects.end()) {
320 throw vpException(vpException::badValue, "Object " + objectName + " is not known by the Megapose server!");
321 }
322 std::future<vpMegaPoseEstimate> trackerFuture;
324
325 cv::Mat frame;
326 vpMegaPoseEstimate megaposeEstimate; // last Megapose estimation
327 vpRect lastDetection; // Last detection (initialization)
328 bool callMegapose = true; // Whether we should call Megapose this iteration
329 bool initialized = false; // Whether tracking should be initialized or reinitialized
330 bool tracking = false;
331
332 bool overlayModel = true;
333 vpImage<vpRGBa> overlayImage(height, width);
334 std::string overlayMode = "full";
335
336 std::vector<double> megaposeTimes;
337 std::vector<double> frameTimes;
338
339 double megaposeStartTime = 0.0;
340
342 while (true) {
343 const double frameStart = vpTime::measureTimeMs();
344 capture >> frame;
345 if (frame.empty())
346 break;
347
348 if (I.getSize() == 0) {
349 vpImageConvert::convert(frame, I);
350 display->init(I);
351 vpDisplay::setTitle(I, "Megapose object pose estimation");
352 }
353 else {
354 vpImageConvert::convert(frame, I);
355 }
358 // Check whether Megapose is still running
360 if (!callMegapose && trackerFuture.wait_for(std::chrono::milliseconds(0)) == std::future_status::ready) {
361 megaposeEstimate = trackerFuture.get();
362 if (tracking) {
363 megaposeTimes.push_back(vpTime::measureTimeMs() - megaposeStartTime);
364 }
365 callMegapose = true;
366 tracking = true;
367
368 if (overlayModel) {
369 overlayImage = megapose->viewObjects({ objectName }, { megaposeEstimate.cTo }, overlayMode);
370 }
371
372 if (megaposeEstimate.score < reinitThreshold) { // If confidence is low, require a reinitialisation with 2D detection
373 initialized = false;
374 }
375 }
378 if (callMegapose) {
379 if (!initialized) {
380 tracking = false;
381 std::optional<vpRect> detection = std::nullopt;
382#if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && \
383 ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
384 if (detectionMethod == DetectionMethod::DNN) {
385 detection = detectObjectForInitMegaposeDnn(
386 dnn, frame, objectName, initialized ? std::optional(megaposeEstimate) : std::nullopt);
387 }
388#endif
389 if (detectionMethod == DetectionMethod::CLICK) {
390 detection = detectObjectForInitMegaposeClick(I);
391 }
392
393 if (detection) {
394 initialized = true;
395 lastDetection = *detection;
396 trackerFuture = megaposeTracker.init(I, lastDetection);
397 callMegapose = false;
398
399 }
400 }
401 else {
402 trackerFuture = megaposeTracker.track(I);
403 callMegapose = false;
404 megaposeStartTime = vpTime::measureTimeMs();
405 }
406 }
408
410 std::string keyboardEvent;
411 const bool keyPressed = vpDisplay::getKeyboardEvent(I, keyboardEvent, false);
412 if (keyPressed) {
413 if (keyboardEvent == "t") {
414 overlayModel = !overlayModel;
415 }
416 else if (keyboardEvent == "w") {
417 overlayMode = overlayMode == "full" ? "wireframe" : "full";
418 }
419 }
420
421 if (tracking) {
422 if (overlayModel) {
423 overlayRender(I, overlayImage);
425 }
426 vpDisplay::displayText(I, 20, 20, "Right click to quit", vpColor::red);
427 vpDisplay::displayText(I, 30, 20, "Press T: Toggle overlay", vpColor::red);
428 vpDisplay::displayText(I, 40, 20, "Press W: Toggle wireframe", vpColor::red);
429 vpDisplay::displayFrame(I, megaposeEstimate.cTo, cam, 0.05, vpColor::none, 3);
430 //vpDisplay::displayRectangle(I, lastDetection, vpColor::red);
431 displayScore(I, megaposeEstimate.score);
432 }
434
436
438 if (vpDisplay::getClick(I, button, false)) {
439 if (button == vpMouseButton::button3) {
440 break; // Right click to stop
441 }
442 }
443 const double frameEnd = vpTime::measureTimeMs();
444 if (!isLiveCapture) {
445 vpTime::wait(std::max<double>(0.0, videoFrametime - (frameEnd - frameStart)));
446 }
447 frameTimes.push_back(vpTime::measureTimeMs() - frameStart);
448 }
449 std::cout << "Average frame time: " << vpMath::getMean(frameTimes) << std::endl;
450 std::cout << "Average time between Megapose calls: " << vpMath::getMean(megaposeTimes) << std::endl;
451}
452
453#else
454int main()
455{
456 std::cout << "Compile ViSP with the DNN tracker module, the JSON 3rd party library and the OpenCV detection module" << std::endl;
457 return EXIT_SUCCESS;
458}
459
460#endif
Generic class defining intrinsic camera parameters.
@ perspectiveProjWithoutDistortion
Perspective projection without distortion model.
Class to define RGB colors available for display functionalities.
Definition vpColor.h:157
static const vpColor red
Definition vpColor.h:198
static const vpColor none
Definition vpColor.h:210
static const vpColor green
Definition vpColor.h:201
Structure containing some information required for the configuration of a vpDetectorDNNOpenCV object.
DNNResultsParsingType
Enumeration listing the types of DNN for which the vpDetectorDNNOpenCV furnishes the methods permitti...
static DNNResultsParsingType dnnResultsParsingTypeFromString(const std::string &name)
virtual bool detect(const vpImage< unsigned char > &I, std::vector< DetectedFeatures2D > &output)
Object detection using OpenCV DNN module.
static bool getClick(const vpImage< unsigned char > &I, bool blocking=true)
static bool getKeyboardEvent(const vpImage< unsigned char > &I, bool blocking=true)
static void display(const vpImage< unsigned char > &I)
static void displayFrame(const vpImage< unsigned char > &I, const vpHomogeneousMatrix &cMo, const vpCameraParameters &cam, double size, const vpColor &color=vpColor::none, unsigned int thickness=1, const vpImagePoint &offset=vpImagePoint(0, 0), const std::string &frameName="", const vpColor &textColor=vpColor::black, const vpImagePoint &textOffset=vpImagePoint(15, 15))
static void displayCross(const vpImage< unsigned char > &I, const vpImagePoint &ip, unsigned int size, const vpColor &color, unsigned int thickness=1)
static void setTitle(const vpImage< unsigned char > &I, const std::string &windowtitle)
static void flush(const vpImage< unsigned char > &I)
static void displayRectangle(const vpImage< unsigned char > &I, const vpImagePoint &topLeft, unsigned int width, unsigned int height, const vpColor &color, bool fill=false, unsigned int thickness=1)
static void displayText(const vpImage< unsigned char > &I, const vpImagePoint &ip, const std::string &s, const vpColor &color)
error that can be emitted by ViSP classes.
Definition vpException.h:60
@ ioError
I/O error.
Definition vpException.h:67
@ badValue
Used to indicate that a value is not in the allowed range.
Definition vpException.h:73
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
Class that defines a 2D point in an image. This class is useful for image processing and stores only ...
static double distance(const vpImagePoint &iP1, const vpImagePoint &iP2)
Definition of the vpImage class member functions.
Definition vpImage.h:131
Command line argument parsing with support for JSON files. If a JSON file is supplied,...
static double getMean(const std::vector< double > &v)
Definition vpMath.cpp:323
static bool isNumber(const std::string &str)
Definition vpMath.cpp:235
vpHomogeneousMatrix cTo
Definition vpMegaPose.h:76
A simplified interface to track a single object with MegaPose. This tracker works asynchronously: A c...
unsigned char B
Blue component.
Definition vpRGBa.h:327
unsigned char R
Red component.
Definition vpRGBa.h:325
unsigned char G
Green component.
Definition vpRGBa.h:326
Defines a rectangle in the plane.
Definition vpRect.h:79
void getCenter(double &x, double &y) const
Definition vpRect.h:136
std::shared_ptr< vpDisplay > createDisplay()
Return a smart pointer vpDisplay specialization if a GUI library is available or nullptr otherwise.
VISP_EXPORT double measureTimeMs()
VISP_EXPORT int wait(double t0, double t)