OpenCVForUnity
OpenCVForUnity copied to clipboard
How to convert c++ sample ‘Hand Keypoint Detection’ using opencvforunity?
I am studying the sample ‘Hand Keypoint Detection’ on https://www.learnopencv.com/hand-keypoint-detection-using-deep-learning-and-opencv/, it works in c++ application. I am trying to convert this using opencvforunity. Everything works well except below c++ line conversion:
//Mat probMap(H, W, CV_32F, output.ptr(0, n));
What's the equivalent code in opencvforunity?
See my below code snippet: Mat input = Dnn.blobFromImage(img, 1.0 / 255, new Size(inWidth, inHeight), new Scalar(0, 0, 0), false, false);
net.setInput(input);
// TickMeter tm = new TickMeter ();
// tm.start ();
Mat output = net.forward();
int H = output.size(2);
int W = output.size(3);
float[] data = new float[W];
List<Point> points = new List<Point>();
for (int n = 0; n < nPoints; n++)
{
// Probability map of corresponding body's part.
//Mat probMap(H, W, CV_32F, output.ptr(0, n));
output.get(n, 0, data);
Mat probMap = new Mat(H, W, CvType.CV_32FC1);
probMap.put(0, 0, data);
Imgproc.resize(probMap, probMap, new Size(frameWidth, frameHeight));
Point maxLoc;
double prob;
Core.MinMaxLocResult result = Core.minMaxLoc(probMap);
//minMaxLoc(probMap, 0, &prob, 0, &maxLoc);
probMap.Dispose();
double x = result.maxLoc.x;
double y = result.maxLoc.y;
Debug.Log(string.Format("x:{0},y:{1},maxVal:{2}", x, y, result.maxVal));
maxLoc = new Point(x, y);
if (result.maxVal > 0.1)
{
points.Add(maxLoc);
Imgproc.circle(img, new Point((int)x, (int)y), 8, new Scalar(0, 255, 255), -1);
Imgproc.putText(img, string.Format("{0}", n), new Point((int)x, (int)y), Imgproc.FONT_HERSHEY_COMPLEX, 1, new Scalar(0, 0, 255), 2);
//circle(frameCopy, new Point(x, y), 8, Scalar(0, 255, 255), -1);
//cv::putText(frameCopy, cv::format("%d", n), cv::Point((int)maxLoc.x, (int)maxLoc.y), cv::FONT_HERSHEY_COMPLEX, 1, cv::Scalar(0, 0, 255), 2);
}
else
{
points.Add(null);
}
}
I added code to load a hand detection model in OpenPoseExample. OpenPoseExample.zip
Great job! That works when detecting image from disk. However, my scenario is for each frame from webcam, I got error saying "dnn::forward_11() : OpenCV(4.1.0-dev) C:\Users\satoo\Desktop\opencv\modules\dnn\src\layers\convolution_layer.cpp:282: error: (-2:Unspecified error) Number of input channels should be multiple of 3 but got 4 in function 'cv::dnn::ConvolutionLayerImpl::getMemoryShapes'". Any idea to fix this? Below code snippet is what I was testing:
void Update () { if (webCamTextureToMatHelper.IsPlaying () && webCamTextureToMatHelper.DidUpdateThisFrame ()) {
Mat rgbaMat = webCamTextureToMatHelper.GetMat ();
if (detectType == DATASET_TYPE.ObjectDetect)
{
RunDnnObjectDetection(rgbaMat);
}
else if(detectType == DATASET_TYPE.HAND)
{
RunHandPose(rgbaMat);
}
}
}
void RunHandPose(Mat img)
{
//if true, The error log of the Native side OpenCV will be displayed on the Unity Editor Console.
Utils.setDebugMode(true);
//Mat img = Imgcodecs.imread(image_filepath);
if (img.empty())
{
Debug.LogError("dnn/COCO_val2014_000000000589.jpg is not loaded.The image file can be downloaded here: \"https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/examples/media/COCO_val2014_000000000589.jpg\" folder. ");
img = new Mat(368, 368, CvType.CV_8UC3, new Scalar(0, 0, 0));
}
//Adust Quad.transform.localScale.
gameObject.transform.localScale = new Vector3(img.width(), img.height(), 1);
Debug.Log("Screen.width " + Screen.width + " Screen.height " + Screen.height + " Screen.orientation " + Screen.orientation);
float imageWidth = img.width();
float imageHeight = img.height();
float widthScale = (float)Screen.width / imageWidth;
float heightScale = (float)Screen.height / imageHeight;
if (widthScale < heightScale)
{
Camera.main.orthographicSize = (imageWidth * (float)Screen.height / (float)Screen.width) / 2;
}
else
{
Camera.main.orthographicSize = imageHeight / 2;
}
if (net == null)
{
Imgproc.putText(img, "model file is not loaded.", new Point(5, img.rows() - 30), Imgproc.FONT_HERSHEY_SIMPLEX, 0.7, new Scalar(255, 255, 255), 2, Imgproc.LINE_AA, false);
Imgproc.putText(img, "Please read console message.", new Point(5, img.rows() - 10), Imgproc.FONT_HERSHEY_SIMPLEX, 0.7, new Scalar(255, 255, 255), 2, Imgproc.LINE_AA, false);
}
else
{
float frameWidth = img.cols();
float frameHeight = img.rows();
Mat input = Dnn.blobFromImage(img, inScale, new Size(inWidth, inHeight), new Scalar(0, 0, 0), false, false);
Debug.Log(string.Format("channels:{0}", input.channels()));
net.setInput(input);
// TickMeter tm = new TickMeter ();
// tm.start ();
Mat output = net.forward();
// tm.stop ();
// Debug.Log ("Inference time, ms: " + tm.getTimeMilli ());
Debug.Log("output.size(0) " + output.size(0));
Debug.Log("output.size(1) " + output.size(1));
Debug.Log("output.size(2) " + output.size(2));
Debug.Log("output.size(3) " + output.size(3));
float[] data = new float[output.size(2) * output.size(3)];
output = output.reshape(1, output.size(1));
List<Point> points = new List<Point>();
for (int i = 0; i < BODY_PARTS.Count; i++)
{
output.get(i, 0, data);
Mat heatMap = new Mat(1, data.Length, CvType.CV_32FC1);
heatMap.put(0, 0, data);
//Originally, we try to find all the local maximums. To simplify a sample
//we just find a global one. However only a single pose at the same time
//could be detected this way.
Core.MinMaxLocResult result = Core.minMaxLoc(heatMap);
heatMap.Dispose();
double x = (frameWidth * (result.maxLoc.x % 46)) / 46;
double y = (frameHeight * (result.maxLoc.x / 46)) / 46;
if (result.maxVal > 0.1)
{
points.Add(new Point(x, y));
}
else
{
points.Add(null);
}
}
for (int i = 0; i < POSE_PAIRS.GetLength(0); i++)
{
string partFrom = POSE_PAIRS[i, 0];
string partTo = POSE_PAIRS[i, 1];
int idFrom = BODY_PARTS[partFrom];
int idTo = BODY_PARTS[partTo];
if (points[idFrom] != null && points[idTo] != null)
{
Imgproc.line(img, points[idFrom], points[idTo], new Scalar(0, 255, 0), 3);
Imgproc.ellipse(img, points[idFrom], new Size(3, 3), 0, 0, 360, new Scalar(0, 0, 255), Core.FILLED);
Imgproc.ellipse(img, points[idTo], new Size(3, 3), 0, 0, 360, new Scalar(0, 0, 255), Core.FILLED);
}
}
MatOfDouble timings = new MatOfDouble();
long t = net.getPerfProfile(timings);
Debug.Log("t: " + t);
Debug.Log("timings.dump(): " + timings.dump());
double freq = Core.getTickFrequency() / 1000;
Debug.Log("freq: " + freq);
Imgproc.putText(img, (t / freq) + "ms", new Point(10, img.height() - 10), Imgproc.FONT_HERSHEY_SIMPLEX, 0.6, new Scalar(0, 0, 255), 2);
}
Imgproc.cvtColor(img, img, Imgproc.COLOR_BGR2RGB);
Texture2D texture = new Texture2D(img.cols(), img.rows(), TextureFormat.RGBA32, false);
Utils.matToTexture2D(img, texture);
gameObject.GetComponent<Renderer>().material.mainTexture = texture;
Utils.setDebugMode(false);
}
I guess I fix the error by below line:
var img = new Mat(webCamTextureMat.rows(), webCamTextureMat.cols(), CvType.CV_8UC3);
Imgproc.cvtColor(webCamTextureMat, img, Imgproc.COLOR_RGBA2BGR);
However, the performance is very very slow, any idea to make the hand pose detection work for webcam scenario?