[QUESTION] How to use NMS operator to process yolov8 output?

Open VitalyVaryvdin opened this issue 1 year ago • 0 comments

Hi!

Having hard time figuring out how to properly process yolov8 output with Non-Maximum Supression operator

I've got yolo output in device memory, how would I go about creating required tensors for NMS operator? Also, without copying memory from device to host for data processing and back to device again for NMS operator

As far as I understand, yolo output is this for batch size of 1, data type is f32: [x, x, x... x] [y, y, y... y] [w, w, w... w] [h, h, h... h] [cls,cls,cls... cls]

And NMS operator expects it in this format for batch size of 1, data type is int16: [x,y,w,h...x,y,w,h]

I've only came up with a solution to copy device memory to host, create array of 4 component int16 vectors, copy them to device memory and pass to the operator, which sounds really bad

This is what I've currently got:

struct rect
{
	int16_t x,y,w,h;
};

std::vector<float> x(outputWidth);
std::vector<float> y(outputWidth);
std::vector<float> w(outputWidth);
std::vector<float> h(outputWidth);
std::vector<float> s(outputWidth);

auto ptr = frame->outputLayer->basePtr();

cudaStream_t stream;
cudaStreamCreate(&stream);

CHECK_CUDA_ERROR(cudaMemcpyAsync(x.data(), ptr, outputWidth * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK_CUDA_ERROR(cudaMemcpyAsync(y.data(), ptr + (outputWidth * 1), outputWidth * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK_CUDA_ERROR(cudaMemcpyAsync(w.data(), ptr + (outputWidth * 2), outputWidth * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK_CUDA_ERROR(cudaMemcpyAsync(h.data(), ptr + (outputWidth * 3), outputWidth * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK_CUDA_ERROR(cudaMemcpyAsync(s.data(), ptr + (outputWidth*4), outputWidth * sizeof(float), cudaMemcpyDeviceToHost, stream));

cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);

std::vector<rect> boxes(outputWidth);

for (int i = 0; i < outputWidth; i++)
{
	boxes.push_back({
		static_cast<int16_t>(((x[i] - 0.5 * w[i]) * xFactor)),
		static_cast<int16_t>(((x[i] - 0.5 * h[i]) * yFactor)),
		static_cast<int16_t>(w[i] * xFactor),
		static_cast<int16_t>(h[i] * yFactor)
	});
}

frame->outputLayer is nvcv::Optional<nvcv::TensorDataStridedCuda>, which is instantiated as

nvcv::Tensor::Requirements reqsOutputLayer = nvcv::Tensor::CalcRequirements(1, { model.output.dims[2], model.output.dims[1] }, nvcv::FMT_RGBf32p);

nvcv::TensorDataStridedCuda::Buffer bufOutputLayer;
std::copy(reqsOutputLayer.strides, reqsOutputLayer.strides + NVCV_TENSOR_MAX_RANK, bufOutputLayer.strides);
CHECK_CUDA_ERROR(cudaMalloc((void**)&bufOutputLayer.basePtr, CalcTotalSizeBytes(nvcv::Requirements{reqsOutputLayer.mem}.cudaMem())));

nvcv::TensorDataStridedCuda outputLayerTensorData(nvcv::TensorShape{reqsOutputLayer.shape, reqsOutputLayer.rank, reqsOutputLayer.layout}, nvcv::DataType{reqsOutputLayer.dtype}, bufOutputLayer);
nvcv::Tensor outputLayerTensor = TensorWrapData(outputLayerTensorData);

and passed to TensorRT inference

context->executeV2(new void*[]{ frame->inputLayer->basePtr(), outputLayerTensor.exportData<nvcv::TensorDataStridedCuda>()->basePtr() });

Jul 01 '24 09:07 VitalyVaryvdin