TensorRT stream is always blocking in my compute,another is not.

stream is always blocking in my compute,another is not.

Open xijiejie opened this issue 3 years ago • 0 comments

Description

stream is always blocking in my compute,another is not.

i try in different computer, only my computer is not.

code

#include #include #include

#include <cuda.h> #include "cuda_runtime.h" #include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

global void addKernel(int *c, const int *a, const int *b) { int i = threadIdx.x; c[i] = a[i] + b[i]; }

const int N = 1 << 20;

global void kernel(float *x, int n) { int tid = threadIdx.x + blockIdx.x * blockDim.x; for (int i = tid; i < n; i += blockDim.x * gridDim.x) { x[i] = sqrt(pow(3.14159, i)); } }

global void convertPixelFormat(uint8_t* inputBgra, uint8_t* outputYuv, int numPixels) { for(int k = 0;k<100;k++) { int stride = gridDim.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x; short3 yuv16; char3 yuv8;

	while (idx <= numPixels) {
		if (idx < numPixels) {
			yuv16.x = 66 * inputBgra[idx * 4 + 2] + 129 * inputBgra[idx * 4 + 1] + 25 * inputBgra[idx * 4];
			yuv16.y = -38 * inputBgra[idx * 4 + 2] + -74 * inputBgra[idx * 4 + 1] + 112 * inputBgra[idx * 4];
			yuv16.z = 112 * inputBgra[idx * 4 + 2] + -94 * inputBgra[idx * 4 + 1] + -18 * inputBgra[idx * 4];

			yuv8.x = (yuv16.x >> 8) + 16;
			yuv8.y = (yuv16.y >> 8) + 128;
			yuv8.z = (yuv16.z >> 8) + 128;

			*(reinterpret_cast<char3*>(&outputYuv[idx * 3])) = yuv8;
		}
		idx += stride;
	}
}

}

int main() { uint8_t* bgraBuffer; uint8_t* yuvBuffer; uint8_t* deviceBgraBuffer; uint8_t* deviceYuvBuffer;

const int dataSizeBgra = 7680 * 4320 * 4;
const int dataSizeYuv = 7680 * 4320 * 3;

cudaMallocHost(&bgraBuffer, dataSizeBgra);
cudaMallocHost(&yuvBuffer, dataSizeYuv);
cudaMalloc(&deviceBgraBuffer, dataSizeBgra);
cudaMalloc(&deviceYuvBuffer, dataSizeYuv);

const int nStreams = 4;

std::cout << " " << std::endl;
std::cout << "Computing results using GPU, using " << nStreams << " streams." << std::endl;
std::cout << " " << std::endl;

cudaStream_t streams[nStreams];
std::cout << "    Creating " << nStreams << " CUDA streams." << std::endl;
for (int i = 0; i < nStreams; i++) {
	cudaStreamCreate(&streams[i]);
}

int brgaOffset = 0;
int yuvOffset = 0;
const int brgaChunkSize = dataSizeBgra / nStreams;
const int yuvChunkSize = dataSizeYuv / nStreams;

for (int i = 0; i < nStreams; i++)
{
	std::cout << "        Launching stream " << i << "." << std::endl;
	brgaOffset = brgaChunkSize * i;
	yuvOffset = yuvChunkSize * i;
	cudaMemcpyAsync(deviceBgraBuffer + brgaOffset,
		bgraBuffer + brgaOffset,
		brgaChunkSize,
		cudaMemcpyHostToDevice,
		streams[i]);

	convertPixelFormat << <4096, 1024, 0, streams[i] >> > (deviceBgraBuffer + brgaOffset, deviceYuvBuffer + yuvOffset, brgaChunkSize / 4);

	cudaMemcpyAsync(yuvBuffer + yuvOffset,
		deviceYuvBuffer + yuvOffset,
		yuvChunkSize,
		cudaMemcpyDeviceToHost,
		streams[i]);
}

//for (int i = 0; i < nStreams; i++) {

//	cudaStreamSynchronize(streams[i]);

//}
cudaDeviceSynchronize();

for (int i = 0; i < nStreams; i++)
{
	cudaStreamDestroy(streams[i]);
}
//const int arraySize = 5;
//const int a[arraySize] = { 1, 2, 3, 4, 5 };
//const int b[arraySize] = { 10, 20, 30, 40, 50 };
//int c[arraySize] = { 0 };

//// Add vectors in parallel.
//cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
//if (cudaStatus != cudaSuccess) {
//    fprintf(stderr, "addWithCuda failed!");
//    return 1;
//}

//printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
//    c[0], c[1], c[2], c[3], c[4]);

//// cudaDeviceReset must be called before exiting in order for profiling and
//// tracing tools such as Nsight and Visual Profiler to show complete traces.
//cudaStatus = cudaDeviceReset();
//if (cudaStatus != cudaSuccess) {
//    fprintf(stderr, "cudaDeviceReset failed!");
//    return 1;
//}
return 0;

}

// Helper function for using CUDA to add vectors in parallel. cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size) { int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    goto Error;
}

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    goto Error;
}

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    goto Error;
}

// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b);

return cudaStatus;

}

another

Aug 31 '22 02:08 xijiejie

TensorRT TensorRT copied to clipboard

stream is always blocking in my compute,another is not.

Description

code

TensorRT
TensorRT copied to clipboard