TensorRT
                                
                                 TensorRT copied to clipboard
                                
                                    TensorRT copied to clipboard
                            
                            
                            
                        stream is always blocking in my compute,another is not.
Description
stream is always blocking in my compute,another is not.
i try in different computer, only my computer is not.
code
#include 
#include <cuda.h> #include "cuda_runtime.h" #include "device_launch_parameters.h"
#include <stdio.h>
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
global void addKernel(int *c, const int *a, const int *b) { int i = threadIdx.x; c[i] = a[i] + b[i]; }
const int N = 1 << 20;
global void kernel(float *x, int n) { int tid = threadIdx.x + blockIdx.x * blockDim.x; for (int i = tid; i < n; i += blockDim.x * gridDim.x) { x[i] = sqrt(pow(3.14159, i)); } }
global void convertPixelFormat(uint8_t* inputBgra, uint8_t* outputYuv, int numPixels) { for(int k = 0;k<100;k++) { int stride = gridDim.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x; short3 yuv16; char3 yuv8;
	while (idx <= numPixels) {
		if (idx < numPixels) {
			yuv16.x = 66 * inputBgra[idx * 4 + 2] + 129 * inputBgra[idx * 4 + 1] + 25 * inputBgra[idx * 4];
			yuv16.y = -38 * inputBgra[idx * 4 + 2] + -74 * inputBgra[idx * 4 + 1] + 112 * inputBgra[idx * 4];
			yuv16.z = 112 * inputBgra[idx * 4 + 2] + -94 * inputBgra[idx * 4 + 1] + -18 * inputBgra[idx * 4];
			yuv8.x = (yuv16.x >> 8) + 16;
			yuv8.y = (yuv16.y >> 8) + 128;
			yuv8.z = (yuv16.z >> 8) + 128;
			*(reinterpret_cast<char3*>(&outputYuv[idx * 3])) = yuv8;
		}
		idx += stride;
	}
}
}
int main() { uint8_t* bgraBuffer; uint8_t* yuvBuffer; uint8_t* deviceBgraBuffer; uint8_t* deviceYuvBuffer;
const int dataSizeBgra = 7680 * 4320 * 4;
const int dataSizeYuv = 7680 * 4320 * 3;
cudaMallocHost(&bgraBuffer, dataSizeBgra);
cudaMallocHost(&yuvBuffer, dataSizeYuv);
cudaMalloc(&deviceBgraBuffer, dataSizeBgra);
cudaMalloc(&deviceYuvBuffer, dataSizeYuv);
const int nStreams = 4;
std::cout << " " << std::endl;
std::cout << "Computing results using GPU, using " << nStreams << " streams." << std::endl;
std::cout << " " << std::endl;
cudaStream_t streams[nStreams];
std::cout << "    Creating " << nStreams << " CUDA streams." << std::endl;
for (int i = 0; i < nStreams; i++) {
	cudaStreamCreate(&streams[i]);
}
int brgaOffset = 0;
int yuvOffset = 0;
const int brgaChunkSize = dataSizeBgra / nStreams;
const int yuvChunkSize = dataSizeYuv / nStreams;
for (int i = 0; i < nStreams; i++)
{
	std::cout << "        Launching stream " << i << "." << std::endl;
	brgaOffset = brgaChunkSize * i;
	yuvOffset = yuvChunkSize * i;
	cudaMemcpyAsync(deviceBgraBuffer + brgaOffset,
		bgraBuffer + brgaOffset,
		brgaChunkSize,
		cudaMemcpyHostToDevice,
		streams[i]);
	convertPixelFormat << <4096, 1024, 0, streams[i] >> > (deviceBgraBuffer + brgaOffset, deviceYuvBuffer + yuvOffset, brgaChunkSize / 4);
	cudaMemcpyAsync(yuvBuffer + yuvOffset,
		deviceYuvBuffer + yuvOffset,
		yuvChunkSize,
		cudaMemcpyDeviceToHost,
		streams[i]);
}
//for (int i = 0; i < nStreams; i++) {
//	cudaStreamSynchronize(streams[i]);
//}
cudaDeviceSynchronize();
for (int i = 0; i < nStreams; i++)
{
	cudaStreamDestroy(streams[i]);
}
//const int arraySize = 5;
//const int a[arraySize] = { 1, 2, 3, 4, 5 };
//const int b[arraySize] = { 10, 20, 30, 40, 50 };
//int c[arraySize] = { 0 };
//// Add vectors in parallel.
//cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
//if (cudaStatus != cudaSuccess) {
//    fprintf(stderr, "addWithCuda failed!");
//    return 1;
//}
//printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
//    c[0], c[1], c[2], c[3], c[4]);
//// cudaDeviceReset must be called before exiting in order for profiling and
//// tracing tools such as Nsight and Visual Profiler to show complete traces.
//cudaStatus = cudaDeviceReset();
//if (cudaStatus != cudaSuccess) {
//    fprintf(stderr, "cudaDeviceReset failed!");
//    return 1;
//}
return 0;
}
// Helper function for using CUDA to add vectors in parallel. cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size) { int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b);
return cudaStatus;
}
 
