TensorRT
TensorRT copied to clipboard
Asynchronous inference engine, the speed increases exponentially
hello, when i use multiple stream parallel inference engine, the speed is doubled, this is my code, thank you very much for your help
const int nStreams =3;
std::cout << "Creating " << nStreams << " CUDA streams." << std::endl;
cudaStream_t stream[nStreams];;
for (int i = 0; i < nStreams; i++)
cudaStreamCreate(&stream[i]);
//cudaStream_t stream;
//cudaStreamCreate(&stream);
//复制图片数据到GPU
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i = 0; i < nStreams; i++) {
cudaMemcpyAsync(buffers[inputIndex], data, m_nImgNum * m_nChannelNum * m_nImgSize * m_nImgSize * sizeof(float), cudaMemcpyHostToDevice, stream[i]);
//执行推理
context->enqueueV2(buffers, stream[i], nullptr);
//将GPU数据拷贝回CPU
cudaMemcpyAsync(outdata, buffers[outputIndex], m_nImgNum * m_nImgSize * m_nImgSize * sizeof(int), cudaMemcpyDeviceToHost, stream[i]);
//cudaStreamSynchronize(stream[i]);
}
for (int i = 0; i < nStreams; ++i)
cudaStreamSynchronize(stream[i]);
//销毁流
CUDA_CHECK();
CUDA_CALL(cudaDeviceSynchronize());
CUDA_CALL(cudaEventRecord(stop, 0));
CUDA_CALL(cudaEventSynchronize(stop));
CUDA_CALL(cudaEventElapsedTime(&elapsedTime, start, stop));
std::cout << "Whole process took " << elapsedTime << "ms." << std::endl;
for (int i = 0; i < nStreams; ++i)
cudaStreamDestroy(stream[i]);
- Please run at least 100-150 times to warm up the GPU.
- Locking the GPU frequency is a good practice.
- the inference time may be fluctuating so please measure the mean/median time over N runs
@nvpohanh for viz
btw if you just want to check the multi-stream. trtexec has an option for it, check it with trtexec -h.
Isn't this expected? With nStream=3, you now have 3x as many computations to do, so the latency will be roughly 3x (or slightly less than that, if some kernels happen to run in parallel).
Closing since no activity for more than 3 weeks, please reopen if you still have question, thanks!