mmdeploy
mmdeploy copied to clipboard
restorer postprocess the time difference between a stream and a single kernel is too large
Checklist
- [ ] I have searched related issues but cannot get the expected help.
- [ ] 2. I have read the FAQ documentation but cannot get the expected help.
- [ ] 3. The bug has not been fixed in the latest version.
Describe the bug
I changed the post-processing to a gpu implementation, and the stream time was too different from the execution time of a single kernel function. 我将后处理改成gpu实现,stream耗时与单独核函数执行耗时相差太大
post-processing cost time:150 ms Transpose kernel function:1.5ms
Reproduction
global void transpose(const float* src, int height, int width, int channels, int src_width_stride, uint8_t* dst, int dst_width_stride) { auto x = blockIdx.xblockDim.x + threadIdx.x; auto y = blockIdx.yblockDim.y + threadIdx.y;
if(x>=width || y>=height) return;
for(auto c=0; c<channels; c++){
dst[y * dst_width_stride + x * channels + c] =
uint8_t(fminf(src[c * src_width_stride + y * width + x] * 255, 255.f));
}
}
void Transpose(const float* src, int height, int width, int channels, uint8_t* dst, cudaStream_t stream) { const dim3 thread_block(32, 32); const dim3 block_num((width + thread_block.x - 1) / thread_block.x, (height + thread_block.y - 1) / thread_block.y);
auto src_width_stride = width * height;
auto dst_width_stride = width * channels;
transpose<<<block_num, thread_block, 0, stream>>>(src, height, width, channels,
src_width_stride, dst, dst_width_stride);
}
Result
if(src.shape().size()==5){
c = src.shape(2);
h = src.shape(3);
w = src.shape(4);
}
TensorDesc dst_desc;
dst_desc.device = src.desc().device;
dst_desc.data_type = DataType::kINT8;
dst_desc.shape = src.desc().shape;
dst_desc.name = src.desc().name;
Tensor dst_tensor(dst_desc);
dst_tensor.Reshape({1, h, w, c});
auto cuda_stream = GetNative<cudaStream_t>(stream);
if(DataType::kFLOAT == src.data_type())
{
auto input = src.data<float>();
auto output = dst_tensor.data<uint8_t>();
Transpose(input, (int)h, (int)w, (int)c, output, cuda_stream);
}else{
assert(0);
}
dst = std::move(dst_tensor);
return success();
}
Result<Value> TensorToImg::operator()(const Value& input)
{
auto upscale = input["output"].get<Tensor>();
Tensor tensor_hwc;
apply(upscale, tensor_hwc, stream);
OUTCOME_TRY(stream.Wait());
Mat mat(height,
width,
format,
DataType::kFLOAT,
std::shared_ptr
return mat; }
Environment
11/09 11:02:42 - mmengine - INFO -
11/09 11:02:42 - mmengine - INFO - **********Environmental information**********
11/09 11:02:46 - mmengine - INFO - sys.platform: win32
11/09 11:02:46 - mmengine - INFO - Python: 3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]
11/09 11:02:46 - mmengine - INFO - CUDA available: True
11/09 11:02:46 - mmengine - INFO - numpy_random_seed: 2147483648
11/09 11:02:46 - mmengine - INFO - GPU 0: NVIDIA GeForce RTX 4060 Ti
11/09 11:02:46 - mmengine - INFO - CUDA_HOME: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6
11/09 11:02:46 - mmengine - INFO - NVCC: Cuda compilation tools, release 11.6, V11.6.55
11/09 11:02:46 - mmengine - INFO - MSVC: 用于 x64 的 Microsoft (R) C/C++ 优化编译器 19.29.30151 版
11/09 11:02:46 - mmengine - INFO - GCC: n/a
11/09 11:02:46 - mmengine - INFO - PyTorch: 1.12.0+cu116
11/09 11:02:46 - mmengine - INFO - PyTorch compiling details: PyTorch built with:
- C++ Version: 199711
- MSVC 192829337
- Intel(R) Math Kernel Library Version 2020.0.2 Product Build 20200624 for Intel(R) 64 architecture applications
- Intel(R) MKL-DNN v2.6.0 (Git Hash 52b5f107dd9cf10910aaa19cb47f3abf9b349815)
- OpenMP 2019
- LAPACK is enabled (usually provided by MKL)
- CPU capability usage: AVX2
- CUDA Runtime 11.6
- NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_37,code=compute_37
- CuDNN 8.3.2 (built against CUDA 11.5)
- Magma 2.5.4
- Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.6, CUDNN_VERSION=8.3.2, CXX_COMPILER=C:/actions-runner/_work/pytorch/pytorch/builder/windows/tmp_bin/sccache-cl.exe, CXX_FLAGS=/DWIN32 /D_WINDOWS /GR /EHsc /w /bigobj -DUSE_PTHREADPOOL -openmp:experimental -IC:/actions-runner/_work/pytorch/pytorch/builder/windows/mkl/include -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DUSE_FBGEMM -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.12.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=OFF, USE_MPI=OFF, USE_NCCL=OFF, USE_NNPACK=OFF, USE_OPENMP=ON, USE_ROCM=OFF,
11/09 11:02:46 - mmengine - INFO - TorchVision: 0.13.0+cu116
11/09 11:02:46 - mmengine - INFO - OpenCV: 4.8.1
11/09 11:02:46 - mmengine - INFO - MMEngine: 0.9.0
11/09 11:02:46 - mmengine - INFO - MMCV: 2.0.1
11/09 11:02:46 - mmengine - INFO - MMCV Compiler: MSVC 192930148
11/09 11:02:46 - mmengine - INFO - MMCV CUDA Compiler: 11.6
11/09 11:02:46 - mmengine - INFO - MMDeploy: 1.3.0+6edd802
11/09 11:02:46 - mmengine - INFO -
11/09 11:02:46 - mmengine - INFO - **********Backend information**********
11/09 11:02:46 - mmengine - INFO - tensorrt: 8.5.1.7
11/09 11:02:46 - mmengine - INFO - tensorrt custom ops: NotAvailable
d:\Anaconda3\lib\site-packages\pkg_resources\__init__.py:116: PkgResourcesDeprecationWarning: 4.0.0-unsupported is an invalid version and will not be supported in a future release
warnings.warn(
11/09 11:02:46 - mmengine - INFO - ONNXRuntime: 1.15.1
11/09 11:02:46 - mmengine - INFO - ONNXRuntime-gpu: None
11/09 11:02:46 - mmengine - INFO - ONNXRuntime custom ops: NotAvailable
11/09 11:02:46 - mmengine - INFO - pplnn: None
11/09 11:02:46 - mmengine - INFO - ncnn: None
11/09 11:02:46 - mmengine - INFO - snpe: None
11/09 11:02:46 - mmengine - INFO - openvino: None
11/09 11:02:46 - mmengine - INFO - torchscript: 1.12.0+cu116
11/09 11:02:46 - mmengine - INFO - torchscript custom ops: NotAvailable
11/09 11:02:46 - mmengine - INFO - rknn-toolkit: None
11/09 11:02:46 - mmengine - INFO - rknn-toolkit2: None
11/09 11:02:46 - mmengine - INFO - ascend: None
11/09 11:02:46 - mmengine - INFO - coreml: None
11/09 11:02:46 - mmengine - INFO - tvm: None
11/09 11:02:46 - mmengine - INFO - vacc: None
11/09 11:02:46 - mmengine - INFO -
11/09 11:02:46 - mmengine - INFO - **********Codebase information**********
11/09 11:02:46 - mmengine - INFO - mmdet: 3.0.0
11/09 11:02:46 - mmengine - INFO - mmseg: None
11/09 11:02:46 - mmengine - INFO - mmpretrain: 1.1.0
11/09 11:02:46 - mmengine - INFO - mmocr: None
11/09 11:02:46 - mmengine - INFO - mmagic: 1.0.2
11/09 11:02:46 - mmengine - INFO - mmdet3d: None
11/09 11:02:46 - mmengine - INFO - mmpose: 1.2.0
11/09 11:02:46 - mmengine - INFO - mmrotate: None
11/09 11:02:46 - mmengine - INFO - mmaction: 1.2.0
11/09 11:02:46 - mmengine - INFO - mmrazor: None
11/09 11:02:46 - mmengine - INFO - mmyolo: 0.5.0
Error traceback
No response