MNN
MNN copied to clipboard
cuda下resize的性能是否有问题?或者是我的用法不正确
MNN组件版本: MNN Version 3.0.1 with cuda
显卡型号:NVIDIA GeForce RTX 3060 Ti
MNN测试代码: `void testMNN() {
MNN::BackendConfig backend_config;
MNNDeviceContext gpuDeviceConfig;
gpuDeviceConfig.deviceId = 0;
backend_config.sharedContext = &gpuDeviceConfig;
MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CUDA, backend_config, MNN_GPU_TUNING_WIDE);
MNN::Express::ExecutorScope::Current()->lazyEval = 0;
LARGE_INTEGER frequency;
QueryPerformanceFrequency(&frequency);
for(int i=0; i<10; i++) {
LARGE_INTEGER t0;
QueryPerformanceCounter(&t0);
float h_crop = 2944.f;
float w_crop = 2520.f;
MNN::Express::VARP inputVar = MNN::Express::_Input({1,8,2944,2520}, MNN::Express::NCHW, halide_type_of<float>());
float *fp = inputVar->writeMap<float>();
for(int j=0; j<8*2944*2520;j++)
fp[j] = 0.618f;
LARGE_INTEGER t1;
QueryPerformanceCounter(&t1);
float m_size = 640.f;
MNN::Express::VARP scaleVar = MNN::Express::_Resize(inputVar, m_size/w_crop, m_size/h_crop);
LARGE_INTEGER t2;
QueryPerformanceCounter(&t2);
scaleVar = MNN::Express::_Resize(scaleVar, w_crop/m_size, h_crop/m_size);
LARGE_INTEGER t3;
QueryPerformanceCounter(&t3);
scaleVar->readMap<float>();
LARGE_INTEGER t4;
QueryPerformanceCounter(&t4);
//double elapsed1 = (double)(t1.QuadPart - t0.QuadPart) / frequency.QuadPart;
double elapsed2 = (double)(t2.QuadPart - t1.QuadPart) / frequency.QuadPart;
double elapsed3 = (double)(t3.QuadPart - t2.QuadPart) / frequency.QuadPart;
double elapsed4 = (double)(t4.QuadPart - t3.QuadPart) / frequency.QuadPart;
printf("===scale_down:%fs, scale_up:%fs, copy:%fs===\n", elapsed2, elapsed3, elapsed4);
}`
PyTorch对比测试代码: `from torch.nn import functional as F import torch
for i in range(10):
#t0 = time.time()
h_crop = 2944
w_crop = 2520
input_var = torch.randn(1, 8, h_crop, w_crop).cuda()
t1 = time.time()
m_size = 640
scale_var = F.interpolate(input_var, size=(m_size, m_size), mode='bilinear')
torch.cuda.synchronize()
t2 = time.time()
scale_var = F.interpolate(scale_var, size=(h_crop, w_crop), mode='bilinear')
torch.cuda.synchronize()
t3 = time.time()
res = scale_var.cpu().numpy()
t4 = time.time()
print("===scale_down:{}s, scale_up:{}s, copy_cpu:{}s===".format(round(t2-t1,4), round(t3-t2,4), round(t4-t3,4)))`
性能对比数据:
mnn 的 resize 基于 NC4HW4 布局实现,会隐式进行输入输出的布局转换。估计转换时间占了较大部分
Marking as stale. No activity in 60 days.