benchmark
benchmark copied to clipboard
MobilenetV1 profile for optimizing
Profile MobilenetV1 to optimize depthwise conv.
从profile 结果来看 conditional block消耗时间过长
test depthwise_conv op:
input dim:[32,32,224,224] filters_num:32 filter size:7 stride:1
paddle | tf | |
---|---|---|
time cost(s) | 0.099 | 0.059 |
import paddle.fluid as fluid import numpy as np import time
data = fluid.data(name='input', shape = [32, 32, 224, 224], dtype='float32') data.stop_gradient = False
result = fluid.layers.conv2d(input=data, num_filters=32, filter_size=[7, 7], stride=[1, 1], padding="SAME", groups=32, use_cudnn=False, param_attr='filters', bias_attr=False, act=None, data_format='NCHW')
place = fluid.CUDAPlace(0) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
in_data1 = np.random.rand(32, 32, 224, 224).astype('float32') in_data2 = np.random.rand(30, 32, 7, 7).astype('float32')
all_time = 0
fluid.profiler.start_profiler('All') for i in range(10):
start = time.time() [out] = exe.run(fluid.default_main_program(), fetch_list=[result], feed={'input': in_data1}) print(out.shape)
if i >=5: all_time +=(time.time() - start)
fluid.profiler.stop_profiler('total')
tf: import tensorflow as tf import numpy as np import time
data = tf.placeholder(name='input', shape=[32, 32, 224, 224], dtype=tf.float32) filters = tf.placeholder(name='filters', shape=[7, 7, 32, 1], dtype=tf.float32) result = tf.nn.depthwise_conv2d(input=data, filter=filters, strides=[1, 1, 1, 1], padding="SAME", data_format='NCHW')
in_data = np.random.rand(32, 32, 224, 224).astype('float32') fdata = np.random.rand(7, 7, 32, 1).astype('float32')
with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) all_time = 0 for i in range(10): start = time.time() out = sess.run(fetches=[result], feed_dict={data: in_data, filters: fdata}) print(out[0].shape) if i > 5: all_time += (time.time() - start)
print('argsort time:', all_time / 5.0)
CUDNN 7.5.1
grep: warning: GREP_OPTIONS is deprecated; please use an alias or script
==6990== NVPROF is profiling process 6990, command: python mobilenet/test_paddle.py
W0107 10:05:59.245507 6990 device_context.cc:236] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 10.1, Runtime API Version: 9.0
W0107 10:05:59.250385 6990 device_context.cc:244] device: 0, cuDNN Version: 7.5.
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
('argsort time:', 0.10950679779052734)
==6990== Profiling application: python mobilenet/test_paddle.py
==6990== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 55.01% 569.54ms 10 56.954ms 39.257ms 209.97ms [CUDA memcpy DtoH]
40.63% 420.59ms 13 32.353ms 1.8560us 42.771ms [CUDA memcpy HtoD]
4.36% 45.109ms 10 4.5109ms 4.0903ms 4.7665ms void conv2d_grouped_direct_kernel<float, float, float, float, float, bool=1, bool=0, int=0, int=1, int=0>(cudnnTensorStruct, float const *, cudnnFilterStruct, float const *, cudnnConvolutionStruct, cudnnTensorStruct, float*, float, float, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, int, float const *, float const *, cudnnActivationStruct)
0.00% 7.9680us 1 7.9680us 7.9680us 7.9680us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<unsigned int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<float>, thrust::cuda_cub::__transform::no_stencil_tag, paddle::operators::GaussianGenerator<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<unsigned int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<float>, thrust::cuda_cub::__transform::no_stencil_tag, paddle::operators::GaussianGenerator<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::use_default, thrust::use_default)
0.00% 7.6160us 4 1.9040us 1.6960us 2.4640us [CUDA memset]
API calls: 45.92% 3.45417s 8 431.77ms 2.1990us 3.45382s cudaStreamCreateWithFlags
15.92% 1.19764s 1 1.19764s 1.19764s 1.19764s cudaStreamCreate
14.01% 1.05375s 419 2.5149ms 5.4240us 124.16ms cuModuleUnload
13.83% 1.04074s 23 45.250ms 13.969us 216.52ms cudaMemcpy
10.11% 760.84ms 4 190.21ms 724ns 760.83ms cudaFree
0.08% 5.7761ms 276 20.927us 221ns 1.6584ms cuDeviceGetAttribute
0.03% 2.2402ms 16 140.01us 6.5520us 741.09us cudaMalloc
0.03% 2.2016ms 3 733.87us 522.42us 897.81us cuDeviceTotalMem
0.03% 2.1134ms 1 2.1134ms 2.1134ms 2.1134ms cudaGetDeviceProperties
0.02% 1.7382ms 1 1.7382ms 1.7382ms 1.7382ms cudaHostAlloc
0.01% 611.44us 11 55.585us 36.725us 76.901us cudaLaunch
0.00% 369.27us 3 123.09us 71.022us 177.29us cuDeviceGetName
0.00% 150.38us 11 13.670us 9.3680us 23.237us cudaStreamSynchronize
0.00% 123.83us 96 1.2890us 566ns 12.699us cudaFuncSetAttribute
0.00% 111.51us 4 27.876us 12.211us 61.780us cudaMemsetAsync
0.00% 75.483us 41 1.8410us 545ns 6.0140us cudaSetDevice
0.00% 57.221us 60 953ns 613ns 8.2940us cudaEventCreateWithFlags
0.00% 52.634us 182 289ns 199ns 2.2340us cudaSetupArgument
0.00% 39.786us 61 652ns 393ns 4.4830us cudaDeviceGetAttribute
0.00% 34.926us 10 3.4920us 540ns 13.562us cudaGetDevice
0.00% 21.948us 11 1.9950us 1.0570us 2.4650us cudaConfigureCall
0.00% 12.861us 1 12.861us 12.861us 12.861us cudaFuncGetAttributes
0.00% 11.539us 4 2.8840us 2.5320us 3.0600us cudaStreamCreateWithPriority
0.00% 9.0260us 1 9.0260us 9.0260us 9.0260us cudaDeviceGetStreamPriorityRange
0.00% 8.3820us 1 8.3820us 8.3820us 8.3820us cudaEventRecord
0.00% 7.5960us 1 7.5960us 7.5960us 7.5960us cudaHostGetDevicePointer
0.00% 5.5220us 4 1.3800us 428ns 3.5920us cuDeviceGet
0.00% 4.1390us 3 1.3790us 844ns 2.2280us cudaGetDeviceCount
0.00% 3.8680us 5 773ns 267ns 2.3650us cuDeviceGetCount
0.00% 3.4090us 2 1.7040us 1.5110us 1.8980us cuDriverGetVersion
0.00% 2.9890us 10 298ns 272ns 319ns cudaGetLastError
0.00% 2.7090us 2 1.3540us 1.3270us 1.3820us cuInit
0.00% 2.5410us 2 1.2700us 1.2510us 1.2900us cuDevicePrimaryCtxRelease
0.00% 2.1710us 2 1.0850us 491ns 1.6800us cudaDriverGetVersion
0.00% 1.9570us 2 978ns 206ns 1.7510us cudaPeekAtLastError
0.00% 249ns 1 249ns 249ns 249ns cudaRuntimeGetVersion
develop cuda
grep: warning: GREP_OPTIONS is deprecated; please use an alias or script
==464== NVPROF is profiling process 464, command: python mobilenet/test_paddle.py
W0107 10:00:27.440114 464 device_context.cc:236] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 10.1, Runtime API Version: 9.0
W0107 10:00:27.444963 464 device_context.cc:244] device: 0, cuDNN Version: 7.5.
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
('argsort time:', 0.12051382064819335)
==464== Profiling application: python mobilenet/test_paddle.py
==464== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 53.88% 514.12ms 10 51.412ms 32.905ms 211.05ms [CUDA memcpy DtoH]
40.39% 385.42ms 13 29.647ms 1.8240us 39.965ms [CUDA memcpy HtoD]
5.72% 54.570ms 10 5.4570ms 5.0582ms 5.9272ms void paddle::operators::math::KernelDepthwiseConvSp<float, int=1, int=1, int=-1, bool=0>(float const *, float const , int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, paddle::operators::math::KernelDepthwiseConvSp<float, int=1, int=1, int=-1, bool=0>*, paddle::framework::DataLayout)
0.00% 7.9040us 1 7.9040us 7.9040us 7.9040us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<unsigned int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<float>, thrust::cuda_cub::__transform::no_stencil_tag, paddle::operators::GaussianGenerator<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<unsigned int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<float>, thrust::cuda_cub::__transform::no_stencil_tag, paddle::operators::GaussianGenerator<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::use_default, thrust::use_default)
0.00% 7.4880us 4 1.8720us 1.7280us 2.3040us [CUDA memset]
API calls: 45.81% 3.47900s 8 434.88ms 2.3780us 3.47833s cudaStreamCreateWithFlags
18.42% 1.39882s 1 1.39882s 1.39882s 1.39882s cudaStreamCreate
12.76% 968.72ms 419 2.3120ms 5.6790us 75.024ms cuModuleUnload
12.63% 959.00ms 23 41.695ms 16.416us 218.76ms cudaMemcpy
10.21% 775.34ms 4 193.83ms 879ns 775.33ms cudaFree
0.05% 3.8967ms 276 14.118us 223ns 877.20us cuDeviceGetAttribute
0.03% 2.3235ms 1 2.3235ms 2.3235ms 2.3235ms cudaHostAlloc
0.03% 2.0878ms 16 130.49us 5.9890us 774.62us cudaMalloc
0.03% 1.9100ms 3 636.68us 523.78us 857.09us cuDeviceTotalMem
0.01% 910.65us 1 910.65us 910.65us 910.65us cudaGetDeviceProperties
0.01% 564.90us 11 51.354us 39.146us 56.915us cudaLaunch
0.00% 336.70us 3 112.23us 92.161us 147.56us cuDeviceGetName
0.00% 141.67us 96 1.4750us 583ns 13.641us cudaFuncSetAttribute
0.00% 134.09us 11 12.190us 10.588us 15.857us cudaStreamSynchronize
0.00% 103.78us 4 25.945us 11.404us 64.932us cudaMemsetAsync
0.00% 76.171us 41 1.8570us 563ns 6.2550us cudaSetDevice
0.00% 56.830us 60 947ns 631ns 8.0880us cudaEventCreateWithFlags
0.00% 51.972us 202 257ns 199ns 2.3020us cudaSetupArgument
0.00% 42.589us 61 698ns 392ns 4.0860us cudaDeviceGetAttribute
0.00% 38.928us 10 3.8920us 660ns 14.666us cudaGetDevice
0.00% 13.335us 1 13.335us 13.335us 13.335us cudaFuncGetAttributes
0.00% 13.045us 1 13.045us 13.045us 13.045us cudaDeviceGetStreamPriorityRange
0.00% 12.063us 4 3.0150us 2.5130us 3.5330us cudaStreamCreateWithPriority
0.00% 11.728us 11 1.0660us 888ns 1.3370us cudaConfigureCall
0.00% 10.993us 1 10.993us 10.993us 10.993us cudaEventRecord
0.00% 10.531us 1 10.531us 10.531us 10.531us cudaHostGetDevicePointer
0.00% 5.7760us 4 1.4440us 306ns 3.1790us cuDeviceGet
0.00% 4.3170us 5 863ns 329ns 2.7810us cuDeviceGetCount
0.00% 4.0860us 3 1.3620us 922ns 2.0180us cudaGetDeviceCount
0.00% 2.8520us 2 1.4260us 1.2230us 1.6290us cuDriverGetVersion
0.00% 2.8130us 2 1.4060us 1.3980us 1.4150us cuInit
0.00% 2.6260us 2 1.3130us 1.2380us 1.3880us cuDevicePrimaryCtxRelease
0.00% 2.2210us 2 1.1100us 474ns 1.7470us cudaDriverGetVersion
0.00% 523ns 2 261ns 197ns 326ns cudaPeekAtLastError
0.00% 272ns 1 272ns 272ns 272ns cudaRuntimeGetVersion
tensorflow
2020-01-07 10:01:19.671743: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
==524== NVPROF is profiling process 524, command: python mobilenet/test_tf.py
2020-01-07 10:01:20.377712: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:964] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-01-07 10:01:20.379133: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties:
name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53
pciBusID: 0000:00:0d.0
totalMemory: 15.75GiB freeMemory: 15.41GiB
2020-01-07 10:01:20.379268: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0
2020-01-07 10:01:21.576634: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-01-07 10:01:21.576708: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0
2020-01-07 10:01:21.576722: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N
2020-01-07 10:01:21.613054: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14890 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:0d.0, compute capability: 7.0)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
(32, 32, 224, 224)
('argsort time:', 0.06668057441711425)
==524== Profiling application: python mobilenet/test_tf.py
==524== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 52.20% 383.74ms 20 19.187ms 2.4640us 43.166ms [CUDA memcpy HtoD]
41.88% 307.86ms 10 30.786ms 30.710ms 30.989ms [CUDA memcpy DtoH]
5.93% 43.566ms 10 4.3566ms 4.1745ms 4.3793ms void tensorflow::DepthwiseConv2dGPUKernelNCHW<float, int=-1, int=-1, int=1>(tensorflow::DepthwiseArgs, float const *, float const , tensorflow::DepthwiseArgs*, int)
0.00% 2.4960us 1 2.4960us 2.4960us 2.4960us [CUDA memset]
API calls: 42.58% 1.19562s 1 1.19562s 1.19562s 1.19562s cudaFree
14.99% 420.99ms 20 21.049ms 43.958us 43.292ms cuMemcpyHtoDAsync
12.58% 353.07ms 2 176.53ms 176.40ms 176.67ms cuMemHostAlloc
12.19% 342.28ms 12 28.523ms 14.610us 35.233ms cuCtxSynchronize
11.62% 326.23ms 1 326.23ms 326.23ms 326.23ms cuDevicePrimaryCtxRetain
1.53% 42.911ms 60 715.18us 701ns 42.389ms cuEventRecord
1.31% 36.771ms 2 18.385ms 565.50us 36.205ms cuMemGetInfo
1.28% 35.837ms 6 5.9728ms 481ns 35.829ms cuEventCreate
0.94% 26.478ms 4 6.6196ms 870.25us 22.515ms cudaGetDeviceProperties
0.62% 17.444ms 1 17.444ms 17.444ms 17.444ms cuMemAlloc
0.21% 5.8154ms 4730 1.2290us 905ns 34.767us cuEventQuery
0.04% 1.2110ms 106 11.424us 220ns 349.55us cuDeviceGetAttribute
0.04% 1.0348ms 2 517.39us 515.35us 519.43us cuDeviceTotalMem
0.02% 603.44us 10 60.343us 48.041us 80.534us cudaLaunch
0.01% 367.08us 1 367.08us 367.08us 367.08us cuDeviceGetProperties
0.01% 271.33us 10 27.133us 15.855us 46.861us cuMemcpyDtoHAsync
0.01% 153.63us 2 76.815us 75.887us 77.744us cuDeviceGetName
0.00% 135.74us 10 13.574us 11.259us 20.528us cudaFuncGetAttributes
0.00% 109.10us 30 3.6360us 1.1970us 9.3580us cuStreamWaitEvent
0.00% 62.024us 1 62.024us 62.024us 62.024us cuMemsetD32
0.00% 46.633us 40 1.1650us 496ns 6.0830us cudaDeviceGetAttribute
0.00% 46.459us 10 4.6450us 3.9240us 7.6660us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0.00% 38.513us 4 9.6280us 1.9800us 31.705us cuStreamCreate
0.00% 34.729us 11 3.1570us 2.6460us 3.9370us cudaGetDevice
0.00% 33.171us 8 4.1460us 473ns 9.1660us cuCtxSetCurrent
0.00% 21.392us 10 2.1390us 1.3510us 3.8830us cudaConfigureCall
0.00% 18.434us 50 368ns 199ns 1.2630us cudaSetupArgument
0.00% 7.2250us 2 3.6120us 1.2310us 5.9940us cuEventDestroy
0.00% 5.5560us 2 2.7780us 2.6950us 2.8610us cudaSetDevice
0.00% 4.9860us 7 712ns 223ns 2.4500us cuDeviceGetCount
0.00% 3.5040us 3 1.1680us 427ns 2.5390us cuDeviceGet
0.00% 2.3370us 1 2.3370us 2.3370us 2.3370us cuDeviceGetPCIBusId
0.00% 1.9110us 1 1.9110us 1.9110us 1.9110us cudaGetDeviceCount
0.00% 1.5280us 1 1.5280us 1.5280us 1.5280us cuInit
0.00% 1.2210us 2 610ns 374ns 847ns cuDriverGetVersion
0.00% 924ns 1 924ns 924ns 924ns cuDeviceComputeCapability
0.00% 650ns 1 650ns 650ns 650ns cuCtxGetCurrent
0.00% 608ns 1 608ns 608ns 608ns cuDevicePrimaryCtxGetState
Paddle和TensorFlow的API性能测试和profile框架可依据benchmark/api/paddle/abs.py和api/tensorflow/abs.py编写,并往benchmark里面提交测试脚本。
优化后
==127662== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 56.42% 512.18ms 10 51.218ms 32.783ms 213.03ms [CUDA memcpy DtoH]
38.71% 351.45ms 13 27.035ms 1.8240us 35.711ms [CUDA memcpy HtoD]
4.87% 44.191ms 10 4.4191ms 4.1470ms 4.8415ms void paddle::operators::math::KernelDepthwiseConvSp<float, int=1, int=1, int=-1, bool=0>(float const *, float const , int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, paddle::operators::math::KernelDepthwiseConvSp<float, int=1, int=1, int=-1, bool=0>*, paddle::framework::DataLayout)
0.00% 7.9040us 1 7.9040us 7.9040us 7.9040us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<unsigned int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<float>, thrust::cuda_cub::__transform::no_stencil_tag, paddle::operators::GaussianGenerator<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<unsigned int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<float>, thrust::cuda_cub::__transform::no_stencil_tag, paddle::operators::GaussianGenerator<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::use_default, thrust::use_default)
0.00% 7.6480us 4 1.9120us 1.7280us 2.4000us [CUDA memset]
API calls: 46.22% 3.46309s 8 432.89ms 2.5150us 3.46275s cudaStreamCreateWithFlags
18.54% 1.38919s 1 1.38919s 1.38919s 1.38919s cudaStreamCreate
12.70% 951.45ms 419 2.2708ms 4.8110us 101.89ms cuModuleUnload
12.17% 912.09ms 23 39.656ms 16.881us 219.56ms cudaMemcpy
10.22% 765.59ms 4 191.40ms 798ns 765.59ms cudaFree
0.04% 2.7120ms 16 169.50us 6.1800us 1.0358ms cudaMalloc
0.03% 2.6163ms 276 9.4790us 223ns 349.82us cuDeviceGetAttribute
0.02% 1.8365ms 1 1.8365ms 1.8365ms 1.8365ms cudaHostAlloc
0.02% 1.6061ms 3 535.36us 529.80us 541.93us cuDeviceTotalMem
0.01% 902.31us 1 902.31us 902.31us 902.31us cudaGetDeviceProperties
0.01% 600.67us 11 54.606us 35.317us 65.714us cudaLaunch
0.00% 265.64us 3 88.545us 71.491us 103.31us cuDeviceGetName
0.00% 135.80us 11 12.345us 10.261us 13.949us cudaStreamSynchronize
0.00% 129.39us 96 1.3470us 619ns 12.658us cudaFuncSetAttribute
0.00% 97.330us 4 24.332us 10.110us 60.240us cudaMemsetAsync
0.00% 77.505us 42 1.8450us 557ns 5.7560us cudaSetDevice
0.00% 57.883us 60 964ns 610ns 6.9650us cudaEventCreateWithFlags
0.00% 49.584us 202 245ns 193ns 2.0650us cudaSetupArgument
0.00% 41.905us 62 675ns 387ns 4.5420us cudaDeviceGetAttribute
0.00% 32.282us 10 3.2280us 480ns 12.638us cudaGetDevice
0.00% 13.628us 1 13.628us 13.628us 13.628us cudaFuncGetAttributes
0.00% 12.718us 11 1.1560us 1.0400us 1.5230us cudaConfigureCall
0.00% 11.020us 4 2.7550us 2.5240us 3.0900us cudaStreamCreateWithPriority
0.00% 9.3490us 1 9.3490us 9.3490us 9.3490us cudaEventRecord
0.00% 8.5710us 1 8.5710us 8.5710us 8.5710us cudaDeviceGetStreamPriorityRange
0.00% 8.2850us 1 8.2850us 8.2850us 8.2850us cudaHostGetDevicePointer
0.00% 6.2070us 4 1.5510us 290ns 3.8060us cuDeviceGet
0.00% 4.6410us 3 1.5470us 1.0010us 2.3840us cudaGetDeviceCount
0.00% 4.2080us 5 841ns 272ns 2.6130us cuDeviceGetCount
0.00% 3.5120us 2 1.7560us 1.6590us 1.8530us cuInit
0.00% 3.4230us 2 1.7110us 1.6280us 1.7950us cuDriverGetVersion
0.00% 2.9440us 2 1.4720us 1.2830us 1.6610us cuDevicePrimaryCtxRelease
0.00% 2.4490us 2 1.2240us 787ns 1.6620us cudaDriverGetVersion
0.00% 604ns 2 302ns 192ns 412ns cudaPeekAtLastError
0.00% 252ns 1 252ns 252ns 252ns cudaRuntimeGetVersion