knowhere
knowhere copied to clipboard
Recall of gpu_ivf_pq is lower than ivf_pq with dataset glove-200
compare the logs below, there are 2 issues actually:
- recall of gpu and cpu are not same
- gpu is slower than cpu when nprobe=512
nlist=1024
m=8
nbits=8
gpu_ivf_pq search on glove recalls: [0.1168, 0.1362, 0.1479, 0.1543, 0.1578, 0.1598, 0.1607, 0.161, 0.161, 0.161]
gpu_ivf_pq search on glove latency: [0.0278, 0.0368, 0.0539, 0.093, 0.1608, 0.9298, 1.9743, 3.7446, 7.3059, 14.6296]
ivf_pq search on glove recalls: [0.1357, 0.1549, 0.1657, 0.1709, 0.1736, 0.1752, 0.1758, 0.176, 0.176, 0.176]
ivf_pq search on glove latency: [0.2152, 0.1308, 0.2232, 0.3057, 0.4902, 0.6978, 1.3009, 2.3552, 4.5747, 8.9149]
can reproduce @yanliang567 's issue:
CPU run
[58.885 s] glove-200-angular | IVF_PQ | nlist=1024
================================================================================
nprobe = 1, nq = 10000, k = 100, elapse = 0.152s, R@ = 0.1351
nprobe = 2, nq = 10000, k = 100, elapse = 0.156s, R@ = 0.1545
nprobe = 4, nq = 10000, k = 100, elapse = 0.258s, R@ = 0.1654
nprobe = 8, nq = 10000, k = 100, elapse = 0.239s, R@ = 0.1707
nprobe = 16, nq = 10000, k = 100, elapse = 0.338s, R@ = 0.1733
nprobe = 32, nq = 10000, k = 100, elapse = 0.576s, R@ = 0.1750
nprobe = 64, nq = 10000, k = 100, elapse = 0.895s, R@ = 0.1757
nprobe = 128, nq = 10000, k = 100, elapse = 1.545s, R@ = 0.1758
nprobe = 256, nq = 10000, k = 100, elapse = 3.048s, R@ = 0.1758
nprobe = 512, nq = 10000, k = 100, elapse = 6.047s, R@ = 0.1758
================================================================================
[72.803 s] Test 'glove-200-angular/IVF_PQ' done
GPU run
[4.379 s] glove-200-angular | IVF_PQ | nlist=1024
================================================================================
nprobe = 1, nq = 10000, k = 100, elapse = 0.029s, R@ = 0.1170
nprobe = 2, nq = 10000, k = 100, elapse = 0.045s, R@ = 0.1367
nprobe = 4, nq = 10000, k = 100, elapse = 0.083s, R@ = 0.1483
nprobe = 8, nq = 10000, k = 100, elapse = 0.134s, R@ = 0.1548
nprobe = 16, nq = 10000, k = 100, elapse = 0.246s, R@ = 0.1582
nprobe = 32, nq = 10000, k = 100, elapse = 1.246s, R@ = 0.1601
nprobe = 64, nq = 10000, k = 100, elapse = 2.438s, R@ = 0.1610
nprobe = 128, nq = 10000, k = 100, elapse = 4.862s, R@ = 0.1613
nprobe = 256, nq = 10000, k = 100, elapse = 9.730s, R@ = 0.1613
nprobe = 512, nq = 10000, k = 100, elapse = 19.627s, R@ = 0.1613
================================================================================
[43.516 s] Test 'glove-200-angular/IVF_PQ' done
==34089== NVPROF is profiling process 34089, command: ./test
==34089== Profiling application: ./test
==34089== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 25.56% 2.32929s 1250 1.8634ms 1.5279ms 2.3547ms void faiss::gpu::pqCodeDistances<float, float, int=32, bool=1>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, int, faiss::gpu::pqCodeDistances<float, float, int=32, bool=1, float, int=2, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqCodeDistances<float, float, int=32, bool=1, float, int=3, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqCodeDistances<float, float, int=32, bool=1, int, int=2, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqCodeDistances<float, float, int=32, bool=1, float, int=4, bool=1, int, faiss::gpu::traits>)
17.12% 1.56048s 29608 52.704us 26.144us 81.952us void faiss::gpu::l2SelectMin1<float, int=8, int=256>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::l2SelectMin1<float, int=8, int=256, faiss::gpu::Tensor, int=1, bool=1, int, faiss::gpu::traits>, faiss::gpu::l2SelectMin1<float, int=8, int=256, unsigned char, int=1, bool=1, int, faiss::gpu::traits>, faiss::gpu::traits::DefaultPtrTraits, faiss::gpu::l2SelectMin1<float, int=8, int=256, int, int=2, bool=1, int, faiss::gpu::traits>)
17.10% 1.55805s 29628 52.586us 11.360us 182.88us volta_sgemm_128x32_tn
13.76% 1.25412s 1250 1.0033ms 722.82us 1.5675ms void faiss::gpu::pass1SelectLists<int=128, int=128, int=3, bool=0>(void**, faiss::gpu::Tensor<int, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::traits::DefaultPtrTraits, void**, void**, int, int, faiss::gpu::IndicesOptions, void**, void**)
12.84% 1.17041s 1250 936.33us 890.98us 1.0710ms void faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, float, int=3, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, int, int=2, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, float, int=4, bool=1, int, faiss::gpu::traits>, void**, int*, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, float, int=3, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, float, int=1, bool=1, int, faiss::gpu::traits>)
7.00% 637.67ms 197892 3.2220us 543ns 11.942ms [CUDA memcpy HtoD]
1.64% 149.75ms 20380 7.3470us 2.4630us 42.080us void faiss::gpu::sumAlongRows<float, bool=1>(faiss::gpu::Tensor<float, int=1, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::sumAlongRows<float, bool=1, faiss::gpu::Tensor, int=2, bool=1, int, faiss::gpu::traits>)
1.49% 135.51ms 65573 2.0660us 1.6310us 531.84us void faiss::gpu::calcResidual<float, bool=0>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::calcResidual<float, bool=0, float, int=2, bool=1, int, faiss::gpu::traits>, faiss::gpu::calcResidual<float, bool=0, int, int=1, bool=1, int, faiss::gpu::traits>, faiss::gpu::traits::DefaultPtrTraits)
1.14% 103.45ms 65536 1.5780us 1.4390us 13.120us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<long const , int, thrust::cuda_cub::__transform::no_stencil_tag, faiss::gpu::Convert<long, int>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<long const , int, thrust::cuda_cub::__transform::no_stencil_tag, faiss::gpu::Convert<long, int>, thrust::cuda_cub::__transform::always_true_predicate>, long>(int*, thrust::cuda_cub::__transform::no_stencil_tag)
0.72% 66.048ms 66004 1.0000us 671ns 170.37us [CUDA memcpy DtoH]
0.62% 56.355ms 1074 52.471us 7.8080us 798.08us void faiss::gpu::l2NormRowMajor<float, float4, int, int=8, bool=0, bool=1>(faiss::gpu::Tensor<float4, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::l2NormRowMajor<float, float4, int, int=8, bool=0, bool=1, float, int=1, bool=1, float4, faiss::gpu::traits>)
0.26% 23.392ms 1250 18.713us 15.520us 25.312us void faiss::gpu::pass2SelectLists<int=128, int=128, int=3, bool=0>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::pass2SelectLists<int=128, int=128, int=3, bool=0, int, int=2, bool=1, int, faiss::gpu::traits>, void**, faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, int, faiss::gpu::IndicesOptions, faiss::gpu::traits::DefaultPtrTraits, faiss::gpu::pass2SelectLists<int=128, int=128, int=3, bool=0, long, int=2, bool=1, int, faiss::gpu::traits>)
0.24% 21.808ms 37 589.40us 72.544us 621.60us void faiss::gpu::transposeOuter<float, int>(float const , faiss::gpu::transposeOuter<float, int>, int, faiss::gpu::transposeOuter<float, int>, faiss::gpu::transposeOuter<float, int>)
0.15% 13.569ms 16253 834ns 543ns 12.768us [CUDA memset]
0.12% 11.226ms 6396 1.7550us 1.5670us 13.184us [CUDA memcpy DtoD]
0.09% 8.0448ms 20 402.24us 228.64us 447.23us void faiss::gpu::l2SelectMinK<float, int=512, int=8, int=128>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::l2SelectMinK<float, int=512, int=8, int=128, faiss::gpu::Tensor, int=1, bool=1, int, faiss::gpu::traits>, faiss::gpu::l2SelectMinK<float, int=512, int=8, int=128, unsigned char, int=1, bool=1, int, faiss::gpu::traits>, faiss::gpu::traits::DefaultPtrTraits, faiss::gpu::l2SelectMinK<float, int=512, int=8, int=128, int, int=2, bool=1, int, faiss::gpu::traits>, int, faiss::gpu::Tensor)
0.05% 4.3242ms 1250 3.4590us 2.8480us 14.720us void cub::DeviceScanKernel<cub::DeviceScanPolicy
==34437== NVPROF is profiling process 34437, command: ./test
==34437== Profiling application: ./test
==34437== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 62.46% 13.6743s 67254 203.32us 672ns 11.811ms [CUDA memcpy DtoH]
7.19% 1.57450s 29608 53.178us 26.049us 75.840us void faiss::gpu::l2SelectMin1<float, int=8, int=256>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::l2SelectMin1<float, int=8, int=256, faiss::gpu::Tensor, int=1, bool=1, int, faiss::gpu::traits>, faiss::gpu::l2SelectMin1<float, int=8, int=256, unsigned char, int=1, bool=1, int, faiss::gpu::traits>, faiss::gpu::traits::DefaultPtrTraits, faiss::gpu::l2SelectMin1<float, int=8, int=256, int, int=2, bool=1, int, faiss::gpu::traits>)
5.04% 1.10281s 1250 882.25us 874.05us 889.00us void faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, float, int=3, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, int, int=2, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, float, int=4, bool=1, int, faiss::gpu::traits>, void**, int*, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, float, int=3, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, float, int=1, bool=1, int, faiss::gpu::traits>)
4.38% 958.22ms 1250 766.58us 698.53us 858.66us void faiss::gpu::pass1SelectLists<int=128, int=128, int=3, bool=0>(void**, faiss::gpu::Tensor<int, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::traits::DefaultPtrTraits, void**, void**, int, int, faiss::gpu::IndicesOptions, void**, void**)
4.11% 899.52ms 7580 118.67us 81.505us 155.17us volta_sgemm_128x32_tn
2.70% 591.54ms 1250 473.23us 466.53us 485.25us void faiss::gpu::sumAlongRows<float, bool=0>(faiss::gpu::Tensor<float, int=1, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::sumAlongRows<float, bool=0, faiss::gpu::Tensor, int=2, bool=1, int, faiss::gpu::traits>)
2.59% 566.21ms 1287 439.94us 70.112us 686.66us void faiss::gpu::transposeOuter<float, int>(float const , faiss::gpu::transposeOuter<float, int>, int, faiss::gpu::transposeOuter<float, int>, faiss::gpu::transposeOuter<float, int>)
2.49% 544.30ms 197882 2.7500us 543ns 19.054ms [CUDA memcpy HtoD]
2.46% 538.48ms 1250 430.78us 427.20us 438.02us void faiss::gpu::sumAlongColumns<float4, int=16, int=4, int=4>(faiss::gpu::Tensor<float4, int=1, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::sumAlongColumns<float4, int=16, int=4, int=4, faiss::gpu::Tensor, int=2, bool=1, int, faiss::gpu::traits>)
2.22% 486.56ms 22048 22.068us 11.008us 43.008us volta_sgemm_32x128_tn
1.51% 331.15ms 1250 264.92us 256.10us 275.49us volta_sgemm_32x128_nn
0.68% 148.95ms 20380 7.3080us 2.4000us 50.048us void faiss::gpu::sumAlongRows<float, bool=1>(faiss::gpu::Tensor<float, int=1, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::sumAlongRows<float, bool=1, faiss::gpu::Tensor, int=2, bool=1, int, faiss::gpu::traits>)
0.60% 130.99ms 65573 1.9970us 1.6310us 485.06us void faiss::gpu::calcResidual<float, bool=0>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::calcResidual<float, bool=0, float, int=2, bool=1, int, faiss::gpu::traits>, faiss::gpu::calcResidual<float, bool=0, int, int=1, bool=1, int, faiss::gpu::traits>, faiss::gpu::traits::DefaultPtrTraits)
0.47% 102.42ms 65536 1.5620us 1.4390us 13.088us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<long const , int, thrust::cuda_cub::__transform::no_stencil_tag, faiss::gpu::Convert<long, int>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<long const , int, thrust::cuda_cub::__transform::no_stencil_tag, faiss::gpu::Convert<long, int>, thrust::cuda_cub::__transform::always_true_predicate>, long>(int*, thrust::cuda_cub::__transform::no_stencil_tag)
0.42% 91.212ms 3500 26.060us 7.2320us 76.897us void faiss::gpu::l2NormRowMajor<float, float, int, int=8, bool=0, bool=1>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::l2NormRowMajor<float, float, int, int=8, bool=0, bool=1, float, int=1, bool=1, float, faiss::gpu::traits>)
0.22% 47.383ms 1250 37.906us 36.320us 46.976us void faiss::gpu::pqResidualVector<float, bool=1>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::pqResidualVector<float, bool=1, float, int=2, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqResidualVector<float, bool=1, int, int=2, bool=1, int, faiss::gpu::traits>, int, faiss::gpu::pqResidualVector<float, bool=1, float, int=4, bool=1, int, faiss::gpu::traits>)
0.10% 22.816ms 1250 18.253us 17.408us 28.064us void faiss::gpu::pass2SelectLists<int=128, int=128, int=3, bool=0>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::pass2SelectLists<int=128, int=128, int=3, bool=0, int, int=2, bool=1, int, faiss::gpu::traits>, void**, faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, int, faiss::gpu::IndicesOptions, faiss::gpu::traits::DefaultPtrTraits, faiss::gpu::pass2SelectLists<int=128, int=128, int=3, bool=0, long, int=2, bool=1, int, faiss::gpu::traits>)
0.09% 18.748ms 64 292.95us 10.784us 1.2343ms void faiss::gpu::l2NormRowMajor<float, float4, int, int=8, bool=0, bool=1>(faiss::gpu::Tensor<float4, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::l2NormRowMajor<float, float4, int, int=8, bool=0, bool=1, float, int=1, bool=1, float4, faiss::gpu::traits>)
0.07% 14.318ms 16253 880ns 543ns 9.8240us [CUDA memset]
0.06% 12.170ms 6396 1.9020us 1.5680us 12.992us [CUDA memcpy DtoD]
0.05% 10.243ms 1252 8.1800us 6.4320us 17.952us void faiss::gpu::transposeAny<float, unsigned int, int=3, int=-1>(faiss::gpu::TensorInfo<float, unsigned int>, unsigned int, float)
0.04% 8.3235ms 20 416.17us 212.55us 470.72us void faiss::gpu::l2SelectMinK<float, int=512, int=8, int=128>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::l2SelectMinK<float, int=512, int=8, int=128, faiss::gpu::Tensor, int=1, bool=1, int, faiss::gpu::traits>, faiss::gpu::l2SelectMinK<float, int=512, int=8, int=128, unsigned char, int=1, bool=1, int, faiss::gpu::traits>, faiss::gpu::traits::DefaultPtrTraits, faiss::gpu::l2SelectMinK<float, int=512, int=8, int=128, int, int=2, bool=1, int, faiss::gpu::traits>, int, faiss::gpu::Tensor)
0.03% 6.2340ms 1250 4.9870us 2.5600us 12.384us void cub::DeviceScanKernel<cub::DeviceScanPolicy
update the baseline first