raft
raft copied to clipboard
CAGRA graph pruning: fix 32/64-bit int arithmetics
Fix 32/64 bit conversions in indexing within CAGRA optimize routine, which may have caused occasional illegal-memory-access errors.
The error pops up very rarely and is hard to reproduce, so this fix is not guaranteed. Below is the symptom (frame 3 indicates the error is caught during first sync after the pruning message is printed):
...
[D] [15:42:39.832147] cpp/include/raft/neighbors/detail/cagra/graph_core.cuh:392 # Pruning kNN Graph on GPUs
...
CUDA error encountered at: file=cpp/include/raft/core/interruptible.hpp line=301: call='query_result', Reason=cudaErrorIllegalAddress:an illegal memory access was encountered
Obtained 19 stack frames
#1 in cpp/build/libraft_cagra_ann_bench.so: raft::cuda_error::cuda_error(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) +0x5e [0x7eff508cf9de]
#2 in cpp/build/libraft_cagra_ann_bench.so: void raft::interruptible::synchronize_impl<cudaError (*)(CUstream_st*), rmm::cuda_stream_view>(cudaError (*)(CUstream_st*), rmm::cuda_stream_view) +0x1be [0x7eff508d733e]
#3 in cpp/build/libraft_cagra_ann_bench.so: void raft::neighbors::cagra::detail::graph::optimize<unsigned int, raft::host_device_accessor<std::experimental::default_accessor<unsigned int>, (raft::memory_type)0> >(raft::resources const&, std::experimental::mdspan<unsigned int, std::experimental::extents<long, 18446744073709551615ul, 18446744073709551615ul>, std::experimental::layout_right, raft::host_device_accessor<std::experimental::default_accessor<unsigned int>, (raft::memory_type)0> >, std::experimental::mdspan<unsigned int, std::experimental::extents<long, 18446744073709551615ul, 18446744073709551615ul>, std::experimental::layout_right, raft::host_device_accessor<std::experimental::default_accessor<unsigned int>, (raft::memory_type)0> >) +0x9eb [0x7eff50913f6b]
#4 in cpp/build/libraft_cagra_ann_bench.so: void raft::neighbors::cagra::detail::optimize<unsigned int, raft::host_device_accessor<std::experimental::default_accessor<unsigned int>, (raft::memory_type)0> >(raft::resources const&, std::experimental::mdspan<unsigned int, std::experimental::extents<long, 18446744073709551615ul, 18446744073709551615ul>, std::experimental::layout_right, raft::host_device_accessor<std::experimental::default_accessor<unsigned int>, (raft::memory_type)0> >, std::experimental::mdspan<unsigned int, std::experimental::extents<long, 18446744073709551615ul, 18446744073709551615ul>, std::experimental::layout_right, raft::host_device_accessor<std::experimental::default_accessor<unsigned int>, (raft::memory_type)0> >) +0x7a [0x7eff5097cc8a]
#5 in cpp/build/libraft_cagra_ann_bench.so: raft::neighbors::cagra::index<float, unsigned int> raft::neighbors::cagra::detail::build<float, unsigned int, raft::host_device_accessor<std::experimental::default_accessor<float const>, (raft::memory_type)0> >(raft::resources const&, raft::neighbors::cagra::index_params const&, std::experimental::mdspan<float const, std::experimental::extents<long, 18446744073709551615ul, 18446744073709551615ul>, std::experimental::layout_right, raft::host_device_accessor<std::experimental::default_accessor<float const>, (raft::memory_type)0> >, std::optional<raft::neighbors::experimental::nn_descent::index_params>, std::optional<float>, std::optional<raft::neighbors::ivf_pq::index_params>, std::optional<raft::neighbors::ivf_pq::search_params>, bool) +0x14a4 [0x7eff509b4094]
#6 in cpp/build/libraft_cagra_ann_bench.so: raft::bench::ann::RaftCagra<float, unsigned int>::build(float const*, unsigned long) +0xd0 [0x7eff509b4fc0]
#7 in ./cpp/build/ANN_BENCH: void raft::bench::ann::bench_build<float>(benchmark::State&, std::shared_ptr<raft::bench::ann::Dataset<float> const>, raft::bench::ann::Configuration::Index, bool) +0x435 [0x55c3cfbe32c5]
#8 in ./cpp/build/ANN_BENCH: benchmark::internal::LambdaBenchmark<benchmark::RegisterBenchmark<void (&)(benchmark::State&, std::shared_ptr<raft::bench::ann::Dataset<float> const>, raft::bench::ann::Configuration::Index, bool), std::shared_ptr<raft::bench::ann::Dataset<float> const>&, raft::bench::ann::Configuration::Index&, bool&>(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, void (&)(benchmark::State&, std::shared_ptr<raft::bench::ann::Dataset<float> const>, raft::bench::ann::Configuration::Index, bool), std::shared_ptr<raft::bench::ann::Dataset<float> const>&, raft::bench::ann::Configuration::Index&, bool&)::{lambda(benchmark::State&)#1}>::Run(benchmark::State&) +0x81 [0x55c3cfb9c501]
#9 in ./cpp/build/ANN_BENCH: benchmark::internal::BenchmarkInstance::Run(long, int, benchmark::internal::ThreadTimer*, benchmark::internal::ThreadManager*, benchmark::internal::PerfCountersMeasurement*) const +0x128 [0x55c3cfc0b578]
#10 in ./cpp/build/ANN_BENCH(+0x1391a7) [0x55c3cfbf11a7]
#11 in ./cpp/build/ANN_BENCH: benchmark::internal::BenchmarkRunner::DoNIterations() +0x376 [0x55c3cfbf2986]
#12 in ./cpp/build/ANN_BENCH: benchmark::internal::BenchmarkRunner::DoOneRepetition() +0x117 [0x55c3cfbf3647]
#13 in ./cpp/build/ANN_BENCH(+0x12e9cf) [0x55c3cfbe69cf]
#14 in ./cpp/build/ANN_BENCH: benchmark::RunSpecifiedBenchmarks(benchmark::BenchmarkReporter*, benchmark::BenchmarkReporter*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) +0x551 [0x55c3cfbe7b01]
#15 in ./cpp/build/ANN_BENCH: benchmark::RunSpecifiedBenchmarks() +0x3e [0x55c3cfbe7c7e]
#16 in ./cpp/build/ANN_BENCH: raft::bench::ann::run_main(int, char**) +0xeb6 [0x55c3cfbd9d26]
#17 in /usr/lib/x86_64-linux-gnu/libc.so.6(+0x29d90) [0x7eff54d33d90]
#18 in /usr/lib/x86_64-linux-gnu/libc.so.6: __libc_start_main +0x80 [0x7eff54d33e40]
#19 in ./cpp/build/ANN_BENCH(+0xc456f) [0x55c3cfb7c56f]
https://github.com/rapidsai/raft/pull/2197/files#diff-7cbd141c76b6be244a7e4771ecbbb06c88f22b8c5bc80f8f49b516fcfc70bea0L387-L389 This one looks suspicious. It's uint32_t/int64_t in a few places.