cub
cub copied to clipboard
Error in cub::DeviceHistogram::HistogramEven
The following code fails when invoking cub::DeviceHistogram::HistogramEven
.
NOTE: It fails ONLY for some values of n
and dim
in the code below.
@danpovey
(py38) fangjun:~/open-source/k2/build_debug$ ./bin/cu_cub_test
Invoking DeviceHistogramInitKernel<<<38789, 256, 0, 0>>>()
Invoking histogram_sweep_kernel<<<{240, 1, 1}, 384, 0, 0>>>(), 16 pixels per thread, 3 SM occupancy
cu_cub_test: /root/fangjun/open-source/k2/k2/csrc/cub_test.cu:43: int main(): Assertion `ret == cudaSuccess' failed.
Aborted
#include <assert.h>
#include <vector>
#include "cub/cub.cuh"
int main() {
#if 1
// this causes assertion error
int32_t dim = 9939705; // array dim
int32_t n = 9929898; // max value
#else
// this is OK
int32_t dim = 100;
int32_t n = 1000;
#endif
std::vector<int32_t> v(dim, 0);
int32_t *src;
cudaError_t ret = cudaMalloc(&src, dim * sizeof(int32_t));
assert(ret == cudaSuccess);
ret =
cudaMemcpy(src, v.data(), dim * sizeof(int32_t), cudaMemcpyHostToDevice);
assert(ret == cudaSuccess);
int32_t *dst;
ret = cudaMalloc(&dst, n * sizeof(int32_t));
assert(ret == cudaSuccess);
std::size_t temp_storage_bytes = 0;
ret = cub::DeviceHistogram::HistogramEven(
nullptr, temp_storage_bytes, src, dst, n + 1, 0, n, dim, nullptr, true);
assert(ret == cudaSuccess);
int8_t *temp_storage;
ret = cudaMalloc(&temp_storage, temp_storage_bytes);
assert(ret == cudaSuccess);
ret =
cub::DeviceHistogram::HistogramEven(temp_storage, temp_storage_bytes, src,
dst, n + 1, 0, n, dim, nullptr, true);
assert(ret == cudaSuccess); // <------------------------- this is line 43, it fails here
ret = cudaFree(temp_storage);
assert(ret == cudaSuccess);
ret = cudaFree(dst);
assert(ret == cudaSuccess);
ret = cudaFree(src);
assert(ret == cudaSuccess);
return 0;
}
The above program output from a test case in our framework will give you an idea for some dimensions that were OK, and one that was not OK, in case it might be useful to guess what is going wrong.
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 2302316, src_dim = 6360309, temp_storage_byte = 2210223871
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 5347040, src_dim = 5033009, temp_storage_byte = 5133158911
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 4370517, src_dim = 8294045, temp_storage_byte = 4195696895
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 8657720, src_dim = 6608153, temp_storage_byte = 8311411711
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 6921405, src_dim = 1456215, temp_storage_byte = 6589178111
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 725544, src_dim = 548632, temp_storage_byte = 261196543
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 4713341, src_dim = 8486926, temp_storage_byte = 4524807935
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 8678084, src_dim = 6684235, temp_storage_byte = 8330961151
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 9302809, src_dim = 69234, temp_storage_byte = 446535423
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 8992522, src_dim = 3114920, temp_storage_byte = 8632821759
[F] /ceph-dan/k2/k2/csrc/pytorch_context.cu:190:virtual void k2::PytorchCudaContext::CopyDataTo(size_t, const void*, k2::ContextPtr, void*) Check failed: ret == cudaSuccess (700 vs. 0) Error: an illegal memory access was encountered.
As far as I can tell, it happens when temp_storage_bytes exceeds 2 to the power 33.
Thanks for the report. I can reproduce the error with this testcase on main
, and cuda-memcheck found some out-of-bounds writes:
========= CUDA-MEMCHECK
========= Invalid __global__ write of size 4
========= at 0x00000240 in void cub::DeviceHistogramSweepKernel<cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::Policy500, int=0, int=1, int=1, int*, int, cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::ScaleTransform, cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::PassThruTransform, int>(int, cub::ArrayWrapper<int, int>, cub::ArrayWrapper, int, int*, int, int, int=1, int, int, int, cub::GridQueue<int>)
========= by thread (383,0,0) in block (239,0,0)
========= Address 0x7fb12fd370d4 is out of bounds
========= Device Frame:void cub::DeviceHistogramSweepKernel<cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::Policy500, int=0, int=1, int=1, int*, int, cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::ScaleTransform, cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::PassThruTransform, int>(int, cub::ArrayWrapper<int, int>, cub::ArrayWrapper, int, int*, int, int, int=1, int, int, int, cub::GridQueue<int>) (void cub::DeviceHistogramSweepKernel<cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::Policy500, int=0, int=1, int=1, int*, int, cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::ScaleTransform, cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::PassThruTransform, int>(int, cub::ArrayWrapper<int, int>, cub::ArrayWrapper, int, int*, int, int, int=1, int, int, int, cub::GridQueue<int>) : 0x240)
This is likely an instance of #212 in the wild; the last int
in cub::DispatchHistogram<int=1, int=1, int*, int, int, int>
indicates that a 32-bit integer is being used for indexing into the dataset.
Fixing #212 is high on my priority list, but I likely won't be able to get to it for at least a couple of months. That issue has some suggestions for possible workarounds in the meantime.
I am closing this issue as a duplicate of #212 to keep the backlog tidy Thanks a lot for the report