cub icon indicating copy to clipboard operation
cub copied to clipboard

Error in cub::DeviceHistogram::HistogramEven

Open csukuangfj opened this issue 3 years ago • 3 comments

The following code fails when invoking cub::DeviceHistogram::HistogramEven.

NOTE: It fails ONLY for some values of n and dim in the code below.

@danpovey

(py38) fangjun:~/open-source/k2/build_debug$ ./bin/cu_cub_test
Invoking DeviceHistogramInitKernel<<<38789, 256, 0, 0>>>()
Invoking histogram_sweep_kernel<<<{240, 1, 1}, 384, 0, 0>>>(), 16 pixels per thread, 3 SM occupancy
cu_cub_test: /root/fangjun/open-source/k2/k2/csrc/cub_test.cu:43: int main(): Assertion `ret == cudaSuccess' failed.
Aborted
#include <assert.h>

#include <vector>

#include "cub/cub.cuh"

int main() {
#if 1
  // this causes assertion error
  int32_t dim = 9939705;  // array dim
  int32_t n = 9929898;    // max value
#else
  // this is OK
  int32_t dim = 100;
  int32_t n = 1000;
#endif

  std::vector<int32_t> v(dim, 0);
  int32_t *src;
  cudaError_t ret = cudaMalloc(&src, dim * sizeof(int32_t));
  assert(ret == cudaSuccess);

  ret =
      cudaMemcpy(src, v.data(), dim * sizeof(int32_t), cudaMemcpyHostToDevice);
  assert(ret == cudaSuccess);

  int32_t *dst;
  ret = cudaMalloc(&dst, n * sizeof(int32_t));
  assert(ret == cudaSuccess);

  std::size_t temp_storage_bytes = 0;
  ret = cub::DeviceHistogram::HistogramEven(
      nullptr, temp_storage_bytes, src, dst, n + 1, 0, n, dim, nullptr, true);
  assert(ret == cudaSuccess);

  int8_t *temp_storage;
  ret = cudaMalloc(&temp_storage, temp_storage_bytes);
  assert(ret == cudaSuccess);

  ret =
      cub::DeviceHistogram::HistogramEven(temp_storage, temp_storage_bytes, src,
                                          dst, n + 1, 0, n, dim, nullptr, true);
  assert(ret == cudaSuccess);     // <-------------------------  this is line 43, it fails here

  ret = cudaFree(temp_storage);
  assert(ret == cudaSuccess);

  ret = cudaFree(dst);
  assert(ret == cudaSuccess);

  ret = cudaFree(src);
  assert(ret == cudaSuccess);

  return 0;
}

csukuangfj avatar Apr 23 '21 11:04 csukuangfj

The above program output from a test case in our framework will give you an idea for some dimensions that were OK, and one that was not OK, in case it might be useful to guess what is going wrong.

[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 2302316, src_dim = 6360309, temp_storage_byte = 2210223871
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 5347040, src_dim = 5033009, temp_storage_byte = 5133158911
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 4370517, src_dim = 8294045, temp_storage_byte = 4195696895
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 8657720, src_dim = 6608153, temp_storage_byte = 8311411711
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 6921405, src_dim = 1456215, temp_storage_byte = 6589178111
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 725544, src_dim = 548632, temp_storage_byte = 261196543
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 4713341, src_dim = 8486926, temp_storage_byte = 4524807935
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 8678084, src_dim = 6684235, temp_storage_byte = 8330961151
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 9302809, src_dim = 69234, temp_storage_byte = 446535423
[I] /ceph-dan/k2/k2/csrc/array_ops.cu:307:k2::Array1<int> k2::GetCounts(k2::ContextPtr, const int32_t*, int32_t, int32_t) n = 8992522, src_dim = 3114920, temp_storage_byte = 8632821759
[F] /ceph-dan/k2/k2/csrc/pytorch_context.cu:190:virtual void k2::PytorchCudaContext::CopyDataTo(size_t, const void*, k2::ContextPtr, void*) Check failed: ret == cudaSuccess (700 vs. 0)  Error: an illegal memory access was encountered.

danpovey avatar Apr 23 '21 11:04 danpovey

As far as I can tell, it happens when temp_storage_bytes exceeds 2 to the power 33.

danpovey avatar Apr 23 '21 11:04 danpovey

Thanks for the report. I can reproduce the error with this testcase on main, and cuda-memcheck found some out-of-bounds writes:

========= CUDA-MEMCHECK
========= Invalid __global__ write of size 4
=========     at 0x00000240 in void cub::DeviceHistogramSweepKernel<cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::Policy500, int=0, int=1, int=1, int*, int, cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::ScaleTransform, cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::PassThruTransform, int>(int, cub::ArrayWrapper<int, int>, cub::ArrayWrapper, int, int*, int, int, int=1, int, int, int, cub::GridQueue<int>)
=========     by thread (383,0,0) in block (239,0,0)
=========     Address 0x7fb12fd370d4 is out of bounds
=========     Device Frame:void cub::DeviceHistogramSweepKernel<cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::Policy500, int=0, int=1, int=1, int*, int, cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::ScaleTransform, cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::PassThruTransform, int>(int, cub::ArrayWrapper<int, int>, cub::ArrayWrapper, int, int*, int, int, int=1, int, int, int, cub::GridQueue<int>) (void cub::DeviceHistogramSweepKernel<cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::Policy500, int=0, int=1, int=1, int*, int, cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::ScaleTransform, cub::DispatchHistogram<int=1, int=1, int*, int, int, int>::PassThruTransform, int>(int, cub::ArrayWrapper<int, int>, cub::ArrayWrapper, int, int*, int, int, int=1, int, int, int, cub::GridQueue<int>) : 0x240)

This is likely an instance of #212 in the wild; the last int in cub::DispatchHistogram<int=1, int=1, int*, int, int, int> indicates that a 32-bit integer is being used for indexing into the dataset.

Fixing #212 is high on my priority list, but I likely won't be able to get to it for at least a couple of months. That issue has some suggestions for possible workarounds in the meantime.

alliepiper avatar Apr 23 '21 16:04 alliepiper

I am closing this issue as a duplicate of #212 to keep the backlog tidy Thanks a lot for the report

miscco avatar Feb 23 '23 16:02 miscco