Restrict in-place execution

Open gevtushenko opened this issue 3 years ago • 0 comments

There's a blind spot in the Thrust/CUB in-place execution guarantees that I believe should be addressed. Thrust/CUB allow iterators to point to the same memory meanwhile there's no restriction on these iterators value types.

For instance, it's legal to reinterpret iterators as pointing to char. The following code illustrates a useful scenario of converting masks into offsets. It's not at all clear from the documentation why this is not allowed:

  const bool alias = true;
  const int nc = sizeof(std::size_t) * 512 * 1024;
  const int ns = nc / sizeof(std::size_t); 

  char *d_in{};
  cudaMalloc(&d_in, nc);
  cudaMemset(d_in, 1, nc);

  std::size_t *d_out{};

  if (alias)
  {
    d_out = reinterpret_cast<std::size_t*>(d_in);
  }
  else
  {
    cudaMalloc(&d_out, nc);
    cudaMemset(d_out, 1, nc);
  }

  cub::Sum op{};
  std::size_t init{};

  std::size_t temp_storage_bytes{}; 
  std::uint8_t *d_temp_storage{};

  cub::DeviceScan::ExclusiveScan(
      d_temp_storage, temp_storage_bytes, 
      d_in, d_out, op, init, ns);

  cudaMalloc(&d_temp_storage, temp_storage_bytes); 

  cub::DeviceScan::ExclusiveScan(
      d_temp_storage, temp_storage_bytes, 
      d_in, d_out, op, init, ns);

  std::vector<std::size_t> h_out(ns);
  cudaMemcpy(h_out.data(), d_out, nc, cudaMemcpyDeviceToHost);

  for (int i = 0; i < ns; i++)
  {
    if (h_out[i] != i)
    {
      std::cout << "case 1 out[" << i << "] = " << h_out[i] << " != " << i << std::endl;
      break;
    }
  }

The code above introduces a data race leading to incorrect result in the case of aliasing. The more casual example would be usage of unions:

  union U
  {
    char in[sizeof(std::size_t) * 512 * 1024]; 
    std::size_t out[sizeof(in) / sizeof(std::size_t)]; 
  };

  const std::size_t nc = sizeof(U::in);
  const std::size_t ns = sizeof(U::out) / sizeof(std::size_t);

  U *d_u{};
  cudaMalloc(&d_u, sizeof(U));

  char *d_in = d_u->in;
  std::size_t *d_out = d_u->out;

  cudaMemset(d_in, 1, nc);

  cub::Sum op{};
  std::size_t init{};

  std::size_t temp_storage_bytes{}; 
  std::uint8_t *d_temp_storage{};

  cub::DeviceScan::ExclusiveScan(
      d_temp_storage, temp_storage_bytes, 
      d_in, d_out, op, init, ns);

  cudaMalloc(&d_temp_storage, temp_storage_bytes); 

  cub::DeviceScan::ExclusiveScan(
      d_temp_storage, temp_storage_bytes, 
      d_in, d_out, op, init, ns);

  std::vector<std::size_t> h_out(ns);
  cudaMemcpy(h_out.data(), d_out, nc, cudaMemcpyDeviceToHost);

  for (int i = 0; i < ns; i++)
  {
    if (h_out[i] != i)
    {
      std::cout << "case 2 out[" << i << "] = " << h_out[i] << " != " << i << std::endl;
      break;
    }
  }

  cudaFree(d_temp_storage);
  cudaFree(d_u);

This also leads to the previous issues. Thrust inherits the very same issues.

Since the change of documentation might be considered as a breaking change, I'd like to get this addressed in 2.0.

Jun 01 '22 09:06 gevtushenko