BGHT CUDA Arch Error

when i using it in my project, i meet:

/usr/local/cuda/include/cuda/std/detail/__atomic:11:4: error: #error "CUDA atomics are only supported for sm_60 and up on *nix and sm_70 and up on Windows."
   11 | #  error "CUDA atomics are only supported for sm_60 and up on *nix and sm_70 and up on Windows."
      |    ^~~~~

My GPU is RTX3060 on CUDA 11.4, Ubuntu 18.04 Here is my CMakeLists.txt configuration:

include(cmake/CPM.cmake)
CPMAddPackage(
        NAME bght
        URL "https://github.com/owensgroup/BGHT/archive/refs/heads/main.zip"
        OPTIONS
        "build_tests OFF"
        "build_benchmarks OFF"
)

set(CUDA_ARCHS 70)
cuda_add_library(mylib SHARED mylib.cu mylib.cuh)
target_link_libraries(mylib bght)
set_target_properties(mylib PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCHS})`

Aug 31 '23 03:08 HakubunLuo

Hi @HakubunLuo , thanks for reporting this issue. Could you please share the complete log you get from both the CMake configure and build commands? It looks like the CUDA_ARCHS are not properly set. Could you also share the rest of your CMakeLists.txt file?

Aug 31 '23 04:08 maawad

CMakeLists.txt

cmake_minimum_required(VERSION 3.25)
set(CMAKE_CUDA_ARCHITECTURES 80 86)
set(CMAKE_CUDA_STANDARD 17)
SET(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
project(demo_test CUDA CXX)

find_package(CUDA REQUIRED)
find_package(CUDAToolkit REQUIRED)

include_directories(/usr/local/cuda/include)
include_directories(/usr/include)

aux_source_directory(src SOURCES)

include(cmake/CPM.cmake)
CPMAddPackage(
        NAME bght
        URL "https://github.com/owensgroup/BGHT/archive/refs/heads/main.zip"
        OPTIONS
        "build_tests OFF"
        "build_benchmarks OFF"
)

set(CUDA_ARCHS 86)
cuda_add_library(demo SHARED demo.cu demo.cuh)
target_link_libraries(demo bght)
set_target_properties(demo PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCHS})


add_executable(demo_test main.cpp)

target_link_libraries(demo_test PRIVATE demo)

target_link_libraries(demo_test PRIVATE CUDA::cudart)
set_target_properties(demo_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)`

I just solved this by replacing cuda_add_library by add_library, now when I used it in my kernel function, there is another problem: I have a kernel function:

template<class HashMap>
__global__ void createLookupHashKernel(HashMap lookupTable, const int *coordinates, int N, int max_x, int max_y)

The size of table is about 100,000, it was very solve if I pass HashMap directly so that I used HashMap *lookupTable and pass reference into it. However, it has memory problem like this:

========= Invalid __global__ read of size 4 bytes
=========     at 0x280 in /home/dell/CLionProjects/NewSpconvOp/cmake-build-debug/_deps/bght-src/include/detail/pair.cuh:79:bght::equal_to<int>::operator ()(const int &, const int &) const
=========     by thread (132,0,0) in block (0,0,0)
=========     Address 0x7fff22e34a68 is out of bounds

My operations in kernel function is:

        int key = ...;
        int idx = ...;
        using key_type = int;
        using value_type = int;
        using pair_type = bght::pair<key_type, value_type>;

        auto block = cooperative_groups::this_thread_block();
        auto tile = cooperative_groups::tiled_partition<HashMap::bucket_size>(block);
        pair_type pair{key, idx};
        lookupTable->insert(pair, tile);

The hash map was created by:

    std::size_t capacity = num_size * 2;
    auto invalid_key = std::numeric_limits<key_type>::max();
    auto invalid_value = std::numeric_limits<value_type>::max();

    bght::bcht<key_type, value_type> hash_lookupTable(capacity, invalid_key, invalid_value);

Aug 31 '23 04:08 HakubunLuo

You should not pass the hash tables by reference or as pointers to kernels. You should pass them by value to kernels.

It looks like you are dealing with a pointer here?

 lookupTable->insert(pair, tile);

Here is an example: https://github.com/owensgroup/BGHT/blob/140b80f859efe08c1f7bcf95b459bb56500703bd/test/test_types.cu#L176-L210

Aug 31 '23 04:08 maawad

Also if every thread is trying to insert a key, you will need to serialize them within a tile. See how we do it here:

https://github.com/owensgroup/BGHT/blob/140b80f859efe08c1f7bcf95b459bb56500703bd/include/detail/kernels.cuh#L26-L64

Aug 31 '23 04:08 maawad

Yes There are lots of threads try to insert the hash map. I reference the code to edit my kernel:

template<class HashMap>
__global__ void createLookupHashKernel(HashMap lookupTable, const int *coordinates, int N, int max_x, int max_y) {
    auto idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx < N) {
        int x = coordinates[idx * 4 + 1];
        int y = coordinates[idx * 4 + 2];
        int z = coordinates[idx * 4 + 3];
        int key = getIndex(x, y, z, max_x, max_y);

        using key_type = int;
        using value_type = int;
        using pair_type = bght::pair<key_type, value_type>;
        auto block = cooperative_groups::this_thread_block();
        auto tile = cooperative_groups::tiled_partition<HashMap::bucket_size>(block);

        bool do_op = true;
        pair_type insertion_pair{key, (int) idx};
        bool success = true;
        // Do the insertion
        auto work_queue = tile.ballot(do_op);
        while (work_queue) {
            auto cur_rank = __ffs(work_queue) - 1;
            auto cur_pair = tile.shfl(insertion_pair, cur_rank);
            bool insertion_success = lookupTable.insert(cur_pair, tile);

            if (tile.thread_rank() == cur_rank) {
                do_op = false;
                success = insertion_success;
            }
            work_queue = tile.ballot(do_op);
        }
        
    }
}

I do not add

   if (!tile.all(success)) { 
     *map.d_build_success_ = false; 
   }

Because i get d_build_success_' is a private member of 'bght::bcht<int, int>' This edited kernel function also looks like in dead-lock

Aug 31 '23 04:08 HakubunLuo

If N is not multiple of the bucket size you will into issues. The insert function expects bucket_size threads calling it.

This if statement is problematic:

if (idx < N) {
}

You can replace it with a couple of lines to address this issue:

   auto thread_id = threadIdx.x + blockIdx.x * blockDim.x; 
  
   // tile 
   auto block = cooperative_groups::this_thread_block(); 
   auto tile = cooperative_groups::tiled_partition<HashMap::bucket_size>(block); 

   if ((thread_id - tile.thread_rank()) >= N) { 
     return; 
   } 
  
   bool do_op = false; 
  
   // load the input 
   if (thread_id < N) { 
    int x = coordinates[idx * 4 + 1];
    int y = coordinates[idx * 4 + 2];
    int z = coordinates[idx * 4 + 3];
    int key = getIndex(x, y, z, max_x, max_y);

     do_op = true; 
   } 

// the insertion loop.

Correct, that variable is hidden. What you could do is just add another boolean argument to your kernel, and set it to false if any of the insertions failed. In general, if you have a non-skewed distribution then insertion should succeed otherwise you may need to decrease the load factor. Let me know if the modifications here works for you.

Aug 31 '23 04:08 maawad

It works for insert process, so if I also have a kernel function that reads hash map in multi threads, i need to use same operations?

Aug 31 '23 06:08 HakubunLuo

Great! Yes, you will need to follow the same steps for finds. Here is an example: https://github.com/owensgroup/BGHT/blob/140b80f859efe08c1f7bcf95b459bb56500703bd/include/detail/kernels.cuh#L67-L109

Aug 31 '23 06:08 maawad

I use this to find values by tables we created before: ` createRulesTableByHashKernel(HashMap lookupTable, const int *coordinates, int N, int *rulesTable, int max_x, int max_y, int max_z, int kernel_size) {

auto thread_id = threadIdx.x + blockIdx.x * blockDim.x;
int half_kernel = kernel_size / 2;

auto block = cooperative_groups::this_thread_block();
auto tile = cooperative_groups::tiled_partition<HashMap::bucket_size>(block);
if ((thread_id - tile.thread_rank()) >= N) {
    return;
}

bool do_op = false;
typename HashMap::key_type find_key;
typename HashMap::mapped_type result;


auto idx = thread_id;

if (idx < N) {

    int x_start = ...
    int y_start = ...
    int z_start = ...

    int x_end = ...
    int y_end = ...
    int z_end = ...

    for (int x = x_start; x <= x_end; x++)
        for (int y = y_start; y <= y_end; y++)
            for (int z = z_start; z <= z_end; z++) {

                int lookup_idx = getIndex(x, y, z, max_x, max_y);

                int target = lookupTable.find(lookup_idx, tile);

                ...
            }

}

}

` It works, but it looks like it not find the correct data by keys

Aug 31 '23 06:08 HakubunLuo

You need to follow the insertion/find code. Again, this if statement is problematic:

if (idx < N) {
}

and if values of lookup_idx are different per thread in the tile, you will need to use the work queue loop just like you did for inserts.

Aug 31 '23 06:08 maawad

I am confused with InputIt first and InputIt last in examples, I only need to implement one search at one time. I noticed:

` template <typename InputIt, typename OutputIt, typename HashMap> global void find_kernel(InputIt first, InputIt last, OutputIt output_begin, HashMap map) { auto thread_id = threadIdx.x + blockIdx.x * blockDim.x; auto count = last - first;

if (thread_id < count) { auto find_key = first[thread_id]; auto result = map.find(find_key); output_begin[thread_id] = result; } } `

in kernel.cuh, However, I can not just use one parameter: find_key to search.

Aug 31 '23 06:08 HakubunLuo

I understand the keys you are using are different. You need to follow the same strategy you followed for insertion which is similar to find as well. The two things you need to make sure happens are (1) all threads in the tile call the find function, and (2) within a tile you serially do finds. See comments here:

__global__
void createRulesTableByHashKernel(HashMap lookupTable, const int *coordinates, 
                           int N, int *rulesTable, int max_x, int max_y, int max_z, int kernel_size) {
  
  auto thread_id = threadIdx.x + blockIdx.x * blockDim.x;
  int half_kernel = kernel_size / 2;
  
  auto block = cooperative_groups::this_thread_block();
  auto tile = cooperative_groups::tiled_partition<HashMap::bucket_size>(block);
  if ((thread_id - tile.thread_rank()) >= N) { // this if statement make sure (1) happens
      return;
  }
  
  bool do_op = false;
  typename HashMap::key_type find_key;
  typename HashMap::mapped_type result;
  
  
  auto idx = thread_id;
  
  //if (idx < N) { // violates (1)
  
      int x_start = ...
      int y_start = ...
      int z_start = ...
  
      int x_end = ...
      int y_end = ...
      int z_end = ...
  
      for (int x = x_start; x <= x_end; x++)
          for (int y = y_start; y <= y_end; y++)
              for (int z = z_start; z <= z_end; z++) {
                 int lookup_idx = getIndex(x, y, z, max_x, max_y);
                   // since we removed the if (idx < N) we may need to make sure the index is valid
                  bool do_op = is_valid_index(lookup_idx);
  
                  // is the lookup_idx different per threads in the tile of size bucket_size?
                  // if yes, you need do the following so that (2) is satisfied:
  
                   auto work_queue = tile.ballot(do_op); 
                   while (work_queue) { 
                     auto cur_rank = __ffs(work_queue) - 1; 
                     auto cur_key = tile.shfl(lookup_idx, cur_rank); 
                  
                     int target  = map.find(cur_key, tile); 
                  
                     if (tile.thread_rank() == cur_rank) { 
                       do_op = false; 
                     } 
                     work_queue = tile.ballot(do_op); 
                   } 
                  ...
              }
  
  //}
}

Aug 31 '23 07:08 maawad

BGHT BGHT copied to clipboard

CUDA Arch Error

BGHT
BGHT copied to clipboard