SYCLomatic
SYCLomatic copied to clipboard
incorrect migration of __shfl_xor_sync CUDA API within a template function
Describe the bug
When I tried to migrate a template function with just 1 loc in the function body.
#include <cuda_runtime.h>
template <typename KeyT, typename ValueT, uint32_t WORKGROUP_SIZE>
class WarpSort{
__device__ static void swap(KeyT& key, ValueT& value,
uint32_t const& step, uint32_t const& activemask,
bool bDescending, std::true_type const& isKeyOnly)
{
__shfl_xor_sync(activemask, key, step, 32);
}
}
I expect the __shfl_xor_sync
would be migrated into dpct::permute_sub_group_by_xor
. However, SYCLomatic does nothing to the code.
after migration:
#include <sycl/sycl.hpp>
#include <dpct/dpct.hpp>
template <typename KeyT, typename ValueT, uint32_t WORKGROUP_SIZE>
class WarpSort{
static void swap(KeyT& key, ValueT& value,
uint32_t const& step, uint32_t const& activemask,
bool bDescending, std::true_type const& isKeyOnly)
{
__shfl_xor_sync(activemask, key, step, 32);
}
To reproduce
#include <cuda_runtime.h>
template <typename KeyT, typename ValueT, uint32_t WORKGROUP_SIZE>
class WarpSort{
__device__ static void swap(KeyT& key, ValueT& value,
uint32_t const& step, uint32_t const& activemask,
bool bDescending, std::true_type const& isKeyOnly)
{
__shfl_xor_sync(activemask, key, step, 32);
}
}
run the above code with dpct
Environment
- OS: Linux
- Target device and vendor: Nvidia GPU
- DPC++ version:Intel(R) oneAPI DPC++/C++ Compiler 2024.2.0 (2024.2.0.20240602)
Additional context
One interesting observation is when you change the type of key
into explicit type name such as int& key
, the migration success.
before migration:
#include <cuda_runtime.h>
template <typename KeyT, typename ValueT, uint32_t WORKGROUP_SIZE>
class WarpSort{
__device__ static void swap(int& key, ValueT& value,
uint32_t const& step, uint32_t const& activemask,
bool bDescending, std::true_type const& isKeyOnly)
{
__shfl_xor_sync(activemask, key, step, 32);
}
}
after migration:
template <typename KeyT, typename ValueT, uint32_t WORKGROUP_SIZE>
class WarpSort{
static void swap(int& key, ValueT& value,
uint32_t const& step, uint32_t const& activemask,
bool bDescending, std::true_type const& isKeyOnly,
const sycl::nd_item<3> &item_ct1)
{
/*
DPCT1023:0: The SYCL sub-group does not support mask options for
dpct::permute_sub_group_by_xor. You can specify
"--use-experimental-features=masked-sub-group-operation" to use the
experimental helper function to migrate __shfl_xor_sync.
*/
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), key, step);
}