TensorComprehensions Spurious __syncthreads in the beginning of the kernel.

I noticed that in some cases the first statement after variable definitions in the kernel is a __syncthreads(); which if I am not mistaken makes no sense.

For example, in GroupNormalizationSingleKernel with input sizes C=1024, G=4, N=32, H=4, W=41 and options:

tc::CudaMappingOptions::makeNaiveMappingOptions()
    .outerScheduleFusionStrategy(tc::FusionStrategy::Max)
    .outerScheduleAllowSkewing(true)
    .outerSchedulePositiveOrthant(true)
    .intraTileScheduleFusionStrategy(tc::FusionStrategy::Max)
    .intraTileScheduleAllowSkewing(true)
    .intraTileSchedulePositiveOrthant(true)
    .fixParametersBeforeScheduling(false)
    .tile(1)
    .unroll(148)
    .tileImperfectlyNested(false)
    .matchLibraryCalls(false)
    .mapToThreads(1, 126, 3)
    .mapToBlocks(118, 61, 193)
    .useSharedMemory(true)
    .usePrivateMemory(false)
    .unrollCopyShared(true)
    .useReadOnlyCache(true);

The generated code is:

template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}
#define if_then_else(cond,a,b) ((cond) ? (a) : (b))
ft
// Can't include system dependencies with NVRTC
// Can't include cuda_fp16.h with NVRTC due to transitive system dependencies
// #include <cuda_fp16.h>

// Halide type handling
typedef char int8;
typedef short int16;
typedef int int32;
typedef long int64;
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef unsigned long uint64;
// typedef half float16;
typedef float float32;
typedef double float64;

#define inff __int_as_float(0x7f800000)
#define inf __longlong_as_double(0x7ff0000000000000LL)

// Before CUDA 9, syncwarp is a noop since warps are always synchronized.
#if __CUDACC_VER_MAJOR__ < 9
__device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {}
#endif



namespace __tc {
template<typename T>
__device__ __forceinline__ T ldg(const T* ptr) {
#if __CUDA_ARCH__ >= 350
  return __ldg(ptr);
#else
  return *ptr;
#endif
}
} // namespace __tc
__global__ __launch_bounds__(4) void group_normalization_single_kernel_256_4_34_32_41(int32 D, int32 G, int32 H, int32 N, int32 W, float32* pO, float32* psum, float32* psumSquares, const float32* pI, const float32* pgamma, const float32* pbeta) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*O)[4][256][34][41] = reinterpret_cast<float32 (*)[4][256][34][41]>(pO);
  float32 (*sum)[4] = reinterpret_cast<float32 (*)[4]>(psum);
  float32 (*sumSquares)[4] = reinterpret_cast<float32 (*)[4]>(psumSquares);
  const float32 (*I)[4][256][34][41] = reinterpret_cast<const float32 (*)[4][256][34][41]>(pI);
  const float32 (*gamma)[256] = reinterpret_cast<const float32 (*)[256]>(pgamma);
  const float32 (*beta)[256] = reinterpret_cast<const float32 (*)[256]>(pbeta);
  __shared__ float32 _sumSquares_0[1][5];
  __shared__ float32 _sum_0[1][5];
  __shared__ float32 _beta_0[4][257];
  __shared__ float32 _gamma_0[4][257];
  __syncthreads();
  if (t1 == 0) {
    _sumSquares_0[0][0] = sumSquares[b0][0];
    _sumSquares_0[0][1] = sumSquares[b0][1];
    _sumSquares_0[0][2] = sumSquares[b0][2];
    _sumSquares_0[0][3] = sumSquares[b0][3];
    _sum_0[0][0] = sum[b0][0];
    _sum_0[0][1] = sum[b0][1];
    _sum_0[0][2] = sum[b0][2];
    _sum_0[0][3] = sum[b0][3];
  }
  for (int c3 = 0; c3 <= 255; c3 += 1) {
    _gamma_0[t1][c3] = __tc::ldg(&gamma[t1][c3]);
  }
  for (int c3 = 0; c3 <= 255; c3 += 1) {
    _beta_0[t1][c3] = __tc::ldg(&beta[t1][c3]);
  }
  __syncthreads();
  if (t1 == 0) {
    for (int c4 = 0; c4 <= 3; c4 += 1) {
      _sum_0[0][c4] = 0.000000f;
      for (int c5 = 0; c5 <= 255; c5 += 1) {
        for (int c6 = 0; c6 <= 33; c6 += 1) {
          for (int c7 = 0; c7 <= 40; c7 += 1) {
            _sum_0[0][c4] = (_sum_0[0][c4] + __tc::ldg(&I[b0][c4][c5][c6][c7]));
            if (c5 == 0 && c6 == 0 && c7 == 0) {
              _sumSquares_0[0][c4] = 0.000000f;
            }
            _sumSquares_0[0][c4] = (_sumSquares_0[0][c4] + (__tc::ldg(&I[b0][c4][c5][c6][c7])*__tc::ldg(&I[b0][c4][c5][c6][c7])));
          }
        }
      }
      for (int c5 = 256; c5 <= 511; c5 += 1) {
        for (int c6 = 0; c6 <= 33; c6 += 1) {
          for (int c7 = 0; c7 <= 40; c7 += 1) {
            O[b0][c4][(c5 - 256)][c6][c7] = (((_gamma_0[c4][c5 - 256]*(__tc::ldg(&I[b0][c4][(c5 - 256)][c6][c7]) - (_sum_0[0][c4]/float32(((256*34)*41)))))*rsqrt((((_sumSquares_0[0][c4]/float32(((256*34)*41))) - (((_sum_0[0][c4]*_sum_0[0][c4])/float32(((256*34)*41)))/float32(((256*34)*41)))) + 0.000010f))) + _beta_0[c4][c5 - 256]);
          }
        }
      }
    }
  }
  __syncthreads();
  if (t1 == 0) {
    sum[b0][0] = _sum_0[0][0];
    sum[b0][1] = _sum_0[0][1];
    sum[b0][2] = _sum_0[0][2];
    sum[b0][3] = _sum_0[0][3];
    sumSquares[b0][0] = _sumSquares_0[0][0];
    sumSquares[b0][1] = _sumSquares_0[0][1];
    sumSquares[b0][2] = _sumSquares_0[0][2];
    sumSquares[b0][3] = _sumSquares_0[0][3];
  }
  __syncthreads();
}

I've also observed this with grouped convolution.

Jun 20 '18 13:06 thetheodor

Does it cause slowdowns?

I know exactly where it comes from and have no intention of fixing this in the near future unless it improves performance.

Jun 20 '18 14:06 ftynse

I didn't measure.

Jun 20 '18 14:06 thetheodor

TensorComprehensions TensorComprehensions copied to clipboard

Spurious __syncthreads in the beginning of the kernel.

TensorComprehensions
TensorComprehensions copied to clipboard