TensorComprehensions
TensorComprehensions copied to clipboard
Spurious __syncthreads in the beginning of the kernel.
I noticed that in some cases the first statement after variable definitions in the kernel is a __syncthreads(); which if I am not mistaken makes no sense.
For example, in GroupNormalizationSingleKernel with input sizes C=1024, G=4, N=32, H=4, W=41 and options:
tc::CudaMappingOptions::makeNaiveMappingOptions()
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
.outerScheduleAllowSkewing(true)
.outerSchedulePositiveOrthant(true)
.intraTileScheduleFusionStrategy(tc::FusionStrategy::Max)
.intraTileScheduleAllowSkewing(true)
.intraTileSchedulePositiveOrthant(true)
.fixParametersBeforeScheduling(false)
.tile(1)
.unroll(148)
.tileImperfectlyNested(false)
.matchLibraryCalls(false)
.mapToThreads(1, 126, 3)
.mapToBlocks(118, 61, 193)
.useSharedMemory(true)
.usePrivateMemory(false)
.unrollCopyShared(true)
.useReadOnlyCache(true);
The generated code is:
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
#define if_then_else(cond,a,b) ((cond) ? (a) : (b))
ft
// Can't include system dependencies with NVRTC
// Can't include cuda_fp16.h with NVRTC due to transitive system dependencies
// #include <cuda_fp16.h>
// Halide type handling
typedef char int8;
typedef short int16;
typedef int int32;
typedef long int64;
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef unsigned long uint64;
// typedef half float16;
typedef float float32;
typedef double float64;
#define inff __int_as_float(0x7f800000)
#define inf __longlong_as_double(0x7ff0000000000000LL)
// Before CUDA 9, syncwarp is a noop since warps are always synchronized.
#if __CUDACC_VER_MAJOR__ < 9
__device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {}
#endif
namespace __tc {
template<typename T>
__device__ __forceinline__ T ldg(const T* ptr) {
#if __CUDA_ARCH__ >= 350
return __ldg(ptr);
#else
return *ptr;
#endif
}
} // namespace __tc
__global__ __launch_bounds__(4) void group_normalization_single_kernel_256_4_34_32_41(int32 D, int32 G, int32 H, int32 N, int32 W, float32* pO, float32* psum, float32* psumSquares, const float32* pI, const float32* pgamma, const float32* pbeta) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*O)[4][256][34][41] = reinterpret_cast<float32 (*)[4][256][34][41]>(pO);
float32 (*sum)[4] = reinterpret_cast<float32 (*)[4]>(psum);
float32 (*sumSquares)[4] = reinterpret_cast<float32 (*)[4]>(psumSquares);
const float32 (*I)[4][256][34][41] = reinterpret_cast<const float32 (*)[4][256][34][41]>(pI);
const float32 (*gamma)[256] = reinterpret_cast<const float32 (*)[256]>(pgamma);
const float32 (*beta)[256] = reinterpret_cast<const float32 (*)[256]>(pbeta);
__shared__ float32 _sumSquares_0[1][5];
__shared__ float32 _sum_0[1][5];
__shared__ float32 _beta_0[4][257];
__shared__ float32 _gamma_0[4][257];
__syncthreads();
if (t1 == 0) {
_sumSquares_0[0][0] = sumSquares[b0][0];
_sumSquares_0[0][1] = sumSquares[b0][1];
_sumSquares_0[0][2] = sumSquares[b0][2];
_sumSquares_0[0][3] = sumSquares[b0][3];
_sum_0[0][0] = sum[b0][0];
_sum_0[0][1] = sum[b0][1];
_sum_0[0][2] = sum[b0][2];
_sum_0[0][3] = sum[b0][3];
}
for (int c3 = 0; c3 <= 255; c3 += 1) {
_gamma_0[t1][c3] = __tc::ldg(&gamma[t1][c3]);
}
for (int c3 = 0; c3 <= 255; c3 += 1) {
_beta_0[t1][c3] = __tc::ldg(&beta[t1][c3]);
}
__syncthreads();
if (t1 == 0) {
for (int c4 = 0; c4 <= 3; c4 += 1) {
_sum_0[0][c4] = 0.000000f;
for (int c5 = 0; c5 <= 255; c5 += 1) {
for (int c6 = 0; c6 <= 33; c6 += 1) {
for (int c7 = 0; c7 <= 40; c7 += 1) {
_sum_0[0][c4] = (_sum_0[0][c4] + __tc::ldg(&I[b0][c4][c5][c6][c7]));
if (c5 == 0 && c6 == 0 && c7 == 0) {
_sumSquares_0[0][c4] = 0.000000f;
}
_sumSquares_0[0][c4] = (_sumSquares_0[0][c4] + (__tc::ldg(&I[b0][c4][c5][c6][c7])*__tc::ldg(&I[b0][c4][c5][c6][c7])));
}
}
}
for (int c5 = 256; c5 <= 511; c5 += 1) {
for (int c6 = 0; c6 <= 33; c6 += 1) {
for (int c7 = 0; c7 <= 40; c7 += 1) {
O[b0][c4][(c5 - 256)][c6][c7] = (((_gamma_0[c4][c5 - 256]*(__tc::ldg(&I[b0][c4][(c5 - 256)][c6][c7]) - (_sum_0[0][c4]/float32(((256*34)*41)))))*rsqrt((((_sumSquares_0[0][c4]/float32(((256*34)*41))) - (((_sum_0[0][c4]*_sum_0[0][c4])/float32(((256*34)*41)))/float32(((256*34)*41)))) + 0.000010f))) + _beta_0[c4][c5 - 256]);
}
}
}
}
}
__syncthreads();
if (t1 == 0) {
sum[b0][0] = _sum_0[0][0];
sum[b0][1] = _sum_0[0][1];
sum[b0][2] = _sum_0[0][2];
sum[b0][3] = _sum_0[0][3];
sumSquares[b0][0] = _sumSquares_0[0][0];
sumSquares[b0][1] = _sumSquares_0[0][1];
sumSquares[b0][2] = _sumSquares_0[0][2];
sumSquares[b0][3] = _sumSquares_0[0][3];
}
__syncthreads();
}
I've also observed this with grouped convolution.
Does it cause slowdowns?
I know exactly where it comes from and have no intention of fixing this in the near future unless it improves performance.
I didn't measure.