DirectXShaderCompiler [SPIR-V] Nested static 'for' loops with unroll translate to 'while( true )' loops with actual branches

Hi! We're using DXC version 1.6.2104.52. We target mobile devices, such as Adreno and Mali GPUs. Consider the following code:

float4 color = (float4)0;
float4 colors[4];
float luminances[4];
[unroll]
for ( int j = 0; j < (4 - 1); j++ )
{
	[unroll]
	for ( int k = 0; k < (4 - j - 1); k++ )
	{
		[flatten]
		if ( luminances[k] > luminances[k + 1] )
		{
			float tmpLum = luminances[k];
			luminances[k] = luminances[k + 1];
			luminances[k + 1] = tmpLum;

			float4 tmpColor = colors[k];
			colors[k] = colors[k + 1];
			colors[k + 1] = tmpColor;
		}
	}
}

All of the iteration bounds are known at compile time, so we expect this loop to be fully unrolled. But, when viewing shader disassembly, the final code looks like

  while(true) {
    int _117 = Phi(0, _118);
    bool _120 = _117 < 3;
    [[Unroll]]
    if(!_120) break;

    while(true) {
      int _124 = Phi(0, _125);
      int _127 = 3 - _117;
      bool _128 = _124 < _127;
      [[Unroll]]
      if(!_128) break;
      float* _131 = &_74[_124];
      float _132 = *_131;
      int _125 = _124 + 1;
      float* _133 = &_74[_125];
      float _134 = *_133;
      bool _135 = _132 > _134;
      [[Flatten]]
      if(_135) {
        float _138 = *_131;
        float _139 = *_133;
        *_131 = _139;
        *_133 = _138;
        float4* _140 = &_73[_124];
        float4 _141 = *_140;
        float4* _142 = &_73[_125];
        float4 _143 = *_142;
        *_140 = _143;
        *_142 = _141;
      }

    }

    int _118 = _117 + 1;
  }

When doing Mali GPU captures, this produces actual branches on GPU with warp divergence up to 30%. Running SPIR-V binaries through malioc shows that the resulting shader is disproportionally bottlenecked by Load/Store unit operations (1.8 cycles arithmetics, 15 cycles worst case Load/Store unit operations).

If we unroll the shader manually, like this

void boxMedianOneStep( int j, inout float4 colors[4], inout float luminances[4] )
{
	[unroll]
	for ( int k = 0; k < (4 - j - 1); k++ )
	{
		[flatten]
		if ( luminances[k] > luminances[k + 1] )
		{
			float tmpLum = luminances[k];
			luminances[k] = luminances[k + 1];
			luminances[k + 1] = tmpLum;

			float4 tmpColor = colors[k];
			colors[k] = colors[k + 1];
			colors[k + 1] = tmpColor;
		}
	}
}

boxMedianOneStep( 0, colors, luminances );
boxMedianOneStep( 1, colors, luminances );
boxMedianOneStep( 2, colors, luminances );

the resulting code is perfectly flat, L/S bottlenecks are gone and there is zero warp divergence. The unrolled shader is up to 8x faster than the original one. Our legacy GLES toolchain (FXC->HLSLcc) handles this case with no problem. The new DXC/SPV-Opt toolchain does not, and even running additional loop unroll passes on final SPV code does not help at all.

May 28 '22 11:05 OrangeeZ

Hi @OrangeeZ!

Can you give a little more information about how this shader is being compiled and fill in some missing elements? As written, the shader is missing the entry function, color is unused, and colors[] and luminance[] are undefined, so the loops aren't preserved. Additionally, can you confirm your interest is only in SPIRV output?

Jun 13 '22 21:06 pow2clk

Hi @pow2clk!

The shader looks kinda like:

struct VertexOut
{
	float4 pos : SV_POSITION;
	float2 tc : TEXCOORD;
};

void boxMedianOneStep( int j, inout float3 colors[4], inout float luminances[4] )
{
	[unroll]
	for ( int k = 0; k < (4 - j - 1); k++ )
	{
		[flatten]
		if ( luminances[k] > luminances[k + 1] )
		{
			float tmpLum = luminances[k];
			luminances[k] = luminances[k + 1];
			luminances[k + 1] = tmpLum;

			float3 tmpColor = colors[k];
			colors[k] = colors[k + 1];
			colors[k + 1] = tmpColor;
		}
	}
}

float3 boxMedian( uniform Texture2D<float4> tex, uniform SamplerState sampleState, float2 invTextureSize, float2 tc )
{
	float2 offsets[4] = 
	{
		float2(-1.0, -1.0),
		float2(1.0, -1.0),
		float2(1.0, 1.0),
		float2(-1.0, 1.0)
	};

	float3 colors[4];
	float luminances[4];

	[unroll]
	for ( int i = 0; i < 4; i++ )
	{
		const float2 currentTC = tc + offsets[i] * invTextureSize;
		colors[i] = float3( tex.SampleLevel( sampleState, currentTC, 0 ).rgb );
		luminances[i] = luminance( colors[i].rgb );
	}

	boxMedianOneStep( 0, colors, luminances );
	boxMedianOneStep( 1, colors, luminances );
	boxMedianOneStep( 2, colors, luminances );

	return float3( ( colors[1] + colors[2] ) * 0.5 );
}

float4 PS_bright_pass(const VertexOut i ) : SV_Target0
{
	float3 color = boxMedian(g_srcMap, g_RTLinearSampler, g_srcSize.zw, i.tc).rgb;
	return float4(color, 0.0f);
}

The original loop in boxMedian() looked like this:

	float max = 0;
	[unroll]
	for ( int j = 0; j < (4 - 1); j++ )
	{
		[unroll]
		for ( int k = 0; k < (4 - j - 1); k++ )
		{
			[flatten]
			if ( luminances[k] > luminances[k + 1] )
			{
				float tmpLum = luminances[k];
				luminances[k] = luminances[k + 1];
				luminances[k + 1] = tmpLum;

				float4 tmpColor = colors[k];
				colors[k] = colors[k + 1];
				colors[k + 1] = tmpColor;
			}
		}
	}

Yes, we're only intrested in SPIR-V output.

Jun 14 '22 09:06 OrangeeZ

Thanks @Orangeez!

Given that DXIL properly unrolls this loop, I'm marking this as a SPIRV issue and tagging @sudonatalie and @kuhar

Here is the above shader with some small additions so that it compiles without anything missing

struct VertexOut
{
	float4 pos : SV_POSITION;
	float2 tc : TEXCOORD;
};

Texture2D<float4> g_srcMap;
SamplerState g_RTLinearSampler;
float4 g_srcSize;
float luminance(float3 RGB) {
      return RGB.g * 2.0 + RGB.r + RGB.b;
}

void boxMedianOneStep( int j, inout float3 colors[4], inout float luminances[4] )
{
	[unroll]
	for ( int k = 0; k < (4 - j - 1); k++ )
	{
		[flatten]
		if ( luminances[k] > luminances[k + 1] )
		{
			float tmpLum = luminances[k];
			luminances[k] = luminances[k + 1];
			luminances[k + 1] = tmpLum;

			float3 tmpColor = colors[k];
			colors[k] = colors[k + 1];
			colors[k + 1] = tmpColor;
		}
	}
}

float3 boxMedian( uniform Texture2D<float4> tex, uniform SamplerState sampleState, float2 invTextureSize, float2 tc )
{
	float2 offsets[4] = 
	{
		float2(-1.0, -1.0),
		float2(1.0, -1.0),
		float2(1.0, 1.0),
		float2(-1.0, 1.0)
	};

	float3 colors[4];
	float luminances[4];

	[unroll]
	for ( int i = 0; i < 4; i++ )
	{
		const float2 currentTC = tc + offsets[i] * invTextureSize;
		colors[i] = float3( tex.SampleLevel( sampleState, currentTC, 0 ).rgb );
		luminances[i] = luminance( colors[i].rgb );
	}

	boxMedianOneStep( 0, colors, luminances );
	boxMedianOneStep( 1, colors, luminances );
	boxMedianOneStep( 2, colors, luminances );

	return float3( ( colors[1] + colors[2] ) * 0.5 );
}

float4 PS_bright_pass(const VertexOut i ) : SV_Target0
{
	float3 color = boxMedian(g_srcMap, g_RTLinearSampler, g_srcSize.zw, i.tc).rgb;
	return float4(color, 0.0f);
}

Jun 29 '22 18:06 pow2clk

The problem is that the loop unroller in spirv-tools is very limited. It cannot unroll a loop unless it is an inner-most loop with a known upper bound. In this example, the expectation is that the outerloop is unrolled first (which cannot be done), which will given the inner loops fixed iterations counts.

To fix this spirv-tools will have to be able to unroll outer loops, which is difficult because of the structured control flow rules in spir-v. At least, that is the reason it was not implemented in the first place.

Mar 10 '23 19:03 s-perron

DirectXShaderCompiler DirectXShaderCompiler copied to clipboard

[SPIR-V] Nested static 'for' loops with unroll translate to 'while( true )' loops with actual branches

DirectXShaderCompiler
DirectXShaderCompiler copied to clipboard