ispc
ispc copied to clipboard
Stack usage walking pointers with static loop counters
Found an issue where ISPC uses stack whenever walking pointers inside a loop. When manually unrolled, this does not occur. In the example, 4-wide is correct because the loop counter is 1. 8-wide or greater is not. https://ispc.godbolt.org/z/BVZLcU
Just in case the link becomes unavailable, here's the code:
// Copyright Epic Games, Inc. All Rights Reserved.
struct FVector4
{
float<4> V;
};
struct WideFVector4
{
float V[programCount];
};
unmasked inline uniform WideFVector4 operator+(const uniform WideFVector4 &A, const uniform WideFVector4 &B)
{
uniform WideFVector4 Result;
Result.V[programIndex] = A.V[programIndex] + B.V[programIndex];
return Result;
}
unmasked inline void LoadWideFVector4(uniform FVector4 * uniform DstPtr, const uniform FVector4* uniform SrcPtr)
{
*DstPtr = *SrcPtr;
#if TARGET_WIDTH == 8 || TARGET_WIDTH == 16
*(DstPtr + 1) = *(SrcPtr + 1);
#endif
#if TARGET_WIDTH == 16
*(DstPtr + 2) = *(SrcPtr + 2);
*(DstPtr + 3) = *(SrcPtr + 3);
#endif
}
unmasked inline void StoreWideFVector4(uniform FVector4* uniform DstPtr, const uniform FVector4 * uniform SrcPtr)
{
*DstPtr = *SrcPtr;
#if TARGET_WIDTH == 8 || TARGET_WIDTH == 16
*(DstPtr + 1) = *(SrcPtr + 1);
#endif
#if TARGET_WIDTH == 16
*(DstPtr + 2) = *(SrcPtr + 2);
*(DstPtr + 3) = *(SrcPtr + 3);
#endif
}
unmasked void AddWide(uniform FVector4 Result[], uniform FVector4 Source1[], uniform FVector4 Source2[], const uniform int NumToAdd)
{
uniform int Base = NumToAdd & ~(programCount-1);
uniform int BoneOffset = programCount / 4;
for(uniform int Index = 0; Index < Base; Index+=(programCount/4))
{
uniform WideFVector4 S1, S2;
LoadWideFVector4((uniform FVector4 *uniform)&S1, (uniform FVector4 *uniform)&Source1[Index]);
LoadWideFVector4((uniform FVector4 *uniform)&S2, (uniform FVector4 *uniform)&Source2[Index]);
const uniform WideFVector4 R = S1 + S2;
StoreWideFVector4((uniform FVector4 *uniform)&Result[Index], (uniform FVector4 *uniform)&R);
}
}
unmasked inline void LoadWideFVector4_2(uniform FVector4 * uniform DstPtr, const uniform FVector4* uniform SrcPtr)
{
// Uses stack; use other method until fixed
for(uniform int i = 0; i < (programCount / 4); i++)
{
*(DstPtr + i) = *(SrcPtr + i);
}
}
unmasked inline void StoreWideFVector4_2(uniform FVector4* uniform DstPtr, const uniform FVector4 * uniform SrcPtr)
{
// Uses stack; use other method until fixed
for(uniform int i = 0; i < (programCount / 4); i++)
{
*(DstPtr + i) = *(SrcPtr + i);
}
}
unmasked void AddWide2(uniform FVector4 Result[], uniform FVector4 Source1[], uniform FVector4 Source2[], const uniform int NumToAdd)
{
uniform int Base = NumToAdd & ~(programCount-1);
uniform int BoneOffset = programCount / 4;
for(uniform int Index = 0; Index < Base; Index+=(programCount/4))
{
uniform WideFVector4 S1, S2;
LoadWideFVector4_2((uniform FVector4 *uniform)&S1, (uniform FVector4 *uniform)&Source1[Index]);
LoadWideFVector4_2((uniform FVector4 *uniform)&S2, (uniform FVector4 *uniform)&Source2[Index]);
const uniform WideFVector4 R = S1 + S2;
StoreWideFVector4_2((uniform FVector4 *uniform)&Result[Index], (uniform FVector4 *uniform)&R);
}
}
The problem here is that unrolling runs too late. Specifically, there's no SROA pass to clean up after the unrolling.
I've tried two approaches.
- Run SROA after the unrolling and got a good code, but the main loop was not unrolled by 2, as it is unrolled in
AddWideversion. Basically unrolling was applied to inner loop, and not to outer (keep in mind that it was running after the inlining). - I introduced one more unrolling pass before
ReplaceStdlibShiftPass- so happens early and before inlining. In this caseAddWideandAddWide2produce identical code. This is basically the same patch I tried for #2468.