llvm-project
llvm-project copied to clipboard
AArch32 FP16 neon average function produces incorrect result when optimized
An average function written with intrinsics produces inaccurate values when optimized.
It works with -O0 but fails with all levels of optimization -Os, -Oz, -O1, -O2
It also works when built for AArch64 with optimization on, but fails with AArch32
The inner loop produces 4 sums
const __fp16* i0 = input;
const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + elements);
const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + elements);
const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + elements);
size_t n = elements;
while (n >= 8 * sizeof(__fp16)) {
const float16x8_t vi0 = vld1q_f16(i0); i0 += 8;
const float16x8_t vi1 = vld1q_f16(i1); i1 += 8;
const float16x8_t vi2 = vld1q_f16(i2); i2 += 8;
const float16x8_t vi3 = vld1q_f16(i3); i3 += 8;
vsum0 = vaddq_f16(vsum0, vi0);
vsum1 = vaddq_f16(vsum1, vi1);
vsum2 = vaddq_f16(vsum2, vi2);
vsum3 = vaddq_f16(vsum3, vi3);
n -= 8 * sizeof(__fp16);
}
The results are later combined and output as 4 fp16 values:
float16x4_t vout = vmul_f16(vsum, vmultiplier);
vout = vmax_f16(vout, voutput_min);
vout = vmin_f16(vout, voutput_max);
vst1_f16(o, vout); o += 4;
If the code is simplied to do 1 average at a time instead of 4, it works. If the code computes 4 averages but only outputs 1, it fails.
When tested with 4 rows of 8 elements with random inputs for 0.1 to 10.0, the average is a little off:
[ RUN ] F16_GAVGPOOL_CW__NEONFP16ARITH_X8.elements_eq_8
third_party/XNNPACK/test/gavgpool-cw-microkernel-tester.h:190: Failure
The difference between fp16_ieee_to_fp32_value(y[i]) and y_ref[i] is 0.4029541015625, which exceeds 1.0e-2f * std::abs(y_ref[i]), where
fp16_ieee_to_fp32_value(y[i]) evaluates to 5.375,
y_ref[i] evaluates to 4.9720458984375, and
1.0e-2f * std::abs(y_ref[i]) evaluates to 0.049720458686351776.
at position 1, elements = 8, channels = 4
[ FAILED ] F16_GAVGPOOL_CW__NEONFP16ARITH_X8.elements_eq_8 (2 ms)
@llvm/issue-subscribers-backend-arm
I've narrowed it down to the params pointer. Here is a godbolt reproducible https://godbolt.org/z/aY4cx8vv3
The function is to 2 channels. The reason that matters is vmultiplier will have the wrong value and the 2nd channel will scale by garbage.
#include <stdio.h>
#include <arm_neon.h>
#include <xnnpack/gavgpool.h>
void xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8(
size_t elements,
size_t channels,
const void* input,
void* output,
const union xnn_f16_gavgpool_params params[static 1]) // <- fails
// const union xnn_f16_gavgpool_params *params) // <- works
{
const __fp16* i0 = input;
__fp16* o = (__fp16*) output;
const float16x4_t vmultiplier = vreinterpret_f16_u16(vld1_dup_u16(¶ms->neonfp16arith.multiplier));
const float16x4_t voutput_min = vreinterpret_f16_u16(vld1_dup_u16(¶ms->neonfp16arith.output_min));
const float16x4_t voutput_max = vreinterpret_f16_u16(vld1_dup_u16(¶ms->neonfp16arith.output_max));
while (channels >= 2) {
const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + elements);
float16x8_t vsum0 = vmovq_n_f16(0);
float16x8_t vsum1 = vmovq_n_f16(0);
size_t n = elements;
while (n >= 8 * sizeof(__fp16)) {
const float16x8_t vi0 = vld1q_f16(i0); i0 += 8;
const float16x8_t vi1 = vld1q_f16(i1); i1 += 8;
vsum0 = vaddq_f16(vsum0, vi0);
vsum1 = vaddq_f16(vsum1, vi1);
n -= 8 * sizeof(__fp16);
}
// Combine 2 rows into 2 values in same vector.
const float16x4_t vsum0_lo = vadd_f16(vget_low_f16(vsum0), vget_high_f16(vsum0));
const float16x4_t vsum1_lo = vadd_f16(vget_low_f16(vsum1), vget_high_f16(vsum1));
const float16x4_t vsum01_lo = vpadd_f16(vsum0_lo, vsum1_lo);
const float16x4_t vsum = vpadd_f16(vsum01_lo, vsum01_lo);
float16x4_t vout = vmul_f16(vsum, vmultiplier);
vout = vmax_f16(vout, voutput_min);
vout = vmin_f16(vout, voutput_max);
vst1_lane_u32(o, vreinterpret_u32_f16(vout), 0); o += 2;
i0 = i1;
channels -= 2;
}
}
If I switch the params to a simple pointer, the 3 vld1_dup_u16 instructions generate this:
ldr ip, [sp, #8]
add ip, ip, #16
vld1.16 {d16[]}, [ip :16]!
vld1.16 {d17[]}, [ip :16]!
vld1.16 {d18[]}, [ip :16]
But failing code produces this:
ldr ip, [sp, #8]
mov r4, #2
add lr, ip, #16
vld2.16 {d16-d17}, [lr :128], r4
add r4, ip, #20
vdup.32 d16, d16[0]
vld1.16 {d17[]}, [r4 :16]
vld1.16 {d18[]}, [lr :16]
The union is defined as
union xnn_f16_gavgpool_params {
char _; // Dummy member variable to comply with the C standard
struct {
__attribute__((__aligned__(16))) uint16_t mask[8];
uint16_t multiplier;
uint16_t output_min;
uint16_t output_max;
} neonfp16arith;
};
The bad code is somehow related to the attribute((aligned(16))) In the example, mask is unused, but the vld1_dup_u16(¶ms->neonfp16arith.multiplier)) did an unexpected
vld2.16 {d16-d17}, [lr :128], r4
vdup.32 d16, d16[0]
The test can be simplifed to: https://godbolt.org/z/Yz44hncq7
#include <arm_neon.h>
struct f16_params{
__attribute__((__aligned__(16))) __fp16 multiplier;
};
float16x4_t f16_dup(const struct f16_params params[static 1])
{
return vld1_dup_f16(¶ms->multiplier);
}
Which produces:
vld2.16 {d16, d17}, [r0:128]
vdup.32 d0, d16[0]
Expected result is:
vld1.16 {d0[]}, [r0:16]
@lenary @davemgreen could you take a look? This is clearly a miscompilation bug with a simple repro.
I believe that with #58512 this is fixed, even if not optimal. Please let me know if not
vld2.16 {d16, d17}, [r0:128] copies the word 2 in the memory into word 1 in d16, then vdup.32 d0, d16[0] replicates (word 0, word 1) tuple into d0. To make this code minimally correct (yet very inefficient), vdup.32 should have been vdup.16.
@EugeneZelenko could you re-open this issue? This is an active miscompilation bug
godbolt with clang trunk is doing the correct sized dup now:
vldr d16, [r0]
vdup.16 d0, d16[0]
so the fix to the dup works, but it would be better if it produced the same code as -O0
vld1.16 {d16[]}, [r0:16]