i686 gcc 12: regression at -O1 or -O2
test: x86/sse/emul/c
start time: 16:39:16
duration: 0.03s
result: exit status 1
command: MALLOC_PERTURB_=148 '/<<PKGBUILDDIR>>/gcc_test/test/x86/sse-emul-c'
----------------------------------- stderr -----------------------------------
../test/x86/sse.c:4307: assertion failed: r[0] == test_vec[i].r[0] (19623 == 292)
../test/x86/sse.c:4348: assertion failed: r[0] == test_vec[i].r[0] (19623 == 292)
To reproduce
cd docker
./simde-dev.sh
# now inside the container
simde-reset-build.sh i686-gcc-12-debflags
meson compile -C i686-gcc-12-debflags
meson test -C i686-gcc-12-debflags --print-errorlogs
Confirmed with https://github.com/simd-everywhere/simde/releases/tag/v0.7.4-rc3
Will probably be helped by https://github.com/simd-everywhere/simde/issues/1025
I'll investigate. It is very much possible that this is due to MMX.
Ok, this is not related to MMX.
Doing some tests, it seems that this is a GCC bug specific to GCC 12 that has been fixed in GCC 12.2.1. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322
Baaasically, it seems that there was an incorrect guard for whether the MULH pattern was autovectorizable and when it was targeting an unsupported instruction set it just shrugged and tried to do a MULH on each machine word in the vector:
# x86_64 -mno-mmx -mno-sse (artificial)
mulhi_pu16:
mov rax, rsi
mul rdi
mov rax, rdx
ret
# i686 -mno-mmx -mno-sse
mulhi_pu16:
push ebx
mov eax, DWORD PTR [esp+12]
mul DWORD PTR [esp+20]
mov eax, DWORD PTR [esp+16]
mov ecx, DWORD PTR [esp+8]
mov ebx, edx
mul DWORD PTR [esp+24]
mov eax, ecx
mov DWORD PTR [ecx], ebx
mov DWORD PTR [ecx+4], edx
pop ebx
ret 4
# arm -mfloat-abi=soft
mulhi_pu16:
ldrd ip, r1, [sp]
umull ip, r2, ip, r2
umull r1, r3, r1, r3
strd r2, r3, [r0]
bx lr
It seems that we just need to break up the autovec pattern, which amusingly is as simple as a backwards loop :trollface:
@easyaspi314 thanks for the research!
Can you be more specific as to how we can workaround this for those using the affected versions of GCC?
Ah, I'll try __attribute__((optimize("no-tree-vectorize")))
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322#c14
Ah, I'll try
__attribute__((optimize("no-tree-vectorize")))https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322#c14
attribute optimize does work temporarily but it does disable inlining which emits a warn/error.
/* GCC 12.1-12.2.0 emit garbage if vector.umulh is not
* a legal operation. Looping backwards breaks the
* vectorization pattern. */
#if defined(SIMDE_GCC_BUG_106322)
for (int i = num; i-- > 0; ) {
#else
for (int i = 0; i < num; i++) {
#endif
should in theory work but I haven't tested thoroughly
Hmm. Neither reversing the loop nor using __attribute__((optimize("no-tree-vectorize"))) fixed the tests under i686 GCC-12 with -O2 (via https://github.com/simd-everywhere/simde/blob/master/docker/cross-files/i686-gcc-12-debflags.cross )
diff --git a/simde/x86/sse.h b/simde/x86/sse.h
index 80d4c6b3..2250e66d 100644
--- a/simde/x86/sse.h
+++ b/simde/x86/sse.h
@@ -3435,7 +3435,8 @@ simde_mm_mulhi_pu16 (simde__m64 a, simde__m64 b) {
r_.neon_u16 = t3;
#else
SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
+ //for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
+ for (size_t i = (sizeof(r_.u16) / sizeof(r_.u16[0])); i > 0 ; i--) {
r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, ((HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i])) >> UINT32_C(16)));
}
#endif
That backwards loop is off by one.