simple strided access to input buffers generates terrible asm
This single 2x downsample program:
#include "Halide.h"
using namespace Halide;
int main(int argc, char **argv) {
Func f, g;
Var x, y;
ImageParam im(UInt(8), 1);
g(x) = im(2 * x) + im(2 * x + 1);
g.vectorize(x, 32, TailStrategy::RoundUp);
g.compile_to_assembly("/dev/stdout", {im}, Target{"x86-64-avx2-linux-no_runtime-no_bounds_query-no_asserts"});
return 0;
}
Generates this horrifying inner loop:
.LBB0_2:
movslq %edx, %rdx
vpinsrw $6, 60(%rsi,%rdx), %xmm0, %xmm5
vmovdqu (%rsi,%rdx), %ymm3
vpshufb %ymm2, %ymm3, %ymm3
movq 48(%rsi,%rdx), %rcx
vmovq %rcx, %xmm4
vpinsrd $2, 56(%rsi,%rdx), %xmm4, %xmm6
vmovdqu (%rsi,%rdx), %xmm7
vmovdqu 32(%rsi,%rdx), %xmm4
vpshufb %xmm0, %xmm4, %xmm4
vpinsrb $8, %ecx, %xmm4, %xmm1
vmovdqu 16(%rsi,%rdx), %xmm4
movl %ecx, %eax
shrl $16, %eax
vpinsrb $9, %eax, %xmm1, %xmm1
movq %rcx, %rax
shrq $32, %rax
vpinsrb $10, %eax, %xmm1, %xmm1
shrq $48, %rcx
vpinsrb $11, %ecx, %xmm1, %xmm1
vpextrb $8, %xmm6, %eax
vpinsrb $12, %eax, %xmm1, %xmm1
vpextrb $10, %xmm6, %eax
vpinsrb $13, %eax, %xmm1, %xmm1
vpextrb $12, %xmm5, %eax
vpinsrb $14, %eax, %xmm1, %xmm1
vpinsrb $15, 62(%rsi,%rdx), %xmm1, %xmm1
vpshufb %xmm0, %xmm7, %xmm5
vmovd %xmm4, %eax
vpinsrb $8, %eax, %xmm5, %xmm5
vpextrb $2, %xmm4, %eax
vpinsrb $9, %eax, %xmm5, %xmm5
vpextrb $4, %xmm4, %eax
vpinsrb $10, %eax, %xmm5, %xmm5
vpextrb $6, %xmm4, %eax
vpinsrb $11, %eax, %xmm5, %xmm5
vpextrb $8, %xmm4, %eax
vpinsrb $12, %eax, %xmm5, %xmm5
vpextrb $10, %xmm4, %eax
vpinsrb $13, %eax, %xmm5, %xmm5
vpextrb $12, %xmm4, %eax
vpinsrb $14, %eax, %xmm5, %xmm5
vpextrb $14, %xmm4, %eax
vpinsrb $15, %eax, %xmm5, %xmm4
vinserti128 $1, %xmm1, %ymm4, %ymm1
vmovdqu 32(%rsi,%rdx), %ymm4
vpshufb %ymm8, %ymm4, %ymm4
vpblendd $204, %ymm4, %ymm3, %ymm3 # ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
vpermq $216, %ymm3, %ymm3 # ymm3 = ymm3[0,2,1,3]
vpaddb %ymm1, %ymm3, %ymm1
vmovdqu %ymm1, (%r8,%rdi)
addq $32, %rdi
addl $64, %edx
cmpq %rdi, %r9
jne .LBB0_2
The reason for this is complex and to do with only reasoning locally about each strided load, not wanting to load out of bounds on input buffers, and then loading non-power-of-two vectors. A workaround is to stage the input explicitly with a .in(), but this runs into issue #6816
So where does this stand -- is it affecting x86 only, or is it more general? Do we need a revert? An urgent fix?
It's more general. There's a workaround, but I think a timely fix would be good. I don't think we should revert, because the PR that fixed the issue fixed an out-of-bounds read on ARM.
This is also a problem for internal buffers, because it means we can't use aligned_alloc #7190 without adding a whole extra alignment bytes at the start and end.
Edit: After more careful examination of the status quo, I believe the allocator never has to worry about over or under-reads. Base pointers of loads are only subtracted to bring them into alignment with the min coordinate, which can't under-read, and any over-read on the end is handled by the padding added in CodeGen_Posix
@abadams commented in chat: "We really need a compiler pass that can take a global view of what loads are happening in a stmt... StageStridedLoads or some such thing, then CodeGen_LLVM should just do the naive thing"