Halide simple strided access to input buffers generates terrible asm

This single 2x downsample program:

#include "Halide.h"

using namespace Halide;

int main(int argc, char **argv) {
    Func f, g;
    Var x, y;

    ImageParam im(UInt(8), 1);

    g(x) = im(2 * x) + im(2 * x + 1);

    g.vectorize(x, 32, TailStrategy::RoundUp);

    g.compile_to_assembly("/dev/stdout", {im}, Target{"x86-64-avx2-linux-no_runtime-no_bounds_query-no_asserts"});

    return 0;
}

Generates this horrifying inner loop:

.LBB0_2:   
	movslq	%edx, %rdx
	vpinsrw	$6, 60(%rsi,%rdx), %xmm0, %xmm5
	vmovdqu	(%rsi,%rdx), %ymm3
	vpshufb	%ymm2, %ymm3, %ymm3
	movq	48(%rsi,%rdx), %rcx
	vmovq	%rcx, %xmm4
	vpinsrd	$2, 56(%rsi,%rdx), %xmm4, %xmm6
	vmovdqu	(%rsi,%rdx), %xmm7
	vmovdqu	32(%rsi,%rdx), %xmm4
	vpshufb	%xmm0, %xmm4, %xmm4
	vpinsrb	$8, %ecx, %xmm4, %xmm1
	vmovdqu	16(%rsi,%rdx), %xmm4
	movl	%ecx, %eax
	shrl	$16, %eax
	vpinsrb	$9, %eax, %xmm1, %xmm1
	movq	%rcx, %rax
	shrq	$32, %rax
	vpinsrb	$10, %eax, %xmm1, %xmm1
	shrq	$48, %rcx
	vpinsrb	$11, %ecx, %xmm1, %xmm1
	vpextrb	$8, %xmm6, %eax
	vpinsrb	$12, %eax, %xmm1, %xmm1
	vpextrb	$10, %xmm6, %eax
	vpinsrb	$13, %eax, %xmm1, %xmm1
	vpextrb	$12, %xmm5, %eax
	vpinsrb	$14, %eax, %xmm1, %xmm1
	vpinsrb	$15, 62(%rsi,%rdx), %xmm1, %xmm1
	vpshufb	%xmm0, %xmm7, %xmm5
	vmovd	%xmm4, %eax
	vpinsrb	$8, %eax, %xmm5, %xmm5
	vpextrb	$2, %xmm4, %eax
	vpinsrb	$9, %eax, %xmm5, %xmm5
	vpextrb	$4, %xmm4, %eax
	vpinsrb	$10, %eax, %xmm5, %xmm5
	vpextrb	$6, %xmm4, %eax
	vpinsrb	$11, %eax, %xmm5, %xmm5
	vpextrb	$8, %xmm4, %eax
	vpinsrb	$12, %eax, %xmm5, %xmm5
	vpextrb	$10, %xmm4, %eax
	vpinsrb	$13, %eax, %xmm5, %xmm5
	vpextrb	$12, %xmm4, %eax
	vpinsrb	$14, %eax, %xmm5, %xmm5
	vpextrb	$14, %xmm4, %eax
	vpinsrb	$15, %eax, %xmm5, %xmm4
	vinserti128	$1, %xmm1, %ymm4, %ymm1
	vmovdqu	32(%rsi,%rdx), %ymm4
	vpshufb	%ymm8, %ymm4, %ymm4
	vpblendd	$204, %ymm4, %ymm3, %ymm3       # ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
	vpermq	$216, %ymm3, %ymm3              # ymm3 = ymm3[0,2,1,3]
	vpaddb	%ymm1, %ymm3, %ymm1
	vmovdqu	%ymm1, (%r8,%rdi)
	addq	$32, %rdi
	addl	$64, %edx
	cmpq	%rdi, %r9
	jne	.LBB0_2

The reason for this is complex and to do with only reasoning locally about each strided load, not wanting to load out of bounds on input buffers, and then loading non-power-of-two vectors. A workaround is to stage the input explicitly with a .in(), but this runs into issue #6816

Jun 24 '22 20:06 abadams

So where does this stand -- is it affecting x86 only, or is it more general? Do we need a revert? An urgent fix?

Jun 27 '22 16:06 steven-johnson

It's more general. There's a workaround, but I think a timely fix would be good. I don't think we should revert, because the PR that fixed the issue fixed an out-of-bounds read on ARM.

Jun 27 '22 21:06 abadams

This is also a problem for internal buffers, because it means we can't use aligned_alloc #7190 without adding a whole extra alignment bytes at the start and end.

Edit: After more careful examination of the status quo, I believe the allocator never has to worry about over or under-reads. Base pointers of loads are only subtracted to bring them into alignment with the min coordinate, which can't under-read, and any over-read on the end is handled by the padding added in CodeGen_Posix

Dec 06 '22 21:12 abadams

@abadams commented in chat: "We really need a compiler pass that can take a global view of what loads are happening in a stmt... StageStridedLoads or some such thing, then CodeGen_LLVM should just do the naive thing"

Dec 06 '22 21:12 steven-johnson