Fast ISel on AArch64 produces an invalid relocation/relaxation sequence
When using -O1, fast ISel, and sanitizers on ARM64 / macOS it seems that Clang 15 miscompiles Abseil's flag parsing code.
There is code to set a global variable to a specific value which looks like this:
adrp x10, __ZN4absl14flags_internal12_GLOBAL__N_115specified_flagsE@PAGE
add x8, x10, __ZN4absl14flags_internal12_GLOBAL__N_115specified_flagsE@PAGEOFF
str x20, [x8]
While superficially this looks correct, when linked (both with LLD and LD64 as far as I can tell) the instruction sequence becomes:
0x10078a25c <+2812>: adrp x10, 3749
0x10078a260 <+2816>: nop
0x10078a264 <+2820>: str x20, [x8, #0x9c0]
Because the str still uses x8 as the base register rather than x10, the address is based on a stale value left over from somewhere else.
My assumption is that this sequence of relocations are intended to enable this linker relaxation and are supposed to use a single register in order to make the relaxation simple to apply. At least, that seems likely given that both LLD and LD64 don't do any validity checks and produce the same result.
Unsure what the best reproducer would be here as reducing this is tricky without disturbing the pattern that allows the relaxation to apply or the registers used. I have the IR input for the whole function from Abseil that produces it, although that's somewhat large. I'm trying some ideas for further minimizing it.
Reproduction steps:
> llc -fast-isel --relocation-model=pic parse.onefunction.ll -o - | grep -C3 'add\s*x8, x10, .*specified_flagsE@PAGEOFF'
Lloh26:
adrp x10, __ZN4absl14flags_internal12_GLOBAL__N_115specified_flagsE@PAGE
Lloh27:
add x8, x10, __ZN4absl14flags_internal12_GLOBAL__N_115specified_flagsE@PAGEOFF
Lloh28:
str x20, [x8]
I tried to build a minimal reproduction, but while it produces a similar assembly sequence, the link step doesn't do the same relaxation I observe in the real world test case. Unsure what else I can provide.
I can give steps to reproduce this with an open source project and on an ARM Mac, but it's pretty involved. Let me know what would help.
My minimal repro attempt that doesn't work:
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-macosx12.0.0"
@my_global2 = global { [32 x i8] } zeroinitializer
@my_global = global { ptr, [24 x i8] } zeroinitializer
@my_global3 = global { [32 x i8] } zeroinitializer
define void @my_function(ptr %0, i1 %1) personality ptr null {
store ptr %0, ptr @my_global, align 8
br label %3
3: ; preds = %2
br i1 %1, label %4, label %5
4: ; preds = %3
call void @__asan_report_load8()
unreachable
5: ; preds = %3
br label %6
6: ; preds = %5
%7 = load ptr, ptr @my_global, align 8
br label %8
8: ; preds = %6
call void @my_function2(ptr %7)
br label %9
9: ; preds = %8
%10 = call ptr null(ptr null)
ret void
}
declare void @_exit(i64)
define void @my_function2(ptr) {
call void @_exit(i64 0)
ret void
}
declare void @abort()
define void @__asan_report_load8() {
call void @abort()
ret void
}
define void @main() {
%p = alloca { ptr, [24 x i8] }
call void @my_function(ptr @my_global2, i1 false)
ret void
}
Running this command on the above:
> llc -fast-isel --function-sections --data-sections --relocation-model=pic reduced.test.ll -o -
Produces:
.section __TEXT,__text,regular,pure_instructions
.build_version macos, 12, 0
.globl _my_function ; -- Begin function my_function
.p2align 2
_my_function: ; @my_function
Lfunc_begin0:
.cfi_startproc
; %bb.0:
stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
.cfi_def_cfa_offset 16
.cfi_offset w30, -8
.cfi_offset w29, -16
adrp x8, _my_global@PAGE
add x9, x8, _my_global@PAGEOFF
str x0, [x9]
tbnz w1, #0, LBB0_2
; %bb.1:
ldr x0, [x8, _my_global@PAGEOFF]
bl _my_function2
brk #0x1
LBB0_2:
bl ___asan_report_load8
brk #0x1
Lfunc_end0:
.cfi_endproc
; -- End function
.globl _my_function2 ; -- Begin function my_function2
.p2align 2
_my_function2: ; @my_function2
.cfi_startproc
; %bb.0:
stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
.cfi_def_cfa_offset 16
.cfi_offset w30, -8
.cfi_offset w29, -16
mov x0, xzr
bl __exit
ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
ret
.cfi_endproc
; -- End function
.globl ___asan_report_load8 ; -- Begin function __asan_report_load8
.p2align 2
___asan_report_load8: ; @__asan_report_load8
.cfi_startproc
; %bb.0:
stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
.cfi_def_cfa_offset 16
.cfi_offset w30, -8
.cfi_offset w29, -16
bl _abort
ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
ret
.cfi_endproc
; -- End function
.globl _main ; -- Begin function main
.p2align 2
_main: ; @main
.cfi_startproc
; %bb.0:
sub sp, sp, #48
.cfi_def_cfa_offset 48
stp x29, x30, [sp, #32] ; 16-byte Folded Spill
.cfi_offset w30, -8
.cfi_offset w29, -16
Lloh0:
adrp x8, _my_global2@PAGE
Lloh1:
add x0, x8, _my_global2@PAGEOFF
and w1, wzr, #0x1
bl _my_function
ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
add sp, sp, #48
ret
.loh AdrpAdd Lloh0, Lloh1
.cfi_endproc
; -- End function
.globl _my_global2 ; @my_global2
.zerofill __DATA,__common,_my_global2,32,4
.globl _my_global ; @my_global
.zerofill __DATA,__common,_my_global,32,4
.globl _my_global3 ; @my_global3
.zerofill __DATA,__common,_my_global3,32,4
.subsections_via_symbols
But linking it doesn't reproduce the weird relaxation:
> clang -fpie -fdata-sections -ffunction-sections -O1 -o reduced.test reduced.test.s; and otool -t -v reduced.test
reduced.test:
(__TEXT,__text) section
_my_function:
0000000100003e78 stp x29, x30, [sp, #-0x10]!
0000000100003e7c adrp x8, 5 ; 0x100008000
0000000100003e80 add x9, x8, #0x20
0000000100003e84 str x0, [x9]
0000000100003e88 tbnz w1, #0x0, 0x100003e98
0000000100003e8c ldr x0, [x8, #0x20]
0000000100003e90 bl _my_function2
0000000100003e94 brk #0x1
0000000100003e98 bl ___asan_report_load8
0000000100003e9c brk #0x1
_my_function2:
0000000100003ea0 stp x29, x30, [sp, #-0x10]!
0000000100003ea4 mov x0, xzr
0000000100003ea8 bl 0x100003ee8 ; symbol stub for: __exit
0000000100003eac ldp x29, x30, [sp], #0x10
0000000100003eb0 ret
___asan_report_load8:
0000000100003eb4 stp x29, x30, [sp, #-0x10]!
0000000100003eb8 bl 0x100003ef4 ; symbol stub for: _abort
0000000100003ebc ldp x29, x30, [sp], #0x10
0000000100003ec0 ret
_main:
0000000100003ec4 sub sp, sp, #0x30
0000000100003ec8 stp x29, x30, [sp, #0x20]
0000000100003ecc adr x0, #0x4134
0000000100003ed0 nop
0000000100003ed4 and w1, wzr, #0x1
0000000100003ed8 bl _my_function
0000000100003edc ldp x29, x30, [sp, #0x20]
0000000100003ee0 add sp, sp, #0x30
0000000100003ee4 ret
@llvm/issue-subscribers-backend-aarch64
As pointed out on Twitter, LD64 doesn't seem to assert that the adrp destination and add destination match, but it also doesn't update them to match, just folds the offset into the addressing mode:
https://github.com/apple-opensource/ld64/blob/609/src/ld/OutputFile.cpp#L2509,L2513
Unclear what the fix should be here:
- Avoid emitting this in LLVM
- Teach the linker to check and skip transforming the pattern
- Teach the linker to update the base register
The problem is likely to be in AArch64CollectLOH.cpp, where it tries to find these patterns and emit the linker hints. ld64 assumes that the compiler has done all the necessary checks.
We've had problems recently with this pass before, I have no idea why it never came up in all these years and within the last year or so we've hit multiple.
@chandlerc Are you sure that the bug reproduces when linking with ld64.lld?
Your IR gets generated to asm with an AddrpAddStr linker optimization hint, but even on trunk that particular hint is not implemented in lld
https://github.com/llvm/llvm-project/blob/08901c8a980d98672d456558fac3b2bee990bf61/lld/MachO/Arch/ARM64.cpp#L668-L671
https://github.com/llvm/llvm-project/issues/50399#issuecomment-1194143464
@chandlerc Are you sure that the bug reproduces when linking with ld64.lld?
It did with -fuse-ld=lld, but I can check later to see if this was actually still using ld64 for some reason or something stripped out that flag without my realizing; it's entirely possible.
The simplest way to check if it was indeed linked with ld64 is to run otool -l on the linked binary and check the version field belonging to tool 3 in the LC_BUILD_VERSION load command. If it is LLVM's version, the binary used LLD; otherwise it should match the number in ld -v.
(IIRC Apple has a tool constant for LLD (not sure what the deal with that is), but we still specify the same TOOL_LD in the build_tool_version struct as ld64)
Several folks on Twitter helped me learn about the LOH hints that need to be present for the linker behavior I saw to fire, and so took another shot at reducing this from the original test case, mostly just for fun...
I got the following reduction that preserves the LOH stuff that seems the root of the issue: https://gist.github.com/chandlerc/48a4b7c99797a1f55653f63fece3f30e
With that:
> llc -fast-isel --relocation-model=pic reduced.test.ll -o - | rg -U -C5 '^\s+adrp.*\nLloh.*\n\s+add.*global\.1.*PAGEOFF\nLloh.*\n\s+str'
mov x0, xzr
bl _bar
Ltmp6:
; %bb.6: ; %bb57
Lloh6:
adrp x6, _global.1@PAGE
Lloh7:
add x8, x6, _global.1@PAGEOFF
Lloh8:
str x20, [x8]
LBB2_7: ; %bb62
ldrb w12, [x29, #57]
mov x10, xzr
mov x9, xzr
add x10, x10, #8
I've think I've contrived to make this linkable as well, and when I link it and disassemble you can see the misfiring linker transformation:
> llc -fast-isel --relocation-model=pic reduced.test.ll -o - | clang -fpie -O1 -o reduced.test -x assembler -; and otool -t -v reduced.test | rg -A3 'bl\s*_bar$'
0000000100003ac4 bl _bar
0000000100003ac8 adrp x6, 513 ; 0x100204000
0000000100003acc nop
0000000100003ad0 str x20, [x8, #0xe0]
@chandlerc Are you sure that the bug reproduces when linking with ld64.lld?
It did with -fuse-ld=lld, but I can check later to see if this was actually still using ld64 for some reason or something stripped out that flag without my realizing; it's entirely possible.
Now that I have a stand-alone reproduction, indeed, I only see this with ld64 itself. =] The flag to use LLD wasn't actually being passed in the build system in the original case where I thought it was.