nmodl [LLVM] Basic scalable vector support draft

This is a draft PR that adds SVE support for LLVM code generation backend. To use scalable vectors, a new --scalable option is added.

Currently, we support basic code generation, including all points mentioned in #637. Note that this is not properly checked (predication was definitely not checked) and even valid LLVM IR may not generate correct assembly (due to LLVM limitations such as maximum scalable vector alignment is 16, etc.).

Example using a constant (https://godbolt.org/z/nK9j3sdM6):

// test.mod
NEURON {
    SUFFIX test
    RANGE x, y
}

ASSIGNED { x y }

STATE { m }

BREAKPOINT {
    SOLVE states METHOD cnexp
}

DERIVATIVE states {
   m = y + 2
}

;bin/nmodl -o llvm  ../../nmodl/test/integration/mod/test.mod llvm --ir --single-precision --vector-width 2 --scalable --disable-debug-info
; ModuleID = 'test'
source_filename = "test"

%test__instance_var__type = type { float*, float*, float*, float*, float*, float*, float*, i32*, float, float, float, i32, i32 }

; Function Attrs: nofree nounwind
define void @nrn_state_test(%test__instance_var__type* noalias nocapture readonly %mech1) #0 {
  %mech = alloca %test__instance_var__type*, align 8
  %id = alloca i32, align 4
  %node_id = alloca <vscale x 2 x i32>, align 8
  %v = alloca <vscale x 2 x float>, align 8
  %epilogue_node_id = alloca i32, align 4
  %epilogue_v = alloca float, align 4
  store %test__instance_var__type* %mech1, %test__instance_var__type** %mech, align 8
  store i32 0, i32* %id, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %0
  %1 = call i32 @llvm.vscale.i32()
  %2 = mul i32 %1, 2
  %3 = sub i32 %2, 1
  %4 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %5 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %4, i32 0, i32 12
  %6 = load i32, i32* %5, align 4
  %7 = sub i32 %6, %3
  %8 = load i32, i32* %id, align 4
  %9 = icmp slt i32 %8, %7
  br i1 %9, label %for.body, label %for.exit

for.body:                                         ; preds = %for.cond
  %10 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %11 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %10, i32 0, i32 7
  %12 = load i32, i32* %id, align 4
  %13 = sext i32 %12 to i64
  %14 = load i32*, i32** %11, align 8
  %15 = getelementptr inbounds i32, i32* %14, i64 %13
  %16 = bitcast i32* %15 to <vscale x 2 x i32>*
  %17 = load <vscale x 2 x i32>, <vscale x 2 x i32>* %16, align 8
  store <vscale x 2 x i32> %17, <vscale x 2 x i32>* %node_id, align 8
  %18 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %19 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %18, i32 0, i32 6
  %20 = load <vscale x 2 x i32>, <vscale x 2 x i32>* %node_id, align 8
  %21 = sext <vscale x 2 x i32> %20 to <vscale x 2 x i64>
  %22 = load float*, float** %19, align 8
  %23 = getelementptr inbounds float, float* %22, <vscale x 2 x i64> %21
  %24 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> %23, i32 1, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x float> undef)
  store <vscale x 2 x float> %24, <vscale x 2 x float>* %v, align 8
  %25 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %26 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %25, i32 0, i32 1
  %27 = load i32, i32* %id, align 4
  %28 = sext i32 %27 to i64
  %29 = load float*, float** %26, align 8
  %30 = getelementptr inbounds float, float* %29, i64 %28
  %31 = bitcast float* %30 to <vscale x 2 x float>*
  %32 = load <vscale x 2 x float>, <vscale x 2 x float>* %31, align 8
  %33 = fadd <vscale x 2 x float> %32, shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 2.000000e+00, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
  %34 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %35 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %34, i32 0, i32 2
  %36 = load i32, i32* %id, align 4
  %37 = sext i32 %36 to i64
  %38 = load float*, float** %35, align 8
  %39 = getelementptr inbounds float, float* %38, i64 %37
  %40 = bitcast float* %39 to <vscale x 2 x float>*
  store <vscale x 2 x float> %33, <vscale x 2 x float>* %40, align 8
  br label %for.inc

for.inc:                                          ; preds = %for.body
  %41 = call i32 @llvm.vscale.i32()
  %42 = mul i32 %41, 2
  %43 = load i32, i32* %id, align 4
  %44 = add i32 %43, %42
  store i32 %44, i32* %id, align 4
  br label %for.cond

for.exit:                                         ; preds = %for.cond
  br label %for.cond2

for.cond2:                                        ; preds = %for.inc4, %for.exit
  %45 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %46 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %45, i32 0, i32 12
  %47 = load i32, i32* %46, align 4
  %48 = load i32, i32* %id, align 4
  %49 = icmp slt i32 %48, %47
  br i1 %49, label %for.body3, label %for.exit5

for.body3:                                        ; preds = %for.cond2
  %50 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %51 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %50, i32 0, i32 7
  %52 = load i32, i32* %id, align 4
  %53 = sext i32 %52 to i64
  %54 = load i32*, i32** %51, align 8
  %55 = getelementptr inbounds i32, i32* %54, i64 %53
  %56 = load i32, i32* %55, align 4
  store i32 %56, i32* %epilogue_node_id, align 4
  %57 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %58 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %57, i32 0, i32 6
  %59 = load i32, i32* %epilogue_node_id, align 4
  %60 = sext i32 %59 to i64
  %61 = load float*, float** %58, align 8
  %62 = getelementptr inbounds float, float* %61, i64 %60
  %63 = load float, float* %62, align 4
  store float %63, float* %epilogue_v, align 4
  %64 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %65 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %64, i32 0, i32 1
  %66 = load i32, i32* %id, align 4
  %67 = sext i32 %66 to i64
  %68 = load float*, float** %65, align 8
  %69 = getelementptr inbounds float, float* %68, i64 %67
  %70 = load float, float* %69, align 4
  %71 = fadd float %70, 2.000000e+00
  %72 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %73 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %72, i32 0, i32 2
  %74 = load i32, i32* %id, align 4
  %75 = sext i32 %74 to i64
  %76 = load float*, float** %73, align 8
  %77 = getelementptr inbounds float, float* %76, i64 %75
  store float %71, float* %77, align 4
  br label %for.inc4

for.inc4:                                         ; preds = %for.body3
  %78 = load i32, i32* %id, align 4
  %79 = add i32 %78, 1
  store i32 %79, i32* %id, align 4
  br label %for.cond2

for.exit5:                                        ; preds = %for.cond2
  ret void
}

; Function Attrs: nofree nosync nounwind readnone willreturn
declare i32 @llvm.vscale.i32() #1

; Function Attrs: nofree nosync nounwind readonly willreturn
declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*>, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x float>) #2

attributes #0 = { nofree nounwind "target-features"="+sve,+sve"}
; we actually generate attributes #0 = { nofree nounwind }, see todos
attributes #1 = { nofree nosync nounwind readnone willreturn }
attributes #2 = { nofree nosync nounwind readonly willreturn }

; llc -O3 -mtriple=aarch64--linux-gnu
nrn_state_test:                         // @nrn_state_test
        str     x29, [sp, #-16]!                // 8-byte Folded Spill
        addvl   sp, sp, #-1
        sub     sp, sp, #16                     // =16
        addvl   x8, sp, #1
        str     x0, [x8, #24]
        cntd    x8
        ptrue   p0.d
        neg     x9, x8
        fmov    z0.s, #2.00000000
        str     wzr, [sp, #12]
.LBB0_1:                                // %for.cond
        addvl   x10, sp, #1
        ldr     x10, [x10, #24]
        ldr     w11, [sp, #12]
        ldr     w10, [x10, #80]
        add     w10, w10, w9
        add     w10, w10, #1                    // =1
        cmp     w11, w10
        b.ge    .LBB0_3
        addvl   x10, sp, #1
        ldr     x10, [x10, #24]
        ldrsw   x11, [sp, #12]
        ldp     x10, x12, [x10, #48]
        ld1sw   { z1.d }, p0/z, [x12, x11, lsl #2]
        add     x11, sp, #16                    // =16
        st1w    { z1.d }, p0, [x11, #1, mul vl]
        ld1w    { z1.d }, p0/z, [x10, z1.d, lsl #2]
        addvl   x10, sp, #1
        ldr     x10, [x10, #24]
        ldrsw   x12, [sp, #12]
        ldr     w13, [sp, #12]
        ldp     x11, x10, [x10, #8]
        ld1w    { z2.d }, p0/z, [x11, x12, lsl #2]
        add     x11, sp, #16                    // =16
        st1w    { z1.d }, p0, [x11]
        add     w11, w13, w8
        movprfx z1, z2
        fadd    z1.s, p0/m, z1.s, z0.s
        st1w    { z1.d }, p0, [x10, x12, lsl #2]
        str     w11, [sp, #12]
        b       .LBB0_1
.LBB0_3:
        fmov    s0, #2.00000000
        addvl   x8, sp, #1
        ldr     x8, [x8, #24]
        ldr     w9, [sp, #12]
        ldr     w8, [x8, #80]
        cmp     w9, w8
        b.ge    .LBB0_5
.LBB0_4:                                // %for.body3
        addvl   x9, sp, #1
        ldr     x9, [x9, #24]
        ldrsw   x8, [sp, #12]
        ldp     x13, x11, [x9, #48]
        lsl     x10, x8, #2
        ldp     x12, x9, [x9, #8]
        add     w8, w8, #1                      // =1
        ldrsw   x11, [x11, x10]
        ldr     s1, [x12, x10]
        ldr     s2, [x13, x11, lsl #2]
        stp     w11, w8, [sp, #8]
        fadd    s1, s1, s0
        str     s1, [x9, x10]
        str     s2, [sp, #4]
        addvl   x8, sp, #1
        ldr     x8, [x8, #24]
        ldr     w9, [sp, #12]
        ldr     w8, [x8, #80]
        cmp     w9, w8
        b.lt    .LBB0_4
.LBB0_5:                                // %for.exit5
        addvl   sp, sp, #1
        add     sp, sp, #16                     // =16
        ldr     x29, [sp], #16                  // 8-byte Folded Reload
        ret

Problems If we use a simple math function such as exp in the kernel (see https://godbolt.org/z/18jEeP65G), then llc complains with

ERROR: Invalid size request on a scalable vector.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace.
Stack dump:
0.	Program arguments: /opt/compiler-explorer/clang-trunk/bin/llc -o /app/output.s -x86-asm-syntax=intel -O3 -mtriple=aarch64--linux-gnu <source>
1.	Running pass 'Function Pass Manager' on module '<source>'.
2.	Running pass 'AArch64 Instruction Selection' on function '@nrn_state_test'
 #0 0x000055d6059b7a0c llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/opt/compiler-explorer/clang-trunk/bin/llc+0x2900a0c)
 #1 0x000055d6059b5994 llvm::sys::RunSignalHandlers() (/opt/compiler-explorer/clang-trunk/bin/llc+0x28fe994)
 #2 0x000055d6059b5b03 SignalHandler(int) Signals.cpp:0:0
 #3 0x00007feef253a3c0 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x153c0)
 #4 0x00007feef200a18b raise (/lib/x86_64-linux-gnu/libc.so.6+0x4618b)
 #5 0x00007feef1fe9859 abort (/lib/x86_64-linux-gnu/libc.so.6+0x25859)
 #6 0x000055d605924536 llvm::report_fatal_error(llvm::Twine const&, bool) (/opt/compiler-explorer/clang-trunk/bin/llc+0x286d536)
 #7 0x000055d605924668 (/opt/compiler-explorer/clang-trunk/bin/llc+0x286d668)
 #8 0x000055d60596a06d (/opt/compiler-explorer/clang-trunk/bin/llc+0x28b306d)
 #9 0x000055d6057eefdd llvm::SelectionDAG::UnrollVectorOp(llvm::SDNode*, unsigned int) (/opt/compiler-explorer/clang-trunk/bin/llc+0x2737fdd)
#10 0x000055d60587106d (anonymous namespace)::VectorLegalizer::Expand(llvm::SDNode*, llvm::SmallVectorImpl<llvm::SDValue>&) LegalizeVectorOps.cpp:0:0
#11 0x000055d605875b44 (anonymous namespace)::VectorLegalizer::LegalizeOp(llvm::SDValue) LegalizeVectorOps.cpp:0:0
#12 0x000055d6058778bb llvm::SelectionDAG::LegalizeVectors() (/opt/compiler-explorer/clang-trunk/bin/llc+0x27c08bb)
#13 0x000055d605803f7b llvm::SelectionDAGISel::CodeGenAndEmitDAG() (/opt/compiler-explorer/clang-trunk/bin/llc+0x274cf7b)
#14 0x000055d6058075e4 llvm::SelectionDAGISel::SelectAllBasicBlocks(llvm::Function const&) (/opt/compiler-explorer/clang-trunk/bin/llc+0x27505e4)
#15 0x000055d605809602 llvm::SelectionDAGISel::runOnMachineFunction(llvm::MachineFunction&) (.part.859) SelectionDAGISel.cpp:0:0
#16 0x000055d604ea1578 llvm::MachineFunctionPass::runOnFunction(llvm::Function&) (/opt/compiler-explorer/clang-trunk/bin/llc+0x1dea578)
#17 0x000055d60529ae37 llvm::FPPassManager::runOnFunction(llvm::Function&) (/opt/compiler-explorer/clang-trunk/bin/llc+0x21e3e37)
#18 0x000055d60529b551 llvm::FPPassManager::runOnModule(llvm::Module&) (/opt/compiler-explorer/clang-trunk/bin/llc+0x21e4551)
#19 0x000055d60529a08f llvm::legacy::PassManagerImpl::run(llvm::Module&) (/opt/compiler-explorer/clang-trunk/bin/llc+0x21e308f)
#20 0x000055d603919ab1 compileModule(char**, llvm::LLVMContext&) llc.cpp:0:0
#21 0x000055d603848446 main (/opt/compiler-explorer/clang-trunk/bin/llc+0x791446)
#22 0x00007feef1feb0b3 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x270b3)
#23 0x000055d60391186a _start (/opt/compiler-explorer/clang-trunk/bin/llc+0x85a86a)
Compiler returned: 139

This is likely a bug on LLVM side (or exp simply should not be used with scalable vectors). If the exponential call is replaced with some other function call (i.e llvm.exp.nxv2f32 -> something that takes <vscale x 2 x float> argument), then no error occurs.

Plan:

Implementation & Testing

[x] Support ScalableVecorType
[x] Support induction variable increments with llvm.vscale.i32() call
[x] Support scalable constants
[x] Add target features (+sve) [See pramodk/llvm-scalable-vectors]
[ ] Implement IR tests
[ ] Implement execution tests (if using SVE platform)
[ ] Remove hardcoded llvm.vscale.i32() from LLVM helper visitor

Questions

[ ] Problem: math inrisnics do not work with scalable vectors
[ ] How do we call SIMD math functions if the vector width is unknown at compile time?

May 31 '21 14:05 georgemitenkov

Can one of the admins verify this patch?

May 31 '21 14:05 bbpbuildbot

If I understand well, <n x m x type> means that it is an unknown multiple (n times) of m x type where m is the minimum number of elements. Since the minimum for sve is 128 bits shouldn't be <vscale x 4 x float> or <vscale x 2 x double> ? If the above is correct, we should make this automatic, so you should be able to specify either --vector-width <m> or --scalable and in the latter case you select the minimum width based on the type.

Jun 08 '21 10:06 castigli