Enzyme.jl icon indicating copy to clipboard operation
Enzyme.jl copied to clipboard

Slow performance on simple element-wise array multiplication

Open oschulz opened this issue 2 years ago • 7 comments

f!(dA, A, B) runs in 90 ns, but Enzyme.autodiff(f!, dpl_dA, dpl_A, dpl_B) takes about 2000 ns with Enzyme v0.10.18:.

using Enzyme, Random, BenchmarkTools
using Base.Experimental: @aliasscope, Const

#Enzyme.API.printperf!(true)
#Enzyme.API.printall!(true)

n, m = 12, 15
A = rand(n, m)
B = rand(n, m)
C = zero(A)

function f!(C, A, B)
    @aliasscope let A = Const(A), B = Const(B)
        @inbounds for j in axes(A, 2), i in axes(A, 1)
            C[i,j] =  A[i,j] * B[i,j]
        end
    end
    nothing
end

f!(C, A, B)
# @benchmark f!($C, $A, $B)

dpl_A = Duplicated(A, zero(A))
dpl_B = Duplicated(B, zero(B))
dpl_C = Duplicated(C, rand!(copy(C)))

# Very slow - why?
Enzyme.autodiff(Reverse, f!, dpl_C, dpl_A, dpl_B)
# @benchmark Enzyme.autodiff(Reverse, f!, $dpl_C, $dpl_A, $dpl_B)

With Enzyme.API.printperf! I get

Load may need caching   %19 = load i64, i64 addrspace(11)* %18, align 8 due to   store double %42, double addrspace(13)* %44, align 8, !dbg !53, !tbaa !44, !noalias !47
Load may need caching   %27 = load i64, i64 addrspace(11)* %26, align 8 due to   store double %42, double addrspace(13)* %44, align 8, !dbg !53, !tbaa !44, !noalias !47
Load must be recomputed   %27 = load i64, i64 addrspace(11)* %26, align 8 in reverse_invertL28.i due to   store double %42, double addrspace(13)* %44, align 8, !dbg !53, !tbaa !44, !noalias !47
Caching instruction   %27 = load i64, i64 addrspace(11)* %26, align 8 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1
Load must be recomputed   %19 = load i64, i64 addrspace(11)* %18, align 8 in reverse_invertL28.i due to   store double %42, double addrspace(13)* %44, align 8, !dbg !53, !tbaa !44, !noalias !47
Caching instruction   %19 = load i64, i64 addrspace(11)* %18, align 8 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1

@vchuravy suggested I open an issue.

oschulz avatar Feb 13 '23 13:02 oschulz

Pre opt:

after simplification :
; Function Attrs: mustprogress nofree nosync willreturn
define void @preprocess_julia_f__3847_inner.1({} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %0, {} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %1, {} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %2) local_unnamed_addr #3 !dbg !59 {
entry:
  %3 = call {}*** @julia.get_pgcstack() #4
  %4 = bitcast {} addrspace(10)* %1 to {} addrspace(10)* addrspace(10)*, !dbg !60
  %5 = addrspacecast {} addrspace(10)* addrspace(10)* %4 to {} addrspace(10)* addrspace(11)*, !dbg !60
  %6 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %5, i64 4, !dbg !60
  %7 = bitcast {} addrspace(10)* addrspace(11)* %6 to i64 addrspace(11)*, !dbg !60
  %8 = load i64, i64 addrspace(11)* %7, align 16, !dbg !60, !tbaa !12, !range !16, !invariant.load !4
  %9 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %5, i64 3, !dbg !63
  %10 = bitcast {} addrspace(10)* addrspace(11)* %9 to i64 addrspace(11)*, !dbg !63
  %11 = load i64, i64 addrspace(11)* %10, align 8, !dbg !63, !tbaa !12, !range !16, !invariant.load !4
  %.not.not = icmp eq i64 %8, 0, !dbg !65
  br i1 %.not.not, label %julia_f__3847_inner.exit, label %L21.i.preheader, !dbg !72

L21.i.preheader:                                  ; preds = %entry
  %.not.not7 = icmp eq i64 %11, 0
  %12 = bitcast {} addrspace(10)* %1 to double addrspace(13)* addrspace(10)*
  %13 = addrspacecast double addrspace(13)* addrspace(10)* %12 to double addrspace(13)* addrspace(11)*
  %14 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %13, align 16
  %15 = bitcast {} addrspace(10)* %2 to {} addrspace(10)* addrspace(10)*
  %16 = addrspacecast {} addrspace(10)* addrspace(10)* %15 to {} addrspace(10)* addrspace(11)*
  %17 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %16, i64 3
  %18 = bitcast {} addrspace(10)* addrspace(11)* %17 to i64 addrspace(11)*
  %19 = load i64, i64 addrspace(11)* %18, align 8
  %20 = bitcast {} addrspace(10)* %2 to double addrspace(13)* addrspace(10)*
  %21 = addrspacecast double addrspace(13)* addrspace(10)* %20 to double addrspace(13)* addrspace(11)*
  %22 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %21, align 16
  %23 = bitcast {} addrspace(10)* %0 to {} addrspace(10)* addrspace(10)*
  %24 = addrspacecast {} addrspace(10)* addrspace(10)* %23 to {} addrspace(10)* addrspace(11)*
  %25 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %24, i64 3
  %26 = bitcast {} addrspace(10)* addrspace(11)* %25 to i64 addrspace(11)*
  %27 = load i64, i64 addrspace(11)* %26, align 8
  %28 = bitcast {} addrspace(10)* %0 to double addrspace(13)* addrspace(10)*
  %29 = addrspacecast double addrspace(13)* addrspace(10)* %28 to double addrspace(13)* addrspace(11)*
  %30 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %29, align 16
  br i1 %.not.not7, label %julia_f__3847_inner.exit, label %L40.i.preheader.preheader, !dbg !72

L40.i.preheader.preheader:                        ; preds = %L21.i.preheader
  br label %L40.i.preheader, !dbg !73

L40.i.preheader:                                  ; preds = %L40.i.preheader.preheader, %L59.i
  %iv = phi i64 [ %iv.next, %L59.i ], [ 0, %L40.i.preheader.preheader ]
  %iv.next = add nuw nsw i64 %iv, 1
  %31 = add nsw i64 %iv.next, -1
  %32 = mul i64 %11, %31
  %33 = mul i64 %19, %31
  %34 = mul i64 %27, %31
  br label %L40.i, !dbg !73

L40.i:                                            ; preds = %L40.i, %L40.i.preheader
  %iv1 = phi i64 [ %iv.next2, %L40.i ], [ 0, %L40.i.preheader ]
  %iv.next2 = add nuw nsw i64 %iv1, 1, !dbg !74
  %35 = add nsw i64 %iv.next2, -1, !dbg !74
  %36 = add i64 %35, %32, !dbg !74
  %37 = getelementptr inbounds double, double addrspace(13)* %14, i64 %36, !dbg !74
  %38 = load double, double addrspace(13)* %37, align 8, !dbg !74, !tbaa !41, !alias.scope !44
  %39 = add i64 %35, %33, !dbg !74
  %40 = getelementptr inbounds double, double addrspace(13)* %22, i64 %39, !dbg !74
  %41 = load double, double addrspace(13)* %40, align 8, !dbg !74, !tbaa !41, !alias.scope !44
  %42 = fmul double %38, %41, !dbg !76
  %43 = add i64 %35, %34, !dbg !77
  %44 = getelementptr inbounds double, double addrspace(13)* %30, i64 %43, !dbg !77
  store double %42, double addrspace(13)* %44, align 8, !dbg !77, !tbaa !41, !noalias !44
  %.not.not8 = icmp eq i64 %iv.next2, %11, !dbg !78
  %45 = add nuw nsw i64 %iv.next2, 1, !dbg !81
  br i1 %.not.not8, label %L59.i, label %L40.i, !dbg !73

L59.i:                                            ; preds = %L40.i
  %.not = icmp eq i64 %iv.next, %8, !dbg !78
  %46 = add nuw nsw i64 %iv.next, 1, !dbg !81
  br i1 %.not, label %julia_f__3847_inner.exit.loopexit, label %L40.i.preheader, !dbg !73

julia_f__3847_inner.exit.loopexit:                ; preds = %L59.i
  br label %julia_f__3847_inner.exit, !dbg !82

julia_f__3847_inner.exit:                         ; preds = %julia_f__3847_inner.exit.loopexit, %L21.i.preheader, %entry
  ret void, !dbg !82
}

Load may need caching   %19 = load i64, i64 addrspace(11)* %18, align 8 due to   store double %42, double addrspace(13)* %44, align 8, !dbg !50, !tbaa !41, !noalias !44
Load may need caching   %27 = load i64, i64 addrspace(11)* %26, align 8 due to   store double %42, double addrspace(13)* %44, align 8, !dbg !50, !tbaa !41, !noalias !44
Load must be recomputed   %27 = load i64, i64 addrspace(11)* %26, align 8 in reverse_invertL40.i due to   store double %42, double addrspace(13)* %44, align 8, !dbg !50, !tbaa !41, !noalias !44
Caching instruction   %27 = load i64, i64 addrspace(11)* %26, align 8 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1
Load must be recomputed   %19 = load i64, i64 addrspace(11)* %18, align 8 in reverse_invertL40.i due to   store double %42, double addrspace(13)* %44, align 8, !dbg !50, !tbaa !41, !noalias !44
Caching instruction   %19 = load i64, i64 addrspace(11)* %18, align 8 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1
; Function Attrs: mustprogress nofree nosync willreturn
define internal void @diffejulia_f__3847_inner.1({} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %0, {} addrspace(10)* nocapture %"'", {} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %1, {} addrspace(10)* nocapture %"'1", {} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %2, {} addrspace(10)* nocapture %"'2") local_unnamed_addr #3 !dbg !83 {
entry:
  %"iv'ac" = alloca i64, align 8
  %"iv1'ac" = alloca i64, align 8
  %_cache = alloca i64, align 8
  store i64 0, i64* %_cache, align 8
  %"'de" = alloca double, align 8
  %3 = getelementptr double, double* %"'de", i64 0
  store double 0.000000e+00, double* %3, align 8
  %_cache23 = alloca i64, align 8
  store i64 0, i64* %_cache23, align 8
  %"'de35" = alloca double, align 8
  %4 = getelementptr double, double* %"'de35", i64 0
  store double 0.000000e+00, double* %4, align 8
  %"'de36" = alloca double, align 8
  %5 = getelementptr double, double* %"'de36", i64 0
  store double 0.000000e+00, double* %5, align 8
  %6 = call {}*** @julia.get_pgcstack() #4
  %7 = bitcast {} addrspace(10)* %1 to {} addrspace(10)* addrspace(10)*, !dbg !84
  %8 = addrspacecast {} addrspace(10)* addrspace(10)* %7 to {} addrspace(10)* addrspace(11)*, !dbg !84
  %9 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %8, i64 4, !dbg !84
  %10 = bitcast {} addrspace(10)* addrspace(11)* %9 to i64 addrspace(11)*, !dbg !84
  %11 = load i64, i64 addrspace(11)* %10, align 16, !dbg !84, !tbaa !12, !range !16, !invariant.load !4
  %12 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %8, i64 3, !dbg !87
  %13 = bitcast {} addrspace(10)* addrspace(11)* %12 to i64 addrspace(11)*, !dbg !87
  %14 = load i64, i64 addrspace(11)* %13, align 8, !dbg !87, !tbaa !12, !range !16, !invariant.load !4
  %.not.not = icmp eq i64 %11, 0, !dbg !89
  br i1 %.not.not, label %julia_f__3847_inner.exit, label %L21.i.preheader, !dbg !96

L21.i.preheader:                                  ; preds = %entry
  %.not.not7 = icmp eq i64 %14, 0
  %"'ipc11" = bitcast {} addrspace(10)* %"'1" to double addrspace(13)* addrspace(10)*
  %15 = bitcast {} addrspace(10)* %1 to double addrspace(13)* addrspace(10)*
  %"'ipc12" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc11" to double addrspace(13)* addrspace(11)*
  %16 = addrspacecast double addrspace(13)* addrspace(10)* %15 to double addrspace(13)* addrspace(11)*
  %"'ipl13" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc12", align 16
  %17 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %16, align 16
  %18 = bitcast {} addrspace(10)* %2 to {} addrspace(10)* addrspace(10)*
  %19 = addrspacecast {} addrspace(10)* addrspace(10)* %18 to {} addrspace(10)* addrspace(11)*
  %20 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %19, i64 3
  %21 = bitcast {} addrspace(10)* addrspace(11)* %20 to i64 addrspace(11)*
  %22 = load i64, i64 addrspace(11)* %21, align 8
  %"'ipc8" = bitcast {} addrspace(10)* %"'2" to double addrspace(13)* addrspace(10)*
  %23 = bitcast {} addrspace(10)* %2 to double addrspace(13)* addrspace(10)*
  %"'ipc9" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc8" to double addrspace(13)* addrspace(11)*
  %24 = addrspacecast double addrspace(13)* addrspace(10)* %23 to double addrspace(13)* addrspace(11)*
  %"'ipl10" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc9", align 16
  %25 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %24, align 16
  %26 = bitcast {} addrspace(10)* %0 to {} addrspace(10)* addrspace(10)*
  %27 = addrspacecast {} addrspace(10)* addrspace(10)* %26 to {} addrspace(10)* addrspace(11)*
  %28 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %27, i64 3
  %29 = bitcast {} addrspace(10)* addrspace(11)* %28 to i64 addrspace(11)*
  %30 = load i64, i64 addrspace(11)* %29, align 8
  store i64 %30, i64* %_cache, align 8, !invariant.group !97
  store i64 %22, i64* %_cache23, align 8, !invariant.group !98
  %"'ipc" = bitcast {} addrspace(10)* %"'" to double addrspace(13)* addrspace(10)*
  %"'ipc5" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc" to double addrspace(13)* addrspace(11)*
  %"'ipl" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc5", align 16
  br i1 %.not.not7, label %julia_f__3847_inner.exit, label %L40.i.preheader.preheader, !dbg !96

L40.i.preheader.preheader:                        ; preds = %L21.i.preheader
  %31 = add nsw i64 %11, -1, !dbg !99
  %32 = add nsw i64 %14, -1, !dbg !99
  br label %L40.i.preheader, !dbg !99

L40.i.preheader:                                  ; preds = %L59.i, %L40.i.preheader.preheader
  %iv = phi i64 [ %iv.next, %L59.i ], [ 0, %L40.i.preheader.preheader ]
  %iv.next = add nuw nsw i64 %iv, 1
  %33 = add nsw i64 %iv.next, -1
  %34 = mul i64 %14, %33
  %35 = mul i64 %22, %33
  %36 = mul i64 %30, %33
  br label %L40.i, !dbg !99

L40.i:                                            ; preds = %L40.i, %L40.i.preheader
  %iv1 = phi i64 [ %iv.next2, %L40.i ], [ 0, %L40.i.preheader ]
  %iv.next2 = add nuw nsw i64 %iv1, 1, !dbg !100
  %37 = add nsw i64 %iv.next2, -1, !dbg !100
  %38 = add i64 %37, %34, !dbg !100
  %"'ipg38" = getelementptr inbounds double, double addrspace(13)* %"'ipl13", i64 %38, !dbg !100
  %39 = getelementptr inbounds double, double addrspace(13)* %17, i64 %38, !dbg !100
  %40 = load double, double addrspace(13)* %39, align 8, !dbg !100, !tbaa !41, !alias.scope !44
  %41 = add i64 %37, %35, !dbg !100
  %"'ipg37" = getelementptr inbounds double, double addrspace(13)* %"'ipl10", i64 %41, !dbg !100
  %42 = getelementptr inbounds double, double addrspace(13)* %25, i64 %41, !dbg !100
  %43 = load double, double addrspace(13)* %42, align 8, !dbg !100, !tbaa !41, !alias.scope !44
  %44 = add i64 %37, %36, !dbg !102
  %"'ipg" = getelementptr inbounds double, double addrspace(13)* %"'ipl", i64 %44, !dbg !102
  %.not.not8 = icmp eq i64 %iv.next2, %14, !dbg !103
  br i1 %.not.not8, label %L59.i, label %L40.i, !dbg !99

L59.i:                                            ; preds = %L40.i
  %.not = icmp eq i64 %iv.next, %11, !dbg !103
  br i1 %.not, label %julia_f__3847_inner.exit.loopexit, label %L40.i.preheader, !dbg !99

julia_f__3847_inner.exit.loopexit:                ; preds = %L59.i
  br label %julia_f__3847_inner.exit, !dbg !106

julia_f__3847_inner.exit:                         ; preds = %julia_f__3847_inner.exit.loopexit, %L21.i.preheader, %entry
  br label %invertjulia_f__3847_inner.exit, !dbg !106

invertentry:                                      ; preds = %invertjulia_f__3847_inner.exit, %invertL21.i.preheader
  ret void

invertL21.i.preheader:                            ; preds = %staging, %invertL40.i.preheader.preheader
  br label %invertentry

invertL40.i.preheader.preheader:                  ; preds = %invertL40.i.preheader
  br label %invertL21.i.preheader

invertL40.i.preheader:                            ; preds = %invertL40.i
  %45 = load i64, i64* %"iv'ac", align 8
  %46 = icmp eq i64 %45, 0
  %47 = xor i1 %46, true
  br i1 %46, label %invertL40.i.preheader.preheader, label %incinvertL40.i.preheader

incinvertL40.i.preheader:                         ; preds = %invertL40.i.preheader
  %48 = load i64, i64* %"iv'ac", align 8
  %49 = add nsw i64 %48, -1
  store i64 %49, i64* %"iv'ac", align 8
  br label %invertL59.i

invertL40.i:                                      ; preds = %mergeinvertL40.i_L59.i, %incinvertL40.i
  %50 = load i64, i64* %"iv1'ac", align 8
  %51 = load i64, i64* %"iv'ac", align 8
  %"'ipc_unwrap" = bitcast {} addrspace(10)* %"'" to double addrspace(13)* addrspace(10)*
  %"'ipc5_unwrap" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc_unwrap" to double addrspace(13)* addrspace(11)*
  %"'ipl_unwrap" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc5_unwrap", align 16
  %iv.next2_unwrap = add nuw nsw i64 %50, 1
  %_unwrap = add nsw i64 %iv.next2_unwrap, -1
  %52 = load i64, i64* %_cache, align 8, !invariant.group !97
  %iv.next_unwrap = add nuw nsw i64 %51, 1
  %_unwrap15 = add nsw i64 %iv.next_unwrap, -1
  %_unwrap16 = mul i64 %52, %_unwrap15
  %_unwrap17 = add i64 %_unwrap, %_unwrap16
  %"'ipg_unwrap" = getelementptr inbounds double, double addrspace(13)* %"'ipl_unwrap", i64 %_unwrap17
  %53 = load double, double addrspace(13)* %"'ipg_unwrap", align 8, !tbaa !41, !noalias !44
  store double 0.000000e+00, double addrspace(13)* %"'ipg_unwrap", align 8, !dbg !102, !tbaa !41, !alias.scope !107, !noalias !110
  %54 = load double, double* %"'de", align 8
  %55 = fadd fast double %54, %53
  store double %55, double* %"'de", align 8
  %56 = load double, double* %"'de", align 8
  %57 = load i64, i64* %"iv1'ac", align 8
  %58 = load i64, i64* %"iv'ac", align 8
  %_unwrap20 = bitcast {} addrspace(10)* %2 to double addrspace(13)* addrspace(10)*
  %_unwrap21 = addrspacecast double addrspace(13)* addrspace(10)* %_unwrap20 to double addrspace(13)* addrspace(11)*
  %_unwrap22 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %_unwrap21, align 16
  %59 = load i64, i64* %_cache23, align 8, !invariant.group !98
  %_unwrap24 = mul i64 %59, %_unwrap15
  %_unwrap25 = add i64 %_unwrap, %_unwrap24
  %_unwrap26 = getelementptr inbounds double, double addrspace(13)* %_unwrap22, i64 %_unwrap25
  %_unwrap27 = load double, double addrspace(13)* %_unwrap26, align 8, !dbg !100, !tbaa !41, !alias.scope !44
  %m0diffe = fmul fast double %56, %_unwrap27
  %60 = load i64, i64* %"iv1'ac", align 8
  %61 = load i64, i64* %"iv'ac", align 8
  %_unwrap28 = bitcast {} addrspace(10)* %1 to double addrspace(13)* addrspace(10)*
  %_unwrap29 = addrspacecast double addrspace(13)* addrspace(10)* %_unwrap28 to double addrspace(13)* addrspace(11)*
  %_unwrap30 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %_unwrap29, align 16
  %_unwrap31 = mul i64 %14, %_unwrap15
  %_unwrap32 = add i64 %_unwrap, %_unwrap31
  %_unwrap33 = getelementptr inbounds double, double addrspace(13)* %_unwrap30, i64 %_unwrap32
  %_unwrap34 = load double, double addrspace(13)* %_unwrap33, align 8, !dbg !100, !tbaa !41, !alias.scope !44
  %m1diffe = fmul fast double %56, %_unwrap34
  store double 0.000000e+00, double* %"'de", align 8
  %62 = load double, double* %"'de35", align 8
  %63 = fadd fast double %62, %m0diffe
  store double %63, double* %"'de35", align 8
  %64 = load double, double* %"'de36", align 8
  %65 = fadd fast double %64, %m1diffe
  store double %65, double* %"'de36", align 8
  %66 = load double, double* %"'de36", align 8
  store double 0.000000e+00, double* %"'de36", align 8
  %67 = load i64, i64* %"iv1'ac", align 8
  %68 = load i64, i64* %"iv'ac", align 8
  %"'ipc8_unwrap" = bitcast {} addrspace(10)* %"'2" to double addrspace(13)* addrspace(10)*
  %"'ipc9_unwrap" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc8_unwrap" to double addrspace(13)* addrspace(11)*
  %"'ipl10_unwrap" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc9_unwrap", align 16
  %"'ipg37_unwrap" = getelementptr inbounds double, double addrspace(13)* %"'ipl10_unwrap", i64 %_unwrap25
  %69 = load double, double addrspace(13)* %"'ipg37_unwrap", align 8, !dbg !100, !tbaa !41, !alias.scope !112, !noalias !115
  %70 = fadd fast double %69, %66
  store double %70, double addrspace(13)* %"'ipg37_unwrap", align 8, !dbg !100, !tbaa !41, !alias.scope !112, !noalias !115
  %71 = load double, double* %"'de35", align 8
  store double 0.000000e+00, double* %"'de35", align 8
  %72 = load i64, i64* %"iv1'ac", align 8
  %73 = load i64, i64* %"iv'ac", align 8
  %"'ipc11_unwrap" = bitcast {} addrspace(10)* %"'1" to double addrspace(13)* addrspace(10)*
  %"'ipc12_unwrap" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc11_unwrap" to double addrspace(13)* addrspace(11)*
  %"'ipl13_unwrap" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc12_unwrap", align 16
  %"'ipg38_unwrap" = getelementptr inbounds double, double addrspace(13)* %"'ipl13_unwrap", i64 %_unwrap32
  %74 = load double, double addrspace(13)* %"'ipg38_unwrap", align 8, !dbg !100, !tbaa !41, !alias.scope !117, !noalias !120
  %75 = fadd fast double %74, %71
  store double %75, double addrspace(13)* %"'ipg38_unwrap", align 8, !dbg !100, !tbaa !41, !alias.scope !117, !noalias !120
  %76 = load i64, i64* %"iv1'ac", align 8
  %77 = icmp eq i64 %76, 0
  %78 = xor i1 %77, true
  br i1 %77, label %invertL40.i.preheader, label %incinvertL40.i

incinvertL40.i:                                   ; preds = %invertL40.i
  %79 = load i64, i64* %"iv1'ac", align 8
  %80 = add nsw i64 %79, -1
  store i64 %80, i64* %"iv1'ac", align 8
  br label %invertL40.i

invertL59.i:                                      ; preds = %mergeinvertL40.i.preheader_julia_f__3847_inner.exit.loopexit, %incinvertL40.i.preheader
  %81 = load i64, i64* %"iv'ac", align 8
  %_unwrap40 = add nsw i64 %14, -1
  br label %mergeinvertL40.i_L59.i

mergeinvertL40.i_L59.i:                           ; preds = %invertL59.i
  store i64 %_unwrap40, i64* %"iv1'ac", align 8
  br label %invertL40.i

invertjulia_f__3847_inner.exit.loopexit:          ; preds = %staging
  %_unwrap41 = add nsw i64 %11, -1
  br label %mergeinvertL40.i.preheader_julia_f__3847_inner.exit.loopexit

mergeinvertL40.i.preheader_julia_f__3847_inner.exit.loopexit: ; preds = %invertjulia_f__3847_inner.exit.loopexit
  store i64 %_unwrap41, i64* %"iv'ac", align 8
  br label %invertL59.i

invertjulia_f__3847_inner.exit:                   ; preds = %julia_f__3847_inner.exit
  %.not.not7_unwrap = icmp eq i64 %14, 0
  br i1 %.not.not, label %invertentry, label %staging

staging:                                          ; preds = %invertjulia_f__3847_inner.exit
  br i1 %.not.not7_unwrap, label %invertL21.i.preheader, label %invertjulia_f__3847_inner.exit.loopexit
}

Post optimization:

julia> Enzyme.autodiff(Reverse, f!, dpl_dA, dpl_A, dpl_B)
mod = ; ModuleID = 'text'
source_filename = "text"
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-darwin21.4.0"

; Function Attrs: nofree nosync
define private void @julia_f__1991_inner.1({} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %0, {} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %1, {} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %2) local_unnamed_addr #0 !dbg !5 {
entry:
  %3 = bitcast {} addrspace(10)* %1 to {} addrspace(10)* addrspace(10)*, !dbg !7
  %4 = addrspacecast {} addrspace(10)* addrspace(10)* %3 to {} addrspace(10)* addrspace(11)*, !dbg !7
  %5 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %4, i64 4, !dbg !7
  %6 = bitcast {} addrspace(10)* addrspace(11)* %5 to i64 addrspace(11)*, !dbg !7
  %7 = load i64, i64 addrspace(11)* %6, align 16, !dbg !7, !tbaa !23, !range !27
  %.not.not = icmp eq i64 %7, 0, !dbg !28
  br i1 %.not.not, label %julia_f__1991_inner.exit, label %L16.i.preheader, !dbg !39

L16.i.preheader:                                  ; preds = %entry
  %8 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %4, i64 3
  %9 = bitcast {} addrspace(10)* addrspace(11)* %8 to i64 addrspace(11)*
  %10 = load i64, i64 addrspace(11)* %9, align 8, !tbaa !23, !range !27
  %.not.not7 = icmp eq i64 %10, 0
  %11 = bitcast {} addrspace(10)* %1 to double addrspace(13)* addrspace(10)*
  %12 = addrspacecast double addrspace(13)* addrspace(10)* %11 to double addrspace(13)* addrspace(11)*
  %13 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %12, align 16
  %14 = bitcast {} addrspace(10)* %2 to {} addrspace(10)* addrspace(10)*
  %15 = addrspacecast {} addrspace(10)* addrspace(10)* %14 to {} addrspace(10)* addrspace(11)*
  %16 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %15, i64 3
  %17 = bitcast {} addrspace(10)* addrspace(11)* %16 to i64 addrspace(11)*
  %18 = load i64, i64 addrspace(11)* %17, align 8
  %19 = bitcast {} addrspace(10)* %2 to double addrspace(13)* addrspace(10)*
  %20 = addrspacecast double addrspace(13)* addrspace(10)* %19 to double addrspace(13)* addrspace(11)*
  %21 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %20, align 16
  %22 = bitcast {} addrspace(10)* %0 to {} addrspace(10)* addrspace(10)*
  %23 = addrspacecast {} addrspace(10)* addrspace(10)* %22 to {} addrspace(10)* addrspace(11)*
  %24 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %23, i64 3
  %25 = bitcast {} addrspace(10)* addrspace(11)* %24 to i64 addrspace(11)*
  %26 = load i64, i64 addrspace(11)* %25, align 8
  %27 = bitcast {} addrspace(10)* %0 to double addrspace(13)* addrspace(10)*
  %28 = addrspacecast double addrspace(13)* addrspace(10)* %27 to double addrspace(13)* addrspace(11)*
  %29 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %28, align 16
  br i1 %.not.not7, label %julia_f__1991_inner.exit, label %L32.i.preheader, !dbg !39

L32.i.preheader:                                  ; preds = %L16.i.preheader, %L51.i
  %value_phi3.i = phi i64 [ %63, %L51.i ], [ 1, %L16.i.preheader ]
  %30 = add nsw i64 %value_phi3.i, -1
  %31 = mul i64 %10, %30
  %32 = mul i64 %18, %30
  %33 = mul i64 %26, %30
  %min.iters.check = icmp ult i64 %10, 4, !dbg !40
  br i1 %min.iters.check, label %scalar.ph, label %vector.ph, !dbg !40

vector.ph:                                        ; preds = %L32.i.preheader
  %n.vec = and i64 %10, 9223372036854775804, !dbg !40
  %ind.end = or i64 %n.vec, 1, !dbg !40
  br label %vector.body, !dbg !40

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %34 = add i64 %index, %31, !dbg !41
  %35 = getelementptr inbounds double, double addrspace(13)* %13, i64 %34, !dbg !41
  %36 = bitcast double addrspace(13)* %35 to <2 x double> addrspace(13)*, !dbg !41
  %wide.load = load <2 x double>, <2 x double> addrspace(13)* %36, align 8, !dbg !41, !tbaa !44, !alias.scope !47
  %37 = getelementptr inbounds double, double addrspace(13)* %35, i64 2, !dbg !41
  %38 = bitcast double addrspace(13)* %37 to <2 x double> addrspace(13)*, !dbg !41
  %wide.load9 = load <2 x double>, <2 x double> addrspace(13)* %38, align 8, !dbg !41, !tbaa !44, !alias.scope !47
  %39 = add i64 %index, %32, !dbg !41
  %40 = getelementptr inbounds double, double addrspace(13)* %21, i64 %39, !dbg !41
  %41 = bitcast double addrspace(13)* %40 to <2 x double> addrspace(13)*, !dbg !41
  %wide.load10 = load <2 x double>, <2 x double> addrspace(13)* %41, align 8, !dbg !41, !tbaa !44, !alias.scope !47
  %42 = getelementptr inbounds double, double addrspace(13)* %40, i64 2, !dbg !41
  %43 = bitcast double addrspace(13)* %42 to <2 x double> addrspace(13)*, !dbg !41
  %wide.load11 = load <2 x double>, <2 x double> addrspace(13)* %43, align 8, !dbg !41, !tbaa !44, !alias.scope !47
  %44 = fmul <2 x double> %wide.load, %wide.load10, !dbg !50
  %45 = fmul <2 x double> %wide.load9, %wide.load11, !dbg !50
  %46 = add i64 %index, %33, !dbg !53
  %47 = getelementptr inbounds double, double addrspace(13)* %29, i64 %46, !dbg !53
  %48 = bitcast double addrspace(13)* %47 to <2 x double> addrspace(13)*, !dbg !53
  store <2 x double> %44, <2 x double> addrspace(13)* %48, align 8, !dbg !53, !tbaa !44, !noalias !47
  %49 = getelementptr inbounds double, double addrspace(13)* %47, i64 2, !dbg !53
  %50 = bitcast double addrspace(13)* %49 to <2 x double> addrspace(13)*, !dbg !53
  store <2 x double> %45, <2 x double> addrspace(13)* %50, align 8, !dbg !53, !tbaa !44, !noalias !47
  %index.next = add nuw i64 %index, 4
  %51 = icmp eq i64 %index.next, %n.vec
  br i1 %51, label %middle.block, label %vector.body, !llvm.loop !55

middle.block:                                     ; preds = %vector.body
  %cmp.n = icmp eq i64 %10, %n.vec, !dbg !40
  br i1 %cmp.n, label %L51.i, label %scalar.ph, !dbg !40

scalar.ph:                                        ; preds = %L32.i.preheader, %middle.block
  %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L32.i.preheader ]
  br label %L32.i, !dbg !40

L32.i:                                            ; preds = %scalar.ph, %L32.i
  %value_phi8.i = phi i64 [ %62, %L32.i ], [ %bc.resume.val, %scalar.ph ]
  %52 = add nsw i64 %value_phi8.i, -1, !dbg !41
  %53 = add i64 %52, %31, !dbg !41
  %54 = getelementptr inbounds double, double addrspace(13)* %13, i64 %53, !dbg !41
  %55 = load double, double addrspace(13)* %54, align 8, !dbg !41, !tbaa !44, !alias.scope !47
  %56 = add i64 %52, %32, !dbg !41
  %57 = getelementptr inbounds double, double addrspace(13)* %21, i64 %56, !dbg !41
  %58 = load double, double addrspace(13)* %57, align 8, !dbg !41, !tbaa !44, !alias.scope !47
  %59 = fmul double %55, %58, !dbg !50
  %60 = add i64 %52, %33, !dbg !53
  %61 = getelementptr inbounds double, double addrspace(13)* %29, i64 %60, !dbg !53
  store double %59, double addrspace(13)* %61, align 8, !dbg !53, !tbaa !44, !noalias !47
  %.not.not8 = icmp eq i64 %value_phi8.i, %10, !dbg !57
  %62 = add nuw nsw i64 %value_phi8.i, 1, !dbg !62
  br i1 %.not.not8, label %L51.i, label %L32.i, !dbg !40, !llvm.loop !63

L51.i:                                            ; preds = %middle.block, %L32.i
  %.not = icmp eq i64 %value_phi3.i, %7, !dbg !57
  %63 = add nuw nsw i64 %value_phi3.i, 1, !dbg !62
  br i1 %.not, label %julia_f__1991_inner.exit, label %L32.i.preheader, !dbg !40

julia_f__1991_inner.exit:                         ; preds = %L51.i, %L16.i.preheader, %entry
  ret void, !dbg !65
}

; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1

; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1

; Function Attrs: alwaysinline
define void @diffejulia_f__1991_inner_1wrap({} addrspace(10)* %0, {} addrspace(10)* %1, {} addrspace(10)* %2, {} addrspace(10)* %3, {} addrspace(10)* %4, {} addrspace(10)* %5) #2 !dbg !66 {
entry:
  %6 = bitcast {} addrspace(10)* %2 to {} addrspace(10)* addrspace(10)*, !dbg !67
  %7 = addrspacecast {} addrspace(10)* addrspace(10)* %6 to {} addrspace(10)* addrspace(11)*, !dbg !67
  %8 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %7, i64 4, !dbg !67
  %9 = bitcast {} addrspace(10)* addrspace(11)* %8 to i64 addrspace(11)*, !dbg !67
  %10 = load i64, i64 addrspace(11)* %9, align 16, !dbg !67, !tbaa !23, !range !27
  %.not.not.i = icmp eq i64 %10, 0, !dbg !76
  br i1 %.not.not.i, label %diffejulia_f__1991_inner.1.exit, label %L16.i.preheader.i, !dbg !80

L16.i.preheader.i:                                ; preds = %entry
  %11 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %7, i64 3
  %12 = bitcast {} addrspace(10)* addrspace(11)* %11 to i64 addrspace(11)*
  %13 = load i64, i64 addrspace(11)* %12, align 8, !tbaa !23, !range !27
  %.not.not7.i = icmp eq i64 %13, 0
  %14 = bitcast {} addrspace(10)* %2 to double addrspace(13)* addrspace(10)*
  %15 = addrspacecast double addrspace(13)* addrspace(10)* %14 to double addrspace(13)* addrspace(11)*
  %16 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %15, align 16
  %17 = bitcast {} addrspace(10)* %4 to {} addrspace(10)* addrspace(10)*
  %18 = addrspacecast {} addrspace(10)* addrspace(10)* %17 to {} addrspace(10)* addrspace(11)*
  %19 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %18, i64 3
  %20 = bitcast {} addrspace(10)* addrspace(11)* %19 to i64 addrspace(11)*
  %21 = load i64, i64 addrspace(11)* %20, align 8
  %22 = bitcast {} addrspace(10)* %4 to double addrspace(13)* addrspace(10)*
  %23 = addrspacecast double addrspace(13)* addrspace(10)* %22 to double addrspace(13)* addrspace(11)*
  %24 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %23, align 16
  %25 = bitcast {} addrspace(10)* %0 to {} addrspace(10)* addrspace(10)*
  %26 = addrspacecast {} addrspace(10)* addrspace(10)* %25 to {} addrspace(10)* addrspace(11)*
  %27 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %26, i64 3
  %28 = bitcast {} addrspace(10)* addrspace(11)* %27 to i64 addrspace(11)*
  %29 = load i64, i64 addrspace(11)* %28, align 8
  %30 = bitcast {} addrspace(10)* %0 to double addrspace(13)* addrspace(10)*
  %31 = addrspacecast double addrspace(13)* addrspace(10)* %30 to double addrspace(13)* addrspace(11)*
  %32 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %31, align 16
  br i1 %.not.not7.i, label %diffejulia_f__1991_inner.1.exit, label %L32.i.preheader.i, !dbg !80

L32.i.preheader.i:                                ; preds = %L16.i.preheader.i, %L51.i.i
  %iv.i = phi i64 [ %iv.next.i, %L51.i.i ], [ 0, %L16.i.preheader.i ]
  %33 = mul i64 %iv.i, %13
  %34 = mul i64 %iv.i, %21
  %35 = mul i64 %iv.i, %29
  %min.iters.check = icmp ult i64 %13, 4, !dbg !81
  br i1 %min.iters.check, label %scalar.ph, label %vector.ph, !dbg !81

vector.ph:                                        ; preds = %L32.i.preheader.i
  %n.vec = and i64 %13, 9223372036854775804, !dbg !81
  br label %vector.body, !dbg !81

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ], !dbg !82
  %36 = add i64 %index, %33, !dbg !82
  %37 = getelementptr inbounds double, double addrspace(13)* %16, i64 %36, !dbg !82
  %38 = bitcast double addrspace(13)* %37 to <2 x double> addrspace(13)*, !dbg !82
  %wide.load = load <2 x double>, <2 x double> addrspace(13)* %38, align 8, !dbg !82, !tbaa !44, !alias.scope !47
  %39 = getelementptr inbounds double, double addrspace(13)* %37, i64 2, !dbg !82
  %40 = bitcast double addrspace(13)* %39 to <2 x double> addrspace(13)*, !dbg !82
  %wide.load22 = load <2 x double>, <2 x double> addrspace(13)* %40, align 8, !dbg !82, !tbaa !44, !alias.scope !47
  %41 = add i64 %index, %34, !dbg !82
  %42 = getelementptr inbounds double, double addrspace(13)* %24, i64 %41, !dbg !82
  %43 = bitcast double addrspace(13)* %42 to <2 x double> addrspace(13)*, !dbg !82
  %wide.load23 = load <2 x double>, <2 x double> addrspace(13)* %43, align 8, !dbg !82, !tbaa !44, !alias.scope !47
  %44 = getelementptr inbounds double, double addrspace(13)* %42, i64 2, !dbg !82
  %45 = bitcast double addrspace(13)* %44 to <2 x double> addrspace(13)*, !dbg !82
  %wide.load24 = load <2 x double>, <2 x double> addrspace(13)* %45, align 8, !dbg !82, !tbaa !44, !alias.scope !47
  %46 = fmul <2 x double> %wide.load, %wide.load23, !dbg !84
  %47 = fmul <2 x double> %wide.load22, %wide.load24, !dbg !84
  %48 = add i64 %index, %35, !dbg !82
  %49 = getelementptr inbounds double, double addrspace(13)* %32, i64 %48, !dbg !82
  %50 = bitcast double addrspace(13)* %49 to <2 x double> addrspace(13)*, !dbg !85
  store <2 x double> %46, <2 x double> addrspace(13)* %50, align 8, !dbg !85, !tbaa !44, !alias.scope !86, !noalias !89
  %51 = getelementptr inbounds double, double addrspace(13)* %49, i64 2, !dbg !85
  %52 = bitcast double addrspace(13)* %51 to <2 x double> addrspace(13)*, !dbg !85
  store <2 x double> %47, <2 x double> addrspace(13)* %52, align 8, !dbg !85, !tbaa !44, !alias.scope !86, !noalias !89
  %index.next = add nuw i64 %index, 4, !dbg !82
  %53 = icmp eq i64 %index.next, %n.vec, !dbg !82
  br i1 %53, label %middle.block, label %vector.body, !dbg !82, !llvm.loop !91

middle.block:                                     ; preds = %vector.body
  %cmp.n = icmp eq i64 %13, %n.vec, !dbg !81
  br i1 %cmp.n, label %L51.i.i, label %scalar.ph, !dbg !81

scalar.ph:                                        ; preds = %L32.i.preheader.i, %middle.block
  %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %L32.i.preheader.i ]
  br label %L32.i.i, !dbg !81

L32.i.i:                                          ; preds = %L32.i.i, %scalar.ph
  %iv1.i = phi i64 [ %iv.next2.i, %L32.i.i ], [ %bc.resume.val, %scalar.ph ]
  %iv.next2.i = add nuw nsw i64 %iv1.i, 1, !dbg !82
  %54 = add i64 %iv1.i, %33, !dbg !82
  %55 = getelementptr inbounds double, double addrspace(13)* %16, i64 %54, !dbg !82
  %56 = load double, double addrspace(13)* %55, align 8, !dbg !82, !tbaa !44, !alias.scope !47
  %57 = add i64 %iv1.i, %34, !dbg !82
  %58 = getelementptr inbounds double, double addrspace(13)* %24, i64 %57, !dbg !82
  %59 = load double, double addrspace(13)* %58, align 8, !dbg !82, !tbaa !44, !alias.scope !47
  %60 = fmul double %56, %59, !dbg !84
  %61 = add i64 %iv1.i, %35, !dbg !85
  %62 = getelementptr inbounds double, double addrspace(13)* %32, i64 %61, !dbg !85
  store double %60, double addrspace(13)* %62, align 8, !dbg !85, !tbaa !44, !alias.scope !86, !noalias !89
  %.not.not8.i = icmp eq i64 %iv.next2.i, %13, !dbg !92
  br i1 %.not.not8.i, label %L51.i.i, label %L32.i.i, !dbg !81, !llvm.loop !95

L51.i.i:                                          ; preds = %middle.block, %L32.i.i
  %iv.next.i = add nuw nsw i64 %iv.i, 1
  %.not.i = icmp eq i64 %iv.next.i, %10, !dbg !92
  br i1 %.not.i, label %invertL51.i.i.preheader, label %L32.i.preheader.i, !dbg !81

invertL51.i.i.preheader:                          ; preds = %L51.i.i
  %"'ipc_unwrap.i" = bitcast {} addrspace(10)* %1 to double addrspace(13)* addrspace(10)*
  %"'ipc5_unwrap.i" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc_unwrap.i" to double addrspace(13)* addrspace(11)*
  %"'ipc6_unwrap.i" = bitcast {} addrspace(10)* %5 to double addrspace(13)* addrspace(10)*
  %"'ipc7_unwrap.i" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc6_unwrap.i" to double addrspace(13)* addrspace(11)*
  %"'ipc9_unwrap.i" = bitcast {} addrspace(10)* %3 to double addrspace(13)* addrspace(10)*
  %"'ipc10_unwrap.i" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc9_unwrap.i" to double addrspace(13)* addrspace(11)*
  br label %invertL51.i.i

invertL32.i.preheader.i:                          ; preds = %invertL32.i.i
  %63 = icmp eq i64 %"iv'ac.i.0", 0
  br i1 %63, label %diffejulia_f__1991_inner.1.exit, label %invertL51.i.i

invertL32.i.i:                                    ; preds = %invertL32.i.i, %invertL51.i.i
  %"iv1'ac.i.0.in" = phi i64 [ %13, %invertL51.i.i ], [ %"iv1'ac.i.0", %invertL32.i.i ]
  %"iv1'ac.i.0" = add nsw i64 %"iv1'ac.i.0.in", -1
  %"'ipl_unwrap.i" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc5_unwrap.i", align 16
  %_unwrap14.i = add i64 %"iv1'ac.i.0", %_unwrap13.i
  %"'ipg_unwrap.i" = getelementptr inbounds double, double addrspace(13)* %"'ipl_unwrap.i", i64 %_unwrap14.i
  %64 = load double, double addrspace(13)* %"'ipg_unwrap.i", align 8, !tbaa !44, !noalias !47
  store double 0.000000e+00, double addrspace(13)* %"'ipg_unwrap.i", align 8, !dbg !85, !tbaa !44, !alias.scope !96, !noalias !97
  %_unwrap17.i = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %23, align 16
  %_unwrap20.i = add i64 %"iv1'ac.i.0", %_unwrap19.i
  %_unwrap21.i = getelementptr inbounds double, double addrspace(13)* %_unwrap17.i, i64 %_unwrap20.i
  %_unwrap22.i = load double, double addrspace(13)* %_unwrap21.i, align 8, !dbg !82, !tbaa !44, !alias.scope !47
  %m0diffe.i = fmul fast double %_unwrap22.i, %64
  %_unwrap25.i = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %15, align 16
  %_unwrap30.i = add i64 %_unwrap29.i, %"iv1'ac.i.0"
  %_unwrap31.i = getelementptr inbounds double, double addrspace(13)* %_unwrap25.i, i64 %_unwrap30.i
  %_unwrap32.i = load double, double addrspace(13)* %_unwrap31.i, align 8, !dbg !82, !tbaa !44, !alias.scope !47
  %m1diffe.i = fmul fast double %_unwrap32.i, %64
  %"'ipl8_unwrap.i" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc7_unwrap.i", align 16
  %"'ipg35_unwrap.i" = getelementptr inbounds double, double addrspace(13)* %"'ipl8_unwrap.i", i64 %_unwrap20.i
  %65 = load double, double addrspace(13)* %"'ipg35_unwrap.i", align 8, !dbg !82, !tbaa !44, !alias.scope !98, !noalias !101
  %66 = fadd fast double %65, %m1diffe.i
  store double %66, double addrspace(13)* %"'ipg35_unwrap.i", align 8, !dbg !82, !tbaa !44, !alias.scope !98, !noalias !101
  %"'ipl11_unwrap.i" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc10_unwrap.i", align 16
  %"'ipg36_unwrap.i" = getelementptr inbounds double, double addrspace(13)* %"'ipl11_unwrap.i", i64 %_unwrap30.i
  %67 = load double, double addrspace(13)* %"'ipg36_unwrap.i", align 8, !dbg !82, !tbaa !44, !alias.scope !103, !noalias !106
  %68 = fadd fast double %67, %m0diffe.i
  store double %68, double addrspace(13)* %"'ipg36_unwrap.i", align 8, !dbg !82, !tbaa !44, !alias.scope !103, !noalias !106
  %69 = icmp eq i64 %"iv1'ac.i.0", 0
  br i1 %69, label %invertL32.i.preheader.i, label %invertL32.i.i

invertL51.i.i:                                    ; preds = %invertL51.i.i.preheader, %invertL32.i.preheader.i
  %"iv'ac.i.0.in" = phi i64 [ %"iv'ac.i.0", %invertL32.i.preheader.i ], [ %10, %invertL51.i.i.preheader ]
  %"iv'ac.i.0" = add nsw i64 %"iv'ac.i.0.in", -1
  %_unwrap13.i = mul i64 %"iv'ac.i.0", %29
  %_unwrap19.i = mul i64 %"iv'ac.i.0", %21
  %_unwrap29.i = mul i64 %13, %"iv'ac.i.0"
  br label %invertL32.i.i

diffejulia_f__1991_inner.1.exit:                  ; preds = %invertL32.i.preheader.i, %L16.i.preheader.i, %entry
  ret void
}

; Function Attrs: inaccessiblemem_or_argmemonly
declare void @ijl_gc_queue_root({} addrspace(10)*) #3

; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8*, i32, i32) #4

; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc(i8*, i64) #4

attributes #0 = { nofree nosync "enzymejl_world"="32451" "probe-stack"="inline-asm" }
attributes #1 = { argmemonly nofree nosync nounwind willreturn "enzymejl_world"="32451" }
attributes #2 = { alwaysinline "probe-stack"="inline-asm" }
attributes #3 = { inaccessiblemem_or_argmemonly }
attributes #4 = { allocsize(1) }

!llvm.module.flags = !{!0, !1}
!llvm.dbg.cu = !{!2}

!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = distinct !DICompileUnit(language: DW_LANG_Julia, file: !3, producer: "julia", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !4, nameTableKind: None)
!3 = !DIFile(filename: "REPL[7]", directory: ".")
!4 = !{}
!5 = distinct !DISubprogram(name: "f!", linkageName: "julia_f!_1991", scope: null, file: !3, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!6 = !DISubroutineType(types: !4)
!7 = !DILocation(line: 152, scope: !8, inlinedAt: !10)
!8 = distinct !DISubprogram(name: "size;", linkageName: "size", scope: !9, file: !9, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!9 = !DIFile(filename: "array.jl", directory: ".")
!10 = distinct !DILocation(line: 95, scope: !11, inlinedAt: !13)
!11 = distinct !DISubprogram(name: "axes;", linkageName: "axes", scope: !12, file: !12, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!12 = !DIFile(filename: "abstractarray.jl", directory: ".")
!13 = distinct !DILocation(line: 30, scope: !14, inlinedAt: !16)
!14 = distinct !DISubprogram(name: "axes;", linkageName: "axes", scope: !15, file: !15, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!15 = !DIFile(filename: "experimental.jl", directory: ".")
!16 = distinct !DILocation(line: 74, scope: !11, inlinedAt: !17)
!17 = distinct !DILocation(line: 3, scope: !18, inlinedAt: !19)
!18 = distinct !DISubprogram(name: "macro expansion;", linkageName: "macro expansion", scope: !3, file: !3, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!19 = distinct !DILocation(line: 49, scope: !20, inlinedAt: !21)
!20 = distinct !DISubprogram(name: "macro expansion;", linkageName: "macro expansion", scope: !15, file: !15, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!21 = distinct !DILocation(line: 2, scope: !5, inlinedAt: !22)
!22 = distinct !DILocation(line: 0, scope: !5)
!23 = !{!24, !24, i64 0}
!24 = !{!"jtbaa_const", !25, i64 0}
!25 = !{!"jtbaa", !26, i64 0}
!26 = !{!"jtbaa"}
!27 = !{i64 0, i64 9223372036854775807}
!28 = !DILocation(line: 83, scope: !29, inlinedAt: !31)
!29 = distinct !DISubprogram(name: "<;", linkageName: "<", scope: !30, file: !30, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!30 = !DIFile(filename: "int.jl", directory: ".")
!31 = distinct !DILocation(line: 382, scope: !32, inlinedAt: !34)
!32 = distinct !DISubprogram(name: ">;", linkageName: ">", scope: !33, file: !33, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!33 = !DIFile(filename: "operators.jl", directory: ".")
!34 = distinct !DILocation(line: 654, scope: !35, inlinedAt: !37)
!35 = distinct !DISubprogram(name: "isempty;", linkageName: "isempty", scope: !36, file: !36, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!36 = !DIFile(filename: "range.jl", directory: ".")
!37 = distinct !DILocation(line: 879, scope: !38, inlinedAt: !17)
!38 = distinct !DISubprogram(name: "iterate;", linkageName: "iterate", scope: !36, file: !36, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!39 = !DILocation(line: 3, scope: !18, inlinedAt: !19)
!40 = !DILocation(line: 5, scope: !18, inlinedAt: !19)
!41 = !DILocation(line: 34, scope: !42, inlinedAt: !43)
!42 = distinct !DISubprogram(name: "getindex;", linkageName: "getindex", scope: !15, file: !15, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!43 = distinct !DILocation(line: 4, scope: !18, inlinedAt: !19)
!44 = !{!45, !45, i64 0}
!45 = !{!"jtbaa_arraybuf", !46, i64 0}
!46 = !{!"jtbaa_data", !25, i64 0}
!47 = !{!48}
!48 = !{!"aliasscope", !49}
!49 = !{!"f!"}
!50 = !DILocation(line: 385, scope: !51, inlinedAt: !43)
!51 = distinct !DISubprogram(name: "*;", linkageName: "*", scope: !52, file: !52, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!52 = !DIFile(filename: "float.jl", directory: ".")
!53 = !DILocation(line: 968, scope: !54, inlinedAt: !43)
!54 = distinct !DISubprogram(name: "setindex!;", linkageName: "setindex!", scope: !9, file: !9, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!55 = distinct !{!55, !56}
!56 = !{!"llvm.loop.isvectorized", i32 1}
!57 = !DILocation(line: 477, scope: !58, inlinedAt: !60)
!58 = distinct !DISubprogram(name: "==;", linkageName: "==", scope: !59, file: !59, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!59 = !DIFile(filename: "promotion.jl", directory: ".")
!60 = distinct !DILocation(line: 883, scope: !38, inlinedAt: !61)
!61 = distinct !DILocation(line: 5, scope: !18, inlinedAt: !19)
!62 = !DILocation(line: 883, scope: !38, inlinedAt: !61)
!63 = distinct !{!63, !64, !56}
!64 = !{!"llvm.loop.unroll.runtime.disable"}
!65 = !DILocation(line: 0, scope: !5)
!66 = distinct !DISubprogram(name: "f!", linkageName: "julia_f!_1991", scope: null, file: !3, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!67 = !DILocation(line: 152, scope: !8, inlinedAt: !68)
!68 = distinct !DILocation(line: 95, scope: !11, inlinedAt: !69)
!69 = distinct !DILocation(line: 30, scope: !14, inlinedAt: !70)
!70 = distinct !DILocation(line: 74, scope: !11, inlinedAt: !71)
!71 = distinct !DILocation(line: 3, scope: !18, inlinedAt: !72)
!72 = distinct !DILocation(line: 49, scope: !20, inlinedAt: !73)
!73 = distinct !DILocation(line: 2, scope: !66, inlinedAt: !74)
!74 = distinct !DILocation(line: 0, scope: !66, inlinedAt: !75)
!75 = distinct !DILocation(line: 0, scope: !66)
!76 = !DILocation(line: 83, scope: !29, inlinedAt: !77)
!77 = distinct !DILocation(line: 382, scope: !32, inlinedAt: !78)
!78 = distinct !DILocation(line: 654, scope: !35, inlinedAt: !79)
!79 = distinct !DILocation(line: 879, scope: !38, inlinedAt: !71)
!80 = !DILocation(line: 3, scope: !18, inlinedAt: !72)
!81 = !DILocation(line: 5, scope: !18, inlinedAt: !72)
!82 = !DILocation(line: 34, scope: !42, inlinedAt: !83)
!83 = distinct !DILocation(line: 4, scope: !18, inlinedAt: !72)
!84 = !DILocation(line: 385, scope: !51, inlinedAt: !83)
!85 = !DILocation(line: 968, scope: !54, inlinedAt: !83)
!86 = !{!87}
!87 = distinct !{!87, !88, !"primal"}
!88 = distinct !{!88, !" diff: %"}
!89 = !{!90, !48}
!90 = distinct !{!90, !88, !"shadow_0"}
!91 = distinct !{!91, !56}
!92 = !DILocation(line: 477, scope: !58, inlinedAt: !93)
!93 = distinct !DILocation(line: 883, scope: !38, inlinedAt: !94)
!94 = distinct !DILocation(line: 5, scope: !18, inlinedAt: !72)
!95 = distinct !{!95, !64, !56}
!96 = !{!90}
!97 = !{!87, !48}
!98 = !{!99}
!99 = distinct !{!99, !100, !"shadow_0"}
!100 = distinct !{!100, !" diff: %"}
!101 = !{!102}
!102 = distinct !{!102, !100, !"primal"}
!103 = !{!104}
!104 = distinct !{!104, !105, !"shadow_0"}
!105 = distinct !{!105, !" diff: %"}
!106 = !{!107}
!107 = distinct !{!107, !105, !"primal"}

wsmoses avatar Feb 13 '23 22:02 wsmoses

Looks like the reverse pass isn't getting vectorized. @vchuravy

wsmoses avatar Feb 13 '23 22:02 wsmoses

A collection of PR's that go to the heart of this: https://github.com/EnzymeAD/Enzyme/pull/996 on Enzyme proper and https://reviews.llvm.org/D144053 in LLVM proper.

wsmoses avatar Feb 15 '23 04:02 wsmoses

Is there any way to work around this for now - by passing Enzyme.jl some manual hints on aliasing or so?

oschulz avatar Feb 15 '23 08:02 oschulz

With the addrspace13 PR, the array inner pointer loads are at least being LICM'd. The bw pass still isn't vectorizing though for some reason (perhaps because the output isn't noalias wrt the other things?)

; ModuleID = 'start'
source_filename = "start"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-linux-gnu"

; Function Attrs: nofree nosync
define private void @julia_f__742_inner.1({} addrspace(10)* nocapture nofree noundef nonnull readonly align 16 dereferenceable(40) %0, {} addrspace(10)* nocapture nofree noundef nonnull readonly align 16 dereferenceable(40) %1, {} addrspace(10)* nocapture nofree noundef nonnull readonly align 16 dereferenceable(40) %2) local_unnamed_addr #0 !dbg !4 {
entry:
  %3 = bitcast {} addrspace(10)* %1 to {} addrspace(10)* addrspace(10)*, !dbg !7
  %4 = addrspacecast {} addrspace(10)* addrspace(10)* %3 to {} addrspace(10)* addrspace(11)*, !dbg !7
  %5 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %4, i64 4, !dbg !7
  %6 = bitcast {} addrspace(10)* addrspace(11)* %5 to i64 addrspace(11)*, !dbg !7
  %7 = load i64, i64 addrspace(11)* %6, align 16, !dbg !7, !tbaa !23, !range !27, !alias.scope !28, !noalias !31
  %.not = icmp eq i64 %7, 0, !dbg !36
  br i1 %.not, label %julia_f__742_inner.exit, label %L14.i.preheader, !dbg !47

L14.i.preheader:                                  ; preds = %entry
  %8 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %4, i64 3
  %9 = bitcast {} addrspace(10)* addrspace(11)* %8 to i64 addrspace(11)*
  %10 = load i64, i64 addrspace(11)* %9, align 8, !tbaa !23, !range !27, !alias.scope !28, !noalias !31
  %.not7 = icmp eq i64 %10, 0
  %11 = bitcast {} addrspace(10)* %1 to double addrspace(13)* addrspace(10)*
  %12 = addrspacecast double addrspace(13)* addrspace(10)* %11 to double addrspace(13)* addrspace(11)*
  %13 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %12, align 16, !alias.scope !48
  %14 = bitcast {} addrspace(10)* %2 to {} addrspace(10)* addrspace(10)*
  %15 = addrspacecast {} addrspace(10)* addrspace(10)* %14 to {} addrspace(10)* addrspace(11)*
  %16 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %15, i64 3
  %17 = bitcast {} addrspace(10)* addrspace(11)* %16 to i64 addrspace(11)*
  %18 = load i64, i64 addrspace(11)* %17, align 8
  %19 = bitcast {} addrspace(10)* %2 to double addrspace(13)* addrspace(10)*
  %20 = addrspacecast double addrspace(13)* addrspace(10)* %19 to double addrspace(13)* addrspace(11)*
  %21 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %20, align 16, !alias.scope !48
  %22 = bitcast {} addrspace(10)* %0 to {} addrspace(10)* addrspace(10)*
  %23 = addrspacecast {} addrspace(10)* addrspace(10)* %22 to {} addrspace(10)* addrspace(11)*
  %24 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %23, i64 3
  %25 = bitcast {} addrspace(10)* addrspace(11)* %24 to i64 addrspace(11)*
  %26 = load i64, i64 addrspace(11)* %25, align 8
  %27 = bitcast {} addrspace(10)* %0 to double addrspace(13)* addrspace(10)*
  %28 = addrspacecast double addrspace(13)* addrspace(10)* %27 to double addrspace(13)* addrspace(11)*
  %29 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %28, align 16, !alias.scope !48
  br i1 %.not7, label %julia_f__742_inner.exit, label %L28.i.preheader, !dbg !47

L28.i.preheader:                                  ; preds = %L14.i.preheader, %L45.i
  %value_phi3.i = phi i64 [ %63, %L45.i ], [ 1, %L14.i.preheader ]
  %30 = add nsw i64 %value_phi3.i, -1
  %31 = mul i64 %10, %30
  %32 = mul i64 %18, %30
  %33 = mul i64 %26, %30
  %min.iters.check = icmp ult i64 %10, 4, !dbg !51
  br i1 %min.iters.check, label %scalar.ph, label %vector.ph, !dbg !51

vector.ph:                                        ; preds = %L28.i.preheader
  %n.vec = and i64 %10, 9223372036854775804, !dbg !51
  %ind.end = or i64 %n.vec, 1, !dbg !51
  br label %vector.body, !dbg !51

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %34 = add i64 %index, %31, !dbg !52
  %35 = getelementptr inbounds double, double addrspace(13)* %13, i64 %34, !dbg !52
  %36 = bitcast double addrspace(13)* %35 to <2 x double> addrspace(13)*, !dbg !52
  %wide.load = load <2 x double>, <2 x double> addrspace(13)* %36, align 8, !dbg !52, !tbaa !55, !alias.scope !58, !noalias !61
  %37 = getelementptr inbounds double, double addrspace(13)* %35, i64 2, !dbg !52
  %38 = bitcast double addrspace(13)* %37 to <2 x double> addrspace(13)*, !dbg !52
  %wide.load10 = load <2 x double>, <2 x double> addrspace(13)* %38, align 8, !dbg !52, !tbaa !55, !alias.scope !58, !noalias !61
  %39 = add i64 %index, %32, !dbg !52
  %40 = getelementptr inbounds double, double addrspace(13)* %21, i64 %39, !dbg !52
  %41 = bitcast double addrspace(13)* %40 to <2 x double> addrspace(13)*, !dbg !52
  %wide.load11 = load <2 x double>, <2 x double> addrspace(13)* %41, align 8, !dbg !52, !tbaa !55, !alias.scope !58, !noalias !61
  %42 = getelementptr inbounds double, double addrspace(13)* %40, i64 2, !dbg !52
  %43 = bitcast double addrspace(13)* %42 to <2 x double> addrspace(13)*, !dbg !52
  %wide.load12 = load <2 x double>, <2 x double> addrspace(13)* %43, align 8, !dbg !52, !tbaa !55, !alias.scope !58, !noalias !61
  %44 = fmul <2 x double> %wide.load, %wide.load11, !dbg !62
  %45 = fmul <2 x double> %wide.load10, %wide.load12, !dbg !62
  %46 = add i64 %index, %33, !dbg !65
  %47 = getelementptr inbounds double, double addrspace(13)* %29, i64 %46, !dbg !65
  %48 = bitcast double addrspace(13)* %47 to <2 x double> addrspace(13)*, !dbg !65
  store <2 x double> %44, <2 x double> addrspace(13)* %48, align 8, !dbg !65, !tbaa !55, !alias.scope !67, !noalias !68
  %49 = getelementptr inbounds double, double addrspace(13)* %47, i64 2, !dbg !65
  %50 = bitcast double addrspace(13)* %49 to <2 x double> addrspace(13)*, !dbg !65
  store <2 x double> %45, <2 x double> addrspace(13)* %50, align 8, !dbg !65, !tbaa !55, !alias.scope !67, !noalias !68
  %index.next = add nuw i64 %index, 4
  %51 = icmp eq i64 %index.next, %n.vec
  br i1 %51, label %middle.block, label %vector.body, !llvm.loop !71

middle.block:                                     ; preds = %vector.body
  %cmp.n = icmp eq i64 %10, %n.vec, !dbg !51
  br i1 %cmp.n, label %L45.i, label %scalar.ph, !dbg !51

scalar.ph:                                        ; preds = %L28.i.preheader, %middle.block
  %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L28.i.preheader ]
  br label %L28.i, !dbg !51

L28.i:                                            ; preds = %scalar.ph, %L28.i
  %value_phi8.i = phi i64 [ %62, %L28.i ], [ %bc.resume.val, %scalar.ph ]
  %52 = add nsw i64 %value_phi8.i, -1, !dbg !52
  %53 = add i64 %52, %31, !dbg !52
  %54 = getelementptr inbounds double, double addrspace(13)* %13, i64 %53, !dbg !52
  %55 = load double, double addrspace(13)* %54, align 8, !dbg !52, !tbaa !55, !alias.scope !58, !noalias !61
  %56 = add i64 %52, %32, !dbg !52
  %57 = getelementptr inbounds double, double addrspace(13)* %21, i64 %56, !dbg !52
  %58 = load double, double addrspace(13)* %57, align 8, !dbg !52, !tbaa !55, !alias.scope !58, !noalias !61
  %59 = fmul double %55, %58, !dbg !62
  %60 = add i64 %52, %33, !dbg !65
  %61 = getelementptr inbounds double, double addrspace(13)* %29, i64 %60, !dbg !65
  store double %59, double addrspace(13)* %61, align 8, !dbg !65, !tbaa !55, !alias.scope !67, !noalias !68
  %.not8 = icmp eq i64 %value_phi8.i, %10, !dbg !73
  %62 = add nuw nsw i64 %value_phi8.i, 1, !dbg !78
  br i1 %.not8, label %L45.i, label %L28.i, !dbg !51, !llvm.loop !79

L45.i:                                            ; preds = %middle.block, %L28.i
  %.not9 = icmp eq i64 %value_phi3.i, %7, !dbg !73
  %63 = add nuw nsw i64 %value_phi3.i, 1, !dbg !78
  br i1 %.not9, label %julia_f__742_inner.exit, label %L28.i.preheader, !dbg !51

julia_f__742_inner.exit:                          ; preds = %L45.i, %L14.i.preheader, %entry
  ret void, !dbg !81
}

; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1

; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1

; Function Attrs: alwaysinline
define void @diffejulia_f__742_inner_1wrap({} addrspace(10)* %0, {} addrspace(10)* %1, {} addrspace(10)* %2, {} addrspace(10)* %3, {} addrspace(10)* %4, {} addrspace(10)* %5) #2 !dbg !82 {
entry:
  %6 = bitcast {} addrspace(10)* %2 to {} addrspace(10)* addrspace(10)*, !dbg !83
  %7 = addrspacecast {} addrspace(10)* addrspace(10)* %6 to {} addrspace(10)* addrspace(11)*, !dbg !83
  %8 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %7, i64 4, !dbg !83
  %9 = bitcast {} addrspace(10)* addrspace(11)* %8 to i64 addrspace(11)*, !dbg !83
  %10 = load i64, i64 addrspace(11)* %9, align 16, !dbg !83, !tbaa !23, !range !27, !alias.scope !28, !noalias !31
  %.not.i = icmp eq i64 %10, 0, !dbg !92
  br i1 %.not.i, label %diffejulia_f__742_inner.1.exit, label %L14.i.preheader.i, !dbg !96

L14.i.preheader.i:                                ; preds = %entry
  %11 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %7, i64 3
  %12 = bitcast {} addrspace(10)* addrspace(11)* %11 to i64 addrspace(11)*
  %13 = load i64, i64 addrspace(11)* %12, align 8, !tbaa !23, !range !27, !alias.scope !28, !noalias !31, !invariant.group !97
  %.not7.i = icmp eq i64 %13, 0
  %14 = bitcast {} addrspace(10)* %2 to double addrspace(13)* addrspace(10)*
  %15 = addrspacecast double addrspace(13)* addrspace(10)* %14 to double addrspace(13)* addrspace(11)*
  %16 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %15, align 16, !alias.scope !48, !invariant.group !98
  %17 = bitcast {} addrspace(10)* %4 to {} addrspace(10)* addrspace(10)*
  %18 = addrspacecast {} addrspace(10)* addrspace(10)* %17 to {} addrspace(10)* addrspace(11)*
  %19 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %18, i64 3
  %20 = bitcast {} addrspace(10)* addrspace(11)* %19 to i64 addrspace(11)*
  %21 = load i64, i64 addrspace(11)* %20, align 8
  %22 = bitcast {} addrspace(10)* %4 to double addrspace(13)* addrspace(10)*
  %23 = addrspacecast double addrspace(13)* addrspace(10)* %22 to double addrspace(13)* addrspace(11)*
  %24 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %23, align 16, !alias.scope !48, !invariant.group !99
  %25 = bitcast {} addrspace(10)* %0 to {} addrspace(10)* addrspace(10)*
  %26 = addrspacecast {} addrspace(10)* addrspace(10)* %25 to {} addrspace(10)* addrspace(11)*
  %27 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %26, i64 3
  %28 = bitcast {} addrspace(10)* addrspace(11)* %27 to i64 addrspace(11)*
  %29 = load i64, i64 addrspace(11)* %28, align 8
  %30 = bitcast {} addrspace(10)* %0 to double addrspace(13)* addrspace(10)*
  %31 = addrspacecast double addrspace(13)* addrspace(10)* %30 to double addrspace(13)* addrspace(11)*
  %32 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %31, align 16, !alias.scope !48
  br i1 %.not7.i, label %diffejulia_f__742_inner.1.exit, label %L28.i.preheader.i, !dbg !96

L28.i.preheader.i:                                ; preds = %L14.i.preheader.i, %L45.i.i
  %iv.i = phi i64 [ %iv.next.i, %L45.i.i ], [ 0, %L14.i.preheader.i ]
  %33 = mul i64 %iv.i, %13
  %34 = mul i64 %iv.i, %21
  %35 = mul i64 %iv.i, %29
  %min.iters.check = icmp ult i64 %13, 4, !dbg !100
  br i1 %min.iters.check, label %scalar.ph, label %vector.ph, !dbg !100

vector.ph:                                        ; preds = %L28.i.preheader.i
  %n.vec = and i64 %13, 9223372036854775804, !dbg !100
  br label %vector.body, !dbg !100

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ], !dbg !101
  %36 = add i64 %index, %33, !dbg !101
  %37 = getelementptr inbounds double, double addrspace(13)* %16, i64 %36, !dbg !101
  %38 = bitcast double addrspace(13)* %37 to <2 x double> addrspace(13)*, !dbg !101
  %wide.load = load <2 x double>, <2 x double> addrspace(13)* %38, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61
  %39 = getelementptr inbounds double, double addrspace(13)* %37, i64 2, !dbg !101
  %40 = bitcast double addrspace(13)* %39 to <2 x double> addrspace(13)*, !dbg !101
  %wide.load22 = load <2 x double>, <2 x double> addrspace(13)* %40, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61
  %41 = add i64 %index, %34, !dbg !101
  %42 = getelementptr inbounds double, double addrspace(13)* %24, i64 %41, !dbg !101
  %43 = bitcast double addrspace(13)* %42 to <2 x double> addrspace(13)*, !dbg !101
  %wide.load23 = load <2 x double>, <2 x double> addrspace(13)* %43, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61
  %44 = getelementptr inbounds double, double addrspace(13)* %42, i64 2, !dbg !101
  %45 = bitcast double addrspace(13)* %44 to <2 x double> addrspace(13)*, !dbg !101
  %wide.load24 = load <2 x double>, <2 x double> addrspace(13)* %45, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61
  %46 = fmul <2 x double> %wide.load, %wide.load23, !dbg !103
  %47 = fmul <2 x double> %wide.load22, %wide.load24, !dbg !103
  %48 = add i64 %index, %35, !dbg !101
  %49 = getelementptr inbounds double, double addrspace(13)* %32, i64 %48, !dbg !101
  %50 = bitcast double addrspace(13)* %49 to <2 x double> addrspace(13)*, !dbg !104
  store <2 x double> %46, <2 x double> addrspace(13)* %50, align 8, !dbg !104, !tbaa !55, !alias.scope !105, !noalias !108
  %51 = getelementptr inbounds double, double addrspace(13)* %49, i64 2, !dbg !104
  %52 = bitcast double addrspace(13)* %51 to <2 x double> addrspace(13)*, !dbg !104
  store <2 x double> %47, <2 x double> addrspace(13)* %52, align 8, !dbg !104, !tbaa !55, !alias.scope !105, !noalias !108
  %index.next = add nuw i64 %index, 4, !dbg !101
  %53 = icmp eq i64 %index.next, %n.vec, !dbg !101
  br i1 %53, label %middle.block, label %vector.body, !dbg !101, !llvm.loop !112

middle.block:                                     ; preds = %vector.body
  %cmp.n = icmp eq i64 %13, %n.vec, !dbg !100
  br i1 %cmp.n, label %L45.i.i, label %scalar.ph, !dbg !100

scalar.ph:                                        ; preds = %L28.i.preheader.i, %middle.block
  %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %L28.i.preheader.i ]
  br label %L28.i.i, !dbg !100

L28.i.i:                                          ; preds = %L28.i.i, %scalar.ph
  %iv1.i = phi i64 [ %iv.next2.i, %L28.i.i ], [ %bc.resume.val, %scalar.ph ]
  %iv.next2.i = add nuw nsw i64 %iv1.i, 1, !dbg !101
  %54 = add i64 %iv1.i, %33, !dbg !101
  %55 = getelementptr inbounds double, double addrspace(13)* %16, i64 %54, !dbg !101
  %56 = load double, double addrspace(13)* %55, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61, !invariant.group !113
  %57 = add i64 %iv1.i, %34, !dbg !101
  %58 = getelementptr inbounds double, double addrspace(13)* %24, i64 %57, !dbg !101
  %59 = load double, double addrspace(13)* %58, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61, !invariant.group !114
  %60 = fmul double %56, %59, !dbg !103
  %61 = add i64 %iv1.i, %35, !dbg !104
  %62 = getelementptr inbounds double, double addrspace(13)* %32, i64 %61, !dbg !104
  store double %60, double addrspace(13)* %62, align 8, !dbg !104, !tbaa !55, !alias.scope !105, !noalias !108
  %.not8.i = icmp eq i64 %iv.next2.i, %13, !dbg !115
  br i1 %.not8.i, label %L45.i.i, label %L28.i.i, !dbg !100, !llvm.loop !118

L45.i.i:                                          ; preds = %middle.block, %L28.i.i
  %iv.next.i = add nuw nsw i64 %iv.i, 1
  %.not9.i = icmp eq i64 %iv.next.i, %10, !dbg !115
  br i1 %.not9.i, label %invertL45.i.i.preheader, label %L28.i.preheader.i, !dbg !100

invertL45.i.i.preheader:                          ; preds = %L45.i.i
  %"'ipc_unwrap.i" = bitcast {} addrspace(10)* %1 to double addrspace(13)* addrspace(10)*
  %"'ipc5_unwrap.i" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc_unwrap.i" to double addrspace(13)* addrspace(11)*
  %"'ipl_unwrap.i" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc5_unwrap.i", align 16, !alias.scope !48, !invariant.group !119
  %"'ipc6_unwrap.i" = bitcast {} addrspace(10)* %5 to double addrspace(13)* addrspace(10)*
  %"'ipc7_unwrap.i" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc6_unwrap.i" to double addrspace(13)* addrspace(11)*
  %"'ipl8_unwrap.i" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc7_unwrap.i", align 16, !alias.scope !48, !invariant.group !120
  %"'ipc9_unwrap.i" = bitcast {} addrspace(10)* %3 to double addrspace(13)* addrspace(10)*
  %"'ipc10_unwrap.i" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc9_unwrap.i" to double addrspace(13)* addrspace(11)*
  %"'ipl11_unwrap.i" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc10_unwrap.i", align 16, !alias.scope !48, !invariant.group !121
  br label %invertL45.i.i

invertL28.i.preheader.i:                          ; preds = %invertL28.i.i
  %63 = icmp eq i64 %"iv'ac.i.0", 0
  br i1 %63, label %diffejulia_f__742_inner.1.exit, label %invertL45.i.i

invertL28.i.i:                                    ; preds = %invertL28.i.i, %invertL45.i.i
  %"iv1'ac.i.0.in" = phi i64 [ %13, %invertL45.i.i ], [ %"iv1'ac.i.0", %invertL28.i.i ]
  %"iv1'ac.i.0" = add nsw i64 %"iv1'ac.i.0.in", -1
  %_unwrap14.i = add i64 %"iv1'ac.i.0", %_unwrap13.i
  %"'ipg_unwrap.i" = getelementptr inbounds double, double addrspace(13)* %"'ipl_unwrap.i", i64 %_unwrap14.i
  %64 = load double, double addrspace(13)* %"'ipg_unwrap.i", align 8, !tbaa !55, !noalias !122
  store double 0.000000e+00, double addrspace(13)* %"'ipg_unwrap.i", align 8, !dbg !104, !tbaa !55, !alias.scope !123, !noalias !124
  %_unwrap20.i = add i64 %"iv1'ac.i.0", %_unwrap19.i
  %_unwrap21.i = getelementptr inbounds double, double addrspace(13)* %24, i64 %_unwrap20.i
  %_unwrap22.i = load double, double addrspace(13)* %_unwrap21.i, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61, !invariant.group !114
  %m0diffe.i = fmul fast double %_unwrap22.i, %64
  %_unwrap30.i = add i64 %_unwrap29.i, %"iv1'ac.i.0"
  %_unwrap31.i = getelementptr inbounds double, double addrspace(13)* %16, i64 %_unwrap30.i
  %_unwrap32.i = load double, double addrspace(13)* %_unwrap31.i, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61, !invariant.group !113
  %m1diffe.i = fmul fast double %_unwrap32.i, %64
  %"'ipg35_unwrap.i" = getelementptr inbounds double, double addrspace(13)* %"'ipl8_unwrap.i", i64 %_unwrap20.i
  %65 = load double, double addrspace(13)* %"'ipg35_unwrap.i", align 8, !dbg !101, !tbaa !55, !alias.scope !125, !noalias !128
  %66 = fadd fast double %65, %m1diffe.i
  store double %66, double addrspace(13)* %"'ipg35_unwrap.i", align 8, !dbg !101, !tbaa !55, !alias.scope !125, !noalias !130
  %"'ipg36_unwrap.i" = getelementptr inbounds double, double addrspace(13)* %"'ipl11_unwrap.i", i64 %_unwrap30.i
  %67 = load double, double addrspace(13)* %"'ipg36_unwrap.i", align 8, !dbg !101, !tbaa !55, !alias.scope !131, !noalias !134
  %68 = fadd fast double %67, %m0diffe.i
  store double %68, double addrspace(13)* %"'ipg36_unwrap.i", align 8, !dbg !101, !tbaa !55, !alias.scope !131, !noalias !136
  %69 = icmp eq i64 %"iv1'ac.i.0", 0
  br i1 %69, label %invertL28.i.preheader.i, label %invertL28.i.i

invertL45.i.i:                                    ; preds = %invertL45.i.i.preheader, %invertL28.i.preheader.i
  %"iv'ac.i.0.in" = phi i64 [ %"iv'ac.i.0", %invertL28.i.preheader.i ], [ %10, %invertL45.i.i.preheader ]
  %"iv'ac.i.0" = add nsw i64 %"iv'ac.i.0.in", -1
  %_unwrap13.i = mul i64 %"iv'ac.i.0", %29
  %_unwrap19.i = mul i64 %"iv'ac.i.0", %21
  %_unwrap29.i = mul i64 %13, %"iv'ac.i.0"
  br label %invertL28.i.i

diffejulia_f__742_inner.1.exit:                   ; preds = %invertL28.i.preheader.i, %L14.i.preheader.i, %entry
  ret void
}

; Function Attrs: inaccessiblemem_or_argmemonly
declare void @ijl_gc_queue_root({} addrspace(10)*) #3

; Function Attrs: inaccessiblemem_or_argmemonly
declare void @jl_gc_queue_binding({} addrspace(10)*) #3

; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8*, i32, i32) #4

; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc(i8*, i64) #4

; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @ijl_gc_alloc_typed(i8*, i64, i8*) #4

attributes #0 = { nofree nosync "enzymejl_world"="33430" "probe-stack"="inline-asm" }
attributes #1 = { argmemonly nofree nosync nounwind willreturn "enzymejl_world"="33430" }
attributes #2 = { alwaysinline "probe-stack"="inline-asm" }
attributes #3 = { inaccessiblemem_or_argmemonly }
attributes #4 = { allocsize(1) }

!llvm.module.flags = !{!0, !1}
!llvm.dbg.cu = !{!2}

!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = distinct !DICompileUnit(language: DW_LANG_Julia, file: !3, producer: "julia", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, nameTableKind: None)
!3 = !DIFile(filename: "/home/wmoses/git/Enzyme.jl/slw.jl", directory: ".")
!4 = distinct !DISubprogram(name: "f!", linkageName: "julia_f!_742", scope: null, file: !3, line: 12, type: !5, scopeLine: 12, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!5 = !DISubroutineType(types: !6)
!6 = !{}
!7 = !DILocation(line: 150, scope: !8, inlinedAt: !10)
!8 = distinct !DISubprogram(name: "size;", linkageName: "size", scope: !9, file: !9, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!9 = !DIFile(filename: "array.jl", directory: ".")
!10 = distinct !DILocation(line: 98, scope: !11, inlinedAt: !13)
!11 = distinct !DISubprogram(name: "axes;", linkageName: "axes", scope: !12, file: !12, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!12 = !DIFile(filename: "abstractarray.jl", directory: ".")
!13 = distinct !DILocation(line: 30, scope: !14, inlinedAt: !16)
!14 = distinct !DISubprogram(name: "axes;", linkageName: "axes", scope: !15, file: !15, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!15 = !DIFile(filename: "experimental.jl", directory: ".")
!16 = distinct !DILocation(line: 77, scope: !11, inlinedAt: !17)
!17 = distinct !DILocation(line: 14, scope: !18, inlinedAt: !19)
!18 = distinct !DISubprogram(name: "macro expansion;", linkageName: "macro expansion", scope: !3, file: !3, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!19 = distinct !DILocation(line: 49, scope: !20, inlinedAt: !21)
!20 = distinct !DISubprogram(name: "macro expansion;", linkageName: "macro expansion", scope: !15, file: !15, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!21 = distinct !DILocation(line: 13, scope: !4, inlinedAt: !22)
!22 = distinct !DILocation(line: 0, scope: !4)
!23 = !{!24, !24, i64 0}
!24 = !{!"jtbaa_const", !25, i64 0}
!25 = !{!"jtbaa", !26, i64 0}
!26 = !{!"jtbaa"}
!27 = !{i64 0, i64 9223372036854775807}
!28 = !{!29}
!29 = !{!"jnoalias_const", !30}
!30 = !{!"jnoalias"}
!31 = !{!32, !33, !34, !35}
!32 = !{!"jnoalias_gcframe", !30}
!33 = !{!"jnoalias_stack", !30}
!34 = !{!"jnoalias_data", !30}
!35 = !{!"jnoalias_typemd", !30}
!36 = !DILocation(line: 83, scope: !37, inlinedAt: !39)
!37 = distinct !DISubprogram(name: "<;", linkageName: "<", scope: !38, file: !38, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!38 = !DIFile(filename: "int.jl", directory: ".")
!39 = distinct !DILocation(line: 369, scope: !40, inlinedAt: !42)
!40 = distinct !DISubprogram(name: ">;", linkageName: ">", scope: !41, file: !41, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!41 = !DIFile(filename: "operators.jl", directory: ".")
!42 = distinct !DILocation(line: 662, scope: !43, inlinedAt: !45)
!43 = distinct !DISubprogram(name: "isempty;", linkageName: "isempty", scope: !44, file: !44, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!44 = !DIFile(filename: "range.jl", directory: ".")
!45 = distinct !DILocation(line: 887, scope: !46, inlinedAt: !17)
!46 = distinct !DISubprogram(name: "iterate;", linkageName: "iterate", scope: !44, file: !44, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!47 = !DILocation(line: 14, scope: !18, inlinedAt: !19)
!48 = !{!49}
!49 = distinct !{!49, !50, !"na_addr13"}
!50 = distinct !{!50, !"addr13"}
!51 = !DILocation(line: 16, scope: !18, inlinedAt: !19)
!52 = !DILocation(line: 34, scope: !53, inlinedAt: !54)
!53 = distinct !DISubprogram(name: "getindex;", linkageName: "getindex", scope: !15, file: !15, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!54 = distinct !DILocation(line: 15, scope: !18, inlinedAt: !19)
!55 = !{!56, !56, i64 0}
!56 = !{!"jtbaa_arraybuf", !57, i64 0}
!57 = !{!"jtbaa_data", !25, i64 0}
!58 = !{!59, !34}
!59 = !{!"aliasscope", !60}
!60 = !{!"f!"}
!61 = !{!32, !33, !35, !29}
!62 = !DILocation(line: 410, scope: !63, inlinedAt: !54)
!63 = distinct !DISubprogram(name: "*;", linkageName: "*", scope: !64, file: !64, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!64 = !DIFile(filename: "float.jl", directory: ".")
!65 = !DILocation(line: 971, scope: !66, inlinedAt: !54)
!66 = distinct !DISubprogram(name: "setindex!;", linkageName: "setindex!", scope: !9, file: !9, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!67 = !{!34}
!68 = !{!49, !69, !59, !32, !33, !35, !29}
!69 = distinct !{!69, !70, !"na_addr13"}
!70 = distinct !{!70, !"addr13"}
!71 = distinct !{!71, !72}
!72 = !{!"llvm.loop.isvectorized", i32 1}
!73 = !DILocation(line: 499, scope: !74, inlinedAt: !76)
!74 = distinct !DISubprogram(name: "==;", linkageName: "==", scope: !75, file: !75, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!75 = !DIFile(filename: "promotion.jl", directory: ".")
!76 = distinct !DILocation(line: 891, scope: !46, inlinedAt: !77)
!77 = distinct !DILocation(line: 16, scope: !18, inlinedAt: !19)
!78 = !DILocation(line: 891, scope: !46, inlinedAt: !77)
!79 = distinct !{!79, !80, !72}
!80 = !{!"llvm.loop.unroll.runtime.disable"}
!81 = !DILocation(line: 0, scope: !4)
!82 = distinct !DISubprogram(name: "f!", linkageName: "julia_f!_742", scope: null, file: !3, line: 12, type: !5, scopeLine: 12, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!83 = !DILocation(line: 150, scope: !8, inlinedAt: !84)
!84 = distinct !DILocation(line: 98, scope: !11, inlinedAt: !85)
!85 = distinct !DILocation(line: 30, scope: !14, inlinedAt: !86)
!86 = distinct !DILocation(line: 77, scope: !11, inlinedAt: !87)
!87 = distinct !DILocation(line: 14, scope: !18, inlinedAt: !88)
!88 = distinct !DILocation(line: 49, scope: !20, inlinedAt: !89)
!89 = distinct !DILocation(line: 13, scope: !82, inlinedAt: !90)
!90 = distinct !DILocation(line: 0, scope: !82, inlinedAt: !91)
!91 = distinct !DILocation(line: 0, scope: !82)
!92 = !DILocation(line: 83, scope: !37, inlinedAt: !93)
!93 = distinct !DILocation(line: 369, scope: !40, inlinedAt: !94)
!94 = distinct !DILocation(line: 662, scope: !43, inlinedAt: !95)
!95 = distinct !DILocation(line: 887, scope: !46, inlinedAt: !87)
!96 = !DILocation(line: 14, scope: !18, inlinedAt: !88)
!97 = distinct !{}
!98 = distinct !{}
!99 = distinct !{}
!100 = !DILocation(line: 16, scope: !18, inlinedAt: !88)
!101 = !DILocation(line: 34, scope: !53, inlinedAt: !102)
!102 = distinct !DILocation(line: 15, scope: !18, inlinedAt: !88)
!103 = !DILocation(line: 410, scope: !63, inlinedAt: !102)
!104 = !DILocation(line: 971, scope: !66, inlinedAt: !102)
!105 = !{!106, !34}
!106 = distinct !{!106, !107, !"primal"}
!107 = distinct !{!107, !" diff: %"}
!108 = !{!49, !109, !110, !59, !32, !33, !35, !29}
!109 = distinct !{!109, !107, !"shadow_0"}
!110 = distinct !{!110, !111, !"na_addr13"}
!111 = distinct !{!111, !"addr13"}
!112 = distinct !{!112, !72}
!113 = distinct !{}
!114 = distinct !{}
!115 = !DILocation(line: 499, scope: !74, inlinedAt: !116)
!116 = distinct !DILocation(line: 891, scope: !46, inlinedAt: !117)
!117 = distinct !DILocation(line: 16, scope: !18, inlinedAt: !88)
!118 = distinct !{!118, !80, !72}
!119 = distinct !{}
!120 = distinct !{}
!121 = distinct !{}
!122 = !{!110, !59, !32, !33, !35, !29}
!123 = !{!109}
!124 = !{!49, !106, !110, !59, !32, !33, !35, !29}
!125 = !{!126}
!126 = distinct !{!126, !127, !"shadow_0"}
!127 = distinct !{!127, !" diff: %"}
!128 = !{!129, !32, !33, !35, !29}
!129 = distinct !{!129, !127, !"primal"}
!130 = !{!49, !129, !32, !33, !35, !29}
!131 = !{!132}
!132 = distinct !{!132, !133, !"shadow_0"}
!133 = distinct !{!133, !" diff: %"}
!134 = !{!135, !32, !33, !35, !29}
!135 = distinct !{!135, !133, !"primal"}
!136 = !{!49, !135, !32, !33, !35, !29}

Another weird thing is that the even vanilla (non differentiated, let alone fwd pass) code is vectorized worse than the codellvm version, which also implies there may be a target injection/pipeline issue to investigate.

julia> @code_llvm f!(dA, A, B)
;  @ REPL[9]:1 within `f!`
define nonnull {}* @"japi1_f!_2602"({}* %0, {}** noalias nocapture noundef readonly %1, i32 %2) #0 {
top:
  %3 = alloca {}**, align 8
  store volatile {}** %1, {}*** %3, align 8
  %4 = load {}*, {}** %1, align 8
  %5 = getelementptr inbounds {}*, {}** %1, i64 1
  %6 = load {}*, {}** %5, align 8
  %7 = getelementptr inbounds {}*, {}** %1, i64 2
  %8 = load {}*, {}** %7, align 8
;  @ REPL[9]:2 within `f!`
; ┌ @ experimental.jl:49 within `macro expansion` @ REPL[9]:3
; │┌ @ abstractarray.jl:77 within `axes` @ experimental.jl:30 @ abstractarray.jl:98
; ││┌ @ array.jl:150 within `size`
     %9 = bitcast {}* %6 to {}**
     %10 = getelementptr inbounds {}*, {}** %9, i64 4
     %11 = bitcast {}** %10 to i64*
     %12 = load i64, i64* %11, align 8
; │└└
; │┌ @ range.jl:887 within `iterate`
; ││┌ @ range.jl:662 within `isempty`
; │││┌ @ operators.jl:369 within `>`
; ││││┌ @ int.jl:83 within `<`
       %.not.not = icmp eq i64 %12, 0
; │└└└└
   br i1 %.not.not, label %L56, label %L14.preheader

L14.preheader:                                    ; preds = %top
   %13 = getelementptr inbounds {}*, {}** %9, i64 3
   %14 = bitcast {}** %13 to i64*
   %15 = load i64, i64* %14, align 8
   %.not.not19 = icmp eq i64 %15, 0
   %16 = bitcast {}* %6 to double**
   %17 = load double*, double** %16, align 8
   %18 = bitcast {}* %8 to {}**
   %19 = getelementptr inbounds {}*, {}** %18, i64 3
   %20 = bitcast {}** %19 to i64*
   %21 = load i64, i64* %20, align 8
   %22 = bitcast {}* %8 to double**
   %23 = load double*, double** %22, align 8
   %24 = bitcast {}* %4 to {}**
   %25 = getelementptr inbounds {}*, {}** %24, i64 3
   %26 = bitcast {}** %25 to i64*
   %27 = load i64, i64* %26, align 8
   %28 = bitcast {}* %4 to double**
   %29 = load double*, double** %28, align 8
   br i1 %.not.not19, label %L56, label %L28.preheader

L28.preheader:                                    ; preds = %L45, %L14.preheader
   %value_phi3 = phi i64 [ %77, %L45 ], [ 1, %L14.preheader ]
   %30 = add nsw i64 %value_phi3, -1
   %31 = mul i64 %15, %30
   %32 = mul i64 %21, %30
   %33 = mul i64 %27, %30
; │ @ experimental.jl:49 within `macro expansion` @ REPL[9]:5
   %min.iters.check = icmp ult i64 %15, 16
   br i1 %min.iters.check, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %L28.preheader
   %n.vec = and i64 %15, 9223372036854775792
   %ind.end = or i64 %n.vec, 1
   br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; │ @ experimental.jl:49 within `macro expansion` @ REPL[9]:4
; │┌ @ experimental.jl:34 within `getindex`
    %34 = add i64 %index, %31
    %35 = getelementptr inbounds double, double* %17, i64 %34
    %36 = bitcast double* %35 to <4 x double>*
    %wide.load = load <4 x double>, <4 x double>* %36, align 8
    %37 = getelementptr inbounds double, double* %35, i64 4
    %38 = bitcast double* %37 to <4 x double>*
    %wide.load21 = load <4 x double>, <4 x double>* %38, align 8
    %39 = getelementptr inbounds double, double* %35, i64 8
    %40 = bitcast double* %39 to <4 x double>*
    %wide.load22 = load <4 x double>, <4 x double>* %40, align 8
    %41 = getelementptr inbounds double, double* %35, i64 12
    %42 = bitcast double* %41 to <4 x double>*
    %wide.load23 = load <4 x double>, <4 x double>* %42, align 8
    %43 = add i64 %index, %32
    %44 = getelementptr inbounds double, double* %23, i64 %43
    %45 = bitcast double* %44 to <4 x double>*
    %wide.load24 = load <4 x double>, <4 x double>* %45, align 8
    %46 = getelementptr inbounds double, double* %44, i64 4
    %47 = bitcast double* %46 to <4 x double>*
    %wide.load25 = load <4 x double>, <4 x double>* %47, align 8
    %48 = getelementptr inbounds double, double* %44, i64 8
    %49 = bitcast double* %48 to <4 x double>*
    %wide.load26 = load <4 x double>, <4 x double>* %49, align 8
    %50 = getelementptr inbounds double, double* %44, i64 12
    %51 = bitcast double* %50 to <4 x double>*
    %wide.load27 = load <4 x double>, <4 x double>* %51, align 8
; │└
; │┌ @ float.jl:410 within `*`
    %52 = fmul <4 x double> %wide.load, %wide.load24
    %53 = fmul <4 x double> %wide.load21, %wide.load25
    %54 = fmul <4 x double> %wide.load22, %wide.load26
    %55 = fmul <4 x double> %wide.load23, %wide.load27
; │└
; │┌ @ array.jl:971 within `setindex!`
    %56 = add i64 %index, %33
    %57 = getelementptr inbounds double, double* %29, i64 %56
    %58 = bitcast double* %57 to <4 x double>*
    store <4 x double> %52, <4 x double>* %58, align 8
    %59 = getelementptr inbounds double, double* %57, i64 4
    %60 = bitcast double* %59 to <4 x double>*
    store <4 x double> %53, <4 x double>* %60, align 8
    %61 = getelementptr inbounds double, double* %57, i64 8
    %62 = bitcast double* %61 to <4 x double>*
    store <4 x double> %54, <4 x double>* %62, align 8
    %63 = getelementptr inbounds double, double* %57, i64 12
    %64 = bitcast double* %63 to <4 x double>*
    store <4 x double> %55, <4 x double>* %64, align 8
    %index.next = add nuw i64 %index, 16
    %65 = icmp eq i64 %index.next, %n.vec
    br i1 %65, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body
; │└
; │ @ experimental.jl:49 within `macro expansion` @ REPL[9]:5
   %cmp.n = icmp eq i64 %15, %n.vec
   br i1 %cmp.n, label %L45, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block, %L28.preheader
   %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L28.preheader ]
   br label %L28

L28:                                              ; preds = %L28, %scalar.ph
   %value_phi8 = phi i64 [ %76, %L28 ], [ %bc.resume.val, %scalar.ph ]
; │ @ experimental.jl:49 within `macro expansion` @ REPL[9]:4
; │┌ @ experimental.jl:34 within `getindex`
    %66 = add nsw i64 %value_phi8, -1
    %67 = add i64 %66, %31
    %68 = getelementptr inbounds double, double* %17, i64 %67
    %69 = load double, double* %68, align 8
    %70 = add i64 %66, %32
    %71 = getelementptr inbounds double, double* %23, i64 %70
    %72 = load double, double* %71, align 8
; │└
; │┌ @ float.jl:410 within `*`
    %73 = fmul double %69, %72
; │└
; │┌ @ array.jl:971 within `setindex!`
    %74 = add i64 %66, %33
    %75 = getelementptr inbounds double, double* %29, i64 %74
    store double %73, double* %75, align 8
; │└
; │ @ experimental.jl:49 within `macro expansion` @ REPL[9]:5
; │┌ @ range.jl:891 within `iterate`
; ││┌ @ promotion.jl:499 within `==`
     %.not.not20 = icmp eq i64 %value_phi8, %15
; ││└
    %76 = add nuw nsw i64 %value_phi8, 1
; │└
   br i1 %.not.not20, label %L45, label %L28

L45:                                              ; preds = %L28, %middle.block
; │┌ @ range.jl:891 within `iterate`
; ││┌ @ promotion.jl:499 within `==`
     %.not = icmp eq i64 %value_phi3, %12
; ││└
    %77 = add nuw nsw i64 %value_phi3, 1
; │└
   br i1 %.not, label %L56, label %L28.preheader

L56:                                              ; preds = %L45, %L14.preheader, %top
; └
;  @ REPL[9]:7 within `f!`
  ret {}* inttoptr (i64 140233368764424 to {}*)
}

wsmoses avatar Apr 10 '23 02:04 wsmoses

It looks like the first reason why vectorization isn't happening is a lack of noalias info.

Concretely, the reverse pass looks like

load dout store dout

load A load B

dA += ... dB += ...

current alias info believes

  • dout could alias A, B, dA, dB.
    • this is fixed by marking output noalias input [right now this is one directional where output writes noalias input, but we need the other way]
  • A aliases dB, and B aliases dA
    • this is fixed by marking A and B as noalias each other.

wsmoses avatar Apr 10 '23 04:04 wsmoses

@wsmoses I tried using UnsafeArrays to work around the Enzyme/Julia double-pointered-arrays problem (so Enzyme should be able to infer aliasing correctly?):

using UnsafeArrays

UnsafeArrays.uview(A::Duplicated) = Duplicated(uview(A.val), uview(A.dval))

function f!(dA, A, B)
    # Can't use Const with UnsafeArray:
    # @aliasscope let A = Const(A), B = Const(B)
        @inbounds for j in axes(A, 2), i in axes(A, 1)
            dA[i,j] =  A[i,j] * B[i,j]
        end
    # end
    nothing
end

@uviews C A B begin
    f!(C, A, B)
end

@uviews C A B begin
    @benchmark f!($C, $A, $B)
end

@uviews dpl_C dpl_A dpl_B begin
    Enzyme.autodiff(Reverse, f!, dpl_C, dpl_A, dpl_B)
end

@uviews dpl_C dpl_A dpl_B begin
    # Everything should be single-pointer in here:
    @benchmark Enzyme.autodiff(Reverse, f!, $dpl_C, $dpl_A, $dpl_B)
end

Doesn't seem to work though - the speed of the primal computation is the same, but reverse diff becomes even slower (about 4 ms instead of 2 ms). Any idea why?

oschulz avatar Jan 06 '24 22:01 oschulz