Slow performance on simple element-wise array multiplication
f!(dA, A, B) runs in 90 ns, but Enzyme.autodiff(f!, dpl_dA, dpl_A, dpl_B) takes about 2000 ns with Enzyme v0.10.18:.
using Enzyme, Random, BenchmarkTools
using Base.Experimental: @aliasscope, Const
#Enzyme.API.printperf!(true)
#Enzyme.API.printall!(true)
n, m = 12, 15
A = rand(n, m)
B = rand(n, m)
C = zero(A)
function f!(C, A, B)
@aliasscope let A = Const(A), B = Const(B)
@inbounds for j in axes(A, 2), i in axes(A, 1)
C[i,j] = A[i,j] * B[i,j]
end
end
nothing
end
f!(C, A, B)
# @benchmark f!($C, $A, $B)
dpl_A = Duplicated(A, zero(A))
dpl_B = Duplicated(B, zero(B))
dpl_C = Duplicated(C, rand!(copy(C)))
# Very slow - why?
Enzyme.autodiff(Reverse, f!, dpl_C, dpl_A, dpl_B)
# @benchmark Enzyme.autodiff(Reverse, f!, $dpl_C, $dpl_A, $dpl_B)
With Enzyme.API.printperf! I get
Load may need caching %19 = load i64, i64 addrspace(11)* %18, align 8 due to store double %42, double addrspace(13)* %44, align 8, !dbg !53, !tbaa !44, !noalias !47
Load may need caching %27 = load i64, i64 addrspace(11)* %26, align 8 due to store double %42, double addrspace(13)* %44, align 8, !dbg !53, !tbaa !44, !noalias !47
Load must be recomputed %27 = load i64, i64 addrspace(11)* %26, align 8 in reverse_invertL28.i due to store double %42, double addrspace(13)* %44, align 8, !dbg !53, !tbaa !44, !noalias !47
Caching instruction %27 = load i64, i64 addrspace(11)* %26, align 8 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1
Load must be recomputed %19 = load i64, i64 addrspace(11)* %18, align 8 in reverse_invertL28.i due to store double %42, double addrspace(13)* %44, align 8, !dbg !53, !tbaa !44, !noalias !47
Caching instruction %19 = load i64, i64 addrspace(11)* %18, align 8 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1
@vchuravy suggested I open an issue.
Pre opt:
after simplification :
; Function Attrs: mustprogress nofree nosync willreturn
define void @preprocess_julia_f__3847_inner.1({} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %0, {} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %1, {} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %2) local_unnamed_addr #3 !dbg !59 {
entry:
%3 = call {}*** @julia.get_pgcstack() #4
%4 = bitcast {} addrspace(10)* %1 to {} addrspace(10)* addrspace(10)*, !dbg !60
%5 = addrspacecast {} addrspace(10)* addrspace(10)* %4 to {} addrspace(10)* addrspace(11)*, !dbg !60
%6 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %5, i64 4, !dbg !60
%7 = bitcast {} addrspace(10)* addrspace(11)* %6 to i64 addrspace(11)*, !dbg !60
%8 = load i64, i64 addrspace(11)* %7, align 16, !dbg !60, !tbaa !12, !range !16, !invariant.load !4
%9 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %5, i64 3, !dbg !63
%10 = bitcast {} addrspace(10)* addrspace(11)* %9 to i64 addrspace(11)*, !dbg !63
%11 = load i64, i64 addrspace(11)* %10, align 8, !dbg !63, !tbaa !12, !range !16, !invariant.load !4
%.not.not = icmp eq i64 %8, 0, !dbg !65
br i1 %.not.not, label %julia_f__3847_inner.exit, label %L21.i.preheader, !dbg !72
L21.i.preheader: ; preds = %entry
%.not.not7 = icmp eq i64 %11, 0
%12 = bitcast {} addrspace(10)* %1 to double addrspace(13)* addrspace(10)*
%13 = addrspacecast double addrspace(13)* addrspace(10)* %12 to double addrspace(13)* addrspace(11)*
%14 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %13, align 16
%15 = bitcast {} addrspace(10)* %2 to {} addrspace(10)* addrspace(10)*
%16 = addrspacecast {} addrspace(10)* addrspace(10)* %15 to {} addrspace(10)* addrspace(11)*
%17 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %16, i64 3
%18 = bitcast {} addrspace(10)* addrspace(11)* %17 to i64 addrspace(11)*
%19 = load i64, i64 addrspace(11)* %18, align 8
%20 = bitcast {} addrspace(10)* %2 to double addrspace(13)* addrspace(10)*
%21 = addrspacecast double addrspace(13)* addrspace(10)* %20 to double addrspace(13)* addrspace(11)*
%22 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %21, align 16
%23 = bitcast {} addrspace(10)* %0 to {} addrspace(10)* addrspace(10)*
%24 = addrspacecast {} addrspace(10)* addrspace(10)* %23 to {} addrspace(10)* addrspace(11)*
%25 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %24, i64 3
%26 = bitcast {} addrspace(10)* addrspace(11)* %25 to i64 addrspace(11)*
%27 = load i64, i64 addrspace(11)* %26, align 8
%28 = bitcast {} addrspace(10)* %0 to double addrspace(13)* addrspace(10)*
%29 = addrspacecast double addrspace(13)* addrspace(10)* %28 to double addrspace(13)* addrspace(11)*
%30 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %29, align 16
br i1 %.not.not7, label %julia_f__3847_inner.exit, label %L40.i.preheader.preheader, !dbg !72
L40.i.preheader.preheader: ; preds = %L21.i.preheader
br label %L40.i.preheader, !dbg !73
L40.i.preheader: ; preds = %L40.i.preheader.preheader, %L59.i
%iv = phi i64 [ %iv.next, %L59.i ], [ 0, %L40.i.preheader.preheader ]
%iv.next = add nuw nsw i64 %iv, 1
%31 = add nsw i64 %iv.next, -1
%32 = mul i64 %11, %31
%33 = mul i64 %19, %31
%34 = mul i64 %27, %31
br label %L40.i, !dbg !73
L40.i: ; preds = %L40.i, %L40.i.preheader
%iv1 = phi i64 [ %iv.next2, %L40.i ], [ 0, %L40.i.preheader ]
%iv.next2 = add nuw nsw i64 %iv1, 1, !dbg !74
%35 = add nsw i64 %iv.next2, -1, !dbg !74
%36 = add i64 %35, %32, !dbg !74
%37 = getelementptr inbounds double, double addrspace(13)* %14, i64 %36, !dbg !74
%38 = load double, double addrspace(13)* %37, align 8, !dbg !74, !tbaa !41, !alias.scope !44
%39 = add i64 %35, %33, !dbg !74
%40 = getelementptr inbounds double, double addrspace(13)* %22, i64 %39, !dbg !74
%41 = load double, double addrspace(13)* %40, align 8, !dbg !74, !tbaa !41, !alias.scope !44
%42 = fmul double %38, %41, !dbg !76
%43 = add i64 %35, %34, !dbg !77
%44 = getelementptr inbounds double, double addrspace(13)* %30, i64 %43, !dbg !77
store double %42, double addrspace(13)* %44, align 8, !dbg !77, !tbaa !41, !noalias !44
%.not.not8 = icmp eq i64 %iv.next2, %11, !dbg !78
%45 = add nuw nsw i64 %iv.next2, 1, !dbg !81
br i1 %.not.not8, label %L59.i, label %L40.i, !dbg !73
L59.i: ; preds = %L40.i
%.not = icmp eq i64 %iv.next, %8, !dbg !78
%46 = add nuw nsw i64 %iv.next, 1, !dbg !81
br i1 %.not, label %julia_f__3847_inner.exit.loopexit, label %L40.i.preheader, !dbg !73
julia_f__3847_inner.exit.loopexit: ; preds = %L59.i
br label %julia_f__3847_inner.exit, !dbg !82
julia_f__3847_inner.exit: ; preds = %julia_f__3847_inner.exit.loopexit, %L21.i.preheader, %entry
ret void, !dbg !82
}
Load may need caching %19 = load i64, i64 addrspace(11)* %18, align 8 due to store double %42, double addrspace(13)* %44, align 8, !dbg !50, !tbaa !41, !noalias !44
Load may need caching %27 = load i64, i64 addrspace(11)* %26, align 8 due to store double %42, double addrspace(13)* %44, align 8, !dbg !50, !tbaa !41, !noalias !44
Load must be recomputed %27 = load i64, i64 addrspace(11)* %26, align 8 in reverse_invertL40.i due to store double %42, double addrspace(13)* %44, align 8, !dbg !50, !tbaa !41, !noalias !44
Caching instruction %27 = load i64, i64 addrspace(11)* %26, align 8 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1
Load must be recomputed %19 = load i64, i64 addrspace(11)* %18, align 8 in reverse_invertL40.i due to store double %42, double addrspace(13)* %44, align 8, !dbg !50, !tbaa !41, !noalias !44
Caching instruction %19 = load i64, i64 addrspace(11)* %18, align 8 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1
; Function Attrs: mustprogress nofree nosync willreturn
define internal void @diffejulia_f__3847_inner.1({} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %0, {} addrspace(10)* nocapture %"'", {} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %1, {} addrspace(10)* nocapture %"'1", {} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %2, {} addrspace(10)* nocapture %"'2") local_unnamed_addr #3 !dbg !83 {
entry:
%"iv'ac" = alloca i64, align 8
%"iv1'ac" = alloca i64, align 8
%_cache = alloca i64, align 8
store i64 0, i64* %_cache, align 8
%"'de" = alloca double, align 8
%3 = getelementptr double, double* %"'de", i64 0
store double 0.000000e+00, double* %3, align 8
%_cache23 = alloca i64, align 8
store i64 0, i64* %_cache23, align 8
%"'de35" = alloca double, align 8
%4 = getelementptr double, double* %"'de35", i64 0
store double 0.000000e+00, double* %4, align 8
%"'de36" = alloca double, align 8
%5 = getelementptr double, double* %"'de36", i64 0
store double 0.000000e+00, double* %5, align 8
%6 = call {}*** @julia.get_pgcstack() #4
%7 = bitcast {} addrspace(10)* %1 to {} addrspace(10)* addrspace(10)*, !dbg !84
%8 = addrspacecast {} addrspace(10)* addrspace(10)* %7 to {} addrspace(10)* addrspace(11)*, !dbg !84
%9 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %8, i64 4, !dbg !84
%10 = bitcast {} addrspace(10)* addrspace(11)* %9 to i64 addrspace(11)*, !dbg !84
%11 = load i64, i64 addrspace(11)* %10, align 16, !dbg !84, !tbaa !12, !range !16, !invariant.load !4
%12 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %8, i64 3, !dbg !87
%13 = bitcast {} addrspace(10)* addrspace(11)* %12 to i64 addrspace(11)*, !dbg !87
%14 = load i64, i64 addrspace(11)* %13, align 8, !dbg !87, !tbaa !12, !range !16, !invariant.load !4
%.not.not = icmp eq i64 %11, 0, !dbg !89
br i1 %.not.not, label %julia_f__3847_inner.exit, label %L21.i.preheader, !dbg !96
L21.i.preheader: ; preds = %entry
%.not.not7 = icmp eq i64 %14, 0
%"'ipc11" = bitcast {} addrspace(10)* %"'1" to double addrspace(13)* addrspace(10)*
%15 = bitcast {} addrspace(10)* %1 to double addrspace(13)* addrspace(10)*
%"'ipc12" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc11" to double addrspace(13)* addrspace(11)*
%16 = addrspacecast double addrspace(13)* addrspace(10)* %15 to double addrspace(13)* addrspace(11)*
%"'ipl13" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc12", align 16
%17 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %16, align 16
%18 = bitcast {} addrspace(10)* %2 to {} addrspace(10)* addrspace(10)*
%19 = addrspacecast {} addrspace(10)* addrspace(10)* %18 to {} addrspace(10)* addrspace(11)*
%20 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %19, i64 3
%21 = bitcast {} addrspace(10)* addrspace(11)* %20 to i64 addrspace(11)*
%22 = load i64, i64 addrspace(11)* %21, align 8
%"'ipc8" = bitcast {} addrspace(10)* %"'2" to double addrspace(13)* addrspace(10)*
%23 = bitcast {} addrspace(10)* %2 to double addrspace(13)* addrspace(10)*
%"'ipc9" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc8" to double addrspace(13)* addrspace(11)*
%24 = addrspacecast double addrspace(13)* addrspace(10)* %23 to double addrspace(13)* addrspace(11)*
%"'ipl10" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc9", align 16
%25 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %24, align 16
%26 = bitcast {} addrspace(10)* %0 to {} addrspace(10)* addrspace(10)*
%27 = addrspacecast {} addrspace(10)* addrspace(10)* %26 to {} addrspace(10)* addrspace(11)*
%28 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %27, i64 3
%29 = bitcast {} addrspace(10)* addrspace(11)* %28 to i64 addrspace(11)*
%30 = load i64, i64 addrspace(11)* %29, align 8
store i64 %30, i64* %_cache, align 8, !invariant.group !97
store i64 %22, i64* %_cache23, align 8, !invariant.group !98
%"'ipc" = bitcast {} addrspace(10)* %"'" to double addrspace(13)* addrspace(10)*
%"'ipc5" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc" to double addrspace(13)* addrspace(11)*
%"'ipl" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc5", align 16
br i1 %.not.not7, label %julia_f__3847_inner.exit, label %L40.i.preheader.preheader, !dbg !96
L40.i.preheader.preheader: ; preds = %L21.i.preheader
%31 = add nsw i64 %11, -1, !dbg !99
%32 = add nsw i64 %14, -1, !dbg !99
br label %L40.i.preheader, !dbg !99
L40.i.preheader: ; preds = %L59.i, %L40.i.preheader.preheader
%iv = phi i64 [ %iv.next, %L59.i ], [ 0, %L40.i.preheader.preheader ]
%iv.next = add nuw nsw i64 %iv, 1
%33 = add nsw i64 %iv.next, -1
%34 = mul i64 %14, %33
%35 = mul i64 %22, %33
%36 = mul i64 %30, %33
br label %L40.i, !dbg !99
L40.i: ; preds = %L40.i, %L40.i.preheader
%iv1 = phi i64 [ %iv.next2, %L40.i ], [ 0, %L40.i.preheader ]
%iv.next2 = add nuw nsw i64 %iv1, 1, !dbg !100
%37 = add nsw i64 %iv.next2, -1, !dbg !100
%38 = add i64 %37, %34, !dbg !100
%"'ipg38" = getelementptr inbounds double, double addrspace(13)* %"'ipl13", i64 %38, !dbg !100
%39 = getelementptr inbounds double, double addrspace(13)* %17, i64 %38, !dbg !100
%40 = load double, double addrspace(13)* %39, align 8, !dbg !100, !tbaa !41, !alias.scope !44
%41 = add i64 %37, %35, !dbg !100
%"'ipg37" = getelementptr inbounds double, double addrspace(13)* %"'ipl10", i64 %41, !dbg !100
%42 = getelementptr inbounds double, double addrspace(13)* %25, i64 %41, !dbg !100
%43 = load double, double addrspace(13)* %42, align 8, !dbg !100, !tbaa !41, !alias.scope !44
%44 = add i64 %37, %36, !dbg !102
%"'ipg" = getelementptr inbounds double, double addrspace(13)* %"'ipl", i64 %44, !dbg !102
%.not.not8 = icmp eq i64 %iv.next2, %14, !dbg !103
br i1 %.not.not8, label %L59.i, label %L40.i, !dbg !99
L59.i: ; preds = %L40.i
%.not = icmp eq i64 %iv.next, %11, !dbg !103
br i1 %.not, label %julia_f__3847_inner.exit.loopexit, label %L40.i.preheader, !dbg !99
julia_f__3847_inner.exit.loopexit: ; preds = %L59.i
br label %julia_f__3847_inner.exit, !dbg !106
julia_f__3847_inner.exit: ; preds = %julia_f__3847_inner.exit.loopexit, %L21.i.preheader, %entry
br label %invertjulia_f__3847_inner.exit, !dbg !106
invertentry: ; preds = %invertjulia_f__3847_inner.exit, %invertL21.i.preheader
ret void
invertL21.i.preheader: ; preds = %staging, %invertL40.i.preheader.preheader
br label %invertentry
invertL40.i.preheader.preheader: ; preds = %invertL40.i.preheader
br label %invertL21.i.preheader
invertL40.i.preheader: ; preds = %invertL40.i
%45 = load i64, i64* %"iv'ac", align 8
%46 = icmp eq i64 %45, 0
%47 = xor i1 %46, true
br i1 %46, label %invertL40.i.preheader.preheader, label %incinvertL40.i.preheader
incinvertL40.i.preheader: ; preds = %invertL40.i.preheader
%48 = load i64, i64* %"iv'ac", align 8
%49 = add nsw i64 %48, -1
store i64 %49, i64* %"iv'ac", align 8
br label %invertL59.i
invertL40.i: ; preds = %mergeinvertL40.i_L59.i, %incinvertL40.i
%50 = load i64, i64* %"iv1'ac", align 8
%51 = load i64, i64* %"iv'ac", align 8
%"'ipc_unwrap" = bitcast {} addrspace(10)* %"'" to double addrspace(13)* addrspace(10)*
%"'ipc5_unwrap" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc_unwrap" to double addrspace(13)* addrspace(11)*
%"'ipl_unwrap" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc5_unwrap", align 16
%iv.next2_unwrap = add nuw nsw i64 %50, 1
%_unwrap = add nsw i64 %iv.next2_unwrap, -1
%52 = load i64, i64* %_cache, align 8, !invariant.group !97
%iv.next_unwrap = add nuw nsw i64 %51, 1
%_unwrap15 = add nsw i64 %iv.next_unwrap, -1
%_unwrap16 = mul i64 %52, %_unwrap15
%_unwrap17 = add i64 %_unwrap, %_unwrap16
%"'ipg_unwrap" = getelementptr inbounds double, double addrspace(13)* %"'ipl_unwrap", i64 %_unwrap17
%53 = load double, double addrspace(13)* %"'ipg_unwrap", align 8, !tbaa !41, !noalias !44
store double 0.000000e+00, double addrspace(13)* %"'ipg_unwrap", align 8, !dbg !102, !tbaa !41, !alias.scope !107, !noalias !110
%54 = load double, double* %"'de", align 8
%55 = fadd fast double %54, %53
store double %55, double* %"'de", align 8
%56 = load double, double* %"'de", align 8
%57 = load i64, i64* %"iv1'ac", align 8
%58 = load i64, i64* %"iv'ac", align 8
%_unwrap20 = bitcast {} addrspace(10)* %2 to double addrspace(13)* addrspace(10)*
%_unwrap21 = addrspacecast double addrspace(13)* addrspace(10)* %_unwrap20 to double addrspace(13)* addrspace(11)*
%_unwrap22 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %_unwrap21, align 16
%59 = load i64, i64* %_cache23, align 8, !invariant.group !98
%_unwrap24 = mul i64 %59, %_unwrap15
%_unwrap25 = add i64 %_unwrap, %_unwrap24
%_unwrap26 = getelementptr inbounds double, double addrspace(13)* %_unwrap22, i64 %_unwrap25
%_unwrap27 = load double, double addrspace(13)* %_unwrap26, align 8, !dbg !100, !tbaa !41, !alias.scope !44
%m0diffe = fmul fast double %56, %_unwrap27
%60 = load i64, i64* %"iv1'ac", align 8
%61 = load i64, i64* %"iv'ac", align 8
%_unwrap28 = bitcast {} addrspace(10)* %1 to double addrspace(13)* addrspace(10)*
%_unwrap29 = addrspacecast double addrspace(13)* addrspace(10)* %_unwrap28 to double addrspace(13)* addrspace(11)*
%_unwrap30 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %_unwrap29, align 16
%_unwrap31 = mul i64 %14, %_unwrap15
%_unwrap32 = add i64 %_unwrap, %_unwrap31
%_unwrap33 = getelementptr inbounds double, double addrspace(13)* %_unwrap30, i64 %_unwrap32
%_unwrap34 = load double, double addrspace(13)* %_unwrap33, align 8, !dbg !100, !tbaa !41, !alias.scope !44
%m1diffe = fmul fast double %56, %_unwrap34
store double 0.000000e+00, double* %"'de", align 8
%62 = load double, double* %"'de35", align 8
%63 = fadd fast double %62, %m0diffe
store double %63, double* %"'de35", align 8
%64 = load double, double* %"'de36", align 8
%65 = fadd fast double %64, %m1diffe
store double %65, double* %"'de36", align 8
%66 = load double, double* %"'de36", align 8
store double 0.000000e+00, double* %"'de36", align 8
%67 = load i64, i64* %"iv1'ac", align 8
%68 = load i64, i64* %"iv'ac", align 8
%"'ipc8_unwrap" = bitcast {} addrspace(10)* %"'2" to double addrspace(13)* addrspace(10)*
%"'ipc9_unwrap" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc8_unwrap" to double addrspace(13)* addrspace(11)*
%"'ipl10_unwrap" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc9_unwrap", align 16
%"'ipg37_unwrap" = getelementptr inbounds double, double addrspace(13)* %"'ipl10_unwrap", i64 %_unwrap25
%69 = load double, double addrspace(13)* %"'ipg37_unwrap", align 8, !dbg !100, !tbaa !41, !alias.scope !112, !noalias !115
%70 = fadd fast double %69, %66
store double %70, double addrspace(13)* %"'ipg37_unwrap", align 8, !dbg !100, !tbaa !41, !alias.scope !112, !noalias !115
%71 = load double, double* %"'de35", align 8
store double 0.000000e+00, double* %"'de35", align 8
%72 = load i64, i64* %"iv1'ac", align 8
%73 = load i64, i64* %"iv'ac", align 8
%"'ipc11_unwrap" = bitcast {} addrspace(10)* %"'1" to double addrspace(13)* addrspace(10)*
%"'ipc12_unwrap" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc11_unwrap" to double addrspace(13)* addrspace(11)*
%"'ipl13_unwrap" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc12_unwrap", align 16
%"'ipg38_unwrap" = getelementptr inbounds double, double addrspace(13)* %"'ipl13_unwrap", i64 %_unwrap32
%74 = load double, double addrspace(13)* %"'ipg38_unwrap", align 8, !dbg !100, !tbaa !41, !alias.scope !117, !noalias !120
%75 = fadd fast double %74, %71
store double %75, double addrspace(13)* %"'ipg38_unwrap", align 8, !dbg !100, !tbaa !41, !alias.scope !117, !noalias !120
%76 = load i64, i64* %"iv1'ac", align 8
%77 = icmp eq i64 %76, 0
%78 = xor i1 %77, true
br i1 %77, label %invertL40.i.preheader, label %incinvertL40.i
incinvertL40.i: ; preds = %invertL40.i
%79 = load i64, i64* %"iv1'ac", align 8
%80 = add nsw i64 %79, -1
store i64 %80, i64* %"iv1'ac", align 8
br label %invertL40.i
invertL59.i: ; preds = %mergeinvertL40.i.preheader_julia_f__3847_inner.exit.loopexit, %incinvertL40.i.preheader
%81 = load i64, i64* %"iv'ac", align 8
%_unwrap40 = add nsw i64 %14, -1
br label %mergeinvertL40.i_L59.i
mergeinvertL40.i_L59.i: ; preds = %invertL59.i
store i64 %_unwrap40, i64* %"iv1'ac", align 8
br label %invertL40.i
invertjulia_f__3847_inner.exit.loopexit: ; preds = %staging
%_unwrap41 = add nsw i64 %11, -1
br label %mergeinvertL40.i.preheader_julia_f__3847_inner.exit.loopexit
mergeinvertL40.i.preheader_julia_f__3847_inner.exit.loopexit: ; preds = %invertjulia_f__3847_inner.exit.loopexit
store i64 %_unwrap41, i64* %"iv'ac", align 8
br label %invertL59.i
invertjulia_f__3847_inner.exit: ; preds = %julia_f__3847_inner.exit
%.not.not7_unwrap = icmp eq i64 %14, 0
br i1 %.not.not, label %invertentry, label %staging
staging: ; preds = %invertjulia_f__3847_inner.exit
br i1 %.not.not7_unwrap, label %invertL21.i.preheader, label %invertjulia_f__3847_inner.exit.loopexit
}
Post optimization:
julia> Enzyme.autodiff(Reverse, f!, dpl_dA, dpl_A, dpl_B)
mod = ; ModuleID = 'text'
source_filename = "text"
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-darwin21.4.0"
; Function Attrs: nofree nosync
define private void @julia_f__1991_inner.1({} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %0, {} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %1, {} addrspace(10)* nocapture nofree nonnull readonly align 16 dereferenceable(40) %2) local_unnamed_addr #0 !dbg !5 {
entry:
%3 = bitcast {} addrspace(10)* %1 to {} addrspace(10)* addrspace(10)*, !dbg !7
%4 = addrspacecast {} addrspace(10)* addrspace(10)* %3 to {} addrspace(10)* addrspace(11)*, !dbg !7
%5 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %4, i64 4, !dbg !7
%6 = bitcast {} addrspace(10)* addrspace(11)* %5 to i64 addrspace(11)*, !dbg !7
%7 = load i64, i64 addrspace(11)* %6, align 16, !dbg !7, !tbaa !23, !range !27
%.not.not = icmp eq i64 %7, 0, !dbg !28
br i1 %.not.not, label %julia_f__1991_inner.exit, label %L16.i.preheader, !dbg !39
L16.i.preheader: ; preds = %entry
%8 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %4, i64 3
%9 = bitcast {} addrspace(10)* addrspace(11)* %8 to i64 addrspace(11)*
%10 = load i64, i64 addrspace(11)* %9, align 8, !tbaa !23, !range !27
%.not.not7 = icmp eq i64 %10, 0
%11 = bitcast {} addrspace(10)* %1 to double addrspace(13)* addrspace(10)*
%12 = addrspacecast double addrspace(13)* addrspace(10)* %11 to double addrspace(13)* addrspace(11)*
%13 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %12, align 16
%14 = bitcast {} addrspace(10)* %2 to {} addrspace(10)* addrspace(10)*
%15 = addrspacecast {} addrspace(10)* addrspace(10)* %14 to {} addrspace(10)* addrspace(11)*
%16 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %15, i64 3
%17 = bitcast {} addrspace(10)* addrspace(11)* %16 to i64 addrspace(11)*
%18 = load i64, i64 addrspace(11)* %17, align 8
%19 = bitcast {} addrspace(10)* %2 to double addrspace(13)* addrspace(10)*
%20 = addrspacecast double addrspace(13)* addrspace(10)* %19 to double addrspace(13)* addrspace(11)*
%21 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %20, align 16
%22 = bitcast {} addrspace(10)* %0 to {} addrspace(10)* addrspace(10)*
%23 = addrspacecast {} addrspace(10)* addrspace(10)* %22 to {} addrspace(10)* addrspace(11)*
%24 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %23, i64 3
%25 = bitcast {} addrspace(10)* addrspace(11)* %24 to i64 addrspace(11)*
%26 = load i64, i64 addrspace(11)* %25, align 8
%27 = bitcast {} addrspace(10)* %0 to double addrspace(13)* addrspace(10)*
%28 = addrspacecast double addrspace(13)* addrspace(10)* %27 to double addrspace(13)* addrspace(11)*
%29 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %28, align 16
br i1 %.not.not7, label %julia_f__1991_inner.exit, label %L32.i.preheader, !dbg !39
L32.i.preheader: ; preds = %L16.i.preheader, %L51.i
%value_phi3.i = phi i64 [ %63, %L51.i ], [ 1, %L16.i.preheader ]
%30 = add nsw i64 %value_phi3.i, -1
%31 = mul i64 %10, %30
%32 = mul i64 %18, %30
%33 = mul i64 %26, %30
%min.iters.check = icmp ult i64 %10, 4, !dbg !40
br i1 %min.iters.check, label %scalar.ph, label %vector.ph, !dbg !40
vector.ph: ; preds = %L32.i.preheader
%n.vec = and i64 %10, 9223372036854775804, !dbg !40
%ind.end = or i64 %n.vec, 1, !dbg !40
br label %vector.body, !dbg !40
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%34 = add i64 %index, %31, !dbg !41
%35 = getelementptr inbounds double, double addrspace(13)* %13, i64 %34, !dbg !41
%36 = bitcast double addrspace(13)* %35 to <2 x double> addrspace(13)*, !dbg !41
%wide.load = load <2 x double>, <2 x double> addrspace(13)* %36, align 8, !dbg !41, !tbaa !44, !alias.scope !47
%37 = getelementptr inbounds double, double addrspace(13)* %35, i64 2, !dbg !41
%38 = bitcast double addrspace(13)* %37 to <2 x double> addrspace(13)*, !dbg !41
%wide.load9 = load <2 x double>, <2 x double> addrspace(13)* %38, align 8, !dbg !41, !tbaa !44, !alias.scope !47
%39 = add i64 %index, %32, !dbg !41
%40 = getelementptr inbounds double, double addrspace(13)* %21, i64 %39, !dbg !41
%41 = bitcast double addrspace(13)* %40 to <2 x double> addrspace(13)*, !dbg !41
%wide.load10 = load <2 x double>, <2 x double> addrspace(13)* %41, align 8, !dbg !41, !tbaa !44, !alias.scope !47
%42 = getelementptr inbounds double, double addrspace(13)* %40, i64 2, !dbg !41
%43 = bitcast double addrspace(13)* %42 to <2 x double> addrspace(13)*, !dbg !41
%wide.load11 = load <2 x double>, <2 x double> addrspace(13)* %43, align 8, !dbg !41, !tbaa !44, !alias.scope !47
%44 = fmul <2 x double> %wide.load, %wide.load10, !dbg !50
%45 = fmul <2 x double> %wide.load9, %wide.load11, !dbg !50
%46 = add i64 %index, %33, !dbg !53
%47 = getelementptr inbounds double, double addrspace(13)* %29, i64 %46, !dbg !53
%48 = bitcast double addrspace(13)* %47 to <2 x double> addrspace(13)*, !dbg !53
store <2 x double> %44, <2 x double> addrspace(13)* %48, align 8, !dbg !53, !tbaa !44, !noalias !47
%49 = getelementptr inbounds double, double addrspace(13)* %47, i64 2, !dbg !53
%50 = bitcast double addrspace(13)* %49 to <2 x double> addrspace(13)*, !dbg !53
store <2 x double> %45, <2 x double> addrspace(13)* %50, align 8, !dbg !53, !tbaa !44, !noalias !47
%index.next = add nuw i64 %index, 4
%51 = icmp eq i64 %index.next, %n.vec
br i1 %51, label %middle.block, label %vector.body, !llvm.loop !55
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i64 %10, %n.vec, !dbg !40
br i1 %cmp.n, label %L51.i, label %scalar.ph, !dbg !40
scalar.ph: ; preds = %L32.i.preheader, %middle.block
%bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L32.i.preheader ]
br label %L32.i, !dbg !40
L32.i: ; preds = %scalar.ph, %L32.i
%value_phi8.i = phi i64 [ %62, %L32.i ], [ %bc.resume.val, %scalar.ph ]
%52 = add nsw i64 %value_phi8.i, -1, !dbg !41
%53 = add i64 %52, %31, !dbg !41
%54 = getelementptr inbounds double, double addrspace(13)* %13, i64 %53, !dbg !41
%55 = load double, double addrspace(13)* %54, align 8, !dbg !41, !tbaa !44, !alias.scope !47
%56 = add i64 %52, %32, !dbg !41
%57 = getelementptr inbounds double, double addrspace(13)* %21, i64 %56, !dbg !41
%58 = load double, double addrspace(13)* %57, align 8, !dbg !41, !tbaa !44, !alias.scope !47
%59 = fmul double %55, %58, !dbg !50
%60 = add i64 %52, %33, !dbg !53
%61 = getelementptr inbounds double, double addrspace(13)* %29, i64 %60, !dbg !53
store double %59, double addrspace(13)* %61, align 8, !dbg !53, !tbaa !44, !noalias !47
%.not.not8 = icmp eq i64 %value_phi8.i, %10, !dbg !57
%62 = add nuw nsw i64 %value_phi8.i, 1, !dbg !62
br i1 %.not.not8, label %L51.i, label %L32.i, !dbg !40, !llvm.loop !63
L51.i: ; preds = %middle.block, %L32.i
%.not = icmp eq i64 %value_phi3.i, %7, !dbg !57
%63 = add nuw nsw i64 %value_phi3.i, 1, !dbg !62
br i1 %.not, label %julia_f__1991_inner.exit, label %L32.i.preheader, !dbg !40
julia_f__1991_inner.exit: ; preds = %L51.i, %L16.i.preheader, %entry
ret void, !dbg !65
}
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
; Function Attrs: alwaysinline
define void @diffejulia_f__1991_inner_1wrap({} addrspace(10)* %0, {} addrspace(10)* %1, {} addrspace(10)* %2, {} addrspace(10)* %3, {} addrspace(10)* %4, {} addrspace(10)* %5) #2 !dbg !66 {
entry:
%6 = bitcast {} addrspace(10)* %2 to {} addrspace(10)* addrspace(10)*, !dbg !67
%7 = addrspacecast {} addrspace(10)* addrspace(10)* %6 to {} addrspace(10)* addrspace(11)*, !dbg !67
%8 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %7, i64 4, !dbg !67
%9 = bitcast {} addrspace(10)* addrspace(11)* %8 to i64 addrspace(11)*, !dbg !67
%10 = load i64, i64 addrspace(11)* %9, align 16, !dbg !67, !tbaa !23, !range !27
%.not.not.i = icmp eq i64 %10, 0, !dbg !76
br i1 %.not.not.i, label %diffejulia_f__1991_inner.1.exit, label %L16.i.preheader.i, !dbg !80
L16.i.preheader.i: ; preds = %entry
%11 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %7, i64 3
%12 = bitcast {} addrspace(10)* addrspace(11)* %11 to i64 addrspace(11)*
%13 = load i64, i64 addrspace(11)* %12, align 8, !tbaa !23, !range !27
%.not.not7.i = icmp eq i64 %13, 0
%14 = bitcast {} addrspace(10)* %2 to double addrspace(13)* addrspace(10)*
%15 = addrspacecast double addrspace(13)* addrspace(10)* %14 to double addrspace(13)* addrspace(11)*
%16 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %15, align 16
%17 = bitcast {} addrspace(10)* %4 to {} addrspace(10)* addrspace(10)*
%18 = addrspacecast {} addrspace(10)* addrspace(10)* %17 to {} addrspace(10)* addrspace(11)*
%19 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %18, i64 3
%20 = bitcast {} addrspace(10)* addrspace(11)* %19 to i64 addrspace(11)*
%21 = load i64, i64 addrspace(11)* %20, align 8
%22 = bitcast {} addrspace(10)* %4 to double addrspace(13)* addrspace(10)*
%23 = addrspacecast double addrspace(13)* addrspace(10)* %22 to double addrspace(13)* addrspace(11)*
%24 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %23, align 16
%25 = bitcast {} addrspace(10)* %0 to {} addrspace(10)* addrspace(10)*
%26 = addrspacecast {} addrspace(10)* addrspace(10)* %25 to {} addrspace(10)* addrspace(11)*
%27 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %26, i64 3
%28 = bitcast {} addrspace(10)* addrspace(11)* %27 to i64 addrspace(11)*
%29 = load i64, i64 addrspace(11)* %28, align 8
%30 = bitcast {} addrspace(10)* %0 to double addrspace(13)* addrspace(10)*
%31 = addrspacecast double addrspace(13)* addrspace(10)* %30 to double addrspace(13)* addrspace(11)*
%32 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %31, align 16
br i1 %.not.not7.i, label %diffejulia_f__1991_inner.1.exit, label %L32.i.preheader.i, !dbg !80
L32.i.preheader.i: ; preds = %L16.i.preheader.i, %L51.i.i
%iv.i = phi i64 [ %iv.next.i, %L51.i.i ], [ 0, %L16.i.preheader.i ]
%33 = mul i64 %iv.i, %13
%34 = mul i64 %iv.i, %21
%35 = mul i64 %iv.i, %29
%min.iters.check = icmp ult i64 %13, 4, !dbg !81
br i1 %min.iters.check, label %scalar.ph, label %vector.ph, !dbg !81
vector.ph: ; preds = %L32.i.preheader.i
%n.vec = and i64 %13, 9223372036854775804, !dbg !81
br label %vector.body, !dbg !81
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ], !dbg !82
%36 = add i64 %index, %33, !dbg !82
%37 = getelementptr inbounds double, double addrspace(13)* %16, i64 %36, !dbg !82
%38 = bitcast double addrspace(13)* %37 to <2 x double> addrspace(13)*, !dbg !82
%wide.load = load <2 x double>, <2 x double> addrspace(13)* %38, align 8, !dbg !82, !tbaa !44, !alias.scope !47
%39 = getelementptr inbounds double, double addrspace(13)* %37, i64 2, !dbg !82
%40 = bitcast double addrspace(13)* %39 to <2 x double> addrspace(13)*, !dbg !82
%wide.load22 = load <2 x double>, <2 x double> addrspace(13)* %40, align 8, !dbg !82, !tbaa !44, !alias.scope !47
%41 = add i64 %index, %34, !dbg !82
%42 = getelementptr inbounds double, double addrspace(13)* %24, i64 %41, !dbg !82
%43 = bitcast double addrspace(13)* %42 to <2 x double> addrspace(13)*, !dbg !82
%wide.load23 = load <2 x double>, <2 x double> addrspace(13)* %43, align 8, !dbg !82, !tbaa !44, !alias.scope !47
%44 = getelementptr inbounds double, double addrspace(13)* %42, i64 2, !dbg !82
%45 = bitcast double addrspace(13)* %44 to <2 x double> addrspace(13)*, !dbg !82
%wide.load24 = load <2 x double>, <2 x double> addrspace(13)* %45, align 8, !dbg !82, !tbaa !44, !alias.scope !47
%46 = fmul <2 x double> %wide.load, %wide.load23, !dbg !84
%47 = fmul <2 x double> %wide.load22, %wide.load24, !dbg !84
%48 = add i64 %index, %35, !dbg !82
%49 = getelementptr inbounds double, double addrspace(13)* %32, i64 %48, !dbg !82
%50 = bitcast double addrspace(13)* %49 to <2 x double> addrspace(13)*, !dbg !85
store <2 x double> %46, <2 x double> addrspace(13)* %50, align 8, !dbg !85, !tbaa !44, !alias.scope !86, !noalias !89
%51 = getelementptr inbounds double, double addrspace(13)* %49, i64 2, !dbg !85
%52 = bitcast double addrspace(13)* %51 to <2 x double> addrspace(13)*, !dbg !85
store <2 x double> %47, <2 x double> addrspace(13)* %52, align 8, !dbg !85, !tbaa !44, !alias.scope !86, !noalias !89
%index.next = add nuw i64 %index, 4, !dbg !82
%53 = icmp eq i64 %index.next, %n.vec, !dbg !82
br i1 %53, label %middle.block, label %vector.body, !dbg !82, !llvm.loop !91
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i64 %13, %n.vec, !dbg !81
br i1 %cmp.n, label %L51.i.i, label %scalar.ph, !dbg !81
scalar.ph: ; preds = %L32.i.preheader.i, %middle.block
%bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %L32.i.preheader.i ]
br label %L32.i.i, !dbg !81
L32.i.i: ; preds = %L32.i.i, %scalar.ph
%iv1.i = phi i64 [ %iv.next2.i, %L32.i.i ], [ %bc.resume.val, %scalar.ph ]
%iv.next2.i = add nuw nsw i64 %iv1.i, 1, !dbg !82
%54 = add i64 %iv1.i, %33, !dbg !82
%55 = getelementptr inbounds double, double addrspace(13)* %16, i64 %54, !dbg !82
%56 = load double, double addrspace(13)* %55, align 8, !dbg !82, !tbaa !44, !alias.scope !47
%57 = add i64 %iv1.i, %34, !dbg !82
%58 = getelementptr inbounds double, double addrspace(13)* %24, i64 %57, !dbg !82
%59 = load double, double addrspace(13)* %58, align 8, !dbg !82, !tbaa !44, !alias.scope !47
%60 = fmul double %56, %59, !dbg !84
%61 = add i64 %iv1.i, %35, !dbg !85
%62 = getelementptr inbounds double, double addrspace(13)* %32, i64 %61, !dbg !85
store double %60, double addrspace(13)* %62, align 8, !dbg !85, !tbaa !44, !alias.scope !86, !noalias !89
%.not.not8.i = icmp eq i64 %iv.next2.i, %13, !dbg !92
br i1 %.not.not8.i, label %L51.i.i, label %L32.i.i, !dbg !81, !llvm.loop !95
L51.i.i: ; preds = %middle.block, %L32.i.i
%iv.next.i = add nuw nsw i64 %iv.i, 1
%.not.i = icmp eq i64 %iv.next.i, %10, !dbg !92
br i1 %.not.i, label %invertL51.i.i.preheader, label %L32.i.preheader.i, !dbg !81
invertL51.i.i.preheader: ; preds = %L51.i.i
%"'ipc_unwrap.i" = bitcast {} addrspace(10)* %1 to double addrspace(13)* addrspace(10)*
%"'ipc5_unwrap.i" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc_unwrap.i" to double addrspace(13)* addrspace(11)*
%"'ipc6_unwrap.i" = bitcast {} addrspace(10)* %5 to double addrspace(13)* addrspace(10)*
%"'ipc7_unwrap.i" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc6_unwrap.i" to double addrspace(13)* addrspace(11)*
%"'ipc9_unwrap.i" = bitcast {} addrspace(10)* %3 to double addrspace(13)* addrspace(10)*
%"'ipc10_unwrap.i" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc9_unwrap.i" to double addrspace(13)* addrspace(11)*
br label %invertL51.i.i
invertL32.i.preheader.i: ; preds = %invertL32.i.i
%63 = icmp eq i64 %"iv'ac.i.0", 0
br i1 %63, label %diffejulia_f__1991_inner.1.exit, label %invertL51.i.i
invertL32.i.i: ; preds = %invertL32.i.i, %invertL51.i.i
%"iv1'ac.i.0.in" = phi i64 [ %13, %invertL51.i.i ], [ %"iv1'ac.i.0", %invertL32.i.i ]
%"iv1'ac.i.0" = add nsw i64 %"iv1'ac.i.0.in", -1
%"'ipl_unwrap.i" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc5_unwrap.i", align 16
%_unwrap14.i = add i64 %"iv1'ac.i.0", %_unwrap13.i
%"'ipg_unwrap.i" = getelementptr inbounds double, double addrspace(13)* %"'ipl_unwrap.i", i64 %_unwrap14.i
%64 = load double, double addrspace(13)* %"'ipg_unwrap.i", align 8, !tbaa !44, !noalias !47
store double 0.000000e+00, double addrspace(13)* %"'ipg_unwrap.i", align 8, !dbg !85, !tbaa !44, !alias.scope !96, !noalias !97
%_unwrap17.i = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %23, align 16
%_unwrap20.i = add i64 %"iv1'ac.i.0", %_unwrap19.i
%_unwrap21.i = getelementptr inbounds double, double addrspace(13)* %_unwrap17.i, i64 %_unwrap20.i
%_unwrap22.i = load double, double addrspace(13)* %_unwrap21.i, align 8, !dbg !82, !tbaa !44, !alias.scope !47
%m0diffe.i = fmul fast double %_unwrap22.i, %64
%_unwrap25.i = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %15, align 16
%_unwrap30.i = add i64 %_unwrap29.i, %"iv1'ac.i.0"
%_unwrap31.i = getelementptr inbounds double, double addrspace(13)* %_unwrap25.i, i64 %_unwrap30.i
%_unwrap32.i = load double, double addrspace(13)* %_unwrap31.i, align 8, !dbg !82, !tbaa !44, !alias.scope !47
%m1diffe.i = fmul fast double %_unwrap32.i, %64
%"'ipl8_unwrap.i" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc7_unwrap.i", align 16
%"'ipg35_unwrap.i" = getelementptr inbounds double, double addrspace(13)* %"'ipl8_unwrap.i", i64 %_unwrap20.i
%65 = load double, double addrspace(13)* %"'ipg35_unwrap.i", align 8, !dbg !82, !tbaa !44, !alias.scope !98, !noalias !101
%66 = fadd fast double %65, %m1diffe.i
store double %66, double addrspace(13)* %"'ipg35_unwrap.i", align 8, !dbg !82, !tbaa !44, !alias.scope !98, !noalias !101
%"'ipl11_unwrap.i" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc10_unwrap.i", align 16
%"'ipg36_unwrap.i" = getelementptr inbounds double, double addrspace(13)* %"'ipl11_unwrap.i", i64 %_unwrap30.i
%67 = load double, double addrspace(13)* %"'ipg36_unwrap.i", align 8, !dbg !82, !tbaa !44, !alias.scope !103, !noalias !106
%68 = fadd fast double %67, %m0diffe.i
store double %68, double addrspace(13)* %"'ipg36_unwrap.i", align 8, !dbg !82, !tbaa !44, !alias.scope !103, !noalias !106
%69 = icmp eq i64 %"iv1'ac.i.0", 0
br i1 %69, label %invertL32.i.preheader.i, label %invertL32.i.i
invertL51.i.i: ; preds = %invertL51.i.i.preheader, %invertL32.i.preheader.i
%"iv'ac.i.0.in" = phi i64 [ %"iv'ac.i.0", %invertL32.i.preheader.i ], [ %10, %invertL51.i.i.preheader ]
%"iv'ac.i.0" = add nsw i64 %"iv'ac.i.0.in", -1
%_unwrap13.i = mul i64 %"iv'ac.i.0", %29
%_unwrap19.i = mul i64 %"iv'ac.i.0", %21
%_unwrap29.i = mul i64 %13, %"iv'ac.i.0"
br label %invertL32.i.i
diffejulia_f__1991_inner.1.exit: ; preds = %invertL32.i.preheader.i, %L16.i.preheader.i, %entry
ret void
}
; Function Attrs: inaccessiblemem_or_argmemonly
declare void @ijl_gc_queue_root({} addrspace(10)*) #3
; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8*, i32, i32) #4
; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc(i8*, i64) #4
attributes #0 = { nofree nosync "enzymejl_world"="32451" "probe-stack"="inline-asm" }
attributes #1 = { argmemonly nofree nosync nounwind willreturn "enzymejl_world"="32451" }
attributes #2 = { alwaysinline "probe-stack"="inline-asm" }
attributes #3 = { inaccessiblemem_or_argmemonly }
attributes #4 = { allocsize(1) }
!llvm.module.flags = !{!0, !1}
!llvm.dbg.cu = !{!2}
!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = distinct !DICompileUnit(language: DW_LANG_Julia, file: !3, producer: "julia", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !4, nameTableKind: None)
!3 = !DIFile(filename: "REPL[7]", directory: ".")
!4 = !{}
!5 = distinct !DISubprogram(name: "f!", linkageName: "julia_f!_1991", scope: null, file: !3, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!6 = !DISubroutineType(types: !4)
!7 = !DILocation(line: 152, scope: !8, inlinedAt: !10)
!8 = distinct !DISubprogram(name: "size;", linkageName: "size", scope: !9, file: !9, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!9 = !DIFile(filename: "array.jl", directory: ".")
!10 = distinct !DILocation(line: 95, scope: !11, inlinedAt: !13)
!11 = distinct !DISubprogram(name: "axes;", linkageName: "axes", scope: !12, file: !12, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!12 = !DIFile(filename: "abstractarray.jl", directory: ".")
!13 = distinct !DILocation(line: 30, scope: !14, inlinedAt: !16)
!14 = distinct !DISubprogram(name: "axes;", linkageName: "axes", scope: !15, file: !15, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!15 = !DIFile(filename: "experimental.jl", directory: ".")
!16 = distinct !DILocation(line: 74, scope: !11, inlinedAt: !17)
!17 = distinct !DILocation(line: 3, scope: !18, inlinedAt: !19)
!18 = distinct !DISubprogram(name: "macro expansion;", linkageName: "macro expansion", scope: !3, file: !3, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!19 = distinct !DILocation(line: 49, scope: !20, inlinedAt: !21)
!20 = distinct !DISubprogram(name: "macro expansion;", linkageName: "macro expansion", scope: !15, file: !15, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!21 = distinct !DILocation(line: 2, scope: !5, inlinedAt: !22)
!22 = distinct !DILocation(line: 0, scope: !5)
!23 = !{!24, !24, i64 0}
!24 = !{!"jtbaa_const", !25, i64 0}
!25 = !{!"jtbaa", !26, i64 0}
!26 = !{!"jtbaa"}
!27 = !{i64 0, i64 9223372036854775807}
!28 = !DILocation(line: 83, scope: !29, inlinedAt: !31)
!29 = distinct !DISubprogram(name: "<;", linkageName: "<", scope: !30, file: !30, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!30 = !DIFile(filename: "int.jl", directory: ".")
!31 = distinct !DILocation(line: 382, scope: !32, inlinedAt: !34)
!32 = distinct !DISubprogram(name: ">;", linkageName: ">", scope: !33, file: !33, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!33 = !DIFile(filename: "operators.jl", directory: ".")
!34 = distinct !DILocation(line: 654, scope: !35, inlinedAt: !37)
!35 = distinct !DISubprogram(name: "isempty;", linkageName: "isempty", scope: !36, file: !36, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!36 = !DIFile(filename: "range.jl", directory: ".")
!37 = distinct !DILocation(line: 879, scope: !38, inlinedAt: !17)
!38 = distinct !DISubprogram(name: "iterate;", linkageName: "iterate", scope: !36, file: !36, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!39 = !DILocation(line: 3, scope: !18, inlinedAt: !19)
!40 = !DILocation(line: 5, scope: !18, inlinedAt: !19)
!41 = !DILocation(line: 34, scope: !42, inlinedAt: !43)
!42 = distinct !DISubprogram(name: "getindex;", linkageName: "getindex", scope: !15, file: !15, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!43 = distinct !DILocation(line: 4, scope: !18, inlinedAt: !19)
!44 = !{!45, !45, i64 0}
!45 = !{!"jtbaa_arraybuf", !46, i64 0}
!46 = !{!"jtbaa_data", !25, i64 0}
!47 = !{!48}
!48 = !{!"aliasscope", !49}
!49 = !{!"f!"}
!50 = !DILocation(line: 385, scope: !51, inlinedAt: !43)
!51 = distinct !DISubprogram(name: "*;", linkageName: "*", scope: !52, file: !52, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!52 = !DIFile(filename: "float.jl", directory: ".")
!53 = !DILocation(line: 968, scope: !54, inlinedAt: !43)
!54 = distinct !DISubprogram(name: "setindex!;", linkageName: "setindex!", scope: !9, file: !9, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!55 = distinct !{!55, !56}
!56 = !{!"llvm.loop.isvectorized", i32 1}
!57 = !DILocation(line: 477, scope: !58, inlinedAt: !60)
!58 = distinct !DISubprogram(name: "==;", linkageName: "==", scope: !59, file: !59, type: !6, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!59 = !DIFile(filename: "promotion.jl", directory: ".")
!60 = distinct !DILocation(line: 883, scope: !38, inlinedAt: !61)
!61 = distinct !DILocation(line: 5, scope: !18, inlinedAt: !19)
!62 = !DILocation(line: 883, scope: !38, inlinedAt: !61)
!63 = distinct !{!63, !64, !56}
!64 = !{!"llvm.loop.unroll.runtime.disable"}
!65 = !DILocation(line: 0, scope: !5)
!66 = distinct !DISubprogram(name: "f!", linkageName: "julia_f!_1991", scope: null, file: !3, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
!67 = !DILocation(line: 152, scope: !8, inlinedAt: !68)
!68 = distinct !DILocation(line: 95, scope: !11, inlinedAt: !69)
!69 = distinct !DILocation(line: 30, scope: !14, inlinedAt: !70)
!70 = distinct !DILocation(line: 74, scope: !11, inlinedAt: !71)
!71 = distinct !DILocation(line: 3, scope: !18, inlinedAt: !72)
!72 = distinct !DILocation(line: 49, scope: !20, inlinedAt: !73)
!73 = distinct !DILocation(line: 2, scope: !66, inlinedAt: !74)
!74 = distinct !DILocation(line: 0, scope: !66, inlinedAt: !75)
!75 = distinct !DILocation(line: 0, scope: !66)
!76 = !DILocation(line: 83, scope: !29, inlinedAt: !77)
!77 = distinct !DILocation(line: 382, scope: !32, inlinedAt: !78)
!78 = distinct !DILocation(line: 654, scope: !35, inlinedAt: !79)
!79 = distinct !DILocation(line: 879, scope: !38, inlinedAt: !71)
!80 = !DILocation(line: 3, scope: !18, inlinedAt: !72)
!81 = !DILocation(line: 5, scope: !18, inlinedAt: !72)
!82 = !DILocation(line: 34, scope: !42, inlinedAt: !83)
!83 = distinct !DILocation(line: 4, scope: !18, inlinedAt: !72)
!84 = !DILocation(line: 385, scope: !51, inlinedAt: !83)
!85 = !DILocation(line: 968, scope: !54, inlinedAt: !83)
!86 = !{!87}
!87 = distinct !{!87, !88, !"primal"}
!88 = distinct !{!88, !" diff: %"}
!89 = !{!90, !48}
!90 = distinct !{!90, !88, !"shadow_0"}
!91 = distinct !{!91, !56}
!92 = !DILocation(line: 477, scope: !58, inlinedAt: !93)
!93 = distinct !DILocation(line: 883, scope: !38, inlinedAt: !94)
!94 = distinct !DILocation(line: 5, scope: !18, inlinedAt: !72)
!95 = distinct !{!95, !64, !56}
!96 = !{!90}
!97 = !{!87, !48}
!98 = !{!99}
!99 = distinct !{!99, !100, !"shadow_0"}
!100 = distinct !{!100, !" diff: %"}
!101 = !{!102}
!102 = distinct !{!102, !100, !"primal"}
!103 = !{!104}
!104 = distinct !{!104, !105, !"shadow_0"}
!105 = distinct !{!105, !" diff: %"}
!106 = !{!107}
!107 = distinct !{!107, !105, !"primal"}
Looks like the reverse pass isn't getting vectorized. @vchuravy
A collection of PR's that go to the heart of this: https://github.com/EnzymeAD/Enzyme/pull/996 on Enzyme proper and https://reviews.llvm.org/D144053 in LLVM proper.
Is there any way to work around this for now - by passing Enzyme.jl some manual hints on aliasing or so?
With the addrspace13 PR, the array inner pointer loads are at least being LICM'd. The bw pass still isn't vectorizing though for some reason (perhaps because the output isn't noalias wrt the other things?)
; ModuleID = 'start'
source_filename = "start"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-linux-gnu"
; Function Attrs: nofree nosync
define private void @julia_f__742_inner.1({} addrspace(10)* nocapture nofree noundef nonnull readonly align 16 dereferenceable(40) %0, {} addrspace(10)* nocapture nofree noundef nonnull readonly align 16 dereferenceable(40) %1, {} addrspace(10)* nocapture nofree noundef nonnull readonly align 16 dereferenceable(40) %2) local_unnamed_addr #0 !dbg !4 {
entry:
%3 = bitcast {} addrspace(10)* %1 to {} addrspace(10)* addrspace(10)*, !dbg !7
%4 = addrspacecast {} addrspace(10)* addrspace(10)* %3 to {} addrspace(10)* addrspace(11)*, !dbg !7
%5 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %4, i64 4, !dbg !7
%6 = bitcast {} addrspace(10)* addrspace(11)* %5 to i64 addrspace(11)*, !dbg !7
%7 = load i64, i64 addrspace(11)* %6, align 16, !dbg !7, !tbaa !23, !range !27, !alias.scope !28, !noalias !31
%.not = icmp eq i64 %7, 0, !dbg !36
br i1 %.not, label %julia_f__742_inner.exit, label %L14.i.preheader, !dbg !47
L14.i.preheader: ; preds = %entry
%8 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %4, i64 3
%9 = bitcast {} addrspace(10)* addrspace(11)* %8 to i64 addrspace(11)*
%10 = load i64, i64 addrspace(11)* %9, align 8, !tbaa !23, !range !27, !alias.scope !28, !noalias !31
%.not7 = icmp eq i64 %10, 0
%11 = bitcast {} addrspace(10)* %1 to double addrspace(13)* addrspace(10)*
%12 = addrspacecast double addrspace(13)* addrspace(10)* %11 to double addrspace(13)* addrspace(11)*
%13 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %12, align 16, !alias.scope !48
%14 = bitcast {} addrspace(10)* %2 to {} addrspace(10)* addrspace(10)*
%15 = addrspacecast {} addrspace(10)* addrspace(10)* %14 to {} addrspace(10)* addrspace(11)*
%16 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %15, i64 3
%17 = bitcast {} addrspace(10)* addrspace(11)* %16 to i64 addrspace(11)*
%18 = load i64, i64 addrspace(11)* %17, align 8
%19 = bitcast {} addrspace(10)* %2 to double addrspace(13)* addrspace(10)*
%20 = addrspacecast double addrspace(13)* addrspace(10)* %19 to double addrspace(13)* addrspace(11)*
%21 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %20, align 16, !alias.scope !48
%22 = bitcast {} addrspace(10)* %0 to {} addrspace(10)* addrspace(10)*
%23 = addrspacecast {} addrspace(10)* addrspace(10)* %22 to {} addrspace(10)* addrspace(11)*
%24 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %23, i64 3
%25 = bitcast {} addrspace(10)* addrspace(11)* %24 to i64 addrspace(11)*
%26 = load i64, i64 addrspace(11)* %25, align 8
%27 = bitcast {} addrspace(10)* %0 to double addrspace(13)* addrspace(10)*
%28 = addrspacecast double addrspace(13)* addrspace(10)* %27 to double addrspace(13)* addrspace(11)*
%29 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %28, align 16, !alias.scope !48
br i1 %.not7, label %julia_f__742_inner.exit, label %L28.i.preheader, !dbg !47
L28.i.preheader: ; preds = %L14.i.preheader, %L45.i
%value_phi3.i = phi i64 [ %63, %L45.i ], [ 1, %L14.i.preheader ]
%30 = add nsw i64 %value_phi3.i, -1
%31 = mul i64 %10, %30
%32 = mul i64 %18, %30
%33 = mul i64 %26, %30
%min.iters.check = icmp ult i64 %10, 4, !dbg !51
br i1 %min.iters.check, label %scalar.ph, label %vector.ph, !dbg !51
vector.ph: ; preds = %L28.i.preheader
%n.vec = and i64 %10, 9223372036854775804, !dbg !51
%ind.end = or i64 %n.vec, 1, !dbg !51
br label %vector.body, !dbg !51
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%34 = add i64 %index, %31, !dbg !52
%35 = getelementptr inbounds double, double addrspace(13)* %13, i64 %34, !dbg !52
%36 = bitcast double addrspace(13)* %35 to <2 x double> addrspace(13)*, !dbg !52
%wide.load = load <2 x double>, <2 x double> addrspace(13)* %36, align 8, !dbg !52, !tbaa !55, !alias.scope !58, !noalias !61
%37 = getelementptr inbounds double, double addrspace(13)* %35, i64 2, !dbg !52
%38 = bitcast double addrspace(13)* %37 to <2 x double> addrspace(13)*, !dbg !52
%wide.load10 = load <2 x double>, <2 x double> addrspace(13)* %38, align 8, !dbg !52, !tbaa !55, !alias.scope !58, !noalias !61
%39 = add i64 %index, %32, !dbg !52
%40 = getelementptr inbounds double, double addrspace(13)* %21, i64 %39, !dbg !52
%41 = bitcast double addrspace(13)* %40 to <2 x double> addrspace(13)*, !dbg !52
%wide.load11 = load <2 x double>, <2 x double> addrspace(13)* %41, align 8, !dbg !52, !tbaa !55, !alias.scope !58, !noalias !61
%42 = getelementptr inbounds double, double addrspace(13)* %40, i64 2, !dbg !52
%43 = bitcast double addrspace(13)* %42 to <2 x double> addrspace(13)*, !dbg !52
%wide.load12 = load <2 x double>, <2 x double> addrspace(13)* %43, align 8, !dbg !52, !tbaa !55, !alias.scope !58, !noalias !61
%44 = fmul <2 x double> %wide.load, %wide.load11, !dbg !62
%45 = fmul <2 x double> %wide.load10, %wide.load12, !dbg !62
%46 = add i64 %index, %33, !dbg !65
%47 = getelementptr inbounds double, double addrspace(13)* %29, i64 %46, !dbg !65
%48 = bitcast double addrspace(13)* %47 to <2 x double> addrspace(13)*, !dbg !65
store <2 x double> %44, <2 x double> addrspace(13)* %48, align 8, !dbg !65, !tbaa !55, !alias.scope !67, !noalias !68
%49 = getelementptr inbounds double, double addrspace(13)* %47, i64 2, !dbg !65
%50 = bitcast double addrspace(13)* %49 to <2 x double> addrspace(13)*, !dbg !65
store <2 x double> %45, <2 x double> addrspace(13)* %50, align 8, !dbg !65, !tbaa !55, !alias.scope !67, !noalias !68
%index.next = add nuw i64 %index, 4
%51 = icmp eq i64 %index.next, %n.vec
br i1 %51, label %middle.block, label %vector.body, !llvm.loop !71
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i64 %10, %n.vec, !dbg !51
br i1 %cmp.n, label %L45.i, label %scalar.ph, !dbg !51
scalar.ph: ; preds = %L28.i.preheader, %middle.block
%bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L28.i.preheader ]
br label %L28.i, !dbg !51
L28.i: ; preds = %scalar.ph, %L28.i
%value_phi8.i = phi i64 [ %62, %L28.i ], [ %bc.resume.val, %scalar.ph ]
%52 = add nsw i64 %value_phi8.i, -1, !dbg !52
%53 = add i64 %52, %31, !dbg !52
%54 = getelementptr inbounds double, double addrspace(13)* %13, i64 %53, !dbg !52
%55 = load double, double addrspace(13)* %54, align 8, !dbg !52, !tbaa !55, !alias.scope !58, !noalias !61
%56 = add i64 %52, %32, !dbg !52
%57 = getelementptr inbounds double, double addrspace(13)* %21, i64 %56, !dbg !52
%58 = load double, double addrspace(13)* %57, align 8, !dbg !52, !tbaa !55, !alias.scope !58, !noalias !61
%59 = fmul double %55, %58, !dbg !62
%60 = add i64 %52, %33, !dbg !65
%61 = getelementptr inbounds double, double addrspace(13)* %29, i64 %60, !dbg !65
store double %59, double addrspace(13)* %61, align 8, !dbg !65, !tbaa !55, !alias.scope !67, !noalias !68
%.not8 = icmp eq i64 %value_phi8.i, %10, !dbg !73
%62 = add nuw nsw i64 %value_phi8.i, 1, !dbg !78
br i1 %.not8, label %L45.i, label %L28.i, !dbg !51, !llvm.loop !79
L45.i: ; preds = %middle.block, %L28.i
%.not9 = icmp eq i64 %value_phi3.i, %7, !dbg !73
%63 = add nuw nsw i64 %value_phi3.i, 1, !dbg !78
br i1 %.not9, label %julia_f__742_inner.exit, label %L28.i.preheader, !dbg !51
julia_f__742_inner.exit: ; preds = %L45.i, %L14.i.preheader, %entry
ret void, !dbg !81
}
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
; Function Attrs: alwaysinline
define void @diffejulia_f__742_inner_1wrap({} addrspace(10)* %0, {} addrspace(10)* %1, {} addrspace(10)* %2, {} addrspace(10)* %3, {} addrspace(10)* %4, {} addrspace(10)* %5) #2 !dbg !82 {
entry:
%6 = bitcast {} addrspace(10)* %2 to {} addrspace(10)* addrspace(10)*, !dbg !83
%7 = addrspacecast {} addrspace(10)* addrspace(10)* %6 to {} addrspace(10)* addrspace(11)*, !dbg !83
%8 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %7, i64 4, !dbg !83
%9 = bitcast {} addrspace(10)* addrspace(11)* %8 to i64 addrspace(11)*, !dbg !83
%10 = load i64, i64 addrspace(11)* %9, align 16, !dbg !83, !tbaa !23, !range !27, !alias.scope !28, !noalias !31
%.not.i = icmp eq i64 %10, 0, !dbg !92
br i1 %.not.i, label %diffejulia_f__742_inner.1.exit, label %L14.i.preheader.i, !dbg !96
L14.i.preheader.i: ; preds = %entry
%11 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %7, i64 3
%12 = bitcast {} addrspace(10)* addrspace(11)* %11 to i64 addrspace(11)*
%13 = load i64, i64 addrspace(11)* %12, align 8, !tbaa !23, !range !27, !alias.scope !28, !noalias !31, !invariant.group !97
%.not7.i = icmp eq i64 %13, 0
%14 = bitcast {} addrspace(10)* %2 to double addrspace(13)* addrspace(10)*
%15 = addrspacecast double addrspace(13)* addrspace(10)* %14 to double addrspace(13)* addrspace(11)*
%16 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %15, align 16, !alias.scope !48, !invariant.group !98
%17 = bitcast {} addrspace(10)* %4 to {} addrspace(10)* addrspace(10)*
%18 = addrspacecast {} addrspace(10)* addrspace(10)* %17 to {} addrspace(10)* addrspace(11)*
%19 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %18, i64 3
%20 = bitcast {} addrspace(10)* addrspace(11)* %19 to i64 addrspace(11)*
%21 = load i64, i64 addrspace(11)* %20, align 8
%22 = bitcast {} addrspace(10)* %4 to double addrspace(13)* addrspace(10)*
%23 = addrspacecast double addrspace(13)* addrspace(10)* %22 to double addrspace(13)* addrspace(11)*
%24 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %23, align 16, !alias.scope !48, !invariant.group !99
%25 = bitcast {} addrspace(10)* %0 to {} addrspace(10)* addrspace(10)*
%26 = addrspacecast {} addrspace(10)* addrspace(10)* %25 to {} addrspace(10)* addrspace(11)*
%27 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %26, i64 3
%28 = bitcast {} addrspace(10)* addrspace(11)* %27 to i64 addrspace(11)*
%29 = load i64, i64 addrspace(11)* %28, align 8
%30 = bitcast {} addrspace(10)* %0 to double addrspace(13)* addrspace(10)*
%31 = addrspacecast double addrspace(13)* addrspace(10)* %30 to double addrspace(13)* addrspace(11)*
%32 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %31, align 16, !alias.scope !48
br i1 %.not7.i, label %diffejulia_f__742_inner.1.exit, label %L28.i.preheader.i, !dbg !96
L28.i.preheader.i: ; preds = %L14.i.preheader.i, %L45.i.i
%iv.i = phi i64 [ %iv.next.i, %L45.i.i ], [ 0, %L14.i.preheader.i ]
%33 = mul i64 %iv.i, %13
%34 = mul i64 %iv.i, %21
%35 = mul i64 %iv.i, %29
%min.iters.check = icmp ult i64 %13, 4, !dbg !100
br i1 %min.iters.check, label %scalar.ph, label %vector.ph, !dbg !100
vector.ph: ; preds = %L28.i.preheader.i
%n.vec = and i64 %13, 9223372036854775804, !dbg !100
br label %vector.body, !dbg !100
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ], !dbg !101
%36 = add i64 %index, %33, !dbg !101
%37 = getelementptr inbounds double, double addrspace(13)* %16, i64 %36, !dbg !101
%38 = bitcast double addrspace(13)* %37 to <2 x double> addrspace(13)*, !dbg !101
%wide.load = load <2 x double>, <2 x double> addrspace(13)* %38, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61
%39 = getelementptr inbounds double, double addrspace(13)* %37, i64 2, !dbg !101
%40 = bitcast double addrspace(13)* %39 to <2 x double> addrspace(13)*, !dbg !101
%wide.load22 = load <2 x double>, <2 x double> addrspace(13)* %40, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61
%41 = add i64 %index, %34, !dbg !101
%42 = getelementptr inbounds double, double addrspace(13)* %24, i64 %41, !dbg !101
%43 = bitcast double addrspace(13)* %42 to <2 x double> addrspace(13)*, !dbg !101
%wide.load23 = load <2 x double>, <2 x double> addrspace(13)* %43, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61
%44 = getelementptr inbounds double, double addrspace(13)* %42, i64 2, !dbg !101
%45 = bitcast double addrspace(13)* %44 to <2 x double> addrspace(13)*, !dbg !101
%wide.load24 = load <2 x double>, <2 x double> addrspace(13)* %45, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61
%46 = fmul <2 x double> %wide.load, %wide.load23, !dbg !103
%47 = fmul <2 x double> %wide.load22, %wide.load24, !dbg !103
%48 = add i64 %index, %35, !dbg !101
%49 = getelementptr inbounds double, double addrspace(13)* %32, i64 %48, !dbg !101
%50 = bitcast double addrspace(13)* %49 to <2 x double> addrspace(13)*, !dbg !104
store <2 x double> %46, <2 x double> addrspace(13)* %50, align 8, !dbg !104, !tbaa !55, !alias.scope !105, !noalias !108
%51 = getelementptr inbounds double, double addrspace(13)* %49, i64 2, !dbg !104
%52 = bitcast double addrspace(13)* %51 to <2 x double> addrspace(13)*, !dbg !104
store <2 x double> %47, <2 x double> addrspace(13)* %52, align 8, !dbg !104, !tbaa !55, !alias.scope !105, !noalias !108
%index.next = add nuw i64 %index, 4, !dbg !101
%53 = icmp eq i64 %index.next, %n.vec, !dbg !101
br i1 %53, label %middle.block, label %vector.body, !dbg !101, !llvm.loop !112
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i64 %13, %n.vec, !dbg !100
br i1 %cmp.n, label %L45.i.i, label %scalar.ph, !dbg !100
scalar.ph: ; preds = %L28.i.preheader.i, %middle.block
%bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %L28.i.preheader.i ]
br label %L28.i.i, !dbg !100
L28.i.i: ; preds = %L28.i.i, %scalar.ph
%iv1.i = phi i64 [ %iv.next2.i, %L28.i.i ], [ %bc.resume.val, %scalar.ph ]
%iv.next2.i = add nuw nsw i64 %iv1.i, 1, !dbg !101
%54 = add i64 %iv1.i, %33, !dbg !101
%55 = getelementptr inbounds double, double addrspace(13)* %16, i64 %54, !dbg !101
%56 = load double, double addrspace(13)* %55, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61, !invariant.group !113
%57 = add i64 %iv1.i, %34, !dbg !101
%58 = getelementptr inbounds double, double addrspace(13)* %24, i64 %57, !dbg !101
%59 = load double, double addrspace(13)* %58, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61, !invariant.group !114
%60 = fmul double %56, %59, !dbg !103
%61 = add i64 %iv1.i, %35, !dbg !104
%62 = getelementptr inbounds double, double addrspace(13)* %32, i64 %61, !dbg !104
store double %60, double addrspace(13)* %62, align 8, !dbg !104, !tbaa !55, !alias.scope !105, !noalias !108
%.not8.i = icmp eq i64 %iv.next2.i, %13, !dbg !115
br i1 %.not8.i, label %L45.i.i, label %L28.i.i, !dbg !100, !llvm.loop !118
L45.i.i: ; preds = %middle.block, %L28.i.i
%iv.next.i = add nuw nsw i64 %iv.i, 1
%.not9.i = icmp eq i64 %iv.next.i, %10, !dbg !115
br i1 %.not9.i, label %invertL45.i.i.preheader, label %L28.i.preheader.i, !dbg !100
invertL45.i.i.preheader: ; preds = %L45.i.i
%"'ipc_unwrap.i" = bitcast {} addrspace(10)* %1 to double addrspace(13)* addrspace(10)*
%"'ipc5_unwrap.i" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc_unwrap.i" to double addrspace(13)* addrspace(11)*
%"'ipl_unwrap.i" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc5_unwrap.i", align 16, !alias.scope !48, !invariant.group !119
%"'ipc6_unwrap.i" = bitcast {} addrspace(10)* %5 to double addrspace(13)* addrspace(10)*
%"'ipc7_unwrap.i" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc6_unwrap.i" to double addrspace(13)* addrspace(11)*
%"'ipl8_unwrap.i" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc7_unwrap.i", align 16, !alias.scope !48, !invariant.group !120
%"'ipc9_unwrap.i" = bitcast {} addrspace(10)* %3 to double addrspace(13)* addrspace(10)*
%"'ipc10_unwrap.i" = addrspacecast double addrspace(13)* addrspace(10)* %"'ipc9_unwrap.i" to double addrspace(13)* addrspace(11)*
%"'ipl11_unwrap.i" = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %"'ipc10_unwrap.i", align 16, !alias.scope !48, !invariant.group !121
br label %invertL45.i.i
invertL28.i.preheader.i: ; preds = %invertL28.i.i
%63 = icmp eq i64 %"iv'ac.i.0", 0
br i1 %63, label %diffejulia_f__742_inner.1.exit, label %invertL45.i.i
invertL28.i.i: ; preds = %invertL28.i.i, %invertL45.i.i
%"iv1'ac.i.0.in" = phi i64 [ %13, %invertL45.i.i ], [ %"iv1'ac.i.0", %invertL28.i.i ]
%"iv1'ac.i.0" = add nsw i64 %"iv1'ac.i.0.in", -1
%_unwrap14.i = add i64 %"iv1'ac.i.0", %_unwrap13.i
%"'ipg_unwrap.i" = getelementptr inbounds double, double addrspace(13)* %"'ipl_unwrap.i", i64 %_unwrap14.i
%64 = load double, double addrspace(13)* %"'ipg_unwrap.i", align 8, !tbaa !55, !noalias !122
store double 0.000000e+00, double addrspace(13)* %"'ipg_unwrap.i", align 8, !dbg !104, !tbaa !55, !alias.scope !123, !noalias !124
%_unwrap20.i = add i64 %"iv1'ac.i.0", %_unwrap19.i
%_unwrap21.i = getelementptr inbounds double, double addrspace(13)* %24, i64 %_unwrap20.i
%_unwrap22.i = load double, double addrspace(13)* %_unwrap21.i, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61, !invariant.group !114
%m0diffe.i = fmul fast double %_unwrap22.i, %64
%_unwrap30.i = add i64 %_unwrap29.i, %"iv1'ac.i.0"
%_unwrap31.i = getelementptr inbounds double, double addrspace(13)* %16, i64 %_unwrap30.i
%_unwrap32.i = load double, double addrspace(13)* %_unwrap31.i, align 8, !dbg !101, !tbaa !55, !alias.scope !58, !noalias !61, !invariant.group !113
%m1diffe.i = fmul fast double %_unwrap32.i, %64
%"'ipg35_unwrap.i" = getelementptr inbounds double, double addrspace(13)* %"'ipl8_unwrap.i", i64 %_unwrap20.i
%65 = load double, double addrspace(13)* %"'ipg35_unwrap.i", align 8, !dbg !101, !tbaa !55, !alias.scope !125, !noalias !128
%66 = fadd fast double %65, %m1diffe.i
store double %66, double addrspace(13)* %"'ipg35_unwrap.i", align 8, !dbg !101, !tbaa !55, !alias.scope !125, !noalias !130
%"'ipg36_unwrap.i" = getelementptr inbounds double, double addrspace(13)* %"'ipl11_unwrap.i", i64 %_unwrap30.i
%67 = load double, double addrspace(13)* %"'ipg36_unwrap.i", align 8, !dbg !101, !tbaa !55, !alias.scope !131, !noalias !134
%68 = fadd fast double %67, %m0diffe.i
store double %68, double addrspace(13)* %"'ipg36_unwrap.i", align 8, !dbg !101, !tbaa !55, !alias.scope !131, !noalias !136
%69 = icmp eq i64 %"iv1'ac.i.0", 0
br i1 %69, label %invertL28.i.preheader.i, label %invertL28.i.i
invertL45.i.i: ; preds = %invertL45.i.i.preheader, %invertL28.i.preheader.i
%"iv'ac.i.0.in" = phi i64 [ %"iv'ac.i.0", %invertL28.i.preheader.i ], [ %10, %invertL45.i.i.preheader ]
%"iv'ac.i.0" = add nsw i64 %"iv'ac.i.0.in", -1
%_unwrap13.i = mul i64 %"iv'ac.i.0", %29
%_unwrap19.i = mul i64 %"iv'ac.i.0", %21
%_unwrap29.i = mul i64 %13, %"iv'ac.i.0"
br label %invertL28.i.i
diffejulia_f__742_inner.1.exit: ; preds = %invertL28.i.preheader.i, %L14.i.preheader.i, %entry
ret void
}
; Function Attrs: inaccessiblemem_or_argmemonly
declare void @ijl_gc_queue_root({} addrspace(10)*) #3
; Function Attrs: inaccessiblemem_or_argmemonly
declare void @jl_gc_queue_binding({} addrspace(10)*) #3
; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8*, i32, i32) #4
; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc(i8*, i64) #4
; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @ijl_gc_alloc_typed(i8*, i64, i8*) #4
attributes #0 = { nofree nosync "enzymejl_world"="33430" "probe-stack"="inline-asm" }
attributes #1 = { argmemonly nofree nosync nounwind willreturn "enzymejl_world"="33430" }
attributes #2 = { alwaysinline "probe-stack"="inline-asm" }
attributes #3 = { inaccessiblemem_or_argmemonly }
attributes #4 = { allocsize(1) }
!llvm.module.flags = !{!0, !1}
!llvm.dbg.cu = !{!2}
!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = distinct !DICompileUnit(language: DW_LANG_Julia, file: !3, producer: "julia", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, nameTableKind: None)
!3 = !DIFile(filename: "/home/wmoses/git/Enzyme.jl/slw.jl", directory: ".")
!4 = distinct !DISubprogram(name: "f!", linkageName: "julia_f!_742", scope: null, file: !3, line: 12, type: !5, scopeLine: 12, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!5 = !DISubroutineType(types: !6)
!6 = !{}
!7 = !DILocation(line: 150, scope: !8, inlinedAt: !10)
!8 = distinct !DISubprogram(name: "size;", linkageName: "size", scope: !9, file: !9, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!9 = !DIFile(filename: "array.jl", directory: ".")
!10 = distinct !DILocation(line: 98, scope: !11, inlinedAt: !13)
!11 = distinct !DISubprogram(name: "axes;", linkageName: "axes", scope: !12, file: !12, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!12 = !DIFile(filename: "abstractarray.jl", directory: ".")
!13 = distinct !DILocation(line: 30, scope: !14, inlinedAt: !16)
!14 = distinct !DISubprogram(name: "axes;", linkageName: "axes", scope: !15, file: !15, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!15 = !DIFile(filename: "experimental.jl", directory: ".")
!16 = distinct !DILocation(line: 77, scope: !11, inlinedAt: !17)
!17 = distinct !DILocation(line: 14, scope: !18, inlinedAt: !19)
!18 = distinct !DISubprogram(name: "macro expansion;", linkageName: "macro expansion", scope: !3, file: !3, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!19 = distinct !DILocation(line: 49, scope: !20, inlinedAt: !21)
!20 = distinct !DISubprogram(name: "macro expansion;", linkageName: "macro expansion", scope: !15, file: !15, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!21 = distinct !DILocation(line: 13, scope: !4, inlinedAt: !22)
!22 = distinct !DILocation(line: 0, scope: !4)
!23 = !{!24, !24, i64 0}
!24 = !{!"jtbaa_const", !25, i64 0}
!25 = !{!"jtbaa", !26, i64 0}
!26 = !{!"jtbaa"}
!27 = !{i64 0, i64 9223372036854775807}
!28 = !{!29}
!29 = !{!"jnoalias_const", !30}
!30 = !{!"jnoalias"}
!31 = !{!32, !33, !34, !35}
!32 = !{!"jnoalias_gcframe", !30}
!33 = !{!"jnoalias_stack", !30}
!34 = !{!"jnoalias_data", !30}
!35 = !{!"jnoalias_typemd", !30}
!36 = !DILocation(line: 83, scope: !37, inlinedAt: !39)
!37 = distinct !DISubprogram(name: "<;", linkageName: "<", scope: !38, file: !38, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!38 = !DIFile(filename: "int.jl", directory: ".")
!39 = distinct !DILocation(line: 369, scope: !40, inlinedAt: !42)
!40 = distinct !DISubprogram(name: ">;", linkageName: ">", scope: !41, file: !41, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!41 = !DIFile(filename: "operators.jl", directory: ".")
!42 = distinct !DILocation(line: 662, scope: !43, inlinedAt: !45)
!43 = distinct !DISubprogram(name: "isempty;", linkageName: "isempty", scope: !44, file: !44, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!44 = !DIFile(filename: "range.jl", directory: ".")
!45 = distinct !DILocation(line: 887, scope: !46, inlinedAt: !17)
!46 = distinct !DISubprogram(name: "iterate;", linkageName: "iterate", scope: !44, file: !44, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!47 = !DILocation(line: 14, scope: !18, inlinedAt: !19)
!48 = !{!49}
!49 = distinct !{!49, !50, !"na_addr13"}
!50 = distinct !{!50, !"addr13"}
!51 = !DILocation(line: 16, scope: !18, inlinedAt: !19)
!52 = !DILocation(line: 34, scope: !53, inlinedAt: !54)
!53 = distinct !DISubprogram(name: "getindex;", linkageName: "getindex", scope: !15, file: !15, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!54 = distinct !DILocation(line: 15, scope: !18, inlinedAt: !19)
!55 = !{!56, !56, i64 0}
!56 = !{!"jtbaa_arraybuf", !57, i64 0}
!57 = !{!"jtbaa_data", !25, i64 0}
!58 = !{!59, !34}
!59 = !{!"aliasscope", !60}
!60 = !{!"f!"}
!61 = !{!32, !33, !35, !29}
!62 = !DILocation(line: 410, scope: !63, inlinedAt: !54)
!63 = distinct !DISubprogram(name: "*;", linkageName: "*", scope: !64, file: !64, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!64 = !DIFile(filename: "float.jl", directory: ".")
!65 = !DILocation(line: 971, scope: !66, inlinedAt: !54)
!66 = distinct !DISubprogram(name: "setindex!;", linkageName: "setindex!", scope: !9, file: !9, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!67 = !{!34}
!68 = !{!49, !69, !59, !32, !33, !35, !29}
!69 = distinct !{!69, !70, !"na_addr13"}
!70 = distinct !{!70, !"addr13"}
!71 = distinct !{!71, !72}
!72 = !{!"llvm.loop.isvectorized", i32 1}
!73 = !DILocation(line: 499, scope: !74, inlinedAt: !76)
!74 = distinct !DISubprogram(name: "==;", linkageName: "==", scope: !75, file: !75, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!75 = !DIFile(filename: "promotion.jl", directory: ".")
!76 = distinct !DILocation(line: 891, scope: !46, inlinedAt: !77)
!77 = distinct !DILocation(line: 16, scope: !18, inlinedAt: !19)
!78 = !DILocation(line: 891, scope: !46, inlinedAt: !77)
!79 = distinct !{!79, !80, !72}
!80 = !{!"llvm.loop.unroll.runtime.disable"}
!81 = !DILocation(line: 0, scope: !4)
!82 = distinct !DISubprogram(name: "f!", linkageName: "julia_f!_742", scope: null, file: !3, line: 12, type: !5, scopeLine: 12, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !6)
!83 = !DILocation(line: 150, scope: !8, inlinedAt: !84)
!84 = distinct !DILocation(line: 98, scope: !11, inlinedAt: !85)
!85 = distinct !DILocation(line: 30, scope: !14, inlinedAt: !86)
!86 = distinct !DILocation(line: 77, scope: !11, inlinedAt: !87)
!87 = distinct !DILocation(line: 14, scope: !18, inlinedAt: !88)
!88 = distinct !DILocation(line: 49, scope: !20, inlinedAt: !89)
!89 = distinct !DILocation(line: 13, scope: !82, inlinedAt: !90)
!90 = distinct !DILocation(line: 0, scope: !82, inlinedAt: !91)
!91 = distinct !DILocation(line: 0, scope: !82)
!92 = !DILocation(line: 83, scope: !37, inlinedAt: !93)
!93 = distinct !DILocation(line: 369, scope: !40, inlinedAt: !94)
!94 = distinct !DILocation(line: 662, scope: !43, inlinedAt: !95)
!95 = distinct !DILocation(line: 887, scope: !46, inlinedAt: !87)
!96 = !DILocation(line: 14, scope: !18, inlinedAt: !88)
!97 = distinct !{}
!98 = distinct !{}
!99 = distinct !{}
!100 = !DILocation(line: 16, scope: !18, inlinedAt: !88)
!101 = !DILocation(line: 34, scope: !53, inlinedAt: !102)
!102 = distinct !DILocation(line: 15, scope: !18, inlinedAt: !88)
!103 = !DILocation(line: 410, scope: !63, inlinedAt: !102)
!104 = !DILocation(line: 971, scope: !66, inlinedAt: !102)
!105 = !{!106, !34}
!106 = distinct !{!106, !107, !"primal"}
!107 = distinct !{!107, !" diff: %"}
!108 = !{!49, !109, !110, !59, !32, !33, !35, !29}
!109 = distinct !{!109, !107, !"shadow_0"}
!110 = distinct !{!110, !111, !"na_addr13"}
!111 = distinct !{!111, !"addr13"}
!112 = distinct !{!112, !72}
!113 = distinct !{}
!114 = distinct !{}
!115 = !DILocation(line: 499, scope: !74, inlinedAt: !116)
!116 = distinct !DILocation(line: 891, scope: !46, inlinedAt: !117)
!117 = distinct !DILocation(line: 16, scope: !18, inlinedAt: !88)
!118 = distinct !{!118, !80, !72}
!119 = distinct !{}
!120 = distinct !{}
!121 = distinct !{}
!122 = !{!110, !59, !32, !33, !35, !29}
!123 = !{!109}
!124 = !{!49, !106, !110, !59, !32, !33, !35, !29}
!125 = !{!126}
!126 = distinct !{!126, !127, !"shadow_0"}
!127 = distinct !{!127, !" diff: %"}
!128 = !{!129, !32, !33, !35, !29}
!129 = distinct !{!129, !127, !"primal"}
!130 = !{!49, !129, !32, !33, !35, !29}
!131 = !{!132}
!132 = distinct !{!132, !133, !"shadow_0"}
!133 = distinct !{!133, !" diff: %"}
!134 = !{!135, !32, !33, !35, !29}
!135 = distinct !{!135, !133, !"primal"}
!136 = !{!49, !135, !32, !33, !35, !29}
Another weird thing is that the even vanilla (non differentiated, let alone fwd pass) code is vectorized worse than the codellvm version, which also implies there may be a target injection/pipeline issue to investigate.
julia> @code_llvm f!(dA, A, B)
; @ REPL[9]:1 within `f!`
define nonnull {}* @"japi1_f!_2602"({}* %0, {}** noalias nocapture noundef readonly %1, i32 %2) #0 {
top:
%3 = alloca {}**, align 8
store volatile {}** %1, {}*** %3, align 8
%4 = load {}*, {}** %1, align 8
%5 = getelementptr inbounds {}*, {}** %1, i64 1
%6 = load {}*, {}** %5, align 8
%7 = getelementptr inbounds {}*, {}** %1, i64 2
%8 = load {}*, {}** %7, align 8
; @ REPL[9]:2 within `f!`
; ┌ @ experimental.jl:49 within `macro expansion` @ REPL[9]:3
; │┌ @ abstractarray.jl:77 within `axes` @ experimental.jl:30 @ abstractarray.jl:98
; ││┌ @ array.jl:150 within `size`
%9 = bitcast {}* %6 to {}**
%10 = getelementptr inbounds {}*, {}** %9, i64 4
%11 = bitcast {}** %10 to i64*
%12 = load i64, i64* %11, align 8
; │└└
; │┌ @ range.jl:887 within `iterate`
; ││┌ @ range.jl:662 within `isempty`
; │││┌ @ operators.jl:369 within `>`
; ││││┌ @ int.jl:83 within `<`
%.not.not = icmp eq i64 %12, 0
; │└└└└
br i1 %.not.not, label %L56, label %L14.preheader
L14.preheader: ; preds = %top
%13 = getelementptr inbounds {}*, {}** %9, i64 3
%14 = bitcast {}** %13 to i64*
%15 = load i64, i64* %14, align 8
%.not.not19 = icmp eq i64 %15, 0
%16 = bitcast {}* %6 to double**
%17 = load double*, double** %16, align 8
%18 = bitcast {}* %8 to {}**
%19 = getelementptr inbounds {}*, {}** %18, i64 3
%20 = bitcast {}** %19 to i64*
%21 = load i64, i64* %20, align 8
%22 = bitcast {}* %8 to double**
%23 = load double*, double** %22, align 8
%24 = bitcast {}* %4 to {}**
%25 = getelementptr inbounds {}*, {}** %24, i64 3
%26 = bitcast {}** %25 to i64*
%27 = load i64, i64* %26, align 8
%28 = bitcast {}* %4 to double**
%29 = load double*, double** %28, align 8
br i1 %.not.not19, label %L56, label %L28.preheader
L28.preheader: ; preds = %L45, %L14.preheader
%value_phi3 = phi i64 [ %77, %L45 ], [ 1, %L14.preheader ]
%30 = add nsw i64 %value_phi3, -1
%31 = mul i64 %15, %30
%32 = mul i64 %21, %30
%33 = mul i64 %27, %30
; │ @ experimental.jl:49 within `macro expansion` @ REPL[9]:5
%min.iters.check = icmp ult i64 %15, 16
br i1 %min.iters.check, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %L28.preheader
%n.vec = and i64 %15, 9223372036854775792
%ind.end = or i64 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; │ @ experimental.jl:49 within `macro expansion` @ REPL[9]:4
; │┌ @ experimental.jl:34 within `getindex`
%34 = add i64 %index, %31
%35 = getelementptr inbounds double, double* %17, i64 %34
%36 = bitcast double* %35 to <4 x double>*
%wide.load = load <4 x double>, <4 x double>* %36, align 8
%37 = getelementptr inbounds double, double* %35, i64 4
%38 = bitcast double* %37 to <4 x double>*
%wide.load21 = load <4 x double>, <4 x double>* %38, align 8
%39 = getelementptr inbounds double, double* %35, i64 8
%40 = bitcast double* %39 to <4 x double>*
%wide.load22 = load <4 x double>, <4 x double>* %40, align 8
%41 = getelementptr inbounds double, double* %35, i64 12
%42 = bitcast double* %41 to <4 x double>*
%wide.load23 = load <4 x double>, <4 x double>* %42, align 8
%43 = add i64 %index, %32
%44 = getelementptr inbounds double, double* %23, i64 %43
%45 = bitcast double* %44 to <4 x double>*
%wide.load24 = load <4 x double>, <4 x double>* %45, align 8
%46 = getelementptr inbounds double, double* %44, i64 4
%47 = bitcast double* %46 to <4 x double>*
%wide.load25 = load <4 x double>, <4 x double>* %47, align 8
%48 = getelementptr inbounds double, double* %44, i64 8
%49 = bitcast double* %48 to <4 x double>*
%wide.load26 = load <4 x double>, <4 x double>* %49, align 8
%50 = getelementptr inbounds double, double* %44, i64 12
%51 = bitcast double* %50 to <4 x double>*
%wide.load27 = load <4 x double>, <4 x double>* %51, align 8
; │└
; │┌ @ float.jl:410 within `*`
%52 = fmul <4 x double> %wide.load, %wide.load24
%53 = fmul <4 x double> %wide.load21, %wide.load25
%54 = fmul <4 x double> %wide.load22, %wide.load26
%55 = fmul <4 x double> %wide.load23, %wide.load27
; │└
; │┌ @ array.jl:971 within `setindex!`
%56 = add i64 %index, %33
%57 = getelementptr inbounds double, double* %29, i64 %56
%58 = bitcast double* %57 to <4 x double>*
store <4 x double> %52, <4 x double>* %58, align 8
%59 = getelementptr inbounds double, double* %57, i64 4
%60 = bitcast double* %59 to <4 x double>*
store <4 x double> %53, <4 x double>* %60, align 8
%61 = getelementptr inbounds double, double* %57, i64 8
%62 = bitcast double* %61 to <4 x double>*
store <4 x double> %54, <4 x double>* %62, align 8
%63 = getelementptr inbounds double, double* %57, i64 12
%64 = bitcast double* %63 to <4 x double>*
store <4 x double> %55, <4 x double>* %64, align 8
%index.next = add nuw i64 %index, 16
%65 = icmp eq i64 %index.next, %n.vec
br i1 %65, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
; │└
; │ @ experimental.jl:49 within `macro expansion` @ REPL[9]:5
%cmp.n = icmp eq i64 %15, %n.vec
br i1 %cmp.n, label %L45, label %scalar.ph
scalar.ph: ; preds = %middle.block, %L28.preheader
%bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L28.preheader ]
br label %L28
L28: ; preds = %L28, %scalar.ph
%value_phi8 = phi i64 [ %76, %L28 ], [ %bc.resume.val, %scalar.ph ]
; │ @ experimental.jl:49 within `macro expansion` @ REPL[9]:4
; │┌ @ experimental.jl:34 within `getindex`
%66 = add nsw i64 %value_phi8, -1
%67 = add i64 %66, %31
%68 = getelementptr inbounds double, double* %17, i64 %67
%69 = load double, double* %68, align 8
%70 = add i64 %66, %32
%71 = getelementptr inbounds double, double* %23, i64 %70
%72 = load double, double* %71, align 8
; │└
; │┌ @ float.jl:410 within `*`
%73 = fmul double %69, %72
; │└
; │┌ @ array.jl:971 within `setindex!`
%74 = add i64 %66, %33
%75 = getelementptr inbounds double, double* %29, i64 %74
store double %73, double* %75, align 8
; │└
; │ @ experimental.jl:49 within `macro expansion` @ REPL[9]:5
; │┌ @ range.jl:891 within `iterate`
; ││┌ @ promotion.jl:499 within `==`
%.not.not20 = icmp eq i64 %value_phi8, %15
; ││└
%76 = add nuw nsw i64 %value_phi8, 1
; │└
br i1 %.not.not20, label %L45, label %L28
L45: ; preds = %L28, %middle.block
; │┌ @ range.jl:891 within `iterate`
; ││┌ @ promotion.jl:499 within `==`
%.not = icmp eq i64 %value_phi3, %12
; ││└
%77 = add nuw nsw i64 %value_phi3, 1
; │└
br i1 %.not, label %L56, label %L28.preheader
L56: ; preds = %L45, %L14.preheader, %top
; └
; @ REPL[9]:7 within `f!`
ret {}* inttoptr (i64 140233368764424 to {}*)
}
It looks like the first reason why vectorization isn't happening is a lack of noalias info.
Concretely, the reverse pass looks like
load dout store dout
load A load B
dA += ... dB += ...
current alias info believes
- dout could alias A, B, dA, dB.
- this is fixed by marking output noalias input [right now this is one directional where output writes noalias input, but we need the other way]
- A aliases dB, and B aliases dA
- this is fixed by marking A and B as noalias each other.
@wsmoses I tried using UnsafeArrays to work around the Enzyme/Julia double-pointered-arrays problem (so Enzyme should be able to infer aliasing correctly?):
using UnsafeArrays
UnsafeArrays.uview(A::Duplicated) = Duplicated(uview(A.val), uview(A.dval))
function f!(dA, A, B)
# Can't use Const with UnsafeArray:
# @aliasscope let A = Const(A), B = Const(B)
@inbounds for j in axes(A, 2), i in axes(A, 1)
dA[i,j] = A[i,j] * B[i,j]
end
# end
nothing
end
@uviews C A B begin
f!(C, A, B)
end
@uviews C A B begin
@benchmark f!($C, $A, $B)
end
@uviews dpl_C dpl_A dpl_B begin
Enzyme.autodiff(Reverse, f!, dpl_C, dpl_A, dpl_B)
end
@uviews dpl_C dpl_A dpl_B begin
# Everything should be single-pointer in here:
@benchmark Enzyme.autodiff(Reverse, f!, $dpl_C, $dpl_A, $dpl_B)
end
Doesn't seem to work though - the speed of the primal computation is the same, but reverse diff becomes even slower (about 4 ms instead of 2 ms). Any idea why?