TensorCast.jl
TensorCast.jl copied to clipboard
Performance of nested reductions
What can be done to improve TensorCast's performance on the following nested reductions:
using TensorCast, BenchmarkTools
M = [i+j for i=1:4, j=0:4:12]
B = [M[i:i+1, j:j+1] for i in 1:2:size(M,1), j in 1:2:size(M,2)]
M2 = reduce(hcat, reduce.(vcat, eachcol(B)))
@cast M3[i⊗k,j⊗l] |= B[k,l][i,j] # \otimes<tab>;
M == M2 == M3 # true
@btime M2 = reduce(hcat, reduce.(vcat, eachcol($B))) # 392 ns (4 allocs: 512 bytes)
@btime @cast M3[i⊗k,j⊗l] |= $B[k,l][i,j] # 1.250 μs (15 allocs: 640 bytes)
Cheers.
In this case lazy=false
is a bit faster:
julia> @btime M2 = reduce(hcat, reduce.(vcat, eachcol($B)));
min 470.454 ns, mean 522.226 ns (4 allocations, 512 bytes)
julia> @btime @cast M3[i⊗k,j⊗l] |= $B[k,l][i,j];
min 1.096 μs, mean 1.205 μs (15 allocations, 640 bytes)
julia> @btime @cast M3[i⊗k,j⊗l] := $B[k,l][i,j] lazy=false;
min 538.564 ns, mean 663.231 ns (12 allocations, 1.06 KiB)
But I'm not sure it will always be, I think it will allocate twice as much in general:
julia> @pretty @cast M3[i⊗k,j⊗l] |= B[k,l][i,j]
begin
@boundscheck ndims(B) == 2 || throw(ArgumentError("expected a 2-tensor B[k, l]"))
local (ax_i, ax_j, ax_k, ax_l) = (axes(first(B), 1), axes(first(B), 2), axes(B, 1), axes(B, 2))
local fish = transmute(lazystack(B), Val((1, 3, 2, 4))) # <--- two lazy operations
M3 = reshape(Base.identity.(fish), (star(ax_i, ax_k), star(ax_j, ax_l))) # <--- identity.(x) to collect
end
julia> @pretty @cast M3[i⊗k,j⊗l] := B[k,l][i,j] lazy=false
begin
@boundscheck ndims(B) == 2 || throw(ArgumentError("expected a 2-tensor B[k, l]"))
local (ax_i, ax_j, ax_k, ax_l) = (axes(first(B), 1), axes(first(B), 2), axes(B, 1), axes(B, 2))
local fish = transmutedims(eagerstack(B), (1, 3, 2, 4)) # <-- two eager operations
M3 = reshape(fish, (star(ax_i, ax_k), star(ax_j, ax_l)))
end