SparseArrays.jl
SparseArrays.jl copied to clipboard
mapreduce for binary functions on SparseVector
It looks like mapreduce falls back to a generic implementation for SparseVector rather than taking advantage of the structure of the data type.
EDIT. It seems that writing a method for mapreduce that works in general is difficult. Maybe the best that could be asked for is tools for writing special cases. For example, when op is + or max.
I do this
using SparseArrays
n = 1000
density = 0.2
v1 = sprand(Float64, n, density);
v2 = sprand(Float64, n, density);
@which mapreduce((x, y) -> abs(x-y), +, v1, v2)
and get
mapreduce(f, op, A::Union{Base.AbstractBroadcasted, AbstractArray}...; kw...) in Base at reducedim.jl:324
And that method is
mapreduce(f, op, A::AbstractArrayOrBroadcasted...; kw...) =
reduce(op, map(f, A...); kw...)
This code could be modified to implement mapreduce:
https://github.com/JuliaLang/julia/blob/b4dafb84fb89e8fe34501a1a51c151b201280360/stdlib/SparseArrays/src/sparsevector.jl#L1273-L1277
For example, the following shows a large peformance gain
EDIT: The following makes assumptions about f and op and will not work in general. But, you could modify the following when op is something like + to handle the case that f(0, 0) != 0. I can't see a fully general and efficient solution.
using SparseArrays: SparseVector, nonzeroinds, nonzeros, nnz
function mymapreduce(f::Function, op::Function, x::SparseVector, y::SparseVector)
xnzind = nonzeroinds(x)
ynzind = nonzeroinds(y)
xnzval = nonzeros(x)
ynzval = nonzeros(y)
mx = nnz(x)
my = nnz(y)
return _binary_map_reduce(f, op, mx, my, xnzind, xnzval, ynzind, ynzval)
end
function _binary_map_reduce(f::Function,
op::Function, mx::Int, my::Int,
xnzind, xnzval::AbstractVector{Tx},
ynzind, ynzval::AbstractVector{Ty}) where {Tx,Ty}
fx = x -> f(x, zero(Ty))
fy = y -> f(zero(Tx), y)
return _binary_map_reduce(f, fx, fy, op, mx, my,
xnzind, xnzval,
ynzind, ynzval)
end
function _binary_map_reduce(f::Function, fx::Function, fy::Function,
op::Function, mx::Int, my::Int,
xnzind, xnzval::AbstractVector{Tx},
ynzind, ynzval::AbstractVector{Ty}) where {Tx,Ty}
ix = 1; iy = 1
s = zero(f(zero(Tx), zero(Ty)))
@inbounds while ix <= mx && iy <= my
jx = xnzind[ix]
jy = ynzind[iy]
if jx == jy
v = f(xnzval[ix], ynzval[iy])
s = op(v, s)
ix += 1; iy += 1
elseif jx < jy
v = fx(xnzval[ix])
s = op(v, s)
ix += 1
else
v = fy(ynzval[iy])
s = op(s, v)
iy += 1
end
end
@inbounds while ix <= mx
v = fx(xnzval[ix])
s = op(s, v)
ix += 1
end
@inbounds while iy <= my
v = fy(ynzval[iy])
s = op(s, v)
iy += 1
end
return s
end
Using the example above:
julia> @btime mapreduce((x, y) -> abs(x-y), +, $v1, $v2)
10.756 μs (4 allocations: 6.50 KiB)
176.02710895372354
julia> @btime mymapreduce((x, y) -> abs(x-y), +, $v1, $v2)
462.315 ns (0 allocations: 0 bytes)
176.02710895372354
(The was raised first in a discourse post)
This issue came up when I was looking into distance measures for sparse probability densities.
In fact you can get more perfomance with another method:
function mymapreduce(f::Function, fx::Function, fy::Function, op::Function, x::SparseVector, y::SparseVector)
xnzind = nonzeroinds(x)
ynzind = nonzeroinds(y)
xnzval = nonzeros(x)
ynzval = nonzeros(y)
mx = nnz(x)
my = nnz(y)
return _binary_map_reduce(f, fx, fy, op, mx, my, xnzind, xnzval, ynzind, ynzval)
end
I get
julia> @btime mymapreduce((x, y) -> (sqrt(x)-sqrt(y))^2, +, $v1, $v2)
795.927 ns (0 allocations: 0 bytes)
170.15810903507113
julia> @btime mymapreduce((x, y) -> (sqrt(x)-sqrt(y))^2, abs, abs, +, $v1, $v2)
463.736 ns (0 allocations: 0 bytes)
170.15810903507113
~But, I think abusing the interface for mapreduce this way is perhaps not the best approach.~ EDIT: More importantly, as mentioned above, this won't work in general. It assumes f(0, 0)=0 and that reducing f(0, 0) with op has no effect on the result.