SnpArrays.jl
SnpArrays.jl copied to clipboard
Improved Tiling for Matrix-Vector Products
For stochastic trace estimation, we require being able to evaluate terms like $\mathbf{G}^\top \mathbf{x}$ and $\mathbf{G}\mathbf{x}$ efficiently, where $\mathbf{G}$ is a SnpArray. I was looking into SnpLinAlg
and noticed that I was having alot of memory allocations, presumably due to use of Julia Base.@threads
and pointer allocation from the tiling procedure. However, it turns out that only multithreading within a tile evaluation rather than multithreading over several tiles yields a substantial performance improvement.
First the section defining the functions Gt_mul_x!
, Gt_mul_x_tiled!
, G_mul_x!
, and G_mul_x_tiled!
using StatsBase, Statistics, Plots, LinearAlgebra, BenchmarkTools, Random
using SnpArrays, DelimitedFiles, Glob, StaticArrays
using Plots, VectorizationBase, LoopVectorization, Polyester
function Gt_mul_x!(
y::AbstractVector{T},
G::SnpArray,
x::AbstractVector{Tx},
μ::AbstractVector{T},
σinv::AbstractVector{T}
) where {T<:AbstractFloat, Tx<:Union{Signed, AbstractFloat}}
m, d = size(G)
K, rem = divrem(m, 4)
fill!(y, zero(T))
# Parallelize over the columns of G
@turbo thread=true for k in axes(G, 2)
yk = zero(T)
for j in 1:K
for i in 1:4
# index into the correct value in compressed SNP array
Gijk = (G.data[j, k] >> ((i - 1) << 1)) & 0x03
# index into the corresponding value in x
xval = x[(j - 1) << 2 + i]
# assume ADDITIVE_MODEL is being used
# yk += xval * ifelse(isone(Gijk), zero(T), T((Gijk >= 0x02) * (Gijk - 0x01)) - μ[k])
# Branch-free version
yk += xval * (T((Gijk >= 0x02) * (Gijk - 0x01)) - !isone(Gijk) * μ[k])
end
end
y[k] = yk
end
if rem > 0
@turbo thread=true for k in axes(G, 2)
yk = zero(T)
for i in 1:rem
Gijk = (G.data[K + 1, k] >> ((i - 1) << 1)) & 0x03
xval = x[K << 2 + i]
# yk += xval * ifelse(isone(Gijk), zero(T), T((Gijk >= 0x02) * (Gijk - 0x01)) - μ[k])
# Branch-free version
yk += xval * (T((Gijk >= 0x02) * (Gijk - 0x01)) - !isone(Gijk) * μ[k])
end
y[k] += yk
end
end
@turbo for j in axes(G, 2)
y[j] = y[j] * σinv[j]
end
return y
end
function Gt_mul_x_chunk!(
y::AbstractVector{T},
G::AbstractMatrix{UInt8},
x::AbstractVector{Tx},
μ::AbstractVector{T}
) where {T<:AbstractFloat, Tx<:Union{Signed, AbstractFloat}}
@turbo thread=true for k in axes(G, 2)
yk = zero(T)
for j in axes(G, 1)
for i in 1:4
# index into the correct value in compressed SNP array
Gijk = (G[j, k] >> ((i - 1) << 1)) & 0x03
# index into the corresponding value in x
xval = x[(j - 1) << 2 + i]
# assume ADDITIVE_MODEL is being used
# yk += xval * ifelse(isone(Gijk), zero(T), T((Gijk >= 0x02) * (Gijk - 0x01)) - μ[k])
# Branch-free version
# * almost identical performance to ifelse
# * (Gijk >= 0x02) * (Gijk - 0x01) should compile to
# saturating arithmetic, which has a built-in LLVM function call
yk += xval * (T((Gijk >= 0x02) * (Gijk - 0x01)) - !isone(Gijk) * μ[k])
end
end
y[k] += yk
end
end
function Gt_mul_x_tiled!(
y::AbstractVector{T},
G::SnpArray,
x::AbstractVector{Tx},
μ::AbstractVector{T},
σinv::AbstractVector{T};
chunkwidth::Int = 1024
) where {T<:AbstractFloat, Tx<:Union{Signed, AbstractFloat}}
m, d = size(G)
mpacked = size(G.data, 1)
packedchunkwidth = chunkwidth >> 2
# need K and rem to handle remainder rows
K, rem = divrem(m, 4)
# Slice up row iteration space
nrowchunks, rowchunkrem = divrem(m, chunkwidth)
# Slice up col iteration space
ncolchunks, colchunkrem = divrem(d, chunkwidth)
# Initialize output vector to zero,
# * could be modified to agree with 5-argument mul!
fill!(y, zero(T))
@inbounds for col in 1:ncolchunks
colstart = (col - 1) * chunkwidth
_y = view(y, (colstart + 1):(col * chunkwidth))
_μ = view(μ, (colstart + 1):(col * chunkwidth))
for l in 1:nrowchunks
rowstart = (l - 1) * chunkwidth
packedrowstart = (l - 1) * packedchunkwidth
_x = view(x, (rowstart + 1):(l * chunkwidth))
_G_data = view(G.data, (packedrowstart + 1):(packedrowstart + packedchunkwidth), (colstart + 1):(col * chunkwidth))
Gt_mul_x_chunk!(_y, _G_data, _x, _μ)
end
if rowchunkrem > 0
rowstart = nrowchunks * chunkwidth
packedrowstart = nrowchunks * packedchunkwidth
# If `m` is a multiple of 4, can just load the entire remainder view
if rem == 0
_x = view(x, (rowstart + 1):m)
_G_data = view(G.data, (packedrowstart + 1):mpacked, (colstart + 1):(col * chunkwidth))
Gt_mul_x_chunk!(_y, _G_data, _x, _μ)
else
# Taking a view excluding the remainer rows of y and G
_x = view(x, (rowstart + 1):(K << 2))
_G_data = view(G.data, (packedrowstart + 1):K, (colstart + 1):(col * chunkwidth))
Gt_mul_x_chunk!(_y, _G_data, _x, _μ)
# Handling remainder rows
@turbo thread=true for k in 1:chunkwidth
yk = zero(T)
for i in 1:rem
Gijk = (G.data[K + 1, colstart + k] >> ((i - 1) << 1)) & 0x03
xval = x[K << 2 + i]
yk += xval * (T((Gijk >= 0x02) * (Gijk - 0x01)) - !isone(Gijk) * _μ[k])
end
_y[k] += yk
end
end
end
end
if colchunkrem > 0
# TODO: Handle remainder columns
nothing
end
@turbo for j in axes(G, 2)
y[j] = y[j] * σinv[j]
end
return y
end
function G_mul_x!(
y::AbstractVector{T},
G::SnpArray,
x::AbstractVector{Tx},
μ::AbstractVector{T},
σinv::AbstractVector{T}
) where {T<:AbstractFloat, Tx<:Union{Signed, AbstractFloat}}
m, d = size(G)
K, rem = divrem(m, 4)
fill!(y, zero(T))
# Parallelize over the columns of G
@turbo thread=true for k in axes(G, 2)
for j in 1:K
for i in 1:4
# index into the correct value in compressed SNP array
Gijk = (G.data[j, k] >> ((i - 1) << 1)) & 0x03
# assume ADDITIVE_MODEL is being used
# y[(j - 1) << 2 + i] += x[k] * ifelse(isone(Gijk), zero(T), T((Gijk >= 0x02) * (Gijk - 0x01)) - μ[k]) * σinv[k]
y[(j - 1) << 2 + i] += x[k] * (T((Gijk >= 0x02) * (Gijk - 0x01)) - !isone(Gijk) * μ[k]) * σinv[k]
end
end
end
if rem > 0
@turbo thread=true for k in axes(G, 2)
for i in 1:rem
Gijk = (G.data[K + 1, k] >> ((i - 1) << 1)) & 0x03
# y[K << 2 + i] += x[k] * ifelse(isone(Gijk), zero(T), T((Gijk >= 0x02) * (Gijk - 0x01)) - μ[k]) * σinv[k]
y[K << 2 + i] += x[k] * (T((Gijk >= 0x02) * (Gijk - 0x01)) - !isone(Gijk) * μ[k]) * σinv[k]
end
end
end
return y
end
function G_mul_x_chunk!(
y::AbstractVector{T},
G::AbstractMatrix{UInt8},
x::AbstractVector{Tx},
μ::AbstractVector{T},
σinv::AbstractVector{T}
) where {T<:AbstractFloat, Tx<:Union{Signed, AbstractFloat}}
@turbo thread=true for k in axes(G, 2)
for j in axes(G, 1)
for i in 1:4
# index into the correct value in compressed SNP array
Gijk = (G[j, k] >> ((i - 1) << 1)) & 0x03
y[((j - 1) << 2) + i] += x[k] * (T((Gijk >= 0x02) * (Gijk - 0x01)) - !isone(Gijk) * μ[k]) * σinv[k]
end
end
end
end
function G_mul_x_tiled!(
y::AbstractVector{T},
G::SnpArray,
x::AbstractVector{Tx},
μ::AbstractVector{T},
σinv::AbstractVector{T};
chunkwidth::Int = 1024
) where {T<:AbstractFloat, Tx<:Union{Signed, AbstractFloat}}
m, d = size(G)
mpacked = size(G.data, 1)
packedchunkwidth = chunkwidth >> 2
# need K and rem to handle remainder rows
K, rem = divrem(m, 4)
# Slice up row iteration space
nrowchunks, rowchunkrem = divrem(m, chunkwidth)
# Slice up col iteration space
ncolchunks, colchunkrem = divrem(d, chunkwidth)
# Initialize output vector to zero,
# * could be modified to agree with 5-argument mul!
fill!(y, zero(T))
# multithreading only on inner tile loop avoids pointer allocations using @turbo thread=true
# a more sophisticated way might use buffers and multi-level parallelism
@inbounds for col in 1:ncolchunks
colstart = (col - 1) * chunkwidth
_x = view(x, (colstart + 1):(col * chunkwidth))
_μ = view(μ, (colstart + 1):(col * chunkwidth))
_σinv = view(σinv, (colstart + 1):(col * chunkwidth))
for l in 1:nrowchunks
rowstart = (l - 1) * chunkwidth
packedrowstart = (l - 1) * packedchunkwidth
_y = view(y, (rowstart + 1):(l * chunkwidth))
_G_data = view(G.data, (packedrowstart + 1):(packedrowstart + packedchunkwidth), (colstart + 1):(col * chunkwidth))
G_mul_x_chunk!(_y, _G_data, _x, _μ, _σinv)
end
if rowchunkrem > 0
rowstart = nrowchunks * chunkwidth
packedrowstart = nrowchunks * packedchunkwidth
# If `m` is a multiple of 4, can just load the entire remainder view
if rem == 0
_y = view(y, (rowstart + 1):m)
_G_data = view(G.data, (packedrowstart + 1):mpacked, (colstart + 1):(col * chunkwidth))
G_mul_x_chunk!(_y, _G_data, _x, _μ, _σinv)
else
# TODO: pretty ugly, is there a better/less verbose way of handling remainders?
# Taking a view excluding the remainer rows of y and G
_y = view(y, (rowstart + 1):(K << 2))
_G_data = view(G.data, (packedrowstart + 1):K, (colstart + 1):(col * chunkwidth))
G_mul_x_chunk!(_y, _G_data, _x, _μ, _σinv)
# Handling remainder rows
@turbo thread=true for k in 1:chunkwidth
for i in 1:rem
# index into the correct value in compressed SNP array
Gijk = (G.data[K + 1, colstart + k] >> ((i - 1) << 1)) & 0x03
y[K << 2 + i] += _x[k] * (T((Gijk >= 0x02) * (Gijk - 0x01)) - !isone(Gijk) * _μ[k]) * _σinv[k]
end
end
end
end
end
# Handling remainder columns when `d` is not a multiple of `chunkwidth`
@inbounds if colchunkrem > 0
colstart = ncolchunks * chunkwidth
_x = view(x, (colstart + 1):d)
_μ = view(μ, (colstart + 1):d)
_σinv = view(σinv, (colstart + 1):d)
for l in 1:nrowchunks
packedrowstart = (l - 1) * packedchunkwidth
_y = view(y, ((l - 1) * chunkwidth + 1):(l * chunkwidth))
_G_data = view(G.data, (packedrowstart + 1):(packedrowstart + packedchunkwidth), (colstart + 1):d)
G_mul_x_chunk!(_y, _G_data, _x, _μ, _σinv)
end
if rowchunkrem > 0
packedrowstart = nrowchunks * packedchunkwidth
rowstart = nrowchunks * chunkwidth
if rem == 0
_y = view(y, (rowstart + 1):m)
_G_data = view(G.data, (packedrowstart + 1):mpacked, (colstart + 1):d)
G_mul_x_chunk!(_y, _G_data, _x, _μ, _σinv)
else
_y = view(y, (rowstart + 1):(K << 2))
_G_data = view(G.data, (packedrowstart + 1):K, (colstart + 1):d)
G_mul_x_chunk!(_y, _G_data, _x, _μ, _σinv)
# Handling remainder rows
@turbo thread=true for k in eachindex(_x)
for i in 1:rem
# index into the correct value in compressed SNP array
Gijk = (G.data[K + 1, colstart + k] >> ((i - 1) << 1)) & 0x03
y[K << 2 + i] += _x[k] * (T((Gijk >= 0x02) * (Gijk - 0x01)) - !isone(Gijk) * _μ[k]) * _σinv[k]
end
end
end
end
end
return y
end
And setting up data for simulation:
EUR = SnpArray(SnpArrays.datadir("EUR_subset.bed"))
EUR_5 = [EUR; EUR; EUR; EUR; EUR]
EUR_5_5 = [EUR_5 EUR_5 EUR_5 EUR_5 EUR_5]
G = EUR_5_5;
n, q = size(G)
# Second row of `columncounts` indicates number of missing
G_counts = counts(G, dims = 1)
n_nonmissing = n .- G_counts[2, :]
# Allele Frequencies
ρ = (G_counts[3, :] + 2 * G_counts[4, :]) ./ (2 * n_nonmissing)
# Expected Values
μ = 2 * ρ
# Standard Deviations
σ = sqrt.(2 .* (1 .- ρ) .* ρ)
σinv = inv.(σ)
# Truncating the columns to some multiple of 2
m = n # 1895
d = 1 << 18 # 262_144
G_ = SnpArray(undef, m, d);
copyto!(G_.data, view(G.data, :, 1:d));
ρ_ = ρ[1:d]
μ_ = μ[1:d]
σinv_ = σinv[1:d]
First, we can look at $\mathbf{G}^\top \mathbf{x}$,
rng = MersenneTwister(1234)
xvec = rand(rng, (-1.0, 1.0), m);
resq = Vector{Float64}(undef, d);
@benchmark Gt_mul_x!($resq, $G_, $xvec, $μ_, $σinv_) evals=1
BenchmarkTools.Trial: 120 samples with 1 evaluation.
Range (min … max): 38.016 ms … 117.112 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 39.044 ms ┊ GC (median): 0.00%
Time (mean ± σ): 41.954 ms ± 10.720 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
██▂
████▇▅▁▅▁▅▅▆▁▅▁▁▁▁▁▅▁▁▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▆▁▁▁▁▁▁▁▁▁▁▅▁▁▁▁▁▁▅ ▅
38 ms Histogram: log(frequency) by time 91.6 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
resq2 = similar(resq);
@benchmark Gt_mul_x_tiled!($resq2, $G_, $xvec, $μ_, $σinv_) evals=1
BenchmarkTools.Trial: 109 samples with 1 evaluation.
Range (min … max): 40.673 ms … 58.873 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 45.255 ms ┊ GC (median): 0.00%
Time (mean ± σ): 45.881 ms ± 4.235 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
▂█
██▆▇▅▁▃▃▇▅▆██▇▆█▆▇▆█▇▆▃▇▃▅▃▃▃▁▁▃▃▆▁▃▃▁▃▁▁▁▁▃▃▃▃▁▃▁▁▁▁▁▃▃▁▁▃ ▃
40.7 ms Histogram: frequency by time 58.6 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
then $\mathbf{Gx}$,
xvecq = rand(rng, (-1.0, 1.0), d);
resn = Vector{Float64}(undef, m);
@benchmark G_mul_x!($resn, $G_, $xvecq, $μ_, $σinv_) evals=1
BenchmarkTools.Trial: 16 samples with 1 evaluation.
Range (min … max): 322.233 ms … 391.568 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 327.874 ms ┊ GC (median): 0.00%
Time (mean ± σ): 332.096 ms ± 16.457 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
▃ █▃ ▃█
█▇██▁▁▇██▁▁▁▁▁▇▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▇ ▁
322 ms Histogram: frequency by time 392 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
resn2 = similar(resn);
@benchmark G_mul_x_tiled!($resn2, $G_, $xvecq, $μ_, $σinv_; chunkwidth=1024) evals=1
BenchmarkTools.Trial: 85 samples with 1 evaluation.
Range (min … max): 54.705 ms … 72.373 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 58.710 ms ┊ GC (median): 0.00%
Time (mean ± σ): 59.256 ms ± 4.076 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
█ ▂▄ ▃
█▃██▇▅▁▃▃▁▃▅▁▅▅▅▃▅▃▆▆▁▅▃█▃▅▅▃▃▇▃▃▁▁▃▁▁▃▃▁▁▁▁▁▃▁▁▁▁▃▁▃▁▁▁▁▁▅ ▁
54.7 ms Histogram: frequency by time 70.2 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
And finally, looking at the same operations using SnpLinAlg
Gsla_ = SnpLinAlg{Float64}(G_; model=ADDITIVE_MODEL, center=true, scale=true, impute=true);
resq_sla = similar(resq);
@benchmark mul!($resq_sla, transpose($Gsla_), $xvec) evals=1
BenchmarkTools.Trial: 119 samples with 1 evaluation.
Range (min … max): 40.871 ms … 44.785 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 42.233 ms ┊ GC (median): 0.00%
Time (mean ± σ): 42.141 ms ± 589.852 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▃▅█▅
▃▁▁▃▃▁▃▄▇▄▄▄▄▄▅▄▅▇▅▄▁▅▅▅▆████▇▅▃▃▁▄▄▁▃▁▁▃▁▁▁▃▁▁▁▁▁▁▃▁▁▁▃▁▁▃▃ ▃
40.9 ms Histogram: frequency by time 44 ms <
Memory estimate: 103.50 KiB, allocs estimate: 1428.
resn_sla = similar(resn);
@benchmark mul!($resn_sla, $Gsla_, $xvecq) evals=1
BenchmarkTools.Trial: 16 samples with 1 evaluation.
Range (min … max): 321.255 ms … 325.775 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 321.770 ms ┊ GC (median): 0.00%
Time (mean ± σ): 322.199 ms ± 1.182 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
█▁ ▁█▁▁▁▁▁ ▁ ▁ ▁ ▁ ▁
██▁███████▁█▁▁▁▁▁▁█▁▁▁▁▁▁▁▁█▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
321 ms Histogram: frequency by time 326 ms <
Memory estimate: 221.59 KiB, allocs estimate: 3268.
For $\mathbf{G}^\top \mathbf{x}$, tiling provides no improvement over just using @turbo thread=true
, when using SnpLinAlg, there is substantial overhead from using Base threads and tiled views. Whereas for $\mathbf{Gx}$ there is an over 5x performance improvement from using sequential tiling with limiting multithreading to a single tile computation. Even better, by not parallelizing over tiles, we avoid all heap allocations.
I suspect we might see a similar improvement with $\mathbf{G}^\top \mathbf{X}_1$ and $\mathbf{GX}_2$ where $\mathbf{X}_1$ and $\mathbf{X}_2$ are matrices, but implementation of that might be very tedious dealing with tile remainder chunks.
I did these simulations on my M1 Max Macbook, please let me know if the improvement is comparable for other hardware. It is possible that for something like a 32-core Xeon processor on a server we would see different behavior.
Thanks for the report. How many threads were used?
I wrote matrix-vector multiplication three years ago, and there could be room for improvement. I will look into the GX
part. Something that sounds counterintuitive to me is that if multiple threads work on the same tile, there could be some cache conflict, but that doesn't seem to be the case here. Let's check these parts in other environments.
When I read the code today, I see things I could have done differently. For example, recursive tiling strategies would have made code maintenance easier than the current fixed-size block approach (at the cost of slightly worse performance). The machinery based on Tullio.jl it is in the OpenADMIXTURE package's loop.jl
file.
Sorry for interjecting 2 questions here.
@BrendonChau, just want to confirm, are you saying that G_mul_x_tiled!
is 5x faster than SnpLinAlg
because SnpLinAlg
has an extra parallel layer? I thought it was okay to have multiple layers of parallelization as long as it is not nested @threads
.
Also, can you explain a bit more on this:
Even better, by not parallelizing over tiles, we avoid all heap allocations.
where did the heap allocations come from? I'm asking because previously I also observed lots of allocations but I couldn't figure out where they came from.
@kose-y I ran it with 8 threads on my local machine.
julia> versioninfo()
Julia Version 1.9.2
Commit e4ee485e90 (2023-07-05 09:39 UTC)
Platform Info:
OS: macOS (arm64-apple-darwin22.4.0)
CPU: 10 × Apple M1 Max
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-14.0.6 (ORCJIT, apple-m1)
Threads: 8 on 8 virtual cores
Environment:
JULIA_EDITOR = code
JULIA_NUM_THREADS = 8
@biona001 I believe the allocations come from Julia's own resource management using Threads.@threads
and from assignment into array views. Since we must ensure thread-safety, my understanding is that GC.@preserve
happens to allocate for array views. Using @tturbo
or @turbo thread=true
is non-allocating for a single tile operation.
I did some more investigating and it seems that my implementation for $\mathbf{Gx}$ is only faster for 'wide' SnpArray
s. For 'square' or 'tall' SnpArrays
I actually see a small performance regression relative to SnpLinAlg
which is still mysterious to me.
@BrendonChau While I haven't had a chance to look into it yet, that certainly makes sense to me. The previous example wasn't just "tall" enough--perhaps we can change behavior in that particular case. Allocations from multithreading didn't really seem major there--how is it with large arrays?
I modified my functions so that remainder handling is less verbose. I also wrote a version of $\mathbf{Gx}$ that uses Threads.@threads
to parallelize over the tiles in a column, this has more allocations than computing over each tile in parallel, but is faster for large SnpArrays
, being consistently faster than SnpLinAlg
. Also, for the chunk calculations, I use Base.Cartesian.@nexprs
to directly unroll the innermost loop, this is about 20-30% faster than just letting @turbo
handle it.
using StatsBase, Statistics, Plots, LinearAlgebra, BenchmarkTools, Random
using SnpArrays, DelimitedFiles, Glob
using Plots, LoopVectorization, Polyester
function G_mul_x_chunk!(
y::AbstractVector{T},
G::AbstractMatrix{UInt8},
x::AbstractVector{Tx},
μ::AbstractVector{T},
σinv::AbstractVector{T}
) where {T<:AbstractFloat, Tx<:Union{Signed, AbstractFloat}}
@turbo thread=true for k in axes(G, 2)
for j in axes(G, 1)
# for i in 1:4
# # index into the correct value in compressed SNP array
# Gijk = (G[j, k] >> ((i - 1) << 1)) & 0x03
# y[((j - 1) << 2) + i] += x[k] * (T((Gijk >= 0x02) * (Gijk - 0x01)) - !isone(Gijk) * μ[k]) * σinv[k]
# end
# Macro directly unrolls the i ∈ 1:4 loop, ~20-30% faster
Base.Cartesian.@nexprs 4 i -> begin
G_i_jk = (G[j, k] >> ((i - 1) * 2)) & 0x03
y[((j - 1) << 2) + i] += x[k] * (T((G_i_jk >= 0x02) * (G_i_jk - 0x01)) - !isone(G_i_jk) * μ[k]) * σinv[k]
end
end
end
end
function G_mul_x_tiled!(
y::AbstractVector{T},
G::SnpArray,
x::AbstractVector{Tx},
μ::AbstractVector{T},
σinv::AbstractVector{T};
chunkwidth::Int = 1024
) where {T<:AbstractFloat, Tx<:Union{Signed, AbstractFloat}}
m, d = size(G)
mpacked = size(G.data, 1)
packedchunkwidth = chunkwidth >> 2
# need K and rem to handle remainder rows
K, rem = divrem(m, 4)
# Slice up row iteration space
nrowchunks, rowchunkrem = divrem(m, chunkwidth)
# if remainder is nonzero, number of rowslices is nrowchunks + 1
rowchunkiters = rowchunkrem == 0 ? nrowchunks : nrowchunks + 1
# Slice up col iteration space
ncolchunks, colchunkrem = divrem(d, chunkwidth)
# if remainder is nonzero, number of colslices is ncolchunks + 1
colchunkiters = colchunkrem == 0 ? ncolchunks : ncolchunks + 1
# Initialize output vector to zero,
# * could be modified to agree with 5-argument mul!
fill!(y, zero(T))
# multithreading only on inner tile loop avoids pointer allocations using @turbo thread=true
# a more sophisticated way might use buffers and multi-level parallelism
@inbounds for col in 1:colchunkiters
colstart = (col - 1) * chunkwidth
colstop = col <= ncolchunks ? col * chunkwidth : d
_x = view(x, (colstart + 1):colstop)
_μ = view(μ, (colstart + 1):colstop)
_σinv = view(σinv, (colstart + 1):colstop)
for l in 1:rowchunkiters
rowstart = (l - 1) * chunkwidth
rowstop = l <= nrowchunks ? l * chunkwidth : min(m, K << 2)
packedrowstart = (l - 1) * packedchunkwidth
packedrowstop = l <= nrowchunks ? l * packedchunkwidth : min(mpacked, K)
_y = view(y, (rowstart + 1):rowstop)
_G_data = view(G.data, (packedrowstart + 1):packedrowstop, (colstart + 1):colstop)
G_mul_x_chunk!(_y, _G_data, _x, _μ, _σinv)
end
if rem > 0
width = col <= ncolchunks ? chunkwidth : colchunkrem
@turbo thread=true for k in 1:width
for i in 1:rem
# index into the correct value in compressed SNP array
Gijk = (G.data[K + 1, colstart + k] >> ((i - 1) << 1)) & 0x03
y[K << 2 + i] += _x[k] * (T((Gijk >= 0x02) * (Gijk - 0x01)) - !isone(Gijk) * _μ[k]) * _σinv[k]
end
end
end
end
return y
end
function G_mul_x_chunk_2!(
y::AbstractVector{T},
G::AbstractMatrix{UInt8},
x::AbstractVector{Tx},
μ::AbstractVector{T},
σinv::AbstractVector{T}
) where {T<:AbstractFloat, Tx<:Union{Signed, AbstractFloat}}
@turbo thread=false for k in axes(G, 2)
for j in axes(G, 1)
# for i in 1:4
# # index into the correct value in compressed SNP array
# Gijk = (G[j, k] >> ((i - 1) << 1)) & 0x03
# y[((j - 1) << 2) + i] += x[k] * (T((Gijk >= 0x02) * (Gijk - 0x01)) - !isone(Gijk) * μ[k]) * σinv[k]
# end
# Macro directly unrolls the i ∈ 1:4 loop, ~20-30% faster
Base.Cartesian.@nexprs 4 i -> begin
G_i_jk = (G[j, k] >> ((i - 1) * 2)) & 0x03
y[((j - 1) << 2) + i] += x[k] * (T((G_i_jk >= 0x02) * (G_i_jk - 0x01)) - !isone(G_i_jk) * μ[k]) * σinv[k]
end
end
end
end
function G_mul_x_tiled_2!(
y::AbstractVector{T},
G::SnpArray,
x::AbstractVector{Tx},
μ::AbstractVector{T},
σinv::AbstractVector{T};
chunkwidth::Int = 1024
) where {T<:AbstractFloat, Tx<:Union{Signed, AbstractFloat}}
m, d = size(G)
mpacked = size(G.data, 1)
packedchunkwidth = chunkwidth >> 2
# need K and rem to handle remainder rows
K, rem = divrem(m, 4)
# Slice up row iteration space
nrowchunks, rowchunkrem = divrem(m, chunkwidth)
# if remainder is nonzero, number of rowslices in nrowchunks + 1
rowchunkiters = rowchunkrem == 0 ? nrowchunks : nrowchunks + 1
# Slice up col iteration space
ncolchunks, colchunkrem = divrem(d, chunkwidth)
# if remainder is nonzero, number of colslices in ncolchunks + 1
colchunkiters = colchunkrem == 0 ? ncolchunks : ncolchunks + 1
# Initialize output vector to zero,
# * could be modified to agree with 5-argument mul!
fill!(y, zero(T))
@inbounds for col in 1:colchunkiters
colstart = (col - 1) * chunkwidth
colstop = col <= ncolchunks ? col * chunkwidth : d
_x = view(x, (colstart + 1):colstop)
_μ = view(μ, (colstart + 1):colstop)
_σinv = view(σinv, (colstart + 1):colstop)
# multi-threading on inside loop should be safe since rowchunks
# assign into disjoint slices of y, dont need to bother with
# @sync and Threads.@spawn
Threads.@threads for l in 1:rowchunkiters
rowstart = (l - 1) * chunkwidth
rowstop = l <= nrowchunks ? l * chunkwidth : min(m, K << 2)
packedrowstart = (l - 1) * packedchunkwidth
packedrowstop = l <= nrowchunks ? l * packedchunkwidth : min(mpacked, K)
_y = view(y, (rowstart + 1):rowstop)
_G_data = view(G.data, (packedrowstart + 1):packedrowstop, (colstart + 1):colstop)
G_mul_x_chunk_2!(_y, _G_data, _x, _μ, _σinv)
if l > nrowchunks && rem > 0
width = col <= ncolchunks ? chunkwidth : colchunkrem
@turbo thread=false for k in 1:width
for i in 1:rem
# index into the correct value in compressed SNP array
Gijk = (G.data[K + 1, colstart + k] >> ((i - 1) << 1)) & 0x03
y[K << 2 + i] += _x[k] * (T((Gijk >= 0x02) * (Gijk - 0x01)) - !isone(Gijk) * _μ[k]) * _σinv[k]
end
end
end
end
end
return y
end
First the smaller simulation, G
is 65_667
by 131_072
rng = MersenneTwister(1234)
# SnpArrays.simulate! is from my fork
m = 1_024 * 64 + 128 + 3
q = 1_024 * 128
minfreq = 0.01
MAFs = minfreq .+ (0.5 - minfreq) .* rand(rng, q)
G = SnpArrays.simulate!(rng, m, q, MAFs)
# Second row of `columncounts` indicates number of missing
G_counts = counts(G, dims = 1)
m_nonmissing = m .- G_counts[2, :]
# Allele Frequencies
ρ = (G_counts[3, :] + 2 * G_counts[4, :]) ./ (2 * m_nonmissing)
# Expected Values
μ = 2 * ρ
# Standard Deviations
σ = sqrt.(2 .* (1 .- ρ) .* ρ)
σinv = inv.(σ)
# Testing Gx
xvecq = rand(rng, (-1.0, 1.0), q);
resm = Vector{Float64}(undef, m);
resm2 = similar(resm);
@benchmark G_mul_x_tiled!($resm2, $G, $xvecq, $μ, $σinv; chunkwidth=1024) evals=1
BenchmarkTools.Trial: 7 samples with 1 evaluation.
Range (min … max): 713.627 ms … 819.101 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 740.464 ms ┊ GC (median): 0.00%
Time (mean ± σ): 750.719 ms ± 36.066 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
█ █ █ █ █ █ █
█▁▁▁▁▁▁█▁█▁▁▁▁▁█▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
714 ms Histogram: frequency by time 819 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
resm3 = similar(resm);
@benchmark G_mul_x_tiled_2!($resm3, $G, $xvecq, $μ, $σinv; chunkwidth=1024) evals=1
BenchmarkTools.Trial: 7 samples with 1 evaluation.
Range (min … max): 707.115 ms … 759.956 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 716.626 ms ┊ GC (median): 0.00%
Time (mean ± σ): 721.859 ms ± 17.641 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
█ █ ██ █ █ █
█▁▁▁█▁▁▁▁██▁▁█▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
707 ms Histogram: frequency by time 760 ms <
Memory estimate: 828.09 KiB, allocs estimate: 6403.
# Compare to SnpLinAlg
Gsla_ = SnpLinAlg{Float64}(G; model=ADDITIVE_MODEL, center=true, scale=true, impute=true);
resm_sla = similar(resm);
@benchmark mul!($resm_sla, $Gsla_, $xvecq) evals=1
BenchmarkTools.Trial: 7 samples with 1 evaluation.
Range (min … max): 747.357 ms … 765.554 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 750.175 ms ┊ GC (median): 0.00%
Time (mean ± σ): 752.828 ms ± 6.519 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
█ █ █ █ █ █ █
█▁█▁▁▁█▁▁█▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
747 ms Histogram: frequency by time 766 ms <
Memory estimate: 1.53 MiB, allocs estimate: 24069.
Then, the larger simulation, G
is 131_203
by `262_144
rng = MersenneTwister(1234)
# SnpArrays.simulate! is from my fork
m = 1_024 * 128 + 128 + 3
q = 1_024 * 256
minfreq = 0.01
MAFs = minfreq .+ (0.5 - minfreq) .* rand(rng, q)
G = SnpArrays.simulate!(rng, m, q, MAFs)
# Second row of `columncounts` indicates number of missing
G_counts = counts(G, dims = 1)
m_nonmissing = m .- G_counts[2, :]
# Allele Frequencies
ρ = (G_counts[3, :] + 2 * G_counts[4, :]) ./ (2 * m_nonmissing)
# Expected Values
μ = 2 * ρ
# Standard Deviations
σ = sqrt.(2 .* (1 .- ρ) .* ρ)
σinv = inv.(σ)
# Testing Gx
xvecq = rand(rng, (-1.0, 1.0), q);
resm = Vector{Float64}(undef, m);
resm2 = similar(resm);
@benchmark G_mul_x_tiled!($resm2, $G, $xvecq, $μ, $σinv; chunkwidth=1024) evals=1
BenchmarkTools.Trial: 2 samples with 1 evaluation.
Range (min … max): 3.212 s … 3.321 s ┊ GC (min … max): 0.00% … 0.00%
Time (median): 3.267 s ┊ GC (median): 0.00%
Time (mean ± σ): 3.267 s ± 76.880 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
█ █
█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
3.21 s Histogram: frequency by time 3.32 s <
Memory estimate: 0 bytes, allocs estimate: 0.
resm3 = similar(resm);
@benchmark G_mul_x_tiled_2!($resm3, $G, $xvecq, $μ, $σinv; chunkwidth=1024) evals=1
BenchmarkTools.Trial: 2 samples with 1 evaluation.
Range (min … max): 2.708 s … 2.753 s ┊ GC (min … max): 0.00% … 0.00%
Time (median): 2.730 s ┊ GC (median): 0.00%
Time (mean ± σ): 2.730 s ± 31.661 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
█ █
█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
2.71 s Histogram: frequency by time 2.75 s <
Memory estimate: 1.62 MiB, allocs estimate: 12816.
# Compare to SnpLinAlg
Gsla_ = SnpLinAlg{Float64}(G; model=ADDITIVE_MODEL, center=true, scale=true, impute=true);
resm_sla = similar(resm);
@benchmark mul!($resm_sla, $Gsla_, $xvecq) evals=1
BenchmarkTools.Trial: 2 samples with 1 evaluation.
Range (min … max): 2.970 s … 2.979 s ┊ GC (min … max): 0.00% … 0.00%
Time (median): 2.974 s ┊ GC (median): 0.00%
Time (mean ± σ): 2.974 s ± 6.346 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
█ █
█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
2.97 s Histogram: frequency by time 2.98 s <
Memory estimate: 5.83 MiB, allocs estimate: 93187.
In the larger simulation G_mul_x_tiled!
is no longer performant, but G_mul_x_tiled_2!
manages to keep up. Allocations are still relatively small for both G_mul_x_tiled_2!
and SnpLinAlg
. The performance gap is now closer to a 10% improvement, so not particularly impressive. For larger SnpArray
s, the bottleneck ends up being memory latency, as we would expect. I could pick a better chunkwidth
like 128 or 192 to make sure all working memory can fit in the L1 cache for each core, but that would only be marginally beneficial for large arrays.
Which one do you think more affects the performance, parallelizing over the tiles in a column or loop unrolling via Base.Cartesian.@nexprs
?
I'm fairly confident that parallelization has a much larger impact than the loop unrolling, at least for large SnpArray
s.
I need some clarification, as there already is parallelization happening with the current package. So do you mean limiting the parallelization to a single column has more impact than 20~30% acceleration from the loop unrolling?