LoopVectorization.jl
LoopVectorization.jl copied to clipboard
Packed Array?
I'm working on a packed array type, where multiple values are packed in a single byte like BitArray
, but each element can be two or four bits. Each element is considered a UInt8
value.
const _msk8 = ~UInt8(0)
@inline _msk_end(l::Int) = _msk8 >>> _mod8(-l)
@inline _div(l, lgbits_per_entry) = l >> (3 - lgbits_per_entry)
@inline _div8(l) = l >> 3
@inline _mod(l, lgbits_per_entry) = l & _msk_end(3 - lgbits_per_entry)
@inline _mod8(l) = l & 0x07
@inline _blsr(x)= x & (x-1) #zeros the last set bit. Has native instruction on many archs. needed in multidimensional.jl
@inline num_chunks(n::Int, lgbits_per_entry::Int) = _div8((n << lgbits_per_entry) + 7)
mutable struct BitsArray{N, B} <: AbstractArray{UInt8, N}
chunks::Array{UInt8,N}
len::Int
dims::NTuple{N,Int}
end
# B: log2(bits per entry)
function BitsArray{N, B}(::UndefInitializer, dims::Vararg{Int,N}) where {N, B}
@assert B ∈ (0, 1, 2)
n = 1
i = 1
nc = 1
dim1 = 0
length(dims) >= 1 && (dim1 = num_chunks(dims[1], B))
for d in dims
d >= 0 || throw(ArgumentError("dimension size must be ≥ 0, got $d for dimension $i"))
n *= d
if i == 1
nc *= dim1
else
nc *= d
end
i += 1
end
length(dims) == 0 && (nc = 0)
chunks = Array{UInt8,N}(undef, dim1, dims[2:end]...)
#nc > 0 && (chunks[end] .= UInt8(0))
b = BitsArray{N, B}(chunks, n, dims)
N != 1 && (b.dims = dims)
return b
end
@inline Base.size(x::BitsArray) = x.dims
@inline Base.length(x::BitsArray) = x.len
Base.IndexStyle(::Type{<:BitsArray{N,B}}) where {N, B} = IndexCartesian()
@inline get_chunks_id(i::Int, lgbits_per_entry::Int) = _div(i-1, lgbits_per_entry) + 1, _mod(i-1, lgbits_per_entry)
# note: within-byte index is meant to be 0-based.
function Base.getindex(a::BitsArray{N,B}, I::Vararg{Int, N}) where {N,B}
@boundscheck checkbounds(a, I...)
@inbounds i1, i2 = get_chunks_id(I[1], B)
bits_per_array = 1 << B
u = _msk_end(bits_per_array) << (i2 * (bits_per_array))
@inbounds r = (a.chunks[i1, I[2:end]...] & u) >> (i2 * bits_per_array)
end
The first dimension is to be byte-aligned.
Is there a way for this to work with LoopVectorization? I tried implementing some of the ArrayInterface.jl
methods, but I must be missing something. Maybe it is more complicated with custom getindex function?
using ArrayInterface
ArrayInterface.defines_strides(::Type{<:BitsArray}) = true
ArrayInterface.contiguous_axis(::Type{<:BitsArray}) = ArrayInterface.One()
ArrayInterface.stride_rank(::Type{BitsArray{N,B}}) where {N,B} = ArrayInterface.nstatic(Val(N))
ArrayInterface.contiguous_batch_size(::Type{BitsArray{N,B}}) where {N,B} = ArrayInterface.Zero()
ArrayInterface.dense_dims(::Type{BitsArray{N,B}}) where {N,B} = ArrayInterface._all_dense(Val{N}())
ArrayInterface.parent_type(::Type{<:BitsArray{N, B}}) where {N, B} = Array{UInt8, N}
Base.parent(a::BitsArray) = a.chunks
I get
julia> ba = BitsArray{2,2}(undef, 16, 5);
julia> stridedpointer(ba)
ERROR: conversion to pointer not defined for BitsArray{2, 2}
Stacktrace:
[1] error(s::String)
@ Base ./error.jl:33
[2] unsafe_convert(#unused#::Type{Ptr{UInt8}}, a::BitsArray{2, 2})
@ Base ./pointer.jl:67
[3] pointer(x::BitsArray{2, 2})
@ Base ./abstractarray.jl:1170
[4] memory_reference
@ ~/.julia/dev/LayoutPointers/src/stridedpointers.jl:16 [inlined]
[5] memory_reference
@ ~/.julia/dev/LayoutPointers/src/stridedpointers.jl:14 [inlined]
[6] stridedpointer(A::BitsArray{2, 2})
@ LayoutPointers ~/.julia/dev/LayoutPointers/src/stridedpointers.jl:67
[7] top-level scope
@ REPL[35]:1
However, naively converting to a pointer will result in wrong results (as it wouldn't be using your getindex
function).
So my suggestion would be to follow the approach taken here.
Thank you for your suggestion. I'm trying to follow it, but I might need some help. I will add a comment with what I'm trying soon.