DataStructures.jl icon indicating copy to clipboard operation
DataStructures.jl copied to clipboard

move stack to Vector

Open oscardssmith opened this issue 8 months ago • 7 comments

By my measurements, it's a bunch faster for this workload (this may be new as of Julia 1.11)

julia> function f3(d, n)
           for i in 1:n
               push!(d, i)
               pop!(d)
           end
       end
f3 (generic function with 1 method)

julia> @benchmark f3(d, 1000) evals=1 setup = (d=Vector{Int}())
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  789.000 ns …  10.042 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     797.000 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   823.878 ns ± 158.991 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▅█▅▁▅▄           ▂  ▁▄▃                                     ▂ ▁
  ███████▆▆▄▁▁▁▁▁▁▇██▃███▇▇▆▇█▆▆▅▄▄▄▄▄▅▄▅▄▆▇▇▆▆▇▆▆▆▅▅▄▁▄▁▃▃▄▅██ █
  789 ns        Histogram: log(frequency) by time       1.07 μs <

 Memory estimate: 96 bytes, allocs estimate: 1.

julia> @benchmark f3(d, 1000) evals=1 setup = (d=Deque{Int}())
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  3.667 μs …  28.349 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     3.803 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   3.930 μs ± 489.972 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▁▆█▄▇▅▃▃▂▁▁▂▅▃ ▁  ▁▁       ▁                                ▂
  ████████████████▇▇██▇▇▆▇█▇███▇▇▆█▇▄▅▄▆▆▆▆▇▇▅▅▅▄▅▅▆▅▄▅▄▄▄▄▆▅ █
  3.67 μs      Histogram: log(frequency) by time      5.66 μs <

oscardssmith avatar Apr 30 '25 21:04 oscardssmith

tests fail

fingolfin avatar May 16 '25 16:05 fingolfin

Since Deque is implemented as an unrolling linked-list, it would be nice to also benchmark inserting all elements followed by removing them all:

function f4(d, n)
    for i in 1:n
        push!(d, i)
    end
    for i in 1:n
        pop!(d)
    end
end

lfenzo avatar Oct 04 '25 22:10 lfenzo

julia> @benchmark f4(d, 1000) evals=1 setup = (d=Vector{Int}())
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  762.000 ns …  1.745 ms  ┊ GC (min … max):  0.00% … 98.73%
 Time  (median):     962.000 ns              ┊ GC (median):     0.00%
 Time  (mean ± σ):     2.022 μs ± 23.323 μs  ┊ GC (mean ± σ):  37.07% ±  4.78%

  ▇█▇▅▅▅▅▅▅▅▄▄▃▃▃▂▂▁▁               ▁▁▂▂▁▂▁▁▁▁▁▁▁              ▂
  ██████████████████████▆▅▅▅▃▆▄▄▁▃▃▇███████████████▇▇▇▇▇▆▇▆▆▅▆ █
  762 ns        Histogram: log(frequency) by time      4.03 μs <

 Memory estimate: 21.80 KiB, allocs estimate: 7.

julia> @benchmark f4(d, 1000) evals=1 setup = (d=Deque{Int}())
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  3.076 μs … 163.643 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     3.677 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   3.668 μs ±   2.911 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

    ▃▇▆▅▄▃▂▁▁▁▁    ▁▇█▇▇▅▄▄▂▂▁▂▁                         ▁▁▁  ▃
  ▅▇████████████▇▆████████████████▇▆▆▆▆▅▆▅▅▅▅▄▄▅▁▃▄▄▅▁▄▇█████ █
  3.08 μs      Histogram: log(frequency) by time      4.92 μs <

 Memory estimate: 0 bytes, allocs estimate: 0.

Looks like the Vector version is ~3x faster, but allocates more.

oscardssmith avatar Oct 04 '25 22:10 oscardssmith

I wonder if we shouldn't just deprecate Stack and Queue and tell people to use Deque or Vector depending on need.

oxinabox avatar Oct 06 '25 02:10 oxinabox

Vector isn't a great queue unfortunately (circular queues are generally better since you never have to move data around within the structure)

oscardssmith avatar Oct 06 '25 04:10 oscardssmith

Vector isn't a great queue unfortunately (circular queues are generally better since you never have to move data around within the structure)

Last time i branch marked it (which was pre-1.0) it was shockingly good as a queue

oxinabox avatar Oct 06 '25 07:10 oxinabox

My initial experiments seem to confirm. I threw together the following which should be a lot faster than Vector since it only copies data on resize, but I am not seeing that. I presumably have a bug somewhere.

mutable struct Queue{T}
    mem::Memory{T}
    start::Int
    len::Int
    function Queue{T}() where T
        new{T}(Memory{T}(undef, 2), 0, 0)
    end
end

pow2index(i, size) = 1 + (i & (size-1))

function Base.resize!(q::Queue{T}, newsize) where T
    (;start, len, mem) = q
    memlen = length(mem)
    newsize = nextpow(2, newsize)
    @assert newsize >= len
    newmem = Memory{T}(undef, newsize)
    
    for i in 1:len
        newmem[i] = mem[pow2index(start + i - 1, memlen)]
    end
    q.mem = newmem
    q.start = 0
    q
end

function Base.push!(q::Queue, v)
    (;start, len, mem) = q
    memlen = length(mem)
    if len == memlen
        memlen *= 2
        (;start, mem) = resize!(q, memlen)
    end
    mem[pow2index(start + len, memlen)] = v
    q.len += 1
    v
end

function Base.popfirst!(q::Queue)
    (;start, len, mem) = q
    @assert len != 0
    v = mem[pow2index(start, length(mem))]
    q.start += 1
    q.len -= 1
    v
end

function testqueue1(q, n1, n2)
    for i in 1:n1
        push!(q, i)
    end
    for _ in 1:n2
        v = popfirst!(q)
        push!(q, v)
    end
    popfirst!(q)
end

oscardssmith avatar Oct 07 '25 05:10 oscardssmith