Iterators.jl icon indicating copy to clipboard operation
Iterators.jl copied to clipboard

[WIP] mergesorted

Open dfdx opened this issue 8 years ago • 14 comments

Alternative version of #62. The main difference is that function safenext() and complete() are replaced with one another iterator PrefetchIterator which constantly keeps 1 element in cache, making it available for comparison without actually moving an iterator pointer (see this comment for details why it is needed at all). Compared with #62, this version:

  • is more type stable (assuming type stable iterator)
  • cannot handle empty iterators (since PrefetchIterator reads 1 element during construction)

dfdx avatar Jun 06 '16 19:06 dfdx

Hmm, I now have doubts regarding type stability of this solution as well. Although I never use nothing, Julia still fails to infer types:

julia> it = PrefetchIterator(1:10)
Iterators.PrefetchIterator(1:10)

julia> s = start(it)
Iterators.PrefetchState(1,1,2,1)

julia> @code_warntype next(it, s)
Variables:
  it::Iterators.PrefetchIterator
  state::Iterators.PrefetchState
  new_hd::Any
  new_st::Any
  #s14::Any

Body:
  begin  # /home/username/.julia/v0.4/Iterators/src/Iterators.jl, line 742:
      NewvarNode(:new_hd)
      NewvarNode(:new_st)
      NewvarNode(symbol("#s14"))
      unless (Iterators.done)((top(getfield))(it::Iterators.PrefetchIterator,:inner)::Any,(top(getfield))(state::Iterators.PrefetchState,:st)::Any)::Any goto 0 # /home/username/.julia/v0.4/Iterators/src/Iterators.jl, line 744:
      return (top(tuple))((top(getfield))(state::Iterators.PrefetchState,:hd)::Any,$(Expr(:new, :((top(getfield))(Iterators,:PrefetchState)::Type{Iterators.PrefetchState}), :((top(getfield))(state::Iterators.PrefetchState,:hd)), :((top(getfield))(state::Iterators.PrefetchState,:hd)), :((top(getfield))(state::Iterators.PrefetchState,:st)), :((top(getfield))(state::Iterators.PrefetchState,:st)))))::Tuple{Any,Iterators.PrefetchState}
      goto 1
      0:  # /home/username/.julia/v0.4/Iterators/src/Iterators.jl, line 746:
      GenSym(0) = (Iterators.next)((top(getfield))(it::Iterators.PrefetchIterator,:inner)::Any,(top(getfield))(state::Iterators.PrefetchState,:st)::Any)::Any
      #s14 = (top(start))(GenSym(0))::Any
      GenSym(1) = (top(indexed_next))(GenSym(0),1,#s14)::Any
      new_hd = (top(getfield))(GenSym(1),1)::Any
      #s14 = (top(getfield))(GenSym(1),2)::Any
      GenSym(2) = (top(indexed_next))(GenSym(0),2,#s14)::Any
      new_st = (top(getfield))(GenSym(2),1)::Any
      #s14 = (top(getfield))(GenSym(2),2)::Any # /home/username/.julia/v0.4/Iterators/src/Iterators.jl, line 747:
      return (top(tuple))((top(getfield))(state::Iterators.PrefetchState,:hd)::Any,$(Expr(:new, :((top(getfield))(Iterators,:PrefetchState)::Type{Iterators.PrefetchState}), :(new_hd::_), :((top(getfield))(state::Iterators.PrefetchState,:hd)), :(new_st::_), :((top(getfield))(state::Iterators.PrefetchState,:st)))))::Tuple{Any,Iterators.PrefetchState}
      1: 
  end::Tuple{Any,Iterators.PrefetchState}

Note that new_hd and new_st have type Any and return type is still Tuple{Any,Iterators.PrefetchState}. This is understandable since we don't put any restrictions on a type of input iterator.

Do we have a chance to fix it? If not, does it make sense to introduce nothing back to be able to handle empty iterators?

dfdx avatar Jun 07 '16 07:06 dfdx

I like the idea of a prefetching iterator. I was going to call it PeekIterator, since it lets you peek at the next element without consuming it. I think this is the crucial element that simplifies implementing mergesorted.

You should be able to make it type-stable if you know the type of the state and value of the two inner iterators. You know these types if you call start and next -- but of course, you cannot always call next, and you need to know the type in that case as well.

This might work:

immutable PeekIterator{C}
    c::C   # wrapped collection (inner iterator)
end
immutable PeekIteratorState{S,V}
    s::S   # inner iterator state
    valid::Bool   # whether v is valid
    v::V   # cached value
    PeekIteratorState(s, v) = new(s, true, v)
    PeekIteratorState(s) = new(s, false)   # v remains uninitialized
end

function start{C}(p::PeekIterator{C})
    s = start(p.c)
    d = done(p.c, s)
    if !d
        v, s = next(p.c, s)
    end
    # Due to Julia's weird scoping rules, v remains known to the compiler, but might not be defined
    # Note: if the "if" block above contains a "return" statement, this doesn't work any more
    S = typeof(s)
    V = typeof(v)
    if !d
        PeekIteratorState{S,V}(s, v)
    else
        PeekIteratorState{S,V}(s)
    end
end
function done{C,S,V}(p::PeekIterator{C}, s::PeekIteratorState{S,V})
    !s.valid   # this implies done(p.c, s.s)
end
function next{C,S,V}(p::PeekIterator{C}, s::PeekIteratorState{S,V})
    @assert s.valid
    if done(p.c, s.s)
        # inner iterator is done; return one more value
        s.v, PeekIteratorState{S,V}(s.s)
    else
        v, s = next(p.c, s.s)
        s.v, PeekIteratorState{S,V}(s, v)
    end
end

eschnett avatar Jun 08 '16 02:06 eschnett

Here you are:

import Base: eltype, iteratorsize, iteratoreltype, start, next, done, SizeUnknown, length, size
immutable Peek{I}
    it::I
end

peek(itr) = Peek(itr)

eltype{I}(::Type{Peek{I}}) = eltype(I)
iteratorsize{I}(::Type{Peek{I}}) = iteratorsize(I)
iteratoreltype{I}(::Type{Peek{I}}) = iteratoreltype(I)
length(f::Peek) = length(f.it)
size(f::Peek) = size(f.it)

function start(f::Peek)
    s = start(f.it)
    d = done(f.it, s)
    # this is a simple way to make this function type stable
    d && throw(ArgumentError("argument to Peek must contain at least one iterator"))
    el, s = next(f.it, s)
    return s, el, done(f.it, s)
end

function next(f::Peek, state)
    s, val, last = state
    last && return val, (s, val, false)
    el, s = next(f.it, s)
    return val, (s, el, done(f.it, s) )
end

@inline function done(f::Peek, state)
    s, el, last = state
    return done(f.it, s) && !last
end

peek{I}(f::Peek{I}, state) = state[3] ? Nullable{eltype(I)}() : Nullable{eltype(I)}(state[2])

mschauer avatar Jun 08 '16 07:06 mschauer

As topping, for iterators which have the next element in their state already (so nothing is really to be done), define e.g.

start{T}(r::Peek{UnitRange{T}}) = start(r.it)
next{T}(r::Peek{UnitRange{T}}, i) = next(r.it, i)
done{T}(r::Peek{UnitRange{T}}, i) = done(r.it, i)
peek{T}(r::Peek{UnitRange{T}}, i) = ...

mschauer avatar Jun 08 '16 08:06 mschauer

function start(f::Peek)
    s = start(f.it)
    d = done(f.it, s)
    # this is a simple way to make this function type stable
    d && throw(ArgumentError("argument to Peek must contain at least one iterator"))
    el, s = next(f.it, s)
    return s, el, done(f.it, s)
end

I think we should avoid throwing exceptions in start(). Here are 2 use cases:

  1. In this PR mergesorted() can't handle empty iterator because Prefetch/PeekIterator needs at least one element. But that's not an inherent issue for mergesorted, just an implementation detail. Logically, we should support mergesorted([], []), but it would be very hard to do if we can't even call start() on internal iterator.
  2. In another project I use UniqueIterator (think "distinct sorted") that also relies on PeekIterator. And again I believe it's unfair to make collect(UniqueIterator([])) to throw an exception.

I thought about using Nullable for storing head element of iterator. This way we could do both - avoid nothing and don't throw exceptions. Does it sound reasonable?

dfdx avatar Jun 08 '16 09:06 dfdx

Here's a version that is both - type stable and allows wrapping empty iterators (I renamed Peek into PeekIter because it conflicted with the name of a function peek used to get the head of that iterator, but I don't insist on the name):

import Base: eltype, iteratorsize, iteratoreltype, start, next, done, SizeUnknown, length, size


immutable PeekIter{I}
    it::I
end

peekiter(itr) = PeekIter(itr)

eltype{I}(::Type{PeekIter{I}}) = eltype(I)
iteratorsize{I}(::Type{PeekIter{I}}) = iteratorsize(I)
iteratoreltype{I}(::Type{PeekIter{I}}) = iteratoreltype(I)
length(f::PeekIter) = length(f.it)
size(f::PeekIter) = size(f.it)

function start{I}(f::PeekIter{I})
    s = start(f.it)
    E = eltype(I)
    if done(f.it, s)
        val = Nullable{E}()
    else
        el, s = next(f.it, s)
        val = Nullable{E}(el)
    end
    return s, val, done(f.it, s)
end

function next(f::PeekIter, state)
    s, val, last = state
    last && return get(val), (s, val, true)
    el, s = next(f.it, s)
    return get(val), (s, Nullable(el), done(f.it, s))
end

@inline function done(f::PeekIter, state)
    s, el, last = state
    return done(f.it, s) && last
end

peek{I}(f::PeekIter{I}, state) = state[3] ? Nullable{eltype(I)}() : Nullable{eltype(I)}(state[2])

And some test for demonstration:

@test [x for x in peekiter(1:10)] == collect(1:10)
@test [x for x in peekiter([])] == collect([])

it = peekiter(1:10)
s = start(it)
@test get(peek(it, s)) == 1

it = peekiter([])
s = start(it)
@test isnull(peek(it, s))

dfdx avatar Jun 08 '16 22:06 dfdx

@dfdx You can change the implementation of done from returning return done(f.it, s) && last to simply return last.

This PeekIter should be an addition to Base by itself; it's clearly useful on its own.

eschnett avatar Jun 09 '16 00:06 eschnett

I think !isnull && done == last so you could get rid of last

mschauer avatar Jun 09 '16 04:06 mschauer

PS: last && return val, (s, val, false) and return done(f.it, s) && !last were not typos, your changes to done and next break the iterator:

julia> for i in peekiter(1:10);print(i);end
123456789

mschauer avatar Jun 09 '16 10:06 mschauer

@dfdx It kind of "worked" because you provided the right "length" and julia's comprehension does not call "done" in that particular case

mschauer avatar Jun 09 '16 12:06 mschauer

PS: last && return val, (s, val, false) and return done(f.it, s) && !last were not typos, your changes to done and next break the iterator:

Oops, my tests wrongly convinced me that the transformation was correct. As a useful output (mostly for myself): don't use [x for x in ...] syntax to test iterators since it doesn't use start(). next(), done() to populate the array.

@dfdx It kind of "worked" because you provided the right "length" and julia's comprehension does not call "done" in that particular case

Yep, realized it just after I'd posted the comment :)

I think !isnull && done == last so you could get rid of last

Not sure I got you right, but the following code removes last and seems to work correctly (I'll write more tests later).

import Base: eltype, iteratorsize, iteratoreltype, start, next, done, SizeUnknown, length, size

immutable PeekIter{I}
    it::I
end

peekiter(itr) = PeekIter(itr)

eltype{I}(::Type{PeekIter{I}}) = eltype(I)
iteratorsize{I}(::Type{PeekIter{I}}) = iteratorsize(I)
iteratoreltype{I}(::Type{PeekIter{I}}) = iteratoreltype(I)
length(f::PeekIter) = length(f.it)
size(f::PeekIter) = size(f.it)

function start{I}(f::PeekIter{I})
    s = start(f.it)
    if done(f.it, s)
        val = Nullable{eltype(I)}()
    else
        el, s = next(f.it, s)
        val = Nullable{eltype(I)}(el)
    end
    return s, val
end

function next(f::PeekIter, state)
    s, val = state
    # done() should prevent condition `isnull(val) && done(state)`
    !isnull(val) && done(f.it, s) && return get(val), (s, Nullable{typeof(val)}())
    el, s = next(f.it, s)
    return get(val), (s, Nullable(el), done(f.it, s))
end

@inline function done(f::PeekIter, state)
    s, val = state
    return done(f.it, s) && isnull(val)
end

peek{I}(f::PeekIter{I}, state) = state[3] ? Nullable{eltype(I)}() : Nullable{eltype(I)}(state[2])

dfdx avatar Jun 09 '16 13:06 dfdx

That's what I meant. This iterator is actually quite neat.

mschauer avatar Jun 09 '16 13:06 mschauer

I've made a separate PR for PeekIter. At the same time, I feel like I can improve mergesorted to accept any number of iterators to merge, so please don't accept with PR yet.

dfdx avatar Jun 09 '16 23:06 dfdx

At the same time, I feel like I can improve mergesorted to accept any number of iterators to merge

Done (though tests will fail for now since this PR relies on #65 which is not merged yet).

dfdx avatar Jun 11 '16 23:06 dfdx