Iterators.jl
Iterators.jl copied to clipboard
[WIP] mergesorted
Alternative version of #62. The main difference is that function safenext()
and complete()
are replaced with one another iterator PrefetchIterator
which constantly keeps 1 element in cache, making it available for comparison without actually moving an iterator pointer (see this comment for details why it is needed at all). Compared with #62, this version:
- is more type stable (assuming type stable iterator)
- cannot handle empty iterators (since
PrefetchIterator
reads 1 element during construction)
Hmm, I now have doubts regarding type stability of this solution as well. Although I never use nothing
, Julia still fails to infer types:
julia> it = PrefetchIterator(1:10)
Iterators.PrefetchIterator(1:10)
julia> s = start(it)
Iterators.PrefetchState(1,1,2,1)
julia> @code_warntype next(it, s)
Variables:
it::Iterators.PrefetchIterator
state::Iterators.PrefetchState
new_hd::Any
new_st::Any
#s14::Any
Body:
begin # /home/username/.julia/v0.4/Iterators/src/Iterators.jl, line 742:
NewvarNode(:new_hd)
NewvarNode(:new_st)
NewvarNode(symbol("#s14"))
unless (Iterators.done)((top(getfield))(it::Iterators.PrefetchIterator,:inner)::Any,(top(getfield))(state::Iterators.PrefetchState,:st)::Any)::Any goto 0 # /home/username/.julia/v0.4/Iterators/src/Iterators.jl, line 744:
return (top(tuple))((top(getfield))(state::Iterators.PrefetchState,:hd)::Any,$(Expr(:new, :((top(getfield))(Iterators,:PrefetchState)::Type{Iterators.PrefetchState}), :((top(getfield))(state::Iterators.PrefetchState,:hd)), :((top(getfield))(state::Iterators.PrefetchState,:hd)), :((top(getfield))(state::Iterators.PrefetchState,:st)), :((top(getfield))(state::Iterators.PrefetchState,:st)))))::Tuple{Any,Iterators.PrefetchState}
goto 1
0: # /home/username/.julia/v0.4/Iterators/src/Iterators.jl, line 746:
GenSym(0) = (Iterators.next)((top(getfield))(it::Iterators.PrefetchIterator,:inner)::Any,(top(getfield))(state::Iterators.PrefetchState,:st)::Any)::Any
#s14 = (top(start))(GenSym(0))::Any
GenSym(1) = (top(indexed_next))(GenSym(0),1,#s14)::Any
new_hd = (top(getfield))(GenSym(1),1)::Any
#s14 = (top(getfield))(GenSym(1),2)::Any
GenSym(2) = (top(indexed_next))(GenSym(0),2,#s14)::Any
new_st = (top(getfield))(GenSym(2),1)::Any
#s14 = (top(getfield))(GenSym(2),2)::Any # /home/username/.julia/v0.4/Iterators/src/Iterators.jl, line 747:
return (top(tuple))((top(getfield))(state::Iterators.PrefetchState,:hd)::Any,$(Expr(:new, :((top(getfield))(Iterators,:PrefetchState)::Type{Iterators.PrefetchState}), :(new_hd::_), :((top(getfield))(state::Iterators.PrefetchState,:hd)), :(new_st::_), :((top(getfield))(state::Iterators.PrefetchState,:st)))))::Tuple{Any,Iterators.PrefetchState}
1:
end::Tuple{Any,Iterators.PrefetchState}
Note that new_hd
and new_st
have type Any
and return type is still Tuple{Any,Iterators.PrefetchState}
. This is understandable since we don't put any restrictions on a type of input iterator.
Do we have a chance to fix it? If not, does it make sense to introduce nothing
back to be able to handle empty iterators?
I like the idea of a prefetching iterator. I was going to call it PeekIterator
, since it lets you peek at the next element without consuming it. I think this is the crucial element that simplifies implementing mergesorted
.
You should be able to make it type-stable if you know the type of the state and value of the two inner iterators. You know these types if you call start
and next
-- but of course, you cannot always call next
, and you need to know the type in that case as well.
This might work:
immutable PeekIterator{C}
c::C # wrapped collection (inner iterator)
end
immutable PeekIteratorState{S,V}
s::S # inner iterator state
valid::Bool # whether v is valid
v::V # cached value
PeekIteratorState(s, v) = new(s, true, v)
PeekIteratorState(s) = new(s, false) # v remains uninitialized
end
function start{C}(p::PeekIterator{C})
s = start(p.c)
d = done(p.c, s)
if !d
v, s = next(p.c, s)
end
# Due to Julia's weird scoping rules, v remains known to the compiler, but might not be defined
# Note: if the "if" block above contains a "return" statement, this doesn't work any more
S = typeof(s)
V = typeof(v)
if !d
PeekIteratorState{S,V}(s, v)
else
PeekIteratorState{S,V}(s)
end
end
function done{C,S,V}(p::PeekIterator{C}, s::PeekIteratorState{S,V})
!s.valid # this implies done(p.c, s.s)
end
function next{C,S,V}(p::PeekIterator{C}, s::PeekIteratorState{S,V})
@assert s.valid
if done(p.c, s.s)
# inner iterator is done; return one more value
s.v, PeekIteratorState{S,V}(s.s)
else
v, s = next(p.c, s.s)
s.v, PeekIteratorState{S,V}(s, v)
end
end
Here you are:
import Base: eltype, iteratorsize, iteratoreltype, start, next, done, SizeUnknown, length, size
immutable Peek{I}
it::I
end
peek(itr) = Peek(itr)
eltype{I}(::Type{Peek{I}}) = eltype(I)
iteratorsize{I}(::Type{Peek{I}}) = iteratorsize(I)
iteratoreltype{I}(::Type{Peek{I}}) = iteratoreltype(I)
length(f::Peek) = length(f.it)
size(f::Peek) = size(f.it)
function start(f::Peek)
s = start(f.it)
d = done(f.it, s)
# this is a simple way to make this function type stable
d && throw(ArgumentError("argument to Peek must contain at least one iterator"))
el, s = next(f.it, s)
return s, el, done(f.it, s)
end
function next(f::Peek, state)
s, val, last = state
last && return val, (s, val, false)
el, s = next(f.it, s)
return val, (s, el, done(f.it, s) )
end
@inline function done(f::Peek, state)
s, el, last = state
return done(f.it, s) && !last
end
peek{I}(f::Peek{I}, state) = state[3] ? Nullable{eltype(I)}() : Nullable{eltype(I)}(state[2])
As topping, for iterators which have the next element in their state already (so nothing is really to be done), define e.g.
start{T}(r::Peek{UnitRange{T}}) = start(r.it)
next{T}(r::Peek{UnitRange{T}}, i) = next(r.it, i)
done{T}(r::Peek{UnitRange{T}}, i) = done(r.it, i)
peek{T}(r::Peek{UnitRange{T}}, i) = ...
function start(f::Peek)
s = start(f.it)
d = done(f.it, s)
# this is a simple way to make this function type stable
d && throw(ArgumentError("argument to Peek must contain at least one iterator"))
el, s = next(f.it, s)
return s, el, done(f.it, s)
end
I think we should avoid throwing exceptions in start()
. Here are 2 use cases:
- In this PR
mergesorted()
can't handle empty iterator becausePrefetch
/PeekIterator
needs at least one element. But that's not an inherent issue formergesorted
, just an implementation detail. Logically, we should supportmergesorted([], [])
, but it would be very hard to do if we can't even callstart()
on internal iterator. - In another project I use
UniqueIterator
(think "distinct sorted") that also relies onPeekIterator
. And again I believe it's unfair to makecollect(UniqueIterator([]))
to throw an exception.
I thought about using Nullable
for storing head element of iterator. This way we could do both - avoid nothing
and don't throw exceptions. Does it sound reasonable?
Here's a version that is both - type stable and allows wrapping empty iterators (I renamed Peek
into PeekIter
because it conflicted with the name of a function peek
used to get the head of that iterator, but I don't insist on the name):
import Base: eltype, iteratorsize, iteratoreltype, start, next, done, SizeUnknown, length, size
immutable PeekIter{I}
it::I
end
peekiter(itr) = PeekIter(itr)
eltype{I}(::Type{PeekIter{I}}) = eltype(I)
iteratorsize{I}(::Type{PeekIter{I}}) = iteratorsize(I)
iteratoreltype{I}(::Type{PeekIter{I}}) = iteratoreltype(I)
length(f::PeekIter) = length(f.it)
size(f::PeekIter) = size(f.it)
function start{I}(f::PeekIter{I})
s = start(f.it)
E = eltype(I)
if done(f.it, s)
val = Nullable{E}()
else
el, s = next(f.it, s)
val = Nullable{E}(el)
end
return s, val, done(f.it, s)
end
function next(f::PeekIter, state)
s, val, last = state
last && return get(val), (s, val, true)
el, s = next(f.it, s)
return get(val), (s, Nullable(el), done(f.it, s))
end
@inline function done(f::PeekIter, state)
s, el, last = state
return done(f.it, s) && last
end
peek{I}(f::PeekIter{I}, state) = state[3] ? Nullable{eltype(I)}() : Nullable{eltype(I)}(state[2])
And some test for demonstration:
@test [x for x in peekiter(1:10)] == collect(1:10)
@test [x for x in peekiter([])] == collect([])
it = peekiter(1:10)
s = start(it)
@test get(peek(it, s)) == 1
it = peekiter([])
s = start(it)
@test isnull(peek(it, s))
@dfdx You can change the implementation of done
from returning return done(f.it, s) && last
to simply return last
.
This PeekIter
should be an addition to Base
by itself; it's clearly useful on its own.
I think !isnull && done == last
so you could get rid of last
PS: last && return val, (s, val, false)
and return done(f.it, s) && !last
were not typos, your changes to done and next break the iterator:
julia> for i in peekiter(1:10);print(i);end
123456789
@dfdx It kind of "worked" because you provided the right "length" and julia's comprehension does not call "done" in that particular case
PS: last && return val, (s, val, false) and return done(f.it, s) && !last were not typos, your changes to done and next break the iterator:
Oops, my tests wrongly convinced me that the transformation was correct. As a useful output (mostly for myself): don't use [x for x in ...]
syntax to test iterators since it doesn't use start(). next(), done()
to populate the array.
@dfdx It kind of "worked" because you provided the right "length" and julia's comprehension does not call "done" in that particular case
Yep, realized it just after I'd posted the comment :)
I think !isnull && done == last so you could get rid of last
Not sure I got you right, but the following code removes last
and seems to work correctly (I'll write more tests later).
import Base: eltype, iteratorsize, iteratoreltype, start, next, done, SizeUnknown, length, size
immutable PeekIter{I}
it::I
end
peekiter(itr) = PeekIter(itr)
eltype{I}(::Type{PeekIter{I}}) = eltype(I)
iteratorsize{I}(::Type{PeekIter{I}}) = iteratorsize(I)
iteratoreltype{I}(::Type{PeekIter{I}}) = iteratoreltype(I)
length(f::PeekIter) = length(f.it)
size(f::PeekIter) = size(f.it)
function start{I}(f::PeekIter{I})
s = start(f.it)
if done(f.it, s)
val = Nullable{eltype(I)}()
else
el, s = next(f.it, s)
val = Nullable{eltype(I)}(el)
end
return s, val
end
function next(f::PeekIter, state)
s, val = state
# done() should prevent condition `isnull(val) && done(state)`
!isnull(val) && done(f.it, s) && return get(val), (s, Nullable{typeof(val)}())
el, s = next(f.it, s)
return get(val), (s, Nullable(el), done(f.it, s))
end
@inline function done(f::PeekIter, state)
s, val = state
return done(f.it, s) && isnull(val)
end
peek{I}(f::PeekIter{I}, state) = state[3] ? Nullable{eltype(I)}() : Nullable{eltype(I)}(state[2])
That's what I meant. This iterator is actually quite neat.
I've made a separate PR for PeekIter
. At the same time, I feel like I can improve mergesorted
to accept any number of iterators to merge, so please don't accept with PR yet.
At the same time, I feel like I can improve mergesorted to accept any number of iterators to merge
Done (though tests will fail for now since this PR relies on #65 which is not merged yet).