SplitApplyCombine.jl
SplitApplyCombine.jl copied to clipboard
comparison as new parameter of the group function
I would like to ask if the opportunity to have a version of the group function with an additional parameter such as the comparison of the innerjoin function has ever been considered. This possibility / flexibility could be very useful on many occasions. Just to make the idea better (certainly not because he has the audacity to suggest how to possibly develop the thing) I submit a naive version of the function and some application examples. The name, following the lines of the functions groupsum, groupprod, etc. is grouplocal, because groups are made "locally" and not globally.
function grouplocal(groups, valori,comparison)
I = Tuple{Int,Int}
T = eltype(valori)
out = Dictionary{I, Vector{T}}()
g=1
grpid=(g,1)
st=groups[1]
push!(get!(Vector{Int64},out, grpid),valori[1])
for (grp, value,r) in zip(groups[2:end], valori[2:end], 2:length(groups))
if comparison(st,grp)
push!(get(out, grpid,nothing), value)
else
g+=1
grpid=(g,r)
st=grp
push!(get!(Vector{Int64},out, grpid),value)
end
end
return out
end
leading group UPPERCASE
seq=['A','b','c','D','e','f']
uc(x,y)=islowercase(x)!=islowercase(y)
grouplocal(seq,seq,uc)
julia> grouplocal(seq,seq,uc)
2-element Dictionary{Tuple{Int64, Int64}, Vector{Char}}
(1, 1) │ ['A', 'b', 'c']
(2, 4) │ ['D', 'e', 'f']
alternate sequences of odd even numbers
seq1=[2,31,3,43,2,32,3,45,5,3,6,8,54,7,8,6]
p(x,y)=isodd(x)==isodd(y)
grouplocal(seq1,seq1,p)
julia> grouplocal(seq1,seq1,p)
7-element Dictionary{Tuple{Int64, Int64}, Vector{Int64}}
(1, 1) │ [2]
(2, 2) │ [31, 3, 43]
(3, 5) │ [2, 32]
(4, 7) │ [3, 45, 5, 3]
(5, 11) │ [6, 8, 54]
(6, 14) │ [7]
(7, 15) │ [8, 6]
difference between contiguous values greater than 2 as separation threshold
seq2=[2,3,5,8,9,11,12,15,16,17,22]
d(x,y)=y<=2
ds=[0;diff(seq2)]
grouplocal(ds,seq2,d)
julia> grouplocal(ds,seq2,d)
4-element Dictionary{Tuple{Int64, Int64}, Vector{Int64}}
(1, 1) │ [2, 3, 5]
(2, 4) │ [8, 9, 11, 12]
(3, 8) │ [15, 16, 17]
(4, 11) │ [22]
leading group has the same first 4 characters
nv=["name1","val11","val12","val13","name2","val21","val22"]
cmp(x,y)=x[1:4]!=y[1:4]
grouplocal(nv,nv,cmp)
julia> grouplocal(nv,nv,cmp)
2-element Dictionary{Tuple{Int64, Int64}, Vector{String}}
(1, 1) │ ["name1", "val11", "val12", "val13"]
(2, 5) │ ["name2", "val21", "val22"]
function grouplocalview1(groups, valori,comparison)
I = UnitRange{Int64}
out = Vector{I}()
l=1
grpid=l:l
st=groups[1]
push!(out,grpid)
for (grp, i) in zip(groups[2:end], 2:length(groups))
if !comparison(st,grp)
out[end]=l:i-1
l=i
grpid=l:l
st=grp
push!(out,grpid)
end
out[end]=l:i
end
return getindex.([valori],out)
end
So the "local" semantic here is to break up "runs"?
I think there is definitely a space for this. There is an operation that generally goes by the name partition
which generally behaves like group
but the groups are assumed to be contiguous. It should be possible to somehow pass in the "previous group key" or something like that into a comparison function.
It's worth considering what the semantics of the keys are for dictionaries and other iterable data structures (the r
part).
Another idea - you could just return an array of (sub?) arrays, for example,