Deedle
Deedle copied to clipboard
Slow aggregations
We should have a way to do faster groupby + aggregation. Here is an example:
First, some ceremony to construct the frame:
type System.Random with
member this.GetValues(minValue, maxValue) =
Seq.initInfinite (fun _ -> this.Next(minValue, maxValue))
member this.GetWord(n) =
Seq.initInfinite (fun _ -> System.String [|for _ in 1..n -> this.Next(26) + 97 |> char|])
let r = System.Random()
let grp = r.GetWord(10) |> Seq.take 10 |> Seq.toArray
let idx = seq { for i in 1 .. 100000 -> grp.[i % 10] }
let nums = r.GetValues(1,1000) |> Seq.take 100000
let s1 = Seq.zip [1..100000] nums |> series
let s2 = Seq.zip [1..100000] idx |> series
let f : Frame<int,string> = frame []
f?A <- s1
f?B <- s2
Ok, now this is the operation that takes way too long:
f |> Frame.aggregateRowsBy ["B"] ["A"] Stats.sum ;;
I couldn't help but tinker a bit - this yields a 15x speed improvement, but doesn't handle multiple columns in either the aggregate-by or group-by parameters - but it does return the group as the new index, which is nice. I'd love to know if it can be done even faster!
member frame.AggregateColumnBy(groupBy:_, aggBy:_, aggFunc:Func<_,_>) =
let dct = Dictionary<_, SeriesBuilder<_,_>>()
let gby = frame.GetColumn(groupBy)
let col = frame.GetColumn(aggBy)
let mutable i = 0
for i in [0 .. frame.RowCount-1] do
let k = gby.GetAt(i)
let v = col.GetKeyAt(i), col.GetAt(i)
match dct.TryGetValue(k) with
| true, l -> l.Add(v)
| _ -> dct.[k] <- SeriesBuilder<'TRowKey,_>()
dct.[k].Add(v)
let result = series [ for k in dct.Keys -> (k, aggFunc.Invoke(dct.[k].Series)) ]
series [(aggBy, result)] |> FrameUtils.fromColumns frame.IndexBuilder frame.VectorBuilder
And, this gets us down another 3x... I wonder if any of this can be generalized without losing speed.
member frame.FoldColumnBy(groupBy:_, aggBy:_, init:_, foldl:Func<_,_,_>) =
let dct = Dictionary<_, _>()
let gby = frame.GetColumn(groupBy)
let col = frame.GetColumn(aggBy)
let mutable i = 0
for i in [0 .. frame.RowCount-1] do
let k = gby.GetAt(i)
let v = col.GetAt(i)
match dct.TryGetValue(k) with
| true, o -> dct.[k] <- foldl.Invoke(o, v)
| _ -> dct.[k] <- foldl.Invoke(init, v)
let result = series [ for k in dct.Keys -> (k, dct.[k]) ]
series [(aggBy, result)] |> FrameUtils.fromColumns frame.IndexBuilder frame.VectorBuilder