Impute.jl
Impute.jl copied to clipboard
ERROR: UndefVarError: T not defined
Code:
using CSV, DataFrames, Impute
function logfiles()
files=readdir("data")
filter!(files -> occursin(r"log_", files), files)
non_empty_files = String[]
for file in files
if stat("data/" * file).size > 2000
push!(non_empty_files, file)
end
end
non_empty_files
end
function read_log(logfiles)
df = nothing
t_end = 0
for logfile in logfiles
df_new = CSV.read("data/" * logfile, DataFrame)
if isnothing(df)
df=df_new
t_end = last(df.TIME)
else
t_start = first(df_new.TIME)
if (t_start - t_end) > 60
n = div(t_start - t_end + 30, 60)
v = fill(missing, size(df)[2]-1)
println("Missing: ", n, " minutes")
allowmissing!(df)
for i in 1:n
v1 = vcat([i*60+t_end], v)
push!(df, v1)
end
end
df = outerjoin(df, df_new, matchmissing=:equal, on = intersect(names(df), names(df_new)))
t_end = last(df.TIME)
end
end
df = Impute.interp(df)
return
df = read_log(logfiles())
Error message:
ERROR: UndefVarError: T not defined
Stacktrace:
[1] _impute!(data::Vector{Any}, imp::Impute.Interpolate)
@ Impute ~/.julia/packages/Impute/AB7zS/src/imputors/interp.jl:55
[2] impute!(data::Vector{Any}, imp::Impute.Interpolate; dims::Function, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Impute ~/.julia/packages/Impute/AB7zS/src/imputors.jl:127
[3] impute!(data::Vector{Any}, imp::Impute.Interpolate)
@ Impute ~/.julia/packages/Impute/AB7zS/src/imputors.jl:127
[4] impute!(table::DataFrame, imp::Impute.Interpolate; cols::Nothing)
@ Impute ~/.julia/packages/Impute/AB7zS/src/imputors.jl:232
[5] impute!
@ ~/.julia/packages/Impute/AB7zS/src/imputors.jl:224 [inlined]
[6] #impute#8
@ ~/.julia/packages/Impute/AB7zS/src/imputors.jl:80 [inlined]
[7] impute
@ ~/.julia/packages/Impute/AB7zS/src/imputors.jl:80 [inlined]
[8] interp(data::DataFrame; kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Impute ~/.julia/packages/Impute/AB7zS/src/functional.jl:76
[9] interp(data::DataFrame)
@ Impute ~/.julia/packages/Impute/AB7zS/src/functional.jl:75
[10] read_log(logfiles::Vector{String})
@ Main ~/repos/trader/src/logging.jl:45
[11] top-level scope
@ REPL[2]:1
Installed packages:
pkg> st
Status `~/repos/trader/Project.toml`
[336ed68f] CSV v0.9.11
[8f4d0f93] Conda v1.5.2
[a93c6f00] DataFrames v1.2.2
[587475ba] Flux v0.12.8
[53d20848] FluxArchitectures v0.1.3
[38e38edf] GLM v1.5.1
[cd3eb016] HTTP v0.9.17
[f7bf1975] Impute v0.6.7
[682c06a0] JSON v0.21.2
[0f8b85d8] JSON3 v1.9.2
[9b87118b] PackageCompiler v2.0.2
[d96e819e] Parameters v0.12.3
[58dd65bb] Plotly v0.4.1
[f0f68f2c] PlotlyJS v0.18.8
[91a5bcdd] Plots v1.24.2
[438e738f] PyCall v1.92.5
[d330b81b] PyPlot v2.10.0
[f269a46b] TimeZones v1.6.2
[de0858da] Printf
[10745b16] Statistics
Julia version 1.6.3 on Ubuntu Linux 18.04.
Any idea?
The data to reproduce the bug can be downloaded here: https://gist.github.com/ufechner7/812ad4770ca7b2bea83429161b02b11a
One more finding: The error does NOT happen when I launch julia with the command: julia --project It DOES happen when I launch it with the command: julia -t auto --project
Any idea what is going on here?
The bug is in https://github.com/invenia/Impute.jl/blob/d08c5069f71507e4fee5e49e90f97c9527586a69/src/imputors/interp.jl#L55, which assumes that data has an eltype of Union{T,Missing}, and not just Missing. The latter case will cause T to be undefined, and throw the observed error.
Nice, thanks for all the debugging details!
Hmm, I'm not sure if that's exactly a bug. If you try and interpolate among only missing values then I'd expect that to fail. I suppose we could throw a more explicit error vs falling back to the UndefVarError. What I find interesting is that this only happens with -t auto which make me think there's something weird going on with the multithreaded CSV file parsing? I'll try reproducing your example to see how the input types differ in those two cases.
I'm afraid I was unable to reproduce your example on my local mac (1.6.1) or in a julia:1.6.4-buster docker container, using a range of 6-12 threads. Would you mind calling @show Tables.schema(df) right before calling Impute.interp(df) in your example? That should produce output that looks like:
Tables.schema(df) = Tables.Schema:
:TIME Union{Missing, Int64}
Symbol("1INCH-EUR") Union{Missing, Float64}
Symbol("AAVE-EUR") Union{Missing, Float64}
Symbol("ADA-EUR") Union{Missing, Float64}
...
In case I missed something this is the environment I'm using.
https://gist.github.com/rofinn/c4dbe2a293a1a07463c895c09cdca7cd