DataScienceTutorials.jl
DataScienceTutorials.jl copied to clipboard
Training a bunch of models and comparing
This post: https://discourse.julialang.org/t/custom-xgboost-loss-function-w-zygote-julia-computing-blog-post/35811/13?u=tlienart by @azev77
TOC: 1. Load packages/2. Load models/3. Clean data/4. Train & Score.
- Load packages
using MLJ, RDatasets, DataFrames, TableView
- Load models
@inline function load_m(model_list)
@inbounds for (i, model) in enumerate(model_list)
load(model.name, pkg=model.package_name, verbosity=0) #
end
end
# Load all models ONCE!
models( ) |> load_m
- Clean data
# Clean data
@inline function one_hot_encode(d::DataFrame)
encoded = DataFrame()
@inbounds for col in names(d), val in unique(d[!, col])
lab = string(col) * "_" * string(val)
encoded[!, Symbol(lab) ] = ifelse.(d[!, col] .== val, 1, 0)
end
return encoded
end
#AZ: convert Strings & Count to OHE.
@inline function AZ(X)
sch = schema(X);
#ty = [CategoricalString{UInt8}, CategoricalString{UInt32}, CategoricalValue{Int64,UInt32}]
tn = [Int, Float16, Float32, Float64]
vs = [];
@inbounds for (name, type) in zip(sch.names, sch.types)
if type ∉ tn #∈ ty #∉ [Int32, Int64, Float64]
#println(:($name) , " ", type)
push!(vs, :($name) )
#global X = coerce(X, :($name) =>Continuous);
end
end
#
Xd= DataFrame(X);
X_ohe = one_hot_encode( Xd[:, vs] )
Xd = hcat( X_ohe, select(Xd, Not( vs )) )
Xd = coerce(Xd, autotype(Xd, :discrete_to_continuous))
#sch= schema(Xd);
#@show sch.scitypes;
#
X=Xd
return X
end
- Train & Score.
#Train & Score.
#NOTE: if we do target engineering we need to transform Y back to compare score.
@inline function train_m(m, X, y, train, test, pr, meas; invtrans=identity)
t1 = time_ns()
p = m.package_name
m = m.name
println(m)
if m =="XGBoostRegressor"
mdl = eval(Meta.parse("$(m)(num_round=500)"))
elseif m=="LGBMRegressor"
mdl = eval(Meta.parse("$(m)(num_iterations = 1_000, min_data_in_leaf=10)"))
elseif m=="EvoTreeRegressor"
mdl = eval(Meta.parse("$(m)(nrounds = 1500)"))
else
mdl = eval(Meta.parse("$(m)()"))
end
#
mach = machine(mdl, X, y)
fit!(mach, rows=train, verbosity=0) #, verbosity=0
#ŷ = MLJ.pr(mach, rows=test)
ŷ = pr(mach, rows=test)
ŷ = invtrans.(ŷ)
y = invtrans.(y)
#AZ Custom oos-R2
if meas==rmsl
s = meas(abs.(ŷ), abs.(y[test]) ) #abs.() for rmsl AMES.
else
s = meas(ŷ, y[test])
end
t2 = time_ns()
return [m,p,round(s, sigdigits=5), round((t2-t1)/1.0e9, sigdigits=5)]
end
#
" some models crash certain datasets"
@inline function f(X, y, train, test, pr, meas; pr_type = [:deterministic, :probabilistic])
X = AZ(X)
dropm = ["EpsilonSVR"]; #Crashes AMES #"ARDRegressor"
drop_pkg = ["NaiveBayes"]; #, "ScikitLearn"
m_match = models(matching(X, y),
x -> x.prediction_type ∈ pr_type,
x -> x.package_name ∉ drop_pkg,
x -> x.name ∉ dropm);
sc = Array{Any}(undef, size(m_match, 1), 4)
@inbounds for (i,m) in enumerate(m_match)
sc[i,:] .= try
train_m(m, X, y, train, test, pr, meas)
catch er
println(er)
m.name, m.package_name, 10_000,10_000
end
end
df= DataFrame(Model = sc[sortperm(sc[:,3]), 1],
Pkg = sc[sortperm(sc[:,3]), 2],
SCORE = sc[sortperm(sc[:,3]), 3],
Time = sc[sortperm(sc[:,3]), 4])
#showtable(df)
#
return df
end
################################################################################
#Boston 50 models
################################################################################
X, y = @load_boston;
train, test = partition(eachindex(y), .7, rng=333);
df = f(X, y, train, test, predict, rmsp, pr_type = [:deterministic])
TableView.showtable(df)
thanks a lot!
I'm guessing that the Breast Cancer tutorial, which compares multiple models programmatically, was the ultimate outcome of this discussion and am closing.
Feel free to re-open if I am mistaken.