MLJ.jl icon indicating copy to clipboard operation
MLJ.jl copied to clipboard

MinMaxScaler (and more)

Open egolep opened this issue 3 years ago • 6 comments

It would be very nice to have more transformers than Standardizer, OneHotEncoding and BoxCox (and their univariate versions)

I even tried to implement a MinMaxScaler using Standardizer as an example, but I keep getting:

[ Info: Training Machine{MinMaxScaler,…} @194. ┌ Error: Problem fitting the machine Machine{MinMaxScaler,…} @194. └ @ MLJBase ~/.julia/packages/MLJBase/AkJde/src/machines.jl:484 [ Info: Running type checks... [ Info: Type checks okay. ERROR: MethodError: no method matching fit(::MinMaxScaler, ::Int64, ::DataFrame) Closest candidates are: fit(::MLJBase.Stack{modelnames, inp_scitype, tg_scitype} where {modelnames, inp_scitype, tg_scitype}, ::Int64, ::Any, ::Any) at /home/egolep/.julia/packages/MLJBase/AkJde/src/composition/models/stacking.jl:277 fit(::Union{MLJIteration.DeterministicIteratedModel{M}, MLJIteration.ProbabilisticIteratedModel{M}} where M, ::Any, ::Any...) at /home/egolep/.julia/packages/MLJIteration/Twn0E/src/core.jl:51 fit(::Union{MLJTuning.DeterministicTunedModel{T, M}, MLJTuning.ProbabilisticTunedModel{T, M}}, ::Integer, ::Any...) where {T, M} at /home/egolep/.julia/packages/MLJTuning/QFcuQ/src/tuned_models.jl:592 ... Stacktrace: [1] fit_only!(mach::Machine{MinMaxScaler, true}; rows::Vector{Int64}, verbosity::Int64, force::Bool) @ MLJBase ~/.julia/packages/MLJBase/AkJde/src/machines.jl:482 [2] #fit!#98 @ ~/.julia/packages/MLJBase/AkJde/src/machines.jl:549 [inlined] [3] top-level scope @ REPL[120]:1

here my implementation (of both a univariate version and the multivariate one):

import MLJModelInterface.inverse_transform mutable struct UnivariateMinMaxScaler <: Unsupervised end

function fit(transformer::UnivariateMinMaxScaler, verbosity::Int, v::AbstractVector{T}) where T<:Real min, max = minimum(v), maximum(v) fitresult = (min, max) cache = nothing report = NamedTuple() return fitresult, cache, report end

function transform(transformer::UnivariateMinMaxScaler, fitresult, x::Real) min, max = fitresult x_std = (x .- min) ./ (max - min) return x_std .* (max - min) + min end

transform(tranformer::UnivariateMinMaxScaler, fitresult, v) = [transform(tranformer, fitresult, x) for x in v]

function inverse_transform(transformer::UnivariateMinMaxScaler, fitresult, y::Real) min, max = fitresult y_std = y .- min ./ (max - min) return y_std .* (max - min) .+ min end

inverse_transform(transformer::UnivariateMinMaxScaler, fitresult, w) = [inverse_transform(transformer, fitresult, y) for y in w]

mutable struct MinMaxScaler <: Unsupervised features::Vector{Symbol} end

MinMaxScaler(; features=Symbol[]) = MinMaxScaler(features)

function fit(transformer::MinMaxScaler, verbosity::Int, X::Any)

_schema =  schema(X)
all_features = _schema.names
types = scitypes(X)

# determine indices of all_features to be transformed
if isempty(transformer.features)
    cols_to_fit = filter!(eachindex(all_features)|>collect) do j
        types[j] <: Continuous
    end
else
    cols_to_fit = filter!(eachindex(all_features)|>collect) do j
        all_features[j] in transformer.features && types[j] <: Continuous
    end
end

fitresult_given_feature = Dict{Symbol,Tuple{Float64,Float64}}()

# fit each feature
verbosity < 2 || @info "Features scaled: "
for j in cols_to_fit
    col_fitresult, cache, report =
        fit(UnivariateMinMaxScaler(), verbosity - 1, selectcols(X, j))
    fitresult_given_feature[all_features[j]] = col_fitresult
    verbosity < 2 ||
        @info "  :$(all_features[j])    mu=$(col_fitresult[1])  sigma=$(col_fitresult[2])"
end

fitresult = fitresult_given_feature
cache = nothing
report = (features_fit=keys(fitresult_given_feature),)

return fitresult, cache, report

end

MLJ.fitted_params(::MinMaxScaler, fitresult) = (min_and_max_given_feature=fitresult,)

function transform(transformer::MinMaxScaler, fitresult, X) features_to_be_transformerd = keys(fitresult) all_features = schema(X).names

issubset(Set(features_to_be_transformerd), Set(all_features)) || 
			 error("Attempting to transform data with incompatible feature labels.")

col_transformer = UnivariateMinMaxScaler()

cols = map(all_features) do ftr
	if ftr in features_to_be_transformerd
		transform(col_transformer, fitresult[ftr], selectcols(X, ftr))
	else
		selectcols(X, ftr)
	end
end

named_cols = NamedTuple(all_features)(tuple(cols...))

return MLJBase.table(named_cols, prototype=X)

end

I get the same error using both the multivariate and the univariate one.

egolep avatar Jul 12 '21 01:07 egolep

@egolep Thanks for this.

Despite your definition

function fit(transformer::MinMaxScaler, verbosity::Int, X::Any)

you are getting

ERROR: MethodError: no method matching fit(::MinMaxScaler, ::Int64, ::DataFrame)

Maybe this is a dumb question, but did you import MLJModelInterface.fit to extend it (or MLJBase.fit)? Perhaps you have only defined Main.fit which MLJ will not recognise.

ablaom avatar Jul 13 '21 02:07 ablaom

BTW, you may want to focus on just the univariate case, in view of https://github.com/JuliaAI/MLJModels.jl/issues/288 .

ablaom avatar Jul 13 '21 02:07 ablaom

Hi @ablaom, moving from MLJModelInterface to MLJBase did the trick. I'm starting to think that it could be a version-related problem inside the virtual environment I was using for this little experiment.

Now the univariate case does work and I'm going to call it a day, following your suggestion.

Many thanks for your reply!

egolep avatar Jul 13 '21 12:07 egolep

I lied. Now also MinMaxScaler() works since leaving it unfinished triggered all my OCDs.

Thanks again for your replies. If I will implement more of this kind of models, could it be worth to create a pull request? Or these transformers are kept in a small number for a reason?

egolep avatar Jul 13 '21 13:07 egolep

No, PR to MLJModels most welcome. You'll need to add a test...

ablaom avatar Jul 13 '21 20:07 ablaom

@egolep You still interested in make a PR to MLJModels.jl ? Let me know if I can help you make that happen.

ablaom avatar Aug 02 '21 22:08 ablaom