From 7d24ac2e5aad6d08b177a87a0f2eefa81661c7a5 Mon Sep 17 00:00:00 2001 From: OkonSamuel Date: Wed, 16 Feb 2022 05:04:40 +0100 Subject: [PATCH 1/2] add more parameters to control fitting, and add data checks --- src/MLJGLMInterface.jl | 192 ++++++++++++++++++++++++++++++++--------- test/runtests.jl | 83 +++++++++++++++--- 2 files changed, 223 insertions(+), 52 deletions(-) diff --git a/src/MLJGLMInterface.jl b/src/MLJGLMInterface.jl index b3a1189..dc2555d 100644 --- a/src/MLJGLMInterface.jl +++ b/src/MLJGLMInterface.jl @@ -60,6 +60,10 @@ end fit_intercept::Bool = true link::GLM.Link01 = GLM.LogitLink() offsetcol::Union{Symbol, Nothing} = nothing + maxiter::Integer = 30 + atol::Real = 1e-6 + rtol::Real = 1e-6 + minstepfac::Real = 0.001 report_keys::KEYS_TYPE = DEFAULT_KEYS::(isnothing(_) || issubset(_, VALID_KEYS)) end @@ -68,6 +72,10 @@ end distribution::Distribution = Poisson() link::GLM.Link = GLM.LogLink() offsetcol::Union{Symbol, Nothing} = nothing + maxiter::Integer = 30 + atol::Real = 1e-6 + rtol::Real = 1e-6 + minstepfac::Real = 0.001 report_keys::KEYS_TYPE = DEFAULT_KEYS::(isnothing(_) || issubset(_, VALID_KEYS)) end @@ -87,17 +95,19 @@ Augment the matrix `X` with a column of ones if the intercept is to be fitted (`b=true`), return `X` otherwise. """ function augment_X(X::Matrix, b::Bool)::Matrix - b && return hcat(X, ones(eltype(X), size(X, 1), 1)) + b && return hcat(X, ones(float(Int), size(X, 1), 1)) return X end _to_vector(v::Vector) = v _to_vector(v) = collect(v) +_to_array(v::AbstractArray) = v +_to_array(v) = collect(v) """ split_X_offset(X, offsetcol::Nothing) -When no offset is specied, return X and an empty vector. +When no offset is specied, return X, an empty vector. """ split_X_offset(X, offsetcol::Nothing) = (X, Float64[]) @@ -115,6 +125,92 @@ function split_X_offset(X, offsetcol::Symbol) return newX, _to_vector(offset) end +# If `estimates_dispersion_param` returns `false` then the dispersion +# parameter isn't estimated from data but known apriori to be `1`. +estimates_dispersion_param(::LinearRegressor) = true +estimates_dispersion_param(::LinearBinaryClassifier) = false + +function estimates_dispersion_param(model::LinearCountRegressor) + return GLM.dispersion_parameter(model.distribution) +end + +function _throw_sample_size_error(model, est_dispersion_param) + requires_info = _requires_info(model, est_dispersion_param) + + if isnothing(model.offsetcol) + offset_info = " `offsetcol == nothing`" + else + offset_info = " `offsetcol !== nothing`" + end + + modelname = nameof(typeof(model)) + if model isa LinearCountRegressor + distribution_info = "and `distribution = $(nameof(typeof(model.distribution)))()`" + else + distribution_info = "\b" + end + + "" + throw( + ArgumentError( + " `$(modelname)` with `fit_intercept = $(model.fit_intercept)`,"* + "$(offset_info) $(distribution_info) requires $(requires_info)" + ) + ) + return nothing +end + +# `_requires_info` returns one of the following strings +# "`n_samples >= n_features`", "`n_samples > n_features`" +# "`n_samples >= n_features - 1`", "`n_samples > n_features - 1`" +# "`n_samples >= n_features + 1`", "`n_samples > n_features + 1`" +function _requires_info(model, est_dispersion_param) + inequality = est_dispersion_param ? ">" : ">=" + int_num = model.fit_intercept - !isnothing(model.offsetcol) + + if iszero(int_num) + int_num_string = "\b" + elseif int_num < 0 + int_num_string = "- $(abs(int_num))" + else + int_num_string = "+ $(int_num)" + end + + return "`n_samples $(inequality) n_features $(int_num_string)`." +end + +function check_sample_size(model, n, p) + if estimates_dispersion_param(model) + n <= p + model.fit_intercept && _throw_sample_size_error(model, true) + else + n < p + model.fit_intercept && _throw_sample_size_error(model, false) + end + return nothing +end + +function _matrix_and_features(model, Xcols, handle_intercept=false) + col_names = Tables.columnnames(Xcols) + n, p = Tables.rowcount(Xcols), length(col_names) + augment = handle_intercept && model.fit_intercept + + if !handle_intercept # i.e This only runs during `fit` + check_sample_size(model, n, p) + end + + if p == 0 + Xmatrix = Matrix{float(Int)}(undef, n, p) + else + Xmatrix = Tables.matrix(Xcols) + end + + Xmatrix = augment_X(Xmatrix, augment) + + return Xmatrix, col_names +end + +_to_columns(t::Tables.AbstractColumns) = t +_to_columns(t) = Tables.Columns(t) + """ prepare_inputs(model, X; handle_intercept=false) @@ -122,12 +218,29 @@ Handle `model.offsetcol` and `model.fit_intercept` if `handle_intercept=true`. `handle_intercept` is disabled for fitting since the StatsModels.@formula handles the intercept. """ function prepare_inputs(model, X; handle_intercept=false) - Xminoffset, offset = split_X_offset(X, model.offsetcol) - Xmatrix = MMI.matrix(Xminoffset) - if handle_intercept - Xmatrix = augment_X(Xmatrix, model.fit_intercept) + Xcols = _to_columns(X) + table_features = Tables.columnnames(Xcols) + p = length(table_features) + p >= 1 || throw( + ArgumentError("`X` must contain at least one feature column.") + ) + if !isnothing(model.offsetcol) + model.offsetcol in table_features || throw( + ArgumentError("offset column `$(model.offsetcol)` not found in table `X") + ) + if p < 2 && !model.fit_intercept + throw( + ArgumentError( + "At least 2 feature columns are required for learning with"* + " `offsetcol !== nothing` and `fit_intercept == false`." + ) + ) + end end - return Xmatrix, offset + Xminoffset, offset = split_X_offset(Xcols, model.offsetcol) + Xminoffset_cols = _to_columns(Xminoffset) + Xmatrix, features = _matrix_and_features(model, Xminoffset_cols , handle_intercept) + return Xmatrix, offset, _to_array(features) end """ @@ -170,7 +283,6 @@ function glm_report(glm_model, features, reportkeys) return NamedTuple{Tuple(keys(report_dict))}(values(report_dict)) end - """ glm_formula(model, features) -> FormulaTerm @@ -191,31 +303,10 @@ end Return data which is ready to be passed to `fit(form, data, ...)`. """ function glm_data(model, Xmatrix, y, features) - header = collect(features) - data = Tables.table([Xmatrix y]; header=[header; :y]) + data = Tables.table([Xmatrix y]; header=[features...; :y]) return data end -_to_array(v::AbstractArray) = v -_to_array(v) = collect(v) - -""" - glm_features(model, X) - -Returns an iterable features object, to be used in the construction of -glm formula and glm data header. -""" -function glm_features(model, X) - if Tables.columnaccess(X) - table_features = _to_array(keys(Tables.columns(X))) - else - first_row = iterate(Tables.rows(X), 1)[1] - table_features = first_row === nothing ? Symbol[] : _to_array(keys(first_row)) - end - filter!(!=(model.offsetcol), table_features) - return table_features -end - """ check_weights(w, y) @@ -259,8 +350,7 @@ params(fr::FitResult) = fr.params function MMI.fit(model::LinearRegressor, verbosity::Int, X, y, w=nothing) # apply the model - Xmatrix, offset = prepare_inputs(model, X) - features = glm_features(model, X) + Xmatrix, offset, features = prepare_inputs(model, X) y_ = isempty(offset) ? y : y .- offset wts = check_weights(w, y_) data = glm_data(model, Xmatrix, y_, features) @@ -278,12 +368,20 @@ end function MMI.fit(model::LinearCountRegressor, verbosity::Int, X, y, w=nothing) # apply the model - Xmatrix, offset = prepare_inputs(model, X) - features = glm_features(model, X) + Xmatrix, offset, features = prepare_inputs(model, X) data = glm_data(model, Xmatrix, y, features) wts = check_weights(w, y) form = glm_formula(model, features) - fitted_glm = GLM.glm(form, data, model.distribution, model.link; offset, wts).model + fitted_glm_frame = GLM.glm( + form, data, model.distribution, model.link; + offset, + model.maxiter, + model.atol, + model.rtol, + model.minstepfac, + wts + ) + fitted_glm = fitted_glm_frame.model fitresult = FitResult( GLM.coef(fitted_glm), GLM.dispersion(fitted_glm), (features = features,) ) @@ -299,11 +397,19 @@ function MMI.fit(model::LinearBinaryClassifier, verbosity::Int, X, y, w=nothing) decode = y[1] y_plain = MMI.int(y) .- 1 # 0, 1 of type Int wts = check_weights(w, y_plain) - Xmatrix, offset = prepare_inputs(model, X) - features = glm_features(model, X) + Xmatrix, offset, features = prepare_inputs(model, X) data = glm_data(model, Xmatrix, y_plain, features) form = glm_formula(model, features) - fitted_glm = GLM.glm(form, data, Bernoulli(), model.link; offset, wts).model + fitted_glm_frame = GLM.glm( + form, data, Bernoulli(), model.link; + offset, + model.maxiter, + model.atol, + model.rtol, + model.minstepfac, + wts + ) + fitted_glm = fitted_glm_frame.model fitresult = FitResult( GLM.coef(fitted_glm), GLM.dispersion(fitted_glm), (features = features,) ) @@ -342,9 +448,17 @@ glm_link(::LinearRegressor) = GLM.IdentityLink() # more efficient than MLJBase fallback function MMI.predict_mean(model::GLM_MODELS, fitresult, Xnew) - Xmatrix, offset = prepare_inputs(model, Xnew; handle_intercept=true) + Xmatrix, offset, _ = prepare_inputs(model, Xnew; handle_intercept=true) result = glm_fitresult(model, fitresult) # ::FitResult coef = coefs(result) + p = size(Xmatrix, 2) + if p != length(coef) + throw( + DimensionMismatch( + "The number of features in training and prediction datasets must be equal" + ) + ) + end link = glm_link(model) return glm_predict(link, coef, Xmatrix, model.offsetcol, offset) end diff --git a/test/runtests.jl b/test/runtests.jl index 41f6969..dab0394 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,7 +7,7 @@ using MLJGLMInterface using GLM: coeftable import GLM -import Distributions +using Distributions: Normal, Poisson, Uniform import StableRNGs using Tables @@ -51,7 +51,7 @@ expit(X) = 1 ./ (1 .+ exp.(-X)) # test `predict` object p_distr = predict(atom_ols, fitresult, selectrows(X, test)) dispersion = MLJGLMInterface.dispersion(fitresult) - @test p_distr[1] == Distributions.Normal(p[1], dispersion) + @test p_distr[1] == Normal(p[1], dispersion) # test metadata model = atom_ols @@ -102,6 +102,10 @@ end @test mean(cross_entropy(yhatw1, y)) < 0.25 @test yhatw1 ≈ yhat1 + # check predict on `Xnew` with wrong dims + Xnew = MLJBase.table(Tables.matrix(X)[:, 1:3], names=Tables.columnnames(X)[1:3]) + @test_throws DimensionMismatch predict(lr, fitresult, Xnew) + fitted_params(pr, fitresult) # Test metadata @@ -113,7 +117,15 @@ end @test is_supervised(model) @test package_license(model) == "MIT" @test prediction_type(model) == :probabilistic - @test hyperparameters(model) == (:fit_intercept, :link, :offsetcol, :report_keys) + hyper_params = hyperparameters(model) + @test hyper_params[1] == :fit_intercept + @test hyper_params[2] == :link + @test hyper_params[3] == :offsetcol + @test hyper_params[4] == :maxiter + @test hyper_params[5] == :atol + @test hyper_params[6] == :rtol + @test hyper_params[7] == :minstepfac + @test hyper_params[8] == :report_keys end ### @@ -125,7 +137,7 @@ end X = randn(rng, 500, 5) θ = randn(rng, 5) y = map(exp.(X*θ)) do mu - rand(rng, Distributions.Poisson(mu)) + rand(rng, Poisson(mu)) end w = ones(eltype(y), length(y)) @@ -143,6 +155,12 @@ end @test norm(θ̂w .- θ)/norm(θ) ≤ 0.03 @test θ̂w ≈ θ̂ + # check predict on `Xnew` with wrong dims + Xnew = MLJBase.table( + Tables.matrix(XTable)[:, 1:3], names=Tables.columnnames(XTable)[1:3] + ) + @test_throws DimensionMismatch predict(lcr, fitresult, Xnew) + # Test metadata model = lcr @test name(model) == "LinearCountRegressor" @@ -157,32 +175,71 @@ end @test hyper_params[2] == :distribution @test hyper_params[3] == :link @test hyper_params[4] == :offsetcol - @test hyper_params[5] == :report_keys - + @test hyper_params[5] == :maxiter + @test hyper_params[6] == :atol + @test hyper_params[7] == :rtol + @test hyper_params[8] == :minstepfac end modeltypes = [LinearRegressor, LinearBinaryClassifier, LinearCountRegressor] @testset "Test prepare_inputs" begin + @testset "check sample size" for fit_intercept in [true, false] + lin_reg = LinearRegressor(;fit_intercept) + X_lin_reg = MLJBase.table(rand(3, 3 + fit_intercept)) + log_reg = LinearBinaryClassifier(;fit_intercept) + X_log_reg = MLJBase.table(rand(2, 3 + fit_intercept)) + lcr = LinearCountRegressor(;distribution=Poisson(), fit_intercept) + X_lcr = X_log_reg + for (m, X) in [(lin_reg, X_lin_reg), (log_reg, X_log_reg), (lcr, X_lcr)] + @test_throws ArgumentError MLJGLMInterface.prepare_inputs(m, X) + end + + end + @testset "intercept/offsetcol" for mt in modeltypes X = (x1=[1,2,3], x2=[4,5,6]) m = mt(fit_intercept=true, offsetcol=:x2) - Xmatrix, offset = MLJGLMInterface.prepare_inputs(m, X; handle_intercept=true) - + r = MLJGLMInterface.prepare_inputs(m, X; handle_intercept=true) + Xmatrix, offset, features = r @test offset == [4, 5, 6] @test Xmatrix== [1 1; 2 1; 3 1] + @test features == [:x1] + + m1 = mt(fit_intercept=true, offsetcol=:x3) + @test_throws ArgumentError MLJGLMInterface.prepare_inputs(m1, X) end @testset "no intercept/no offsetcol" for mt in modeltypes X = (x1=[1,2,3], x2=[4,5,6]) m = mt(fit_intercept=false) - Xmatrix, offset = MLJGLMInterface.prepare_inputs(m, X; handle_intercept=true) - + r = MLJGLMInterface.prepare_inputs(m, X; handle_intercept=true) + Xmatrix, offset, features = r @test offset == [] @test Xmatrix == [1 4; 2 5; 3 6] + @test features == [:x1, :x2] + + X1 = NamedTuple() + @test_throws ArgumentError MLJGLMInterface.prepare_inputs(m, X1) + end + + @testset "offsetcol but no intercept" for mt in modeltypes + X = (x1=[1,2,3], x2=[4,5,6]) + m = mt(offsetcol=:x1, fit_intercept=false) + Xmatrix, offset, features = MLJGLMInterface.prepare_inputs(m, X) + + @test offset == [1, 2, 3] + @test Xmatrix == permutedims([4 5 6]) + @test features == [:x2] + + # throw error for tables with just one column. + # Since for `offsetcol !== nothing` and `fit_intercept == false` + # the table must have at least two columns. + X1 = (x1=[1,2,3],) + @test_throws ArgumentError MLJGLMInterface.prepare_inputs(m, X1) end end @@ -210,7 +267,7 @@ end N = 1000 rng = StableRNGs.StableRNG(0) X = MLJBase.table(rand(rng, N, 3)) - y = rand(rng, Distributions.Uniform(0,1), N) .< expit(2*X.x1 + X.x2 - X.x3) + y = rand(rng, Uniform(0,1), N) .< expit(2*X.x1 + X.x2 - X.x3) y = categorical(y) lr = LinearBinaryClassifier(fit_intercept=false, offsetcol=:x2) @@ -224,7 +281,7 @@ end N = 1000 rng = StableRNGs.StableRNG(0) X = MLJBase.table(rand(rng, N, 3)) - y = 2*X.x1 + X.x2 - X.x3 + rand(rng, Distributions.Normal(0,1), N) + y = 2*X.x1 + X.x2 - X.x3 + rand(rng, Normal(0,1), N) lr = LinearRegressor(fit_intercept=false, offsetcol=:x2) fitresult, _, report = fit(lr, 1, X, y) @@ -237,7 +294,7 @@ end rng = StableRNGs.StableRNG(0) X = MLJBase.table(rand(rng, N, 3)) y = map(exp.(2*X.x1 + X.x2 - X.x3)) do mu - rand(rng, Distributions.Poisson(mu)) + rand(rng, Poisson(mu)) end lcr = LinearCountRegressor(fit_intercept=false, offsetcol=:x2) From 1d55c5083015be2611eb6dbbd66cadf301beb569 Mon Sep 17 00:00:00 2001 From: Okon Samuel <39421418+OkonSamuel@users.noreply.github.com> Date: Thu, 17 Feb 2022 10:13:10 +0100 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: Rik Huijzer --- src/MLJGLMInterface.jl | 15 +++++++++------ test/runtests.jl | 6 +++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/MLJGLMInterface.jl b/src/MLJGLMInterface.jl index dc2555d..46cb2a3 100644 --- a/src/MLJGLMInterface.jl +++ b/src/MLJGLMInterface.jl @@ -107,7 +107,7 @@ _to_array(v) = collect(v) """ split_X_offset(X, offsetcol::Nothing) -When no offset is specied, return X, an empty vector. +When no offset is specified, return `X` and an empty vector. """ split_X_offset(X, offsetcol::Nothing) = (X, Float64[]) @@ -150,7 +150,6 @@ function _throw_sample_size_error(model, est_dispersion_param) distribution_info = "\b" end - "" throw( ArgumentError( " `$(modelname)` with `fit_intercept = $(model.fit_intercept)`,"* @@ -160,10 +159,14 @@ function _throw_sample_size_error(model, est_dispersion_param) return nothing end -# `_requires_info` returns one of the following strings -# "`n_samples >= n_features`", "`n_samples > n_features`" -# "`n_samples >= n_features - 1`", "`n_samples > n_features - 1`" -# "`n_samples >= n_features + 1`", "`n_samples > n_features + 1`" +""" + _requires_info(model, est_dispersion_param) + +Returns one of the following strings +- "`n_samples >= n_features`", "`n_samples > n_features`" +- "`n_samples >= n_features - 1`", "`n_samples > n_features - 1`" +- "`n_samples >= n_features + 1`", "`n_samples > n_features + 1`" +""" function _requires_info(model, est_dispersion_param) inequality = est_dispersion_param ? ">" : ">=" int_num = model.fit_intercept - !isnothing(model.offsetcol) diff --git a/test/runtests.jl b/test/runtests.jl index dab0394..71fbcac 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -184,11 +184,11 @@ end modeltypes = [LinearRegressor, LinearBinaryClassifier, LinearCountRegressor] @testset "Test prepare_inputs" begin @testset "check sample size" for fit_intercept in [true, false] - lin_reg = LinearRegressor(;fit_intercept) + lin_reg = LinearRegressor(; fit_intercept) X_lin_reg = MLJBase.table(rand(3, 3 + fit_intercept)) - log_reg = LinearBinaryClassifier(;fit_intercept) + log_reg = LinearBinaryClassifier(; fit_intercept) X_log_reg = MLJBase.table(rand(2, 3 + fit_intercept)) - lcr = LinearCountRegressor(;distribution=Poisson(), fit_intercept) + lcr = LinearCountRegressor(; distribution=Poisson(), fit_intercept) X_lcr = X_log_reg for (m, X) in [(lin_reg, X_lin_reg), (log_reg, X_log_reg), (lcr, X_lcr)] @test_throws ArgumentError MLJGLMInterface.prepare_inputs(m, X)