From 7d24ac2e5aad6d08b177a87a0f2eefa81661c7a5 Mon Sep 17 00:00:00 2001
From: OkonSamuel <okonsamuel50@gmail.com>
Date: Wed, 16 Feb 2022 05:04:40 +0100
Subject: [PATCH 1/2] add more parameters to control fitting, and add data
 checks

---
 src/MLJGLMInterface.jl | 192 ++++++++++++++++++++++++++++++++---------
 test/runtests.jl       |  83 +++++++++++++++---
 2 files changed, 223 insertions(+), 52 deletions(-)

diff --git a/src/MLJGLMInterface.jl b/src/MLJGLMInterface.jl
index b3a1189..dc2555d 100644
--- a/src/MLJGLMInterface.jl
+++ b/src/MLJGLMInterface.jl
@@ -60,6 +60,10 @@ end
     fit_intercept::Bool = true
     link::GLM.Link01 = GLM.LogitLink()
     offsetcol::Union{Symbol, Nothing} = nothing
+    maxiter::Integer = 30
+    atol::Real = 1e-6
+    rtol::Real = 1e-6
+    minstepfac::Real = 0.001
     report_keys::KEYS_TYPE = DEFAULT_KEYS::(isnothing(_) || issubset(_, VALID_KEYS))
 end
 
@@ -68,6 +72,10 @@ end
     distribution::Distribution = Poisson()
     link::GLM.Link = GLM.LogLink()
     offsetcol::Union{Symbol, Nothing} = nothing
+    maxiter::Integer = 30
+    atol::Real = 1e-6
+    rtol::Real = 1e-6
+    minstepfac::Real = 0.001
     report_keys::KEYS_TYPE = DEFAULT_KEYS::(isnothing(_) || issubset(_, VALID_KEYS))
 end
 
@@ -87,17 +95,19 @@ Augment the matrix `X` with a column of ones if the intercept is to be
 fitted (`b=true`), return `X` otherwise.
 """
 function augment_X(X::Matrix, b::Bool)::Matrix
-    b && return hcat(X, ones(eltype(X), size(X, 1), 1))
+    b && return hcat(X, ones(float(Int), size(X, 1), 1))
     return X
 end
 
 _to_vector(v::Vector) = v
 _to_vector(v) = collect(v)
+_to_array(v::AbstractArray) = v
+_to_array(v) = collect(v)
 
 """
     split_X_offset(X, offsetcol::Nothing)
 
-When no offset is specied, return X and an empty vector.
+When no offset is specied, return X, an empty vector.
 """
 split_X_offset(X, offsetcol::Nothing) = (X, Float64[])
 
@@ -115,6 +125,92 @@ function split_X_offset(X, offsetcol::Symbol)
     return newX, _to_vector(offset)
 end
 
+# If `estimates_dispersion_param` returns `false` then the dispersion
+# parameter isn't estimated from data but known apriori to be `1`.
+estimates_dispersion_param(::LinearRegressor) = true
+estimates_dispersion_param(::LinearBinaryClassifier) = false
+
+function estimates_dispersion_param(model::LinearCountRegressor)
+    return GLM.dispersion_parameter(model.distribution)
+end
+
+function _throw_sample_size_error(model, est_dispersion_param)
+    requires_info = _requires_info(model, est_dispersion_param)
+
+    if isnothing(model.offsetcol)
+        offset_info = " `offsetcol == nothing`"
+    else
+        offset_info = " `offsetcol !== nothing`"
+    end
+
+    modelname = nameof(typeof(model))
+    if model isa LinearCountRegressor    
+        distribution_info = "and `distribution = $(nameof(typeof(model.distribution)))()`"
+    else
+        distribution_info = "\b"
+    end
+
+    ""
+    throw(
+        ArgumentError(
+            " `$(modelname)` with `fit_intercept = $(model.fit_intercept)`,"*
+            "$(offset_info) $(distribution_info) requires $(requires_info)"
+        )
+    )
+    return nothing
+end
+
+# `_requires_info` returns one of the following strings
+# "`n_samples >= n_features`", "`n_samples > n_features`"
+# "`n_samples >= n_features - 1`",  "`n_samples > n_features - 1`"
+# "`n_samples >= n_features + 1`", "`n_samples > n_features + 1`"
+function _requires_info(model, est_dispersion_param)
+    inequality = est_dispersion_param ? ">" : ">="
+    int_num = model.fit_intercept - !isnothing(model.offsetcol)
+
+    if iszero(int_num)
+        int_num_string = "\b"
+    elseif int_num < 0
+        int_num_string = "- $(abs(int_num))"
+    else
+        int_num_string = "+ $(int_num)"
+    end
+
+    return "`n_samples $(inequality) n_features $(int_num_string)`."
+end
+
+function check_sample_size(model, n, p)
+    if estimates_dispersion_param(model)
+        n <= p + model.fit_intercept && _throw_sample_size_error(model, true)
+    else
+        n < p + model.fit_intercept && _throw_sample_size_error(model, false)
+    end
+    return nothing
+end
+
+function _matrix_and_features(model, Xcols, handle_intercept=false)
+    col_names = Tables.columnnames(Xcols)
+    n, p = Tables.rowcount(Xcols), length(col_names)
+    augment = handle_intercept && model.fit_intercept
+
+    if !handle_intercept # i.e This only runs during `fit`
+        check_sample_size(model, n, p)
+    end
+
+    if p == 0
+        Xmatrix = Matrix{float(Int)}(undef, n, p)
+    else
+        Xmatrix = Tables.matrix(Xcols)
+    end
+
+    Xmatrix = augment_X(Xmatrix, augment)
+
+    return Xmatrix, col_names
+end
+
+_to_columns(t::Tables.AbstractColumns) = t
+_to_columns(t) = Tables.Columns(t)
+
 """
     prepare_inputs(model, X; handle_intercept=false)
 
@@ -122,12 +218,29 @@ Handle `model.offsetcol` and `model.fit_intercept` if `handle_intercept=true`.
 `handle_intercept` is disabled for fitting since the StatsModels.@formula handles the intercept.
 """
 function prepare_inputs(model, X; handle_intercept=false)
-    Xminoffset, offset = split_X_offset(X, model.offsetcol)
-    Xmatrix = MMI.matrix(Xminoffset)
-    if handle_intercept
-        Xmatrix = augment_X(Xmatrix, model.fit_intercept)
+    Xcols = _to_columns(X)
+    table_features = Tables.columnnames(Xcols)
+    p = length(table_features)
+    p >= 1 || throw(
+        ArgumentError("`X` must contain at least one feature column.")
+    )
+    if !isnothing(model.offsetcol)
+        model.offsetcol in table_features || throw(
+            ArgumentError("offset column `$(model.offsetcol)` not found in table `X")
+        )
+        if p < 2 && !model.fit_intercept
+            throw(
+                ArgumentError(
+                    "At least 2 feature columns are required for learning with"*
+                    " `offsetcol !== nothing` and `fit_intercept == false`."
+                )
+            )
+        end
     end
-    return Xmatrix, offset
+    Xminoffset, offset = split_X_offset(Xcols, model.offsetcol)
+    Xminoffset_cols = _to_columns(Xminoffset)
+    Xmatrix, features = _matrix_and_features(model, Xminoffset_cols , handle_intercept)
+    return Xmatrix, offset, _to_array(features)
 end
 
 """
@@ -170,7 +283,6 @@ function glm_report(glm_model, features, reportkeys)
     return NamedTuple{Tuple(keys(report_dict))}(values(report_dict))
 end
 
-
 """
     glm_formula(model, features) -> FormulaTerm
 
@@ -191,31 +303,10 @@ end
 Return data which is ready to be passed to `fit(form, data, ...)`.
 """
 function glm_data(model, Xmatrix, y, features)
-    header = collect(features)
-    data = Tables.table([Xmatrix y]; header=[header; :y])
+    data = Tables.table([Xmatrix y]; header=[features...; :y])
     return data
 end
 
-_to_array(v::AbstractArray) = v
-_to_array(v) = collect(v)
-
-"""
-    glm_features(model, X)
-
-Returns an iterable features object, to be used in the construction of 
-glm formula and glm data header.
-"""
-function glm_features(model, X)
-    if Tables.columnaccess(X)
-        table_features = _to_array(keys(Tables.columns(X)))
-    else
-        first_row = iterate(Tables.rows(X), 1)[1]
-        table_features = first_row === nothing ? Symbol[] : _to_array(keys(first_row))
-    end
-    filter!(!=(model.offsetcol), table_features)
-    return table_features
-end
-
 """
     check_weights(w, y)
 
@@ -259,8 +350,7 @@ params(fr::FitResult) = fr.params
 
 function MMI.fit(model::LinearRegressor, verbosity::Int, X, y, w=nothing)
     # apply the model
-    Xmatrix, offset = prepare_inputs(model, X)
-    features = glm_features(model, X)
+    Xmatrix, offset, features = prepare_inputs(model, X)
     y_ = isempty(offset) ? y : y .- offset
     wts = check_weights(w, y_)
     data = glm_data(model, Xmatrix, y_, features)
@@ -278,12 +368,20 @@ end
 
 function MMI.fit(model::LinearCountRegressor, verbosity::Int, X, y, w=nothing)
     # apply the model
-    Xmatrix, offset = prepare_inputs(model, X)
-    features = glm_features(model, X)
+    Xmatrix, offset, features = prepare_inputs(model, X)
     data = glm_data(model, Xmatrix, y, features)
     wts = check_weights(w, y)
     form = glm_formula(model, features)
-    fitted_glm = GLM.glm(form, data, model.distribution, model.link; offset, wts).model
+    fitted_glm_frame = GLM.glm(
+        form, data, model.distribution, model.link;
+        offset,
+        model.maxiter,
+        model.atol,
+        model.rtol,
+        model.minstepfac,
+        wts
+    )
+    fitted_glm = fitted_glm_frame.model
     fitresult = FitResult(
         GLM.coef(fitted_glm), GLM.dispersion(fitted_glm), (features = features,)
     )
@@ -299,11 +397,19 @@ function MMI.fit(model::LinearBinaryClassifier, verbosity::Int, X, y, w=nothing)
     decode = y[1]
     y_plain = MMI.int(y) .- 1 # 0, 1 of type Int
     wts = check_weights(w, y_plain)
-    Xmatrix, offset = prepare_inputs(model, X)
-    features = glm_features(model, X)
+    Xmatrix, offset, features = prepare_inputs(model, X)
     data = glm_data(model, Xmatrix, y_plain, features)
     form = glm_formula(model, features)
-    fitted_glm = GLM.glm(form, data, Bernoulli(), model.link; offset, wts).model
+    fitted_glm_frame = GLM.glm(
+        form, data, Bernoulli(), model.link;
+        offset,
+        model.maxiter,
+        model.atol,
+        model.rtol,
+        model.minstepfac,
+        wts
+    )
+    fitted_glm = fitted_glm_frame.model
     fitresult = FitResult(
         GLM.coef(fitted_glm), GLM.dispersion(fitted_glm), (features = features,)
     )
@@ -342,9 +448,17 @@ glm_link(::LinearRegressor) = GLM.IdentityLink()
 
 # more efficient than MLJBase fallback
 function MMI.predict_mean(model::GLM_MODELS, fitresult, Xnew)
-    Xmatrix, offset = prepare_inputs(model, Xnew; handle_intercept=true)
+    Xmatrix, offset, _ = prepare_inputs(model, Xnew; handle_intercept=true)
     result = glm_fitresult(model, fitresult) # ::FitResult
     coef = coefs(result)
+    p = size(Xmatrix, 2)
+    if p != length(coef)
+        throw(
+            DimensionMismatch(
+                "The number of features in training and prediction datasets must be equal"
+            )
+        )
+    end
     link = glm_link(model)
     return glm_predict(link, coef, Xmatrix, model.offsetcol, offset)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 41f6969..dab0394 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -7,7 +7,7 @@ using MLJGLMInterface
 using GLM: coeftable
 import GLM
 
-import Distributions
+using  Distributions: Normal, Poisson, Uniform
 import StableRNGs
 using Tables
 
@@ -51,7 +51,7 @@ expit(X) = 1 ./ (1 .+ exp.(-X))
     # test `predict` object
     p_distr = predict(atom_ols, fitresult, selectrows(X, test))
     dispersion =  MLJGLMInterface.dispersion(fitresult)
-    @test p_distr[1] == Distributions.Normal(p[1], dispersion)
+    @test p_distr[1] == Normal(p[1], dispersion)
 
     # test metadata
     model = atom_ols
@@ -102,6 +102,10 @@ end
     @test mean(cross_entropy(yhatw1, y)) < 0.25
     @test yhatw1 ≈ yhat1
 
+    # check predict on `Xnew` with wrong dims
+    Xnew = MLJBase.table(Tables.matrix(X)[:, 1:3], names=Tables.columnnames(X)[1:3])
+    @test_throws DimensionMismatch predict(lr, fitresult, Xnew)
+
     fitted_params(pr, fitresult)
 
     # Test metadata
@@ -113,7 +117,15 @@ end
     @test is_supervised(model)
     @test package_license(model) == "MIT"
     @test prediction_type(model) == :probabilistic
-    @test hyperparameters(model) == (:fit_intercept, :link, :offsetcol, :report_keys)
+    hyper_params = hyperparameters(model)
+    @test hyper_params[1] == :fit_intercept
+    @test hyper_params[2] == :link
+    @test hyper_params[3] == :offsetcol
+    @test hyper_params[4] == :maxiter
+    @test hyper_params[5] == :atol
+    @test hyper_params[6] == :rtol
+    @test hyper_params[7] == :minstepfac
+    @test hyper_params[8] == :report_keys
 end
 
 ###
@@ -125,7 +137,7 @@ end
     X = randn(rng, 500, 5)
     θ = randn(rng, 5)
     y = map(exp.(X*θ)) do mu
-        rand(rng, Distributions.Poisson(mu))
+        rand(rng, Poisson(mu))
     end
     w = ones(eltype(y), length(y))
 
@@ -143,6 +155,12 @@ end
     @test norm(θ̂w .- θ)/norm(θ) ≤ 0.03
     @test θ̂w ≈ θ̂ 
 
+    # check predict on `Xnew` with wrong dims
+    Xnew = MLJBase.table(
+        Tables.matrix(XTable)[:, 1:3], names=Tables.columnnames(XTable)[1:3]
+    )
+    @test_throws DimensionMismatch predict(lcr, fitresult, Xnew)
+
     # Test metadata
     model = lcr
     @test name(model) == "LinearCountRegressor"
@@ -157,32 +175,71 @@ end
     @test hyper_params[2] == :distribution
     @test hyper_params[3] == :link
     @test hyper_params[4] == :offsetcol
-    @test hyper_params[5] == :report_keys
-
+    @test hyper_params[5] == :maxiter
+    @test hyper_params[6] == :atol
+    @test hyper_params[7] == :rtol
+    @test hyper_params[8] == :minstepfac
 end
 
 modeltypes = [LinearRegressor, LinearBinaryClassifier, LinearCountRegressor]
 @testset "Test prepare_inputs" begin
+    @testset "check sample size" for fit_intercept in [true, false]
+        lin_reg = LinearRegressor(;fit_intercept)
+        X_lin_reg = MLJBase.table(rand(3, 3 + fit_intercept))
+        log_reg = LinearBinaryClassifier(;fit_intercept)
+        X_log_reg = MLJBase.table(rand(2, 3 + fit_intercept))
+        lcr = LinearCountRegressor(;distribution=Poisson(), fit_intercept)
+        X_lcr = X_log_reg
+        for (m, X) in [(lin_reg, X_lin_reg), (log_reg, X_log_reg), (lcr, X_lcr)]
+            @test_throws ArgumentError MLJGLMInterface.prepare_inputs(m, X)
+        end
+
+    end
+
     @testset "intercept/offsetcol" for mt in modeltypes
             X = (x1=[1,2,3], x2=[4,5,6])
             m = mt(fit_intercept=true, offsetcol=:x2)
-            Xmatrix, offset = MLJGLMInterface.prepare_inputs(m, X; handle_intercept=true)
-
+            r = MLJGLMInterface.prepare_inputs(m, X; handle_intercept=true)
+            Xmatrix, offset, features = r
             @test offset == [4, 5, 6]
             @test Xmatrix== [1 1;
                              2 1;
                              3 1]
+            @test features == [:x1]
+
+            m1 = mt(fit_intercept=true, offsetcol=:x3)
+            @test_throws ArgumentError MLJGLMInterface.prepare_inputs(m1, X)
     end
 
     @testset "no intercept/no offsetcol" for mt in modeltypes
         X = (x1=[1,2,3], x2=[4,5,6])
         m = mt(fit_intercept=false)
-        Xmatrix, offset = MLJGLMInterface.prepare_inputs(m, X; handle_intercept=true)
-
+        r = MLJGLMInterface.prepare_inputs(m, X; handle_intercept=true)
+        Xmatrix, offset, features = r
         @test offset == []
         @test Xmatrix == [1 4;
                           2 5;
                           3 6]
+        @test features == [:x1, :x2]
+
+        X1 = NamedTuple()
+        @test_throws ArgumentError MLJGLMInterface.prepare_inputs(m, X1)
+    end
+
+    @testset "offsetcol but no intercept" for mt in modeltypes
+        X = (x1=[1,2,3], x2=[4,5,6])
+        m = mt(offsetcol=:x1, fit_intercept=false)
+        Xmatrix, offset, features = MLJGLMInterface.prepare_inputs(m, X)
+
+        @test offset == [1, 2, 3]
+        @test Xmatrix == permutedims([4 5 6])
+        @test features == [:x2]
+
+        # throw error for tables with just one column.
+        # Since for `offsetcol !== nothing` and `fit_intercept == false`
+        # the table must have at least two columns.
+        X1 = (x1=[1,2,3],)
+        @test_throws ArgumentError MLJGLMInterface.prepare_inputs(m, X1)
     end
 
 end
@@ -210,7 +267,7 @@ end
         N = 1000
         rng = StableRNGs.StableRNG(0)
         X = MLJBase.table(rand(rng, N, 3))
-        y = rand(rng, Distributions.Uniform(0,1), N) .< expit(2*X.x1 + X.x2 - X.x3)
+        y = rand(rng, Uniform(0,1), N) .< expit(2*X.x1 + X.x2 - X.x3)
         y = categorical(y)
 
         lr = LinearBinaryClassifier(fit_intercept=false, offsetcol=:x2)
@@ -224,7 +281,7 @@ end
         N = 1000
         rng = StableRNGs.StableRNG(0)
         X = MLJBase.table(rand(rng, N, 3))
-        y = 2*X.x1 + X.x2 - X.x3 + rand(rng, Distributions.Normal(0,1), N) 
+        y = 2*X.x1 + X.x2 - X.x3 + rand(rng, Normal(0,1), N) 
 
         lr = LinearRegressor(fit_intercept=false, offsetcol=:x2)
         fitresult, _, report = fit(lr, 1, X, y)
@@ -237,7 +294,7 @@ end
         rng = StableRNGs.StableRNG(0)
         X = MLJBase.table(rand(rng, N, 3))
         y = map(exp.(2*X.x1 + X.x2 - X.x3)) do mu
-            rand(rng, Distributions.Poisson(mu))
+            rand(rng, Poisson(mu))
         end
 
         lcr = LinearCountRegressor(fit_intercept=false, offsetcol=:x2)

From 1d55c5083015be2611eb6dbbd66cadf301beb569 Mon Sep 17 00:00:00 2001
From: Okon Samuel <39421418+OkonSamuel@users.noreply.github.com>
Date: Thu, 17 Feb 2022 10:13:10 +0100
Subject: [PATCH 2/2] Apply suggestions from code review

Co-authored-by: Rik Huijzer <rikhuijzer@pm.me>
---
 src/MLJGLMInterface.jl | 15 +++++++++------
 test/runtests.jl       |  6 +++---
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/MLJGLMInterface.jl b/src/MLJGLMInterface.jl
index dc2555d..46cb2a3 100644
--- a/src/MLJGLMInterface.jl
+++ b/src/MLJGLMInterface.jl
@@ -107,7 +107,7 @@ _to_array(v) = collect(v)
 """
     split_X_offset(X, offsetcol::Nothing)
 
-When no offset is specied, return X, an empty vector.
+When no offset is specified, return `X` and an empty vector.
 """
 split_X_offset(X, offsetcol::Nothing) = (X, Float64[])
 
@@ -150,7 +150,6 @@ function _throw_sample_size_error(model, est_dispersion_param)
         distribution_info = "\b"
     end
 
-    ""
     throw(
         ArgumentError(
             " `$(modelname)` with `fit_intercept = $(model.fit_intercept)`,"*
@@ -160,10 +159,14 @@ function _throw_sample_size_error(model, est_dispersion_param)
     return nothing
 end
 
-# `_requires_info` returns one of the following strings
-# "`n_samples >= n_features`", "`n_samples > n_features`"
-# "`n_samples >= n_features - 1`",  "`n_samples > n_features - 1`"
-# "`n_samples >= n_features + 1`", "`n_samples > n_features + 1`"
+""" 
+    _requires_info(model, est_dispersion_param)
+    
+Returns one of the following strings
+- "`n_samples >= n_features`", "`n_samples > n_features`"
+- "`n_samples >= n_features - 1`",  "`n_samples > n_features - 1`"
+- "`n_samples >= n_features + 1`", "`n_samples > n_features + 1`"
+"""
 function _requires_info(model, est_dispersion_param)
     inequality = est_dispersion_param ? ">" : ">="
     int_num = model.fit_intercept - !isnothing(model.offsetcol)
diff --git a/test/runtests.jl b/test/runtests.jl
index dab0394..71fbcac 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -184,11 +184,11 @@ end
 modeltypes = [LinearRegressor, LinearBinaryClassifier, LinearCountRegressor]
 @testset "Test prepare_inputs" begin
     @testset "check sample size" for fit_intercept in [true, false]
-        lin_reg = LinearRegressor(;fit_intercept)
+        lin_reg = LinearRegressor(; fit_intercept)
         X_lin_reg = MLJBase.table(rand(3, 3 + fit_intercept))
-        log_reg = LinearBinaryClassifier(;fit_intercept)
+        log_reg = LinearBinaryClassifier(; fit_intercept)
         X_log_reg = MLJBase.table(rand(2, 3 + fit_intercept))
-        lcr = LinearCountRegressor(;distribution=Poisson(), fit_intercept)
+        lcr = LinearCountRegressor(; distribution=Poisson(), fit_intercept)
         X_lcr = X_log_reg
         for (m, X) in [(lin_reg, X_lin_reg), (log_reg, X_log_reg), (lcr, X_lcr)]
             @test_throws ArgumentError MLJGLMInterface.prepare_inputs(m, X)