From a75b20bd3cd56c1b1150d69e0fc517d03972c761 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Fri, 1 Sep 2023 21:28:16 -0500 Subject: [PATCH 01/24] termnames --- Project.toml | 2 +- src/StatsModels.jl | 1 + src/contrasts.jl | 12 ++++++------ src/statsmodel.jl | 41 ++++++++++++++++++++++++++++++++++++++++- test/statsmodel.jl | 9 +++++++++ 5 files changed, 57 insertions(+), 8 deletions(-) diff --git a/Project.toml b/Project.toml index 1bf9eb39..7376d625 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "StatsModels" uuid = "3eaba693-59b7-5ba5-a881-562e759f1c8d" -version = "0.7.2" +version = "0.7.3" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" diff --git a/src/StatsModels.jl b/src/StatsModels.jl index 0e119901..79c2a93f 100644 --- a/src/StatsModels.jl +++ b/src/StatsModels.jl @@ -36,6 +36,7 @@ export coefnames, setcontrasts!, formula, + termnames, AbstractTerm, ConstantTerm, diff --git a/src/contrasts.jl b/src/contrasts.jl index f4f03af8..fa4ae517 100644 --- a/src/contrasts.jl +++ b/src/contrasts.jl @@ -87,7 +87,7 @@ mutable struct MyCoding <: AbstractContrasts end contrasts_matrix(C::MyCoding, baseind, n) = ... -termnames(C::MyCoding, levels, baseind) = ... +_termnames(C::MyCoding, levels, baseind) = ... ``` # References @@ -198,7 +198,7 @@ function ContrastsMatrix(contrasts::C, levels::AbstractVector{T}) where {C<:Abst "$c_levels.")) end - tnames = termnames(contrasts, c_levels, baseind) + tnames = _termnames(contrasts, c_levels, baseind) mat = contrasts_matrix(contrasts, baseind, n) @@ -224,7 +224,7 @@ function ContrastsMatrix(c::ContrastsMatrix, levels::AbstractVector) return c end -function termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer) +function _termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer) not_base = [1:(baseind-1); (baseind+1):length(levels)] levels[not_base] end @@ -233,7 +233,7 @@ Base.getindex(contrasts::ContrastsMatrix, rowinds, colinds) = getindex(contrasts.matrix, getindex.(Ref(contrasts.invindex), rowinds), colinds) # Making a contrast type T only requires that there be a method for -# contrasts_matrix(T, baseind, n) and optionally termnames(T, levels, baseind) +# contrasts_matrix(T, baseind, n) and optionally _termnames(T, levels, baseind) # The rest is boilerplate. for contrastType in [:DummyCoding, :EffectsCoding, :HelmertCoding] @eval begin @@ -462,7 +462,7 @@ function contrasts_matrix(C::SeqDiffCoding, _, n) end # TODO: consider customizing term names: -# termnames(C::SeqDiffCoding, levels::AbstractVector, baseind::Integer) = +# _termnames(C::SeqDiffCoding, levels::AbstractVector, baseind::Integer) = # ["$(levels[i])-$(levels[i-1])" for i in 2:length(levels)] """ @@ -591,7 +591,7 @@ function contrasts_matrix(C::HypothesisCoding, baseind, n) C.contrasts end -termnames(C::HypothesisCoding, levels::AbstractVector, baseind::Int) = +_termnames(C::HypothesisCoding, levels::AbstractVector, baseind::Int) = something(C.labels, levels[1:length(levels) .!= baseind]) DataAPI.levels(c::HypothesisCoding) = c.levels diff --git a/src/statsmodel.jl b/src/statsmodel.jl index 0bb67c7c..d4fa65d4 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -106,12 +106,51 @@ function formula end formula(m::TableStatisticalModel) = m.mf.f formula(m::TableRegressionModel) = m.mf.f +""" + termnames(model::StatisticalModel) + termnames(term::AbstractTerm) + +Return the names associated with terms associated with a model. + +For models with only continuous predictors, this is the same as +`(responsename(model), coefnames(model))`. + +For models with categorical predictors, the returned names reflect +the categorical predictor and not the coefficients resulting from +the choice of contrast coding. + +```jldoctest + julia> termnames(@formula(y ~ 1 + x * y + (1+x|g))) + termnames( @formula(y ~ 1 + log(x) * log(y) + (1+x|g))) + ("y", ["1", "log(x)", "log(y)", "log(x) & log(y)", "(1 + x) | g"]) +``` +""" +termnames(model::StatisticalModel) = termnames(formula(model)) + +""" + termnames(term::AbstractTerm) + +Return the name(s) of column(s) generated by a term. Return value is either a +`String` or an iterable of `String`s. +""" +termnames(t::FormulaTerm) = (termnames(t.lhs), termnames(t.rhs)) +termnames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : nothing +termnames(t::ContinuousTerm) = string(t.sym) +termnames(t::CategoricalTerm) = string(t.sym) +termnames(t::Term) = string(t.sym) +termnames(t::ConstantTerm) = string(t.n) +termnames(t::FunctionTerm) = string(t.exorig) +termnames(ts::TupleTerm) = reduce(vcat, termnames.(ts)) +termnames(t::MatrixTerm) = mapreduce(termnames, vcat, t.terms) +termnames(t::InteractionTerm) = + kron_insideout((args...) -> join(args, " & "), vectorize.(termnames.(t.terms))...) + @doc """ fit(Mod::Type{<:StatisticalModel}, f::FormulaTerm, data, args...; contrasts::Dict{Symbol}, kwargs...) Convert tabular data into a numeric response vector and predictor matrix using -the formula `f`, and then `fit` the specified model type, wrapping the result in +the formula `f`, and then `fit` the specified model type, wrapping Stthe result in a [`TableRegressionModel`](@ref) or [`TableStatisticalModel`](@ref) (as appropriate). diff --git a/test/statsmodel.jl b/test/statsmodel.jl index 7e81b6f9..639109af 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -161,6 +161,7 @@ Base.show(io::IO, m::DummyModTwo) = println(io, m.msg) ## test copying of names from Terms to CoefTable ct = coeftable(m) @test ct.rownms == ["(Intercept)", "x1", "x2", "x1 & x2"] + @test termnames(m) == ("y", ["(Intercept)", "x1", "x2", "x1 & x2"]) ## show with coeftable defined io = IOBuffer() @@ -171,6 +172,7 @@ Base.show(io::IO, m::DummyModTwo) = println(io, m.msg) m2 = fit(DummyMod, f2, d) @test coeftable(m2).rownms == ["(Intercept)", "x1p: 6", "x1p: 7", "x1p: 8"] + @test termnames(m2) == ("y", ["(Intercept)", "x1p"]) ## predict w/ new data missing levels @test predict(m2, d[2:4, :]) == predict(m2)[2:4] @@ -233,6 +235,13 @@ Base.show(io::IO, m::DummyModTwo) = println(io, m.msg) m2 = fit(DummyModTwo, f, d) # make sure show() still works when there is no coeftable method show(io, m2) + + # one final termnames check + # note that `1` is still a ConstantTerm and not yet InterceptTerm + # because apply_schema hasn't been called + @test termnames(@formula(y ~ 1 + log(x) * y + (1+x|g)))[2] == + ["1", "log(x)", "y", "log(x) & y", "(1 + x) | g"] + end @testset "lrtest" begin From dd1f431adcb2e44eacf8eb6f22d193b935d445f3 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Fri, 1 Sep 2023 22:01:09 -0500 Subject: [PATCH 02/24] fix --- src/statsmodel.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index d4fa65d4..59278ed5 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -120,9 +120,8 @@ the categorical predictor and not the coefficients resulting from the choice of contrast coding. ```jldoctest - julia> termnames(@formula(y ~ 1 + x * y + (1+x|g))) - termnames( @formula(y ~ 1 + log(x) * log(y) + (1+x|g))) - ("y", ["1", "log(x)", "log(y)", "log(x) & log(y)", "(1 + x) | g"]) +julia> termnames(@formula(y ~ 1 + x * y + (1+x|g))) +("y", ["1", "x", "y", "x & y", "(1 + x) | g"]) ``` """ termnames(model::StatisticalModel) = termnames(formula(model)) From 6d727f48a6036769bf5ea0ea2e358c18ad384d76 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sat, 2 Sep 2023 18:46:35 +0000 Subject: [PATCH 03/24] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/statsmodel.jl | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index 59278ed5..b0ec4ebd 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -108,15 +108,14 @@ formula(m::TableRegressionModel) = m.mf.f """ termnames(model::StatisticalModel) - termnames(term::AbstractTerm) -Return the names associated with terms associated with a model. +Return the names of terms used in the formula of `model`. -For models with only continuous predictors, this is the same as +For regression models with only continuous predictors, this is the same as `(responsename(model), coefnames(model))`. For models with categorical predictors, the returned names reflect -the categorical predictor and not the coefficients resulting from +the variable name and not the coefficients resulting from the choice of contrast coding. ```jldoctest @@ -139,7 +138,7 @@ termnames(t::CategoricalTerm) = string(t.sym) termnames(t::Term) = string(t.sym) termnames(t::ConstantTerm) = string(t.n) termnames(t::FunctionTerm) = string(t.exorig) -termnames(ts::TupleTerm) = reduce(vcat, termnames.(ts)) +termnames(ts::TupleTerm) = mapreduce(termnames, vcat, ts) termnames(t::MatrixTerm) = mapreduce(termnames, vcat, t.terms) termnames(t::InteractionTerm) = kron_insideout((args...) -> join(args, " & "), vectorize.(termnames.(t.terms))...) @@ -149,7 +148,7 @@ termnames(t::InteractionTerm) = contrasts::Dict{Symbol}, kwargs...) Convert tabular data into a numeric response vector and predictor matrix using -the formula `f`, and then `fit` the specified model type, wrapping Stthe result in +the formula `f`, and then `fit` the specified model type, wrapping the result in a [`TableRegressionModel`](@ref) or [`TableStatisticalModel`](@ref) (as appropriate). From 3fef31190b6a09ed46ae493d9ff884fef6181ae6 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sat, 2 Sep 2023 13:57:12 -0500 Subject: [PATCH 04/24] more docs --- src/statsmodel.jl | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index b0ec4ebd..1c09ca5b 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -111,19 +111,41 @@ formula(m::TableRegressionModel) = m.mf.f Return the names of terms used in the formula of `model`. -For regression models with only continuous predictors, this is the same as +This is a convenience method for `termnames(formula(model))`. + +For `RegressionModel`s with only continuous predictors, this is the same as `(responsename(model), coefnames(model))`. For models with categorical predictors, the returned names reflect the variable name and not the coefficients resulting from the choice of contrast coding. +""" +termnames(model::StatisticalModel) = termnames(formula(model)) +""" + termnames(t::FormulaTerm) + +Return a two-tuple of `termnames` applied to the left and +right hand sides of the formula. + +Note that until `apply_schema` has been called, literal `1` in formulae +is interpreted as `ConstantTerm(1)` and will thus appear as `"1"` in the +returned term names. ```jldoctest julia> termnames(@formula(y ~ 1 + x * y + (1+x|g))) ("y", ["1", "x", "y", "x & y", "(1 + x) | g"]) ``` + +Similarly, formulae with an implicit intercept will not have a `"1"` +in their term names, because the implicit intercept does not exist until +`apply_schema` is called (and may not exist for certain model contexts). + +```jldoctest +julia> termnames(@formula(y ~ x * y + (1+x|g))) +("y", ["x", "y", "x & y", "(1 + x) | g"]) +``` """ -termnames(model::StatisticalModel) = termnames(formula(model)) +termnames(t::FormulaTerm) = (termnames(t.lhs), termnames(t.rhs)) """ termnames(term::AbstractTerm) @@ -131,7 +153,6 @@ termnames(model::StatisticalModel) = termnames(formula(model)) Return the name(s) of column(s) generated by a term. Return value is either a `String` or an iterable of `String`s. """ -termnames(t::FormulaTerm) = (termnames(t.lhs), termnames(t.rhs)) termnames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : nothing termnames(t::ContinuousTerm) = string(t.sym) termnames(t::CategoricalTerm) = string(t.sym) From 922235ffc0a4b861456bc90bb14f054e4781c80b Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sat, 2 Sep 2023 14:19:32 -0500 Subject: [PATCH 05/24] more docs more tests --- src/statsmodel.jl | 11 +++++++++-- test/statsmodel.jl | 20 ++++++++++++++++++-- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index 1c09ca5b..322bc972 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -150,8 +150,10 @@ termnames(t::FormulaTerm) = (termnames(t.lhs), termnames(t.rhs)) """ termnames(term::AbstractTerm) -Return the name(s) of column(s) generated by a term. Return value is either a -`String` or an iterable of `String`s. +Return the name(s) of column(s) generated by a term. + +Return value is either a `String`, an iterable of `String`s or nothing if there +no associated name (e.g. `termnames(InterceptTerm{false}())`). """ termnames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : nothing termnames(t::ContinuousTerm) = string(t.sym) @@ -160,6 +162,11 @@ termnames(t::Term) = string(t.sym) termnames(t::ConstantTerm) = string(t.n) termnames(t::FunctionTerm) = string(t.exorig) termnames(ts::TupleTerm) = mapreduce(termnames, vcat, ts) +# these have some surprising behavior: +# termnames(::InteractionTerm) always returns a vector +# termnames(MatrixTerm(term(:a))) returns a scalar +# termnames(MatrixTerm((term(a:), term(:b)))) returns a vector +# but this is the same behavior as coefnames termnames(t::MatrixTerm) = mapreduce(termnames, vcat, t.terms) termnames(t::InteractionTerm) = kron_insideout((args...) -> join(args, " & "), vectorize.(termnames.(t.terms))...) diff --git a/test/statsmodel.jl b/test/statsmodel.jl index 639109af..c54183bc 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -234,14 +234,30 @@ Base.show(io::IO, m::DummyModTwo) = println(io, m.msg) m2 = fit(DummyModTwo, f, d) # make sure show() still works when there is no coeftable method - show(io, m2) + show(io, m2) +end +@testset "termnames" begin # one final termnames check # note that `1` is still a ConstantTerm and not yet InterceptTerm # because apply_schema hasn't been called @test termnames(@formula(y ~ 1 + log(x) * y + (1+x|g)))[2] == ["1", "log(x)", "y", "log(x) & y", "(1 + x) | g"] - + @test termnames(ConstantTerm(1)) == "1" + @test termnames(Term(:x)) == "x" + @test termnames(InterceptTerm{true}()) == "(Intercept)" + @test termnames(InterceptTerm{false}()) === nothing + @test termnames(ContinuousTerm(:x, 1, 0, 0, 0)) == "x" + cm = StatsModels.ContrastsMatrix([1 0; 0 1], ["b", "c"], ["a", "b", "c"], DummyCoding()) + @test termnames(CategoricalTerm(:x, cm)) =="x" + @test termnames(FunctionTerm(log, [Term(:x)], :(log(x)))) == "log(x)" + # these next few seem a little weird but they're consistent with the + # definition of coefnames + @test termnames(InteractionTerm(term.((:a, :b, :c)))) == ["a & b & c"] + @test termnames(MatrixTerm(term(:a))) == "a" + @test termnames(MatrixTerm((term(:a), term(:b)))) == ["a", "b"] + @test termnames((term(:a), term(:b))) == ["a", "b"] + @test termnames((term(:a),)) == "a" end @testset "lrtest" begin From f517329947c0f34677cf2e883eece69064d838df Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sun, 3 Sep 2023 20:32:18 +0000 Subject: [PATCH 06/24] Update src/statsmodel.jl Co-authored-by: Milan Bouchet-Valat --- src/statsmodel.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index 322bc972..d43cde6e 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -121,6 +121,7 @@ the variable name and not the coefficients resulting from the choice of contrast coding. """ termnames(model::StatisticalModel) = termnames(formula(model)) + """ termnames(t::FormulaTerm) From cf0aa5978ea98557937fcda6ebe9d20c518aeb9e Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 5 Sep 2023 16:35:09 +0200 Subject: [PATCH 07/24] use StatsAPI directly, deprecate old termnames --- Project.toml | 2 ++ docs/src/internals.md | 8 +++--- src/StatsModels.jl | 4 ++- src/contrasts.jl | 42 +++++++++++++++++-------------- src/modelframe.jl | 24 +++++++++--------- src/statsmodel.jl | 48 +++++++++++++++++------------------ src/temporal_terms.jl | 2 +- src/terms.jl | 18 +++++++------- test/runtests.jl | 1 + test/statsmodel.jl | 58 +++++++++++++++++++++---------------------- 10 files changed, 108 insertions(+), 99 deletions(-) diff --git a/Project.toml b/Project.toml index 7376d625..30742b98 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" ShiftedArrays = "1277b4bf-5013-50f5-be3d-901d8477a67a" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" @@ -21,6 +22,7 @@ DataAPI = "1.1" DataFrames = "1" DataStructures = "0.17, 0.18" ShiftedArrays = "1, 2" +StatsAPI = "1" StatsBase = "0.33.5, 0.34" StatsFuns = "0.9, 1.0" Tables = "0.2, 1" diff --git a/docs/src/internals.md b/docs/src/internals.md index a68f9ba3..8ec003d4 100644 --- a/docs/src/internals.md +++ b/docs/src/internals.md @@ -80,7 +80,7 @@ FormulaTerm{Term, Term} ``` !!! note - + As always, you can introspect which method is called with ```julia @@ -444,7 +444,7 @@ StatsModels.termvars(p::PolyTerm) = StatsModels.termvars(p.term) # number of columns in the matrix this term produces StatsModels.width(p::PolyTerm) = p.deg -StatsBase.coefnames(p::PolyTerm) = coefnames(p.term) .* "^" .* string.(1:p.deg) +StatsAPI.coefnames(p::PolyTerm) = coefnames(p.term) .* "^" .* string.(1:p.deg) # output @@ -558,9 +558,9 @@ PolyTerm{Term, ConstantTerm{Int64}} ``` !!! note - + The functions like `poly` should be exported by the package that provides - the special syntax for two reasons. First, it makes run-time term + the special syntax for two reasons. First, it makes run-time term construction more convenient. Second, because of how the `@formula` macro generates code, the function that represents special syntax must be available in the namespace where `@formula` is _called_. This is because diff --git a/src/StatsModels.jl b/src/StatsModels.jl index 79c2a93f..1992b2c9 100644 --- a/src/StatsModels.jl +++ b/src/StatsModels.jl @@ -1,6 +1,7 @@ module StatsModels using Tables +using StatsAPI using StatsBase using ShiftedArrays using ShiftedArrays: lag, lead @@ -8,6 +9,7 @@ using DataStructures using DataAPI using DataAPI: levels using Printf: @sprintf +using StatsAPI: coefnames, fit, predict, predict! using StatsFuns: chisqccdf using SparseArrays @@ -32,7 +34,7 @@ export HelmertCoding, SeqDiffCoding, HypothesisCoding, - + coefnames, setcontrasts!, formula, diff --git a/src/contrasts.jl b/src/contrasts.jl index fa4ae517..38d208fc 100644 --- a/src/contrasts.jl +++ b/src/contrasts.jl @@ -53,7 +53,7 @@ C(levels = ::Vector{Any}, base = ::Any) # specify levels and base mean of the lower levels * [`SeqDiffCoding`](@ref) - Code for differences between sequential levels of the variable. -* [`HypothesisCoding`](@ref) - Manually specify contrasts via a hypothesis +* [`HypothesisCoding`](@ref) - Manually specify contrasts via a hypothesis matrix, which gives the weighting for the average response for each level * [`StatsModels.ContrastsCoding`](@ref) - Manually specify contrasts matrix, which is directly copied into the model matrix. @@ -79,7 +79,7 @@ The easiest way to specify custom contrasts is with `HypothesisCoding` or contrast coding system, you can subtype `AbstractContrasts`. This requires a constructor, a `contrasts_matrix` method for constructing the actual contrasts matrix that maps from levels to `ModelMatrix` column values, and (optionally) a -`termnames` method: +`coefnames` method: ```julia mutable struct MyCoding <: AbstractContrasts @@ -87,7 +87,7 @@ mutable struct MyCoding <: AbstractContrasts end contrasts_matrix(C::MyCoding, baseind, n) = ... -_termnames(C::MyCoding, levels, baseind) = ... +coefnames(C::MyCoding, levels, baseind) = ... ``` # References @@ -103,7 +103,7 @@ abstract type AbstractContrasts end # Contrasts + Levels (usually from data) = ContrastsMatrix struct ContrastsMatrix{C <: AbstractContrasts, M <: AbstractMatrix, T, U} matrix::M - termnames::Vector{U} + termnames::Vector{U} # XXX this is somewhat of a misnomer, this should be coefnames... levels::Vector{T} contrasts::C invindex::Dict{T,Int} @@ -166,7 +166,7 @@ function ContrastsMatrix(contrasts::C, levels::AbstractVector{T}) where {C<:Abst # 3. contrast levels missing from data: would have empty columns, generate a # rank-deficient model matrix. c_levels = something(DataAPI.levels(contrasts), levels) - + mismatched_levels = symdiff(c_levels, levels) if !isempty(mismatched_levels) throw(ArgumentError("contrasts levels not found in data or vice-versa: " * @@ -198,7 +198,7 @@ function ContrastsMatrix(contrasts::C, levels::AbstractVector{T}) where {C<:Abst "$c_levels.")) end - tnames = _termnames(contrasts, c_levels, baseind) + tnames = coefnames(contrasts, c_levels, baseind) mat = contrasts_matrix(contrasts, baseind, n) @@ -224,7 +224,11 @@ function ContrastsMatrix(c::ContrastsMatrix, levels::AbstractVector) return c end -function _termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer) +@deprecate(termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer), + coefnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer), + false) + +function StatsAPI.coefnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer) not_base = [1:(baseind-1); (baseind+1):length(levels)] levels[not_base] end @@ -233,7 +237,7 @@ Base.getindex(contrasts::ContrastsMatrix, rowinds, colinds) = getindex(contrasts.matrix, getindex.(Ref(contrasts.invindex), rowinds), colinds) # Making a contrast type T only requires that there be a method for -# contrasts_matrix(T, baseind, n) and optionally _termnames(T, levels, baseind) +# contrasts_matrix(T, baseind, n) and optionally coefnames(T, levels, baseind) # The rest is boilerplate. for contrastType in [:DummyCoding, :EffectsCoding, :HelmertCoding] @eval begin @@ -254,7 +258,7 @@ DataAPI.levels(c::AbstractContrasts) = nothing FullDummyCoding() Full-rank dummy coding generates one indicator (1 or 0) column for each level, -**including** the base level. This is sometimes known as +**including** the base level. This is sometimes known as [one-hot encoding](https://en.wikipedia.org/wiki/One-hot). Not exported but included here for the sake of completeness. @@ -331,7 +335,7 @@ column is generated with 1 where `variable .== x` and -1 where `variable .== bas of 0. If `levels` are omitted or `nothing`, they are determined from the data -by calling the `levels` function when constructing `ContrastsMatrix`. +by calling the `levels` function when constructing `ContrastsMatrix`. If `base` is omitted or `nothing`, the first level is used as the base. When all levels are equally frequent, effects coding generates model matrix @@ -373,7 +377,7 @@ Helmert coding codes each level as the difference from the average of the lower levels. If `levels` are omitted or `nothing`, they are determined from the data -by calling the `levels` function when constructing `Contrastsmatrix`. +by calling the `levels` function when constructing `Contrastsmatrix`. If `base` is omitted or `nothing`, the first level is used as the base. For each non-base level, Helmert coding generates a columns with -1 for each of n levels below, n for that level, and 0 above. @@ -462,7 +466,7 @@ function contrasts_matrix(C::SeqDiffCoding, _, n) end # TODO: consider customizing term names: -# _termnames(C::SeqDiffCoding, levels::AbstractVector, baseind::Integer) = +# StatsAPI.coefnames(C::SeqDiffCoding, levels::AbstractVector, baseind::Integer) = # ["$(levels[i])-$(levels[i-1])" for i in 2:length(levels)] """ @@ -591,7 +595,7 @@ function contrasts_matrix(C::HypothesisCoding, baseind, n) C.contrasts end -_termnames(C::HypothesisCoding, levels::AbstractVector, baseind::Int) = +StatsAPI.coefnames(C::HypothesisCoding, levels::AbstractVector, baseind::Int) = something(C.labels, levels[1:length(levels) .!= baseind]) DataAPI.levels(c::HypothesisCoding) = c.levels @@ -602,8 +606,8 @@ DataAPI.levels(c::HypothesisCoding) = c.levels Coding by manual specification of contrasts matrix. For k levels, the contrasts must be a k by k-1 Matrix. The contrasts in this matrix will be copied directly -into the model matrix; if you want to specify your contrasts as hypotheses (i.e., -weights assigned to each level's cell mean), you should use +into the model matrix; if you want to specify your contrasts as hypotheses (i.e., +weights assigned to each level's cell mean), you should use [`HypothesisCoding`](@ref) instead. """ mutable struct ContrastsCoding{T<:AbstractMatrix} <: AbstractContrasts @@ -687,9 +691,9 @@ julia> StatsModels.hypothesis_matrix(cmat) -1 0 0 1 ``` -For non-centered contrasts like `DummyCoding`, without including the intercept -the hypothesis matrix is incorrect. So while `intercept=true` is the default for -non-centered contrasts, you can see the (wrong) hypothesis matrix when ignoring +For non-centered contrasts like `DummyCoding`, without including the intercept +the hypothesis matrix is incorrect. So while `intercept=true` is the default for +non-centered contrasts, you can see the (wrong) hypothesis matrix when ignoring it by forcing `intercept=false`: ```jldoctest hypmat @@ -710,7 +714,7 @@ julia> StatsModels.hypothesis_matrix(cmat, tolerance=0) # ugly 1.0 -2.23753e-16 6.91749e-18 -1.31485e-16 -1.0 1.0 -2.42066e-16 9.93754e-17 -1.0 4.94472e-17 1.0 9.93754e-17 - -1.0 1.04958e-16 -1.31044e-16 1.0 + -1.0 1.04958e-16 -1.31044e-16 1.0 ``` Finally, the hypothesis matrix for a constructed `ContrastsMatrix` (as stored by diff --git a/src/modelframe.jl b/src/modelframe.jl index 13c59f16..c9fed27f 100644 --- a/src/modelframe.jl +++ b/src/modelframe.jl @@ -23,7 +23,7 @@ ModelFrame(f::FormulaTerm, data; model::Type{M} = StatisticalModel, contrasts::D * `f::FormulaTerm`: Formula whose left hand side is the *response* and right hand side are the *predictors*. * `schema::Any`: The schema that was applied to generate `f`. -* `data::D`: The data table being modeled. The only restriction is that `data` +* `data::D`: The data table being modeled. The only restriction is that `data` is a table (`Tables.istable(data) == true`) * `model::Type{M}`: The type of the model that will be fit from this model frame. @@ -52,7 +52,7 @@ end _missing_omit(x::AbstractVector{T}) where T = copyto!(similar(x, nonmissingtype(T)), x) _missing_omit(x::AbstractVector, rows) = _missing_omit(view(x, rows)) - + function missing_omit(d::T) where T<:ColumnTable nonmissings = trues(length(first(d))) for col in d @@ -72,7 +72,7 @@ missing_omit(data::T, formula::AbstractTerm) where T<:ColumnTable = function ModelFrame(f::FormulaTerm, data::ColumnTable; model::Type{M}=StatisticalModel, contrasts=Dict{Symbol,Any}()) where M - + msg = checknamesexist( f, data ) if msg != "" throw(ArgumentError(msg)) @@ -82,14 +82,14 @@ function ModelFrame(f::FormulaTerm, data::ColumnTable; sch = schema(f, data, contrasts) f = apply_schema(f, sch, M) - + ModelFrame(f, sch, data, model) end ModelFrame(f::FormulaTerm, data; model=StatisticalModel, contrasts=Dict{Symbol,Any}()) = ModelFrame(f, columntable(data); model=model, contrasts=contrasts) -StatsBase.modelmatrix(f::FormulaTerm, data; kwargs...) = modelmatrix(f.rhs, data; kwargs...) +StatsAPI.modelmatrix(f::FormulaTerm, data; kwargs...) = modelmatrix(f.rhs, data; kwargs...) """ modelmatrix(t::AbstractTerm, data; hints=Dict(), mod=StatisticalModel) @@ -106,14 +106,14 @@ calling [`modelcols`](@ref) if necessary. The optional `hints` and `mod` keyword arguments are passed to [`apply_schema`](@ref). !!! note - + `modelmatrix` is provided as a convenience for interactive use. For modeling packages that wish to support a formula-based interface, it is recommended to use the [`schema`](@ref) -- [`apply_schema`](@ref) -- [`modelcols`](@ref) pipeline directly """ -function StatsBase.modelmatrix(t::Union{AbstractTerm, TupleTerm}, data; +function StatsAPI.modelmatrix(t::Union{AbstractTerm, TupleTerm}, data; hints=Dict{Symbol,Any}(), mod::Type{M}=StatisticalModel) where M Tables.istable(data) || throw(ArgumentError("expected data in a Table, got $(typeof(data))")) @@ -134,14 +134,14 @@ before calling [`modelcols`](@ref) if necessary. The optional `hints` and `mod` keyword arguments are passed to [`apply_schema`](@ref). !!! note - + `response` is provided as a convenience for interactive use. For modeling packages that wish to support a formula-based interface, it is recommended to use the [`schema`](@ref) -- [`apply_schema`](@ref) -- [`modelcols`](@ref) pipeline directly """ -function StatsBase.response(f::FormulaTerm, data; +function StatsAPI.response(f::FormulaTerm, data; hints=Dict{Symbol,Any}(), mod::Type{M}=StatisticalModel) where M Tables.istable(data) || @@ -151,10 +151,10 @@ function StatsBase.response(f::FormulaTerm, data; end -StatsBase.modelmatrix(mf::ModelFrame; data=mf.data) = modelcols(mf.f.rhs, data) -StatsBase.response(mf::ModelFrame; data=mf.data) = modelcols(mf.f.lhs, data) +StatsAPI.modelmatrix(mf::ModelFrame; data=mf.data) = modelcols(mf.f.rhs, data) +StatsAPI.response(mf::ModelFrame; data=mf.data) = modelcols(mf.f.lhs, data) -StatsBase.coefnames(mf::ModelFrame) = vectorize(coefnames(mf.f.rhs)) +StatsAPI.coefnames(mf::ModelFrame) = vectorize(coefnames(mf.f.rhs)) """ setcontrasts!(mf::ModelFrame; kwargs...) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index d43cde6e..6c759a13 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -75,7 +75,7 @@ end for (modeltype, dfmodeltype) in ((:StatisticalModel, TableStatisticalModel), (:RegressionModel, TableRegressionModel)) @eval begin - function StatsBase.fit(::Type{T}, f::FormulaTerm, data, args...; + function StatsAPI.fit(::Type{T}, f::FormulaTerm, data, args...; contrasts::Dict{Symbol,<:Any} = Dict{Symbol,Any}(), kwargs...) where T<:$modeltype @@ -127,9 +127,9 @@ termnames(model::StatisticalModel) = termnames(formula(model)) Return a two-tuple of `termnames` applied to the left and right hand sides of the formula. - + Note that until `apply_schema` has been called, literal `1` in formulae -is interpreted as `ConstantTerm(1)` and will thus appear as `"1"` in the +is interpreted as `ConstantTerm(1)` and will thus appear as `"1"` in the returned term names. ```jldoctest @@ -138,7 +138,7 @@ julia> termnames(@formula(y ~ 1 + x * y + (1+x|g))) ``` Similarly, formulae with an implicit intercept will not have a `"1"` -in their term names, because the implicit intercept does not exist until +in their term names, because the implicit intercept does not exist until `apply_schema` is called (and may not exist for certain model contexts). ```jldoctest @@ -151,7 +151,7 @@ termnames(t::FormulaTerm) = (termnames(t.lhs), termnames(t.rhs)) """ termnames(term::AbstractTerm) -Return the name(s) of column(s) generated by a term. +Return the name(s) of column(s) generated by a term. Return value is either a `String`, an iterable of `String`s or nothing if there no associated name (e.g. `termnames(InterceptTerm{false}())`). @@ -182,7 +182,7 @@ a [`TableRegressionModel`](@ref) or [`TableStatisticalModel`](@ref) (as appropriate). This is intended as a backstop for modeling packages that implement model types -that are subtypes of `StatsBase.StatisticalModel` but do not explicitly support +that are subtypes of `StatsAPI.StatisticalModel` but do not explicitly support the full StatsModels terms-based interface. Currently this works by creating a [`ModelFrame`](@ref) from the formula and data, and then converting this to a [`ModelMatrix`](@ref), but this is an internal implementation detail which may @@ -191,24 +191,24 @@ change in the near future. # Delegate functions from StatsBase that use our new types const TableModels = Union{TableStatisticalModel, TableRegressionModel} -@delegate TableModels.model [StatsBase.coef, StatsBase.confint, - StatsBase.deviance, StatsBase.nulldeviance, - StatsBase.loglikelihood, StatsBase.nullloglikelihood, - StatsBase.dof, StatsBase.dof_residual, StatsBase.nobs, - StatsBase.stderror, StatsBase.vcov, StatsBase.fitted] -@delegate TableRegressionModel.model [StatsBase.modelmatrix, - StatsBase.residuals, StatsBase.response, - StatsBase.predict, StatsBase.predict!, - StatsBase.cooksdistance] -StatsBase.predict(m::TableRegressionModel, new_x::AbstractMatrix; kwargs...) = +@delegate TableModels.model [StatsAPI.coef, StatsAPI.confint, + StatsAPI.deviance, StatsAPI.nulldeviance, + StatsAPI.loglikelihood, StatsAPI.nullloglikelihood, + StatsAPI.dof, StatsAPI.dof_residual, StatsAPI.nobs, + StatsAPI.stderror, StatsAPI.vcov, StatsAPI.fitted] +@delegate TableRegressionModel.model [StatsAPI.modelmatrix, + StatsAPI.residuals, StatsAPI.response, + StatsAPI.predict, StatsAPI.predict!, + StatsAPI.cooksdistance] +StatsAPI.predict(m::TableRegressionModel, new_x::AbstractMatrix; kwargs...) = predict(m.model, new_x; kwargs...) # Need to define these manually because of ambiguity using @delegate -StatsBase.r2(mm::TableRegressionModel) = r2(mm.model) -StatsBase.adjr2(mm::TableRegressionModel) = adjr2(mm.model) -StatsBase.r2(mm::TableRegressionModel, variant::Symbol) = r2(mm.model, variant) -StatsBase.adjr2(mm::TableRegressionModel, variant::Symbol) = adjr2(mm.model, variant) -StatsBase.loglikelihood(mm::TableModels, c::Colon) = loglikelihood(mm.model, c) +StatsAPI.r2(mm::TableRegressionModel) = r2(mm.model) +StatsAPI.adjr2(mm::TableRegressionModel) = adjr2(mm.model) +StatsAPI.r2(mm::TableRegressionModel, variant::Symbol) = r2(mm.model, variant) +StatsAPI.adjr2(mm::TableRegressionModel, variant::Symbol) = adjr2(mm.model, variant) +StatsAPI.loglikelihood(mm::TableModels, c::Colon) = loglikelihood(mm.model, c) isnested(m1::TableModels, m2::TableModels; kwargs...) = isnested(m1.model, m2.model; kwargs...) @@ -235,7 +235,7 @@ function _return_predictions(T, yp::NamedTuple, nonmissings, len) end # Predict function that takes data table as predictor instead of matrix -function StatsBase.predict(mm::TableRegressionModel, data; kwargs...) +function StatsAPI.predict(mm::TableRegressionModel, data; kwargs...) Tables.istable(data) || throw(ArgumentError("expected data in a Table, got $(typeof(data))")) @@ -247,10 +247,10 @@ function StatsBase.predict(mm::TableRegressionModel, data; kwargs...) _return_predictions(Tables.materializer(data), y_pred, nonmissings, length(nonmissings)) end -StatsBase.coefnames(model::TableModels) = coefnames(model.mf) +StatsAPI.coefnames(model::TableModels) = coefnames(model.mf) # coeftable implementation -function StatsBase.coeftable(model::TableModels; kwargs...) +function StatsAPI.coeftable(model::TableModels; kwargs...) ct = coeftable(model.model, kwargs...) cfnames = coefnames(model.mf) if length(ct.rownms) == length(cfnames) diff --git a/src/temporal_terms.jl b/src/temporal_terms.jl index 6c08e0b6..3a8f25a3 100644 --- a/src/temporal_terms.jl +++ b/src/temporal_terms.jl @@ -64,7 +64,7 @@ function Base.show(io::IO, ll::LeadLagTerm{<:Any, F}) where F opname = string(nameof(F.instance)) print(io, "$opname($(ll.term), $(ll.nsteps))") end -function StatsBase.coefnames(ll::LeadLagTerm{<:Any, F}) where F +function StatsAPI.coefnames(ll::LeadLagTerm{<:Any, F}) where F opname = string(nameof(F.instance)) coefnames(ll.term) .* "_$opname$(ll.nsteps)" end diff --git a/src/terms.jl b/src/terms.jl index f9a89c17..c34de73f 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -504,7 +504,7 @@ lazy_modelcols(ft::FunctionTerm, d::NamedTuple) = Base.Broadcast.broadcasted(ft.f, lazy_modelcols.(ft.args, Ref(d))...) lazy_modelcols(x, d) = modelcols(x, d) - + modelcols(t::ContinuousTerm, d::NamedTuple) = copy.(d[t.sym]) @@ -570,15 +570,15 @@ vectorize(x) = [x] Return the name(s) of column(s) generated by a term. Return value is either a `String` or an iterable of `String`s. """ -StatsBase.coefnames(t::FormulaTerm) = (coefnames(t.lhs), coefnames(t.rhs)) -StatsBase.coefnames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : [] -StatsBase.coefnames(t::ContinuousTerm) = string(t.sym) -StatsBase.coefnames(t::CategoricalTerm) = +StatsAPI.coefnames(t::FormulaTerm) = (coefnames(t.lhs), coefnames(t.rhs)) +StatsAPI.coefnames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : [] +StatsAPI.coefnames(t::ContinuousTerm) = string(t.sym) +StatsAPI.coefnames(t::CategoricalTerm) = ["$(t.sym): $name" for name in t.contrasts.termnames] -StatsBase.coefnames(t::FunctionTerm) = string(t.exorig) -StatsBase.coefnames(ts::TupleTerm) = reduce(vcat, coefnames.(ts)) -StatsBase.coefnames(t::MatrixTerm) = mapreduce(coefnames, vcat, t.terms) -StatsBase.coefnames(t::InteractionTerm) = +StatsAPI.coefnames(t::FunctionTerm) = string(t.exorig) +StatsAPI.coefnames(ts::TupleTerm) = reduce(vcat, coefnames.(ts)) +StatsAPI.coefnames(t::MatrixTerm) = mapreduce(coefnames, vcat, t.terms) +StatsAPI.coefnames(t::InteractionTerm) = kron_insideout((args...) -> join(args, " & "), vectorize.(coefnames.(t.terms))...) ################################################################################ diff --git a/test/runtests.jl b/test/runtests.jl index 66829a8a..36f30fe2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,6 +6,7 @@ using SparseArrays using StatsModels using DataFrames using CategoricalArrays +using StatsAPI using StatsBase using StatsModels: ContrastsMatrix diff --git a/test/statsmodel.jl b/test/statsmodel.jl index c54183bc..4e8a2e3d 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -8,17 +8,17 @@ struct DummyMod <: RegressionModel end ## dumb fit method: just copy the x and y input over -StatsBase.fit(::Type{DummyMod}, x::Matrix, y::Vector) = +StatsAPI.fit(::Type{DummyMod}, x::Matrix, y::Vector) = DummyMod(collect(1:size(x, 2)), x, y) -StatsBase.response(mod::DummyMod) = mod.y +StatsAPI.response(mod::DummyMod) = mod.y ## dumb coeftable: just prints the "beta" values -StatsBase.coeftable(mod::DummyMod) = +StatsAPI.coeftable(mod::DummyMod) = CoefTable(reshape(mod.beta, (size(mod.beta,1), 1)), ["'beta' value"], ["" for n in 1:size(mod.x,2)], 0) # dumb predict: return values predicted by "beta" and dummy confidence bounds -function StatsBase.predict(mod::DummyMod; +function StatsAPI.predict(mod::DummyMod; interval::Union{Nothing,Symbol}=nothing) pred = mod.x * mod.beta if interval === nothing @@ -29,7 +29,7 @@ function StatsBase.predict(mod::DummyMod; throw(ArgumentError("value not allowed for interval")) end end -function StatsBase.predict(mod::DummyMod, newX::Matrix; +function StatsAPI.predict(mod::DummyMod, newX::Matrix; interval::Union{Nothing,Symbol}=nothing) pred = newX * mod.beta if interval === nothing @@ -40,15 +40,15 @@ function StatsBase.predict(mod::DummyMod, newX::Matrix; throw(ArgumentError("value not allowed for interval")) end end -StatsBase.dof(mod::DummyMod) = length(mod.beta) -StatsBase.dof_residual(mod::DummyMod) = length(mod.y) - length(mod.beta) -StatsBase.nobs(mod::DummyMod) = length(mod.y) -StatsBase.deviance(mod::DummyMod) = sum((response(mod) .- predict(mod)).^2) +StatsAPI.dof(mod::DummyMod) = length(mod.beta) +StatsAPI.dof_residual(mod::DummyMod) = length(mod.y) - length(mod.beta) +StatsAPI.nobs(mod::DummyMod) = length(mod.y) +StatsAPI.deviance(mod::DummyMod) = sum((response(mod) .- predict(mod)).^2) # Incorrect but simple definition StatsModels.isnested(mod1::DummyMod, mod2::DummyMod; atol::Real=0.0) = dof(mod1) <= dof(mod2) -StatsBase.loglikelihood(mod::DummyMod) = -sum((response(mod) .- predict(mod)).^2) -StatsBase.loglikelihood(mod::DummyMod, ::Colon) = -(response(mod) .- predict(mod)).^2 +StatsAPI.loglikelihood(mod::DummyMod) = -sum((response(mod) .- predict(mod)).^2) +StatsAPI.loglikelihood(mod::DummyMod, ::Colon) = -(response(mod) .- predict(mod)).^2 # A dummy RegressionModel type that does not support intercept struct DummyModNoIntercept <: RegressionModel @@ -60,17 +60,17 @@ end StatsModels.drop_intercept(::Type{DummyModNoIntercept}) = true ## dumb fit method: just copy the x and y input over -StatsBase.fit(::Type{DummyModNoIntercept}, x::Matrix, y::Vector) = +StatsAPI.fit(::Type{DummyModNoIntercept}, x::Matrix, y::Vector) = DummyModNoIntercept(collect(1:size(x, 2)), x, y) -StatsBase.response(mod::DummyModNoIntercept) = mod.y +StatsAPI.response(mod::DummyModNoIntercept) = mod.y ## dumb coeftable: just prints the "beta" values -StatsBase.coeftable(mod::DummyModNoIntercept) = +StatsAPI.coeftable(mod::DummyModNoIntercept) = CoefTable(reshape(mod.beta, (size(mod.beta,1), 1)), ["'beta' value"], ["" for n in 1:size(mod.x,2)], 0) # dumb predict: return values predicted by "beta" and dummy confidence bounds -function StatsBase.predict(mod::DummyModNoIntercept; +function StatsAPI.predict(mod::DummyModNoIntercept; interval::Union{Nothing,Symbol}=nothing) pred = mod.x * mod.beta if interval === nothing @@ -81,7 +81,7 @@ function StatsBase.predict(mod::DummyModNoIntercept; throw(ArgumentError("value not allowed for interval")) end end -function StatsBase.predict(mod::DummyModNoIntercept, newX::Matrix; +function StatsAPI.predict(mod::DummyModNoIntercept, newX::Matrix; interval::Union{Nothing,Symbol}=nothing) pred = newX * mod.beta if interval === nothing @@ -92,20 +92,20 @@ function StatsBase.predict(mod::DummyModNoIntercept, newX::Matrix; throw(ArgumentError("value not allowed for interval")) end end -StatsBase.dof(mod::DummyModNoIntercept) = length(mod.beta) -StatsBase.dof_residual(mod::DummyModNoIntercept) = length(mod.y) - length(mod.beta) -StatsBase.nobs(mod::DummyModNoIntercept) = length(mod.y) -StatsBase.deviance(mod::DummyModNoIntercept) = sum((response(mod) .- predict(mod)).^2) +StatsAPI.dof(mod::DummyModNoIntercept) = length(mod.beta) +StatsAPI.dof_residual(mod::DummyModNoIntercept) = length(mod.y) - length(mod.beta) +StatsAPI.nobs(mod::DummyModNoIntercept) = length(mod.y) +StatsAPI.deviance(mod::DummyModNoIntercept) = sum((response(mod) .- predict(mod)).^2) # isnested not implemented to test fallback -StatsBase.loglikelihood(mod::DummyModNoIntercept) = -sum((response(mod) .- predict(mod)).^2) -StatsBase.loglikelihood(mod::DummyModNoIntercept, ::Colon) = -(response(mod) .- predict(mod)).^2 +StatsAPI.loglikelihood(mod::DummyModNoIntercept) = -sum((response(mod) .- predict(mod)).^2) +StatsAPI.loglikelihood(mod::DummyModNoIntercept, ::Colon) = -(response(mod) .- predict(mod)).^2 ## Another dummy model type to test fall-through show method struct DummyModTwo <: RegressionModel msg::String end -StatsBase.fit(::Type{DummyModTwo}, ::Matrix, ::Vector) = DummyModTwo("hello!") +StatsAPI.fit(::Type{DummyModTwo}, ::Matrix, ::Vector) = DummyModTwo("hello!") Base.show(io::IO, m::DummyModTwo) = println(io, m.msg) @testset "stat model types" begin @@ -234,14 +234,14 @@ Base.show(io::IO, m::DummyModTwo) = println(io, m.msg) m2 = fit(DummyModTwo, f, d) # make sure show() still works when there is no coeftable method - show(io, m2) + show(io, m2) end @testset "termnames" begin # one final termnames check # note that `1` is still a ConstantTerm and not yet InterceptTerm # because apply_schema hasn't been called - @test termnames(@formula(y ~ 1 + log(x) * y + (1+x|g)))[2] == + @test termnames(@formula(y ~ 1 + log(x) * y + (1+x|g)))[2] == ["1", "log(x)", "y", "log(x) & y", "(1 + x) | g"] @test termnames(ConstantTerm(1)) == "1" @test termnames(Term(:x)) == "x" @@ -251,7 +251,7 @@ end cm = StatsModels.ContrastsMatrix([1 0; 0 1], ["b", "c"], ["a", "b", "c"], DummyCoding()) @test termnames(CategoricalTerm(:x, cm)) =="x" @test termnames(FunctionTerm(log, [Term(:x)], :(log(x)))) == "log(x)" - # these next few seem a little weird but they're consistent with the + # these next few seem a little weird but they're consistent with the # definition of coefnames @test termnames(InteractionTerm(term.((:a, :b, :c)))) == ["a & b & c"] @test termnames(MatrixTerm(term(:a))) == "a" @@ -287,7 +287,7 @@ end ────────────────────────────────────────────────────── DOF ΔDOF LogLik Deviance Chisq p(>Chisq) ────────────────────────────────────────────────────── - [1] 1 -14.0000 14.0000 + [1] 1 -14.0000 14.0000 [2] 2 1 -3.2600 3.2600 21.4800 <1e-05 ──────────────────────────────────────────────────────""" @@ -332,7 +332,7 @@ end ────────────────────────────────────────────────────── DOF ΔDOF LogLik Deviance Chisq p(>Chisq) ────────────────────────────────────────────────────── - [1] 0 -30.0000 30.0000 + [1] 0 -30.0000 30.0000 [2] 1 1 -10.8600 10.8600 38.2800 <1e-09 ──────────────────────────────────────────────────────""" else @@ -341,7 +341,7 @@ end ────────────────────────────────────────────────────── DOF ΔDOF LogLik Deviance Chisq p(>Chisq) ────────────────────────────────────────────────────── - [1] 0 -30.0000 30.0000 + [1] 0 -30.0000 30.0000 [2] 1 1 -10.8600 10.8600 38.2800 <1e-9 ──────────────────────────────────────────────────────""" end From 1b8d89d9270947d7bb0d99117e0710a3aef1fe2b Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 5 Sep 2023 16:49:33 +0200 Subject: [PATCH 08/24] consistency --- src/statsmodel.jl | 26 ++++++++++++++------------ test/statsmodel.jl | 8 +++----- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index 6c759a13..882fea93 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -111,7 +111,8 @@ formula(m::TableRegressionModel) = m.mf.f Return the names of terms used in the formula of `model`. -This is a convenience method for `termnames(formula(model))`. +This is a convenience method for `termnames(formula(model))`, which returns a +two-tuple of `termnames` applied to the left and right hand sides of the formula. For `RegressionModel`s with only continuous predictors, this is the same as `(responsename(model), coefnames(model))`. @@ -128,7 +129,8 @@ termnames(model::StatisticalModel) = termnames(formula(model)) Return a two-tuple of `termnames` applied to the left and right hand sides of the formula. -Note that until `apply_schema` has been called, literal `1` in formulae +!!! note + Until `apply_schema` has been called, literal `1` in formulae is interpreted as `ConstantTerm(1)` and will thus appear as `"1"` in the returned term names. @@ -151,7 +153,7 @@ termnames(t::FormulaTerm) = (termnames(t.lhs), termnames(t.rhs)) """ termnames(term::AbstractTerm) -Return the name(s) of column(s) generated by a term. +Return the name of a term. Return value is either a `String`, an iterable of `String`s or nothing if there no associated name (e.g. `termnames(InterceptTerm{false}())`). @@ -162,15 +164,15 @@ termnames(t::CategoricalTerm) = string(t.sym) termnames(t::Term) = string(t.sym) termnames(t::ConstantTerm) = string(t.n) termnames(t::FunctionTerm) = string(t.exorig) -termnames(ts::TupleTerm) = mapreduce(termnames, vcat, ts) -# these have some surprising behavior: -# termnames(::InteractionTerm) always returns a vector -# termnames(MatrixTerm(term(:a))) returns a scalar -# termnames(MatrixTerm((term(a:), term(:b)))) returns a vector -# but this is the same behavior as coefnames -termnames(t::MatrixTerm) = mapreduce(termnames, vcat, t.terms) -termnames(t::InteractionTerm) = - kron_insideout((args...) -> join(args, " & "), vectorize.(termnames.(t.terms))...) +# termnames(TupleTerm)) alwyas returns a vector, even if it's just one element, e.g., +# termnames((term(:a),)) +termnames(ts::TupleTerm) = mapreduce(termnames, vcat, ts; init=[]) +# termnames(MatrixTerm)) alwyas returns a vector, even if it's just one element, e.g., +# termnames(MatrixTerm(term(:a))) +termnames(t::MatrixTerm) = mapreduce(termnames, vcat, t.terms; init=[]) +function termnames(t::InteractionTerm) + only(kron_insideout((args...) -> join(args, " & "), vectorize.(termnames.(t.terms))...)) +end @doc """ fit(Mod::Type{<:StatisticalModel}, f::FormulaTerm, data, args...; diff --git a/test/statsmodel.jl b/test/statsmodel.jl index 4e8a2e3d..4f86d75c 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -251,13 +251,11 @@ end cm = StatsModels.ContrastsMatrix([1 0; 0 1], ["b", "c"], ["a", "b", "c"], DummyCoding()) @test termnames(CategoricalTerm(:x, cm)) =="x" @test termnames(FunctionTerm(log, [Term(:x)], :(log(x)))) == "log(x)" - # these next few seem a little weird but they're consistent with the - # definition of coefnames - @test termnames(InteractionTerm(term.((:a, :b, :c)))) == ["a & b & c"] - @test termnames(MatrixTerm(term(:a))) == "a" + @test termnames(InteractionTerm(term.((:a, :b, :c)))) == "a & b & c" + @test termnames(MatrixTerm(term(:a))) == ["a"] @test termnames(MatrixTerm((term(:a), term(:b)))) == ["a", "b"] @test termnames((term(:a), term(:b))) == ["a", "b"] - @test termnames((term(:a),)) == "a" + @test termnames((term(:a),)) == ["a"] end @testset "lrtest" begin From 987fce6a4603553c2e64d10301e07d9edf26a120 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 5 Sep 2023 17:13:59 +0200 Subject: [PATCH 09/24] argh --- src/StatsModels.jl | 2 +- test/runtests.jl | 1 + test/statsmodel.jl | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/StatsModels.jl b/src/StatsModels.jl index 1992b2c9..0f33c21d 100644 --- a/src/StatsModels.jl +++ b/src/StatsModels.jl @@ -9,7 +9,7 @@ using DataStructures using DataAPI using DataAPI: levels using Printf: @sprintf -using StatsAPI: coefnames, fit, predict, predict! +using StatsAPI: coefnames, fit, predict, dof using StatsFuns: chisqccdf using SparseArrays diff --git a/test/runtests.jl b/test/runtests.jl index 36f30fe2..255b06c3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -9,6 +9,7 @@ using CategoricalArrays using StatsAPI using StatsBase +using StatsAPI: dof using StatsModels: ContrastsMatrix my_tests = ["ambiguity.jl", diff --git a/test/statsmodel.jl b/test/statsmodel.jl index 4f86d75c..0f31cf47 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -285,7 +285,7 @@ end ────────────────────────────────────────────────────── DOF ΔDOF LogLik Deviance Chisq p(>Chisq) ────────────────────────────────────────────────────── - [1] 1 -14.0000 14.0000 + [1] 1 -14.0000 14.0000 [2] 2 1 -3.2600 3.2600 21.4800 <1e-05 ──────────────────────────────────────────────────────""" @@ -330,7 +330,7 @@ end ────────────────────────────────────────────────────── DOF ΔDOF LogLik Deviance Chisq p(>Chisq) ────────────────────────────────────────────────────── - [1] 0 -30.0000 30.0000 + [1] 0 -30.0000 30.0000 [2] 1 1 -10.8600 10.8600 38.2800 <1e-09 ──────────────────────────────────────────────────────""" else @@ -339,7 +339,7 @@ end ────────────────────────────────────────────────────── DOF ΔDOF LogLik Deviance Chisq p(>Chisq) ────────────────────────────────────────────────────── - [1] 0 -30.0000 30.0000 + [1] 0 -30.0000 30.0000 [2] 1 1 -10.8600 10.8600 38.2800 <1e-9 ──────────────────────────────────────────────────────""" end From 767d6e067483c8802616585166447dcac7a73403 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 5 Sep 2023 17:15:28 +0200 Subject: [PATCH 10/24] StatsAPI --- docs/Project.toml | 1 + docs/src/internals.md | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/Project.toml b/docs/Project.toml index 8827ab23..872d814e 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -3,6 +3,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" +StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" diff --git a/docs/src/internals.md b/docs/src/internals.md index 8ec003d4..b5035569 100644 --- a/docs/src/internals.md +++ b/docs/src/internals.md @@ -395,7 +395,7 @@ possible to use an existing function, the best practice is to define a new function to make dispatch less ambiguous. ```jldoctest 1 -using StatsBase +using StatsAPI # syntax: best practice to define a _new_ function poly(x, n) = x^n From 44dc3c3d4eff30fbc8a0a7aaadd71fd377a25ae2 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 5 Sep 2023 17:23:12 +0200 Subject: [PATCH 11/24] init with String[] --- src/statsmodel.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index 882fea93..b7960cca 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -166,10 +166,10 @@ termnames(t::ConstantTerm) = string(t.n) termnames(t::FunctionTerm) = string(t.exorig) # termnames(TupleTerm)) alwyas returns a vector, even if it's just one element, e.g., # termnames((term(:a),)) -termnames(ts::TupleTerm) = mapreduce(termnames, vcat, ts; init=[]) +termnames(ts::TupleTerm) = mapreduce(termnames, vcat, ts; init=String[]) # termnames(MatrixTerm)) alwyas returns a vector, even if it's just one element, e.g., # termnames(MatrixTerm(term(:a))) -termnames(t::MatrixTerm) = mapreduce(termnames, vcat, t.terms; init=[]) +termnames(t::MatrixTerm) = mapreduce(termnames, vcat, t.terms; init=String[]) function termnames(t::InteractionTerm) only(kron_insideout((args...) -> join(args, " & "), vectorize.(termnames.(t.terms))...)) end From e71ba2209d2b17870aa2826c96e06fbcfe3c0936 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 5 Sep 2023 10:28:01 -0500 Subject: [PATCH 12/24] okidoki Co-authored-by: Milan Bouchet-Valat --- src/statsmodel.jl | 11 +++++------ test/statsmodel.jl | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index b7960cca..cb03f951 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -76,8 +76,8 @@ for (modeltype, dfmodeltype) in ((:StatisticalModel, TableStatisticalModel), (:RegressionModel, TableRegressionModel)) @eval begin function StatsAPI.fit(::Type{T}, f::FormulaTerm, data, args...; - contrasts::Dict{Symbol,<:Any} = Dict{Symbol,Any}(), - kwargs...) where T<:$modeltype + contrasts::Dict{Symbol,<:Any} = Dict{Symbol,Any}(), + kwargs...) where T<:$modeltype Tables.istable(data) || throw(ArgumentError("expected data in a Table, got $(typeof(data))")) cols = columntable(data) @@ -164,15 +164,14 @@ termnames(t::CategoricalTerm) = string(t.sym) termnames(t::Term) = string(t.sym) termnames(t::ConstantTerm) = string(t.n) termnames(t::FunctionTerm) = string(t.exorig) -# termnames(TupleTerm)) alwyas returns a vector, even if it's just one element, e.g., +# termnames(TupleTerm)) always returns a vector, even if it's just one element, e.g., # termnames((term(:a),)) termnames(ts::TupleTerm) = mapreduce(termnames, vcat, ts; init=String[]) -# termnames(MatrixTerm)) alwyas returns a vector, even if it's just one element, e.g., +# termnames(MatrixTerm)) always returns a vector, even if it's just one element, e.g., # termnames(MatrixTerm(term(:a))) termnames(t::MatrixTerm) = mapreduce(termnames, vcat, t.terms; init=String[]) -function termnames(t::InteractionTerm) +termnames(t::InteractionTerm) = only(kron_insideout((args...) -> join(args, " & "), vectorize.(termnames.(t.terms))...)) -end @doc """ fit(Mod::Type{<:StatisticalModel}, f::FormulaTerm, data, args...; diff --git a/test/statsmodel.jl b/test/statsmodel.jl index 0f31cf47..7445b2ee 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -19,7 +19,7 @@ StatsAPI.coeftable(mod::DummyMod) = 0) # dumb predict: return values predicted by "beta" and dummy confidence bounds function StatsAPI.predict(mod::DummyMod; - interval::Union{Nothing,Symbol}=nothing) + interval::Union{Nothing,Symbol}=nothing) pred = mod.x * mod.beta if interval === nothing return pred @@ -30,7 +30,7 @@ function StatsAPI.predict(mod::DummyMod; end end function StatsAPI.predict(mod::DummyMod, newX::Matrix; - interval::Union{Nothing,Symbol}=nothing) + interval::Union{Nothing,Symbol}=nothing) pred = newX * mod.beta if interval === nothing return pred From fe426f06c2fa1b21e84b176a8acac8893cbd3a3c Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 5 Sep 2023 17:53:22 +0200 Subject: [PATCH 13/24] deprecations --- src/StatsModels.jl | 1 + src/contrasts.jl | 9 +++------ src/deprecated.jl | 12 ++++++++++++ src/terms.jl | 4 ++-- test/contrasts.jl | 6 +++++- 5 files changed, 23 insertions(+), 9 deletions(-) create mode 100644 src/deprecated.jl diff --git a/src/StatsModels.jl b/src/StatsModels.jl index 0f33c21d..6bb16069 100644 --- a/src/StatsModels.jl +++ b/src/StatsModels.jl @@ -84,5 +84,6 @@ include("formula.jl") include("modelframe.jl") include("statsmodel.jl") include("lrtest.jl") +include("deprecated.jl") end # module StatsModels diff --git a/src/contrasts.jl b/src/contrasts.jl index 38d208fc..ce1479e0 100644 --- a/src/contrasts.jl +++ b/src/contrasts.jl @@ -103,7 +103,7 @@ abstract type AbstractContrasts end # Contrasts + Levels (usually from data) = ContrastsMatrix struct ContrastsMatrix{C <: AbstractContrasts, M <: AbstractMatrix, T, U} matrix::M - termnames::Vector{U} # XXX this is somewhat of a misnomer, this should be coefnames... + coefnames::Vector{U} levels::Vector{T} contrasts::C invindex::Dict{T,Int} @@ -122,11 +122,11 @@ end # will behave identically in creating modelmatrix columns Base.:(==)(a::ContrastsMatrix{C}, b::ContrastsMatrix{C}) where {C<:AbstractContrasts} = a.matrix == b.matrix && - a.termnames == b.termnames && + a.coefnames == b.coefnames && a.levels == b.levels Base.hash(a::ContrastsMatrix{C}, h::UInt) where {C} = - hash(C, hash(a.matrix, hash(a.termnames, hash(a.levels, h)))) + hash(C, hash(a.matrix, hash(a.coefnames, hash(a.levels, h)))) """ An instantiation of a contrast coding system for particular levels @@ -224,9 +224,6 @@ function ContrastsMatrix(c::ContrastsMatrix, levels::AbstractVector) return c end -@deprecate(termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer), - coefnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer), - false) function StatsAPI.coefnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer) not_base = [1:(baseind-1); (baseind+1):length(levels)] diff --git a/src/deprecated.jl b/src/deprecated.jl new file mode 100644 index 00000000..8d6cd8b0 --- /dev/null +++ b/src/deprecated.jl @@ -0,0 +1,12 @@ +@deprecate(termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer), + coefnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer), + false) + +function Base.getproperty(cm::ContrastsMatrix, x::Symbol) + if x === :termnames + Base.depwarn("The `termnames` field has been renamed `coefnames`.", :ContrastsMatrix) + x = :coefnames + end + + return getfield(cm, x) +end diff --git a/src/terms.jl b/src/terms.jl index c34de73f..245da800 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -226,7 +226,7 @@ width(::CategoricalTerm{C,T,N}) where {C,T,N} = N # constructor that computes the width based on the contrasts matrix CategoricalTerm(sym::Symbol, contrasts::ContrastsMatrix{C,T}) where {C,T} = - CategoricalTerm{C,T,length(contrasts.termnames)}(sym, contrasts) + CategoricalTerm{C,T,length(contrasts.coefnames)}(sym, contrasts) """ MatrixTerm{Ts} <: AbstractTerm @@ -574,7 +574,7 @@ StatsAPI.coefnames(t::FormulaTerm) = (coefnames(t.lhs), coefnames(t.rhs)) StatsAPI.coefnames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : [] StatsAPI.coefnames(t::ContinuousTerm) = string(t.sym) StatsAPI.coefnames(t::CategoricalTerm) = - ["$(t.sym): $name" for name in t.contrasts.termnames] + ["$(t.sym): $name" for name in t.contrasts.coefnames] StatsAPI.coefnames(t::FunctionTerm) = string(t.exorig) StatsAPI.coefnames(ts::TupleTerm) = reduce(vcat, coefnames.(ts)) StatsAPI.coefnames(t::MatrixTerm) = mapreduce(coefnames, vcat, t.terms) diff --git a/test/contrasts.jl b/test/contrasts.jl index 7758c187..c80a5f46 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -1,5 +1,9 @@ @testset "contrasts" begin + cm = StatsModels.ContrastsMatrix(DummyCoding(), ["a", "b"]) + @test_logs (:warn, "The `termnames` field has been renamed `coefnames`.") cm.termnames + @test cm.termnames == cm.coefnames + d = DataFrame(y = rand(6), x = [:b, :a, :c, :a, :a, :b]) @@ -378,7 +382,7 @@ cmat = StatsModels.ContrastsMatrix(contrasts, 'a':'d') spcmat = StatsModels.ContrastsMatrix(sparse(cmat.matrix), - cmat.termnames, + cmat.coefnames, cmat.levels, cmat.contrasts) From 4ce8532d73ab2265c325900b8944f000dd42b366 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 5 Sep 2023 17:53:46 +0200 Subject: [PATCH 14/24] whitespace --- src/statsmodel.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index cb03f951..3498ad68 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -131,8 +131,8 @@ right hand sides of the formula. !!! note Until `apply_schema` has been called, literal `1` in formulae -is interpreted as `ConstantTerm(1)` and will thus appear as `"1"` in the -returned term names. + is interpreted as `ConstantTerm(1)` and will thus appear as `"1"` in the + returned term names. ```jldoctest julia> termnames(@formula(y ~ 1 + x * y + (1+x|g))) From 750dcd29d321fdf80d6fcf92f7d44a48e5157727 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 5 Sep 2023 11:20:41 -0500 Subject: [PATCH 15/24] Update src/contrasts.jl Co-authored-by: Milan Bouchet-Valat --- src/contrasts.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/contrasts.jl b/src/contrasts.jl index ce1479e0..e4d1c3a0 100644 --- a/src/contrasts.jl +++ b/src/contrasts.jl @@ -224,7 +224,6 @@ function ContrastsMatrix(c::ContrastsMatrix, levels::AbstractVector) return c end - function StatsAPI.coefnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer) not_base = [1:(baseind-1); (baseind+1):length(levels)] levels[not_base] From 5c28fcf34c1cb3bd1d1b079bc80a9c61b988d7d4 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 5 Sep 2023 11:55:30 -0500 Subject: [PATCH 16/24] Update src/statsmodel.jl Co-authored-by: Dave Kleinschmidt --- src/statsmodel.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index 3498ad68..4d5accec 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -115,7 +115,7 @@ This is a convenience method for `termnames(formula(model))`, which returns a two-tuple of `termnames` applied to the left and right hand sides of the formula. For `RegressionModel`s with only continuous predictors, this is the same as -`(responsename(model), coefnames(model))`. +`(responsename(model), coefnames(model))` and `coefnames(formula(model))`. For models with categorical predictors, the returned names reflect the variable name and not the coefficients resulting from From 73c312df2dabad4b37b2b9045158a43c81dbc28e Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 5 Sep 2023 19:12:46 +0200 Subject: [PATCH 17/24] variablenames and xref --- src/StatsModels.jl | 2 +- src/contrasts.jl | 6 +++--- src/statsmodel.jl | 52 ++++++++++++++++++++++++---------------------- src/terms.jl | 2 ++ test/statsmodel.jl | 34 +++++++++++++++--------------- 5 files changed, 50 insertions(+), 46 deletions(-) diff --git a/src/StatsModels.jl b/src/StatsModels.jl index 6bb16069..d6713bd0 100644 --- a/src/StatsModels.jl +++ b/src/StatsModels.jl @@ -38,7 +38,7 @@ export coefnames, setcontrasts!, formula, - termnames, + variablenames, AbstractTerm, ConstantTerm, diff --git a/src/contrasts.jl b/src/contrasts.jl index ce1479e0..a76e8de0 100644 --- a/src/contrasts.jl +++ b/src/contrasts.jl @@ -108,16 +108,16 @@ struct ContrastsMatrix{C <: AbstractContrasts, M <: AbstractMatrix, T, U} contrasts::C invindex::Dict{T,Int} function ContrastsMatrix(matrix::M, - termnames::Vector{U}, + coefnames::Vector{U}, levels::Vector{T}, contrasts::C) where {U, T, C <: AbstractContrasts, M <: AbstractMatrix} allunique(levels) || throw(ArgumentError("levels must be all unique, got $(levels)")) invindex = Dict{T,Int}(x=>i for (i,x) in enumerate(levels)) - new{C,M,T,U}(matrix, termnames, levels, contrasts, invindex) + new{C,M,T,U}(matrix, coefnames, levels, contrasts, invindex) end end -# only check equality of matrix, termnames, and levels, and that the type is the +# only check equality of matrix, coefnames, and levels, and that the type is the # same for the contrasts (values are irrelevant). This ensures that the two # will behave identically in creating modelmatrix columns Base.:(==)(a::ContrastsMatrix{C}, b::ContrastsMatrix{C}) where {C<:AbstractContrasts} = diff --git a/src/statsmodel.jl b/src/statsmodel.jl index 3498ad68..6e832211 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -107,12 +107,12 @@ formula(m::TableStatisticalModel) = m.mf.f formula(m::TableRegressionModel) = m.mf.f """ - termnames(model::StatisticalModel) + variablenames(model::StatisticalModel) Return the names of terms used in the formula of `model`. -This is a convenience method for `termnames(formula(model))`, which returns a -two-tuple of `termnames` applied to the left and right hand sides of the formula. +This is a convenience method for `variablenames(formula(model))`, which returns a +two-tuple of `variablenames` applied to the left and right hand sides of the formula. For `RegressionModel`s with only continuous predictors, this is the same as `(responsename(model), coefnames(model))`. @@ -120,13 +120,15 @@ For `RegressionModel`s with only continuous predictors, this is the same as For models with categorical predictors, the returned names reflect the variable name and not the coefficients resulting from the choice of contrast coding. + +See also [`coefnames`](@ref). """ -termnames(model::StatisticalModel) = termnames(formula(model)) +variablenames(model::StatisticalModel) = variablenamesames(formula(model)) """ - termnames(t::FormulaTerm) + variablenames(t::FormulaTerm) -Return a two-tuple of `termnames` applied to the left and +Return a two-tuple of `variablenames` applied to the left and right hand sides of the formula. !!! note @@ -135,7 +137,7 @@ right hand sides of the formula. returned term names. ```jldoctest -julia> termnames(@formula(y ~ 1 + x * y + (1+x|g))) +julia> variablenames(@formula(y ~ 1 + x * y + (1+x|g))) ("y", ["1", "x", "y", "x & y", "(1 + x) | g"]) ``` @@ -144,34 +146,34 @@ in their term names, because the implicit intercept does not exist until `apply_schema` is called (and may not exist for certain model contexts). ```jldoctest -julia> termnames(@formula(y ~ x * y + (1+x|g))) +julia> variablenames(@formula(y ~ x * y + (1+x|g))) ("y", ["x", "y", "x & y", "(1 + x) | g"]) ``` """ -termnames(t::FormulaTerm) = (termnames(t.lhs), termnames(t.rhs)) +variablenames(t::FormulaTerm) = (variablenames(t.lhs), variablenames(t.rhs)) """ - termnames(term::AbstractTerm) + variablenames(term::AbstractTerm) Return the name of a term. Return value is either a `String`, an iterable of `String`s or nothing if there -no associated name (e.g. `termnames(InterceptTerm{false}())`). +no associated name (e.g. `variablenames(InterceptTerm{false}())`). """ -termnames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : nothing -termnames(t::ContinuousTerm) = string(t.sym) -termnames(t::CategoricalTerm) = string(t.sym) -termnames(t::Term) = string(t.sym) -termnames(t::ConstantTerm) = string(t.n) -termnames(t::FunctionTerm) = string(t.exorig) -# termnames(TupleTerm)) always returns a vector, even if it's just one element, e.g., -# termnames((term(:a),)) -termnames(ts::TupleTerm) = mapreduce(termnames, vcat, ts; init=String[]) -# termnames(MatrixTerm)) always returns a vector, even if it's just one element, e.g., -# termnames(MatrixTerm(term(:a))) -termnames(t::MatrixTerm) = mapreduce(termnames, vcat, t.terms; init=String[]) -termnames(t::InteractionTerm) = - only(kron_insideout((args...) -> join(args, " & "), vectorize.(termnames.(t.terms))...)) +variablenames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : nothing +variablenames(t::ContinuousTerm) = string(t.sym) +variablenames(t::CategoricalTerm) = string(t.sym) +variablenames(t::Term) = string(t.sym) +variablenames(t::ConstantTerm) = string(t.n) +variablenames(t::FunctionTerm) = string(t.exorig) +# variablenames(TupleTerm)) always returns a vector, even if it's just one element, e.g., +# variablenames((term(:a),)) +variablenames(ts::TupleTerm) = mapreduce(variablenames, vcat, ts; init=String[]) +# variablenames(MatrixTerm)) always returns a vector, even if it's just one element, e.g., +# variablenames(MatrixTerm(term(:a))) +variablenames(t::MatrixTerm) = mapreduce(variablenames, vcat, t.terms; init=String[]) +variablenames(t::InteractionTerm) = + only(kron_insideout((args...) -> join(args, " & "), vectorize.(variablenames.(t.terms))...)) @doc """ fit(Mod::Type{<:StatisticalModel}, f::FormulaTerm, data, args...; diff --git a/src/terms.jl b/src/terms.jl index 245da800..02a4805b 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -569,6 +569,8 @@ vectorize(x) = [x] Return the name(s) of column(s) generated by a term. Return value is either a `String` or an iterable of `String`s. + +See also [`variablenames`](@ref). """ StatsAPI.coefnames(t::FormulaTerm) = (coefnames(t.lhs), coefnames(t.rhs)) StatsAPI.coefnames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : [] diff --git a/test/statsmodel.jl b/test/statsmodel.jl index 7445b2ee..b3617fbd 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -161,7 +161,7 @@ Base.show(io::IO, m::DummyModTwo) = println(io, m.msg) ## test copying of names from Terms to CoefTable ct = coeftable(m) @test ct.rownms == ["(Intercept)", "x1", "x2", "x1 & x2"] - @test termnames(m) == ("y", ["(Intercept)", "x1", "x2", "x1 & x2"]) + @test variablenames(m) == ("y", ["(Intercept)", "x1", "x2", "x1 & x2"]) ## show with coeftable defined io = IOBuffer() @@ -172,7 +172,7 @@ Base.show(io::IO, m::DummyModTwo) = println(io, m.msg) m2 = fit(DummyMod, f2, d) @test coeftable(m2).rownms == ["(Intercept)", "x1p: 6", "x1p: 7", "x1p: 8"] - @test termnames(m2) == ("y", ["(Intercept)", "x1p"]) + @test variablenames(m2) == ("y", ["(Intercept)", "x1p"]) ## predict w/ new data missing levels @test predict(m2, d[2:4, :]) == predict(m2)[2:4] @@ -237,25 +237,25 @@ Base.show(io::IO, m::DummyModTwo) = println(io, m.msg) show(io, m2) end -@testset "termnames" begin - # one final termnames check +@testset "variablenames" begin + # one final variablenames check # note that `1` is still a ConstantTerm and not yet InterceptTerm # because apply_schema hasn't been called - @test termnames(@formula(y ~ 1 + log(x) * y + (1+x|g)))[2] == + @test variablenames(@formula(y ~ 1 + log(x) * y + (1+x|g)))[2] == ["1", "log(x)", "y", "log(x) & y", "(1 + x) | g"] - @test termnames(ConstantTerm(1)) == "1" - @test termnames(Term(:x)) == "x" - @test termnames(InterceptTerm{true}()) == "(Intercept)" - @test termnames(InterceptTerm{false}()) === nothing - @test termnames(ContinuousTerm(:x, 1, 0, 0, 0)) == "x" + @test variablenames(ConstantTerm(1)) == "1" + @test variablenames(Term(:x)) == "x" + @test variablenames(InterceptTerm{true}()) == "(Intercept)" + @test variablenames(InterceptTerm{false}()) === nothing + @test variablenames(ContinuousTerm(:x, 1, 0, 0, 0)) == "x" cm = StatsModels.ContrastsMatrix([1 0; 0 1], ["b", "c"], ["a", "b", "c"], DummyCoding()) - @test termnames(CategoricalTerm(:x, cm)) =="x" - @test termnames(FunctionTerm(log, [Term(:x)], :(log(x)))) == "log(x)" - @test termnames(InteractionTerm(term.((:a, :b, :c)))) == "a & b & c" - @test termnames(MatrixTerm(term(:a))) == ["a"] - @test termnames(MatrixTerm((term(:a), term(:b)))) == ["a", "b"] - @test termnames((term(:a), term(:b))) == ["a", "b"] - @test termnames((term(:a),)) == ["a"] + @test variablenames(CategoricalTerm(:x, cm)) =="x" + @test variablenames(FunctionTerm(log, [Term(:x)], :(log(x)))) == "log(x)" + @test variablenames(InteractionTerm(term.((:a, :b, :c)))) == "a & b & c" + @test variablenames(MatrixTerm(term(:a))) == ["a"] + @test variablenames(MatrixTerm((term(:a), term(:b)))) == ["a", "b"] + @test variablenames((term(:a), term(:b))) == ["a", "b"] + @test variablenames((term(:a),)) == ["a"] end @testset "lrtest" begin From e48290601d3066ad91a1fb920f8c635844effac0 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 5 Sep 2023 19:15:52 +0200 Subject: [PATCH 18/24] fixes --- src/statsmodel.jl | 4 ++-- test/statsmodel.jl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index 84abecc8..12c12886 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -123,7 +123,7 @@ the choice of contrast coding. See also [`coefnames`](@ref). """ -variablenames(model::StatisticalModel) = variablenamesames(formula(model)) +variablenames(model::StatisticalModel) = variablenames(formula(model)) """ variablenames(t::FormulaTerm) @@ -160,7 +160,7 @@ Return the name of a term. Return value is either a `String`, an iterable of `String`s or nothing if there no associated name (e.g. `variablenames(InterceptTerm{false}())`). """ -variablenames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : nothing +variablenames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : String[] variablenames(t::ContinuousTerm) = string(t.sym) variablenames(t::CategoricalTerm) = string(t.sym) variablenames(t::Term) = string(t.sym) diff --git a/test/statsmodel.jl b/test/statsmodel.jl index b3617fbd..0c35a21e 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -246,10 +246,10 @@ end @test variablenames(ConstantTerm(1)) == "1" @test variablenames(Term(:x)) == "x" @test variablenames(InterceptTerm{true}()) == "(Intercept)" - @test variablenames(InterceptTerm{false}()) === nothing + @test variablenames(InterceptTerm{false}()) == String[] @test variablenames(ContinuousTerm(:x, 1, 0, 0, 0)) == "x" cm = StatsModels.ContrastsMatrix([1 0; 0 1], ["b", "c"], ["a", "b", "c"], DummyCoding()) - @test variablenames(CategoricalTerm(:x, cm)) =="x" + @test variablenames(CategoricalTerm(:x, cm)) == "x" @test variablenames(FunctionTerm(log, [Term(:x)], :(log(x)))) == "log(x)" @test variablenames(InteractionTerm(term.((:a, :b, :c)))) == "a & b & c" @test variablenames(MatrixTerm(term(:a))) == ["a"] From 561d377485b06287d08111f9ca62d9156c568d9a Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 5 Sep 2023 19:20:28 +0200 Subject: [PATCH 19/24] move to a different file --- src/statsmodel.jl | 69 ----------------------------------------------- src/terms.jl | 69 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 69 deletions(-) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index 12c12886..cd814b04 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -106,75 +106,6 @@ function formula end formula(m::TableStatisticalModel) = m.mf.f formula(m::TableRegressionModel) = m.mf.f -""" - variablenames(model::StatisticalModel) - -Return the names of terms used in the formula of `model`. - -This is a convenience method for `variablenames(formula(model))`, which returns a -two-tuple of `variablenames` applied to the left and right hand sides of the formula. - -For `RegressionModel`s with only continuous predictors, this is the same as -`(responsename(model), coefnames(model))` and `coefnames(formula(model))`. - -For models with categorical predictors, the returned names reflect -the variable name and not the coefficients resulting from -the choice of contrast coding. - -See also [`coefnames`](@ref). -""" -variablenames(model::StatisticalModel) = variablenames(formula(model)) - -""" - variablenames(t::FormulaTerm) - -Return a two-tuple of `variablenames` applied to the left and -right hand sides of the formula. - -!!! note - Until `apply_schema` has been called, literal `1` in formulae - is interpreted as `ConstantTerm(1)` and will thus appear as `"1"` in the - returned term names. - -```jldoctest -julia> variablenames(@formula(y ~ 1 + x * y + (1+x|g))) -("y", ["1", "x", "y", "x & y", "(1 + x) | g"]) -``` - -Similarly, formulae with an implicit intercept will not have a `"1"` -in their term names, because the implicit intercept does not exist until -`apply_schema` is called (and may not exist for certain model contexts). - -```jldoctest -julia> variablenames(@formula(y ~ x * y + (1+x|g))) -("y", ["x", "y", "x & y", "(1 + x) | g"]) -``` -""" -variablenames(t::FormulaTerm) = (variablenames(t.lhs), variablenames(t.rhs)) - -""" - variablenames(term::AbstractTerm) - -Return the name of a term. - -Return value is either a `String`, an iterable of `String`s or nothing if there -no associated name (e.g. `variablenames(InterceptTerm{false}())`). -""" -variablenames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : String[] -variablenames(t::ContinuousTerm) = string(t.sym) -variablenames(t::CategoricalTerm) = string(t.sym) -variablenames(t::Term) = string(t.sym) -variablenames(t::ConstantTerm) = string(t.n) -variablenames(t::FunctionTerm) = string(t.exorig) -# variablenames(TupleTerm)) always returns a vector, even if it's just one element, e.g., -# variablenames((term(:a),)) -variablenames(ts::TupleTerm) = mapreduce(variablenames, vcat, ts; init=String[]) -# variablenames(MatrixTerm)) always returns a vector, even if it's just one element, e.g., -# variablenames(MatrixTerm(term(:a))) -variablenames(t::MatrixTerm) = mapreduce(variablenames, vcat, t.terms; init=String[]) -variablenames(t::InteractionTerm) = - only(kron_insideout((args...) -> join(args, " & "), vectorize.(variablenames.(t.terms))...)) - @doc """ fit(Mod::Type{<:StatisticalModel}, f::FormulaTerm, data, args...; contrasts::Dict{Symbol}, kwargs...) diff --git a/src/terms.jl b/src/terms.jl index 02a4805b..be19ff33 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -583,6 +583,75 @@ StatsAPI.coefnames(t::MatrixTerm) = mapreduce(coefnames, vcat, t.terms) StatsAPI.coefnames(t::InteractionTerm) = kron_insideout((args...) -> join(args, " & "), vectorize.(coefnames.(t.terms))...) +""" + variablenames(model::StatisticalModel) + +Return the names of terms used in the formula of `model`. + +This is a convenience method for `variablenames(formula(model))`, which returns a +two-tuple of `variablenames` applied to the left and right hand sides of the formula. + +For `RegressionModel`s with only continuous predictors, this is the same as +`(responsename(model), coefnames(model))` and `coefnames(formula(model))`. + +For models with categorical predictors, the returned names reflect +the variable name and not the coefficients resulting from +the choice of contrast coding. + +See also [`coefnames`](@ref). +""" +variablenames(model::StatisticalModel) = variablenames(formula(model)) + +""" + variablenames(t::FormulaTerm) + +Return a two-tuple of `variablenames` applied to the left and +right hand sides of the formula. + +!!! note + Until `apply_schema` has been called, literal `1` in formulae + is interpreted as `ConstantTerm(1)` and will thus appear as `"1"` in the + returned term names. + +```jldoctest +julia> variablenames(@formula(y ~ 1 + x * y + (1+x|g))) +("y", ["1", "x", "y", "x & y", "(1 + x) | g"]) +``` + +Similarly, formulae with an implicit intercept will not have a `"1"` +in their term names, because the implicit intercept does not exist until +`apply_schema` is called (and may not exist for certain model contexts). + +```jldoctest +julia> variablenames(@formula(y ~ x * y + (1+x|g))) +("y", ["x", "y", "x & y", "(1 + x) | g"]) +``` +""" +variablenames(t::FormulaTerm) = (variablenames(t.lhs), variablenames(t.rhs)) + +""" + variablenames(term::AbstractTerm) + +Return the name of the statistical variable associated with a term. + +Return value is either a `String`, an iterable of `String`s or nothing if there +no associated name (e.g. `variablenames(InterceptTerm{false}())`). +""" +variablenames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : String[] +variablenames(t::ContinuousTerm) = string(t.sym) +variablenames(t::CategoricalTerm) = string(t.sym) +variablenames(t::Term) = string(t.sym) +variablenames(t::ConstantTerm) = string(t.n) +variablenames(t::FunctionTerm) = string(t.exorig) +# variablenames(TupleTerm)) always returns a vector, even if it's just one element, e.g., +# variablenames((term(:a),)) +variablenames(ts::TupleTerm) = mapreduce(variablenames, vcat, ts; init=String[]) +# variablenames(MatrixTerm)) always returns a vector, even if it's just one element, e.g., +# variablenames(MatrixTerm(term(:a))) +variablenames(t::MatrixTerm) = mapreduce(variablenames, vcat, t.terms; init=String[]) +variablenames(t::InteractionTerm) = + only(kron_insideout((args...) -> join(args, " & "), vectorize.(variablenames.(t.terms))...)) + ################################################################################ # old Terms features: From e744aa2eda940b52a843944a7dc8761e3a5ccee4 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 5 Sep 2023 19:22:05 +0200 Subject: [PATCH 20/24] update doc string --- src/terms.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/terms.jl b/src/terms.jl index be19ff33..80d5f1fd 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -619,7 +619,7 @@ julia> variablenames(@formula(y ~ 1 + x * y + (1+x|g))) ``` Similarly, formulae with an implicit intercept will not have a `"1"` -in their term names, because the implicit intercept does not exist until +in their variable names, because the implicit intercept does not exist until `apply_schema` is called (and may not exist for certain model contexts). ```jldoctest @@ -634,8 +634,8 @@ variablenames(t::FormulaTerm) = (variablenames(t.lhs), variablenames(t.rhs)) Return the name of the statistical variable associated with a term. -Return value is either a `String`, an iterable of `String`s or nothing if there -no associated name (e.g. `variablenames(InterceptTerm{false}())`). +Return value is either a `String`, an iterable of `String`s or the empty vector +if there is no associated variable (e.g. `variablenames(InterceptTerm{false}())`). """ variablenames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : String[] variablenames(t::ContinuousTerm) = string(t.sym) From 28e3b6aee7dfff5b81e185b921cf9634e15de3a4 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 6 Sep 2023 07:48:11 +0000 Subject: [PATCH 21/24] sure Co-authored-by: Alex Arslan --- src/deprecated.jl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/deprecated.jl b/src/deprecated.jl index 8d6cd8b0..c4939de3 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -3,10 +3,11 @@ false) function Base.getproperty(cm::ContrastsMatrix, x::Symbol) - if x === :termnames - Base.depwarn("The `termnames` field has been renamed `coefnames`.", :ContrastsMatrix) - x = :coefnames + if x === :termnames + Base.depwarn("the `termnames` field of `ConstrastsMatrix` is deprecated; use `coefnames(cm)` instead.", + :ContrastsMatrix) + return coefnames(cm) + else + return getfield(cm, x) end - - return getfield(cm, x) end From 80511d489f3167a748af18a83fafe6cfaff6c438 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 6 Sep 2023 22:55:56 +0200 Subject: [PATCH 22/24] s/variablenames/termnames/ --- src/StatsModels.jl | 2 +- src/terms.jl | 54 +++++++++++++++++++++++----------------------- test/statsmodel.jl | 34 ++++++++++++++--------------- 3 files changed, 45 insertions(+), 45 deletions(-) diff --git a/src/StatsModels.jl b/src/StatsModels.jl index d6713bd0..6bb16069 100644 --- a/src/StatsModels.jl +++ b/src/StatsModels.jl @@ -38,7 +38,7 @@ export coefnames, setcontrasts!, formula, - variablenames, + termnames, AbstractTerm, ConstantTerm, diff --git a/src/terms.jl b/src/terms.jl index 80d5f1fd..188120b1 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -570,7 +570,7 @@ vectorize(x) = [x] Return the name(s) of column(s) generated by a term. Return value is either a `String` or an iterable of `String`s. -See also [`variablenames`](@ref). +See also [`termnames`](@ref). """ StatsAPI.coefnames(t::FormulaTerm) = (coefnames(t.lhs), coefnames(t.rhs)) StatsAPI.coefnames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : [] @@ -584,12 +584,12 @@ StatsAPI.coefnames(t::InteractionTerm) = kron_insideout((args...) -> join(args, " & "), vectorize.(coefnames.(t.terms))...) """ - variablenames(model::StatisticalModel) + termnames(model::StatisticalModel) Return the names of terms used in the formula of `model`. -This is a convenience method for `variablenames(formula(model))`, which returns a -two-tuple of `variablenames` applied to the left and right hand sides of the formula. +This is a convenience method for `termnames(formula(model))`, which returns a +two-tuple of `termnames` applied to the left and right hand sides of the formula. For `RegressionModel`s with only continuous predictors, this is the same as `(responsename(model), coefnames(model))` and `coefnames(formula(model))`. @@ -600,12 +600,12 @@ the choice of contrast coding. See also [`coefnames`](@ref). """ -variablenames(model::StatisticalModel) = variablenames(formula(model)) +termnames(model::StatisticalModel) = termnames(formula(model)) """ - variablenames(t::FormulaTerm) + termnames(t::FormulaTerm) -Return a two-tuple of `variablenames` applied to the left and +Return a two-tuple of `termnames` applied to the left and right hand sides of the formula. !!! note @@ -614,7 +614,7 @@ right hand sides of the formula. returned term names. ```jldoctest -julia> variablenames(@formula(y ~ 1 + x * y + (1+x|g))) +julia> termnames(@formula(y ~ 1 + x * y + (1+x|g))) ("y", ["1", "x", "y", "x & y", "(1 + x) | g"]) ``` @@ -623,34 +623,34 @@ in their variable names, because the implicit intercept does not exist until `apply_schema` is called (and may not exist for certain model contexts). ```jldoctest -julia> variablenames(@formula(y ~ x * y + (1+x|g))) +julia> termnames(@formula(y ~ x * y + (1+x|g))) ("y", ["x", "y", "x & y", "(1 + x) | g"]) ``` """ -variablenames(t::FormulaTerm) = (variablenames(t.lhs), variablenames(t.rhs)) +termnames(t::FormulaTerm) = (termnames(t.lhs), termnames(t.rhs)) """ - variablenames(term::AbstractTerm) + termnames(term::AbstractTerm) Return the name of the statistical variable associated with a term. Return value is either a `String`, an iterable of `String`s or the empty vector -if there is no associated variable (e.g. `variablenames(InterceptTerm{false}())`). -""" -variablenames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : String[] -variablenames(t::ContinuousTerm) = string(t.sym) -variablenames(t::CategoricalTerm) = string(t.sym) -variablenames(t::Term) = string(t.sym) -variablenames(t::ConstantTerm) = string(t.n) -variablenames(t::FunctionTerm) = string(t.exorig) -# variablenames(TupleTerm)) always returns a vector, even if it's just one element, e.g., -# variablenames((term(:a),)) -variablenames(ts::TupleTerm) = mapreduce(variablenames, vcat, ts; init=String[]) -# variablenames(MatrixTerm)) always returns a vector, even if it's just one element, e.g., -# variablenames(MatrixTerm(term(:a))) -variablenames(t::MatrixTerm) = mapreduce(variablenames, vcat, t.terms; init=String[]) -variablenames(t::InteractionTerm) = - only(kron_insideout((args...) -> join(args, " & "), vectorize.(variablenames.(t.terms))...)) +if there is no associated variable (e.g. `termnames(InterceptTerm{false}())`). +""" +termnames(::InterceptTerm{H}) where {H} = H ? "(Intercept)" : String[] +termnames(t::ContinuousTerm) = string(t.sym) +termnames(t::CategoricalTerm) = string(t.sym) +termnames(t::Term) = string(t.sym) +termnames(t::ConstantTerm) = string(t.n) +termnames(t::FunctionTerm) = string(t.exorig) +# termnames(TupleTerm)) always returns a vector, even if it's just one element, e.g., +# termnames((term(:a),)) +termnames(ts::TupleTerm) = mapreduce(termnames, vcat, ts; init=String[]) +# termnames(MatrixTerm)) always returns a vector, even if it's just one element, e.g., +# termnames(MatrixTerm(term(:a))) +termnames(t::MatrixTerm) = mapreduce(termnames, vcat, t.terms; init=String[]) +termnames(t::InteractionTerm) = + only(kron_insideout((args...) -> join(args, " & "), vectorize.(termnames.(t.terms))...)) ################################################################################ # old Terms features: diff --git a/test/statsmodel.jl b/test/statsmodel.jl index 0c35a21e..a60b3d4d 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -161,7 +161,7 @@ Base.show(io::IO, m::DummyModTwo) = println(io, m.msg) ## test copying of names from Terms to CoefTable ct = coeftable(m) @test ct.rownms == ["(Intercept)", "x1", "x2", "x1 & x2"] - @test variablenames(m) == ("y", ["(Intercept)", "x1", "x2", "x1 & x2"]) + @test termnames(m) == ("y", ["(Intercept)", "x1", "x2", "x1 & x2"]) ## show with coeftable defined io = IOBuffer() @@ -172,7 +172,7 @@ Base.show(io::IO, m::DummyModTwo) = println(io, m.msg) m2 = fit(DummyMod, f2, d) @test coeftable(m2).rownms == ["(Intercept)", "x1p: 6", "x1p: 7", "x1p: 8"] - @test variablenames(m2) == ("y", ["(Intercept)", "x1p"]) + @test termnames(m2) == ("y", ["(Intercept)", "x1p"]) ## predict w/ new data missing levels @test predict(m2, d[2:4, :]) == predict(m2)[2:4] @@ -237,25 +237,25 @@ Base.show(io::IO, m::DummyModTwo) = println(io, m.msg) show(io, m2) end -@testset "variablenames" begin - # one final variablenames check +@testset "termnames" begin + # one final termnames check # note that `1` is still a ConstantTerm and not yet InterceptTerm # because apply_schema hasn't been called - @test variablenames(@formula(y ~ 1 + log(x) * y + (1+x|g)))[2] == + @test termnames(@formula(y ~ 1 + log(x) * y + (1+x|g)))[2] == ["1", "log(x)", "y", "log(x) & y", "(1 + x) | g"] - @test variablenames(ConstantTerm(1)) == "1" - @test variablenames(Term(:x)) == "x" - @test variablenames(InterceptTerm{true}()) == "(Intercept)" - @test variablenames(InterceptTerm{false}()) == String[] - @test variablenames(ContinuousTerm(:x, 1, 0, 0, 0)) == "x" + @test termnames(ConstantTerm(1)) == "1" + @test termnames(Term(:x)) == "x" + @test termnames(InterceptTerm{true}()) == "(Intercept)" + @test termnames(InterceptTerm{false}()) == String[] + @test termnames(ContinuousTerm(:x, 1, 0, 0, 0)) == "x" cm = StatsModels.ContrastsMatrix([1 0; 0 1], ["b", "c"], ["a", "b", "c"], DummyCoding()) - @test variablenames(CategoricalTerm(:x, cm)) == "x" - @test variablenames(FunctionTerm(log, [Term(:x)], :(log(x)))) == "log(x)" - @test variablenames(InteractionTerm(term.((:a, :b, :c)))) == "a & b & c" - @test variablenames(MatrixTerm(term(:a))) == ["a"] - @test variablenames(MatrixTerm((term(:a), term(:b)))) == ["a", "b"] - @test variablenames((term(:a), term(:b))) == ["a", "b"] - @test variablenames((term(:a),)) == ["a"] + @test termnames(CategoricalTerm(:x, cm)) == "x" + @test termnames(FunctionTerm(log, [Term(:x)], :(log(x)))) == "log(x)" + @test termnames(InteractionTerm(term.((:a, :b, :c)))) == "a & b & c" + @test termnames(MatrixTerm(term(:a))) == ["a"] + @test termnames(MatrixTerm((term(:a), term(:b)))) == ["a", "b"] + @test termnames((term(:a), term(:b))) == ["a", "b"] + @test termnames((term(:a),)) == ["a"] end @testset "lrtest" begin From 7f4129316489da2ba2d14d8b3774d54009819be3 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 6 Sep 2023 23:05:44 +0200 Subject: [PATCH 23/24] @ararslan making trouble --- src/contrasts.jl | 2 ++ src/deprecated.jl | 2 +- test/contrasts.jl | 4 +++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/contrasts.jl b/src/contrasts.jl index a628b15c..52e83d1d 100644 --- a/src/contrasts.jl +++ b/src/contrasts.jl @@ -117,6 +117,8 @@ struct ContrastsMatrix{C <: AbstractContrasts, M <: AbstractMatrix, T, U} end end +StatsAPI.coefnames(cm::ContrastsMatrix) = cm.coefnames + # only check equality of matrix, coefnames, and levels, and that the type is the # same for the contrasts (values are irrelevant). This ensures that the two # will behave identically in creating modelmatrix columns diff --git a/src/deprecated.jl b/src/deprecated.jl index c4939de3..ff54be4d 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -4,7 +4,7 @@ function Base.getproperty(cm::ContrastsMatrix, x::Symbol) if x === :termnames - Base.depwarn("the `termnames` field of `ConstrastsMatrix` is deprecated; use `coefnames(cm)` instead.", + Base.depwarn("The `termnames` field of `ConstrastsMatrix` is deprecated; use `coefnames(cm)` instead.", :ContrastsMatrix) return coefnames(cm) else diff --git a/test/contrasts.jl b/test/contrasts.jl index c80a5f46..63e82cb3 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -1,7 +1,9 @@ @testset "contrasts" begin cm = StatsModels.ContrastsMatrix(DummyCoding(), ["a", "b"]) - @test_logs (:warn, "The `termnames` field has been renamed `coefnames`.") cm.termnames + @test_logs((:warn, + "The `termnames` field of `ConstrastsMatrix` is deprecated; use `coefnames(cm)` instead."), + cm.termnames) @test cm.termnames == cm.coefnames d = DataFrame(y = rand(6), From 0dccffb29274377e4bf0cc9d4d54a8a57ddebbe9 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 6 Sep 2023 23:27:03 +0200 Subject: [PATCH 24/24] include termnames in manual --- docs/src/api.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/api.md b/docs/src/api.md index 98c726be..76c20ffe 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -15,6 +15,7 @@ end term coefnames modelcols +termnames ``` ### Higher-order terms