diff --git a/Manifest.toml b/Manifest.toml index 10e2633c..f4b6295e 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -27,28 +27,11 @@ git-tree-sha1 = "23d7324164c89638c18f6d7f90d972fa9c4fa9fb" uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" version = "0.7.7" -[[ColorTypes]] -deps = ["FixedPointNumbers", "Random"] -git-tree-sha1 = "7b62b728a5f3dd6ee3b23910303ccf27e82fad5e" -uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" -version = "0.8.1" - [[Compat]] -deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "06be57f11a029927e10d050a6c5496a8695a5437" +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "a4839bd26e3e7f4869a4cf6c31f9f93f47aac7c5" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.3.0" - -[[ComputationalResources]] -deps = ["Test"] -git-tree-sha1 = "89e7e7ed20af73d9f78877d2b8d1194e7b6ff13d" -uuid = "ed09eef8-17a6-5b46-8889-db040fac31e3" -version = "0.3.0" - -[[Crayons]] -git-tree-sha1 = "cb7a62895da739fe5bb43f1a26d4292baf4b3dc0" -uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" -version = "4.0.1" +version = "3.5.0" [[DataAPI]] git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252" @@ -80,9 +63,9 @@ uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" [[Distributions]] deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"] -git-tree-sha1 = "e063d0b5d27180b98edacd2b1cb90ecfbc171385" +git-tree-sha1 = "6b19601c0e98de3a8964ed33ad73e130c7165b1d" uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" -version = "0.21.12" +version = "0.22.4" [[FillArrays]] deps = ["LinearAlgebra", "Random", "SparseArrays"] @@ -90,17 +73,6 @@ git-tree-sha1 = "fec413d4fc547992eb62a5c544cedb6d7853c1f5" uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" version = "0.8.4" -[[FixedPointNumbers]] -git-tree-sha1 = "4aaea64dd0c30ad79037084f8ca2b94348e65eaa" -uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" -version = "0.7.1" - -[[Formatting]] -deps = ["Printf"] -git-tree-sha1 = "a0c901c29c0e7c763342751c0a94211d56c0de5c" -uuid = "59287772-0a20-5a39-b81b-1366585eb4c0" -version = "0.4.1" - [[Future]] deps = ["Random"] uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" @@ -109,12 +81,6 @@ uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[InvertedIndices]] -deps = ["Test"] -git-tree-sha1 = "15732c475062348b0165684ffe28e85ea8396afc" -uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" -version = "1.0.0" - [[IteratorInterfaceExtensions]] git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" uuid = "82899510-4779-5014-852e-03e436cf321d" @@ -126,12 +92,6 @@ git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e" uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" version = "0.21.0" -[[LearnBase]] -deps = ["LinearAlgebra", "SparseArrays", "StatsBase", "Test"] -git-tree-sha1 = "c4b5da6d68517f46f70ed5157b28336b56cd2ff3" -uuid = "7f8f8fb0-2700-5f03-b4bd-41f8cfc144b6" -version = "0.2.2" - [[LibGit2]] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" @@ -145,17 +105,11 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [[Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" -[[LossFunctions]] -deps = ["InteractiveUtils", "LearnBase", "Markdown", "Random", "RecipesBase", "SparseArrays", "Statistics", "StatsBase", "Test"] -git-tree-sha1 = "08d87fec43e7d335811dfae5b55dbfc5690e915b" -uuid = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" -version = "0.5.1" - -[[MLJBase]] -deps = ["CategoricalArrays", "ComputationalResources", "DelimitedFiles", "Distributed", "Distributions", "InteractiveUtils", "InvertedIndices", "LinearAlgebra", "LossFunctions", "Missings", "OrderedCollections", "Parameters", "PrettyTables", "ProgressMeter", "Random", "ScientificTypes", "Statistics", "StatsBase", "Tables"] -git-tree-sha1 = "450fa34dcb0005d0799ffcf9cca5f40aa6d83059" -uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d" -version = "0.10.1" +[[MLJModelInterface]] +deps = ["ScientificTypes"] +git-tree-sha1 = "269deeabed43d68656c80fa57a83fb53ad202728" +uuid = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" +version = "0.1.5" [[Markdown]] deps = ["Base64"] @@ -194,12 +148,6 @@ git-tree-sha1 = "5f303510529486bb02ac4d70da8295da38302194" uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" version = "0.9.11" -[[Parameters]] -deps = ["OrderedCollections"] -git-tree-sha1 = "b62b2558efb1eef1fa44e4be5ff58a515c287e38" -uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" -version = "0.12.0" - [[Parsers]] deps = ["Dates", "Test"] git-tree-sha1 = "d112c19ccca00924d5d3a38b11ae2b4b268dda39" @@ -210,22 +158,10 @@ version = "0.3.11" deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -[[PrettyTables]] -deps = ["Crayons", "Formatting", "Parameters", "Reexport", "Tables"] -git-tree-sha1 = "2268242f037e0290e87d55c02060320c1d0d6b03" -uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" -version = "0.6.0" - [[Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" -[[ProgressMeter]] -deps = ["Distributed", "Printf"] -git-tree-sha1 = "ea1f4fa0ff5e8b771bf130d87af5b7ef400760bd" -uuid = "92933f4c-e287-5a05-a399-4b506db050ca" -version = "1.2.0" - [[QuadGK]] deps = ["DataStructures", "LinearAlgebra"] git-tree-sha1 = "dc84e810393cfc6294248c9032a9cdacc14a3db4" @@ -240,11 +176,6 @@ uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" deps = ["Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -[[RecipesBase]] -git-tree-sha1 = "b4ed4a7f988ea2340017916f7c9e5d7560b52cae" -uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" -version = "0.8.0" - [[Reexport]] deps = ["Pkg"] git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0" @@ -261,10 +192,9 @@ version = "0.6.0" uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" [[ScientificTypes]] -deps = ["CategoricalArrays", "ColorTypes", "PrettyTables", "Tables"] -git-tree-sha1 = "20fa7448b38ea42eb40da1d66c83cf67d626964a" +git-tree-sha1 = "9c232034bbee8c53173cdce83787bf8968b09d31" uuid = "321657f4-b219-11e9-178b-2701a2544e81" -version = "0.5.1" +version = "0.7.1" [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" @@ -288,9 +218,9 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [[SpecialFunctions]] deps = ["OpenSpecFun_jll"] -git-tree-sha1 = "268052ee908b2c086cc0011f528694f02f3e2408" +git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "0.9.0" +version = "0.10.0" [[StaticArrays]] deps = ["LinearAlgebra", "Random", "Statistics"] @@ -304,15 +234,15 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [[StatsBase]] deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] -git-tree-sha1 = "c53e809e63fe5cf5de13632090bc3520649c9950" +git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -version = "0.32.0" +version = "0.32.1" [[StatsFuns]] deps = ["Rmath", "SpecialFunctions"] -git-tree-sha1 = "79982835d2ff3970685cb704500909c94189bde9" +git-tree-sha1 = "f290ddd5fdedeadd10e961eb3f4d3340f09d030a" uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c" -version = "0.9.3" +version = "0.9.4" [[SuiteSparse]] deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"] diff --git a/Project.toml b/Project.toml index 8e9fd1b5..280f6bf3 100644 --- a/Project.toml +++ b/Project.toml @@ -1,12 +1,12 @@ name = "EvoTrees" uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" authors = ["jeremiedb "] -version = "0.4.2" +version = "0.4.3" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" -MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" +MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SortingAlgorithms = "a2af1166-a08f-5f64-846c-94a0d3cef48c" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" @@ -18,9 +18,15 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [compat] CategoricalArrays = "0.7" Distributions = "0.21, 0.22" -MLJBase = "0.10" SortingAlgorithms = "0.3" StaticArrays = "0.12" StatsBase = "0.32" Tables = "0.2" julia = "1" + +[extras] +MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Test", "MLJBase"] diff --git a/experiments/MLJ.jl b/experiments/MLJ.jl index 5b4a7058..766b5679 100644 --- a/experiments/MLJ.jl +++ b/experiments/MLJ.jl @@ -184,7 +184,6 @@ tree.model.nrounds += 10 pred = predict(tree, selectrows(X,train)) pred_mean = predict_mean(tree, selectrows(X,train)) pred_mode = predict_mode(tree, selectrows(X,train)) -pred_median = predict_median(tree, selectrows(X,train)) ################################################## ### Gaussian - Larger data @@ -231,8 +230,7 @@ tree.model.nrounds += 10 pred = predict(tree, selectrows(X,train)) pred_mean = predict_mean(tree, selectrows(X,train)) pred_mode = predict_mode(tree, selectrows(X,train)) -pred_median = predict_median(tree, selectrows(X,train)) -mean(abs.(pred_train - selectrows(Y,train))) +mean(abs.(pred_mean - selectrows(Y,train))) q_20 = quantile.(pred, 0.20) q_20 = quantile.(pred, 0.80) diff --git a/src/EvoTrees.jl b/src/EvoTrees.jl index 218d9781..2cb44aee 100644 --- a/src/EvoTrees.jl +++ b/src/EvoTrees.jl @@ -1,19 +1,18 @@ module EvoTrees -export init_evotree, grow_evotree!, grow_tree, predict, fit_evotree, +export init_evotree, grow_evotree!, grow_tree, fit_evotree, predict, EvoTreeRegressor, EvoTreeCount, EvoTreeClassifier, EvoTreeGaussian, EvoTreeRModels, importance using Statistics using Base.Threads: @threads using StatsBase: sample, quantile -import StatsBase: predict using Random: seed! using StaticArrays using Distributions using CategoricalArrays -import MLJBase -# import MLJ +import MLJModelInterface: predict +import MLJModelInterface include("models.jl") include("structs.jl") diff --git a/src/MLJ.jl b/src/MLJ.jl index e791d814..825b669c 100644 --- a/src/MLJ.jl +++ b/src/MLJ.jl @@ -1,6 +1,6 @@ -function MLJBase.fit(model::EvoTypes, verbosity::Int, X, y) - Xmatrix = MLJBase.matrix(X) +function MLJModelInterface.fit(model::EvoTypes, verbosity::Int, X, y) + Xmatrix = MLJModelInterface.matrix(X) fitresult, cache = init_evotree(model, Xmatrix, y, verbosity = verbosity) grow_evotree!(fitresult, cache, verbosity = verbosity) report = nothing @@ -21,13 +21,12 @@ function okay_to_continue(new, old) new.metric == old.metric end -function MLJBase.update(model::EvoTypes, verbosity, - fitresult, cache, X, y) +function MLJModelInterface.update(model::EvoTypes, verbosity::Integer, fitresult, cache, X, y) if okay_to_continue(model, cache.params) grow_evotree!(fitresult, cache, verbosity = verbosity) else - Xmatrix = MLJBase.matrix(X) + Xmatrix = MLJModelInterface.matrix(X) fitresult, cache = init_evotree(model, Xmatrix, y, verbosity = verbosity) grow_evotree!(fitresult, cache, verbosity = verbosity) end @@ -35,60 +34,88 @@ function MLJBase.update(model::EvoTypes, verbosity, return fitresult, cache, report end -function predict(model::EvoTypes, fitresult, Xnew) - Xnew = MLJBase.matrix(Xnew) +function predict(model::EvoTreeRegressor, fitresult, Xnew) + Xnew = MLJModelInterface.matrix(Xnew) pred = predict(fitresult, Xnew) return pred end function predict(model::EvoTreeClassifier, fitresult, Xnew) - Xnew = MLJBase.matrix(Xnew) + Xnew = MLJModelInterface.matrix(Xnew) pred = predict(fitresult, Xnew) - return [MLJBase.UnivariateFinite(fitresult.levels, pred[i,:]) for i in 1:size(pred,1)] + return [MLJModelInterface.UnivariateFinite(fitresult.levels, pred[i,:]) for i in 1:size(pred,1)] end function predict(model::EvoTreeCount, fitresult, Xnew) - Xnew = MLJBase.matrix(Xnew) + Xnew = MLJModelInterface.matrix(Xnew) λ = predict(fitresult, Xnew) return [Distributions.Poisson(λᵢ) for λᵢ ∈ λ] end function predict(model::EvoTreeGaussian, fitresult, Xnew) - Xnew = MLJBase.matrix(Xnew) + Xnew = MLJModelInterface.matrix(Xnew) pred = predict(fitresult, Xnew) return [Distributions.Normal(pred[i]...) for i in 1:size(pred,1)] end -# MLJBase.predict_mean(model::Union{EvoTreeRegressor, EvoTreeCount, EvoTreeGaussian}, fitresult, Xnew) = -# mean.(MLJBase.predict(model, fitresult, Xnew)) -# -# MLJBase.predict_mode(model::Union{EvoTreeRegressor, EvoTreeClassifier,EvoTreeCount, EvoTreeGaussian}, fitresult, Xnew) = -# mode.(MLJBase.predict(model, fitresult, Xnew)) -# -# MLJBase.predict_median(model::Union{EvoTreeRegressor, EvoTreeCount, EvoTreeGaussian}, fitresult, Xnew) = -# median.(MLJBase.predict(model, fitresult, Xnew)) +# Metadata +const EvoTreeRegressor_desc = "Regression models with various underlying methods: least square, quantile, logistic." +const EvoTreeClassifier_desc = "Multi-classification with softmax and cross-entropy loss." +const EvoTreeCount_desc = "Poisson regression fitting λ with max likelihood." +const EvoTreeGaussian_desc = "Gaussian maximum likelihood of μ and σ²." + +MLJModelInterface.metadata_pkg.((EvoTreeRegressor, EvoTreeClassifier, EvoTreeCount, EvoTreeGaussian), + name="EvoTrees", + uuid="f6006082-12f8-11e9-0c9c-0d5d367ab1e5", + url="https://github.com/Evovest/EvoTrees.jl", + julia=true, + is_wrapper=false) + +MLJModelInterface.metadata_model(EvoTreeRegressor, + input=MLJModelInterface.Table(MLJModelInterface.Continuous), + target=AbstractVector{<:MLJModelInterface.Continuous}, + weights=false, + descr=EvoTreeRegressor_desc) + +MLJModelInterface.metadata_model(EvoTreeClassifier, + input=MLJModelInterface.Table(MLJModelInterface.Continuous), + target=AbstractVector{<:MLJModelInterface.Finite}, + weights=false, + descr=EvoTreeClassifier_desc) + +MLJModelInterface.metadata_model(EvoTreeCount, + input=MLJModelInterface.Table(MLJModelInterface.Continuous), + target=AbstractVector{<:MLJModelInterface.Count}, + weights=false, + descr=EvoTreeCount_desc) + +MLJModelInterface.metadata_model(EvoTreeGaussian, + input=MLJModelInterface.Table(MLJModelInterface.Continuous), + target=AbstractVector{<:MLJModelInterface.Continuous}, + weights=false, + descr=EvoTreeGaussian_desc) # shared metadata -MLJBase.package_name(::Type{<:EvoTypes}) = "EvoTrees" -MLJBase.package_uuid(::Type{<:EvoTypes}) = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" -MLJBase.package_url(::Type{<:EvoTypes}) = "https://github.com/Evovest/EvoTrees.jl" -MLJBase.is_pure_julia(::Type{<:EvoTypes}) = true - -MLJBase.load_path(::Type{<:EvoTreeRegressor}) = "EvoTrees.EvoTreeRegressor" -MLJBase.input_scitype(::Type{<:EvoTreeRegressor}) = MLJBase.Table(MLJBase.Continuous) -MLJBase.target_scitype(::Type{<:EvoTreeRegressor}) = AbstractVector{<:MLJBase.Continuous} - -MLJBase.load_path(::Type{<:EvoTreeCount}) = "EvoTrees.EvoTreeCount" -MLJBase.input_scitype(::Type{<:EvoTreeCount}) = MLJBase.Table(MLJBase.Continuous) -MLJBase.target_scitype(::Type{<:EvoTreeCount}) = AbstractVector{<:MLJBase.Count} - -MLJBase.load_path(::Type{<:EvoTreeClassifier}) = "EvoTrees.EvoTreeClassifier" -MLJBase.input_scitype(::Type{<:EvoTreeClassifier}) = MLJBase.Table(MLJBase.Continuous) -MLJBase.target_scitype(::Type{<:EvoTreeClassifier}) = AbstractVector{<:MLJBase.Finite} - -MLJBase.load_path(::Type{<:EvoTreeGaussian}) = "EvoTrees.EvoTreeGaussian" -MLJBase.input_scitype(::Type{<:EvoTreeGaussian}) = MLJBase.Table(MLJBase.Continuous) -MLJBase.target_scitype(::Type{<:EvoTreeGaussian}) = AbstractVector{<:MLJBase.Continuous} +# MLJModelInterface.package_name(::Type{<:EvoTypes}) = "EvoTrees" +# MLJModelInterface.package_uuid(::Type{<:EvoTypes}) = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" +# MLJModelInterface.package_url(::Type{<:EvoTypes}) = "https://github.com/Evovest/EvoTrees.jl" +# MLJModelInterface.is_pure_julia(::Type{<:EvoTypes}) = true +# +# MLJModelInterface.load_path(::Type{<:EvoTreeRegressor}) = "EvoTrees.EvoTreeRegressor" +# MLJModelInterface.input_scitype(::Type{<:EvoTreeRegressor}) = MLJModelInterface.Table(MLJModelInterface.Continuous) +# MLJModelInterface.target_scitype(::Type{<:EvoTreeRegressor}) = AbstractVector{<:MLJModelInterface.Continuous} +# +# MLJModelInterface.load_path(::Type{<:EvoTreeCount}) = "EvoTrees.EvoTreeCount" +# MLJModelInterface.input_scitype(::Type{<:EvoTreeCount}) = MLJModelInterface.Table(MLJModelInterface.Continuous) +# MLJModelInterface.target_scitype(::Type{<:EvoTreeCount}) = AbstractVector{<:MLJModelInterface.Count} +# +# MLJModelInterface.load_path(::Type{<:EvoTreeClassifier}) = "EvoTrees.EvoTreeClassifier" +# MLJModelInterface.input_scitype(::Type{<:EvoTreeClassifier}) = MLJModelInterface.Table(MLJModelInterface.Continuous) +# MLJModelInterface.target_scitype(::Type{<:EvoTreeClassifier}) = AbstractVector{<:MLJModelInterface.Finite} +# +# MLJModelInterface.load_path(::Type{<:EvoTreeGaussian}) = "EvoTrees.EvoTreeGaussian" +# MLJModelInterface.input_scitype(::Type{<:EvoTreeGaussian}) = MLJModelInterface.Table(MLJModelInterface.Continuous) +# MLJModelInterface.target_scitype(::Type{<:EvoTreeGaussian}) = AbstractVector{<:MLJModelInterface.Continuous} # function MLJ.clean!(model::EvoTreeRegressor) # warning = "" diff --git a/src/models.jl b/src/models.jl index 2456f32c..ea28196f 100644 --- a/src/models.jl +++ b/src/models.jl @@ -12,7 +12,7 @@ struct Quantile <: QuantileRegression end struct Softmax <: MultiClassRegression end struct Gaussian <: GaussianRegression end -mutable struct EvoTreeRegressor{T<:AbstractFloat, U<:ModelType, S<:Int} <: MLJBase.Deterministic +mutable struct EvoTreeRegressor{T<:AbstractFloat, U<:ModelType, S<:Int} <: MLJModelInterface.Deterministic loss::U nrounds::S λ::T @@ -55,7 +55,7 @@ function EvoTreeRegressor(; end -mutable struct EvoTreeCount{T<:AbstractFloat, U<:ModelType, S<:Int} <: MLJBase.Probabilistic +mutable struct EvoTreeCount{T<:AbstractFloat, U<:ModelType, S<:Int} <: MLJModelInterface.Probabilistic loss::U nrounds::S λ::T @@ -93,7 +93,7 @@ function EvoTreeCount(; end -mutable struct EvoTreeClassifier{T<:AbstractFloat, U<:ModelType, S<:Int} <: MLJBase.Probabilistic +mutable struct EvoTreeClassifier{T<:AbstractFloat, U<:ModelType, S<:Int} <: MLJModelInterface.Probabilistic loss::U nrounds::S λ::T @@ -131,7 +131,7 @@ function EvoTreeClassifier(; end -mutable struct EvoTreeGaussian{T<:AbstractFloat, U<:ModelType, S<:Int} <: MLJBase.Probabilistic +mutable struct EvoTreeGaussian{T<:AbstractFloat, U<:ModelType, S<:Int} <: MLJModelInterface.Probabilistic loss::U nrounds::S λ::T diff --git a/src/trees.jl b/src/trees.jl deleted file mode 100644 index 9e8e111f..00000000 --- a/src/trees.jl +++ /dev/null @@ -1,455 +0,0 @@ -# initialize train_nodes -function grow_tree(δ, δ², 𝑤, - hist_δ, hist_δ², hist_𝑤, - params::Union{EvoTreeRegressor,EvoTreeCount,EvoTreeClassifier,EvoTreeGaussian}, - train_nodes::Vector{TrainNode{L,T,S}}, - splits::Vector{SplitInfo{L,T,Int}}, - edges, X_bin) where {R<:Real, T<:AbstractFloat, S<:Int, L} - - active_id = ones(Int, 1) - leaf_count = one(Int) - tree_depth = one(Int) - tree = Tree(Vector{TreeNode{params.K, T, Int, Bool}}()) - - # grow while there are remaining active nodes - while size(active_id, 1) > 0 && tree_depth <= params.max_depth - next_active_id = ones(Int, 0) - # grow nodes - for id in active_id - node = train_nodes[id] - if tree_depth == params.max_depth || node.∑𝑤[1] <= params.min_weight - push!(tree.nodes, TreeNode(pred_leaf(params.loss, node, params, δ²))) - else - @threads for feat in node.𝑗 - splits[feat].gain = node.gain - find_split_static!(hist_δ[feat], hist_δ²[feat], hist_𝑤[feat], view(X_bin,:,feat), δ, δ², 𝑤, node.∑δ, node.∑δ², node.∑𝑤, params, splits[feat], edges[feat], node.𝑖) - # update_hist!(hist_δ, hist_δ², hist_𝑤, X_bin, δ, δ², 𝑤, set, feat) - # find_split!(hist_δ[feat], hist_δ²[feat], hist_𝑤[feat], node.∑δ, node.∑δ², node.∑𝑤, params, splits[feat], edges[feat], feat) - end - # assign best split - best = get_max_gain(splits) - # grow node if best split improve gain - if best.gain > node.gain + params.γ - left, right = update_set(node.𝑖, best.𝑖, view(X_bin,:,best.feat)) - train_nodes[leaf_count + 1] = TrainNode(node.depth + 1, best.∑δL, best.∑δ²L, best.∑𝑤L, best.gainL, left, node.𝑗) - train_nodes[leaf_count + 2] = TrainNode(node.depth + 1, best.∑δR, best.∑δ²R, best.∑𝑤R, best.gainR, right, node.𝑗) - # push split Node - push!(tree.nodes, TreeNode(leaf_count + 1, leaf_count + 2, best.feat, best.cond, params.K)) - push!(next_active_id, leaf_count + 1) - push!(next_active_id, leaf_count + 2) - leaf_count += 2 - else - push!(tree.nodes, TreeNode(pred_leaf(params.loss, node, params, δ²))) - end # end of single node split search - end - end # end of loop over active ids for a given depth - active_id = next_active_id - tree_depth += 1 - end # end of tree growth - return tree -end - -# extract the gain value from the vector of best splits and return the split info associated with best split -function get_max_gain(splits::Vector{SplitInfo{L,T,S}}) where {L,T,S} - gains = (x -> x.gain).(splits) - feat = findmax(gains)[2] - best = splits[feat] - return best -end - -# grow_gbtree -function grow_gbtree(X::AbstractArray{R, 2}, Y::AbstractVector{S}, params::Union{EvoTreeRegressor,EvoTreeCount,EvoTreeClassifier,EvoTreeGaussian}; - X_eval::AbstractArray{R, 2} = Array{R, 2}(undef, (0,0)), Y_eval::AbstractVector{S} = Vector{S}(undef, 0), - early_stopping_rounds=Int(1e5), print_every_n=100, verbosity=1) where {R<:Real, S<:Real} - - seed!(params.seed) - - μ = ones(params.K) - μ .*= mean(Y) - if typeof(params.loss) == Logistic - μ .= logit.(μ) - elseif typeof(params.loss) == Poisson - μ .= log.(μ) - elseif typeof(params.loss) == Softmax - μ .*= 0.0 - elseif typeof(params.loss) == Gaussian - μ = SVector{2}([mean(Y), log(var(Y))]) - end - - # initialize preds - pred = zeros(SVector{params.K,Float64}, size(X,1)) - for i in eachindex(pred) - pred[i] += μ - end - - # eval init - if size(Y_eval, 1) > 0 - # pred_eval = ones(size(Y_eval, 1), params.K) .* μ' - pred_eval = zeros(SVector{params.K,Float64}, size(X_eval,1)) - for i in eachindex(pred_eval) - pred_eval[i] += μ - end - end - - # bias = Tree([TreeNode(SVector{1, Float64}(μ))]) - bias = Tree([TreeNode(SVector{params.K,Float64}(μ))]) - gbtree = GBTree([bias], params, Metric()) - - X_size = size(X) - 𝑖_ = collect(1:X_size[1]) - 𝑗_ = collect(1:X_size[2]) - - # initialize gradients and weights - δ = zeros(SVector{params.K, Float64}, X_size[1]) - δ² = zeros(SVector{params.K, Float64}, X_size[1]) - 𝑤 = zeros(SVector{1, Float64}, X_size[1]) - for i in eachindex(𝑤) - 𝑤[i] += ones(1) - end - - edges = get_edges(X, params.nbins) - X_bin = binarize(X, edges) - - # initialize train nodes - train_nodes = Vector{TrainNode{params.K, Float64, Int64}}(undef, 2^params.max_depth-1) - for node in 1:2^params.max_depth-1 - train_nodes[node] = TrainNode(0, SVector{params.K, Float64}(fill(-Inf, params.K)), SVector{params.K, Float64}(fill(-Inf, params.K)), SVector{1, Float64}(fill(-Inf, 1)), -Inf, [0], [0]) - end - - # initializde node splits info and tracks - colsample size (𝑗) - splits = Vector{SplitInfo{params.K, Float64, Int64}}(undef, X_size[2]) - hist_δ = Vector{Vector{SVector{params.K, Float64}}}(undef, X_size[2]) - hist_δ² = Vector{Vector{SVector{params.K, Float64}}}(undef, X_size[2]) - hist_𝑤 = Vector{Vector{SVector{1, Float64}}}(undef, X_size[2]) - for feat in 𝑗_ - splits[feat] = SplitInfo{params.K, Float64, Int}(-Inf, SVector{params.K, Float64}(zeros(params.K)), SVector{params.K, Float64}(zeros(params.K)), SVector{1, Float64}(zeros(1)), SVector{params.K, Float64}(zeros(params.K)), SVector{params.K, Float64}(zeros(params.K)), SVector{1, Float64}(zeros(1)), -Inf, -Inf, 0, feat, 0.0) - hist_δ[feat] = zeros(SVector{params.K, Float64}, length(edges[feat])) - hist_δ²[feat] = zeros(SVector{params.K, Float64}, length(edges[feat])) - hist_𝑤[feat] = zeros(SVector{1, Float64}, length(edges[feat])) - end - - # initialize metric - if params.metric != :none - metric_track = Metric() - metric_best = Metric() - iter_since_best = 0 - end - - # loop over nrounds - for i in 1:params.nrounds - # select random rows and cols - 𝑖 = 𝑖_[sample(𝑖_, ceil(Int, params.rowsample * X_size[1]), replace=false, ordered=true)] - 𝑗 = 𝑗_[sample(𝑗_, ceil(Int, params.colsample * X_size[2]), replace=false, ordered=true)] - - # reset gain to -Inf - for feat in 𝑗_ - splits[feat].gain = -Inf - end - - # get gradients - update_grads!(params.loss, params.α, pred, Y, δ, δ², 𝑤) - ∑δ, ∑δ², ∑𝑤 = sum(δ[𝑖]), sum(δ²[𝑖]), sum(𝑤[𝑖]) - gain = get_gain(params.loss, ∑δ, ∑δ², ∑𝑤, params.λ) - - # assign a root and grow tree - train_nodes[1] = TrainNode(1, ∑δ, ∑δ², ∑𝑤, gain, 𝑖, 𝑗) - tree = grow_tree(δ, δ², 𝑤, hist_δ, hist_δ², hist_𝑤, params, train_nodes, splits, edges, X_bin) - # push new tree to model - push!(gbtree.trees, tree) - # update predictions - predict!(pred, tree, X) - # eval predictions - if size(Y_eval, 1) > 0 - predict!(pred_eval, tree, X_eval) - end - - # callback function - if params.metric != :none - if size(Y_eval, 1) > 0 - metric_track.metric .= eval_metric(Val{params.metric}(), pred_eval, Y_eval, params.α) - else - metric_track.metric .= eval_metric(Val{params.metric}(), pred, Y, params.α) - end - - if metric_track.metric < metric_best.metric - metric_best.metric .= metric_track.metric - metric_best.iter .= i - else - iter_since_best += 1 - end - - if mod(i, print_every_n) == 0 && verbosity > 0 - display(string("iter:", i, ", eval: ", metric_track.metric)) - end - iter_since_best >= early_stopping_rounds ? break : nothing - end # end of callback - - end #end of nrounds - - if params.metric != :none - gbtree.metric.iter .= metric_best.iter - gbtree.metric.metric .= metric_best.metric - end - return gbtree -end - -# grow_gbtree - continue training -function grow_gbtree!(model::GBTree, X::AbstractArray{R, 2}, Y::AbstractVector{S}; - X_eval::AbstractArray{R, 2} = Array{R, 2}(undef, (0,0)), Y_eval::AbstractVector{S} = Vector{S}(undef, 0), - early_stopping_rounds=Int(1e5), print_every_n=100, verbosity=1) where {R<:Real, S<:Real} - - params = model.params - seed!(params.seed) - - # initialize predictions - efficiency to be improved - pred = zeros(SVector{params.K,Float64}, size(X,1)) - pred_ = predict(model, X) - for i in eachindex(pred) - pred[i] = SVector{params.K,Float64}(pred_[i]) - end - # eval init - if size(Y_eval, 1) > 0 - pred_eval = zeros(SVector{params.K,Float64}, size(X_eval,1)) - pred_eval_ = predict(model, X_eval) - for i in eachindex(pred_eval) - pred_eval[i] = SVector{params.K,Float64}(pred_eval_[i]) - end - end - - X_size = size(X) - 𝑖_ = collect(1:X_size[1]) - 𝑗_ = collect(1:X_size[2]) - - # initialize gradients and weights - δ, δ² = zeros(SVector{params.K, Float64}, X_size[1]), zeros(SVector{params.K, Float64}, X_size[1]) - 𝑤 = zeros(SVector{1, Float64}, X_size[1]) .+ 1 - - edges = get_edges(X, params.nbins) - X_bin = binarize(X, edges) - - # initialize train nodes - train_nodes = Vector{TrainNode{params.K, Float64, Int64}}(undef, 2^params.max_depth-1) - for node in 1:2^params.max_depth-1 - train_nodes[node] = TrainNode(0, SVector{params.K, Float64}(fill(-Inf, params.K)), SVector{params.K, Float64}(fill(-Inf, params.K)), SVector{1, Float64}(fill(-Inf, 1)), -Inf, [0], [0]) - end - - # initializde node splits info and tracks - colsample size (𝑗) - splits = Vector{SplitInfo{params.K, Float64, Int64}}(undef, X_size[2]) - hist_δ = Vector{Vector{SVector{params.K, Float64}}}(undef, X_size[2]) - hist_δ² = Vector{Vector{SVector{params.K, Float64}}}(undef, X_size[2]) - hist_𝑤 = Vector{Vector{SVector{1, Float64}}}(undef, X_size[2]) - for feat in 𝑗_ - splits[feat] = SplitInfo{params.K, Float64, Int}(-Inf, SVector{params.K, Float64}(zeros(params.K)), SVector{params.K, Float64}(zeros(params.K)), SVector{1, Float64}(zeros(1)), SVector{params.K, Float64}(zeros(params.K)), SVector{params.K, Float64}(zeros(params.K)), SVector{1, Float64}(zeros(1)), -Inf, -Inf, 0, feat, 0.0) - hist_δ[feat] = zeros(SVector{params.K, Float64}, length(edges[feat])) - hist_δ²[feat] = zeros(SVector{params.K, Float64}, length(edges[feat])) - hist_𝑤[feat] = zeros(SVector{1, Float64}, length(edges[feat])) - end - - # initialize metric - if params.metric != :none - metric_track = model.metric - metric_best = model.metric - iter_since_best = 0 - end - - # loop over nrounds - for i in 1:params.nrounds - # select random rows and cols - 𝑖 = 𝑖_[sample(𝑖_, ceil(Int, params.rowsample * X_size[1]), replace=false, ordered=true)] - 𝑗 = 𝑗_[sample(𝑗_, ceil(Int, params.colsample * X_size[2]), replace=false, ordered=true)] - - # reset gain to -Inf - for feat in 𝑗_ - splits[feat].gain = -Inf - end - - # get gradients - update_grads!(params.loss, params.α, pred, Y, δ, δ², 𝑤) - ∑δ, ∑δ², ∑𝑤 = sum(δ[𝑖]), sum(δ²[𝑖]), sum(𝑤[𝑖]) - gain = get_gain(params.loss, ∑δ, ∑δ², ∑𝑤, params.λ) - - # assign a root and grow tree - train_nodes[1] = TrainNode(1, ∑δ, ∑δ², ∑𝑤, gain, 𝑖, 𝑗) - tree = grow_tree(δ, δ², 𝑤, hist_δ, hist_δ², hist_𝑤, params, train_nodes, splits, edges, X_bin) - - # update push tree to model - push!(model.trees, tree) - - # get update predictions - predict!(pred, tree, X) - # eval predictions - if size(Y_eval, 1) > 0 - predict!(pred_eval, tree, X_eval) - end - - # callback function - if params.metric != :none - - if size(Y_eval, 1) > 0 - metric_track.metric .= eval_metric(Val{params.metric}(), pred_eval, Y_eval, params.α) - else - metric_track.metric .= eval_metric(Val{params.metric}(), pred, Y, params.α) - end - - if metric_track.metric < metric_best.metric - metric_best.metric .= metric_track.metric - metric_best.iter .= i - else - iter_since_best += 1 - end - - if mod(i, print_every_n) == 0 && verbosity > 0 - display(string("iter:", i, ", eval: ", metric_track.metric)) - end - iter_since_best >= early_stopping_rounds ? break : nothing - end - end #end of nrounds - - if params.metric != :none - model.metric.iter .= metric_best.iter - model.metric.metric .= metric_best.metric - end - return model -end - - - - -# grow_gbtree -function grow_gbtree_MLJ(X::AbstractMatrix{R}, Y::AbstractVector{S}, params::Union{EvoTreeRegressor,EvoTreeCount,EvoTreeClassifier,EvoTreeGaussian}; verbosity=1) where {R<:Real, S<:Real} - - seed!(params.seed) - - μ = ones(params.K) - μ .*= mean(Y) - if typeof(params.loss) == Logistic - μ .= logit.(μ) - elseif typeof(params.loss) == Poisson - μ .= log.(μ) - elseif typeof(params.loss) == Softmax - μ .*= 0.0 - elseif typeof(params.loss) == Gaussian - μ = SVector{2}([mean(Y), log(var(Y))]) - end - - # initialize preds - pred = zeros(SVector{params.K,Float64}, size(X,1)) - for i in eachindex(pred) - pred[i] += μ - end - - # bias = Tree([TreeNode(SVector{1, Float64}(μ))]) - bias = Tree([TreeNode(SVector{params.K,Float64}(μ))]) - gbtree = GBTree([bias], params, Metric()) - - X_size = size(X) - 𝑖_ = collect(1:X_size[1]) - 𝑗_ = collect(1:X_size[2]) - - # initialize gradients and weights - δ, δ² = zeros(SVector{params.K, Float64}, X_size[1]), zeros(SVector{params.K, Float64}, X_size[1]) - 𝑤 = zeros(SVector{1, Float64}, X_size[1]) .+ 1 - - edges = get_edges(X, params.nbins) - X_bin = binarize(X, edges) - - # initialize train nodes - train_nodes = Vector{TrainNode{params.K, Float64, Int64}}(undef, 2^params.max_depth-1) - for node in 1:2^params.max_depth-1 - train_nodes[node] = TrainNode(0, SVector{params.K, Float64}(fill(-Inf, params.K)), SVector{params.K, Float64}(fill(-Inf, params.K)), SVector{1, Float64}(fill(-Inf, 1)), -Inf, [0], [0]) - end - - # initializde node splits info and tracks - colsample size (𝑗) - splits = Vector{SplitInfo{params.K, Float64, Int64}}(undef, X_size[2]) - hist_δ = Vector{Vector{SVector{params.K, Float64}}}(undef, X_size[2]) - hist_δ² = Vector{Vector{SVector{params.K, Float64}}}(undef, X_size[2]) - hist_𝑤 = Vector{Vector{SVector{1, Float64}}}(undef, X_size[2]) - for feat in 𝑗_ - splits[feat] = SplitInfo{params.K, Float64, Int}(-Inf, SVector{params.K, Float64}(zeros(params.K)), SVector{params.K, Float64}(zeros(params.K)), SVector{1, Float64}(zeros(1)), SVector{params.K, Float64}(zeros(params.K)), SVector{params.K, Float64}(zeros(params.K)), SVector{1, Float64}(zeros(1)), -Inf, -Inf, 0, feat, 0.0) - hist_δ[feat] = zeros(SVector{params.K, Float64}, length(edges[feat])) - hist_δ²[feat] = zeros(SVector{params.K, Float64}, length(edges[feat])) - hist_𝑤[feat] = zeros(SVector{1, Float64}, length(edges[feat])) - end - - # loop over nrounds - for i in 1:params.nrounds - # select random rows and cols - 𝑖 = 𝑖_[sample(𝑖_, ceil(Int, params.rowsample * X_size[1]), replace=false, ordered=true)] - 𝑗 = 𝑗_[sample(𝑗_, ceil(Int, params.colsample * X_size[2]), replace=false, ordered=true)] - - # reset gain to -Inf - for feat in 𝑗_ - splits[feat].gain = -Inf - end - - # get gradients - update_grads!(params.loss, params.α, pred, Y, δ, δ², 𝑤) - ∑δ, ∑δ², ∑𝑤 = sum(δ[𝑖]), sum(δ²[𝑖]), sum(𝑤[𝑖]) - gain = get_gain(params.loss, ∑δ, ∑δ², ∑𝑤, params.λ) - - # assign a root and grow tree - train_nodes[1] = TrainNode(1, ∑δ, ∑δ², ∑𝑤, gain, 𝑖, 𝑗) - tree = grow_tree(δ, δ², 𝑤, hist_δ, hist_δ², hist_𝑤, params, train_nodes, splits, edges, X_bin) - # push new tree to model - push!(gbtree.trees, tree) - - # get update predictions - predict!(pred, tree, X) - - end #end of nrounds - - cache = (params=deepcopy(params), X=X, Y=Y, pred=pred, 𝑖_=𝑖_, 𝑗_=𝑗_, δ=δ, δ²=δ², 𝑤=𝑤, edges=edges, X_bin=X_bin, - train_nodes=train_nodes, splits=splits, hist_δ=hist_δ, hist_δ²=hist_δ², hist_𝑤=hist_𝑤) - # cache = (deepcopy(params), X, Y, pred, 𝑖_, 𝑗_, δ, δ², 𝑤, edges, X_bin, train_nodes, splits, hist_δ, hist_δ², hist_𝑤) - return gbtree, cache -end - -# continue training for MLJ - continue training from same dataset - all preprocessed elements passed as cache -function grow_gbtree_MLJ!(model::GBTree, cache; verbosity=1) - - params = model.params - - # initialize predictions - # cache_params, X, Y, pred, 𝑖_, 𝑗_, δ, δ², 𝑤, edges, X_bin, train_nodes, splits, hist_δ, hist_δ², hist_𝑤 = cache - train_nodes = cache.train_nodes - splits = cache.splits - - X_size = size(cache.X_bin) - δnrounds = params.nrounds - cache.params.nrounds - # println("MLJ! δnrounds: ", δnrounds) - - # loop over nrounds - for i in 1:δnrounds - - # select random rows and cols - 𝑖 = cache.𝑖_[sample(cache.𝑖_, ceil(Int, params.rowsample * X_size[1]), replace=false, ordered=true)] - 𝑗 = cache.𝑗_[sample(cache.𝑗_, ceil(Int, params.colsample * X_size[2]), replace=false, ordered=true)] - - # reset gain to -Inf - for feat in cache.𝑗_ - splits[feat].gain = -Inf - end - - # get gradients - update_grads!(params.loss, params.α, cache.pred, cache.Y, cache.δ, cache.δ², cache.𝑤) - ∑δ, ∑δ², ∑𝑤 = sum(cache.δ[𝑖]), sum(cache.δ²[𝑖]), sum(cache.𝑤[𝑖]) - gain = get_gain(params.loss, ∑δ, ∑δ², ∑𝑤, params.λ) - - # assign a root and grow tree - train_nodes[1] = TrainNode(1, ∑δ, ∑δ², ∑𝑤, gain, 𝑖, 𝑗) - tree = grow_tree(cache.δ, cache.δ², cache.𝑤, cache.hist_δ, cache.hist_δ², cache.hist_𝑤, params, train_nodes, splits, cache.edges, cache.X_bin) - - # update push tree to model - push!(model.trees, tree) - - # get update predictions - predict!(cache.pred, tree, cache.X) - - end #end of nrounds - - cache.params.nrounds = params.nrounds - # cache = (deepcopy(params), X, Y, pred, 𝑖_, 𝑗_, δ, δ², 𝑤, edges, X_bin, train_nodes, splits, hist_δ, hist_δ², hist_𝑤) - - # return model, cache - return model -end diff --git a/test/runtests.jl b/test/runtests.jl index ab572d6a..60f0d0c4 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,6 +3,7 @@ using StatsBase: sample using Test using EvoTrees using EvoTrees: sigmoid, logit +# using MLJBase # prepare a dataset features = rand(10_000) .* 5