Skip to content

Commit

Permalink
new params default: nrounds=100, nbins=64 max-depth=6
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremiedb committed Aug 12, 2023
1 parent 3baf19e commit a70d925
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 18 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "EvoTrees"
uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
authors = ["jeremiedb <[email protected]>"]
version = "0.15.3"
version = "0.16.0"

[deps]
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
Expand Down
144 changes: 144 additions & 0 deletions experiments/shuffling.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
using DataFrames
using Distributions
using EvoTrees
using LinearAlgebra
using GLM
using Random

δ = 1.0e-6
b = fill(1.0 - δ, 3, 3) + δ * I
z = zeros(3, 3)
y = fill(0.5, 3)
dist = MvNormal([
b z z 0.8*y
z b z y
z z b 1.2*y
0.8*y' y' 1.2*y' 1.0])
Random.seed!(1)
mat = rand(dist, 10_000);
df = DataFrame(transpose(mat), [string.("x", 1:9); "y"]);
target_name = "y"

#################################
# Tables API
#################################
config = EvoTreeRegressor(seed=123)
m1 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m1)

config = EvoTreeRegressor(seed=124)
m2 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m2)

# permuted tables doesn't return the same result - numerical rounding error?
df2 = df[!, 10:-1:1]
config = EvoTreeRegressor()
m3 = fit_evotree(config,
df2;
target_name="y",
verbosity=0);
EvoTrees.importance(m3)

# manual check on col permutations
config = EvoTreeRegressor(max_depth=4)
m1, cache1 = EvoTrees.init(config, df; target_name);
EvoTrees.grow_evotree!(m1, cache1, config, EvoTrees.CPU)
EvoTrees.importance(m1)

df2 = df[!, 10:-1:1];
config = EvoTreeRegressor(max_depth=4)
m2, cache2 = EvoTrees.init(config, df2; target_name);
EvoTrees.grow_evotree!(m2, cache2, config, EvoTrees.CPU)
EvoTrees.importance(m2)

all(cache1.x_bin .== cache2.x_bin[:, 9:-1:1])
all(cache1.edges .== cache2.edges[9:-1:1])
m1.trees[2]
m2.trees[2]

m1.trees[2].feat
m2.trees[2].feat

Int.(m1.trees[2].cond_bin)
Int.(m2.trees[2].cond_bin)


config = EvoTreeRegressor(nrounds=100, eta=0.05, colsample=1.0)
m3 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m3)

#################################
# Tables API
#################################
config = EvoTreeRegressor(colsample=0.5)
m1 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m1)

m2 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m2)

#################################
# Matrix API
#################################
x_train = Matrix(mat[1:9, :]')
y_train = mat[10, :]

config = EvoTreeRegressor()
m1 = fit_evotree(config;
x_train,
y_train,
verbosity=0);
EvoTrees.importance(m1)

m2 = fit_evotree(config;
x_train,
y_train,
verbosity=0);
EvoTrees.importance(m2)

using GLM
x_train = Matrix(mat[1:9, :]')
y_train = mat[10, :]
lm(x_train, y_train)

#################################
# Matrix debug API
#################################
x_train = Matrix(mat[1:9, :]')
y_train = mat[10, :]

config = EvoTreeRegressor()
m1, cache1 = EvoTrees.init(config, x_train, y_train);
EvoTrees.grow_evotree!(m1, cache1, config, EvoTrees.CPU)
EvoTrees.importance(m1)

m2, cache2 = EvoTrees.init(config, x_train, y_train);
EvoTrees.grow_evotree!(m2, cache2, config, EvoTrees.CPU)
EvoTrees.importance(m2)

using MLJ
using EvoTrees
using MLJLinearModels
X, y = make_regression()
model = Stack(
metalearner=LinearRegressor(),
resampling=CV(nfolds=2),
tree=EvoTreeRegressor()
)
mach = machine(model, X, y)
fit!(mach)
4 changes: 2 additions & 2 deletions src/fit-utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Get the braking points of the feature data.
"""
function get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.TaskLocalRNG()) where {T}
nobs = min(size(X, 1), 1000 * nbins)
idx = rand(rng, 1:size(X, 1), nobs)
idx = sample(rng, 1:size(X, 1), nobs, replace=false, ordered=true)
nfeats = size(X, 2)
edges = Vector{Vector{T}}(undef, nfeats)
featbins = Vector{UInt8}(undef, nfeats)
Expand All @@ -25,7 +25,7 @@ end
function get_edges(df; fnames, nbins, rng=Random.TaskLocalRNG())
_nobs = length(Tables.getcolumn(df, 1))
nobs = min(_nobs, 1000 * nbins)
idx = rand(rng, 1:_nobs, nobs)
idx = sample(rng, 1:_nobs, nobs, replace=false, ordered=true)
edges = Vector{Any}([Vector{eltype(Tables.getcolumn(df, col))}() for col in fnames])
nfeats = length(fnames)
featbins = Vector{UInt8}(undef, nfeats)
Expand Down
30 changes: 15 additions & 15 deletions src/models.jl
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,15 @@ function EvoTreeRegressor(; kwargs...)
# defaults arguments
args = Dict{Symbol,Any}(
:loss => :mse,
:nrounds => 10,
:nrounds => 100,
:lambda => 0.0,
:gamma => 0.0, # min gain to split
:eta => 0.1, # learning rate
:max_depth => 5,
:max_depth => 6,
:min_weight => 1.0, # minimal weight, different from xgboost (but same for linear)
:rowsample => 1.0,
:colsample => 1.0,
:nbins => 32,
:nbins => 64,
:alpha => 0.5,
:monotone_constraints => Dict{Int,Int}(),
:tree_type => "binary",
Expand Down Expand Up @@ -151,15 +151,15 @@ function EvoTreeCount(; kwargs...)

# defaults arguments
args = Dict{Symbol,Any}(
:nrounds => 10,
:nrounds => 100,
:lambda => 0.0,
:gamma => 0.0, # min gain to split
:eta => 0.1, # learning rate
:max_depth => 5,
:max_depth => 6,
:min_weight => 1.0, # minimal weight, different from xgboost (but same for linear)
:rowsample => 1.0,
:colsample => 1.0,
:nbins => 32,
:nbins => 64,
:alpha => 0.5,
:monotone_constraints => Dict{Int,Int}(),
:tree_type => "binary",
Expand Down Expand Up @@ -217,15 +217,15 @@ function EvoTreeClassifier(; kwargs...)

# defaults arguments
args = Dict{Symbol,Any}(
:nrounds => 10,
:nrounds => 100,
:lambda => 0.0,
:gamma => 0.0, # min gain to split
:eta => 0.1, # learning rate
:max_depth => 5,
:max_depth => 6,
:min_weight => 1.0, # minimal weight, different from xgboost (but same for linear)
:rowsample => 1.0,
:colsample => 1.0,
:nbins => 32,
:nbins => 64,
:alpha => 0.5,
:tree_type => "binary",
:rng => 123,
Expand Down Expand Up @@ -283,15 +283,15 @@ function EvoTreeMLE(; kwargs...)
# defaults arguments
args = Dict{Symbol,Any}(
:loss => :gaussian_mle,
:nrounds => 10,
:nrounds => 100,
:lambda => 0.0,
:gamma => 0.0, # min gain to split
:eta => 0.1, # learning rate
:max_depth => 5,
:max_depth => 6,
:min_weight => 8.0, # minimal weight, different from xgboost (but same for linear)
:rowsample => 1.0,
:colsample => 1.0,
:nbins => 32,
:nbins => 64,
:alpha => 0.5,
:monotone_constraints => Dict{Int,Int}(),
:tree_type => "binary",
Expand Down Expand Up @@ -366,15 +366,15 @@ function EvoTreeGaussian(; kwargs...)

# defaults arguments
args = Dict{Symbol,Any}(
:nrounds => 10,
:nrounds => 100,
:lambda => 0.0,
:gamma => 0.0, # min gain to split
:eta => 0.1, # learning rate
:max_depth => 5,
:max_depth => 6,
:min_weight => 8.0, # minimal weight, different from xgboost (but same for linear)
:rowsample => 1.0,
:colsample => 1.0,
:nbins => 32,
:nbins => 64,
:alpha => 0.5,
:monotone_constraints => Dict{Int,Int}(),
:tree_type => "binary",
Expand Down

0 comments on commit a70d925

Please sign in to comment.