Skip to content

Commit

Permalink
Merge pull request #48 from Evovest/float32
Browse files Browse the repository at this point in the history
Float32
  • Loading branch information
jeremiedb authored Apr 19, 2020
2 parents 4cd2e9e + 0292bd1 commit 4235aa2
Show file tree
Hide file tree
Showing 10 changed files with 82 additions and 80 deletions.
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "EvoTrees"
uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
authors = ["jeremiedb <[email protected]>"]
version = "0.4.6"
version = "0.4.7"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand All @@ -14,10 +14,10 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

[compat]
CategoricalArrays = "0.7"
Distributions = "0.22"
Distributions = "0.22, 0.23"
MLJModelInterface = "0.1, 0.2"
StaticArrays = "0.12"
StatsBase = "0.32"
StatsBase = "0.32, 0.33"
julia = "1"

[extras]
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ julia> Pkg.add("EvoTrees")

[Benchmark](https://github.com/Evovest/EvoTrees.jl/blob/master/blog/benchmarks.jl) for 100 iterations on randomly generated data:

| Dimensions / Algo | XGBoost Exact | XGBoost Hist | EvoTrees | |
|-------------------|:-------------:|:------------:|:--------:|---|
| 10K x 100 | 1.18s | 2.15s | 0.52s | |
| 100K x 100 | 9.39s | 4.25s | 2.02s | |
| 1M X 100 | 146.5s | 20.2s | 22.5 | |
| Dimensions / Algo | XGBoost Exact | XGBoost Hist | EvoTrees |
|-------------------|:-------------:|:------------:|:--------:|
| 10K x 100 | 1.18s | 2.15s | 0.52s |
| 100K x 100 | 9.39s | 4.25s | 2.02s |
| 1M X 100 | 146.5s | 20.2s | 21.5 |


## Parameters
Expand Down
8 changes: 3 additions & 5 deletions blog/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@ using EvoTrees
using BenchmarkTools

# prepare a dataset
X = rand(Int(2.e6), 100)
Y = rand(size(X, 1))

#######################
# EvoTrees
#######################
using EvoTrees

X = rand(Int(1.e5), 100)
Y = rand(size(X, 1))

config = EvoTreeRegressor(
loss=:linear, metric=:none,
Expand All @@ -38,5 +36,5 @@ param = ["max_depth" => 5,
"max_bin" => 32]
metrics = ["rmse"]

@time model_xgb = xgboost(X, num_round, label = Y, param = param, metrics=metrics, silent=1);
@time model_xgb = xgboost(X, num_round, label = Y, param = param, silent=1);
@time pred = XGBoost.predict(model_xgb, X)
27 changes: 14 additions & 13 deletions experiments/benchmarks.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
using Statistics
using StatsBase: sample
using XGBoost
# using XGBoost
using Revise
using EvoTrees
using BenchmarkTools

# prepare a dataset
features = rand(Int(1.25e5), 100)
features = rand(Int(2.25e6), 100)
# features = rand(100, 10)
X = features
Y = rand(size(X, 1))
Expand Down Expand Up @@ -40,15 +41,16 @@ metrics = ["rmse"]

# train model
config = EvoTreeRegressor(
loss=:linear, metric=:none,
nrounds=100,
λ = 0.0, γ=0.0, η=0.05,
max_depth = 6, min_weight = 1.0,
rowsample=0.5, colsample=0.5, nbins=32)
loss=:linear, metric=:none,
nrounds=100, α = 0.5,
λ = 0.0, γ=0.0, η=0.05,
max_depth = 6, min_weight = 1.0,
rowsample=0.5, colsample=0.5, nbins=32)

# for 100k: 410.477 ms (44032 allocations: 182.68 MiB)
# for 1.25e6: 6.964114 seconds (6.05 M allocations: 2.350 GiB, 2.82% gc time)
# for 1.25e6 no eval: 6.200 s (44330 allocations: 2.19 GiB)
# for 1.25e5 init_evotree: 2.009 s 0.322925 seconds (2.53 k allocations: 167.345 MiB)
# for 1.25e5 no eval iter 100: 2.009 s (628514 allocations: 720.62 MiB)
# for 1.25e6 no eval iter 10: 6.200 s (44330 allocations: 2.19 GiB)
# for 1.25e6 no eval iter 100: 19.481940 seconds (635.33 k allocations: 6.679 GiB, 3.11% gc time)
# for 1.25e6 mse with eval data: 6.321 s (45077 allocations: 2.19 GiB)
@time model, cache = init_evotree(config, X_train, Y_train);
@time grow_evotree!(model, cache);
Expand Down Expand Up @@ -104,16 +106,15 @@ metrics = ["logloss"]
@time bst = xgboost(train_X, num_round, label = train_Y, eta = 0.1, max_depth = 3, metrics = metrics, silent=0, objective = "binary:logistic")
features_xgb = XGBoost.importance(bst)

X_train = Float64.(train_X)
Y_train = Float64.(train_Y)
params1 = EvoTreeRegressor(
loss=:logistic, metric=:logloss,
nrounds=100,
λ = 0.0, γ=0.0, η=0.1,
max_depth = 4, min_weight = 1.0,
rowsample=1.0, colsample=1.0, nbins=250)

@time model = fit_evotree(params1, X_train, Y_train, print_every_n=50);
@time model = fit_evotree(params1, train_X, train_Y, print_every_n=20);
@time model = fit_evotree(params1, X_train, Y_train, X_eval=test_X, Y_eval=test_Y, print_every_n=20);
@time pred_train = EvoTrees.predict(model, X_train)
features_evo = importance(model, 1:size(X_train,2))
sort(collect(values(features_evo)))
32 changes: 16 additions & 16 deletions experiments/random.jl
Original file line number Diff line number Diff line change
Expand Up @@ -79,46 +79,46 @@ mean(abs.(pred_train .- Y_train))
params1 = EvoTreeRegressor(
loss=:logistic, metric=:logloss,
nrounds=100,
λ = 0.0, γ=0.0, η=0.1,
max_depth = 6, min_weight = 1.0,
rowsample=0.5, colsample=0.5, nbins=32)
λ = 0.0f0, γ=0.0f0, η=0.1f0,
max_depth = 6, min_weight = 1.0f0,
rowsample=0.5f0, colsample=0.5f0, α=0.5f0, nbins=32)
@time model = fit_evotree(params1, X_train, Y_train);
@time model = fit_evotree(params1, X_train, Y_train, X_eval = X_eval, Y_eval = Y_eval, print_every_n=10)
@time pred_train = predict(model, X_train)

# Quantile
params1 = EvoTreeRegressor(
loss=:quantile, metric=:quantile, α=0.80,
loss=:quantile, metric=:quantile, α=0.80f0,
nrounds=100,
λ = 0.1, γ=0.0, η=0.1,
max_depth = 6, min_weight = 1.0,
rowsample=0.5, colsample=0.5, nbins=32)
λ = 0.1f0, γ=0.0f0, η=0.1f0,
max_depth = 6, min_weight = 1.0f0,
rowsample=0.5f0, colsample=0.5f0, nbins=32)
@time model = fit_evotree(params1, X_train, Y_train);
@time model = fit_evotree(params1, X_train, Y_train, X_eval = X_eval, Y_eval = Y_eval, print_every_n=10)
@time pred_train = predict(model, X_train)

# gaussian
params1 = EvoTreeGaussian(
loss=:gaussian, metric=:gaussian,
nrounds=100,
λ = 0.0, γ=0.0, η=0.1,
max_depth = 6, min_weight = 10.0,
rowsample=0.5, colsample=0.5, nbins=32)
nrounds=100, α=0.5f0,
λ = 0.0f0, γ=0.0f0, η=0.1f0,
max_depth = 6, min_weight = 10.0f0,
rowsample=0.5f0, colsample=0.5f0, nbins=32)
@time model = fit_evotree(params1, X_train, Y_train);
@time model = fit_evotree(params1, X_train, Y_train, X_eval = X_eval, Y_eval = Y_eval, print_every_n=10)
@time pred_train = predict(model, X_train)

# softmax
params1 = EvoTreeClassifier(
loss=:softmax, metric=:mlogloss,
nrounds=100,
λ = 0.0, γ=0.0, η=0.1,
max_depth = 6, min_weight = 10.0,
rowsample=0.5, colsample=0.5, nbins=32)
nrounds=100, α=0.5f0,
λ=0.0f0, γ=0.0f0, η=0.1f0,
max_depth = 6, min_weight = 10.0f0,
rowsample=0.5f0, colsample=0.5f0, nbins=32)

Y_train_int = UInt32.(round.(Y_train*2) .+ 1)
Y_eval_int = UInt32.(round.(Y_eval*2) .+ 1)
Y_train_int = Int.(Y_train_int)
@time model = fit_evotree(params1, X_train, Y_train_int);
@time model = fit_evotree(params1, X_train, Y_train_int, print_every_n=10);
@time model = fit_evotree(params1, X_train, Y_train_int, X_eval = X_eval, Y_eval = Y_eval_int, print_every_n=10)
@time pred_train = predict(model, X_train)
2 changes: 1 addition & 1 deletion src/find_split.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Get the braking points
#############################################
function get_edges(X::Matrix{T}, nbins=250) where {T}
edges = Vector{Vector{T}}(undef, size(X,2))
edges = Vector{Vector{Float32}}(undef, size(X,2))
@threads for i in 1:size(X, 2)
edges[i] = quantile(view(X, :,i), (1:nbins)/nbins)
if length(edges[i]) == 0
Expand Down
51 changes: 27 additions & 24 deletions src/fit.jl
Original file line number Diff line number Diff line change
@@ -1,53 +1,55 @@
# initialise evotree
function init_evotree(params::Union{EvoTreeRegressor,EvoTreeCount,EvoTreeClassifier,EvoTreeGaussian},
X::AbstractMatrix{R}, Y::AbstractVector{S}; verbosity=1) where {R<:Real, S}
function init_evotree(params::EvoTypes,
X::AbstractMatrix{R}, Y::AbstractVector{S}; verbosity=1) where {R,S}

seed!(params.seed)

K = 1
levels = ""
if typeof(params.loss) == Logistic
Y = Float32.(Y)
μ = fill(logit(mean(Y)), 1)
elseif typeof(params.loss) == Poisson
Y = Float64.(Y)
Y = Float32.(Y)
μ = fill(log(mean(Y)), 1)
elseif typeof(params.loss) == Softmax
if typeof(Y) <: AbstractCategoricalVector
levels = CategoricalArray(CategoricalArrays.levels(Y))
K = length(levels)
μ = zeros(K)
μ = zeros(Float32, K)
Y = MLJModelInterface.int.(Y)
else
levels = CategoricalArray(sort(unique(Y)))
K = length(levels)
μ = zeros(K)
μ = zeros(Float32, K)
Y = UInt32.(Y)
end
elseif typeof(params.loss) == Gaussian
K = 2
Y = Float32.(Y)
μ = SVector{2}([mean(Y), log(var(Y))])
else
Y = Float32.(Y)
μ = fill(mean(Y), 1)
end

# initialize preds
pred = zeros(SVector{K,Float64}, size(X,1))
pred = zeros(SVector{K,Float32}, size(X,1))
for i in eachindex(pred)
pred[i] += μ
end

# bias = Tree([TreeNode(SVector{1, Float64}(μ))])
bias = Tree([TreeNode(SVector{K,Float64}(μ))])
bias = Tree([TreeNode(SVector{K,Float32}(μ))])
evotree = GBTree([bias], params, Metric(), K, levels)

X_size = size(X)
𝑖_ = collect(1:X_size[1])
𝑗_ = collect(1:X_size[2])

# initialize gradients and weights
δ, δ² = zeros(SVector{evotree.K, Float64}, X_size[1]), zeros(SVector{evotree.K, Float64}, X_size[1])
𝑤 = zeros(SVector{1, Float64}, X_size[1])
𝑤_ini = SVector{1, Float64}(1)
δ, δ² = zeros(SVector{evotree.K, Float32}, X_size[1]), zeros(SVector{evotree.K, Float32}, X_size[1])
𝑤 = zeros(SVector{1, Float32}, X_size[1])
𝑤_ini = SVector{1, Float32}(1)
for i in 1:length(𝑤)
𝑤[i] += 𝑤_ini
end
Expand All @@ -58,24 +60,24 @@ function init_evotree(params::Union{EvoTreeRegressor,EvoTreeCount,EvoTreeClassif


# initializde histograms
hist_δ = Vector{Matrix{SVector{evotree.K, Float64}}}(undef, 2^params.max_depth-1)
hist_δ² = Vector{Matrix{SVector{evotree.K, Float64}}}(undef, 2^params.max_depth-1)
hist_𝑤 = Vector{Matrix{SVector{1, Float64}}}(undef, 2^params.max_depth-1)
hist_δ = Vector{Matrix{SVector{evotree.K, Float32}}}(undef, 2^params.max_depth-1)
hist_δ² = Vector{Matrix{SVector{evotree.K, Float32}}}(undef, 2^params.max_depth-1)
hist_𝑤 = Vector{Matrix{SVector{1, Float32}}}(undef, 2^params.max_depth-1)

# initialize train nodes
train_nodes = Vector{TrainNode{evotree.K, Float64, Int64}}(undef, 2^params.max_depth-1)
train_nodes = Vector{TrainNode{evotree.K, Float32, Int64}}(undef, 2^params.max_depth-1)

for node in 1:2^params.max_depth-1
train_nodes[node] = TrainNode(0, 0, SVector{evotree.K, Float64}(fill(-Inf, evotree.K)), SVector{evotree.K, Float64}(fill(-Inf, evotree.K)), SVector{1, Float64}(fill(-Inf, 1)), -Inf, [0], [0])
train_nodes[node] = TrainNode(0, 0, SVector{evotree.K, Float32}(fill(Float32(-Inf), evotree.K)), SVector{evotree.K, Float32}(fill(Float32(-Inf), evotree.K)), SVector{1, Float32}(fill(Float32(-Inf), 1)), Float32(-Inf), [0], [0])

hist_δ[node] = zeros(SVector{evotree.K, Float64}, params.nbins, X_size[2])
hist_δ²[node] = zeros(SVector{evotree.K, Float64}, params.nbins, X_size[2])
hist_𝑤[node] = zeros(SVector{1, Float64}, params.nbins, X_size[2])
hist_δ[node] = zeros(SVector{evotree.K, Float32}, params.nbins, X_size[2])
hist_δ²[node] = zeros(SVector{evotree.K, Float32}, params.nbins, X_size[2])
hist_𝑤[node] = zeros(SVector{1, Float32}, params.nbins, X_size[2])
end

splits = Vector{SplitInfo{evotree.K, Float64, Int64}}(undef, X_size[2])
splits = Vector{SplitInfo{evotree.K, Float32, Int64}}(undef, X_size[2])
for feat in 𝑗_
splits[feat] = SplitInfo{evotree.K, Float64, Int}(-Inf, SVector{evotree.K, Float64}(zeros(evotree.K)), SVector{evotree.K, Float64}(zeros(evotree.K)), SVector{1, Float64}(zeros(1)), SVector{evotree.K, Float64}(zeros(evotree.K)), SVector{evotree.K, Float64}(zeros(evotree.K)), SVector{1, Float64}(zeros(1)), -Inf, -Inf, 0, feat, 0.0)
splits[feat] = SplitInfo{evotree.K, Float32, Int}(Float32(-Inf), SVector{evotree.K, Float32}(zeros(evotree.K)), SVector{evotree.K, Float32}(zeros(evotree.K)), SVector{1, Float32}(zeros(1)), SVector{evotree.K, Float32}(zeros(evotree.K)), SVector{evotree.K, Float32}(zeros(evotree.K)), SVector{1, Float32}(zeros(1)), Float32(-Inf), Float32(-Inf), 0, feat, 0.0)
end

cache = (params=deepcopy(params),
Expand Down Expand Up @@ -108,7 +110,7 @@ function grow_evotree!(evotree::GBTree, cache; verbosity=1)
𝑗 = cache.𝑗_[sample(cache.𝑗_, ceil(Int, params.colsample * X_size[2]), replace=false, ordered=true)]
# reset gain to -Inf
for feat in cache.𝑗_
splits[feat].gain = -Inf
splits[feat].gain = Float32(-Inf)
end

# build a new tree
Expand All @@ -132,7 +134,7 @@ end
# grow a single tree
function grow_tree(δ, δ², 𝑤,
hist_δ, hist_δ², hist_𝑤,
params::Union{EvoTreeRegressor,EvoTreeCount,EvoTreeClassifier,EvoTreeGaussian},
params::EvoTypes,
train_nodes::Vector{TrainNode{L,T,S}},
splits::Vector{SplitInfo{L,T,Int}},
edges, X_bin) where {R<:Real, T<:AbstractFloat, S<:Int, L}
Expand Down Expand Up @@ -216,6 +218,7 @@ function fit_evotree(params, X_train, Y_train;

if params.metric != :none && X_eval !== nothing
pred_eval = predict(model.trees[1], X_eval, model.K)
Y_eval = convert.(eltype(cache.Y), Y_eval)
end

while model.params.nrounds < nrounds_max && iter_since_best < early_stopping_rounds
Expand All @@ -227,7 +230,7 @@ function fit_evotree(params, X_train, Y_train;
predict!(pred_eval, model.trees[model.params.nrounds+1], X_eval)
metric_track.metric = eval_metric(Val{params.metric}(), pred_eval, Y_eval, params.α)
else
metric_track.metric = eval_metric(Val{params.metric}(), cache.pred, Y_train, params.α)
metric_track.metric = eval_metric(Val{params.metric}(), cache.pred, cache.Y, params.α)
end
if metric_track.metric < metric_best.metric
metric_best.metric = metric_track.metric
Expand Down
16 changes: 8 additions & 8 deletions src/models.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ function EvoTreeRegressor(;
elseif loss == :quantile model_type = Quantile()
end

model = EvoTreeRegressor(model_type, nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α, metric, seed)
model = EvoTreeRegressor(model_type, nrounds, Float32(λ), Float32(γ), Float32(η), max_depth, Float32(min_weight), Float32(rowsample), Float32(colsample), nbins, Float32(α), metric, seed)

return model
end
Expand Down Expand Up @@ -87,7 +87,7 @@ function EvoTreeCount(;
seed=444)

if loss == :poisson model_type = Poisson() end
model = EvoTreeCount(model_type, nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α, metric, seed)
model = EvoTreeCount(Poisson(), nrounds, Float32(λ), Float32(γ), Float32(η), max_depth, Float32(min_weight), Float32(rowsample), Float32(colsample), nbins, Float32(α), metric, seed)

return model
end
Expand Down Expand Up @@ -125,7 +125,7 @@ function EvoTreeClassifier(;
seed=444)

if loss == :softmax model_type = Softmax() end
model = EvoTreeClassifier(model_type, nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α, metric, seed)
model = EvoTreeClassifier(Softmax(), nrounds, Float32(λ), Float32(γ), Float32(η), max_depth, Float32(min_weight), Float32(rowsample), Float32(colsample), nbins, Float32(α), metric, seed)

return model
end
Expand Down Expand Up @@ -163,7 +163,7 @@ function EvoTreeGaussian(;
seed=444)

if loss == :gaussian model_type = Gaussian() end
model = EvoTreeGaussian(model_type, nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α, metric, seed)
model = EvoTreeGaussian(Gaussian(), nrounds, Float32(λ), Float32(γ), Float32(η), max_depth, Float32(min_weight), Float32(rowsample), Float32(colsample), nbins, Float32(α), metric, seed)

return model
end
Expand Down Expand Up @@ -191,13 +191,13 @@ function EvoTreeRModels(
elseif loss == :quantile model_type = Quantile()
elseif loss == :L1 model_type = L1()
end
model = EvoTreeRegressor(model_type, nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α, metric, seed)
model = EvoTreeRegressor(model_type, nrounds, Float32(λ), Float32(γ), Float32(η), max_depth, Float32(min_weight), Float32(rowsample), Float32(colsample), nbins, Float32(α), metric, seed)
elseif loss == :poisson
model = EvoTreeCount(Poisson(), nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α, metric, seed)
model = EvoTreeCount(Poisson(), nrounds, Float32(λ), Float32(γ), Float32(η), max_depth, Float32(min_weight), Float32(rowsample), Float32(colsample), nbins, Float32(α), metric, seed)
elseif loss == :softmax
model = EvoTreeClassifier(Softmax(), nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α, metric, seed)
model = EvoTreeClassifier(Softmax(), nrounds, Float32(λ), Float32(γ), Float32(η), max_depth, Float32(min_weight), Float32(rowsample), Float32(colsample), nbins, Float32(α), metric, seed)
elseif loss == :gaussian
model = EvoTreeGaussian(Gaussian(), nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α, metric, seed)
model = EvoTreeGaussian(Gaussian(), nrounds, Float32(λ), Float32(γ), Float32(η), max_depth, Float32(min_weight), Float32(rowsample), Float32(colsample), nbins, Float32(α), metric, seed)
else
throw("invalid loss")
end
Expand Down
Loading

0 comments on commit 4235aa2

Please sign in to comment.