diff --git a/Project.toml b/Project.toml index cde30189..a78cda25 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "EvoTrees" uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" authors = ["jeremiedb "] -version = "0.15.1" +version = "0.15.2" [deps] BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" diff --git a/README.md b/README.md index 785776bf..fcbe6a98 100644 --- a/README.md +++ b/README.md @@ -46,27 +46,27 @@ Code to reproduce is availabe in [`benchmarks/regressor.jl`](https://github.com/ - Julia: v1.9.1. - Algorithms - XGBoost: v2.3.0 (Using the `hist` algorithm). - - EvoTrees: v0.15.0. + - EvoTrees: v0.15.2. ### Training: | Dimensions / Algo | XGBoost CPU | EvoTrees CPU | XGBoost GPU | EvoTrees GPU | |---------------------|:-----------:|:------------:|:-----------:|:------------:| -| 100K x 100 | 2.33s | 1.09s | 0.90s | 2.72s | -| 500K x 100 | 10.7s | 2.96s | 1.84s | 3.65s | -| 1M x 100 | 20.9s | 6.48s | 3.10s | 4.45s | -| 5M x 100 | 108s | 35.8s | 12.9s | 12.7s | -| 10M x 100 | 216s | 71.6s | 25.5s | 23.0s | +| 100K x 100 | 2.34s | 1.01s | 0.90s | 2.61s | +| 500K x 100 | 10.7s | 3.95s | 1.84s | 3.41s | +| 1M x 100 | 21.1s | 6.57s | 3.10s | 4.47s | +| 5M x 100 | 108s | 36.1s | 12.9s | 12.5s | +| 10M x 100 | 218s | 72.6s | 25.5s | 23.0s | ### Inference: | Dimensions / Algo | XGBoost CPU | EvoTrees CPU | XGBoost GPU | EvoTrees GPU | |---------------------|:------------:|:------------:|:-----------:|:------------:| -| 100K x 100 | 0.151s | 0.053s | NA | 0.036s | -| 500K x 100 | 0.628s | 0.276s | NA | 0.169s | -| 1M x 100 | 1.26s | 0.558s | NA | 0.334s | +| 100K x 100 | 0.151s | 0.058s | NA | 0.045s | +| 500K x 100 | 0.647s | 0.248s | NA | 0.172s | +| 1M x 100 | 1.26s | 0.573s | NA | 0.327s | | 5M x 100 | 6.04s | 2.87s | NA | 1.66s | -| 10M x 100 | 12.4s | 5.71s | NA | 3.31s | +| 10M x 100 | 12.4s | 5.71s | NA | 3.40s | ## MLJ Integration diff --git a/benchmarks/regressor.jl b/benchmarks/regressor.jl index 278462bd..b78b38af 100644 --- a/benchmarks/regressor.jl +++ b/benchmarks/regressor.jl @@ -8,12 +8,17 @@ using BenchmarkTools using Random: seed! import CUDA +### v.0.15.1 +# desktop | 1e6 | depth 11 | cpu: 37.2s +# desktop | 10e6 | depth 11 | cpu + +### perf depth # desktop | 1e6 | depth 11 | cpu: 28s gpu: 73 sec | xgboost: 26s # desktop | 10e6 | depth 11 | cpu 205s gpu: 109 sec | xgboost 260s nobs = Int(1e6) num_feat = Int(100) nrounds = 200 -max_depth = 11 +max_depth = 6 tree_type = "binary" T = Float64 nthread = Base.Threads.nthreads() @@ -120,14 +125,11 @@ device = "cpu" # @time m_evo = fit_evotree(params_evo; x_train, y_train, device, verbosity, print_every_n=100); @info "train - eval" @time m_evo = fit_evotree(params_evo; x_train, y_train, x_eval=x_train, y_eval=y_train, metric=metric_evo, device, verbosity, print_every_n=100); -# using Plots -# plot(m_evo, 2) - @time m_evo = fit_evotree(params_evo; x_train, y_train, x_eval=x_train, y_eval=y_train, metric=metric_evo, device, verbosity, print_every_n=100); @info "predict" @time pred_evo = m_evo(x_train); @time pred_evo = m_evo(x_train); -@btime m_evo($x_train); +# @btime m_evo($x_train); @info "EvoTrees GPU" device = "gpu" @@ -142,4 +144,4 @@ CUDA.@time m_evo = fit_evotree(params_evo; x_train, y_train, x_eval=x_train, y_e @info "predict" CUDA.@time pred_evo = m_evo(x_train; device); CUDA.@time pred_evo = m_evo(x_train; device); -@btime m_evo($x_train; device); +# @btime m_evo($x_train; device); diff --git a/docs/src/assets/regression-sinus-binary.png b/docs/src/assets/regression-sinus-binary.png index b442732c..c81dc366 100644 Binary files a/docs/src/assets/regression-sinus-binary.png and b/docs/src/assets/regression-sinus-binary.png differ diff --git a/experiments/readme_plots_cpu.jl b/experiments/readme_plots_cpu.jl index e431f3d9..7c15e433 100644 --- a/experiments/readme_plots_cpu.jl +++ b/experiments/readme_plots_cpu.jl @@ -90,7 +90,7 @@ params1 = EvoTreeRegressor(; nbins=64, lambda=0.1, gamma=0.1, - eta=0.05, + eta=0.1, max_depth=6, min_weight=1.0, rowsample=0.5, @@ -132,7 +132,7 @@ params1 = EvoTreeRegressor(; nbins=64, lambda=0.1, gamma=0.1, - eta=0.05, + eta=0.1, max_depth=6, min_weight=1.0, rowsample=0.5, @@ -288,7 +288,7 @@ params1 = EvoTreeRegressor(; loss=:tweedie, nrounds=500, nbins=64, - lambda=0.5, + lambda=0.1, gamma=0.1, eta=0.1, max_depth=6, @@ -359,7 +359,7 @@ params1 = EvoTreeRegressor(; nbins=64, lambda=0.1, gamma=0.0, - eta=0.05, + eta=0.1, max_depth=6, min_weight=1.0, rowsample=0.5, @@ -389,7 +389,7 @@ params1 = EvoTreeRegressor(; nbins=64, lambda=0.1, gamma=0.0, - eta=0.05, + eta=0.1, max_depth=6, min_weight=1.0, rowsample=0.5, @@ -408,7 +408,7 @@ params1 = EvoTreeRegressor(; nbins=64, lambda=0.1, gamma=0.0, - eta=0.05, + eta=0.1, max_depth=6, min_weight=1.0, rowsample=0.5, @@ -466,10 +466,10 @@ params1 = EvoTreeMLE(; nbins=64, lambda=0.1, gamma=0.1, - eta=0.05, + eta=0.1, max_depth=6, - min_weight=10.0, - rowsample=1.0, + min_weight=10, + rowsample=0.5, colsample=1.0, rng=123, tree_type, @@ -549,12 +549,12 @@ params1 = EvoTrees.EvoTreeMLE(; loss=:logistic, nrounds=500, nbins=64, - lambda=1.0, + lambda=0.1, gamma=0.1, - eta=0.03, + eta=0.1, max_depth=6, - min_weight=1.0, - rowsample=1.0, + min_weight=10, + rowsample=0.5, colsample=1.0, tree_type, rng=123, diff --git a/experiments/readme_plots_gpu.jl b/experiments/readme_plots_gpu.jl index 7fdeb244..33b89c92 100644 --- a/experiments/readme_plots_gpu.jl +++ b/experiments/readme_plots_gpu.jl @@ -249,7 +249,7 @@ params1 = EvoTreeGaussian(; gamma=0.1, eta=0.1, max_depth=6, - min_weight=20, + min_weight=10, rowsample=0.5, colsample=1.0, rng=123, diff --git a/figures/regression-sinus-binary-gpu.png b/figures/regression-sinus-binary-gpu.png index e50b6ef9..1be5632a 100644 Binary files a/figures/regression-sinus-binary-gpu.png and b/figures/regression-sinus-binary-gpu.png differ diff --git a/figures/regression-sinus-binary.png b/figures/regression-sinus-binary.png index b442732c..c81dc366 100644 Binary files a/figures/regression-sinus-binary.png and b/figures/regression-sinus-binary.png differ diff --git a/figures/regression-sinus-oblivious-gpu.png b/figures/regression-sinus-oblivious-gpu.png index 0a67e8b4..44644279 100644 Binary files a/figures/regression-sinus-oblivious-gpu.png and b/figures/regression-sinus-oblivious-gpu.png differ diff --git a/figures/regression-sinus-oblivious.png b/figures/regression-sinus-oblivious.png index 635a35e2..358d6749 100644 Binary files a/figures/regression-sinus-oblivious.png and b/figures/regression-sinus-oblivious.png differ diff --git a/figures/regression-sinus2-binary.png b/figures/regression-sinus2-binary.png index 888638f2..964652e9 100644 Binary files a/figures/regression-sinus2-binary.png and b/figures/regression-sinus2-binary.png differ diff --git a/figures/regression-sinus2-oblivious.png b/figures/regression-sinus2-oblivious.png index b6c32f1c..2a1eb75f 100644 Binary files a/figures/regression-sinus2-oblivious.png and b/figures/regression-sinus2-oblivious.png differ diff --git a/src/fit-utils.jl b/src/fit-utils.jl index 97d456dc..f49163a4 100644 --- a/src/fit-utils.jl +++ b/src/fit-utils.jl @@ -172,58 +172,32 @@ function split_set_threads!( lefts = zeros(Int, nblocks) rights = zeros(Int, nblocks) - if nblocks == 1 - lefts[1], rights[1] = split_set_chunk!( - left, - right, - is, - 1, - nblocks, - x_bin, - feat, - cond_bin, - feattype, - offset, - chunk_size, - ) - else - @threads :static for bid = 1:nblocks - lefts[bid], rights[bid] = split_set_chunk!( - left, - right, - is, - bid, - nblocks, - x_bin, - feat, - cond_bin, - feattype, - offset, - chunk_size, - ) + @sync begin + for bid = 1:nblocks + @spawn begin + lefts[bid], rights[bid] = split_set_chunk!( + left, + right, + is, + bid, + nblocks, + x_bin, + feat, + cond_bin, + feattype, + offset, + chunk_size, + ) + end end end sum_lefts = sum(lefts) cumsum_lefts = cumsum(lefts) cumsum_rights = cumsum(rights) - if nblocks == 1 - split_views_kernel!( - out, - left, - right, - 1, - offset, - chunk_size, - lefts, - rights, - sum_lefts, - cumsum_lefts, - cumsum_rights, - ) - else - @threads :static for bid = 1:nblocks - split_views_kernel!( + @sync begin + for bid = 1:nblocks + @spawn split_views_kernel!( out, left, right, @@ -258,16 +232,6 @@ function update_hist!( is::AbstractVector, js::AbstractVector, ) where {L<:GradientRegression} - # if length(is) < 1_000 - # for j in js - # @inbounds @simd for i in is - # bin = x_bin[i, j] - # hist[j][1, bin] += ∇[1, i] - # hist[j][2, bin] += ∇[2, i] - # hist[j][3, bin] += ∇[3, i] - # end - # end - # else @threads :static for j in js @inbounds @simd for i in is bin = x_bin[i, j] @@ -276,7 +240,6 @@ function update_hist!( hist[j][3, bin] += ∇[3, i] end end - # end return nothing end diff --git a/src/fit.jl b/src/fit.jl index 78271b9d..f6d2aac0 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -61,17 +61,19 @@ function grow_tree!( end end - # reset - n_next = [1] - n_current = copy(n_next) + # initialize + n_current = [1] depth = 1 # initialize summary stats nodes[1].∑ .= dropdims(sum(Float64, view(∇, :, nodes[1].is), dims=2), dims=2) nodes[1].gain = get_gain(params, nodes[1].∑) + # grow while there are remaining active nodes while length(n_current) > 0 && depth <= params.max_depth offset = 0 # identifies breakpoint for each node set within a depth + n_next = Int[] + if depth < params.max_depth for n_id in eachindex(n_current) n = n_current[n_id] @@ -89,18 +91,15 @@ function grow_tree!( update_hist!(L, nodes[n].h, ∇, x_bin, nodes[n].is, js) end end - end - - @threads :static for n ∈ sort(n_current) - update_gains!(nodes[n], js, params, feattypes, monotone_constraints) + @threads :static for n ∈ sort(n_current) + update_gains!(nodes[n], js, params, feattypes, monotone_constraints) + end end for n ∈ sort(n_current) if depth == params.max_depth || nodes[n].∑[end] <= params.min_weight pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) - popfirst!(n_next) else - # update_gains!(nodes[n], js, params, feattypes, monotone_constraints) best = findmax(findmax.(nodes[n].gains)) best_gain = best[1][1] best_bin = best[1][2] @@ -110,12 +109,8 @@ function grow_tree!( tree.cond_bin[n] = best_bin tree.feat[n] = best_feat tree.cond_float[n] = edges[tree.feat[n]][tree.cond_bin[n]] - end - tree.split[n] = tree.cond_bin[n] != 0 - if !tree.split[n] - pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) - popfirst!(n_next) - else + tree.split[n] = best_bin != 0 + _left, _right = split_set_threads!( out, left, @@ -127,12 +122,14 @@ function grow_tree!( feattypes[best_feat], offset, ) + offset += length(nodes[n].is) nodes[n<<1].is, nodes[n<<1+1].is = _left, _right nodes[n<<1].∑ .= nodes[n].hL[best_feat][:, best_bin] nodes[n<<1+1].∑ .= nodes[n].hR[best_feat][:, best_bin] nodes[n<<1].gain = get_gain(params, nodes[n<<1].∑) nodes[n<<1+1].gain = get_gain(params, nodes[n<<1+1].∑) + if length(_right) >= length(_left) push!(n_next, n << 1) push!(n_next, n << 1 + 1) @@ -140,7 +137,8 @@ function grow_tree!( push!(n_next, n << 1 + 1) push!(n_next, n << 1) end - popfirst!(n_next) + else + pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) end end end @@ -177,17 +175,18 @@ function grow_otree!( end end - # reset - n_next = [1] - n_current = copy(n_next) + # initialize + n_current = [1] depth = 1 # initialize summary stats nodes[1].∑ .= dropdims(sum(Float64, view(∇, :, nodes[1].is), dims=2), dims=2) nodes[1].gain = get_gain(params, nodes[1].∑) + # grow while there are remaining active nodes while length(n_current) > 0 && depth <= params.max_depth offset = 0 # identifies breakpoint for each node set within a depth + n_next = Int[] min_weight_flag = false for n in n_current @@ -197,7 +196,6 @@ function grow_otree!( for n in n_current # @info "length(nodes[n].is)" length(nodes[n].is) depth n pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) - popfirst!(n_next) end else # update histograms @@ -217,6 +215,9 @@ function grow_otree!( update_hist!(L, nodes[n].h, ∇, x_bin, nodes[n].is, js) end end + @threads :static for n ∈ n_current + update_gains!(nodes[n], js, params, feattypes, monotone_constraints) + end # initialize gains for node 1 in which all gains of a given depth will be accumulated if depth > 1 @@ -227,7 +228,6 @@ function grow_otree!( gain = 0 # update gains based on the aggregation of all nodes of a given depth. One gains matrix per depth (vs one per node in binary trees). for n ∈ sort(n_current) - update_gains!(nodes[n], js, params, feattypes, monotone_constraints) if n > 1 # accumulate gains in node 1 for j in js nodes[1].gains[j] .+= nodes[n].gains[j] @@ -281,12 +281,10 @@ function grow_otree!( push!(n_next, n << 1 + 1) push!(n_next, n << 1) end - popfirst!(n_next) end else for n in n_current pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) - popfirst!(n_next) end end end diff --git a/src/gpu/fit.jl b/src/gpu/fit.jl index 30d9bd00..b6751599 100644 --- a/src/gpu/fit.jl +++ b/src/gpu/fit.jl @@ -61,16 +61,18 @@ function grow_tree!( end # initialize - n_next = [1] - n_current = copy(n_next) + n_current = [1] depth = 1 # initialize summary stats nodes[1].∑ .= Vector(vec(sum(∇[:, nodes[1].is], dims=2))) nodes[1].gain = get_gain(params, nodes[1].∑) # should use a GPU version? - # grow while there are remaining active nodes - TO DO histogram substraction hits issue on GPU + + # grow while there are remaining active nodes while length(n_current) > 0 && depth <= params.max_depth offset = 0 # identifies breakpoint for each node set within a depth + n_next = Int[] + if depth < params.max_depth for n_id in eachindex(n_current) n = n_current[n_id] @@ -88,34 +90,26 @@ function grow_tree!( update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js) end end + @threads :static for n ∈ sort(n_current) + update_gains!(nodes[n], js, params, feattypes, monotone_constraints) + end end - # grow while there are remaining active nodes for n ∈ sort(n_current) if depth == params.max_depth || nodes[n].∑[end] <= params.min_weight pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) - popfirst!(n_next) else - # @info "gain & max" - update_gains!(nodes[n], js, params, feattypes, monotone_constraints) best = findmax(findmax.(nodes[n].gains)) best_gain = best[1][1] best_bin = best[1][2] best_feat = best[2] - # if best_gain > nodes[n].gain + params.gamma && best_gain > nodes[n].gains[best_feat][end] + params.gamma if best_gain > nodes[n].gain + params.gamma tree.gain[n] = best_gain - nodes[n].gain tree.cond_bin[n] = best_bin tree.feat[n] = best_feat tree.cond_float[n] = edges[tree.feat[n]][tree.cond_bin[n]] - end - tree.split[n] = tree.cond_bin[n] != 0 - if !tree.split[n] - pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) - popfirst!(n_next) - else - # @info "split" best_bin typeof(nodes[n].is) length(nodes[n].is) - # @info "split typeof" typeof(out) typeof(left) typeof(nodes[n].is) typeof(x_bin) + tree.split[n] = best_bin != 0 + _left, _right = split_set_threads_gpu!( out, left, @@ -127,12 +121,14 @@ function grow_tree!( feattypes[best_feat], offset, ) + offset += length(nodes[n].is) nodes[n<<1].is, nodes[n<<1+1].is = _left, _right nodes[n<<1].∑ .= nodes[n].hL[best_feat][:, best_bin] nodes[n<<1+1].∑ .= nodes[n].hR[best_feat][:, best_bin] nodes[n<<1].gain = get_gain(params, nodes[n<<1].∑) nodes[n<<1+1].gain = get_gain(params, nodes[n<<1+1].∑) + if length(_right) >= length(_left) push!(n_next, n << 1) push!(n_next, n << 1 + 1) @@ -140,8 +136,8 @@ function grow_tree!( push!(n_next, n << 1 + 1) push!(n_next, n << 1) end - # @info "split post" length(_left) length(_right) - popfirst!(n_next) + else + pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) end end end @@ -182,8 +178,7 @@ function grow_otree!( end # initialize - n_next = [1] - n_current = copy(n_next) + n_current = [1] depth = 1 # initialize summary stats @@ -193,6 +188,7 @@ function grow_otree!( # grow while there are remaining active nodes while length(n_current) > 0 && depth <= params.max_depth offset = 0 # identifies breakpoint for each node set within a depth + n_next = Int[] min_weight_flag = false for n in n_current @@ -202,7 +198,6 @@ function grow_otree!( for n in n_current # @info "length(nodes[n].is)" length(nodes[n].is) depth n pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) - popfirst!(n_next) end else # update histograms @@ -222,6 +217,9 @@ function grow_otree!( update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js) end end + @threads :static for n ∈ n_current + update_gains!(nodes[n], js, params, feattypes, monotone_constraints) + end # initialize gains for node 1 in which all gains of a given depth will be accumulated if depth > 1 @@ -232,7 +230,6 @@ function grow_otree!( gain = 0 # update gains based on the aggregation of all nodes of a given depth. One gains matrix per depth (vs one per node in binary trees). for n ∈ sort(n_current) - update_gains!(nodes[n], js, params, feattypes, monotone_constraints) if n > 1 # accumulate gains in node 1 for j in js nodes[1].gains[j] .+= nodes[n].gains[j] @@ -286,12 +283,10 @@ function grow_otree!( push!(n_next, n << 1 + 1) push!(n_next, n << 1) end - popfirst!(n_next) end else for n in n_current pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) - popfirst!(n_next) end end end