diff --git a/Project.toml b/Project.toml index ba4801ba..56933c60 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "EvoTrees" uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" authors = ["jeremiedb "] -version = "0.4.8" +version = "0.4.9" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" diff --git a/README.md b/README.md index af4d34ed..9ac8644a 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,53 @@ julia> Pkg.add("EvoTrees") - α: float \[0,1\], set the quantile or bias in L1 default=0.5 - metric: {:mse, :rmse, :mae, :logloss, :quantile}, default=:none -## Getting started + +## MLJ Integration + +See [official project page](https://github.com/alan-turing-institute/MLJ.jl) for more info. + +```julia +using StatsBase: sample +using EvoTrees +using EvoTrees: sigmoid, logit +using MLJBase + +features = rand(10_000) .* 5 .- 2 +X = reshape(features, (size(features)[1], 1)) +Y = sin.(features) .* 0.5 .+ 0.5 +Y = logit(Y) + randn(size(Y)) +Y = sigmoid(Y) +y = Y +X = MLJBase.table(X) + +# @load EvoTreeRegressor +# linear regression +tree_model = EvoTreeRegressor(loss=:linear, max_depth=5, η=0.05, nrounds=10) + +# set machine +mach = machine(tree_model, X, y) + +# partition data +train, test = partition(eachindex(y), 0.7, shuffle=true); # 70:30 split + +# fit data +fit!(mach, rows=train, verbosity=1) + +# continue training +mach.model.nrounds += 10 +fit!(mach, rows=train, verbosity=1) + +# predict on train data +pred_train = predict(mach, selectrows(X,train)) +mean(abs.(pred_train - selectrows(Y,train))) + +# predict on test data +pred_test = predict(mach, selectrows(X,test)) +mean(abs.(pred_test - selectrows(Y,test))) +``` + + +## Getting started using internal API Minimal example to fit a noisy sinus wave. @@ -167,7 +213,7 @@ pred_train_q80 = predict(model, X_train) ## Gaussian Max Likelihood -![](gaussian_likelihood.png) +![](gaussian_sinus.png) ```julia params1 = EvoTreeGaussian( diff --git a/experiments/benchmarks.jl b/experiments/benchmarks.jl index 913b0d6c..ca00ff01 100644 --- a/experiments/benchmarks.jl +++ b/experiments/benchmarks.jl @@ -6,7 +6,7 @@ using EvoTrees using BenchmarkTools # prepare a dataset -features = rand(Int(2.25e6), 100) +features = rand(Int(1.25e6), 100) # features = rand(100, 10) X = features Y = rand(size(X, 1)) diff --git a/experiments/gaussian.jl b/experiments/gaussian.jl index 9305907b..f05e55bb 100644 --- a/experiments/gaussian.jl +++ b/experiments/gaussian.jl @@ -1,8 +1,9 @@ +using Plots using Statistics using StatsBase: sample -using EvoTrees using Distributions -using Plots +using Revise +using EvoTrees features = rand(Int(1.25e4), 5) # prepare a dataset @@ -14,6 +15,8 @@ Y[(X[:,1] .>= 0.4) .& (X[:,1] .< 0.6)] .*= 5 Y[(X[:,1] .>= 0.9)] .*= 5 𝑖 = collect(1:size(X,1)) +Y .*= 0.01 + # train-eval split 𝑖_sample = sample(𝑖, size(𝑖, 1), replace = false) train_size = 0.8 @@ -26,17 +29,17 @@ Y_train, Y_eval = Y[𝑖_train], Y[𝑖_eval] # train model params1 = EvoTreeGaussian( loss=:gaussian, metric=:gaussian, - nrounds=400, - λ = 0.5, γ=1.0, η=0.05, - max_depth = 4, min_weight = 200.0, - rowsample=0.9, colsample=0.99, nbins=255) + nrounds=40, + λ = 0.0, γ=0.0, η=0.05, + max_depth = 5, min_weight = 50.0, + rowsample=0.5, colsample=1.0, nbins=200) @time model = fit_evotree(params1, X_train, Y_train, X_eval=X_eval, Y_eval=Y_eval, print_every_n = 10); # @time model = fit_evotree(params1, X_train, Y_train, print_every_n = 10); @time pred_train = EvoTrees.predict(model, X_train) @time pred_train_gauss = EvoTrees.predict(params1, model, X_train) -pred_gauss = [Distributions.Normal(pred_train[i,1], sqrt(pred_train[i,2])) for i in 1:size(pred_train,1)] +pred_gauss = [Distributions.Normal(pred_train[i,1], pred_train[i,2]) for i in 1:size(pred_train,1)] pred_q90 = quantile.(pred_gauss, 0.9) pred_q10 = quantile.(pred_gauss, 0.1) @@ -46,7 +49,40 @@ mean(Y_train .< pred_q10) x_perm = sortperm(X_train[:,1]) plot(X_train[:, 1], Y_train, ms = 1, mcolor = "gray", mscolor = "lightgray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "") plot!(X_train[:,1][x_perm], pred_train[x_perm, 1], color = "navy", linewidth = 1.5, label = "mu") -plot!(X_train[:,1][x_perm], sqrt.(pred_train[x_perm, 2]), color = "blue", linewidth = 1.5, label = "sigma") +plot!(X_train[:,1][x_perm], pred_train[x_perm, 2], color = "blue", linewidth = 1.5, label = "sigma") plot!(X_train[:,1][x_perm], pred_q10[x_perm, 1], color = "red", linewidth = 1.5, label = "q10") plot!(X_train[:,1][x_perm], pred_q90[x_perm, 1], color = "green", linewidth = 1.5, label = "q90") -# savefig("regression_gaussian.png") +savefig("regression_gaussian_v1.png") + + +# compare with zygote +using Zygote + +pred = [0.0, log(1.0)] +target = 0.1 + +δ1 = (target - pred[1]) / max(1e-8, exp(2*pred[2])) +δ2 = (1 - (pred[1] - target)^2 / max(1e-8, exp(2*pred[2]))) + +δ²1 = 1 / max(1e-8, exp(2*pred[2])) +δ²2 = 2 / max(1e-8, exp(2*pred[2])) * (pred[1] - target)^2 + + +lpdf(x,μ,σ) = -log(σ) - log(2π)/2 - 1/2*((x-μ)/σ)^2 +lpdf(0, pred[1], pred[2]) + +lpdf2(x,μ,lσ) = -log(exp(lσ)) - log(2π)/2 - 1/2*((x-μ)/exp(lσ))^2 +lpdf2(0, pred[1], pred[2]) + + +n1 = Normal(0, 1) +Distributions.logpdf(n1, 0) + +# gradient(lpdf, target, pred[1], pred[2])[2:end] +gradient(lpdf2, target, pred[1], pred[2])[2:end] +Zygote.hessian(lpdf2, target, pred[1], pred[2]) + +gradient_lpdf(x,pred) = gradient(lpdf2, x, pred[1], pred[2])[3] +hessian_lpdf(x,pred) = gradient(gradient_lpdf, x, pred)[1] +gradient_lpdf(target, pred) +hessian_lpdf(target, pred) diff --git a/experiments/parametric_type.jl b/experiments/parametric_type.jl new file mode 100644 index 00000000..07f17250 --- /dev/null +++ b/experiments/parametric_type.jl @@ -0,0 +1,102 @@ +using Statistics +using StatsBase: sample +# using XGBoost +using Revise +using EvoTrees +using BenchmarkTools + +# prepare a dataset +features = rand(Int(2.25e6), 100) +# features = rand(100, 10) +X = features +Y = rand(size(X, 1)) +𝑖 = collect(1:size(X,1)) + +# train-eval split +𝑖_sample = sample(𝑖, size(𝑖, 1), replace = false) +train_size = 0.8 +𝑖_train = 𝑖_sample[1:floor(Int, train_size * size(𝑖, 1))] +𝑖_eval = 𝑖_sample[floor(Int, train_size * size(𝑖, 1))+1:end] + +X_train, X_eval = X[𝑖_train, :], X[𝑖_eval, :] +Y_train, Y_eval = Y[𝑖_train], Y[𝑖_eval] + +config = EvoTrees.EvoTreeRegressor3(T=Float32, + loss=:linear, metric=:none, + nrounds=100, α = 0.5, + λ = 0.0, γ=0.0, η=0.05, + max_depth = 6, min_weight = 1.0, + rowsample=0.5, colsample=0.5, nbins=32) + + +# for 1.25e5 init_evotree: 2.009 s 0.322925 seconds (2.53 k allocations: 167.345 MiB) +# for 1.25e5 no eval iter 100: 2.009 s (628514 allocations: 720.62 MiB) +# for 1.25e6 no eval iter 10: 6.200 s (44330 allocations: 2.19 GiB) +# for 1.25e6 no eval iter 100: 19.481940 seconds (635.33 k allocations: 6.679 GiB, 3.11% gc time) +# for 1.25e6 mse with eval data: 6.321 s (45077 allocations: 2.19 GiB) +@time model, cache = init_evotree(config, X_train, Y_train); +@time grow_evotree!(model, cache); +@time model = fit_evotree(config, X_train, Y_train); +@btime model = fit_evotree(config, X_train, Y_train); +@time pred_train = EvoTrees.predict(model, X_train) + +@time model = fit_evotree(config, X_train, Y_train, X_eval=X_eval, Y_eval=Y_eval, print_every_n=9999, early_stopping_rounds=9999); +@btime model = fit_evotree(config, X_train, Y_train, X_eval=X_eval, Y_eval=Y_eval, print_every_n=9999, early_stopping_rounds=9999); + +@time model = fit_evotree(config, X_train, Y_train, early_stopping_rounds=10); +@time model = fit_evotree(config, X_train, Y_train, print_every_n=2); + +# @time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 5); +# @btime model = grow_gbtree($X_train, $Y_train, $params1, X_eval = $X_eval, Y_eval = $Y_eval); +@time pred_train = predict(model, X_train) + + +############################# +# agaricus +############################# +function readlibsvm(fname::String, shape) + dmx = zeros(Float32, shape) + label = Float32[] + fi = open(fname, "r") + cnt = 1 + for line in eachline(fi) + line = split(line, " ") + push!(label, parse(Float64, line[1])) + line = line[2:end] + for itm in line + itm = split(itm, ":") + dmx[cnt, parse(Int, itm[1]) + 1] = parse(Int, itm[2]) + end + cnt += 1 + end + close(fi) + return (dmx, label) +end + +# we use auxiliary function to read LIBSVM format into julia Matrix +train_X, train_Y = readlibsvm("data/agaricus.txt.train", (6513, 126)) +test_X, test_Y = readlibsvm("data/agaricus.txt.test", (1611, 126)) + +#-------------Basic Training using XGBoost----------------- +# note: xgboost naturally handles sparse input +# use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector) +# model parameters can be set as parameters for ```xgboost``` function, or use a Vector{String} / Dict() +num_round = 100 +# you can directly pass Julia's matrix or sparse matrix as data, +# by calling xgboost(data, num_round, label=label, training-parameters) +metrics = ["logloss"] +@time bst = xgboost(train_X, num_round, label = train_Y, eta = 0.1, max_depth = 3, metrics = metrics, silent=0, objective = "binary:logistic") +features_xgb = XGBoost.importance(bst) + +params1 = EvoTreeRegressor( + loss=:logistic, metric=:logloss, + nrounds=100, + λ = 0.0, γ=0.0, η=0.1, + max_depth = 4, min_weight = 1.0, + rowsample=1.0, colsample=1.0, nbins=250) + +@time model = fit_evotree(params1, train_X, train_Y, print_every_n=20); +@time model = fit_evotree(params1, X_train, Y_train, X_eval=test_X, Y_eval=test_Y, print_every_n=20); +@time pred_train = EvoTrees.predict(model, X_train) +features_evo = importance(model, 1:size(X_train,2)) +sort(collect(values(features_evo))) diff --git a/experiments/readme_plots.jl b/experiments/readme_plots.jl index 4c38ef8e..5436c88e 100644 --- a/experiments/readme_plots.jl +++ b/experiments/readme_plots.jl @@ -87,7 +87,7 @@ params1 = EvoTreeRegressor( sqrt(mean((pred_train_L1 .- Y_train) .^ 2)) x_perm = sortperm(X_train[:,1]) -plot(X_train, Y_train, ms = 1, mcolor = "gray", mscolor = "lightgray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "") +plot(X_train, Y_train, ms = 1, mcolor = "gray", mscolor = "gray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "") plot!(X_train[:,1][x_perm], pred_train_linear[x_perm], color = "navy", linewidth = 1.5, label = "Linear") plot!(X_train[:,1][x_perm], pred_train_logistic[x_perm], color = "darkred", linewidth = 1.5, label = "Logistic") plot!(X_train[:,1][x_perm], pred_train_poisson[x_perm], color = "green", linewidth = 1.5, label = "Poisson") @@ -135,8 +135,40 @@ params1 = EvoTreeRegressor( sum(pred_train_q80 .< Y_train) / length(Y_train) x_perm = sortperm(X_train[:,1]) -plot(X_train, Y_train, ms = 1, mcolor = "gray", mscolor = "lightgray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "") +plot(X_train, Y_train, ms = 1, mcolor = "gray", mscolor = "gray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "") plot!(X_train[:,1][x_perm], pred_train_q50[x_perm], color = "navy", linewidth = 1.5, label = "Median") plot!(X_train[:,1][x_perm], pred_train_q20[x_perm], color = "darkred", linewidth = 1.5, label = "Q20") plot!(X_train[:,1][x_perm], pred_train_q80[x_perm], color = "green", linewidth = 1.5, label = "Q80") savefig("quantiles_sinus.png") + + + +############################### +## gaussian +############################### +params1 = EvoTreeGaussian( + loss=:gaussian, metric=:gaussian, + nrounds=200, nbins=100, + λ = 0.0, γ=0.0, η=0.05, + max_depth = 5, min_weight = 1.0, + rowsample=0.8, colsample=1.0, seed=123) + +@time model = fit_evotree(params1, X_train, Y_train, X_eval=X_eval, Y_eval=Y_eval, print_every_n = 10); +# @time model = fit_evotree(params1, X_train, Y_train, print_every_n = 10); +@time pred_train = EvoTrees.predict(model, X_train) +@time pred_train_gauss = EvoTrees.predict(params1, model, X_train) + +pred_gauss = [Distributions.Normal(pred_train[i,1], pred_train[i,2]) for i in 1:size(pred_train,1)] +pred_q80 = quantile.(pred_gauss, 0.8) +pred_q20 = quantile.(pred_gauss, 0.2) + +mean(Y_train .< pred_q80) +mean(Y_train .< pred_q20) + +x_perm = sortperm(X_train[:,1]) +plot(X_train[:, 1], Y_train, ms = 1, mcolor = "gray", mscolor = "gray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "") +plot!(X_train[:,1][x_perm], pred_train[x_perm, 1], color = "navy", linewidth = 1.5, label = "mu") +plot!(X_train[:,1][x_perm], pred_train[x_perm, 2], color = "red", linewidth = 1.5, label = "sigma") +plot!(X_train[:,1][x_perm], pred_q20[x_perm, 1], color = "green", linewidth = 1.5, label = "q20") +plot!(X_train[:,1][x_perm], pred_q80[x_perm, 1], color = "green", linewidth = 1.5, label = "q80") +savefig("gaussian_sinus.png") diff --git a/gaussian_sinus.png b/gaussian_sinus.png new file mode 100644 index 00000000..35644cb4 Binary files /dev/null and b/gaussian_sinus.png differ diff --git a/quantiles_sinus.png b/quantiles_sinus.png index b0a3be6f..2e925679 100644 Binary files a/quantiles_sinus.png and b/quantiles_sinus.png differ diff --git a/regression_gaussian_v1.png b/regression_gaussian_v1.png new file mode 100644 index 00000000..6fd68c5f Binary files /dev/null and b/regression_gaussian_v1.png differ diff --git a/regression_sinus.png b/regression_sinus.png index bdf4875e..d6162807 100644 Binary files a/regression_sinus.png and b/regression_sinus.png differ diff --git a/src/MLJ.jl b/src/MLJ.jl index 23adc13e..475da6a1 100644 --- a/src/MLJ.jl +++ b/src/MLJ.jl @@ -55,7 +55,7 @@ end function predict(model::EvoTreeGaussian, fitresult, Xnew) Xnew = MLJModelInterface.matrix(Xnew) pred = predict(fitresult, Xnew) - return [Distributions.Normal(pred[i,1], sqrt(pred[i,2])) for i in 1:size(pred,1)] + return [Distributions.Normal(pred[i,1], pred[i,2]) for i in 1:size(pred,1)] end # Metadata diff --git a/src/eval.jl b/src/eval.jl index 6eb83639..9137885d 100644 --- a/src/eval.jl +++ b/src/eval.jl @@ -61,11 +61,11 @@ end # gaussian # pred[i][1] = μ -# pred[i][2] = log(σ²) +# pred[i][2] = log(σ) function eval_metric(::Val{:gaussian}, pred::Vector{SVector{L,T}}, Y::AbstractVector{T}, α=0.0) where {L, T <: AbstractFloat} eval = zero(T) @inbounds for i in 1:length(pred) - eval += pred[i][2]/2 + (Y[i] - pred[i][1])^2 / (2*max(1e-8, exp(pred[i][2]))) + eval += pred[i][2] + (Y[i] - pred[i][1])^2 / (2*max(1e-8, exp(2*pred[i][2]))) end eval /= length(Y) return eval diff --git a/src/find_split.jl b/src/find_split.jl index d7b7242e..5263b23d 100644 --- a/src/find_split.jl +++ b/src/find_split.jl @@ -1,8 +1,8 @@ ############################################# # Get the braking points ############################################# -function get_edges(X::Matrix{T}, nbins=250) where {T} - edges = Vector{Vector{Float32}}(undef, size(X,2)) +function get_edges(X::AbstractMatrix{T}, nbins=250) where {T} + edges = Vector{Vector{T}}(undef, size(X,2)) @threads for i in 1:size(X, 2) edges[i] = quantile(view(X, :,i), (1:nbins)/nbins) if length(edges[i]) == 0 diff --git a/src/fit.jl b/src/fit.jl index 52321e63..d103541b 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -1,45 +1,58 @@ +function allo2(params::EvoTypes) + T = typeof(params) + # test = T(params.η) + return T +end + +function allo3(params::EvoTrees.EvoTypes{T,U,S}) where {T,U,S} + # T = typeof(params) + test = T(params.η) + return test +end + # initialise evotree -function init_evotree(params::EvoTypes, - X::AbstractMatrix{R}, Y::AbstractVector{S}; verbosity=1) where {R,S} +function init_evotree(params::EvoTypes{T,U,S}, + X::AbstractMatrix, Y::AbstractVector; verbosity=1) where {T,U,S} seed!(params.seed) K = 1 levels = "" + X = convert(Matrix{T}, X) if typeof(params.loss) == Logistic - Y = Float32.(Y) + Y = T.(Y) μ = fill(logit(mean(Y)), 1) elseif typeof(params.loss) == Poisson - Y = Float32.(Y) + Y = T.(Y) μ = fill(log(mean(Y)), 1) elseif typeof(params.loss) == Softmax if typeof(Y) <: AbstractCategoricalVector levels = CategoricalArray(CategoricalArrays.levels(Y)) K = length(levels) - μ = zeros(Float32, K) + μ = zeros(T, K) Y = MLJModelInterface.int.(Y) else levels = CategoricalArray(sort(unique(Y))) K = length(levels) - μ = zeros(Float32, K) + μ = zeros(T, K) Y = UInt32.(Y) end elseif typeof(params.loss) == Gaussian K = 2 - Y = Float32.(Y) - μ = SVector{2}([mean(Y), log(var(Y))]) + Y = T.(Y) + μ = SVector{2}([mean(Y), log(std(Y))]) else - Y = Float32.(Y) + Y = T.(Y) μ = fill(mean(Y), 1) end # initialize preds - pred = zeros(SVector{K,Float32}, size(X,1)) + pred = zeros(SVector{K,T}, size(X,1)) for i in eachindex(pred) pred[i] += μ end - bias = Tree([TreeNode(SVector{K,Float32}(μ))]) + bias = Tree([TreeNode(SVector{K,T}(μ))]) evotree = GBTree([bias], params, Metric(), K, levels) X_size = size(X) @@ -47,9 +60,9 @@ function init_evotree(params::EvoTypes, 𝑗_ = collect(1:X_size[2]) # initialize gradients and weights - δ, δ² = zeros(SVector{evotree.K, Float32}, X_size[1]), zeros(SVector{evotree.K, Float32}, X_size[1]) - 𝑤 = zeros(SVector{1, Float32}, X_size[1]) - 𝑤_ini = SVector{1, Float32}(1) + δ, δ² = zeros(SVector{evotree.K, T}, X_size[1]), zeros(SVector{evotree.K, T}, X_size[1]) + 𝑤 = zeros(SVector{1, T}, X_size[1]) + 𝑤_ini = SVector{1, T}(1) for i in 1:length(𝑤) 𝑤[i] += 𝑤_ini end @@ -60,24 +73,24 @@ function init_evotree(params::EvoTypes, # initializde histograms - hist_δ = Vector{Matrix{SVector{evotree.K, Float32}}}(undef, 2^params.max_depth-1) - hist_δ² = Vector{Matrix{SVector{evotree.K, Float32}}}(undef, 2^params.max_depth-1) - hist_𝑤 = Vector{Matrix{SVector{1, Float32}}}(undef, 2^params.max_depth-1) + hist_δ = Vector{Matrix{SVector{evotree.K, T}}}(undef, 2^params.max_depth-1) + hist_δ² = Vector{Matrix{SVector{evotree.K, T}}}(undef, 2^params.max_depth-1) + hist_𝑤 = Vector{Matrix{SVector{1, T}}}(undef, 2^params.max_depth-1) # initialize train nodes - train_nodes = Vector{TrainNode{evotree.K, Float32, Int64}}(undef, 2^params.max_depth-1) + train_nodes = Vector{TrainNode{evotree.K, T, Int64}}(undef, 2^params.max_depth-1) for node in 1:2^params.max_depth-1 - train_nodes[node] = TrainNode(0, 0, SVector{evotree.K, Float32}(fill(Float32(-Inf), evotree.K)), SVector{evotree.K, Float32}(fill(Float32(-Inf), evotree.K)), SVector{1, Float32}(fill(Float32(-Inf), 1)), Float32(-Inf), [0], [0]) + train_nodes[node] = TrainNode(0, 0, SVector{evotree.K, T}(fill(T(-Inf), evotree.K)), SVector{evotree.K, T}(fill(T(-Inf), evotree.K)), SVector{1, T}(fill(T(-Inf), 1)), T(-Inf), [0], [0]) - hist_δ[node] = zeros(SVector{evotree.K, Float32}, params.nbins, X_size[2]) - hist_δ²[node] = zeros(SVector{evotree.K, Float32}, params.nbins, X_size[2]) - hist_𝑤[node] = zeros(SVector{1, Float32}, params.nbins, X_size[2]) + hist_δ[node] = zeros(SVector{evotree.K, T}, params.nbins, X_size[2]) + hist_δ²[node] = zeros(SVector{evotree.K, T}, params.nbins, X_size[2]) + hist_𝑤[node] = zeros(SVector{1, T}, params.nbins, X_size[2]) end - splits = Vector{SplitInfo{evotree.K, Float32, Int64}}(undef, X_size[2]) + splits = Vector{SplitInfo{evotree.K, T, Int64}}(undef, X_size[2]) for feat in 𝑗_ - splits[feat] = SplitInfo{evotree.K, Float32, Int}(Float32(-Inf), SVector{evotree.K, Float32}(zeros(evotree.K)), SVector{evotree.K, Float32}(zeros(evotree.K)), SVector{1, Float32}(zeros(1)), SVector{evotree.K, Float32}(zeros(evotree.K)), SVector{evotree.K, Float32}(zeros(evotree.K)), SVector{1, Float32}(zeros(1)), Float32(-Inf), Float32(-Inf), 0, feat, 0.0) + splits[feat] = SplitInfo{evotree.K, T, Int}(T(-Inf), SVector{evotree.K, T}(zeros(evotree.K)), SVector{evotree.K, T}(zeros(evotree.K)), SVector{1, T}(zeros(1)), SVector{evotree.K, T}(zeros(evotree.K)), SVector{evotree.K, T}(zeros(evotree.K)), SVector{1, T}(zeros(1)), T(-Inf), T(-Inf), 0, feat, 0.0) end cache = (params=deepcopy(params), @@ -93,7 +106,7 @@ function init_evotree(params::EvoTypes, end -function grow_evotree!(evotree::GBTree, cache; verbosity=1) +function grow_evotree!(evotree::GBTree{L,T,S}, cache; verbosity=1) where {L,T,S} # initialize from cache params = evotree.params @@ -110,7 +123,7 @@ function grow_evotree!(evotree::GBTree, cache; verbosity=1) 𝑗 = cache.𝑗_[sample(cache.𝑗_, ceil(Int, params.colsample * X_size[2]), replace=false, ordered=true)] # reset gain to -Inf for feat in cache.𝑗_ - splits[feat].gain = Float32(-Inf) + splits[feat].gain = T(-Inf) end # build a new tree @@ -134,10 +147,10 @@ end # grow a single tree function grow_tree(δ, δ², 𝑤, hist_δ, hist_δ², hist_𝑤, - params::EvoTypes, + params::EvoTypes{T,U,S}, train_nodes::Vector{TrainNode{L,T,S}}, splits::Vector{SplitInfo{L,T,Int}}, - edges, X_bin) where {R<:Real, T<:AbstractFloat, S<:Int, L} + edges, X_bin) where {T<:AbstractFloat, U, S, L} active_id = ones(Int, 1) leaf_count = one(Int) @@ -150,7 +163,7 @@ function grow_tree(δ, δ², 𝑤, # grow nodes for id in active_id node = train_nodes[id] - if tree_depth == params.max_depth || node.∑𝑤[1] <= params.min_weight + 1e-12 + if tree_depth == params.max_depth || node.∑𝑤[1] <= params.min_weight + 1e-8 push!(tree.nodes, TreeNode(pred_leaf(params.loss, node, params, δ²))) else if id > 1 && id == tree.nodes[node.parent].right diff --git a/src/loss.jl b/src/loss.jl index 17ea4c50..cf55b5e2 100644 --- a/src/loss.jl +++ b/src/loss.jl @@ -67,11 +67,11 @@ end # Gaussian - http://jrmeyer.github.io/machinelearning/2017/08/18/mle.html # pred[i][1] = μ -# pred[i][2] = log(σ²) +# pred[i][2] = log(σ) function update_grads!(loss::Gaussian, α, pred::Vector{SVector{L,T}}, target::AbstractArray{T, 1}, δ::Vector{SVector{L,T}}, δ²::Vector{SVector{L,T}}, 𝑤::Vector{SVector{1,T}}) where {T <: AbstractFloat, L} @inbounds @threads for i in eachindex(δ) - δ[i] = SVector((pred[i][1] - target[i]) / max(1e-8, exp(pred[i][2])) * 𝑤[i][1], 𝑤[i][1] / 2 * (1 - (pred[i][1] - target[i])^2 / max(1e-8, exp(pred[i][2])))) - δ²[i] = SVector(𝑤[i][1] / max(1e-8, exp(pred[i][2])), 𝑤[i][1] / max(1e-8, exp(pred[i][2])) * (pred[i][1] - target[i])^2) + δ[i] = SVector((pred[i][1] - target[i]) / max(1e-8, exp(2*pred[i][2])) * 𝑤[i][1], (1 - (pred[i][1] - target[i])^2 / max(1e-8, exp(2*pred[i][2]))) * 𝑤[i][1]) + δ²[i] = SVector(𝑤[i][1] / max(1e-8, exp(2*pred[i][2])), 2*𝑤[i][1] / max(1e-8, exp(2*pred[i][2])) * (pred[i][1] - target[i])^2) end end diff --git a/src/models.jl b/src/models.jl index cfdcb52b..ea8b7b99 100644 --- a/src/models.jl +++ b/src/models.jl @@ -29,6 +29,7 @@ mutable struct EvoTreeRegressor{T<:AbstractFloat, U<:ModelType, S<:Int} <: MLJMo end function EvoTreeRegressor(; + T::Type=Float32, loss=:linear, nrounds=10, λ=0.0, # @@ -49,7 +50,7 @@ function EvoTreeRegressor(; elseif loss == :quantile model_type = Quantile() end - model = EvoTreeRegressor(model_type, nrounds, Float32(λ), Float32(γ), Float32(η), max_depth, Float32(min_weight), Float32(rowsample), Float32(colsample), nbins, Float32(α), metric, seed) + model = EvoTreeRegressor(model_type, nrounds, T(λ), T(γ), T(η), max_depth, T(min_weight), T(rowsample), T(colsample), nbins, T(α), metric, seed) return model end @@ -72,6 +73,7 @@ mutable struct EvoTreeCount{T<:AbstractFloat, U<:ModelType, S<:Int} <: MLJModelI end function EvoTreeCount(; + T::Type=Float32, loss=:poisson, nrounds=10, λ=0.0, # @@ -87,7 +89,7 @@ function EvoTreeCount(; seed=444) if loss == :poisson model_type = Poisson() end - model = EvoTreeCount(Poisson(), nrounds, Float32(λ), Float32(γ), Float32(η), max_depth, Float32(min_weight), Float32(rowsample), Float32(colsample), nbins, Float32(α), metric, seed) + model = EvoTreeCount(Poisson(), nrounds, T(λ), T(γ), T(η), max_depth, T(min_weight), T(rowsample), T(colsample), nbins, T(α), metric, seed) return model end @@ -110,6 +112,7 @@ mutable struct EvoTreeClassifier{T<:AbstractFloat, U<:ModelType, S<:Int} <: MLJM end function EvoTreeClassifier(; + T::Type=Float32, loss=:softmax, nrounds=10, λ=0.0, # @@ -125,7 +128,7 @@ function EvoTreeClassifier(; seed=444) if loss == :softmax model_type = Softmax() end - model = EvoTreeClassifier(Softmax(), nrounds, Float32(λ), Float32(γ), Float32(η), max_depth, Float32(min_weight), Float32(rowsample), Float32(colsample), nbins, Float32(α), metric, seed) + model = EvoTreeClassifier(Softmax(), nrounds, T(λ), T(γ), T(η), max_depth, T(min_weight), T(rowsample), T(colsample), nbins, T(α), metric, seed) return model end @@ -148,6 +151,7 @@ mutable struct EvoTreeGaussian{T<:AbstractFloat, U<:ModelType, S<:Int} <: MLJMod end function EvoTreeGaussian(; + T::Type=Float64, loss=:gaussian, nrounds=10, λ=0.0, # @@ -163,7 +167,7 @@ function EvoTreeGaussian(; seed=444) if loss == :gaussian model_type = Gaussian() end - model = EvoTreeGaussian(Gaussian(), nrounds, Float32(λ), Float32(γ), Float32(η), max_depth, Float32(min_weight), Float32(rowsample), Float32(colsample), nbins, Float32(α), metric, seed) + model = EvoTreeGaussian(Gaussian(), nrounds, T(λ), T(γ), T(η), max_depth, T(min_weight), T(rowsample), T(colsample), nbins, T(α), metric, seed) return model end @@ -197,7 +201,7 @@ function EvoTreeRModels( elseif loss == :softmax model = EvoTreeClassifier(Softmax(), nrounds, Float32(λ), Float32(γ), Float32(η), max_depth, Float32(min_weight), Float32(rowsample), Float32(colsample), nbins, Float32(α), metric, seed) elseif loss == :gaussian - model = EvoTreeGaussian(Gaussian(), nrounds, Float32(λ), Float32(γ), Float32(η), max_depth, Float32(min_weight), Float32(rowsample), Float32(colsample), nbins, Float32(α), metric, seed) + model = EvoTreeGaussian(Gaussian(), nrounds, Float64(λ), Float64(γ), Float64(η), max_depth, Float64(min_weight), Float64(rowsample), Float64(colsample), nbins, Float64(α), metric, seed) else throw("invalid loss") end @@ -205,4 +209,5 @@ function EvoTreeRModels( return model end -const EvoTypes = Union{EvoTreeRegressor,EvoTreeCount,EvoTreeClassifier,EvoTreeGaussian} +# const EvoTypes = Union{EvoTreeRegressor,EvoTreeCount,EvoTreeClassifier,EvoTreeGaussian} +const EvoTypes{T,U,S} = Union{EvoTreeRegressor{T,U,S},EvoTreeCount{T,U,S},EvoTreeClassifier{T,U,S},EvoTreeGaussian{T,U,S}} diff --git a/src/predict.jl b/src/predict.jl index 426260a2..ba273759 100644 --- a/src/predict.jl +++ b/src/predict.jl @@ -1,5 +1,5 @@ # prediction from single tree - assign each observation to its final leaf -function predict!(pred, tree::Tree, X::AbstractMatrix{T}) where {T<:Real} +function predict!(pred, tree::Tree{L,T,S}, X::AbstractMatrix) where {L,T,S} @inbounds @threads for i in 1:size(X,1) id = 1 x = view(X, i, :) @@ -15,19 +15,19 @@ function predict!(pred, tree::Tree, X::AbstractMatrix{T}) where {T<:Real} end # prediction from single tree - assign each observation to its final leaf -function predict(tree::Tree, X::AbstractMatrix{T}, K) where T<:Real - pred = zeros(SVector{K,Float32}, size(X, 1)) +function predict(tree::Tree{L,T,S}, X::AbstractMatrix, K) where {L,T,S} + pred = zeros(SVector{K,T}, size(X, 1)) predict!(pred, tree, X) return pred end # prediction from single tree - assign each observation to its final leaf -function predict(model::GBTree, X::AbstractMatrix{T}) where T<:Real - pred = zeros(SVector{model.K,Float32}, size(X, 1)) +function predict(model::GBTree{L,T,S}, X::AbstractMatrix) where {L,T,S} + pred = zeros(SVector{model.K,T}, size(X, 1)) for tree in model.trees predict!(pred, tree, X) end - pred = reinterpret(Float32, pred) + pred = reinterpret(T, pred) if typeof(model.params.loss) == Logistic @. pred = sigmoid(pred) elseif typeof(model.params.loss) == Poisson diff --git a/test/MLJ.jl b/test/MLJ.jl index 9c05bc5e..9e07a1e6 100644 --- a/test/MLJ.jl +++ b/test/MLJ.jl @@ -50,7 +50,7 @@ mach = machine(tree_model, X, y) train, test = partition(eachindex(y), 0.7, shuffle=true); # 70:30 split fit!(mach, rows=train, verbosity=1) -mach.model.nrounds += 10 +mach.model.nrounds += 50 fit!(mach, rows=train, verbosity=1) pred_train = predict(mach, selectrows(X,train))