Merge pull request #51 from Evovest/dev

Dev
Evovest · Jul 13, 2020 · 309cebe · 309cebe
2 parents 5282cda + e5d1e0e
commit 309cebe
Show file tree

Hide file tree

Showing 18 changed files with 299 additions and 65 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "EvoTrees"
 uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
 authors = ["jeremiedb <[email protected]>"]
-version = "0.4.8"
+version = "0.4.9"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"

diff --git a/README.md b/README.md
@@ -62,7 +62,53 @@ julia> Pkg.add("EvoTrees")
   - α: float \[0,1\], set the quantile or bias in L1 default=0.5
   - metric: {:mse, :rmse, :mae, :logloss, :quantile},  default=:none
 
-## Getting started
+
+## MLJ Integration
+
+See [official project page](https://github.com/alan-turing-institute/MLJ.jl) for more info.
+
+```julia
+using StatsBase: sample
+using EvoTrees
+using EvoTrees: sigmoid, logit
+using MLJBase
+
+features = rand(10_000) .* 5 .- 2
+X = reshape(features, (size(features)[1], 1))
+Y = sin.(features) .* 0.5 .+ 0.5
+Y = logit(Y) + randn(size(Y))
+Y = sigmoid(Y)
+y = Y
+X = MLJBase.table(X)
+
+# @load EvoTreeRegressor
+# linear regression
+tree_model = EvoTreeRegressor(loss=:linear, max_depth=5, η=0.05, nrounds=10)
+
+# set machine
+mach = machine(tree_model, X, y)
+
+# partition data
+train, test = partition(eachindex(y), 0.7, shuffle=true); # 70:30 split
+
+# fit data
+fit!(mach, rows=train, verbosity=1)
+
+# continue training
+mach.model.nrounds += 10
+fit!(mach, rows=train, verbosity=1)
+
+# predict on train data
+pred_train = predict(mach, selectrows(X,train))
+mean(abs.(pred_train - selectrows(Y,train)))
+
+# predict on test data
+pred_test = predict(mach, selectrows(X,test))
+mean(abs.(pred_test - selectrows(Y,test)))
+```
+
+
+## Getting started using internal API
 
 Minimal example to fit a noisy sinus wave.
 
@@ -167,7 +213,7 @@ pred_train_q80 = predict(model, X_train)
 
 ## Gaussian Max Likelihood
 
-![](gaussian_likelihood.png)
+![](gaussian_sinus.png)
 
 ```julia
 params1 = EvoTreeGaussian(

diff --git a/experiments/benchmarks.jl b/experiments/benchmarks.jl
@@ -6,7 +6,7 @@ using EvoTrees
 using BenchmarkTools
 
 # prepare a dataset
-features = rand(Int(2.25e6), 100)
+features = rand(Int(1.25e6), 100)
 # features = rand(100, 10)
 X = features
 Y = rand(size(X, 1))

diff --git a/experiments/gaussian.jl b/experiments/gaussian.jl
@@ -1,8 +1,9 @@
+using Plots
 using Statistics
 using StatsBase: sample
-using EvoTrees
 using Distributions
-using Plots
+using Revise
+using EvoTrees
 
 features = rand(Int(1.25e4), 5)
 # prepare a dataset
@@ -14,6 +15,8 @@ Y[(X[:,1] .>= 0.4) .& (X[:,1] .< 0.6)] .*= 5
 Y[(X[:,1] .>= 0.9)] .*= 5
 𝑖 = collect(1:size(X,1))
 
+Y .*= 0.01
+
 # train-eval split
 𝑖_sample = sample(𝑖, size(𝑖, 1), replace = false)
 train_size = 0.8
@@ -26,17 +29,17 @@ Y_train, Y_eval = Y[𝑖_train], Y[𝑖_eval]
 # train model
 params1 = EvoTreeGaussian(
     loss=:gaussian, metric=:gaussian,
-    nrounds=400,
-    λ = 0.5, γ=1.0, η=0.05,
-    max_depth = 4, min_weight = 200.0,
-    rowsample=0.9, colsample=0.99, nbins=255)
+    nrounds=40,
+    λ = 0.0, γ=0.0, η=0.05,
+    max_depth = 5, min_weight = 50.0,
+    rowsample=0.5, colsample=1.0, nbins=200)
 
 @time model = fit_evotree(params1, X_train, Y_train, X_eval=X_eval, Y_eval=Y_eval, print_every_n = 10);
 # @time model = fit_evotree(params1, X_train, Y_train, print_every_n = 10);
 @time pred_train = EvoTrees.predict(model, X_train)
 @time pred_train_gauss = EvoTrees.predict(params1, model, X_train)
 
-pred_gauss = [Distributions.Normal(pred_train[i,1], sqrt(pred_train[i,2])) for i in 1:size(pred_train,1)]
+pred_gauss = [Distributions.Normal(pred_train[i,1], pred_train[i,2]) for i in 1:size(pred_train,1)]
 pred_q90 = quantile.(pred_gauss, 0.9)
 pred_q10 = quantile.(pred_gauss, 0.1)
 
@@ -46,7 +49,40 @@ mean(Y_train .< pred_q10)
 x_perm = sortperm(X_train[:,1])
 plot(X_train[:, 1], Y_train, ms = 1, mcolor = "gray", mscolor = "lightgray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "")
 plot!(X_train[:,1][x_perm], pred_train[x_perm, 1], color = "navy", linewidth = 1.5, label = "mu")
-plot!(X_train[:,1][x_perm], sqrt.(pred_train[x_perm, 2]), color = "blue", linewidth = 1.5, label = "sigma")
+plot!(X_train[:,1][x_perm], pred_train[x_perm, 2], color = "blue", linewidth = 1.5, label = "sigma")
 plot!(X_train[:,1][x_perm], pred_q10[x_perm, 1], color = "red", linewidth = 1.5, label = "q10")
 plot!(X_train[:,1][x_perm], pred_q90[x_perm, 1], color = "green", linewidth = 1.5, label = "q90")
-# savefig("regression_gaussian.png")
+savefig("regression_gaussian_v1.png")
+
+
+# compare with zygote
+using Zygote
+
+pred = [0.0, log(1.0)]
+target = 0.1
+
+δ1 = (target - pred[1]) / max(1e-8, exp(2*pred[2]))
+δ2 = (1 - (pred[1] - target)^2 / max(1e-8, exp(2*pred[2])))
+
+δ²1 = 1 / max(1e-8, exp(2*pred[2]))
+δ²2 = 2 / max(1e-8, exp(2*pred[2])) * (pred[1] - target)^2
+
+
+lpdf(x,μ,σ) = -log(σ) - log(2π)/2 - 1/2*((x-μ)/σ)^2
+lpdf(0, pred[1], pred[2])
+
+lpdf2(x,μ,lσ) = -log(exp(lσ)) - log(2π)/2 - 1/2*((x-μ)/exp(lσ))^2
+lpdf2(0, pred[1], pred[2])
+
+
+n1 = Normal(0, 1)
+Distributions.logpdf(n1, 0)
+
+# gradient(lpdf, target, pred[1], pred[2])[2:end]
+gradient(lpdf2, target, pred[1], pred[2])[2:end]
+Zygote.hessian(lpdf2, target, pred[1], pred[2])
+
+gradient_lpdf(x,pred) = gradient(lpdf2, x, pred[1], pred[2])[3]
+hessian_lpdf(x,pred) = gradient(gradient_lpdf, x, pred)[1]
+gradient_lpdf(target, pred)
+hessian_lpdf(target, pred)
diff --git a/experiments/parametric_type.jl b/experiments/parametric_type.jl
@@ -0,0 +1,102 @@
+using Statistics
+using StatsBase: sample
+# using XGBoost
+using Revise
+using EvoTrees
+using BenchmarkTools
+
+# prepare a dataset
+features = rand(Int(2.25e6), 100)
+# features = rand(100, 10)
+X = features
+Y = rand(size(X, 1))
+𝑖 = collect(1:size(X,1))
+
+# train-eval split
+𝑖_sample = sample(𝑖, size(𝑖, 1), replace = false)
+train_size = 0.8
+𝑖_train = 𝑖_sample[1:floor(Int, train_size * size(𝑖, 1))]
+𝑖_eval = 𝑖_sample[floor(Int, train_size * size(𝑖, 1))+1:end]
+
+X_train, X_eval = X[𝑖_train, :], X[𝑖_eval, :]
+Y_train, Y_eval = Y[𝑖_train], Y[𝑖_eval]
+
+config = EvoTrees.EvoTreeRegressor3(T=Float32,
+        loss=:linear, metric=:none,
+        nrounds=100, α = 0.5,
+        λ = 0.0, γ=0.0, η=0.05,
+        max_depth = 6, min_weight = 1.0,
+        rowsample=0.5, colsample=0.5, nbins=32)
+
+
+# for 1.25e5 init_evotree: 2.009 s 0.322925 seconds (2.53 k allocations: 167.345 MiB)
+# for 1.25e5 no eval iter 100: 2.009 s (628514 allocations: 720.62 MiB)
+# for 1.25e6 no eval iter 10: 6.200 s (44330 allocations: 2.19 GiB)
+# for 1.25e6 no eval iter 100: 19.481940 seconds (635.33 k allocations: 6.679 GiB, 3.11% gc time)
+# for 1.25e6 mse with eval data: 6.321 s (45077 allocations: 2.19 GiB)
+@time model, cache = init_evotree(config, X_train, Y_train);
+@time grow_evotree!(model, cache);
+@time model = fit_evotree(config, X_train, Y_train);
+@btime model = fit_evotree(config, X_train, Y_train);
+@time pred_train = EvoTrees.predict(model, X_train)
+
+@time model = fit_evotree(config, X_train, Y_train, X_eval=X_eval, Y_eval=Y_eval, print_every_n=9999, early_stopping_rounds=9999);
+@btime model = fit_evotree(config, X_train, Y_train, X_eval=X_eval, Y_eval=Y_eval, print_every_n=9999, early_stopping_rounds=9999);
+
+@time model = fit_evotree(config, X_train, Y_train, early_stopping_rounds=10);
+@time model = fit_evotree(config, X_train, Y_train, print_every_n=2);
+
+# @time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 5);
+# @btime model = grow_gbtree($X_train, $Y_train, $params1, X_eval = $X_eval, Y_eval = $Y_eval);
+@time pred_train = predict(model, X_train)
+
+
+#############################
+# agaricus
+#############################
+function readlibsvm(fname::String, shape)
+    dmx = zeros(Float32, shape)
+    label = Float32[]
+    fi = open(fname, "r")
+    cnt = 1
+    for line in eachline(fi)
+        line = split(line, " ")
+        push!(label, parse(Float64, line[1]))
+        line = line[2:end]
+        for itm in line
+            itm = split(itm, ":")
+            dmx[cnt, parse(Int, itm[1]) + 1] = parse(Int, itm[2])
+        end
+        cnt += 1
+    end
+    close(fi)
+    return (dmx, label)
+end
+
+# we use auxiliary function to read LIBSVM format into julia Matrix
+train_X, train_Y = readlibsvm("data/agaricus.txt.train", (6513, 126))
+test_X, test_Y   = readlibsvm("data/agaricus.txt.test", (1611, 126))
+
+#-------------Basic Training using XGBoost-----------------
+# note: xgboost naturally handles sparse input
+# use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector)
+# model parameters can be set as parameters for ```xgboost``` function, or use a Vector{String} / Dict()
+num_round = 100
+# you can directly pass Julia's matrix or sparse matrix as data,
+# by calling xgboost(data, num_round, label=label, training-parameters)
+metrics = ["logloss"]
+@time bst = xgboost(train_X, num_round, label = train_Y, eta = 0.1, max_depth = 3, metrics = metrics, silent=0, objective = "binary:logistic")
+features_xgb = XGBoost.importance(bst)
+
+params1 = EvoTreeRegressor(
+    loss=:logistic, metric=:logloss,
+    nrounds=100,
+    λ = 0.0, γ=0.0, η=0.1,
+    max_depth = 4, min_weight = 1.0,
+    rowsample=1.0, colsample=1.0, nbins=250)
+
+@time model = fit_evotree(params1, train_X, train_Y, print_every_n=20);
+@time model = fit_evotree(params1, X_train, Y_train, X_eval=test_X, Y_eval=test_Y, print_every_n=20);
+@time pred_train = EvoTrees.predict(model, X_train)
+features_evo = importance(model, 1:size(X_train,2))
+sort(collect(values(features_evo)))
diff --git a/experiments/readme_plots.jl b/experiments/readme_plots.jl
@@ -87,7 +87,7 @@ params1 = EvoTreeRegressor(
 sqrt(mean((pred_train_L1 .- Y_train) .^ 2))
 
 x_perm = sortperm(X_train[:,1])
-plot(X_train, Y_train, ms = 1, mcolor = "gray", mscolor = "lightgray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "")
+plot(X_train, Y_train, ms = 1, mcolor = "gray", mscolor = "gray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "")
 plot!(X_train[:,1][x_perm], pred_train_linear[x_perm], color = "navy", linewidth = 1.5, label = "Linear")
 plot!(X_train[:,1][x_perm], pred_train_logistic[x_perm], color = "darkred", linewidth = 1.5, label = "Logistic")
 plot!(X_train[:,1][x_perm], pred_train_poisson[x_perm], color = "green", linewidth = 1.5, label = "Poisson")
@@ -135,8 +135,40 @@ params1 = EvoTreeRegressor(
 sum(pred_train_q80 .< Y_train) / length(Y_train)
 
 x_perm = sortperm(X_train[:,1])
-plot(X_train, Y_train, ms = 1, mcolor = "gray", mscolor = "lightgray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "")
+plot(X_train, Y_train, ms = 1, mcolor = "gray", mscolor = "gray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "")
 plot!(X_train[:,1][x_perm], pred_train_q50[x_perm], color = "navy", linewidth = 1.5, label = "Median")
 plot!(X_train[:,1][x_perm], pred_train_q20[x_perm], color = "darkred", linewidth = 1.5, label = "Q20")
 plot!(X_train[:,1][x_perm], pred_train_q80[x_perm], color = "green", linewidth = 1.5, label = "Q80")
 savefig("quantiles_sinus.png")
+
+
+
+###############################
+## gaussian
+###############################
+params1 = EvoTreeGaussian(
+    loss=:gaussian, metric=:gaussian,
+    nrounds=200, nbins=100,
+    λ = 0.0, γ=0.0, η=0.05,
+    max_depth = 5, min_weight = 1.0,
+    rowsample=0.8, colsample=1.0, seed=123)
+
+@time model = fit_evotree(params1, X_train, Y_train, X_eval=X_eval, Y_eval=Y_eval, print_every_n = 10);
+# @time model = fit_evotree(params1, X_train, Y_train, print_every_n = 10);
+@time pred_train = EvoTrees.predict(model, X_train)
+@time pred_train_gauss = EvoTrees.predict(params1, model, X_train)
+
+pred_gauss = [Distributions.Normal(pred_train[i,1], pred_train[i,2]) for i in 1:size(pred_train,1)]
+pred_q80 = quantile.(pred_gauss, 0.8)
+pred_q20 = quantile.(pred_gauss, 0.2)
+
+mean(Y_train .< pred_q80)
+mean(Y_train .< pred_q20)
+
+x_perm = sortperm(X_train[:,1])
+plot(X_train[:, 1], Y_train, ms = 1, mcolor = "gray", mscolor = "gray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "")
+plot!(X_train[:,1][x_perm], pred_train[x_perm, 1], color = "navy", linewidth = 1.5, label = "mu")
+plot!(X_train[:,1][x_perm], pred_train[x_perm, 2], color = "red", linewidth = 1.5, label = "sigma")
+plot!(X_train[:,1][x_perm], pred_q20[x_perm, 1], color = "green", linewidth = 1.5, label = "q20")
+plot!(X_train[:,1][x_perm], pred_q80[x_perm, 1], color = "green", linewidth = 1.5, label = "q80")
+savefig("gaussian_sinus.png")
diff --git a/gaussian_sinus.png b/gaussian_sinus.png
diff --git a/quantiles_sinus.png b/quantiles_sinus.png
diff --git a/regression_gaussian_v1.png b/regression_gaussian_v1.png
diff --git a/regression_sinus.png b/regression_sinus.png
diff --git a/src/MLJ.jl b/src/MLJ.jl
@@ -55,7 +55,7 @@ end
 function predict(model::EvoTreeGaussian, fitresult, Xnew)
     Xnew = MLJModelInterface.matrix(Xnew)
     pred = predict(fitresult, Xnew)
-    return [Distributions.Normal(pred[i,1], sqrt(pred[i,2])) for i in 1:size(pred,1)]
+    return [Distributions.Normal(pred[i,1], pred[i,2]) for i in 1:size(pred,1)]
 end
 
 # Metadata

diff --git a/src/eval.jl b/src/eval.jl
@@ -61,11 +61,11 @@ end
 
 # gaussian
 # pred[i][1] = μ
-# pred[i][2] = log(σ²)
+# pred[i][2] = log(σ)
 function eval_metric(::Val{:gaussian}, pred::Vector{SVector{L,T}}, Y::AbstractVector{T}, α=0.0) where {L, T <: AbstractFloat}
     eval = zero(T)
     @inbounds for i in 1:length(pred)
-        eval += pred[i][2]/2 + (Y[i] - pred[i][1])^2 / (2*max(1e-8, exp(pred[i][2])))
+        eval += pred[i][2] + (Y[i] - pred[i][1])^2 / (2*max(1e-8, exp(2*pred[i][2])))
     end
     eval /= length(Y)
     return eval

diff --git a/src/find_split.jl b/src/find_split.jl
@@ -1,8 +1,8 @@
 #############################################
 # Get the braking points
 #############################################
-function get_edges(X::Matrix{T}, nbins=250) where {T}
-    edges = Vector{Vector{Float32}}(undef, size(X,2))
+function get_edges(X::AbstractMatrix{T}, nbins=250) where {T}
+    edges = Vector{Vector{T}}(undef, size(X,2))
     @threads for i in 1:size(X, 2)
         edges[i] = quantile(view(X, :,i), (1:nbins)/nbins)
         if length(edges[i]) == 0