Skip to content

Commit

Permalink
Merge pull request #51 from Evovest/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
jeremiedb authored Jul 13, 2020
2 parents 5282cda + e5d1e0e commit 309cebe
Show file tree
Hide file tree
Showing 18 changed files with 299 additions and 65 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "EvoTrees"
uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
authors = ["jeremiedb <[email protected]>"]
version = "0.4.8"
version = "0.4.9"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand Down
50 changes: 48 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,53 @@ julia> Pkg.add("EvoTrees")
- α: float \[0,1\], set the quantile or bias in L1 default=0.5
- metric: {:mse, :rmse, :mae, :logloss, :quantile}, default=:none

## Getting started

## MLJ Integration

See [official project page](https://github.com/alan-turing-institute/MLJ.jl) for more info.

```julia
using StatsBase: sample
using EvoTrees
using EvoTrees: sigmoid, logit
using MLJBase

features = rand(10_000) .* 5 .- 2
X = reshape(features, (size(features)[1], 1))
Y = sin.(features) .* 0.5 .+ 0.5
Y = logit(Y) + randn(size(Y))
Y = sigmoid(Y)
y = Y
X = MLJBase.table(X)

# @load EvoTreeRegressor
# linear regression
tree_model = EvoTreeRegressor(loss=:linear, max_depth=5, η=0.05, nrounds=10)

# set machine
mach = machine(tree_model, X, y)

# partition data
train, test = partition(eachindex(y), 0.7, shuffle=true); # 70:30 split

# fit data
fit!(mach, rows=train, verbosity=1)

# continue training
mach.model.nrounds += 10
fit!(mach, rows=train, verbosity=1)

# predict on train data
pred_train = predict(mach, selectrows(X,train))
mean(abs.(pred_train - selectrows(Y,train)))

# predict on test data
pred_test = predict(mach, selectrows(X,test))
mean(abs.(pred_test - selectrows(Y,test)))
```


## Getting started using internal API

Minimal example to fit a noisy sinus wave.

Expand Down Expand Up @@ -167,7 +213,7 @@ pred_train_q80 = predict(model, X_train)

## Gaussian Max Likelihood

![](gaussian_likelihood.png)
![](gaussian_sinus.png)

```julia
params1 = EvoTreeGaussian(
Expand Down
2 changes: 1 addition & 1 deletion experiments/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ using EvoTrees
using BenchmarkTools

# prepare a dataset
features = rand(Int(2.25e6), 100)
features = rand(Int(1.25e6), 100)
# features = rand(100, 10)
X = features
Y = rand(size(X, 1))
Expand Down
54 changes: 45 additions & 9 deletions experiments/gaussian.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
using Plots
using Statistics
using StatsBase: sample
using EvoTrees
using Distributions
using Plots
using Revise
using EvoTrees

features = rand(Int(1.25e4), 5)
# prepare a dataset
Expand All @@ -14,6 +15,8 @@ Y[(X[:,1] .>= 0.4) .& (X[:,1] .< 0.6)] .*= 5
Y[(X[:,1] .>= 0.9)] .*= 5
𝑖 = collect(1:size(X,1))

Y .*= 0.01

# train-eval split
𝑖_sample = sample(𝑖, size(𝑖, 1), replace = false)
train_size = 0.8
Expand All @@ -26,17 +29,17 @@ Y_train, Y_eval = Y[𝑖_train], Y[𝑖_eval]
# train model
params1 = EvoTreeGaussian(
loss=:gaussian, metric=:gaussian,
nrounds=400,
λ = 0.5, γ=1.0, η=0.05,
max_depth = 4, min_weight = 200.0,
rowsample=0.9, colsample=0.99, nbins=255)
nrounds=40,
λ = 0.0, γ=0.0, η=0.05,
max_depth = 5, min_weight = 50.0,
rowsample=0.5, colsample=1.0, nbins=200)

@time model = fit_evotree(params1, X_train, Y_train, X_eval=X_eval, Y_eval=Y_eval, print_every_n = 10);
# @time model = fit_evotree(params1, X_train, Y_train, print_every_n = 10);
@time pred_train = EvoTrees.predict(model, X_train)
@time pred_train_gauss = EvoTrees.predict(params1, model, X_train)

pred_gauss = [Distributions.Normal(pred_train[i,1], sqrt(pred_train[i,2])) for i in 1:size(pred_train,1)]
pred_gauss = [Distributions.Normal(pred_train[i,1], pred_train[i,2]) for i in 1:size(pred_train,1)]
pred_q90 = quantile.(pred_gauss, 0.9)
pred_q10 = quantile.(pred_gauss, 0.1)

Expand All @@ -46,7 +49,40 @@ mean(Y_train .< pred_q10)
x_perm = sortperm(X_train[:,1])
plot(X_train[:, 1], Y_train, ms = 1, mcolor = "gray", mscolor = "lightgray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "")
plot!(X_train[:,1][x_perm], pred_train[x_perm, 1], color = "navy", linewidth = 1.5, label = "mu")
plot!(X_train[:,1][x_perm], sqrt.(pred_train[x_perm, 2]), color = "blue", linewidth = 1.5, label = "sigma")
plot!(X_train[:,1][x_perm], pred_train[x_perm, 2], color = "blue", linewidth = 1.5, label = "sigma")
plot!(X_train[:,1][x_perm], pred_q10[x_perm, 1], color = "red", linewidth = 1.5, label = "q10")
plot!(X_train[:,1][x_perm], pred_q90[x_perm, 1], color = "green", linewidth = 1.5, label = "q90")
# savefig("regression_gaussian.png")
savefig("regression_gaussian_v1.png")


# compare with zygote
using Zygote

pred = [0.0, log(1.0)]
target = 0.1

δ1 = (target - pred[1]) / max(1e-8, exp(2*pred[2]))
δ2 = (1 - (pred[1] - target)^2 / max(1e-8, exp(2*pred[2])))

δ²1 = 1 / max(1e-8, exp(2*pred[2]))
δ²2 = 2 / max(1e-8, exp(2*pred[2])) * (pred[1] - target)^2


lpdf(x,μ,σ) = -log(σ) - log(2π)/2 - 1/2*((x-μ)/σ)^2
lpdf(0, pred[1], pred[2])

lpdf2(x,μ,lσ) = -log(exp(lσ)) - log(2π)/2 - 1/2*((x-μ)/exp(lσ))^2
lpdf2(0, pred[1], pred[2])


n1 = Normal(0, 1)
Distributions.logpdf(n1, 0)

# gradient(lpdf, target, pred[1], pred[2])[2:end]
gradient(lpdf2, target, pred[1], pred[2])[2:end]
Zygote.hessian(lpdf2, target, pred[1], pred[2])

gradient_lpdf(x,pred) = gradient(lpdf2, x, pred[1], pred[2])[3]
hessian_lpdf(x,pred) = gradient(gradient_lpdf, x, pred)[1]
gradient_lpdf(target, pred)
hessian_lpdf(target, pred)
102 changes: 102 additions & 0 deletions experiments/parametric_type.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
using Statistics
using StatsBase: sample
# using XGBoost
using Revise
using EvoTrees
using BenchmarkTools

# prepare a dataset
features = rand(Int(2.25e6), 100)
# features = rand(100, 10)
X = features
Y = rand(size(X, 1))
𝑖 = collect(1:size(X,1))

# train-eval split
𝑖_sample = sample(𝑖, size(𝑖, 1), replace = false)
train_size = 0.8
𝑖_train = 𝑖_sample[1:floor(Int, train_size * size(𝑖, 1))]
𝑖_eval = 𝑖_sample[floor(Int, train_size * size(𝑖, 1))+1:end]

X_train, X_eval = X[𝑖_train, :], X[𝑖_eval, :]
Y_train, Y_eval = Y[𝑖_train], Y[𝑖_eval]

config = EvoTrees.EvoTreeRegressor3(T=Float32,
loss=:linear, metric=:none,
nrounds=100, α = 0.5,
λ = 0.0, γ=0.0, η=0.05,
max_depth = 6, min_weight = 1.0,
rowsample=0.5, colsample=0.5, nbins=32)


# for 1.25e5 init_evotree: 2.009 s 0.322925 seconds (2.53 k allocations: 167.345 MiB)
# for 1.25e5 no eval iter 100: 2.009 s (628514 allocations: 720.62 MiB)
# for 1.25e6 no eval iter 10: 6.200 s (44330 allocations: 2.19 GiB)
# for 1.25e6 no eval iter 100: 19.481940 seconds (635.33 k allocations: 6.679 GiB, 3.11% gc time)
# for 1.25e6 mse with eval data: 6.321 s (45077 allocations: 2.19 GiB)
@time model, cache = init_evotree(config, X_train, Y_train);
@time grow_evotree!(model, cache);
@time model = fit_evotree(config, X_train, Y_train);
@btime model = fit_evotree(config, X_train, Y_train);
@time pred_train = EvoTrees.predict(model, X_train)

@time model = fit_evotree(config, X_train, Y_train, X_eval=X_eval, Y_eval=Y_eval, print_every_n=9999, early_stopping_rounds=9999);
@btime model = fit_evotree(config, X_train, Y_train, X_eval=X_eval, Y_eval=Y_eval, print_every_n=9999, early_stopping_rounds=9999);

@time model = fit_evotree(config, X_train, Y_train, early_stopping_rounds=10);
@time model = fit_evotree(config, X_train, Y_train, print_every_n=2);

# @time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 5);
# @btime model = grow_gbtree($X_train, $Y_train, $params1, X_eval = $X_eval, Y_eval = $Y_eval);
@time pred_train = predict(model, X_train)


#############################
# agaricus
#############################
function readlibsvm(fname::String, shape)
dmx = zeros(Float32, shape)
label = Float32[]
fi = open(fname, "r")
cnt = 1
for line in eachline(fi)
line = split(line, " ")
push!(label, parse(Float64, line[1]))
line = line[2:end]
for itm in line
itm = split(itm, ":")
dmx[cnt, parse(Int, itm[1]) + 1] = parse(Int, itm[2])
end
cnt += 1
end
close(fi)
return (dmx, label)
end

# we use auxiliary function to read LIBSVM format into julia Matrix
train_X, train_Y = readlibsvm("data/agaricus.txt.train", (6513, 126))
test_X, test_Y = readlibsvm("data/agaricus.txt.test", (1611, 126))

#-------------Basic Training using XGBoost-----------------
# note: xgboost naturally handles sparse input
# use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector)
# model parameters can be set as parameters for ```xgboost``` function, or use a Vector{String} / Dict()
num_round = 100
# you can directly pass Julia's matrix or sparse matrix as data,
# by calling xgboost(data, num_round, label=label, training-parameters)
metrics = ["logloss"]
@time bst = xgboost(train_X, num_round, label = train_Y, eta = 0.1, max_depth = 3, metrics = metrics, silent=0, objective = "binary:logistic")
features_xgb = XGBoost.importance(bst)

params1 = EvoTreeRegressor(
loss=:logistic, metric=:logloss,
nrounds=100,
λ = 0.0, γ=0.0, η=0.1,
max_depth = 4, min_weight = 1.0,
rowsample=1.0, colsample=1.0, nbins=250)

@time model = fit_evotree(params1, train_X, train_Y, print_every_n=20);
@time model = fit_evotree(params1, X_train, Y_train, X_eval=test_X, Y_eval=test_Y, print_every_n=20);
@time pred_train = EvoTrees.predict(model, X_train)
features_evo = importance(model, 1:size(X_train,2))
sort(collect(values(features_evo)))
36 changes: 34 additions & 2 deletions experiments/readme_plots.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ params1 = EvoTreeRegressor(
sqrt(mean((pred_train_L1 .- Y_train) .^ 2))

x_perm = sortperm(X_train[:,1])
plot(X_train, Y_train, ms = 1, mcolor = "gray", mscolor = "lightgray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "")
plot(X_train, Y_train, ms = 1, mcolor = "gray", mscolor = "gray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "")
plot!(X_train[:,1][x_perm], pred_train_linear[x_perm], color = "navy", linewidth = 1.5, label = "Linear")
plot!(X_train[:,1][x_perm], pred_train_logistic[x_perm], color = "darkred", linewidth = 1.5, label = "Logistic")
plot!(X_train[:,1][x_perm], pred_train_poisson[x_perm], color = "green", linewidth = 1.5, label = "Poisson")
Expand Down Expand Up @@ -135,8 +135,40 @@ params1 = EvoTreeRegressor(
sum(pred_train_q80 .< Y_train) / length(Y_train)

x_perm = sortperm(X_train[:,1])
plot(X_train, Y_train, ms = 1, mcolor = "gray", mscolor = "lightgray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "")
plot(X_train, Y_train, ms = 1, mcolor = "gray", mscolor = "gray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "")
plot!(X_train[:,1][x_perm], pred_train_q50[x_perm], color = "navy", linewidth = 1.5, label = "Median")
plot!(X_train[:,1][x_perm], pred_train_q20[x_perm], color = "darkred", linewidth = 1.5, label = "Q20")
plot!(X_train[:,1][x_perm], pred_train_q80[x_perm], color = "green", linewidth = 1.5, label = "Q80")
savefig("quantiles_sinus.png")



###############################
## gaussian
###############################
params1 = EvoTreeGaussian(
loss=:gaussian, metric=:gaussian,
nrounds=200, nbins=100,
λ = 0.0, γ=0.0, η=0.05,
max_depth = 5, min_weight = 1.0,
rowsample=0.8, colsample=1.0, seed=123)

@time model = fit_evotree(params1, X_train, Y_train, X_eval=X_eval, Y_eval=Y_eval, print_every_n = 10);
# @time model = fit_evotree(params1, X_train, Y_train, print_every_n = 10);
@time pred_train = EvoTrees.predict(model, X_train)
@time pred_train_gauss = EvoTrees.predict(params1, model, X_train)

pred_gauss = [Distributions.Normal(pred_train[i,1], pred_train[i,2]) for i in 1:size(pred_train,1)]
pred_q80 = quantile.(pred_gauss, 0.8)
pred_q20 = quantile.(pred_gauss, 0.2)

mean(Y_train .< pred_q80)
mean(Y_train .< pred_q20)

x_perm = sortperm(X_train[:,1])
plot(X_train[:, 1], Y_train, ms = 1, mcolor = "gray", mscolor = "gray", background_color = RGB(1, 1, 1), seriestype=:scatter, xaxis = ("feature"), yaxis = ("target"), legend = true, label = "")
plot!(X_train[:,1][x_perm], pred_train[x_perm, 1], color = "navy", linewidth = 1.5, label = "mu")
plot!(X_train[:,1][x_perm], pred_train[x_perm, 2], color = "red", linewidth = 1.5, label = "sigma")
plot!(X_train[:,1][x_perm], pred_q20[x_perm, 1], color = "green", linewidth = 1.5, label = "q20")
plot!(X_train[:,1][x_perm], pred_q80[x_perm, 1], color = "green", linewidth = 1.5, label = "q80")
savefig("gaussian_sinus.png")
Binary file added gaussian_sinus.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified quantiles_sinus.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added regression_gaussian_v1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified regression_sinus.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion src/MLJ.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ end
function predict(model::EvoTreeGaussian, fitresult, Xnew)
Xnew = MLJModelInterface.matrix(Xnew)
pred = predict(fitresult, Xnew)
return [Distributions.Normal(pred[i,1], sqrt(pred[i,2])) for i in 1:size(pred,1)]
return [Distributions.Normal(pred[i,1], pred[i,2]) for i in 1:size(pred,1)]
end

# Metadata
Expand Down
4 changes: 2 additions & 2 deletions src/eval.jl
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,11 @@ end

# gaussian
# pred[i][1] = μ
# pred[i][2] = log(σ²)
# pred[i][2] = log(σ)
function eval_metric(::Val{:gaussian}, pred::Vector{SVector{L,T}}, Y::AbstractVector{T}, α=0.0) where {L, T <: AbstractFloat}
eval = zero(T)
@inbounds for i in 1:length(pred)
eval += pred[i][2]/2 + (Y[i] - pred[i][1])^2 / (2*max(1e-8, exp(pred[i][2])))
eval += pred[i][2] + (Y[i] - pred[i][1])^2 / (2*max(1e-8, exp(2*pred[i][2])))
end
eval /= length(Y)
return eval
Expand Down
4 changes: 2 additions & 2 deletions src/find_split.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#############################################
# Get the braking points
#############################################
function get_edges(X::Matrix{T}, nbins=250) where {T}
edges = Vector{Vector{Float32}}(undef, size(X,2))
function get_edges(X::AbstractMatrix{T}, nbins=250) where {T}
edges = Vector{Vector{T}}(undef, size(X,2))
@threads for i in 1:size(X, 2)
edges[i] = quantile(view(X, :,i), (1:nbins)/nbins)
if length(edges[i]) == 0
Expand Down
Loading

0 comments on commit 309cebe

Please sign in to comment.