Skip to content

Commit

Permalink
Merge pull request #55 from Evovest/gpu-dev
Browse files Browse the repository at this point in the history
Gpu dev
  • Loading branch information
jeremiedb authored Aug 11, 2020
2 parents 4020c36 + 2e379b1 commit f6cbb0f
Show file tree
Hide file tree
Showing 33 changed files with 1,641 additions and 626 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
language: julia
julia:
- nightly
- 1.0
- 1.5
- 1.4

matrix:
Expand Down
8 changes: 5 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
name = "EvoTrees"
uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
authors = ["jeremiedb <[email protected]>"]
version = "0.4.9"
version = "0.5.0"

[deps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
Expand All @@ -13,12 +14,13 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

[compat]
CategoricalArrays = "0.7, 0.8"
CUDA = "1"
CategoricalArrays = "0.8"
Distributions = "0.22, 0.23"
MLJModelInterface = "0.3"
StaticArrays = "0.12"
StatsBase = "0.32, 0.33"
julia = "1"
julia = "1.4"

[extras]
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
Expand Down
16 changes: 10 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Build Status](https://travis-ci.org/Evovest/EvoTrees.jl.svg?branch=master)](https://travis-ci.org/Evovest/EvoTrees.jl)

A Julia implementation of boosted trees.
Efficient histogram based algorithm with support for conventional and extended loss (notably multi-target objectives such as max likelihood methods).
Efficient histogram based algorithm with support for multiple loss functions (notably multi-target objectives such as max likelihood methods).

[R binding available](https://github.com/Evovest/EvoTrees)

Expand All @@ -17,9 +17,13 @@ Currently supports:
- multiclassification (softmax)
- Gaussian (max likelihood)

Input features is expected to be `Matrix{Float64/Float32}`. User friendly format conversion to be done (or see integration with MLJ).

Input features is expected to be `Matrix{Float64}`. User friendly format conversion to be done.
Next priority: GPU support.
## GPU

An experimental GPU support is now provided for linear, logistic and Gaussian objective functions. Speedup compared to multi-threaded cpu histogram is modest at the moment (~25% vs 16 CPU threads on RTX2080).

Simply call `fit_evotree_gpu()` instead of `fit_evotree()` and `predict_gpu()` instead of `predict()`.

## Installation

Expand Down Expand Up @@ -112,7 +116,7 @@ mean(abs.(pred_test - selectrows(Y,test)))

Minimal example to fit a noisy sinus wave.

![](regression_sinus.png)
![](figures/regression_sinus.png)

```julia
using EvoTrees
Expand Down Expand Up @@ -177,7 +181,7 @@ pred_eval_L1 = predict(model, X_eval)

## Quantile Regression

![](quantiles_sinus.png)
![](figures/quantiles_sinus.png)

```julia
# q50
Expand Down Expand Up @@ -213,7 +217,7 @@ pred_train_q80 = predict(model, X_train)

## Gaussian Max Likelihood

![](gaussian_sinus.png)
![](figures/gaussian_sinus.png)

```julia
params1 = EvoTreeGaussian(
Expand Down
Binary file removed anim_fps1.gif
Binary file not shown.
96 changes: 96 additions & 0 deletions experiments/GPU_loss.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
using CUDA
# using Flux

items = Int(1e6)
Ξ΄ = rand(Float32, items, 1)
δ² = rand(Float32, items, 1)
𝑀 = rand(Float32, items)
pred = rand(Float32, items, 1)
target = rand(Float32, items)

Ξ΄_gpu = CuArray(Ξ΄)
δ²_gpu = CuArray(δ²)
𝑀_gpu = CuArray(𝑀)
pred_gpu = CuArray(pred)
target_gpu = CuArray(target)

function update_grads_gpu_linear_1!(pred::AbstractMatrix{T}, target::AbstractVector{T}, Ξ΄::AbstractMatrix{T}, δ²::AbstractMatrix{T}, 𝑀::AbstractVector{T}) where {T <: AbstractFloat}
@. Ξ΄ = 2f0 * (pred - target) * 𝑀
@. δ² = 2f0 * 𝑀
return
end


function kernel_linear_Ξ΄!(Ξ΄::CuDeviceMatrix{T}, p::CuDeviceMatrix{T}, t::CuDeviceVector{T}, 𝑀::CuDeviceVector{T}) where {T<:AbstractFloat}
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
if i <= length(t)
@inbounds Ξ΄[i] = 2 * (p[i] - t[i]) * 𝑀[i]
end
return
end

function kernel_linear_δ²!(Ξ΄::CuDeviceMatrix{T}, p::CuDeviceMatrix{T}, t::CuDeviceVector{T}, 𝑀::CuDeviceVector{T}) where {T<:AbstractFloat}
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
if i <= length(t)
@inbounds Ξ΄[i] = 2 * 𝑀[i]
end
return
end

# base approach - block built along the cols first, the rows (limit collisions)
function grad_linear!(Ξ΄::CuMatrix{T}, δ²::CuMatrix{T}, p::CuMatrix{T}, t::CuVector{T}, 𝑀::CuVector{T}; MAX_THREADS=1024) where {T<:AbstractFloat}
thread_i = min(MAX_THREADS, length(t))
threads = (thread_i)
blocks = ceil.(Int, (length(t)) ./ threads)
@cuda blocks=blocks threads=threads kernel_linear_Ξ΄!(Ξ΄, p, t, 𝑀)
@cuda blocks=blocks threads=threads kernel_linear_δ²!(δ², p, t, 𝑀)
return
end

CUDA.@time update_grads_gpu_linear_1!(pred_gpu, target_gpu, Ξ΄_gpu, δ²_gpu, 𝑀_gpu)
CUDA.@time grad_linear!(Ξ΄_gpu, δ²_gpu, pred_gpu, target_gpu, 𝑀_gpu, MAX_THREADS=1024)

#################################################
# Gaussian
#################################################
items = Int(1e6)
Ξ΄ = zeros(Float32, items, 1)
δ² = zeros(Float32, items, 1)
𝑀 = rand(Float32, items)
pred = rand(Float32, items, 1)
target = rand(Float32, items)

Ξ΄_gpu = CuArray(Ξ΄)
δ²_gpu = CuArray(δ²)
𝑀_gpu = CuArray(𝑀)
pred_gpu = CuArray(pred)
target_gpu = CuArray(target)

function kernel_gauss_Ξ΄!(Ξ΄::CuDeviceMatrix{T}, p::CuDeviceMatrix{T}, t::CuDeviceVector{T}, 𝑀::CuDeviceVector{T}) where {T<:AbstractFloat}
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
if i <= length(t)
Ξ΄[i,1] = (p[i,1] - t[i]) / max(Cfloat(1e-8), exp(2f0 * p[i,2])) * 𝑀[i]
Ξ΄[i,2] = (1f0 - (p[i,1] - t[i])^2f0 / max(Cfloat(1e-8), exp(2f0 * p[i,2]))) * 𝑀[i]
end
return
end

function kernel_gauss_δ²!(Ξ΄::CuDeviceMatrix{T}, p::CuDeviceMatrix{T}, t::CuDeviceVector{T}, 𝑀::CuDeviceVector{T}) where {T<:AbstractFloat}
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
if i <= length(t)
Ξ΄[i,1] = 𝑀[i] / max(Cfloat(1e-8), exp(2 * p[i,2]))
Ξ΄[i,2] = 2 * 𝑀[i] / max(Cfloat(1e-8), exp(2 * pred[i,2])) * (p[i,1] - target[i])^2
end
end

# base approach - block built along the cols first, the rows (limit collisions)
function grad_gaussian!(Ξ΄::CuMatrix{T}, δ²::CuMatrix{T}, p::CuMatrix{T}, t::CuVector{T}, 𝑀::CuVector{T}; MAX_THREADS=1024) where {T<:AbstractFloat}
thread_i = min(MAX_THREADS, length(t))
threads = (thread_i)
blocks = ceil.(Int, (length(t)) ./ threads)
@cuda blocks=blocks threads=threads kernel_linear_Ξ΄!(Ξ΄, p, t, 𝑀)
@cuda blocks=blocks threads=threads kernel_linear_δ²!(δ², p, t, 𝑀)
return
end

CUDA.@time grad_gaussian!(Ξ΄_gpu, δ²_gpu, pred_gpu, target_gpu, 𝑀_gpu, MAX_THREADS=1024)
198 changes: 198 additions & 0 deletions experiments/GPU_v2.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
# using CUDA
using CUDA
# using Flux
# using GeometricFlux

nbins = 32
ncol = 100
items = Int(1e6)
hist = zeros(Float32, nbins, ncol)
Ξ΄ = rand(Float32, items)
idx = rand(1:nbins, items, ncol)
𝑖 = collect(1:items)
𝑗 = collect(1:ncol)

hist_gpu = CuArray(hist)
Ξ΄_gpu = CuArray(Ξ΄)
idx_gpu = CuArray(idx)
𝑖_gpu = CuArray(𝑖)
𝑗_gpu = CuArray(𝑗)

# CPU
function hist_cpu!(hist, Ξ΄, idx, 𝑖, 𝑗)
Threads.@threads for j in 𝑗
@inbounds for i in 𝑖
hist[idx[i], j] += Ξ΄[i]
end
end
return
end

function kernel_1!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, 𝑖, 𝑗) where {T<:AbstractFloat}
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
if i <= length(𝑖) && j <= length(𝑗)
@inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j])
@inbounds CUDA.atomic_add!(pointer(h, k), x[𝑖[i]])
end
return
end

# base approach - block built along the cols first, the rows (limit collisions)
function hist_gpu_1!(h::CuMatrix{T}, x::CuVector{T}, id::CuMatrix{Int}, 𝑖, 𝑗; MAX_THREADS=1024) where {T<:AbstractFloat}
thread_j = min(MAX_THREADS, length(𝑗))
thread_i = min(MAX_THREADS Γ· thread_j, length(𝑖))
threads = (thread_i, thread_j)
blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads)
@cuda blocks=blocks threads=threads kernel_1!(h, x, id, 𝑖, 𝑗)
return
end

@time hist_cpu!(hist, Ξ΄, idx)
CUDA.@time hist_gpu_1!(hist_gpu, Ξ΄_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)




nbins = 32
ncol = 100
items = Int(2e6)
K = 1
hist = zeros(Float32, nbins, 3, ncol)
Ξ΄ = rand(Float32, items, 3)
idx = rand(1:nbins, items, ncol)
𝑖 = collect(1:items)
𝑗 = collect(1:ncol)

hist_gpu = CuArray(hist)
Ξ΄_gpu = CuArray(Ξ΄)
idx_gpu = CuArray(idx)
𝑖_gpu = CuArray(𝑖)
𝑗_gpu = CuArray(𝑗)

function kernel_2!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, id, 𝑖, 𝑗) where {T<:AbstractFloat}
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
if i <= length(𝑖) && j <= length(𝑗)
@inbounds k1 = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 1, 𝑗[j])
@inbounds CUDA.atomic_add!(pointer(h, k1), x[𝑖[i],1])
@inbounds k2 = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 2, 𝑗[j])
@inbounds CUDA.atomic_add!(pointer(h, k2), x[𝑖[i],2])
@inbounds k3 = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 3, 𝑗[j])
@inbounds CUDA.atomic_add!(pointer(h, k3), x[𝑖[i],3])
end
return
end

# base approach - block built along the cols first, the rows (limit collisions)
function hist_gpu_2!(h::CuArray{T,3}, x::CuMatrix{T}, id::CuMatrix{Int}, 𝑖, 𝑗; MAX_THREADS=1024) where {T<:AbstractFloat}
thread_j = min(MAX_THREADS, length(𝑗))
thread_i = min(MAX_THREADS Γ· thread_j, length(𝑖))
threads = (thread_i, thread_j)
blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads)
@cuda blocks=blocks threads=threads kernel_2!(h, x, id, 𝑖, 𝑗)
return
end

CUDA.@time hist_gpu_2!(hist_gpu, Ξ΄_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)

hist_gpu_1 = Array(hist_gpu)
hist_gpu_2 = Array(hist_gpu)
diff1 = hist_gpu_2 - hist_gpu_1

######################################################################################################
# best approach: loop on K indicators
######################################################################################################
function kernel_3!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, id, 𝑖, 𝑗, K) where {T<:AbstractFloat}
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
if i <= length(𝑖) && j <= length(𝑗)
for k in 1:K
@inbounds pt = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], k, 𝑗[j])
@inbounds CUDA.atomic_add!(pointer(h, pt), x[𝑖[i],k])
end
end
return
end

# base approach - block built along the cols first, the rows (limit collisions)
function hist_gpu_3!(h::CuArray{T,3}, x::CuMatrix{T}, id::CuMatrix{Int}, 𝑖, 𝑗, K; MAX_THREADS=1024) where {T<:AbstractFloat}
thread_j = min(MAX_THREADS, length(𝑗))
thread_i = min(MAX_THREADS Γ· thread_j, length(𝑖))
threads = (thread_i, thread_j)
blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads)
@cuda blocks=blocks threads=threads kernel_3!(h, x, id, 𝑖, 𝑗, K)
return
end

hist_gpu_1 = Array(hist_gpu)
hist_gpu_2 = Array(hist_gpu)
diff2 = hist_gpu_2 - hist_gpu_1
diff2 - diff1

CUDA.@time hist_gpu_3!(hist_gpu, Ξ΄_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, 3, MAX_THREADS=1024)



######################################################################################################
# 3D kernel - instead of iterating on K - Less efficient than the loop on Ks
######################################################################################################
function kernel_3D!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, id, 𝑖, 𝑗, K) where {T<:AbstractFloat}
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
k = threadIdx().z + (blockIdx().z - 1) * blockDim().z
if i <= length(𝑖) && j <= length(𝑗)
@inbounds pt = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], k, 𝑗[j])
@inbounds CUDA.atomic_add!(pointer(h, pt), x[𝑖[i],k])
end
return
end

# base approach - block built along the cols first, the rows (limit collisions)
function hist_gpu_3D!(h::CuArray{T,3}, x::CuMatrix{T}, id::CuMatrix{Int}, 𝑖, 𝑗, K; MAX_THREADS=1024) where {T<:AbstractFloat}
thread_k = min(MAX_THREADS, K)
thread_j = min(MAX_THREADS Γ· thread_k, length(𝑗))
thread_i = min(MAX_THREADS Γ· (thread_k * thread_j), length(𝑖))
threads = (thread_i, thread_j, thread_k)
blocks = ceil.(Int, (length(𝑖), length(𝑗), K) ./ threads)
@cuda blocks=blocks threads=threads kernel_3D!(h, x, id, 𝑖, 𝑗, K)
return
end

CUDA.@time hist_gpu_3D!(hist_gpu, Ξ΄_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, 3, MAX_THREADS=1024)

hist_gpu_1 = Array(hist_gpu)
hist_gpu_2 = Array(hist_gpu)
diff1 = hist_gpu_2 - hist_gpu_1


######################################################################################################
# 3D kernel - instead of iterating on K - No collision approach - single i thread - bad!
######################################################################################################
function kernel_3D2!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, id, 𝑖, 𝑗, K) where {T<:AbstractFloat}
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
k = threadIdx().z + (blockIdx().z - 1) * blockDim().z
if i <= length(𝑖) && j <= length(𝑗)
# @inbounds pt = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], k, 𝑗[j])
@inbounds h[id[𝑖[i], 𝑗[j]], k, 𝑗[j]] += x[𝑖[i],k]
end
return
end

# base approach - block built along the cols first, the rows (limit collisions)
function hist_gpu_3D2!(h::CuArray{T,3}, x::CuMatrix{T}, id::CuMatrix{Int}, 𝑖, 𝑗, K; MAX_THREADS=1024) where {T<:AbstractFloat}
thread_k = min(MAX_THREADS, K)
thread_j = min(MAX_THREADS Γ· thread_k, length(𝑗))
thread_i = 1
threads = (thread_i, thread_j, thread_k)
blocks = ceil.(Int, (length(𝑖), length(𝑗), K) ./ threads)
@cuda blocks=blocks threads=threads kernel_3D2!(h, x, id, 𝑖, 𝑗, K)
return
end

CUDA.@time hist_gpu_3D2!(hist_gpu, Ξ΄_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, 3, MAX_THREADS=1024)

hist_gpu_1 = Array(hist_gpu)
hist_gpu_2 = Array(hist_gpu)
diff1 = hist_gpu_2 - hist_gpu_1
Loading

0 comments on commit f6cbb0f

Please sign in to comment.