Merge pull request #55 from Evovest/gpu-dev

Gpu dev
Evovest · Aug 11, 2020 · f6cbb0f · f6cbb0f
2 parents 4020c36 + 2e379b1
commit f6cbb0f
Show file tree

Hide file tree

Showing 33 changed files with 1,641 additions and 626 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,7 +1,7 @@
 language: julia
 julia:
   - nightly
-  - 1.0
+  - 1.5
   - 1.4
 
 matrix:

diff --git a/Project.toml b/Project.toml
@@ -1,9 +1,10 @@
 name = "EvoTrees"
 uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
 authors = ["jeremiedb <[email protected]>"]
-version = "0.4.9"
+version = "0.5.0"
 
 [deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
@@ -13,12 +14,13 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
-CategoricalArrays = "0.7, 0.8"
+CUDA = "1"
+CategoricalArrays = "0.8"
 Distributions = "0.22, 0.23"
 MLJModelInterface = "0.3"
 StaticArrays = "0.12"
 StatsBase = "0.32, 0.33"
-julia = "1"
+julia = "1.4"
 
 [extras]
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"

diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Build Status](https://travis-ci.org/Evovest/EvoTrees.jl.svg?branch=master)](https://travis-ci.org/Evovest/EvoTrees.jl)
 
 A Julia implementation of boosted trees.
-Efficient histogram based algorithm with support for conventional and extended loss (notably multi-target objectives such as max likelihood methods).
+Efficient histogram based algorithm with support for multiple loss functions (notably multi-target objectives such as max likelihood methods).
 
 [R binding available](https://github.com/Evovest/EvoTrees)
 
@@ -17,9 +17,13 @@ Currently supports:
 - multiclassification (softmax)
 - Gaussian (max likelihood)
 
+Input features is expected to be `Matrix{Float64/Float32}`. User friendly format conversion to be done (or see integration with MLJ).
 
-Input features is expected to be `Matrix{Float64}`. User friendly format conversion to be done.
-Next priority: GPU support.
+## GPU
+
+An experimental GPU support is now provided for linear, logistic and Gaussian objective functions. Speedup compared to multi-threaded cpu histogram is modest at the moment (~25% vs 16 CPU threads on RTX2080).
+
+Simply call `fit_evotree_gpu()` instead of `fit_evotree()` and `predict_gpu()` instead of `predict()`.
 
 ## Installation
 
@@ -112,7 +116,7 @@ mean(abs.(pred_test - selectrows(Y,test)))
 
 Minimal example to fit a noisy sinus wave.
 
-![](regression_sinus.png)
+![](figures/regression_sinus.png)
 
 ```julia
 using EvoTrees
@@ -177,7 +181,7 @@ pred_eval_L1 = predict(model, X_eval)
 
 ## Quantile Regression
 
-![](quantiles_sinus.png)
+![](figures/quantiles_sinus.png)
 
 ```julia
 # q50
@@ -213,7 +217,7 @@ pred_train_q80 = predict(model, X_train)
 
 ## Gaussian Max Likelihood
 
-![](gaussian_sinus.png)
+![](figures/gaussian_sinus.png)
 
 ```julia
 params1 = EvoTreeGaussian(

diff --git a/anim_fps1.gif b/anim_fps1.gif
diff --git a/experiments/GPU_loss.jl b/experiments/GPU_loss.jl
@@ -0,0 +1,96 @@
+using CUDA
+# using Flux
+
+items = Int(1e6)
+δ = rand(Float32, items, 1)
+δ² = rand(Float32, items, 1)
+𝑤 = rand(Float32, items)
+pred = rand(Float32, items, 1)
+target = rand(Float32, items)
+
+δ_gpu = CuArray(δ)
+δ²_gpu = CuArray(δ²)
+𝑤_gpu = CuArray(𝑤)
+pred_gpu = CuArray(pred)
+target_gpu = CuArray(target)
+
+function update_grads_gpu_linear_1!(pred::AbstractMatrix{T}, target::AbstractVector{T}, δ::AbstractMatrix{T}, δ²::AbstractMatrix{T}, 𝑤::AbstractVector{T}) where {T <: AbstractFloat}
+    @. δ = 2f0 * (pred - target) * 𝑤
+    @. δ² = 2f0 * 𝑤
+    return
+end
+
+
+function kernel_linear_δ!(δ::CuDeviceMatrix{T}, p::CuDeviceMatrix{T}, t::CuDeviceVector{T}, 𝑤::CuDeviceVector{T}) where {T<:AbstractFloat}
+    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+    if i <= length(t)
+        @inbounds δ[i] = 2 * (p[i] - t[i]) * 𝑤[i]
+    end
+    return
+end
+
+function kernel_linear_δ²!(δ::CuDeviceMatrix{T}, p::CuDeviceMatrix{T}, t::CuDeviceVector{T}, 𝑤::CuDeviceVector{T}) where {T<:AbstractFloat}
+    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+    if i <= length(t)
+        @inbounds δ[i] = 2 * 𝑤[i]
+    end
+    return
+end
+
+# base approach - block built along the cols first, the rows (limit collisions)
+function grad_linear!(δ::CuMatrix{T}, δ²::CuMatrix{T}, p::CuMatrix{T}, t::CuVector{T}, 𝑤::CuVector{T}; MAX_THREADS=1024) where {T<:AbstractFloat}
+    thread_i = min(MAX_THREADS, length(t))
+    threads = (thread_i)
+    blocks = ceil.(Int, (length(t)) ./ threads)
+    @cuda blocks=blocks threads=threads kernel_linear_δ!(δ, p, t, 𝑤)
+    @cuda blocks=blocks threads=threads kernel_linear_δ²!(δ², p, t, 𝑤)
+    return
+end
+
+CUDA.@time update_grads_gpu_linear_1!(pred_gpu, target_gpu, δ_gpu, δ²_gpu, 𝑤_gpu)
+CUDA.@time grad_linear!(δ_gpu, δ²_gpu, pred_gpu, target_gpu, 𝑤_gpu, MAX_THREADS=1024)
+
+#################################################
+# Gaussian
+#################################################
+items = Int(1e6)
+δ = zeros(Float32, items, 1)
+δ² = zeros(Float32, items, 1)
+𝑤 = rand(Float32, items)
+pred = rand(Float32, items, 1)
+target = rand(Float32, items)
+
+δ_gpu = CuArray(δ)
+δ²_gpu = CuArray(δ²)
+𝑤_gpu = CuArray(𝑤)
+pred_gpu = CuArray(pred)
+target_gpu = CuArray(target)
+
+function kernel_gauss_δ!(δ::CuDeviceMatrix{T}, p::CuDeviceMatrix{T}, t::CuDeviceVector{T}, 𝑤::CuDeviceVector{T}) where {T<:AbstractFloat}
+    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+    if i <= length(t)
+        δ[i,1] = (p[i,1] - t[i]) / max(Cfloat(1e-8), exp(2f0 * p[i,2])) * 𝑤[i]
+        δ[i,2] = (1f0 - (p[i,1] - t[i])^2f0 / max(Cfloat(1e-8), exp(2f0 * p[i,2]))) * 𝑤[i]
+    end
+    return
+end
+
+function kernel_gauss_δ²!(δ::CuDeviceMatrix{T}, p::CuDeviceMatrix{T}, t::CuDeviceVector{T}, 𝑤::CuDeviceVector{T}) where {T<:AbstractFloat}
+    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+    if i <= length(t)
+        δ[i,1] = 𝑤[i] / max(Cfloat(1e-8), exp(2 * p[i,2]))
+        δ[i,2] = 2 * 𝑤[i] / max(Cfloat(1e-8), exp(2 * pred[i,2])) * (p[i,1] - target[i])^2
+    end
+end
+
+# base approach - block built along the cols first, the rows (limit collisions)
+function grad_gaussian!(δ::CuMatrix{T}, δ²::CuMatrix{T}, p::CuMatrix{T}, t::CuVector{T}, 𝑤::CuVector{T}; MAX_THREADS=1024) where {T<:AbstractFloat}
+    thread_i = min(MAX_THREADS, length(t))
+    threads = (thread_i)
+    blocks = ceil.(Int, (length(t)) ./ threads)
+    @cuda blocks=blocks threads=threads kernel_linear_δ!(δ, p, t, 𝑤)
+    @cuda blocks=blocks threads=threads kernel_linear_δ²!(δ², p, t, 𝑤)
+    return
+end
+
+CUDA.@time grad_gaussian!(δ_gpu, δ²_gpu, pred_gpu, target_gpu, 𝑤_gpu, MAX_THREADS=1024)
diff --git a/experiments/GPU_v2.jl b/experiments/GPU_v2.jl
@@ -0,0 +1,198 @@
+# using CUDA
+using CUDA
+# using Flux
+# using GeometricFlux
+
+nbins = 32
+ncol = 100
+items = Int(1e6)
+hist = zeros(Float32, nbins, ncol)
+δ = rand(Float32, items)
+idx = rand(1:nbins, items, ncol)
+𝑖 = collect(1:items)
+𝑗 = collect(1:ncol)
+
+hist_gpu = CuArray(hist)
+δ_gpu = CuArray(δ)
+idx_gpu = CuArray(idx)
+𝑖_gpu = CuArray(𝑖)
+𝑗_gpu = CuArray(𝑗)
+
+# CPU
+function hist_cpu!(hist, δ, idx, 𝑖, 𝑗)
+    Threads.@threads for j in 𝑗
+        @inbounds for i in 𝑖
+            hist[idx[i], j] += δ[i]
+        end
+    end
+    return
+end
+
+function kernel_1!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, 𝑖, 𝑗) where {T<:AbstractFloat}
+    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
+    if i <= length(𝑖) && j <= length(𝑗)
+        @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j])
+        @inbounds CUDA.atomic_add!(pointer(h, k), x[𝑖[i]])
+    end
+    return
+end
+
+# base approach - block built along the cols first, the rows (limit collisions)
+function hist_gpu_1!(h::CuMatrix{T}, x::CuVector{T}, id::CuMatrix{Int}, 𝑖, 𝑗; MAX_THREADS=1024) where {T<:AbstractFloat}
+    thread_j = min(MAX_THREADS, length(𝑗))
+    thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖))
+    threads = (thread_i, thread_j)
+    blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads)
+    @cuda blocks=blocks threads=threads kernel_1!(h, x, id, 𝑖, 𝑗)
+    return
+end
+
+@time hist_cpu!(hist, δ, idx)
+CUDA.@time hist_gpu_1!(hist_gpu, δ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)
+
+
+
+
+nbins = 32
+ncol = 100
+items = Int(2e6)
+K = 1
+hist = zeros(Float32, nbins, 3, ncol)
+δ = rand(Float32, items, 3)
+idx = rand(1:nbins, items, ncol)
+𝑖 = collect(1:items)
+𝑗 = collect(1:ncol)
+
+hist_gpu = CuArray(hist)
+δ_gpu = CuArray(δ)
+idx_gpu = CuArray(idx)
+𝑖_gpu = CuArray(𝑖)
+𝑗_gpu = CuArray(𝑗)
+
+function kernel_2!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, id, 𝑖, 𝑗) where {T<:AbstractFloat}
+    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
+    if i <= length(𝑖) && j <= length(𝑗)
+        @inbounds k1 = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 1, 𝑗[j])
+        @inbounds CUDA.atomic_add!(pointer(h, k1), x[𝑖[i],1])
+        @inbounds k2 = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 2, 𝑗[j])
+        @inbounds CUDA.atomic_add!(pointer(h, k2), x[𝑖[i],2])
+        @inbounds k3 = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 3, 𝑗[j])
+        @inbounds CUDA.atomic_add!(pointer(h, k3), x[𝑖[i],3])
+    end
+    return
+end
+
+# base approach - block built along the cols first, the rows (limit collisions)
+function hist_gpu_2!(h::CuArray{T,3}, x::CuMatrix{T}, id::CuMatrix{Int}, 𝑖, 𝑗; MAX_THREADS=1024) where {T<:AbstractFloat}
+    thread_j = min(MAX_THREADS, length(𝑗))
+    thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖))
+    threads = (thread_i, thread_j)
+    blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads)
+    @cuda blocks=blocks threads=threads kernel_2!(h, x, id, 𝑖, 𝑗)
+    return
+end
+
+CUDA.@time hist_gpu_2!(hist_gpu, δ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)
+
+hist_gpu_1 = Array(hist_gpu)
+hist_gpu_2 = Array(hist_gpu)
+diff1 = hist_gpu_2 - hist_gpu_1
+
+######################################################################################################
+# best approach: loop on K indicators
+######################################################################################################
+function kernel_3!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, id, 𝑖, 𝑗, K) where {T<:AbstractFloat}
+    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
+    if i <= length(𝑖) && j <= length(𝑗)
+        for k in 1:K
+            @inbounds pt = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], k, 𝑗[j])
+            @inbounds CUDA.atomic_add!(pointer(h, pt), x[𝑖[i],k])
+        end
+    end
+    return
+end
+
+# base approach - block built along the cols first, the rows (limit collisions)
+function hist_gpu_3!(h::CuArray{T,3}, x::CuMatrix{T}, id::CuMatrix{Int}, 𝑖, 𝑗, K; MAX_THREADS=1024) where {T<:AbstractFloat}
+    thread_j = min(MAX_THREADS, length(𝑗))
+    thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖))
+    threads = (thread_i, thread_j)
+    blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads)
+    @cuda blocks=blocks threads=threads kernel_3!(h, x, id, 𝑖, 𝑗, K)
+    return
+end
+
+hist_gpu_1 = Array(hist_gpu)
+hist_gpu_2 = Array(hist_gpu)
+diff2 = hist_gpu_2 - hist_gpu_1
+diff2 - diff1
+
+CUDA.@time hist_gpu_3!(hist_gpu, δ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, 3, MAX_THREADS=1024)
+
+
+
+######################################################################################################
+# 3D kernel - instead of iterating on K - Less efficient than the loop on Ks
+######################################################################################################
+function kernel_3D!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, id, 𝑖, 𝑗, K) where {T<:AbstractFloat}
+    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
+    k = threadIdx().z + (blockIdx().z - 1) * blockDim().z
+    if i <= length(𝑖) && j <= length(𝑗)
+        @inbounds pt = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], k, 𝑗[j])
+        @inbounds CUDA.atomic_add!(pointer(h, pt), x[𝑖[i],k])
+    end
+    return
+end
+
+# base approach - block built along the cols first, the rows (limit collisions)
+function hist_gpu_3D!(h::CuArray{T,3}, x::CuMatrix{T}, id::CuMatrix{Int}, 𝑖, 𝑗, K; MAX_THREADS=1024) where {T<:AbstractFloat}
+    thread_k = min(MAX_THREADS, K)
+    thread_j = min(MAX_THREADS ÷ thread_k, length(𝑗))
+    thread_i = min(MAX_THREADS ÷ (thread_k * thread_j), length(𝑖))
+    threads = (thread_i, thread_j, thread_k)
+    blocks = ceil.(Int, (length(𝑖), length(𝑗), K) ./ threads)
+    @cuda blocks=blocks threads=threads kernel_3D!(h, x, id, 𝑖, 𝑗, K)
+    return
+end
+
+CUDA.@time hist_gpu_3D!(hist_gpu, δ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, 3, MAX_THREADS=1024)
+
+hist_gpu_1 = Array(hist_gpu)
+hist_gpu_2 = Array(hist_gpu)
+diff1 = hist_gpu_2 - hist_gpu_1
+
+
+######################################################################################################
+# 3D kernel - instead of iterating on K - No collision approach - single i thread - bad!
+######################################################################################################
+function kernel_3D2!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, id, 𝑖, 𝑗, K) where {T<:AbstractFloat}
+    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
+    k = threadIdx().z + (blockIdx().z - 1) * blockDim().z
+    if i <= length(𝑖) && j <= length(𝑗)
+        # @inbounds pt = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], k, 𝑗[j])
+        @inbounds h[id[𝑖[i], 𝑗[j]], k, 𝑗[j]] += x[𝑖[i],k]
+    end
+    return
+end
+
+# base approach - block built along the cols first, the rows (limit collisions)
+function hist_gpu_3D2!(h::CuArray{T,3}, x::CuMatrix{T}, id::CuMatrix{Int}, 𝑖, 𝑗, K; MAX_THREADS=1024) where {T<:AbstractFloat}
+    thread_k = min(MAX_THREADS, K)
+    thread_j = min(MAX_THREADS ÷ thread_k, length(𝑗))
+    thread_i = 1
+    threads = (thread_i, thread_j, thread_k)
+    blocks = ceil.(Int, (length(𝑖), length(𝑗), K) ./ threads)
+    @cuda blocks=blocks threads=threads kernel_3D2!(h, x, id, 𝑖, 𝑗, K)
+    return
+end
+
+CUDA.@time hist_gpu_3D2!(hist_gpu, δ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, 3, MAX_THREADS=1024)
+
+hist_gpu_1 = Array(hist_gpu)
+hist_gpu_2 = Array(hist_gpu)
+diff1 = hist_gpu_2 - hist_gpu_1
-Original file line number
+Diff line change
@@ -1,7 +1,7 @@
     language: julia
     julia:
       - nightly
-      - 1.0
+      - 1.5
       - 1.4
     matrix:
@@ Expand Down @@