-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #55 from Evovest/gpu-dev
Gpu dev
- Loading branch information
Showing
33 changed files
with
1,641 additions
and
626 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
language: julia | ||
julia: | ||
- nightly | ||
- 1.0 | ||
- 1.5 | ||
- 1.4 | ||
|
||
matrix: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,10 @@ | ||
name = "EvoTrees" | ||
uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" | ||
authors = ["jeremiedb <[email protected]>"] | ||
version = "0.4.9" | ||
version = "0.5.0" | ||
|
||
[deps] | ||
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" | ||
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" | ||
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" | ||
MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" | ||
|
@@ -13,12 +14,13 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" | |
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" | ||
|
||
[compat] | ||
CategoricalArrays = "0.7, 0.8" | ||
CUDA = "1" | ||
CategoricalArrays = "0.8" | ||
Distributions = "0.22, 0.23" | ||
MLJModelInterface = "0.3" | ||
StaticArrays = "0.12" | ||
StatsBase = "0.32, 0.33" | ||
julia = "1" | ||
julia = "1.4" | ||
|
||
[extras] | ||
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
using CUDA | ||
# using Flux | ||
|
||
items = Int(1e6) | ||
Ξ΄ = rand(Float32, items, 1) | ||
δ² = rand(Float32, items, 1) | ||
π€ = rand(Float32, items) | ||
pred = rand(Float32, items, 1) | ||
target = rand(Float32, items) | ||
|
||
Ξ΄_gpu = CuArray(Ξ΄) | ||
δ²_gpu = CuArray(δ²) | ||
π€_gpu = CuArray(π€) | ||
pred_gpu = CuArray(pred) | ||
target_gpu = CuArray(target) | ||
|
||
function update_grads_gpu_linear_1!(pred::AbstractMatrix{T}, target::AbstractVector{T}, Ξ΄::AbstractMatrix{T}, δ²::AbstractMatrix{T}, π€::AbstractVector{T}) where {T <: AbstractFloat} | ||
@. Ξ΄ = 2f0 * (pred - target) * π€ | ||
@. δ² = 2f0 * π€ | ||
return | ||
end | ||
|
||
|
||
function kernel_linear_Ξ΄!(Ξ΄::CuDeviceMatrix{T}, p::CuDeviceMatrix{T}, t::CuDeviceVector{T}, π€::CuDeviceVector{T}) where {T<:AbstractFloat} | ||
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x | ||
if i <= length(t) | ||
@inbounds Ξ΄[i] = 2 * (p[i] - t[i]) * π€[i] | ||
end | ||
return | ||
end | ||
|
||
function kernel_linear_δ²!(Ξ΄::CuDeviceMatrix{T}, p::CuDeviceMatrix{T}, t::CuDeviceVector{T}, π€::CuDeviceVector{T}) where {T<:AbstractFloat} | ||
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x | ||
if i <= length(t) | ||
@inbounds Ξ΄[i] = 2 * π€[i] | ||
end | ||
return | ||
end | ||
|
||
# base approach - block built along the cols first, the rows (limit collisions) | ||
function grad_linear!(Ξ΄::CuMatrix{T}, δ²::CuMatrix{T}, p::CuMatrix{T}, t::CuVector{T}, π€::CuVector{T}; MAX_THREADS=1024) where {T<:AbstractFloat} | ||
thread_i = min(MAX_THREADS, length(t)) | ||
threads = (thread_i) | ||
blocks = ceil.(Int, (length(t)) ./ threads) | ||
@cuda blocks=blocks threads=threads kernel_linear_Ξ΄!(Ξ΄, p, t, π€) | ||
@cuda blocks=blocks threads=threads kernel_linear_δ²!(δ², p, t, π€) | ||
return | ||
end | ||
|
||
CUDA.@time update_grads_gpu_linear_1!(pred_gpu, target_gpu, Ξ΄_gpu, δ²_gpu, π€_gpu) | ||
CUDA.@time grad_linear!(Ξ΄_gpu, δ²_gpu, pred_gpu, target_gpu, π€_gpu, MAX_THREADS=1024) | ||
|
||
################################################# | ||
# Gaussian | ||
################################################# | ||
items = Int(1e6) | ||
Ξ΄ = zeros(Float32, items, 1) | ||
δ² = zeros(Float32, items, 1) | ||
π€ = rand(Float32, items) | ||
pred = rand(Float32, items, 1) | ||
target = rand(Float32, items) | ||
|
||
Ξ΄_gpu = CuArray(Ξ΄) | ||
δ²_gpu = CuArray(δ²) | ||
π€_gpu = CuArray(π€) | ||
pred_gpu = CuArray(pred) | ||
target_gpu = CuArray(target) | ||
|
||
function kernel_gauss_Ξ΄!(Ξ΄::CuDeviceMatrix{T}, p::CuDeviceMatrix{T}, t::CuDeviceVector{T}, π€::CuDeviceVector{T}) where {T<:AbstractFloat} | ||
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x | ||
if i <= length(t) | ||
Ξ΄[i,1] = (p[i,1] - t[i]) / max(Cfloat(1e-8), exp(2f0 * p[i,2])) * π€[i] | ||
Ξ΄[i,2] = (1f0 - (p[i,1] - t[i])^2f0 / max(Cfloat(1e-8), exp(2f0 * p[i,2]))) * π€[i] | ||
end | ||
return | ||
end | ||
|
||
function kernel_gauss_δ²!(Ξ΄::CuDeviceMatrix{T}, p::CuDeviceMatrix{T}, t::CuDeviceVector{T}, π€::CuDeviceVector{T}) where {T<:AbstractFloat} | ||
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x | ||
if i <= length(t) | ||
Ξ΄[i,1] = π€[i] / max(Cfloat(1e-8), exp(2 * p[i,2])) | ||
Ξ΄[i,2] = 2 * π€[i] / max(Cfloat(1e-8), exp(2 * pred[i,2])) * (p[i,1] - target[i])^2 | ||
end | ||
end | ||
|
||
# base approach - block built along the cols first, the rows (limit collisions) | ||
function grad_gaussian!(Ξ΄::CuMatrix{T}, δ²::CuMatrix{T}, p::CuMatrix{T}, t::CuVector{T}, π€::CuVector{T}; MAX_THREADS=1024) where {T<:AbstractFloat} | ||
thread_i = min(MAX_THREADS, length(t)) | ||
threads = (thread_i) | ||
blocks = ceil.(Int, (length(t)) ./ threads) | ||
@cuda blocks=blocks threads=threads kernel_linear_Ξ΄!(Ξ΄, p, t, π€) | ||
@cuda blocks=blocks threads=threads kernel_linear_δ²!(δ², p, t, π€) | ||
return | ||
end | ||
|
||
CUDA.@time grad_gaussian!(Ξ΄_gpu, δ²_gpu, pred_gpu, target_gpu, π€_gpu, MAX_THREADS=1024) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
# using CUDA | ||
using CUDA | ||
# using Flux | ||
# using GeometricFlux | ||
|
||
nbins = 32 | ||
ncol = 100 | ||
items = Int(1e6) | ||
hist = zeros(Float32, nbins, ncol) | ||
Ξ΄ = rand(Float32, items) | ||
idx = rand(1:nbins, items, ncol) | ||
π = collect(1:items) | ||
π = collect(1:ncol) | ||
|
||
hist_gpu = CuArray(hist) | ||
Ξ΄_gpu = CuArray(Ξ΄) | ||
idx_gpu = CuArray(idx) | ||
π_gpu = CuArray(π) | ||
π_gpu = CuArray(π) | ||
|
||
# CPU | ||
function hist_cpu!(hist, Ξ΄, idx, π, π) | ||
Threads.@threads for j in π | ||
@inbounds for i in π | ||
hist[idx[i], j] += Ξ΄[i] | ||
end | ||
end | ||
return | ||
end | ||
|
||
function kernel_1!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, π, π) where {T<:AbstractFloat} | ||
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x | ||
j = threadIdx().y + (blockIdx().y - 1) * blockDim().y | ||
if i <= length(π) && j <= length(π) | ||
@inbounds k = Base._to_linear_index(h, id[π[i], π[j]], π[j]) | ||
@inbounds CUDA.atomic_add!(pointer(h, k), x[π[i]]) | ||
end | ||
return | ||
end | ||
|
||
# base approach - block built along the cols first, the rows (limit collisions) | ||
function hist_gpu_1!(h::CuMatrix{T}, x::CuVector{T}, id::CuMatrix{Int}, π, π; MAX_THREADS=1024) where {T<:AbstractFloat} | ||
thread_j = min(MAX_THREADS, length(π)) | ||
thread_i = min(MAX_THREADS Γ· thread_j, length(π)) | ||
threads = (thread_i, thread_j) | ||
blocks = ceil.(Int, (length(π), length(π)) ./ threads) | ||
@cuda blocks=blocks threads=threads kernel_1!(h, x, id, π, π) | ||
return | ||
end | ||
|
||
@time hist_cpu!(hist, Ξ΄, idx) | ||
CUDA.@time hist_gpu_1!(hist_gpu, Ξ΄_gpu, idx_gpu, π_gpu, π_gpu, MAX_THREADS=1024) | ||
|
||
|
||
|
||
|
||
nbins = 32 | ||
ncol = 100 | ||
items = Int(2e6) | ||
K = 1 | ||
hist = zeros(Float32, nbins, 3, ncol) | ||
Ξ΄ = rand(Float32, items, 3) | ||
idx = rand(1:nbins, items, ncol) | ||
π = collect(1:items) | ||
π = collect(1:ncol) | ||
|
||
hist_gpu = CuArray(hist) | ||
Ξ΄_gpu = CuArray(Ξ΄) | ||
idx_gpu = CuArray(idx) | ||
π_gpu = CuArray(π) | ||
π_gpu = CuArray(π) | ||
|
||
function kernel_2!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, id, π, π) where {T<:AbstractFloat} | ||
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x | ||
j = threadIdx().y + (blockIdx().y - 1) * blockDim().y | ||
if i <= length(π) && j <= length(π) | ||
@inbounds k1 = Base._to_linear_index(h, id[π[i], π[j]], 1, π[j]) | ||
@inbounds CUDA.atomic_add!(pointer(h, k1), x[π[i],1]) | ||
@inbounds k2 = Base._to_linear_index(h, id[π[i], π[j]], 2, π[j]) | ||
@inbounds CUDA.atomic_add!(pointer(h, k2), x[π[i],2]) | ||
@inbounds k3 = Base._to_linear_index(h, id[π[i], π[j]], 3, π[j]) | ||
@inbounds CUDA.atomic_add!(pointer(h, k3), x[π[i],3]) | ||
end | ||
return | ||
end | ||
|
||
# base approach - block built along the cols first, the rows (limit collisions) | ||
function hist_gpu_2!(h::CuArray{T,3}, x::CuMatrix{T}, id::CuMatrix{Int}, π, π; MAX_THREADS=1024) where {T<:AbstractFloat} | ||
thread_j = min(MAX_THREADS, length(π)) | ||
thread_i = min(MAX_THREADS Γ· thread_j, length(π)) | ||
threads = (thread_i, thread_j) | ||
blocks = ceil.(Int, (length(π), length(π)) ./ threads) | ||
@cuda blocks=blocks threads=threads kernel_2!(h, x, id, π, π) | ||
return | ||
end | ||
|
||
CUDA.@time hist_gpu_2!(hist_gpu, Ξ΄_gpu, idx_gpu, π_gpu, π_gpu, MAX_THREADS=1024) | ||
|
||
hist_gpu_1 = Array(hist_gpu) | ||
hist_gpu_2 = Array(hist_gpu) | ||
diff1 = hist_gpu_2 - hist_gpu_1 | ||
|
||
###################################################################################################### | ||
# best approach: loop on K indicators | ||
###################################################################################################### | ||
function kernel_3!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, id, π, π, K) where {T<:AbstractFloat} | ||
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x | ||
j = threadIdx().y + (blockIdx().y - 1) * blockDim().y | ||
if i <= length(π) && j <= length(π) | ||
for k in 1:K | ||
@inbounds pt = Base._to_linear_index(h, id[π[i], π[j]], k, π[j]) | ||
@inbounds CUDA.atomic_add!(pointer(h, pt), x[π[i],k]) | ||
end | ||
end | ||
return | ||
end | ||
|
||
# base approach - block built along the cols first, the rows (limit collisions) | ||
function hist_gpu_3!(h::CuArray{T,3}, x::CuMatrix{T}, id::CuMatrix{Int}, π, π, K; MAX_THREADS=1024) where {T<:AbstractFloat} | ||
thread_j = min(MAX_THREADS, length(π)) | ||
thread_i = min(MAX_THREADS Γ· thread_j, length(π)) | ||
threads = (thread_i, thread_j) | ||
blocks = ceil.(Int, (length(π), length(π)) ./ threads) | ||
@cuda blocks=blocks threads=threads kernel_3!(h, x, id, π, π, K) | ||
return | ||
end | ||
|
||
hist_gpu_1 = Array(hist_gpu) | ||
hist_gpu_2 = Array(hist_gpu) | ||
diff2 = hist_gpu_2 - hist_gpu_1 | ||
diff2 - diff1 | ||
|
||
CUDA.@time hist_gpu_3!(hist_gpu, Ξ΄_gpu, idx_gpu, π_gpu, π_gpu, 3, MAX_THREADS=1024) | ||
|
||
|
||
|
||
###################################################################################################### | ||
# 3D kernel - instead of iterating on K - Less efficient than the loop on Ks | ||
###################################################################################################### | ||
function kernel_3D!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, id, π, π, K) where {T<:AbstractFloat} | ||
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x | ||
j = threadIdx().y + (blockIdx().y - 1) * blockDim().y | ||
k = threadIdx().z + (blockIdx().z - 1) * blockDim().z | ||
if i <= length(π) && j <= length(π) | ||
@inbounds pt = Base._to_linear_index(h, id[π[i], π[j]], k, π[j]) | ||
@inbounds CUDA.atomic_add!(pointer(h, pt), x[π[i],k]) | ||
end | ||
return | ||
end | ||
|
||
# base approach - block built along the cols first, the rows (limit collisions) | ||
function hist_gpu_3D!(h::CuArray{T,3}, x::CuMatrix{T}, id::CuMatrix{Int}, π, π, K; MAX_THREADS=1024) where {T<:AbstractFloat} | ||
thread_k = min(MAX_THREADS, K) | ||
thread_j = min(MAX_THREADS Γ· thread_k, length(π)) | ||
thread_i = min(MAX_THREADS Γ· (thread_k * thread_j), length(π)) | ||
threads = (thread_i, thread_j, thread_k) | ||
blocks = ceil.(Int, (length(π), length(π), K) ./ threads) | ||
@cuda blocks=blocks threads=threads kernel_3D!(h, x, id, π, π, K) | ||
return | ||
end | ||
|
||
CUDA.@time hist_gpu_3D!(hist_gpu, Ξ΄_gpu, idx_gpu, π_gpu, π_gpu, 3, MAX_THREADS=1024) | ||
|
||
hist_gpu_1 = Array(hist_gpu) | ||
hist_gpu_2 = Array(hist_gpu) | ||
diff1 = hist_gpu_2 - hist_gpu_1 | ||
|
||
|
||
###################################################################################################### | ||
# 3D kernel - instead of iterating on K - No collision approach - single i thread - bad! | ||
###################################################################################################### | ||
function kernel_3D2!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, id, π, π, K) where {T<:AbstractFloat} | ||
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x | ||
j = threadIdx().y + (blockIdx().y - 1) * blockDim().y | ||
k = threadIdx().z + (blockIdx().z - 1) * blockDim().z | ||
if i <= length(π) && j <= length(π) | ||
# @inbounds pt = Base._to_linear_index(h, id[π[i], π[j]], k, π[j]) | ||
@inbounds h[id[π[i], π[j]], k, π[j]] += x[π[i],k] | ||
end | ||
return | ||
end | ||
|
||
# base approach - block built along the cols first, the rows (limit collisions) | ||
function hist_gpu_3D2!(h::CuArray{T,3}, x::CuMatrix{T}, id::CuMatrix{Int}, π, π, K; MAX_THREADS=1024) where {T<:AbstractFloat} | ||
thread_k = min(MAX_THREADS, K) | ||
thread_j = min(MAX_THREADS Γ· thread_k, length(π)) | ||
thread_i = 1 | ||
threads = (thread_i, thread_j, thread_k) | ||
blocks = ceil.(Int, (length(π), length(π), K) ./ threads) | ||
@cuda blocks=blocks threads=threads kernel_3D2!(h, x, id, π, π, K) | ||
return | ||
end | ||
|
||
CUDA.@time hist_gpu_3D2!(hist_gpu, Ξ΄_gpu, idx_gpu, π_gpu, π_gpu, 3, MAX_THREADS=1024) | ||
|
||
hist_gpu_1 = Array(hist_gpu) | ||
hist_gpu_2 = Array(hist_gpu) | ||
diff1 = hist_gpu_2 - hist_gpu_1 |
Oops, something went wrong.