Skip to content

Commit

Permalink
adjoint interpolation on AMD GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander-Barth committed Nov 28, 2024
1 parent a8b8ef1 commit 6226676
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 45 deletions.
48 changes: 31 additions & 17 deletions ext/AMDGPUExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,20 @@ function interpnd_d!(pos::AbstractVector{<:NTuple{N}},A,vec) where N

@inbounds for i = index:stride:length(pos)
p = pos[i]
ind = floor.(Int,p)
#ind = floor.(Int,p)
ind = unsafe_trunc.(Int32,floor.(p))

# interpolation coefficients
c = p .- ind
#c = p .- ind
c = ntuple(Val(N)) do i
p[i] - ind[i]
end

for offset in CartesianIndices(ntuple(n -> 0:1,Val(N)))
p2 = Tuple(offset) .+ ind
#p2 = Tuple(offset) .+ ind
p2 = ntuple(Val(N)) do i
offset[i] + ind[i]
end

cc = prod(ntuple(n -> (offset[n] == 1 ? c[n] : 1-c[n]),Val(N)))
vec[i] += A[p2...] * cc
Expand All @@ -27,54 +34,61 @@ function interpnd_d!(pos::AbstractVector{<:NTuple{N}},A,vec) where N
return nothing
end

function interpnd!(pos::AbstractVector{<:NTuple{N}},d_A::ROCArray,vec_d) where N
function interpnd!(pos::AbstractVector{<:NTuple{N}},A_d::ROCArray,vec_d) where N
AMDGPU.@sync begin
len = length(pos)
kernel = @roc launch=false interpnd_d!(pos,d_A,vec_d)
config = launch_configuration(kernel.fun)
kernel = @roc launch=false interpnd_d!(pos,A_d,vec_d)
config = AMDGPU.launch_configuration(kernel)
groupsize = min(len, config.groupsize)
gridsize = cld(len, groupsize)
@debug gridsize,groupsize

kernel(pos,d_A,vec_d; groupsize, gridsize)
kernel(pos,A_d,vec_d; groupsize, gridsize)
end
end


function interp_adjn_d!(pos::AbstractVector{<:NTuple{N}},values,A2) where N
function interp_adjn_d!(pos::AbstractVector{<:NTuple{N}},values,B) where N
index = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
stride = gridGroupDim().x * workgroupDim().x

A2 .= 0

@inbounds for i = index:stride:length(pos)
p = pos[i]
ind = floor.(Int,p)
#ind = floor.(Int,p)
ind = unsafe_trunc.(Int32,floor.(p))

# interpolation coefficients
c = p .- ind
#c = p .- ind
c = ntuple(Val(N)) do i
p[i] - ind[i]
end

for offset in CartesianIndices(ntuple(n -> 0:1,Val(N)))
p2 = Tuple(offset) .+ ind
# p2 = Tuple(offset) .+ ind
p2 = ntuple(Val(N)) do i
offset[i] + ind[i]
end

cc = prod(ntuple(n -> (offset[n] == 1 ? c[n] : 1-c[n]),Val(N)))

I = LinearIndices(A2)[p2...]
AMDGPU.atomic_add!(pointer(A2,I), values[i] * cc)
I = LinearIndices(B)[p2...]
B[I] += values[i] * cc
end
end

return nothing
end


function interp_adjn!(pos::AbstractVector{<:NTuple{N}},values_d::ROCArray,d_A2) where N
function interp_adjn!(pos::AbstractVector{<:NTuple{N}},values_d::ROCArray,B_d) where N
B_d .= 0

AMDGPU.@sync begin
len = length(pos)
#numgridsize = ceil(Int, length(pos)/256)
# must be one
numgridsize = 1
@roc groupsize=256 gridsize=numgridsize interp_adjn_d!(pos,values_d,d_A2)
@roc groupsize=256 gridsize=numgridsize interp_adjn_d!(pos,values_d,B_d)
end
end

Expand Down
5 changes: 2 additions & 3 deletions ext/CUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,6 @@ function interp_adjn_d!(pos::AbstractVector{<:NTuple{N}},values,A2) where N
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x

# initialize before kernel launch???
A2 .= 0

@inbounds for i = index:stride:length(pos)
p = pos[i]
ind = floor.(Int,p)
Expand All @@ -70,6 +67,8 @@ end


function interp_adjn!(pos::AbstractVector{<:NTuple{N}},cuvalues::CuArray,d_A2) where N
A2 .= 0

CUDA.@sync begin
len = length(pos)
#numblocks = ceil(Int, length(pos)/256)
Expand Down
65 changes: 40 additions & 25 deletions test/test_interp_adjoint.jl
Original file line number Diff line number Diff line change
@@ -1,50 +1,65 @@
using AMDGPU
using CUDA
using DINCAE: interpnd, interp_adjn
using Flux
using Test
using LinearAlgebra
using Test

f(pos,B) = sum(interpnd(pos,B))

device_list = Any[identity] # CPU

if CUDA.functional()
push!(device_list,cu)
end

if AMDGPU.functional()
push!(device_list,roc)
end

pos = [(1.5f0,2.f0),(5.f0,2.3f0)]
values = [3.f0,4.f0]
sz = (6,7)
B = ones(Float32,sz)

A = interp_adjn(pos,values,sz)
A_ref = zeros(Float32,sz)
A_ref[1,2] = A_ref[2,2] = 0.5 * 3
A_ref[5,2] = 0.7 * 4
A_ref[5,3] = 0.3 * 4

@test A[1,2] 1.5
@test A[2,2] 1.5
dA_ref = zeros(Float32,sz)
dA_ref[1,2] = dA_ref[2,2] = 0.5
dA_ref[5,2] = 0.7
dA_ref[5,3] = 0.3

dA = gradient(f,pos,B)[2]

@test dA[1,2] 0.5
@test dA[2,2] 0.5

if CUDA.functional()
pos_d = cu(pos)
values_d = cu(values)
B_d = cu(B)
for device in device_list
pos_d = device(pos)
values_d = device(values)
B_d = device(B)

A_d = interp_adjn(pos_d,values_d,sz)

CUDA.@allowscalar begin
@test A_d[1,2] 1.5
@test A_d[2,2] 1.5
end
@test Array(A_d) A_ref

dA_d = gradient(f,pos_d,B_d)[2]


CUDA.@allowscalar begin
@test dA_d[1,2] 0.5
@test dA_d[2,2] 0.5
end
@test Array(dA_d) dA_ref
end

# check adjoint relationship

A = randn(Float32,size(A,1),size(A,2))
Ai = randn(Float32,length(pos))
A = randn(Float32,sz)
dAi = randn(Float32,length(pos))

@test Ai interpnd(pos,A) A interp_adjn(pos,Ai,size(A))
@test dAi interpnd(pos,A) A interp_adjn(pos,dAi,size(A))

for device in device_list[2:end]
A_d = device(A)
dAi_d = device(dAi)
pos_d = device(pos)

@test interpnd(pos,A) Array(interpnd(pos_d,A_d))
@test interp_adjn(pos,dAi,size(A)) Array(interp_adjn(pos_d,dAi_d,size(A)))

@test dAi_d interpnd(pos_d,A_d) A_d interp_adjn(pos_d,dAi_d,size(A))
end

0 comments on commit 6226676

Please sign in to comment.