adjoint interpolation on AMD GPU

gher-uliege · Nov 28, 2024 · 6226676 · 6226676
1 parent a8b8ef1
commit 6226676
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 45 deletions.
diff --git a/ext/AMDGPUExt.jl b/ext/AMDGPUExt.jl
@@ -11,13 +11,20 @@ function interpnd_d!(pos::AbstractVector{<:NTuple{N}},A,vec) where N
 
     @inbounds for i = index:stride:length(pos)
         p = pos[i]
-        ind = floor.(Int,p)
+        #ind = floor.(Int,p)
+        ind = unsafe_trunc.(Int32,floor.(p))
 
         # interpolation coefficients
-        c = p .- ind
+        #c = p .- ind
+        c = ntuple(Val(N)) do i
+            p[i] - ind[i]
+        end
 
         for offset in CartesianIndices(ntuple(n -> 0:1,Val(N)))
-            p2 = Tuple(offset) .+ ind
+            #p2 = Tuple(offset) .+ ind
+            p2 = ntuple(Val(N)) do i
+                offset[i] + ind[i]
+            end
 
             cc = prod(ntuple(n -> (offset[n] == 1 ? c[n] : 1-c[n]),Val(N)))
             vec[i] += A[p2...] * cc
@@ -27,54 +34,61 @@ function interpnd_d!(pos::AbstractVector{<:NTuple{N}},A,vec) where N
     return nothing
 end
 
-function interpnd!(pos::AbstractVector{<:NTuple{N}},d_A::ROCArray,vec_d) where N
+function interpnd!(pos::AbstractVector{<:NTuple{N}},A_d::ROCArray,vec_d) where N
     AMDGPU.@sync begin
         len = length(pos)
-        kernel = @roc launch=false interpnd_d!(pos,d_A,vec_d)
-        config = launch_configuration(kernel.fun)
+        kernel = @roc launch=false interpnd_d!(pos,A_d,vec_d)
+        config = AMDGPU.launch_configuration(kernel)
         groupsize = min(len, config.groupsize)
         gridsize = cld(len, groupsize)
         @debug gridsize,groupsize
 
-        kernel(pos,d_A,vec_d; groupsize, gridsize)
+        kernel(pos,A_d,vec_d; groupsize, gridsize)
     end
 end
 
 
-function interp_adjn_d!(pos::AbstractVector{<:NTuple{N}},values,A2) where N
+function interp_adjn_d!(pos::AbstractVector{<:NTuple{N}},values,B) where N
     index = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
     stride = gridGroupDim().x * workgroupDim().x
 
-    A2 .= 0
-
     @inbounds for i = index:stride:length(pos)
         p = pos[i]
-        ind = floor.(Int,p)
+        #ind = floor.(Int,p)
+        ind = unsafe_trunc.(Int32,floor.(p))
 
         # interpolation coefficients
-        c = p .- ind
+        #c = p .- ind
+        c = ntuple(Val(N)) do i
+            p[i] - ind[i]
+        end
 
         for offset in CartesianIndices(ntuple(n -> 0:1,Val(N)))
-            p2 = Tuple(offset) .+ ind
+            # p2 = Tuple(offset) .+ ind
+            p2 = ntuple(Val(N)) do i
+                offset[i] + ind[i]
+            end
 
             cc = prod(ntuple(n -> (offset[n] == 1 ? c[n] : 1-c[n]),Val(N)))
 
-            I = LinearIndices(A2)[p2...]
-            AMDGPU.atomic_add!(pointer(A2,I), values[i] * cc)
+            I = LinearIndices(B)[p2...]
+            B[I] += values[i] * cc
         end
     end
 
     return nothing
 end
 
 
-function interp_adjn!(pos::AbstractVector{<:NTuple{N}},values_d::ROCArray,d_A2) where N
+function interp_adjn!(pos::AbstractVector{<:NTuple{N}},values_d::ROCArray,B_d) where N
+    B_d .= 0
+
     AMDGPU.@sync begin
         len = length(pos)
         #numgridsize = ceil(Int, length(pos)/256)
         # must be one
         numgridsize = 1
-        @roc groupsize=256 gridsize=numgridsize interp_adjn_d!(pos,values_d,d_A2)
+        @roc groupsize=256 gridsize=numgridsize interp_adjn_d!(pos,values_d,B_d)
     end
 end
 

diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
@@ -45,9 +45,6 @@ function interp_adjn_d!(pos::AbstractVector{<:NTuple{N}},values,A2) where N
     index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     stride = gridDim().x * blockDim().x
 
-    # initialize before kernel launch???
-    A2 .= 0
-
     @inbounds for i = index:stride:length(pos)
         p = pos[i]
         ind = floor.(Int,p)
@@ -70,6 +67,8 @@ end
 
 
 function interp_adjn!(pos::AbstractVector{<:NTuple{N}},cuvalues::CuArray,d_A2) where N
+    A2 .= 0
+
     CUDA.@sync begin
         len = length(pos)
         #numblocks = ceil(Int, length(pos)/256)

diff --git a/test/test_interp_adjoint.jl b/test/test_interp_adjoint.jl
@@ -1,50 +1,65 @@
+using AMDGPU
 using CUDA
 using DINCAE: interpnd, interp_adjn
 using Flux
-using Test
 using LinearAlgebra
+using Test
 
 f(pos,B) = sum(interpnd(pos,B))
 
+device_list = Any[identity] # CPU
+
+if CUDA.functional()
+    push!(device_list,cu)
+end
+
+if AMDGPU.functional()
+    push!(device_list,roc)
+end
+
 pos = [(1.5f0,2.f0),(5.f0,2.3f0)]
 values = [3.f0,4.f0]
 sz = (6,7)
 B = ones(Float32,sz)
 
-A = interp_adjn(pos,values,sz)
+A_ref = zeros(Float32,sz)
+A_ref[1,2] = A_ref[2,2] = 0.5 * 3
+A_ref[5,2] = 0.7 * 4
+A_ref[5,3] = 0.3 * 4
 
-@test A[1,2] ≈ 1.5
-@test A[2,2] ≈ 1.5
+dA_ref = zeros(Float32,sz)
+dA_ref[1,2] = dA_ref[2,2] = 0.5
+dA_ref[5,2] = 0.7
+dA_ref[5,3] = 0.3
 
-dA = gradient(f,pos,B)[2]
 
-@test dA[1,2] ≈ 0.5
-@test dA[2,2] ≈ 0.5
-
-if CUDA.functional()
-    pos_d  = cu(pos)
-    values_d = cu(values)
-    B_d = cu(B)
+for device in device_list
+    pos_d  = device(pos)
+    values_d = device(values)
+    B_d = device(B)
 
     A_d = interp_adjn(pos_d,values_d,sz)
-
-    CUDA.@allowscalar begin
-        @test A_d[1,2] ≈ 1.5
-        @test A_d[2,2] ≈ 1.5
-    end
+    @test Array(A_d) ≈ A_ref
 
     dA_d = gradient(f,pos_d,B_d)[2]
 
-
-    CUDA.@allowscalar begin
-        @test dA_d[1,2] ≈ 0.5
-        @test dA_d[2,2] ≈ 0.5
-    end
+    @test Array(dA_d) ≈ dA_ref
 end
 
 # check adjoint relationship
 
-A = randn(Float32,size(A,1),size(A,2))
-Ai = randn(Float32,length(pos))
+A = randn(Float32,sz)
+dAi = randn(Float32,length(pos))
 
-@test Ai ⋅ interpnd(pos,A) ≈ A ⋅ interp_adjn(pos,Ai,size(A))
+@test dAi ⋅ interpnd(pos,A) ≈ A ⋅ interp_adjn(pos,dAi,size(A))
+
+for device in device_list[2:end]
+    A_d = device(A)
+    dAi_d = device(dAi)
+    pos_d = device(pos)
+
+    @test interpnd(pos,A) ≈ Array(interpnd(pos_d,A_d))
+    @test interp_adjn(pos,dAi,size(A)) ≈ Array(interp_adjn(pos_d,dAi_d,size(A)))
+
+    @test dAi_d ⋅ interpnd(pos_d,A_d) ≈ A_d ⋅ interp_adjn(pos_d,dAi_d,size(A))
+end