test each KA's kernel on the CPU and verify output (#227)

exanauts · Feb 11, 2022 · a2c48be · a2c48be
1 parent 12449de
commit a2c48be
Show file tree

Hide file tree

Showing 8 changed files with 254 additions and 92 deletions.
diff --git a/src/LinearSolvers/preconditioners.jl b/src/LinearSolvers/preconditioners.jl
@@ -164,14 +164,14 @@ struct BlockJacobiPreconditioner{AT,GAT,VI,GVI,MT,GMT,MI,GMI,SMT,VF,GVF} <: Abst
     end
 end
 
-function BlockJacobiPreconditioner(J::SparseMatrixCSC; nblocks=-1, device=CPU())
+function BlockJacobiPreconditioner(J::SparseMatrixCSC; nblocks=-1, device=CPU(), noverlaps=0)
     n = size(J, 1)
     npartitions = if nblocks > 0
         nblocks
     else
         div(n, 32)
     end
-    return BlockJacobiPreconditioner(J, npartitions, device)
+    return BlockJacobiPreconditioner(J, npartitions, device, noverlaps)
 end
 BlockJacobiPreconditioner(J::CUSPARSE.CuSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...)
 

diff --git a/src/autodiff.jl b/src/autodiff.jl
@@ -80,9 +80,7 @@ any nonlinear constraint ``h(x)``.
 """
 abstract type AbstractJacobian end
 
-function jacobian!(jac::AbstractJacobian, stack::AbstractStack)
-    error("Mising method jacobian!(", typeof(jac), ", ", typeof(stack), ")")
-end
+function jacobian! end
 
 """
     AbstractHessianProd
@@ -205,20 +203,20 @@ end
 
 # Sparse Jacobian partials
 
-@kernel function partials_kernel_gpu!(@Const(J_rowPtr), @Const(J_colVal), J_nzVal, @Const(duals), @Const(coloring))
+@kernel function partials_kernel_csr!(@Const(J_rowPtr), @Const(J_colVal), J_nzVal, @Const(duals), @Const(coloring))
     i = @index(Global, Linear)
 
-    @inbounds for j in J_rowPtr[i]:J_rowPtr[i+1]-1
-        @inbounds J_nzVal[j] = duals[coloring[J_colVal[j]]+1, i]
+    for j in J_rowPtr[i]:J_rowPtr[i+1]-1
+        J_nzVal[j] = duals[coloring[J_colVal[j]]+1, i]
     end
 end
 
-@kernel function partials_kernel_cpu!(J_colptr, J_rowval, J_nzval, duals, coloring)
+@kernel function partials_kernel_csc!(J_colptr, J_rowval, J_nzval, duals, coloring)
     # CSC is column oriented: nmap is equal to number of columns
     i = @index(Global, Linear)
 
-    @inbounds for j in J_colptr[i]:J_colptr[i+1]-1
-        @inbounds J_nzval[j] = duals[coloring[i]+1, J_rowval[j]]
+    for j in J_colptr[i]:J_colptr[i+1]-1
+        J_nzval[j] = duals[coloring[i]+1, J_rowval[j]]
     end
 end
 
@@ -240,10 +238,10 @@ function partials!(jac::AbstractJacobian)
     duals_ = reshape(reinterpret(T, duals), N+1, n)
 
     if isa(device, CPU)
-        kernel! = partials_kernel_cpu!(device)
+        kernel! = partials_kernel_csc!(device)
         ev = kernel!(J.colptr, J.rowval, J.nzval, duals_, coloring, ndrange=size(J,2), dependencies=Event(device))
     elseif isa(device, GPU)
-        kernel! = partials_kernel_gpu!(device)
+        kernel! = partials_kernel_csr!(device)
         ev = kernel!(J.rowPtr, J.colVal, J.nzVal, duals_, coloring, ndrange=size(J,1), dependencies=Event(device))
     else
         error("Unknown device $device")

diff --git a/src/cuda_wrapper.jl b/src/cuda_wrapper.jl
@@ -32,16 +32,6 @@ CuSparseMatrixCSR{Tv, Int32}(A::SparseMatrixCSC{Tv, Ti}) where {Tv, Ti} = CuSpar
 
 # AbstractStack
 
-@kernel function _transfer_to_input!(input, map, src)
-    i = @index(Global, Linear)
-    input[map[i]] = src[i]
-end
-
-@kernel function _transfer_fr_input!(dest, input, map)
-    i = @index(Global, Linear)
-    dest[i] = input[map[i]]
-end
-
 function Base.copyto!(stack::AutoDiff.AbstractStack, map::AbstractVector{Int}, vals::VT) where {VT <: CuArray}
     @assert length(map) == length(vals)
     ndrange = (length(map),)
@@ -56,30 +46,6 @@ function Base.copyto!(dest::VT, stack::AutoDiff.AbstractStack, map::AbstractVect
     wait(ev)
 end
 
-#=
-    LinearSolvers
-=#
-function csclsvqr!(A::CUSPARSE.CuSparseMatrixCSC{Float64},
-                    b::CUDA.CuArray{Float64, 1, CUDA.Mem.DeviceBuffer},
-                    x::CUDA.CuArray{Float64, 1, CUDA.Mem.DeviceBuffer},
-                    tol::Float64,
-                    reorder::Cint,
-                    inda::Char)
-    n = size(A,1)
-    desca = CUSPARSE.CuMatrixDescriptor(
-        CUSPARSE.CUSPARSE_MATRIX_TYPE_GENERAL,
-        CUSPARSE.CUSPARSE_FILL_MODE_LOWER,
-        CUSPARSE.CUSPARSE_DIAG_TYPE_NON_UNIT, inda)
-    singularity = Ref{Cint}(1)
-    CUSOLVER.cusolverSpDcsrlsvqr(CUSOLVER.sparse_handle(), n, A.nnz, desca, A.nzVal, A.colPtr, A.rowVal, b, tol, reorder, x, singularity)
-
-    if singularity[] != -1
-        throw(SingularException(singularity[]))
-    end
-
-    x
-end
-
 # By default, no factorization routine is available
 LinearSolvers.update!(s::DirectSolver{Nothing}, J::CuSparseMatrixCSR) = nothing
 function LinearSolvers.ldiv!(::DirectSolver{Nothing},
@@ -88,27 +54,6 @@ function LinearSolvers.ldiv!(::DirectSolver{Nothing},
     CUSOLVER.csrlsvqr!(J, x, y, 1e-8, one(Cint), 'O')
     return 0
 end
-function LinearSolvers.ldiv!(::DirectSolver{Nothing},
-    y::CUDA.CuVector, J::CUSPARSE.CuSparseMatrixCSC, x::CUDA.CuVector,
-)
-    csclsvqr!(J, x, y, 1e-8, one(Cint), 'O')
-    return 0
-end
-
-#=
-    Autodiff
-=#
-
-@kernel function _extract_values_kernel(dest, src)
-    i = @index(Global, Linear)
-    dest[i] = src[i].value
-end
-
-function extract_values!(dest::CuArray, src::CuArray)
-    ndrange = (length(dest),)
-    ev = _extract_values_kernel(CUDADevice())(dest, src, ndrange=ndrange)
-    wait(ev)
-end
 
 #=
     Generic SpMV for CuSparseMatrixCSR
@@ -117,16 +62,6 @@ function ForwardDiff.npartials(vec::CuVector{ForwardDiff.Dual{T, V, N}}) where {
     return N
 end
 
-# Differentiable LinearAlgebra.mul! for ForwardDiff
-@kernel function _spmv_csr_kernel!(Y, X, colVal, rowPtr, nzVal, alpha, beta, n, m)
-    i, k = @index(Global, NTuple)
-    Y[k, i] *= beta
-    @inbounds for c in rowPtr[i]:rowPtr[i+1]-1
-        j = colVal[c]
-        Y[k, i] += alpha * nzVal[c] * X[k, j]
-    end
-end
-
 function LinearAlgebra.mul!(
     Y::CuArray{T, 1},
     A::CUSPARSE.CuSparseMatrixCSR,

diff --git a/src/utils.jl b/src/utils.jl
@@ -1,6 +1,4 @@
 
-abstract type AbstractArchitecture end
-
 # norm
 xnorm(x::AbstractVector) = norm(x, 2)
 
@@ -14,3 +12,82 @@ function get_jacobian_types(::CPU)
     return SMT, A
 end
 
+#=
+    Kernels utils
+=#
+
+@kernel function _transfer_to_input!(input, map, src)
+    i = @index(Global, Linear)
+    input[map[i]] = src[i]
+end
+
+@kernel function _transfer_fr_input!(dest, input, map)
+    i = @index(Global, Linear)
+    dest[i] = input[map[i]]
+end
+
+# Differentiable LinearAlgebra.mul! for ForwardDiff
+@kernel function _spmv_csr_kernel!(Y, X, colVal, rowPtr, nzVal, alpha, beta, n, m)
+    i, k = @index(Global, NTuple)
+    Y[k, i] *= beta
+    @inbounds for c in rowPtr[i]:rowPtr[i+1]-1
+        j = colVal[c]
+        Y[k, i] += alpha * nzVal[c] * X[k, j]
+    end
+end
+
+#=
+    CSC2CSR
+=#
+
+# Taken from
+# https://github.com/scipy/scipy/blob/3b36a574dc657d1ca116f6e230be694f3de31afc/scipy/sparse/sparsetools/csr.h#L376
+function csr2csc(n, m, Ap, Aj, Ax, Bp, Bi, Bx)
+    nnzA = Ap[n+1] - 1
+    fill!(Bp, 0)
+
+    for i in 1:nnzA
+        Bp[Aj[i]] += 1
+    end
+
+    cumsum = 1
+    for j in 1:m
+        tmp = Bp[j]
+        Bp[j] = cumsum
+        cumsum += tmp
+    end
+    Bp[m+1] = nnzA + 1
+
+    for i in 1:n
+        for c in Ap[i]:Ap[i+1]-1
+            j = Aj[c]
+            dest = Bp[j]
+            Bi[dest] = i
+            Bx[dest] = Ax[c]
+            Bp[j] += 1
+        end
+    end
+
+    last = 1
+    for j in 1:m+1
+        tmp = Bp[j]
+        Bp[j] = last
+        last = tmp
+    end
+end
+
+csc2csr(n, m, Ap, Ai, Ax, Bp, Bj, Bx) = csr2csc(m, n, Ap, Ai, Ax, Bp, Bj, Bx)
+
+function convert2csr(A::SparseMatrixCSC{Tv, Ti}) where {Tv, Ti}
+    n, m = size(A)
+    nnzA = nnz(A)
+    Ap, Ai, Ax = A.colptr, A.rowval, A.nzval
+
+    Bp = zeros(Ti, n+1)
+    Bj = zeros(Ti, nnzA)
+    Bx = zeros(Tv, nnzA)
+
+    csc2csr(n, m, Ap, Ai, Ax, Bp, Bj, Bx)
+    return Bp, Bj, Bx
+end
+
diff --git a/test/Polar/TestPolarForm.jl b/test/Polar/TestPolarForm.jl
@@ -1,7 +1,5 @@
 module TestPolarFormulation
 
-@eval Base.Experimental.@optlevel 0
-
 using Test
 
 using CUDA

diff --git a/test/TestKernels.jl b/test/TestKernels.jl
@@ -0,0 +1,138 @@
+# Tests for custom kernels
+module TestKernels
+
+using LinearAlgebra
+using Random
+using SparseArrays
+using Test
+
+using ExaPF
+using KernelAbstractions
+
+const AD = ExaPF.AutoDiff
+
+function test_utils_kernel(device, AT, SMT)
+    # transfer kernels
+    n = 10
+    m = 20
+    # FROM
+    mapping = randperm(m)[1:n]
+    dest = zeros(n)
+    src = rand(m)
+    ndrange = (n, )
+    ev = ExaPF._transfer_fr_input!(device)(dest, src, mapping; ndrange=ndrange)
+    wait(ev)
+    @test dest == src[mapping]
+
+    # TO
+    src = rand(n)
+    dest = zeros(m)
+    ndrange = (n, )
+    ev = ExaPF._transfer_to_input!(device)(dest, mapping, src; ndrange=ndrange)
+    wait(ev)
+    @test src == dest[mapping]
+end
+
+function test_polar_kernel(device, AT, SMT)
+    nb = 10
+    nl = 10
+    m = nb + 2*nl
+    vmag = rand(nb)
+    vang = rand(nb)
+    output = zeros(m)
+    f = rand(1:nb, nl)
+    t = rand(1:nb, nl)
+    ndrange = (m, 1)
+    ev = ExaPF.basis_kernel!(device)(output, vmag, vang, f, t, nl, nb; ndrange=ndrange)
+    wait(ev)
+
+    Δθ = vang[f] .- vang[t]
+    vl2 = vmag[f] .* vmag[t]
+    res = [vl2 .* cos.(Δθ);  vl2 .* sin.(Δθ); vmag .* vmag]
+    @test res == output
+end
+
+function test_autodiff_kernel(device, AT, SMT)
+    n, m = 10, 20
+    J = sprandn(n, m, .2)
+    colors = AD.SparseDiffTools.matrix_colors(J)
+    p = length(unique(colors))
+
+    # set_value!
+    x = rand(n)
+    duals = zeros(p+1, n)
+    ndrange = (n, )
+    ev = AD._set_value_kernel!(device)(duals, x; ndrange=ndrange)
+    wait(ev)
+    @test duals[1, :] == x
+
+    # Seed with coloring for Jacobian/Hessian
+    coloring = rand(1:p, n)
+    duals = zeros(p+1, n)
+    map = randperm(n)
+    ndrange = (n, p)
+    ev = AD._seed_coloring_kernel!(device)(duals, coloring, map; ndrange=ndrange)
+    wait(ev)
+
+    # Seed for Hessian-vector products
+    v = rand(n)
+    duals = zeros(2, n)
+    ndrange = (n, )
+    ev = AD._seed_kernel!(device)(duals, v, map; ndrange=ndrange)
+    wait(ev)
+    @test duals[2, map] == v
+
+    # Partials extraction
+    duals = rand(p+1, m)
+    ## CSC
+    ndrange = (m, ) # columns oriented
+    ev = AD.partials_kernel_csc!(device)(J.colptr, J.rowval, J.nzval, duals, colors; ndrange=ndrange)
+    wait(ev)
+
+    ## CSR
+    Bp, Bj, Bx = ExaPF.convert2csr(J)
+    ndrange = (n, ) # rows oriented
+    ev = AD.partials_kernel_csr!(device)(Bp, Bj, Bx, duals, colors; ndrange=ndrange)
+    wait(ev)
+
+    # Convert back to CSC and check results
+    Ap = zeros(Int, m+1)
+    Ai = zeros(Int, nnz(J))
+    Ax = zeros(nnz(J))
+    ExaPF.csr2csc(n, m, Bp, Bj, Bx, Ap, Ai, Ax)
+    @test Ax == J.nzval
+end
+
+function test_sparse_mul_kernel(device, AT, SMT)
+    n, m = 10, 20
+    p = 2
+    J = sprandn(n, m, .2)
+    x = rand(m, p)
+    y_ref = zeros(n, p)
+    mul!(y_ref, J, x, 1.0, 0.0)
+
+    # Convert to CSR
+    Bp, Bj, Bx = ExaPF.convert2csr(J)
+    # Kernel is transposing the input and the output
+    x = Array(x')
+    y = zeros(p, n)
+    ndrange = (n, p)
+    ev = ExaPF._spmv_csr_kernel!(device)(y, x, Bj, Bp, Bx, 1.0, 0.0, n, m; ndrange=ndrange)
+    wait(ev)
+
+    @test y' ≈ y_ref
+end
+
+function runtests(device, AT, SMT)
+    for name_sym in names(@__MODULE__; all = true)
+        name = string(name_sym)
+        if !startswith(name, "test_")
+            continue
+        end
+        test_func = getfield(@__MODULE__, name_sym)
+        test_func(device, AT, SMT)
+    end
+end
+
+end
+