diff --git a/src/LinearSolvers/preconditioners.jl b/src/LinearSolvers/preconditioners.jl index 80ec9ca1..a0133747 100644 --- a/src/LinearSolvers/preconditioners.jl +++ b/src/LinearSolvers/preconditioners.jl @@ -185,18 +185,21 @@ Base.eltype(::BlockJacobiPreconditioner) = Float64 # of the blocks, gemm_strided is performing too many unecessary operations, # impairing its performance. @kernel function mblock_kernel!(y, b, p_len, rp_len, part, blocks) + p = size(b, 2) i, j = @index(Global, NTuple) len = p_len[i] rlen = rp_len[i] if j <= rlen - accum = 0.0 - idxA = @inbounds part[j, i] - for k=1:len - idxB = @inbounds part[k, i] - @inbounds accum = accum + blocks[j, k, i]*b[idxB] + for ℓ=1:p + accum = 0.0 + idxA = @inbounds part[j, i] + for k=1:len + idxB = @inbounds part[k, i] + @inbounds accum = accum + blocks[j, k, i]*b[idxB,ℓ] + end + y[idxA,ℓ] = accum end - y[idxA] = accum end end @@ -223,7 +226,7 @@ function mul!(Y, C::BlockJacobiPreconditioner, B::Matrix{T}) where T blck = view(C.blocks, 1:rlen, 1:rlen, i) for j=1:C.rest_size[i] idx = part[j] - mul!(view(Y,idx,:), view(blck, j, :)', view(B, part, :), one(T), one(T)) + mul!(view(Y,idx,:), view(blck, :, j), view(B, part, :), one(T), one(T)) end end end @@ -241,6 +244,19 @@ function mul!(y, C::BlockJacobiPreconditioner, b::CuVector{T}) where T KA.synchronize(CUDABackend()) end +function mul!(Y, C::BlockJacobiPreconditioner, B::CuMatrix{T}) where T + n, p = size(B) + fill!(Y, zero(T)) + max_rlen = maximum(C.rest_size) + ndrange = (C.nblocks, max_rlen) + mblock_kernel!(CUDABackend())( + Y, B, C.culpartitions, C.curest_size, + C.cupartitions, C.cublocks, + ndrange=ndrange, + ) + KA.synchronize(CUDABackend()) +end + """ build_adjmatrix diff --git a/test/TestLinearSolvers.jl b/test/TestLinearSolvers.jl index d36a99b2..0e32461a 100644 --- a/test/TestLinearSolvers.jl +++ b/test/TestLinearSolvers.jl @@ -30,15 +30,28 @@ function test_preconditioners(device, AT, SMT) nblocks = 2 precond = LS.BlockJacobiPreconditioner(A, nblocks, device) LS.update(precond, A, device) - Iₙ = Matrix{Float64}(I, n, n) - Y = zeros(Float64, n, n) - Y2 = zeros(Float64, n, n) - mul!(Y, precond, Iₙ) - for i=1:n - eᵢ = Iₙ[:,i] - mul!(view(Y2,:,i), precond, eᵢ) + if device == ExaPF.CPU() + Iₙ = Matrix{Float64}(I, n, n) + Y = zeros(Float64, n, n) + Y2 = zeros(Float64, n, n) + mul!(Y, precond, Iₙ) + for i=1:n + eᵢ = Vector(Iₙ[:,i]) + mul!(view(Y2,:,i), precond, eᵢ) + end + @test Y ≈ Y2 + end + if device == ExaPF.GPU() + Iₙ = CuMatrix{Float64}(I, n, n) + Y = CUDA.zeros(Float64, n, n) + Y2 = CUDA.zeros(Float64, n, n) + mul!(Y, precond, Iₙ) + for i=1:n + eᵢ = CuVector(Iₙ[:,i]) + mul!(view(Y2,:,i), precond, eᵢ) + end + @test Y ≈ Y2 end - @test Y ≈ Y2 end function test_custom_bicgstab(device, AT, SMT)