diff --git a/src/LinearSolvers/preconditioners.jl b/src/LinearSolvers/preconditioners.jl index 5428dd0c..580b0758 100644 --- a/src/LinearSolvers/preconditioners.jl +++ b/src/LinearSolvers/preconditioners.jl @@ -185,18 +185,21 @@ Base.eltype(::BlockJacobiPreconditioner) = Float64 # of the blocks, gemm_strided is performing too many unecessary operations, # impairing its performance. @kernel function mblock_kernel!(y, b, p_len, rp_len, part, blocks) + p = size(b, 2) i, j = @index(Global, NTuple) len = p_len[i] rlen = rp_len[i] if j <= rlen - accum = 0.0 - idxA = @inbounds part[j, i] - for k=1:len - idxB = @inbounds part[k, i] - @inbounds accum = accum + blocks[j, k, i]*b[idxB] + for ℓ=1:p + accum = 0.0 + idxA = @inbounds part[j, i] + for k=1:len + idxB = @inbounds part[k, i] + @inbounds accum = accum + blocks[j, k, i]*b[idxB,ℓ] + end + y[idxA,ℓ] = accum end - y[idxA] = accum end end @@ -214,6 +217,22 @@ function mul!(y, C::BlockJacobiPreconditioner, b::Vector{T}) where T end end +function mul!(Y, C::BlockJacobiPreconditioner, B::Matrix{T}) where T + n, p = size(B) + fill!(Y, zero(T)) + for i=1:C.nblocks + rlen = C.lpartitions[i] + part = C.partitions[1:rlen, i] + blck = C.blocks[1:rlen, 1:rlen, i] + for rhs=1:p + for j=1:C.rest_size[i] + idx = part[j] + Y[idx,rhs] += dot(blck[j, :], B[part,rhs]) + end + end + end +end + function mul!(y, C::BlockJacobiPreconditioner, b::CuVector{T}) where T n = size(b, 1) fill!(y, zero(T)) @@ -227,6 +246,19 @@ function mul!(y, C::BlockJacobiPreconditioner, b::CuVector{T}) where T KA.synchronize(CUDABackend()) end +function mul!(Y, C::BlockJacobiPreconditioner, B::CuMatrix{T}) where T + n, p = size(B) + fill!(Y, zero(T)) + max_rlen = maximum(C.rest_size) + ndrange = (C.nblocks, max_rlen) + mblock_kernel!(CUDABackend())( + Y, B, C.culpartitions, C.curest_size, + C.cupartitions, C.cublocks, + ndrange=ndrange, + ) + KA.synchronize(CUDABackend()) +end + """ build_adjmatrix diff --git a/test/TestLinearSolvers.jl b/test/TestLinearSolvers.jl index 9f8512b7..1603dc28 100644 --- a/test/TestLinearSolvers.jl +++ b/test/TestLinearSolvers.jl @@ -19,6 +19,29 @@ function generate_random_system(n::Int, m::Int) return spA, b, x♯ end +function test_preconditioners(device, AT, SMT) + n, m = 100, 100 + A, b, x♯ = generate_random_system(n, m) + # Transfer data to device + A = A |> SMT + b = b |> AT + x♯ = x♯ |> AT + x = similar(b); r = similar(b) + nblocks = 2 + precond = LS.BlockJacobiPreconditioner(A, nblocks, device) + LS.update(precond, A, device) + Iₙ = Matrix{Float64}(I, n, n) |> AT + Y = zeros(Float64, n, n) |> AT + Y2 = zeros(Float64, n, n) |> AT + mul!(Y, precond, Iₙ) + for i=1:n + eᵢ = Iₙ[:,i] |> AT + mul!(view(Y2,:,i), precond, eᵢ) + end + @test Y ≈ Y2 + end +end + function test_custom_bicgstab(device, AT, SMT) n, m = 100, 100 A, b, x♯ = generate_random_system(n, m)