diff --git a/Project.toml b/Project.toml index d3d48add..781781ba 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7" +KrylovPreconditioners = "45d422c2-293f-44ce-8315-2cb988662dec" LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3" LightGraphs = "093fc24a-ae57-5d10-9952-331d41423f4d" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -30,6 +31,7 @@ CUDA = "4.1, 5" ForwardDiff = "0.10" KernelAbstractions = "0.9" Krylov = "0.9" +KrylovPreconditioners = "0.1" LazyArtifacts = "1.9" LightGraphs = "1.3" LinearAlgebra = "1.9" diff --git a/ext/ExaPFAMDGPUExt.jl b/ext/ExaPFAMDGPUExt.jl index 06ade3b2..9ec729aa 100644 --- a/ext/ExaPFAMDGPUExt.jl +++ b/ext/ExaPFAMDGPUExt.jl @@ -36,5 +36,4 @@ List all linear solvers available solving the power flow on an NVIDIA GPU. ExaPF.list_solvers(::ROCBackend) = [LS.BICGSTAB, LS.DQGMRES, LS.EigenBICGSTAB, LS.KrylovBICGSTAB] include("amdgpu_wrapper.jl") -include("amdgpu_preconditioner.jl") end diff --git a/ext/ExaPFCUDAExt.jl b/ext/ExaPFCUDAExt.jl index de18eeef..b675034a 100644 --- a/ext/ExaPFCUDAExt.jl +++ b/ext/ExaPFCUDAExt.jl @@ -31,5 +31,4 @@ List all linear solvers available solving the power flow on an NVIDIA GPU. ExaPF.list_solvers(::CUDABackend) = [LS.DirectSolver, LS.BICGSTAB, LS.DQGMRES, LS.EigenBICGSTAB, LS.KrylovBICGSTAB] include("cuda_wrapper.jl") -include("cuda_preconditioner.jl") end diff --git a/ext/amdgpu_preconditioner.jl b/ext/amdgpu_preconditioner.jl deleted file mode 100644 index 69202e41..00000000 --- a/ext/amdgpu_preconditioner.jl +++ /dev/null @@ -1,40 +0,0 @@ -LS.BlockJacobiPreconditioner(J::rocSPARSE.ROCSparseMatrixCSR; options...) = ExaPF.LS.BlockJacobiPreconditioner(SparseMatrixCSC(J); device=ROCBackend(), options...) - -function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::ROCBackend) - nblocks = p.nblocks - fillblock_gpu_kernel! = ExaPF.LS._fillblock_gpu!(device) - # Fill Block Jacobi" begin - fillblock_gpu_kernel!( - p.cublocks, size(p.id,1), - p.cupartitions, p.cumap, - j_rowptr, j_colval, j_nzval, - p.cupart, p.culpartitions, p.id, - ndrange=nblocks, - ) - KA.synchronize(device) - # Invert blocks begin - blocklist = Array{ROCArray{Float64,2}}(undef, nblocks) - for b in 1:nblocks - blocklist[b] = p.cublocks[:,:,b] - end - AMDGPU.@sync pivot, info = AMDGPU.rocSOLVER.getrf_batched!(blocklist) - AMDGPU.@sync pivot, info, blocklist = AMDGPU.rocSOLVER.getri_batched!(blocklist, pivot) - for b in 1:nblocks - p.cublocks[:,:,b] .= blocklist[b] - end - return -end - -""" - function update(J::CuSparseMatrixCSR, p) - -Update the preconditioner `p` from the sparse Jacobian `J` in CSR format for the GPU - -1) The dense blocks `cuJs` are filled from the sparse Jacobian `J` -2) To a batch inversion of the dense blocks using CUBLAS -3) Extract the preconditioner matrix `p.P` from the dense blocks `cuJs` - -""" -function LS.update(p, J::rocSPARSE.ROCSparseMatrixCSR, device::ROCBackend) - _update_gpu(p, J.rowPtr, J.colVal, J.nzVal, device) -end diff --git a/ext/cuda_preconditioner.jl b/ext/cuda_preconditioner.jl deleted file mode 100644 index 6f4e62c4..00000000 --- a/ext/cuda_preconditioner.jl +++ /dev/null @@ -1,40 +0,0 @@ -LS.BlockJacobiPreconditioner(J::CUSPARSE.CuSparseMatrixCSR; options...) = ExaPF.LS.BlockJacobiPreconditioner(SparseMatrixCSC(J); device=CUDABackend(), options...) - -function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::CUDABackend) - nblocks = p.nblocks - fillblock_gpu_kernel! = ExaPF.LS._fillblock_gpu!(device) - # Fill Block Jacobi" begin - fillblock_gpu_kernel!( - p.cublocks, size(p.id,1), - p.cupartitions, p.cumap, - j_rowptr, j_colval, j_nzval, - p.cupart, p.culpartitions, p.id, - ndrange=nblocks, - ) - KA.synchronize(device) - # Invert blocks begin - blocklist = Array{CuArray{Float64,2}}(undef, nblocks) - for b in 1:nblocks - blocklist[b] = p.cublocks[:,:,b] - end - CUDA.@sync pivot, info = CUDA.CUBLAS.getrf_batched!(blocklist, true) - CUDA.@sync pivot, info, blocklist = CUDA.CUBLAS.getri_batched(blocklist, pivot) - for b in 1:nblocks - p.cublocks[:,:,b] .= blocklist[b] - end - return -end - -""" - function update(J::CuSparseMatrixCSR, p) - -Update the preconditioner `p` from the sparse Jacobian `J` in CSR format for the GPU - -1) The dense blocks `cuJs` are filled from the sparse Jacobian `J` -2) To a batch inversion of the dense blocks using CUBLAS -3) Extract the preconditioner matrix `p.P` from the dense blocks `cuJs` - -""" -function LS.update(p, J::CUSPARSE.CuSparseMatrixCSR, device::CUDABackend) - _update_gpu(p, J.rowPtr, J.colVal, J.nzVal, device) -end diff --git a/src/LinearSolvers/LinearSolvers.jl b/src/LinearSolvers/LinearSolvers.jl index 729dbef7..366b119a 100644 --- a/src/LinearSolvers/LinearSolvers.jl +++ b/src/LinearSolvers/LinearSolvers.jl @@ -17,6 +17,7 @@ import ..ExaPF: xnorm import Base.size, Base.sizeof, Base.format_bytes import Krylov: KrylovStats, allocate_if, ksizeof, FloatOrComplex, ktimer, matrix_to_vector, kdisplay, mulorldiv! +using KrylovPreconditioners const KA = KernelAbstractions @@ -32,7 +33,6 @@ export DirectSolver, BICGSTAB, EigenBICGSTAB, KrylovBICGSTAB Diverged, ) -include("preconditioners.jl") include("bicgstab.jl") include("bicgstab_eigen.jl") @@ -144,7 +144,7 @@ Custom BICGSTAB implementation to solve iteratively the linear system ``A x = y``. """ struct BICGSTAB <: AbstractIterativeLinearSolver - precond::AbstractPreconditioner + precond::AbstractKrylovPreconditioner maxiter::Int tol::Float64 verbose::Bool @@ -174,7 +174,7 @@ Julia's port of Eigen's BICGSTAB to solve iteratively the linear system ``A x = y``. """ struct EigenBICGSTAB <: AbstractIterativeLinearSolver - precond::AbstractPreconditioner + precond::AbstractKrylovPreconditioner maxiter::Int tol::Float64 verbose::Bool @@ -206,7 +206,7 @@ Wrap `Krylov.jl`'s DQGMRES algorithm to solve iteratively the linear system """ struct DQGMRES <: AbstractIterativeLinearSolver inner::Krylov.DqgmresSolver - precond::AbstractPreconditioner + precond::AbstractKrylovPreconditioner memory::Int verbose::Bool end @@ -238,7 +238,7 @@ Wrap `Krylov.jl`'s BICGSTAB algorithm to solve iteratively the linear system """ struct KrylovBICGSTAB <: AbstractIterativeLinearSolver inner::Krylov.BicgstabSolver - precond::AbstractPreconditioner + precond::AbstractKrylovPreconditioner verbose::Int atol::Float64 rtol::Float64 @@ -255,7 +255,6 @@ end function ldiv!(solver::KrylovBICGSTAB, y::AbstractVector, J::AbstractMatrix, x::AbstractVector, ) - _allowscalar(J) do Krylov.bicgstab!( solver.inner, J, x; N=solver.precond, @@ -264,9 +263,8 @@ function ldiv!(solver::KrylovBICGSTAB, verbose=solver.verbose, history=true, ) - end copyto!(y, solver.inner.x) - return length(solver.inner.stats.residuals) + return solver.inner.stats.niter end """ diff --git a/src/LinearSolvers/preconditioners.jl b/src/LinearSolvers/preconditioners.jl deleted file mode 100644 index 9b978a75..00000000 --- a/src/LinearSolvers/preconditioners.jl +++ /dev/null @@ -1,326 +0,0 @@ -import LinearAlgebra: ldiv!, \, *, mul! - -""" - AbstractPreconditioner - -Preconditioners for the iterative solvers mostly focused on GPUs - -""" -abstract type AbstractPreconditioner end - -""" - overlap(Graph, subset, level) - -Given subset embedded within Graph, compute subset2 such that -subset2 contains subset and all of its adjacent vertices. -""" -function overlap(Graph, subset; level=1) - @assert level > 0 - subset2 = [LightGraphs.neighbors(Graph, v) for v in subset] - subset2 = reduce(vcat, subset2) - subset2 = unique(vcat(subset, subset2)) - - level -= 1 - if level == 0 - return subset2 - else - return overlap(Graph, subset2, level=level) - end -end - -""" - BlockJacobiPreconditioner - -Overlapping-Schwarz preconditioner. - -### Attributes - -* `nblocks::Int64`: Number of partitions or blocks. -* `blocksize::Int64`: Size of each block. -* `partitions::Vector{Vector{Int64}}``: `npart` partitions stored as lists -* `cupartitions`: `partitions` transfered to the GPU -* `lpartitions::Vector{Int64}``: Length of each partitions. -* `culpartitions::Vector{Int64}``: Length of each partitions, on the GPU. -* `blocks`: Dense blocks of the block-Jacobi -* `cublocks`: `Js` transfered to the GPU -* `map`: The partitions as a mapping to construct views -* `cumap`: `cumap` transferred to the GPU` -* `part`: Partitioning as output by Metis -* `cupart`: `part` transferred to the GPU -""" -struct BlockJacobiPreconditioner{AT,GAT,VI,GVI,GMT,MI,GMI} <: AbstractPreconditioner - nblocks::Int64 - blocksize::Int64 - partitions::MI - cupartitions::GMI - lpartitions::VI - culpartitions::GVI - rest_size::VI - curest_size::GVI - blocks::AT - cublocks::GAT - map::VI - cumap::GVI - part::VI - cupart::GVI - id::GMT -end - -function BlockJacobiPreconditioner(J, npart, device=CPU(), olevel=0) where {} - if npart < 2 - error("Number of partitions `npart` should be at" * - "least 2 for partitioning in Metis") - end - adj = build_adjmatrix(SparseMatrixCSC(J)) - g = LightGraphs.Graph(adj) - part = Metis.partition(g, npart) - partitions = Vector{Vector{Int64}}() - for i in 1:npart - push!(partitions, []) - end - for (i,v) in enumerate(part) - push!(partitions[v], i) - end - # We keep track of the partition size pre-overlap. - # This will allow us to implement the RAS update. - rest_size = length.(partitions) - # overlap - if olevel > 0 - for i in 1:npart - partitions[i] = overlap(g, partitions[i], level=olevel) - end - end - lpartitions = length.(partitions) - blocksize = maximum(length.(partitions)) - blocks = zeros(Float64, blocksize, blocksize, npart) - # Get partitions into bit typed structure - bpartitions = zeros(Int64, blocksize, npart) - bpartitions .= 0.0 - for i in 1:npart - bpartitions[1:length(partitions[i]),i] .= Vector{Int64}(partitions[i]) - end - id = Matrix{Float64}(I, blocksize, blocksize) - for i in 1:npart - blocks[:,:,i] .= id - end - nmap = 0 - for b in partitions - nmap += length(b) - end - map = zeros(Int64, nmap) - part = zeros(Int64, nmap) - for b in 1:npart - for (i,el) in enumerate(partitions[b]) - map[el] = i - part[el] = b - end - end - - id = adapt(device, id) - cubpartitions = adapt(device, bpartitions) - culpartitions = adapt(device, lpartitions) - curest_size = adapt(device, rest_size) - cublocks = adapt(device, blocks) - cumap = adapt(device, map) - cupart = adapt(device, part) - return BlockJacobiPreconditioner( - npart, blocksize, bpartitions, - cubpartitions, lpartitions, - culpartitions, rest_size, - curest_size, blocks, - cublocks, map, - cumap, part, - cupart, id - ) -end - -function BlockJacobiPreconditioner(J::SparseMatrixCSC; nblocks=-1, device=CPU(), noverlaps=0) - n = size(J, 1) - npartitions = if nblocks > 0 - nblocks - else - div(n, 32) - end - if npartitions < 2 - npartitions = 2 - end - return BlockJacobiPreconditioner(J, npartitions, device, noverlaps) -end - -Base.eltype(::BlockJacobiPreconditioner) = Float64 - -# NOTE: Custom kernel to implement blocks - vector multiplication. -# The blocks have very unbalanced sizes, leading to imbalances -# between the different threads. -# CUBLAS.gemm_strided_batched has been tested has well, but is -# overall 3x slower than this custom kernel : due to the various sizes -# of the blocks, gemm_strided is performing too many unecessary operations, -# impairing its performance. -@kernel function mblock_kernel!(y, b, p_len, rp_len, part, blocks) - p = size(b, 2) - i, j = @index(Global, NTuple) - len = p_len[i] - rlen = rp_len[i] - - if j <= rlen - for ℓ=1:p - accum = 0.0 - idxA = @inbounds part[j, i] - for k=1:len - idxB = @inbounds part[k, i] - @inbounds accum = accum + blocks[j, k, i]*b[idxB,ℓ] - end - y[idxA,ℓ] = accum - end - end -end - -function mul!(y, C::BlockJacobiPreconditioner, b::Vector{T}) where T - n = size(b, 1) - fill!(y, zero(T)) - for i=1:C.nblocks - rlen = C.lpartitions[i] - part = C.partitions[1:rlen, i] - blck = C.blocks[1:rlen, 1:rlen, i] - for j=1:C.rest_size[i] - idx = part[j] - y[idx] += dot(blck[j, :], b[part]) - end - end -end - -function mul!(Y, C::BlockJacobiPreconditioner, B::Matrix{T}) where T - n, p = size(B) - fill!(Y, zero(T)) - for i=1:C.nblocks - rlen = C.lpartitions[i] - part = C.partitions[1:rlen, i] - blck = C.blocks[1:rlen, 1:rlen, i] - for rhs=1:p - for j=1:C.rest_size[i] - idx = part[j] - Y[idx,rhs] += dot(blck[j, :], B[part,rhs]) - end - end - end -end - -function mul!(y, C::BlockJacobiPreconditioner, b::AbstractVector{T}) where T - device = KA.get_backend(b) - n = size(b, 1) - fill!(y, zero(T)) - max_rlen = maximum(C.rest_size) - ndrange = (C.nblocks, max_rlen) - mblock_kernel!(device)( - y, b, C.culpartitions, C.curest_size, - C.cupartitions, C.cublocks, - ndrange=ndrange, - ) - KA.synchronize(device) -end - -function mul!(Y, C::BlockJacobiPreconditioner, B::AbstractMatrix{T}) where T - device = KA.get_backend(B) - n, p = size(B) - fill!(Y, zero(T)) - max_rlen = maximum(C.rest_size) - ndrange = (C.nblocks, max_rlen) - mblock_kernel!(device)( - Y, B, C.culpartitions, C.curest_size, - C.cupartitions, C.cublocks, - ndrange=ndrange, - ) - KA.synchronize(device) -end - -""" - build_adjmatrix - -Build the adjacency matrix of a matrix A corresponding to the undirected graph - -""" -function build_adjmatrix(A) - rows = Int64[] - cols = Int64[] - vals = Float64[] - rowsA = rowvals(A) - m, n = size(A) - for i = 1:n - for j in nzrange(A, i) - push!(rows, rowsA[j]) - push!(cols, i) - push!(vals, 1.0) - - push!(rows, i) - push!(cols, rowsA[j]) - push!(vals, 1.0) - end - end - return sparse(rows,cols,vals,size(A,1),size(A,2)) -end - -""" - _fillblock_gpu - -Fill the dense blocks of the preconditioner from the sparse CSR matrix arrays - -""" -@kernel function _fillblock_gpu!(blocks, blocksize, partition, map, rowPtr, colVal, nzVal, part, lpartitions, id) - b = @index(Global, Linear) - for i in 1:blocksize - for j in 1:blocksize - blocks[i,j,b] = id[i,j] - end - end - - @inbounds for k in 1:lpartitions[b] - # select row - i = partition[k, b] - # iterate matrix - for row_ptr in rowPtr[i]:(rowPtr[i + 1] - 1) - # retrieve column value - col = colVal[row_ptr] - # iterate partition list and see if pertains to it - for j in 1:lpartitions[b] - if col == partition[j, b] - @inbounds blocks[k, j, b] = nzVal[row_ptr] - end - end - end - end -end - -""" - function update(J::SparseMatrixCSC, p) - -Update the preconditioner `p` from the sparse Jacobian `J` in CSC format for the CPU - -Note that this implements the same algorithm as for the GPU and becomes very slow on CPU with growing number of blocks. - -""" -function update(p, J::SparseMatrixCSC, device) - # TODO: Enabling threading leads to a crash here - for b in 1:p.nblocks - p.blocks[:,:,b] = p.id[:,:] - for k in 1:p.lpartitions[b] - i = p.partitions[k,b] - for j in J.colptr[i]:J.colptr[i+1]-1 - if b == p.part[J.rowval[j]] - p.blocks[p.map[J.rowval[j]], p.map[i], b] = J.nzval[j] - end - end - end - end - for b in 1:p.nblocks - # Invert blocks - p.blocks[:,:,b] .= inv(p.blocks[:,:,b]) - end -end - -function Base.show(precond::BlockJacobiPreconditioner) - npartitions = precond.npart - nblock = precond.nblocks - println("#partitions: $npartitions, Blocksize: n = ", nblock, - " Mbytes = ", (nblock*nblock*npartitions*8.0)/(1024.0*1024.0)) - println("Block Jacobi block size: $(precond.nJs)") -end diff --git a/src/Polar/newton.jl b/src/Polar/newton.jl index 1cb7e071..95ca2372 100644 --- a/src/Polar/newton.jl +++ b/src/Polar/newton.jl @@ -38,7 +38,8 @@ struct ConvergenceStatus norm_residuals::Float64 n_linear_solves::Int time_jacobian::Float64 - time_linear_solver::Float64 + time_linear_solver_update::Float64 + time_linear_solver_ldiv::Float64 time_total::Float64 end @@ -46,7 +47,9 @@ function Base.show(io::IO, conv::ConvergenceStatus) println(io, "Power flow has converged: ", conv.has_converged) @printf(io, " * #iterations: %d\n", conv.n_iterations) @printf(io, " * Time Jacobian (s) ........: %1.4f\n", conv.time_jacobian) - @printf(io, " * Time linear solver (s) ...: %1.4f\n", conv.time_linear_solver) + @printf(io, " * Time linear solver (s) ...: %1.4f\n", conv.time_linear_solver_ldiv + conv.time_linear_solver_update) + @printf(io, " * update (s) ............: %1.4f\n", conv.time_linear_solver_update) + @printf(io, " * ldiv (s) ..............: %1.4f\n", conv.time_linear_solver_ldiv) @printf(io, " * Time total (s) ...........: %1.4f\n", conv.time_total) end @@ -126,7 +129,8 @@ function nlsolve!( linsol_iters = Int[] time_total = 0.0 time_jacobian = 0.0 - time_linear_solver = 0.0 + time_linear_solver_update = 0.0 + time_linear_solver_ldiv = 0.0 map = jac.map x = view(stack.input, map) @@ -153,8 +157,10 @@ function nlsolve!( end # Update - time_linear_solver += @elapsed begin + time_linear_solver_update += @elapsed begin LS.update!(linear_solver, J) + end + time_linear_solver_ldiv += @elapsed begin n_iters = LS.ldiv!(linear_solver, Δx, J, residual) end x .= x .- Δx @@ -167,7 +173,8 @@ function nlsolve!( time_total = time() - tic return ConvergenceStatus( - converged, iter, normF, sum(linsol_iters), time_jacobian, time_linear_solver, time_total, + converged, iter, normF, sum(linsol_iters), + time_jacobian, time_linear_solver_update, time_linear_solver_ldiv, time_total, ) end diff --git a/test/Project.toml b/test/Project.toml index 156c2dea..7468472c 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -5,6 +5,7 @@ FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7" +KrylovPreconditioners = "45d422c2-293f-44ce-8315-2cb988662dec" LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3" LightGraphs = "093fc24a-ae57-5d10-9952-331d41423f4d" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" diff --git a/test/TestLinearSolvers.jl b/test/TestLinearSolvers.jl index 228ca7f3..7941afad 100644 --- a/test/TestLinearSolvers.jl +++ b/test/TestLinearSolvers.jl @@ -7,6 +7,7 @@ using Random using SparseArrays using ExaPF +using KrylovPreconditioners const LS = ExaPF.LinearSolvers function generate_random_system(n::Int, m::Int) @@ -28,7 +29,7 @@ function test_preconditioners(device, AT, SMT) x♯ = x♯ |> AT x = similar(b); r = similar(b) nblocks = 2 - precond = LS.BlockJacobiPreconditioner(A, nblocks, device) + precond = BlockJacobiPreconditioner(A, nblocks, device) LS.update(precond, A, device) Iₙ = Matrix{Float64}(I, n, n) |> AT Y = zeros(Float64, n, n) |> AT @@ -50,7 +51,7 @@ function test_custom_bicgstab(device, AT, SMT) x♯ = x♯ |> AT x = similar(b); r = similar(b) nblocks = 2 - precond = LS.BlockJacobiPreconditioner(A, nblocks, device) + precond = BlockJacobiPreconditioner(A, nblocks, device) LS.update(precond, A, device) linear_solver = LS.BICGSTAB(A; P=precond) n_iters = LS.ldiv!(linear_solver, x, A, b) @@ -72,7 +73,7 @@ function test_all_linear_solvers(device, AT, SMT) # Init preconditioner nblocks = 2 for olevel in [0, 1] - precond = LS.BlockJacobiPreconditioner(A; nblocks=nblocks, device=device, noverlaps=olevel) + precond = BlockJacobiPreconditioner(A; nblocks=nblocks, device=device, noverlaps=olevel) # Test printing println(devnull, precond) LS.update(precond, A, device)