Use KrylovPreconditioners

exanauts · Dec 5, 2023 · e657b19 · e657b19
1 parent 7e3b639
commit e657b19
Show file tree

Hide file tree

Showing 21 changed files with 176 additions and 506 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,13 +1,14 @@
 name = "ExaPF"
 uuid = "0cf0e50c-a82e-488f-ac7e-41ffdff1b8aa"
 authors = ["Adrian Maldonado <[email protected]>", "Michel Schanen <[email protected]>", "François Pacaud <[email protected]>"]
-version = "0.10.1"
+version = "0.11.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7"
+KrylovPreconditioners = "45d422c2-293f-44ce-8315-2cb988662dec"
 LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
 LightGraphs = "093fc24a-ae57-5d10-9952-331d41423f4d"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -30,6 +31,7 @@ CUDA = "4.1, 5"
 ForwardDiff = "0.10"
 KernelAbstractions = "0.9"
 Krylov = "0.9"
+KrylovPreconditioners = "0.2"
 LazyArtifacts = "1.9"
 LightGraphs = "1.3"
 LinearAlgebra = "1.9"

diff --git a/benchmark.jl b/benchmark.jl
@@ -0,0 +1,70 @@
+using CUDA
+using KernelAbstractions
+using ExaPF
+import ExaPF: AutoDiff
+using LazyArtifacts
+using LinearAlgebra
+using KrylovPreconditioners
+using PProf
+using Profile
+
+
+const PS = ExaPF.PowerSystem
+const LS = ExaPF.LinearSolvers
+# datafile = joinpath(artifact"ExaData", "ExaData", "matpower", "case_ACTIVSg70k.m")
+# datafile = joinpath(artifact"ExaData", "ExaData", "case1354.m")
+datafile = joinpath(artifact"ExaData", "ExaData", "case9241pegase.m")
+polar = ExaPF.PolarForm(datafile, CPU())
+stack = ExaPF.NetworkStack(polar)
+ExaPF.init!(polar, stack)
+@time convergence = run_pf(polar, stack; verbose=1)
+
+pf = PS.PowerNetwork(datafile)
+polar_gpu = ExaPF.PolarForm(pf, CUDABackend())
+stack_gpu = ExaPF.NetworkStack(polar_gpu)
+basis_gpu = ExaPF.PolarBasis(polar_gpu)
+pflow_gpu = ExaPF.PowerFlowBalance(polar_gpu) ∘ basis_gpu
+mapx = ExaPF.mapping(polar, State());
+jx_gpu = ExaPF.Jacobian(polar_gpu, pflow_gpu, mapx)
+direct_linear_solver = LS.DirectSolver(jx_gpu.J)
+pf_algo = NewtonRaphson(; verbose=1, tol=1e-10)
+
+
+jac_gpu = jx_gpu.J;
+
+npartitions = div(size(jac_gpu,1), 32);
+precond = BlockJacobiPreconditioner(jac_gpu, npartitions, CUDABackend(), 0);
+# precond = KrylovPreconditioners.kp_ilu0(jac_gpu)
+# linear_solver = ExaPF.KrylovBICGSTAB(jac_gpu; P=precond, ldiv=false, scaling=true, atol=1e-10, verbose=0);
+# linear_solver = ExaPF.KrylovBICGSTAB(jac_gpu; P=precond, ldiv=false, scaling=false, atol=1e-10, verbose=0);
+# linear_solver = ExaPF.KrylovBICGSTAB(jac_gpu; P=precond, ldiv=false, scaling=false, atol=1e-10, verbose=0, maxiter=500);
+linear_solver = ExaPF.KrylovBICGSTAB(
+    jac_gpu; P=precond, ldiv=false, scaling=true,
+    rtol=1e-7, atol=1e-7, verbose=0
+);
+pf_algo = NewtonRaphson(; verbose=1, tol=1e-7, maxiter=20)
+ExaPF.init!(polar_gpu, stack_gpu)
+reset_timer!(linear_solver.precond)
+@time convergence = ExaPF.nlsolve!(pf_algo, jx_gpu, stack_gpu; linear_solver=linear_solver)
+@show get_timer(linear_solver.precond)
+ExaPF.init!(polar_gpu, stack_gpu)
+@time convergence = ExaPF.nlsolve!(pf_algo, jx_gpu, stack_gpu; linear_solver=direct_linear_solver)
+@show linear_solver.inner.stats.niter
+@show convergence
+# Profiling
+ExaPF.init!(polar_gpu, stack_gpu)
+Profile.clear()
+Profile.@profile begin
+    linear_solver.precond.timer_update = 0.0
+    convergence = ExaPF.nlsolve!(pf_algo, jx_gpu, stack_gpu; linear_solver=linear_solver)
+    @show linear_solver.precond.timer_update
+end
+Profile.clear()
+ExaPF.init!(polar_gpu, stack_gpu)
+Profile.@profile begin
+    linear_solver.precond.timer_update = 0.0
+    convergence = ExaPF.nlsolve!(pf_algo, jx_gpu, stack_gpu; linear_solver=linear_solver)
+    @show linear_solver.precond.timer_update
+end
+PProf.pprof()
+@show convergence
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
@@ -230,7 +230,7 @@ function benchmark_bicgstab(polar, config, noverlaps, nblocks)
     n = size(J, 1)
     npartitions = max(ceil(Int, n / nblocks), 2)
     precond = LS.BlockJacobiPreconditioner(J, npartitions, polar.device, noverlaps)
-    algo = LS.KrylovBICGSTAB(J; P=precond)
+    algo = LS.Bicgstab(J; P=precond)
     # Update preconditioner
     LS.update!(algo, J)
     # RHS
@@ -319,7 +319,7 @@ function run_benchmarks_bicgstab(polar, config=DEFAULT_CONFIG)
     nblocks = ref_nblocks[best_time]
     config_pf[:noverlaps] = olevel
     config_pf[:npartitions] = ceil(Int, nx / nblocks)
-    res = benchmark_powerflow(polar, config, LS.KrylovBICGSTAB)
+    res = benchmark_powerflow(polar, config, LS.Bicgstab)
     push!(names, "powerflow_bicgstab_$(nblocks)blk_$(olevel)overlap")
     push!(timings, res.time)
     push!(iters, res.it)

diff --git a/docs/src/lib/linearsolver.md b/docs/src/lib/linearsolver.md
@@ -24,18 +24,8 @@ DirectSolver
 ## Iterative solvers
 
 ```@docs
-KrylovBICGSTAB
-DQGMRES
-BICGSTAB
-EigenBICGSTAB
-```
-
-`ExaPF.jl` is shipped with a custom BICGSTAB implementation.
-However, we highly recommend to use `KrylovBICGSTAB` instead,
-which has proved to be more robust.
-```@docs
-bicgstab
-
+Bicgstab
+Dqgmres
 ```
 
 Available linear solvers can be queried with
@@ -60,20 +50,3 @@ BlockGmresSolver
 block_gmres
 block_gmres!
 ```
-
-## Preconditioning
-
-To solve linear systems with iterative methods, `ExaPF`
-provides an implementation of a block-Jacobi preconditioner,
-portable on GPU.
-
-```@docs
-AbstractPreconditioner
-```
-
-### Block-Jacobi preconditioner
-
-```@docs
-BlockJacobiPreconditioner
-update
-```
diff --git a/docs/src/man/autodiff.md b/docs/src/man/autodiff.md
@@ -1,5 +1,4 @@
 ```@meta
-CurrentModule = ExaPF
 DocTestSetup = quote
     using ExaPF
     const AD = ExaPF.AD

diff --git a/docs/src/man/benchmark.md b/docs/src/man/benchmark.md
@@ -10,15 +10,15 @@ DocTestFilters = [r"ExaPF"]
 # Benchmark
 
 For the purpose of performance regression testing, ExaPF provides a lightweight benchmark script. It allows to test the various configurations for the linear solvers used in the Newton-Raphson algorithm, and run them on a specific hardware. The main julia script [benchmark/benchmarks.jl](https://github.com/exanauts/ExaPF.jl/tree/main/benchmark/benchmarks.jl) takes all its options from the command line.
-The benchmark script takes as input a linear solver (e.g. `KrylovBICGSTAB`), a target architecture as a `KernelAbstractions` object (CPU or CUDABackend), and a case filename which is included in the `ExaData` artifact. An exhaustive list of all available linear solvers can be obtained via [`ExaPF.LinearSolvers.list_solvers`](@ref).
+The benchmark script takes as input a linear solver (e.g. `Bicgstab`), a target architecture as a `KernelAbstractions` object (CPU or CUDABackend), and a case filename which is included in the `ExaData` artifact. An exhaustive list of all available linear solvers can be obtained via [`ExaPF.LinearSolvers.list_solvers`](@ref).
 
 Running
 ```
-julia --project benchmark/benchmarks.jl KrylovBICGSTAB CUDABackend case300.m
+julia --project benchmark/benchmarks.jl Bicgstab CUDABackend case300.m
 ```
 yields
 ```
-KrylovBICGSTAB, CUDABackend, case300.m,  69.0,  3.57,  43.7, true
+Bicgstab, CUDABackend, case300.m,  69.0,  3.57,  43.7, true
 ```
 The first three fields are the settings of the benchmark run. They are followed by three timings in milliseconds:
 1. the time taken by the Newton-Raphson algorithm to solve the power flow,

diff --git a/docs/src/man/formulations.md b/docs/src/man/formulations.md
@@ -1,4 +1,3 @@
-
 ```@meta
 CurrentModule = ExaPF
 DocTestSetup = quote

diff --git a/docs/src/man/linearsolver.md b/docs/src/man/linearsolver.md
@@ -34,6 +34,8 @@ systems. That's why this package comes with a block Jacobi preconditioner
 that is tailored towards GPUs and is proven to work well with power flow
 problems.
 
+The block-Jacobi preconditioner used in ExaPF has been added to [`KrylovPreconditioners.jl`](https://github.com/JuliaSmoothOptimizers/KrylovPreconditioners.jl)
+
 The Jacobian is partitioned into a dense block diagonal structure using `Metis.jl`, where each block is inverted to build our preconditioner `P`.
 
 ![Dense block Jacobi preconditioner \label{fig:preconditioner}](../figures/gpublocks.png)

diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md
@@ -230,11 +230,11 @@ for GPU usage. To build an instance with 8 blocks, just write
 ```@repl quickstart
 npartitions = 8;
 jac_gpu = jx_gpu.J;
-precond = LS.BlockJacobiPreconditioner(jac_gpu, npartitions, CUDABackend());
+precond = BlockJacobiPreconditioner(jac_gpu, npartitions, CUDABackend());
 ```
 You can attach the preconditioner to an BICGSTAB algorithm simply as
 ```@repl quickstart
-linear_solver = ExaPF.KrylovBICGSTAB(jac_gpu; P=precond);
+linear_solver = ExaPF.Bicgstab(jac_gpu; P=precond);
 
 ```
 (this will use the BICGSTAB algorithm implemented in

diff --git a/ext/ExaPFAMDGPUExt.jl b/ext/ExaPFAMDGPUExt.jl
@@ -10,23 +10,23 @@ using ForwardDiff
 using LinearAlgebra
 using KernelAbstractions
 using SparseArrays
+using KrylovPreconditioners
 
 const KA = KernelAbstractions
 const LS = ExaPF.LinearSolvers
 const PS = ExaPF.PowerSystem
 const AD = ExaPF.AutoDiff
+const KP = KrylovPreconditioners
 
 LS.DirectSolver(J::ROCSparseMatrixCSR; options...) = ExaPF.LS.DirectSolver(nothing)
-LS.update!(solver::ExaPF.LS.AbstractIterativeLinearSolver, J::ROCSparseMatrixCSR) = ExaPF.LS.update(solver.precond, J, ROCBackend())
+LS.update!(solver::ExaPF.LS.AbstractIterativeLinearSolver, J::ROCSparseMatrixCSR) = KP.update!(solver.precond, J)
 LS._get_type(J::ROCSparseMatrixCSR) = ROCArray{Float64, 1, AMDGPU.Mem.HIPBuffer}
 LS.default_linear_solver(A::ROCSparseMatrixCSR, device::ROCBackend) = ExaPF.LS.KrylovBICGSTAB(A)
-function LS._allowscalar(f::Function, J::ROCSparseMatrixCSR)
-    AMDGPU.allowscalar(true)
-    f()
-    AMDGPU.allowscalar(false)
-end
 ExaPF._iscsr(::ROCSparseMatrixCSR) = true
 ExaPF._iscsc(::ROCSparseMatrixCSR) = false
+function LS.scaling!(::LS.KrylovBICGSTAB,A::ROCSparseMatrixCSR,b)
+    KP.scaling_csr!(A,b)
+end
 
 """
     list_solvers(::ROCBackend)
@@ -36,5 +36,4 @@ List all linear solvers available solving the power flow on an NVIDIA GPU.
 ExaPF.list_solvers(::ROCBackend) = [LS.BICGSTAB, LS.DQGMRES, LS.EigenBICGSTAB, LS.KrylovBICGSTAB]
 
 include("amdgpu_wrapper.jl")
-include("amdgpu_preconditioner.jl")
 end
diff --git a/ext/ExaPFCUDAExt.jl b/ext/ExaPFCUDAExt.jl
@@ -10,19 +10,24 @@ using ForwardDiff
 using LinearAlgebra
 using KernelAbstractions
 using SparseArrays
+using KrylovPreconditioners
 
 const KA = KernelAbstractions
 const LS = ExaPF.LinearSolvers
 const PS = ExaPF.PowerSystem
 const AD = ExaPF.AutoDiff
+const KP = KrylovPreconditioners
 
 LS.DirectSolver(J::CuSparseMatrixCSR; options...) = ExaPF.LS.DirectSolver(nothing)
-LS.update!(solver::ExaPF.LS.AbstractIterativeLinearSolver, J::CuSparseMatrixCSR) = ExaPF.LS.update(solver.precond, J, CUDABackend())
+LS.update!(solver::ExaPF.LS.AbstractIterativeLinearSolver, J::CuSparseMatrixCSR) = KP.update!(solver.precond, J)
+LS.update!(solver::ExaPF.LS.DirectSolver, J::CuSparseMatrixCSR) = lu!(solver.factorization, J)
 LS._get_type(J::CuSparseMatrixCSR) = CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}
 LS.default_linear_solver(A::CuSparseMatrixCSR, device::CUDABackend) = ExaPF.LS.DirectSolver(A)
-LS._allowscalar(f::Function, J::CuSparseMatrixCSR) = CUDA.allowscalar(f)
 ExaPF._iscsr(::CuSparseMatrixCSR) = true
 ExaPF._iscsc(::CuSparseMatrixCSR) = false
+function LS.scaling!(::LS.KrylovBICGSTAB,A::CuSparseMatrixCSR,b)
+    KP.scaling_csr!(A,b)
+end
 """
     list_solvers(::CUDABackend)
 
@@ -31,5 +36,4 @@ List all linear solvers available solving the power flow on an NVIDIA GPU.
 ExaPF.list_solvers(::CUDABackend) = [LS.DirectSolver, LS.BICGSTAB, LS.DQGMRES, LS.EigenBICGSTAB, LS.KrylovBICGSTAB]
 
 include("cuda_wrapper.jl")
-include("cuda_preconditioner.jl")
 end
diff --git a/ext/amdgpu_preconditioner.jl b/ext/amdgpu_preconditioner.jl
diff --git a/ext/cuda_preconditioner.jl b/ext/cuda_preconditioner.jl
diff --git a/ext/cuda_wrapper.jl b/ext/cuda_wrapper.jl
@@ -53,7 +53,7 @@ end
 # By default, no factorization routine is available
 LS.update!(s::LS.DirectSolver{Nothing}, J::CuSparseMatrixCSR) = nothing
 function LS.ldiv!(::LS.DirectSolver{Nothing},
-    y::CuVector, J::CuSparseMatrixCSR, x::CuVector,
+    y::CuVector, J::CuSparseMatrixCSR, x::CuVector; options...
 )
     CUSOLVER.csrlsvqr!(J, x, y, 1e-8, one(Cint), 'O')
     return 0