Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mixed precision double diagonal gpu ilu0 variants #5688

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists_files.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ if (HAVE_CUDA)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.cu)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.cu)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.cu)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/kernelEnums.hpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuVector.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuView.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu)
Expand Down
20 changes: 10 additions & 10 deletions opm/simulators/linalg/PreconditionerFactory_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,10 +357,10 @@ struct StandardPreconditioners {
F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
using field_type = typename V::field_type;
using GpuDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
auto gpuDILU = std::make_shared<GpuDILU>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float);
auto gpuDILU = std::make_shared<GpuDILU>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme);

auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, GpuDILU>>(gpuDILU);
auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
Expand All @@ -370,10 +370,10 @@ struct StandardPreconditioners {
F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
using field_type = typename V::field_type;
using OpmGpuILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float);
auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme);

auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, OpmGpuILU0>>(gpuilu0);
auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
Expand Down Expand Up @@ -650,27 +650,27 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);

using field_type = typename V::field_type;
using GPUILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;

return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme));
});

F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
using field_type = typename V::field_type;
using GPUDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUDILU>>(std::make_shared<GPUDILU>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUDILU>>(std::make_shared<GPUDILU>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme));
});

F::addCreator("GPUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);

using block_type = typename V::block_type;
using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
Expand All @@ -679,7 +679,7 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
using Adapter = typename gpuistl::PreconditionerAdapter<VTo, VTo, GpuDILU>;
using Converter = typename gpuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
auto converted = std::make_shared<Converter>(op.getmat());
auto adapted = std::make_shared<Adapter>(std::make_shared<GpuDILU>(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
auto adapted = std::make_shared<Adapter>(std::make_shared<GpuDILU>(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels, mixed_precision_scheme));
converted->setUnderlyingPreconditioner(adapted);
return converted;
});
Expand Down
72 changes: 59 additions & 13 deletions opm/simulators/linalg/gpuistl/GpuDILU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ namespace Opm::gpuistl
{

template <class M, class X, class Y, int l>
GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat)
GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme)
: m_cpuMatrix(A)
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
, m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
Expand All @@ -52,9 +52,12 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo
, m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
, m_splitMatrix(splitMatrix)
, m_tuneThreadBlockSizes(tuneKernels)
, m_storeFactorizationAsFloat(storeFactorizationAsFloat)

{

OPM_ERROR_IF(!isValidMixedPrecisionScheme(mixedPrecisionScheme),
fmt::format("Invalid mixed precision scheme chosen: {}", mixedPrecisionScheme));
m_mixedPrecisionScheme = static_cast<MixedPrecisionScheme>(mixedPrecisionScheme);

// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
// Some sanity check
OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
Expand Down Expand Up @@ -82,14 +85,16 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo
m_cpuMatrix, m_reorderedToNatural);
}

if (m_storeFactorizationAsFloat) {
if (!m_mixedPrecisionScheme == MixedPrecisionScheme::DEFAULT) {
if (!m_splitMatrix){
OPM_THROW(std::runtime_error, "Matrix must be split when storing as float.");
}
m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
m_gpuMatrixReorderedDiagFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
m_gpuDInvFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
// The MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE does not need to allocate this float vector
if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
m_gpuDInvFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
}
}

computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
Expand Down Expand Up @@ -123,8 +128,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) {
if (m_storeFactorizationAsFloat) {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float>(
if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, float>(
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
Expand All @@ -135,8 +140,20 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
d.data(),
v.data(),
lowerSolveThreadBlockSize);
}else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
} else {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
Expand Down Expand Up @@ -170,7 +187,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
const int numOfRowsInLevel = m_levelSets[level].size();
levelStartIdx -= numOfRowsInLevel;
if (m_splitMatrix) {
if (m_storeFactorizationAsFloat){
if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT){
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
Expand All @@ -181,6 +198,17 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInvFloat->data(),
v.data(),
upperSolveThreadBlockSize);
} else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE){
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
v.data(),
upperSolveThreadBlockSize);
} else {
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
Expand Down Expand Up @@ -271,8 +299,8 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) {
if (m_storeFactorizationAsFloat) {
detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, true>(
if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
Expand All @@ -289,9 +317,27 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
factorizationBlockSize);
} else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) {
detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuMatrixReorderedDiag->data(),
m_gpuReorderToNatural.data(),
m_gpuNaturalToReorder.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
nullptr,
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
factorizationBlockSize);
} else {
// TODO: should this be field type twice or field type then float in the template?
detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, false>(
detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MixedPrecisionScheme::DEFAULT>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
Expand Down
5 changes: 3 additions & 2 deletions opm/simulators/linalg/gpuistl/GpuDILU.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <opm/grid/utility/SparseTable.hpp>
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
#include <opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp>
#include <vector>


Expand Down Expand Up @@ -62,7 +63,7 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
//! \param A The matrix to operate on.
//! \param w The relaxation factor.
//!
explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat);
explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme);

//! \brief Prepare the preconditioner.
//! \note Does nothing at the time being.
Expand Down Expand Up @@ -145,7 +146,7 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
//! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
bool m_tuneThreadBlockSizes;
//! \brief Bool storing whether or not we will store the factorization as float. Only used for mixed precision
bool m_storeFactorizationAsFloat;
MixedPrecisionScheme m_mixedPrecisionScheme;
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
//! The default value of -1 indicates that we have not calibrated and selected a value yet
int m_upperSolveThreadBlockSize = -1;
Expand Down
Loading