Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【Hackathon No.110】 为 Paddle 增强 sparse.matmul API #60040

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 27 additions & 20 deletions paddle/fluid/platform/dynload/cusparse.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,26 +30,33 @@ namespace dynload {

#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 11000
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
__macro(cusparseCreateMatDescr); \
__macro(cusparseDestroy); \
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase); \
__macro(cusparseCreateCsr); \
__macro(cusparseCreateCoo); \
__macro(cusparseCreateDnMat); \
__macro(cusparseCreateDnVec); \
__macro(cusparseSpMM_bufferSize); \
__macro(cusparseSpMM); \
__macro(cusparseDestroySpMat); \
__macro(cusparseDestroyDnMat); \
__macro(cusparseDestroyDnVec); \
__macro(cusparseSpMV_bufferSize); \
__macro(cusparseSpMV);
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
__macro(cusparseCreateMatDescr); \
__macro(cusparseDestroy); \
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase); \
__macro(cusparseCreateCsr); \
__macro(cusparseCreateCoo); \
__macro(cusparseCreateDnMat); \
__macro(cusparseCreateDnVec); \
__macro(cusparseSpMM_bufferSize); \
__macro(cusparseSpMM); \
__macro(cusparseDestroySpMat); \
__macro(cusparseDestroyDnMat); \
__macro(cusparseDestroyDnVec); \
__macro(cusparseSpMV_bufferSize); \
__macro(cusparseSpMV); \
__macro(cusparseSpGEMM_createDescr); \
__macro(cusparseSpGEMM_workEstimation); \
__macro(cusparseSpGEMM_compute); \
__macro(cusparseSpGEMM_destroyDescr); \
__macro(cusparseSpMatGetSize); \
__macro(cusparseCsrSetPointers); \
__macro(cusparseSpGEMM_copy);

CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
Expand Down
47 changes: 27 additions & 20 deletions paddle/phi/backends/dynload/cusparse.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,26 +42,33 @@ extern void *cusparse_dso_handle;

#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 11000
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
__macro(cusparseCreateMatDescr); \
__macro(cusparseDestroy); \
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase); \
__macro(cusparseCreateCsr); \
__macro(cusparseCreateCoo); \
__macro(cusparseCreateDnMat); \
__macro(cusparseCreateDnVec); \
__macro(cusparseSpMM_bufferSize); \
__macro(cusparseSpMM); \
__macro(cusparseDestroySpMat); \
__macro(cusparseDestroyDnMat); \
__macro(cusparseDestroyDnVec); \
__macro(cusparseSpMV_bufferSize); \
__macro(cusparseSpMV);
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
__macro(cusparseCreateMatDescr); \
__macro(cusparseDestroy); \
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase); \
__macro(cusparseCreateCsr); \
__macro(cusparseCreateCoo); \
__macro(cusparseCreateDnMat); \
__macro(cusparseCreateDnVec); \
__macro(cusparseSpMM_bufferSize); \
__macro(cusparseSpMM); \
__macro(cusparseDestroySpMat); \
__macro(cusparseDestroyDnMat); \
__macro(cusparseDestroyDnVec); \
__macro(cusparseSpMV_bufferSize); \
__macro(cusparseSpMV); \
__macro(cusparseSpGEMM_createDescr); \
__macro(cusparseSpGEMM_workEstimation); \
__macro(cusparseSpGEMM_compute); \
__macro(cusparseSpGEMM_destroyDescr); \
__macro(cusparseSpMatGetSize); \
__macro(cusparseCsrSetPointers); \
__macro(cusparseSpGEMM_copy);

CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
Expand Down
9 changes: 9 additions & 0 deletions paddle/phi/kernels/funcs/sparse/sparse_blas.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ class SparseBlas {
public:
explicit SparseBlas(const DeviceContext& dev_ctx) : dev_ctx_(dev_ctx) {}

template <typename T, typename TensorType>
void SPMM(bool transa,
bool transb,
T alpha,
const TensorType& mat_a,
const TensorType& mat_b,
T beta,
TensorType* mat_out) const;

template <typename T, typename TensorType>
void SPMM(bool transa,
bool transb,
Expand Down
169 changes: 167 additions & 2 deletions paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
#include "paddle/phi/core/visit_type.h"
Expand Down Expand Up @@ -89,6 +90,8 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,

int64_t batch_nnz = x.nnz() / batch_size;
cudaDataType_t gpu_type = GetGpuDataType<T>();
cusparseIndexType_t index_type =
std::is_same<T, int32_t>::value ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseCreateCsr(descriptor,
M,
Expand All @@ -97,8 +100,8 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
const_cast<IntT*>(crows_data),
const_cast<IntT*>(cols_data),
const_cast<T*>(values_data),
CUSPARSE_INDEX_64I,
CUSPARSE_INDEX_64I,
index_type,
index_type,
CUSPARSE_INDEX_BASE_ZERO,
gpu_type);
});
Expand Down Expand Up @@ -309,6 +312,32 @@ class CuSparseDnVecDescriptor {
cusparseDnVecDescr_t descriptor_;
};

/************* SpGEMM DESCRIPTOR ************/
template <typename T>
class CuSparseSpGEMMDescriptor {
public:
explicit CuSparseSpGEMMDescriptor(const phi::GPUContext& dev_ctx)
: dev_ctx_(dev_ctx) {
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSpGEMM_createDescr(&descriptor_);
});
VLOG(6) << "Create cusparseSpGEMMDescr_t " << &descriptor_;
}

~CuSparseSpGEMMDescriptor() {
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSpGEMM_destroyDescr(descriptor_);
});
VLOG(6) << "Destroy cusparseSpGEMMDescr_t " << &descriptor_;
}

const cusparseSpGEMMDescr_t& descriptor() const { return descriptor_; }

private:
const phi::GPUContext& dev_ctx_;
cusparseSpGEMMDescr_t descriptor_;
};

/************* SPARSE*DENSE->DENSE MATMUL ************/
template <>
template <typename T, typename TensorType>
Expand Down Expand Up @@ -414,6 +443,142 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
});
}

/************* SPARSE*SPARSE->SPARSE MATMUL ************/
template <>
template <typename T, typename TensorType>
void SparseBlas<phi::GPUContext>::SPMM(bool transa,
bool transb,
T alpha,
const TensorType& mat_a,
const TensorType& mat_b,
T beta,
TensorType* mat_out) const {
auto dims = mat_out->dims();
DenseTensor *mat_out_crows = mat_out->mutable_crows(),
*mat_out_cols = mat_out->mutable_cols(),
*mat_out_values = mat_out->mutable_values();
MetaTensor meta_out_crows(mat_out_crows), meta_out_cols(mat_out_cols),
meta_out_values(mat_out_values);
meta_out_crows.set_dtype(mat_a.crows().dtype());
meta_out_cols.set_dtype(mat_a.cols().dtype());
meta_out_values.set_dtype(mat_a.values().dtype());
meta_out_crows.set_dims(common::make_ddim({dims[dims.size() - 2] + 1}));
int* out_crows = dev_ctx_.template Alloc<int>(mat_out_crows);
dev_ctx_.template Alloc<int>(mat_out_cols);
dev_ctx_.template Alloc<T>(mat_out_values);

auto a_descriptor = CuSparseSpMatDescriptor<T>(mat_a, dev_ctx_);
auto b_descriptor = CuSparseSpMatDescriptor<T>(mat_b, dev_ctx_);
auto out_descriptor = CuSparseSpMatDescriptor<T>(*mat_out, dev_ctx_);
auto spgemm_descriptor = CuSparseSpGEMMDescriptor<T>(dev_ctx_);

cudaDataType_t gpu_type = GetGpuDataType<T>();
size_t buffer_size1 = 0, buffer_size2 = 0;

dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSpGEMM_workEstimation(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
out_descriptor.descriptor(),
gpu_type,
CUSPARSE_SPGEMM_DEFAULT,
spgemm_descriptor.descriptor(),
&buffer_size1,
nullptr);
});
phi::Allocator::AllocationPtr tmp_buffer1 = phi::memory_utils::Alloc(
dev_ctx_.GetPlace(),
buffer_size1,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
void* tmp_buffer_ptr1 = tmp_buffer1->ptr();

dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSpGEMM_workEstimation(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
out_descriptor.descriptor(),
gpu_type,
CUSPARSE_SPGEMM_DEFAULT,
spgemm_descriptor.descriptor(),
&buffer_size1,
tmp_buffer_ptr1);
});
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSpGEMM_compute(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
out_descriptor.descriptor(),
gpu_type,
CUSPARSE_SPGEMM_DEFAULT,
spgemm_descriptor.descriptor(),
&buffer_size2,
nullptr);
});
phi::Allocator::AllocationPtr tmp_buffer2 = phi::memory_utils::Alloc(
dev_ctx_.GetPlace(),
buffer_size2,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
void* tmp_buffer_ptr2 = tmp_buffer2->ptr();

dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSpGEMM_compute(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
out_descriptor.descriptor(),
gpu_type,
CUSPARSE_SPGEMM_DEFAULT,
spgemm_descriptor.descriptor(),
&buffer_size2,
tmp_buffer_ptr2);
});

int64_t C_num_rows1, C_num_cols1, C_nnz1;
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSpMatGetSize(
out_descriptor.descriptor(), &C_num_rows1, &C_num_cols1, &C_nnz1);
});

meta_out_cols.set_dims(common::make_ddim({C_nnz1}));
meta_out_values.set_dims(common::make_ddim({C_nnz1}));
T* out_values = dev_ctx_.template Alloc<T>(mat_out_values);
int* out_cols = dev_ctx_.template Alloc<int>(mat_out_cols);

dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseCsrSetPointers(
out_descriptor.descriptor(), out_crows, out_cols, out_values);
});

dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSpGEMM_copy(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
out_descriptor.descriptor(),
gpu_type,
CUSPARSE_SPGEMM_DEFAULT,
spgemm_descriptor.descriptor());
});
}

/************* DENSE*DENSE->SPARSE MATMUL ************/
#if CUDA_VERSION >= 11030
template <>
Expand Down
Loading