From 6c3c0ad42c832be1914d1f1cb6e213891a9dd15f Mon Sep 17 00:00:00 2001 From: Aditya Ranjan Date: Fri, 12 Apr 2024 22:38:00 -0400 Subject: [PATCH 01/11] matmul --- matmul/README | 9 ++ matmul/fp16_conversion.h | 116 +++++++++++++++++++++ matmul/frontier/gemm.cu | 200 +++++++++++++++++++++++++++++++++++++ matmul/frontier/makefile | 13 +++ matmul/perlmutter/gemm.cu | 190 +++++++++++++++++++++++++++++++++++ matmul/perlmutter/makefile | 12 +++ 6 files changed, 540 insertions(+) create mode 100644 matmul/README create mode 100644 matmul/fp16_conversion.h create mode 100644 matmul/frontier/gemm.cu create mode 100644 matmul/frontier/makefile create mode 100644 matmul/perlmutter/gemm.cu create mode 100644 matmul/perlmutter/makefile diff --git a/matmul/README b/matmul/README new file mode 100644 index 0000000..962c0eb --- /dev/null +++ b/matmul/README @@ -0,0 +1,9 @@ +(Perlmutter) Before compiling do this: +module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 +export CRAY_ACCEL_TARGET=nvidia80 + +(Perlmutter) Before running do this: +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64 + +(Frontier) Before compiling do this: +module load PrgEnv-cray amd-mixed craype-accel-amd-gfx90a diff --git a/matmul/fp16_conversion.h b/matmul/fp16_conversion.h new file mode 100644 index 0000000..8ff5d29 --- /dev/null +++ b/matmul/fp16_conversion.h @@ -0,0 +1,116 @@ +// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This code modified from the public domain code here: +// https://gist.github.com/rygorous/2156668 +// The URL above includes more robust conversion routines +// that handle Inf and NaN correctly. +// +// It is recommended to use the more robust versions in production code. + +// code from: https://github.com/parallel-forall/code-samples/tree/master/posts/mixed-precision + +typedef unsigned uint; + +union FP32 +{ + uint u; + float f; + struct + { + uint Mantissa : 23; + uint Exponent : 8; + uint Sign : 1; + }; +}; + +union FP16 +{ + unsigned short u; + struct + { + uint Mantissa : 10; + uint Exponent : 5; + uint Sign : 1; + }; +}; + +// Approximate solution. This is faster but converts some sNaNs to +// infinity and doesn't round correctly. Handle with care. +// Approximate solution. This is faster but converts some sNaNs to +// infinity and doesn't round correctly. Handle with care. +static half approx_float_to_half(float fl) +{ + FP32 f32infty = { 255 << 23 }; + FP32 f16max = { (127 + 16) << 23 }; + FP32 magic = { 15 << 23 }; + FP32 expinf = { (255 ^ 31) << 23 }; + uint sign_mask = 0x80000000u; + FP16 o = { 0 }; + + FP32 f = *((FP32*)&fl); + + uint sign = f.u & sign_mask; + f.u ^= sign; + + if (!(f.f < f32infty.u)) // Inf or NaN + o.u = f.u ^ expinf.u; + else + { + if (f.f > f16max.f) f.f = f16max.f; + f.f *= magic.f; + } + + o.u = f.u >> 13; // Take the mantissa bits + o.u |= sign >> 16; + return *((half*)&o); +} + +// from half->float code - just for verification. +static float half_to_float(half hf) +{ + FP16 h = *((FP16*)&hf); + + static const FP32 magic = { 113 << 23 }; + static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + + o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits + uint exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + else if (exp == 0) // Zero/Denormal? + { + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + + o.u |= (h.u & 0x8000) << 16; // sign bit + return o.f; +} diff --git a/matmul/frontier/gemm.cu b/matmul/frontier/gemm.cu new file mode 100644 index 0000000..8ab42e0 --- /dev/null +++ b/matmul/frontier/gemm.cu @@ -0,0 +1,200 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "../fp16_conversion.h" + +using namespace std; + +#define FP16MM + +const char* hipblasGetErrorString(hipblasStatus_t status) +{ + switch(status) + { + case HIPBLAS_STATUS_SUCCESS: return "HIPBLAS_STATUS_SUCCESS"; + case HIPBLAS_STATUS_NOT_INITIALIZED: return "HIPBLAS_STATUS_NOT_INITIALIZED"; + case HIPBLAS_STATUS_ALLOC_FAILED: return "HIPBLAS_STATUS_ALLOC_FAILED"; + case HIPBLAS_STATUS_INVALID_VALUE: return "HIPBLAS_STATUS_INVALID_VALUE"; + case HIPBLAS_STATUS_ARCH_MISMATCH: return "HIPBLAS_STATUS_ARCH_MISMATCH"; + case HIPBLAS_STATUS_MAPPING_ERROR: return "HIPBLAS_STATUS_MAPPING_ERROR"; + case HIPBLAS_STATUS_EXECUTION_FAILED: return "HIPBLAS_STATUS_EXECUTION_FAILED"; + case HIPBLAS_STATUS_INTERNAL_ERROR: return "HIPBLAS_STATUS_INTERNAL_ERROR"; + case HIPBLAS_STATUS_NOT_SUPPORTED: return "HIPBLAS_STATUS_NOT_SUPPORTED"; + case HIPBLAS_STATUS_HANDLE_IS_NULLPTR: return "HIPBLAS_STATUS_HANDLE_IS_NULLPTR"; + case HIPBLAS_STATUS_INVALID_ENUM: return "HIPBLAS_STATUS_INVALID_ENUM"; + case HIPBLAS_STATUS_UNKNOWN: return "HIPBLAS_STATUS_UNKNOWN"; + } + return "unknown error"; +} + +// Convenience function for checking HIP runtime API results +// can be wrapped around any runtime API call. No-op in release builds. +inline +hipError_t checkHip(hipError_t result) +{ + if (result != hipSuccess) { + fprintf(stderr, "HIP Runtime Error: %s\n", hipGetErrorString(result)); + assert(result == hipSuccess); + } + return result; +} + +inline +hipblasStatus_t checkCublas(hipblasStatus_t result) +{ + if (result != HIPBLAS_STATUS_SUCCESS) { + fprintf(stderr, "HIP Runtime Error: %s\n", hipblasGetErrorString(result)); + assert(result == HIPBLAS_STATUS_SUCCESS); + } + return result; +} + +// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU +void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) { + int a=1; + + for(int i = 0; i < nr_rows_A * nr_cols_A; i++){ + A[i] = (float)rand()/(float)(RAND_MAX/a); + } +} + +int main(int argc, char ** argv){ + + + int min_m_k_n = 1024; + int max_m_k_n = 16384*2; + int repeats = 100; + int verbose = 1; + +#ifndef FP16MM + cout << "\nhipblasSgemm test result:\n" << endl; +#else + cout << "\nhipblasHgemm test result:\n" << endl; +#endif + + if(verbose) + cout << "running with" + << " min_m_k_n: " << min_m_k_n + << " max_m_k_n: " << max_m_k_n + << " repeats: " << repeats + << endl; + + hipblasStatus_t stat; + hipblasHandle_t handle; + + checkCublas(hipblasCreate(&handle)); + + if(verbose) cout << "allocating device variables" << endl; + + // Allocate 3 arrays on CPU + + float *h_A = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float)); + float *h_B = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float)); + float *h_C = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float)); + + CPU_fill_rand(h_A, max_m_k_n, max_m_k_n); + CPU_fill_rand(h_B, max_m_k_n, max_m_k_n); + CPU_fill_rand(h_C, max_m_k_n, max_m_k_n); + +#ifndef FP16MM + // Allocate 3 arrays on GPU + float *d_A, *d_B, *d_C; + checkHip(hipMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(float))); + checkHip(hipMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(float))); + checkHip(hipMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(float))); + + checkHip(hipMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(float),hipMemcpyHostToDevice)); + checkHip(hipMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(float),hipMemcpyHostToDevice)); + checkHip(hipMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(float),hipMemcpyHostToDevice)); + + int lda, ldb, ldc, m, n, k; + const float alf = 1.0f; + const float bet = 0.0f; + const float *alpha = &alf; + const float *beta = &bet; + +#else + + uint16_t *d_A, *d_B, *d_C; + checkHip(hipMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(uint16_t))); + checkHip(hipMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(uint16_t))); + checkHip(hipMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(uint16_t))); + + for (int i = 0; i < max_m_k_n * max_m_k_n; i++) { + half temp_a = approx_float_to_half(h_A[i]); + half temp_b = approx_float_to_half(h_B[i]); + half temp_c = approx_float_to_half(h_C[i]); + d_A[i] = *((uint16_t*) &temp_a); + d_B[i] = *((uint16_t*) &temp_b); + d_C[i] = *((uint16_t*) &temp_c); + } + + int lda, ldb, ldc, m, n, k; + half temp_alf = approx_float_to_half(1.0f); + half temp_bet = approx_float_to_half(0.0f); + const uint16_t alf = *((uint16_t*) &temp_alf); + const uint16_t bet = *((uint16_t*) &temp_bet); + const uint16_t *alpha = &alf; + const uint16_t *beta = &bet; + +#endif + + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + + for(int size = min_m_k_n; size <= max_m_k_n; size=size*2){ + double sum = 0.0; + for(int rep = 0; rep < repeats; rep++){ + hipEventRecord(start, 0); + m=n=k=size; + lda = m; + ldb = k; + ldc = m; +#ifndef FP16MM + stat = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); +#else + stat = hipblasHgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); +#endif + hipEventRecord(stop,0); + hipEventSynchronize(stop); + if(stat != HIPBLAS_STATUS_SUCCESS){ + cerr << "hipblasSgemmBatched failed" << endl; + exit(1); + } + assert(!hipGetLastError()); + + float elapsed; + hipEventElapsedTime(&elapsed, start, stop); + elapsed /= 1000.0f; + if (rep >= 25) { + sum += elapsed; + } + } +#ifndef FP16MM + cout << "float32: size " +#else + cout << "float16: size " +#endif + << size << " average: " << sum/75 << " s "<< endl; + + } + + //Free GPU memory + hipFree(d_A); + hipFree(d_B); + hipFree(d_C); + + // Free CPU memory + free(h_A); + free(h_B); + free(h_C); + + return 0; +} + diff --git a/matmul/frontier/makefile b/matmul/frontier/makefile new file mode 100644 index 0000000..65f6cde --- /dev/null +++ b/matmul/frontier/makefile @@ -0,0 +1,13 @@ +CC = cc +INC = -I${ROCM_PATH}/include +CFLAGS = -std=c++11 -O3 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip +LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lhipblas + +all: gemm.x + +gemm.x: gemm.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o gemm.x gemm.cu + +clean: + rm -f gemm.x + diff --git a/matmul/perlmutter/gemm.cu b/matmul/perlmutter/gemm.cu new file mode 100644 index 0000000..91436d3 --- /dev/null +++ b/matmul/perlmutter/gemm.cu @@ -0,0 +1,190 @@ +#include +#include +#include +#include +#include +#include +#include "../fp16_conversion.h" + +using namespace std; + +#define FP16MM + +const char* cublasGetErrorString(cublasStatus_t status) +{ + switch(status) + { + case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; + case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; + } + return "unknown error"; +} + +// Convenience function for checking CUDA runtime API results +// can be wrapped around any runtime API call. No-op in release builds. +inline +cudaError_t checkCuda(cudaError_t result) +{ + if (result != cudaSuccess) { + fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); + assert(result == cudaSuccess); + } + return result; +} + +inline +cublasStatus_t checkCublas(cublasStatus_t result) +{ + if (result != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "CUDA Runtime Error: %s\n", cublasGetErrorString(result)); + assert(result == CUBLAS_STATUS_SUCCESS); + } + return result; +} + +// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU +void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) { + int a=1; + + for(int i = 0; i < nr_rows_A * nr_cols_A; i++){ + A[i] = (float)rand()/(float)(RAND_MAX/a); + } +} + +int main(int argc, char ** argv){ + + + int min_m_k_n = 1024; + int max_m_k_n = 16384*2; + int repeats = 100; + int verbose = 1; + +#ifndef FP16MM + cout << "\ncublasSgemm test result:\n" << endl; +#else + cout << "\ncublasHgemm test result:\n" << endl; +#endif + + if(verbose) + cout << "running with" + << " min_m_k_n: " << min_m_k_n + << " max_m_k_n: " << max_m_k_n + << " repeats: " << repeats + << endl; + + cublasStatus_t stat; + cublasHandle_t handle; + + checkCublas(cublasCreate(&handle)); + + if(verbose) cout << "allocating device variables" << endl; + + // Allocate 3 arrays on CPU + + float *h_A = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float)); + float *h_B = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float)); + float *h_C = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float)); + + CPU_fill_rand(h_A, max_m_k_n, max_m_k_n); + CPU_fill_rand(h_B, max_m_k_n, max_m_k_n); + CPU_fill_rand(h_C, max_m_k_n, max_m_k_n); + +#ifndef FP16MM + // Allocate 3 arrays on GPU + float *d_A, *d_B, *d_C; + checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(float))); + checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(float))); + checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(float))); + + checkCuda(cudaMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice)); + checkCuda(cudaMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice)); + checkCuda(cudaMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice)); + + int lda, ldb, ldc, m, n, k; + const float alf = 1.0f; + const float bet = 0.0f; + const float *alpha = &alf; + const float *beta = &bet; + +#else + + __half *d_A, *d_B, *d_C; + checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(__half))); + checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(__half))); + checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(__half))); + + for (int i = 0; i < max_m_k_n * max_m_k_n; i++) { + d_A[i] = approx_float_to_half(h_A[i]); + d_B[i] = approx_float_to_half(h_B[i]); + d_C[i] = approx_float_to_half(h_C[i]); + } + + int lda, ldb, ldc, m, n, k; + const __half alf = approx_float_to_half(1.0); + const __half bet = approx_float_to_half(0.0); + const __half *alpha = &alf; + const __half *beta = &bet; + +#endif + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + for(int size = min_m_k_n; size <= max_m_k_n; size=size*2){ + double sum = 0.0; + for(int rep = 0; rep < repeats; rep++){ + cudaEventRecord(start, 0); + m=n=k=size; + lda = m; + ldb = k; + ldc = m; +#ifndef FP16MM + stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); +#else + stat = cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); +#endif + cudaEventRecord(stop,0); + cudaEventSynchronize(stop); + if(stat != CUBLAS_STATUS_SUCCESS){ + cerr << "cublasSgemmBatched failed" << endl; + exit(1); + } + assert(!cudaGetLastError()); + + float elapsed; + cudaEventElapsedTime(&elapsed, start, stop); + elapsed /= 1000.0f; + if (rep >= 25) { + sum += elapsed; + } + } +#ifndef FP16MM + cout << "float32: size " +#else + cout << "float16: size " +#endif + << size << " average: " << sum/75 << " s "<< endl; + + } + + //Free GPU memory + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + + // Free CPU memory + free(h_A); + free(h_B); + free(h_C); + + return 0; +} diff --git a/matmul/perlmutter/makefile b/matmul/perlmutter/makefile new file mode 100644 index 0000000..abccbf3 --- /dev/null +++ b/matmul/perlmutter/makefile @@ -0,0 +1,12 @@ +CC = cc +CFLAGS = -std=c++11 -O3 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 +LDFLAGS = -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64 -lcublas -lcurand + +all: gemm.x + +gemm.x: gemm.cu + ${CC} ${CFLAGS} ${LDFLAGS} -o gemm.x gemm.cu + +clean: + rm -f gemm.x + From 5893fe97670ec33aeb427fd5b4146b18fe4f5b0f Mon Sep 17 00:00:00 2001 From: Aditya Ranjan Date: Fri, 12 Apr 2024 23:32:22 -0400 Subject: [PATCH 02/11] perlmutter update --- matmul/perlmutter/gemm.cu | 80 +++++++++++---------------------------- 1 file changed, 23 insertions(+), 57 deletions(-) diff --git a/matmul/perlmutter/gemm.cu b/matmul/perlmutter/gemm.cu index 91436d3..401c654 100644 --- a/matmul/perlmutter/gemm.cu +++ b/matmul/perlmutter/gemm.cu @@ -8,8 +8,6 @@ using namespace std; -#define FP16MM - const char* cublasGetErrorString(cublasStatus_t status) { switch(status) @@ -51,11 +49,11 @@ cublasStatus_t checkCublas(cublasStatus_t result) } // Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU -void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) { +void CPU_fill_rand(__half *A, int nr_rows_A, int nr_cols_A) { int a=1; for(int i = 0; i < nr_rows_A * nr_cols_A; i++){ - A[i] = (float)rand()/(float)(RAND_MAX/a); + A[i] = approx_float_to_half((float)rand()/(float)(RAND_MAX/a)); } } @@ -67,11 +65,7 @@ int main(int argc, char ** argv){ int repeats = 100; int verbose = 1; -#ifndef FP16MM - cout << "\ncublasSgemm test result:\n" << endl; -#else cout << "\ncublasHgemm test result:\n" << endl; -#endif if(verbose) cout << "running with" @@ -89,51 +83,29 @@ int main(int argc, char ** argv){ // Allocate 3 arrays on CPU - float *h_A = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float)); - float *h_B = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float)); - float *h_C = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float)); + __half *h_A = (__half *)malloc(max_m_k_n * max_m_k_n * sizeof(__half)); + __half *h_B = (__half *)malloc(max_m_k_n * max_m_k_n * sizeof(__half)); + __half *h_C = (__half *)malloc(max_m_k_n * max_m_k_n * sizeof(__half)); CPU_fill_rand(h_A, max_m_k_n, max_m_k_n); CPU_fill_rand(h_B, max_m_k_n, max_m_k_n); CPU_fill_rand(h_C, max_m_k_n, max_m_k_n); - -#ifndef FP16MM - // Allocate 3 arrays on GPU - float *d_A, *d_B, *d_C; - checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(float))); - checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(float))); - checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(float))); - - checkCuda(cudaMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice)); - checkCuda(cudaMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice)); - checkCuda(cudaMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice)); - - int lda, ldb, ldc, m, n, k; - const float alf = 1.0f; - const float bet = 0.0f; - const float *alpha = &alf; - const float *beta = &bet; -#else - - __half *d_A, *d_B, *d_C; - checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(__half))); - checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(__half))); - checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(__half))); - - for (int i = 0; i < max_m_k_n * max_m_k_n; i++) { - d_A[i] = approx_float_to_half(h_A[i]); - d_B[i] = approx_float_to_half(h_B[i]); - d_C[i] = approx_float_to_half(h_C[i]); - } - - int lda, ldb, ldc, m, n, k; - const __half alf = approx_float_to_half(1.0); - const __half bet = approx_float_to_half(0.0); - const __half *alpha = &alf; - const __half *beta = &bet; - -#endif + // Allocate 3 arrays on GPU + __half *d_A, *d_B, *d_C; + checkCuda(cudaMalloc(&d_A, max_m_k_n * max_m_k_n * sizeof(__half))); + checkCuda(cudaMalloc(&d_B, max_m_k_n * max_m_k_n * sizeof(__half))); + checkCuda(cudaMalloc(&d_C, max_m_k_n * max_m_k_n * sizeof(__half))); + + checkCuda(cudaMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(__half),cudaMemcpyHostToDevice)); + checkCuda(cudaMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(__half),cudaMemcpyHostToDevice)); + checkCuda(cudaMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(__half),cudaMemcpyHostToDevice)); + + int lda, ldb, ldc, m, n, k; + const __half alf = approx_float_to_half(1.0); + const __half bet = approx_float_to_half(0.0); + const __half *alpha = &alf; + const __half *beta = &bet; cudaEvent_t start, stop; cudaEventCreate(&start); @@ -147,11 +119,9 @@ int main(int argc, char ** argv){ lda = m; ldb = k; ldc = m; -#ifndef FP16MM - stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); -#else - stat = cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); -#endif + + stat = cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); + cudaEventRecord(stop,0); cudaEventSynchronize(stop); if(stat != CUBLAS_STATUS_SUCCESS){ @@ -167,11 +137,7 @@ int main(int argc, char ** argv){ sum += elapsed; } } -#ifndef FP16MM - cout << "float32: size " -#else cout << "float16: size " -#endif << size << " average: " << sum/75 << " s "<< endl; } From 6869225c14c8516ec67e70a16a3e473ece0fdbc5 Mon Sep 17 00:00:00 2001 From: Aditya Ranjan Date: Fri, 12 Apr 2024 20:57:34 -0700 Subject: [PATCH 03/11] fix --- matmul/perlmutter/gemm.cu | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/matmul/perlmutter/gemm.cu b/matmul/perlmutter/gemm.cu index 401c654..1c169c3 100644 --- a/matmul/perlmutter/gemm.cu +++ b/matmul/perlmutter/gemm.cu @@ -49,11 +49,11 @@ cublasStatus_t checkCublas(cublasStatus_t result) } // Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU -void CPU_fill_rand(__half *A, int nr_rows_A, int nr_cols_A) { +void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) { int a=1; for(int i = 0; i < nr_rows_A * nr_cols_A; i++){ - A[i] = approx_float_to_half((float)rand()/(float)(RAND_MAX/a)); + A[i] = (float)rand()/(float)(RAND_MAX/a); } } @@ -83,9 +83,9 @@ int main(int argc, char ** argv){ // Allocate 3 arrays on CPU - __half *h_A = (__half *)malloc(max_m_k_n * max_m_k_n * sizeof(__half)); - __half *h_B = (__half *)malloc(max_m_k_n * max_m_k_n * sizeof(__half)); - __half *h_C = (__half *)malloc(max_m_k_n * max_m_k_n * sizeof(__half)); + float *h_A = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float)); + float *h_B = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float)); + float *h_C = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float)); CPU_fill_rand(h_A, max_m_k_n, max_m_k_n); CPU_fill_rand(h_B, max_m_k_n, max_m_k_n); @@ -93,13 +93,15 @@ int main(int argc, char ** argv){ // Allocate 3 arrays on GPU __half *d_A, *d_B, *d_C; - checkCuda(cudaMalloc(&d_A, max_m_k_n * max_m_k_n * sizeof(__half))); - checkCuda(cudaMalloc(&d_B, max_m_k_n * max_m_k_n * sizeof(__half))); - checkCuda(cudaMalloc(&d_C, max_m_k_n * max_m_k_n * sizeof(__half))); + checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(__half))); + checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(__half))); + checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(__half))); - checkCuda(cudaMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(__half),cudaMemcpyHostToDevice)); - checkCuda(cudaMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(__half),cudaMemcpyHostToDevice)); - checkCuda(cudaMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(__half),cudaMemcpyHostToDevice)); + for (int i = 0; i < max_m_k_n * max_m_k_n; i++) { + d_A[i] = approx_float_to_half(h_A[i]); + d_B[i] = approx_float_to_half(h_B[i]); + d_C[i] = approx_float_to_half(h_C[i]); + } int lda, ldb, ldc, m, n, k; const __half alf = approx_float_to_half(1.0); From 52892775919b00b51e78a9d23fbc544cb58eaa88 Mon Sep 17 00:00:00 2001 From: Aditya Ranjan Date: Fri, 12 Apr 2024 22:41:58 -0700 Subject: [PATCH 04/11] bfloat16 and gemm ex --- matmul/perlmutter/gemm.cu | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/matmul/perlmutter/gemm.cu b/matmul/perlmutter/gemm.cu index 1c169c3..c602894 100644 --- a/matmul/perlmutter/gemm.cu +++ b/matmul/perlmutter/gemm.cu @@ -65,8 +65,6 @@ int main(int argc, char ** argv){ int repeats = 100; int verbose = 1; - cout << "\ncublasHgemm test result:\n" << endl; - if(verbose) cout << "running with" << " min_m_k_n: " << min_m_k_n @@ -92,22 +90,22 @@ int main(int argc, char ** argv){ CPU_fill_rand(h_C, max_m_k_n, max_m_k_n); // Allocate 3 arrays on GPU - __half *d_A, *d_B, *d_C; - checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(__half))); - checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(__half))); - checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(__half))); + nv_bfloat16 *d_A, *d_B, *d_C; + checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(nv_bfloat16))); + checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(nv_bfloat16))); + checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(nv_bfloat16))); for (int i = 0; i < max_m_k_n * max_m_k_n; i++) { - d_A[i] = approx_float_to_half(h_A[i]); - d_B[i] = approx_float_to_half(h_B[i]); - d_C[i] = approx_float_to_half(h_C[i]); + d_A[i] = __float2bfloat16(h_A[i]); + d_B[i] = __float2bfloat16(h_B[i]); + d_C[i] = __float2bfloat16(h_C[i]); } int lda, ldb, ldc, m, n, k; - const __half alf = approx_float_to_half(1.0); - const __half bet = approx_float_to_half(0.0); - const __half *alpha = &alf; - const __half *beta = &bet; + const nv_bfloat16 alf = __float2bfloat16(1.0f); + const nv_bfloat16 bet = __float2bfloat16(0.0f); + const nv_bfloat16 *alpha = &alf; + const nv_bfloat16 *beta = &bet; cudaEvent_t start, stop; cudaEventCreate(&start); @@ -122,13 +120,13 @@ int main(int argc, char ** argv){ ldb = k; ldc = m; - stat = cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); + stat = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, CUDA_R_16BF, lda, d_B, CUDA_R_16BF, ldb, beta, d_C, CUDA_R_16BF, ldc, CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); cudaEventRecord(stop,0); cudaEventSynchronize(stop); if(stat != CUBLAS_STATUS_SUCCESS){ - cerr << "cublasSgemmBatched failed" << endl; - exit(1); + fprintf(stderr, "CuBLAS Error: %s\n", cublasGetErrorString(stat)); + exit(1); } assert(!cudaGetLastError()); @@ -139,7 +137,7 @@ int main(int argc, char ** argv){ sum += elapsed; } } - cout << "float16: size " + cout << "bfloat16: size " << size << " average: " << sum/75 << " s "<< endl; } From ed4c177bb0995a7187c3ee146e7b61e8712ae7fd Mon Sep 17 00:00:00 2001 From: Aditya Ranjan Date: Sat, 13 Apr 2024 11:36:55 -0400 Subject: [PATCH 05/11] bfloat16 and gemm_ex --- matmul/frontier/gemm.cu | 110 ++++++++++++++++----------------------- matmul/frontier/makefile | 2 +- 2 files changed, 45 insertions(+), 67 deletions(-) diff --git a/matmul/frontier/gemm.cu b/matmul/frontier/gemm.cu index 8ab42e0..03fa9aa 100644 --- a/matmul/frontier/gemm.cu +++ b/matmul/frontier/gemm.cu @@ -6,28 +6,29 @@ #include #include #include +#include #include "../fp16_conversion.h" using namespace std; -#define FP16MM - -const char* hipblasGetErrorString(hipblasStatus_t status) +const char* rocblasGetErrorString(rocblas_status status) { switch(status) { - case HIPBLAS_STATUS_SUCCESS: return "HIPBLAS_STATUS_SUCCESS"; - case HIPBLAS_STATUS_NOT_INITIALIZED: return "HIPBLAS_STATUS_NOT_INITIALIZED"; - case HIPBLAS_STATUS_ALLOC_FAILED: return "HIPBLAS_STATUS_ALLOC_FAILED"; - case HIPBLAS_STATUS_INVALID_VALUE: return "HIPBLAS_STATUS_INVALID_VALUE"; - case HIPBLAS_STATUS_ARCH_MISMATCH: return "HIPBLAS_STATUS_ARCH_MISMATCH"; - case HIPBLAS_STATUS_MAPPING_ERROR: return "HIPBLAS_STATUS_MAPPING_ERROR"; - case HIPBLAS_STATUS_EXECUTION_FAILED: return "HIPBLAS_STATUS_EXECUTION_FAILED"; - case HIPBLAS_STATUS_INTERNAL_ERROR: return "HIPBLAS_STATUS_INTERNAL_ERROR"; - case HIPBLAS_STATUS_NOT_SUPPORTED: return "HIPBLAS_STATUS_NOT_SUPPORTED"; - case HIPBLAS_STATUS_HANDLE_IS_NULLPTR: return "HIPBLAS_STATUS_HANDLE_IS_NULLPTR"; - case HIPBLAS_STATUS_INVALID_ENUM: return "HIPBLAS_STATUS_INVALID_ENUM"; - case HIPBLAS_STATUS_UNKNOWN: return "HIPBLAS_STATUS_UNKNOWN"; + case rocblas_status_success: return "rocblas_status_success"; + case rocblas_status_invalid_handle: return "rocblas_status_invalid_handle"; + case rocblas_status_not_implemented: return "rocblas_status_not_implemented"; + case rocblas_status_invalid_pointer: return "rocblas_status_invalid_pointer"; + case rocblas_status_invalid_size: return "rocblas_status_invalid_size"; + case rocblas_status_memory_error: return "rocblas_status_memory_error"; + case rocblas_status_internal_error: return "rocblas_status_internal_error"; + case rocblas_status_perf_degraded: return "rocblas_status_perf_degraded"; + case rocblas_status_size_query_mismatch: return "rocblas_status_size_query_mismatch"; + case rocblas_status_size_increased: return "rocblas_status_size_increased"; + case rocblas_status_size_unchanged: return "rocblas_status_size_unchanged"; + case rocblas_status_invalid_value: return "rocblas_status_invalid_value"; + case rocblas_status_continue: return "rocblas_status_continue"; + case rocblas_status_check_numerics_fail: return "rocblas_status_check_numerics_fail"; } return "unknown error"; } @@ -45,11 +46,11 @@ hipError_t checkHip(hipError_t result) } inline -hipblasStatus_t checkCublas(hipblasStatus_t result) +rocblas_status checkRocblas(rocblas_status result) { - if (result != HIPBLAS_STATUS_SUCCESS) { - fprintf(stderr, "HIP Runtime Error: %s\n", hipblasGetErrorString(result)); - assert(result == HIPBLAS_STATUS_SUCCESS); + if (result != rocblas_status_success) { + fprintf(stderr, "ROCM Runtime Error: %s\n", rocblasGetErrorString(result)); + assert(result == rocblas_status_success); } return result; } @@ -59,8 +60,8 @@ void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) { int a=1; for(int i = 0; i < nr_rows_A * nr_cols_A; i++){ - A[i] = (float)rand()/(float)(RAND_MAX/a); - } + A[i] = (float)rand()/(float)(RAND_MAX/a); + } } int main(int argc, char ** argv){ @@ -71,12 +72,6 @@ int main(int argc, char ** argv){ int repeats = 100; int verbose = 1; -#ifndef FP16MM - cout << "\nhipblasSgemm test result:\n" << endl; -#else - cout << "\nhipblasHgemm test result:\n" << endl; -#endif - if(verbose) cout << "running with" << " min_m_k_n: " << min_m_k_n @@ -84,10 +79,10 @@ int main(int argc, char ** argv){ << " repeats: " << repeats << endl; - hipblasStatus_t stat; - hipblasHandle_t handle; + rocblas_status stat; + rocblas_handle handle; - checkCublas(hipblasCreate(&handle)); + checkRocblas(rocblas_create_handle(&handle)); if(verbose) cout << "allocating device variables" << endl; @@ -101,29 +96,18 @@ int main(int argc, char ** argv){ CPU_fill_rand(h_B, max_m_k_n, max_m_k_n); CPU_fill_rand(h_C, max_m_k_n, max_m_k_n); -#ifndef FP16MM // Allocate 3 arrays on GPU - float *d_A, *d_B, *d_C; - checkHip(hipMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(float))); - checkHip(hipMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(float))); - checkHip(hipMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(float))); - - checkHip(hipMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(float),hipMemcpyHostToDevice)); - checkHip(hipMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(float),hipMemcpyHostToDevice)); - checkHip(hipMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(float),hipMemcpyHostToDevice)); - - int lda, ldb, ldc, m, n, k; - const float alf = 1.0f; - const float bet = 0.0f; - const float *alpha = &alf; - const float *beta = &bet; - -#else + uint16_t *d_A, *d_B, *d_C, *d_D; + checkHip(hipMalloc(&d_A, max_m_k_n * max_m_k_n * sizeof(uint16_t))); + checkHip(hipMalloc(&d_B, max_m_k_n * max_m_k_n * sizeof(uint16_t))); + checkHip(hipMalloc(&d_C, max_m_k_n * max_m_k_n * sizeof(uint16_t))); - uint16_t *d_A, *d_B, *d_C; - checkHip(hipMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(uint16_t))); - checkHip(hipMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(uint16_t))); - checkHip(hipMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(uint16_t))); + // rocblas_gemm_ex requries D array too + checkHip(hipMalloc(&d_D, max_m_k_n * max_m_k_n * sizeof(uint16_t))); + + checkHip(hipMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(uint16_t),hipMemcpyHostToDevice)); + checkHip(hipMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(uint16_t),hipMemcpyHostToDevice)); + checkHip(hipMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(uint16_t),hipMemcpyHostToDevice)); for (int i = 0; i < max_m_k_n * max_m_k_n; i++) { half temp_a = approx_float_to_half(h_A[i]); @@ -134,7 +118,7 @@ int main(int argc, char ** argv){ d_C[i] = *((uint16_t*) &temp_c); } - int lda, ldb, ldc, m, n, k; + int lda, ldb, ldd, ldc, m, n, k; half temp_alf = approx_float_to_half(1.0f); half temp_bet = approx_float_to_half(0.0f); const uint16_t alf = *((uint16_t*) &temp_alf); @@ -142,8 +126,6 @@ int main(int argc, char ** argv){ const uint16_t *alpha = &alf; const uint16_t *beta = &bet; -#endif - hipEvent_t start, stop; hipEventCreate(&start); hipEventCreate(&stop); @@ -156,15 +138,15 @@ int main(int argc, char ** argv){ lda = m; ldb = k; ldc = m; -#ifndef FP16MM - stat = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); -#else - stat = hipblasHgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); -#endif + ldd = m; + + + stat = rocblas_gemm_ex(handle, rocblas_operation_none, rocblas_operation_none, m, n, k, alpha, d_A, rocblas_datatype_bf16_r, lda, d_B, rocblas_datatype_bf16_r, ldb, beta, d_C, rocblas_datatype_bf16_r, ldc, d_D, rocblas_datatype_bf16_r, ldd, rocblas_datatype_f32_r, rocblas_gemm_algo_standard, 0, 0); + hipEventRecord(stop,0); hipEventSynchronize(stop); - if(stat != HIPBLAS_STATUS_SUCCESS){ - cerr << "hipblasSgemmBatched failed" << endl; + if(stat != rocblas_status_success){ + fprintf(stderr, "RocBLAS Error: %s\n", rocblasGetErrorString(stat)); exit(1); } assert(!hipGetLastError()); @@ -176,11 +158,7 @@ int main(int argc, char ** argv){ sum += elapsed; } } -#ifndef FP16MM - cout << "float32: size " -#else - cout << "float16: size " -#endif + cout << "bfloat16: size " << size << " average: " << sum/75 << " s "<< endl; } diff --git a/matmul/frontier/makefile b/matmul/frontier/makefile index 65f6cde..3324a22 100644 --- a/matmul/frontier/makefile +++ b/matmul/frontier/makefile @@ -1,7 +1,7 @@ CC = cc INC = -I${ROCM_PATH}/include CFLAGS = -std=c++11 -O3 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lhipblas +LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrocblas all: gemm.x From 22e8aad7a147744f5bc1b09792a4d5e85dee9fc1 Mon Sep 17 00:00:00 2001 From: Aditya Ranjan Date: Sun, 14 Apr 2024 12:34:23 -0400 Subject: [PATCH 06/11] rocm 6.0 --- matmul/README | 4 +++- matmul/frontier/gemm.cu | 9 ++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/matmul/README b/matmul/README index 962c0eb..b09f8f8 100644 --- a/matmul/README +++ b/matmul/README @@ -6,4 +6,6 @@ export CRAY_ACCEL_TARGET=nvidia80 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64 (Frontier) Before compiling do this: -module load PrgEnv-cray amd-mixed craype-accel-amd-gfx90a +module load PrgEnv-cray amd-mixed/6.0 craype-accel-amd-gfx90a +module load cpe/23.05 +export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" diff --git a/matmul/frontier/gemm.cu b/matmul/frontier/gemm.cu index 03fa9aa..12f63a8 100644 --- a/matmul/frontier/gemm.cu +++ b/matmul/frontier/gemm.cu @@ -1,3 +1,5 @@ +#define __HIP_PLATFORM_AMD__ + #include #include #include @@ -5,8 +7,7 @@ #include #include #include -#include -#include +#include #include "../fp16_conversion.h" using namespace std; @@ -29,6 +30,8 @@ const char* rocblasGetErrorString(rocblas_status status) case rocblas_status_invalid_value: return "rocblas_status_invalid_value"; case rocblas_status_continue: return "rocblas_status_continue"; case rocblas_status_check_numerics_fail: return "rocblas_status_check_numerics_fail"; + case rocblas_status_excluded_from_build: return "rocblas_status_excluded_from_build"; + case rocblas_status_arch_mismatch: return "rocblas_status_arch_mismatch"; } return "unknown error"; } @@ -68,7 +71,7 @@ int main(int argc, char ** argv){ int min_m_k_n = 1024; - int max_m_k_n = 16384*2; + int max_m_k_n = 16384*4; int repeats = 100; int verbose = 1; From 28ae13738a9a8ecdbab932448be08092d68dde1c Mon Sep 17 00:00:00 2001 From: Aditya Ranjan Date: Sun, 14 Apr 2024 22:09:22 -0400 Subject: [PATCH 07/11] don't need the memcpys --- matmul/frontier/gemm.cu | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/matmul/frontier/gemm.cu b/matmul/frontier/gemm.cu index 12f63a8..5fafeb7 100644 --- a/matmul/frontier/gemm.cu +++ b/matmul/frontier/gemm.cu @@ -71,7 +71,9 @@ int main(int argc, char ** argv){ int min_m_k_n = 1024; - int max_m_k_n = 16384*4; + int max_m_k_n = 16384*2; + // int min_m_k_n = 50000; + // int max_m_k_n = 50000; int repeats = 100; int verbose = 1; @@ -105,13 +107,9 @@ int main(int argc, char ** argv){ checkHip(hipMalloc(&d_B, max_m_k_n * max_m_k_n * sizeof(uint16_t))); checkHip(hipMalloc(&d_C, max_m_k_n * max_m_k_n * sizeof(uint16_t))); - // rocblas_gemm_ex requries D array too + // rocblas_gemm_ex requires D array too checkHip(hipMalloc(&d_D, max_m_k_n * max_m_k_n * sizeof(uint16_t))); - checkHip(hipMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(uint16_t),hipMemcpyHostToDevice)); - checkHip(hipMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(uint16_t),hipMemcpyHostToDevice)); - checkHip(hipMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(uint16_t),hipMemcpyHostToDevice)); - for (int i = 0; i < max_m_k_n * max_m_k_n; i++) { half temp_a = approx_float_to_half(h_A[i]); half temp_b = approx_float_to_half(h_B[i]); From 57286616116de647967f2452cbb68970fd03ac94 Mon Sep 17 00:00:00 2001 From: Aditya Ranjan Date: Sun, 14 Apr 2024 23:22:37 -0400 Subject: [PATCH 08/11] fixing integer overflow --- matmul/frontier/gemm.cu | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/matmul/frontier/gemm.cu b/matmul/frontier/gemm.cu index 5fafeb7..002fbd9 100644 --- a/matmul/frontier/gemm.cu +++ b/matmul/frontier/gemm.cu @@ -59,21 +59,18 @@ rocblas_status checkRocblas(rocblas_status result) } // Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU -void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) { - int a=1; +void CPU_fill_rand(float *A, unsigned long nr_rows_A, unsigned long nr_cols_A) { + int a=1; - for(int i = 0; i < nr_rows_A * nr_cols_A; i++){ + for (unsigned long long i = 0; i < nr_rows_A * nr_cols_A; i++){ A[i] = (float)rand()/(float)(RAND_MAX/a); } } int main(int argc, char ** argv){ - - int min_m_k_n = 1024; - int max_m_k_n = 16384*2; - // int min_m_k_n = 50000; - // int max_m_k_n = 50000; + unsigned long long min_m_k_n = 1024; + unsigned long long max_m_k_n = 16384*4; int repeats = 100; int verbose = 1; @@ -110,7 +107,7 @@ int main(int argc, char ** argv){ // rocblas_gemm_ex requires D array too checkHip(hipMalloc(&d_D, max_m_k_n * max_m_k_n * sizeof(uint16_t))); - for (int i = 0; i < max_m_k_n * max_m_k_n; i++) { + for (unsigned long long i = 0; i < max_m_k_n * max_m_k_n; i++) { half temp_a = approx_float_to_half(h_A[i]); half temp_b = approx_float_to_half(h_B[i]); half temp_c = approx_float_to_half(h_C[i]); From fba25c6aa230481423063c2947e0076afe9ab90c Mon Sep 17 00:00:00 2001 From: Aditya Ranjan Date: Sun, 14 Apr 2024 23:27:31 -0400 Subject: [PATCH 09/11] more --- matmul/frontier/gemm.cu | 2 +- matmul/perlmutter/gemm.cu | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/matmul/frontier/gemm.cu b/matmul/frontier/gemm.cu index 002fbd9..c271fd4 100644 --- a/matmul/frontier/gemm.cu +++ b/matmul/frontier/gemm.cu @@ -59,7 +59,7 @@ rocblas_status checkRocblas(rocblas_status result) } // Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU -void CPU_fill_rand(float *A, unsigned long nr_rows_A, unsigned long nr_cols_A) { +void CPU_fill_rand(float *A, unsigned long long nr_rows_A, unsigned long long nr_cols_A) { int a=1; for (unsigned long long i = 0; i < nr_rows_A * nr_cols_A; i++){ diff --git a/matmul/perlmutter/gemm.cu b/matmul/perlmutter/gemm.cu index c602894..ae52df9 100644 --- a/matmul/perlmutter/gemm.cu +++ b/matmul/perlmutter/gemm.cu @@ -49,10 +49,10 @@ cublasStatus_t checkCublas(cublasStatus_t result) } // Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU -void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) { +void CPU_fill_rand(float *A, unsigned long long nr_rows_A, unsigned long long nr_cols_A) { int a=1; - for(int i = 0; i < nr_rows_A * nr_cols_A; i++){ + for(unsigned long long i = 0; i < nr_rows_A * nr_cols_A; i++){ A[i] = (float)rand()/(float)(RAND_MAX/a); } } @@ -60,8 +60,8 @@ void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) { int main(int argc, char ** argv){ - int min_m_k_n = 1024; - int max_m_k_n = 16384*2; + unsigned long long min_m_k_n = 1024; + unsigned long long max_m_k_n = 16384*2; int repeats = 100; int verbose = 1; @@ -95,7 +95,7 @@ int main(int argc, char ** argv){ checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(nv_bfloat16))); checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(nv_bfloat16))); - for (int i = 0; i < max_m_k_n * max_m_k_n; i++) { + for (unsigned long long i = 0; i < max_m_k_n * max_m_k_n; i++) { d_A[i] = __float2bfloat16(h_A[i]); d_B[i] = __float2bfloat16(h_B[i]); d_C[i] = __float2bfloat16(h_C[i]); From 648bf183ea97e56a8818faad596691d04e7c58fa Mon Sep 17 00:00:00 2001 From: adityaranjan Date: Mon, 22 Apr 2024 16:00:55 -0400 Subject: [PATCH 10/11] license --- matmul/LICENSE | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 matmul/LICENSE diff --git a/matmul/LICENSE b/matmul/LICENSE new file mode 100644 index 0000000..f32dc2c --- /dev/null +++ b/matmul/LICENSE @@ -0,0 +1,43 @@ +MIT License + +Copyright (c) 2017 He Ma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + +Copyright (c) 2024, Parallel Software and Systems Group, University of +Maryland. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. From a90f80726d6c1932f39ec22c7e871c7de3f80f0e Mon Sep 17 00:00:00 2001 From: Aditya Ranjan Date: Sat, 6 Jul 2024 10:37:09 -0400 Subject: [PATCH 11/11] minor --- matmul/frontier/gemm.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/matmul/frontier/gemm.cu b/matmul/frontier/gemm.cu index c271fd4..43151ba 100644 --- a/matmul/frontier/gemm.cu +++ b/matmul/frontier/gemm.cu @@ -69,8 +69,8 @@ void CPU_fill_rand(float *A, unsigned long long nr_rows_A, unsigned long long nr int main(int argc, char ** argv){ - unsigned long long min_m_k_n = 1024; - unsigned long long max_m_k_n = 16384*4; + unsigned long long min_m_k_n = 16384*2; + unsigned long long max_m_k_n = 16384*2; int repeats = 100; int verbose = 1;