Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GEMM Benchmark #12

Open
wants to merge 11 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions matmul/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
MIT License

Copyright (c) 2017 He Ma

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


Copyright (c) 2024, Parallel Software and Systems Group, University of
Maryland.

Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
11 changes: 11 additions & 0 deletions matmul/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
(Perlmutter) Before compiling do this:
module load PrgEnv-cray cudatoolkit craype-accel-nvidia80
export CRAY_ACCEL_TARGET=nvidia80

(Perlmutter) Before running do this:
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64

(Frontier) Before compiling do this:
module load PrgEnv-cray amd-mixed/6.0 craype-accel-amd-gfx90a
module load cpe/23.05
export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
116 changes: 116 additions & 0 deletions matmul/fp16_conversion.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// This code modified from the public domain code here:
// https://gist.github.com/rygorous/2156668
// The URL above includes more robust conversion routines
// that handle Inf and NaN correctly.
//
// It is recommended to use the more robust versions in production code.

// code from: https://github.com/parallel-forall/code-samples/tree/master/posts/mixed-precision

typedef unsigned uint;

union FP32
{
uint u;
float f;
struct
{
uint Mantissa : 23;
uint Exponent : 8;
uint Sign : 1;
};
};

union FP16
{
unsigned short u;
struct
{
uint Mantissa : 10;
uint Exponent : 5;
uint Sign : 1;
};
};

// Approximate solution. This is faster but converts some sNaNs to
// infinity and doesn't round correctly. Handle with care.
// Approximate solution. This is faster but converts some sNaNs to
// infinity and doesn't round correctly. Handle with care.
static half approx_float_to_half(float fl)
{
FP32 f32infty = { 255 << 23 };
FP32 f16max = { (127 + 16) << 23 };
FP32 magic = { 15 << 23 };
FP32 expinf = { (255 ^ 31) << 23 };
uint sign_mask = 0x80000000u;
FP16 o = { 0 };

FP32 f = *((FP32*)&fl);

uint sign = f.u & sign_mask;
f.u ^= sign;

if (!(f.f < f32infty.u)) // Inf or NaN
o.u = f.u ^ expinf.u;
else
{
if (f.f > f16max.f) f.f = f16max.f;
f.f *= magic.f;
}

o.u = f.u >> 13; // Take the mantissa bits
o.u |= sign >> 16;
return *((half*)&o);
}

// from half->float code - just for verification.
static float half_to_float(half hf)
{
FP16 h = *((FP16*)&hf);

static const FP32 magic = { 113 << 23 };
static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
FP32 o;

o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits
uint exp = shifted_exp & o.u; // just the exponent
o.u += (127 - 15) << 23; // exponent adjust

// handle exponent special cases
if (exp == shifted_exp) // Inf/NaN?
o.u += (128 - 16) << 23; // extra exp adjust
else if (exp == 0) // Zero/Denormal?
{
o.u += 1 << 23; // extra exp adjust
o.f -= magic.f; // renormalize
}

o.u |= (h.u & 0x8000) << 16; // sign bit
return o.f;
}
176 changes: 176 additions & 0 deletions matmul/frontier/gemm.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
#define __HIP_PLATFORM_AMD__

#include <unistd.h>
#include <iostream>
#include <stdlib.h>
#include <assert.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <hip/hip_fp16.h>
#include <rocblas/rocblas.h>
#include "../fp16_conversion.h"

using namespace std;

const char* rocblasGetErrorString(rocblas_status status)
{
switch(status)
{
case rocblas_status_success: return "rocblas_status_success";
case rocblas_status_invalid_handle: return "rocblas_status_invalid_handle";
case rocblas_status_not_implemented: return "rocblas_status_not_implemented";
case rocblas_status_invalid_pointer: return "rocblas_status_invalid_pointer";
case rocblas_status_invalid_size: return "rocblas_status_invalid_size";
case rocblas_status_memory_error: return "rocblas_status_memory_error";
case rocblas_status_internal_error: return "rocblas_status_internal_error";
case rocblas_status_perf_degraded: return "rocblas_status_perf_degraded";
case rocblas_status_size_query_mismatch: return "rocblas_status_size_query_mismatch";
case rocblas_status_size_increased: return "rocblas_status_size_increased";
case rocblas_status_size_unchanged: return "rocblas_status_size_unchanged";
case rocblas_status_invalid_value: return "rocblas_status_invalid_value";
case rocblas_status_continue: return "rocblas_status_continue";
case rocblas_status_check_numerics_fail: return "rocblas_status_check_numerics_fail";
case rocblas_status_excluded_from_build: return "rocblas_status_excluded_from_build";
case rocblas_status_arch_mismatch: return "rocblas_status_arch_mismatch";
}
return "unknown error";
}

// Convenience function for checking HIP runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
hipError_t checkHip(hipError_t result)
{
if (result != hipSuccess) {
fprintf(stderr, "HIP Runtime Error: %s\n", hipGetErrorString(result));
assert(result == hipSuccess);
}
return result;
}

inline
rocblas_status checkRocblas(rocblas_status result)
{
if (result != rocblas_status_success) {
fprintf(stderr, "ROCM Runtime Error: %s\n", rocblasGetErrorString(result));
assert(result == rocblas_status_success);
}
return result;
}

// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU
void CPU_fill_rand(float *A, unsigned long long nr_rows_A, unsigned long long nr_cols_A) {
int a=1;

for (unsigned long long i = 0; i < nr_rows_A * nr_cols_A; i++){
A[i] = (float)rand()/(float)(RAND_MAX/a);
}
}

int main(int argc, char ** argv){

unsigned long long min_m_k_n = 16384*2;
unsigned long long max_m_k_n = 16384*2;
int repeats = 100;
int verbose = 1;

if(verbose)
cout << "running with"
<< " min_m_k_n: " << min_m_k_n
<< " max_m_k_n: " << max_m_k_n
<< " repeats: " << repeats
<< endl;

rocblas_status stat;
rocblas_handle handle;

checkRocblas(rocblas_create_handle(&handle));

if(verbose) cout << "allocating device variables" << endl;

// Allocate 3 arrays on CPU

float *h_A = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
float *h_B = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
float *h_C = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));

CPU_fill_rand(h_A, max_m_k_n, max_m_k_n);
CPU_fill_rand(h_B, max_m_k_n, max_m_k_n);
CPU_fill_rand(h_C, max_m_k_n, max_m_k_n);

// Allocate 3 arrays on GPU
uint16_t *d_A, *d_B, *d_C, *d_D;
checkHip(hipMalloc(&d_A, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
checkHip(hipMalloc(&d_B, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
checkHip(hipMalloc(&d_C, max_m_k_n * max_m_k_n * sizeof(uint16_t)));

// rocblas_gemm_ex requires D array too
checkHip(hipMalloc(&d_D, max_m_k_n * max_m_k_n * sizeof(uint16_t)));

for (unsigned long long i = 0; i < max_m_k_n * max_m_k_n; i++) {
half temp_a = approx_float_to_half(h_A[i]);
half temp_b = approx_float_to_half(h_B[i]);
half temp_c = approx_float_to_half(h_C[i]);
d_A[i] = *((uint16_t*) &temp_a);
d_B[i] = *((uint16_t*) &temp_b);
d_C[i] = *((uint16_t*) &temp_c);
}

int lda, ldb, ldd, ldc, m, n, k;
half temp_alf = approx_float_to_half(1.0f);
half temp_bet = approx_float_to_half(0.0f);
const uint16_t alf = *((uint16_t*) &temp_alf);
const uint16_t bet = *((uint16_t*) &temp_bet);
const uint16_t *alpha = &alf;
const uint16_t *beta = &bet;

hipEvent_t start, stop;
hipEventCreate(&start);
hipEventCreate(&stop);

for(int size = min_m_k_n; size <= max_m_k_n; size=size*2){
double sum = 0.0;
for(int rep = 0; rep < repeats; rep++){
hipEventRecord(start, 0);
m=n=k=size;
lda = m;
ldb = k;
ldc = m;
ldd = m;


stat = rocblas_gemm_ex(handle, rocblas_operation_none, rocblas_operation_none, m, n, k, alpha, d_A, rocblas_datatype_bf16_r, lda, d_B, rocblas_datatype_bf16_r, ldb, beta, d_C, rocblas_datatype_bf16_r, ldc, d_D, rocblas_datatype_bf16_r, ldd, rocblas_datatype_f32_r, rocblas_gemm_algo_standard, 0, 0);

hipEventRecord(stop,0);
hipEventSynchronize(stop);
if(stat != rocblas_status_success){
fprintf(stderr, "RocBLAS Error: %s\n", rocblasGetErrorString(stat));
exit(1);
}
assert(!hipGetLastError());

float elapsed;
hipEventElapsedTime(&elapsed, start, stop);
elapsed /= 1000.0f;
if (rep >= 25) {
sum += elapsed;
}
}
cout << "bfloat16: size "
<< size << " average: " << sum/75 << " s "<< endl;

}

//Free GPU memory
hipFree(d_A);
hipFree(d_B);
hipFree(d_C);

// Free CPU memory
free(h_A);
free(h_B);
free(h_C);

return 0;
}

13 changes: 13 additions & 0 deletions matmul/frontier/makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
CC = cc
INC = -I${ROCM_PATH}/include
CFLAGS = -std=c++11 -O3 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip
LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrocblas

all: gemm.x

gemm.x: gemm.cu
${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o gemm.x gemm.cu

clean:
rm -f gemm.x

Loading