From 6c3c0ad42c832be1914d1f1cb6e213891a9dd15f Mon Sep 17 00:00:00 2001
From: Aditya Ranjan <adi.k.ranjan@gmail.com>
Date: Fri, 12 Apr 2024 22:38:00 -0400
Subject: [PATCH 01/11] matmul

---
 matmul/README              |   9 ++
 matmul/fp16_conversion.h   | 116 +++++++++++++++++++++
 matmul/frontier/gemm.cu    | 200 +++++++++++++++++++++++++++++++++++++
 matmul/frontier/makefile   |  13 +++
 matmul/perlmutter/gemm.cu  | 190 +++++++++++++++++++++++++++++++++++
 matmul/perlmutter/makefile |  12 +++
 6 files changed, 540 insertions(+)
 create mode 100644 matmul/README
 create mode 100644 matmul/fp16_conversion.h
 create mode 100644 matmul/frontier/gemm.cu
 create mode 100644 matmul/frontier/makefile
 create mode 100644 matmul/perlmutter/gemm.cu
 create mode 100644 matmul/perlmutter/makefile

diff --git a/matmul/README b/matmul/README
new file mode 100644
index 0000000..962c0eb
--- /dev/null
+++ b/matmul/README
@@ -0,0 +1,9 @@
+(Perlmutter) Before compiling do this:
+module load PrgEnv-cray cudatoolkit craype-accel-nvidia80
+export CRAY_ACCEL_TARGET=nvidia80
+
+(Perlmutter) Before running do this:
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64
+
+(Frontier) Before compiling do this:
+module load PrgEnv-cray amd-mixed craype-accel-amd-gfx90a
diff --git a/matmul/fp16_conversion.h b/matmul/fp16_conversion.h
new file mode 100644
index 0000000..8ff5d29
--- /dev/null
+++ b/matmul/fp16_conversion.h
@@ -0,0 +1,116 @@
+// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This code modified from the public domain code here: 
+// https://gist.github.com/rygorous/2156668
+// The URL above includes more robust conversion routines
+// that handle Inf and NaN correctly. 
+// 
+// It is recommended to use the more robust versions in production code.
+
+// code from: https://github.com/parallel-forall/code-samples/tree/master/posts/mixed-precision
+
+typedef unsigned uint;
+
+union FP32
+{
+    uint u;
+    float f;
+    struct
+    {
+        uint Mantissa : 23;
+        uint Exponent : 8;
+        uint Sign : 1;
+    };
+};
+
+union FP16
+{
+    unsigned short u;
+    struct
+    {
+        uint Mantissa : 10;
+        uint Exponent : 5;
+        uint Sign : 1;
+    };
+};
+
+// Approximate solution. This is faster but converts some sNaNs to
+// infinity and doesn't round correctly. Handle with care.
+// Approximate solution. This is faster but converts some sNaNs to
+// infinity and doesn't round correctly. Handle with care.
+static half approx_float_to_half(float fl)
+{
+    FP32 f32infty = { 255 << 23 };
+    FP32 f16max = { (127 + 16) << 23 };
+    FP32 magic = { 15 << 23 };
+    FP32 expinf = { (255 ^ 31) << 23 };
+    uint sign_mask = 0x80000000u;
+    FP16 o = { 0 };
+
+    FP32 f = *((FP32*)&fl);
+
+    uint sign = f.u & sign_mask;
+    f.u ^= sign;
+
+    if (!(f.f < f32infty.u)) // Inf or NaN
+        o.u = f.u ^ expinf.u;
+    else
+    {
+        if (f.f > f16max.f) f.f = f16max.f;
+        f.f *= magic.f;
+    }
+
+    o.u = f.u >> 13; // Take the mantissa bits
+    o.u |= sign >> 16;
+    return *((half*)&o);
+}
+
+// from half->float code - just for verification.
+static float half_to_float(half hf)
+{
+    FP16 h = *((FP16*)&hf);
+
+    static const FP32 magic = { 113 << 23 };
+    static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
+    FP32 o;
+
+    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
+    uint exp = shifted_exp & o.u;   // just the exponent
+    o.u += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o.u += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) // Zero/Denormal?
+    {
+        o.u += 1 << 23;             // extra exp adjust
+        o.f -= magic.f;             // renormalize
+    }
+
+    o.u |= (h.u & 0x8000) << 16;    // sign bit
+    return o.f;
+}
diff --git a/matmul/frontier/gemm.cu b/matmul/frontier/gemm.cu
new file mode 100644
index 0000000..8ab42e0
--- /dev/null
+++ b/matmul/frontier/gemm.cu
@@ -0,0 +1,200 @@
+#include <unistd.h>
+#include <iostream>
+#include <stdlib.h>
+#include <assert.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_fp16.h>
+#include <hipblas/hipblas.h>
+#include "../fp16_conversion.h"
+
+using namespace std;
+
+#define FP16MM
+
+const char* hipblasGetErrorString(hipblasStatus_t status)
+{
+    switch(status)
+    {
+        case HIPBLAS_STATUS_SUCCESS: return "HIPBLAS_STATUS_SUCCESS";
+        case HIPBLAS_STATUS_NOT_INITIALIZED: return "HIPBLAS_STATUS_NOT_INITIALIZED";
+        case HIPBLAS_STATUS_ALLOC_FAILED: return "HIPBLAS_STATUS_ALLOC_FAILED";
+        case HIPBLAS_STATUS_INVALID_VALUE: return "HIPBLAS_STATUS_INVALID_VALUE"; 
+        case HIPBLAS_STATUS_ARCH_MISMATCH: return "HIPBLAS_STATUS_ARCH_MISMATCH"; 
+        case HIPBLAS_STATUS_MAPPING_ERROR: return "HIPBLAS_STATUS_MAPPING_ERROR";
+        case HIPBLAS_STATUS_EXECUTION_FAILED: return "HIPBLAS_STATUS_EXECUTION_FAILED"; 
+        case HIPBLAS_STATUS_INTERNAL_ERROR: return "HIPBLAS_STATUS_INTERNAL_ERROR"; 
+        case HIPBLAS_STATUS_NOT_SUPPORTED: return "HIPBLAS_STATUS_NOT_SUPPORTED";
+        case HIPBLAS_STATUS_HANDLE_IS_NULLPTR: return "HIPBLAS_STATUS_HANDLE_IS_NULLPTR";
+        case HIPBLAS_STATUS_INVALID_ENUM: return "HIPBLAS_STATUS_INVALID_ENUM";
+        case HIPBLAS_STATUS_UNKNOWN: return "HIPBLAS_STATUS_UNKNOWN";
+    }
+    return "unknown error";
+}
+
+// Convenience function for checking HIP runtime API results
+// can be wrapped around any runtime API call. No-op in release builds.
+inline
+hipError_t checkHip(hipError_t result)
+{
+  if (result != hipSuccess) {
+    fprintf(stderr, "HIP Runtime Error: %s\n", hipGetErrorString(result));
+    assert(result == hipSuccess);
+  }
+  return result;
+}
+
+inline
+hipblasStatus_t checkCublas(hipblasStatus_t result)
+{
+  if (result != HIPBLAS_STATUS_SUCCESS) {
+    fprintf(stderr, "HIP Runtime Error: %s\n", hipblasGetErrorString(result));
+    assert(result == HIPBLAS_STATUS_SUCCESS);
+  }
+  return result;
+}
+
+// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU
+void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) {
+	int a=1;
+
+    for(int i = 0; i < nr_rows_A * nr_cols_A; i++){
+		A[i] = (float)rand()/(float)(RAND_MAX/a);
+	}
+}
+
+int main(int argc, char ** argv){
+
+
+  int min_m_k_n = 1024;
+  int max_m_k_n = 16384*2;
+  int repeats = 100;
+  int verbose = 1;
+
+#ifndef FP16MM
+  cout << "\nhipblasSgemm test result:\n" << endl;
+#else
+  cout << "\nhipblasHgemm test result:\n" << endl;
+#endif
+  
+  if(verbose) 
+    cout << "running with" 
+	 << " min_m_k_n: " << min_m_k_n
+	 << " max_m_k_n: " << max_m_k_n
+	 << " repeats: " << repeats
+	 << endl;
+
+  hipblasStatus_t stat;
+  hipblasHandle_t handle;
+
+  checkCublas(hipblasCreate(&handle));
+
+  if(verbose) cout << "allocating device variables" << endl;
+  
+  // Allocate 3 arrays on CPU
+  
+  float *h_A = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
+  float *h_B = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
+  float *h_C = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
+
+  CPU_fill_rand(h_A, max_m_k_n, max_m_k_n);
+  CPU_fill_rand(h_B, max_m_k_n, max_m_k_n);
+  CPU_fill_rand(h_C, max_m_k_n, max_m_k_n);
+
+#ifndef FP16MM
+    // Allocate 3 arrays on GPU
+    float *d_A, *d_B, *d_C;
+    checkHip(hipMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(float)));
+    checkHip(hipMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(float)));
+    checkHip(hipMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(float)));
+    
+    checkHip(hipMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(float),hipMemcpyHostToDevice));
+    checkHip(hipMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(float),hipMemcpyHostToDevice));
+    checkHip(hipMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(float),hipMemcpyHostToDevice));
+    
+    int lda, ldb, ldc, m, n, k;
+    const float alf = 1.0f;
+    const float bet = 0.0f;
+    const float *alpha = &alf;
+    const float *beta = &bet;
+  
+#else
+    
+    uint16_t *d_A, *d_B, *d_C;
+    checkHip(hipMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
+    checkHip(hipMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
+    checkHip(hipMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
+
+    for (int i = 0; i < max_m_k_n * max_m_k_n; i++) {
+        half temp_a = approx_float_to_half(h_A[i]);
+        half temp_b = approx_float_to_half(h_B[i]);
+        half temp_c = approx_float_to_half(h_C[i]);
+        d_A[i] = *((uint16_t*) &temp_a);
+        d_B[i] = *((uint16_t*) &temp_b);
+        d_C[i] = *((uint16_t*) &temp_c);
+    }
+
+    int lda, ldb, ldc, m, n, k;
+    half temp_alf = approx_float_to_half(1.0f);
+    half temp_bet = approx_float_to_half(0.0f);
+    const uint16_t alf = *((uint16_t*) &temp_alf);
+    const uint16_t bet = *((uint16_t*) &temp_bet);
+    const uint16_t *alpha = &alf;
+    const uint16_t *beta = &bet;
+
+#endif
+  
+  hipEvent_t start, stop;
+  hipEventCreate(&start);
+  hipEventCreate(&stop);
+
+  for(int size = min_m_k_n; size <= max_m_k_n; size=size*2){
+    double sum = 0.0;
+    for(int rep = 0; rep < repeats; rep++){
+      hipEventRecord(start, 0);
+	  m=n=k=size;
+	  lda = m;
+	  ldb = k;
+	  ldc = m;
+#ifndef FP16MM
+        stat = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); 
+#else
+	stat = hipblasHgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); 
+#endif
+      hipEventRecord(stop,0);
+      hipEventSynchronize(stop);
+      if(stat != HIPBLAS_STATUS_SUCCESS){
+	cerr << "hipblasSgemmBatched failed" << endl;
+	exit(1);
+      }
+      assert(!hipGetLastError());
+      
+      float elapsed;
+      hipEventElapsedTime(&elapsed, start, stop);
+      elapsed /= 1000.0f;
+      if (rep >= 25) {
+          sum += elapsed;
+      }
+    }
+#ifndef FP16MM	
+  cout << "float32: size " 
+#else
+  cout << "float16: size " 
+#endif
+  << size << " average: " << sum/75 << " s "<< endl;
+
+  }
+
+  //Free GPU memory
+  hipFree(d_A);
+  hipFree(d_B);
+  hipFree(d_C);
+
+  // Free CPU memory
+  free(h_A);
+  free(h_B);
+  free(h_C);
+      
+  return 0;
+}
+
diff --git a/matmul/frontier/makefile b/matmul/frontier/makefile
new file mode 100644
index 0000000..65f6cde
--- /dev/null
+++ b/matmul/frontier/makefile
@@ -0,0 +1,13 @@
+CC	= cc
+INC	= -I${ROCM_PATH}/include
+CFLAGS  = -std=c++11 -O3 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip
+LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lhipblas
+
+all: gemm.x
+
+gemm.x: gemm.cu 
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o gemm.x gemm.cu
+
+clean: 
+	rm -f gemm.x 
+
diff --git a/matmul/perlmutter/gemm.cu b/matmul/perlmutter/gemm.cu
new file mode 100644
index 0000000..91436d3
--- /dev/null
+++ b/matmul/perlmutter/gemm.cu
@@ -0,0 +1,190 @@
+#include <unistd.h>
+#include <iostream>
+#include <stdlib.h>
+#include <assert.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include "../fp16_conversion.h"
+
+using namespace std;
+
+#define FP16MM
+
+const char* cublasGetErrorString(cublasStatus_t status)
+{
+    switch(status)
+    {
+        case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+        case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+        case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+        case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; 
+        case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; 
+        case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+        case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; 
+        case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; 
+        case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+	case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+    return "unknown error";
+}
+
+// Convenience function for checking CUDA runtime API results
+// can be wrapped around any runtime API call. No-op in release builds.
+inline
+cudaError_t checkCuda(cudaError_t result)
+{
+  if (result != cudaSuccess) {
+    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
+    assert(result == cudaSuccess);
+  }
+  return result;
+}
+
+inline
+cublasStatus_t checkCublas(cublasStatus_t result)
+{
+  if (result != CUBLAS_STATUS_SUCCESS) {
+    fprintf(stderr, "CUDA Runtime Error: %s\n", cublasGetErrorString(result));
+    assert(result == CUBLAS_STATUS_SUCCESS);
+  }
+  return result;
+}
+
+// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU
+void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) {
+	int a=1;
+
+    for(int i = 0; i < nr_rows_A * nr_cols_A; i++){
+		A[i] = (float)rand()/(float)(RAND_MAX/a);
+	}
+}
+
+int main(int argc, char ** argv){
+
+
+  int min_m_k_n = 1024;
+  int max_m_k_n = 16384*2;
+  int repeats = 100;
+  int verbose = 1;
+
+#ifndef FP16MM
+  cout << "\ncublasSgemm test result:\n" << endl;
+#else
+  cout << "\ncublasHgemm test result:\n" << endl;
+#endif
+  
+  if(verbose) 
+    cout << "running with" 
+	 << " min_m_k_n: " << min_m_k_n
+	 << " max_m_k_n: " << max_m_k_n
+	 << " repeats: " << repeats
+	 << endl;
+
+  cublasStatus_t stat;
+  cublasHandle_t handle;
+
+  checkCublas(cublasCreate(&handle));
+
+  if(verbose) cout << "allocating device variables" << endl;
+  
+  // Allocate 3 arrays on CPU
+  
+  float *h_A = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
+  float *h_B = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
+  float *h_C = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
+
+  CPU_fill_rand(h_A, max_m_k_n, max_m_k_n);
+  CPU_fill_rand(h_B, max_m_k_n, max_m_k_n);
+  CPU_fill_rand(h_C, max_m_k_n, max_m_k_n);
+
+#ifndef FP16MM
+    // Allocate 3 arrays on GPU
+    float *d_A, *d_B, *d_C;
+    checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(float)));
+    checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(float)));
+    checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(float)));
+    
+    checkCuda(cudaMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice));
+    checkCuda(cudaMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice));
+    checkCuda(cudaMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice));
+    
+    int lda, ldb, ldc, m, n, k;
+    const float alf = 1.0f;
+    const float bet = 0.0f;
+    const float *alpha = &alf;
+    const float *beta = &bet;
+  
+#else
+    
+  	__half *d_A, *d_B, *d_C;
+    checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(__half)));
+    checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(__half)));
+    checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(__half)));
+    
+    for (int i = 0; i < max_m_k_n * max_m_k_n; i++) {
+      d_A[i] = approx_float_to_half(h_A[i]);
+  	  d_B[i] = approx_float_to_half(h_B[i]);
+  	  d_C[i] = approx_float_to_half(h_C[i]);
+    }
+
+    int lda, ldb, ldc, m, n, k;
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha = &alf;
+    const __half *beta = &bet;
+	
+#endif
+  
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  for(int size = min_m_k_n; size <= max_m_k_n; size=size*2){
+    double sum = 0.0;
+    for(int rep = 0; rep < repeats; rep++){
+      cudaEventRecord(start, 0);
+	  m=n=k=size;
+	  lda = m;
+	  ldb = k;
+	  ldc = m;
+#ifndef FP16MM
+        stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); 
+#else
+	stat = cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); 
+#endif
+      cudaEventRecord(stop,0);
+      cudaEventSynchronize(stop);
+      if(stat != CUBLAS_STATUS_SUCCESS){
+	cerr << "cublasSgemmBatched failed" << endl;
+	exit(1);
+      }
+      assert(!cudaGetLastError());
+      
+      float elapsed;
+      cudaEventElapsedTime(&elapsed, start, stop);
+      elapsed /= 1000.0f;
+      if (rep >= 25) {
+          sum += elapsed;
+      }
+    }
+#ifndef FP16MM	
+  cout << "float32: size " 
+#else
+  cout << "float16: size " 
+#endif
+  << size << " average: " << sum/75 << " s "<< endl;
+
+  }
+
+  //Free GPU memory
+  cudaFree(d_A);
+  cudaFree(d_B);
+  cudaFree(d_C);
+
+  // Free CPU memory
+  free(h_A);
+  free(h_B);
+  free(h_C);
+      
+  return 0;
+}
diff --git a/matmul/perlmutter/makefile b/matmul/perlmutter/makefile
new file mode 100644
index 0000000..abccbf3
--- /dev/null
+++ b/matmul/perlmutter/makefile
@@ -0,0 +1,12 @@
+CC	= cc
+CFLAGS	= -std=c++11 -O3 -target-accel=nvidia80 --cuda-gpu-arch=sm_80
+LDFLAGS	= -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64 -lcublas -lcurand
+
+all: gemm.x
+
+gemm.x: gemm.cu 
+	${CC} ${CFLAGS} ${LDFLAGS} -o gemm.x gemm.cu
+
+clean: 
+	rm -f gemm.x 
+

From 5893fe97670ec33aeb427fd5b4146b18fe4f5b0f Mon Sep 17 00:00:00 2001
From: Aditya Ranjan <adi.k.ranjan@gmail.com>
Date: Fri, 12 Apr 2024 23:32:22 -0400
Subject: [PATCH 02/11] perlmutter update

---
 matmul/perlmutter/gemm.cu | 80 +++++++++++----------------------------
 1 file changed, 23 insertions(+), 57 deletions(-)

diff --git a/matmul/perlmutter/gemm.cu b/matmul/perlmutter/gemm.cu
index 91436d3..401c654 100644
--- a/matmul/perlmutter/gemm.cu
+++ b/matmul/perlmutter/gemm.cu
@@ -8,8 +8,6 @@
 
 using namespace std;
 
-#define FP16MM
-
 const char* cublasGetErrorString(cublasStatus_t status)
 {
     switch(status)
@@ -51,11 +49,11 @@ cublasStatus_t checkCublas(cublasStatus_t result)
 }
 
 // Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU
-void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) {
+void CPU_fill_rand(__half *A, int nr_rows_A, int nr_cols_A) {
 	int a=1;
 
     for(int i = 0; i < nr_rows_A * nr_cols_A; i++){
-		A[i] = (float)rand()/(float)(RAND_MAX/a);
+		A[i] = approx_float_to_half((float)rand()/(float)(RAND_MAX/a));
 	}
 }
 
@@ -67,11 +65,7 @@ int main(int argc, char ** argv){
   int repeats = 100;
   int verbose = 1;
 
-#ifndef FP16MM
-  cout << "\ncublasSgemm test result:\n" << endl;
-#else
   cout << "\ncublasHgemm test result:\n" << endl;
-#endif
   
   if(verbose) 
     cout << "running with" 
@@ -89,51 +83,29 @@ int main(int argc, char ** argv){
   
   // Allocate 3 arrays on CPU
   
-  float *h_A = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
-  float *h_B = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
-  float *h_C = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
+  __half *h_A = (__half *)malloc(max_m_k_n * max_m_k_n * sizeof(__half));
+  __half *h_B = (__half *)malloc(max_m_k_n * max_m_k_n * sizeof(__half));
+  __half *h_C = (__half *)malloc(max_m_k_n * max_m_k_n * sizeof(__half));
 
   CPU_fill_rand(h_A, max_m_k_n, max_m_k_n);
   CPU_fill_rand(h_B, max_m_k_n, max_m_k_n);
   CPU_fill_rand(h_C, max_m_k_n, max_m_k_n);
-
-#ifndef FP16MM
-    // Allocate 3 arrays on GPU
-    float *d_A, *d_B, *d_C;
-    checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(float)));
-    checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(float)));
-    checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(float)));
-    
-    checkCuda(cudaMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice));
-    checkCuda(cudaMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice));
-    checkCuda(cudaMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice));
-    
-    int lda, ldb, ldc, m, n, k;
-    const float alf = 1.0f;
-    const float bet = 0.0f;
-    const float *alpha = &alf;
-    const float *beta = &bet;
   
-#else
-    
-  	__half *d_A, *d_B, *d_C;
-    checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(__half)));
-    checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(__half)));
-    checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(__half)));
-    
-    for (int i = 0; i < max_m_k_n * max_m_k_n; i++) {
-      d_A[i] = approx_float_to_half(h_A[i]);
-  	  d_B[i] = approx_float_to_half(h_B[i]);
-  	  d_C[i] = approx_float_to_half(h_C[i]);
-    }
-
-    int lda, ldb, ldc, m, n, k;
-    const __half alf = approx_float_to_half(1.0);
-    const __half bet = approx_float_to_half(0.0);
-    const __half *alpha = &alf;
-    const __half *beta = &bet;
-	
-#endif
+  // Allocate 3 arrays on GPU
+  __half *d_A, *d_B, *d_C;
+  checkCuda(cudaMalloc(&d_A, max_m_k_n * max_m_k_n * sizeof(__half)));
+  checkCuda(cudaMalloc(&d_B, max_m_k_n * max_m_k_n * sizeof(__half)));
+  checkCuda(cudaMalloc(&d_C, max_m_k_n * max_m_k_n * sizeof(__half)));    
+  
+  checkCuda(cudaMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(__half),cudaMemcpyHostToDevice));
+  checkCuda(cudaMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(__half),cudaMemcpyHostToDevice));
+  checkCuda(cudaMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(__half),cudaMemcpyHostToDevice));
+
+  int lda, ldb, ldc, m, n, k;
+  const __half alf = approx_float_to_half(1.0);
+  const __half bet = approx_float_to_half(0.0);
+  const __half *alpha = &alf;
+  const __half *beta = &bet;
   
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -147,11 +119,9 @@ int main(int argc, char ** argv){
 	  lda = m;
 	  ldb = k;
 	  ldc = m;
-#ifndef FP16MM
-        stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); 
-#else
-	stat = cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); 
-#endif
+      
+      stat = cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); 
+
       cudaEventRecord(stop,0);
       cudaEventSynchronize(stop);
       if(stat != CUBLAS_STATUS_SUCCESS){
@@ -167,11 +137,7 @@ int main(int argc, char ** argv){
           sum += elapsed;
       }
     }
-#ifndef FP16MM	
-  cout << "float32: size " 
-#else
   cout << "float16: size " 
-#endif
   << size << " average: " << sum/75 << " s "<< endl;
 
   }

From 6869225c14c8516ec67e70a16a3e473ece0fdbc5 Mon Sep 17 00:00:00 2001
From: Aditya Ranjan <adi.k.ranjan@gmail.com>
Date: Fri, 12 Apr 2024 20:57:34 -0700
Subject: [PATCH 03/11] fix

---
 matmul/perlmutter/gemm.cu | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/matmul/perlmutter/gemm.cu b/matmul/perlmutter/gemm.cu
index 401c654..1c169c3 100644
--- a/matmul/perlmutter/gemm.cu
+++ b/matmul/perlmutter/gemm.cu
@@ -49,11 +49,11 @@ cublasStatus_t checkCublas(cublasStatus_t result)
 }
 
 // Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU
-void CPU_fill_rand(__half *A, int nr_rows_A, int nr_cols_A) {
+void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) {
 	int a=1;
 
     for(int i = 0; i < nr_rows_A * nr_cols_A; i++){
-		A[i] = approx_float_to_half((float)rand()/(float)(RAND_MAX/a));
+          A[i] = (float)rand()/(float)(RAND_MAX/a);
 	}
 }
 
@@ -83,9 +83,9 @@ int main(int argc, char ** argv){
   
   // Allocate 3 arrays on CPU
   
-  __half *h_A = (__half *)malloc(max_m_k_n * max_m_k_n * sizeof(__half));
-  __half *h_B = (__half *)malloc(max_m_k_n * max_m_k_n * sizeof(__half));
-  __half *h_C = (__half *)malloc(max_m_k_n * max_m_k_n * sizeof(__half));
+  float *h_A = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
+  float *h_B = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
+  float *h_C = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
 
   CPU_fill_rand(h_A, max_m_k_n, max_m_k_n);
   CPU_fill_rand(h_B, max_m_k_n, max_m_k_n);
@@ -93,13 +93,15 @@ int main(int argc, char ** argv){
   
   // Allocate 3 arrays on GPU
   __half *d_A, *d_B, *d_C;
-  checkCuda(cudaMalloc(&d_A, max_m_k_n * max_m_k_n * sizeof(__half)));
-  checkCuda(cudaMalloc(&d_B, max_m_k_n * max_m_k_n * sizeof(__half)));
-  checkCuda(cudaMalloc(&d_C, max_m_k_n * max_m_k_n * sizeof(__half)));    
+  checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(__half)));
+  checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(__half)));
+  checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(__half)));    
   
-  checkCuda(cudaMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(__half),cudaMemcpyHostToDevice));
-  checkCuda(cudaMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(__half),cudaMemcpyHostToDevice));
-  checkCuda(cudaMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(__half),cudaMemcpyHostToDevice));
+  for (int i = 0; i < max_m_k_n * max_m_k_n; i++) {
+    d_A[i] = approx_float_to_half(h_A[i]);
+    d_B[i] = approx_float_to_half(h_B[i]);
+    d_C[i] = approx_float_to_half(h_C[i]);
+  }
 
   int lda, ldb, ldc, m, n, k;
   const __half alf = approx_float_to_half(1.0);

From 52892775919b00b51e78a9d23fbc544cb58eaa88 Mon Sep 17 00:00:00 2001
From: Aditya Ranjan <adi.k.ranjan@gmail.com>
Date: Fri, 12 Apr 2024 22:41:58 -0700
Subject: [PATCH 04/11] bfloat16 and gemm ex

---
 matmul/perlmutter/gemm.cu | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/matmul/perlmutter/gemm.cu b/matmul/perlmutter/gemm.cu
index 1c169c3..c602894 100644
--- a/matmul/perlmutter/gemm.cu
+++ b/matmul/perlmutter/gemm.cu
@@ -65,8 +65,6 @@ int main(int argc, char ** argv){
   int repeats = 100;
   int verbose = 1;
 
-  cout << "\ncublasHgemm test result:\n" << endl;
-  
   if(verbose) 
     cout << "running with" 
 	 << " min_m_k_n: " << min_m_k_n
@@ -92,22 +90,22 @@ int main(int argc, char ** argv){
   CPU_fill_rand(h_C, max_m_k_n, max_m_k_n);
   
   // Allocate 3 arrays on GPU
-  __half *d_A, *d_B, *d_C;
-  checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(__half)));
-  checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(__half)));
-  checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(__half)));    
+  nv_bfloat16 *d_A, *d_B, *d_C;
+  checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(nv_bfloat16)));
+  checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(nv_bfloat16)));
+  checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(nv_bfloat16)));    
   
   for (int i = 0; i < max_m_k_n * max_m_k_n; i++) {
-    d_A[i] = approx_float_to_half(h_A[i]);
-    d_B[i] = approx_float_to_half(h_B[i]);
-    d_C[i] = approx_float_to_half(h_C[i]);
+    d_A[i] = __float2bfloat16(h_A[i]);
+    d_B[i] = __float2bfloat16(h_B[i]);
+    d_C[i] = __float2bfloat16(h_C[i]);
   }
 
   int lda, ldb, ldc, m, n, k;
-  const __half alf = approx_float_to_half(1.0);
-  const __half bet = approx_float_to_half(0.0);
-  const __half *alpha = &alf;
-  const __half *beta = &bet;
+  const nv_bfloat16 alf = __float2bfloat16(1.0f);
+  const nv_bfloat16 bet = __float2bfloat16(0.0f);
+  const nv_bfloat16 *alpha = &alf;
+  const nv_bfloat16 *beta = &bet;
   
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -122,13 +120,13 @@ int main(int argc, char ** argv){
 	  ldb = k;
 	  ldc = m;
       
-      stat = cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); 
+      stat = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, CUDA_R_16BF, lda, d_B, CUDA_R_16BF, ldb, beta, d_C, CUDA_R_16BF, ldc, CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
 
       cudaEventRecord(stop,0);
       cudaEventSynchronize(stop);
       if(stat != CUBLAS_STATUS_SUCCESS){
-	cerr << "cublasSgemmBatched failed" << endl;
-	exit(1);
+	  fprintf(stderr, "CuBLAS Error: %s\n", cublasGetErrorString(stat));
+          exit(1);
       }
       assert(!cudaGetLastError());
       
@@ -139,7 +137,7 @@ int main(int argc, char ** argv){
           sum += elapsed;
       }
     }
-  cout << "float16: size " 
+  cout << "bfloat16: size " 
   << size << " average: " << sum/75 << " s "<< endl;
 
   }

From ed4c177bb0995a7187c3ee146e7b61e8712ae7fd Mon Sep 17 00:00:00 2001
From: Aditya Ranjan <adi.k.ranjan@gmail.com>
Date: Sat, 13 Apr 2024 11:36:55 -0400
Subject: [PATCH 05/11] bfloat16 and gemm_ex

---
 matmul/frontier/gemm.cu  | 110 ++++++++++++++++-----------------------
 matmul/frontier/makefile |   2 +-
 2 files changed, 45 insertions(+), 67 deletions(-)

diff --git a/matmul/frontier/gemm.cu b/matmul/frontier/gemm.cu
index 8ab42e0..03fa9aa 100644
--- a/matmul/frontier/gemm.cu
+++ b/matmul/frontier/gemm.cu
@@ -6,28 +6,29 @@
 #include <hip/hip_runtime_api.h>
 #include <hip/hip_fp16.h>
 #include <hipblas/hipblas.h>
+#include <rocblas.h>
 #include "../fp16_conversion.h"
 
 using namespace std;
 
-#define FP16MM
-
-const char* hipblasGetErrorString(hipblasStatus_t status)
+const char* rocblasGetErrorString(rocblas_status status)
 {
     switch(status)
     {
-        case HIPBLAS_STATUS_SUCCESS: return "HIPBLAS_STATUS_SUCCESS";
-        case HIPBLAS_STATUS_NOT_INITIALIZED: return "HIPBLAS_STATUS_NOT_INITIALIZED";
-        case HIPBLAS_STATUS_ALLOC_FAILED: return "HIPBLAS_STATUS_ALLOC_FAILED";
-        case HIPBLAS_STATUS_INVALID_VALUE: return "HIPBLAS_STATUS_INVALID_VALUE"; 
-        case HIPBLAS_STATUS_ARCH_MISMATCH: return "HIPBLAS_STATUS_ARCH_MISMATCH"; 
-        case HIPBLAS_STATUS_MAPPING_ERROR: return "HIPBLAS_STATUS_MAPPING_ERROR";
-        case HIPBLAS_STATUS_EXECUTION_FAILED: return "HIPBLAS_STATUS_EXECUTION_FAILED"; 
-        case HIPBLAS_STATUS_INTERNAL_ERROR: return "HIPBLAS_STATUS_INTERNAL_ERROR"; 
-        case HIPBLAS_STATUS_NOT_SUPPORTED: return "HIPBLAS_STATUS_NOT_SUPPORTED";
-        case HIPBLAS_STATUS_HANDLE_IS_NULLPTR: return "HIPBLAS_STATUS_HANDLE_IS_NULLPTR";
-        case HIPBLAS_STATUS_INVALID_ENUM: return "HIPBLAS_STATUS_INVALID_ENUM";
-        case HIPBLAS_STATUS_UNKNOWN: return "HIPBLAS_STATUS_UNKNOWN";
+        case rocblas_status_success: return "rocblas_status_success";
+	case rocblas_status_invalid_handle: return "rocblas_status_invalid_handle";
+	case rocblas_status_not_implemented: return "rocblas_status_not_implemented";
+	case rocblas_status_invalid_pointer: return "rocblas_status_invalid_pointer";
+	case rocblas_status_invalid_size: return "rocblas_status_invalid_size";
+        case rocblas_status_memory_error: return "rocblas_status_memory_error";
+	case rocblas_status_internal_error: return "rocblas_status_internal_error";
+	case rocblas_status_perf_degraded: return "rocblas_status_perf_degraded";
+	case rocblas_status_size_query_mismatch: return "rocblas_status_size_query_mismatch";
+	case rocblas_status_size_increased: return "rocblas_status_size_increased";
+	case rocblas_status_size_unchanged: return "rocblas_status_size_unchanged";
+	case rocblas_status_invalid_value: return "rocblas_status_invalid_value";
+	case rocblas_status_continue: return "rocblas_status_continue";
+	case rocblas_status_check_numerics_fail: return "rocblas_status_check_numerics_fail";
     }
     return "unknown error";
 }
@@ -45,11 +46,11 @@ hipError_t checkHip(hipError_t result)
 }
 
 inline
-hipblasStatus_t checkCublas(hipblasStatus_t result)
+rocblas_status checkRocblas(rocblas_status result)
 {
-  if (result != HIPBLAS_STATUS_SUCCESS) {
-    fprintf(stderr, "HIP Runtime Error: %s\n", hipblasGetErrorString(result));
-    assert(result == HIPBLAS_STATUS_SUCCESS);
+  if (result != rocblas_status_success) {
+    fprintf(stderr, "ROCM Runtime Error: %s\n", rocblasGetErrorString(result));
+    assert(result == rocblas_status_success);
   }
   return result;
 }
@@ -59,8 +60,8 @@ void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) {
 	int a=1;
 
     for(int i = 0; i < nr_rows_A * nr_cols_A; i++){
-		A[i] = (float)rand()/(float)(RAND_MAX/a);
-	}
+        A[i] = (float)rand()/(float)(RAND_MAX/a);		
+    }
 }
 
 int main(int argc, char ** argv){
@@ -71,12 +72,6 @@ int main(int argc, char ** argv){
   int repeats = 100;
   int verbose = 1;
 
-#ifndef FP16MM
-  cout << "\nhipblasSgemm test result:\n" << endl;
-#else
-  cout << "\nhipblasHgemm test result:\n" << endl;
-#endif
-  
   if(verbose) 
     cout << "running with" 
 	 << " min_m_k_n: " << min_m_k_n
@@ -84,10 +79,10 @@ int main(int argc, char ** argv){
 	 << " repeats: " << repeats
 	 << endl;
 
-  hipblasStatus_t stat;
-  hipblasHandle_t handle;
+  rocblas_status stat;
+  rocblas_handle handle;
 
-  checkCublas(hipblasCreate(&handle));
+  checkRocblas(rocblas_create_handle(&handle));
 
   if(verbose) cout << "allocating device variables" << endl;
   
@@ -101,29 +96,18 @@ int main(int argc, char ** argv){
   CPU_fill_rand(h_B, max_m_k_n, max_m_k_n);
   CPU_fill_rand(h_C, max_m_k_n, max_m_k_n);
 
-#ifndef FP16MM
     // Allocate 3 arrays on GPU
-    float *d_A, *d_B, *d_C;
-    checkHip(hipMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(float)));
-    checkHip(hipMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(float)));
-    checkHip(hipMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(float)));
-    
-    checkHip(hipMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(float),hipMemcpyHostToDevice));
-    checkHip(hipMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(float),hipMemcpyHostToDevice));
-    checkHip(hipMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(float),hipMemcpyHostToDevice));
-    
-    int lda, ldb, ldc, m, n, k;
-    const float alf = 1.0f;
-    const float bet = 0.0f;
-    const float *alpha = &alf;
-    const float *beta = &bet;
-  
-#else
+    uint16_t *d_A, *d_B, *d_C, *d_D;
+    checkHip(hipMalloc(&d_A, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
+    checkHip(hipMalloc(&d_B, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
+    checkHip(hipMalloc(&d_C, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
     
-    uint16_t *d_A, *d_B, *d_C;
-    checkHip(hipMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
-    checkHip(hipMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
-    checkHip(hipMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
+    // rocblas_gemm_ex requries D array too
+    checkHip(hipMalloc(&d_D, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
+
+    checkHip(hipMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(uint16_t),hipMemcpyHostToDevice));
+    checkHip(hipMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(uint16_t),hipMemcpyHostToDevice));
+    checkHip(hipMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(uint16_t),hipMemcpyHostToDevice));
 
     for (int i = 0; i < max_m_k_n * max_m_k_n; i++) {
         half temp_a = approx_float_to_half(h_A[i]);
@@ -134,7 +118,7 @@ int main(int argc, char ** argv){
         d_C[i] = *((uint16_t*) &temp_c);
     }
 
-    int lda, ldb, ldc, m, n, k;
+    int lda, ldb, ldd, ldc, m, n, k;
     half temp_alf = approx_float_to_half(1.0f);
     half temp_bet = approx_float_to_half(0.0f);
     const uint16_t alf = *((uint16_t*) &temp_alf);
@@ -142,8 +126,6 @@ int main(int argc, char ** argv){
     const uint16_t *alpha = &alf;
     const uint16_t *beta = &bet;
 
-#endif
-  
   hipEvent_t start, stop;
   hipEventCreate(&start);
   hipEventCreate(&stop);
@@ -156,15 +138,15 @@ int main(int argc, char ** argv){
 	  lda = m;
 	  ldb = k;
 	  ldc = m;
-#ifndef FP16MM
-        stat = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); 
-#else
-	stat = hipblasHgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); 
-#endif
+	  ldd = m;
+
+
+      stat = rocblas_gemm_ex(handle, rocblas_operation_none, rocblas_operation_none, m, n, k, alpha, d_A, rocblas_datatype_bf16_r, lda, d_B, rocblas_datatype_bf16_r, ldb, beta, d_C, rocblas_datatype_bf16_r, ldc, d_D, rocblas_datatype_bf16_r, ldd, rocblas_datatype_f32_r, rocblas_gemm_algo_standard, 0, 0);
+
       hipEventRecord(stop,0);
       hipEventSynchronize(stop);
-      if(stat != HIPBLAS_STATUS_SUCCESS){
-	cerr << "hipblasSgemmBatched failed" << endl;
+      if(stat != rocblas_status_success){
+	fprintf(stderr, "RocBLAS Error: %s\n", rocblasGetErrorString(stat));
 	exit(1);
       }
       assert(!hipGetLastError());
@@ -176,11 +158,7 @@ int main(int argc, char ** argv){
           sum += elapsed;
       }
     }
-#ifndef FP16MM	
-  cout << "float32: size " 
-#else
-  cout << "float16: size " 
-#endif
+  cout << "bfloat16: size " 
   << size << " average: " << sum/75 << " s "<< endl;
 
   }
diff --git a/matmul/frontier/makefile b/matmul/frontier/makefile
index 65f6cde..3324a22 100644
--- a/matmul/frontier/makefile
+++ b/matmul/frontier/makefile
@@ -1,7 +1,7 @@
 CC	= cc
 INC	= -I${ROCM_PATH}/include
 CFLAGS  = -std=c++11 -O3 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip
-LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lhipblas
+LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrocblas
 
 all: gemm.x
 

From 22e8aad7a147744f5bc1b09792a4d5e85dee9fc1 Mon Sep 17 00:00:00 2001
From: Aditya Ranjan <adi.k.ranjan@gmail.com>
Date: Sun, 14 Apr 2024 12:34:23 -0400
Subject: [PATCH 06/11] rocm 6.0

---
 matmul/README           | 4 +++-
 matmul/frontier/gemm.cu | 9 ++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/matmul/README b/matmul/README
index 962c0eb..b09f8f8 100644
--- a/matmul/README
+++ b/matmul/README
@@ -6,4 +6,6 @@ export CRAY_ACCEL_TARGET=nvidia80
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64
 
 (Frontier) Before compiling do this:
-module load PrgEnv-cray amd-mixed craype-accel-amd-gfx90a
+module load PrgEnv-cray amd-mixed/6.0 craype-accel-amd-gfx90a
+module load cpe/23.05
+export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
diff --git a/matmul/frontier/gemm.cu b/matmul/frontier/gemm.cu
index 03fa9aa..12f63a8 100644
--- a/matmul/frontier/gemm.cu
+++ b/matmul/frontier/gemm.cu
@@ -1,3 +1,5 @@
+#define __HIP_PLATFORM_AMD__
+
 #include <unistd.h>
 #include <iostream>
 #include <stdlib.h>
@@ -5,8 +7,7 @@
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include <hip/hip_fp16.h>
-#include <hipblas/hipblas.h>
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 #include "../fp16_conversion.h"
 
 using namespace std;
@@ -29,6 +30,8 @@ const char* rocblasGetErrorString(rocblas_status status)
 	case rocblas_status_invalid_value: return "rocblas_status_invalid_value";
 	case rocblas_status_continue: return "rocblas_status_continue";
 	case rocblas_status_check_numerics_fail: return "rocblas_status_check_numerics_fail";
+        case rocblas_status_excluded_from_build: return "rocblas_status_excluded_from_build";
+        case rocblas_status_arch_mismatch: return "rocblas_status_arch_mismatch";
     }
     return "unknown error";
 }
@@ -68,7 +71,7 @@ int main(int argc, char ** argv){
 
 
   int min_m_k_n = 1024;
-  int max_m_k_n = 16384*2;
+  int max_m_k_n = 16384*4;
   int repeats = 100;
   int verbose = 1;
 

From 28ae13738a9a8ecdbab932448be08092d68dde1c Mon Sep 17 00:00:00 2001
From: Aditya Ranjan <adi.k.ranjan@gmail.com>
Date: Sun, 14 Apr 2024 22:09:22 -0400
Subject: [PATCH 07/11] don't need the memcpys

---
 matmul/frontier/gemm.cu | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/matmul/frontier/gemm.cu b/matmul/frontier/gemm.cu
index 12f63a8..5fafeb7 100644
--- a/matmul/frontier/gemm.cu
+++ b/matmul/frontier/gemm.cu
@@ -71,7 +71,9 @@ int main(int argc, char ** argv){
 
 
   int min_m_k_n = 1024;
-  int max_m_k_n = 16384*4;
+  int max_m_k_n = 16384*2;
+  // int min_m_k_n = 50000;
+  // int max_m_k_n = 50000;
   int repeats = 100;
   int verbose = 1;
 
@@ -105,13 +107,9 @@ int main(int argc, char ** argv){
     checkHip(hipMalloc(&d_B, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
     checkHip(hipMalloc(&d_C, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
     
-    // rocblas_gemm_ex requries D array too
+    // rocblas_gemm_ex requires D array too
     checkHip(hipMalloc(&d_D, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
 
-    checkHip(hipMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(uint16_t),hipMemcpyHostToDevice));
-    checkHip(hipMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(uint16_t),hipMemcpyHostToDevice));
-    checkHip(hipMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(uint16_t),hipMemcpyHostToDevice));
-
     for (int i = 0; i < max_m_k_n * max_m_k_n; i++) {
         half temp_a = approx_float_to_half(h_A[i]);
         half temp_b = approx_float_to_half(h_B[i]);

From 57286616116de647967f2452cbb68970fd03ac94 Mon Sep 17 00:00:00 2001
From: Aditya Ranjan <adi.k.ranjan@gmail.com>
Date: Sun, 14 Apr 2024 23:22:37 -0400
Subject: [PATCH 08/11] fixing integer overflow

---
 matmul/frontier/gemm.cu | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/matmul/frontier/gemm.cu b/matmul/frontier/gemm.cu
index 5fafeb7..002fbd9 100644
--- a/matmul/frontier/gemm.cu
+++ b/matmul/frontier/gemm.cu
@@ -59,21 +59,18 @@ rocblas_status checkRocblas(rocblas_status result)
 }
 
 // Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU
-void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) {
-	int a=1;
+void CPU_fill_rand(float *A, unsigned long nr_rows_A, unsigned long nr_cols_A) {
+    int a=1;
 
-    for(int i = 0; i < nr_rows_A * nr_cols_A; i++){
+    for (unsigned long long i = 0; i < nr_rows_A * nr_cols_A; i++){
         A[i] = (float)rand()/(float)(RAND_MAX/a);		
     }
 }
 
 int main(int argc, char ** argv){
 
-
-  int min_m_k_n = 1024;
-  int max_m_k_n = 16384*2;
-  // int min_m_k_n = 50000;
-  // int max_m_k_n = 50000;
+  unsigned long long min_m_k_n = 1024;
+  unsigned long long max_m_k_n = 16384*4;
   int repeats = 100;
   int verbose = 1;
 
@@ -110,7 +107,7 @@ int main(int argc, char ** argv){
     // rocblas_gemm_ex requires D array too
     checkHip(hipMalloc(&d_D, max_m_k_n * max_m_k_n * sizeof(uint16_t)));
 
-    for (int i = 0; i < max_m_k_n * max_m_k_n; i++) {
+    for (unsigned long long i = 0; i < max_m_k_n * max_m_k_n; i++) {
         half temp_a = approx_float_to_half(h_A[i]);
         half temp_b = approx_float_to_half(h_B[i]);
         half temp_c = approx_float_to_half(h_C[i]);

From fba25c6aa230481423063c2947e0076afe9ab90c Mon Sep 17 00:00:00 2001
From: Aditya Ranjan <adi.k.ranjan@gmail.com>
Date: Sun, 14 Apr 2024 23:27:31 -0400
Subject: [PATCH 09/11] more

---
 matmul/frontier/gemm.cu   |  2 +-
 matmul/perlmutter/gemm.cu | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/matmul/frontier/gemm.cu b/matmul/frontier/gemm.cu
index 002fbd9..c271fd4 100644
--- a/matmul/frontier/gemm.cu
+++ b/matmul/frontier/gemm.cu
@@ -59,7 +59,7 @@ rocblas_status checkRocblas(rocblas_status result)
 }
 
 // Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU
-void CPU_fill_rand(float *A, unsigned long nr_rows_A, unsigned long nr_cols_A) {
+void CPU_fill_rand(float *A, unsigned long long nr_rows_A, unsigned long long nr_cols_A) {
     int a=1;
 
     for (unsigned long long i = 0; i < nr_rows_A * nr_cols_A; i++){
diff --git a/matmul/perlmutter/gemm.cu b/matmul/perlmutter/gemm.cu
index c602894..ae52df9 100644
--- a/matmul/perlmutter/gemm.cu
+++ b/matmul/perlmutter/gemm.cu
@@ -49,10 +49,10 @@ cublasStatus_t checkCublas(cublasStatus_t result)
 }
 
 // Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU
-void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) {
+void CPU_fill_rand(float *A, unsigned long long nr_rows_A, unsigned long long nr_cols_A) {
 	int a=1;
 
-    for(int i = 0; i < nr_rows_A * nr_cols_A; i++){
+    for(unsigned long long i = 0; i < nr_rows_A * nr_cols_A; i++){
           A[i] = (float)rand()/(float)(RAND_MAX/a);
 	}
 }
@@ -60,8 +60,8 @@ void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) {
 int main(int argc, char ** argv){
 
 
-  int min_m_k_n = 1024;
-  int max_m_k_n = 16384*2;
+  unsigned long long min_m_k_n = 1024;
+  unsigned long long max_m_k_n = 16384*2;
   int repeats = 100;
   int verbose = 1;
 
@@ -95,7 +95,7 @@ int main(int argc, char ** argv){
   checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(nv_bfloat16)));
   checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(nv_bfloat16)));    
   
-  for (int i = 0; i < max_m_k_n * max_m_k_n; i++) {
+  for (unsigned long long i = 0; i < max_m_k_n * max_m_k_n; i++) {
     d_A[i] = __float2bfloat16(h_A[i]);
     d_B[i] = __float2bfloat16(h_B[i]);
     d_C[i] = __float2bfloat16(h_C[i]);

From 648bf183ea97e56a8818faad596691d04e7c58fa Mon Sep 17 00:00:00 2001
From: adityaranjan <adi.k.ranjan@gmail.com>
Date: Mon, 22 Apr 2024 16:00:55 -0400
Subject: [PATCH 10/11] license

---
 matmul/LICENSE | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 matmul/LICENSE

diff --git a/matmul/LICENSE b/matmul/LICENSE
new file mode 100644
index 0000000..f32dc2c
--- /dev/null
+++ b/matmul/LICENSE
@@ -0,0 +1,43 @@
+MIT License
+
+Copyright (c) 2017 He Ma
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
+Copyright (c) 2024, Parallel Software and Systems Group, University of
+Maryland.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.

From a90f80726d6c1932f39ec22c7e871c7de3f80f0e Mon Sep 17 00:00:00 2001
From: Aditya Ranjan <adi.k.ranjan@gmail.com>
Date: Sat, 6 Jul 2024 10:37:09 -0400
Subject: [PATCH 11/11] minor

---
 matmul/frontier/gemm.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/matmul/frontier/gemm.cu b/matmul/frontier/gemm.cu
index c271fd4..43151ba 100644
--- a/matmul/frontier/gemm.cu
+++ b/matmul/frontier/gemm.cu
@@ -69,8 +69,8 @@ void CPU_fill_rand(float *A, unsigned long long nr_rows_A, unsigned long long nr
 
 int main(int argc, char ** argv){
 
-  unsigned long long min_m_k_n = 1024;
-  unsigned long long max_m_k_n = 16384*4;
+  unsigned long long min_m_k_n = 16384*2;
+  unsigned long long max_m_k_n = 16384*2;
   int repeats = 100;
   int verbose = 1;