OpenMPTarget backend now enabled; time to debug correctness

MatthewRHermes · Oct 3, 2023 · d3e9989 · d3e9989
1 parent 3456905
commit d3e9989
Show file tree

Hide file tree

Showing 13 changed files with 352 additions and 28 deletions.
diff --git a/examples/gpu/polymer_async/1_6-31g_inp_gpu.py b/examples/gpu/polymer_async/1_6-31g_inp_gpu.py
@@ -1,3 +1,5 @@
+from mrh.my_pyscf.gpu import libgpu
+
 import pyscf 
 from gpu4pyscf import patch_pyscf
 
@@ -9,7 +11,6 @@
 #lib.logger.TIMER_LEVEL=lib.logger.INFO
 
 # -- this should all be inside of LASSCF() constructor
-from mrh.my_pyscf.gpu import libgpu
 gpu = libgpu.libgpu_create_device()
 
 num_gpus = libgpu.libgpu_get_num_devices(gpu)

diff --git a/examples/gpu/polymer_async/1_6-31g_inp_gpu_simple.py b/examples/gpu/polymer_async/1_6-31g_inp_gpu_simple.py
@@ -1,3 +1,5 @@
+from mrh.my_pyscf.gpu import libgpu
+
 import pyscf 
 from gpu4pyscf import patch_pyscf
 
@@ -6,7 +8,6 @@
 from mrh.my_pyscf.mcscf.lasscf_async import LASSCF
 from pyscf.mcscf import avas	
 
-from mrh.my_pyscf.gpu import libgpu
 gpu = libgpu.libgpu_init()
 
 lib.logger.TIMER_LEVEL=lib.logger.INFO

diff --git a/examples/gpu/polymer_sync/1_6-31g_inp_gpu.py b/examples/gpu/polymer_sync/1_6-31g_inp_gpu.py
@@ -1,3 +1,5 @@
+from mrh.my_pyscf.gpu import libgpu
+
 import pyscf
 from gpu4pyscf import patch_pyscf
 
@@ -14,7 +16,6 @@
 lib.logger.TIME_LEVEL = lib.logger.INFO
 
 # -- this should all be inside of LASSCF() constructor
-from mrh.my_pyscf.gpu import libgpu
 gpu = libgpu.libgpu_create_device()
 
 num_gpus = libgpu.libgpu_get_num_devices(gpu)

diff --git a/gpu/mini-apps/openmp/c++/Makefile b/gpu/mini-apps/openmp/c++/Makefile
@@ -0,0 +1,80 @@
+# Use NVIDIA compilers w/ ALCF provided OpenMPI
+#
+# module load nvhpc-nompi
+#
+# Definition of MACROS
+
+PATH_TO_NVCC = $(shell which nvcc)
+PATH_TO_NVHPC = $(shell echo ${PATH_TO_NVCC} | rev | cut -d '/' -f 4- | rev)
+
+$(info PATH_TO_NVHPC= [${PATH_TO_NVHPC}])
+
+CUDA = ${PATH_TO_NVHPC}/cuda
+
+PATH_TO_PYTHON=$(shell readlink -f `which python` | rev | cut -d '/' -f 2- | rev)
+
+$(info PATH_TO_PYTHON= [$(PATH_TO_PYTHON)])
+
+PYTHON_INC=$(shell python -m pybind11 --includes)
+PYTHON_LIB=$(shell $(PATH_TO_PYTHON)/python3-config --ldflags)
+PYTHON_LIB+=-lpython3
+
+CXX = CC
+CXXFLAGS = -g -O3 -std=c++0x
+CXXFLAGS += -D_SINGLE_PRECISION
+CXXFLAGS += -mp=gpu -gpu=cc80,cuda11.0
+
+CXXFLAGS += $(PYTHON_INC)
+CXXFLAGS += -I../../src -D_USE_GPU -D_GPU_OPENMP
+
+CUDA_CXX = $(CXX)
+CUDA_CXXFLAGS = $(CXXFLAGS)
+
+CPP = cpp -P -traditional
+CPPFLAGS =
+
+LD = $(CXX)
+LIB = ../../src/libgpu.so $(PYTHON_LIB)
+LIB += -L/home/knight/soft/polaris/lapack/lib -llapack -lrefblas -lgfortran  
+
+BINROOT=./
+EX=vecadd
+SHELL=/bin/sh
+
+# -- subset of src files with cuda kernels
+CUDA_SRC = 
+CUDA_OBJ = $(CUDA_SRC:.cpp=.o)
+
+SRC = $(filter-out $(CUDA_SRC), $(wildcard *.cpp))
+INC = $(wildcard *.h)
+OBJ = $(SRC:.cpp=.o)
+
+#
+# -- target : 	Dependencies
+# --		Rule to create target
+
+$(EX): 	$(OBJ) $(CUDA_OBJ)
+	$(LD) -o $@ $(CXXFLAGS) $(OBJ) $(CUDA_OBJ) $(LIB)
+
+####################################################################
+
+$(OBJ): %.o: %.cpp
+	$(CXX) $(CXXFLAGS) -c $<
+
+$(CUDA_OBJ): %.o: %.cpp
+	$(CUDA_CXX) -x cu $(CUDA_CXXFLAGS) -c $< -o $@
+
+#
+# -- Remove *.o and *~ from the directory
+clean:
+	rm -f *.o *~
+#
+# -- Remove *.o, *~, and executable from the directory
+realclean:
+	rm -f *.o *~ ./$(EX)
+
+#
+# -- Simple dependencies
+
+$(OBJ) : $(INC)
+$(CUDA_OBJ) : $(INC)
diff --git a/gpu/mini-apps/openmp/c++/main.cpp b/gpu/mini-apps/openmp/c++/main.cpp
@@ -0,0 +1,130 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <iostream>
+#include <cassert>
+#include "mpi.h"
+
+#include <omp.h>
+
+#include "pm.h"
+
+#define _N 1024
+#define _LOCAL_SIZE 64
+
+#ifdef _SINGLE_PRECISION
+  typedef float real_t;
+#else
+  typedef double real_t;
+#endif
+
+using namespace PM_NS;
+
+// ----------------------------------------------------------------
+
+void _vecadd(real_t * a, real_t * b, real_t * c, int N)
+{
+
+#pragma omp target teams distribute parallel for is_device_ptr(a, b, c) 
+  for(int i=0; i<N; ++i) {
+    c[i] = a[i] + b[i];
+  }
+
+}
+
+// ----------------------------------------------------------------
+
+int main( int argc, char* argv[] )
+{
+  MPI_Init(&argc, &argv);
+
+  int me,nranks;
+  MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
+
+  const int N = _N;
+
+  class PM * pm = new PM();
+
+  real_t * a = (real_t*) malloc(N*sizeof(real_t));
+  real_t * b = (real_t*) malloc(N*sizeof(real_t));
+  real_t * c = (real_t*) malloc(N*sizeof(real_t));
+
+  // Initialize host
+  for(int i=0; i<N; ++i) {
+    a[i] = sin(i)*sin(i);
+    b[i] = cos(i)*cos(i);
+    c[i] = -1.0;
+  }
+
+  int num_devices = pm->dev_num_devices();
+
+  if(me == 0) {
+    printf("# of devices= %i\n",num_devices);
+    pm->dev_properties(num_devices);
+  }
+
+  // Device ID
+
+  int device_id = me % num_devices;
+  for(int i=0; i<nranks; ++i) {
+    if(i == me) {
+      printf("Rank %i running on GPU %i!\n",me,device_id);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+  }
+
+#ifdef _SINGLE_PRECISION
+  if(me == 0) printf("Using single-precision\n\n");
+#else
+  if(me == 0) printf("Using double-precision\n\n");
+#endif
+
+  // Create device buffers and transfer data to device
+
+  real_t * d_a = (real_t *) pm->dev_malloc(N*sizeof(real_t));
+  real_t * d_b = (real_t *) pm->dev_malloc(N*sizeof(real_t));
+  real_t * d_c = (real_t *) pm->dev_malloc(N*sizeof(real_t));
+
+  pm->dev_push(d_a, a, N);
+  pm->dev_push(d_b, b, N);
+  pm->dev_push(d_c, c, N);
+
+  // Execute kernel
+
+  _vecadd(d_a, d_b, d_c, N);
+
+  // Transfer data from device
+
+  pm->dev_pull(d_c, c, N);
+
+  //Check result on host
+
+  double diff = 0;
+  for(int i=0; i<N; ++i) diff += (double) c[i];
+  diff = diff/(double) N - 1.0;
+
+  double diffsq = diff * diff;
+
+  int sum;
+  MPI_Reduce(&diffsq, &sum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+  if(me == 0) {
+    if(sum < 1e-6) printf("\nResult is CORRECT!! :)\n");
+    else printf("\nResult is WRONG!! :(\n");
+  }
+
+  // Clean up
+
+  free(a);
+  free(b);
+  free(c);
+
+  pm->dev_free(d_a);
+  pm->dev_free(d_b);
+  pm->dev_free(d_c);
+
+  delete pm;
+
+  MPI_Finalize();
+}
diff --git a/gpu/mini-apps/openmp/c++/set_affinity_gpu_polaris.sh b/gpu/mini-apps/openmp/c++/set_affinity_gpu_polaris.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+num_gpus=4
+# need to assign GPUs in reverse order due to topology
+# See Polaris Device Affinity Information https://www.alcf.anl.gov/support/user-guides/polaris/hardware-overview/machine-overview/index.html
+gpu=$((${num_gpus} - 1 - ${PMI_LOCAL_RANK} % ${num_gpus}))
+export CUDA_VISIBLE_DEVICES=$gpu
+#echo “RANK= ${PMI_RANK} LOCAL_RANK= ${PMI_LOCAL_RANK} gpu= ${gpu}”
+exec "$@"
diff --git a/gpu/mini-apps/openmp/c++/submit.sh b/gpu/mini-apps/openmp/c++/submit.sh
@@ -0,0 +1,20 @@
+#!/bin/bash -l
+#PBS -l select=1:system=polaris
+#PBS -l place=scatter
+#PBS -l walltime=0:30:00
+#PBS -q debug 
+#PBS -A Catalyst
+#PBS -l filesystems=home:grand:eagle
+
+#cd ${PBS_O_WORKDIR}
+
+# MPI example w/ 4 MPI ranks per node spread evenly across cores
+NNODES=`wc -l < $PBS_NODEFILE`
+NRANKS_PER_NODE=8
+NDEPTH=8
+NTHREADS=1
+
+NTOTRANKS=$(( NNODES * NRANKS_PER_NODE ))
+echo "NUM_OF_NODES= ${NNODES} TOTAL_NUM_RANKS= ${NTOTRANKS} RANKS_PER_NODE= ${NRANKS_PER_NODE} THREADS_PER_RANK= ${NTHREADS}"
+
+mpiexec -n ${NTOTRANKS} --ppn ${NRANKS_PER_NODE} --depth=${NDEPTH} --cpu-bind depth ./vecadd
diff --git a/gpu/mini-apps/openmp/python/main.py b/gpu/mini-apps/openmp/python/main.py
@@ -0,0 +1,20 @@
+from mrh.my_pyscf.gpu import libgpu
+
+import pyscf   # -- this is contaminating a path preventing an OpenMP runtime that supports GPUs from being picked up
+from gpu4pyscf import patch_pyscf
+
+from pyscf import gto, scf, tools, mcscf, lib
+from mrh.my_pyscf.mcscf.lasscf_async import LASSCF
+from pyscf.mcscf import avas	
+
+gpu = libgpu.libgpu_create_device()
+
+num_gpus = libgpu.libgpu_get_num_devices(gpu)
+print("num_gpus= ", num_gpus)
+
+libgpu.libgpu_dev_properties(gpu, num_gpus)
+
+gpu_id = 0
+libgpu.libgpu_set_device(gpu, gpu_id)
+
+libgpu.libgpu_destroy_device(gpu)
diff --git a/gpu/mini-apps/openmp/python/submit_polaris.sh b/gpu/mini-apps/openmp/python/submit_polaris.sh
@@ -0,0 +1,37 @@
+#!/bin/bash -l
+#PBS -l select=1:system=polaris
+#PBS -l place=scatter
+#PBS -l walltime=0:30:00
+#PBS -q debug 
+#PBS -A Catalyst
+#PBS -l filesystems=home:grand:eagle
+
+cd /lus/grand/projects/LASSCF_gpudev/knight/soft/mrh/gpu/mini-apps/openmp/python
+
+# MPI example w/ 16 MPI ranks per node spread evenly across cores
+NNODES=`wc -l < $PBS_NODEFILE`
+NRANKS_PER_NODE=1
+NTHREADS=32
+NDEPTH=${NTHREADS}
+
+NTOTRANKS=$(( NNODES * NRANKS_PER_NODE ))
+echo "NUM_OF_NODES= ${NNODES} TOTAL_NUM_RANKS= ${NTOTRANKS} RANKS_PER_NODE= ${NRANKS_PER_NODE} THREADS_PER_RANK= ${NTHREADS}"
+
+#MPI_ARGS="-n ${NTOTRANKS} --ppn ${NRANKS_PER_NODE} "
+MPI_ARGS="-n ${NTOTRANKS} --ppn ${NRANKS_PER_NODE} --depth=${NDEPTH} --cpu-bind depth "
+
+OMP_ARGS=" "
+#OMP_ARGS="--env OMP_NUM_THREADS=${NTHREADS} "
+OMP_ARGS=" --env OMP_NUM_THREADS=${NTHREADS} --env OMP_PROC_BIND=spread --env OMP_PLACES=threads "
+#OMP_ARGS+=" --env OMP_WAIT_POLICY=ACTIVE "
+
+INPUT="main.py"
+
+#export CUDA_VISIBLE_DEVICES=0
+
+EXE="python ${INPUT} "
+
+#python -m cProfile -o out.prof ${INPUT}
+#{ time ${EXE} ;} 2>&1 | tee profile.txt
+{ time mpiexec ${MPI_ARGS} ${OMP_ARGS} ${EXE} ;} 2>&1 | tee profile.txt
+#nsys profile --stats=true -t cuda,nvtx mpiexec ${MPI_ARGS} ${OMP_ARGS} ${EXE} 2>&1 | tee profile.txt
diff --git a/gpu/src/arch/polaris-nvhpc-openmp b/gpu/src/arch/polaris-nvhpc-openmp
@@ -2,6 +2,13 @@ INSTALL = ../../my_pyscf/gpu
 
 PYTHON_INC=$(shell python -m pybind11 --includes)
 
+PATH_TO_PYTHON=$(shell readlink -f `which python` | rev | cut -d '/' -f 2- | rev)
+
+$(info PATH_TO_PYTHON= [$(PATH_TO_PYTHON)])
+
+PYTHON_LIB=$(shell $(PATH_TO_PYTHON)/python3-config --ldflags)
+PYTHON_LIB+=-lpython3
+
 PATH_TO_NVCC = $(shell which nvcc)
 PATH_TO_NVHPC = $(shell echo ${PATH_TO_NVCC} | rev | cut -d '/' -f 4- | rev)
 
@@ -27,9 +34,11 @@ CXXFLAGS += -D_CUDA_NVTX
 LD = $(CXX)
 LDFLAGS = $(GPU_FLAGS) -fPIC -shared
 LIB = -lstdc++
+LIB += $(PYTHON_LIB)
+LIB += -L/home/knight/soft/polaris/lapack/lib -llapack -lrefblas -lgfortran
 LIB += -L$(PATH_TO_NVHPC)/math_libs/lib64 -lcublas -lcublasLt
 LIB += -L$(PATH_TO_NVHPC)/cuda/lib64 -lnvToolsExt
 #LIB += -lnvomp
 
-LIB += -L$(PATH_TO_NVHPC)/compilers/lib
-LIB += -lacchost -laccdevaux -laccdevice -lcudadevice -latomic -lnvhpcatm -lnvf -lnvomp -lnvcpumath-avx2 -lnvc -lnvcpumath -lcudart -lcuda
+#LIB += -L$(PATH_TO_NVHPC)/compilers/lib
+#LIB += -lacchost -laccdevaux -laccdevice -lcudadevice -latomic -lnvhpcatm -lnvf -lnvomp -lnvcpumath-avx2 -lnvc -lnvcpumath -lcudart -lcuda
diff --git a/gpu/src/device_openmp.cpp b/gpu/src/device_openmp.cpp
@@ -94,7 +94,7 @@ void Device::init_get_jk(py::array_t<double> _eri1, py::array_t<double> _dmtril,
     nvtxRangePushA("Create handle");
 #endif
     cublasCreate(&handle);
-    _OMP_CHECK_ERRORS();
+    _CUDA_CHECK_ERRORS();
 #ifdef _CUDA_NVTX
     nvtxRangePop();
 #endif