Skip to content

Commit

Permalink
OpenMPTarget backend now enabled; time to debug correctness
Browse files Browse the repository at this point in the history
  • Loading branch information
cjknight committed Oct 3, 2023
1 parent 3456905 commit d3e9989
Show file tree
Hide file tree
Showing 13 changed files with 352 additions and 28 deletions.
3 changes: 2 additions & 1 deletion examples/gpu/polymer_async/1_6-31g_inp_gpu.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from mrh.my_pyscf.gpu import libgpu

import pyscf
from gpu4pyscf import patch_pyscf

Expand All @@ -9,7 +11,6 @@
#lib.logger.TIMER_LEVEL=lib.logger.INFO

# -- this should all be inside of LASSCF() constructor
from mrh.my_pyscf.gpu import libgpu
gpu = libgpu.libgpu_create_device()

num_gpus = libgpu.libgpu_get_num_devices(gpu)
Expand Down
3 changes: 2 additions & 1 deletion examples/gpu/polymer_async/1_6-31g_inp_gpu_simple.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from mrh.my_pyscf.gpu import libgpu

import pyscf
from gpu4pyscf import patch_pyscf

Expand All @@ -6,7 +8,6 @@
from mrh.my_pyscf.mcscf.lasscf_async import LASSCF
from pyscf.mcscf import avas

from mrh.my_pyscf.gpu import libgpu
gpu = libgpu.libgpu_init()

lib.logger.TIMER_LEVEL=lib.logger.INFO
Expand Down
3 changes: 2 additions & 1 deletion examples/gpu/polymer_sync/1_6-31g_inp_gpu.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from mrh.my_pyscf.gpu import libgpu

import pyscf
from gpu4pyscf import patch_pyscf

Expand All @@ -14,7 +16,6 @@
lib.logger.TIME_LEVEL = lib.logger.INFO

# -- this should all be inside of LASSCF() constructor
from mrh.my_pyscf.gpu import libgpu
gpu = libgpu.libgpu_create_device()

num_gpus = libgpu.libgpu_get_num_devices(gpu)
Expand Down
80 changes: 80 additions & 0 deletions gpu/mini-apps/openmp/c++/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Use NVIDIA compilers w/ ALCF provided OpenMPI
#
# module load nvhpc-nompi
#
# Definition of MACROS

PATH_TO_NVCC = $(shell which nvcc)
PATH_TO_NVHPC = $(shell echo ${PATH_TO_NVCC} | rev | cut -d '/' -f 4- | rev)

$(info PATH_TO_NVHPC= [${PATH_TO_NVHPC}])

CUDA = ${PATH_TO_NVHPC}/cuda

PATH_TO_PYTHON=$(shell readlink -f `which python` | rev | cut -d '/' -f 2- | rev)

$(info PATH_TO_PYTHON= [$(PATH_TO_PYTHON)])

PYTHON_INC=$(shell python -m pybind11 --includes)
PYTHON_LIB=$(shell $(PATH_TO_PYTHON)/python3-config --ldflags)
PYTHON_LIB+=-lpython3

CXX = CC
CXXFLAGS = -g -O3 -std=c++0x
CXXFLAGS += -D_SINGLE_PRECISION
CXXFLAGS += -mp=gpu -gpu=cc80,cuda11.0

CXXFLAGS += $(PYTHON_INC)
CXXFLAGS += -I../../src -D_USE_GPU -D_GPU_OPENMP

CUDA_CXX = $(CXX)
CUDA_CXXFLAGS = $(CXXFLAGS)

CPP = cpp -P -traditional
CPPFLAGS =

LD = $(CXX)
LIB = ../../src/libgpu.so $(PYTHON_LIB)
LIB += -L/home/knight/soft/polaris/lapack/lib -llapack -lrefblas -lgfortran

BINROOT=./
EX=vecadd
SHELL=/bin/sh

# -- subset of src files with cuda kernels
CUDA_SRC =
CUDA_OBJ = $(CUDA_SRC:.cpp=.o)

SRC = $(filter-out $(CUDA_SRC), $(wildcard *.cpp))
INC = $(wildcard *.h)
OBJ = $(SRC:.cpp=.o)

#
# -- target : Dependencies
# -- Rule to create target

$(EX): $(OBJ) $(CUDA_OBJ)
$(LD) -o $@ $(CXXFLAGS) $(OBJ) $(CUDA_OBJ) $(LIB)

####################################################################

$(OBJ): %.o: %.cpp
$(CXX) $(CXXFLAGS) -c $<

$(CUDA_OBJ): %.o: %.cpp
$(CUDA_CXX) -x cu $(CUDA_CXXFLAGS) -c $< -o $@

#
# -- Remove *.o and *~ from the directory
clean:
rm -f *.o *~
#
# -- Remove *.o, *~, and executable from the directory
realclean:
rm -f *.o *~ ./$(EX)

#
# -- Simple dependencies

$(OBJ) : $(INC)
$(CUDA_OBJ) : $(INC)
130 changes: 130 additions & 0 deletions gpu/mini-apps/openmp/c++/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <cassert>
#include "mpi.h"

#include <omp.h>

#include "pm.h"

#define _N 1024
#define _LOCAL_SIZE 64

#ifdef _SINGLE_PRECISION
typedef float real_t;
#else
typedef double real_t;
#endif

using namespace PM_NS;

// ----------------------------------------------------------------

void _vecadd(real_t * a, real_t * b, real_t * c, int N)
{

#pragma omp target teams distribute parallel for is_device_ptr(a, b, c)
for(int i=0; i<N; ++i) {
c[i] = a[i] + b[i];
}

}

// ----------------------------------------------------------------

int main( int argc, char* argv[] )
{
MPI_Init(&argc, &argv);

int me,nranks;
MPI_Comm_size(MPI_COMM_WORLD, &nranks);
MPI_Comm_rank(MPI_COMM_WORLD, &me);

const int N = _N;

class PM * pm = new PM();

real_t * a = (real_t*) malloc(N*sizeof(real_t));
real_t * b = (real_t*) malloc(N*sizeof(real_t));
real_t * c = (real_t*) malloc(N*sizeof(real_t));

// Initialize host
for(int i=0; i<N; ++i) {
a[i] = sin(i)*sin(i);
b[i] = cos(i)*cos(i);
c[i] = -1.0;
}

int num_devices = pm->dev_num_devices();

if(me == 0) {
printf("# of devices= %i\n",num_devices);
pm->dev_properties(num_devices);
}

// Device ID

int device_id = me % num_devices;
for(int i=0; i<nranks; ++i) {
if(i == me) {
printf("Rank %i running on GPU %i!\n",me,device_id);
}
MPI_Barrier(MPI_COMM_WORLD);
}

#ifdef _SINGLE_PRECISION
if(me == 0) printf("Using single-precision\n\n");
#else
if(me == 0) printf("Using double-precision\n\n");
#endif

// Create device buffers and transfer data to device

real_t * d_a = (real_t *) pm->dev_malloc(N*sizeof(real_t));
real_t * d_b = (real_t *) pm->dev_malloc(N*sizeof(real_t));
real_t * d_c = (real_t *) pm->dev_malloc(N*sizeof(real_t));

pm->dev_push(d_a, a, N);
pm->dev_push(d_b, b, N);
pm->dev_push(d_c, c, N);

// Execute kernel

_vecadd(d_a, d_b, d_c, N);

// Transfer data from device

pm->dev_pull(d_c, c, N);

//Check result on host

double diff = 0;
for(int i=0; i<N; ++i) diff += (double) c[i];
diff = diff/(double) N - 1.0;

double diffsq = diff * diff;

int sum;
MPI_Reduce(&diffsq, &sum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

if(me == 0) {
if(sum < 1e-6) printf("\nResult is CORRECT!! :)\n");
else printf("\nResult is WRONG!! :(\n");
}

// Clean up

free(a);
free(b);
free(c);

pm->dev_free(d_a);
pm->dev_free(d_b);
pm->dev_free(d_c);

delete pm;

MPI_Finalize();
}
8 changes: 8 additions & 0 deletions gpu/mini-apps/openmp/c++/set_affinity_gpu_polaris.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
num_gpus=4
# need to assign GPUs in reverse order due to topology
# See Polaris Device Affinity Information https://www.alcf.anl.gov/support/user-guides/polaris/hardware-overview/machine-overview/index.html
gpu=$((${num_gpus} - 1 - ${PMI_LOCAL_RANK} % ${num_gpus}))
export CUDA_VISIBLE_DEVICES=$gpu
#echo “RANK= ${PMI_RANK} LOCAL_RANK= ${PMI_LOCAL_RANK} gpu= ${gpu}”
exec "$@"
20 changes: 20 additions & 0 deletions gpu/mini-apps/openmp/c++/submit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash -l
#PBS -l select=1:system=polaris
#PBS -l place=scatter
#PBS -l walltime=0:30:00
#PBS -q debug
#PBS -A Catalyst
#PBS -l filesystems=home:grand:eagle

#cd ${PBS_O_WORKDIR}

# MPI example w/ 4 MPI ranks per node spread evenly across cores
NNODES=`wc -l < $PBS_NODEFILE`
NRANKS_PER_NODE=8
NDEPTH=8
NTHREADS=1

NTOTRANKS=$(( NNODES * NRANKS_PER_NODE ))
echo "NUM_OF_NODES= ${NNODES} TOTAL_NUM_RANKS= ${NTOTRANKS} RANKS_PER_NODE= ${NRANKS_PER_NODE} THREADS_PER_RANK= ${NTHREADS}"

mpiexec -n ${NTOTRANKS} --ppn ${NRANKS_PER_NODE} --depth=${NDEPTH} --cpu-bind depth ./vecadd
20 changes: 20 additions & 0 deletions gpu/mini-apps/openmp/python/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from mrh.my_pyscf.gpu import libgpu

import pyscf # -- this is contaminating a path preventing an OpenMP runtime that supports GPUs from being picked up
from gpu4pyscf import patch_pyscf

from pyscf import gto, scf, tools, mcscf, lib
from mrh.my_pyscf.mcscf.lasscf_async import LASSCF
from pyscf.mcscf import avas

gpu = libgpu.libgpu_create_device()

num_gpus = libgpu.libgpu_get_num_devices(gpu)
print("num_gpus= ", num_gpus)

libgpu.libgpu_dev_properties(gpu, num_gpus)

gpu_id = 0
libgpu.libgpu_set_device(gpu, gpu_id)

libgpu.libgpu_destroy_device(gpu)
37 changes: 37 additions & 0 deletions gpu/mini-apps/openmp/python/submit_polaris.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash -l
#PBS -l select=1:system=polaris
#PBS -l place=scatter
#PBS -l walltime=0:30:00
#PBS -q debug
#PBS -A Catalyst
#PBS -l filesystems=home:grand:eagle

cd /lus/grand/projects/LASSCF_gpudev/knight/soft/mrh/gpu/mini-apps/openmp/python

# MPI example w/ 16 MPI ranks per node spread evenly across cores
NNODES=`wc -l < $PBS_NODEFILE`
NRANKS_PER_NODE=1
NTHREADS=32
NDEPTH=${NTHREADS}

NTOTRANKS=$(( NNODES * NRANKS_PER_NODE ))
echo "NUM_OF_NODES= ${NNODES} TOTAL_NUM_RANKS= ${NTOTRANKS} RANKS_PER_NODE= ${NRANKS_PER_NODE} THREADS_PER_RANK= ${NTHREADS}"

#MPI_ARGS="-n ${NTOTRANKS} --ppn ${NRANKS_PER_NODE} "
MPI_ARGS="-n ${NTOTRANKS} --ppn ${NRANKS_PER_NODE} --depth=${NDEPTH} --cpu-bind depth "

OMP_ARGS=" "
#OMP_ARGS="--env OMP_NUM_THREADS=${NTHREADS} "
OMP_ARGS=" --env OMP_NUM_THREADS=${NTHREADS} --env OMP_PROC_BIND=spread --env OMP_PLACES=threads "
#OMP_ARGS+=" --env OMP_WAIT_POLICY=ACTIVE "

INPUT="main.py"

#export CUDA_VISIBLE_DEVICES=0

EXE="python ${INPUT} "

#python -m cProfile -o out.prof ${INPUT}
#{ time ${EXE} ;} 2>&1 | tee profile.txt
{ time mpiexec ${MPI_ARGS} ${OMP_ARGS} ${EXE} ;} 2>&1 | tee profile.txt
#nsys profile --stats=true -t cuda,nvtx mpiexec ${MPI_ARGS} ${OMP_ARGS} ${EXE} 2>&1 | tee profile.txt
13 changes: 11 additions & 2 deletions gpu/src/arch/polaris-nvhpc-openmp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@ INSTALL = ../../my_pyscf/gpu

PYTHON_INC=$(shell python -m pybind11 --includes)

PATH_TO_PYTHON=$(shell readlink -f `which python` | rev | cut -d '/' -f 2- | rev)

$(info PATH_TO_PYTHON= [$(PATH_TO_PYTHON)])

PYTHON_LIB=$(shell $(PATH_TO_PYTHON)/python3-config --ldflags)
PYTHON_LIB+=-lpython3

PATH_TO_NVCC = $(shell which nvcc)
PATH_TO_NVHPC = $(shell echo ${PATH_TO_NVCC} | rev | cut -d '/' -f 4- | rev)

Expand All @@ -27,9 +34,11 @@ CXXFLAGS += -D_CUDA_NVTX
LD = $(CXX)
LDFLAGS = $(GPU_FLAGS) -fPIC -shared
LIB = -lstdc++
LIB += $(PYTHON_LIB)
LIB += -L/home/knight/soft/polaris/lapack/lib -llapack -lrefblas -lgfortran
LIB += -L$(PATH_TO_NVHPC)/math_libs/lib64 -lcublas -lcublasLt
LIB += -L$(PATH_TO_NVHPC)/cuda/lib64 -lnvToolsExt
#LIB += -lnvomp

LIB += -L$(PATH_TO_NVHPC)/compilers/lib
LIB += -lacchost -laccdevaux -laccdevice -lcudadevice -latomic -lnvhpcatm -lnvf -lnvomp -lnvcpumath-avx2 -lnvc -lnvcpumath -lcudart -lcuda
#LIB += -L$(PATH_TO_NVHPC)/compilers/lib
#LIB += -lacchost -laccdevaux -laccdevice -lcudadevice -latomic -lnvhpcatm -lnvf -lnvomp -lnvcpumath-avx2 -lnvc -lnvcpumath -lcudart -lcuda
2 changes: 1 addition & 1 deletion gpu/src/device_openmp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ void Device::init_get_jk(py::array_t<double> _eri1, py::array_t<double> _dmtril,
nvtxRangePushA("Create handle");
#endif
cublasCreate(&handle);
_OMP_CHECK_ERRORS();
_CUDA_CHECK_ERRORS();
#ifdef _CUDA_NVTX
nvtxRangePop();
#endif
Expand Down
Loading

0 comments on commit d3e9989

Please sign in to comment.