-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
OpenMPTarget backend now enabled; time to debug correctness
- Loading branch information
Showing
13 changed files
with
352 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
# Use NVIDIA compilers w/ ALCF provided OpenMPI | ||
# | ||
# module load nvhpc-nompi | ||
# | ||
# Definition of MACROS | ||
|
||
PATH_TO_NVCC = $(shell which nvcc) | ||
PATH_TO_NVHPC = $(shell echo ${PATH_TO_NVCC} | rev | cut -d '/' -f 4- | rev) | ||
|
||
$(info PATH_TO_NVHPC= [${PATH_TO_NVHPC}]) | ||
|
||
CUDA = ${PATH_TO_NVHPC}/cuda | ||
|
||
PATH_TO_PYTHON=$(shell readlink -f `which python` | rev | cut -d '/' -f 2- | rev) | ||
|
||
$(info PATH_TO_PYTHON= [$(PATH_TO_PYTHON)]) | ||
|
||
PYTHON_INC=$(shell python -m pybind11 --includes) | ||
PYTHON_LIB=$(shell $(PATH_TO_PYTHON)/python3-config --ldflags) | ||
PYTHON_LIB+=-lpython3 | ||
|
||
CXX = CC | ||
CXXFLAGS = -g -O3 -std=c++0x | ||
CXXFLAGS += -D_SINGLE_PRECISION | ||
CXXFLAGS += -mp=gpu -gpu=cc80,cuda11.0 | ||
|
||
CXXFLAGS += $(PYTHON_INC) | ||
CXXFLAGS += -I../../src -D_USE_GPU -D_GPU_OPENMP | ||
|
||
CUDA_CXX = $(CXX) | ||
CUDA_CXXFLAGS = $(CXXFLAGS) | ||
|
||
CPP = cpp -P -traditional | ||
CPPFLAGS = | ||
|
||
LD = $(CXX) | ||
LIB = ../../src/libgpu.so $(PYTHON_LIB) | ||
LIB += -L/home/knight/soft/polaris/lapack/lib -llapack -lrefblas -lgfortran | ||
|
||
BINROOT=./ | ||
EX=vecadd | ||
SHELL=/bin/sh | ||
|
||
# -- subset of src files with cuda kernels | ||
CUDA_SRC = | ||
CUDA_OBJ = $(CUDA_SRC:.cpp=.o) | ||
|
||
SRC = $(filter-out $(CUDA_SRC), $(wildcard *.cpp)) | ||
INC = $(wildcard *.h) | ||
OBJ = $(SRC:.cpp=.o) | ||
|
||
# | ||
# -- target : Dependencies | ||
# -- Rule to create target | ||
|
||
$(EX): $(OBJ) $(CUDA_OBJ) | ||
$(LD) -o $@ $(CXXFLAGS) $(OBJ) $(CUDA_OBJ) $(LIB) | ||
|
||
#################################################################### | ||
|
||
$(OBJ): %.o: %.cpp | ||
$(CXX) $(CXXFLAGS) -c $< | ||
|
||
$(CUDA_OBJ): %.o: %.cpp | ||
$(CUDA_CXX) -x cu $(CUDA_CXXFLAGS) -c $< -o $@ | ||
|
||
# | ||
# -- Remove *.o and *~ from the directory | ||
clean: | ||
rm -f *.o *~ | ||
# | ||
# -- Remove *.o, *~, and executable from the directory | ||
realclean: | ||
rm -f *.o *~ ./$(EX) | ||
|
||
# | ||
# -- Simple dependencies | ||
|
||
$(OBJ) : $(INC) | ||
$(CUDA_OBJ) : $(INC) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <math.h> | ||
#include <iostream> | ||
#include <cassert> | ||
#include "mpi.h" | ||
|
||
#include <omp.h> | ||
|
||
#include "pm.h" | ||
|
||
#define _N 1024 | ||
#define _LOCAL_SIZE 64 | ||
|
||
#ifdef _SINGLE_PRECISION | ||
typedef float real_t; | ||
#else | ||
typedef double real_t; | ||
#endif | ||
|
||
using namespace PM_NS; | ||
|
||
// ---------------------------------------------------------------- | ||
|
||
void _vecadd(real_t * a, real_t * b, real_t * c, int N) | ||
{ | ||
|
||
#pragma omp target teams distribute parallel for is_device_ptr(a, b, c) | ||
for(int i=0; i<N; ++i) { | ||
c[i] = a[i] + b[i]; | ||
} | ||
|
||
} | ||
|
||
// ---------------------------------------------------------------- | ||
|
||
int main( int argc, char* argv[] ) | ||
{ | ||
MPI_Init(&argc, &argv); | ||
|
||
int me,nranks; | ||
MPI_Comm_size(MPI_COMM_WORLD, &nranks); | ||
MPI_Comm_rank(MPI_COMM_WORLD, &me); | ||
|
||
const int N = _N; | ||
|
||
class PM * pm = new PM(); | ||
|
||
real_t * a = (real_t*) malloc(N*sizeof(real_t)); | ||
real_t * b = (real_t*) malloc(N*sizeof(real_t)); | ||
real_t * c = (real_t*) malloc(N*sizeof(real_t)); | ||
|
||
// Initialize host | ||
for(int i=0; i<N; ++i) { | ||
a[i] = sin(i)*sin(i); | ||
b[i] = cos(i)*cos(i); | ||
c[i] = -1.0; | ||
} | ||
|
||
int num_devices = pm->dev_num_devices(); | ||
|
||
if(me == 0) { | ||
printf("# of devices= %i\n",num_devices); | ||
pm->dev_properties(num_devices); | ||
} | ||
|
||
// Device ID | ||
|
||
int device_id = me % num_devices; | ||
for(int i=0; i<nranks; ++i) { | ||
if(i == me) { | ||
printf("Rank %i running on GPU %i!\n",me,device_id); | ||
} | ||
MPI_Barrier(MPI_COMM_WORLD); | ||
} | ||
|
||
#ifdef _SINGLE_PRECISION | ||
if(me == 0) printf("Using single-precision\n\n"); | ||
#else | ||
if(me == 0) printf("Using double-precision\n\n"); | ||
#endif | ||
|
||
// Create device buffers and transfer data to device | ||
|
||
real_t * d_a = (real_t *) pm->dev_malloc(N*sizeof(real_t)); | ||
real_t * d_b = (real_t *) pm->dev_malloc(N*sizeof(real_t)); | ||
real_t * d_c = (real_t *) pm->dev_malloc(N*sizeof(real_t)); | ||
|
||
pm->dev_push(d_a, a, N); | ||
pm->dev_push(d_b, b, N); | ||
pm->dev_push(d_c, c, N); | ||
|
||
// Execute kernel | ||
|
||
_vecadd(d_a, d_b, d_c, N); | ||
|
||
// Transfer data from device | ||
|
||
pm->dev_pull(d_c, c, N); | ||
|
||
//Check result on host | ||
|
||
double diff = 0; | ||
for(int i=0; i<N; ++i) diff += (double) c[i]; | ||
diff = diff/(double) N - 1.0; | ||
|
||
double diffsq = diff * diff; | ||
|
||
int sum; | ||
MPI_Reduce(&diffsq, &sum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); | ||
|
||
if(me == 0) { | ||
if(sum < 1e-6) printf("\nResult is CORRECT!! :)\n"); | ||
else printf("\nResult is WRONG!! :(\n"); | ||
} | ||
|
||
// Clean up | ||
|
||
free(a); | ||
free(b); | ||
free(c); | ||
|
||
pm->dev_free(d_a); | ||
pm->dev_free(d_b); | ||
pm->dev_free(d_c); | ||
|
||
delete pm; | ||
|
||
MPI_Finalize(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/bin/bash | ||
num_gpus=4 | ||
# need to assign GPUs in reverse order due to topology | ||
# See Polaris Device Affinity Information https://www.alcf.anl.gov/support/user-guides/polaris/hardware-overview/machine-overview/index.html | ||
gpu=$((${num_gpus} - 1 - ${PMI_LOCAL_RANK} % ${num_gpus})) | ||
export CUDA_VISIBLE_DEVICES=$gpu | ||
#echo “RANK= ${PMI_RANK} LOCAL_RANK= ${PMI_LOCAL_RANK} gpu= ${gpu}” | ||
exec "$@" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#!/bin/bash -l | ||
#PBS -l select=1:system=polaris | ||
#PBS -l place=scatter | ||
#PBS -l walltime=0:30:00 | ||
#PBS -q debug | ||
#PBS -A Catalyst | ||
#PBS -l filesystems=home:grand:eagle | ||
|
||
#cd ${PBS_O_WORKDIR} | ||
|
||
# MPI example w/ 4 MPI ranks per node spread evenly across cores | ||
NNODES=`wc -l < $PBS_NODEFILE` | ||
NRANKS_PER_NODE=8 | ||
NDEPTH=8 | ||
NTHREADS=1 | ||
|
||
NTOTRANKS=$(( NNODES * NRANKS_PER_NODE )) | ||
echo "NUM_OF_NODES= ${NNODES} TOTAL_NUM_RANKS= ${NTOTRANKS} RANKS_PER_NODE= ${NRANKS_PER_NODE} THREADS_PER_RANK= ${NTHREADS}" | ||
|
||
mpiexec -n ${NTOTRANKS} --ppn ${NRANKS_PER_NODE} --depth=${NDEPTH} --cpu-bind depth ./vecadd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from mrh.my_pyscf.gpu import libgpu | ||
|
||
import pyscf # -- this is contaminating a path preventing an OpenMP runtime that supports GPUs from being picked up | ||
from gpu4pyscf import patch_pyscf | ||
|
||
from pyscf import gto, scf, tools, mcscf, lib | ||
from mrh.my_pyscf.mcscf.lasscf_async import LASSCF | ||
from pyscf.mcscf import avas | ||
|
||
gpu = libgpu.libgpu_create_device() | ||
|
||
num_gpus = libgpu.libgpu_get_num_devices(gpu) | ||
print("num_gpus= ", num_gpus) | ||
|
||
libgpu.libgpu_dev_properties(gpu, num_gpus) | ||
|
||
gpu_id = 0 | ||
libgpu.libgpu_set_device(gpu, gpu_id) | ||
|
||
libgpu.libgpu_destroy_device(gpu) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/bin/bash -l | ||
#PBS -l select=1:system=polaris | ||
#PBS -l place=scatter | ||
#PBS -l walltime=0:30:00 | ||
#PBS -q debug | ||
#PBS -A Catalyst | ||
#PBS -l filesystems=home:grand:eagle | ||
|
||
cd /lus/grand/projects/LASSCF_gpudev/knight/soft/mrh/gpu/mini-apps/openmp/python | ||
|
||
# MPI example w/ 16 MPI ranks per node spread evenly across cores | ||
NNODES=`wc -l < $PBS_NODEFILE` | ||
NRANKS_PER_NODE=1 | ||
NTHREADS=32 | ||
NDEPTH=${NTHREADS} | ||
|
||
NTOTRANKS=$(( NNODES * NRANKS_PER_NODE )) | ||
echo "NUM_OF_NODES= ${NNODES} TOTAL_NUM_RANKS= ${NTOTRANKS} RANKS_PER_NODE= ${NRANKS_PER_NODE} THREADS_PER_RANK= ${NTHREADS}" | ||
|
||
#MPI_ARGS="-n ${NTOTRANKS} --ppn ${NRANKS_PER_NODE} " | ||
MPI_ARGS="-n ${NTOTRANKS} --ppn ${NRANKS_PER_NODE} --depth=${NDEPTH} --cpu-bind depth " | ||
|
||
OMP_ARGS=" " | ||
#OMP_ARGS="--env OMP_NUM_THREADS=${NTHREADS} " | ||
OMP_ARGS=" --env OMP_NUM_THREADS=${NTHREADS} --env OMP_PROC_BIND=spread --env OMP_PLACES=threads " | ||
#OMP_ARGS+=" --env OMP_WAIT_POLICY=ACTIVE " | ||
|
||
INPUT="main.py" | ||
|
||
#export CUDA_VISIBLE_DEVICES=0 | ||
|
||
EXE="python ${INPUT} " | ||
|
||
#python -m cProfile -o out.prof ${INPUT} | ||
#{ time ${EXE} ;} 2>&1 | tee profile.txt | ||
{ time mpiexec ${MPI_ARGS} ${OMP_ARGS} ${EXE} ;} 2>&1 | tee profile.txt | ||
#nsys profile --stats=true -t cuda,nvtx mpiexec ${MPI_ARGS} ${OMP_ARGS} ${EXE} 2>&1 | tee profile.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.