MatthewRHermes · MatthewRHermes · Oct 3, 2023 · Sep 1, 2023 · Sep 1, 2023 · Sep 1, 2023
diff --git a/examples/gpu/polymer_async/1_6-31g_inp.py b/examples/gpu/polymer_async/1_6-31g_inp.py
@@ -14,8 +14,8 @@
 mf=mf.density_fit()
 mf.run()
 
-ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2p'])
-#ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2pz'])
+#ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2p'])
+ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2pz'])
 
 las=LASSCF(mf, list((2,)*nfrags),list((2,)*nfrags))
 

diff --git a/examples/gpu/polymer_async/1_6-31g_inp_gpu.py b/examples/gpu/polymer_async/1_6-31g_inp_gpu.py
@@ -1,3 +1,5 @@
+from mrh.my_pyscf.gpu import libgpu
+
 import pyscf 
 from gpu4pyscf import patch_pyscf
 
@@ -9,7 +11,6 @@
 #lib.logger.TIMER_LEVEL=lib.logger.INFO
 
 # -- this should all be inside of LASSCF() constructor
-from mrh.my_pyscf.gpu import libgpu
 gpu = libgpu.libgpu_create_device()
 
 num_gpus = libgpu.libgpu_get_num_devices(gpu)
@@ -26,15 +27,16 @@
 basis='6-31g'
 outputfile='1_6-31g_out_gpu.log'
 mol=gto.M(use_gpu=gpu, atom=generator(nfrags),basis=basis,verbose=5,output=outputfile)
+#mol.max_memory = 8000
 
 print("\nCalling scf.RHF(mol) ; mol.use_gpu= ", mol.use_gpu)
 mf=scf.RHF(mol)
 mf=mf.density_fit()
 mf.run()
 
 print("\nCalling avas.kernel w/ mf.mol.use_gpu= ", mf.mol.use_gpu)
-ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2p'])
-#ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2pz'])
+#ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2p'])
+ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2pz'])
 
 print("\nStarting the LASSCF calculation with use_gpu= ", gpu)
 las=LASSCF(mf, list((2,)*nfrags),list((2,)*nfrags), use_gpu=gpu)

diff --git a/examples/gpu/polymer_async/1_6-31g_inp_gpu_simple.py b/examples/gpu/polymer_async/1_6-31g_inp_gpu_simple.py
@@ -1,3 +1,5 @@
+from mrh.my_pyscf.gpu import libgpu
+
 import pyscf 
 from gpu4pyscf import patch_pyscf
 
@@ -6,7 +8,6 @@
 from mrh.my_pyscf.mcscf.lasscf_async import LASSCF
 from pyscf.mcscf import avas	
 
-from mrh.my_pyscf.gpu import libgpu
 gpu = libgpu.libgpu_init()
 
 lib.logger.TIMER_LEVEL=lib.logger.INFO
@@ -19,8 +20,8 @@
 mf=mf.density_fit()
 mf.run()
 
-ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2p'])
-#ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2pz'])
+#ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2p'])
+ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2pz'])
 
 las=LASSCF(mf, list((2,)*nfrags),list((2,)*nfrags), use_gpu=gpu)
 

diff --git a/examples/gpu/polymer_async/submit_polaris.sh b/examples/gpu/polymer_async/submit_polaris.sh
@@ -38,6 +38,6 @@ EXE="python ${INPUT} "
 #mpiexec ${MPI_ARGS} ${OMP_ARGS} /home/knight/repos/GettingStarted/Examples/Polaris/affinity_omp/hello_affinity 
 
 #python -m cProfile -o out.prof ${INPUT}
-#time ${EXE} | tee profile.txt
-time mpiexec ${MPI_ARGS} ${OMP_ARGS} ${EXE} | tee profile.txt
-#nsys profile --stats=true -t cuda,nvtx mpiexec ${MPI_ARGS} ${OMP_ARGS} ${EXE} | tee profile.txt
+#{ time ${EXE} ;} 2>&1 | tee profile.txt
+{ time mpiexec ${MPI_ARGS} ${OMP_ARGS} ${EXE} ;} 2>&1 | tee profile.txt
+#nsys profile --stats=true -t cuda,nvtx mpiexec ${MPI_ARGS} ${OMP_ARGS} ${EXE} 2>&1 | tee profile.txt
diff --git a/examples/gpu/polymer_sync/1_6-31g_inp.py b/examples/gpu/polymer_sync/1_6-31g_inp.py
@@ -12,7 +12,7 @@
 mf=mf.density_fit()
 mf.run()
 
-ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2p'])
+ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2pz'])
 
 las=LASSCF(mf, list((2,)*nfrags),list((2,)*nfrags))
 

diff --git a/examples/gpu/polymer_sync/1_6-31g_inp_gpu.py b/examples/gpu/polymer_sync/1_6-31g_inp_gpu.py
@@ -1,3 +1,5 @@
+from mrh.my_pyscf.gpu import libgpu
+
 import pyscf
 from gpu4pyscf import patch_pyscf
 
@@ -14,7 +16,6 @@
 lib.logger.TIME_LEVEL = lib.logger.INFO
 
 # -- this should all be inside of LASSCF() constructor
-from mrh.my_pyscf.gpu import libgpu
 gpu = libgpu.libgpu_create_device()
 
 num_gpus = libgpu.libgpu_get_num_devices(gpu)
@@ -40,7 +41,7 @@
 mf.run()
 
 print("\nCalling avas.kernel")
-ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2p'])
+ncas,nelecas,guess_mo_coeff = avas.kernel(mf, ['C 2pz'])
 
 print("\nStarting the LASSCF calculation with use_gpu= ", gpu)
 las=LASSCF(mf, list((2,)*nfrags),list((2,)*nfrags), use_gpu=gpu)

diff --git a/gpu/gpu4pyscf/df/df_jk.py b/gpu/gpu4pyscf/df/df_jk.py
@@ -140,11 +140,12 @@ def get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-13):
             naux, nao_pair = eri1.shape
             if count == 0:
                 libgpu.libgpu_init_get_jk(gpu, eri1, dmtril, blksize, nset, nao, count)
-#            print("count= ", count, "nao= ", nao, " naux= ", naux, "  nao_pair= ", nao_pair, " blksize= ", blksize, " nset= ", nset, " eri1= ", eri1.shape, " dmtril= ", dmtril.shape, " dms= ", numpy.shape(dms), " vj= ", vj_tmp.shape, " vk= ", vk_tmp.shape)
+#            print("count= ", count, "nao= ", nao, " naux= ", naux, "  nao_pair= ", nao_pair, " blksize= ", blksize, " nset= ", nset, " eri1= ", eri1.shape, " dmtril= ", dmtril.shape, " dms= ", numpy.shape(dms))
+#            print("vj= ", vj_tmp.shape, " vk= ", vk_tmp.shape)
 
             if gpu:
-
-                libgpu.libgpu_compute_get_jk(gpu, naux, nao, nset, eri1, dmtril, dms, vj, vk, count)
+                
+                libgpu.libgpu_compute_get_jk(gpu, naux, eri1, dmtril, dms, vj, vk, count)
 
             else:
 
@@ -213,6 +214,9 @@ def get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-13):
 
         t1 = log.timer_debug1('jk', *t1)
 
+        if gpu:
+            libgpu.libgpu_pull_get_jk(gpu, vj, vk)
+
     if with_j: vj = lib.unpack_tril(vj, 1).reshape(dm_shape)
     if with_k: vk = vk.reshape(dm_shape)
 #    print("vj.shape= ", vj.shape)

diff --git a/gpu/mini-apps/openmp/c++/Makefile b/gpu/mini-apps/openmp/c++/Makefile
@@ -0,0 +1,80 @@
+# Use NVIDIA compilers w/ ALCF provided OpenMPI
+#
+# module load nvhpc-nompi
+#
+# Definition of MACROS
+
+PATH_TO_NVCC = $(shell which nvcc)
+PATH_TO_NVHPC = $(shell echo ${PATH_TO_NVCC} | rev | cut -d '/' -f 4- | rev)
+
+$(info PATH_TO_NVHPC= [${PATH_TO_NVHPC}])
+
+CUDA = ${PATH_TO_NVHPC}/cuda
+
+PATH_TO_PYTHON=$(shell readlink -f `which python` | rev | cut -d '/' -f 2- | rev)
+
+$(info PATH_TO_PYTHON= [$(PATH_TO_PYTHON)])
+
+PYTHON_INC=$(shell python -m pybind11 --includes)
+PYTHON_LIB=$(shell $(PATH_TO_PYTHON)/python3-config --ldflags)
+PYTHON_LIB+=-lpython3
+
+CXX = CC
+CXXFLAGS = -g -O3 -std=c++0x
+CXXFLAGS += -D_SINGLE_PRECISION
+CXXFLAGS += -mp=gpu -gpu=cc80,cuda11.0
+
+CXXFLAGS += $(PYTHON_INC)
+CXXFLAGS += -I../../src -D_USE_GPU -D_GPU_OPENMP
+
+CUDA_CXX = $(CXX)
+CUDA_CXXFLAGS = $(CXXFLAGS)
+
+CPP = cpp -P -traditional
+CPPFLAGS =
+
+LD = $(CXX)
+LIB = ../../src/libgpu.so $(PYTHON_LIB)
+LIB += -L/home/knight/soft/polaris/lapack/lib -llapack -lrefblas -lgfortran  
+
+BINROOT=./
+EX=vecadd
+SHELL=/bin/sh
+
+# -- subset of src files with cuda kernels
+CUDA_SRC = 
+CUDA_OBJ = $(CUDA_SRC:.cpp=.o)
+
+SRC = $(filter-out $(CUDA_SRC), $(wildcard *.cpp))
+INC = $(wildcard *.h)
+OBJ = $(SRC:.cpp=.o)
+
+#
+# -- target : 	Dependencies
+# --		Rule to create target
+
+$(EX): 	$(OBJ) $(CUDA_OBJ)
+	$(LD) -o $@ $(CXXFLAGS) $(OBJ) $(CUDA_OBJ) $(LIB)
+
+####################################################################
+
+$(OBJ): %.o: %.cpp
+	$(CXX) $(CXXFLAGS) -c $<
+
+$(CUDA_OBJ): %.o: %.cpp
+	$(CUDA_CXX) -x cu $(CUDA_CXXFLAGS) -c $< -o $@
+
+#
+# -- Remove *.o and *~ from the directory
+clean:
+	rm -f *.o *~
+#
+# -- Remove *.o, *~, and executable from the directory
+realclean:
+	rm -f *.o *~ ./$(EX)
+
+#
+# -- Simple dependencies
+
+$(OBJ) : $(INC)
+$(CUDA_OBJ) : $(INC)
diff --git a/gpu/mini-apps/openmp/c++/main.cpp b/gpu/mini-apps/openmp/c++/main.cpp
@@ -0,0 +1,130 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <iostream>
+#include <cassert>
+#include "mpi.h"
+
+#include <omp.h>
+
+#include "pm.h"
+
+#define _N 1024
+#define _LOCAL_SIZE 64
+
+#ifdef _SINGLE_PRECISION
+  typedef float real_t;
+#else
+  typedef double real_t;
+#endif
+
+using namespace PM_NS;
+
+// ----------------------------------------------------------------
+
+void _vecadd(real_t * a, real_t * b, real_t * c, int N)
+{
+
+#pragma omp target teams distribute parallel for is_device_ptr(a, b, c) 
+  for(int i=0; i<N; ++i) {
+    c[i] = a[i] + b[i];
+  }
+
+}
+
+// ----------------------------------------------------------------
+
+int main( int argc, char* argv[] )
+{
+  MPI_Init(&argc, &argv);
+
+  int me,nranks;
+  MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
+
+  const int N = _N;
+
+  class PM * pm = new PM();
+
+  real_t * a = (real_t*) malloc(N*sizeof(real_t));
+  real_t * b = (real_t*) malloc(N*sizeof(real_t));
+  real_t * c = (real_t*) malloc(N*sizeof(real_t));
+
+  // Initialize host
+  for(int i=0; i<N; ++i) {
+    a[i] = sin(i)*sin(i);
+    b[i] = cos(i)*cos(i);
+    c[i] = -1.0;
+  }
+
+  int num_devices = pm->dev_num_devices();
+
+  if(me == 0) {
+    printf("# of devices= %i\n",num_devices);
+    pm->dev_properties(num_devices);
+  }
+
+  // Device ID
+
+  int device_id = me % num_devices;
+  for(int i=0; i<nranks; ++i) {
+    if(i == me) {
+      printf("Rank %i running on GPU %i!\n",me,device_id);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+  }
+
+#ifdef _SINGLE_PRECISION
+  if(me == 0) printf("Using single-precision\n\n");
+#else
+  if(me == 0) printf("Using double-precision\n\n");
+#endif
+
+  // Create device buffers and transfer data to device
+
+  real_t * d_a = (real_t *) pm->dev_malloc(N*sizeof(real_t));
+  real_t * d_b = (real_t *) pm->dev_malloc(N*sizeof(real_t));
+  real_t * d_c = (real_t *) pm->dev_malloc(N*sizeof(real_t));
+
+  pm->dev_push(d_a, a, N);
+  pm->dev_push(d_b, b, N);
+  pm->dev_push(d_c, c, N);
+
+  // Execute kernel
+
+  _vecadd(d_a, d_b, d_c, N);
+
+  // Transfer data from device
+
+  pm->dev_pull(d_c, c, N);
+
+  //Check result on host
+
+  double diff = 0;
+  for(int i=0; i<N; ++i) diff += (double) c[i];
+  diff = diff/(double) N - 1.0;
+
+  double diffsq = diff * diff;
+
+  int sum;
+  MPI_Reduce(&diffsq, &sum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+  if(me == 0) {
+    if(sum < 1e-6) printf("\nResult is CORRECT!! :)\n");
+    else printf("\nResult is WRONG!! :(\n");
+  }
+
+  // Clean up
+
+  free(a);
+  free(b);
+  free(c);
+
+  pm->dev_free(d_a);
+  pm->dev_free(d_b);
+  pm->dev_free(d_c);
+
+  delete pm;
+
+  MPI_Finalize();
+}
diff --git a/gpu/mini-apps/openmp/c++/set_affinity_gpu_polaris.sh b/gpu/mini-apps/openmp/c++/set_affinity_gpu_polaris.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+num_gpus=4
+# need to assign GPUs in reverse order due to topology
+# See Polaris Device Affinity Information https://www.alcf.anl.gov/support/user-guides/polaris/hardware-overview/machine-overview/index.html
+gpu=$((${num_gpus} - 1 - ${PMI_LOCAL_RANK} % ${num_gpus}))
+export CUDA_VISIBLE_DEVICES=$gpu
+#echo “RANK= ${PMI_RANK} LOCAL_RANK= ${PMI_LOCAL_RANK} gpu= ${gpu}”
+exec "$@"
diff --git a/gpu/mini-apps/openmp/c++/submit.sh b/gpu/mini-apps/openmp/c++/submit.sh
@@ -0,0 +1,20 @@
+#!/bin/bash -l
+#PBS -l select=1:system=polaris
+#PBS -l place=scatter
+#PBS -l walltime=0:30:00
+#PBS -q debug 
+#PBS -A Catalyst
+#PBS -l filesystems=home:grand:eagle
+
+#cd ${PBS_O_WORKDIR}
+
+# MPI example w/ 4 MPI ranks per node spread evenly across cores
+NNODES=`wc -l < $PBS_NODEFILE`
+NRANKS_PER_NODE=8
+NDEPTH=8
+NTHREADS=1
+
+NTOTRANKS=$(( NNODES * NRANKS_PER_NODE ))
+echo "NUM_OF_NODES= ${NNODES} TOTAL_NUM_RANKS= ${NTOTRANKS} RANKS_PER_NODE= ${NRANKS_PER_NODE} THREADS_PER_RANK= ${NTHREADS}"
+
+mpiexec -n ${NTOTRANKS} --ppn ${NRANKS_PER_NODE} --depth=${NDEPTH} --cpu-bind depth ./vecadd
diff --git a/gpu/mini-apps/openmp/python/main.py b/gpu/mini-apps/openmp/python/main.py
@@ -0,0 +1,20 @@
+from mrh.my_pyscf.gpu import libgpu
+
+import pyscf   # -- this is contaminating a path preventing an OpenMP runtime that supports GPUs from being picked up
+from gpu4pyscf import patch_pyscf
+
+from pyscf import gto, scf, tools, mcscf, lib
+from mrh.my_pyscf.mcscf.lasscf_async import LASSCF
+from pyscf.mcscf import avas	
+
+gpu = libgpu.libgpu_create_device()
+
+num_gpus = libgpu.libgpu_get_num_devices(gpu)
+print("num_gpus= ", num_gpus)
+
+libgpu.libgpu_dev_properties(gpu, num_gpus)
+
+gpu_id = 0
+libgpu.libgpu_set_device(gpu, gpu_id)
+
+libgpu.libgpu_destroy_device(gpu)