Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{2023.06}[system] cuTENSOR v2.0.1.2 with CUDA/12.1.1 #367

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,10 @@ ${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX}
# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install
# Allow skipping CUDA SDK install in e.g. CI environments
if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuDNN_host_injections.sh -c 12.1.1 -d 8.9.2.26
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \
-e ${EESSI_PREFIX}/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml \
-t /tmp/temp \
--accept-cuda-eula
else
echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed"
fi
Expand Down
72 changes: 24 additions & 48 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,45 +105,46 @@

end


local function eessi_cuda_enabled_load_hook(t)
local function eessi_cuda_and_libraries_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
-- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
-- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse
-- to load the CUDA module and print an informative message on how to set up GPU support for NESSI
local packagesList = { ["CUDA"] = true, ["cuDNN"] = true, ["cuTENSOR"] = true }
-- If we try to load any of the modules in packagesList, we check if the
-- full package was installed on the host in host_injections.
-- This is required for end users to build additional software that depends
-- on the package. If the full SDK isn't present, refuse
-- to load the module and print an informative message on how to set up GPU support for NESSI
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
if simpleName == 'CUDA' then
if packagesList[simpleName] then
-- simpleName is a module in packagesList
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- build final path where the CUDA software should be installed
local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudaDirExists = isDir(cudaEasyBuildDir)
if not cudaDirExists then
-- build final path where the software should be installed
local packageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local packageDirExists = isDir(packageEasyBuildDir)
if not packageDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with NESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where NESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where NESSI "
advice = advice .. "can find it.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
end
end
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the NESSI linker,
-- when loading CUDA (and cu*) enabled modules check if the necessary driver libraries are accessible to the NESSI linker,
-- otherwise, refuse to load the requested module and print error message
local checkGpu = mt:haveProperty(simpleName,"arch","gpu")
local overrideGpuCheck = os.getenv("EESSI_OVERRIDE_GPU_CHECK")
if checkGpu and (overrideGpuCheck == nil) then
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
if haveGpu then
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
local cudaVersionFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
local cvmfs_repo = os.getenv("EESSI_CVMFS_REPO") or ""
local cudaVersionFile = cvmfs_repo .. "/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = cvmfs_repo .. "/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
local cudaDriverExists = isFile(cudaDriverFile)
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
if not (cudaDriverExists or singularityCudaExists) then
local advice = "which relies on the CUDA runtime environment and driver libraries. "
advice = advice .. "In order to be able to use the module, you will need "
advice = advice .. "to make sure NESSI can find the GPU driver libraries on your host system. You can "
advice = advice .. "override this check by setting the environment variable EESSI_OVERRIDE_GPU_CHECK but "
advice = advice .. "the loaded application will not be able to execute on your system.\\n"
advice = advice .. "to make sure NESSI can find the GPU driver libraries on your host system.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
else
Expand Down Expand Up @@ -174,38 +175,13 @@
end
end

local function eessi_cudnn_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
-- If we try to load cuDNN itself, check if the full cuDNN package was installed on the host in host_injections.
-- This is required for end users to build additional cuDNN dependent software. If the full SDK isn't present, refuse
-- to load the cuDNN module and print an informative message on how to set up GPU support for NESSI
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
if simpleName == 'cuDNN' then
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- build final path where the cuDNN software should be installed
local cudnnEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudnnDirExists = isDir(cudnnEasyBuildDir)
if not cudnnDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with NESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the cuDNN package where NESSI "
advice = advice .. "can find it.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
end
end
end

-- Combine both functions into a single one, as we can only register one function as load hook in lmod
-- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed
function eessi_load_hook(t)
-- Only apply CUDA and cuDNN hooks if the loaded module is in the NESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA or cuDNN module from a local software stack
-- Only apply CUDA and libraries hook if the loaded module is in the NESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA or library module from a local software stack
if from_eessi_prefix(t) then
eessi_cuda_enabled_load_hook(t)
eessi_cudnn_enabled_load_hook(t)
eessi_cuda_and_libraries_enabled_load_hook(t)
end
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ easyconfigs:
options:
from-pr: 20299
- EESSI-extend-2023.06-easybuild.eb
# comment to trigger rebuild
- cuDNN-8.9.2.26-CUDA-12.1.1.eb
- cuTENSOR-2.0.1.2-CUDA-12.1.1.eb
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,5 @@ easyconfigs:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/19451;
options:
from-pr: 19451
- cuDNN-8.9.2.26-CUDA-12.1.1.eb
- OSU-Micro-Benchmarks-7.2-gompi-2023a-CUDA-12.1.1.eb
- ABySS-2.3.7-foss-2023a.eb
Loading
Loading