Skip to content

Commit

Permalink
{2023.06}[system] cuDNN/8.9.2.26-CUDA-12.1.1
Browse files Browse the repository at this point in the history
  • Loading branch information
truib committed May 17, 2024
1 parent 158ad00 commit 6664591
Show file tree
Hide file tree
Showing 6 changed files with 321 additions and 5 deletions.
5 changes: 3 additions & 2 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ pr_diff=$(ls [0-9]*.diff | head -1)
# for now, this just reinstalls all scripts. Note the most elegant, but works
${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX}

# Install full CUDA SDK in host_injections
# Install full CUDA SDK and cu* libraries in host_injections
# Hardcode this for now, see if it works
# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install
# Allow skipping CUDA SDK install in e.g. CI environments
Expand All @@ -234,8 +234,9 @@ fi

if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh -c 12.1.1 --accept-cuda-eula
else
echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
fi

# Install drivers in host_injections
Expand Down
29 changes: 27 additions & 2 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,13 +172,38 @@
end
end
local function eessi_cudnn_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
-- If we try to load cuDNN itself, check if the full cuDNN package was installed on the host in host_injections.
-- This is required for end users to build additional cuDNN dependent software. If the full SDK isn't present, refuse
-- to load the cuDNN module and print an informative message on how to set up GPU support for EESSI
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
if simpleName == 'cuDNN' then
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- build final path where the cuDNN software should be installed
local cudnnEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudnnDirExists = isDir(cudnnEasyBuildDir)
if not cudnnDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the cuDNN package where EESSI "
advice = advice .. "can find it.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
end
end
end
-- Combine both functions into a single one, as we can only register one function as load hook in lmod
-- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed
function eessi_load_hook(t)
-- Only apply CUDA hooks if the loaded module is in the EESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA module from a local software stack
-- Only apply CUDA and cuDNN hooks if the loaded module is in the EESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA and cuDNN module from a local software stack
if from_eessi_prefix(t) then
eessi_cuda_enabled_load_hook(t)
eessi_cudnn_enabled_load_hook(t)
end
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ easyconfigs:
options:
from-pr: 20299
- EESSI-extend-2023.06-easybuild.eb
- cuDNN-8.9.2.26-CUDA-12.1.1.eb
79 changes: 79 additions & 0 deletions eb_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,64 @@ def post_sanitycheck_cuda(self, *args, **kwargs):
raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!")



def post_sanitycheck_cudnn(self, *args, **kwargs):
"""
Remove files from cuDNN installation that we are not allowed to ship,
and replace them with a symlink to a corresponding installation under host_injections.
"""
if self.name == 'cuDNN':
print_msg("Replacing files in cuDNN installation that we can not ship with symlinks to host_injections...")

allowlist = ['LICENSE']

# read cuDNN LICENSE, construct allowlist based on section 2. Distribution
# that specifies list of files that can be shipped
license_path = os.path.join(self.installdir, 'LICENSE')
search_string = "2. Distribution. The following portions of the SDK are distributable under the Agreement:"
with open(license_path) as infile:
for line in infile:
if line.strip().startswith(search_string):
# remove search string, split into words, remove trailing
# dots '.' and only retain words starting with a dot '.'
distributable = line[len(search_string):]
for word in distributable.split():
if word[0] == '.':
allowlist.append(word.rstrip('.'))

allowlist = sorted(set(allowlist))
self.log.info("Allowlist for files in cuDNN installation that can be redistributed: " + ', '.join(allowlist))

# iterate over all files in the CUDA installation directory
for dir_path, _, files in os.walk(self.installdir):
for filename in files:
full_path = os.path.join(dir_path, filename)
# we only really care about real files, i.e. not symlinks
if not os.path.islink(full_path):
# check if the current file is part of the allowlist
basename = filename.split('.')[0]
if '.' in filename:
extension = '.' + filename.split('.')[1]
if basename in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
elif '.' in filename and extension in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path)
else:
self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
filename, full_path)
# if it is not in the allowlist, delete the file and create a symlink to host_injections
host_inj_path = full_path.replace('versions', 'host_injections')
# make sure source and target of symlink are not the same
if full_path == host_inj_path:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
"are using this hook for a NESSI installation?",
full_path, host_inj_path)
remove_file(full_path)
symlink(host_inj_path, full_path)
else:
raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!")


def inject_gpu_property(ec):
"""
Add 'gpu' property, via modluafooter easyconfig parameter
Expand All @@ -656,6 +714,26 @@ def inject_gpu_property(ec):
ec[key] = '\n'.join([ec_dict[key], value])
else:
ec[key] = value

# Check if cuDNN is in the dependencies, if so add the 'gpu' Lmod property
if ('cuDNN' in [dep[0] for dep in iter(ec_dict['dependencies'])]):
ec.log.info("Injecting gpu as Lmod arch property and envvar with cuDNN version")
key = 'modluafooter'
value = 'add_property("arch","gpu")'
cudnn_version = 0
for dep in iter(ec_dict['dependencies']):
# Make cuDNN a build dependency only (rpathing saves us from link errors)
if 'cuDNN' in dep[0]:
cudnn_version = dep[1]
ec_dict['dependencies'].remove(dep)
if dep not in ec_dict['builddependencies']:
ec_dict['builddependencies'].append(dep)
value = '\n'.join([value, 'setenv("EESSICUDNNVERSION","%s")' % cudnn_version])
if key in ec_dict:
if not value in ec_dict[key]:
ec[key] = '\n'.join([ec_dict[key], value])
else:
ec[key] = value
return ec


Expand Down Expand Up @@ -709,4 +787,5 @@ def inject_gpu_property(ec):

POST_SANITYCHECK_HOOKS = {
'CUDA': post_sanitycheck_cuda,
'cuDNN': post_sanitycheck_cudnn,
}
2 changes: 1 addition & 1 deletion install_scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ copy_files_by_list ${TOPDIR}/scripts ${INSTALL_PREFIX}/scripts "${script_files[@

# Copy files for the scripts/gpu_support/nvidia directory
nvidia_files=(
install_cuda_host_injections.sh link_nvidia_host_libraries.sh
install_cuda_host_injections.sh install_cudnn_host_injections.sh link_nvidia_host_libraries.sh
)
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"

Expand Down
210 changes: 210 additions & 0 deletions scripts/gpu_support/nvidia/install_cudnn_host_injections.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
#!/usr/bin/env bash

# This script can be used to install cuDNN under the `.../host_injections` directory.
# This provides the parts of the cuDNN installation that cannot be redistributed as
# part of EESSI due to license limitations. While GPU-based software from EESSI will
# _run_ without these, installation of additional software that requires the cuDNN
# installation(s) under `host_injections` to be present.
#
# The `host_injections` directory is a variant symlink that by default points to
# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see
# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the
# installation to be successful, this directory needs to be writeable by the user
# executing this script.

# Initialise our bash functions
TOPDIR=$(dirname $(realpath $BASH_SOURCE))
source "$TOPDIR"/../../utils.sh

# Function to display help message
show_help() {
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --help Display this help message"
echo " -c, --cuda-version CUDA_VERSION Specify a version of CUDA to be used"
echo " when installing cuDNN (must"
echo " have a corresponding easyconfig in the"
echo " EasyBuild release)"
echo " -d, --cudnn-version CUDNN_VERSION Specify a version of cuDNN to install (must"
echo " have a corresponding easyconfig in the"
echo " EasyBuild release)"
echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary"
echo " storage during the cuDNN install"
echo " (must have >10GB available)"
}

# Initialize variables
cuda_version=""
cudnn_version=""

# Parse command-line options
while [[ $# -gt 0 ]]; do
case "$1" in
--help)
show_help
exit 0
;;
-c|--cuda-version)
if [ -n "$2" ]; then
cuda_version="$2"
shift 2
else
echo "Error: Argument required for $1"
show_help
exit 1
fi
;;
-d|--cudnn-version)
if [ -n "$2" ]; then
cudnn_version="$2"
shift 2
else
echo "Error: Argument required for $1"
show_help
exit 1
fi
;;
-t|--temp-dir)
if [ -n "$2" ]; then
CUDA_TEMP_DIR="$2"
shift 2
else
echo "Error: Argument required for $1"
show_help
exit 1
fi
;;
*)
show_help
fatal_error "Error: Unknown option: $1"
;;
esac
done

# Make sure EESSI is initialised
check_eessi_initialised

# Make sure the CUDA version supplied is a semantic version
is_semantic_version() {
local version=$1
local regex='^[0-9]+\.[0-9]+\.[0-9]+$'

if [[ $version =~ $regex ]]; then
return 0 # Return success (0) if it's a semantic version
else
return 1 # Return failure (1) if it's not a semantic version
fi
}
if ! is_semantic_version "$cuda_version"; then
show_help
error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n"
error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n"
error="${error}version to provide is probably one of those available under\n"
error="${error}$EESSI_SOFTWARE_PATH/software/cuDNN\n"
fatal_error "${error}"
fi

# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections`
cudnn_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections}

# Only install cuDNN if specified version is not found.
# (existence of easybuild subdir implies a successful install)
if [ -d "${cudnn_install_parent}"/software/cuDNN/*-CUDA-"${cuda_version}"/easybuild ]; then
echo_green "cuDNN software found! No need to install cuDNN again."
else
# We need to be able write to the installation space so let's make sure we can
if ! create_directory_structure "${cudnn_install_parent}"/software/cuDNN ; then
fatal_error "No write permissions to directory ${cudnn_install_parent}/software/cuDNN"
fi

# we need a directory we can use for temporary storage
if [[ -z "${CUDA_TEMP_DIR}" ]]; then
tmpdir=$(mktemp -d)
else
tmpdir="${CUDA_TEMP_DIR}"/temp
if ! mkdir "$tmpdir" ; then
fatal_error "Could not create directory ${tmpdir}"
fi
fi

required_space_in_tmpdir=50000
# Let's see if we have sources and build locations defined if not, we use the temporary space
if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then
export EASYBUILD_BUILDPATH=${tmpdir}/build
required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000))
fi
if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then
export EASYBUILD_SOURCEPATH=${tmpdir}/sources
required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000))
fi

# The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB),
# need to do a space check before we proceed
avail_space=$(df --output=avail "${cudnn_install_parent}"/ | tail -n 1 | awk '{print $1}')
if (( avail_space < 5000000 )); then
fatal_error "Need at least 5GB disk space to install cuDNN under ${cudnn_install_parent}, exiting now..."
fi
avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}')
if (( avail_space < required_space_in_tmpdir )); then
error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n"
error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check."
error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH "
error="${error}to reduce this requirement. Exiting now..."
fatal_error "${error}"
fi

if ! command -v "eb" &>/dev/null; then
echo_yellow "Attempting to load an EasyBuild module to do actual install"
module load EasyBuild
# There are some scenarios where this may fail
if [ $? -ne 0 ]; then
error="'eb' command not found in your environment and\n"
error="${error} module load EasyBuild\n"
error="${error}failed for some reason.\n"
error="${error}Please re-run this script with the 'eb' command available."
fatal_error "${error}"
fi
fi

cudnn_easyconfig="cuDNN-${cudnn_version}-CUDA-${cuda_version}.eb"

# Check the easyconfig file is available in the release
# (eb search always returns 0, so we need a grep to ensure a usable exit code)
eb --search ^${cudnn_easyconfig}|grep cuDNN > /dev/null 2>&1
# Check the exit code
if [ $? -ne 0 ]; then
eb_version=$(eb --version)
available_cudnn_easyconfigs=$(eb --search ^cuDNN-*.eb|grep cuDNN)

error="The easyconfig ${cudnn_easyconfig} was not found in EasyBuild version:\n"
error="${error} ${eb_version}\n"
error="${error}You either need to give a different version of CUDA to install _or_ \n"
error="${error}use a different version of EasyBuild for the installation.\n"
error="${error}\nThe versions of available with the current eb command are:\n"
error="${error}${available_cudnn_easyconfigs}"
fatal_error "${error}"
fi

# We need the --rebuild option, as the cuDNN module may or may not be on the
# `MODULEPATH` yet. Even if it is, we still want to redo this installation
# since it will provide the symlinked targets for the parts of the cuDNN
# installation in the `.../versions/...` prefix
# We install the module in our `tmpdir` since we do not need the modulefile,
# we only care about providing the targets for the symlinks.
extra_args="--rebuild --installpath-modules=${tmpdir}"

# We don't want hooks used in this install, we need a vanilla cuDNN installation
touch "$tmpdir"/none.py
# shellcheck disable=SC2086 # Intended splitting of extra_args
eb --prefix="$tmpdir" ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cudnn_install_parent}"/ "${cudnn_easyconfig}"
ret=$?
if [ $ret -ne 0 ]; then
eb_last_log=$(unset EB_VERBOSE; eb --last-log)
cp -a ${eb_last_log} .
fatal_error "cuDNN installation failed, please check EasyBuild logs $(basename ${eb_last_log})..."
else
echo_green "cuDNN installation at ${cudnn_install_parent}/software/cuDNN/${cudnn_version}-CUDA-${cuda_version} succeeded!"
fi
# clean up tmpdir
rm -rf "${tmpdir}"
fi

0 comments on commit 6664591

Please sign in to comment.