Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CUDA 12.1.1, CUDA samples, and CUDA related hooks and lmodrc changes #434

Merged
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
fcc7ddb
Also recreated lmodrc when it was changed in a PR
Dec 19, 2023
2b09d1c
Modified lmodrc to add CUDA support. It now checks if you load the CU…
Dec 19, 2023
62e70ba
Adapt created_lmodrc.py for the new domain
Dec 19, 2023
045c099
Add post_sanitycheck hook for CUDA in order to only ship the files we…
Dec 19, 2023
4a4c6e7
Add (the redistributable part of) CUDA to the softare stack
Dec 19, 2023
0346b22
Add CUDA-Samples to the build list
Dec 19, 2023
5ec4c3b
Merge remote-tracking branch 'upstream/2023.06-software.eessi.io' int…
ocaisa Dec 20, 2023
5905e72
Tweak GPU support implementation
ocaisa Dec 20, 2023
73618a0
Add missing quotes on errors
ocaisa Dec 20, 2023
46727cb
Merge branch '2023.06-software.eessi.io' into cuda_cuda_samples_eessi_io
Dec 20, 2023
039921b
Merge branch 'cuda_cuda_samples_eessi_io' into cuda_cuda_samples_eess…
casparvl Dec 20, 2023
a4e8de7
Merge pull request #1 from ocaisa/cuda_cuda_samples_eessi_io
casparvl Dec 20, 2023
32925fe
Error messages now refer to the scripts that need to be run to instal…
Dec 20, 2023
94a2bfe
Merge branch 'cuda_cuda_samples_eessi_io' of github.com:casparvl/soft…
Dec 20, 2023
a33a0cd
make install_scripts a bit more verbose
boegel Dec 20, 2023
c7b380d
use separate easystack file for CUDA + control order in which easysta…
boegel Dec 20, 2023
f506566
copy EasyBuild log file in case CUDA installation failed in install_c…
boegel Dec 20, 2023
e3ddacc
add additional optional options required for handling NVIDIA support …
boegel Dec 20, 2023
16ddf7f
fix typo when passing --host-injections to container script
boegel Dec 20, 2023
35d6084
correctly pass --nv to singularity command
boegel Dec 20, 2023
fd97667
use quotes when adding --nv
boegel Dec 20, 2023
1917146
comment out running of link_nvidia_host_libraries.sh script, since it…
boegel Dec 20, 2023
f80f0fc
clean up post_sanitycheck_cuda hook and inject_gpu_property function …
boegel Dec 20, 2023
2d37842
remove empty line in eessi-2023.06-eb-4.8.2-2023a.yml
boegel Dec 20, 2023
f007c40
use easyconfigs PR 19451 for installing CUDA-Samples v12.1
boegel Dec 20, 2023
70fa0f9
Ship the scripts, and keep them in a single location
ocaisa Dec 20, 2023
db0c141
Update create_lmodrc.py
ocaisa Dec 21, 2023
293b107
Update create_tarball.sh
ocaisa Dec 21, 2023
73476b2
Only copy scripts if the contents differ
ocaisa Dec 21, 2023
a333a74
Remove temporary test directory
ocaisa Dec 21, 2023
43c73c0
Get rid of copy/paste unfriendly '.'
ocaisa Dec 21, 2023
3ec3df8
Update create_tarball.sh
ocaisa Dec 21, 2023
42e3404
always append to list of files to include in tarball, to avoid overwr…
boegel Dec 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,8 @@ done

echo ">> Creating/updating Lmod cache..."
export LMOD_RC="${EASYBUILD_INSTALLPATH}/.lmod/lmodrc.lua"
if [ ! -f $LMOD_RC ]; then
lmodrc_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodrc.py$' > /dev/null; echo $?)
if [ ! -f $LMOD_RC ] || [ ${lmodrc_changed} == '0' ]; then
boegel marked this conversation as resolved.
Show resolved Hide resolved
python3 $TOPDIR/create_lmodrc.py ${EASYBUILD_INSTALLPATH}
check_exit_code $? "$LMOD_RC created" "Failed to create $LMOD_RC"
fi
Expand Down
76 changes: 76 additions & 0 deletions create_lmodrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,81 @@
}
"""

GPU_LMOD_RC ="""require("strict")
local hook = require("Hook")
local open = io.open

local function read_file(path)
local file = open(path, "rb") -- r read mode and b binary mode
if not file then return nil end
local content = file:read "*a" -- *a or *all reads the whole file
file:close()
return content
end

local function cuda_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
-- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
-- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse
-- to load the CUDA module and print an informative message on how to set up GPU support for EESSI
if simpleName == 'CUDA' then
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- build final path where the CUDA software should be installed
local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudaDirExists = isDir(cudaEasyBuildDir)
if not cudaDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
advice = advice .. "due to licencing. In order to be able to use the CUDA module, please follow the "
advice = advice .. "instructions available under https://www.eessi.io/docs/gpu/ \\n"
casparvl marked this conversation as resolved.
Show resolved Hide resolved
LmodError("\\nYou requested to load ", simpleName, " ", advice)
end
end
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker,
-- otherwise, refuse to load the requested module and print error message
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
if haveGpu then
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
local cudaDriverExists = isFile(cudaDriverFile)
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
if not (cudaDriverExists or singularityCudaExists) then
local advice = "which relies on the CUDA runtime environment and driver libraries. "
advice = advice .. "In order to be able to use the module, please follow the instructions "
advice = advice .. "available under https://www.eessi.io/docs/gpu/ \\n"
casparvl marked this conversation as resolved.
Show resolved Hide resolved
LmodError("\\nYou requested to load ", simpleName, " ", advice)
else
-- CUDA driver exists, now we check its version to see if an update is needed
if cudaDriverExists then
local cudaVersion = read_file(cudaVersionFile)
casparvl marked this conversation as resolved.
Show resolved Hide resolved
local cudaVersion_req = os.getenv("EESSICUDAVERSION")
-- driver CUDA versions don't give a patch version for CUDA
local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)")
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
local driver_libs_need_update = false
if major < major_req then
driver_libs_need_update = true
elseif major == major_req then
if minor < minor_req then
driver_libs_need_update = true
end
end
if driver_libs_need_update == true then
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
advice = advice .. "Please update your CUDA driver libraries and then follow the instructions "
advice = advice .. "under https://www.eessi.io/docs/gpu/ to let EESSI know about the update.\\n"
casparvl marked this conversation as resolved.
Show resolved Hide resolved
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
end
end
end
end
end

hook.register("load", cuda_enabled_load_hook)
"""

def error(msg):
sys.stderr.write("ERROR: %s\n" % msg)
Expand All @@ -36,6 +111,7 @@ def error(msg):
'dot_lmod': DOT_LMOD,
'prefix': prefix,
}
lmodrc_txt += '\n' + GPU_LMOD_RC
try:
os.makedirs(os.path.dirname(lmodrc_path), exist_ok=True)
with open(lmodrc_path, 'w') as fp:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,9 @@ easyconfigs:
- Boost-1.82.0-GCC-12.3.0.eb
- netCDF-4.9.2-gompi-2023a.eb
- FFmpeg-6.0-GCCcore-12.3.0.eb
- CUDA-Samples-12.1-GCC-12.3.0-CUDA-12.1.1.eb
# use easyconfig that only install subset of CUDA samples,
# to circumvent problem with nvcc linking to glibc of host OS;
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/19189
options:
from-pr: 19189
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,7 @@ easyconfigs:
- Nextflow-23.10.0.eb:
options:
from-pr: 19172
- CUDA-12.1.1.eb:
casparvl marked this conversation as resolved.
Show resolved Hide resolved
options:
include-easyblocks-from-pr: 3045
accept-eula-for: CUDA
88 changes: 88 additions & 0 deletions eb_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ def parse_hook(ec, *args, **kwargs):
if ec.name in PARSE_HOOKS:
PARSE_HOOKS[ec.name](ec, eprefix)

# inject the GPU property (if required)
ec = inject_gpu_property(ec)


def post_ready_hook(self, *args, **kwargs):
"""
Expand Down Expand Up @@ -247,6 +250,12 @@ def pre_configure_hook(self, *args, **kwargs):
PRE_CONFIGURE_HOOKS[self.name](self, *args, **kwargs)


def post_sanitycheck_hook(self, *args, **kwargs):
"""Main post-sanity-check hook: trigger custom functions based on software name."""
if self.name in POST_SANITYCHECK_HOOKS:
POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs)


def pre_configure_hook_openblas_optarch_generic(self, *args, **kwargs):
"""
Pre-configure hook for OpenBLAS: add DYNAMIC_ARCH=1 to build/test/install options when using --optarch=GENERIC
Expand Down Expand Up @@ -393,6 +402,81 @@ def pre_single_extension_isoband(ext, *args, **kwargs):
ext.cfg['preinstallopts'] = "sed -i 's/SIGSTKSZ/32768/g' src/testthat/vendor/catch.h && "


def post_sanitycheck_cuda(self, *args, **kwargs):
"""Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections."""
print_msg("Replacing CUDA stuff we cannot ship with symlinks...")
# read CUDA EULA
eula_path = os.path.join(self.installdir, "EULA.txt")
tmp_buffer = []
with open(eula_path) as infile:
copy = False
for line in infile:
if line.strip() == "2.6. Attachment A":
copy = True
continue
elif line.strip() == "2.7. Attachment B":
copy = False
continue
elif copy:
tmp_buffer.append(line)
# create whitelist without file extensions, they're not really needed and they only complicate things
whitelist = ['EULA', 'README']
file_extensions = [".so", ".a", ".h", ".bc"]
for tmp in tmp_buffer:
for word in tmp.split():
if any(ext in word for ext in file_extensions):
whitelist.append(word.split(".")[0])
whitelist = list(set(whitelist))
# Do some quick checks for things we should or shouldn't have in the list
if "nvcc" in whitelist:
raise EasyBuildError("Found 'nvcc' in whitelist: %s" % whitelist)
if "libcudart" not in whitelist:
raise EasyBuildError("Did not find 'libcudart' in whitelist: %s" % whitelist)
# iterate over all files in the CUDA path
for root, dirs, files in os.walk(self.installdir):
for filename in files:
# we only really care about real files, i.e. not symlinks
if not os.path.islink(os.path.join(root, filename)):
# check if the current file is part of the whitelist
basename = filename.split(".")[0]
if basename not in whitelist:
# if it is not in the whitelist, delete the file and create a symlink to host_injections
source = os.path.join(root, filename)
target = source.replace("versions", "host_injections")
# Make sure source and target are not the same
if source == target:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you are"
"using this hook for an EESSI installation?")
os.remove(source)
# Using os.symlink requires the existence of the target directory, so we use os.system
system_command="ln -s '%s' '%s'" % (target, source)
if os.system(system_command) != 0:
raise EasyBuildError("Failed to create symbolic link: %s" % system_command)


def inject_gpu_property(ec):
ec_dict = ec.asdict()
# Check if CUDA is in the dependencies, if so add the GPU Lmod tag
if ("CUDA" in [dep[0] for dep in iter(ec_dict["dependencies"])]):
ec.log.info("[parse hook] Injecting gpu as Lmod arch property and envvar with CUDA version")
key = "modluafooter"
value = 'add_property("arch","gpu")'
cuda_version = 0
for dep in iter(ec_dict["dependencies"]):
# Make CUDA a build dependency only (rpathing saves us from link errors)
if "CUDA" in dep[0]:
cuda_version = dep[1]
ec_dict["dependencies"].remove(dep)
ec_dict["builddependencies"].append(dep) if dep not in ec_dict["builddependencies"] else ec_dict["builddependencies"]
value = "\n".join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version])
if key in ec_dict:
if not value in ec_dict[key]:
ec[key] = "\n".join([ec_dict[key], value])
else:
ec[key] = value
return ec


PARSE_HOOKS = {
'CGAL': parse_hook_cgal_toolchainopts_precise,
'fontconfig': parse_hook_fontconfig_add_fonts,
Expand Down Expand Up @@ -424,3 +508,7 @@ def pre_single_extension_isoband(ext, *args, **kwargs):
'isoband': pre_single_extension_isoband,
'testthat': pre_single_extension_testthat,
}

POST_SANITYCHECK_HOOKS = {
'CUDA': post_sanitycheck_cuda,
}
Loading