From a333a741bb75a68f6d29cb718472af70e3b5c912 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 21 Dec 2023 12:19:19 +0100 Subject: [PATCH] Remove temporary test directory --- .../nvidia/install_cuda_host_injections.sh | 211 ------------------ .../nvidia/link_nvidia_host_libraries.sh | 136 ----------- temp/scripts/utils.sh | 144 ------------ 3 files changed, 491 deletions(-) delete mode 100755 temp/scripts/gpu_support/nvidia/install_cuda_host_injections.sh delete mode 100755 temp/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh delete mode 100644 temp/scripts/utils.sh diff --git a/temp/scripts/gpu_support/nvidia/install_cuda_host_injections.sh b/temp/scripts/gpu_support/nvidia/install_cuda_host_injections.sh deleted file mode 100755 index a9310d817a..0000000000 --- a/temp/scripts/gpu_support/nvidia/install_cuda_host_injections.sh +++ /dev/null @@ -1,211 +0,0 @@ -#!/usr/bin/env bash - -# This script can be used to install CUDA under the `.../host_injections` directory. -# This provides the parts of the CUDA installation that cannot be redistributed as -# part of EESSI due to license limitations. While GPU-based software from EESSI will -# _run_ without these, installation of additional CUDA software requires the CUDA -# installation(s) under `host_injections` to be present. -# -# The `host_injections` directory is a variant symlink that by default points to -# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see -# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the -# installation to be successful, this directory needs to be writeable by the user -# executing this script. - -# Initialise our bash functions -TOPDIR=$(dirname $(realpath $BASH_SOURCE)) -source "$TOPDIR"/../../utils.sh - -# Function to display help message -show_help() { - echo "Usage: $0 [OPTIONS]" - echo "Options:" - echo " --help Display this help message" - echo " --accept-cuda-eula You _must_ accept the CUDA EULA to install" - echo " CUDA, see the EULA at" - echo " https://docs.nvidia.com/cuda/eula/index.html" - echo " -c, --cuda-version CUDA_VERSION Specify a version o CUDA to install (must" - echo " have a corresponding easyconfig in the" - echo " EasyBuild release)" - echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" - echo " storage during the CUDA install" - echo " (must have >10GB available)" -} - -# Initialize variables -install_cuda_version="" -eula_accepted=0 - -# Parse command-line options -while [[ $# -gt 0 ]]; do - case "$1" in - --help) - show_help - exit 0 - ;; - -c|--cuda-version) - if [ -n "$2" ]; then - install_cuda_version="$2" - shift 2 - else - echo "Error: Argument required for $1" - show_help - exit 1 - fi - ;; - --accept-cuda-eula) - eula_accepted=1 - shift 1 - ;; - -t|--temp-dir) - if [ -n "$2" ]; then - CUDA_TEMP_DIR="$2" - shift 2 - else - echo "Error: Argument required for $1" - show_help - exit 1 - fi - ;; - *) - show_help - fatal_error "Error: Unknown option: $1" - ;; - esac -done - -# Make sure EESSI is initialised -check_eessi_initialised - -# Make sure the CUDA version supplied is a semantic version -is_semantic_version() { - local version=$1 - local regex='^[0-9]+\.[0-9]+\.[0-9]+$' - - if [[ $version =~ $regex ]]; then - return 0 # Return success (0) if it's a semantic version - else - return 1 # Return failure (1) if it's not a semantic version - fi -} -if ! is_semantic_version "$install_cuda_version"; then - show_help - error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n" - error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n" - error="${error}version to provide is probably one of those available under\n" - error="${error}$EESSI_SOFTWARE_PATH/software/CUDA\n" - fatal_error "${error}" -fi - -# Make sure they have accepted the CUDA EULA -if [ "$eula_accepted" -ne 1 ]; then - show_help - error="\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n" - fatal_error "${error}" -fi - -# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` -# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) -cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} - -# Only install CUDA if specified version is not found. -# (existence of easybuild subdir implies a successful install) -if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then - echo_green "CUDA software found! No need to install CUDA again." -else - # We need to be able write to the installation space so let's make sure we can - if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then - fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" - fi - - # we need a directory we can use for temporary storage - if [[ -z "${CUDA_TEMP_DIR}" ]]; then - tmpdir=$(mktemp -d) - else - tmpdir="${CUDA_TEMP_DIR}"/temp - if ! mkdir "$tmpdir" ; then - fatal_error "Could not create directory ${tmpdir}" - fi - fi - - required_space_in_tmpdir=50000 - # Let's see if we have sources and build locations defined if not, we use the temporary space - if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then - export EASYBUILD_BUILDPATH=${tmpdir}/build - required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) - fi - if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then - export EASYBUILD_SOURCEPATH=${tmpdir}/sources - required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) - fi - - # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), - # need to do a space check before we proceed - avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') - if (( avail_space < 5000000 )); then - fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." - fi - avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') - if (( avail_space < required_space_in_tmpdir )); then - error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" - error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check." - error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " - error="${error}to reduce this requirement. Exiting now..." - fatal_error "${error}" - fi - - if ! command -v "eb" &>/dev/null; then - echo_yellow "Attempting to load an EasyBuild module to do actual install" - module load EasyBuild - # There are some scenarios where this may fail - if [ $? -ne 0 ]; then - error="'eb' command not found in your environment and\n" - error="${error} module load EasyBuild\n" - error="${error}failed for some reason.\n" - error="${error}Please re-run this script with the 'eb' command available." - fatal_error "${error}" - fi - fi - - cuda_easyconfig="CUDA-${install_cuda_version}.eb" - - # Check the easyconfig file is available in the release - # (eb search always returns 0, so we need a grep to ensure a usable exit code) - eb --search ^${cuda_easyconfig}|grep CUDA > /dev/null 2>&1 - # Check the exit code - if [ $? -ne 0 ]; then - eb_version=$(eb --version) - available_cuda_easyconfigs=$(eb --search ^CUDA-*.eb|grep CUDA) - - error="The easyconfig ${cuda_easyconfig} was not found in EasyBuild version:\n" - error="${error} ${eb_version}\n" - error="${error}You either need to give a different version of CUDA to install _or_ \n" - error="${error}use a different version of EasyBuild for the installation.\n" - error="${error}\nThe versions of available with the current eb command are:\n" - error="${error}${available_cuda_easyconfigs}" - fatal_error "${error}" - fi - - # We need the --rebuild option, as the CUDA module may or may not be on the - # `MODULEPATH` yet. Even if it is, we still want to redo this installation - # since it will provide the symlinked targets for the parts of the CUDA - # installation in the `.../versions/...` prefix - # We install the module in our `tmpdir` since we do not need the modulefile, - # we only care about providing the targets for the symlinks. - extra_args="--rebuild --installpath-modules=${tmpdir}" - - # We don't want hooks used in this install, we need a vanilla CUDA installation - touch "$tmpdir"/none.py - # shellcheck disable=SC2086 # Intended splitting of extra_args - eb --prefix="$tmpdir" ${extra_args} --accept-eula-for=CUDA --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}" - ret=$? - if [ $ret -ne 0 ]; then - eb_last_log=$(unset EB_VERBOSE; eb --last-log) - cp -a ${eb_last_log} . - fatal_error "CUDA installation failed, please check EasyBuild logs $(basename ${eb_last_log})..." - else - echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" - fi - # clean up tmpdir - rm -rf "${tmpdir}" -fi diff --git a/temp/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/temp/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh deleted file mode 100755 index e6ff110797..0000000000 --- a/temp/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ /dev/null @@ -1,136 +0,0 @@ -#!/bin/bash - -# This script links host libraries related to GPU drivers to a location where -# they can be found by the EESSI linker - -# Initialise our bash functions -TOPDIR=$(dirname $(realpath $BASH_SOURCE)) -source "$TOPDIR"/../../utils.sh - -# We rely on ldconfig to give us the location of the libraries on the host -command_name="ldconfig" -# We cannot use a version of ldconfig that's being shipped under CVMFS -exclude_prefix="/cvmfs" - -found_paths=() -# Always attempt to use /sbin/ldconfig -if [ -x "/sbin/$command_name" ]; then - found_paths+=("/sbin/$command_name") -fi -IFS=':' read -ra path_dirs <<< "$PATH" -for dir in "${path_dirs[@]}"; do - if [ "$dir" = "/sbin" ]; then - continue # we've already checked for $command_name in /sbin, don't need to do it twice - fi - if [[ ! "$dir" =~ ^$exclude_prefix ]]; then - if [ -x "$dir/$command_name" ]; then - found_paths+=("$dir/$command_name") - fi - fi -done - -if [ ${#found_paths[@]} -gt 0 ]; then - echo "Found $command_name in the following locations:" - printf -- "- %s\n" "${found_paths[@]}" - echo "Using first version" - host_ldconfig=${found_paths[0]} -else - error="$command_name not found in PATH or only found in paths starting with $exclude_prefix." - fatal_error "$error" -fi - -# Make sure EESSI is initialised (doesn't matter what version) -check_eessi_initialised - -# Find the CUDA version of the host CUDA drivers -# (making sure that this can still work inside prefix environment inside a container) -export LD_LIBRARY_PATH=/.singularity.d/libs:$LD_LIBRARY_PATH -nvidia_smi_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader" -if $nvidia_smi_command > /dev/null; then - host_driver_version=$($nvidia_smi_command | tail -n1) - # If the first worked, this should work too - host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') -else - error="Failed to successfully execute\n $nvidia_smi_command\n" - fatal_error "$error" -fi - -# Let's make sure the driver libraries are not already in place -link_drivers=1 - -host_injections_nvidia_dir="${EESSI_CVMFS_REPO}/host_injections/nvidia/${EESSI_CPU_FAMILY}" -host_injection_driver_dir="${host_injections_nvidia_dir}/host" -host_injection_driver_version_file="$host_injection_driver_dir/driver_version.txt" -if [ -e "$host_injection_driver_version_file" ]; then - if grep -q "$host_driver_version" "$host_injection_driver_version_file"; then - echo_green "The host CUDA driver libraries have already been linked!" - link_drivers=0 - else - # There's something there but it is out of date - echo_yellow "Cleaning out outdated symlinks" - rm $host_injection_driver_dir/* - if [ $? -ne 0 ]; then - error="Unable to remove files under '$host_injection_driver_dir'." - fatal_error "$error" - fi - fi -fi - -drivers_linked=0 -if [ "$link_drivers" -eq 1 ]; then - if ! create_directory_structure "${host_injection_driver_dir}" ; then - fatal_error "No write permissions to directory ${host_injection_driver_dir}" - fi - cd ${host_injection_driver_dir} - # Need a small temporary space to hold a couple of files - temp_dir=$(mktemp -d) - - # Gather libraries on the host (_must_ be host ldconfig) - $host_ldconfig -p | awk '{print $NF}' > "$temp_dir"/libs.txt - # Allow for the fact that we may be in a container so the CUDA libs might be in there - ls /.singularity.d/libs/* >> "$temp_dir"/libs.txt 2>/dev/null - - # Leverage singularity to find the full list of libraries we should be linking to - echo_yellow "Downloading latest version of nvliblist.conf from Apptainer" - curl -o "$temp_dir"/nvliblist.conf https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf - - # Make symlinks to all the interesting libraries - grep '.so$' "$temp_dir"/nvliblist.conf | xargs -i grep {} "$temp_dir"/libs.txt | xargs -i ln -s {} - - # Inject driver and CUDA versions into dir - echo $host_driver_version > driver_version.txt - echo $host_cuda_version > cuda_version.txt - drivers_linked=1 - - # Remove the temporary directory when done - rm -r "$temp_dir" -fi - -# Make latest symlink for NVIDIA drivers -cd $host_injections_nvidia_dir -symlink="latest" -if [ -L "$symlink" ]; then - # Unless the drivers have been installed, leave the symlink alone - if [ "$drivers_linked" -eq 1 ]; then - ln -sf host latest - fi -else - # No link exists yet - ln -s host latest -fi - -# Make sure the libraries can be found by the EESSI linker -host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections} -if [ -L "$host_injection_linker_dir/lib" ]; then - target_path=$(readlink -f "$host_injection_linker_dir/lib") - if [ "$target_path" != "$$host_injections_nvidia_dir/latest" ]; then - cd $host_injection_linker_dir - ln -sf $host_injections_nvidia_dir/latest lib - fi -else - create_directory_structure $host_injection_linker_dir - cd $host_injection_linker_dir - ln -s $host_injections_nvidia_dir/latest lib -fi - -echo_green "Host NVIDIA gpu drivers linked successfully for EESSI" diff --git a/temp/scripts/utils.sh b/temp/scripts/utils.sh deleted file mode 100644 index b2be3f6221..0000000000 --- a/temp/scripts/utils.sh +++ /dev/null @@ -1,144 +0,0 @@ -function echo_green() { - echo -e "\e[32m$1\e[0m" -} - -function echo_red() { - echo -e "\e[31m$1\e[0m" -} - -function echo_yellow() { - echo -e "\e[33m$1\e[0m" -} - -ANY_ERROR_EXITCODE=1 -function fatal_error() { - echo_red "ERROR: $1" >&2 - if [[ $# -gt 1 ]]; then - exit "$2" - else - exit "${ANY_ERROR_EXITCODE}" - fi -} - -function check_exit_code { - ec=$1 - ok_msg=$2 - fail_msg=$3 - - if [[ $ec -eq 0 ]]; then - echo_green "${ok_msg}" - else - fatal_error "${fail_msg}" - fi -} - -function check_eessi_initialised() { - if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then - fatal_error "EESSI has not been initialised!" - else - return 0 - fi -} - -function check_in_prefix_shell() { - # Make sure EPREFIX is defined - if [[ -z "${EPREFIX}" ]]; then - fatal_error "This script cannot be used without having first defined EPREFIX" - fi - if [[ ! ${SHELL} = ${EPREFIX}/bin/bash ]]; then - fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!" - fi -} - -function create_directory_structure() { - # Ensure we are given a single path argument - if [ $# -ne 1 ]; then - echo_red "Function requires a single (relative or absolute) path argument" >&2 - return $ANY_ERROR_EXITCODE - fi - dir_structure="$1" - - # Attempt to create the directory structure - error_message=$(mkdir -p "$dir_structure" 2>&1) - return_code=$? - # If it fails be explicit about the error - if [ ${return_code} -ne 0 ]; then - real_dir=$(realpath -m "$dir_structure") - echo_red "Creating ${dir_structure} (real path ${real_dir}) failed with:\n ${error_message}" >&2 - else - # If we're creating it, our use case is that we want to be able to write there - # (this is a check in case the directory already existed) - if [ ! -w "${dir_structure}" ]; then - real_dir=$(realpath -m "$dir_structure") - echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" - return_code=$ANY_ERROR_EXITCODE - fi - fi - - return $return_code -} - -function get_path_for_tool { - tool_name=$1 - tool_envvar_name=$2 - - which_out=$(which "${tool_name}" 2>&1) - exit_code=$? - if [[ ${exit_code} -eq 0 ]]; then - echo "INFO: found tool ${tool_name} in PATH (${which_out})" >&2 - echo "${which_out}" - return 0 - fi - if [[ -z "${tool_envvar_name}" ]]; then - msg="no env var holding the full path to tool '${tool_name}' provided" - echo "${msg}" >&2 - return 1 - else - tool_envvar_value=${!tool_envvar_name} - if [[ -x "${tool_envvar_value}" ]]; then - msg="INFO: found tool ${tool_envvar_value} via env var ${tool_envvar_name}" - echo "${msg}" >&2 - echo "${tool_envvar_value}" - return 0 - else - msg="ERROR: tool '${tool_name}' not in PATH\n" - msg+="ERROR: tool '${tool_envvar_value}' via '${tool_envvar_name}' not in PATH" - echo "${msg}" >&2 - echo "" - return 2 - fi - fi -} - -function get_host_from_url { - url=$1 - re="(http|https)://([^/:]+)" - if [[ $url =~ $re ]]; then - echo "${BASH_REMATCH[2]}" - return 0 - else - echo "" - return 1 - fi -} - -function get_port_from_url { - url=$1 - re="(http|https)://[^:]+:([0-9]+)" - if [[ $url =~ $re ]]; then - echo "${BASH_REMATCH[2]}" - return 0 - else - echo "" - return 1 - fi -} - -function get_ipv4_address { - hname=$1 - hipv4=$(grep "${hname}" /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) - # TODO try other methods if the one above does not work --> tool that verifies - # what method can be used? - echo "${hipv4}" - return 0 -}