Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make link_nvidia_host_libraries.sh script a bit more robust, in case target of host_injections directory is a non-existing directory #437

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 40 additions & 35 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -203,42 +203,47 @@ ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12
# ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh

# use PR patch file to determine in which easystack files stuff was added
for easystack_file in $(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing'); do

echo -e "Processing easystack file ${easystack_file}...\n\n"

# determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file
eb_version=$(echo ${easystack_file} | sed 's/.*eb-\([0-9.]*\).*/\1/g')

# load EasyBuild module (will be installed if it's not available yet)
source ${TOPDIR}/load_easybuild_module.sh ${eb_version}

${EB} --show-config

echo_green "All set, let's start installing some software with EasyBuild v${eb_version} in ${EASYBUILD_INSTALLPATH}..."

if [ -f ${easystack_file} ]; then
echo_green "Feeding easystack file ${easystack_file} to EasyBuild..."

${EB} --easystack ${TOPDIR}/${easystack_file} --robot
ec=$?

# copy EasyBuild log file if EasyBuild exited with an error
if [ ${ec} -ne 0 ]; then
eb_last_log=$(unset EB_VERBOSE; eb --last-log)
# copy to current working directory
cp -a ${eb_last_log} .
echo "Last EasyBuild log file copied from ${eb_last_log} to ${PWD}"
# copy to build logs dir (with context added)
copy_build_log "${eb_last_log}" "${build_logs_dir}"
changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing')
if [ -z ${changed_easystacks} ]; then
echo "No missing installations, party time!" # Ensure the bot report success, as there was nothing to be build here
else
for easystack_file in ${changed_easystacks}; do

echo -e "Processing easystack file ${easystack_file}...\n\n"

# determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file
eb_version=$(echo ${easystack_file} | sed 's/.*eb-\([0-9.]*\).*/\1/g')

# load EasyBuild module (will be installed if it's not available yet)
source ${TOPDIR}/load_easybuild_module.sh ${eb_version}

${EB} --show-config

echo_green "All set, let's start installing some software with EasyBuild v${eb_version} in ${EASYBUILD_INSTALLPATH}..."

if [ -f ${easystack_file} ]; then
echo_green "Feeding easystack file ${easystack_file} to EasyBuild..."

${EB} --easystack ${TOPDIR}/${easystack_file} --robot
ec=$?

# copy EasyBuild log file if EasyBuild exited with an error
if [ ${ec} -ne 0 ]; then
eb_last_log=$(unset EB_VERBOSE; eb --last-log)
# copy to current working directory
cp -a ${eb_last_log} .
echo "Last EasyBuild log file copied from ${eb_last_log} to ${PWD}"
# copy to build logs dir (with context added)
copy_build_log "${eb_last_log}" "${build_logs_dir}"
fi

$TOPDIR/check_missing_installations.sh ${TOPDIR}/${easystack_file}
else
fatal_error "Easystack file ${easystack_file} not found!"
fi

$TOPDIR/check_missing_installations.sh ${TOPDIR}/${easystack_file}
else
fatal_error "Easystack file ${easystack_file} not found!"
fi

done

done
fi

### add packages here

Expand Down
16 changes: 12 additions & 4 deletions scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@ export LD_LIBRARY_PATH=/.singularity.d/libs:$LD_LIBRARY_PATH
nvidia_smi_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader"
if $nvidia_smi_command > /dev/null; then
host_driver_version=$($nvidia_smi_command | tail -n1)
echo_green "Found NVIDIA GPU driver version ${host_driver_version}"
# If the first worked, this should work too
host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}')
echo_green "Found host CUDA version ${host_cuda_version}"
else
error="Failed to successfully execute\n $nvidia_smi_command\n"
fatal_error "$error"
Expand All @@ -58,12 +60,18 @@ fi
# Let's make sure the driver libraries are not already in place
link_drivers=1

# first make sure that target of host_injections variant symlink is an existing directory
host_injections_target=$(realpath -m ${EESSI_CVMFS_REPO}/host_injections)
if [ ! -d ${host_injections_target} ]; then
create_directory_structure ${host_injections_target}
fi

host_injections_nvidia_dir="${EESSI_CVMFS_REPO}/host_injections/nvidia/${EESSI_CPU_FAMILY}"
host_injection_driver_dir="${host_injections_nvidia_dir}/host"
host_injection_driver_version_file="$host_injection_driver_dir/driver_version.txt"
if [ -e "$host_injection_driver_version_file" ]; then
if grep -q "$host_driver_version" "$host_injection_driver_version_file"; then
echo_green "The host CUDA driver libraries have already been linked!"
echo_green "The host GPU driver libraries (v${host_driver_version}) have already been linked! (based on ${host_injection_driver_version_file})"
link_drivers=0
else
# There's something there but it is out of date
Expand Down Expand Up @@ -91,8 +99,8 @@ if [ "$link_drivers" -eq 1 ]; then
ls /.singularity.d/libs/* >> "$temp_dir"/libs.txt 2>/dev/null

# Leverage singularity to find the full list of libraries we should be linking to
echo_yellow "Downloading latest version of nvliblist.conf from Apptainer"
curl -o "$temp_dir"/nvliblist.conf https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf
echo_yellow "Downloading latest version of nvliblist.conf from Apptainer to ${temp_dir}/nvliblist.conf"
curl --silent --output "$temp_dir"/nvliblist.conf https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf

# Make symlinks to all the interesting libraries
grep '.so$' "$temp_dir"/nvliblist.conf | xargs -i grep {} "$temp_dir"/libs.txt | xargs -i ln -s {}
Expand Down Expand Up @@ -133,4 +141,4 @@ else
ln -s $host_injections_nvidia_dir/latest lib
fi

echo_green "Host NVIDIA gpu drivers linked successfully for EESSI"
echo_green "Host NVIDIA GPU drivers linked successfully for EESSI"