Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add self hosted runner to get GPU testing on CSD3 #139

Closed
wants to merge 22 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/check_tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash
/home/js2430/rds/hpc-work/OceanBioME-runner/_work/_temp/julia-1.9.3/bin/julia -O0 --color=yes --project test/gpu_runtests.jl > "output.test"; ec=$?
if [ $ec -eq 0 ]; then
echo "1" > "results.test";
else
echo "0" > "results.test";
fi
105 changes: 105 additions & 0 deletions .github/workflows/gpu-test.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/bin/bash
#!
#! Example SLURM job script for Wilkes3 (AMD EPYC 7763, ConnectX-6, A100)
#! Last updated: Fri 30 Jul 11:07:58 BST 2021
#!

#!#############################################################
#!#### Modify the options in this section as appropriate ######
#!#############################################################

#! sbatch directives begin here ###############################
#! Name of the job:
#SBATCH -J OceanBioME-testing
#! Which project should be charged (NB Wilkes2 projects end in '-GPU'):
#SBATCH -A taylor-sl3-gpu
#! How many whole nodes should be allocated?
#SBATCH --nodes=1
#! How many (MPI) tasks will there be in total?
#! Note probably this should not exceed the total number of GPUs in use.
#SBATCH --ntasks=1
#! Specify the number of GPUs per node (between 1 and 4; must be 4 if nodes>1).
#! Note that the job submission script will enforce no more than 32 cpus per GPU.
#SBATCH --gres=gpu:1
#! How much wallclock time will be required?
#SBATCH --time=02:00:00
#! What types of email messages do you wish to receive?
#SBATCH --mail-type=NONE
#! Uncomment this to prevent the job from being requeued (e.g. if
#! interrupted by node failure or system downtime):
##SBATCH --no-requeue

#! Do not change:
#SBATCH -p ampere

#! sbatch directives end here (put any additional directives above this line)

#! Notes:
#! Charging is determined by GPU number*walltime.

#! Number of nodes and tasks per node allocated by SLURM (do not change):
numnodes=$SLURM_JOB_NUM_NODES
numtasks=$SLURM_NTASKS
mpi_tasks_per_node=$(echo "$SLURM_TASKS_PER_NODE" | sed -e 's/^\([0-9][0-9]*\).*$/\1/')
#! ############################################################
#! Modify the settings below to specify the application's environment, location
#! and launch method:

#! Optionally modify the environment seen by the application
#! (note that SLURM reproduces the environment at submission irrespective of ~/.bashrc):
. /etc/profile.d/modules.sh # Leave this line (enables the module command)
module purge # Removes all modules still loaded
module load rhel8/default-amp # REQUIRED - loads the basic environment

#! Insert additional module load commands after this line if needed:

#! Full path to application executable:
#! TODO - workout how to make version get passed to this script
application="./check_tests.sh"

#! Run options for the application:
options=""

#! Work directory (i.e. where the job will run):
workdir="$SLURM_SUBMIT_DIR" # The value of SLURM_SUBMIT_DIR sets workdir to the directory
# in which sbatch is run.

#! Are you using OpenMP (NB this is unrelated to OpenMPI)? If so increase this
#! safe value to no more than 128:
export OMP_NUM_THREADS=1

#! Number of MPI tasks to be started by the application per node and in total (do not change):
np=$[${numnodes}*${mpi_tasks_per_node}]

#! Choose this for a pure shared-memory OpenMP parallel program on a single node:
#! (OMP_NUM_THREADS threads will be created):
CMD="$application $options"


###############################################################
### You should not have to change anything below this line ####
###############################################################

cd $workdir
echo -e "Changed directory to `pwd`.\n"

JOBID=$SLURM_JOB_ID

echo -e "JobID: $JOBID\n======"
echo "Time: `date`"
echo "Running on master node: `hostname`"
echo "Current directory: `pwd`"

if [ "$SLURM_JOB_NODELIST" ]; then
#! Create a machine file:
export NODEFILE=`generate_pbs_nodefile`
cat $NODEFILE | uniq > machine.file.$JOBID
echo -e "\nNodes allocated:\n================"
echo `cat machine.file.$JOBID | sed -e 's/\..*$//g'`
fi

echo -e "\nnumtasks=$numtasks, numnodes=$numnodes, mpi_tasks_per_node=$mpi_tasks_per_node (OMP_NUM_THREADS=$OMP_NUM_THREADS)"

echo -e "\nExecuting command:\n==================\n$CMD\n"

eval $CMD
36 changes: 36 additions & 0 deletions .github/workflows/gpu-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: CSD3 tests

on:
#push:
# branches:
# - main
pull_request:
branches:
- main

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

env:
DATADEPS_ALWAYS_ACCEPT: true
JULIA_VERSION: "1.9.3"
JULIA_MINOR_VERSION: "1.9"
RDS_HOME: "/home/js2430/rds/hpc-work/OceanBioME-runner/_work"

jobs:
test:
name: Deploy CSD3 tests
runs-on: self-hosted
steps:
- name: Download Julia
run: "wget -N -P $RDS_HOME/_temp https://julialang-s3.julialang.org/bin/linux/x64/$JULIA_MINOR_VERSION/julia-$JULIA_VERSION-linux-x86_64.tar.gz;
tar xf $RDS_HOME/_temp/julia-$JULIA_VERSION-linux-x86_64.tar.gz -C $RDS_HOME/_temp"
- name: Clean repo
run: "git checkout main; git pull --force; git checkout $GITHUB_HEAD_REF; git pull --force"
- name: Run tests
run: "sbatch .github/workflows/gpu-test.sbatch;
/home/js2430/rds/hpc-work/OceanBioME-runner/_work/_temp/julia-1.9.3/bin/julia -O0 --color=yes --project test/check_results.jl"
- name: Clean up
if: always()
run: "rm -rf $RDS_HOME/_temp/julia-$JULIA_VERSION; rm results.test; rm output.test" # presumably this happens anyway
1 change: 1 addition & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
name: Tests

on:
push:
branches:
Expand Down
18 changes: 18 additions & 0 deletions test/check_results.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
file_exists = isfile("results.test")

while !(file_exists)
sleep(10)
global file_exists = isfile("results.test")
end

result = open("results.test") do file
parse(Int, read(file, String))
end

output = open("output.test") do file
read(file, String)
end

println(output)

result == 0 ? error("Tests failed") : nothing
13 changes: 13 additions & 0 deletions test/gpu_runtests.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
using Pkg

Pkg.instantiate()

Pkg.add("Test", "CUDA", "DataDeps", "Documenter", "Statistics", "JLD2")

Pkg.precompile()

using Oceananigans

arch = GPU()

include("runtests.jl")
14 changes: 10 additions & 4 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
using OceanBioME, Documenter, Test
using OceanBioME, Documenter, Test, Oceananigans

if !(@isdefined arch)
arch = CPU()
end

include("test_utils.jl")
include("test_light.jl")
Expand All @@ -8,6 +12,8 @@ include("test_gasexchange.jl")
include("test_slatissima.jl")
include("test_sediments.jl")

@testset "Doctests" begin
doctest(OceanBioME)
end
if isa(arch, CPU)
@testset "Doctests" begin
doctest(OceanBioME)
end
end
15 changes: 7 additions & 8 deletions test/test_LOBSTER.jl
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,13 @@ end

n_timesteps = 100

for arch in (CPU(), )
grid = RectilinearGrid(arch; size=(1, 1, 1), extent=(1, 1, 2))
for open_bottom = (false, true), sinking = (false, true), variable_redfield = (false, true), oxygen = (false, true), carbonates = (false, true)
if !(sinking && open_bottom) # no sinking is the same with and without open bottom
@info "Testing on $(typeof(arch)) with carbonates $(carbonates ? :✅ : :❌), oxygen $(oxygen ? :✅ : :❌), variable redfield $(variable_redfield ? :✅ : :❌), sinking $(sinking ? :✅ : :❌), open bottom $(open_bottom ? :✅ : :❌))"
@testset "$arch, $carbonates, $oxygen, $variable_redfield, $sinking, $open_bottom" begin
test_LOBSTER(grid, carbonates, oxygen, variable_redfield, sinking, open_bottom, n_timesteps)
end
grid = RectilinearGrid(arch; size=(1, 1, 1), extent=(1, 1, 2))

for open_bottom = (false, true), sinking = (false, true), variable_redfield = (false, true), oxygen = (false, true), carbonates = (false, true)
if !(sinking && open_bottom) # no sinking is the same with and without open bottom
@info "Testing on $(typeof(arch)) with carbonates $(carbonates ? :✅ : :❌), oxygen $(oxygen ? :✅ : :❌), variable redfield $(variable_redfield ? :✅ : :❌), sinking $(sinking ? :✅ : :❌), open bottom $(open_bottom ? :✅ : :❌))"
@testset "$arch, $carbonates, $oxygen, $variable_redfield, $sinking, $open_bottom" begin
test_LOBSTER(grid, carbonates, oxygen, variable_redfield, sinking, open_bottom, n_timesteps)
end
end
end
15 changes: 7 additions & 8 deletions test/test_NPZD.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,13 @@ function test_NPZD(grid, sinking, open_bottom)
return nothing
end

for arch in (CPU(), )
grid = RectilinearGrid(arch; size=(3, 3, 6), extent=(1, 1, 2))
for sinking = (false, true), open_bottom = (false, true)
if !(sinking && open_bottom) # no sinking is the same with and without open bottom
@info "Testing on $(typeof(arch)) with sinking $(sinking ? :✅ : :❌), open bottom $(open_bottom ? :✅ : :❌))"
@testset "$arch, $sinking, $open_bottom" begin
test_NPZD(grid, sinking, open_bottom)
end
grid = RectilinearGrid(arch; size=(3, 3, 6), extent=(1, 1, 2))

for sinking = (false, true), open_bottom = (false, true)
if !(sinking && open_bottom) # no sinking is the same with and without open bottom
@info "Testing on $(typeof(arch)) with sinking $(sinking ? :✅ : :❌), open bottom $(open_bottom ? :✅ : :❌))"
@testset "$arch, $sinking, $open_bottom" begin
test_NPZD(grid, sinking, open_bottom)
end
end
end
2 changes: 0 additions & 2 deletions test/test_light.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,9 @@ function test_two_band(grid, bgc, model_type)
return all(results_PAR .≈ reverse(expected_PAR))
end

archs = (CPU(), )

@testset "Light attenuaiton model" begin
for model in (NonhydrostaticModel, HydrostaticFreeSurfaceModel),
arch in archs,
grid in (RectilinearGrid(arch; size = (2, 2, 2), extent = (2, 2, 2)),
LatitudeLongitudeGrid(arch; size = (5, 5, 2), longitude = (-180, 180), latitude = (-85, 85), z = (-2, 0))),
bgc in (LOBSTER, NutrientPhytoplanktonZooplanktonDetritus) # this is now redundant since each model doesn't deal with the light separatly
Expand Down
47 changes: 23 additions & 24 deletions test/test_sediments.jl
Original file line number Diff line number Diff line change
Expand Up @@ -112,31 +112,30 @@ display_name(::ImmersedBoundaryGrid) = "Immersed boundary grid"

bottom_height(x, y) = -1000 + 500 * exp(- (x^2 + y^2) / 250) # a perfect hill

grids = [RectilinearGrid(architecture; size=(3, 3, 50), extent=(10, 10, 500)),
LatitudeLongitudeGrid(architecture; size = (3, 3, 16), latitude = (0, 10), longitude = (0, 10), z = (-500, 0)),
ImmersedBoundaryGrid(
LatitudeLongitudeGrid(architecture; size = (3, 3, 16), latitude = (0, 10), longitude = (0, 10), z = (-500, 0)),
ridFittedBottom(bottom_height))]

@testset "Sediment integration" begin
for architecture in (CPU(), )
grids = [RectilinearGrid(architecture; size=(3, 3, 50), extent=(10, 10, 500)),
LatitudeLongitudeGrid(architecture; size = (3, 3, 16), latitude = (0, 10), longitude = (0, 10), z = (-500, 0)),
ImmersedBoundaryGrid(
LatitudeLongitudeGrid(architecture; size = (3, 3, 16), latitude = (0, 10), longitude = (0, 10), z = (-500, 0)),
GridFittedBottom(bottom_height))]
for grid in grids
for timestepper in (:QuasiAdamsBashforth2, :RungeKutta3),
sediment_model in (InstantRemineralisation(; grid), SimpleMultiG(; grid)),
model in (NonhydrostaticModel, HydrostaticFreeSurfaceModel)
for biogeochemistry in (NutrientPhytoplanktonZooplanktonDetritus(; grid, sediment_model),
LOBSTER(; grid,
carbonates = ifelse(isa(sediment_model, SimpleMultiG), true, false),
oxygen = ifelse(isa(sediment_model, SimpleMultiG), true, false),
variable_redfield = ifelse(isa(sediment_model, SimpleMultiG), true, false),
sediment_model))
# get rid of incompatible combinations
run = ifelse((model == NonhydrostaticModel && (isa(grid, ImmersedBoundaryGrid) || isa(grid, LatitudeLongitudeGrid))) ||
(model == HydrostaticFreeSurfaceModel && timestepper == :RungeKutta3) ||
(isa(sediment_model, SimpleMultiG) && isa(biogeochemistry.underlying_biogeochemistry, NutrientPhytoplanktonZooplanktonDetritus)), false, true)
if run
@info "Testing sediment on $(typeof(architecture)) with $timestepper and $(display_name(sediment_model)) on $(display_name(biogeochemistry.underlying_biogeochemistry))"
@testset "$architecture, $timestepper, $(display_name(sediment_model)), $(display_name(biogeochemistry.underlying_biogeochemistry))" test_flat_sediment(grid, biogeochemistry, model; timestepper)
end
for grid in grids
for timestepper in (:QuasiAdamsBashforth2, :RungeKutta3),
sediment_model in (InstantRemineralisation(; grid), SimpleMultiG(; grid)),
model in (NonhydrostaticModel, HydrostaticFreeSurfaceModel)
for biogeochemistry in (NutrientPhytoplanktonZooplanktonDetritus(; grid, sediment_model),
LOBSTER(; grid,
carbonates = ifelse(isa(sediment_model, SimpleMultiG), true, false),
oxygen = ifelse(isa(sediment_model, SimpleMultiG), true, false),
variable_redfield = ifelse(isa(sediment_model, SimpleMultiG), true, false),
sediment_model))
# get rid of incompatible combinations
run = ifelse((model == NonhydrostaticModel && (isa(grid, ImmersedBoundaryGrid) || isa(grid, LatitudeLongitudeGrid))) ||
(model == HydrostaticFreeSurfaceModel && timestepper == :RungeKutta3) ||
(isa(sediment_model, SimpleMultiG) && isa(biogeochemistry.underlying_biogeochemistry, NutrientPhytoplanktonZooplanktonDetritus)), false, true)
if run
@info "Testing sediment on $(typeof(architecture)) with $timestepper and $(display_name(sediment_model)) on $(display_name(biogeochemistry.underlying_biogeochemistry))"
@testset "$architecture, $timestepper, $(display_name(sediment_model)), $(display_name(biogeochemistry.underlying_biogeochemistry))" test_flat_sediment(grid, biogeochemistry, model; timestepper)
end
end
end
Expand Down
1 change: 0 additions & 1 deletion test/test_slatissima.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ function intercept_tendencies!(model, intercepted_tendencies)
end

@testset "SLatissima particle setup and conservations" begin
arch = CPU()
grid = RectilinearGrid(arch; size=(1, 1, 1), extent=(1, 1, 1))

# Initial properties
Expand Down
8 changes: 3 additions & 5 deletions test/test_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ function test_negative_zeroing(arch)
end

@testset "Test Utils" begin
for arch in (CPU(), )
@test test_column_diffusion_timescale(arch)
@test test_negative_scaling(arch)
@test test_negative_zeroing(arch)
end
@test test_column_diffusion_timescale(arch)
@test test_negative_scaling(arch)
@test test_negative_zeroing(arch)
end