From 257b4dc68e3eed133cd29e3824d6a5bb2e2624d7 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 10:51:11 -0400 Subject: [PATCH] try 2 node run --- only_for_me/narval/finetune.py | 2 +- only_for_me/narval/finetune.sh | 9 ++++++++- only_for_me/narval/narval.md | 7 ++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index d77caf82..cde0e6ca 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -61,7 +61,7 @@ os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/checkpoints'), accelerator='gpu', devices=2, - num_nodes=1, + num_nodes=2, strategy='ddp', precision='16-mixed', max_epochs=max_epochs, diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index 30c680d8..84506204 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -1,11 +1,18 @@ #!/bin/bash #SBATCH --mem=32G -#SBATCH --nodes=1 +#SBATCH --nodes=2 #SBATCH --time=0:20:0 #SBATCH --tasks-per-node=2 #SBATCH --cpus-per-task=12 #SBATCH --gres=gpu:a100:2 +#### SBATCH --mem=32G +#### SBATCH --nodes=1 +#### SBATCH --time=0:20:0 +#### SBATCH --tasks-per-node=2 +#### SBATCH --cpus-per-task=12 +#### SBATCH --gres=gpu:a100:2 + #### #### SBATCH --mem=16G #### SBATCH --nodes=1 diff --git a/only_for_me/narval/narval.md b/only_for_me/narval/narval.md index d6700aa7..b1fa5eea 100644 --- a/only_for_me/narval/narval.md +++ b/only_for_me/narval/narval.md @@ -8,6 +8,7 @@ https://prashp.gitlab.io/post/compute-canada-tut/ https://docs.alliancecan.ca/wiki/Python ssh walml@narval.alliancecan.ca +ssh-copy-id to avoid password in future module purge module avail @@ -51,8 +52,12 @@ and my own cloned repos pip install --no-deps -e galaxy-datasets pip install --no-deps -e zoobot +Run training + +sbatch only_for_me/narval/finetune.sh + +Works with simple images on multi-GPU, single node -Multi-node notes https://lightning.ai/docs/pytorch/stable/clouds/cluster_intermediate_2.html# https://pytorch.org/docs/stable/elastic/run.html#environment-variables