Skip to content

Commit

Permalink
maxvit 4 gpu 6 worker ssd
Browse files Browse the repository at this point in the history
  • Loading branch information
mwalmsley committed Nov 5, 2023
1 parent d380ef0 commit 0834d7f
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 5 deletions.
4 changes: 2 additions & 2 deletions only_for_me/narval/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,8 @@
num_workers=6,
random_state=random_state,
learning_rate=1e-3,
# cache_dir=os.environ['SLURM_TMPDIR'] + '/cache'
cache_dir='/tmp/cache'
cache_dir=os.environ['SLURM_TMPDIR'] + '/cache'
# cache_dir='/tmp/cache'
# /tmp for ramdisk (400GB total, vs 4TB total for nvme)
)

Expand Down
7 changes: 4 additions & 3 deletions only_for_me/narval/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ nvidia-smi

PYTHON=/home/walml/envs/zoobot39_dev/bin/python

# mkdir $SLURM_TMPDIR/cache
mkdir /tmp/cache
mkdir $SLURM_TMPDIR/cache
# mkdir /tmp/cache

export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication.
# export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable.
Expand All @@ -21,6 +21,7 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t
REPO_DIR=/project/def-bovy/walml/zoobot/
srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \
--save-dir $REPO_DIR/only_for_me/narval/debug_models \
--batch-size 512 \
--batch-size 128 \
--architecture maxvit_tiny_224 \
--color --wandb --mixed-precision
# srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py

0 comments on commit 0834d7f

Please sign in to comment.