From ae9c3b4a417a7c7beb29f0a5b02999d64c796684 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 10 Apr 2024 07:21:51 -0700 Subject: [PATCH] update --- configs/mcli/mitchish70.yaml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/configs/mcli/mitchish70.yaml b/configs/mcli/mitchish70.yaml index 2bac84089..769605ed3 100644 --- a/configs/mcli/mitchish70.yaml +++ b/configs/mcli/mitchish70.yaml @@ -1,6 +1,6 @@ name: olmo-70b -image: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 -#image: public.ecr.aws/z0f8p3z5/olmo:pytorch2.2.1_cu121-python3.11-ubuntu20.04 +# image: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 +image: public.ecr.aws/z0f8p3z5/olmo:pytorch2.2.1_cu121-python3.11-ubuntu20.04 scheduling: priority: auto # preemptible: true # means it can be retried @@ -71,7 +71,6 @@ compute: - inst-ht0xx-r15z3-workers - inst-entnk-r15z3-workers - inst-hvw6t-r15z3-workers - - inst-3to96-r15z3-workers - inst-4ki3x-r15z3-workers - inst-aixwt-r15z3-workers - inst-pbivr-r15z3-workers @@ -125,6 +124,7 @@ compute: - inst-97xv1-r15z3-workers - inst-vaqst-r15z3-workers - inst-i6mnk-r15z3-workers + - inst-xtbwa-r15z3-workers # Bad nodes: # - inst-zgb86-r15z3-workers # - inst-hdlqg-r15z3-workers @@ -132,6 +132,7 @@ compute: # - inst-bw20d-r15z3-workers # - inst-4zdz3-r15z3-workers # - inst-zlnho-r15z3-workers + # - inst-3to96-r15z3-workers integrations: - integration_type: git_repo git_repo: allenai/OLMo @@ -204,3 +205,5 @@ command: |- # --device_train_microbatch_size=2 \ # gpus: 896 # --global_train_batch_size=1792 \ +# gpus: 600 # (75 nodes) +# --global_train_batch_size=1800 \