From 2ea9ba1329fcefa1dcf91b0cbf2085ac98530fbb Mon Sep 17 00:00:00 2001 From: anmolgupt <14880251+anmolgupt@users.noreply.github.com> Date: Thu, 30 May 2024 09:40:43 -0700 Subject: [PATCH] added train_samples keyword for compliance check (#645) * added train_samples keyword for compliance check * added cache clear logging --- large_language_model/megatron-lm/megatron/training.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/large_language_model/megatron-lm/megatron/training.py b/large_language_model/megatron-lm/megatron/training.py index 6bbd8f407..b4f5e1956 100755 --- a/large_language_model/megatron-lm/megatron/training.py +++ b/large_language_model/megatron-lm/megatron/training.py @@ -95,6 +95,9 @@ def pretrain(train_valid_test_dataset_provider, args_defaults: a dictionary from argument-name to argument-value. It to set already parse arguments. """ + # The reference implementation does not clear the cache currently + # but the submissions are required to do so + mllogger.event(key=mllogger.constants.CACHE_CLEAR, value=True) mllogger.start(key=mllogger.constants.INIT_START, sync=False) # Initalize and get arguments, timers, and Tensorboard writer. @@ -232,6 +235,9 @@ def pretrain(train_valid_test_dataset_provider, mllogger.event(key="trained_samples", value=(args.consumed_train_samples - args.ext_lr_steps) * args.seq_length, sync=False) + mllogger.event(key="train_samples", + value=(args.consumed_train_samples - args.ext_lr_steps) * args.seq_length, + sync=False) mllogger.end(key=mllogger.constants.BLOCK_STOP, metadata={'first_epoch_num': 0}, sync=False) @@ -813,6 +819,9 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, mllogger.event(key="trained_samples", value=(args.consumed_train_samples - args.ext_lr_steps) * args.seq_length, sync=False) + mllogger.event(key="train_samples", + value=(args.consumed_train_samples - args.ext_lr_steps) * args.seq_length, + sync=False) if not saved_checkpoint: save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler)