diff --git a/large_language_model/megatron-lm/megatron/training.py b/large_language_model/megatron-lm/megatron/training.py index 6bbd8f407..b4f5e1956 100755 --- a/large_language_model/megatron-lm/megatron/training.py +++ b/large_language_model/megatron-lm/megatron/training.py @@ -95,6 +95,9 @@ def pretrain(train_valid_test_dataset_provider, args_defaults: a dictionary from argument-name to argument-value. It to set already parse arguments. """ + # The reference implementation does not clear the cache currently + # but the submissions are required to do so + mllogger.event(key=mllogger.constants.CACHE_CLEAR, value=True) mllogger.start(key=mllogger.constants.INIT_START, sync=False) # Initalize and get arguments, timers, and Tensorboard writer. @@ -232,6 +235,9 @@ def pretrain(train_valid_test_dataset_provider, mllogger.event(key="trained_samples", value=(args.consumed_train_samples - args.ext_lr_steps) * args.seq_length, sync=False) + mllogger.event(key="train_samples", + value=(args.consumed_train_samples - args.ext_lr_steps) * args.seq_length, + sync=False) mllogger.end(key=mllogger.constants.BLOCK_STOP, metadata={'first_epoch_num': 0}, sync=False) @@ -813,6 +819,9 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, mllogger.event(key="trained_samples", value=(args.consumed_train_samples - args.ext_lr_steps) * args.seq_length, sync=False) + mllogger.event(key="train_samples", + value=(args.consumed_train_samples - args.ext_lr_steps) * args.seq_length, + sync=False) if not saved_checkpoint: save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler)