Try to fix training Loss inconsistent after resume from old checkpoint (

huggingface#25872) * fix loss inconsistent after resume huggingface#25340 * fix typo * clean code * reformatted code * adjust code according to comments * adjust check_dataloader_randomsampler location * return sampler only * handle sampler is None * Update src/transformers/trainer_pt_utils.py thanks @amyeroberts Co-authored-by: amyeroberts <[email protected]> --------- Co-authored-by: amyeroberts <[email protected]>
sywangyi · Sep 7, 2023 · fb7d246 · fb7d246
1 parent c5e66a4
commit fb7d246
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 3 deletions.
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -65,7 +65,7 @@
 from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
 from .models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_MAPPING_NAMES
 from .optimization import Adafactor, get_scheduler
-from .pytorch_utils import ALL_LAYERNORM_LAYERS
+from .pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_less_than_1_11
 from .tokenization_utils_base import PreTrainedTokenizerBase
 from .trainer_callback import (
     CallbackHandler,
@@ -85,6 +85,7 @@
     distributed_broadcast_scalars,
     distributed_concat,
     find_batch_size,
+    get_dataloader_sampler,
     get_model_param_count,
     get_module_class_from_name,
     get_parameter_names,
@@ -219,6 +220,7 @@
 if TYPE_CHECKING:
     import optuna
 
+
 logger = logging.get_logger(__name__)
 
 
@@ -1808,8 +1810,17 @@ def _inner_training_loop(
         # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
         if not args.ignore_data_skip:
             for epoch in range(epochs_trained):
-                for _ in train_dataloader:
-                    break
+                sampler = get_dataloader_sampler(train_dataloader)
+                is_random_sampler = isinstance(sampler, RandomSampler)
+                if is_torch_less_than_1_11 or not is_random_sampler:
+                    # We just need to begin an iteration to create the randomization of the sampler.
+                    for _ in train_dataloader:
+                        break
+                else:
+                    # Otherwise we need to call the whooooole sampler cause there is some random operation added
+                    # AT THE VERY END!
+                    sampler = sampler if sampler is not None else []
+                    _ = list(sampler)
 
         total_batched_samples = 0
         for epoch in range(epochs_trained, num_train_epochs):

diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
@@ -55,6 +55,13 @@
 logger = logging.get_logger(__name__)
 
 
+def get_dataloader_sampler(dataloader):
+    if hasattr(dataloader, "batch_sampler") and dataloader.batch_sampler is not None:
+        return get_dataloader_sampler(dataloader.batch_sampler)
+    elif hasattr(dataloader, "sampler"):
+        return dataloader.sampler
+
+
 def atleast_1d(tensor_or_array: Union[torch.Tensor, np.ndarray]):
     if isinstance(tensor_or_array, torch.Tensor):
         if hasattr(torch, "atleast_1d"):