From 57036a42aab60903cd6f54a0ab1a6535d9c9a6cb Mon Sep 17 00:00:00 2001
From: Phuc Nguyen <b3f0cus@icloud.com>
Date: Sun, 3 Mar 2024 13:26:21 +0000
Subject: [PATCH 01/73] require unit test

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 672b1880..6dcd1228 100644
--- a/README.md
+++ b/README.md
@@ -66,6 +66,11 @@ pre-commit install
 pre-commit run --config .pre-commit-config.yaml --all-files
 ```
 
+*As a part of making sure we aren't slowed down as the codebase grows, we will not merge a PR if the features it introduces do not have test coverage.*
+
+We have extensions built on top of Nanotron, with their tests located in the `/examples` folder. Since VSCode defaults to discovering tests only in the `/tests` folder, please run tests from both `/examples` and `/tests` to ensure your PR does not break these extensions.
+
+
 Features we would like to add:
 - [ ] Support `torch.compile`
 - [ ] Support `torch.distributed.rpc`

From 242f387af28c6916415f4cd0a63c1bdc1adbb3f6 Mon Sep 17 00:00:00 2001
From: kerem <keremozrtk@gmail.com>
Date: Sun, 3 Mar 2024 21:03:10 +0300
Subject: [PATCH 02/73] Add get_global_rank method to ParallelContext class

---
 src/nanotron/parallel/context.py              | 24 +++++++++++++++++++
 ..._parameters_accumulate_gradient_in_fp32.py | 12 +++++-----
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/src/nanotron/parallel/context.py b/src/nanotron/parallel/context.py
index 22597154..40992909 100644
--- a/src/nanotron/parallel/context.py
+++ b/src/nanotron/parallel/context.py
@@ -127,3 +127,27 @@ def destroy(self):
 
         dist.barrier()
         dist.destroy_process_group()
+
+    def get_global_rank(
+        self,
+        expert_parallel_rank: int,
+        pipeline_parallel_rank: int,
+        data_parallel_rank: int,
+        tensor_parallel_rank: int,
+    ) -> int:
+        """
+        Get the global rank based on the specified ranks in different parallel groups.
+
+        :param expert_parallel_rank: int, Rank in the expert parallel group.
+        :param pipeline_parallel_rank: int, Rank in the pipeline parallel group.
+        :param data_parallel_rank: int, Rank in the data parallel group.
+        :param tensor_parallel_rank: int, Rank in the tensor parallel group.
+
+        :return: int, The global rank.
+        """
+        return self.world_rank_matrix[
+            expert_parallel_rank,
+            pipeline_parallel_rank,
+            data_parallel_rank,
+            tensor_parallel_rank,
+        ]
\ No newline at end of file
diff --git a/tests/test_parameters_accumulate_gradient_in_fp32.py b/tests/test_parameters_accumulate_gradient_in_fp32.py
index 68c2dd88..d1ecc26c 100644
--- a/tests/test_parameters_accumulate_gradient_in_fp32.py
+++ b/tests/test_parameters_accumulate_gradient_in_fp32.py
@@ -343,12 +343,12 @@ def _test_tied_weights_sync_with_grad_accum_in_fp32(
                 (
                     target,
                     (
-                        parallel_context.world_rank_matrix[
-                            dist.get_rank(parallel_context.expert_pg),
-                            get_pp_rank_of(target, module=mdl),
-                            dist.get_rank(parallel_context.dp_pg),
-                            dist.get_rank(parallel_context.tp_pg),
-                        ],
+                        parallel_context.get_global_rank(
+                            expert_parallel_rank=dist.get_rank(parallel_context.expert_pg),
+                            pipeline_parallel_rank=get_pp_rank_of(target, module=mdl),
+                            data_parallel_rank=dist.get_rank(parallel_context.dp_pg),
+                            tensor_parallel_rank=dist.get_rank(parallel_context.tp_pg),
+                        ),
                     ),
                 )
                 for target in [

From 8dfdbf96d3a3cc1b20866eac75a355cb4134df6f Mon Sep 17 00:00:00 2001
From: kerem <keremozrtk@gmail.com>
Date: Mon, 4 Mar 2024 13:26:33 +0300
Subject: [PATCH 03/73] modify world_rank_matrix in trainer.py

---
 src/nanotron/trainer.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index 4d9130b6..392aeefa 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -216,9 +216,9 @@ def __init__(
             self.consumed_train_samples = 0
 
         # Setup tensorboard write and log writers on output rank
-        self.logger_ranks = self.parallel_context.world_rank_matrix[
+        self.logger_ranks = self.parallel_context.get_global_rank(
             0, self.unwrapped_model.output_pp_rank, 0, 0
-        ].flatten()
+        ).flatten()
         self.loggerwriter = self.setup_log_writers()
 
         # Log where each module is instantiated
@@ -781,12 +781,12 @@ def mark_tied_parameters(
             (
                 target,
                 (
-                    parallel_context.world_rank_matrix[
-                        dist.get_rank(parallel_context.expert_pg),
-                        get_pp_rank_of(target, module=model),
-                        dist.get_rank(parallel_context.dp_pg),
-                        dist.get_rank(parallel_context.tp_pg),
-                    ],
+                    parallel_context.get_global_rank(
+                        expert_parallel_rank=dist.get_rank(parallel_context.expert_pg),
+                        pipeline_parallel_rank=get_pp_rank_of(target, module=model),
+                        data_parallel_rank=dist.get_rank(parallel_context.dp_pg),
+                        tensor_parallel_rank=dist.get_rank(parallel_context.tp_pg),
+                    ),
                 ),
             )
             for target in embeddings_lm_head_tied_names

From ee5ce492436ba25d0aced4dfea61c74dba852e99 Mon Sep 17 00:00:00 2001
From: kerem <keremozrtk@gmail.com>
Date: Wed, 6 Mar 2024 14:08:17 +0300
Subject: [PATCH 04/73] Add tests to validate return type of get_global_rank

---
 tests/test_distributed.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_distributed.py b/tests/test_distributed.py
index 0101c7d4..aceb13e1 100644
--- a/tests/test_distributed.py
+++ b/tests/test_distributed.py
@@ -25,6 +25,9 @@ def _test_init_parallel_context(parallel_context: ParallelContext):
     assert isinstance(parallel_context.world_rank_matrix, np.ndarray)
     assert isinstance(parallel_context.world_ranks_to_pg, dict)
 
+    global_rank = parallel_context.get_global_rank(0, *ranks3d)
+    assert isinstance(global_rank, int)
+
     parallel_context.destroy()
     assert dist.is_initialized() is False
 
@@ -39,4 +42,4 @@ def _test_init_parallel_context(parallel_context: ParallelContext):
 )
 @rerun_if_address_is_in_use()
 def test_init_parallel_context(tp: int, dp: int, pp: int):
-    init_distributed(tp=tp, dp=dp, pp=pp)(_test_init_parallel_context)()
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_init_parallel_context)()
\ No newline at end of file

From 3740022d3152d14bc4c623a0324791094af5e8e3 Mon Sep 17 00:00:00 2001
From: kerem <keremozrtk@gmail.com>
Date: Thu, 7 Mar 2024 00:47:01 +0300
Subject: [PATCH 05/73] Add/Update tests to validate return type of
 get_global_rank

---
 tests/test_distributed.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_distributed.py b/tests/test_distributed.py
index c5bbb06b..2a12609c 100644
--- a/tests/test_distributed.py
+++ b/tests/test_distributed.py
@@ -25,9 +25,11 @@ def _test_init_parallel_context(parallel_context: ParallelContext):
     assert isinstance(parallel_context.world_rank_matrix, np.ndarray)
     assert isinstance(parallel_context.world_ranks_to_pg, dict)
 
-    global_rank = parallel_context.get_global_rank(0, *ranks3d)
+    global_rank = parallel_context.get_global_rank(*ranks3d)
     assert isinstance(global_rank, int)
 
+    assert global_rank == parallel_context.world_rank_matrix[ranks3d]
+
     parallel_context.destroy()
     assert dist.is_initialized() is False
 

From 4cbada7162c19b33439a4b4d21b482c448148817 Mon Sep 17 00:00:00 2001
From: kerem <keremozrtk@gmail.com>
Date: Thu, 7 Mar 2024 14:47:44 +0300
Subject: [PATCH 06/73] Update test for get_global_rank

---
 src/nanotron/serialize/weights.py | 2 ++
 tests/test_distributed.py         | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/nanotron/serialize/weights.py b/src/nanotron/serialize/weights.py
index 7751f8d1..08a22f13 100644
--- a/src/nanotron/serialize/weights.py
+++ b/src/nanotron/serialize/weights.py
@@ -69,6 +69,8 @@ def save_weights(model: nn.Module, parallel_context: ParallelContext, root_folde
             else:
                 base_name = name
 
+            is_expert_sharded = False # Let's test the solution to the potential bug by defining is_expert_sharded before using it.
+            
             if param.is_sharded:
                 sharded_info: ShardedInfo = param.get_sharded_info()
                 group = parallel_context.world_ranks_to_pg[sharded_info.global_ranks]
diff --git a/tests/test_distributed.py b/tests/test_distributed.py
index 2a12609c..2203bffe 100644
--- a/tests/test_distributed.py
+++ b/tests/test_distributed.py
@@ -25,10 +25,11 @@ def _test_init_parallel_context(parallel_context: ParallelContext):
     assert isinstance(parallel_context.world_rank_matrix, np.ndarray)
     assert isinstance(parallel_context.world_ranks_to_pg, dict)
 
-    global_rank = parallel_context.get_global_rank(*ranks3d)
+    local_rank = tuple(i.item() for i in np.where(parallel_context.world_rank_matrix == world_rank))
+    global_rank = parallel_context.get_global_rank(*local_rank)
     assert isinstance(global_rank, int)
 
-    assert global_rank == parallel_context.world_rank_matrix[ranks3d]
+    assert global_rank == parallel_context.world_rank_matrix[local_rank]
 
     parallel_context.destroy()
     assert dist.is_initialized() is False

From d0d001301fc2a93b8cd22ac87e4f7e72ac79e3ed Mon Sep 17 00:00:00 2001
From: kerem <keremozrtk@gmail.com>
Date: Thu, 7 Mar 2024 17:39:56 +0300
Subject: [PATCH 07/73] Update type test for get_global_rank

---
 src/nanotron/parallel/context.py | 4 ++--
 tests/test_distributed.py        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/nanotron/parallel/context.py b/src/nanotron/parallel/context.py
index 04d40940..8bc9d2e1 100644
--- a/src/nanotron/parallel/context.py
+++ b/src/nanotron/parallel/context.py
@@ -127,7 +127,7 @@ def set_device(self):
 
     def get_local_ranks(self, world_rank: int) -> Tuple[int, int, int]:
         # return coordinates in world_rank_matrix without expert_parallel_rank
-        return tuple(i.item() for i in np.where(self.world_rank_matrix == world_rank))
+        return tuple(i.item() for i in np.where(self.world_rank_matrix == world_rank))[-3:]
 
     def destroy(self):
         if not dist.is_initialized():
@@ -142,7 +142,7 @@ def get_global_rank(
         pipeline_parallel_rank: int,
         data_parallel_rank: int,
         tensor_parallel_rank: int,
-    ) -> int:
+    ) -> np.int32:
         """
         Get the global rank based on the specified ranks in different parallel groups.
 
diff --git a/tests/test_distributed.py b/tests/test_distributed.py
index 2203bffe..916addef 100644
--- a/tests/test_distributed.py
+++ b/tests/test_distributed.py
@@ -27,7 +27,7 @@ def _test_init_parallel_context(parallel_context: ParallelContext):
 
     local_rank = tuple(i.item() for i in np.where(parallel_context.world_rank_matrix == world_rank))
     global_rank = parallel_context.get_global_rank(*local_rank)
-    assert isinstance(global_rank, int)
+    assert isinstance(global_rank, np.int32)
 
     assert global_rank == parallel_context.world_rank_matrix[local_rank]
 

From 9b7ab4b53a6564c324fe9cbc240f7e966047b524 Mon Sep 17 00:00:00 2001
From: kerem <keremozrtk@gmail.com>
Date: Mon, 11 Mar 2024 13:30:18 +0300
Subject: [PATCH 08/73] Update get_global_rank for precise type check

---
 src/nanotron/parallel/context.py  | 4 ++--
 src/nanotron/serialize/weights.py | 2 --
 tests/test_distributed.py         | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/nanotron/parallel/context.py b/src/nanotron/parallel/context.py
index 8bc9d2e1..509754e4 100644
--- a/src/nanotron/parallel/context.py
+++ b/src/nanotron/parallel/context.py
@@ -142,7 +142,7 @@ def get_global_rank(
         pipeline_parallel_rank: int,
         data_parallel_rank: int,
         tensor_parallel_rank: int,
-    ) -> np.int32:
+    ) -> np.int64:
         """
         Get the global rank based on the specified ranks in different parallel groups.
 
@@ -151,7 +151,7 @@ def get_global_rank(
         :param data_parallel_rank: int, Rank in the data parallel group.
         :param tensor_parallel_rank: int, Rank in the tensor parallel group.
 
-        :return: int, The global rank.
+        :return: numpy.int64, The global rank.
         """
         return self.world_rank_matrix[
             expert_parallel_rank,
diff --git a/src/nanotron/serialize/weights.py b/src/nanotron/serialize/weights.py
index 08a22f13..713becc7 100644
--- a/src/nanotron/serialize/weights.py
+++ b/src/nanotron/serialize/weights.py
@@ -68,8 +68,6 @@ def save_weights(model: nn.Module, parallel_context: ParallelContext, root_folde
                     continue
             else:
                 base_name = name
-
-            is_expert_sharded = False # Let's test the solution to the potential bug by defining is_expert_sharded before using it.
             
             if param.is_sharded:
                 sharded_info: ShardedInfo = param.get_sharded_info()
diff --git a/tests/test_distributed.py b/tests/test_distributed.py
index 916addef..b084e9d9 100644
--- a/tests/test_distributed.py
+++ b/tests/test_distributed.py
@@ -27,7 +27,7 @@ def _test_init_parallel_context(parallel_context: ParallelContext):
 
     local_rank = tuple(i.item() for i in np.where(parallel_context.world_rank_matrix == world_rank))
     global_rank = parallel_context.get_global_rank(*local_rank)
-    assert isinstance(global_rank, np.int32)
+    assert isinstance(global_rank, np.int64), f"The type of global_rank is {type(global_rank)}"
 
     assert global_rank == parallel_context.world_rank_matrix[local_rank]
 

From 5717127c775f305ec5b4ab3130b10e243ab6fcb5 Mon Sep 17 00:00:00 2001
From: kerem <keremozrtk@gmail.com>
Date: Mon, 11 Mar 2024 17:11:39 +0300
Subject: [PATCH 09/73] Bug fix attempt

---
 src/nanotron/serialize/weights.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/nanotron/serialize/weights.py b/src/nanotron/serialize/weights.py
index 713becc7..c87ca77a 100644
--- a/src/nanotron/serialize/weights.py
+++ b/src/nanotron/serialize/weights.py
@@ -69,6 +69,8 @@ def save_weights(model: nn.Module, parallel_context: ParallelContext, root_folde
             else:
                 base_name = name
             
+            is_expert_sharded = False # Let's test the solution to the potential bug by defining is_expert_sharded before using it.
+
             if param.is_sharded:
                 sharded_info: ShardedInfo = param.get_sharded_info()
                 group = parallel_context.world_ranks_to_pg[sharded_info.global_ranks]

From 07afee925a06d75a1e39170697149ee77a78f15e Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 13 Mar 2024 10:01:47 +0100
Subject: [PATCH 10/73] Initial commit

---
 LICENSE                                 |  60 ++
 docs/nanoset.md                         | 144 +++++
 examples/nanoset_llama_7b.py            |  95 +++
 run_train_nanoset.py                    | 113 ++++
 src/nanotron/config/config.py           |  15 +-
 src/nanotron/data/Makefile              |   9 +
 src/nanotron/data/__init__.py           |   0
 src/nanotron/data/dataloader_builder.py | 174 ++++++
 src/nanotron/data/dataset_builder.py    | 219 +++++++
 src/nanotron/data/helpers.cpp           | 765 ++++++++++++++++++++++++
 src/nanotron/data/indexed_dataset.py    | 642 ++++++++++++++++++++
 src/nanotron/data/nanoset.py            | 509 ++++++++++++++++
 src/nanotron/data/nanoset_configs.py    |  50 ++
 src/nanotron/data/utils.py              |  79 +++
 src/nanotron/dataloader.py              |  14 +-
 src/nanotron/trainer.py                 |  17 +-
 16 files changed, 2895 insertions(+), 10 deletions(-)
 create mode 100644 docs/nanoset.md
 create mode 100644 examples/nanoset_llama_7b.py
 create mode 100644 run_train_nanoset.py
 create mode 100644 src/nanotron/data/Makefile
 create mode 100644 src/nanotron/data/__init__.py
 create mode 100644 src/nanotron/data/dataloader_builder.py
 create mode 100644 src/nanotron/data/dataset_builder.py
 create mode 100644 src/nanotron/data/helpers.cpp
 create mode 100644 src/nanotron/data/indexed_dataset.py
 create mode 100644 src/nanotron/data/nanoset.py
 create mode 100644 src/nanotron/data/nanoset_configs.py
 create mode 100644 src/nanotron/data/utils.py

diff --git a/LICENSE b/LICENSE
index 261eeb9e..199c83ea 100644
--- a/LICENSE
+++ b/LICENSE
@@ -199,3 +199,63 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
+
+This repository also contains code from Facebook (from their Fairseq project)
+and NVIDIA (from their Megatron project). Files from these organizations 
+have notices at the top of each file. Below are licenses used in those files, as indicated.
+
+------------- LICENSE FOR Facebook Fairseq code --------------
+
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+------------- LICENSE FOR NVIDIA Megatron code --------------
+
+The following applies to all files unless otherwise noted:
+
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--
diff --git a/docs/nanoset.md b/docs/nanoset.md
new file mode 100644
index 00000000..8e46b50a
--- /dev/null
+++ b/docs/nanoset.md
@@ -0,0 +1,144 @@
+# Data Pipeline
+
+## Data pre-processing
+
+Data preprocessing is built around the following classes:
+
+1. `MMapIndexedDatasetBuilder`
+2. `MMapIndexedDataset`
+
+At the moment, an end-to-end data preprocessing implementation is left to the user. See the class docstring(s) for more details.
+
+#### MMapIndexedDatasetBuilder
+
+The `MMapIndexedDatasetBuilder` is capable of building and merging `MMapIndexedDataset` instances.
+
+#### MMapIndexedDataset
+
+The `MMapIndexedDataset` class is the lowest-level data interface. Internally, an `MMapIndexedDataset` instance references two binaries: the data file (`.bin`) contains document/sequence data and the index file (`.idx`) contains document/sequence metadata.
+
+The index file stores dataset-level metadata first:
+- The index header, for backward compatibility
+- The index version, for backward compatibility
+- A numeric code corresponding to the data type used to write data to the data file
+- The number of sequences in the dataset
+- The number of documents in the dataset
+
+The index file stores document-level and sequence-level metadata second:
+- In order, the number of elements per sequence
+- In order, the byte offset (pointer) per sequence
+- In order, the consecutive sequence index range `[...)` per document
+- In order, the mode per sequence (in the multimodal case)
+
+## Data loading: construction
+
+Building the data loaders is a distributed-aware process built around the following classes:
+
+1. `NanosetConfig`
+2. `NanosetBuilder`
+3. `MMapIndexedDataset`
+3. `Nanoset`
+
+See the class docstrings for more details.
+
+#### NanosetConfig
+
+The `NanosetConfig` class parameterizes the `NanosetBuilder` and in turn the `Nanoset`.
+
+Different training/inference regimes will require different extensions e.g. the `NanosetConfig`
+
+#### NanosetBuilder
+
+The `NanosetBuilder` class builds the highest-level data interfaces.
+
+**NB:** All ranks should attempt to build the dataset via the `NanosetBuilder` or the program will hang. Which ranks follow through on their attempts can be controlled via the `NanosetConfig`.
+
+#### MMapIndexedDataset
+
+The `MMapIndexedDataset` class is the lowest-level data interface.
+
+The `MMapIndexedDataset` should already exist on disk before attempting to build any of the high-level data interfaces.
+
+
+#### Nanoset
+
+The `Nanoset` abstract class is a high-level data interface. It is built upon the `MMapIndexedDataset`.
+
+Different training/inference regimes will require different extensions e.g. the `Nanoset`
+
+## Data loading: implementation
+
+### Nanoset
+
+The `Nanoset` is parameterized by the following variables: the underlying `MMapIndexedDataset` instance `indexed_dataset`, the split indices `indexed_indices` (the congituous subset of document or sequence indices used for training, validation, and testing), the number of samples `N`, the sequence length `S`, and the random seed `R`.
+
+The `Nanoset` creates three index mappings to facilitate lookup: (1) the document index, (2) the sample index, and (3) the shuffle index.
+
+1. The document index _Do_idx_ is a 1-D array mapping from _i_ to document index of length `E * |indexed_indices|` where `E` corresponds to the minimum number of epochs such that `E * |indexed_indices| >= N`. The document index is shuffled according to `R`.
+
+    ```
+    Given:
+
+    N = 15
+    indexed_indices = [5, 6, 7, 8, 9]
+    E = 3
+
+    Then, for example:
+
+    Do_idx = [8, 8, 9, 6, 7, 5, 8, 5, 6, 6, 5, 9, 7, 7, 9]
+    ```
+
+2. The sample index _Sa_idx_ is a 2-D array mapping from _j_ to pairs of (_i_, _Do_idx_[ _i_ ] offset) of shape `[N + 1, 2]`. The rows _j_ and _j_ + 1 serve as the left and right bounds for the _j_-th sample. 
+
+    ```
+    Given:
+
+    S = 1024
+
+    Then, for example:
+
+    Sa_idx[0] = (0, 0)
+    Sa_idx[1] = (0, 1024)       => Do_idx[0] has length greater than S
+    Sa_idx[2] = (1, 512)        => Do_idx[0] has length 1536
+    Sa_idx[3] = (2, 0)          => Do_idx[1] has length 1536
+    Sa_idx[4] = (5, 300)        => Do_idx[2:5] are shorter documents relative to Do_idx[0:2]
+    Sa_idx[5] = (6, 24)         => Do_idx[5] has length 1300
+    ```
+
+3. The shuffle index _Sh_idx_ is a 1-D array mapping from _k_ to _j_ of length `N`. The shuffle index is shuffled according to `R`.
+
+    ```
+    Given
+
+    N = 10
+
+    Then, for example:
+
+    Sh_idx = [4, 0, 2, 6, 1, 9, 5, 8, 7, 3]
+    ```
+
+To query the `Nanoset` for the _k_-th sample we do the following
+
+-  Use the shuffle index to get the index _j_ into the sample index.
+
+    ```
+    j = Sh_idx[k]
+    ```
+- Use the sample index to get the left and right sample-bounding indices into the document index and the starting token offset for each document.
+
+    ```
+    i, offset = Sa_idx[j]
+    i_next, offset_next = Sa_idx[j + 1]
+    ```
+- Use the document index to retrieve `S` tokens from consecutive (in the document index) documents.
+
+    ```
+    sample = []
+    sample += indexed_dataset[Do_idx[i]][offset:]
+    if i != i_next:
+        sample += indexed_dataset[Do_idx[i + 1:i_next]]
+    sample += indexed_dataset[Do_idx[i_next]][:offset_next]
+    ```
+
+To save time during initialization, each index is built/cached sequentially on one process rank and subsequently loaded in parallel on other process ranks. The cached indices are unique to a hash generated in the `Nanoset.__init__` function.
+
diff --git a/examples/nanoset_llama_7b.py b/examples/nanoset_llama_7b.py
new file mode 100644
index 00000000..bde3c6d2
--- /dev/null
+++ b/examples/nanoset_llama_7b.py
@@ -0,0 +1,95 @@
+"""
+Benchmarking script for the Llama-2-7b model
+"""
+
+import os
+
+from nanotron.config import (
+    CheckpointsArgs,
+    Config,
+    DataArgs,
+    GeneralArgs,
+    LlamaConfig,
+    LoggingArgs,
+    LRSchedulerArgs,
+    ModelArgs,
+    OptimizerArgs,
+    ParallelismArgs,
+    NanosetDatasetsArgs,
+    RandomInit,
+    TokenizerArgs,
+    TokensArgs,
+)
+from nanotron.logging import human_format
+
+# Config for a llama model with 6.74M parameters
+model_config = LlamaConfig()
+
+num_params = human_format(
+    model_config.vocab_size * model_config.hidden_size * 2
+    + model_config.num_hidden_layers
+    * (
+        3 * model_config.hidden_size * model_config.intermediate_size
+        + 4 * model_config.hidden_size * model_config.hidden_size
+    )
+).replace(".", "p")
+
+print(f"Model has {num_params} parameters")
+
+seed = 42
+
+learning_rate = LRSchedulerArgs(
+    learning_rate=3e-4, lr_warmup_steps=2, lr_warmup_style="linear", lr_decay_style="cosine", min_decay_lr=1e-5
+)
+
+optimizer = OptimizerArgs(
+    zero_stage=0,
+    weight_decay=0.01,
+    clip_grad=1.0,
+    accumulate_grad_in_fp32=True,
+    adam_eps=1e-08,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    torch_adam_is_fused=True,
+    learning_rate_scheduler=learning_rate,
+)
+
+parallelism = ParallelismArgs(
+    dp=1,
+    pp=1,
+    tp=4,
+    pp_engine="1f1b",
+    tp_mode="REDUCE_SCATTER",
+    tp_linear_async_communication=True,
+)
+
+tokens = TokensArgs(sequence_length=8192, train_steps=5, micro_batch_size=1, batch_accumulation_per_replica=8)
+
+dataset = NanosetDatasetsArgs(data_path="XXXX", split='949,50,1')
+
+checkpoints_path = os.path.dirname(os.path.dirname(__file__)) + "/checkpoints"
+os.makedirs(checkpoints_path, exist_ok=True)
+
+config = Config(
+    general=GeneralArgs(project="bench", run="llama", seed=seed),
+    checkpoints=CheckpointsArgs(checkpoints_path=checkpoints_path, checkpoint_interval=1000),
+    parallelism=parallelism,
+    model=ModelArgs(init_method=RandomInit(std=0.025), model_config=model_config),
+    tokenizer=TokenizerArgs("meta-llama/Llama-2-7b-hf"),
+    optimizer=optimizer,
+    logging=LoggingArgs(),
+    tokens=tokens,
+    data=DataArgs(dataset=dataset, seed=seed),
+    profiler=None,
+)
+
+if __name__ == "__main__":
+    dir = os.path.dirname(__file__)
+
+    # Save config as YAML file
+    config.save_as_yaml(f"{dir}/config_llama.yaml")
+
+    # Launch training
+    os.system("export CUDA_DEVICE_MAX_CONNECTIONS=1")
+    gpus = config.parallelism.dp * config.parallelism.pp * config.parallelism.tp
+    os.system(f"torchrun --nproc_per_node={gpus} run_train.py --config-file {dir}/config_llama.yaml")
diff --git a/run_train_nanoset.py b/run_train_nanoset.py
new file mode 100644
index 00000000..df69a92c
--- /dev/null
+++ b/run_train_nanoset.py
@@ -0,0 +1,113 @@
+"""
+Nanotron training script.
+
+Usage:
+```
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
+torchrun --nproc_per_node=8 run_train.py --config-file examples/config_tiny_llama.yaml
+```
+"""
+import argparse
+
+from nanotron import logging
+
+from nanotron.parallel.pipeline_parallel.utils import get_input_output_pp_ranks
+from nanotron.trainer import DistributedTrainer
+
+from nanotron.data.nanoset import NanosetConfig
+from nanotron.data.dataset_builder import NanosetBuilder
+from nanotron.data.dataloader_builder import build_nanoset_dataloader
+from nanotron.data.utils import compute_datasets_num_samples
+
+logger = logging.get_logger(__name__)
+
+def get_dataloaders(trainer: DistributedTrainer):
+    """Returns train, valid and test dataloaders"""
+
+    # First, we need to know which ranks to feed the dataloader to
+    input_pp_rank, output_pp_rank = get_input_output_pp_ranks(model=trainer.model)
+
+    # Create Nanoset config
+    # TODO Test this calculus multinode
+    split_num_samples = compute_datasets_num_samples(train_iters=trainer.config.tokens.train_steps,
+                                               eval_interval=trainer.config.tokens.val_check_interval,
+                                               eval_iters=trainer.config.tokens.val_steps,
+                                               global_batch_size=trainer.global_batch_size)
+    
+    default_split_num_samples = trainer.consumed_train_samples
+
+    assert split_num_samples == default_split_num_samples
+
+    # break
+    
+    nanoset_config = NanosetConfig(
+        random_seed=trainer.config.data.seed,
+        sequence_length=trainer.sequence_length,
+        data_path=trainer.config.data.dataset.data_path,
+        split=trainer.config.data.dataset.split,
+        split_num_samples=split_num_samples
+    )
+
+    # Build Nanoset datasets
+    train_dataset, valid_dataset, test_dataset = NanosetBuilder(nanoset_config).build(nanoset_config)
+
+    # Prepare train, valid and test dataloaders
+    train_dataloader = build_nanoset_dataloader(
+        train_dataset,
+        trainer.sequence_length,
+        parallel_context=trainer.parallel_context,
+        input_pp_rank=input_pp_rank,
+        output_pp_rank=output_pp_rank,
+        micro_batch_size=trainer.micro_batch_size,
+        consumed_train_samples=trainer.consumed_train_samples,
+        dataloader_num_workers=trainer.config.data.num_loading_workers,
+        seed_worker=trainer.config.data.seed,
+        dataloader_drop_last=True,
+    )
+
+    valid_dataloader = build_nanoset_dataloader(
+        valid_dataset,
+        trainer.sequence_length,
+        parallel_context=trainer.parallel_context,
+        input_pp_rank=input_pp_rank,
+        output_pp_rank=output_pp_rank,
+        micro_batch_size=trainer.micro_batch_size,
+        dataloader_num_workers=trainer.config.data.num_loading_workers,
+        seed_worker=trainer.config.data.seed,
+        dataloader_drop_last=True,
+    )
+
+    test_dataloader = build_nanoset_dataloader(
+        test_dataset,
+        trainer.sequence_length,
+        parallel_context=trainer.parallel_context,
+        input_pp_rank=input_pp_rank,
+        output_pp_rank=output_pp_rank,
+        micro_batch_size=trainer.micro_batch_size,
+        dataloader_num_workers=trainer.config.data.num_loading_workers,
+        seed_worker=trainer.config.data.seed,
+        dataloader_drop_last=True,
+    )
+
+    return train_dataloader, valid_dataloader, test_dataloader
+
+
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config-file", type=str, required=True, help="Path to the YAML or python config file")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+    config_file = args.config_file
+
+    # Load trainer and data
+    trainer = DistributedTrainer(config_file)
+    
+    train_dataloader, valid_dataloader, test_dataloader = get_dataloaders(trainer)
+
+    # Train
+    trainer.train(train_dataloader)
diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
index ec48f8bc..23bb7201 100644
--- a/src/nanotron/config/config.py
+++ b/src/nanotron/config/config.py
@@ -94,14 +94,22 @@ def __post_init__(self):
         if self.hf_dataset_splits is None:
             self.hf_dataset_splits = "train"
 
+@dataclass
+class NanosetDatasetsArgs:
+    data_path: str
+    split: Optional[str] = None
+
+    def __post_init__(self):
+        if self.split is None:
+            self.split = "949,50,1"
 
 @dataclass
 class DataArgs:
     """Arguments related to the data and data files processing"""
 
-    dataset: Optional[PretrainDatasetsArgs]
+    dataset: Union[PretrainDatasetsArgs, NanosetDatasetsArgs]
     seed: Optional[int]
-    num_loading_workers: Optional[int] = 1
+    num_loading_workers: Optional[int] = 0
 
     def __post_init__(self):
         if self.seed is None:
@@ -136,6 +144,7 @@ class GeneralArgs:
 
     Args:
         project: Name of the project (a project gather several runs in common tensorboard/hub-folders)
+        entity: Weights and bias entity name (optional)
         run: Name of the run
         step: Global step (updated when we save the checkpoint)
         consumed_train_samples: Number of samples consumed during training (should be actually just step*batch_size)
@@ -143,6 +152,7 @@ class GeneralArgs:
     """
 
     project: str
+    entity: Optional[str] = None
     run: Optional[str] = None
     seed: Optional[int] = None
     step: Optional[int] = None
@@ -210,6 +220,7 @@ class TokensArgs:
     batch_accumulation_per_replica: int
 
     val_check_interval: Optional[int] = -1
+    val_steps: Optional[int] = 10
     limit_val_batches: Optional[int] = 0
     limit_test_batches: Optional[int] = 0
 
diff --git a/src/nanotron/data/Makefile b/src/nanotron/data/Makefile
new file mode 100644
index 00000000..8f9db768
--- /dev/null
+++ b/src/nanotron/data/Makefile
@@ -0,0 +1,9 @@
+CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
+CPPFLAGS += $(shell python3 -m pybind11 --includes)
+LIBNAME = helpers
+LIBEXT = $(shell python3-config --extension-suffix)
+
+default: $(LIBNAME)$(LIBEXT)
+
+%$(LIBEXT): %.cpp
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
diff --git a/src/nanotron/data/__init__.py b/src/nanotron/data/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/nanotron/data/dataloader_builder.py b/src/nanotron/data/dataloader_builder.py
new file mode 100644
index 00000000..0236ce76
--- /dev/null
+++ b/src/nanotron/data/dataloader_builder.py
@@ -0,0 +1,174 @@
+from nanotron import logging
+from nanotron.logging import log_rank
+
+import nanotron.distributed as dist
+from torch.utils.data import DataLoader, Sampler
+from torch.utils.data.distributed import DistributedSampler
+from nanotron.dataloader import SkipBatchSampler, EmptyInfiniteDataset, get_dataloader_worker_init # QUESTION: Move them here?
+
+from dataclasses import dataclass
+import torch
+import numpy as np
+from typing import Dict, List, Union, Optional
+from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
+from nanotron.parallel import ParallelContext
+
+logger = logging.get_logger(__name__)
+
+def build_nanoset_dataloader(
+        dataset,
+        sequence_length: int,
+        parallel_context: ParallelContext,
+        input_pp_rank: int,
+        output_pp_rank: int,
+        micro_batch_size: int,
+        dataloader_num_workers: int,
+        seed_worker: int,
+        consumed_train_samples: int = 0,
+        dataloader_drop_last: bool = True,
+        dataloader_pin_memory: bool = True,
+        use_loop_to_round_batch_size: bool = False,
+) -> DataLoader:
+
+    # Case of ranks not requiring data. We give them a dummy dataset, then the collator will do his job
+    if not dist.get_rank(parallel_context.pp_pg) in [
+        input_pp_rank,
+        output_pp_rank,
+    ]:
+        dataset_length = len(dataset)
+        dataset = EmptyInfiniteDataset(length=dataset_length)
+        # No need to spawn a lot of workers, we can just use main
+        dataloader_num_workers = 0
+    
+    data_collator = NanosetDataCollatorForCLM(
+        sequence_length=sequence_length,
+        input_pp_rank=input_pp_rank,
+        output_pp_rank=output_pp_rank,
+        parallel_context=parallel_context,
+    )
+
+    # Compute size and rank of dataloader workers
+    dp_ranks_size = parallel_context.dp_pg.size()
+    dp_rank = parallel_context.dp_pg.rank()
+    
+    sampler = get_sampler(
+        dataset=dataset,
+        dl_ranks_size=dp_ranks_size,
+        dl_rank=dp_rank,
+        seed=seed_worker,
+        use_loop_to_round_batch_size=use_loop_to_round_batch_size,
+        micro_batch_size=micro_batch_size,
+        drop_last=dataloader_drop_last,
+        consumed_train_samples=consumed_train_samples,
+    )
+
+    return DataLoader(
+        dataset,
+        batch_size=micro_batch_size,
+        sampler=sampler,
+        collate_fn=data_collator,
+        drop_last=dataloader_drop_last,
+        num_workers=dataloader_num_workers,
+        pin_memory=dataloader_pin_memory,
+        worker_init_fn=get_dataloader_worker_init(dp_rank=dp_rank),
+        # TODO @thomasw21: I'm not sure but this doesn't seem to work at all.
+        # pin_memory_device="cuda",
+    )
+
+@dataclass
+class NanosetDataCollatorForCLM:
+    """
+    Data collator used for causal language modeling with Nanoset.
+
+    - sequence_length: Sequence legth of the model
+    - input_pp_rank: Discards last input id token
+    - output_pp_rank: Discards first label id token
+    - other pp ranks: Don't have data. Instead, we use `TensorPointer` to point to the rank having the data.
+    """
+
+    sequence_length: int
+    input_pp_rank: int
+    output_pp_rank: int
+    parallel_context: ParallelContext
+
+    def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Union[torch.Tensor, TensorPointer]]:
+    
+        # Process the case when current rank doesn't require data. We return `TensorPointer` that points to ranks having the data.
+        current_pp_rank = dist.get_rank(self.parallel_context.pp_pg)
+    
+        if current_pp_rank not in [
+            self.input_pp_rank,
+            self.output_pp_rank,
+        ]:
+            # assert all(len(example) == 0 for example in examples)
+            
+            return {
+                "input_ids": TensorPointer(group_rank=self.input_pp_rank),
+                "input_mask": TensorPointer(group_rank=self.input_pp_rank),
+                "label_ids": TensorPointer(group_rank=self.output_pp_rank),
+                "label_mask": TensorPointer(group_rank=self.output_pp_rank),
+            }
+
+        # NOTE: Nanoset datasets key is "text"
+        assert all(list(example.keys()) == ["text"] for example in examples)
+
+        # TODO @nouamanetazi: Is it better to have examples as np.array or torch.Tensor?
+        input_ids = np.vstack([examples[i]["text"] for i in range(len(examples))])  # (b, s)
+        batch_size, expanded_input_length = input_ids.shape
+
+        result: Dict[str, Union[np.ndarray, TensorPointer]] = {}
+
+        result["input_ids"] = TensorPointer(group_rank=self.input_pp_rank)
+        result["input_mask"] = TensorPointer(group_rank=self.input_pp_rank)
+        result["label_ids"] = TensorPointer(group_rank=self.output_pp_rank)
+        result["label_mask"] = TensorPointer(group_rank=self.output_pp_rank)
+
+        assert (
+            expanded_input_length == self.sequence_length + 1
+        ), f"Samples should be of length {self.sequence_length + 1} (seq_len+1), but got {expanded_input_length}"
+
+        # Process inputs: last token is the label
+        if current_pp_rank == self.input_pp_rank:
+            result["input_ids"] = input_ids[:, :-1]
+            result["input_mask"] = np.ones((batch_size, self.sequence_length), dtype=np.bool_)
+
+        # Process labels: shift them to the left
+        if current_pp_rank == self.output_pp_rank:
+            result["label_ids"] = input_ids[:, 1:]
+            result["label_mask"] = np.ones((batch_size, self.sequence_length), dtype=np.bool_)
+
+        if isinstance(result["input_ids"], torch.Tensor) and result["input_ids"].shape[-1] != self.sequence_length:
+            raise ValueError(
+                f"`labels` are incorrectly preprocessed. `labels` length is {result['input_ids'].shape[-1]}, but should be"
+                f" {self.sequence_length}."
+            )
+        if isinstance(result["label_ids"], torch.Tensor) and result["label_ids"].shape[-1] != self.sequence_length:
+            raise ValueError(
+                f"`labels` are incorrectly preprocessed. `labels` length is {result['label_ids'].shape[-1]}, but should be"
+                f" {self.sequence_length}."
+            )
+
+        # Cast np.array to torch.Tensor
+        result = {k: v if isinstance(v, TensorPointer) else torch.from_numpy(v) for k, v in result.items()}
+        return result
+
+def get_sampler(
+    dl_ranks_size: int,
+    dl_rank: int,
+    dataset,
+    seed: int,
+    consumed_train_samples: int,
+    drop_last: Optional[bool] = True,
+) -> Optional[Sampler]:
+    """returns sampler that restricts data loading to a subset of the dataset proper to the DP rank"""
+    
+    # NOTE: Removed DistributedSamplerWithLoop to support valid and test samplers. 
+    
+    sampler = DistributedSampler(
+            dataset, num_replicas=dl_ranks_size, rank=dl_rank, seed=seed, drop_last=drop_last
+    )
+
+    if consumed_train_samples > 0:
+        sampler = SkipBatchSampler(sampler, skip_batches=consumed_train_samples, dp_size=dl_ranks_size)
+
+    return sampler
diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
new file mode 100644
index 00000000..c317509b
--- /dev/null
+++ b/src/nanotron/data/dataset_builder.py
@@ -0,0 +1,219 @@
+from nanotron import logging
+import math
+from typing import Any, List, Optional, Tuple, Type, Union
+
+import numpy
+import torch
+
+from nanotron.data.nanoset import NanosetConfig, Nanoset
+from nanotron.data.indexed_dataset import MMapIndexedDataset
+from nanotron.data.utils import Split, normalize
+
+logger = logging.getLogger(__name__)
+
+DistributedDataset = Union[Nanoset, MMapIndexedDataset]
+
+class NanosetBuilder(object):
+    """Builder class for the Nanoset classes
+
+    Args:
+
+        config (NanosetConfig): The config object which informs dataset creation
+    """
+
+    def __init__(
+        self, config: NanosetConfig,
+    ):
+        self.config = config
+        self.sizes = config.split_sizes 
+        self.cls = Nanoset # NOTE: keep it like that to support BlendedNanoset in the future
+
+    def build(self) -> List[Nanoset]:
+        """Build all dataset splits according to the provided data_path(s)
+        
+        This method is distributed-aware and must be called on all ranks.
+        
+        The dataset splits returned can vary according to the config. Supply config.data_path and
+        config.split to build BlendedNanoset and/or Nanoset splits from the same
+        distribution. Supply config.data_path_per_split to build BlendedNanoset and/or Nanoset
+        splits from separate distributions.
+
+        Returns:
+            List[Union[BlendedNanoset, Nanoset]]: A list of either
+            Nanoset or BlendedNanoset per split
+        """
+        return self._build_blended_dataset_splits()
+
+    def _build_blended_dataset_splits(
+        self,
+    ) -> List[Nanoset]:
+        """Build all dataset splits according to the provided data_path(s)
+        
+        See the NanosetBuilder.build alias for more information.
+
+        Returns:
+            List[Optional[Union[BlendedNanoset, Nanoset]]]: A list of either
+            Nanoset or BlendedNanoset per split
+        """
+
+        data_path = getattr(self.config, "data_path")
+        split = getattr(self.config, "split_vector")
+
+        # NOTE: For including blended datasets (BlendedNanoset)
+        # NOTE: Refer to Megatron to check how to work with multiple data_paths datasets. For now we don't support it
+        # NOTE: https://github.com/TJ-Solergibert/Megatron-debug/blob/c5eda947d9728d21b03d77b7db56cb71513d5636/megatron/core/datasets/blended_megatron_dataset_builder.py#L81
+        data_path = [data_path]
+        if len(data_path) == 1:
+            return self._build_nanoset_dataset_splits(data_path[0], split, self.sizes)
+
+    def _build_nanoset_dataset_splits(
+        self, path_prefix: str, split: List[float], sizes: List[int],
+    ) -> List[Nanoset]:
+        """Build each Nanoset split from a single MMapIndexedDataset
+
+        Args:
+            path_prefix (str): The MMapIndexedDataset .bin and .idx file prefix
+
+            split (List[float]): The dataset split ratios (must sum to 1.00)
+
+            sizes (List[int]): The number of total samples to draw from each split
+
+        Returns:
+            List[Nanoset]: The Nanoset per split. Always returns Nanosets because we build them in each and every rank 
+        """
+        
+        indexed_dataset = self._build_generic_dataset(
+            MMapIndexedDataset, path_prefix, False
+        )
+        
+        split_idx_bounds = _get_split_indices(
+            split, indexed_dataset.sequence_lengths.shape[0]
+        )
+        
+        split_indices = [
+            numpy.arange(
+                start=split_idx_bounds[i],
+                stop=split_idx_bounds[i + 1],
+                step=1,
+                dtype=numpy.int32,
+            )
+            for i, _ in enumerate(Split)
+        ]
+
+        nanoset_datasets = []
+        for i, _split in enumerate(Split):
+            if split[i] == 0.0:
+                nanoset_datasets.append(None)
+            else:
+                nanoset_datasets.append(
+                    self._build_generic_dataset(
+                        self.cls, indexed_dataset, split_indices[i], sizes[i], _split, self.config
+                    )
+                )
+
+        return nanoset_datasets
+
+    def _build_generic_dataset(
+        self, cls: Type[DistributedDataset], *args: Any,
+    ) -> Optional[DistributedDataset]:
+        """Build the DistributedDataset
+
+        Args:
+            cls (Type[DistributedDataset]): The DistributedDataset class to be built
+
+            args (Tuple[Any]): The positional arguments used to build the provided
+            DistributedDataset class
+
+        Raises:
+            Exception: When the dataset constructor raises an OSError
+
+        Returns:
+            DistributedDataset: The DistributedDataset instantion
+        """
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+
+            dataset = None
+
+            # First, build on rank 0
+            if rank == 0:
+                try:
+                    dataset = cls(*args)
+                except OSError as err:
+                    log = (
+                        f"Failed to write dataset materials to the data cache directory. "
+                        + f"Please supply a directory to which you have write access via "
+                        + f"the path_to_cache attribute in NanosetConfig and "
+                        + f"retry. Refer to the preserved traceback above for more information."
+                    )
+                    raise Exception(log) from err
+
+            torch.distributed.barrier()
+
+            if rank != 0:
+                dataset = cls(*args)
+
+            return dataset
+
+        return cls(*args)
+
+
+def _get_split_indices(split: List[float], num_elements: int) -> List[int]:
+    """Determine the document index bounds per split
+
+    Args:
+        split (List[float]): The dataset split ratios (must sum to 1.00)
+
+        num_elements (int): The number of elements, e.g. sequences or documents, available for
+        the split
+
+    Returns:
+        List[int]: The indices for all three splits e.g. [0, 900, 990, 1000] for a 1000-document
+        set and a [90.0, 9.0, 1.0] split
+    """
+    split_indices = [0]
+    for split_pct in split:
+        split_indices.append(split_indices[-1] + int(round(split_pct * float(num_elements))))
+    split_indices[1:] = list(
+        map(lambda _: _ - (split_indices[-1] - num_elements), split_indices[1:])
+    )
+
+    assert len(split_indices) == len(split) + 1
+    assert split_indices[-1] == num_elements
+
+    return split_indices
+
+# NOTE: Keep for BlendedNanoset
+def _get_prefixes_weights_and_sizes_for_blend(
+    data_path: List[str], target_num_samples_per_split: List[int]
+) -> Tuple[List[str], List[float], List[List[int]]]:
+    """Determine the contribution of the Nanoset splits to the BlendedNanoset splits
+    
+    Args:
+        data_path (List[str]): e.g. ["30", "path/to/dataset_1_prefix", "70", 
+        "path/to/dataset_2_prefix"]
+
+        target_num_samples_per_split (List[int]): The number of samples to target for each
+        BlendedNanoset split
+
+    Returns:
+        Tuple[List[str], List[float], List[List[int]]]: The prefix strings e.g.
+        ["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"], the normalized weights e.g.
+        [0.3, 0.7], and the number of samples to request per Nanoset per split
+    """
+    weights, prefixes = zip(
+        *[(float(data_path[i]), data_path[i + 1].strip()) for i in range(0, len(data_path), 2)]
+    )
+
+    weights = normalize(weights)
+
+    # NOTE: Use 0.5% target margin to ensure we satiate the network
+    sizes_per_dataset = [
+        [
+            int(math.ceil(target_num_samples * weight * 1.005))
+            for target_num_samples in target_num_samples_per_split
+        ]
+        for weight in weights
+    ]
+
+    return prefixes, weights, sizes_per_dataset
\ No newline at end of file
diff --git a/src/nanotron/data/helpers.cpp b/src/nanotron/data/helpers.cpp
new file mode 100644
index 00000000..4e1b3dbc
--- /dev/null
+++ b/src/nanotron/data/helpers.cpp
@@ -0,0 +1,765 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
+
+/* Helper methods for fast index mapping builds */
+
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <math.h>
+#include <stdexcept>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <random>
+
+namespace py = pybind11;
+using namespace std;
+
+const int32_t LONG_SENTENCE_LEN = 512;
+
+void build_blending_indices(py::array_t<int16_t> &dataset_index,
+                            py::array_t<int64_t> &dataset_sample_index,
+                            const py::array_t<double> &weights,
+                            const int32_t num_datasets,
+                            const int64_t size, const bool verbose)
+{
+  /* Given multiple datasets and a weighting array, build samples
+   such that it follows those wieghts.*/
+
+  if (verbose)
+  {
+    std::cout << "> building indices for blended datasets ..." << std::endl;
+  }
+
+  // Get the pointer access without the checks.
+  auto dataset_index_ptr = dataset_index.mutable_unchecked<1>();
+  auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>();
+  auto weights_ptr = weights.unchecked<1>();
+
+  // Initialize buffer for number of samples used for each dataset.
+  int64_t current_samples[num_datasets];
+  for (int64_t i = 0; i < num_datasets; ++i)
+  {
+    current_samples[i] = 0;
+  }
+
+  // For each sample:
+  for (int64_t sample_idx = 0; sample_idx < size; ++sample_idx)
+  {
+
+    // Determine where the max error in sampling is happening.
+    auto sample_idx_double = std::max(static_cast<double>(sample_idx), 1.0);
+    int64_t max_error_index = 0;
+    double max_error = weights_ptr[0] * sample_idx_double -
+                       static_cast<double>(current_samples[0]);
+    for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx)
+    {
+      double error = weights_ptr[dataset_idx] * sample_idx_double -
+                     static_cast<double>(current_samples[dataset_idx]);
+      if (error > max_error)
+      {
+        max_error = error;
+        max_error_index = dataset_idx;
+      }
+    }
+
+    // Populate the indices.
+    dataset_index_ptr[sample_idx] = static_cast<int16_t>(max_error_index);
+    dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index];
+
+    // Update the total samples.
+    current_samples[max_error_index] += 1;
+  }
+
+  // print info
+  if (verbose)
+  {
+    std::cout << " > sample ratios:" << std::endl;
+    for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx)
+    {
+      auto ratio = static_cast<double>(current_samples[dataset_idx]) /
+                   static_cast<double>(size);
+      std::cout << "   dataset " << dataset_idx << ", input: " << weights_ptr[dataset_idx] << ", achieved: " << ratio << std::endl;
+    }
+  }
+}
+
+py::array build_sample_idx(const py::array_t<int32_t> &sizes_,
+                           const py::array_t<int32_t> &doc_idx_,
+                           const int32_t seq_length,
+                           const int32_t num_epochs,
+                           const int64_t tokens_per_epoch)
+{
+  /* Sample index (sample_idx) is used for gpt2 like dataset for which
+     the documents are flattened and the samples are built based on this
+     1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
+     where [..., 0] contains the index into `doc_idx` and [..., 1] is the
+     starting offset in that document.*/
+
+  // Consistency checks.
+  assert(seq_length > 1);
+  assert(num_epochs > 0);
+  assert(tokens_per_epoch > 1);
+
+  // Remove bound checks.
+  auto sizes = sizes_.unchecked<1>();
+  auto doc_idx = doc_idx_.unchecked<1>();
+
+  // Mapping and it's length (1D).
+  int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
+  int32_t *sample_idx = new int32_t[2 * (num_samples + 1)];
+
+  // Index into sample_idx.
+  int64_t sample_index = 0;
+  // Index into doc_idx.
+  int64_t doc_idx_index = 0;
+  // Begining offset for each document.
+  int32_t doc_offset = 0;
+  // Start with first document and no offset.
+  sample_idx[2 * sample_index] = doc_idx_index;
+  sample_idx[2 * sample_index + 1] = doc_offset;
+  ++sample_index;
+
+  while (sample_index <= num_samples)
+  {
+    // Start with a fresh sequence.
+    int32_t remaining_seq_length = seq_length + 1;
+    while (remaining_seq_length != 0)
+    {
+      // Get the document length.
+      auto doc_id = doc_idx[doc_idx_index];
+      auto doc_length = sizes[doc_id] - doc_offset;
+      // And add it to the current sequence.
+      remaining_seq_length -= doc_length;
+      // If we have more than a full sequence, adjust offset and set
+      // remaining length to zero so we return from the while loop.
+      // Note that -1 here is for the same reason we have -1 in
+      // `_num_epochs` calculations.
+      if (remaining_seq_length <= 0)
+      {
+        doc_offset += (remaining_seq_length + doc_length - 1);
+        remaining_seq_length = 0;
+      }
+      else
+      {
+        // Otherwise, start from the begining of the next document.
+        ++doc_idx_index;
+        doc_offset = 0;
+      }
+    }
+    // Record the sequence.
+    sample_idx[2 * sample_index] = doc_idx_index;
+    sample_idx[2 * sample_index + 1] = doc_offset;
+    ++sample_index;
+  }
+
+  // Method to deallocate memory.
+  py::capsule free_when_done(sample_idx, [](void *mem_)
+                             {
+	int32_t *mem = reinterpret_cast<int32_t*>(mem_);
+	delete[] mem; });
+
+  // Return the numpy array.
+  const auto byte_size = sizeof(int32_t);
+  return py::array(std::vector<int64_t>{num_samples + 1, 2}, // shape
+                   {2 * byte_size, byte_size},               // C-style contiguous strides
+                   sample_idx,                               // the data pointer
+                   free_when_done);                          // numpy array references
+}
+
+inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
+                                     const int32_t max_length,
+                                     std::mt19937 &rand32_gen)
+{
+  /* Training sample length. */
+  if (short_seq_ratio == 0)
+  {
+    return max_length;
+  }
+  const auto random_number = rand32_gen();
+  if ((random_number % short_seq_ratio) == 0)
+  {
+    return 2 + random_number % (max_length - 1);
+  }
+  return max_length;
+}
+
+template <typename DocIdx>
+py::array build_mapping_impl(const py::array_t<int64_t> &docs_,
+                             const py::array_t<int32_t> &sizes_,
+                             const int32_t num_epochs,
+                             const uint64_t max_num_samples,
+                             const int32_t max_seq_length,
+                             const double short_seq_prob,
+                             const int32_t seed,
+                             const bool verbose,
+                             const int32_t min_num_sent)
+{
+  /* Build a mapping of (start-index, end-index, sequence-length) where
+     start and end index are the indices of the sentences in the sample
+     and sequence-length is the target sequence length.
+  */
+
+  // Consistency checks.
+  assert(num_epochs > 0);
+  assert(max_seq_length > 1);
+  assert(short_seq_prob >= 0.0);
+  assert(short_seq_prob <= 1.0);
+  assert(seed > 0);
+
+  // Remove bound checks.
+  auto docs = docs_.unchecked<1>();
+  auto sizes = sizes_.unchecked<1>();
+
+  // For efficiency, convert probability to ratio. Note: rand() generates int.
+  int32_t short_seq_ratio = 0;
+  if (short_seq_prob > 0)
+  {
+    short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob));
+  }
+
+  if (verbose)
+  {
+    const auto sent_start_index = docs[0];
+    const auto sent_end_index = docs[docs_.shape(0) - 1];
+    const auto num_sentences = sent_end_index - sent_start_index;
+    cout << "    using:" << endl
+         << std::flush;
+    cout << "     number of documents:            " << docs_.shape(0) - 1 << endl
+         << std::flush;
+    cout << "     sentences range:                [" << sent_start_index << ", " << sent_end_index << ")" << endl
+         << std::flush;
+    cout << "     total number of sentences:      " << num_sentences << endl
+         << std::flush;
+    cout << "     number of epochs:               " << num_epochs << endl
+         << std::flush;
+    cout << "     maximum number of samples:      " << max_num_samples << endl
+         << std::flush;
+    cout << "     maximum sequence length:        " << max_seq_length << endl
+         << std::flush;
+    cout << "     short sequence probability:     " << short_seq_prob << endl
+         << std::flush;
+    cout << "     short sequence ration (1/prob): " << short_seq_ratio << endl
+         << std::flush;
+    cout << "     seed:                           " << seed << endl
+         << std::flush;
+  }
+
+  // Mapping and it's length (1D).
+  int64_t num_samples = -1;
+  DocIdx *maps = NULL;
+
+  // Perform two iterations, in the first iteration get the size
+  // and allocate memory and in the second iteration populate the map.
+  bool second = false;
+  for (int32_t iteration = 0; iteration < 2; ++iteration)
+  {
+
+    // Set the seed so both iterations produce the same results.
+    std::mt19937 rand32_gen(seed);
+
+    // Set the flag on second iteration.
+    second = (iteration == 1);
+
+    // Counters:
+    uint64_t empty_docs = 0;
+    uint64_t one_sent_docs = 0;
+    uint64_t long_sent_docs = 0;
+
+    // Current map index.
+    uint64_t map_index = 0;
+
+    // For each epoch:
+    for (int32_t epoch = 0; epoch < num_epochs; ++epoch)
+    {
+      if (map_index >= max_num_samples)
+      {
+        if (verbose && (!second))
+        {
+          cout << "    reached " << max_num_samples << " samples after "
+               << epoch << " epochs ..." << endl
+               << std::flush;
+        }
+        break;
+      }
+      // For each document:
+      for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc)
+      {
+
+        // Document sentences are in [sent_index_first, sent_index_last)
+        const auto sent_index_first = docs[doc];
+        const auto sent_index_last = docs[doc + 1];
+
+        // At the begining of the document previous index is the
+        // start index.
+        auto prev_start_index = sent_index_first;
+
+        // Remaining documents.
+        auto num_remain_sent = sent_index_last - sent_index_first;
+
+        // Some bookkeeping
+        if ((epoch == 0) && (!second))
+        {
+          if (num_remain_sent == 0)
+          {
+            ++empty_docs;
+          }
+          if (num_remain_sent == 1)
+          {
+            ++one_sent_docs;
+          }
+        }
+
+        // Detect documents with long sentences.
+        bool contains_long_sentence = false;
+        if (num_remain_sent > 1)
+        {
+          for (auto sent_index = sent_index_first;
+               sent_index < sent_index_last; ++sent_index)
+          {
+            if (sizes[sent_index] > LONG_SENTENCE_LEN)
+            {
+              if ((epoch == 0) && (!second))
+              {
+                ++long_sent_docs;
+              }
+              contains_long_sentence = true;
+              break;
+            }
+          }
+        }
+
+        // If we have more than two sentences.
+        if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence))
+        {
+
+          // Set values.
+          auto seq_len = int32_t{0};
+          auto num_sent = int32_t{0};
+          auto target_seq_len = get_target_sample_len(short_seq_ratio,
+                                                      max_seq_length,
+                                                      rand32_gen);
+
+          // Loop through sentences.
+          for (auto sent_index = sent_index_first;
+               sent_index < sent_index_last; ++sent_index)
+          {
+
+            // Add the size and number of sentences.
+            seq_len += sizes[sent_index];
+            ++num_sent;
+            --num_remain_sent;
+
+            // If we have reached the target length.
+            // and if not only one sentence is left in the document.
+            // and if we have at least two sentneces.
+            // and if we have reached end of the document.
+            if (((seq_len >= target_seq_len) &&
+                 (num_remain_sent > 1) &&
+                 (num_sent >= min_num_sent)) ||
+                (num_remain_sent == 0))
+            {
+
+              // Check for overflow.
+              if ((3 * map_index + 2) >
+                  std::numeric_limits<int64_t>::max())
+              {
+                cout << "number of samples exceeded maximum "
+                     << "allowed by type int64: "
+                     << std::numeric_limits<int64_t>::max()
+                     << endl;
+                throw std::overflow_error("Number of samples");
+              }
+
+              // Populate the map.
+              if (second)
+              {
+                const auto map_index_0 = 3 * map_index;
+                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+                maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len);
+              }
+
+              // Update indices / counters.
+              ++map_index;
+              prev_start_index = sent_index + 1;
+              target_seq_len = get_target_sample_len(short_seq_ratio,
+                                                     max_seq_length,
+                                                     rand32_gen);
+              seq_len = 0;
+              num_sent = 0;
+            }
+
+          } // for (auto sent_index=sent_index_first; ...
+        }   // if (num_remain_sent > 1) {
+      }     // for (int doc=0; doc < num_docs; ++doc) {
+    }       // for (int epoch=0; epoch < num_epochs; ++epoch) {
+
+    if (!second)
+    {
+      if (verbose)
+      {
+        cout << "   number of empty documents: " << empty_docs << endl
+             << std::flush;
+        cout << "   number of documents with one sentence: " << one_sent_docs << endl
+             << std::flush;
+        cout << "   number of documents with long sentences: " << long_sent_docs << endl
+             << std::flush;
+        cout << "   will create mapping for " << map_index << " samples" << endl
+             << std::flush;
+      }
+      assert(maps == NULL);
+      assert(num_samples < 0);
+      maps = new DocIdx[3 * map_index];
+      num_samples = static_cast<int64_t>(map_index);
+    }
+
+  } // for (int iteration=0; iteration < 2; ++iteration) {
+
+  // Shuffle.
+  // We need a 64 bit random number generator as we might have more
+  // than 2 billion samples.
+  std::mt19937_64 rand64_gen(seed + 1);
+  for (auto i = (num_samples - 1); i > 0; --i)
+  {
+    const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+    const auto i0 = 3 * i;
+    const auto j0 = 3 * j;
+    // Swap values.
+    swap(maps[i0], maps[j0]);
+    swap(maps[i0 + 1], maps[j0 + 1]);
+    swap(maps[i0 + 2], maps[j0 + 2]);
+  }
+
+  // Method to deallocate memory.
+  py::capsule free_when_done(maps, [](void *mem_)
+                             {
+            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
+	    delete[] mem; });
+
+  // Return the numpy array.
+  const auto byte_size = sizeof(DocIdx);
+  return py::array(std::vector<int64_t>{num_samples, 3}, // shape
+                   {3 * byte_size, byte_size},           // C-style contiguous strides
+                   maps,                                 // the data pointer
+                   free_when_done);                      // numpy array references
+}
+
+py::array build_mapping(const py::array_t<int64_t> &docs_,
+                        const py::array_t<int> &sizes_,
+                        const int num_epochs,
+                        const uint64_t max_num_samples,
+                        const int max_seq_length,
+                        const double short_seq_prob,
+                        const int seed,
+                        const bool verbose,
+                        const int32_t min_num_sent)
+{
+
+  if (sizes_.size() > std::numeric_limits<uint32_t>::max())
+  {
+    if (verbose)
+    {
+      cout << "    using uint64 for data mapping..." << endl
+           << std::flush;
+    }
+    return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs,
+                                        max_num_samples, max_seq_length,
+                                        short_seq_prob, seed, verbose,
+                                        min_num_sent);
+  }
+  else
+  {
+    if (verbose)
+    {
+      cout << "    using uint32 for data mapping..." << endl
+           << std::flush;
+    }
+    return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs,
+                                        max_num_samples, max_seq_length,
+                                        short_seq_prob, seed, verbose,
+                                        min_num_sent);
+  }
+}
+
+template <typename DocIdx>
+py::array build_blocks_mapping_impl(const py::array_t<int64_t> &docs_,
+                                    const py::array_t<int32_t> &sizes_,
+                                    const py::array_t<int32_t> &titles_sizes_,
+                                    const int32_t num_epochs,
+                                    const uint64_t max_num_samples,
+                                    const int32_t max_seq_length,
+                                    const int32_t seed,
+                                    const bool verbose,
+                                    const bool use_one_sent_blocks)
+{
+  /* Build a mapping of (start-index, end-index, sequence-length) where
+     start and end index are the indices of the sentences in the sample
+     and sequence-length is the target sequence length.
+  */
+
+  // Consistency checks.
+  assert(num_epochs > 0);
+  assert(max_seq_length > 1);
+  assert(seed > 0);
+
+  // Remove bound checks.
+  auto docs = docs_.unchecked<1>();
+  auto sizes = sizes_.unchecked<1>();
+  auto titles_sizes = titles_sizes_.unchecked<1>();
+
+  if (verbose)
+  {
+    const auto sent_start_index = docs[0];
+    const auto sent_end_index = docs[docs_.shape(0) - 1];
+    const auto num_sentences = sent_end_index - sent_start_index;
+    cout << "    using:" << endl
+         << std::flush;
+    cout << "     number of documents:            " << docs_.shape(0) - 1 << endl
+         << std::flush;
+    cout << "     sentences range:                [" << sent_start_index << ", " << sent_end_index << ")" << endl
+         << std::flush;
+    cout << "     total number of sentences:      " << num_sentences << endl
+         << std::flush;
+    cout << "     number of epochs:               " << num_epochs << endl
+         << std::flush;
+    cout << "     maximum number of samples:      " << max_num_samples << endl
+         << std::flush;
+    cout << "     maximum sequence length:        " << max_seq_length << endl
+         << std::flush;
+    cout << "     seed:                           " << seed << endl
+         << std::flush;
+  }
+
+  // Mapping and its length (1D).
+  int64_t num_samples = -1;
+  DocIdx *maps = NULL;
+
+  // Acceptable number of sentences per block.
+  int min_num_sent = 2;
+  if (use_one_sent_blocks)
+  {
+    min_num_sent = 1;
+  }
+
+  // Perform two iterations, in the first iteration get the size
+  // and allocate memory and in the second iteration populate the map.
+  bool second = false;
+  for (int32_t iteration = 0; iteration < 2; ++iteration)
+  {
+
+    // Set the flag on second iteration.
+    second = (iteration == 1);
+
+    // Current map index.
+    uint64_t map_index = 0;
+
+    uint64_t empty_docs = 0;
+    uint64_t one_sent_docs = 0;
+    uint64_t long_sent_docs = 0;
+    // For each epoch:
+    for (int32_t epoch = 0; epoch < num_epochs; ++epoch)
+    {
+      // assign every block a unique id
+      int32_t block_id = 0;
+
+      if (map_index >= max_num_samples)
+      {
+        if (verbose && (!second))
+        {
+          cout << "    reached " << max_num_samples << " samples after "
+               << epoch << " epochs ..." << endl
+               << std::flush;
+        }
+        break;
+      }
+      // For each document:
+      for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc)
+      {
+
+        // Document sentences are in [sent_index_first, sent_index_last)
+        const auto sent_index_first = docs[doc];
+        const auto sent_index_last = docs[doc + 1];
+        const auto target_seq_len = max_seq_length - titles_sizes[doc];
+
+        // At the begining of the document previous index is the
+        // start index.
+        auto prev_start_index = sent_index_first;
+
+        // Remaining documents.
+        auto num_remain_sent = sent_index_last - sent_index_first;
+
+        // Some bookkeeping
+        if ((epoch == 0) && (!second))
+        {
+          if (num_remain_sent == 0)
+          {
+            ++empty_docs;
+          }
+          if (num_remain_sent == 1)
+          {
+            ++one_sent_docs;
+          }
+        }
+        // Detect documents with long sentences.
+        bool contains_long_sentence = false;
+        if (num_remain_sent >= min_num_sent)
+        {
+          for (auto sent_index = sent_index_first;
+               sent_index < sent_index_last; ++sent_index)
+          {
+            if (sizes[sent_index] > LONG_SENTENCE_LEN)
+            {
+              if ((epoch == 0) && (!second))
+              {
+                ++long_sent_docs;
+              }
+              contains_long_sentence = true;
+              break;
+            }
+          }
+        }
+        // If we have enough sentences and no long sentences.
+        if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence))
+        {
+
+          // Set values.
+          auto seq_len = int32_t{0};
+          auto num_sent = int32_t{0};
+
+          // Loop through sentences.
+          for (auto sent_index = sent_index_first;
+               sent_index < sent_index_last; ++sent_index)
+          {
+
+            // Add the size and number of sentences.
+            seq_len += sizes[sent_index];
+            ++num_sent;
+            --num_remain_sent;
+
+            // If we have reached the target length.
+            // and there are an acceptable number of sentences left
+            // and if we have at least the minimum number of sentences.
+            // or if we have reached end of the document.
+            if (((seq_len >= target_seq_len) &&
+                 (num_remain_sent >= min_num_sent) &&
+                 (num_sent >= min_num_sent)) ||
+                (num_remain_sent == 0))
+            {
+
+              // Populate the map.
+              if (second)
+              {
+                const auto map_index_0 = 4 * map_index;
+                // Each sample has 4 items: the starting sentence index, ending sentence index,
+                // the index of the document from which the block comes (used for fetching titles)
+                // and the unique id of the block (used for creating block indexes)
+
+                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+                maps[map_index_0 + 2] = static_cast<DocIdx>(doc);
+                maps[map_index_0 + 3] = static_cast<DocIdx>(block_id);
+              }
+
+              // Update indices / counters.
+              ++map_index;
+              ++block_id;
+              prev_start_index = sent_index + 1;
+              seq_len = 0;
+              num_sent = 0;
+            }
+          } // for (auto sent_index=sent_index_first; ...
+        }   // if (num_remain_sent > 1) {
+      }     // for (int doc=0; doc < num_docs; ++doc) {
+    }       // for (int epoch=0; epoch < num_epochs; ++epoch) {
+
+    if (!second)
+    {
+      if (verbose)
+      {
+        cout << "   number of empty documents: " << empty_docs << endl
+             << std::flush;
+        cout << "   number of documents with one sentence: " << one_sent_docs << endl
+             << std::flush;
+        cout << "   number of documents with long sentences: " << long_sent_docs << endl
+             << std::flush;
+        cout << "   will create mapping for " << map_index << " samples" << endl
+             << std::flush;
+      }
+      assert(maps == NULL);
+      assert(num_samples < 0);
+      maps = new DocIdx[4 * map_index];
+      num_samples = static_cast<int64_t>(map_index);
+    }
+
+  } // for (int iteration=0; iteration < 2; ++iteration) {
+
+  // Shuffle.
+  // We need a 64 bit random number generator as we might have more
+  // than 2 billion samples.
+  std::mt19937_64 rand64_gen(seed + 1);
+  for (auto i = (num_samples - 1); i > 0; --i)
+  {
+    const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+    const auto i0 = 4 * i;
+    const auto j0 = 4 * j;
+    // Swap values.
+    swap(maps[i0], maps[j0]);
+    swap(maps[i0 + 1], maps[j0 + 1]);
+    swap(maps[i0 + 2], maps[j0 + 2]);
+    swap(maps[i0 + 3], maps[j0 + 3]);
+  }
+
+  // Method to deallocate memory.
+  py::capsule free_when_done(maps, [](void *mem_)
+                             {
+            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
+	    delete[] mem; });
+
+  // Return the numpy array.
+  const auto byte_size = sizeof(DocIdx);
+  return py::array(std::vector<int64_t>{num_samples, 4}, // shape
+                   {4 * byte_size, byte_size},           // C-style contiguous strides
+                   maps,                                 // the data pointer
+                   free_when_done);                      // numpy array references
+}
+
+py::array build_blocks_mapping(const py::array_t<int64_t> &docs_,
+                               const py::array_t<int> &sizes_,
+                               const py::array_t<int> &titles_sizes_,
+                               const int num_epochs,
+                               const uint64_t max_num_samples,
+                               const int max_seq_length,
+                               const int seed,
+                               const bool verbose,
+                               const bool use_one_sent_blocks)
+{
+
+  if (sizes_.size() > std::numeric_limits<uint32_t>::max())
+  {
+    if (verbose)
+    {
+      cout << "    using uint64 for data mapping..." << endl
+           << std::flush;
+    }
+    return build_blocks_mapping_impl<uint64_t>(docs_, sizes_, titles_sizes_,
+                                               num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
+  }
+  else
+  {
+    if (verbose)
+    {
+      cout << "    using uint32 for data mapping..." << endl
+           << std::flush;
+    }
+    return build_blocks_mapping_impl<uint32_t>(docs_, sizes_, titles_sizes_,
+                                               num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
+  }
+}
+
+PYBIND11_MODULE(helpers, m)
+{
+  m.def("build_mapping", &build_mapping);
+  m.def("build_blocks_mapping", &build_blocks_mapping);
+  m.def("build_sample_idx", &build_sample_idx);
+  m.def("build_blending_indices", &build_blending_indices);
+}
diff --git a/src/nanotron/data/indexed_dataset.py b/src/nanotron/data/indexed_dataset.py
new file mode 100644
index 00000000..ce56f544
--- /dev/null
+++ b/src/nanotron/data/indexed_dataset.py
@@ -0,0 +1,642 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Essentially re-written in entirety
+
+import logging
+import os
+import shutil
+import struct
+import time
+from enum import Enum
+from functools import lru_cache
+from itertools import accumulate
+from types import TracebackType
+from typing import List, Optional, Tuple, Type, Union
+
+import numpy
+import torch
+
+from nanotron.logging import log_rank
+
+logger = logging.getLogger(__name__)
+
+_INDEX_HEADER = b"MMIDIDX\x00\x00"
+
+# TODO @tj-solergibert: Refractor. Delete multimodal references.
+
+class DType(Enum):
+    """The NumPy data type Enum for writing/reading the MMapIndexedDataset indices
+    """
+
+    uint8 = 1
+    int8 = 2
+    int16 = 3
+    int32 = 4
+    int64 = 5
+    float64 = 6
+    float32 = 7
+    uint16 = 8
+
+    @classmethod
+    def code_from_dtype(cls, value: Type[numpy.number]) -> int:
+        """Get the code from the dtype
+
+        Args:
+            value (Type[numpy.number]): The dtype
+
+        Returns:
+            int: The code
+        """
+        return cls[value.__name__].value
+
+    @classmethod
+    def dtype_from_code(cls, value: int) -> Type[numpy.number]:
+        """Get the dtype from the code
+
+        Args:
+            value (int): The code
+
+        Returns:
+            Type[numpy.number]: The dtype
+        """
+        return getattr(numpy, cls(value).name)
+
+    @staticmethod
+    def size(key: Union[int, Type[numpy.number]]) -> int:
+        """Get the size of the dtype/code in bytes
+
+        Args:
+            key (Union[int, Type[numpy.number]]): The dtype or code
+
+        Raises:
+            ValueError: If the key is neither dtype nor integer code
+
+        Returns:
+            int: The size of the dtype/code in in bytes
+        """
+        if isinstance(key, int):
+            return DType.dtype_from_code(key)().itemsize
+        elif numpy.number in key.__mro__:
+            return key().itemsize
+        else:
+            raise ValueError
+
+    @staticmethod
+    def optimal_dtype(cardinality: Optional[int]) -> Type[numpy.number]:
+        """Get the dtype to use for an index of a certain cardinality
+
+        Args:
+            cardinality (Optional[int]): The number of elements to be indexed
+
+        Returns:
+            Type[numpy.number]: The dtype to use for the index
+        """
+        if cardinality is not None and cardinality < 65500:
+            return numpy.uint16
+        else:
+            return numpy.int32
+
+
+class _IndexWriter(object):
+    """Object class to write the index (.idx) file
+
+    Args:
+        idx_path (str): The path to the index file
+
+        dtype (Type[numpy.number]): The dtype of the index file
+    """
+
+    def __init__(self, idx_path: str, dtype: Type[numpy.number]) -> None:
+        self.idx_path = idx_path
+        self.dtype = dtype
+
+    def __enter__(self) -> "_IndexWriter":
+        """Enter the context introduced by the 'with' keyword
+
+        Returns:
+            _IndexWriter: The instance
+        """
+        self.idx_writer = open(self.idx_path, "wb")
+        # fixed, vestigial practice
+        self.idx_writer.write(_INDEX_HEADER)
+        # fixed, vestigial practice
+        self.idx_writer.write(struct.pack("<Q", 1))
+        # the numeric code for the dtype
+        self.idx_writer.write(struct.pack("<B", DType.code_from_dtype(self.dtype)))
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[TracebackType],
+    ) -> Optional[bool]:
+        """Exit the context introduced by the 'with' keyword
+
+        Args:
+            exc_type (Optional[Type[BaseException]]): Exception type
+
+            exc_val (Optional[BaseException]): Exception value
+
+            exc_tb (Optional[TracebackType]): Exception traceback object
+
+        Returns:
+            Optional[bool]: Whether to silence the exception
+        """
+        self.idx_writer.close()
+
+    def write(
+        self,
+        sequence_lengths: List[int],
+        sequence_modes: Optional[List[int]],
+        document_indices: List[int],
+    ) -> None:
+        """Write the index (.idx) file
+
+        Args:
+            sequence_lengths (List[int]): The length of each sequence
+
+            sequence_modes (Optional[List[int]]): The mode of each sequences
+
+            document_indices (List[int]): The seqyebce indices demarcating the end of each document
+        """
+        sequence_pointers = self._sequence_pointers(sequence_lengths)
+
+        # the number of sequences in the dataset
+        sequence_count = len(sequence_lengths)
+        self.idx_writer.write(struct.pack("<Q", sequence_count))
+
+        # the number of documents in the dataset
+        document_count = len(document_indices)
+        self.idx_writer.write(struct.pack("<Q", document_count))
+
+        # the number of tokens per sequence
+        sequence_lengths = numpy.array(sequence_lengths, dtype=numpy.int32)
+        self.idx_writer.write(sequence_lengths.tobytes(order="C"))
+        del sequence_lengths
+
+        # the byte offsets for all sequences
+        sequence_pointers = numpy.array(sequence_pointers, dtype=numpy.int64)
+        self.idx_writer.write(sequence_pointers.tobytes(order="C"))
+        del sequence_pointers
+
+        # the sequence indices marking the end of each document
+        document_indices = numpy.array(document_indices, dtype=numpy.int64)
+        self.idx_writer.write(document_indices.tobytes(order="C"))
+
+        # the mode per sequence
+        if sequence_modes is not None:
+            sequence_modes = numpy.array(sequence_modes, dtype=numpy.int8)
+            self.idx_writer.write(sequence_modes.tobytes(order='C'))
+            del sequence_modes
+
+    def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
+        """Build the sequence pointers per the sequence lengths and dtype size
+
+        Args:
+            sequence_lengths (List[int]): The length of each sequence
+
+        Returns:
+            List[int]: The pointer to the beginning of each sequence
+        """
+        itemsize = DType.size(self.dtype)
+        curr_ptr = 0
+        list_ptr = []
+        for length in sequence_lengths:
+            list_ptr.append(curr_ptr)
+            curr_ptr += length * itemsize
+        return list_ptr
+
+
+class _IndexReader(object):
+    """Object class to read the index (.idx) file
+
+    Args:
+        idx_path (str): The path to the index file
+
+        multimodal (bool): Whether the dataset is multimodal
+    """
+
+    def __init__(self, idx_path: str, multimodal: bool) -> None:
+
+        log_rank(f"Load the {type(self).__name__} from {idx_path}", logger=logger, level=logging.INFO, rank=0)
+
+        with open(idx_path, "rb") as stream:
+            header = stream.read(9)
+            assert header == _INDEX_HEADER, f"bad header, cannot read: {idx_path}"
+
+            version = struct.unpack("<Q", stream.read(8))[0]
+            assert version == 1, f"bad version, cannot read: {idx_path}"
+
+            code = struct.unpack("<B", stream.read(1))[0]
+            self.dtype = DType.dtype_from_code(code)
+            self.dtype_size = DType.size(self.dtype)
+
+            self.sequence_count = struct.unpack("<Q", stream.read(8))[0]
+            self.document_count = struct.unpack("<Q", stream.read(8))[0]
+
+            offset = stream.tell()
+
+        self.bin_buffer_mmap = numpy.memmap(idx_path, mode="r", order="C")
+        self.bin_buffer = memoryview(self.bin_buffer_mmap)
+
+        log_rank(f"\tExtract the sequence lengths", logger=logger, level=logging.INFO, rank=0)
+        t_beg = time.time()
+        self.sequence_lengths = numpy.frombuffer(
+            self.bin_buffer, dtype=numpy.int32, count=self.sequence_count, offset=offset
+        )
+        t_end = time.time()
+        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+
+        log_rank(f"\tExtract the sequence pointers", logger=logger, level=logging.INFO, rank=0)
+        t_beg = time.time()
+        self.sequence_pointers = numpy.frombuffer(
+            self.bin_buffer,
+            dtype=numpy.int64,
+            count=self.sequence_count,
+            offset=offset + self.sequence_lengths.nbytes,
+        )
+        t_end = time.time()
+        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+
+        log_rank(f"\tExtract the document indices", logger=logger, level=logging.INFO, rank=0)
+        t_beg = time.time()
+        self.document_indices = numpy.frombuffer(
+            self.bin_buffer,
+            dtype=numpy.int64,
+            count=self.document_count,
+            offset=offset + self.sequence_lengths.nbytes + self.sequence_pointers.nbytes,
+        )
+        t_end = time.time()
+        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+
+        self.sequence_modes = None
+        if multimodal:
+            log_rank(f"\tExtract the sequence modes", logger=logger, level=logging.INFO, rank=0)
+            t_beg = time.time()
+            self.sequence_modes = numpy.frombuffer(
+                self.bin_buffer,
+                dtype=numpy.int8,
+                count=self.sequence_count,
+                offset=offset
+                + self.sequence_lengths.nbytes
+                + self.sequence_pointers.nbytes
+                + self.document_indices.nbytes,
+            )
+            t_end = time.time()
+            log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+
+        assert self.sequence_lengths.shape[0] == len(self)
+        assert self.sequence_lengths.shape[0] == self.sequence_count
+        assert self.sequence_lengths.shape[0] == self.document_indices[-1]
+
+        log_rank(f"> total number of sequences: {len(self)}", logger=logger, level=logging.INFO, rank=0)
+        log_rank(
+            f"> total number of documents: {self.document_indices.shape[0] - 1}",
+            logger=logger, 
+            level=logging.INFO, 
+            rank=0
+        )
+
+    def __del__(self) -> None:
+        """Clean up the object
+        """
+        self.bin_buffer_mmap._mmap.close()
+        del self.bin_buffer_mmap
+
+    def __len__(self) -> int:
+        """Return the length of the dataset
+
+        Returns:
+            int: The length of the dataset
+        """
+        return self.sequence_count
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, idx: int) -> Tuple[numpy.int32, numpy.int64, Optional[numpy.int8]]:
+        """Return the pointer, length, and mode at the index
+
+        Args:
+            idx (int): The index into the dataset
+
+        Returns:
+            Tuple[numpy.int32, numpy.int64, Optional[numpy.int8]]: The pointer, length and mode at
+            the index
+        """
+        return (
+            self.sequence_pointers[idx],
+            self.sequence_lengths[idx],
+            self.sequence_modes[idx] if self.sequence_modes is not None else None,
+        )
+
+
+class MMapIndexedDataset(torch.utils.data.Dataset):
+    """The low-level interface dataset class
+
+    Args:
+        path_prefix (str): The index (.idx) and data (.bin) prefix
+
+        multimodal (bool, optional): Whether the dataset is multimodal. Defaults to False.
+    """
+
+    def __init__(self, path_prefix: str, multimodal: bool = False) -> None:
+        super().__init__()
+        self.path_prefix = None
+        self.multimodal = None
+
+        self.index = None
+        self.bin_buffer = None
+        self.bin_buffer_mmap = None
+
+        self.initialize(path_prefix, multimodal)
+
+    def initialize(self, path_prefix: str, multimodal: bool) -> None:
+        """Initialize the dataset
+
+        This method is called by MMapIndexedDataset.__init__ during object creation and by
+        MMapIndexedDataset.__setstate__ during un-puckling
+
+        Args:
+            path_prefix (str): The index (.idx) and data (.bin) prefix
+
+            multimodal (bool): Whether the dataset is multimodal
+        """
+        self.path_prefix = path_prefix
+        self.multimodal = multimodal
+        self.index = _IndexReader(get_idx_path(self.path_prefix), self.multimodal)
+        self.bin_buffer_mmap = numpy.memmap(get_bin_path(self.path_prefix), mode="r", order="C")
+        self.bin_buffer = memoryview(self.bin_buffer_mmap)
+
+    def __getstate__(self) -> Tuple[str, bool]:
+        """Get the state during pickling
+
+        Returns:
+            Tuple[str, bool]: The state tuple
+        """
+        return self.path_prefix, self.multimodal
+
+    def __setstate__(self, state: Tuple[str, bool]) -> None:
+        """Set the state during un-pickling
+
+        Args:
+            state (Tuple[str, bool]): The state tuple
+        """
+        path_prefix, multimodal = state
+        self.initialize(path_prefix, multimodal)
+
+    def __del__(self) -> None:
+        """Clean up the object
+        """
+        if self.bin_buffer_mmap is not None:
+            self.bin_buffer_mmap._mmap.close()
+        del self.bin_buffer_mmap
+        del self.index
+
+    def __len__(self) -> int:
+        """Return the length of the dataset i.e. the number of sequences in the index
+
+        Returns:
+            int: The length of the dataset
+        """
+        return len(self.index)
+
+    def __getitem__(
+        self, idx: Union[int, numpy.integer, slice]
+    ) -> Union[numpy.ndarray, Tuple[numpy.ndarray, numpy.ndarray]]:
+        """Return from the dataset
+
+        Args:
+            idx (Union[int, numpy.integer, slice]): The index or index slice into the dataset
+
+        Raises:
+            ValueError: When the index slice is non-contiguous
+
+            TypeError: When the index is of an unexpected type
+
+        Returns:
+            Union[numpy.ndarray, Tuple[numpy.ndarray, numpy.ndarray]]: The sequence tokens and
+            modes at the index or index slice
+        """
+        # TODO: Check this if else
+        if isinstance(idx, (int, numpy.integer)):
+            sequence_pointer, sequence_length, sequence_mode = self.index[idx]
+            sequence = numpy.frombuffer(
+                self.bin_buffer,
+                dtype=self.index.dtype,
+                count=sequence_length,
+                offset=sequence_pointer,
+            )
+            return (sequence, sequence_mode) if sequence_mode is not None else sequence
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            sequence_lengths = self.index.sequence_lengths[idx]
+            sequence_modes = self.index.sequence_modes[idx] if self.multimodal else None
+            sequence_offsets = list(accumulate(sequence_lengths))
+            sequences = numpy.split(
+                numpy.frombuffer(
+                    self.bin_buffer,
+                    dtype=self.index.dtype,
+                    count=sum(sequence_lengths),
+                    offset=self.index.sequence_pointers[start],
+                ),
+                sequence_offsets[:-1],
+            )
+            return (sequences, sequence_modes) if sequence_modes is not None else sequences
+        else:
+            raise TypeError("Unexpected type received for idx: {}".format(type(idx)))
+
+    def get(self, idx: int, offset: int = 0, length: Optional[int] = None) -> numpy.ndarray:
+        """Retrieve a single item from the dataset with the option to only
+        return a portion of the item.
+
+        get(idx) is the same as [idx] but get() does not support slicing.
+        """
+        sequence_pointer, sequence_length, sequence_mode = self.index[idx]
+        if length is None:
+            length = sequence_length - offset
+        sequence_pointer += offset * DType.size(self.index.dtype)
+        sequence = numpy.frombuffer(
+            self.bin_buffer, dtype=self.index.dtype, count=length, offset=sequence_pointer
+        )
+        return (sequence, sequence_mode) if sequence_mode is not None else sequence
+
+    @property
+    def sequence_lengths(self) -> numpy.ndarray:
+        """Get the sequence lengths
+
+        Returns:
+            numpy.ndarray: The sequence lengths
+        """
+        return self.index.sequence_lengths
+
+    @property
+    def document_indices(self) -> numpy.ndarray:
+        """Get the document indices
+
+        Returns:
+            numpy.ndarray: The document indices
+        """
+        return self.index.document_indices
+
+    def get_document_indices(self) -> numpy.ndarray:
+        """Get the document indices
+
+        This method is slated for deprecation.
+
+        Returns:
+            numpy.ndarray: The document indices
+        """
+        return self.index.document_indices
+
+    def set_document_indices(self, document_indices: numpy.ndarray) -> None:
+        """Set the document indices
+
+        This method is slated for deprecation.
+
+        Args:
+            document_indices (numpy.ndarray): The document indices
+        """
+        self.index.document_indices = document_indices
+
+    @property
+    def sequence_modes(self) -> numpy.ndarray:
+        """Get the sequence modes
+
+        Returns:
+            numpy.ndarray: The sequence modes
+        """
+        return self.index.sequence_modes
+
+    @staticmethod
+    def exists(path_prefix: str) -> bool:
+        """Return whether the MMapIndexedDataset exists on disk at the prefix
+
+        Args:
+            path_prefix (str): The prefix to the index (.idx) and data (.bin) files
+
+        Returns:
+            bool: Whether the MMapIndexedDataset exists on disk at the prefix
+        """
+        return os.path.exists(get_idx_path(path_prefix)) and os.path.exists(
+            get_bin_path(path_prefix)
+        )
+
+
+class MMapIndexedDatasetBuilder(object):
+    """Builder class for the MMapIndexedDataset class
+
+    Args:
+        bin_path (str): The path to the data (.bin) file
+
+        dtype (Type[numpy.number], optional): The dtype of the index file. Defaults to numpy.int32.
+
+        multimodal (bool, optional): Whether the dataset is multimodal. Defaults to False.
+    """
+
+    def __init__(
+        self, bin_path: str, dtype: Type[numpy.number] = numpy.int32, multimodal: bool = False
+    ) -> None:
+        self.data_file = open(bin_path, "wb")
+        self.dtype = dtype
+        self.multimodal = multimodal
+
+        self.sequence_lengths = []
+        self.document_indices = [0]
+        self.sequence_modes = [] if self.multimodal else None
+
+    def add_item(self, tensor: torch.Tensor, mode: int = 0) -> None:
+        """Add a single item to the dataset
+
+        Args:
+            tensor (torch.Tensor): The item to add to the data file
+
+            mode (int, optional): The mode for the item. Defaults to 0.
+        """
+        np_array = numpy.array(tensor.numpy(), dtype=self.dtype)
+        self.data_file.write(np_array.tobytes(order="C"))
+        self.sequence_lengths.append(np_array.size)
+        if self.multimodal:
+            self.sequence_modes.append(mode)
+
+    def add_document(
+        self, tensor: torch.Tensor, lengths: List[int], modes: Optional[List[int]] = None
+    ) -> None:
+        """Add an entire document to the dataset
+
+        Args:
+            tensor (torch.Tensor): The document to add
+            lengths (List[int]): The lengths of each item in the document
+            modes (Optional[List[int]], optional): The modes for each item in the document.
+            Defaults to None.
+        """
+        np_array = numpy.array(tensor, dtype=self.dtype)
+        self.data_file.write(np_array.tobytes(order="C"))
+        self.sequence_lengths.extend(lengths)
+        self.document_indices.append(len(self.sequence_lengths))
+        if self.multimodal:
+            self.sequence_modes.extend(modes if modes is not None else [0] * lengths)
+
+    def end_document(self) -> None:
+        """Finalize the document, for use with MMapIndexedDatasetBuilder.add_item
+        """
+        self.document_indices.append(len(self.sequence_lengths))
+
+    def add_index(self, path_prefix: str) -> None:
+        """Add an entire MMapIndexedDataset to the dataset
+
+        Args:
+            path_prefix (str): The index (.idx) and data (.bin) prefix
+        """
+        # Concatenate index
+        index = _IndexReader(get_idx_path(path_prefix), multimodal=self.multimodal)
+        assert index.dtype == self.dtype
+
+        offset = len(self.sequence_lengths)
+        self.sequence_lengths.extend(index.sequence_lengths)
+        self.document_indices.extend((offset + index.document_indices)[1:])
+
+        if self.multimodal:
+            self.sequence_modes.extend(index.sequence_modes)
+
+        # Concatenate data
+        with open(get_bin_path(path_prefix), "rb") as f:
+            shutil.copyfileobj(f, self.data_file)
+
+    def finalize(self, idx_path: str) -> None:
+        """Clean up and write the index (.idx) file
+
+        Args:
+            idx_path (str): The path to the index file
+        """
+        self.data_file.close()
+        with _IndexWriter(idx_path, self.dtype) as writer:
+            writer.write(self.sequence_lengths, self.sequence_modes, self.document_indices)
+
+
+def get_idx_path(path_prefix: str) -> str:
+    """Get the path to the index file from the prefix
+
+    Args:
+        path_prefix (str): The prefix
+
+    Returns:
+        str: The path to the index file
+    """
+    return path_prefix + ".idx"
+
+
+def get_bin_path(path_prefix: str) -> str:
+    """Get the path to the data file from the prefix
+
+    Args:
+        path_prefix (str): The prefix
+
+    Returns:
+        str: The path to the data file
+    """
+    return path_prefix + ".bin"
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
new file mode 100644
index 00000000..205c4815
--- /dev/null
+++ b/src/nanotron/data/nanoset.py
@@ -0,0 +1,509 @@
+from nanotron import logging
+from nanotron.logging import log_rank
+
+import hashlib
+import json
+import os
+import time
+from collections import OrderedDict
+from typing import Dict, List, Tuple
+
+import torch
+import numpy
+
+from nanotron.data.nanoset_configs import NanosetConfig
+from nanotron.data.indexed_dataset import MMapIndexedDataset
+from nanotron.data.utils import Split
+
+logger = logging.getLogger(__name__)
+
+class Nanoset(torch.data.utils.Dataset):
+    """The base Nanoset dataset
+
+    Args:
+        indexed_dataset (MMapIndexedDataset): The MMapIndexedDataset around which to build the
+        Nanoset
+
+        indexed_indices (numpy.ndarray): The set of the documents indices to expose
+
+        num_samples (int): The number of samples to draw from the indexed dataset
+
+        index_split (Split): The indexed_indices Split (train, valid, test)
+
+        config (NanosetConfig): The Nanoset-specific container for all config sourced parameters
+    """
+
+    def __init__(
+        self,
+        indexed_dataset: MMapIndexedDataset,
+        indexed_indices: numpy.ndarray,
+        num_samples: int,
+        index_split: Split,
+        config: NanosetConfig,
+    ) -> None:
+        
+        assert indexed_indices.size > 0
+        assert num_samples > 0
+        assert self.is_multimodal() == indexed_dataset.multimodal
+        assert self.is_split_by_sequence() != self.is_split_by_document()
+
+        self.indexed_dataset = indexed_dataset
+        self.indexed_indices = indexed_indices
+        self.num_samples = num_samples
+        self.index_split = index_split
+        self.config = config
+
+        self.unique_identifiers = OrderedDict()
+        self.unique_identifiers["class"] = type(self).__name__
+        self.unique_identifiers["path_prefix"] = self.indexed_dataset.path_prefix
+        self.unique_identifiers["num_samples"] = self.num_samples
+        self.unique_identifiers["index_split"] = self.index_split.name
+        for attr in self._key_config_attributes():
+            self.unique_identifiers[attr] = getattr(self.config, attr)
+
+        self.unique_description = json.dumps(self.unique_identifiers, indent=4)
+        self.unique_description_hash = hashlib.md5(
+            self.unique_description.encode("utf-8")
+        ).hexdigest()
+
+        self._finalize()
+
+    def _finalize(self) -> None:
+        """Abstract method implementation
+        
+        Load or build/cache the document, sample, and shuffle indices
+        """
+        assert isinstance(self.config, NanosetConfig)
+
+        (
+            self.document_index,
+            self.sample_index,
+            self.shuffle_index,
+        ) = self._build_document_sample_shuffle_indices()
+
+    def __len__(self) -> int:
+        """Abstract method implementation
+
+        Returns:
+            int: The length of the dataset
+        """
+        return self.sample_index.shape[0] - 1
+
+    def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
+        """Abstract method implementation
+
+        Args:
+            idx (int): The index into the dataset
+
+        Returns:
+            Dict[str, numpy.ndarray]: The text ids and (optionally) the document ids wrapped in a
+            dictionary
+        """
+        text, _ = self._query_document_sample_shuffle_indices(idx)
+        
+        return {"text": text}
+
+    # TODO @tj-solergibert: Delete this funcs if not used?
+    @staticmethod
+    def is_multimodal() -> bool:
+        """Abstract method implementation
+
+        Returns:
+            bool: False
+        """
+        return False
+
+    # TODO @tj-solergibert: Delete this funcs if not used?
+    @staticmethod
+    def is_split_by_sequence() -> bool:
+        """Abstract method implementation
+
+        Returns:
+            bool: True
+        """
+        return True
+    
+    # TODO @tj-solergibert: Delete this funcs if not used?
+    @classmethod
+    def is_split_by_document(cls) -> bool:
+        """Return whether the dataset is split by document
+
+        For example, the BERT train/valid/test split is document aware
+
+        Returns:
+            bool: The negation of cls.is_split_by_sequence
+        """
+        return not cls.is_split_by_sequence()
+
+    # TODO @tj-solergibert: Delete this funcs if not used?
+    @staticmethod
+    def _key_config_attributes() -> List[str]:
+        """Return all config attributes which contribute to uniquely identifying the dataset.
+
+        These attributes will be used to build a uniquely identifying string and MD5 hash which
+        will be used to cache/load the dataset from run to run.
+
+        Returns:
+            List[str]: The key config attributes
+        """
+        return ["split", "random_seed", "sequence_length"]
+
+    def _query_document_sample_shuffle_indices(
+        self, idx: int
+    ) -> Tuple[numpy.ndarray, numpy.ndarray]:
+        """Get the text (token ids) and document ids for a given index
+
+        Args:
+            idx (int): The index into the dataset
+
+        Returns:
+            Tuple[numpy.ndarray, numpy.ndarray]: The text ids and document ids
+        """
+        # Do the shuffle mapping
+        idx = self.shuffle_index[idx]
+
+        # Get the beginning and end documents and offsets
+        doc_index_beg, doc_index_beg_offset = self.sample_index[idx]
+        doc_index_end, doc_index_end_offset = self.sample_index[idx + 1]
+
+        document_ids = []
+        sample_parts = []
+
+        # Sample spans a single document
+        if doc_index_beg == doc_index_end:
+            # Add the document id
+            document_ids.append(self.document_index[doc_index_beg])
+
+            # Add the entire sample
+            sample_parts.append(
+                self.indexed_dataset.get(
+                    self.document_index[doc_index_beg],
+                    offset=doc_index_beg_offset,
+                    length=doc_index_end_offset - doc_index_beg_offset + 1,
+                )
+            )
+
+        # Sample spans multiple documents
+        else:
+            for i in range(doc_index_beg, doc_index_end + 1):
+                # Add the document id
+                document_ids.append(self.document_index[i])
+
+                # Add the sample part
+                offset = 0 if i > doc_index_beg else doc_index_beg_offset
+                length = None if i < doc_index_end else doc_index_end_offset + 1
+                sample_parts.append(
+                    self.indexed_dataset.get(self.document_index[i], offset=offset, length=length)
+                )
+
+        return (
+            numpy.array(numpy.concatenate(sample_parts), dtype=numpy.int64),
+            numpy.array(document_ids, dtype=numpy.int64),
+        )
+
+    def _build_document_sample_shuffle_indices(
+        self,
+    ) -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]:
+        """Build the document index, the sample index, and the shuffle index
+        
+        The document index:
+            -- 1-D
+            -- An ordered array of document ids
+
+        The sample index:
+            -- 2-D
+            -- The document indices and offsets which mark the start of every sample
+
+        The shuffle index:
+            -- 1-D
+            -- A random permutation of index range of the sample index
+
+        Returns:
+            Tuple[numpy.ndarray, numpy.ndarray]: The document index, the sample index, and the
+            shuffle index
+
+        TODO: Explain the 80% threshold
+        """
+        path_to_cache = getattr(self.config, "path_to_cache")
+        if path_to_cache is None:
+            path_to_cache = os.path.join(
+                self.indexed_dataset.path_prefix, "cache", f"{type(self).__name__}_indices"
+            )
+
+        get_path_to = lambda suffix: os.path.join(
+            path_to_cache, f"{self.unique_description_hash}-{type(self).__name__}-{suffix}"
+        )
+        path_to_description = get_path_to("description.txt")
+        path_to_document_index = get_path_to("document_index.npy")
+        path_to_sample_index = get_path_to("sample_index.npy")
+        path_to_shuffle_index = get_path_to("shuffle_index.npy")
+        cache_hit = all(
+            map(
+                os.path.isfile,
+                [
+                    path_to_description,
+                    path_to_document_index,
+                    path_to_sample_index,
+                    path_to_shuffle_index,
+                ],
+            )
+        )
+
+        num_tokens_per_epoch = _get_num_tokens_per_epoch(self.indexed_dataset, self.indexed_indices)
+
+        sequence_length = getattr(self.config, "sequence_length")
+
+        num_epochs = _get_num_epochs(num_tokens_per_epoch, sequence_length, self.num_samples)
+
+        if not cache_hit and torch.distributed.get_rank() == 0:
+            log_rank(
+                f"Build and save the {type(self).__name__} {self.index_split.name} indices",
+                logger=logger, 
+                level=logging.INFO, 
+                rank=0
+            )
+
+            if num_epochs == 1:
+                separate_final_epoch = False
+            else:
+                # Get the number of samples for the last epoch
+                num_samples_sans_final_epoch = (
+                    (num_epochs - 1) * num_tokens_per_epoch - 1
+                ) // sequence_length
+                num_samples_from_final_epoch = self.num_samples - num_samples_sans_final_epoch
+                num_samples_per_epoch = (num_tokens_per_epoch - 1) // sequence_length
+
+                # num_samples_from_final_epoch should be non-negative
+                assert num_samples_from_final_epoch >= 0
+
+                # num_samples_from_final_epoch should not exceed max value
+                assert num_samples_from_final_epoch <= num_samples_per_epoch + 1
+
+                # Separate the final epoch if it falls below the threshold
+                threshold = 0.80
+                separate_final_epoch = num_samples_from_final_epoch < int(
+                    threshold * num_samples_per_epoch
+                )
+
+                log_rank(
+                    f"> num_samples_from_final_epoch: {num_samples_from_final_epoch}",
+                    logger=logger, 
+                    level=logging.DEBUG, 
+                    rank=0
+                )
+                log_rank(f"> threshold: {threshold}", logger=logger, level=logging.DEBUG, rank=0)
+                log_rank(
+                    f"> num_samples_per_epoch: {num_samples_per_epoch}", logger=logger, level=logging.DEBUG, rank=0
+                )
+
+            log_rank(
+                f"> separate_final_epoch: {separate_final_epoch}", logger=logger, level=logging.DEBUG, rank=0
+            )
+
+            numpy_random_state = numpy.random.RandomState(getattr(self.config, "random_seed"))
+
+            os.makedirs(path_to_cache, exist_ok=True)
+
+            # Write the description
+            with open(path_to_description, "wt") as writer:
+                writer.write(self.unique_description)
+
+            # Build the document index
+            log_rank(
+                f"\tBuild and save the document index to {os.path.basename(path_to_document_index)}",
+                logger=logger, level=logging.INFO, rank=0
+            )
+            t_beg = time.time()
+            document_index = _build_document_index(
+                self.indexed_indices, num_epochs, numpy_random_state, separate_final_epoch
+            )
+            numpy.save(path_to_document_index, document_index, allow_pickle=True)
+            t_end = time.time()
+            log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+
+            # Build the sample index
+            log_rank(
+                f"\tBuild and save the sample index to {os.path.basename(path_to_sample_index)}", 
+                logger=logger, level=logging.INFO, rank=0
+            )
+            t_beg = time.time()
+            from nanotron.data import helpers
+
+            assert document_index.dtype == numpy.int32
+            assert self.indexed_dataset.sequence_lengths.dtype == numpy.int32
+            sample_index = helpers.build_sample_idx(
+                self.indexed_dataset.sequence_lengths,
+                document_index,
+                sequence_length,
+                num_epochs,
+                num_tokens_per_epoch,
+            )
+            numpy.save(path_to_sample_index, sample_index, allow_pickle=True)
+            t_end = time.time()
+            log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+
+            # Build the shuffle index
+            log_rank(
+                f"\tBuild and save the shuffle index to {os.path.basename(path_to_shuffle_index)}",
+                logger=logger, level=logging.INFO, rank=0
+            )
+            t_beg = time.time()
+            if separate_final_epoch:
+                shuffle_index = _build_shuffle_index(
+                    num_samples_sans_final_epoch, sample_index.shape[0] - 1, numpy_random_state
+                )
+            else:
+                shuffle_index = _build_shuffle_index(
+                    sample_index.shape[0] - 1, sample_index.shape[0] - 1, numpy_random_state
+                )
+            numpy.save(path_to_shuffle_index, shuffle_index, allow_pickle=True)
+            t_end = time.time()
+            log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+
+        log_rank(
+            f"Load the {type(self).__name__} {self.index_split.name} indices", 
+            logger=logger, level=logging.INFO, rank=0
+        )
+
+        log_rank(
+            f"\tLoad the document index from {os.path.basename(path_to_document_index)}",
+            logger=logger, level=logging.INFO, rank=0
+        )
+        t_beg = time.time()
+        document_index = numpy.load(path_to_document_index, allow_pickle=True, mmap_mode='r')
+        t_end = time.time()
+        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+
+        log_rank(
+            f"\tLoad the sample index from {os.path.basename(path_to_sample_index)}",
+            logger=logger, level=logging.INFO, rank=0
+        )
+        t_beg = time.time()
+        sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode='r')
+        t_end = time.time()
+        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+
+        log_rank(
+            f"\tLoad the shuffle index from {os.path.basename(path_to_shuffle_index)}",
+            logger=logger, level=logging.INFO, rank=0
+        )
+        t_beg = time.time()
+        shuffle_index = numpy.load(path_to_shuffle_index, allow_pickle=True, mmap_mode='r')
+        t_end = time.time()
+        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+
+        log_rank(
+            f"> total number of samples: {sample_index.shape[0] - 1}", logger=logger, level=logging.INFO, rank=0
+        )
+        log_rank(f"> total number of epochs: {num_epochs}", logger=logger, level=logging.INFO, rank=0)
+
+        return document_index, sample_index, shuffle_index
+
+
+def _get_num_tokens_per_epoch(indexed_dataset: MMapIndexedDataset, indices: numpy.ndarray) -> int:
+    """Calculate the number of tokens in a single epoch
+
+    Args:
+        indexed_dataset (MMapIndexedDataset): The underlying MMapIndexedDataset
+
+        indices (numpy.ndarray): The subset of indices into the underlying MMapIndexedDataset
+
+    Returns:
+        int: The number of tokens in a single epoch
+    """
+    return numpy.sum(indexed_dataset.sequence_lengths[indices])
+
+
+def _get_num_epochs(num_tokens_per_epoch: int, seq_length: int, num_samples: int) -> int:
+    """Calculate the number of epochs
+
+    Args:
+        num_tokens_per_epoch (int): The number of tokens in a single epoch
+
+        seq_length (int): The sequence length in tokens
+
+        num_samples (int): The total number of samples
+
+    Returns:
+        int: The number of epochs
+    """
+    num_epochs = 0
+    num_tokens = 0
+    while True:
+        num_epochs += 1
+        num_tokens += num_tokens_per_epoch
+        # -1 is because we need to retrieve seq_length + 1 token each time
+        # but the last token will overlap with the first token of the next
+        # sample except for the last sample.
+        if ((num_tokens - 1) // seq_length) >= num_samples:
+            return num_epochs
+
+
+def _build_document_index(
+    documents: numpy.ndarray,
+    num_epochs: int,
+    numpy_random_state: numpy.random.RandomState,
+    separate_final_epoch: bool,
+) -> numpy.ndarray:
+    """Build an array with length = num epochs * num documents
+
+    Args:
+        documents (numpy.ndarray): the subset of exposed document indices
+
+        num_epochs (int): The number of epochs
+
+        numpy_random_state (numpy.random.RandomState): The NumPy random state
+
+        separate_final_epoch (bool): Whether to exclude the last epoch from the global shuffle
+
+    Returns:
+        numpy.ndarray: The document index
+
+    TODO: Explain separate_final_epoch
+    """
+    if not separate_final_epoch or num_epochs == 1:
+        document_index = numpy.mgrid[0:num_epochs, 0 : len(documents)][1]
+        document_index[:] = documents
+        document_index = document_index.reshape(-1)
+        document_index = document_index.astype(numpy.int32)
+        numpy_random_state.shuffle(document_index)
+        return document_index
+    
+    doc_idx_first = _build_document_index(documents, num_epochs - 1, numpy_random_state, False)
+    doc_idx_last = _build_document_index(documents, 1, numpy_random_state, False)
+    return numpy.concatenate((doc_idx_first, doc_idx_last))
+
+
+def _build_shuffle_index(
+    num_samples: int, total_size: int, numpy_random_state: numpy.random.RandomState
+) -> numpy.ndarray:
+    """Build the range [0, size) and shuffle
+
+    Args:
+        num_samples (int): The size of the first shuffle range [0, num_samples)
+
+        total_size (int): The size of the entire index. If larger than 'num_samples', it defines
+
+        the second shuffle range [num_samples, total_size)
+
+        numpy_random_state (numpy.random.RandomState): The NumPy random state
+
+    Returns:
+        numpy.ndarray: The shuffle index
+
+    TODO: Explain [0, num_samples) [num_samples, total_size) split
+    """
+    dtype_ = numpy.uint32
+    if total_size >= (numpy.iinfo(numpy.uint32).max - 1):
+        dtype_ = numpy.int64
+
+    shuffle_idx_first = numpy.arange(start=0, stop=num_samples, step=1, dtype=dtype_)
+    numpy_random_state.shuffle(shuffle_idx_first)
+    if num_samples == total_size:
+        return shuffle_idx_first
+
+    shuffle_idx_last = numpy.arange(start=num_samples, stop=total_size, step=1, dtype=dtype_)
+    numpy_random_state.shuffle(shuffle_idx_last)
+
+    return numpy.concatenate((shuffle_idx_first, shuffle_idx_last))
+
diff --git a/src/nanotron/data/nanoset_configs.py b/src/nanotron/data/nanoset_configs.py
new file mode 100644
index 00000000..791d8d1a
--- /dev/null
+++ b/src/nanotron/data/nanoset_configs.py
@@ -0,0 +1,50 @@
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+from nanotron.data.utils import parse_and_normalize_split
+
+
+@dataclass
+class NanosetConfig:
+    """Configuration object for Nanoset datasets
+    
+    Attributes:
+    
+        random_seed (int): The seed for all RNG during dataset creation.
+
+        sequence_length (int): The sequence length.
+
+        data_path (str): Path to the .bin and .idx file without the extension
+
+        split (Optional[str]): The split string, a comma separated weighting for the dataset splits
+        when drawing samples from a single distribution. Not to be used with 'blend_per_split'.
+        Defaults to None.
+
+        split_vector Optional[List[float]]): The split string, parsed and normalized post-
+        initialization. Not to be passed to the constructor.
+
+        split_num_samples (list[int]): List containing the number of samples per split
+
+        path_to_cache (str): Where all re-useable dataset indices are to be cached.
+    """
+
+    random_seed: int
+
+    sequence_length: int
+    
+    split_num_samples: list[int]
+
+    data_path: Optional[List[str]] = None
+
+    split: Optional[str] = None
+
+    split_vector: Optional[List[float]] = field(init=False, default=None)
+
+    # TODO We add the cache because it need its somewhere, take a look later
+    path_to_cache: str = None
+
+    def __post_init__(self):
+        """Python dataclass method that is used to modify attributes after initialization. See
+        https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
+        """
+        self.split_vector = parse_and_normalize_split(self.split)
\ No newline at end of file
diff --git a/src/nanotron/data/utils.py b/src/nanotron/data/utils.py
new file mode 100644
index 00000000..1454350d
--- /dev/null
+++ b/src/nanotron/data/utils.py
@@ -0,0 +1,79 @@
+from nanotron import logging
+from nanotron.logging import log_rank
+
+from enum import Enum
+from typing import List
+
+import numpy
+
+logger = logging.getLogger(__name__)
+
+
+class Split(Enum):
+    train = 0
+    valid = 1
+    test = 2
+
+
+def compile_helpers():
+    """Compile C++ helper functions at runtime. Make sure this is invoked on a single process.
+    """
+    import os
+    import subprocess
+
+    command = ["make", "-C", os.path.abspath(os.path.dirname(__file__))]
+    if subprocess.run(command).returncode != 0:
+        import sys
+
+        log_rank("Failed to compile the C++ dataset helper functions", logger=logger, level=logging.ERROR, rank=0)
+        sys.exit(1)
+
+def normalize(weights: List[float]) -> List[float]:
+    """Do non-exponentiated normalization
+
+    Args:
+        weights (List[float]): The weights
+
+    Returns:
+        List[float]: The normalized weights
+    """
+    w = numpy.array(weights, dtype=numpy.float64)
+    w_sum = numpy.sum(w)
+    w = (w / w_sum).tolist()
+    return w
+
+def parse_and_normalize_split(split: str) -> List[float]:
+    """Parse the dataset split ratios from a string
+
+    Args:
+        split (str): The train valid test split string e.g. "99,1,0"
+
+    Returns:
+        List[float]: The trian valid test split ratios e.g. [99.0, 1.0, 0.0]
+    """
+    split = list(map(float, re.findall(r"[.0-9]+", split)))
+    split = split + [0.0 for _ in range(len(Split) - len(split))]
+
+    assert len(split) == len(Split)
+    assert all(map(lambda _: _ >= 0.0, split))
+
+    split = normalize(split)
+
+    return split
+
+def compute_datasets_num_samples(train_iters, eval_interval, eval_iters, global_batch_size):
+    
+    train_samples = train_iters * global_batch_size
+    eval_iters = (train_iters // eval_interval + 1) * eval_iters
+    test_iters = eval_iters
+
+    datasets_num_samples = [train_samples,
+                            eval_iters * global_batch_size,
+                            test_iters * global_batch_size]
+    
+    log_rank(" > Datasets target sizes (minimum size):", logger=logger, level=logging.INFO, rank=0)
+    log_rank("    Train:      {}".format(datasets_num_samples[0]), logger=logger, level=logging.INFO, rank=0)
+    log_rank("    Validation: {}".format(datasets_num_samples[1]), logger=logger, level=logging.INFO, rank=0)
+    log_rank("    Test:       {}".format(datasets_num_samples[2]), logger=logger, level=logging.INFO, rank=0)
+    
+    return datasets_num_samples
\ No newline at end of file
diff --git a/src/nanotron/dataloader.py b/src/nanotron/dataloader.py
index d8c7885a..c62a2519 100644
--- a/src/nanotron/dataloader.py
+++ b/src/nanotron/dataloader.py
@@ -239,7 +239,7 @@ def data_generator() -> Generator[Dict[str, Union[torch.Tensor, TensorPointer]],
 
     return data_generator
 
-
+# QUESTION: Move it to nanotron.data.dataloader_builder?
 # Adapted from https://github.com/huggingface/accelerate/blob/a73898027a211c3f6dc4460351b0ec246aa824aa/src/accelerate/data_loader.py#L781C1-L824C28
 class SkipBatchSampler(BatchSampler):
     """
@@ -347,10 +347,10 @@ def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Uni
         ]:
             assert all(len(example) == 0 for example in examples)
             return {
-                "input_ids": TensorPointer(self.input_pp_rank),
-                "input_mask": TensorPointer(self.input_pp_rank),
-                "label_ids": TensorPointer(self.output_pp_rank),
-                "label_mask": TensorPointer(self.output_pp_rank),
+                "input_ids": TensorPointer(group_rank=self.input_pp_rank),
+                "input_mask": TensorPointer(group_rank=self.input_pp_rank),
+                "label_ids": TensorPointer(group_rank=self.output_pp_rank),
+                "label_mask": TensorPointer(group_rank=self.output_pp_rank),
             }
 
         # Make sure we load only what's necessary, ie we only load a `input_ids` column.
@@ -515,7 +515,7 @@ def get_train_dataloader(
         # pin_memory_device="cuda",
     )
 
-
+# QUESTION: Move it to nanotron.data.dataloader_builder?
 def get_dataloader_worker_init(dp_rank: int):
     """Creates random states for each worker in order to get different state in each workers"""
 
@@ -526,7 +526,7 @@ def dataloader_worker_init(worker_id):
 
     return dataloader_worker_init
 
-
+# QUESTION: Move it to nanotron.data.dataloader_builder?
 class EmptyInfiniteDataset:
     """Hack as removing all columns from a datasets.Dataset makes the number of rows 0."""
 
diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index eb6a5b4a..6a2a2f56 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -29,6 +29,7 @@
     ParallelismArgs,
     RandomInit,
     get_config_from_file,
+    NanosetDatasetsArgs
 )
 from nanotron.dataloader import sanity_check_dataloader
 from nanotron.helpers import (
@@ -149,6 +150,19 @@ def __init__(
         if os.environ.get("NANOTRON_BENCHMARK", "0") == "1":
             log_throughput(self.config, self.parallel_context)
 
+        ########################################
+        ## Compile Nanoset helpers
+        ########################################
+            
+            # Only if Using Nanoset Datasets
+            if isinstance(self.config.data.dataset, NanosetDatasetsArgs):
+                if dist.get_rank() == 0:
+                    log_rank("Compiling dataset index builder ...", logger=logger, level=logging.INFO, rank=0)
+                    from nanotron.data.utils import compile_helpers
+
+                    compile_helpers()
+                    log_rank("Done with dataset index builder.", logger=logger, level=logging.INFO, rank=0)
+        
         ########################################
         ## Setting up our model, optimizers, schedulers, etc.
         ########################################
@@ -476,7 +490,8 @@ def train_step_logs(
                     {
                         **{log_item.tag: log_item.scalar_value for log_item in log_entries},
                         "iteration_step": self.iteration_step,
-                    }
+                    },
+                    step=self.iteration_step-1
                 )
 
             self.loggerwriter.add_scalars_from_list(log_entries, self.iteration_step)

From 27a1c99aea48d88ace045a298c7fc816a23dbb82 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 13 Mar 2024 10:07:08 +0100
Subject: [PATCH 11/73] added megatron references to nanoset

---
 src/nanotron/data/nanoset.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
index 205c4815..2c1c8203 100644
--- a/src/nanotron/data/nanoset.py
+++ b/src/nanotron/data/nanoset.py
@@ -18,6 +18,8 @@
 logger = logging.getLogger(__name__)
 
 class Nanoset(torch.data.utils.Dataset):
+    """Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/datasets/gpt_dataset.py"""
+    
     """The base Nanoset dataset
 
     Args:

From 576de9a83e170c1052800744af32d050822511ee Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 13 Mar 2024 09:27:20 +0000
Subject: [PATCH 12/73] fixed wrong check

---
 run_train_nanoset.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/run_train_nanoset.py b/run_train_nanoset.py
index df69a92c..63a9a7fc 100644
--- a/run_train_nanoset.py
+++ b/run_train_nanoset.py
@@ -28,18 +28,11 @@ def get_dataloaders(trainer: DistributedTrainer):
     input_pp_rank, output_pp_rank = get_input_output_pp_ranks(model=trainer.model)
 
     # Create Nanoset config
-    # TODO Test this calculus multinode
     split_num_samples = compute_datasets_num_samples(train_iters=trainer.config.tokens.train_steps,
                                                eval_interval=trainer.config.tokens.val_check_interval,
                                                eval_iters=trainer.config.tokens.val_steps,
                                                global_batch_size=trainer.global_batch_size)
     
-    default_split_num_samples = trainer.consumed_train_samples
-
-    assert split_num_samples == default_split_num_samples
-
-    # break
-    
     nanoset_config = NanosetConfig(
         random_seed=trainer.config.data.seed,
         sequence_length=trainer.sequence_length,

From e6bebc3e76e1ef4c885b2484a457f7403f95cb55 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 13 Mar 2024 09:36:58 +0000
Subject: [PATCH 13/73] fixed getLogger

---
 src/nanotron/data/dataset_builder.py | 2 +-
 src/nanotron/data/indexed_dataset.py | 2 +-
 src/nanotron/data/nanoset.py         | 2 +-
 src/nanotron/data/utils.py           | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
index c317509b..b795f287 100644
--- a/src/nanotron/data/dataset_builder.py
+++ b/src/nanotron/data/dataset_builder.py
@@ -9,7 +9,7 @@
 from nanotron.data.indexed_dataset import MMapIndexedDataset
 from nanotron.data.utils import Split, normalize
 
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 
 DistributedDataset = Union[Nanoset, MMapIndexedDataset]
 
diff --git a/src/nanotron/data/indexed_dataset.py b/src/nanotron/data/indexed_dataset.py
index ce56f544..cf4441b3 100644
--- a/src/nanotron/data/indexed_dataset.py
+++ b/src/nanotron/data/indexed_dataset.py
@@ -21,7 +21,7 @@
 
 from nanotron.logging import log_rank
 
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 
 _INDEX_HEADER = b"MMIDIDX\x00\x00"
 
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
index 2c1c8203..55386391 100644
--- a/src/nanotron/data/nanoset.py
+++ b/src/nanotron/data/nanoset.py
@@ -15,7 +15,7 @@
 from nanotron.data.indexed_dataset import MMapIndexedDataset
 from nanotron.data.utils import Split
 
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 
 class Nanoset(torch.data.utils.Dataset):
     """Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/datasets/gpt_dataset.py"""
diff --git a/src/nanotron/data/utils.py b/src/nanotron/data/utils.py
index 1454350d..ea3befd7 100644
--- a/src/nanotron/data/utils.py
+++ b/src/nanotron/data/utils.py
@@ -6,7 +6,7 @@
 
 import numpy
 
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 
 
 class Split(Enum):

From d59d6f82fb98553221e1afd2e6ca21a4b71dca3a Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 13 Mar 2024 09:41:03 +0000
Subject: [PATCH 14/73] fixed logging + torch.data.utils

---
 src/nanotron/data/indexed_dataset.py | 2 +-
 src/nanotron/data/nanoset.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nanotron/data/indexed_dataset.py b/src/nanotron/data/indexed_dataset.py
index cf4441b3..938447f2 100644
--- a/src/nanotron/data/indexed_dataset.py
+++ b/src/nanotron/data/indexed_dataset.py
@@ -5,7 +5,6 @@
 
 # Essentially re-written in entirety
 
-import logging
 import os
 import shutil
 import struct
@@ -19,6 +18,7 @@
 import numpy
 import torch
 
+from nanotron import logging
 from nanotron.logging import log_rank
 
 logger = logging.get_logger(__name__)
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
index 55386391..59fe82a5 100644
--- a/src/nanotron/data/nanoset.py
+++ b/src/nanotron/data/nanoset.py
@@ -17,7 +17,7 @@
 
 logger = logging.get_logger(__name__)
 
-class Nanoset(torch.data.utils.Dataset):
+class Nanoset(torch.utils.data.Dataset):
     """Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/datasets/gpt_dataset.py"""
     
     """The base Nanoset dataset

From 7ba817cacbcb488486ba204242c9a1cf935cf47f Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 13 Mar 2024 09:48:16 +0000
Subject: [PATCH 15/73] fixed bugs

---
 run_train_nanoset.py                 |  2 +-
 src/nanotron/data/dataset_builder.py |  2 +-
 src/nanotron/data/utils.py           |  1 +
 src/nanotron/trainer.py              | 16 ++++++++--------
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/run_train_nanoset.py b/run_train_nanoset.py
index 63a9a7fc..298de82c 100644
--- a/run_train_nanoset.py
+++ b/run_train_nanoset.py
@@ -42,7 +42,7 @@ def get_dataloaders(trainer: DistributedTrainer):
     )
 
     # Build Nanoset datasets
-    train_dataset, valid_dataset, test_dataset = NanosetBuilder(nanoset_config).build(nanoset_config)
+    train_dataset, valid_dataset, test_dataset = NanosetBuilder(nanoset_config).build()
 
     # Prepare train, valid and test dataloaders
     train_dataloader = build_nanoset_dataloader(
diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
index b795f287..2a7fc7ff 100644
--- a/src/nanotron/data/dataset_builder.py
+++ b/src/nanotron/data/dataset_builder.py
@@ -25,7 +25,7 @@ def __init__(
         self, config: NanosetConfig,
     ):
         self.config = config
-        self.sizes = config.split_sizes 
+        self.sizes = config.split_num_samples 
         self.cls = Nanoset # NOTE: keep it like that to support BlendedNanoset in the future
 
     def build(self) -> List[Nanoset]:
diff --git a/src/nanotron/data/utils.py b/src/nanotron/data/utils.py
index ea3befd7..9357cd51 100644
--- a/src/nanotron/data/utils.py
+++ b/src/nanotron/data/utils.py
@@ -3,6 +3,7 @@
 
 from enum import Enum
 from typing import List
+import re
 
 import numpy
 
diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index 6a2a2f56..3ee91f68 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -154,14 +154,14 @@ def __init__(
         ## Compile Nanoset helpers
         ########################################
             
-            # Only if Using Nanoset Datasets
-            if isinstance(self.config.data.dataset, NanosetDatasetsArgs):
-                if dist.get_rank() == 0:
-                    log_rank("Compiling dataset index builder ...", logger=logger, level=logging.INFO, rank=0)
-                    from nanotron.data.utils import compile_helpers
-
-                    compile_helpers()
-                    log_rank("Done with dataset index builder.", logger=logger, level=logging.INFO, rank=0)
+        # Only if Using Nanoset Datasets
+        if isinstance(self.config.data.dataset, NanosetDatasetsArgs):
+            if dist.get_rank() == 0:
+                log_rank("Compiling dataset index builder ...", logger=logger, level=logging.INFO, rank=0)
+                from nanotron.data.utils import compile_helpers
+
+                compile_helpers()
+                log_rank("Done with dataset index builder.", logger=logger, level=logging.INFO, rank=0)
         
         ########################################
         ## Setting up our model, optimizers, schedulers, etc.

From dab6b6cd20ba76792382cf4dfe08aae7afb17114 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 13 Mar 2024 10:21:55 +0000
Subject: [PATCH 16/73] Removed DistributedSamplerWithLoop args

---
 src/nanotron/data/dataloader_builder.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/nanotron/data/dataloader_builder.py b/src/nanotron/data/dataloader_builder.py
index 0236ce76..1d63d640 100644
--- a/src/nanotron/data/dataloader_builder.py
+++ b/src/nanotron/data/dataloader_builder.py
@@ -27,7 +27,6 @@ def build_nanoset_dataloader(
         consumed_train_samples: int = 0,
         dataloader_drop_last: bool = True,
         dataloader_pin_memory: bool = True,
-        use_loop_to_round_batch_size: bool = False,
 ) -> DataLoader:
 
     # Case of ranks not requiring data. We give them a dummy dataset, then the collator will do his job
@@ -56,8 +55,6 @@ def build_nanoset_dataloader(
         dl_ranks_size=dp_ranks_size,
         dl_rank=dp_rank,
         seed=seed_worker,
-        use_loop_to_round_batch_size=use_loop_to_round_batch_size,
-        micro_batch_size=micro_batch_size,
         drop_last=dataloader_drop_last,
         consumed_train_samples=consumed_train_samples,
     )

From c73dbe74c10167959bb5677e8c8d14fb5c6250de Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 13 Mar 2024 12:00:30 +0100
Subject: [PATCH 17/73] fixed nanoset config generator

---
 examples/nanoset_llama_7b.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/nanoset_llama_7b.py b/examples/nanoset_llama_7b.py
index bde3c6d2..f1d5508f 100644
--- a/examples/nanoset_llama_7b.py
+++ b/examples/nanoset_llama_7b.py
@@ -65,7 +65,7 @@
 
 tokens = TokensArgs(sequence_length=8192, train_steps=5, micro_batch_size=1, batch_accumulation_per_replica=8)
 
-dataset = NanosetDatasetsArgs(data_path="XXXX", split='949,50,1')
+dataset = NanosetDatasetsArgs(data_path="PATH_TO_DATASET", split='949,50,1')
 
 checkpoints_path = os.path.dirname(os.path.dirname(__file__)) + "/checkpoints"
 os.makedirs(checkpoints_path, exist_ok=True)
@@ -87,7 +87,7 @@
     dir = os.path.dirname(__file__)
 
     # Save config as YAML file
-    config.save_as_yaml(f"{dir}/config_llama.yaml")
+    config.save_as_yaml(f"{dir}/config_nanoset_llama_7b.yaml")
 
     # Launch training
     os.system("export CUDA_DEVICE_MAX_CONNECTIONS=1")

From 04d797c4181fac2e0544a82444b851b893c82bc3 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 13 Mar 2024 12:15:40 +0100
Subject: [PATCH 18/73] Added preprocessing scripts

---
 tools/merge_datasets.py  | 101 +++++++++
 tools/preprocess_data.py | 453 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 554 insertions(+)
 create mode 100644 tools/merge_datasets.py
 create mode 100644 tools/preprocess_data.py

diff --git a/tools/merge_datasets.py b/tools/merge_datasets.py
new file mode 100644
index 00000000..e08a6144
--- /dev/null
+++ b/tools/merge_datasets.py
@@ -0,0 +1,101 @@
+import os
+import sys
+import json
+import argparse
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
+)
+
+from megatron.core.datasets.indexed_dataset import (
+    MMapIndexedDataset,
+    MMapIndexedDatasetBuilder,
+    get_bin_path,
+    get_idx_path,
+)
+
+from nanotron.data.indexed_dataset import (
+    MMapIndexedDataset,
+    MMapIndexedDatasetBuilder,
+    get_bin_path,
+    get_idx_path,
+)
+
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    group = parser.add_argument_group(title="input data")
+    group.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Path to directory containing all document files to merge",
+    )
+
+    group = parser.add_argument_group(title="output data")
+    group.add_argument(
+        "--output-prefix",
+        type=str,
+        required=True,
+        help="Path to binary output file without suffix",
+    )
+
+    group = parser.add_argument_group(title="miscellaneous")
+    group.add_argument(
+        "--multimodal",
+        action="store_true",
+        help="Whether the datasets are assumed to be multimodal"
+    )
+
+    args = parser.parse_args()
+
+    assert os.path.isdir(
+        args.input
+    ), f"ERROR: {args.input} is not a directory or does not exist"
+
+    assert os.path.isdir(
+        os.path.dirname(args.output_prefix)
+    ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist"
+
+    return args
+
+
+def main():
+    args = get_args()
+
+    prefixes = set()
+    for basename in os.listdir(args.input):
+        prefix, ext = os.path.splitext(basename)
+
+        if prefix in prefixes:
+            continue
+
+        if not os.path.isfile(os.path.join(args.input, basename)):
+            continue
+
+        ext_pair = ".bin" if ext == ".idx" else ".idx"
+        assert os.path.isfile(
+            os.path.join(args.input, prefix) + ext_pair
+        ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}"
+
+        prefixes.add(prefix)
+
+    builder = None
+    for prefix in sorted(prefixes):
+        if builder is None:
+            dataset = MMapIndexedDataset(os.path.join(args.input, prefix), multimodal=args.multimodal)
+            builder = MMapIndexedDatasetBuilder(
+                get_bin_path(args.output_prefix), dtype=dataset.index.dtype, multimodal=args.multimodal
+            )
+            del dataset
+
+        builder.add_index(os.path.join(args.input, prefix))
+
+    builder.finalize(get_idx_path(args.output_prefix))
+
+
+if __name__ == '__main__':
+
+    main()
\ No newline at end of file
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
new file mode 100644
index 00000000..f4cb0d6f
--- /dev/null
+++ b/tools/preprocess_data.py
@@ -0,0 +1,453 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Processing large data for pretraining."""
+import argparse
+import math
+import json
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir)))
+import time
+import gzip
+import glob
+import torch
+import numpy as np
+import multiprocessing
+try:
+    import nltk
+    nltk_available = True
+except ImportError:
+    nltk_available = False
+
+from nanotron.data import indexed_dataset
+
+# Question: Leave only Llama2Tokenizer?
+def build_tokenizer(args):
+    """Initialize tokenizer."""
+    if args.rank == 0:
+        print('> building {} tokenizer ...'.format(args.tokenizer_type),
+              flush=True)
+
+    # Select and instantiate the tokenizer.
+    if args.tokenizer_type == 'BertWordPieceLowerCase':
+        assert args.vocab_file is not None
+        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
+                                            lower_case=True,
+                                            vocab_extra_ids=args.vocab_extra_ids)
+    elif args.tokenizer_type == 'BertWordPieceCase':
+        assert args.vocab_file is not None
+        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
+                                            lower_case=False,
+                                            vocab_extra_ids=args.vocab_extra_ids)
+    elif args.tokenizer_type == 'GPT2BPETokenizer':
+        assert args.vocab_file is not None
+        assert args.merge_file is not None
+        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
+    elif args.tokenizer_type == 'SentencePieceTokenizer':
+        assert args.tokenizer_model is not None
+        tokenizer = _SentencePieceTokenizer(args.tokenizer_model, vocab_extra_ids=args.vocab_extra_ids)
+    elif args.tokenizer_type == 'GPTSentencePieceTokenizer':
+        assert args.tokenizer_model is not None
+        tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model)
+    elif args.tokenizer_type == 'Llama2Tokenizer':
+        assert args.tokenizer_model is not None
+        tokenizer = _Llama2Tokenizer(args.tokenizer_model)
+    elif args.tokenizer_type == 'NullTokenizer':
+        assert args.vocab_size is not None
+        tokenizer = _NullTokenizer(args.vocab_size)
+    else:
+        raise NotImplementedError('{} tokenizer is not '
+                                  'implemented.'.format(args.tokenizer_type))
+
+    # Add vocab size (if not already set from a checkpoint).
+    if getattr(args, "padded_vocab_size", None) is None:
+        args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size,
+                                                          args)
+
+    return tokenizer
+
+
+
+# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
+class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
+
+    _period_context_fmt = r"""
+        \S*                          # some word material
+        %(SentEndChars)s             # a potential sentence ending
+        \s*                       #  <-- THIS is what I changed
+        (?=(?P<after_tok>
+            %(NonWord)s              # either other punctuation
+            |
+            (?P<next_tok>\S+)     #  <-- Normally you would have \s+ here
+        ))"""
+
+class IdentitySplitter(object):
+    def tokenize(self, *text):
+        return text
+
+
+class Encoder(object):
+    def __init__(self, args):
+        self.args = args
+
+    def initializer(self):
+        # Use Encoder class as a container for global data
+        Encoder.tokenizer = build_tokenizer(self.args)
+        if self.args.split_sentences:
+            if not nltk_available:
+                print("NLTK is not available to split sentences.")
+                exit()
+            if os.environ.get("NLTK_DATA"):
+                library = os.path.join(os.environ.get("NLTK_DATA"), "tokenizers", "punkt", f"{self.args.lang}.pickle")
+                url = f"file:{library}"
+            else:
+                library = os.path.join("tokenizers", "punkt", f"{self.args.lang}.pickle")
+                url = f"nltk:{library}"
+            splitter = nltk.load(url)
+            if self.args.keep_newlines:
+                # this prevents punkt from eating newlines after sentences
+                Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
+                    train_text = splitter._params,
+                    lang_vars = CustomLanguageVars())
+            else:
+                Encoder.splitter = splitter
+
+        else:
+            Encoder.splitter = IdentitySplitter()
+
+    def split(self, json_line):
+        data = json.loads(json_line)
+        output = {}
+        for key in self.args.json_keys:
+            text = data[key]
+            max_len = 1000000
+            tokens_list = [Encoder.splitter.tokenize(text[i:i+max_len]) for i in range(0, len(text), max_len)]
+            output[key] = [tokens for partial in tokens_list for tokens in partial]
+        return json.dumps(output), len(json_line)
+
+    def encode(self, json_line):
+        data = json.loads(json_line)
+        ids = {}
+        lens = {}
+        for key in self.args.json_keys:
+            text = data[key]
+            if isinstance(text, list):
+                sentences = text
+            else:
+                sentences = [text]
+            doc_ids = []
+            sentence_lens = []
+            for sentence in sentences:
+                sentence_ids = Encoder.tokenizer.tokenize(sentence)
+                if len(sentence_ids) > 0:
+                    doc_ids.extend(sentence_ids)
+                    sentence_lens.append(len(sentence_ids))
+            if len(doc_ids) > 0 and self.args.append_eod:
+                doc_ids.append(Encoder.tokenizer.eod)
+                sentence_lens[-1] += 1
+            ids[key] = doc_ids
+            lens[key] = sentence_lens
+        return ids, lens, len(json_line)
+
+
+class Partition(object):
+    def __init__(self, args, workers):
+        self.args = args
+        self.workers = workers
+
+    def print_processing_stats(self, count, proc_start, total_bytes_processed):
+        if count % self.args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed/elapsed/1024/1024
+            print(f"Processed {count} documents",
+                  f"({count/elapsed} docs/s, {mbs} MB/s).",
+                  file=sys.stderr)
+
+    def split_sentences(self, file_name):
+        input_file_name, output_file_name = file_name
+        print("Opening", input_file_name)
+        fin = open(input_file_name, 'r', encoding='utf-8')
+        fout = open(output_file_name, 'w')
+
+        encoder = Encoder(self.args)
+        pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
+        split_docs = pool.imap(encoder.split, fin, 32)
+
+        proc_start = time.time()
+        total_bytes_processed = 0
+        for i, (doc, bytes_processed) in enumerate(split_docs, start=1):
+            total_bytes_processed += bytes_processed
+            fout.write(doc + "\n")
+            self.print_processing_stats(i, proc_start, total_bytes_processed)
+
+        fin.close()
+        fout.close()
+
+
+    def process_json_file(self, file_name):
+        input_file_name, output_prefix = file_name
+        print("Opening", input_file_name)
+        fin = open(input_file_name, 'r', encoding='utf-8')
+
+        startup_start = time.time()
+        encoder = Encoder(self.args)
+        tokenizer = build_tokenizer(self.args)
+        pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
+        encoded_docs = pool.imap(encoder.encode, fin, 32)
+
+        level = "document"
+        if self.args.split_sentences:
+            level = "sentence"
+
+        output_bin_files = {}
+        output_idx_files = {}
+        builders = {}
+
+        for key in self.args.json_keys:
+            output_bin_files[key] = "{}_{}_{}.bin".format(output_prefix,
+                                                          key, level)
+            output_idx_files[key] = "{}_{}_{}.idx".format(output_prefix,
+                                                          key, level)
+            builders[key] = indexed_dataset.MMapIndexedDatasetBuilder(
+                output_bin_files[key],
+                dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
+            )
+
+        startup_end = time.time()
+        proc_start = time.time()
+        total_bytes_processed = 0
+        print("Time to startup:", startup_end - startup_start)
+        for i, (doc, sentence_lens, bytes_processed) in enumerate(encoded_docs, start=1):
+            total_bytes_processed += bytes_processed
+            for key in doc.keys():
+                builders[key].add_document(doc[key], sentence_lens[key])
+            self.print_processing_stats(i, proc_start, total_bytes_processed)
+
+        fin.close()
+        builders[key].finalize(output_idx_files[key])
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title='input data')
+    group.add_argument('--input', type=str, required=True,
+                       help='Path to input JSON')
+    group.add_argument('--json-keys', nargs='+', default=['text'],
+                       help='space separate listed of keys to extract from json')
+    group.add_argument('--split-sentences', action='store_true',
+                       help='Split documents into sentences.')
+    group.add_argument('--keep-newlines', action='store_true',
+                       help='Keep newlines between sentences when splitting.')
+
+    group = parser.add_argument_group(title='tokenizer')
+    group.add_argument('--tokenizer-type', type=str, required=True,
+                       choices=['BertWordPieceLowerCase','BertWordPieceCase',
+                                'GPT2BPETokenizer', 'SentencePieceTokenizer',
+                                'GPTSentencePieceTokenizer', 'Llama2Tokenizer',
+                                'NullTokenizer'],
+                       help='What type of tokenizer to use.')
+    group.add_argument('--tokenizer-model', type=str, default=None,
+                       help='YTTM tokenizer model.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file')
+    group.add_argument('--vocab-size', default=786,
+                       help='size of vocab for use with NullTokenizer')
+    group.add_argument('--merge-file', type=str, default=None,
+                       help='Path to the BPE merge file (if necessary).')
+    group.add_argument('--append-eod', action='store_true',
+                       help='Append an <eod> token to the end of a document.')
+    group.add_argument('--lang', type=str, default='english',
+                       help='Language to use for NLTK-powered sentence splitting.')
+    group = parser.add_argument_group(title='output data')
+    group.add_argument('--output-prefix', type=str, required=True,
+                       help='Path to binary output file without suffix')
+
+    group = parser.add_argument_group(title='runtime')
+    group.add_argument('--workers', type=int, required=True,
+                       help=('Number of worker processes to launch.'
+                             'A good default for fast pre-processing '
+                             'is: (workers * partitions) = available CPU cores.'))
+    group.add_argument('--partitions', type=int, default=1,
+                        help='Number of file partitions')
+    group.add_argument('--log-interval', type=int, default=1000,
+                       help='Interval between progress updates')
+    group.add_argument('--keep-sequential-samples', action='store_true',
+                       help='Ensure ordering of samples in .jsonl files is '
+                            'preserved when using partitions>1.')
+    args = parser.parse_args()
+    args.keep_empty = False
+
+    if args.tokenizer_type.lower().startswith('bert') and not args.split_sentences:
+        print("Are you sure you don't want to split sentences?")
+
+    # some default/dummy values for the tokenizer
+    args.rank = 1
+    args.make_vocab_size_divisible_by = 128
+    args.tensor_model_parallel_size = 1
+    args.vocab_extra_ids = 0
+
+    return args
+
+
+def get_file_name(args, file_id):
+    file_name, extension = os.path.splitext(args.input)
+    input_file_name = file_name + "_" + str(file_id) + extension
+    sentence_split_file = file_name + "_ss_" + str(file_id) + extension
+    output_prefix = args.output_prefix + "_" + str(file_id)
+    file_names = {
+        'partition': input_file_name,
+        'sentence_split': sentence_split_file,
+        'output_prefix': output_prefix}
+    return file_names
+
+
+def check_files_exist(in_ss_out_names, key, num_partitions):
+    for i in range(num_partitions):
+        if not os.path.exists(in_ss_out_names[i][key]):
+            return False
+    return True
+
+
+def main():
+    args = get_args()
+
+    if args.split_sentences:
+        if nltk_available:
+            nltk.download("punkt", quiet=True, download_dir=os.environ.get("NLTK_DATA"))
+        else:
+            raise Exception(
+                "nltk library required for sentence splitting is not available.")
+
+    in_ss_out_names = []
+    if args.partitions == 1:
+        file_name, extension = os.path.splitext(args.input)
+        sentence_split_file = file_name + "_ss" + extension
+        file_names = {
+            'partition': args.input,
+            'sentence_split': sentence_split_file,
+            'output_prefix': args.output_prefix}
+        in_ss_out_names.append(file_names)
+    else:
+        in_file_names = glob.glob(args.input)
+
+        # Count total number of lines across .jsonl files
+        if args.keep_sequential_samples:
+            total_sample_count = 0
+            for filename in in_file_names:
+                with open(filename, "r") as fin:
+                    for fc, _ in enumerate(fin):
+                        pass
+                total_sample_count += (fc + 1)
+            partition_size = math.ceil(total_sample_count / args.partitions)
+
+        # create .jsonl parition files
+        for idx in range(args.partitions):
+            in_ss_out_name = get_file_name(args, idx)
+            in_ss_out_names.append(in_ss_out_name)
+
+        # check to see if paritions were already created
+        partitions_present = check_files_exist(in_ss_out_names, 'partition', args.partitions)
+
+        # check to see if paritions with split sentences already created
+        split_sentences_present = check_files_exist(in_ss_out_names, 'sentence_split', args.partitions)
+
+        if not partitions_present and not split_sentences_present:
+            # populate .jsonl partition files from parent files
+            partitioned_input_files = []
+            for idx in range(args.partitions):
+                partitioned_input_file = open(in_ss_out_names[idx]['partition'], 'w')
+                partitioned_input_files.append(partitioned_input_file)
+
+            index = 0
+            if args.keep_sequential_samples: line_count = 0
+            for in_file_name in in_file_names:
+                # support for gzip files
+                if in_file_name.endswith(".gz"):
+                    fin = gzip.open(in_file_name, 'rt')
+                else:
+                    fin = open(in_file_name, 'r', encoding='utf-8')
+
+                for line in fin:
+                    partitioned_input_files[index].write(line)
+                    if args.keep_sequential_samples:
+                        line_count += 1
+                        if line_count % partition_size == 0:
+                            index += 1
+                    else:
+                        index = (index + 1)%args.partitions
+
+                fin.close()
+
+            for idx in range(args.partitions):
+                partitioned_input_files[idx].close()
+
+    assert args.workers % args.partitions == 0
+    partition = Partition(args, args.workers//args.partitions)
+
+    # check to see if paritions with split sentences already created
+    split_sentences_present = check_files_exist(in_ss_out_names, 'sentence_split', args.partitions)
+
+    # split sentences in partition files
+    if args.split_sentences and not split_sentences_present:
+        processes = []
+        for name in in_ss_out_names:
+            p = multiprocessing.Process(target=partition.split_sentences,
+                                        args=((name['partition'], name['sentence_split']),))
+            p.start()
+            processes.append(p)
+
+        for p in processes:
+            p.join()
+
+        if args.partitions == 1:
+            return
+
+
+    # encode partition files in parallel
+    processes = []
+    input_key = 'sentence_split' if args.split_sentences else 'partition'
+    for name in in_ss_out_names:
+        p = multiprocessing.Process(target=partition.process_json_file,
+                                    args=((name[input_key], name['output_prefix']),))
+        p.start()
+        processes.append(p)
+
+    for p in processes:
+        p.join()
+
+    if args.partitions == 1:
+        return
+
+    # merge bin/idx partitions
+    level = "document"
+    if args.split_sentences:
+        level = "sentence"
+
+    output_bin_files = {}
+    output_idx_files = {}
+    builders = {}
+    tokenizer = build_tokenizer(args)
+
+    for key in args.json_keys:
+        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
+                                                      key, level)
+        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
+                                                      key, level)
+        builders[key] = indexed_dataset.MMapIndexedDatasetBuilder(
+            output_bin_files[key],
+            dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
+        )
+
+        for name in in_ss_out_names:
+            parition_output_prefix = name['output_prefix']
+            full_partition_output_prefix = "{}_{}_{}".format(parition_output_prefix,
+                                                             key, level)
+            builders[key].add_index(full_partition_output_prefix)
+        builders[key].finalize(output_idx_files[key])
+
+
+if __name__ == '__main__':
+
+    main()

From 31ff34ca06f6261f43bdae94ef47f421c37fa4ae Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 13 Mar 2024 12:33:58 +0100
Subject: [PATCH 19/73] added nanoset docs

---
 docs/nanoset.md | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/docs/nanoset.md b/docs/nanoset.md
index 8e46b50a..7f3f67c1 100644
--- a/docs/nanoset.md
+++ b/docs/nanoset.md
@@ -2,13 +2,36 @@
 
 ## Data pre-processing
 
+The training data requires preprocessing. First, place your training data in a loose json format, with one json containing a text sample per line. For example:
+
+<pre>
+{"src": "www.nvidia.com", "text": "The quick brown fox", "type": "Eng", "id": "0", "title": "First Part"}
+{"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"}
+</pre>
+
+The name of the `text` field of the json can be changed by using the `--json-key` flag in preprocess_data.py The other metadata are optional and are not used in training.
+
+The loose json is then processed into a binary format for training. To convert the json into mmap format use [`preprocess_data.py`](/tools/preprocess_data.py). An example script to prepare data for Llama2 training is:
+
+<pre>
+python tools/preprocess_data.py \
+       --input my_corpus.json \
+       --output-prefix my-llama2-dataset \
+       --tokenizer-type Llama2Tokenizer \
+       --tokenizer-model /models/Llama-2-7b-chat-hf/tokenizer.model \
+       --append-eod \
+       --workers 6
+</pre>
+
+The output will be two files named, in this case, `my-llama2-dataset_text_document.bin` and `my-llama2-dataset_text_document.idx`. The `--data-path` specified later in training is the full path and new filename, but without the file extension.
+
+----
+
 Data preprocessing is built around the following classes:
 
 1. `MMapIndexedDatasetBuilder`
 2. `MMapIndexedDataset`
 
-At the moment, an end-to-end data preprocessing implementation is left to the user. See the class docstring(s) for more details.
-
 #### MMapIndexedDatasetBuilder
 
 The `MMapIndexedDatasetBuilder` is capable of building and merging `MMapIndexedDataset` instances.

From da59185fd94458a4db09bf2b8fcec53d7191548c Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 13 Mar 2024 12:57:28 +0100
Subject: [PATCH 20/73] deleted preprocessing files

---
 tools/merge_datasets.py  | 101 ---------
 tools/preprocess_data.py | 453 ---------------------------------------
 2 files changed, 554 deletions(-)
 delete mode 100644 tools/merge_datasets.py
 delete mode 100644 tools/preprocess_data.py

diff --git a/tools/merge_datasets.py b/tools/merge_datasets.py
deleted file mode 100644
index e08a6144..00000000
--- a/tools/merge_datasets.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import os
-import sys
-import json
-import argparse
-
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
-)
-
-from megatron.core.datasets.indexed_dataset import (
-    MMapIndexedDataset,
-    MMapIndexedDatasetBuilder,
-    get_bin_path,
-    get_idx_path,
-)
-
-from nanotron.data.indexed_dataset import (
-    MMapIndexedDataset,
-    MMapIndexedDatasetBuilder,
-    get_bin_path,
-    get_idx_path,
-)
-
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-
-    group = parser.add_argument_group(title="input data")
-    group.add_argument(
-        "--input",
-        type=str,
-        required=True,
-        help="Path to directory containing all document files to merge",
-    )
-
-    group = parser.add_argument_group(title="output data")
-    group.add_argument(
-        "--output-prefix",
-        type=str,
-        required=True,
-        help="Path to binary output file without suffix",
-    )
-
-    group = parser.add_argument_group(title="miscellaneous")
-    group.add_argument(
-        "--multimodal",
-        action="store_true",
-        help="Whether the datasets are assumed to be multimodal"
-    )
-
-    args = parser.parse_args()
-
-    assert os.path.isdir(
-        args.input
-    ), f"ERROR: {args.input} is not a directory or does not exist"
-
-    assert os.path.isdir(
-        os.path.dirname(args.output_prefix)
-    ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist"
-
-    return args
-
-
-def main():
-    args = get_args()
-
-    prefixes = set()
-    for basename in os.listdir(args.input):
-        prefix, ext = os.path.splitext(basename)
-
-        if prefix in prefixes:
-            continue
-
-        if not os.path.isfile(os.path.join(args.input, basename)):
-            continue
-
-        ext_pair = ".bin" if ext == ".idx" else ".idx"
-        assert os.path.isfile(
-            os.path.join(args.input, prefix) + ext_pair
-        ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}"
-
-        prefixes.add(prefix)
-
-    builder = None
-    for prefix in sorted(prefixes):
-        if builder is None:
-            dataset = MMapIndexedDataset(os.path.join(args.input, prefix), multimodal=args.multimodal)
-            builder = MMapIndexedDatasetBuilder(
-                get_bin_path(args.output_prefix), dtype=dataset.index.dtype, multimodal=args.multimodal
-            )
-            del dataset
-
-        builder.add_index(os.path.join(args.input, prefix))
-
-    builder.finalize(get_idx_path(args.output_prefix))
-
-
-if __name__ == '__main__':
-
-    main()
\ No newline at end of file
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
deleted file mode 100644
index f4cb0d6f..00000000
--- a/tools/preprocess_data.py
+++ /dev/null
@@ -1,453 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Processing large data for pretraining."""
-import argparse
-import math
-import json
-import os
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir)))
-import time
-import gzip
-import glob
-import torch
-import numpy as np
-import multiprocessing
-try:
-    import nltk
-    nltk_available = True
-except ImportError:
-    nltk_available = False
-
-from nanotron.data import indexed_dataset
-
-# Question: Leave only Llama2Tokenizer?
-def build_tokenizer(args):
-    """Initialize tokenizer."""
-    if args.rank == 0:
-        print('> building {} tokenizer ...'.format(args.tokenizer_type),
-              flush=True)
-
-    # Select and instantiate the tokenizer.
-    if args.tokenizer_type == 'BertWordPieceLowerCase':
-        assert args.vocab_file is not None
-        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
-                                            lower_case=True,
-                                            vocab_extra_ids=args.vocab_extra_ids)
-    elif args.tokenizer_type == 'BertWordPieceCase':
-        assert args.vocab_file is not None
-        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
-                                            lower_case=False,
-                                            vocab_extra_ids=args.vocab_extra_ids)
-    elif args.tokenizer_type == 'GPT2BPETokenizer':
-        assert args.vocab_file is not None
-        assert args.merge_file is not None
-        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
-    elif args.tokenizer_type == 'SentencePieceTokenizer':
-        assert args.tokenizer_model is not None
-        tokenizer = _SentencePieceTokenizer(args.tokenizer_model, vocab_extra_ids=args.vocab_extra_ids)
-    elif args.tokenizer_type == 'GPTSentencePieceTokenizer':
-        assert args.tokenizer_model is not None
-        tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model)
-    elif args.tokenizer_type == 'Llama2Tokenizer':
-        assert args.tokenizer_model is not None
-        tokenizer = _Llama2Tokenizer(args.tokenizer_model)
-    elif args.tokenizer_type == 'NullTokenizer':
-        assert args.vocab_size is not None
-        tokenizer = _NullTokenizer(args.vocab_size)
-    else:
-        raise NotImplementedError('{} tokenizer is not '
-                                  'implemented.'.format(args.tokenizer_type))
-
-    # Add vocab size (if not already set from a checkpoint).
-    if getattr(args, "padded_vocab_size", None) is None:
-        args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size,
-                                                          args)
-
-    return tokenizer
-
-
-
-# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
-class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
-
-    _period_context_fmt = r"""
-        \S*                          # some word material
-        %(SentEndChars)s             # a potential sentence ending
-        \s*                       #  <-- THIS is what I changed
-        (?=(?P<after_tok>
-            %(NonWord)s              # either other punctuation
-            |
-            (?P<next_tok>\S+)     #  <-- Normally you would have \s+ here
-        ))"""
-
-class IdentitySplitter(object):
-    def tokenize(self, *text):
-        return text
-
-
-class Encoder(object):
-    def __init__(self, args):
-        self.args = args
-
-    def initializer(self):
-        # Use Encoder class as a container for global data
-        Encoder.tokenizer = build_tokenizer(self.args)
-        if self.args.split_sentences:
-            if not nltk_available:
-                print("NLTK is not available to split sentences.")
-                exit()
-            if os.environ.get("NLTK_DATA"):
-                library = os.path.join(os.environ.get("NLTK_DATA"), "tokenizers", "punkt", f"{self.args.lang}.pickle")
-                url = f"file:{library}"
-            else:
-                library = os.path.join("tokenizers", "punkt", f"{self.args.lang}.pickle")
-                url = f"nltk:{library}"
-            splitter = nltk.load(url)
-            if self.args.keep_newlines:
-                # this prevents punkt from eating newlines after sentences
-                Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
-                    train_text = splitter._params,
-                    lang_vars = CustomLanguageVars())
-            else:
-                Encoder.splitter = splitter
-
-        else:
-            Encoder.splitter = IdentitySplitter()
-
-    def split(self, json_line):
-        data = json.loads(json_line)
-        output = {}
-        for key in self.args.json_keys:
-            text = data[key]
-            max_len = 1000000
-            tokens_list = [Encoder.splitter.tokenize(text[i:i+max_len]) for i in range(0, len(text), max_len)]
-            output[key] = [tokens for partial in tokens_list for tokens in partial]
-        return json.dumps(output), len(json_line)
-
-    def encode(self, json_line):
-        data = json.loads(json_line)
-        ids = {}
-        lens = {}
-        for key in self.args.json_keys:
-            text = data[key]
-            if isinstance(text, list):
-                sentences = text
-            else:
-                sentences = [text]
-            doc_ids = []
-            sentence_lens = []
-            for sentence in sentences:
-                sentence_ids = Encoder.tokenizer.tokenize(sentence)
-                if len(sentence_ids) > 0:
-                    doc_ids.extend(sentence_ids)
-                    sentence_lens.append(len(sentence_ids))
-            if len(doc_ids) > 0 and self.args.append_eod:
-                doc_ids.append(Encoder.tokenizer.eod)
-                sentence_lens[-1] += 1
-            ids[key] = doc_ids
-            lens[key] = sentence_lens
-        return ids, lens, len(json_line)
-
-
-class Partition(object):
-    def __init__(self, args, workers):
-        self.args = args
-        self.workers = workers
-
-    def print_processing_stats(self, count, proc_start, total_bytes_processed):
-        if count % self.args.log_interval == 0:
-            current = time.time()
-            elapsed = current - proc_start
-            mbs = total_bytes_processed/elapsed/1024/1024
-            print(f"Processed {count} documents",
-                  f"({count/elapsed} docs/s, {mbs} MB/s).",
-                  file=sys.stderr)
-
-    def split_sentences(self, file_name):
-        input_file_name, output_file_name = file_name
-        print("Opening", input_file_name)
-        fin = open(input_file_name, 'r', encoding='utf-8')
-        fout = open(output_file_name, 'w')
-
-        encoder = Encoder(self.args)
-        pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
-        split_docs = pool.imap(encoder.split, fin, 32)
-
-        proc_start = time.time()
-        total_bytes_processed = 0
-        for i, (doc, bytes_processed) in enumerate(split_docs, start=1):
-            total_bytes_processed += bytes_processed
-            fout.write(doc + "\n")
-            self.print_processing_stats(i, proc_start, total_bytes_processed)
-
-        fin.close()
-        fout.close()
-
-
-    def process_json_file(self, file_name):
-        input_file_name, output_prefix = file_name
-        print("Opening", input_file_name)
-        fin = open(input_file_name, 'r', encoding='utf-8')
-
-        startup_start = time.time()
-        encoder = Encoder(self.args)
-        tokenizer = build_tokenizer(self.args)
-        pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
-        encoded_docs = pool.imap(encoder.encode, fin, 32)
-
-        level = "document"
-        if self.args.split_sentences:
-            level = "sentence"
-
-        output_bin_files = {}
-        output_idx_files = {}
-        builders = {}
-
-        for key in self.args.json_keys:
-            output_bin_files[key] = "{}_{}_{}.bin".format(output_prefix,
-                                                          key, level)
-            output_idx_files[key] = "{}_{}_{}.idx".format(output_prefix,
-                                                          key, level)
-            builders[key] = indexed_dataset.MMapIndexedDatasetBuilder(
-                output_bin_files[key],
-                dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
-            )
-
-        startup_end = time.time()
-        proc_start = time.time()
-        total_bytes_processed = 0
-        print("Time to startup:", startup_end - startup_start)
-        for i, (doc, sentence_lens, bytes_processed) in enumerate(encoded_docs, start=1):
-            total_bytes_processed += bytes_processed
-            for key in doc.keys():
-                builders[key].add_document(doc[key], sentence_lens[key])
-            self.print_processing_stats(i, proc_start, total_bytes_processed)
-
-        fin.close()
-        builders[key].finalize(output_idx_files[key])
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group(title='input data')
-    group.add_argument('--input', type=str, required=True,
-                       help='Path to input JSON')
-    group.add_argument('--json-keys', nargs='+', default=['text'],
-                       help='space separate listed of keys to extract from json')
-    group.add_argument('--split-sentences', action='store_true',
-                       help='Split documents into sentences.')
-    group.add_argument('--keep-newlines', action='store_true',
-                       help='Keep newlines between sentences when splitting.')
-
-    group = parser.add_argument_group(title='tokenizer')
-    group.add_argument('--tokenizer-type', type=str, required=True,
-                       choices=['BertWordPieceLowerCase','BertWordPieceCase',
-                                'GPT2BPETokenizer', 'SentencePieceTokenizer',
-                                'GPTSentencePieceTokenizer', 'Llama2Tokenizer',
-                                'NullTokenizer'],
-                       help='What type of tokenizer to use.')
-    group.add_argument('--tokenizer-model', type=str, default=None,
-                       help='YTTM tokenizer model.')
-    group.add_argument('--vocab-file', type=str, default=None,
-                       help='Path to the vocab file')
-    group.add_argument('--vocab-size', default=786,
-                       help='size of vocab for use with NullTokenizer')
-    group.add_argument('--merge-file', type=str, default=None,
-                       help='Path to the BPE merge file (if necessary).')
-    group.add_argument('--append-eod', action='store_true',
-                       help='Append an <eod> token to the end of a document.')
-    group.add_argument('--lang', type=str, default='english',
-                       help='Language to use for NLTK-powered sentence splitting.')
-    group = parser.add_argument_group(title='output data')
-    group.add_argument('--output-prefix', type=str, required=True,
-                       help='Path to binary output file without suffix')
-
-    group = parser.add_argument_group(title='runtime')
-    group.add_argument('--workers', type=int, required=True,
-                       help=('Number of worker processes to launch.'
-                             'A good default for fast pre-processing '
-                             'is: (workers * partitions) = available CPU cores.'))
-    group.add_argument('--partitions', type=int, default=1,
-                        help='Number of file partitions')
-    group.add_argument('--log-interval', type=int, default=1000,
-                       help='Interval between progress updates')
-    group.add_argument('--keep-sequential-samples', action='store_true',
-                       help='Ensure ordering of samples in .jsonl files is '
-                            'preserved when using partitions>1.')
-    args = parser.parse_args()
-    args.keep_empty = False
-
-    if args.tokenizer_type.lower().startswith('bert') and not args.split_sentences:
-        print("Are you sure you don't want to split sentences?")
-
-    # some default/dummy values for the tokenizer
-    args.rank = 1
-    args.make_vocab_size_divisible_by = 128
-    args.tensor_model_parallel_size = 1
-    args.vocab_extra_ids = 0
-
-    return args
-
-
-def get_file_name(args, file_id):
-    file_name, extension = os.path.splitext(args.input)
-    input_file_name = file_name + "_" + str(file_id) + extension
-    sentence_split_file = file_name + "_ss_" + str(file_id) + extension
-    output_prefix = args.output_prefix + "_" + str(file_id)
-    file_names = {
-        'partition': input_file_name,
-        'sentence_split': sentence_split_file,
-        'output_prefix': output_prefix}
-    return file_names
-
-
-def check_files_exist(in_ss_out_names, key, num_partitions):
-    for i in range(num_partitions):
-        if not os.path.exists(in_ss_out_names[i][key]):
-            return False
-    return True
-
-
-def main():
-    args = get_args()
-
-    if args.split_sentences:
-        if nltk_available:
-            nltk.download("punkt", quiet=True, download_dir=os.environ.get("NLTK_DATA"))
-        else:
-            raise Exception(
-                "nltk library required for sentence splitting is not available.")
-
-    in_ss_out_names = []
-    if args.partitions == 1:
-        file_name, extension = os.path.splitext(args.input)
-        sentence_split_file = file_name + "_ss" + extension
-        file_names = {
-            'partition': args.input,
-            'sentence_split': sentence_split_file,
-            'output_prefix': args.output_prefix}
-        in_ss_out_names.append(file_names)
-    else:
-        in_file_names = glob.glob(args.input)
-
-        # Count total number of lines across .jsonl files
-        if args.keep_sequential_samples:
-            total_sample_count = 0
-            for filename in in_file_names:
-                with open(filename, "r") as fin:
-                    for fc, _ in enumerate(fin):
-                        pass
-                total_sample_count += (fc + 1)
-            partition_size = math.ceil(total_sample_count / args.partitions)
-
-        # create .jsonl parition files
-        for idx in range(args.partitions):
-            in_ss_out_name = get_file_name(args, idx)
-            in_ss_out_names.append(in_ss_out_name)
-
-        # check to see if paritions were already created
-        partitions_present = check_files_exist(in_ss_out_names, 'partition', args.partitions)
-
-        # check to see if paritions with split sentences already created
-        split_sentences_present = check_files_exist(in_ss_out_names, 'sentence_split', args.partitions)
-
-        if not partitions_present and not split_sentences_present:
-            # populate .jsonl partition files from parent files
-            partitioned_input_files = []
-            for idx in range(args.partitions):
-                partitioned_input_file = open(in_ss_out_names[idx]['partition'], 'w')
-                partitioned_input_files.append(partitioned_input_file)
-
-            index = 0
-            if args.keep_sequential_samples: line_count = 0
-            for in_file_name in in_file_names:
-                # support for gzip files
-                if in_file_name.endswith(".gz"):
-                    fin = gzip.open(in_file_name, 'rt')
-                else:
-                    fin = open(in_file_name, 'r', encoding='utf-8')
-
-                for line in fin:
-                    partitioned_input_files[index].write(line)
-                    if args.keep_sequential_samples:
-                        line_count += 1
-                        if line_count % partition_size == 0:
-                            index += 1
-                    else:
-                        index = (index + 1)%args.partitions
-
-                fin.close()
-
-            for idx in range(args.partitions):
-                partitioned_input_files[idx].close()
-
-    assert args.workers % args.partitions == 0
-    partition = Partition(args, args.workers//args.partitions)
-
-    # check to see if paritions with split sentences already created
-    split_sentences_present = check_files_exist(in_ss_out_names, 'sentence_split', args.partitions)
-
-    # split sentences in partition files
-    if args.split_sentences and not split_sentences_present:
-        processes = []
-        for name in in_ss_out_names:
-            p = multiprocessing.Process(target=partition.split_sentences,
-                                        args=((name['partition'], name['sentence_split']),))
-            p.start()
-            processes.append(p)
-
-        for p in processes:
-            p.join()
-
-        if args.partitions == 1:
-            return
-
-
-    # encode partition files in parallel
-    processes = []
-    input_key = 'sentence_split' if args.split_sentences else 'partition'
-    for name in in_ss_out_names:
-        p = multiprocessing.Process(target=partition.process_json_file,
-                                    args=((name[input_key], name['output_prefix']),))
-        p.start()
-        processes.append(p)
-
-    for p in processes:
-        p.join()
-
-    if args.partitions == 1:
-        return
-
-    # merge bin/idx partitions
-    level = "document"
-    if args.split_sentences:
-        level = "sentence"
-
-    output_bin_files = {}
-    output_idx_files = {}
-    builders = {}
-    tokenizer = build_tokenizer(args)
-
-    for key in args.json_keys:
-        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
-                                                      key, level)
-        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
-                                                      key, level)
-        builders[key] = indexed_dataset.MMapIndexedDatasetBuilder(
-            output_bin_files[key],
-            dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
-        )
-
-        for name in in_ss_out_names:
-            parition_output_prefix = name['output_prefix']
-            full_partition_output_prefix = "{}_{}_{}".format(parition_output_prefix,
-                                                             key, level)
-            builders[key].add_index(full_partition_output_prefix)
-        builders[key].finalize(output_idx_files[key])
-
-
-if __name__ == '__main__':
-
-    main()

From e19dd44cc579e7e94ade44d0c2b4409c283ad2de Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 13 Mar 2024 13:09:42 +0100
Subject: [PATCH 21/73] Resolving pre-commit-hook changes

---
 LICENSE                                 |   2 +-
 docs/nanoset.md                         |  11 +--
 examples/nanoset_llama_7b.py            |   4 +-
 run_train_nanoset.py                    |  29 +++---
 src/nanotron/config/config.py           |   2 +
 src/nanotron/data/dataloader_builder.py |  74 ++++++++--------
 src/nanotron/data/dataset_builder.py    |  71 +++++++--------
 src/nanotron/data/helpers.cpp           |  10 +--
 src/nanotron/data/indexed_dataset.py    |  45 ++++------
 src/nanotron/data/nanoset.py            | 112 ++++++++++++------------
 src/nanotron/data/nanoset_configs.py    |   8 +-
 src/nanotron/data/utils.py              |  28 +++---
 src/nanotron/dataloader.py              |   8 +-
 src/nanotron/trainer.py                 |   8 +-
 src/nanotron/utils.py                   |   6 +-
 15 files changed, 204 insertions(+), 214 deletions(-)

diff --git a/LICENSE b/LICENSE
index 199c83ea..208d5bf8 100644
--- a/LICENSE
+++ b/LICENSE
@@ -201,7 +201,7 @@
    limitations under the License.
 
 This repository also contains code from Facebook (from their Fairseq project)
-and NVIDIA (from their Megatron project). Files from these organizations 
+and NVIDIA (from their Megatron project). Files from these organizations
 have notices at the top of each file. Below are licenses used in those files, as indicated.
 
 ------------- LICENSE FOR Facebook Fairseq code --------------
diff --git a/docs/nanoset.md b/docs/nanoset.md
index 7f3f67c1..78de4d87 100644
--- a/docs/nanoset.md
+++ b/docs/nanoset.md
@@ -2,6 +2,8 @@
 
 ## Data pre-processing
 
+For data pre-processing, refer to [Megatron-LM project](https://github.com/NVIDIA/Megatron-LM/tree/main?tab=readme-ov-file#data-preprocessing), as Nanoset requires the same kind of preprocessing. The main file used is [`preprocess_data.py`](https://github.com/NVIDIA/Megatron-LM/blob/main/tools/preprocess_data.py)
+
 The training data requires preprocessing. First, place your training data in a loose json format, with one json containing a text sample per line. For example:
 
 <pre>
@@ -9,9 +11,9 @@ The training data requires preprocessing. First, place your training data in a l
 {"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"}
 </pre>
 
-The name of the `text` field of the json can be changed by using the `--json-key` flag in preprocess_data.py The other metadata are optional and are not used in training.
+The name of the `text` field of the json can be changed by using the `--json-key` flag in [`preprocess_data.py`](https://github.com/NVIDIA/Megatron-LM/blob/main/tools/preprocess_data.py). The other metadata are optional and are not used in training.
 
-The loose json is then processed into a binary format for training. To convert the json into mmap format use [`preprocess_data.py`](/tools/preprocess_data.py). An example script to prepare data for Llama2 training is:
+The loose json is then processed into a binary format for training. To convert the json into mmap format use [`preprocess_data.py`](https://github.com/NVIDIA/Megatron-LM/blob/main/tools/preprocess_data.py). An example script to prepare data for Llama2 training is:
 
 <pre>
 python tools/preprocess_data.py \
@@ -66,7 +68,7 @@ See the class docstrings for more details.
 
 #### NanosetConfig
 
-The `NanosetConfig` class parameterizes the `NanosetBuilder` and in turn the `Nanoset`.
+The `NanosetConfig` class parametrizes the `NanosetBuilder` and in turn the `Nanoset`.
 
 Different training/inference regimes will require different extensions e.g. the `NanosetConfig`
 
@@ -111,7 +113,7 @@ The `Nanoset` creates three index mappings to facilitate lookup: (1) the documen
     Do_idx = [8, 8, 9, 6, 7, 5, 8, 5, 6, 6, 5, 9, 7, 7, 9]
     ```
 
-2. The sample index _Sa_idx_ is a 2-D array mapping from _j_ to pairs of (_i_, _Do_idx_[ _i_ ] offset) of shape `[N + 1, 2]`. The rows _j_ and _j_ + 1 serve as the left and right bounds for the _j_-th sample. 
+2. The sample index _Sa_idx_ is a 2-D array mapping from _j_ to pairs of (_i_, _Do_idx_[ _i_ ] offset) of shape `[N + 1, 2]`. The rows _j_ and _j_ + 1 serve as the left and right bounds for the _j_-th sample.
 
     ```
     Given:
@@ -164,4 +166,3 @@ To query the `Nanoset` for the _k_-th sample we do the following
     ```
 
 To save time during initialization, each index is built/cached sequentially on one process rank and subsequently loaded in parallel on other process ranks. The cached indices are unique to a hash generated in the `Nanoset.__init__` function.
-
diff --git a/examples/nanoset_llama_7b.py b/examples/nanoset_llama_7b.py
index f1d5508f..a7e229b7 100644
--- a/examples/nanoset_llama_7b.py
+++ b/examples/nanoset_llama_7b.py
@@ -13,9 +13,9 @@
     LoggingArgs,
     LRSchedulerArgs,
     ModelArgs,
+    NanosetDatasetsArgs,
     OptimizerArgs,
     ParallelismArgs,
-    NanosetDatasetsArgs,
     RandomInit,
     TokenizerArgs,
     TokensArgs,
@@ -65,7 +65,7 @@
 
 tokens = TokensArgs(sequence_length=8192, train_steps=5, micro_batch_size=1, batch_accumulation_per_replica=8)
 
-dataset = NanosetDatasetsArgs(data_path="PATH_TO_DATASET", split='949,50,1')
+dataset = NanosetDatasetsArgs(data_path="PATH_TO_DATASET", split="949,50,1")
 
 checkpoints_path = os.path.dirname(os.path.dirname(__file__)) + "/checkpoints"
 os.makedirs(checkpoints_path, exist_ok=True)
diff --git a/run_train_nanoset.py b/run_train_nanoset.py
index 298de82c..13bb1b05 100644
--- a/run_train_nanoset.py
+++ b/run_train_nanoset.py
@@ -10,17 +10,16 @@
 import argparse
 
 from nanotron import logging
-
-from nanotron.parallel.pipeline_parallel.utils import get_input_output_pp_ranks
-from nanotron.trainer import DistributedTrainer
-
-from nanotron.data.nanoset import NanosetConfig
-from nanotron.data.dataset_builder import NanosetBuilder
 from nanotron.data.dataloader_builder import build_nanoset_dataloader
+from nanotron.data.dataset_builder import NanosetBuilder
+from nanotron.data.nanoset import NanosetConfig
 from nanotron.data.utils import compute_datasets_num_samples
+from nanotron.parallel.pipeline_parallel.utils import get_input_output_pp_ranks
+from nanotron.trainer import DistributedTrainer
 
 logger = logging.get_logger(__name__)
 
+
 def get_dataloaders(trainer: DistributedTrainer):
     """Returns train, valid and test dataloaders"""
 
@@ -28,17 +27,19 @@ def get_dataloaders(trainer: DistributedTrainer):
     input_pp_rank, output_pp_rank = get_input_output_pp_ranks(model=trainer.model)
 
     # Create Nanoset config
-    split_num_samples = compute_datasets_num_samples(train_iters=trainer.config.tokens.train_steps,
-                                               eval_interval=trainer.config.tokens.val_check_interval,
-                                               eval_iters=trainer.config.tokens.val_steps,
-                                               global_batch_size=trainer.global_batch_size)
-    
+    split_num_samples = compute_datasets_num_samples(
+        train_iters=trainer.config.tokens.train_steps,
+        eval_interval=trainer.config.tokens.val_check_interval,
+        eval_iters=trainer.config.tokens.val_steps,
+        global_batch_size=trainer.global_batch_size,
+    )
+
     nanoset_config = NanosetConfig(
         random_seed=trainer.config.data.seed,
         sequence_length=trainer.sequence_length,
         data_path=trainer.config.data.dataset.data_path,
         split=trainer.config.data.dataset.split,
-        split_num_samples=split_num_samples
+        split_num_samples=split_num_samples,
     )
 
     # Build Nanoset datasets
@@ -85,8 +86,6 @@ def get_dataloaders(trainer: DistributedTrainer):
     return train_dataloader, valid_dataloader, test_dataloader
 
 
-
-
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--config-file", type=str, required=True, help="Path to the YAML or python config file")
@@ -99,7 +98,7 @@ def get_args():
 
     # Load trainer and data
     trainer = DistributedTrainer(config_file)
-    
+
     train_dataloader, valid_dataloader, test_dataloader = get_dataloaders(trainer)
 
     # Train
diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
index 23bb7201..991f9c9d 100644
--- a/src/nanotron/config/config.py
+++ b/src/nanotron/config/config.py
@@ -94,6 +94,7 @@ def __post_init__(self):
         if self.hf_dataset_splits is None:
             self.hf_dataset_splits = "train"
 
+
 @dataclass
 class NanosetDatasetsArgs:
     data_path: str
@@ -103,6 +104,7 @@ def __post_init__(self):
         if self.split is None:
             self.split = "949,50,1"
 
+
 @dataclass
 class DataArgs:
     """Arguments related to the data and data files processing"""
diff --git a/src/nanotron/data/dataloader_builder.py b/src/nanotron/data/dataloader_builder.py
index 1d63d640..221f5448 100644
--- a/src/nanotron/data/dataloader_builder.py
+++ b/src/nanotron/data/dataloader_builder.py
@@ -1,44 +1,46 @@
-from nanotron import logging
-from nanotron.logging import log_rank
+# QUESTION: Move them here?
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
 
-import nanotron.distributed as dist
+import numpy as np
+import torch
 from torch.utils.data import DataLoader, Sampler
 from torch.utils.data.distributed import DistributedSampler
-from nanotron.dataloader import SkipBatchSampler, EmptyInfiniteDataset, get_dataloader_worker_init # QUESTION: Move them here?
 
-from dataclasses import dataclass
-import torch
-import numpy as np
-from typing import Dict, List, Union, Optional
-from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
+import nanotron.distributed as dist
+from nanotron import logging
+from nanotron.dataloader import (
+    EmptyInfiniteDataset,
+    SkipBatchSampler,
+    get_dataloader_worker_init,
+)
 from nanotron.parallel import ParallelContext
+from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
 
 logger = logging.get_logger(__name__)
 
+
 def build_nanoset_dataloader(
-        dataset,
-        sequence_length: int,
-        parallel_context: ParallelContext,
-        input_pp_rank: int,
-        output_pp_rank: int,
-        micro_batch_size: int,
-        dataloader_num_workers: int,
-        seed_worker: int,
-        consumed_train_samples: int = 0,
-        dataloader_drop_last: bool = True,
-        dataloader_pin_memory: bool = True,
+    dataset,
+    sequence_length: int,
+    parallel_context: ParallelContext,
+    input_pp_rank: int,
+    output_pp_rank: int,
+    micro_batch_size: int,
+    dataloader_num_workers: int,
+    seed_worker: int,
+    consumed_train_samples: int = 0,
+    dataloader_drop_last: bool = True,
+    dataloader_pin_memory: bool = True,
 ) -> DataLoader:
 
     # Case of ranks not requiring data. We give them a dummy dataset, then the collator will do his job
-    if not dist.get_rank(parallel_context.pp_pg) in [
-        input_pp_rank,
-        output_pp_rank,
-    ]:
+    if dist.get_rank(parallel_context.pp_pg) not in [input_pp_rank, output_pp_rank]:
         dataset_length = len(dataset)
         dataset = EmptyInfiniteDataset(length=dataset_length)
         # No need to spawn a lot of workers, we can just use main
         dataloader_num_workers = 0
-    
+
     data_collator = NanosetDataCollatorForCLM(
         sequence_length=sequence_length,
         input_pp_rank=input_pp_rank,
@@ -49,7 +51,7 @@ def build_nanoset_dataloader(
     # Compute size and rank of dataloader workers
     dp_ranks_size = parallel_context.dp_pg.size()
     dp_rank = parallel_context.dp_pg.rank()
-    
+
     sampler = get_sampler(
         dataset=dataset,
         dl_ranks_size=dp_ranks_size,
@@ -72,12 +74,13 @@ def build_nanoset_dataloader(
         # pin_memory_device="cuda",
     )
 
+
 @dataclass
 class NanosetDataCollatorForCLM:
     """
     Data collator used for causal language modeling with Nanoset.
 
-    - sequence_length: Sequence legth of the model
+    - sequence_length: Sequence length of the model
     - input_pp_rank: Discards last input id token
     - output_pp_rank: Discards first label id token
     - other pp ranks: Don't have data. Instead, we use `TensorPointer` to point to the rank having the data.
@@ -89,16 +92,16 @@ class NanosetDataCollatorForCLM:
     parallel_context: ParallelContext
 
     def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Union[torch.Tensor, TensorPointer]]:
-    
+
         # Process the case when current rank doesn't require data. We return `TensorPointer` that points to ranks having the data.
         current_pp_rank = dist.get_rank(self.parallel_context.pp_pg)
-    
+
         if current_pp_rank not in [
             self.input_pp_rank,
             self.output_pp_rank,
         ]:
             # assert all(len(example) == 0 for example in examples)
-            
+
             return {
                 "input_ids": TensorPointer(group_rank=self.input_pp_rank),
                 "input_mask": TensorPointer(group_rank=self.input_pp_rank),
@@ -149,6 +152,7 @@ def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Uni
         result = {k: v if isinstance(v, TensorPointer) else torch.from_numpy(v) for k, v in result.items()}
         return result
 
+
 def get_sampler(
     dl_ranks_size: int,
     dl_rank: int,
@@ -158,12 +162,10 @@ def get_sampler(
     drop_last: Optional[bool] = True,
 ) -> Optional[Sampler]:
     """returns sampler that restricts data loading to a subset of the dataset proper to the DP rank"""
-    
-    # NOTE: Removed DistributedSamplerWithLoop to support valid and test samplers. 
-    
-    sampler = DistributedSampler(
-            dataset, num_replicas=dl_ranks_size, rank=dl_rank, seed=seed, drop_last=drop_last
-    )
+
+    # NOTE: Removed DistributedSamplerWithLoop to support valid and test samplers.
+
+    sampler = DistributedSampler(dataset, num_replicas=dl_ranks_size, rank=dl_rank, seed=seed, drop_last=drop_last)
 
     if consumed_train_samples > 0:
         sampler = SkipBatchSampler(sampler, skip_batches=consumed_train_samples, dp_size=dl_ranks_size)
diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
index 2a7fc7ff..3e8fd985 100644
--- a/src/nanotron/data/dataset_builder.py
+++ b/src/nanotron/data/dataset_builder.py
@@ -1,18 +1,19 @@
-from nanotron import logging
 import math
 from typing import Any, List, Optional, Tuple, Type, Union
 
 import numpy
 import torch
 
-from nanotron.data.nanoset import NanosetConfig, Nanoset
+from nanotron import logging
 from nanotron.data.indexed_dataset import MMapIndexedDataset
+from nanotron.data.nanoset import Nanoset, NanosetConfig
 from nanotron.data.utils import Split, normalize
 
 logger = logging.get_logger(__name__)
 
 DistributedDataset = Union[Nanoset, MMapIndexedDataset]
 
+
 class NanosetBuilder(object):
     """Builder class for the Nanoset classes
 
@@ -22,17 +23,18 @@ class NanosetBuilder(object):
     """
 
     def __init__(
-        self, config: NanosetConfig,
+        self,
+        config: NanosetConfig,
     ):
         self.config = config
-        self.sizes = config.split_num_samples 
-        self.cls = Nanoset # NOTE: keep it like that to support BlendedNanoset in the future
+        self.sizes = config.split_num_samples
+        self.cls = Nanoset  # NOTE: keep it like that to support BlendedNanoset in the future
 
     def build(self) -> List[Nanoset]:
         """Build all dataset splits according to the provided data_path(s)
-        
+
         This method is distributed-aware and must be called on all ranks.
-        
+
         The dataset splits returned can vary according to the config. Supply config.data_path and
         config.split to build BlendedNanoset and/or Nanoset splits from the same
         distribution. Supply config.data_path_per_split to build BlendedNanoset and/or Nanoset
@@ -48,7 +50,7 @@ def _build_blended_dataset_splits(
         self,
     ) -> List[Nanoset]:
         """Build all dataset splits according to the provided data_path(s)
-        
+
         See the NanosetBuilder.build alias for more information.
 
         Returns:
@@ -67,7 +69,10 @@ def _build_blended_dataset_splits(
             return self._build_nanoset_dataset_splits(data_path[0], split, self.sizes)
 
     def _build_nanoset_dataset_splits(
-        self, path_prefix: str, split: List[float], sizes: List[int],
+        self,
+        path_prefix: str,
+        split: List[float],
+        sizes: List[int],
     ) -> List[Nanoset]:
         """Build each Nanoset split from a single MMapIndexedDataset
 
@@ -79,17 +84,13 @@ def _build_nanoset_dataset_splits(
             sizes (List[int]): The number of total samples to draw from each split
 
         Returns:
-            List[Nanoset]: The Nanoset per split. Always returns Nanosets because we build them in each and every rank 
+            List[Nanoset]: The Nanoset per split. Always returns Nanosets because we build them in each and every rank
         """
-        
-        indexed_dataset = self._build_generic_dataset(
-            MMapIndexedDataset, path_prefix, False
-        )
-        
-        split_idx_bounds = _get_split_indices(
-            split, indexed_dataset.sequence_lengths.shape[0]
-        )
-        
+
+        indexed_dataset = self._build_generic_dataset(MMapIndexedDataset, path_prefix, False)
+
+        split_idx_bounds = _get_split_indices(split, indexed_dataset.sequence_lengths.shape[0])
+
         split_indices = [
             numpy.arange(
                 start=split_idx_bounds[i],
@@ -114,7 +115,9 @@ def _build_nanoset_dataset_splits(
         return nanoset_datasets
 
     def _build_generic_dataset(
-        self, cls: Type[DistributedDataset], *args: Any,
+        self,
+        cls: Type[DistributedDataset],
+        *args: Any,
     ) -> Optional[DistributedDataset]:
         """Build the DistributedDataset
 
@@ -141,10 +144,10 @@ def _build_generic_dataset(
                     dataset = cls(*args)
                 except OSError as err:
                     log = (
-                        f"Failed to write dataset materials to the data cache directory. "
-                        + f"Please supply a directory to which you have write access via "
-                        + f"the path_to_cache attribute in NanosetConfig and "
-                        + f"retry. Refer to the preserved traceback above for more information."
+                        "Failed to write dataset materials to the data cache directory. "
+                        + "Please supply a directory to which you have write access via "
+                        + "the path_to_cache attribute in NanosetConfig and "
+                        + "retry. Refer to the preserved traceback above for more information."
                     )
                     raise Exception(log) from err
 
@@ -174,23 +177,22 @@ def _get_split_indices(split: List[float], num_elements: int) -> List[int]:
     split_indices = [0]
     for split_pct in split:
         split_indices.append(split_indices[-1] + int(round(split_pct * float(num_elements))))
-    split_indices[1:] = list(
-        map(lambda _: _ - (split_indices[-1] - num_elements), split_indices[1:])
-    )
+    split_indices[1:] = [_ - (split_indices[-1] - num_elements) for _ in split_indices[1:]]
 
     assert len(split_indices) == len(split) + 1
     assert split_indices[-1] == num_elements
 
     return split_indices
 
+
 # NOTE: Keep for BlendedNanoset
 def _get_prefixes_weights_and_sizes_for_blend(
     data_path: List[str], target_num_samples_per_split: List[int]
 ) -> Tuple[List[str], List[float], List[List[int]]]:
     """Determine the contribution of the Nanoset splits to the BlendedNanoset splits
-    
+
     Args:
-        data_path (List[str]): e.g. ["30", "path/to/dataset_1_prefix", "70", 
+        data_path (List[str]): e.g. ["30", "path/to/dataset_1_prefix", "70",
         "path/to/dataset_2_prefix"]
 
         target_num_samples_per_split (List[int]): The number of samples to target for each
@@ -201,19 +203,14 @@ def _get_prefixes_weights_and_sizes_for_blend(
         ["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"], the normalized weights e.g.
         [0.3, 0.7], and the number of samples to request per Nanoset per split
     """
-    weights, prefixes = zip(
-        *[(float(data_path[i]), data_path[i + 1].strip()) for i in range(0, len(data_path), 2)]
-    )
+    weights, prefixes = zip(*[(float(data_path[i]), data_path[i + 1].strip()) for i in range(0, len(data_path), 2)])
 
     weights = normalize(weights)
 
     # NOTE: Use 0.5% target margin to ensure we satiate the network
     sizes_per_dataset = [
-        [
-            int(math.ceil(target_num_samples * weight * 1.005))
-            for target_num_samples in target_num_samples_per_split
-        ]
+        [int(math.ceil(target_num_samples * weight * 1.005)) for target_num_samples in target_num_samples_per_split]
         for weight in weights
     ]
 
-    return prefixes, weights, sizes_per_dataset
\ No newline at end of file
+    return prefixes, weights, sizes_per_dataset
diff --git a/src/nanotron/data/helpers.cpp b/src/nanotron/data/helpers.cpp
index 4e1b3dbc..9c5b4407 100644
--- a/src/nanotron/data/helpers.cpp
+++ b/src/nanotron/data/helpers.cpp
@@ -23,7 +23,7 @@ void build_blending_indices(py::array_t<int16_t> &dataset_index,
                             const int64_t size, const bool verbose)
 {
   /* Given multiple datasets and a weighting array, build samples
-   such that it follows those wieghts.*/
+   such that it follows those weights.*/
 
   if (verbose)
   {
@@ -112,7 +112,7 @@ py::array build_sample_idx(const py::array_t<int32_t> &sizes_,
   int64_t sample_index = 0;
   // Index into doc_idx.
   int64_t doc_idx_index = 0;
-  // Begining offset for each document.
+  // Beginning offset for each document.
   int32_t doc_offset = 0;
   // Start with first document and no offset.
   sample_idx[2 * sample_index] = doc_idx_index;
@@ -141,7 +141,7 @@ py::array build_sample_idx(const py::array_t<int32_t> &sizes_,
       }
       else
       {
-        // Otherwise, start from the begining of the next document.
+        // Otherwise, start from the beginning of the next document.
         ++doc_idx_index;
         doc_offset = 0;
       }
@@ -289,7 +289,7 @@ py::array build_mapping_impl(const py::array_t<int64_t> &docs_,
         const auto sent_index_first = docs[doc];
         const auto sent_index_last = docs[doc + 1];
 
-        // At the begining of the document previous index is the
+        // At the beginning of the document previous index is the
         // start index.
         auto prev_start_index = sent_index_first;
 
@@ -581,7 +581,7 @@ py::array build_blocks_mapping_impl(const py::array_t<int64_t> &docs_,
         const auto sent_index_last = docs[doc + 1];
         const auto target_seq_len = max_seq_length - titles_sizes[doc];
 
-        // At the begining of the document previous index is the
+        // At the beginning of the document previous index is the
         // start index.
         auto prev_start_index = sent_index_first;
 
diff --git a/src/nanotron/data/indexed_dataset.py b/src/nanotron/data/indexed_dataset.py
index 938447f2..10845c6d 100644
--- a/src/nanotron/data/indexed_dataset.py
+++ b/src/nanotron/data/indexed_dataset.py
@@ -27,9 +27,9 @@
 
 # TODO @tj-solergibert: Refractor. Delete multimodal references.
 
+
 class DType(Enum):
-    """The NumPy data type Enum for writing/reading the MMapIndexedDataset indices
-    """
+    """The NumPy data type Enum for writing/reading the MMapIndexedDataset indices"""
 
     uint8 = 1
     int8 = 2
@@ -190,7 +190,7 @@ def write(
         # the mode per sequence
         if sequence_modes is not None:
             sequence_modes = numpy.array(sequence_modes, dtype=numpy.int8)
-            self.idx_writer.write(sequence_modes.tobytes(order='C'))
+            self.idx_writer.write(sequence_modes.tobytes(order="C"))
             del sequence_modes
 
     def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
@@ -243,7 +243,7 @@ def __init__(self, idx_path: str, multimodal: bool) -> None:
         self.bin_buffer_mmap = numpy.memmap(idx_path, mode="r", order="C")
         self.bin_buffer = memoryview(self.bin_buffer_mmap)
 
-        log_rank(f"\tExtract the sequence lengths", logger=logger, level=logging.INFO, rank=0)
+        log_rank("\tExtract the sequence lengths", logger=logger, level=logging.INFO, rank=0)
         t_beg = time.time()
         self.sequence_lengths = numpy.frombuffer(
             self.bin_buffer, dtype=numpy.int32, count=self.sequence_count, offset=offset
@@ -251,7 +251,7 @@ def __init__(self, idx_path: str, multimodal: bool) -> None:
         t_end = time.time()
         log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
-        log_rank(f"\tExtract the sequence pointers", logger=logger, level=logging.INFO, rank=0)
+        log_rank("\tExtract the sequence pointers", logger=logger, level=logging.INFO, rank=0)
         t_beg = time.time()
         self.sequence_pointers = numpy.frombuffer(
             self.bin_buffer,
@@ -262,7 +262,7 @@ def __init__(self, idx_path: str, multimodal: bool) -> None:
         t_end = time.time()
         log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
-        log_rank(f"\tExtract the document indices", logger=logger, level=logging.INFO, rank=0)
+        log_rank("\tExtract the document indices", logger=logger, level=logging.INFO, rank=0)
         t_beg = time.time()
         self.document_indices = numpy.frombuffer(
             self.bin_buffer,
@@ -275,7 +275,7 @@ def __init__(self, idx_path: str, multimodal: bool) -> None:
 
         self.sequence_modes = None
         if multimodal:
-            log_rank(f"\tExtract the sequence modes", logger=logger, level=logging.INFO, rank=0)
+            log_rank("\tExtract the sequence modes", logger=logger, level=logging.INFO, rank=0)
             t_beg = time.time()
             self.sequence_modes = numpy.frombuffer(
                 self.bin_buffer,
@@ -296,14 +296,13 @@ def __init__(self, idx_path: str, multimodal: bool) -> None:
         log_rank(f"> total number of sequences: {len(self)}", logger=logger, level=logging.INFO, rank=0)
         log_rank(
             f"> total number of documents: {self.document_indices.shape[0] - 1}",
-            logger=logger, 
-            level=logging.INFO, 
-            rank=0
+            logger=logger,
+            level=logging.INFO,
+            rank=0,
         )
 
     def __del__(self) -> None:
-        """Clean up the object
-        """
+        """Clean up the object"""
         self.bin_buffer_mmap._mmap.close()
         del self.bin_buffer_mmap
 
@@ -388,8 +387,7 @@ def __setstate__(self, state: Tuple[str, bool]) -> None:
         self.initialize(path_prefix, multimodal)
 
     def __del__(self) -> None:
-        """Clean up the object
-        """
+        """Clean up the object"""
         if self.bin_buffer_mmap is not None:
             self.bin_buffer_mmap._mmap.close()
         del self.bin_buffer_mmap
@@ -460,9 +458,7 @@ def get(self, idx: int, offset: int = 0, length: Optional[int] = None) -> numpy.
         if length is None:
             length = sequence_length - offset
         sequence_pointer += offset * DType.size(self.index.dtype)
-        sequence = numpy.frombuffer(
-            self.bin_buffer, dtype=self.index.dtype, count=length, offset=sequence_pointer
-        )
+        sequence = numpy.frombuffer(self.bin_buffer, dtype=self.index.dtype, count=length, offset=sequence_pointer)
         return (sequence, sequence_mode) if sequence_mode is not None else sequence
 
     @property
@@ -522,9 +518,7 @@ def exists(path_prefix: str) -> bool:
         Returns:
             bool: Whether the MMapIndexedDataset exists on disk at the prefix
         """
-        return os.path.exists(get_idx_path(path_prefix)) and os.path.exists(
-            get_bin_path(path_prefix)
-        )
+        return os.path.exists(get_idx_path(path_prefix)) and os.path.exists(get_bin_path(path_prefix))
 
 
 class MMapIndexedDatasetBuilder(object):
@@ -538,9 +532,7 @@ class MMapIndexedDatasetBuilder(object):
         multimodal (bool, optional): Whether the dataset is multimodal. Defaults to False.
     """
 
-    def __init__(
-        self, bin_path: str, dtype: Type[numpy.number] = numpy.int32, multimodal: bool = False
-    ) -> None:
+    def __init__(self, bin_path: str, dtype: Type[numpy.number] = numpy.int32, multimodal: bool = False) -> None:
         self.data_file = open(bin_path, "wb")
         self.dtype = dtype
         self.multimodal = multimodal
@@ -563,9 +555,7 @@ def add_item(self, tensor: torch.Tensor, mode: int = 0) -> None:
         if self.multimodal:
             self.sequence_modes.append(mode)
 
-    def add_document(
-        self, tensor: torch.Tensor, lengths: List[int], modes: Optional[List[int]] = None
-    ) -> None:
+    def add_document(self, tensor: torch.Tensor, lengths: List[int], modes: Optional[List[int]] = None) -> None:
         """Add an entire document to the dataset
 
         Args:
@@ -582,8 +572,7 @@ def add_document(
             self.sequence_modes.extend(modes if modes is not None else [0] * lengths)
 
     def end_document(self) -> None:
-        """Finalize the document, for use with MMapIndexedDatasetBuilder.add_item
-        """
+        """Finalize the document, for use with MMapIndexedDatasetBuilder.add_item"""
         self.document_indices.append(len(self.sequence_lengths))
 
     def add_index(self, path_prefix: str) -> None:
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
index 59fe82a5..ffdd322e 100644
--- a/src/nanotron/data/nanoset.py
+++ b/src/nanotron/data/nanoset.py
@@ -1,6 +1,3 @@
-from nanotron import logging
-from nanotron.logging import log_rank
-
 import hashlib
 import json
 import os
@@ -8,18 +5,21 @@
 from collections import OrderedDict
 from typing import Dict, List, Tuple
 
-import torch
 import numpy
+import torch
 
-from nanotron.data.nanoset_configs import NanosetConfig
+from nanotron import logging
 from nanotron.data.indexed_dataset import MMapIndexedDataset
+from nanotron.data.nanoset_configs import NanosetConfig
 from nanotron.data.utils import Split
+from nanotron.logging import log_rank
 
 logger = logging.get_logger(__name__)
 
+
 class Nanoset(torch.utils.data.Dataset):
     """Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/datasets/gpt_dataset.py"""
-    
+
     """The base Nanoset dataset
 
     Args:
@@ -43,7 +43,7 @@ def __init__(
         index_split: Split,
         config: NanosetConfig,
     ) -> None:
-        
+
         assert indexed_indices.size > 0
         assert num_samples > 0
         assert self.is_multimodal() == indexed_dataset.multimodal
@@ -64,15 +64,13 @@ def __init__(
             self.unique_identifiers[attr] = getattr(self.config, attr)
 
         self.unique_description = json.dumps(self.unique_identifiers, indent=4)
-        self.unique_description_hash = hashlib.md5(
-            self.unique_description.encode("utf-8")
-        ).hexdigest()
+        self.unique_description_hash = hashlib.md5(self.unique_description.encode("utf-8")).hexdigest()
 
         self._finalize()
 
     def _finalize(self) -> None:
         """Abstract method implementation
-        
+
         Load or build/cache the document, sample, and shuffle indices
         """
         assert isinstance(self.config, NanosetConfig)
@@ -102,7 +100,7 @@ def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
             dictionary
         """
         text, _ = self._query_document_sample_shuffle_indices(idx)
-        
+
         return {"text": text}
 
     # TODO @tj-solergibert: Delete this funcs if not used?
@@ -124,7 +122,7 @@ def is_split_by_sequence() -> bool:
             bool: True
         """
         return True
-    
+
     # TODO @tj-solergibert: Delete this funcs if not used?
     @classmethod
     def is_split_by_document(cls) -> bool:
@@ -150,9 +148,7 @@ def _key_config_attributes() -> List[str]:
         """
         return ["split", "random_seed", "sequence_length"]
 
-    def _query_document_sample_shuffle_indices(
-        self, idx: int
-    ) -> Tuple[numpy.ndarray, numpy.ndarray]:
+    def _query_document_sample_shuffle_indices(self, idx: int) -> Tuple[numpy.ndarray, numpy.ndarray]:
         """Get the text (token ids) and document ids for a given index
 
         Args:
@@ -194,9 +190,7 @@ def _query_document_sample_shuffle_indices(
                 # Add the sample part
                 offset = 0 if i > doc_index_beg else doc_index_beg_offset
                 length = None if i < doc_index_end else doc_index_end_offset + 1
-                sample_parts.append(
-                    self.indexed_dataset.get(self.document_index[i], offset=offset, length=length)
-                )
+                sample_parts.append(self.indexed_dataset.get(self.document_index[i], offset=offset, length=length))
 
         return (
             numpy.array(numpy.concatenate(sample_parts), dtype=numpy.int64),
@@ -207,7 +201,7 @@ def _build_document_sample_shuffle_indices(
         self,
     ) -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]:
         """Build the document index, the sample index, and the shuffle index
-        
+
         The document index:
             -- 1-D
             -- An ordered array of document ids
@@ -228,13 +222,10 @@ def _build_document_sample_shuffle_indices(
         """
         path_to_cache = getattr(self.config, "path_to_cache")
         if path_to_cache is None:
-            path_to_cache = os.path.join(
-                self.indexed_dataset.path_prefix, "cache", f"{type(self).__name__}_indices"
-            )
+            path_to_cache = os.path.join(self.indexed_dataset.path_prefix, "cache", f"{type(self).__name__}_indices")
 
-        get_path_to = lambda suffix: os.path.join(
-            path_to_cache, f"{self.unique_description_hash}-{type(self).__name__}-{suffix}"
-        )
+        def get_path_to(suffix):
+            return os.path.join(path_to_cache, f"{self.unique_description_hash}-{type(self).__name__}-{suffix}")
         path_to_description = get_path_to("description.txt")
         path_to_document_index = get_path_to("document_index.npy")
         path_to_sample_index = get_path_to("sample_index.npy")
@@ -260,18 +251,16 @@ def _build_document_sample_shuffle_indices(
         if not cache_hit and torch.distributed.get_rank() == 0:
             log_rank(
                 f"Build and save the {type(self).__name__} {self.index_split.name} indices",
-                logger=logger, 
-                level=logging.INFO, 
-                rank=0
+                logger=logger,
+                level=logging.INFO,
+                rank=0,
             )
 
             if num_epochs == 1:
                 separate_final_epoch = False
             else:
                 # Get the number of samples for the last epoch
-                num_samples_sans_final_epoch = (
-                    (num_epochs - 1) * num_tokens_per_epoch - 1
-                ) // sequence_length
+                num_samples_sans_final_epoch = ((num_epochs - 1) * num_tokens_per_epoch - 1) // sequence_length
                 num_samples_from_final_epoch = self.num_samples - num_samples_sans_final_epoch
                 num_samples_per_epoch = (num_tokens_per_epoch - 1) // sequence_length
 
@@ -283,24 +272,20 @@ def _build_document_sample_shuffle_indices(
 
                 # Separate the final epoch if it falls below the threshold
                 threshold = 0.80
-                separate_final_epoch = num_samples_from_final_epoch < int(
-                    threshold * num_samples_per_epoch
-                )
+                separate_final_epoch = num_samples_from_final_epoch < int(threshold * num_samples_per_epoch)
 
                 log_rank(
                     f"> num_samples_from_final_epoch: {num_samples_from_final_epoch}",
-                    logger=logger, 
-                    level=logging.DEBUG, 
-                    rank=0
+                    logger=logger,
+                    level=logging.DEBUG,
+                    rank=0,
                 )
                 log_rank(f"> threshold: {threshold}", logger=logger, level=logging.DEBUG, rank=0)
                 log_rank(
                     f"> num_samples_per_epoch: {num_samples_per_epoch}", logger=logger, level=logging.DEBUG, rank=0
                 )
 
-            log_rank(
-                f"> separate_final_epoch: {separate_final_epoch}", logger=logger, level=logging.DEBUG, rank=0
-            )
+            log_rank(f"> separate_final_epoch: {separate_final_epoch}", logger=logger, level=logging.DEBUG, rank=0)
 
             numpy_random_state = numpy.random.RandomState(getattr(self.config, "random_seed"))
 
@@ -313,7 +298,9 @@ def _build_document_sample_shuffle_indices(
             # Build the document index
             log_rank(
                 f"\tBuild and save the document index to {os.path.basename(path_to_document_index)}",
-                logger=logger, level=logging.INFO, rank=0
+                logger=logger,
+                level=logging.INFO,
+                rank=0,
             )
             t_beg = time.time()
             document_index = _build_document_index(
@@ -325,8 +312,10 @@ def _build_document_sample_shuffle_indices(
 
             # Build the sample index
             log_rank(
-                f"\tBuild and save the sample index to {os.path.basename(path_to_sample_index)}", 
-                logger=logger, level=logging.INFO, rank=0
+                f"\tBuild and save the sample index to {os.path.basename(path_to_sample_index)}",
+                logger=logger,
+                level=logging.INFO,
+                rank=0,
             )
             t_beg = time.time()
             from nanotron.data import helpers
@@ -347,7 +336,9 @@ def _build_document_sample_shuffle_indices(
             # Build the shuffle index
             log_rank(
                 f"\tBuild and save the shuffle index to {os.path.basename(path_to_shuffle_index)}",
-                logger=logger, level=logging.INFO, rank=0
+                logger=logger,
+                level=logging.INFO,
+                rank=0,
             )
             t_beg = time.time()
             if separate_final_epoch:
@@ -363,40 +354,46 @@ def _build_document_sample_shuffle_indices(
             log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
         log_rank(
-            f"Load the {type(self).__name__} {self.index_split.name} indices", 
-            logger=logger, level=logging.INFO, rank=0
+            f"Load the {type(self).__name__} {self.index_split.name} indices",
+            logger=logger,
+            level=logging.INFO,
+            rank=0,
         )
 
         log_rank(
             f"\tLoad the document index from {os.path.basename(path_to_document_index)}",
-            logger=logger, level=logging.INFO, rank=0
+            logger=logger,
+            level=logging.INFO,
+            rank=0,
         )
         t_beg = time.time()
-        document_index = numpy.load(path_to_document_index, allow_pickle=True, mmap_mode='r')
+        document_index = numpy.load(path_to_document_index, allow_pickle=True, mmap_mode="r")
         t_end = time.time()
         log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
         log_rank(
             f"\tLoad the sample index from {os.path.basename(path_to_sample_index)}",
-            logger=logger, level=logging.INFO, rank=0
+            logger=logger,
+            level=logging.INFO,
+            rank=0,
         )
         t_beg = time.time()
-        sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode='r')
+        sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode="r")
         t_end = time.time()
         log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
         log_rank(
             f"\tLoad the shuffle index from {os.path.basename(path_to_shuffle_index)}",
-            logger=logger, level=logging.INFO, rank=0
+            logger=logger,
+            level=logging.INFO,
+            rank=0,
         )
         t_beg = time.time()
-        shuffle_index = numpy.load(path_to_shuffle_index, allow_pickle=True, mmap_mode='r')
+        shuffle_index = numpy.load(path_to_shuffle_index, allow_pickle=True, mmap_mode="r")
         t_end = time.time()
         log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
-        log_rank(
-            f"> total number of samples: {sample_index.shape[0] - 1}", logger=logger, level=logging.INFO, rank=0
-        )
+        log_rank(f"> total number of samples: {sample_index.shape[0] - 1}", logger=logger, level=logging.INFO, rank=0)
         log_rank(f"> total number of epochs: {num_epochs}", logger=logger, level=logging.INFO, rank=0)
 
         return document_index, sample_index, shuffle_index
@@ -470,7 +467,7 @@ def _build_document_index(
         document_index = document_index.astype(numpy.int32)
         numpy_random_state.shuffle(document_index)
         return document_index
-    
+
     doc_idx_first = _build_document_index(documents, num_epochs - 1, numpy_random_state, False)
     doc_idx_last = _build_document_index(documents, 1, numpy_random_state, False)
     return numpy.concatenate((doc_idx_first, doc_idx_last))
@@ -508,4 +505,3 @@ def _build_shuffle_index(
     numpy_random_state.shuffle(shuffle_idx_last)
 
     return numpy.concatenate((shuffle_idx_first, shuffle_idx_last))
-
diff --git a/src/nanotron/data/nanoset_configs.py b/src/nanotron/data/nanoset_configs.py
index 791d8d1a..31b10948 100644
--- a/src/nanotron/data/nanoset_configs.py
+++ b/src/nanotron/data/nanoset_configs.py
@@ -7,9 +7,9 @@
 @dataclass
 class NanosetConfig:
     """Configuration object for Nanoset datasets
-    
+
     Attributes:
-    
+
         random_seed (int): The seed for all RNG during dataset creation.
 
         sequence_length (int): The sequence length.
@@ -31,7 +31,7 @@ class NanosetConfig:
     random_seed: int
 
     sequence_length: int
-    
+
     split_num_samples: list[int]
 
     data_path: Optional[List[str]] = None
@@ -47,4 +47,4 @@ def __post_init__(self):
         """Python dataclass method that is used to modify attributes after initialization. See
         https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
         """
-        self.split_vector = parse_and_normalize_split(self.split)
\ No newline at end of file
+        self.split_vector = parse_and_normalize_split(self.split)
diff --git a/src/nanotron/data/utils.py b/src/nanotron/data/utils.py
index 9357cd51..5401e42c 100644
--- a/src/nanotron/data/utils.py
+++ b/src/nanotron/data/utils.py
@@ -1,12 +1,12 @@
-from nanotron import logging
-from nanotron.logging import log_rank
-
+import re
 from enum import Enum
 from typing import List
-import re
 
 import numpy
 
+from nanotron import logging
+from nanotron.logging import log_rank
+
 logger = logging.get_logger(__name__)
 
 
@@ -17,8 +17,7 @@ class Split(Enum):
 
 
 def compile_helpers():
-    """Compile C++ helper functions at runtime. Make sure this is invoked on a single process.
-    """
+    """Compile C++ helper functions at runtime. Make sure this is invoked on a single process."""
     import os
     import subprocess
 
@@ -29,6 +28,7 @@ def compile_helpers():
         log_rank("Failed to compile the C++ dataset helper functions", logger=logger, level=logging.ERROR, rank=0)
         sys.exit(1)
 
+
 def normalize(weights: List[float]) -> List[float]:
     """Do non-exponentiated normalization
 
@@ -43,6 +43,7 @@ def normalize(weights: List[float]) -> List[float]:
     w = (w / w_sum).tolist()
     return w
 
+
 def parse_and_normalize_split(split: str) -> List[float]:
     """Parse the dataset split ratios from a string
 
@@ -56,25 +57,24 @@ def parse_and_normalize_split(split: str) -> List[float]:
     split = split + [0.0 for _ in range(len(Split) - len(split))]
 
     assert len(split) == len(Split)
-    assert all(map(lambda _: _ >= 0.0, split))
+    assert all((_ >= 0.0 for _ in split))
 
     split = normalize(split)
 
     return split
 
+
 def compute_datasets_num_samples(train_iters, eval_interval, eval_iters, global_batch_size):
-    
+
     train_samples = train_iters * global_batch_size
     eval_iters = (train_iters // eval_interval + 1) * eval_iters
     test_iters = eval_iters
 
-    datasets_num_samples = [train_samples,
-                            eval_iters * global_batch_size,
-                            test_iters * global_batch_size]
-    
+    datasets_num_samples = [train_samples, eval_iters * global_batch_size, test_iters * global_batch_size]
+
     log_rank(" > Datasets target sizes (minimum size):", logger=logger, level=logging.INFO, rank=0)
     log_rank("    Train:      {}".format(datasets_num_samples[0]), logger=logger, level=logging.INFO, rank=0)
     log_rank("    Validation: {}".format(datasets_num_samples[1]), logger=logger, level=logging.INFO, rank=0)
     log_rank("    Test:       {}".format(datasets_num_samples[2]), logger=logger, level=logging.INFO, rank=0)
-    
-    return datasets_num_samples
\ No newline at end of file
+
+    return datasets_num_samples
diff --git a/src/nanotron/dataloader.py b/src/nanotron/dataloader.py
index c62a2519..d1853286 100644
--- a/src/nanotron/dataloader.py
+++ b/src/nanotron/dataloader.py
@@ -19,6 +19,9 @@
 )
 
 try:
+    from transformers import PreTrainedTokenizerBase
+    from transformers.trainer_pt_utils import DistributedSamplerWithLoop
+
     import datasets
     from datasets import (
         Dataset,
@@ -29,8 +32,6 @@
         concatenate_datasets,
         load_dataset,
     )
-    from transformers import PreTrainedTokenizerBase
-    from transformers.trainer_pt_utils import DistributedSamplerWithLoop
 except ImportError:
     warnings.warn("Datasets and/or Transformers not installed, you'll be unable to use the dataloader.")
 
@@ -239,6 +240,7 @@ def data_generator() -> Generator[Dict[str, Union[torch.Tensor, TensorPointer]],
 
     return data_generator
 
+
 # QUESTION: Move it to nanotron.data.dataloader_builder?
 # Adapted from https://github.com/huggingface/accelerate/blob/a73898027a211c3f6dc4460351b0ec246aa824aa/src/accelerate/data_loader.py#L781C1-L824C28
 class SkipBatchSampler(BatchSampler):
@@ -515,6 +517,7 @@ def get_train_dataloader(
         # pin_memory_device="cuda",
     )
 
+
 # QUESTION: Move it to nanotron.data.dataloader_builder?
 def get_dataloader_worker_init(dp_rank: int):
     """Creates random states for each worker in order to get different state in each workers"""
@@ -526,6 +529,7 @@ def dataloader_worker_init(worker_id):
 
     return dataloader_worker_init
 
+
 # QUESTION: Move it to nanotron.data.dataloader_builder?
 class EmptyInfiniteDataset:
     """Hack as removing all columns from a datasets.Dataset makes the number of rows 0."""
diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index 3ee91f68..ae1f16d1 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -26,10 +26,10 @@
 from nanotron.config import (
     Config,
     ExistingCheckpointInit,
+    NanosetDatasetsArgs,
     ParallelismArgs,
     RandomInit,
     get_config_from_file,
-    NanosetDatasetsArgs
 )
 from nanotron.dataloader import sanity_check_dataloader
 from nanotron.helpers import (
@@ -153,7 +153,7 @@ def __init__(
         ########################################
         ## Compile Nanoset helpers
         ########################################
-            
+
         # Only if Using Nanoset Datasets
         if isinstance(self.config.data.dataset, NanosetDatasetsArgs):
             if dist.get_rank() == 0:
@@ -162,7 +162,7 @@ def __init__(
 
                 compile_helpers()
                 log_rank("Done with dataset index builder.", logger=logger, level=logging.INFO, rank=0)
-        
+
         ########################################
         ## Setting up our model, optimizers, schedulers, etc.
         ########################################
@@ -491,7 +491,7 @@ def train_step_logs(
                         **{log_item.tag: log_item.scalar_value for log_item in log_entries},
                         "iteration_step": self.iteration_step,
                     },
-                    step=self.iteration_step-1
+                    step=self.iteration_step - 1,
                 )
 
             self.loggerwriter.add_scalars_from_list(log_entries, self.iteration_step)
diff --git a/src/nanotron/utils.py b/src/nanotron/utils.py
index 14fe1ca8..acee80cb 100644
--- a/src/nanotron/utils.py
+++ b/src/nanotron/utils.py
@@ -1,11 +1,10 @@
 import functools
 import inspect
-import math
 import os
 import random
 import socket
 from contextlib import ExitStack, contextmanager
-from typing import Callable, ContextManager, List, Optional
+from typing import ContextManager, List, Optional
 
 import torch
 from packaging import version
@@ -52,7 +51,7 @@ def main_rank_first(group: dist.ProcessGroup):
 @contextmanager
 def local_ranks_zero_first(group: Optional[dist.ProcessGroup] = None):
     """Context manager that executes the code in the context with all the local rank zero of the group going first.
-    Usefull to run only once per node first (e.g. to create local files, etc)
+    Useful to run only once per node first (e.g. to create local files, etc)
     """
     is_main = int(os.environ.get("LOCAL_RANK", 0)) == 0
     if is_main:
@@ -123,6 +122,7 @@ def get_untyped_storage(tensor: torch.Tensor) -> torch.UntypedStorage:
     else:
         return tensor.storage().untyped()
 
+
 def tensor_from_untyped_storage(untyped_storage: torch.UntypedStorage, dtype: torch.dtype):
     # TODO @thomasw21: Figure out what's the best Pytorch way of building a tensor from a storage.
     device = untyped_storage.device

From c167492881802e7354cf132deff5619b4bfa2bfa Mon Sep 17 00:00:00 2001
From: kerem <keremozrtk@gmail.com>
Date: Thu, 14 Mar 2024 13:01:03 +0300
Subject: [PATCH 22/73] Improve Rank Functions and Tests - Enhance docstring
 and type hints in get_local_rank for clarity - Simplify parameter names in
 get_global_rank for readability - Update tests for get_global_rank - Attempt
 to fix a bug related to get_local_rank

---
 src/nanotron/parallel/context.py              | 44 +++++++++++--------
 src/nanotron/parallel/tied_parameters.py      |  2 +-
 src/nanotron/serialize/weights.py             |  3 +-
 src/nanotron/trainer.py                       | 13 +++---
 tests/test_distributed.py                     |  2 +-
 ..._parameters_accumulate_gradient_in_fp32.py |  8 ++--
 6 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/src/nanotron/parallel/context.py b/src/nanotron/parallel/context.py
index 509754e4..4578d979 100644
--- a/src/nanotron/parallel/context.py
+++ b/src/nanotron/parallel/context.py
@@ -1,5 +1,5 @@
 import os
-from typing import Literal, Tuple
+from typing import Literal, Tuple, Annotated
 
 import numpy as np
 import torch
@@ -125,9 +125,22 @@ def set_device(self):
         device_id = local_rank
         torch.cuda.set_device(torch.cuda.device(device_id))
 
-    def get_local_ranks(self, world_rank: int) -> Tuple[int, int, int]:
-        # return coordinates in world_rank_matrix without expert_parallel_rank
-        return tuple(i.item() for i in np.where(self.world_rank_matrix == world_rank))[-3:]
+    def get_local_ranks(self, world_rank: int) -> Tuple[Annotated[int, "ep_rank"], 
+                                                        Annotated[int, "pp_rank"], 
+                                                        Annotated[int, "dp_rank"], 
+                                                        Annotated[int, "tp_rank"]]:
+        """
+        Get the local ranks (expert_parallel_rank, pipeline_parallel_rank, 
+        data_parallel_rank, tensor_parallel_rank) corresponding to a given world rank.
+
+        :param world_rank: int, The global rank to find local ranks for.
+
+        :return: A tuple containing local ranks 
+                (expert_parallel_rank, pipeline_parallel_rank, 
+                data_parallel_rank, tensor_parallel_rank).
+        """
+
+        return tuple(i.item() for i in np.where(self.world_rank_matrix == world_rank))
 
     def destroy(self):
         if not dist.is_initialized():
@@ -138,24 +151,19 @@ def destroy(self):
 
     def get_global_rank(
         self,
-        expert_parallel_rank: int,
-        pipeline_parallel_rank: int,
-        data_parallel_rank: int,
-        tensor_parallel_rank: int,
+        ep_rank: int,
+        pp_rank: int,
+        dp_rank: int,
+        tp_rank: int,
     ) -> np.int64:
         """
         Get the global rank based on the specified ranks in different parallel groups.
 
-        :param expert_parallel_rank: int, Rank in the expert parallel group.
-        :param pipeline_parallel_rank: int, Rank in the pipeline parallel group.
-        :param data_parallel_rank: int, Rank in the data parallel group.
-        :param tensor_parallel_rank: int, Rank in the tensor parallel group.
+        :param ep_rank: int, Rank in the expert parallel group.
+        :param pp_rank: int, Rank in the pipeline parallel group.
+        :param dp_rank: int, Rank in the data parallel group.
+        :param tp_rank: int, Rank in the tensor parallel group.
 
         :return: numpy.int64, The global rank.
         """
-        return self.world_rank_matrix[
-            expert_parallel_rank,
-            pipeline_parallel_rank,
-            data_parallel_rank,
-            tensor_parallel_rank,
-        ]
\ No newline at end of file
+        return self.world_rank_matrix[ep_rank, pp_rank, dp_rank, tp_rank]
\ No newline at end of file
diff --git a/src/nanotron/parallel/tied_parameters.py b/src/nanotron/parallel/tied_parameters.py
index 95020d31..6aa0cbf5 100644
--- a/src/nanotron/parallel/tied_parameters.py
+++ b/src/nanotron/parallel/tied_parameters.py
@@ -50,7 +50,7 @@ def tie_parameters(
     dp_ranks = tuple(
         sorted(
             {
-                parallel_context.get_local_ranks(world_rank=global_rank)[1]
+                parallel_context.get_local_ranks(world_rank=global_rank)[2]
                 for _, global_ranks in ties
                 for global_rank in global_ranks
             }
diff --git a/src/nanotron/serialize/weights.py b/src/nanotron/serialize/weights.py
index c87ca77a..c857154f 100644
--- a/src/nanotron/serialize/weights.py
+++ b/src/nanotron/serialize/weights.py
@@ -68,8 +68,6 @@ def save_weights(model: nn.Module, parallel_context: ParallelContext, root_folde
                     continue
             else:
                 base_name = name
-            
-            is_expert_sharded = False # Let's test the solution to the potential bug by defining is_expert_sharded before using it.
 
             if param.is_sharded:
                 sharded_info: ShardedInfo = param.get_sharded_info()
@@ -87,6 +85,7 @@ def save_weights(model: nn.Module, parallel_context: ParallelContext, root_folde
 
             else:
                 exp_tp_pp_rank_and_size = None
+                is_expert_sharded = False
 
             path = get_path(
                 base_name,
diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index f87bb9aa..bbd7306f 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -207,7 +207,10 @@ def __init__(
 
         # Setup tensorboard write and log writers on output rank
         self.logger_ranks = self.parallel_context.get_global_rank(
-            0, self.unwrapped_model.output_pp_rank, 0, 0
+            ep_rank=0,
+            pp_rank=self.unwrapped_model.output_pp_rank,
+            dp_rank=0,
+            tp_rank=0
         ).flatten()
         self.loggerwriter = self.setup_log_writers()
 
@@ -778,10 +781,10 @@ def mark_tied_parameters(
                 target,
                 (
                     parallel_context.get_global_rank(
-                        expert_parallel_rank=dist.get_rank(parallel_context.expert_pg),
-                        pipeline_parallel_rank=get_pp_rank_of(target, module=model),
-                        data_parallel_rank=dist.get_rank(parallel_context.dp_pg),
-                        tensor_parallel_rank=dist.get_rank(parallel_context.tp_pg),
+                        ep_rank=dist.get_rank(parallel_context.expert_pg),
+                        pp_rank=get_pp_rank_of(target, module=model),
+                        dp_rank=dist.get_rank(parallel_context.dp_pg),
+                        tp_rank=dist.get_rank(parallel_context.tp_pg),
                     ),
                 ),
             )
diff --git a/tests/test_distributed.py b/tests/test_distributed.py
index b084e9d9..7c0d2462 100644
--- a/tests/test_distributed.py
+++ b/tests/test_distributed.py
@@ -29,7 +29,7 @@ def _test_init_parallel_context(parallel_context: ParallelContext):
     global_rank = parallel_context.get_global_rank(*local_rank)
     assert isinstance(global_rank, np.int64), f"The type of global_rank is {type(global_rank)}"
 
-    assert global_rank == parallel_context.world_rank_matrix[local_rank]
+    assert global_rank == dist.get_rank()
 
     parallel_context.destroy()
     assert dist.is_initialized() is False
diff --git a/tests/test_parameters_accumulate_gradient_in_fp32.py b/tests/test_parameters_accumulate_gradient_in_fp32.py
index d1ecc26c..ba0debd6 100644
--- a/tests/test_parameters_accumulate_gradient_in_fp32.py
+++ b/tests/test_parameters_accumulate_gradient_in_fp32.py
@@ -344,10 +344,10 @@ def _test_tied_weights_sync_with_grad_accum_in_fp32(
                     target,
                     (
                         parallel_context.get_global_rank(
-                            expert_parallel_rank=dist.get_rank(parallel_context.expert_pg),
-                            pipeline_parallel_rank=get_pp_rank_of(target, module=mdl),
-                            data_parallel_rank=dist.get_rank(parallel_context.dp_pg),
-                            tensor_parallel_rank=dist.get_rank(parallel_context.tp_pg),
+                            ep_rank=dist.get_rank(parallel_context.expert_pg),
+                            pp_rank=get_pp_rank_of(target, module=mdl),
+                            dp_rank=dist.get_rank(parallel_context.dp_pg),
+                            tp_rank=dist.get_rank(parallel_context.tp_pg),
                         ),
                     ),
                 )

From d86deafc48ff48a678f53cc80839338d2c00c4a0 Mon Sep 17 00:00:00 2001
From: kerem <keremozrtk@gmail.com>
Date: Thu, 14 Mar 2024 14:33:29 +0300
Subject: [PATCH 23/73] Handle conflicts

---
 src/nanotron/parallel/context.py | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/src/nanotron/parallel/context.py b/src/nanotron/parallel/context.py
index 4578d979..e04e26f5 100644
--- a/src/nanotron/parallel/context.py
+++ b/src/nanotron/parallel/context.py
@@ -125,21 +125,7 @@ def set_device(self):
         device_id = local_rank
         torch.cuda.set_device(torch.cuda.device(device_id))
 
-    def get_local_ranks(self, world_rank: int) -> Tuple[Annotated[int, "ep_rank"], 
-                                                        Annotated[int, "pp_rank"], 
-                                                        Annotated[int, "dp_rank"], 
-                                                        Annotated[int, "tp_rank"]]:
-        """
-        Get the local ranks (expert_parallel_rank, pipeline_parallel_rank, 
-        data_parallel_rank, tensor_parallel_rank) corresponding to a given world rank.
-
-        :param world_rank: int, The global rank to find local ranks for.
-
-        :return: A tuple containing local ranks 
-                (expert_parallel_rank, pipeline_parallel_rank, 
-                data_parallel_rank, tensor_parallel_rank).
-        """
-
+    def get_local_ranks(self, world_rank: int) -> Tuple[int, int, int]:
         return tuple(i.item() for i in np.where(self.world_rank_matrix == world_rank))
 
     def destroy(self):

From 48f94d5ad1116b823c1e56531cc2a70bcc0501ad Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sat, 16 Mar 2024 16:59:30 +0000
Subject: [PATCH 24/73] added preprocessing tools

---
 src/nanotron/logging.py      |  16 ++-
 tools/hf_datasets_to_json.py |  34 +++++
 tools/preprocess_data.py     | 246 +++++++++++++++++++++++++++++++++++
 3 files changed, 290 insertions(+), 6 deletions(-)
 create mode 100644 tools/hf_datasets_to_json.py
 create mode 100644 tools/preprocess_data.py

diff --git a/src/nanotron/logging.py b/src/nanotron/logging.py
index 708393b5..ecd01dc0 100644
--- a/src/nanotron/logging.py
+++ b/src/nanotron/logging.py
@@ -217,12 +217,16 @@ def log_rank(
     **kwargs,
 ):
     """Log only if the current process is the rank specified."""
-    # Use default group is group is not provided
-    if group is None:
-        group = torch_dist.distributed_c10d._get_default_group()
-
-    # rank is None means everyone logs
-    if rank is None or dist.get_rank(group) == rank:
+    # Use default group if group is not provided
+    try:
+        if group is None:
+            group = torch_dist.distributed_c10d._get_default_group()
+
+        # rank is None means everyone logs
+        if rank is None or dist.get_rank(group) == rank:
+            logger.log(level, msg, **kwargs)
+            
+    except RuntimeError:
         logger.log(level, msg, **kwargs)
 
 
diff --git a/tools/hf_datasets_to_json.py b/tools/hf_datasets_to_json.py
new file mode 100644
index 00000000..b32e540a
--- /dev/null
+++ b/tools/hf_datasets_to_json.py
@@ -0,0 +1,34 @@
+import argparse
+from datasets import concatenate_datasets, load_dataset
+
+def main(args):
+    # Load all the Dataset
+    if args.split is None:
+        ds = load_dataset(args.dataset_path, num_proc=args.num_workers)
+        ds = concatenate_datasets([ds[splits] for splits in ds.keys()])
+    # Load a split of the Dataset
+    else:
+        ds = load_dataset(args.dataset_path, split = args.split, num_proc=args.num_workers)
+
+    ds = ds.select_columns(args.column_name)
+    
+    if args.column_name not in ["text"]:
+        print(f"Renaming {ds.column_names[0]} to 'text'")
+        ds = ds.rename_column(ds.column_names[0], "text")
+    
+    # Store dataset to json file
+    ds.to_json(path_or_buf=args.output_json, num_proc=args.num_workers)
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset-path", type=str, required=True, help="Path to local stored dataset or repository on the HF hub")
+    parser.add_argument("--split", type=str, help="Which split of the data to process")
+    parser.add_argument("--column-name", type=str, required=True, help="Name of the column containing the data to process")
+    parser.add_argument("--output-json", type=str, required=True, help="Path to the json output file")
+    parser.add_argument("--num-workers", type=int, default=8, help="Number of processes to load the dataset and store the json file")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    _args = get_args()
+    main(_args)
\ No newline at end of file
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
new file mode 100644
index 00000000..656c5a22
--- /dev/null
+++ b/tools/preprocess_data.py
@@ -0,0 +1,246 @@
+import argparse
+import json
+import os
+import sys
+import time
+import glob
+import multiprocessing
+
+# TODO tj-solergibert: Keep this hack or not? Hack to be able to process data without installing nanotron
+sys.path.append(os.path.join(os.path.abspath(os.path.join(os.path.dirname(__file__),os.path.pardir)), "src"))
+
+from transformers import AutoTokenizer
+from nanotron.data import indexed_dataset
+
+class Encoder(object):
+    def __init__(self, args):
+        self.args = args
+
+    def initializer(self):
+        # Use Encoder class as a container for global data
+        Encoder.tokenizer = AutoTokenizer.from_pretrained(self.args.pretrained_model_name_or_path)
+
+    def encode(self, json_line):
+        data = json.loads(json_line)
+        ids = {}
+        lens = {}
+        for key in self.args.json_keys:
+            text = data[key]
+            if isinstance(text, list):
+                sentences = text
+            else:
+                sentences = [text]
+            doc_ids = []
+            sentence_lens = []
+            for sentence in sentences:
+                sentence_ids = Encoder.tokenizer.encode(sentence)
+
+                if len(sentence_ids) > 0:
+                    doc_ids.extend(sentence_ids)
+                    sentence_lens.append(len(sentence_ids))
+
+            if len(doc_ids) > 0 and self.args.append_eod:
+                doc_ids.append(Encoder.tokenizer.eos_token_id)
+                sentence_lens[-1] += 1
+
+            ids[key] = doc_ids
+            lens[key] = sentence_lens
+        return ids, lens, len(json_line)
+
+
+class Partition(object):
+    def __init__(self, args, workers):
+        self.args = args
+        self.workers = workers
+
+    def print_processing_stats(self, count, proc_start, total_bytes_processed):
+        if count % self.args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed/elapsed/1024/1024
+            print(f"Processed {count} documents",
+                  f"({count/elapsed} docs/s, {mbs} MB/s).",
+                  file=sys.stderr)
+
+    def process_json_file(self, file_name):
+        input_file_name, output_prefix = file_name
+        print("Opening", input_file_name)
+        fin = open(input_file_name, 'r', encoding='utf-8')
+
+        startup_start = time.time()
+        encoder = Encoder(self.args)
+        tokenizer = AutoTokenizer.from_pretrained(self.args.pretrained_model_name_or_path)
+        pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
+        encoded_docs = pool.imap(encoder.encode, fin, 32)
+
+        level = "document"
+        
+        output_bin_files = {}
+        output_idx_files = {}
+        builders = {}
+
+        for key in self.args.json_keys:
+            output_bin_files[key] = "{}_{}_{}.bin".format(output_prefix,
+                                                          key, level)
+            output_idx_files[key] = "{}_{}_{}.idx".format(output_prefix,
+                                                          key, level)
+            builders[key] = indexed_dataset.MMapIndexedDatasetBuilder(
+                output_bin_files[key],
+                dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
+            )
+
+        startup_end = time.time()
+        proc_start = time.time()
+        total_bytes_processed = 0
+        print("Time to startup:", startup_end - startup_start)
+        for i, (doc, sentence_lens, bytes_processed) in enumerate(encoded_docs, start=1):
+            total_bytes_processed += bytes_processed
+            for key in doc.keys():
+                builders[key].add_document(doc[key], sentence_lens[key])
+            self.print_processing_stats(i, proc_start, total_bytes_processed)
+
+        fin.close()
+        builders[key].finalize(output_idx_files[key])
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title='input data')
+    group.add_argument('--input', type=str, required=True,
+                       help='Path to input JSON')
+    group.add_argument('--json-keys', nargs='+', default=['text'],
+                       help='space separate listed of keys to extract from json')
+
+    group = parser.add_argument_group(title='tokenizer')
+    group.add_argument('--pretrained-model-name-or-path', type=str, required=True,
+                       help='A path to a directory containing vocabulary files required by the tokenizer or the model id of a predefined tokenizer hosted inside a model repo on the Hugging Face Hub.')
+    group.add_argument('--append-eod', action='store_true',
+                       help='Append an <eod> token to the end of a document.')
+    
+    group = parser.add_argument_group(title='output data')
+    group.add_argument('--output-prefix', type=str, required=True,
+                       help='Path to binary output file without suffix')
+
+    group = parser.add_argument_group(title='runtime')
+    group.add_argument('--workers', type=int, required=True,
+                       help=('Number of worker processes to launch.'
+                             'A good default for fast pre-processing '
+                             'is: (workers * partitions) = available CPU cores.'))
+    group.add_argument('--partitions', type=int, default=1,
+                        help='Number of file partitions')
+    group.add_argument('--log-interval', type=int, default=1000,
+                       help='Interval between progress updates')
+    
+    args = parser.parse_args()
+
+    return args
+
+
+def get_file_name(args, file_id):
+    file_name, extension = os.path.splitext(args.input)
+    input_file_name = file_name + "_" + str(file_id) + extension
+    sentence_split_file = file_name + "_ss_" + str(file_id) + extension
+    output_prefix = args.output_prefix + "_" + str(file_id)
+    file_names = {
+        'partition': input_file_name,
+        'sentence_split': sentence_split_file,
+        'output_prefix': output_prefix}
+    return file_names
+
+
+def check_files_exist(in_ss_out_names, key, num_partitions):
+    for i in range(num_partitions):
+        if not os.path.exists(in_ss_out_names[i][key]):
+            return False
+    return True
+
+
+def main(args):
+
+    in_ss_out_names = []
+    if args.partitions == 1:
+        file_name, extension = os.path.splitext(args.input)
+        sentence_split_file = file_name + "_ss" + extension
+        file_names = {
+            'partition': args.input,
+            'sentence_split': sentence_split_file,
+            'output_prefix': args.output_prefix}
+        in_ss_out_names.append(file_names)
+    else:
+        in_file_names = glob.glob(args.input)
+
+        # Create .jsonl parition files
+        for idx in range(args.partitions):
+            in_ss_out_name = get_file_name(args, idx)
+            in_ss_out_names.append(in_ss_out_name)
+
+        # Check to see if paritions were already created
+        partitions_present = check_files_exist(in_ss_out_names, 'partition', args.partitions)
+
+        if not partitions_present:
+            # Populate .jsonl partition files from parent files
+            partitioned_input_files = []
+            for idx in range(args.partitions):
+                partitioned_input_file = open(in_ss_out_names[idx]['partition'], 'w')
+                partitioned_input_files.append(partitioned_input_file)
+
+            index = 0
+            for in_file_name in in_file_names:
+                fin = open(in_file_name, 'r', encoding='utf-8')
+
+                for line in fin:
+                    partitioned_input_files[index].write(line)
+                    index = (index + 1)%args.partitions
+                fin.close()
+
+            for idx in range(args.partitions):
+                partitioned_input_files[idx].close()
+
+    assert args.workers % args.partitions == 0
+    partition = Partition(args, args.workers//args.partitions)
+    
+    # Encode partition files in parallel
+    processes = []
+    input_key = 'partition'
+    for name in in_ss_out_names:
+        p = multiprocessing.Process(target=partition.process_json_file,
+                                    args=((name[input_key], name['output_prefix']),))
+        p.start()
+        processes.append(p)
+
+    for p in processes:
+        p.join()
+
+    if args.partitions == 1:
+        return
+
+    # Merge bin/idx partitions
+    level = "document"
+
+    output_bin_files = {}
+    output_idx_files = {}
+    builders = {}
+
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
+
+    for key in args.json_keys:
+        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
+                                                      key, level)
+        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
+                                                      key, level)
+        builders[key] = indexed_dataset.MMapIndexedDatasetBuilder(
+            output_bin_files[key],
+            dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
+        )
+
+        for name in in_ss_out_names:
+            parition_output_prefix = name['output_prefix']
+            full_partition_output_prefix = "{}_{}_{}".format(parition_output_prefix,
+                                                             key, level)
+            builders[key].add_index(full_partition_output_prefix)
+        builders[key].finalize(output_idx_files[key])
+
+
+if __name__ == '__main__':
+    _args = get_args()
+    main(_args)
\ No newline at end of file

From dd4ad58c354e87f39748692c3329206fb3865d58 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 17 Mar 2024 00:43:17 +0100
Subject: [PATCH 25/73] Resolving pre-commit-hook changes

---
 src/nanotron/logging.py      |   4 +-
 tools/hf_datasets_to_json.py |  24 +++++--
 tools/preprocess_data.py     | 133 +++++++++++++++++------------------
 3 files changed, 85 insertions(+), 76 deletions(-)

diff --git a/src/nanotron/logging.py b/src/nanotron/logging.py
index ecd01dc0..17949830 100644
--- a/src/nanotron/logging.py
+++ b/src/nanotron/logging.py
@@ -225,8 +225,8 @@ def log_rank(
         # rank is None means everyone logs
         if rank is None or dist.get_rank(group) == rank:
             logger.log(level, msg, **kwargs)
-            
-    except RuntimeError:
+
+    except:
         logger.log(level, msg, **kwargs)
 
 
diff --git a/tools/hf_datasets_to_json.py b/tools/hf_datasets_to_json.py
index b32e540a..e8ac70b9 100644
--- a/tools/hf_datasets_to_json.py
+++ b/tools/hf_datasets_to_json.py
@@ -1,6 +1,8 @@
 import argparse
+
 from datasets import concatenate_datasets, load_dataset
 
+
 def main(args):
     # Load all the Dataset
     if args.split is None:
@@ -8,27 +10,35 @@ def main(args):
         ds = concatenate_datasets([ds[splits] for splits in ds.keys()])
     # Load a split of the Dataset
     else:
-        ds = load_dataset(args.dataset_path, split = args.split, num_proc=args.num_workers)
+        ds = load_dataset(args.dataset_path, split=args.split, num_proc=args.num_workers)
 
     ds = ds.select_columns(args.column_name)
-    
+
     if args.column_name not in ["text"]:
         print(f"Renaming {ds.column_names[0]} to 'text'")
         ds = ds.rename_column(ds.column_names[0], "text")
-    
+
     # Store dataset to json file
     ds.to_json(path_or_buf=args.output_json, num_proc=args.num_workers)
 
+
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset-path", type=str, required=True, help="Path to local stored dataset or repository on the HF hub")
+    parser.add_argument(
+        "--dataset-path", type=str, required=True, help="Path to local stored dataset or repository on the HF hub"
+    )
     parser.add_argument("--split", type=str, help="Which split of the data to process")
-    parser.add_argument("--column-name", type=str, required=True, help="Name of the column containing the data to process")
+    parser.add_argument(
+        "--column-name", type=str, required=True, help="Name of the column containing the data to process"
+    )
     parser.add_argument("--output-json", type=str, required=True, help="Path to the json output file")
-    parser.add_argument("--num-workers", type=int, default=8, help="Number of processes to load the dataset and store the json file")
+    parser.add_argument(
+        "--num-workers", type=int, default=8, help="Number of processes to load the dataset and store the json file"
+    )
+
     return parser.parse_args()
 
 
 if __name__ == "__main__":
     _args = get_args()
-    main(_args)
\ No newline at end of file
+    main(_args)
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 656c5a22..1e9d7a5a 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -1,16 +1,17 @@
 import argparse
+import glob
 import json
+import multiprocessing
 import os
 import sys
 import time
-import glob
-import multiprocessing
 
-# TODO tj-solergibert: Keep this hack or not? Hack to be able to process data without installing nanotron
-sys.path.append(os.path.join(os.path.abspath(os.path.join(os.path.dirname(__file__),os.path.pardir)), "src"))
+# TODO tj-solergibert: Keep this hack or not? Hack to be able to process data without installing nanotron.
+sys.path.append(os.path.join(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)), "src"))
 
-from transformers import AutoTokenizer
 from nanotron.data import indexed_dataset
+from transformers import AutoTokenizer
+
 
 class Encoder(object):
     def __init__(self, args):
@@ -57,15 +58,13 @@ def print_processing_stats(self, count, proc_start, total_bytes_processed):
         if count % self.args.log_interval == 0:
             current = time.time()
             elapsed = current - proc_start
-            mbs = total_bytes_processed/elapsed/1024/1024
-            print(f"Processed {count} documents",
-                  f"({count/elapsed} docs/s, {mbs} MB/s).",
-                  file=sys.stderr)
+            mbs = total_bytes_processed / elapsed / 1024 / 1024
+            print(f"Processed {count} documents", f"({count/elapsed} docs/s, {mbs} MB/s).", file=sys.stderr)
 
     def process_json_file(self, file_name):
         input_file_name, output_prefix = file_name
         print("Opening", input_file_name)
-        fin = open(input_file_name, 'r', encoding='utf-8')
+        fin = open(input_file_name, "r", encoding="utf-8")
 
         startup_start = time.time()
         encoder = Encoder(self.args)
@@ -74,16 +73,14 @@ def process_json_file(self, file_name):
         encoded_docs = pool.imap(encoder.encode, fin, 32)
 
         level = "document"
-        
+
         output_bin_files = {}
         output_idx_files = {}
         builders = {}
 
         for key in self.args.json_keys:
-            output_bin_files[key] = "{}_{}_{}.bin".format(output_prefix,
-                                                          key, level)
-            output_idx_files[key] = "{}_{}_{}.idx".format(output_prefix,
-                                                          key, level)
+            output_bin_files[key] = "{}_{}_{}.bin".format(output_prefix, key, level)
+            output_idx_files[key] = "{}_{}_{}.idx".format(output_prefix, key, level)
             builders[key] = indexed_dataset.MMapIndexedDatasetBuilder(
                 output_bin_files[key],
                 dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
@@ -105,32 +102,38 @@ def process_json_file(self, file_name):
 
 def get_args():
     parser = argparse.ArgumentParser()
-    group = parser.add_argument_group(title='input data')
-    group.add_argument('--input', type=str, required=True,
-                       help='Path to input JSON')
-    group.add_argument('--json-keys', nargs='+', default=['text'],
-                       help='space separate listed of keys to extract from json')
-
-    group = parser.add_argument_group(title='tokenizer')
-    group.add_argument('--pretrained-model-name-or-path', type=str, required=True,
-                       help='A path to a directory containing vocabulary files required by the tokenizer or the model id of a predefined tokenizer hosted inside a model repo on the Hugging Face Hub.')
-    group.add_argument('--append-eod', action='store_true',
-                       help='Append an <eod> token to the end of a document.')
-    
-    group = parser.add_argument_group(title='output data')
-    group.add_argument('--output-prefix', type=str, required=True,
-                       help='Path to binary output file without suffix')
-
-    group = parser.add_argument_group(title='runtime')
-    group.add_argument('--workers', type=int, required=True,
-                       help=('Number of worker processes to launch.'
-                             'A good default for fast pre-processing '
-                             'is: (workers * partitions) = available CPU cores.'))
-    group.add_argument('--partitions', type=int, default=1,
-                        help='Number of file partitions')
-    group.add_argument('--log-interval', type=int, default=1000,
-                       help='Interval between progress updates')
-    
+    group = parser.add_argument_group(title="input data")
+    group.add_argument("--input", type=str, required=True, help="Path to input JSON")
+    group.add_argument(
+        "--json-keys", nargs="+", default=["text"], help="space separate listed of keys to extract from json"
+    )
+
+    group = parser.add_argument_group(title="tokenizer")
+    group.add_argument(
+        "--pretrained-model-name-or-path",
+        type=str,
+        required=True,
+        help="A path to a directory containing vocabulary files required by the tokenizer or the model id of a predefined tokenizer hosted inside a model repo on the Hugging Face Hub.",
+    )
+    group.add_argument("--append-eod", action="store_true", help="Append an <eod> token to the end of a document.")
+
+    group = parser.add_argument_group(title="output data")
+    group.add_argument("--output-prefix", type=str, required=True, help="Path to binary output file without suffix")
+
+    group = parser.add_argument_group(title="runtime")
+    group.add_argument(
+        "--workers",
+        type=int,
+        required=True,
+        help=(
+            "Number of worker processes to launch."
+            "A good default for fast pre-processing "
+            "is: (workers * partitions) = available CPU cores."
+        ),
+    )
+    group.add_argument("--partitions", type=int, default=1, help="Number of file partitions")
+    group.add_argument("--log-interval", type=int, default=1000, help="Interval between progress updates")
+
     args = parser.parse_args()
 
     return args
@@ -141,10 +144,7 @@ def get_file_name(args, file_id):
     input_file_name = file_name + "_" + str(file_id) + extension
     sentence_split_file = file_name + "_ss_" + str(file_id) + extension
     output_prefix = args.output_prefix + "_" + str(file_id)
-    file_names = {
-        'partition': input_file_name,
-        'sentence_split': sentence_split_file,
-        'output_prefix': output_prefix}
+    file_names = {"partition": input_file_name, "sentence_split": sentence_split_file, "output_prefix": output_prefix}
     return file_names
 
 
@@ -162,49 +162,51 @@ def main(args):
         file_name, extension = os.path.splitext(args.input)
         sentence_split_file = file_name + "_ss" + extension
         file_names = {
-            'partition': args.input,
-            'sentence_split': sentence_split_file,
-            'output_prefix': args.output_prefix}
+            "partition": args.input,
+            "sentence_split": sentence_split_file,
+            "output_prefix": args.output_prefix,
+        }
         in_ss_out_names.append(file_names)
     else:
         in_file_names = glob.glob(args.input)
 
-        # Create .jsonl parition files
+        # Create .jsonl partition files
         for idx in range(args.partitions):
             in_ss_out_name = get_file_name(args, idx)
             in_ss_out_names.append(in_ss_out_name)
 
         # Check to see if paritions were already created
-        partitions_present = check_files_exist(in_ss_out_names, 'partition', args.partitions)
+        partitions_present = check_files_exist(in_ss_out_names, "partition", args.partitions)
 
         if not partitions_present:
             # Populate .jsonl partition files from parent files
             partitioned_input_files = []
             for idx in range(args.partitions):
-                partitioned_input_file = open(in_ss_out_names[idx]['partition'], 'w')
+                partitioned_input_file = open(in_ss_out_names[idx]["partition"], "w")
                 partitioned_input_files.append(partitioned_input_file)
 
             index = 0
             for in_file_name in in_file_names:
-                fin = open(in_file_name, 'r', encoding='utf-8')
+                fin = open(in_file_name, "r", encoding="utf-8")
 
                 for line in fin:
                     partitioned_input_files[index].write(line)
-                    index = (index + 1)%args.partitions
+                    index = (index + 1) % args.partitions
                 fin.close()
 
             for idx in range(args.partitions):
                 partitioned_input_files[idx].close()
 
     assert args.workers % args.partitions == 0
-    partition = Partition(args, args.workers//args.partitions)
-    
+    partition = Partition(args, args.workers // args.partitions)
+
     # Encode partition files in parallel
     processes = []
-    input_key = 'partition'
+    input_key = "partition"
     for name in in_ss_out_names:
-        p = multiprocessing.Process(target=partition.process_json_file,
-                                    args=((name[input_key], name['output_prefix']),))
+        p = multiprocessing.Process(
+            target=partition.process_json_file, args=((name[input_key], name["output_prefix"]),)
+        )
         p.start()
         processes.append(p)
 
@@ -224,23 +226,20 @@ def main(args):
     tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
 
     for key in args.json_keys:
-        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
-                                                      key, level)
-        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
-                                                      key, level)
+        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix, key, level)
+        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix, key, level)
         builders[key] = indexed_dataset.MMapIndexedDatasetBuilder(
             output_bin_files[key],
             dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
         )
 
         for name in in_ss_out_names:
-            parition_output_prefix = name['output_prefix']
-            full_partition_output_prefix = "{}_{}_{}".format(parition_output_prefix,
-                                                             key, level)
+            parition_output_prefix = name["output_prefix"]
+            full_partition_output_prefix = "{}_{}_{}".format(parition_output_prefix, key, level)
             builders[key].add_index(full_partition_output_prefix)
         builders[key].finalize(output_idx_files[key])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     _args = get_args()
-    main(_args)
\ No newline at end of file
+    main(_args)

From db52f3e409416245c8cf1070387ec86cbf16bb35 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 17 Mar 2024 00:46:34 +0100
Subject: [PATCH 26/73] Deleted multimodal references & cleaned
 indexed_dataset.py

---
 src/nanotron/data/dataset_builder.py |   2 +-
 src/nanotron/data/indexed_dataset.py | 109 +++++++--------------------
 src/nanotron/data/nanoset.py         |  40 +---------
 3 files changed, 30 insertions(+), 121 deletions(-)

diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
index 3e8fd985..3f75ddbc 100644
--- a/src/nanotron/data/dataset_builder.py
+++ b/src/nanotron/data/dataset_builder.py
@@ -87,7 +87,7 @@ def _build_nanoset_dataset_splits(
             List[Nanoset]: The Nanoset per split. Always returns Nanosets because we build them in each and every rank
         """
 
-        indexed_dataset = self._build_generic_dataset(MMapIndexedDataset, path_prefix, False)
+        indexed_dataset = self._build_generic_dataset(MMapIndexedDataset, path_prefix)
 
         split_idx_bounds = _get_split_indices(split, indexed_dataset.sequence_lengths.shape[0])
 
diff --git a/src/nanotron/data/indexed_dataset.py b/src/nanotron/data/indexed_dataset.py
index 10845c6d..566f41fc 100644
--- a/src/nanotron/data/indexed_dataset.py
+++ b/src/nanotron/data/indexed_dataset.py
@@ -25,8 +25,6 @@
 
 _INDEX_HEADER = b"MMIDIDX\x00\x00"
 
-# TODO @tj-solergibert: Refractor. Delete multimodal references.
-
 
 class DType(Enum):
     """The NumPy data type Enum for writing/reading the MMapIndexedDataset indices"""
@@ -151,7 +149,6 @@ def __exit__(
     def write(
         self,
         sequence_lengths: List[int],
-        sequence_modes: Optional[List[int]],
         document_indices: List[int],
     ) -> None:
         """Write the index (.idx) file
@@ -159,8 +156,6 @@ def write(
         Args:
             sequence_lengths (List[int]): The length of each sequence
 
-            sequence_modes (Optional[List[int]]): The mode of each sequences
-
             document_indices (List[int]): The seqyebce indices demarcating the end of each document
         """
         sequence_pointers = self._sequence_pointers(sequence_lengths)
@@ -187,12 +182,6 @@ def write(
         document_indices = numpy.array(document_indices, dtype=numpy.int64)
         self.idx_writer.write(document_indices.tobytes(order="C"))
 
-        # the mode per sequence
-        if sequence_modes is not None:
-            sequence_modes = numpy.array(sequence_modes, dtype=numpy.int8)
-            self.idx_writer.write(sequence_modes.tobytes(order="C"))
-            del sequence_modes
-
     def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
         """Build the sequence pointers per the sequence lengths and dtype size
 
@@ -217,10 +206,9 @@ class _IndexReader(object):
     Args:
         idx_path (str): The path to the index file
 
-        multimodal (bool): Whether the dataset is multimodal
     """
 
-    def __init__(self, idx_path: str, multimodal: bool) -> None:
+    def __init__(self, idx_path: str) -> None:
 
         log_rank(f"Load the {type(self).__name__} from {idx_path}", logger=logger, level=logging.INFO, rank=0)
 
@@ -273,22 +261,6 @@ def __init__(self, idx_path: str, multimodal: bool) -> None:
         t_end = time.time()
         log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
-        self.sequence_modes = None
-        if multimodal:
-            log_rank("\tExtract the sequence modes", logger=logger, level=logging.INFO, rank=0)
-            t_beg = time.time()
-            self.sequence_modes = numpy.frombuffer(
-                self.bin_buffer,
-                dtype=numpy.int8,
-                count=self.sequence_count,
-                offset=offset
-                + self.sequence_lengths.nbytes
-                + self.sequence_pointers.nbytes
-                + self.document_indices.nbytes,
-            )
-            t_end = time.time()
-            log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
-
         assert self.sequence_lengths.shape[0] == len(self)
         assert self.sequence_lengths.shape[0] == self.sequence_count
         assert self.sequence_lengths.shape[0] == self.document_indices[-1]
@@ -328,7 +300,6 @@ def __getitem__(self, idx: int) -> Tuple[numpy.int32, numpy.int64, Optional[nump
         return (
             self.sequence_pointers[idx],
             self.sequence_lengths[idx],
-            self.sequence_modes[idx] if self.sequence_modes is not None else None,
         )
 
 
@@ -338,21 +309,19 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
     Args:
         path_prefix (str): The index (.idx) and data (.bin) prefix
 
-        multimodal (bool, optional): Whether the dataset is multimodal. Defaults to False.
     """
 
-    def __init__(self, path_prefix: str, multimodal: bool = False) -> None:
+    def __init__(self, path_prefix: str) -> None:
         super().__init__()
         self.path_prefix = None
-        self.multimodal = None
 
         self.index = None
         self.bin_buffer = None
         self.bin_buffer_mmap = None
 
-        self.initialize(path_prefix, multimodal)
+        self.initialize(path_prefix)
 
-    def initialize(self, path_prefix: str, multimodal: bool) -> None:
+    def initialize(self, path_prefix: str) -> None:
         """Initialize the dataset
 
         This method is called by MMapIndexedDataset.__init__ during object creation and by
@@ -361,30 +330,29 @@ def initialize(self, path_prefix: str, multimodal: bool) -> None:
         Args:
             path_prefix (str): The index (.idx) and data (.bin) prefix
 
-            multimodal (bool): Whether the dataset is multimodal
         """
         self.path_prefix = path_prefix
-        self.multimodal = multimodal
-        self.index = _IndexReader(get_idx_path(self.path_prefix), self.multimodal)
+
+        self.index = _IndexReader(get_idx_path(self.path_prefix))
         self.bin_buffer_mmap = numpy.memmap(get_bin_path(self.path_prefix), mode="r", order="C")
         self.bin_buffer = memoryview(self.bin_buffer_mmap)
 
-    def __getstate__(self) -> Tuple[str, bool]:
+    def __getstate__(self) -> str:
         """Get the state during pickling
 
         Returns:
             Tuple[str, bool]: The state tuple
         """
-        return self.path_prefix, self.multimodal
+        return self.path_prefix
 
-    def __setstate__(self, state: Tuple[str, bool]) -> None:
+    def __setstate__(self, path_prefix: str) -> None:
         """Set the state during un-pickling
 
         Args:
             state (Tuple[str, bool]): The state tuple
+
         """
-        path_prefix, multimodal = state
-        self.initialize(path_prefix, multimodal)
+        self.initialize(path_prefix)
 
     def __del__(self) -> None:
         """Clean up the object"""
@@ -401,9 +369,7 @@ def __len__(self) -> int:
         """
         return len(self.index)
 
-    def __getitem__(
-        self, idx: Union[int, numpy.integer, slice]
-    ) -> Union[numpy.ndarray, Tuple[numpy.ndarray, numpy.ndarray]]:
+    def __getitem__(self, idx: Union[int, numpy.integer, slice]) -> numpy.ndarray:
         """Return from the dataset
 
         Args:
@@ -415,25 +381,24 @@ def __getitem__(
             TypeError: When the index is of an unexpected type
 
         Returns:
-            Union[numpy.ndarray, Tuple[numpy.ndarray, numpy.ndarray]]: The sequence tokens and
-            modes at the index or index slice
+            numpy.ndarray: The sequence tokens at the index or index slice
         """
-        # TODO: Check this if else
+
         if isinstance(idx, (int, numpy.integer)):
-            sequence_pointer, sequence_length, sequence_mode = self.index[idx]
+            sequence_pointer, sequence_length = self.index[idx]
             sequence = numpy.frombuffer(
                 self.bin_buffer,
                 dtype=self.index.dtype,
                 count=sequence_length,
                 offset=sequence_pointer,
             )
-            return (sequence, sequence_mode) if sequence_mode is not None else sequence
+            return sequence
+
         elif isinstance(idx, slice):
             start, stop, step = idx.indices(len(self))
             if step != 1:
                 raise ValueError("Slices into indexed_dataset must be contiguous")
             sequence_lengths = self.index.sequence_lengths[idx]
-            sequence_modes = self.index.sequence_modes[idx] if self.multimodal else None
             sequence_offsets = list(accumulate(sequence_lengths))
             sequences = numpy.split(
                 numpy.frombuffer(
@@ -444,7 +409,7 @@ def __getitem__(
                 ),
                 sequence_offsets[:-1],
             )
-            return (sequences, sequence_modes) if sequence_modes is not None else sequences
+            return sequences
         else:
             raise TypeError("Unexpected type received for idx: {}".format(type(idx)))
 
@@ -454,12 +419,13 @@ def get(self, idx: int, offset: int = 0, length: Optional[int] = None) -> numpy.
 
         get(idx) is the same as [idx] but get() does not support slicing.
         """
-        sequence_pointer, sequence_length, sequence_mode = self.index[idx]
+
+        sequence_pointer, sequence_length = self.index[idx]
         if length is None:
             length = sequence_length - offset
         sequence_pointer += offset * DType.size(self.index.dtype)
         sequence = numpy.frombuffer(self.bin_buffer, dtype=self.index.dtype, count=length, offset=sequence_pointer)
-        return (sequence, sequence_mode) if sequence_mode is not None else sequence
+        return sequence
 
     @property
     def sequence_lengths(self) -> numpy.ndarray:
@@ -499,15 +465,6 @@ def set_document_indices(self, document_indices: numpy.ndarray) -> None:
         """
         self.index.document_indices = document_indices
 
-    @property
-    def sequence_modes(self) -> numpy.ndarray:
-        """Get the sequence modes
-
-        Returns:
-            numpy.ndarray: The sequence modes
-        """
-        return self.index.sequence_modes
-
     @staticmethod
     def exists(path_prefix: str) -> bool:
         """Return whether the MMapIndexedDataset exists on disk at the prefix
@@ -528,48 +485,37 @@ class MMapIndexedDatasetBuilder(object):
         bin_path (str): The path to the data (.bin) file
 
         dtype (Type[numpy.number], optional): The dtype of the index file. Defaults to numpy.int32.
-
-        multimodal (bool, optional): Whether the dataset is multimodal. Defaults to False.
     """
 
-    def __init__(self, bin_path: str, dtype: Type[numpy.number] = numpy.int32, multimodal: bool = False) -> None:
+    def __init__(self, bin_path: str, dtype: Type[numpy.number] = numpy.int32) -> None:
         self.data_file = open(bin_path, "wb")
         self.dtype = dtype
-        self.multimodal = multimodal
 
         self.sequence_lengths = []
         self.document_indices = [0]
-        self.sequence_modes = [] if self.multimodal else None
 
-    def add_item(self, tensor: torch.Tensor, mode: int = 0) -> None:
+    def add_item(self, tensor: torch.Tensor) -> None:
         """Add a single item to the dataset
 
         Args:
             tensor (torch.Tensor): The item to add to the data file
 
-            mode (int, optional): The mode for the item. Defaults to 0.
         """
         np_array = numpy.array(tensor.numpy(), dtype=self.dtype)
         self.data_file.write(np_array.tobytes(order="C"))
         self.sequence_lengths.append(np_array.size)
-        if self.multimodal:
-            self.sequence_modes.append(mode)
 
-    def add_document(self, tensor: torch.Tensor, lengths: List[int], modes: Optional[List[int]] = None) -> None:
+    def add_document(self, tensor: torch.Tensor, lengths: List[int]) -> None:
         """Add an entire document to the dataset
 
         Args:
             tensor (torch.Tensor): The document to add
             lengths (List[int]): The lengths of each item in the document
-            modes (Optional[List[int]], optional): The modes for each item in the document.
-            Defaults to None.
         """
         np_array = numpy.array(tensor, dtype=self.dtype)
         self.data_file.write(np_array.tobytes(order="C"))
         self.sequence_lengths.extend(lengths)
         self.document_indices.append(len(self.sequence_lengths))
-        if self.multimodal:
-            self.sequence_modes.extend(modes if modes is not None else [0] * lengths)
 
     def end_document(self) -> None:
         """Finalize the document, for use with MMapIndexedDatasetBuilder.add_item"""
@@ -582,16 +528,13 @@ def add_index(self, path_prefix: str) -> None:
             path_prefix (str): The index (.idx) and data (.bin) prefix
         """
         # Concatenate index
-        index = _IndexReader(get_idx_path(path_prefix), multimodal=self.multimodal)
+        index = _IndexReader(get_idx_path(path_prefix))
         assert index.dtype == self.dtype
 
         offset = len(self.sequence_lengths)
         self.sequence_lengths.extend(index.sequence_lengths)
         self.document_indices.extend((offset + index.document_indices)[1:])
 
-        if self.multimodal:
-            self.sequence_modes.extend(index.sequence_modes)
-
         # Concatenate data
         with open(get_bin_path(path_prefix), "rb") as f:
             shutil.copyfileobj(f, self.data_file)
@@ -604,7 +547,7 @@ def finalize(self, idx_path: str) -> None:
         """
         self.data_file.close()
         with _IndexWriter(idx_path, self.dtype) as writer:
-            writer.write(self.sequence_lengths, self.sequence_modes, self.document_indices)
+            writer.write(self.sequence_lengths, self.document_indices)
 
 
 def get_idx_path(path_prefix: str) -> str:
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
index ffdd322e..9c4e747f 100644
--- a/src/nanotron/data/nanoset.py
+++ b/src/nanotron/data/nanoset.py
@@ -18,9 +18,9 @@
 
 
 class Nanoset(torch.utils.data.Dataset):
-    """Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/datasets/gpt_dataset.py"""
+    """Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/datasets/gpt_dataset.py
 
-    """The base Nanoset dataset
+    The base Nanoset dataset
 
     Args:
         indexed_dataset (MMapIndexedDataset): The MMapIndexedDataset around which to build the
@@ -46,8 +46,6 @@ def __init__(
 
         assert indexed_indices.size > 0
         assert num_samples > 0
-        assert self.is_multimodal() == indexed_dataset.multimodal
-        assert self.is_split_by_sequence() != self.is_split_by_document()
 
         self.indexed_dataset = indexed_dataset
         self.indexed_indices = indexed_indices
@@ -103,39 +101,6 @@ def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
 
         return {"text": text}
 
-    # TODO @tj-solergibert: Delete this funcs if not used?
-    @staticmethod
-    def is_multimodal() -> bool:
-        """Abstract method implementation
-
-        Returns:
-            bool: False
-        """
-        return False
-
-    # TODO @tj-solergibert: Delete this funcs if not used?
-    @staticmethod
-    def is_split_by_sequence() -> bool:
-        """Abstract method implementation
-
-        Returns:
-            bool: True
-        """
-        return True
-
-    # TODO @tj-solergibert: Delete this funcs if not used?
-    @classmethod
-    def is_split_by_document(cls) -> bool:
-        """Return whether the dataset is split by document
-
-        For example, the BERT train/valid/test split is document aware
-
-        Returns:
-            bool: The negation of cls.is_split_by_sequence
-        """
-        return not cls.is_split_by_sequence()
-
-    # TODO @tj-solergibert: Delete this funcs if not used?
     @staticmethod
     def _key_config_attributes() -> List[str]:
         """Return all config attributes which contribute to uniquely identifying the dataset.
@@ -226,6 +191,7 @@ def _build_document_sample_shuffle_indices(
 
         def get_path_to(suffix):
             return os.path.join(path_to_cache, f"{self.unique_description_hash}-{type(self).__name__}-{suffix}")
+
         path_to_description = get_path_to("description.txt")
         path_to_document_index = get_path_to("document_index.npy")
         path_to_sample_index = get_path_to("sample_index.npy")

From 29d2ee7da2333551db18c0436b4b98437efb0975 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 17 Mar 2024 15:57:30 +0000
Subject: [PATCH 27/73] Added BlendedNanoset

---
 src/nanotron/config/config.py        |   5 +-
 src/nanotron/data/blended_nanoset.py | 174 +++++++++++++++++++++++++++
 src/nanotron/data/dataset_builder.py |  67 +++++++++--
 src/nanotron/data/nanoset_configs.py |   9 +-
 src/nanotron/data/utils.py           |   4 +-
 5 files changed, 241 insertions(+), 18 deletions(-)
 create mode 100644 src/nanotron/data/blended_nanoset.py

diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
index 991f9c9d..6fe8783c 100644
--- a/src/nanotron/config/config.py
+++ b/src/nanotron/config/config.py
@@ -97,7 +97,7 @@ def __post_init__(self):
 
 @dataclass
 class NanosetDatasetsArgs:
-    data_path: str
+    data_path: Union[str, dict]
     split: Optional[str] = None
 
     def __post_init__(self):
@@ -426,3 +426,6 @@ def get_config_from_file(
             )
         config.model.model_config = model_config_class(**config.model.model_config)
     return config
+
+
+# PRECOMMIT
\ No newline at end of file
diff --git a/src/nanotron/data/blended_nanoset.py b/src/nanotron/data/blended_nanoset.py
new file mode 100644
index 00000000..0cffab5f
--- /dev/null
+++ b/src/nanotron/data/blended_nanoset.py
@@ -0,0 +1,174 @@
+import hashlib
+import json
+import os
+import time
+from collections import OrderedDict
+from typing import Dict, List, Tuple, Union
+
+import numpy
+import torch
+
+from nanotron.data.nanoset_configs import NanosetConfig
+from nanotron.data.nanoset import Nanoset
+from nanotron.data.utils import normalize
+from nanotron import logging
+from nanotron.logging import log_rank
+
+logger = logging.get_logger(__name__)
+
+_VERBOSE = False
+
+
+class BlendedNanoset(torch.utils.data.Dataset):
+    """Conjugating class for a set of Nanoset instances
+
+    Args:
+        datasets (List[Nanoset]): The Nanoset instances to blend
+
+        weights (List[float]): The weights which determines the dataset blend ratios
+
+        size (int): The number of samples to draw from the blend
+
+        config (BlendedNanosetConfig): The config object which informs dataset creation
+
+    Raises:
+        RuntimeError: When the dataset has fewer or more samples than 'size' post-initialization
+    """
+
+    def __init__(
+        self,
+        datasets: List[Nanoset],
+        weights: List[float],
+        size: int,
+        config: NanosetConfig,
+    ) -> None:
+        assert len(datasets) == len(weights)
+        assert numpy.isclose(sum(weights), 1.0)
+        assert all(map(lambda _: type(_) == type(datasets[0]), datasets))
+
+        # Alert user to unnecessary blending
+        if len(datasets) == 1:
+            log_rank(f"Building a BlendedNanoset for a single Nanoset", logger=logger, level=logging.WARNING, rank=0)
+
+        # Redundant normalization for bitwise identical comparison with Megatron-LM
+        weights = normalize(weights)
+
+        self.datasets = datasets
+        self.weights = weights
+        self.size = size
+        self.config = config
+
+        unique_identifiers = OrderedDict()
+        unique_identifiers["class"] = type(self).__name__
+        unique_identifiers["datasets"] = [dataset.unique_identifiers for dataset in self.datasets]
+        unique_identifiers["weights"] = self.weights
+        unique_identifiers["size"] = self.size
+
+        self.unique_description = json.dumps(unique_identifiers, indent=4)
+        self.unique_description_hash = hashlib.md5(
+            self.unique_description.encode("utf-8")
+        ).hexdigest()
+
+        self.dataset_index, self.dataset_sample_index = self._build_indices()
+
+        # Check size
+        _ = self[self.size - 1]
+        try:
+            _ = self[self.size]
+            raise RuntimeError(f"{type(self).__name__} size is improperly bounded")
+        except IndexError:
+            log_rank(f"> {type(self).__name__} length: {len(self)}", logger=logger, level=logging.INFO, rank=0)
+
+    def __len__(self) -> int:
+        return self.size
+
+    def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
+        dataset_id = self.dataset_index[idx]
+        dataset_sample_id = self.dataset_sample_index[idx]
+        
+        return self.datasets[dataset_id][dataset_sample_id]
+        
+
+    def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
+        """Build and optionally cache the dataset index and the dataset sample index
+
+        The dataset index is a 1-D mapping which determines the dataset to query. The dataset
+        sample index is a 1-D mapping which determines the sample to request from the queried
+        dataset.
+
+        Returns:
+            Tuple[numpy.ndarray, numpy.ndarray]: The dataset index and the dataset sample index
+        """
+        path_to_cache = getattr(self.config, "path_to_cache")
+
+        if path_to_cache:
+            get_path_to = lambda suffix: os.path.join(
+                path_to_cache, f"{self.unique_description_hash}-{type(self).__name__}-{suffix}"
+            )
+            path_to_description = get_path_to("description.txt")
+            path_to_dataset_index = get_path_to("dataset_index.npy")
+            path_to_dataset_sample_index = get_path_to("dataset_sample_index.npy")
+            cache_hit = all(
+                map(
+                    os.path.isfile,
+                    [path_to_description, path_to_dataset_index, path_to_dataset_sample_index],
+                )
+            )
+        else:
+            cache_hit = False
+
+        if not path_to_cache or (not cache_hit and torch.distributed.get_rank() == 0):
+            log_rank(f"Build and save the {type(self).__name__} indices", logger=logger, level=logging.INFO, rank=0)
+
+            # Build the dataset and dataset sample indexes
+            log_rank(f"\tBuild and save the dataset and dataset sample indexes", logger=logger, level=logging.INFO, rank=0)
+
+            t_beg = time.time()
+            from nanotron.data import helpers
+
+            dataset_index = numpy.zeros(self.size, dtype=numpy.int16)
+            dataset_sample_index = numpy.zeros(self.size, dtype=numpy.int64)
+            helpers.build_blending_indices(
+                dataset_index,
+                dataset_sample_index,
+                self.weights,
+                len(self.datasets),
+                self.size,
+                _VERBOSE,
+            )
+
+            if path_to_cache:
+                os.makedirs(path_to_cache, exist_ok=True)
+                # Write the description
+                with open(path_to_description, "wt") as writer:
+                    writer.write(self.unique_description)
+                # Save the indexes
+                numpy.save(path_to_dataset_index, dataset_index, allow_pickle=True)
+                numpy.save(path_to_dataset_sample_index, dataset_sample_index, allow_pickle=True)
+            else:
+                log_rank("Unable to save the indexes because path_to_cache is None", logger=logger, level=logging.WARNING, rank=0)
+
+            t_end = time.time()
+            log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+
+            return dataset_index, dataset_sample_index
+
+        log_rank(f"Load the {type(self).__name__} indices", logger=logger, level=logging.INFO, rank=0)
+
+        log_rank(f"\tLoad the dataset index from {path_to_dataset_index}", logger=logger, level=logging.INFO, rank=0)
+        t_beg = time.time()
+        dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode='r')
+        t_end = time.time()
+        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+
+        log_rank(f"\tLoad the dataset sample index from {path_to_dataset_sample_index}", logger=logger, level=logging.INFO, rank=0)
+        t_beg = time.time()
+        dataset_sample_index = numpy.load(
+            path_to_dataset_sample_index, allow_pickle=True, mmap_mode='r'
+        )
+        t_end = time.time()
+        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+
+        return dataset_index, dataset_sample_index
+    
+# PRECOMMIT
\ No newline at end of file
diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
index 3f75ddbc..a9485c7e 100644
--- a/src/nanotron/data/dataset_builder.py
+++ b/src/nanotron/data/dataset_builder.py
@@ -6,7 +6,9 @@
 
 from nanotron import logging
 from nanotron.data.indexed_dataset import MMapIndexedDataset
-from nanotron.data.nanoset import Nanoset, NanosetConfig
+from nanotron.data.nanoset import Nanoset
+from nanotron.data.blended_nanoset import BlendedNanoset
+from nanotron.data.nanoset_configs import NanosetConfig
 from nanotron.data.utils import Split, normalize
 
 logger = logging.get_logger(__name__)
@@ -28,7 +30,8 @@ def __init__(
     ):
         self.config = config
         self.sizes = config.split_num_samples
-        self.cls = Nanoset  # NOTE: keep it like that to support BlendedNanoset in the future
+        # TODO: Quit cls as we are only supporting Nanoset datasets. Blended are mmapindex + nanoset + blended
+        #self.cls = Nanoset  # NOTE: keep it like that to support BlendedNanoset in the future
 
     def build(self) -> List[Nanoset]:
         """Build all dataset splits according to the provided data_path(s)
@@ -61,12 +64,53 @@ def _build_blended_dataset_splits(
         data_path = getattr(self.config, "data_path")
         split = getattr(self.config, "split_vector")
 
-        # NOTE: For including blended datasets (BlendedNanoset)
-        # NOTE: Refer to Megatron to check how to work with multiple data_paths datasets. For now we don't support it
-        # NOTE: https://github.com/TJ-Solergibert/Megatron-debug/blob/c5eda947d9728d21b03d77b7db56cb71513d5636/megatron/core/datasets/blended_megatron_dataset_builder.py#L81
-        data_path = [data_path]
-        if len(data_path) == 1:
+        # Single Nanoset
+        if isinstance(data_path, str):
             return self._build_nanoset_dataset_splits(data_path[0], split, self.sizes)
+        
+        # Blended Nanoset
+        prefix_per_dataset = list(data_path.keys())              
+        weight_per_dataset = normalize(list(data_path.values())) 
+        
+        # NOTE: Use 0.5% target margin to ensure we satiate the network
+        sizes_per_dataset = [
+            [int(math.ceil(target_num_samples * weight * 1.005)) for target_num_samples in self.sizes]
+            for weight in weight_per_dataset
+        ]
+        
+        nanoset_datasets = [[] for _ in range(len(Split))]
+
+        for i in range(len(prefix_per_dataset)):
+            nanoset_datasets_split = self._build_nanoset_dataset_splits(
+                prefix_per_dataset[i], split, sizes_per_dataset[i]
+            )
+            for j in range(len(nanoset_datasets_split)):
+                    nanoset_datasets[j].append(nanoset_datasets_split[j])
+        
+        # Sum over all contributing datasets, per split
+        size_per_split = list(map(sum, zip(*sizes_per_dataset)))
+
+        blended_nanosets = []
+
+        for i in range(len(nanoset_datasets)):
+            is_none = map(lambda _: _ is None, nanoset_datasets[i])
+
+            if split[i] == 0.0:
+                assert all(is_none)
+                blended_nanosets.append(None)
+            else:
+                assert all(is_none) or not any(is_none)
+                blended_nanosets.append(
+                    self._build_generic_dataset(
+                        BlendedNanoset,
+                        nanoset_datasets[i],
+                        weight_per_dataset,
+                        size_per_split[i],
+                        self.config,
+                    )
+                )
+
+        return blended_nanosets
 
     def _build_nanoset_dataset_splits(
         self,
@@ -108,7 +152,7 @@ def _build_nanoset_dataset_splits(
             else:
                 nanoset_datasets.append(
                     self._build_generic_dataset(
-                        self.cls, indexed_dataset, split_indices[i], sizes[i], _split, self.config
+                        Nanoset, indexed_dataset, split_indices[i], sizes[i], _split, self.config
                     )
                 )
 
@@ -186,9 +230,7 @@ def _get_split_indices(split: List[float], num_elements: int) -> List[int]:
 
 
 # NOTE: Keep for BlendedNanoset
-def _get_prefixes_weights_and_sizes_for_blend(
-    data_path: List[str], target_num_samples_per_split: List[int]
-) -> Tuple[List[str], List[float], List[List[int]]]:
+def _get_prefixes_weights_and_sizes_for_blend(prefixes, weights, splits) -> Tuple[List[str], List[float], List[List[int]]]:
     """Determine the contribution of the Nanoset splits to the BlendedNanoset splits
 
     Args:
@@ -203,7 +245,6 @@ def _get_prefixes_weights_and_sizes_for_blend(
         ["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"], the normalized weights e.g.
         [0.3, 0.7], and the number of samples to request per Nanoset per split
     """
-    weights, prefixes = zip(*[(float(data_path[i]), data_path[i + 1].strip()) for i in range(0, len(data_path), 2)])
 
     weights = normalize(weights)
 
@@ -214,3 +255,5 @@ def _get_prefixes_weights_and_sizes_for_blend(
     ]
 
     return prefixes, weights, sizes_per_dataset
+
+# PRECOMMIT
\ No newline at end of file
diff --git a/src/nanotron/data/nanoset_configs.py b/src/nanotron/data/nanoset_configs.py
index 31b10948..677fbd81 100644
--- a/src/nanotron/data/nanoset_configs.py
+++ b/src/nanotron/data/nanoset_configs.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import List, Optional, Union
 
 from nanotron.data.utils import parse_and_normalize_split
 
@@ -34,13 +34,12 @@ class NanosetConfig:
 
     split_num_samples: list[int]
 
-    data_path: Optional[List[str]] = None
+    data_path: Union[str, dict]
 
-    split: Optional[str] = None
+    split: str
 
     split_vector: Optional[List[float]] = field(init=False, default=None)
 
-    # TODO We add the cache because it need its somewhere, take a look later
     path_to_cache: str = None
 
     def __post_init__(self):
@@ -48,3 +47,5 @@ def __post_init__(self):
         https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
         """
         self.split_vector = parse_and_normalize_split(self.split)
+
+# PRECOMMIT
\ No newline at end of file
diff --git a/src/nanotron/data/utils.py b/src/nanotron/data/utils.py
index 5401e42c..ea7cc768 100644
--- a/src/nanotron/data/utils.py
+++ b/src/nanotron/data/utils.py
@@ -67,8 +67,8 @@ def parse_and_normalize_split(split: str) -> List[float]:
 def compute_datasets_num_samples(train_iters, eval_interval, eval_iters, global_batch_size):
 
     train_samples = train_iters * global_batch_size
-    eval_iters = (train_iters // eval_interval + 1) * eval_iters
     test_iters = eval_iters
+    eval_iters = (train_iters // eval_interval + 1) * eval_iters
 
     datasets_num_samples = [train_samples, eval_iters * global_batch_size, test_iters * global_batch_size]
 
@@ -78,3 +78,5 @@ def compute_datasets_num_samples(train_iters, eval_interval, eval_iters, global_
     log_rank("    Test:       {}".format(datasets_num_samples[2]), logger=logger, level=logging.INFO, rank=0)
 
     return datasets_num_samples
+
+# PRECOMMIT
\ No newline at end of file

From dac7205ea9b08d110daaed6dcdb86232621ef9fa Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 17 Mar 2024 19:24:09 +0100
Subject: [PATCH 28/73] clean

---
 examples/nanoset_llama_7b.py            | 95 -------------------------
 run_train_nanoset.py                    |  1 +
 src/nanotron/config/config.py           |  4 +-
 src/nanotron/data/blended_nanoset.py    | 52 +++++++-------
 src/nanotron/data/dataloader_builder.py |  7 --
 src/nanotron/data/dataset_builder.py    | 73 ++++++-------------
 src/nanotron/data/nanoset_configs.py    |  6 +-
 src/nanotron/data/utils.py              |  2 -
 8 files changed, 52 insertions(+), 188 deletions(-)
 delete mode 100644 examples/nanoset_llama_7b.py

diff --git a/examples/nanoset_llama_7b.py b/examples/nanoset_llama_7b.py
deleted file mode 100644
index a7e229b7..00000000
--- a/examples/nanoset_llama_7b.py
+++ /dev/null
@@ -1,95 +0,0 @@
-"""
-Benchmarking script for the Llama-2-7b model
-"""
-
-import os
-
-from nanotron.config import (
-    CheckpointsArgs,
-    Config,
-    DataArgs,
-    GeneralArgs,
-    LlamaConfig,
-    LoggingArgs,
-    LRSchedulerArgs,
-    ModelArgs,
-    NanosetDatasetsArgs,
-    OptimizerArgs,
-    ParallelismArgs,
-    RandomInit,
-    TokenizerArgs,
-    TokensArgs,
-)
-from nanotron.logging import human_format
-
-# Config for a llama model with 6.74M parameters
-model_config = LlamaConfig()
-
-num_params = human_format(
-    model_config.vocab_size * model_config.hidden_size * 2
-    + model_config.num_hidden_layers
-    * (
-        3 * model_config.hidden_size * model_config.intermediate_size
-        + 4 * model_config.hidden_size * model_config.hidden_size
-    )
-).replace(".", "p")
-
-print(f"Model has {num_params} parameters")
-
-seed = 42
-
-learning_rate = LRSchedulerArgs(
-    learning_rate=3e-4, lr_warmup_steps=2, lr_warmup_style="linear", lr_decay_style="cosine", min_decay_lr=1e-5
-)
-
-optimizer = OptimizerArgs(
-    zero_stage=0,
-    weight_decay=0.01,
-    clip_grad=1.0,
-    accumulate_grad_in_fp32=True,
-    adam_eps=1e-08,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    torch_adam_is_fused=True,
-    learning_rate_scheduler=learning_rate,
-)
-
-parallelism = ParallelismArgs(
-    dp=1,
-    pp=1,
-    tp=4,
-    pp_engine="1f1b",
-    tp_mode="REDUCE_SCATTER",
-    tp_linear_async_communication=True,
-)
-
-tokens = TokensArgs(sequence_length=8192, train_steps=5, micro_batch_size=1, batch_accumulation_per_replica=8)
-
-dataset = NanosetDatasetsArgs(data_path="PATH_TO_DATASET", split="949,50,1")
-
-checkpoints_path = os.path.dirname(os.path.dirname(__file__)) + "/checkpoints"
-os.makedirs(checkpoints_path, exist_ok=True)
-
-config = Config(
-    general=GeneralArgs(project="bench", run="llama", seed=seed),
-    checkpoints=CheckpointsArgs(checkpoints_path=checkpoints_path, checkpoint_interval=1000),
-    parallelism=parallelism,
-    model=ModelArgs(init_method=RandomInit(std=0.025), model_config=model_config),
-    tokenizer=TokenizerArgs("meta-llama/Llama-2-7b-hf"),
-    optimizer=optimizer,
-    logging=LoggingArgs(),
-    tokens=tokens,
-    data=DataArgs(dataset=dataset, seed=seed),
-    profiler=None,
-)
-
-if __name__ == "__main__":
-    dir = os.path.dirname(__file__)
-
-    # Save config as YAML file
-    config.save_as_yaml(f"{dir}/config_nanoset_llama_7b.yaml")
-
-    # Launch training
-    os.system("export CUDA_DEVICE_MAX_CONNECTIONS=1")
-    gpus = config.parallelism.dp * config.parallelism.pp * config.parallelism.tp
-    os.system(f"torchrun --nproc_per_node={gpus} run_train.py --config-file {dir}/config_llama.yaml")
diff --git a/run_train_nanoset.py b/run_train_nanoset.py
index 13bb1b05..5b122d61 100644
--- a/run_train_nanoset.py
+++ b/run_train_nanoset.py
@@ -40,6 +40,7 @@ def get_dataloaders(trainer: DistributedTrainer):
         data_path=trainer.config.data.dataset.data_path,
         split=trainer.config.data.dataset.split,
         split_num_samples=split_num_samples,
+        path_to_cache=trainer.config.data.dataset.path_to_cache,
     )
 
     # Build Nanoset datasets
diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
index 6fe8783c..66f6e805 100644
--- a/src/nanotron/config/config.py
+++ b/src/nanotron/config/config.py
@@ -99,6 +99,7 @@ def __post_init__(self):
 class NanosetDatasetsArgs:
     data_path: Union[str, dict]
     split: Optional[str] = None
+    path_to_cache: Optional[str] = None
 
     def __post_init__(self):
         if self.split is None:
@@ -426,6 +427,3 @@ def get_config_from_file(
             )
         config.model.model_config = model_config_class(**config.model.model_config)
     return config
-
-
-# PRECOMMIT
\ No newline at end of file
diff --git a/src/nanotron/data/blended_nanoset.py b/src/nanotron/data/blended_nanoset.py
index 0cffab5f..68343867 100644
--- a/src/nanotron/data/blended_nanoset.py
+++ b/src/nanotron/data/blended_nanoset.py
@@ -8,16 +8,14 @@
 import numpy
 import torch
 
-from nanotron.data.nanoset_configs import NanosetConfig
+from nanotron import logging
 from nanotron.data.nanoset import Nanoset
+from nanotron.data.nanoset_configs import NanosetConfig
 from nanotron.data.utils import normalize
-from nanotron import logging
 from nanotron.logging import log_rank
 
 logger = logging.get_logger(__name__)
 
-_VERBOSE = False
-
 
 class BlendedNanoset(torch.utils.data.Dataset):
     """Conjugating class for a set of Nanoset instances
@@ -44,11 +42,11 @@ def __init__(
     ) -> None:
         assert len(datasets) == len(weights)
         assert numpy.isclose(sum(weights), 1.0)
-        assert all(map(lambda _: type(_) == type(datasets[0]), datasets))
+        assert all((type(_) == type(datasets[0]) for _ in datasets))
 
         # Alert user to unnecessary blending
         if len(datasets) == 1:
-            log_rank(f"Building a BlendedNanoset for a single Nanoset", logger=logger, level=logging.WARNING, rank=0)
+            log_rank("Building a BlendedNanoset for a single Nanoset", logger=logger, level=logging.WARNING, rank=0)
 
         # Redundant normalization for bitwise identical comparison with Megatron-LM
         weights = normalize(weights)
@@ -65,9 +63,7 @@ def __init__(
         unique_identifiers["size"] = self.size
 
         self.unique_description = json.dumps(unique_identifiers, indent=4)
-        self.unique_description_hash = hashlib.md5(
-            self.unique_description.encode("utf-8")
-        ).hexdigest()
+        self.unique_description_hash = hashlib.md5(self.unique_description.encode("utf-8")).hexdigest()
 
         self.dataset_index, self.dataset_sample_index = self._build_indices()
 
@@ -85,9 +81,8 @@ def __len__(self) -> int:
     def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
         dataset_id = self.dataset_index[idx]
         dataset_sample_id = self.dataset_sample_index[idx]
-        
+
         return self.datasets[dataset_id][dataset_sample_id]
-        
 
     def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
         """Build and optionally cache the dataset index and the dataset sample index
@@ -102,9 +97,10 @@ def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
         path_to_cache = getattr(self.config, "path_to_cache")
 
         if path_to_cache:
-            get_path_to = lambda suffix: os.path.join(
-                path_to_cache, f"{self.unique_description_hash}-{type(self).__name__}-{suffix}"
-            )
+
+            def get_path_to(suffix):
+                return os.path.join(path_to_cache, f"{self.unique_description_hash}-{type(self).__name__}-{suffix}")
+
             path_to_description = get_path_to("description.txt")
             path_to_dataset_index = get_path_to("dataset_index.npy")
             path_to_dataset_sample_index = get_path_to("dataset_sample_index.npy")
@@ -121,7 +117,9 @@ def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
             log_rank(f"Build and save the {type(self).__name__} indices", logger=logger, level=logging.INFO, rank=0)
 
             # Build the dataset and dataset sample indexes
-            log_rank(f"\tBuild and save the dataset and dataset sample indexes", logger=logger, level=logging.INFO, rank=0)
+            log_rank(
+                "\tBuild and save the dataset and dataset sample indexes", logger=logger, level=logging.INFO, rank=0
+            )
 
             t_beg = time.time()
             from nanotron.data import helpers
@@ -134,7 +132,7 @@ def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
                 self.weights,
                 len(self.datasets),
                 self.size,
-                _VERBOSE,
+                False,
             )
 
             if path_to_cache:
@@ -146,7 +144,12 @@ def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
                 numpy.save(path_to_dataset_index, dataset_index, allow_pickle=True)
                 numpy.save(path_to_dataset_sample_index, dataset_sample_index, allow_pickle=True)
             else:
-                log_rank("Unable to save the indexes because path_to_cache is None", logger=logger, level=logging.WARNING, rank=0)
+                log_rank(
+                    "Unable to save the indexes because path_to_cache is None",
+                    logger=logger,
+                    level=logging.WARNING,
+                    rank=0,
+                )
 
             t_end = time.time()
             log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
@@ -157,18 +160,19 @@ def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
 
         log_rank(f"\tLoad the dataset index from {path_to_dataset_index}", logger=logger, level=logging.INFO, rank=0)
         t_beg = time.time()
-        dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode='r')
+        dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode="r")
         t_end = time.time()
         log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
-        log_rank(f"\tLoad the dataset sample index from {path_to_dataset_sample_index}", logger=logger, level=logging.INFO, rank=0)
-        t_beg = time.time()
-        dataset_sample_index = numpy.load(
-            path_to_dataset_sample_index, allow_pickle=True, mmap_mode='r'
+        log_rank(
+            f"\tLoad the dataset sample index from {path_to_dataset_sample_index}",
+            logger=logger,
+            level=logging.INFO,
+            rank=0,
         )
+        t_beg = time.time()
+        dataset_sample_index = numpy.load(path_to_dataset_sample_index, allow_pickle=True, mmap_mode="r")
         t_end = time.time()
         log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
         return dataset_index, dataset_sample_index
-    
-# PRECOMMIT
\ No newline at end of file
diff --git a/src/nanotron/data/dataloader_builder.py b/src/nanotron/data/dataloader_builder.py
index 221f5448..4c584047 100644
--- a/src/nanotron/data/dataloader_builder.py
+++ b/src/nanotron/data/dataloader_builder.py
@@ -1,4 +1,3 @@
-# QUESTION: Move them here?
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Union
 
@@ -70,8 +69,6 @@ def build_nanoset_dataloader(
         num_workers=dataloader_num_workers,
         pin_memory=dataloader_pin_memory,
         worker_init_fn=get_dataloader_worker_init(dp_rank=dp_rank),
-        # TODO @thomasw21: I'm not sure but this doesn't seem to work at all.
-        # pin_memory_device="cuda",
     )
 
 
@@ -100,8 +97,6 @@ def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Uni
             self.input_pp_rank,
             self.output_pp_rank,
         ]:
-            # assert all(len(example) == 0 for example in examples)
-
             return {
                 "input_ids": TensorPointer(group_rank=self.input_pp_rank),
                 "input_mask": TensorPointer(group_rank=self.input_pp_rank),
@@ -163,8 +158,6 @@ def get_sampler(
 ) -> Optional[Sampler]:
     """returns sampler that restricts data loading to a subset of the dataset proper to the DP rank"""
 
-    # NOTE: Removed DistributedSamplerWithLoop to support valid and test samplers.
-
     sampler = DistributedSampler(dataset, num_replicas=dl_ranks_size, rank=dl_rank, seed=seed, drop_last=drop_last)
 
     if consumed_train_samples > 0:
diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
index a9485c7e..d5c69bc4 100644
--- a/src/nanotron/data/dataset_builder.py
+++ b/src/nanotron/data/dataset_builder.py
@@ -1,19 +1,19 @@
 import math
-from typing import Any, List, Optional, Tuple, Type, Union
+from typing import Any, List, Type, Union
 
 import numpy
 import torch
 
 from nanotron import logging
+from nanotron.data.blended_nanoset import BlendedNanoset
 from nanotron.data.indexed_dataset import MMapIndexedDataset
 from nanotron.data.nanoset import Nanoset
-from nanotron.data.blended_nanoset import BlendedNanoset
 from nanotron.data.nanoset_configs import NanosetConfig
 from nanotron.data.utils import Split, normalize
 
 logger = logging.get_logger(__name__)
 
-DistributedDataset = Union[Nanoset, MMapIndexedDataset]
+DistributedDataset = Union[Nanoset, MMapIndexedDataset, BlendedNanoset]
 
 
 class NanosetBuilder(object):
@@ -29,9 +29,6 @@ def __init__(
         config: NanosetConfig,
     ):
         self.config = config
-        self.sizes = config.split_num_samples
-        # TODO: Quit cls as we are only supporting Nanoset datasets. Blended are mmapindex + nanoset + blended
-        #self.cls = Nanoset  # NOTE: keep it like that to support BlendedNanoset in the future
 
     def build(self) -> List[Nanoset]:
         """Build all dataset splits according to the provided data_path(s)
@@ -40,12 +37,10 @@ def build(self) -> List[Nanoset]:
 
         The dataset splits returned can vary according to the config. Supply config.data_path and
         config.split to build BlendedNanoset and/or Nanoset splits from the same
-        distribution. Supply config.data_path_per_split to build BlendedNanoset and/or Nanoset
-        splits from separate distributions.
+        distribution.
 
         Returns:
-            List[Union[BlendedNanoset, Nanoset]]: A list of either
-            Nanoset or BlendedNanoset per split
+            List[Union[BlendedNanoset, Nanoset]]: A list of either Nanoset or BlendedNanoset per split
         """
         return self._build_blended_dataset_splits()
 
@@ -57,8 +52,7 @@ def _build_blended_dataset_splits(
         See the NanosetBuilder.build alias for more information.
 
         Returns:
-            List[Optional[Union[BlendedNanoset, Nanoset]]]: A list of either
-            Nanoset or BlendedNanoset per split
+            List[Union[BlendedNanoset, Nanoset]]: A list of either Nanoset or BlendedNanoset per split
         """
 
         data_path = getattr(self.config, "data_path")
@@ -66,18 +60,21 @@ def _build_blended_dataset_splits(
 
         # Single Nanoset
         if isinstance(data_path, str):
-            return self._build_nanoset_dataset_splits(data_path[0], split, self.sizes)
-        
+            return self._build_nanoset_dataset_splits(data_path, split, self.config.split_num_samples)
+
         # Blended Nanoset
-        prefix_per_dataset = list(data_path.keys())              
-        weight_per_dataset = normalize(list(data_path.values())) 
-        
+        prefix_per_dataset = list(data_path.keys())
+        weight_per_dataset = normalize(list(data_path.values()))
+
         # NOTE: Use 0.5% target margin to ensure we satiate the network
         sizes_per_dataset = [
-            [int(math.ceil(target_num_samples * weight * 1.005)) for target_num_samples in self.sizes]
+            [
+                int(math.ceil(target_num_samples * weight * 1.005))
+                for target_num_samples in self.config.split_num_samples
+            ]
             for weight in weight_per_dataset
         ]
-        
+
         nanoset_datasets = [[] for _ in range(len(Split))]
 
         for i in range(len(prefix_per_dataset)):
@@ -85,15 +82,15 @@ def _build_blended_dataset_splits(
                 prefix_per_dataset[i], split, sizes_per_dataset[i]
             )
             for j in range(len(nanoset_datasets_split)):
-                    nanoset_datasets[j].append(nanoset_datasets_split[j])
-        
+                nanoset_datasets[j].append(nanoset_datasets_split[j])
+
         # Sum over all contributing datasets, per split
         size_per_split = list(map(sum, zip(*sizes_per_dataset)))
 
         blended_nanosets = []
 
         for i in range(len(nanoset_datasets)):
-            is_none = map(lambda _: _ is None, nanoset_datasets[i])
+            is_none = (_ is None for _ in nanoset_datasets[i])
 
             if split[i] == 0.0:
                 assert all(is_none)
@@ -162,7 +159,7 @@ def _build_generic_dataset(
         self,
         cls: Type[DistributedDataset],
         *args: Any,
-    ) -> Optional[DistributedDataset]:
+    ) -> DistributedDataset:
         """Build the DistributedDataset
 
         Args:
@@ -227,33 +224,3 @@ def _get_split_indices(split: List[float], num_elements: int) -> List[int]:
     assert split_indices[-1] == num_elements
 
     return split_indices
-
-
-# NOTE: Keep for BlendedNanoset
-def _get_prefixes_weights_and_sizes_for_blend(prefixes, weights, splits) -> Tuple[List[str], List[float], List[List[int]]]:
-    """Determine the contribution of the Nanoset splits to the BlendedNanoset splits
-
-    Args:
-        data_path (List[str]): e.g. ["30", "path/to/dataset_1_prefix", "70",
-        "path/to/dataset_2_prefix"]
-
-        target_num_samples_per_split (List[int]): The number of samples to target for each
-        BlendedNanoset split
-
-    Returns:
-        Tuple[List[str], List[float], List[List[int]]]: The prefix strings e.g.
-        ["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"], the normalized weights e.g.
-        [0.3, 0.7], and the number of samples to request per Nanoset per split
-    """
-
-    weights = normalize(weights)
-
-    # NOTE: Use 0.5% target margin to ensure we satiate the network
-    sizes_per_dataset = [
-        [int(math.ceil(target_num_samples * weight * 1.005)) for target_num_samples in target_num_samples_per_split]
-        for weight in weights
-    ]
-
-    return prefixes, weights, sizes_per_dataset
-
-# PRECOMMIT
\ No newline at end of file
diff --git a/src/nanotron/data/nanoset_configs.py b/src/nanotron/data/nanoset_configs.py
index 677fbd81..fac1cb2b 100644
--- a/src/nanotron/data/nanoset_configs.py
+++ b/src/nanotron/data/nanoset_configs.py
@@ -38,14 +38,12 @@ class NanosetConfig:
 
     split: str
 
-    split_vector: Optional[List[float]] = field(init=False, default=None)
-
     path_to_cache: str = None
 
+    split_vector: Optional[List[float]] = field(init=False, default=None)
+
     def __post_init__(self):
         """Python dataclass method that is used to modify attributes after initialization. See
         https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
         """
         self.split_vector = parse_and_normalize_split(self.split)
-
-# PRECOMMIT
\ No newline at end of file
diff --git a/src/nanotron/data/utils.py b/src/nanotron/data/utils.py
index ea7cc768..ce5d1e12 100644
--- a/src/nanotron/data/utils.py
+++ b/src/nanotron/data/utils.py
@@ -78,5 +78,3 @@ def compute_datasets_num_samples(train_iters, eval_interval, eval_iters, global_
     log_rank("    Test:       {}".format(datasets_num_samples[2]), logger=logger, level=logging.INFO, rank=0)
 
     return datasets_num_samples
-
-# PRECOMMIT
\ No newline at end of file

From 903761b8e8ec4701ffca2383c2f195392e4ded95 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 17 Mar 2024 20:03:33 +0100
Subject: [PATCH 29/73] Added merge_datasets script

---
 src/nanotron/config/config.py        |  2 +
 src/nanotron/data/blended_nanoset.py |  6 +--
 src/nanotron/data/indexed_dataset.py | 10 ++--
 src/nanotron/data/nanoset.py         | 16 +++---
 tools/merge_datasets.py              | 76 ++++++++++++++++++++++++++++
 tools/preprocess_data.py             | 14 ++---
 6 files changed, 99 insertions(+), 25 deletions(-)
 create mode 100644 tools/merge_datasets.py

diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
index 66f6e805..d02fcddc 100644
--- a/src/nanotron/config/config.py
+++ b/src/nanotron/config/config.py
@@ -104,6 +104,8 @@ class NanosetDatasetsArgs:
     def __post_init__(self):
         if self.split is None:
             self.split = "949,50,1"
+        if not os.path.isdir(self.path_to_cache):
+            self.path_to_cache = None
 
 
 @dataclass
diff --git a/src/nanotron/data/blended_nanoset.py b/src/nanotron/data/blended_nanoset.py
index 68343867..9467c00a 100644
--- a/src/nanotron/data/blended_nanoset.py
+++ b/src/nanotron/data/blended_nanoset.py
@@ -152,7 +152,7 @@ def get_path_to(suffix):
                 )
 
             t_end = time.time()
-            log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+            log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
             return dataset_index, dataset_sample_index
 
@@ -162,7 +162,7 @@ def get_path_to(suffix):
         t_beg = time.time()
         dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode="r")
         t_end = time.time()
-        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+        log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
         log_rank(
             f"\tLoad the dataset sample index from {path_to_dataset_sample_index}",
@@ -173,6 +173,6 @@ def get_path_to(suffix):
         t_beg = time.time()
         dataset_sample_index = numpy.load(path_to_dataset_sample_index, allow_pickle=True, mmap_mode="r")
         t_end = time.time()
-        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+        log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
         return dataset_index, dataset_sample_index
diff --git a/src/nanotron/data/indexed_dataset.py b/src/nanotron/data/indexed_dataset.py
index 566f41fc..0fc0f60c 100644
--- a/src/nanotron/data/indexed_dataset.py
+++ b/src/nanotron/data/indexed_dataset.py
@@ -237,7 +237,7 @@ def __init__(self, idx_path: str) -> None:
             self.bin_buffer, dtype=numpy.int32, count=self.sequence_count, offset=offset
         )
         t_end = time.time()
-        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+        log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
         log_rank("\tExtract the sequence pointers", logger=logger, level=logging.INFO, rank=0)
         t_beg = time.time()
@@ -248,7 +248,7 @@ def __init__(self, idx_path: str) -> None:
             offset=offset + self.sequence_lengths.nbytes,
         )
         t_end = time.time()
-        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+        log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
         log_rank("\tExtract the document indices", logger=logger, level=logging.INFO, rank=0)
         t_beg = time.time()
@@ -259,15 +259,15 @@ def __init__(self, idx_path: str) -> None:
             offset=offset + self.sequence_lengths.nbytes + self.sequence_pointers.nbytes,
         )
         t_end = time.time()
-        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+        log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
         assert self.sequence_lengths.shape[0] == len(self)
         assert self.sequence_lengths.shape[0] == self.sequence_count
         assert self.sequence_lengths.shape[0] == self.document_indices[-1]
 
-        log_rank(f"> total number of sequences: {len(self)}", logger=logger, level=logging.INFO, rank=0)
+        log_rank(f"> Total number of sequences: {len(self)}", logger=logger, level=logging.INFO, rank=0)
         log_rank(
-            f"> total number of documents: {self.document_indices.shape[0] - 1}",
+            f"> Total number of documents: {self.document_indices.shape[0] - 1}",
             logger=logger,
             level=logging.INFO,
             rank=0,
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
index 9c4e747f..d5276fbb 100644
--- a/src/nanotron/data/nanoset.py
+++ b/src/nanotron/data/nanoset.py
@@ -246,7 +246,7 @@ def get_path_to(suffix):
                     level=logging.DEBUG,
                     rank=0,
                 )
-                log_rank(f"> threshold: {threshold}", logger=logger, level=logging.DEBUG, rank=0)
+                log_rank(f"> Threshold: {threshold}", logger=logger, level=logging.DEBUG, rank=0)
                 log_rank(
                     f"> num_samples_per_epoch: {num_samples_per_epoch}", logger=logger, level=logging.DEBUG, rank=0
                 )
@@ -274,7 +274,7 @@ def get_path_to(suffix):
             )
             numpy.save(path_to_document_index, document_index, allow_pickle=True)
             t_end = time.time()
-            log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+            log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
             # Build the sample index
             log_rank(
@@ -297,7 +297,7 @@ def get_path_to(suffix):
             )
             numpy.save(path_to_sample_index, sample_index, allow_pickle=True)
             t_end = time.time()
-            log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+            log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
             # Build the shuffle index
             log_rank(
@@ -317,7 +317,7 @@ def get_path_to(suffix):
                 )
             numpy.save(path_to_shuffle_index, shuffle_index, allow_pickle=True)
             t_end = time.time()
-            log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+            log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
         log_rank(
             f"Load the {type(self).__name__} {self.index_split.name} indices",
@@ -346,7 +346,7 @@ def get_path_to(suffix):
         t_beg = time.time()
         sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode="r")
         t_end = time.time()
-        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+        log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
         log_rank(
             f"\tLoad the shuffle index from {os.path.basename(path_to_shuffle_index)}",
@@ -357,10 +357,10 @@ def get_path_to(suffix):
         t_beg = time.time()
         shuffle_index = numpy.load(path_to_shuffle_index, allow_pickle=True, mmap_mode="r")
         t_end = time.time()
-        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+        log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
-        log_rank(f"> total number of samples: {sample_index.shape[0] - 1}", logger=logger, level=logging.INFO, rank=0)
-        log_rank(f"> total number of epochs: {num_epochs}", logger=logger, level=logging.INFO, rank=0)
+        log_rank(f"> Total number of samples: {sample_index.shape[0] - 1}", logger=logger, level=logging.INFO, rank=0)
+        log_rank(f"> Total number of epochs: {num_epochs}", logger=logger, level=logging.INFO, rank=0)
 
         return document_index, sample_index, shuffle_index
 
diff --git a/tools/merge_datasets.py b/tools/merge_datasets.py
new file mode 100644
index 00000000..8e4f0b04
--- /dev/null
+++ b/tools/merge_datasets.py
@@ -0,0 +1,76 @@
+import argparse
+import os
+
+from nanotron.data.indexed_dataset import (
+    MMapIndexedDataset,
+    MMapIndexedDatasetBuilder,
+    get_bin_path,
+    get_idx_path,
+)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    group = parser.add_argument_group(title="input data")
+    group.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Path to directory containing all document files to merge",
+    )
+
+    group = parser.add_argument_group(title="output data")
+    group.add_argument(
+        "--output-prefix",
+        type=str,
+        required=True,
+        help="Path to binary output file without suffix",
+    )
+
+    args = parser.parse_args()
+
+    assert os.path.isdir(args.input), f"ERROR: {args.input} is not a directory or does not exist"
+
+    assert os.path.isdir(
+        os.path.dirname(args.output_prefix)
+    ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist"
+
+    return args
+
+
+def main():
+    args = get_args()
+
+    prefixes = set()
+    for basename in os.listdir(args.input):
+        prefix, ext = os.path.splitext(basename)
+
+        if prefix in prefixes:
+            continue
+
+        if not os.path.isfile(os.path.join(args.input, basename)):
+            continue
+
+        ext_pair = ".bin" if ext == ".idx" else ".idx"
+        assert os.path.isfile(
+            os.path.join(args.input, prefix) + ext_pair
+        ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}"
+
+        prefixes.add(prefix)
+
+    builder = None
+    for prefix in sorted(prefixes):
+        if builder is None:
+            dataset = MMapIndexedDataset(os.path.join(args.input, prefix))
+            builder = MMapIndexedDatasetBuilder(get_bin_path(args.output_prefix), dtype=dataset.index.dtype)
+            del dataset
+
+        builder.add_index(os.path.join(args.input, prefix))
+
+    builder.finalize(get_idx_path(args.output_prefix))
+
+
+if __name__ == "__main__":
+
+    main()
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 1e9d7a5a..d1457a8d 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -72,15 +72,13 @@ def process_json_file(self, file_name):
         pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
         encoded_docs = pool.imap(encoder.encode, fin, 32)
 
-        level = "document"
-
         output_bin_files = {}
         output_idx_files = {}
         builders = {}
 
         for key in self.args.json_keys:
-            output_bin_files[key] = "{}_{}_{}.bin".format(output_prefix, key, level)
-            output_idx_files[key] = "{}_{}_{}.idx".format(output_prefix, key, level)
+            output_bin_files[key] = "{}_{}.bin".format(output_prefix, key)
+            output_idx_files[key] = "{}_{}.idx".format(output_prefix, key)
             builders[key] = indexed_dataset.MMapIndexedDatasetBuilder(
                 output_bin_files[key],
                 dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
@@ -217,8 +215,6 @@ def main(args):
         return
 
     # Merge bin/idx partitions
-    level = "document"
-
     output_bin_files = {}
     output_idx_files = {}
     builders = {}
@@ -226,8 +222,8 @@ def main(args):
     tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
 
     for key in args.json_keys:
-        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix, key, level)
-        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix, key, level)
+        output_bin_files[key] = "{}_{}.bin".format(args.output_prefix, key)
+        output_idx_files[key] = "{}_{}.idx".format(args.output_prefix, key)
         builders[key] = indexed_dataset.MMapIndexedDatasetBuilder(
             output_bin_files[key],
             dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
@@ -235,7 +231,7 @@ def main(args):
 
         for name in in_ss_out_names:
             parition_output_prefix = name["output_prefix"]
-            full_partition_output_prefix = "{}_{}_{}".format(parition_output_prefix, key, level)
+            full_partition_output_prefix = "{}_{}".format(parition_output_prefix, key)
             builders[key].add_index(full_partition_output_prefix)
         builders[key].finalize(output_idx_files[key])
 

From 0910fbe71558279af61f9d84d6739363fd557d66 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 17 Mar 2024 22:07:30 +0100
Subject: [PATCH 30/73] Improved docs

---
 docs/nanoset.md               | 45 +++++++++++++++++++++--------------
 src/nanotron/config/config.py |  2 --
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/docs/nanoset.md b/docs/nanoset.md
index 78de4d87..214369de 100644
--- a/docs/nanoset.md
+++ b/docs/nanoset.md
@@ -1,31 +1,40 @@
-# Data Pipeline
+# Nanoset
 
 ## Data pre-processing
 
-For data pre-processing, refer to [Megatron-LM project](https://github.com/NVIDIA/Megatron-LM/tree/main?tab=readme-ov-file#data-preprocessing), as Nanoset requires the same kind of preprocessing. The main file used is [`preprocess_data.py`](https://github.com/NVIDIA/Megatron-LM/blob/main/tools/preprocess_data.py)
+Nanotron incorporates Nanosets, a streamlined version of Megatron-LM datasets. It also includes BlendedNanosets, to be able to combine different Nanosets.
 
-The training data requires preprocessing. First, place your training data in a loose json format, with one json containing a text sample per line. For example:
+The training data requires preprocessing. Just like Megatron, first, place your training data in a loose json format, with one json containing a text sample per line. For example:
 
 <pre>
 {"src": "www.nvidia.com", "text": "The quick brown fox", "type": "Eng", "id": "0", "title": "First Part"}
 {"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"}
 </pre>
 
-The name of the `text` field of the json can be changed by using the `--json-key` flag in [`preprocess_data.py`](https://github.com/NVIDIA/Megatron-LM/blob/main/tools/preprocess_data.py). The other metadata are optional and are not used in training.
-
-The loose json is then processed into a binary format for training. To convert the json into mmap format use [`preprocess_data.py`](https://github.com/NVIDIA/Megatron-LM/blob/main/tools/preprocess_data.py). An example script to prepare data for Llama2 training is:
+The loose json is then processed into a binary format for training using the [`tools/preprocess_data.py`](../tools/preprocess_data.py) script. Below we show an example for processing a corpus with the Llama2 tokenizer.
 
 <pre>
 python tools/preprocess_data.py \
-       --input my_corpus.json \
-       --output-prefix my-llama2-dataset \
-       --tokenizer-type Llama2Tokenizer \
-       --tokenizer-model /models/Llama-2-7b-chat-hf/tokenizer.model \
-       --append-eod \
-       --workers 6
+       --input data/my_corpus.json \
+       --output-prefix data/processed-datasets/my-llama2-dataset \
+       --pretrained-model-name-or-path models/Llama-2-7b \
+       --workers 128 \
+       --partitions 8
 </pre>
 
-The output will be two files named, in this case, `my-llama2-dataset_text_document.bin` and `my-llama2-dataset_text_document.idx`. The `--data-path` specified later in training is the full path and new filename, but without the file extension.
+In `--pretrained-model-name-or-path`, we will have to specify a tokenizer in the same way as we do when using `AutoTokenizers.from_pretrained(...)`.
+
+The output will be two files named, in this case, `my-llama2-dataset_text.bin` and `my-llama2-dataset_text.idx`. The `data_path` specified later in training is the full path and new filename, but **without** the file extension.
+
+We also include the following scripts:
+- [`hf_datasets_to_json.py`](../tools/hf_datasets_to_json.py): A tool to transform datasets from the Hugging Face Hub (or local) to the .json format required by `preprocess_data.py`.
+- [`merge_datasets.py`](../tools/merge_datasets.py): A tool to merge preprocessed datasets (_.bin_ and _.idx_ files) into a single file.
+
+## NanosetDatasetsArgs
+
+jjj
+
+
 
 ----
 
@@ -53,7 +62,6 @@ The index file stores document-level and sequence-level metadata second:
 - In order, the number of elements per sequence
 - In order, the byte offset (pointer) per sequence
 - In order, the consecutive sequence index range `[...)` per document
-- In order, the mode per sequence (in the multimodal case)
 
 ## Data loading: construction
 
@@ -62,21 +70,22 @@ Building the data loaders is a distributed-aware process built around the follow
 1. `NanosetConfig`
 2. `NanosetBuilder`
 3. `MMapIndexedDataset`
-3. `Nanoset`
+4. `Nanoset`
+5. `BlendedNanoset`
 
 See the class docstrings for more details.
 
 #### NanosetConfig
 
-The `NanosetConfig` class parametrizes the `NanosetBuilder` and in turn the `Nanoset`.
+The `NanosetConfig` class parametrizes the `NanosetBuilder` and in turn the `Nanoset` and `BlendedNanoset`.
 
-Different training/inference regimes will require different extensions e.g. the `NanosetConfig`
+Refer to [`nanoset_configs.py`](../src/nanotron/data/nanoset_configs.py) to view the different configurable parameters.
 
 #### NanosetBuilder
 
 The `NanosetBuilder` class builds the highest-level data interfaces.
 
-**NB:** All ranks should attempt to build the dataset via the `NanosetBuilder` or the program will hang. Which ranks follow through on their attempts can be controlled via the `NanosetConfig`.
+**NB:** All ranks should attempt to build the dataset via the `NanosetBuilder` or the program will hang.
 
 #### MMapIndexedDataset
 
diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
index d02fcddc..418eabae 100644
--- a/src/nanotron/config/config.py
+++ b/src/nanotron/config/config.py
@@ -149,7 +149,6 @@ class GeneralArgs:
 
     Args:
         project: Name of the project (a project gather several runs in common tensorboard/hub-folders)
-        entity: Weights and bias entity name (optional)
         run: Name of the run
         step: Global step (updated when we save the checkpoint)
         consumed_train_samples: Number of samples consumed during training (should be actually just step*batch_size)
@@ -157,7 +156,6 @@ class GeneralArgs:
     """
 
     project: str
-    entity: Optional[str] = None
     run: Optional[str] = None
     seed: Optional[int] = None
     step: Optional[int] = None

From 5984cf4a3e73b2f6872e847a39bbb7390d4f624c Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Mon, 18 Mar 2024 00:32:23 +0100
Subject: [PATCH 31/73] Cleaned docs

---
 docs/nanoset.md          | 169 ++++++---------------------------------
 tools/preprocess_data.py |   5 ++
 2 files changed, 31 insertions(+), 143 deletions(-)

diff --git a/docs/nanoset.md b/docs/nanoset.md
index 214369de..64c0c15b 100644
--- a/docs/nanoset.md
+++ b/docs/nanoset.md
@@ -2,7 +2,7 @@
 
 ## Data pre-processing
 
-Nanotron incorporates Nanosets, a streamlined version of Megatron-LM datasets. It also includes BlendedNanosets, to be able to combine different Nanosets.
+Nanotron incorporates [`Nanosets`](../src/nanotron/data/nanoset.py), a streamlined version of Megatron-LM datasets. It also includes [`BlendedNanosets`](../src/nanotron/data/blended_nanoset.py), to be able to combine different Nanosets.
 
 The training data requires preprocessing. Just like Megatron, first, place your training data in a loose json format, with one json containing a text sample per line. For example:
 
@@ -32,146 +32,29 @@ We also include the following scripts:
 
 ## NanosetDatasetsArgs
 
-jjj
-
-
-
-----
-
-Data preprocessing is built around the following classes:
-
-1. `MMapIndexedDatasetBuilder`
-2. `MMapIndexedDataset`
-
-#### MMapIndexedDatasetBuilder
-
-The `MMapIndexedDatasetBuilder` is capable of building and merging `MMapIndexedDataset` instances.
-
-#### MMapIndexedDataset
-
-The `MMapIndexedDataset` class is the lowest-level data interface. Internally, an `MMapIndexedDataset` instance references two binaries: the data file (`.bin`) contains document/sequence data and the index file (`.idx`) contains document/sequence metadata.
-
-The index file stores dataset-level metadata first:
-- The index header, for backward compatibility
-- The index version, for backward compatibility
-- A numeric code corresponding to the data type used to write data to the data file
-- The number of sequences in the dataset
-- The number of documents in the dataset
-
-The index file stores document-level and sequence-level metadata second:
-- In order, the number of elements per sequence
-- In order, the byte offset (pointer) per sequence
-- In order, the consecutive sequence index range `[...)` per document
-
-## Data loading: construction
-
-Building the data loaders is a distributed-aware process built around the following classes:
-
-1. `NanosetConfig`
-2. `NanosetBuilder`
-3. `MMapIndexedDataset`
-4. `Nanoset`
-5. `BlendedNanoset`
-
-See the class docstrings for more details.
-
-#### NanosetConfig
-
-The `NanosetConfig` class parametrizes the `NanosetBuilder` and in turn the `Nanoset` and `BlendedNanoset`.
-
-Refer to [`nanoset_configs.py`](../src/nanotron/data/nanoset_configs.py) to view the different configurable parameters.
-
-#### NanosetBuilder
-
-The `NanosetBuilder` class builds the highest-level data interfaces.
-
-**NB:** All ranks should attempt to build the dataset via the `NanosetBuilder` or the program will hang.
-
-#### MMapIndexedDataset
-
-The `MMapIndexedDataset` class is the lowest-level data interface.
-
-The `MMapIndexedDataset` should already exist on disk before attempting to build any of the high-level data interfaces.
-
-
-#### Nanoset
-
-The `Nanoset` abstract class is a high-level data interface. It is built upon the `MMapIndexedDataset`.
-
-Different training/inference regimes will require different extensions e.g. the `Nanoset`
-
-## Data loading: implementation
-
-### Nanoset
-
-The `Nanoset` is parameterized by the following variables: the underlying `MMapIndexedDataset` instance `indexed_dataset`, the split indices `indexed_indices` (the congituous subset of document or sequence indices used for training, validation, and testing), the number of samples `N`, the sequence length `S`, and the random seed `R`.
-
-The `Nanoset` creates three index mappings to facilitate lookup: (1) the document index, (2) the sample index, and (3) the shuffle index.
-
-1. The document index _Do_idx_ is a 1-D array mapping from _i_ to document index of length `E * |indexed_indices|` where `E` corresponds to the minimum number of epochs such that `E * |indexed_indices| >= N`. The document index is shuffled according to `R`.
-
-    ```
-    Given:
-
-    N = 15
-    indexed_indices = [5, 6, 7, 8, 9]
-    E = 3
-
-    Then, for example:
-
-    Do_idx = [8, 8, 9, 6, 7, 5, 8, 5, 6, 6, 5, 9, 7, 7, 9]
-    ```
-
-2. The sample index _Sa_idx_ is a 2-D array mapping from _j_ to pairs of (_i_, _Do_idx_[ _i_ ] offset) of shape `[N + 1, 2]`. The rows _j_ and _j_ + 1 serve as the left and right bounds for the _j_-th sample.
-
-    ```
-    Given:
-
-    S = 1024
-
-    Then, for example:
-
-    Sa_idx[0] = (0, 0)
-    Sa_idx[1] = (0, 1024)       => Do_idx[0] has length greater than S
-    Sa_idx[2] = (1, 512)        => Do_idx[0] has length 1536
-    Sa_idx[3] = (2, 0)          => Do_idx[1] has length 1536
-    Sa_idx[4] = (5, 300)        => Do_idx[2:5] are shorter documents relative to Do_idx[0:2]
-    Sa_idx[5] = (6, 24)         => Do_idx[5] has length 1300
-    ```
-
-3. The shuffle index _Sh_idx_ is a 1-D array mapping from _k_ to _j_ of length `N`. The shuffle index is shuffled according to `R`.
-
+To work with Nanosets, we need to configure 3 arguments:
+1. `split`: The distribution we want to divide our dataset into among train, valid, and test split.
+2. `path_to_cache`: Directory used to store certain metadata of Nanosets to reuse them between different runs.
+3. `data_path`: This argument specifies the file/files that will compose the Nanoset. There are 2 ways to specify it:
+   1. If we only indicate _one_ path (path to the files generated by preprocess_data.py WITHOUT the extension), we will create a Nanoset.
+    ```yaml
+    data:
+      dataset:
+        data_path: nanoset/SlimPajama-6B_text
+        split: 949,50,1
+        path_to_cache: .nanoset_cache
+      num_loading_workers: 0
+      seed: 1234
+    ```
+   2. With a dictionary, we can create a BlendedNanoset where the keys are the paths to the dataset (path to the files generated by preprocess_data.py WITHOUT the extension) and the values are the weight for each dataset.
+    ```yaml
+    data:
+      dataset:
+        data_path:
+          nanoset/SlimPajama-6B_text: 0.8
+          nanoset/europarl_text: 0.2
+        split: 949,50,1
+        path_to_cache: .nanoset_cache
+      num_loading_workers: 0
+      seed: 1234
     ```
-    Given
-
-    N = 10
-
-    Then, for example:
-
-    Sh_idx = [4, 0, 2, 6, 1, 9, 5, 8, 7, 3]
-    ```
-
-To query the `Nanoset` for the _k_-th sample we do the following
-
--  Use the shuffle index to get the index _j_ into the sample index.
-
-    ```
-    j = Sh_idx[k]
-    ```
-- Use the sample index to get the left and right sample-bounding indices into the document index and the starting token offset for each document.
-
-    ```
-    i, offset = Sa_idx[j]
-    i_next, offset_next = Sa_idx[j + 1]
-    ```
-- Use the document index to retrieve `S` tokens from consecutive (in the document index) documents.
-
-    ```
-    sample = []
-    sample += indexed_dataset[Do_idx[i]][offset:]
-    if i != i_next:
-        sample += indexed_dataset[Do_idx[i + 1:i_next]]
-    sample += indexed_dataset[Do_idx[i_next]][:offset_next]
-    ```
-
-To save time during initialization, each index is built/cached sequentially on one process rank and subsequently loaded in parallel on other process ranks. The cached indices are unique to a hash generated in the `Nanoset.__init__` function.
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index d1457a8d..22ec2a7a 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -155,6 +155,11 @@ def check_files_exist(in_ss_out_names, key, num_partitions):
 
 def main(args):
 
+    # Check if output directory exists
+    if not os.path.isdir(os.path.abspath(os.path.join(args.output_prefix, os.path.pardir))):
+        print(f"Creating {os.path.abspath(os.path.join(args.output_prefix, os.path.pardir))} directory...")
+        os.makedirs(os.path.abspath(os.path.join(args.output_prefix, os.path.pardir)), exist_ok=True)
+
     in_ss_out_names = []
     if args.partitions == 1:
         file_name, extension = os.path.splitext(args.input)

From cde43c72ca03479ca34555ab27ba970f67040951 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Mon, 18 Mar 2024 01:00:06 +0100
Subject: [PATCH 32/73] added run instructions

---
 docs/nanoset.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/nanoset.md b/docs/nanoset.md
index 64c0c15b..d5ae224d 100644
--- a/docs/nanoset.md
+++ b/docs/nanoset.md
@@ -58,3 +58,8 @@ To work with Nanosets, we need to configure 3 arguments:
       num_loading_workers: 0
       seed: 1234
     ```
+
+Finally, to use the Nanosets, launch the training with [`run_train_nanoset.py`](../run_train_nanoset.py).
+```shell
+torchrun --nproc-per-node 8 run_train_nanoset.py --config configs/nanoset_llama2.yaml
+```

From 9ae26d42e45960d7bce3b57eeff31e7d309e6601 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Tue, 19 Mar 2024 10:08:27 +0100
Subject: [PATCH 33/73] simplified build funct

---
 src/nanotron/data/dataset_builder.py | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
index d5c69bc4..5aafa95e 100644
--- a/src/nanotron/data/dataset_builder.py
+++ b/src/nanotron/data/dataset_builder.py
@@ -30,7 +30,7 @@ def __init__(
     ):
         self.config = config
 
-    def build(self) -> List[Nanoset]:
+    def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
         """Build all dataset splits according to the provided data_path(s)
 
         This method is distributed-aware and must be called on all ranks.
@@ -39,18 +39,6 @@ def build(self) -> List[Nanoset]:
         config.split to build BlendedNanoset and/or Nanoset splits from the same
         distribution.
 
-        Returns:
-            List[Union[BlendedNanoset, Nanoset]]: A list of either Nanoset or BlendedNanoset per split
-        """
-        return self._build_blended_dataset_splits()
-
-    def _build_blended_dataset_splits(
-        self,
-    ) -> List[Nanoset]:
-        """Build all dataset splits according to the provided data_path(s)
-
-        See the NanosetBuilder.build alias for more information.
-
         Returns:
             List[Union[BlendedNanoset, Nanoset]]: A list of either Nanoset or BlendedNanoset per split
         """

From 698c85b4d10ce1228ee927b1245103f4e893f670 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Tue, 19 Mar 2024 11:30:03 +0100
Subject: [PATCH 34/73] simplified nanoset

---
 src/nanotron/data/blended_nanoset.py |  2 +
 src/nanotron/data/nanoset.py         | 68 +++++++++-------------------
 2 files changed, 23 insertions(+), 47 deletions(-)

diff --git a/src/nanotron/data/blended_nanoset.py b/src/nanotron/data/blended_nanoset.py
index 9467c00a..a362d87e 100644
--- a/src/nanotron/data/blended_nanoset.py
+++ b/src/nanotron/data/blended_nanoset.py
@@ -56,6 +56,8 @@ def __init__(
         self.size = size
         self.config = config
 
+        # Create unique identifier
+
         unique_identifiers = OrderedDict()
         unique_identifiers["class"] = type(self).__name__
         unique_identifiers["datasets"] = [dataset.unique_identifiers for dataset in self.datasets]
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
index d5276fbb..6f67e052 100644
--- a/src/nanotron/data/nanoset.py
+++ b/src/nanotron/data/nanoset.py
@@ -53,6 +53,8 @@ def __init__(
         self.index_split = index_split
         self.config = config
 
+        # Create unique identifier
+
         self.unique_identifiers = OrderedDict()
         self.unique_identifiers["class"] = type(self).__name__
         self.unique_identifiers["path_prefix"] = self.indexed_dataset.path_prefix
@@ -64,14 +66,7 @@ def __init__(
         self.unique_description = json.dumps(self.unique_identifiers, indent=4)
         self.unique_description_hash = hashlib.md5(self.unique_description.encode("utf-8")).hexdigest()
 
-        self._finalize()
-
-    def _finalize(self) -> None:
-        """Abstract method implementation
-
-        Load or build/cache the document, sample, and shuffle indices
-        """
-        assert isinstance(self.config, NanosetConfig)
+        # Load or build/cache the document, sample, and shuffle indices
 
         (
             self.document_index,
@@ -88,40 +83,15 @@ def __len__(self) -> int:
         return self.sample_index.shape[0] - 1
 
     def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
-        """Abstract method implementation
+        """Get the text (token ids) for a given index
 
         Args:
             idx (int): The index into the dataset
 
         Returns:
-            Dict[str, numpy.ndarray]: The text ids and (optionally) the document ids wrapped in a
-            dictionary
+            Dict[str, numpy.ndarray]: The token ids wrapped in a dictionary
         """
-        text, _ = self._query_document_sample_shuffle_indices(idx)
-
-        return {"text": text}
-
-    @staticmethod
-    def _key_config_attributes() -> List[str]:
-        """Return all config attributes which contribute to uniquely identifying the dataset.
-
-        These attributes will be used to build a uniquely identifying string and MD5 hash which
-        will be used to cache/load the dataset from run to run.
-
-        Returns:
-            List[str]: The key config attributes
-        """
-        return ["split", "random_seed", "sequence_length"]
 
-    def _query_document_sample_shuffle_indices(self, idx: int) -> Tuple[numpy.ndarray, numpy.ndarray]:
-        """Get the text (token ids) and document ids for a given index
-
-        Args:
-            idx (int): The index into the dataset
-
-        Returns:
-            Tuple[numpy.ndarray, numpy.ndarray]: The text ids and document ids
-        """
         # Do the shuffle mapping
         idx = self.shuffle_index[idx]
 
@@ -129,16 +99,13 @@ def _query_document_sample_shuffle_indices(self, idx: int) -> Tuple[numpy.ndarra
         doc_index_beg, doc_index_beg_offset = self.sample_index[idx]
         doc_index_end, doc_index_end_offset = self.sample_index[idx + 1]
 
-        document_ids = []
-        sample_parts = []
+        tokens = []
 
         # Sample spans a single document
         if doc_index_beg == doc_index_end:
-            # Add the document id
-            document_ids.append(self.document_index[doc_index_beg])
 
             # Add the entire sample
-            sample_parts.append(
+            tokens.append(
                 self.indexed_dataset.get(
                     self.document_index[doc_index_beg],
                     offset=doc_index_beg_offset,
@@ -149,18 +116,13 @@ def _query_document_sample_shuffle_indices(self, idx: int) -> Tuple[numpy.ndarra
         # Sample spans multiple documents
         else:
             for i in range(doc_index_beg, doc_index_end + 1):
-                # Add the document id
-                document_ids.append(self.document_index[i])
 
                 # Add the sample part
                 offset = 0 if i > doc_index_beg else doc_index_beg_offset
                 length = None if i < doc_index_end else doc_index_end_offset + 1
-                sample_parts.append(self.indexed_dataset.get(self.document_index[i], offset=offset, length=length))
+                tokens.append(self.indexed_dataset.get(self.document_index[i], offset=offset, length=length))
 
-        return (
-            numpy.array(numpy.concatenate(sample_parts), dtype=numpy.int64),
-            numpy.array(document_ids, dtype=numpy.int64),
-        )
+        return {"text": numpy.array(numpy.concatenate(tokens), dtype=numpy.int64)}
 
     def _build_document_sample_shuffle_indices(
         self,
@@ -364,6 +326,18 @@ def get_path_to(suffix):
 
         return document_index, sample_index, shuffle_index
 
+    @staticmethod
+    def _key_config_attributes() -> List[str]:
+        """Return all config attributes which contribute to uniquely identifying the dataset.
+
+        These attributes will be used to build a uniquely identifying string and MD5 hash which
+        will be used to cache/load the dataset from run to run.
+
+        Returns:
+            List[str]: The key config attributes
+        """
+        return ["split", "random_seed", "sequence_length"]
+
 
 def _get_num_tokens_per_epoch(indexed_dataset: MMapIndexedDataset, indices: numpy.ndarray) -> int:
     """Calculate the number of tokens in a single epoch

From 50a189faaefcf95c98f6f8f176db3316cbf2d9b4 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 20 Mar 2024 00:28:19 +0000
Subject: [PATCH 35/73] fixed dacite UnionMatchError

---
 src/nanotron/config/config.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
index 418eabae..78defc59 100644
--- a/src/nanotron/config/config.py
+++ b/src/nanotron/config/config.py
@@ -98,12 +98,10 @@ def __post_init__(self):
 @dataclass
 class NanosetDatasetsArgs:
     data_path: Union[str, dict]
-    split: Optional[str] = None
-    path_to_cache: Optional[str] = None
+    split: Optional[str] = "949,50,1"
+    path_to_cache: Optional[str] = "_"
 
     def __post_init__(self):
-        if self.split is None:
-            self.split = "949,50,1"
         if not os.path.isdir(self.path_to_cache):
             self.path_to_cache = None
 

From 290456228838bd725a7d016fce4b35f146aaf33e Mon Sep 17 00:00:00 2001
From: Phuc Nguyen <b3f0cus@icloud.com>
Date: Thu, 21 Mar 2024 09:59:10 +0000
Subject: [PATCH 36/73] add loading dataloader based on training stages

---
 examples/config_tiny_llama.yaml               | 41 +++++++---
 .../debug_config_tiny_llama.yaml              | 24 +++---
 examples/mamba/config_mamba.yaml              | 26 +++---
 examples/moe/config_llamoe.yaml               | 23 +++---
 run_train.py                                  | 44 +++++++---
 src/nanotron/config/config.py                 | 33 +++++++-
 src/nanotron/trainer.py                       | 80 ++++++++++++++++---
 7 files changed, 204 insertions(+), 67 deletions(-)

diff --git a/examples/config_tiny_llama.yaml b/examples/config_tiny_llama.yaml
index fc310557..f2a3287b 100644
--- a/examples/config_tiny_llama.yaml
+++ b/examples/config_tiny_llama.yaml
@@ -1,19 +1,36 @@
 checkpoints:
   checkpoint_interval: 10
-  checkpoints_path: /fsx/nouamane/projects/nanotron/checkpoints
+  checkpoints_path: checkpoints
   checkpoints_path_is_shared_file_system: false
   resume_checkpoint_path: null
   save_initial_state: false
-data:
-  dataset:
-    dataset_overwrite_cache: false
-    dataset_processing_num_proc_per_process: 1
-    hf_dataset_config_name: null
-    hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
-    hf_dataset_splits: train
-    text_column_name: completion
-  num_loading_workers: 1
-  seed: 42
+
+data_stages:
+  - name: Stable Training Stage
+    start_training_step: 1
+    data:
+      dataset:
+        dataset_overwrite_cache: false
+        dataset_processing_num_proc_per_process: 1
+        hf_dataset_config_name: null
+        hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+        hf_dataset_splits: train
+        text_column_name: completion
+      num_loading_workers: 1
+      seed: 42
+  - name: Annealing Phase
+    start_training_step: 10
+    data:
+      dataset:
+        dataset_overwrite_cache: false
+        dataset_processing_num_proc_per_process: 1
+        hf_dataset_config_name: null
+        hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+        hf_dataset_splits: train
+        text_column_name: completion
+      num_loading_workers: 1
+      seed: 42
+
 general:
   benchmark_csv_path: null
   consumed_train_samples: null
@@ -87,5 +104,5 @@ tokens:
   limit_val_batches: 0
   micro_batch_size: 2
   sequence_length: 32
-  train_steps: 10
+  train_steps: 20
   val_check_interval: -1
diff --git a/examples/contributor-guide/debug_config_tiny_llama.yaml b/examples/contributor-guide/debug_config_tiny_llama.yaml
index 5a097082..27c24ed0 100644
--- a/examples/contributor-guide/debug_config_tiny_llama.yaml
+++ b/examples/contributor-guide/debug_config_tiny_llama.yaml
@@ -4,16 +4,20 @@ checkpoints:
   checkpoints_path_is_shared_file_system: false
   resume_checkpoint_path: null
   save_initial_state: false
-data:
-  dataset:
-    dataset_overwrite_cache: false
-    dataset_processing_num_proc_per_process: 1
-    hf_dataset_config_name: null
-    hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
-    hf_dataset_splits: train
-    text_column_name: completion
-  num_loading_workers: 1
-  seed: 42
+
+data_stages:
+  - name: General purpose training
+    start_training_step: 1
+    data:
+      dataset:
+        dataset_overwrite_cache: false
+        dataset_processing_num_proc_per_process: 1
+        hf_dataset_config_name: null
+        hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+        hf_dataset_splits: train
+        text_column_name: completion
+      num_loading_workers: 1
+      seed: 42
 general:
   benchmark_csv_path: null
   consumed_train_samples: null
diff --git a/examples/mamba/config_mamba.yaml b/examples/mamba/config_mamba.yaml
index 2d79880d..24a2c125 100644
--- a/examples/mamba/config_mamba.yaml
+++ b/examples/mamba/config_mamba.yaml
@@ -4,17 +4,21 @@ checkpoints:
   checkpoints_path_is_shared_file_system: false
   resume_checkpoint_path: null
   save_initial_state: false
-data:
-  dataset:
-    dataset_overwrite_cache: false
-    dataset_processing_num_proc_per_process: 24
-    hf_dataset_config_name: null
-    hf_dataset_or_datasets:
-      roneneldan/TinyStories: 1.0
-    hf_dataset_splits: train
-    text_column_name: text
-  num_loading_workers: 1
-  seed: 42
+
+data_stages:
+  - name: General purpose training
+    start_training_step: 1
+    data:
+      dataset:
+        dataset_overwrite_cache: false
+        dataset_processing_num_proc_per_process: 24
+        hf_dataset_config_name: null
+        hf_dataset_or_datasets:
+          roneneldan/TinyStories: 1.0
+        hf_dataset_splits: train
+        text_column_name: text
+      num_loading_workers: 1
+      seed: 42
 general:
   benchmark_csv_path: null
   consumed_train_samples: null
diff --git a/examples/moe/config_llamoe.yaml b/examples/moe/config_llamoe.yaml
index 0050050e..1b312129 100644
--- a/examples/moe/config_llamoe.yaml
+++ b/examples/moe/config_llamoe.yaml
@@ -4,16 +4,19 @@ checkpoints:
   checkpoints_path_is_shared_file_system: false
   resume_checkpoint_path: /fsx/nouamane/projects/nanotron/examples/checkpoints
   save_initial_state: true
-data:
-  dataset:
-    dataset_overwrite_cache: false
-    dataset_processing_num_proc_per_process: 12
-    hf_dataset_config_name: null
-    hf_dataset_or_datasets: roneneldan/TinyStories
-    hf_dataset_splits: train
-    text_column_name: text
-  num_loading_workers: 1
-  seed: 42
+data_stages:
+  - name: General purpose training
+    start_training_step: 1
+    data:
+      dataset:
+        dataset_overwrite_cache: false
+        dataset_processing_num_proc_per_process: 12
+        hf_dataset_config_name: null
+        hf_dataset_or_datasets: roneneldan/TinyStories
+        hf_dataset_splits: train
+        text_column_name: text
+      num_loading_workers: 1
+      seed: 42
 general:
   benchmark_csv_path: null
   consumed_train_samples: null
diff --git a/run_train.py b/run_train.py
index 7874e424..4c36d1ef 100644
--- a/run_train.py
+++ b/run_train.py
@@ -8,9 +8,12 @@
 ```
 """
 import argparse
+from typing import Dict, cast
 
 from nanotron import logging
 from nanotron.config import (
+    DataArgs,
+    DatasetStageArgs,
     PretrainDatasetsArgs,
 )
 from nanotron.dataloader import (
@@ -25,6 +28,7 @@
 from nanotron.utils import (
     main_rank_first,
 )
+from torch.utils.data import DataLoader
 
 try:
     from huggingface_hub import __version__ as hf_hub_version
@@ -37,14 +41,14 @@
 logger = logging.get_logger(__name__)
 
 
-def get_dataloader(trainer: DistributedTrainer):
+def get_dataloader_from_data_stage(trainer: DistributedTrainer, data: DataArgs):
     """Returns a dataloader for training."""
 
     # First, we need to know which ranks to feed the dataloader to
     input_pp_rank, output_pp_rank = get_input_output_pp_ranks(model=trainer.model)
 
     # Case 1: Dummy data generator
-    if trainer.config.data.dataset is None:
+    if data.dataset is None:
         log_rank("Using dummy data generator", logger=logger, level=logging.INFO, rank=0)
         dataloader = dummy_infinite_data_generator(
             micro_batch_size=trainer.micro_batch_size,
@@ -52,12 +56,12 @@ def get_dataloader(trainer: DistributedTrainer):
             input_pp_rank=input_pp_rank,
             output_pp_rank=output_pp_rank,
             vocab_size=trainer.model_config.vocab_size,
-            seed=trainer.config.data.seed,
+            seed=data.seed,
             parallel_context=trainer.parallel_context,
         )()
 
     # Case 2: HuggingFace datasets
-    elif isinstance(trainer.config.data.dataset, PretrainDatasetsArgs):
+    elif isinstance(data.dataset, PretrainDatasetsArgs):
         log_rank("Using `datasets` library", logger=logger, level=logging.INFO, rank=0)
         tokenizer_path = trainer.config.tokenizer.tokenizer_name_or_path
         log_rank(
@@ -74,8 +78,8 @@ def get_dataloader(trainer: DistributedTrainer):
 
             # We load the raw dataset
             raw_dataset = get_datasets(
-                hf_dataset_or_datasets=trainer.config.data.dataset.hf_dataset_or_datasets,
-                splits=trainer.config.data.dataset.hf_dataset_splits,
+                hf_dataset_or_datasets=data.dataset.hf_dataset_or_datasets,
+                splits=data.dataset.hf_dataset_splits,
             )["train"]
 
             tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
@@ -86,9 +90,9 @@ def get_dataloader(trainer: DistributedTrainer):
             train_dataset = clm_process(
                 raw_dataset=raw_dataset,
                 tokenizer=tokenizer,
-                text_column_name=trainer.config.data.dataset.text_column_name,
-                dataset_processing_num_proc_per_process=trainer.config.data.dataset.dataset_processing_num_proc_per_process,
-                dataset_overwrite_cache=trainer.config.data.dataset.dataset_overwrite_cache,
+                text_column_name=data.dataset.text_column_name,
+                dataset_processing_num_proc_per_process=data.dataset.dataset_processing_num_proc_per_process,
+                dataset_overwrite_cache=data.dataset.dataset_overwrite_cache,
                 sequence_length=trainer.sequence_length,
             )
 
@@ -101,8 +105,8 @@ def get_dataloader(trainer: DistributedTrainer):
                 output_pp_rank=output_pp_rank,
                 micro_batch_size=trainer.micro_batch_size,
                 consumed_train_samples=trainer.consumed_train_samples,
-                dataloader_num_workers=trainer.config.data.num_loading_workers,
-                seed_worker=trainer.config.data.seed,
+                dataloader_num_workers=data.num_loading_workers,
+                seed_worker=data.seed,
                 dataloader_drop_last=True,
             )
             # Check if we have enough samples for train_steps
@@ -117,11 +121,27 @@ def get_dataloader(trainer: DistributedTrainer):
                 f"Try train_steps<={len(dataloader.dataset) // trainer.global_batch_size + trainer.start_iteration_step}"
             )
     else:
-        raise ValueError(f"Unhandled case of `self.config.data.dataset`. Got: {trainer.config.data.dataset}")
+        raise ValueError(f"Unhandled case of `self.config.data.dataset`. Got: {data.dataset}")
 
     return dataloader
 
 
+def get_dataloader(trainer: DistributedTrainer) -> Dict[str, DataLoader]:
+    sorted_stages = sorted(trainer.config.data_stages, key=lambda stage: stage.start_training_step)
+    dataloaders = {}
+    for idx, stage in enumerate(sorted_stages):
+        # NOTE: we only create the dataloader for the first stage,
+        # then we lazy initialize the dataloader for the other stages
+        stage = cast(DatasetStageArgs, stage)
+        dataloader = (
+            get_dataloader(trainer, stage.data)
+            if idx == 0
+            else lambda stage=stage: get_dataloader_from_data_stage(trainer, stage.data)
+        )
+        dataloaders[stage.name] = dataloader
+    return dataloaders
+
+
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--config-file", type=str, required=True, help="Path to the YAML or python config file")
diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
index ec48f8bc..97ad2c31 100644
--- a/src/nanotron/config/config.py
+++ b/src/nanotron/config/config.py
@@ -2,7 +2,7 @@
 import os
 from dataclasses import dataclass, fields
 from pathlib import Path
-from typing import Optional, Type, Union
+from typing import List, Optional, Type, Union
 
 import dacite
 import torch
@@ -108,6 +108,19 @@ def __post_init__(self):
             self.seed = DEFAULT_SEED
 
 
+@dataclass
+class DatasetStageArgs:
+    """Arguments for loading dataset in different stages of the training process"""
+
+    name: str
+    start_training_step: int
+    data: DataArgs
+
+    def __post_init__(self):
+        if self.start_training_step < 0:
+            raise ValueError(f"training_steps should be a positive integer and not {self.start_training_step}")
+
+
 @dataclass
 class CheckpointsArgs:
     """Arguments related to checkpoints:
@@ -298,7 +311,7 @@ class Config:
     logging: Optional[LoggingArgs] = None
     tokens: Optional[TokensArgs] = None
     optimizer: Optional[OptimizerArgs] = None
-    data: Optional[DataArgs] = None
+    data_stages: Optional[List[DatasetStageArgs]] = None
     profiler: Optional[ProfilerArgs] = None
     lighteval: Optional[LightEvalConfig] = None
 
@@ -317,6 +330,22 @@ def __post_init__(self):
                 self.tokens.train_steps - self.optimizer.learning_rate_scheduler.lr_warmup_steps
             )
 
+        if self.data_stages is not None:
+            names = [stage.name for stage in self.data_stages]
+            training_steps = [stage.start_training_step for stage in self.data_stages]
+            assert all(
+                stage.start_training_step == 1 for stage in self.data_stages
+            ), "You must have a training stage starting at 1 in the config's data_stages"
+
+            for stage in self.data_stages:
+                if names.count(stage.name) > 1:
+                    raise ValueError(f"Each stage should have unique names and not {names}")
+
+                if training_steps.count(stage.start_training_step) > 1:
+                    raise ValueError(
+                        f"Each stage should have unique starting training step, please change the starting training step for stage {stage.name}"
+                    )
+
         # # if lighteval, we need tokenizer to be defined
         # if self.checkpoints.lighteval is not None:
         #     assert self.tokenizer.tokenizer_name_or_path is not None
diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index eb6a5b4a..19b04f51 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -16,15 +16,18 @@
     Tuple,
     Type,
     Union,
+    cast,
 )
 
 import torch
 from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import DataLoader
 
 from nanotron import distributed as dist
 from nanotron import logging
 from nanotron.config import (
     Config,
+    DatasetStageArgs,
     ExistingCheckpointInit,
     ParallelismArgs,
     RandomInit,
@@ -222,6 +225,8 @@ def __init__(
         self.sequence_length = self.config.tokens.sequence_length
         self.iteration_step = self.start_iteration_step
         self.limit_val_batches = self.config.tokens.limit_val_batches
+        # NOTE: the dataloader currently in use for the current training stage
+        self.current_dataloader: Optional[DataLoader] = None
 
         self.post_init()
 
@@ -246,24 +251,77 @@ def post_train_step(self):
     def post_training(self):
         pass
 
+    def _print_training_plan(self):
+        stages_info = "".join(
+            f"[Stage {stage.name}] start from step {stage.start_training_step} \n" for stage in self.config.data_stages
+        )
+        full_log_message = f"[Training Plan] There are {len(self.config.data_stages)} training stages \n{stages_info}"
+        log_rank(full_log_message, logger=logger, level=logging.INFO, rank=0)
+
+    def _update_dataloader_based_on_training_stages(self, dataloaders: List[DataLoader]):
+        assert len(dataloaders) > 0, "No dataloaders provided"
+        assert len(dataloaders) == len(
+            self.config.data_stages
+        ), "Number of dataloaders should match the number of dataset stages"
+
+        def clear_dataloader_from_memory(dataloader: DataLoader, stage_name: str):
+            import gc
+
+            log_rank(
+                f"[Training Stage: {stage_name}] Clearing the previous training stage's dataloader and datasets from memory",
+                logger=logger,
+                level=logging.INFO,
+            )
+
+            # NOTE: Clear the internal state of the dataloader
+            del dataloader.dataset
+            del dataloader.sampler
+            del dataloader.batch_sampler
+
+            gc.collect()
+
+        dataloader = None
+        for stage_id, stage in enumerate(self.config.data_stages):
+            stage = cast(DatasetStageArgs, stage)
+
+            if stage.start_training_step == self.iteration_step:
+                if self.current_dataloader is not None:
+                    prev_stage_name = self.config.data_stages[stage_id - 1].name
+                    prev_dataloader = dataloaders[prev_stage_name]
+                    if isinstance(prev_dataloader, DataLoader):
+                        # NOTE: we don't need to clear dummy data generator from memory
+                        clear_dataloader_from_memory(prev_dataloader, stage_name=stage.name)
+
+                log_rank(
+                    f"[Training Stage: {stage.name}] Switching to a new dataset {stage.data.dataset.hf_dataset_or_datasets}",
+                    logger=logger,
+                    level=logging.INFO,
+                    rank=0,
+                )
+
+                dataloader = dataloaders[stage.name]
+                # NOTE: if a dataloader is lazy initialized, we need to call it to initialize it
+                dataloader = dataloader() if callable(dataloader) else dataloader
+                break
+
+        if dataloader is not None:
+            self.current_dataloader = sanity_check_dataloader(
+                dataloader=dataloader, parallel_context=self.parallel_context, config=self.config
+            )
+
     def train(
         self,
-        dataloader_or_dls: Union[Iterator[Dict[str, Union[torch.Tensor, TensorPointer]]], Tuple[Iterator, ...]],
+        dataloader_or_dls: Dict[
+            str, Union[Iterator[Dict[str, Union[torch.Tensor, TensorPointer]]], Tuple[Iterator, ...]]
+        ],
         **kwargs,
     ) -> None:
         self.pre_training(**kwargs)
+        self._print_training_plan()
 
         if self.config.checkpoints.save_initial_state and self.init_checkpoint_path is None:
             self.save_checkpoint()
 
-        if isinstance(dataloader_or_dls, tuple):
-            dataloader = dataloader_or_dls[0]
-        else:
-            dataloader = dataloader_or_dls
-        dataloader = sanity_check_dataloader(
-            dataloader=dataloader, parallel_context=self.parallel_context, config=self.config
-        )
-
         self.pipeline_engine: PipelineEngine = self.config.parallelism.pp_engine
 
         self.pipeline_engine.nb_microbatches = self.n_micro_batches_per_batch
@@ -289,10 +347,12 @@ def train(
             for self.iteration_step in range(self.start_iteration_step + 1, self.config.tokens.train_steps + 1):
                 if isinstance(prof, torch.profiler.profile):
                     prof.step()
+
                 self.iteration_start_time = time.time()
+                self._update_dataloader_based_on_training_stages(dataloader_or_dls)
 
                 # Training step
-                outputs, loss_avg = self.training_step(dataloader=dataloader)
+                outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
 
                 # Training Logs
                 self.consumed_train_samples += self.global_batch_size

From 3936ccfa28f9bd41a9461cd7fce9bb6c0ee2b231 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Thu, 21 Mar 2024 19:12:55 +0000
Subject: [PATCH 37/73] Added tests

---
 tests/helpers/data.py                  |  80 +++++++++++++++++
 tests/test_build_nanoset_dataloader.py | 119 +++++++++++++++++++++++++
 2 files changed, 199 insertions(+)
 create mode 100644 tests/helpers/data.py
 create mode 100644 tests/test_build_nanoset_dataloader.py

diff --git a/tests/helpers/data.py b/tests/helpers/data.py
new file mode 100644
index 00000000..544cea0b
--- /dev/null
+++ b/tests/helpers/data.py
@@ -0,0 +1,80 @@
+import os
+import sys
+
+# Hack to import preprocess_data.py
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir)))
+from argparse import Namespace
+
+import torch
+from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
+from nanotron.sanity_checks import assert_tensor_synced_across_pg
+
+import datasets
+from tools.preprocess_data import main
+
+
+def create_dummy_json_dataset(path_to_json: str, dummy_text: str, n_samples: int = 500):
+
+    ds_dict = {"indices": list(range(n_samples))}
+    ds = datasets.Dataset.from_dict(ds_dict)
+
+    def create_text(sample):
+        return {
+            "text": f"[{sample['indices']}] Hello! I'm sample {sample['indices']}! And this is my dummy text: {dummy_text}"
+        }
+
+    new_ds = ds.map(create_text, batched=False).remove_columns(["indices"])
+    new_ds.to_json(path_or_buf=path_to_json + ".json")
+
+
+def preprocess_dummy_dataset(path_to_json: str):
+    # Create args for preprocessing
+    args = Namespace(
+        input=path_to_json + ".json",
+        json_keys=["text"],
+        output_prefix=path_to_json,
+        pretrained_model_name_or_path="openai-community/gpt2",
+        workers=int(min(os.cpu_count(), 8)),
+        partitions=int((min(os.cpu_count(), 8) / 2)),
+        append_eod=True,
+        log_interval=int(1000),
+    )
+
+    # tools/preprocess_data.py main
+    main(args)
+
+
+def assert_batch_dataloader(batch: dict, parallel_context):
+    """
+    batch (dict): Batch produced from the Dataloader, with keys input_ids, input_mask, label_ids, label_mask
+
+    """
+    for element in batch:
+        tensor = batch[element]
+        data_class = (
+            0  # 0 if tensor is from the ids, 1 if TensorPointer and 2 if mask. Used in the data parallel group check
+        )
+
+        # TensorPointer case: Check that all TensorPointers from the same tp_pg point to the same group_rank. Create torch.tensor with group_rank
+        if isinstance(tensor, TensorPointer):
+            tensor = torch.tensor(tensor.group_rank)
+            data_class = 1
+
+        # Attention Masks case: dtype is torch.bool --> Transform to int64
+        if tensor.dtype == torch.bool:
+            tensor = tensor.long()
+            data_class = 2
+
+        # Assert that we have the SAME element in all the processes belonging to the same tensor parallel group
+        assert_tensor_synced_across_pg(
+            tensor=tensor.flatten(),
+            pg=parallel_context.tp_pg,
+            msg=lambda err: f"{element} is not synchronized across TP {err}",
+        )
+
+        # Assert that we have the SAME class of data in all processes belonging to the same data parallel group
+        assert_tensor_synced_across_pg(
+            tensor=torch.tensor(data_class),
+            pg=parallel_context.dp_pg,
+            msg=lambda err: f"{element} is not synchronized across DP {err}",
+        )
diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
new file mode 100644
index 00000000..10e993be
--- /dev/null
+++ b/tests/test_build_nanoset_dataloader.py
@@ -0,0 +1,119 @@
+import os
+
+import pytest
+import torch
+from helpers.context import TestContext
+from helpers.data import assert_batch_dataloader, create_dummy_json_dataset, preprocess_dummy_dataset
+from helpers.utils import get_all_3d_configurations, init_distributed, rerun_if_address_is_in_use
+from nanotron import distributed as dist
+from nanotron.data.blended_nanoset import BlendedNanoset
+from nanotron.data.dataloader_builder import build_nanoset_dataloader
+from nanotron.data.dataset_builder import NanosetBuilder
+from nanotron.data.nanoset import Nanoset
+from nanotron.data.nanoset_configs import NanosetConfig
+from nanotron.data.utils import compute_datasets_num_samples
+from nanotron.parallel import ParallelContext
+
+
+@pytest.mark.parametrize(
+    "tp,dp,pp",
+    [
+        pytest.param(*all_3d_configs)
+        for gpus in range(1, min(12, 8) + 1)
+        for all_3d_configs in get_all_3d_configurations(gpus)
+    ],
+)
+@rerun_if_address_is_in_use()
+def test_build_nanoset_dataloader(tp: int, dp: int, pp: int):
+    test_context = TestContext()
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_build_nanoset_dataloader)(test_context=test_context)
+
+
+def _test_build_nanoset_dataloader(parallel_context: ParallelContext, test_context: TestContext):
+    TRAIN_STEPS = 10000
+    VAL_CHECK_INTERVAL = TRAIN_STEPS / 4
+    VAL_STEPS = TRAIN_STEPS / 10
+    MICRO_BATCH_SIZE = 4
+    N_MICRO_BATCHES_PER_BATCH = 8
+    GLOBAL_BATCH_SIZE = MICRO_BATCH_SIZE * N_MICRO_BATCHES_PER_BATCH * parallel_context.dp_pg.size()
+
+    SEED = 1234
+    SEQ_LENGTH = 8192
+    SPLIT = "8,1,1"
+
+    dataset_1_json_path = os.path.join(test_context.get_auto_remove_tmp_dir(), "pytest_1")
+    dataset_2_json_path = os.path.join(test_context.get_auto_remove_tmp_dir(), "pytest_2")
+    dataset_1_bin_path = dataset_1_json_path + "_text"
+    dataset_2_bin_path = dataset_2_json_path + "_text"
+
+    # Compile helpers & Create dataset files
+    if dist.get_rank() == 0:
+        from nanotron.data.utils import compile_helpers
+
+        compile_helpers()
+
+        # Create dummy json datasets
+        create_dummy_json_dataset(path_to_json=dataset_1_json_path, dummy_text="Nanoset 1!")
+        create_dummy_json_dataset(path_to_json=dataset_2_json_path, dummy_text="Nanoset 2!")
+
+        # Preprocess dummy json datasets
+        preprocess_dummy_dataset(path_to_json=dataset_1_json_path)
+        preprocess_dummy_dataset(path_to_json=dataset_2_json_path)
+
+    torch.distributed.barrier()
+
+    input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
+
+    # Create Nanoset configs: 1. Normal 2. Blended
+    split_num_samples = compute_datasets_num_samples(
+        train_iters=TRAIN_STEPS,
+        eval_interval=VAL_CHECK_INTERVAL,
+        eval_iters=VAL_STEPS,
+        global_batch_size=GLOBAL_BATCH_SIZE,
+    )
+
+    nanoset_config = NanosetConfig(
+        random_seed=SEED,
+        sequence_length=SEQ_LENGTH,
+        data_path=dataset_1_bin_path,
+        split=SPLIT,
+        split_num_samples=split_num_samples,
+        path_to_cache=test_context.get_auto_remove_tmp_dir(),
+    )
+
+    blended_nanoset_config = NanosetConfig(
+        random_seed=SEED,
+        sequence_length=SEQ_LENGTH,
+        data_path={dataset_1_bin_path: 0.2, dataset_2_bin_path: 0.2},
+        split=SPLIT,
+        split_num_samples=split_num_samples,
+        path_to_cache=test_context.get_auto_remove_tmp_dir(),
+    )
+
+    configs = [nanoset_config, blended_nanoset_config]
+    dataset_types = [Nanoset, BlendedNanoset]
+
+    for config, dataset_type in zip(configs, dataset_types):
+        # Create Nanosets
+        train_dataset, valid_dataset, test_dataset = NanosetBuilder(config).build()
+        # Check the type of Nanoset and the quantity of Nanosets in BlendedNanoset
+        assert isinstance(train_dataset, dataset_type)
+        if isinstance(train_dataset, BlendedNanoset):
+            assert len(train_dataset.datasets) > 1
+
+        # Create Dataloaders
+        dataloader = build_nanoset_dataloader(
+            train_dataset,
+            sequence_length=SEQ_LENGTH,
+            parallel_context=parallel_context,
+            input_pp_rank=input_pp_rank,
+            output_pp_rank=output_pp_rank,
+            micro_batch_size=MICRO_BATCH_SIZE,
+            dataloader_num_workers=0,
+            seed_worker=SEED,
+            dataloader_drop_last=True,
+        )
+
+        # Check a batch produced by the Dataloader
+        batch = next(iter(dataloader))
+        assert_batch_dataloader(batch=batch, parallel_context=parallel_context)

From baad2f7542fd89ef678e67f35460de76d4a3b452 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Thu, 21 Mar 2024 19:14:44 +0000
Subject: [PATCH 38/73] typos

---
 src/nanotron/data/blended_nanoset.py |  2 +-
 src/nanotron/data/nanoset.py         | 19 ++++---------------
 2 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/src/nanotron/data/blended_nanoset.py b/src/nanotron/data/blended_nanoset.py
index a362d87e..ba49f690 100644
--- a/src/nanotron/data/blended_nanoset.py
+++ b/src/nanotron/data/blended_nanoset.py
@@ -27,7 +27,7 @@ class BlendedNanoset(torch.utils.data.Dataset):
 
         size (int): The number of samples to draw from the blend
 
-        config (BlendedNanosetConfig): The config object which informs dataset creation
+        config (NanosetConfig): The config object which informs dataset creation
 
     Raises:
         RuntimeError: When the dataset has fewer or more samples than 'size' post-initialization
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
index 6f67e052..f02fc7ae 100644
--- a/src/nanotron/data/nanoset.py
+++ b/src/nanotron/data/nanoset.py
@@ -3,7 +3,7 @@
 import os
 import time
 from collections import OrderedDict
-from typing import Dict, List, Tuple
+from typing import Dict, Tuple
 
 import numpy
 import torch
@@ -60,8 +60,9 @@ def __init__(
         self.unique_identifiers["path_prefix"] = self.indexed_dataset.path_prefix
         self.unique_identifiers["num_samples"] = self.num_samples
         self.unique_identifiers["index_split"] = self.index_split.name
-        for attr in self._key_config_attributes():
-            self.unique_identifiers[attr] = getattr(self.config, attr)
+        self.unique_identifiers["split"] = self.config.split
+        self.unique_identifiers["random_seed"] = self.config.random_seed
+        self.unique_identifiers["sequence_length"] = self.config.sequence_length
 
         self.unique_description = json.dumps(self.unique_identifiers, indent=4)
         self.unique_description_hash = hashlib.md5(self.unique_description.encode("utf-8")).hexdigest()
@@ -326,18 +327,6 @@ def get_path_to(suffix):
 
         return document_index, sample_index, shuffle_index
 
-    @staticmethod
-    def _key_config_attributes() -> List[str]:
-        """Return all config attributes which contribute to uniquely identifying the dataset.
-
-        These attributes will be used to build a uniquely identifying string and MD5 hash which
-        will be used to cache/load the dataset from run to run.
-
-        Returns:
-            List[str]: The key config attributes
-        """
-        return ["split", "random_seed", "sequence_length"]
-
 
 def _get_num_tokens_per_epoch(indexed_dataset: MMapIndexedDataset, indices: numpy.ndarray) -> int:
     """Calculate the number of tokens in a single epoch

From 89c3a4eb7e2600caf7cb89c6202d299f982c0cc3 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Thu, 21 Mar 2024 20:08:56 +0000
Subject: [PATCH 39/73] fixed test for cuda backend

---
 tests/helpers/data.py                  | 4 ++--
 tests/test_build_nanoset_dataloader.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/helpers/data.py b/tests/helpers/data.py
index 544cea0b..e8bd596a 100644
--- a/tests/helpers/data.py
+++ b/tests/helpers/data.py
@@ -67,14 +67,14 @@ def assert_batch_dataloader(batch: dict, parallel_context):
 
         # Assert that we have the SAME element in all the processes belonging to the same tensor parallel group
         assert_tensor_synced_across_pg(
-            tensor=tensor.flatten(),
+            tensor=tensor.flatten().cuda(),
             pg=parallel_context.tp_pg,
             msg=lambda err: f"{element} is not synchronized across TP {err}",
         )
 
         # Assert that we have the SAME class of data in all processes belonging to the same data parallel group
         assert_tensor_synced_across_pg(
-            tensor=torch.tensor(data_class),
+            tensor=torch.tensor(data_class, device="cuda"),
             pg=parallel_context.dp_pg,
             msg=lambda err: f"{element} is not synchronized across DP {err}",
         )
diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index 10e993be..de9e28e5 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -4,7 +4,7 @@
 import torch
 from helpers.context import TestContext
 from helpers.data import assert_batch_dataloader, create_dummy_json_dataset, preprocess_dummy_dataset
-from helpers.utils import get_all_3d_configurations, init_distributed, rerun_if_address_is_in_use
+from helpers.utils import available_gpus, get_all_3d_configurations, init_distributed, rerun_if_address_is_in_use
 from nanotron import distributed as dist
 from nanotron.data.blended_nanoset import BlendedNanoset
 from nanotron.data.dataloader_builder import build_nanoset_dataloader
@@ -19,7 +19,7 @@
     "tp,dp,pp",
     [
         pytest.param(*all_3d_configs)
-        for gpus in range(1, min(12, 8) + 1)
+        for gpus in range(1, min(available_gpus(), 8) + 1)
         for all_3d_configs in get_all_3d_configurations(gpus)
     ],
 )

From 2f14b78bca4930e6d9d34acf2389a0da419f5a54 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Thu, 21 Mar 2024 22:06:33 +0000
Subject: [PATCH 40/73] destroy parallel_context on test end

---
 tests/test_build_nanoset_dataloader.py | 4 +++-
 tools/preprocess_data.py               | 9 +++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index de9e28e5..6124338a 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -84,7 +84,7 @@ def _test_build_nanoset_dataloader(parallel_context: ParallelContext, test_conte
     blended_nanoset_config = NanosetConfig(
         random_seed=SEED,
         sequence_length=SEQ_LENGTH,
-        data_path={dataset_1_bin_path: 0.2, dataset_2_bin_path: 0.2},
+        data_path={dataset_1_bin_path: 0.8, dataset_2_bin_path: 0.2},
         split=SPLIT,
         split_num_samples=split_num_samples,
         path_to_cache=test_context.get_auto_remove_tmp_dir(),
@@ -117,3 +117,5 @@ def _test_build_nanoset_dataloader(parallel_context: ParallelContext, test_conte
         # Check a batch produced by the Dataloader
         batch = next(iter(dataloader))
         assert_batch_dataloader(batch=batch, parallel_context=parallel_context)
+
+        parallel_context.destroy()
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 22ec2a7a..f7887558 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -140,9 +140,8 @@ def get_args():
 def get_file_name(args, file_id):
     file_name, extension = os.path.splitext(args.input)
     input_file_name = file_name + "_" + str(file_id) + extension
-    sentence_split_file = file_name + "_ss_" + str(file_id) + extension
     output_prefix = args.output_prefix + "_" + str(file_id)
-    file_names = {"partition": input_file_name, "sentence_split": sentence_split_file, "output_prefix": output_prefix}
+    file_names = {"partition": input_file_name, "output_prefix": output_prefix}
     return file_names
 
 
@@ -154,7 +153,8 @@ def check_files_exist(in_ss_out_names, key, num_partitions):
 
 
 def main(args):
-
+    # Check if json file is not empty
+    assert os.path.getsize(args.input), f"{args.input} is empty!"
     # Check if output directory exists
     if not os.path.isdir(os.path.abspath(os.path.join(args.output_prefix, os.path.pardir))):
         print(f"Creating {os.path.abspath(os.path.join(args.output_prefix, os.path.pardir))} directory...")
@@ -162,11 +162,8 @@ def main(args):
 
     in_ss_out_names = []
     if args.partitions == 1:
-        file_name, extension = os.path.splitext(args.input)
-        sentence_split_file = file_name + "_ss" + extension
         file_names = {
             "partition": args.input,
-            "sentence_split": sentence_split_file,
             "output_prefix": args.output_prefix,
         }
         in_ss_out_names.append(file_names)

From d2238359cbc55650d1f2e0e1dfc01c71c36b26e7 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Thu, 21 Mar 2024 23:50:58 +0000
Subject: [PATCH 41/73] refractored test

---
 tests/helpers/data.py                  |  7 +++
 tests/test_build_nanoset_dataloader.py | 66 +++++++++++++-------------
 2 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/tests/helpers/data.py b/tests/helpers/data.py
index e8bd596a..95add1fc 100644
--- a/tests/helpers/data.py
+++ b/tests/helpers/data.py
@@ -13,6 +13,13 @@
 from tools.preprocess_data import main
 
 
+def create_dataset_paths(tmp_dir: str, quantity: int):
+    json_dataset_path = [os.path.join(tmp_dir, f"pytest_{i}") for i in range(quantity)]
+    bin_dataset_path = [f"{path}_text" for path in json_dataset_path]
+
+    return json_dataset_path, bin_dataset_path
+
+
 def create_dummy_json_dataset(path_to_json: str, dummy_text: str, n_samples: int = 500):
 
     ds_dict = {"indices": list(range(n_samples))}
diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index 6124338a..99c53fdf 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -1,17 +1,18 @@
-import os
-
 import pytest
-import torch
 from helpers.context import TestContext
-from helpers.data import assert_batch_dataloader, create_dummy_json_dataset, preprocess_dummy_dataset
+from helpers.data import (
+    assert_batch_dataloader,
+    create_dataset_paths,
+    create_dummy_json_dataset,
+    preprocess_dummy_dataset,
+)
 from helpers.utils import available_gpus, get_all_3d_configurations, init_distributed, rerun_if_address_is_in_use
-from nanotron import distributed as dist
 from nanotron.data.blended_nanoset import BlendedNanoset
 from nanotron.data.dataloader_builder import build_nanoset_dataloader
 from nanotron.data.dataset_builder import NanosetBuilder
 from nanotron.data.nanoset import Nanoset
 from nanotron.data.nanoset_configs import NanosetConfig
-from nanotron.data.utils import compute_datasets_num_samples
+from nanotron.data.utils import compile_helpers, compute_datasets_num_samples
 from nanotron.parallel import ParallelContext
 
 
@@ -19,17 +20,35 @@
     "tp,dp,pp",
     [
         pytest.param(*all_3d_configs)
-        for gpus in range(1, min(available_gpus(), 8) + 1)
+        for gpus in range(1, min(available_gpus(), 4) + 1)
         for all_3d_configs in get_all_3d_configurations(gpus)
     ],
 )
 @rerun_if_address_is_in_use()
 def test_build_nanoset_dataloader(tp: int, dp: int, pp: int):
     test_context = TestContext()
-    init_distributed(tp=tp, dp=dp, pp=pp)(_test_build_nanoset_dataloader)(test_context=test_context)
+
+    # Compile helpers & Create dataset files
+    compile_helpers()
+
+    json_paths, bin_paths = create_dataset_paths(tmp_dir=test_context.get_auto_remove_tmp_dir(), quantity=2)
+
+    # Create dummy json datasets
+    for idx, json_path in enumerate(json_paths):
+        create_dummy_json_dataset(path_to_json=json_path, dummy_text=f"Nanoset {idx}!")
+
+    # Preprocess dummy json datasets
+    for json_path in json_paths:
+        preprocess_dummy_dataset(path_to_json=json_path)
+
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_build_nanoset_dataloader)(
+        test_context=test_context, path_to_bin_files=bin_paths
+    )
 
 
-def _test_build_nanoset_dataloader(parallel_context: ParallelContext, test_context: TestContext):
+def _test_build_nanoset_dataloader(
+    parallel_context: ParallelContext, test_context: TestContext, path_to_bin_files: str
+):
     TRAIN_STEPS = 10000
     VAL_CHECK_INTERVAL = TRAIN_STEPS / 4
     VAL_STEPS = TRAIN_STEPS / 10
@@ -39,28 +58,7 @@ def _test_build_nanoset_dataloader(parallel_context: ParallelContext, test_conte
 
     SEED = 1234
     SEQ_LENGTH = 8192
-    SPLIT = "8,1,1"
-
-    dataset_1_json_path = os.path.join(test_context.get_auto_remove_tmp_dir(), "pytest_1")
-    dataset_2_json_path = os.path.join(test_context.get_auto_remove_tmp_dir(), "pytest_2")
-    dataset_1_bin_path = dataset_1_json_path + "_text"
-    dataset_2_bin_path = dataset_2_json_path + "_text"
-
-    # Compile helpers & Create dataset files
-    if dist.get_rank() == 0:
-        from nanotron.data.utils import compile_helpers
-
-        compile_helpers()
-
-        # Create dummy json datasets
-        create_dummy_json_dataset(path_to_json=dataset_1_json_path, dummy_text="Nanoset 1!")
-        create_dummy_json_dataset(path_to_json=dataset_2_json_path, dummy_text="Nanoset 2!")
-
-        # Preprocess dummy json datasets
-        preprocess_dummy_dataset(path_to_json=dataset_1_json_path)
-        preprocess_dummy_dataset(path_to_json=dataset_2_json_path)
-
-    torch.distributed.barrier()
+    SPLIT = "950,40,10"
 
     input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
 
@@ -75,7 +73,7 @@ def _test_build_nanoset_dataloader(parallel_context: ParallelContext, test_conte
     nanoset_config = NanosetConfig(
         random_seed=SEED,
         sequence_length=SEQ_LENGTH,
-        data_path=dataset_1_bin_path,
+        data_path=path_to_bin_files[0],
         split=SPLIT,
         split_num_samples=split_num_samples,
         path_to_cache=test_context.get_auto_remove_tmp_dir(),
@@ -84,7 +82,7 @@ def _test_build_nanoset_dataloader(parallel_context: ParallelContext, test_conte
     blended_nanoset_config = NanosetConfig(
         random_seed=SEED,
         sequence_length=SEQ_LENGTH,
-        data_path={dataset_1_bin_path: 0.8, dataset_2_bin_path: 0.2},
+        data_path={bin_path: 5 - idx for idx, bin_path in enumerate(path_to_bin_files)},
         split=SPLIT,
         split_num_samples=split_num_samples,
         path_to_cache=test_context.get_auto_remove_tmp_dir(),
@@ -118,4 +116,4 @@ def _test_build_nanoset_dataloader(parallel_context: ParallelContext, test_conte
         batch = next(iter(dataloader))
         assert_batch_dataloader(batch=batch, parallel_context=parallel_context)
 
-        parallel_context.destroy()
+    parallel_context.destroy()

From 2e15ab84766a04310f02b8c53229e5b4645bb954 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Fri, 22 Mar 2024 09:41:16 +0000
Subject: [PATCH 42/73] fixed dataset dependency

---
 tests/helpers/data.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/tests/helpers/data.py b/tests/helpers/data.py
index 95add1fc..227266cd 100644
--- a/tests/helpers/data.py
+++ b/tests/helpers/data.py
@@ -1,3 +1,4 @@
+import json
 import os
 import sys
 
@@ -9,7 +10,6 @@
 from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
 from nanotron.sanity_checks import assert_tensor_synced_across_pg
 
-import datasets
 from tools.preprocess_data import main
 
 
@@ -22,16 +22,11 @@ def create_dataset_paths(tmp_dir: str, quantity: int):
 
 def create_dummy_json_dataset(path_to_json: str, dummy_text: str, n_samples: int = 500):
 
-    ds_dict = {"indices": list(range(n_samples))}
-    ds = datasets.Dataset.from_dict(ds_dict)
-
-    def create_text(sample):
-        return {
-            "text": f"[{sample['indices']}] Hello! I'm sample {sample['indices']}! And this is my dummy text: {dummy_text}"
-        }
-
-    new_ds = ds.map(create_text, batched=False).remove_columns(["indices"])
-    new_ds.to_json(path_or_buf=path_to_json + ".json")
+    with open(path_to_json + ".json", "a") as json_file:
+        for sample in range(n_samples):
+            sample_dict = {"text": f"[{sample}] Hello! Im sample {sample}! And this is my dummy text: {dummy_text}"}
+            json_file.write(json.dumps(sample_dict))
+            json_file.write("\n")
 
 
 def preprocess_dummy_dataset(path_to_json: str):

From 4924d549cfab0d11e2507279e34f319b7a06b840 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Fri, 22 Mar 2024 10:07:45 +0000
Subject: [PATCH 43/73] Added tensor shape assertion

---
 tests/helpers/data.py                  | 6 +++++-
 tests/test_build_nanoset_dataloader.py | 7 ++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/tests/helpers/data.py b/tests/helpers/data.py
index 227266cd..ca08b838 100644
--- a/tests/helpers/data.py
+++ b/tests/helpers/data.py
@@ -46,7 +46,7 @@ def preprocess_dummy_dataset(path_to_json: str):
     main(args)
 
 
-def assert_batch_dataloader(batch: dict, parallel_context):
+def assert_batch_dataloader(batch: dict, parallel_context, micro_batch_size: int, sequence_length: int):
     """
     batch (dict): Batch produced from the Dataloader, with keys input_ids, input_mask, label_ids, label_mask
 
@@ -57,6 +57,10 @@ def assert_batch_dataloader(batch: dict, parallel_context):
             0  # 0 if tensor is from the ids, 1 if TensorPointer and 2 if mask. Used in the data parallel group check
         )
 
+        # Check shape of mask and ids tensors
+        if isinstance(tensor, torch.Tensor):
+            assert tensor.shape == (micro_batch_size, sequence_length)
+
         # TensorPointer case: Check that all TensorPointers from the same tp_pg point to the same group_rank. Create torch.tensor with group_rank
         if isinstance(tensor, TensorPointer):
             tensor = torch.tensor(tensor.group_rank)
diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index 99c53fdf..819cced1 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -114,6 +114,11 @@ def _test_build_nanoset_dataloader(
 
         # Check a batch produced by the Dataloader
         batch = next(iter(dataloader))
-        assert_batch_dataloader(batch=batch, parallel_context=parallel_context)
+        assert_batch_dataloader(
+            batch=batch,
+            parallel_context=parallel_context,
+            micro_batch_size=MICRO_BATCH_SIZE,
+            sequence_length=SEQ_LENGTH,
+        )
 
     parallel_context.destroy()

From b052a23e76864eee259c4c302d7273f7111a5a4c Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Fri, 22 Mar 2024 12:27:51 +0100
Subject: [PATCH 44/73] updated dependencies

---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ebb81b8f..df3dcf5f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,8 @@ dependencies = [
     "packaging",
     "safetensors",
     "dacite",
-    "tqdm"
+    "tqdm",
+    "transformers"
 ]
 
 [tool.setuptools.packages.find]

From e6723f31f4897b59de4a82128d70792cc3832d69 Mon Sep 17 00:00:00 2001
From: Phuc Nguyen <b3f0cus@icloud.com>
Date: Fri, 22 Mar 2024 12:07:27 +0000
Subject: [PATCH 45/73] refactor

---
 src/nanotron/trainer.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index 19b04f51..c0b4c76e 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -237,6 +237,15 @@ def post_init(self):
         pass
 
     def pre_training(self, *args, **kwargs):
+        self._print_training_plan()
+
+        log_rank(
+            f"[Start training] datetime: {datetime.datetime.now()} | mbs: {self.micro_batch_size} | grad_accum: {self.n_micro_batches_per_batch} | global_batch_size: {self.global_batch_size} | sequence_length: {self.sequence_length} | train_steps: {self.config.tokens.train_steps} | start_iteration_step: {self.start_iteration_step} | consumed_train_samples: {self.consumed_train_samples}",  # noqa
+            logger=logger,
+            level=logging.INFO,
+            rank=0,
+        )
+
         current_time = datetime.datetime.now().strftime("%d/%m/%Y_%H:%M:%S")
         if dist.get_rank(self.parallel_context.world_pg) == self.logger_ranks[0] and wandb is not None:
             wandb.init(
@@ -273,7 +282,7 @@ def clear_dataloader_from_memory(dataloader: DataLoader, stage_name: str):
                 level=logging.INFO,
             )
 
-            # NOTE: Clear the internal state of the dataloader
+            # NOTE: Clear dataloader from memory
             del dataloader.dataset
             del dataloader.sampler
             del dataloader.batch_sampler
@@ -317,7 +326,6 @@ def train(
         **kwargs,
     ) -> None:
         self.pre_training(**kwargs)
-        self._print_training_plan()
 
         if self.config.checkpoints.save_initial_state and self.init_checkpoint_path is None:
             self.save_checkpoint()
@@ -326,12 +334,6 @@ def train(
 
         self.pipeline_engine.nb_microbatches = self.n_micro_batches_per_batch
 
-        log_rank(
-            f"[Start training] datetime: {datetime.datetime.now()} | mbs: {self.micro_batch_size} | grad_accum: {self.n_micro_batches_per_batch} | global_batch_size: {self.global_batch_size} | sequence_length: {self.sequence_length} | train_steps: {self.config.tokens.train_steps} | start_iteration_step: {self.start_iteration_step} | consumed_train_samples: {self.consumed_train_samples}",  # noqa
-            logger=logger,
-            level=logging.INFO,
-            rank=0,
-        )
         # TODO @nouamanetazi: refactor this
         # Useful mapping
         self.unwrapped_model = self.model.module if isinstance(self.model, DistributedDataParallel) else self.model

From 0c5df846d24597b3973ceb811f51021dfa31b5b7 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 24 Mar 2024 11:32:02 +0000
Subject: [PATCH 46/73] Simplified preprocess_data.py

---
 tools/preprocess_data.py | 136 +++++++++++++++------------------------
 1 file changed, 52 insertions(+), 84 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index f7887558..bf20df66 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -23,30 +23,13 @@ def initializer(self):
 
     def encode(self, json_line):
         data = json.loads(json_line)
-        ids = {}
-        lens = {}
-        for key in self.args.json_keys:
-            text = data[key]
-            if isinstance(text, list):
-                sentences = text
-            else:
-                sentences = [text]
-            doc_ids = []
-            sentence_lens = []
-            for sentence in sentences:
-                sentence_ids = Encoder.tokenizer.encode(sentence)
-
-                if len(sentence_ids) > 0:
-                    doc_ids.extend(sentence_ids)
-                    sentence_lens.append(len(sentence_ids))
-
-            if len(doc_ids) > 0 and self.args.append_eod:
-                doc_ids.append(Encoder.tokenizer.eos_token_id)
-                sentence_lens[-1] += 1
-
-            ids[key] = doc_ids
-            lens[key] = sentence_lens
-        return ids, lens, len(json_line)
+        text = data[self.args.json_key]
+
+        text_ids = Encoder.tokenizer.encode(text)
+        if self.args.append_eos:
+            text_ids.append(Encoder.tokenizer.eos_token_id)
+
+        return text_ids, len(text_ids), len(json_line)
 
 
 class Partition(object):
@@ -72,39 +55,31 @@ def process_json_file(self, file_name):
         pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
         encoded_docs = pool.imap(encoder.encode, fin, 32)
 
-        output_bin_files = {}
-        output_idx_files = {}
-        builders = {}
+        output_bin_file = "{}_{}.bin".format(output_prefix, self.args.json_key)
+        output_idx_file = "{}_{}.idx".format(output_prefix, self.args.json_key)
 
-        for key in self.args.json_keys:
-            output_bin_files[key] = "{}_{}.bin".format(output_prefix, key)
-            output_idx_files[key] = "{}_{}.idx".format(output_prefix, key)
-            builders[key] = indexed_dataset.MMapIndexedDatasetBuilder(
-                output_bin_files[key],
-                dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
-            )
+        builder = indexed_dataset.MMapIndexedDatasetBuilder(
+            output_bin_file, dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size)
+        )
 
         startup_end = time.time()
         proc_start = time.time()
         total_bytes_processed = 0
         print("Time to startup:", startup_end - startup_start)
-        for i, (doc, sentence_lens, bytes_processed) in enumerate(encoded_docs, start=1):
+        for i, (text_ids, len_text_ids, bytes_processed) in enumerate(encoded_docs, start=1):
             total_bytes_processed += bytes_processed
-            for key in doc.keys():
-                builders[key].add_document(doc[key], sentence_lens[key])
+            builder.add_document([text_ids], [len_text_ids])
             self.print_processing_stats(i, proc_start, total_bytes_processed)
 
         fin.close()
-        builders[key].finalize(output_idx_files[key])
+        builder.finalize(output_idx_file)
 
 
 def get_args():
     parser = argparse.ArgumentParser()
     group = parser.add_argument_group(title="input data")
     group.add_argument("--input", type=str, required=True, help="Path to input JSON")
-    group.add_argument(
-        "--json-keys", nargs="+", default=["text"], help="space separate listed of keys to extract from json"
-    )
+    group.add_argument("--json-key", type=str, default="text", help="Key to extract from json")
 
     group = parser.add_argument_group(title="tokenizer")
     group.add_argument(
@@ -113,10 +88,12 @@ def get_args():
         required=True,
         help="A path to a directory containing vocabulary files required by the tokenizer or the model id of a predefined tokenizer hosted inside a model repo on the Hugging Face Hub.",
     )
-    group.add_argument("--append-eod", action="store_true", help="Append an <eod> token to the end of a document.")
+    group.add_argument("--append-eos", action="store_true", help="Append an <eos> token to the end of a sample.")
 
     group = parser.add_argument_group(title="output data")
-    group.add_argument("--output-prefix", type=str, required=True, help="Path to binary output file without suffix")
+    group.add_argument(
+        "--output-prefix", type=str, required=True, help="Path to binary and index output files without suffix"
+    )
 
     group = parser.add_argument_group(title="runtime")
     group.add_argument(
@@ -145,13 +122,6 @@ def get_file_name(args, file_id):
     return file_names
 
 
-def check_files_exist(in_ss_out_names, key, num_partitions):
-    for i in range(num_partitions):
-        if not os.path.exists(in_ss_out_names[i][key]):
-            return False
-    return True
-
-
 def main(args):
     # Check if json file is not empty
     assert os.path.getsize(args.input), f"{args.input} is empty!"
@@ -175,27 +145,23 @@ def main(args):
             in_ss_out_name = get_file_name(args, idx)
             in_ss_out_names.append(in_ss_out_name)
 
-        # Check to see if paritions were already created
-        partitions_present = check_files_exist(in_ss_out_names, "partition", args.partitions)
-
-        if not partitions_present:
-            # Populate .jsonl partition files from parent files
-            partitioned_input_files = []
-            for idx in range(args.partitions):
-                partitioned_input_file = open(in_ss_out_names[idx]["partition"], "w")
-                partitioned_input_files.append(partitioned_input_file)
+        # Populate .jsonl partition files from parent files
+        partitioned_input_files = []
+        for idx in range(args.partitions):
+            partitioned_input_file = open(in_ss_out_names[idx]["partition"], "w")
+            partitioned_input_files.append(partitioned_input_file)
 
-            index = 0
-            for in_file_name in in_file_names:
-                fin = open(in_file_name, "r", encoding="utf-8")
+        index = 0
+        for in_file_name in in_file_names:
+            fin = open(in_file_name, "r", encoding="utf-8")
 
-                for line in fin:
-                    partitioned_input_files[index].write(line)
-                    index = (index + 1) % args.partitions
-                fin.close()
+            for line in fin:
+                partitioned_input_files[index].write(line)
+                index = (index + 1) % args.partitions
+            fin.close()
 
-            for idx in range(args.partitions):
-                partitioned_input_files[idx].close()
+        for idx in range(args.partitions):
+            partitioned_input_files[idx].close()
 
     assert args.workers % args.partitions == 0
     partition = Partition(args, args.workers // args.partitions)
@@ -217,25 +183,27 @@ def main(args):
         return
 
     # Merge bin/idx partitions
-    output_bin_files = {}
-    output_idx_files = {}
-    builders = {}
-
     tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
 
-    for key in args.json_keys:
-        output_bin_files[key] = "{}_{}.bin".format(args.output_prefix, key)
-        output_idx_files[key] = "{}_{}.idx".format(args.output_prefix, key)
-        builders[key] = indexed_dataset.MMapIndexedDatasetBuilder(
-            output_bin_files[key],
-            dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
-        )
+    output_bin_file = "{}_{}.bin".format(args.output_prefix, args.json_key)
+    output_idx_file = "{}_{}.idx".format(args.output_prefix, args.json_key)
 
-        for name in in_ss_out_names:
-            parition_output_prefix = name["output_prefix"]
-            full_partition_output_prefix = "{}_{}".format(parition_output_prefix, key)
-            builders[key].add_index(full_partition_output_prefix)
-        builders[key].finalize(output_idx_files[key])
+    builder = indexed_dataset.MMapIndexedDatasetBuilder(
+        output_bin_file, dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size)
+    )
+
+    for name in in_ss_out_names:
+        parition_output_prefix = name["output_prefix"]
+        full_partition_output_prefix = "{}_{}".format(parition_output_prefix, args.json_key)
+        builder.add_index(full_partition_output_prefix)
+    builder.finalize(output_idx_file)
+
+    # Clean temporary files
+    for name in in_ss_out_names:
+        os.remove(name["partition"])
+        output_prefixs = glob.glob(name["output_prefix"] + "*")
+        for output in output_prefixs:
+            os.remove(output)
 
 
 if __name__ == "__main__":

From b9cac6e75eb0e7066074e2cbd3d5292a6cb651ff Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 24 Mar 2024 11:46:00 +0000
Subject: [PATCH 47/73] Added random generation of hyperparameters to build
 nanoset dataloaders test

---
 tests/helpers/data.py                  |  6 +++---
 tests/test_build_nanoset_dataloader.py | 19 +++++++++++--------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/tests/helpers/data.py b/tests/helpers/data.py
index ca08b838..6f39743e 100644
--- a/tests/helpers/data.py
+++ b/tests/helpers/data.py
@@ -20,7 +20,7 @@ def create_dataset_paths(tmp_dir: str, quantity: int):
     return json_dataset_path, bin_dataset_path
 
 
-def create_dummy_json_dataset(path_to_json: str, dummy_text: str, n_samples: int = 500):
+def create_dummy_json_dataset(path_to_json: str, dummy_text: str, n_samples: int = 50000):
 
     with open(path_to_json + ".json", "a") as json_file:
         for sample in range(n_samples):
@@ -33,12 +33,12 @@ def preprocess_dummy_dataset(path_to_json: str):
     # Create args for preprocessing
     args = Namespace(
         input=path_to_json + ".json",
-        json_keys=["text"],
+        json_key="text",
         output_prefix=path_to_json,
         pretrained_model_name_or_path="openai-community/gpt2",
         workers=int(min(os.cpu_count(), 8)),
         partitions=int((min(os.cpu_count(), 8) / 2)),
-        append_eod=True,
+        append_eos=True,
         log_interval=int(1000),
     )
 
diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index 819cced1..cdc741d1 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -1,3 +1,5 @@
+import random
+
 import pytest
 from helpers.context import TestContext
 from helpers.data import (
@@ -49,16 +51,18 @@ def test_build_nanoset_dataloader(tp: int, dp: int, pp: int):
 def _test_build_nanoset_dataloader(
     parallel_context: ParallelContext, test_context: TestContext, path_to_bin_files: str
 ):
-    TRAIN_STEPS = 10000
+    SEED = 1234
+    random.seed(SEED)
+
+    TRAIN_STEPS = random.randint(1000, 10000)
     VAL_CHECK_INTERVAL = TRAIN_STEPS / 4
     VAL_STEPS = TRAIN_STEPS / 10
-    MICRO_BATCH_SIZE = 4
-    N_MICRO_BATCHES_PER_BATCH = 8
+    MICRO_BATCH_SIZE = 2 ** random.randint(1, 4)
+    N_MICRO_BATCHES_PER_BATCH = 2 ** random.randint(1, 4)
     GLOBAL_BATCH_SIZE = MICRO_BATCH_SIZE * N_MICRO_BATCHES_PER_BATCH * parallel_context.dp_pg.size()
 
-    SEED = 1234
-    SEQ_LENGTH = 8192
-    SPLIT = "950,40,10"
+    SEQ_LENGTH = 2 ** random.randint(10, 14)
+    SPLIT = "{},{},{}".format(random.randint(1, 100), random.randint(1, 100), random.randint(1, 100))
 
     input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
 
@@ -82,7 +86,7 @@ def _test_build_nanoset_dataloader(
     blended_nanoset_config = NanosetConfig(
         random_seed=SEED,
         sequence_length=SEQ_LENGTH,
-        data_path={bin_path: 5 - idx for idx, bin_path in enumerate(path_to_bin_files)},
+        data_path={bin_path: random.randint(1, 100) for bin_path in path_to_bin_files},
         split=SPLIT,
         split_num_samples=split_num_samples,
         path_to_cache=test_context.get_auto_remove_tmp_dir(),
@@ -108,7 +112,6 @@ def _test_build_nanoset_dataloader(
             output_pp_rank=output_pp_rank,
             micro_batch_size=MICRO_BATCH_SIZE,
             dataloader_num_workers=0,
-            seed_worker=SEED,
             dataloader_drop_last=True,
         )
 

From a5041b7b25d46a6cd0dc0025b53b061dc56476c8 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sun, 24 Mar 2024 12:03:53 +0000
Subject: [PATCH 48/73] Simplified Nanosets

---
 run_train_nanoset.py                    |   3 -
 src/nanotron/data/blended_nanoset.py    |   8 +-
 src/nanotron/data/dataloader_builder.py |   5 +-
 src/nanotron/data/dataset_builder.py    |  24 +-
 src/nanotron/data/helpers.cpp           | 592 ------------------------
 src/nanotron/data/nanoset.py            | 152 +-----
 6 files changed, 40 insertions(+), 744 deletions(-)

diff --git a/run_train_nanoset.py b/run_train_nanoset.py
index 5b122d61..101f15d7 100644
--- a/run_train_nanoset.py
+++ b/run_train_nanoset.py
@@ -56,7 +56,6 @@ def get_dataloaders(trainer: DistributedTrainer):
         micro_batch_size=trainer.micro_batch_size,
         consumed_train_samples=trainer.consumed_train_samples,
         dataloader_num_workers=trainer.config.data.num_loading_workers,
-        seed_worker=trainer.config.data.seed,
         dataloader_drop_last=True,
     )
 
@@ -68,7 +67,6 @@ def get_dataloaders(trainer: DistributedTrainer):
         output_pp_rank=output_pp_rank,
         micro_batch_size=trainer.micro_batch_size,
         dataloader_num_workers=trainer.config.data.num_loading_workers,
-        seed_worker=trainer.config.data.seed,
         dataloader_drop_last=True,
     )
 
@@ -80,7 +78,6 @@ def get_dataloaders(trainer: DistributedTrainer):
         output_pp_rank=output_pp_rank,
         micro_batch_size=trainer.micro_batch_size,
         dataloader_num_workers=trainer.config.data.num_loading_workers,
-        seed_worker=trainer.config.data.seed,
         dataloader_drop_last=True,
     )
 
diff --git a/src/nanotron/data/blended_nanoset.py b/src/nanotron/data/blended_nanoset.py
index ba49f690..4e945c5d 100644
--- a/src/nanotron/data/blended_nanoset.py
+++ b/src/nanotron/data/blended_nanoset.py
@@ -11,7 +11,6 @@
 from nanotron import logging
 from nanotron.data.nanoset import Nanoset
 from nanotron.data.nanoset_configs import NanosetConfig
-from nanotron.data.utils import normalize
 from nanotron.logging import log_rank
 
 logger = logging.get_logger(__name__)
@@ -48,9 +47,6 @@ def __init__(
         if len(datasets) == 1:
             log_rank("Building a BlendedNanoset for a single Nanoset", logger=logger, level=logging.WARNING, rank=0)
 
-        # Redundant normalization for bitwise identical comparison with Megatron-LM
-        weights = normalize(weights)
-
         self.datasets = datasets
         self.weights = weights
         self.size = size
@@ -67,7 +63,7 @@ def __init__(
         self.unique_description = json.dumps(unique_identifiers, indent=4)
         self.unique_description_hash = hashlib.md5(self.unique_description.encode("utf-8")).hexdigest()
 
-        self.dataset_index, self.dataset_sample_index = self._build_indices()
+        self.dataset_index, self.dataset_sample_index = self.build_indices()
 
         # Check size
         _ = self[self.size - 1]
@@ -86,7 +82,7 @@ def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
 
         return self.datasets[dataset_id][dataset_sample_id]
 
-    def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
+    def build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
         """Build and optionally cache the dataset index and the dataset sample index
 
         The dataset index is a 1-D mapping which determines the dataset to query. The dataset
diff --git a/src/nanotron/data/dataloader_builder.py b/src/nanotron/data/dataloader_builder.py
index 4c584047..c5467064 100644
--- a/src/nanotron/data/dataloader_builder.py
+++ b/src/nanotron/data/dataloader_builder.py
@@ -27,7 +27,6 @@ def build_nanoset_dataloader(
     output_pp_rank: int,
     micro_batch_size: int,
     dataloader_num_workers: int,
-    seed_worker: int,
     consumed_train_samples: int = 0,
     dataloader_drop_last: bool = True,
     dataloader_pin_memory: bool = True,
@@ -55,7 +54,6 @@ def build_nanoset_dataloader(
         dataset=dataset,
         dl_ranks_size=dp_ranks_size,
         dl_rank=dp_rank,
-        seed=seed_worker,
         drop_last=dataloader_drop_last,
         consumed_train_samples=consumed_train_samples,
     )
@@ -152,13 +150,12 @@ def get_sampler(
     dl_ranks_size: int,
     dl_rank: int,
     dataset,
-    seed: int,
     consumed_train_samples: int,
     drop_last: Optional[bool] = True,
 ) -> Optional[Sampler]:
     """returns sampler that restricts data loading to a subset of the dataset proper to the DP rank"""
 
-    sampler = DistributedSampler(dataset, num_replicas=dl_ranks_size, rank=dl_rank, seed=seed, drop_last=drop_last)
+    sampler = DistributedSampler(dataset, num_replicas=dl_ranks_size, rank=dl_rank, shuffle=False, drop_last=drop_last)
 
     if consumed_train_samples > 0:
         sampler = SkipBatchSampler(sampler, skip_batches=consumed_train_samples, dp_size=dl_ranks_size)
diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
index 5aafa95e..db42856e 100644
--- a/src/nanotron/data/dataset_builder.py
+++ b/src/nanotron/data/dataset_builder.py
@@ -48,13 +48,13 @@ def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
 
         # Single Nanoset
         if isinstance(data_path, str):
-            return self._build_nanoset_dataset_splits(data_path, split, self.config.split_num_samples)
+            return self.build_nanoset_dataset_splits(data_path, split, self.config.split_num_samples)
 
         # Blended Nanoset
         prefix_per_dataset = list(data_path.keys())
         weight_per_dataset = normalize(list(data_path.values()))
 
-        # NOTE: Use 0.5% target margin to ensure we satiate the network
+        # NOTE: Use 0.5% target margin to ensure that that we do not exhaust the indices of any dataset in the Blend
         sizes_per_dataset = [
             [
                 int(math.ceil(target_num_samples * weight * 1.005))
@@ -66,9 +66,10 @@ def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
         nanoset_datasets = [[] for _ in range(len(Split))]
 
         for i in range(len(prefix_per_dataset)):
-            nanoset_datasets_split = self._build_nanoset_dataset_splits(
+            nanoset_datasets_split = self.build_nanoset_dataset_splits(
                 prefix_per_dataset[i], split, sizes_per_dataset[i]
             )
+
             for j in range(len(nanoset_datasets_split)):
                 nanoset_datasets[j].append(nanoset_datasets_split[j])
 
@@ -84,9 +85,9 @@ def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
                 assert all(is_none)
                 blended_nanosets.append(None)
             else:
-                assert all(is_none) or not any(is_none)
+                assert not any(is_none)
                 blended_nanosets.append(
-                    self._build_generic_dataset(
+                    self.build_generic_dataset(
                         BlendedNanoset,
                         nanoset_datasets[i],
                         weight_per_dataset,
@@ -97,7 +98,7 @@ def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
 
         return blended_nanosets
 
-    def _build_nanoset_dataset_splits(
+    def build_nanoset_dataset_splits(
         self,
         path_prefix: str,
         split: List[float],
@@ -115,10 +116,9 @@ def _build_nanoset_dataset_splits(
         Returns:
             List[Nanoset]: The Nanoset per split. Always returns Nanosets because we build them in each and every rank
         """
+        indexed_dataset = self.build_generic_dataset(MMapIndexedDataset, path_prefix)
 
-        indexed_dataset = self._build_generic_dataset(MMapIndexedDataset, path_prefix)
-
-        split_idx_bounds = _get_split_indices(split, indexed_dataset.sequence_lengths.shape[0])
+        split_idx_bounds = get_split_indices(split, indexed_dataset.sequence_lengths.shape[0])
 
         split_indices = [
             numpy.arange(
@@ -136,14 +136,14 @@ def _build_nanoset_dataset_splits(
                 nanoset_datasets.append(None)
             else:
                 nanoset_datasets.append(
-                    self._build_generic_dataset(
+                    self.build_generic_dataset(
                         Nanoset, indexed_dataset, split_indices[i], sizes[i], _split, self.config
                     )
                 )
 
         return nanoset_datasets
 
-    def _build_generic_dataset(
+    def build_generic_dataset(
         self,
         cls: Type[DistributedDataset],
         *args: Any,
@@ -190,7 +190,7 @@ def _build_generic_dataset(
         return cls(*args)
 
 
-def _get_split_indices(split: List[float], num_elements: int) -> List[int]:
+def get_split_indices(split: List[float], num_elements: int) -> List[int]:
     """Determine the document index bounds per split
 
     Args:
diff --git a/src/nanotron/data/helpers.cpp b/src/nanotron/data/helpers.cpp
index 9c5b4407..d11f86f7 100644
--- a/src/nanotron/data/helpers.cpp
+++ b/src/nanotron/data/helpers.cpp
@@ -166,600 +166,8 @@ py::array build_sample_idx(const py::array_t<int32_t> &sizes_,
                    free_when_done);                          // numpy array references
 }
 
-inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
-                                     const int32_t max_length,
-                                     std::mt19937 &rand32_gen)
-{
-  /* Training sample length. */
-  if (short_seq_ratio == 0)
-  {
-    return max_length;
-  }
-  const auto random_number = rand32_gen();
-  if ((random_number % short_seq_ratio) == 0)
-  {
-    return 2 + random_number % (max_length - 1);
-  }
-  return max_length;
-}
-
-template <typename DocIdx>
-py::array build_mapping_impl(const py::array_t<int64_t> &docs_,
-                             const py::array_t<int32_t> &sizes_,
-                             const int32_t num_epochs,
-                             const uint64_t max_num_samples,
-                             const int32_t max_seq_length,
-                             const double short_seq_prob,
-                             const int32_t seed,
-                             const bool verbose,
-                             const int32_t min_num_sent)
-{
-  /* Build a mapping of (start-index, end-index, sequence-length) where
-     start and end index are the indices of the sentences in the sample
-     and sequence-length is the target sequence length.
-  */
-
-  // Consistency checks.
-  assert(num_epochs > 0);
-  assert(max_seq_length > 1);
-  assert(short_seq_prob >= 0.0);
-  assert(short_seq_prob <= 1.0);
-  assert(seed > 0);
-
-  // Remove bound checks.
-  auto docs = docs_.unchecked<1>();
-  auto sizes = sizes_.unchecked<1>();
-
-  // For efficiency, convert probability to ratio. Note: rand() generates int.
-  int32_t short_seq_ratio = 0;
-  if (short_seq_prob > 0)
-  {
-    short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob));
-  }
-
-  if (verbose)
-  {
-    const auto sent_start_index = docs[0];
-    const auto sent_end_index = docs[docs_.shape(0) - 1];
-    const auto num_sentences = sent_end_index - sent_start_index;
-    cout << "    using:" << endl
-         << std::flush;
-    cout << "     number of documents:            " << docs_.shape(0) - 1 << endl
-         << std::flush;
-    cout << "     sentences range:                [" << sent_start_index << ", " << sent_end_index << ")" << endl
-         << std::flush;
-    cout << "     total number of sentences:      " << num_sentences << endl
-         << std::flush;
-    cout << "     number of epochs:               " << num_epochs << endl
-         << std::flush;
-    cout << "     maximum number of samples:      " << max_num_samples << endl
-         << std::flush;
-    cout << "     maximum sequence length:        " << max_seq_length << endl
-         << std::flush;
-    cout << "     short sequence probability:     " << short_seq_prob << endl
-         << std::flush;
-    cout << "     short sequence ration (1/prob): " << short_seq_ratio << endl
-         << std::flush;
-    cout << "     seed:                           " << seed << endl
-         << std::flush;
-  }
-
-  // Mapping and it's length (1D).
-  int64_t num_samples = -1;
-  DocIdx *maps = NULL;
-
-  // Perform two iterations, in the first iteration get the size
-  // and allocate memory and in the second iteration populate the map.
-  bool second = false;
-  for (int32_t iteration = 0; iteration < 2; ++iteration)
-  {
-
-    // Set the seed so both iterations produce the same results.
-    std::mt19937 rand32_gen(seed);
-
-    // Set the flag on second iteration.
-    second = (iteration == 1);
-
-    // Counters:
-    uint64_t empty_docs = 0;
-    uint64_t one_sent_docs = 0;
-    uint64_t long_sent_docs = 0;
-
-    // Current map index.
-    uint64_t map_index = 0;
-
-    // For each epoch:
-    for (int32_t epoch = 0; epoch < num_epochs; ++epoch)
-    {
-      if (map_index >= max_num_samples)
-      {
-        if (verbose && (!second))
-        {
-          cout << "    reached " << max_num_samples << " samples after "
-               << epoch << " epochs ..." << endl
-               << std::flush;
-        }
-        break;
-      }
-      // For each document:
-      for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc)
-      {
-
-        // Document sentences are in [sent_index_first, sent_index_last)
-        const auto sent_index_first = docs[doc];
-        const auto sent_index_last = docs[doc + 1];
-
-        // At the beginning of the document previous index is the
-        // start index.
-        auto prev_start_index = sent_index_first;
-
-        // Remaining documents.
-        auto num_remain_sent = sent_index_last - sent_index_first;
-
-        // Some bookkeeping
-        if ((epoch == 0) && (!second))
-        {
-          if (num_remain_sent == 0)
-          {
-            ++empty_docs;
-          }
-          if (num_remain_sent == 1)
-          {
-            ++one_sent_docs;
-          }
-        }
-
-        // Detect documents with long sentences.
-        bool contains_long_sentence = false;
-        if (num_remain_sent > 1)
-        {
-          for (auto sent_index = sent_index_first;
-               sent_index < sent_index_last; ++sent_index)
-          {
-            if (sizes[sent_index] > LONG_SENTENCE_LEN)
-            {
-              if ((epoch == 0) && (!second))
-              {
-                ++long_sent_docs;
-              }
-              contains_long_sentence = true;
-              break;
-            }
-          }
-        }
-
-        // If we have more than two sentences.
-        if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence))
-        {
-
-          // Set values.
-          auto seq_len = int32_t{0};
-          auto num_sent = int32_t{0};
-          auto target_seq_len = get_target_sample_len(short_seq_ratio,
-                                                      max_seq_length,
-                                                      rand32_gen);
-
-          // Loop through sentences.
-          for (auto sent_index = sent_index_first;
-               sent_index < sent_index_last; ++sent_index)
-          {
-
-            // Add the size and number of sentences.
-            seq_len += sizes[sent_index];
-            ++num_sent;
-            --num_remain_sent;
-
-            // If we have reached the target length.
-            // and if not only one sentence is left in the document.
-            // and if we have at least two sentneces.
-            // and if we have reached end of the document.
-            if (((seq_len >= target_seq_len) &&
-                 (num_remain_sent > 1) &&
-                 (num_sent >= min_num_sent)) ||
-                (num_remain_sent == 0))
-            {
-
-              // Check for overflow.
-              if ((3 * map_index + 2) >
-                  std::numeric_limits<int64_t>::max())
-              {
-                cout << "number of samples exceeded maximum "
-                     << "allowed by type int64: "
-                     << std::numeric_limits<int64_t>::max()
-                     << endl;
-                throw std::overflow_error("Number of samples");
-              }
-
-              // Populate the map.
-              if (second)
-              {
-                const auto map_index_0 = 3 * map_index;
-                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
-                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
-                maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len);
-              }
-
-              // Update indices / counters.
-              ++map_index;
-              prev_start_index = sent_index + 1;
-              target_seq_len = get_target_sample_len(short_seq_ratio,
-                                                     max_seq_length,
-                                                     rand32_gen);
-              seq_len = 0;
-              num_sent = 0;
-            }
-
-          } // for (auto sent_index=sent_index_first; ...
-        }   // if (num_remain_sent > 1) {
-      }     // for (int doc=0; doc < num_docs; ++doc) {
-    }       // for (int epoch=0; epoch < num_epochs; ++epoch) {
-
-    if (!second)
-    {
-      if (verbose)
-      {
-        cout << "   number of empty documents: " << empty_docs << endl
-             << std::flush;
-        cout << "   number of documents with one sentence: " << one_sent_docs << endl
-             << std::flush;
-        cout << "   number of documents with long sentences: " << long_sent_docs << endl
-             << std::flush;
-        cout << "   will create mapping for " << map_index << " samples" << endl
-             << std::flush;
-      }
-      assert(maps == NULL);
-      assert(num_samples < 0);
-      maps = new DocIdx[3 * map_index];
-      num_samples = static_cast<int64_t>(map_index);
-    }
-
-  } // for (int iteration=0; iteration < 2; ++iteration) {
-
-  // Shuffle.
-  // We need a 64 bit random number generator as we might have more
-  // than 2 billion samples.
-  std::mt19937_64 rand64_gen(seed + 1);
-  for (auto i = (num_samples - 1); i > 0; --i)
-  {
-    const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
-    const auto i0 = 3 * i;
-    const auto j0 = 3 * j;
-    // Swap values.
-    swap(maps[i0], maps[j0]);
-    swap(maps[i0 + 1], maps[j0 + 1]);
-    swap(maps[i0 + 2], maps[j0 + 2]);
-  }
-
-  // Method to deallocate memory.
-  py::capsule free_when_done(maps, [](void *mem_)
-                             {
-            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
-	    delete[] mem; });
-
-  // Return the numpy array.
-  const auto byte_size = sizeof(DocIdx);
-  return py::array(std::vector<int64_t>{num_samples, 3}, // shape
-                   {3 * byte_size, byte_size},           // C-style contiguous strides
-                   maps,                                 // the data pointer
-                   free_when_done);                      // numpy array references
-}
-
-py::array build_mapping(const py::array_t<int64_t> &docs_,
-                        const py::array_t<int> &sizes_,
-                        const int num_epochs,
-                        const uint64_t max_num_samples,
-                        const int max_seq_length,
-                        const double short_seq_prob,
-                        const int seed,
-                        const bool verbose,
-                        const int32_t min_num_sent)
-{
-
-  if (sizes_.size() > std::numeric_limits<uint32_t>::max())
-  {
-    if (verbose)
-    {
-      cout << "    using uint64 for data mapping..." << endl
-           << std::flush;
-    }
-    return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs,
-                                        max_num_samples, max_seq_length,
-                                        short_seq_prob, seed, verbose,
-                                        min_num_sent);
-  }
-  else
-  {
-    if (verbose)
-    {
-      cout << "    using uint32 for data mapping..." << endl
-           << std::flush;
-    }
-    return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs,
-                                        max_num_samples, max_seq_length,
-                                        short_seq_prob, seed, verbose,
-                                        min_num_sent);
-  }
-}
-
-template <typename DocIdx>
-py::array build_blocks_mapping_impl(const py::array_t<int64_t> &docs_,
-                                    const py::array_t<int32_t> &sizes_,
-                                    const py::array_t<int32_t> &titles_sizes_,
-                                    const int32_t num_epochs,
-                                    const uint64_t max_num_samples,
-                                    const int32_t max_seq_length,
-                                    const int32_t seed,
-                                    const bool verbose,
-                                    const bool use_one_sent_blocks)
-{
-  /* Build a mapping of (start-index, end-index, sequence-length) where
-     start and end index are the indices of the sentences in the sample
-     and sequence-length is the target sequence length.
-  */
-
-  // Consistency checks.
-  assert(num_epochs > 0);
-  assert(max_seq_length > 1);
-  assert(seed > 0);
-
-  // Remove bound checks.
-  auto docs = docs_.unchecked<1>();
-  auto sizes = sizes_.unchecked<1>();
-  auto titles_sizes = titles_sizes_.unchecked<1>();
-
-  if (verbose)
-  {
-    const auto sent_start_index = docs[0];
-    const auto sent_end_index = docs[docs_.shape(0) - 1];
-    const auto num_sentences = sent_end_index - sent_start_index;
-    cout << "    using:" << endl
-         << std::flush;
-    cout << "     number of documents:            " << docs_.shape(0) - 1 << endl
-         << std::flush;
-    cout << "     sentences range:                [" << sent_start_index << ", " << sent_end_index << ")" << endl
-         << std::flush;
-    cout << "     total number of sentences:      " << num_sentences << endl
-         << std::flush;
-    cout << "     number of epochs:               " << num_epochs << endl
-         << std::flush;
-    cout << "     maximum number of samples:      " << max_num_samples << endl
-         << std::flush;
-    cout << "     maximum sequence length:        " << max_seq_length << endl
-         << std::flush;
-    cout << "     seed:                           " << seed << endl
-         << std::flush;
-  }
-
-  // Mapping and its length (1D).
-  int64_t num_samples = -1;
-  DocIdx *maps = NULL;
-
-  // Acceptable number of sentences per block.
-  int min_num_sent = 2;
-  if (use_one_sent_blocks)
-  {
-    min_num_sent = 1;
-  }
-
-  // Perform two iterations, in the first iteration get the size
-  // and allocate memory and in the second iteration populate the map.
-  bool second = false;
-  for (int32_t iteration = 0; iteration < 2; ++iteration)
-  {
-
-    // Set the flag on second iteration.
-    second = (iteration == 1);
-
-    // Current map index.
-    uint64_t map_index = 0;
-
-    uint64_t empty_docs = 0;
-    uint64_t one_sent_docs = 0;
-    uint64_t long_sent_docs = 0;
-    // For each epoch:
-    for (int32_t epoch = 0; epoch < num_epochs; ++epoch)
-    {
-      // assign every block a unique id
-      int32_t block_id = 0;
-
-      if (map_index >= max_num_samples)
-      {
-        if (verbose && (!second))
-        {
-          cout << "    reached " << max_num_samples << " samples after "
-               << epoch << " epochs ..." << endl
-               << std::flush;
-        }
-        break;
-      }
-      // For each document:
-      for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc)
-      {
-
-        // Document sentences are in [sent_index_first, sent_index_last)
-        const auto sent_index_first = docs[doc];
-        const auto sent_index_last = docs[doc + 1];
-        const auto target_seq_len = max_seq_length - titles_sizes[doc];
-
-        // At the beginning of the document previous index is the
-        // start index.
-        auto prev_start_index = sent_index_first;
-
-        // Remaining documents.
-        auto num_remain_sent = sent_index_last - sent_index_first;
-
-        // Some bookkeeping
-        if ((epoch == 0) && (!second))
-        {
-          if (num_remain_sent == 0)
-          {
-            ++empty_docs;
-          }
-          if (num_remain_sent == 1)
-          {
-            ++one_sent_docs;
-          }
-        }
-        // Detect documents with long sentences.
-        bool contains_long_sentence = false;
-        if (num_remain_sent >= min_num_sent)
-        {
-          for (auto sent_index = sent_index_first;
-               sent_index < sent_index_last; ++sent_index)
-          {
-            if (sizes[sent_index] > LONG_SENTENCE_LEN)
-            {
-              if ((epoch == 0) && (!second))
-              {
-                ++long_sent_docs;
-              }
-              contains_long_sentence = true;
-              break;
-            }
-          }
-        }
-        // If we have enough sentences and no long sentences.
-        if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence))
-        {
-
-          // Set values.
-          auto seq_len = int32_t{0};
-          auto num_sent = int32_t{0};
-
-          // Loop through sentences.
-          for (auto sent_index = sent_index_first;
-               sent_index < sent_index_last; ++sent_index)
-          {
-
-            // Add the size and number of sentences.
-            seq_len += sizes[sent_index];
-            ++num_sent;
-            --num_remain_sent;
-
-            // If we have reached the target length.
-            // and there are an acceptable number of sentences left
-            // and if we have at least the minimum number of sentences.
-            // or if we have reached end of the document.
-            if (((seq_len >= target_seq_len) &&
-                 (num_remain_sent >= min_num_sent) &&
-                 (num_sent >= min_num_sent)) ||
-                (num_remain_sent == 0))
-            {
-
-              // Populate the map.
-              if (second)
-              {
-                const auto map_index_0 = 4 * map_index;
-                // Each sample has 4 items: the starting sentence index, ending sentence index,
-                // the index of the document from which the block comes (used for fetching titles)
-                // and the unique id of the block (used for creating block indexes)
-
-                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
-                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
-                maps[map_index_0 + 2] = static_cast<DocIdx>(doc);
-                maps[map_index_0 + 3] = static_cast<DocIdx>(block_id);
-              }
-
-              // Update indices / counters.
-              ++map_index;
-              ++block_id;
-              prev_start_index = sent_index + 1;
-              seq_len = 0;
-              num_sent = 0;
-            }
-          } // for (auto sent_index=sent_index_first; ...
-        }   // if (num_remain_sent > 1) {
-      }     // for (int doc=0; doc < num_docs; ++doc) {
-    }       // for (int epoch=0; epoch < num_epochs; ++epoch) {
-
-    if (!second)
-    {
-      if (verbose)
-      {
-        cout << "   number of empty documents: " << empty_docs << endl
-             << std::flush;
-        cout << "   number of documents with one sentence: " << one_sent_docs << endl
-             << std::flush;
-        cout << "   number of documents with long sentences: " << long_sent_docs << endl
-             << std::flush;
-        cout << "   will create mapping for " << map_index << " samples" << endl
-             << std::flush;
-      }
-      assert(maps == NULL);
-      assert(num_samples < 0);
-      maps = new DocIdx[4 * map_index];
-      num_samples = static_cast<int64_t>(map_index);
-    }
-
-  } // for (int iteration=0; iteration < 2; ++iteration) {
-
-  // Shuffle.
-  // We need a 64 bit random number generator as we might have more
-  // than 2 billion samples.
-  std::mt19937_64 rand64_gen(seed + 1);
-  for (auto i = (num_samples - 1); i > 0; --i)
-  {
-    const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
-    const auto i0 = 4 * i;
-    const auto j0 = 4 * j;
-    // Swap values.
-    swap(maps[i0], maps[j0]);
-    swap(maps[i0 + 1], maps[j0 + 1]);
-    swap(maps[i0 + 2], maps[j0 + 2]);
-    swap(maps[i0 + 3], maps[j0 + 3]);
-  }
-
-  // Method to deallocate memory.
-  py::capsule free_when_done(maps, [](void *mem_)
-                             {
-            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
-	    delete[] mem; });
-
-  // Return the numpy array.
-  const auto byte_size = sizeof(DocIdx);
-  return py::array(std::vector<int64_t>{num_samples, 4}, // shape
-                   {4 * byte_size, byte_size},           // C-style contiguous strides
-                   maps,                                 // the data pointer
-                   free_when_done);                      // numpy array references
-}
-
-py::array build_blocks_mapping(const py::array_t<int64_t> &docs_,
-                               const py::array_t<int> &sizes_,
-                               const py::array_t<int> &titles_sizes_,
-                               const int num_epochs,
-                               const uint64_t max_num_samples,
-                               const int max_seq_length,
-                               const int seed,
-                               const bool verbose,
-                               const bool use_one_sent_blocks)
-{
-
-  if (sizes_.size() > std::numeric_limits<uint32_t>::max())
-  {
-    if (verbose)
-    {
-      cout << "    using uint64 for data mapping..." << endl
-           << std::flush;
-    }
-    return build_blocks_mapping_impl<uint64_t>(docs_, sizes_, titles_sizes_,
-                                               num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
-  }
-  else
-  {
-    if (verbose)
-    {
-      cout << "    using uint32 for data mapping..." << endl
-           << std::flush;
-    }
-    return build_blocks_mapping_impl<uint32_t>(docs_, sizes_, titles_sizes_,
-                                               num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
-  }
-}
-
 PYBIND11_MODULE(helpers, m)
 {
-  m.def("build_mapping", &build_mapping);
-  m.def("build_blocks_mapping", &build_blocks_mapping);
   m.def("build_sample_idx", &build_sample_idx);
   m.def("build_blending_indices", &build_blending_indices);
 }
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
index f02fc7ae..b5bf0363 100644
--- a/src/nanotron/data/nanoset.py
+++ b/src/nanotron/data/nanoset.py
@@ -73,15 +73,14 @@ def __init__(
             self.document_index,
             self.sample_index,
             self.shuffle_index,
-        ) = self._build_document_sample_shuffle_indices()
+        ) = self.build_document_sample_shuffle_indices()
 
     def __len__(self) -> int:
-        """Abstract method implementation
-
+        """
         Returns:
             int: The length of the dataset
         """
-        return self.sample_index.shape[0] - 1
+        return self.shuffle_index.shape[0]
 
     def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
         """Get the text (token ids) for a given index
@@ -125,7 +124,7 @@ def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
 
         return {"text": numpy.array(numpy.concatenate(tokens), dtype=numpy.int64)}
 
-    def _build_document_sample_shuffle_indices(
+    def build_document_sample_shuffle_indices(
         self,
     ) -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]:
         """Build the document index, the sample index, and the shuffle index
@@ -136,17 +135,16 @@ def _build_document_sample_shuffle_indices(
 
         The sample index:
             -- 2-D
-            -- The document indices and offsets which mark the start of every sample
+            -- The document indices and offsets which mark the start of every sample to generate samples of sequence length length
 
         The shuffle index:
             -- 1-D
             -- A random permutation of index range of the sample index
 
         Returns:
-            Tuple[numpy.ndarray, numpy.ndarray]: The document index, the sample index, and the
+            Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]: The document index, the sample index, and the
             shuffle index
 
-        TODO: Explain the 80% threshold
         """
         path_to_cache = getattr(self.config, "path_to_cache")
         if path_to_cache is None:
@@ -171,11 +169,11 @@ def get_path_to(suffix):
             )
         )
 
-        num_tokens_per_epoch = _get_num_tokens_per_epoch(self.indexed_dataset, self.indexed_indices)
+        num_tokens_per_epoch = compute_num_tokens_per_epoch(self.indexed_dataset, self.indexed_indices)
 
         sequence_length = getattr(self.config, "sequence_length")
 
-        num_epochs = _get_num_epochs(num_tokens_per_epoch, sequence_length, self.num_samples)
+        num_epochs = compute_num_epochs(num_tokens_per_epoch, sequence_length, self.num_samples)
 
         if not cache_hit and torch.distributed.get_rank() == 0:
             log_rank(
@@ -185,37 +183,6 @@ def get_path_to(suffix):
                 rank=0,
             )
 
-            if num_epochs == 1:
-                separate_final_epoch = False
-            else:
-                # Get the number of samples for the last epoch
-                num_samples_sans_final_epoch = ((num_epochs - 1) * num_tokens_per_epoch - 1) // sequence_length
-                num_samples_from_final_epoch = self.num_samples - num_samples_sans_final_epoch
-                num_samples_per_epoch = (num_tokens_per_epoch - 1) // sequence_length
-
-                # num_samples_from_final_epoch should be non-negative
-                assert num_samples_from_final_epoch >= 0
-
-                # num_samples_from_final_epoch should not exceed max value
-                assert num_samples_from_final_epoch <= num_samples_per_epoch + 1
-
-                # Separate the final epoch if it falls below the threshold
-                threshold = 0.80
-                separate_final_epoch = num_samples_from_final_epoch < int(threshold * num_samples_per_epoch)
-
-                log_rank(
-                    f"> num_samples_from_final_epoch: {num_samples_from_final_epoch}",
-                    logger=logger,
-                    level=logging.DEBUG,
-                    rank=0,
-                )
-                log_rank(f"> Threshold: {threshold}", logger=logger, level=logging.DEBUG, rank=0)
-                log_rank(
-                    f"> num_samples_per_epoch: {num_samples_per_epoch}", logger=logger, level=logging.DEBUG, rank=0
-                )
-
-            log_rank(f"> separate_final_epoch: {separate_final_epoch}", logger=logger, level=logging.DEBUG, rank=0)
-
             numpy_random_state = numpy.random.RandomState(getattr(self.config, "random_seed"))
 
             os.makedirs(path_to_cache, exist_ok=True)
@@ -232,9 +199,8 @@ def get_path_to(suffix):
                 rank=0,
             )
             t_beg = time.time()
-            document_index = _build_document_index(
-                self.indexed_indices, num_epochs, numpy_random_state, separate_final_epoch
-            )
+            document_index = numpy.copy(self.indexed_indices)
+            numpy_random_state.shuffle(document_index)
             numpy.save(path_to_document_index, document_index, allow_pickle=True)
             t_end = time.time()
             log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
@@ -255,7 +221,7 @@ def get_path_to(suffix):
                 self.indexed_dataset.sequence_lengths,
                 document_index,
                 sequence_length,
-                num_epochs,
+                1,
                 num_tokens_per_epoch,
             )
             numpy.save(path_to_sample_index, sample_index, allow_pickle=True)
@@ -270,15 +236,16 @@ def get_path_to(suffix):
                 rank=0,
             )
             t_beg = time.time()
-            if separate_final_epoch:
-                shuffle_index = _build_shuffle_index(
-                    num_samples_sans_final_epoch, sample_index.shape[0] - 1, numpy_random_state
-                )
-            else:
-                shuffle_index = _build_shuffle_index(
-                    sample_index.shape[0] - 1, sample_index.shape[0] - 1, numpy_random_state
-                )
-            numpy.save(path_to_shuffle_index, shuffle_index, allow_pickle=True)
+            shuffle_index = numpy.arange(start=0, stop=(sample_index.shape[0] - 1), step=1, dtype=numpy.uint32)
+            numpy_random_state.shuffle(shuffle_index)
+            n_concatenations = (
+                int(self.num_samples / shuffle_index.shape[0]) + 1
+            )  # NOTE: To ensure that we always generate more samples than requested in num_samples
+            numpy.save(
+                path_to_shuffle_index,
+                numpy.concatenate([shuffle_index for _ in range(n_concatenations)]),
+                allow_pickle=True,
+            )
             t_end = time.time()
             log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
@@ -322,14 +289,14 @@ def get_path_to(suffix):
         t_end = time.time()
         log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
-        log_rank(f"> Total number of samples: {sample_index.shape[0] - 1}", logger=logger, level=logging.INFO, rank=0)
+        log_rank(f"> Total number of samples: {shuffle_index.shape[0]}", logger=logger, level=logging.INFO, rank=0)
         log_rank(f"> Total number of epochs: {num_epochs}", logger=logger, level=logging.INFO, rank=0)
 
         return document_index, sample_index, shuffle_index
 
 
-def _get_num_tokens_per_epoch(indexed_dataset: MMapIndexedDataset, indices: numpy.ndarray) -> int:
-    """Calculate the number of tokens in a single epoch
+def compute_num_tokens_per_epoch(indexed_dataset: MMapIndexedDataset, indices: numpy.ndarray) -> int:
+    """Compute the number of tokens in a single epoch
 
     Args:
         indexed_dataset (MMapIndexedDataset): The underlying MMapIndexedDataset
@@ -342,7 +309,7 @@ def _get_num_tokens_per_epoch(indexed_dataset: MMapIndexedDataset, indices: nump
     return numpy.sum(indexed_dataset.sequence_lengths[indices])
 
 
-def _get_num_epochs(num_tokens_per_epoch: int, seq_length: int, num_samples: int) -> int:
+def compute_num_epochs(num_tokens_per_epoch: int, seq_length: int, num_samples: int) -> int:
     """Calculate the number of epochs
 
     Args:
@@ -365,72 +332,3 @@ def _get_num_epochs(num_tokens_per_epoch: int, seq_length: int, num_samples: int
         # sample except for the last sample.
         if ((num_tokens - 1) // seq_length) >= num_samples:
             return num_epochs
-
-
-def _build_document_index(
-    documents: numpy.ndarray,
-    num_epochs: int,
-    numpy_random_state: numpy.random.RandomState,
-    separate_final_epoch: bool,
-) -> numpy.ndarray:
-    """Build an array with length = num epochs * num documents
-
-    Args:
-        documents (numpy.ndarray): the subset of exposed document indices
-
-        num_epochs (int): The number of epochs
-
-        numpy_random_state (numpy.random.RandomState): The NumPy random state
-
-        separate_final_epoch (bool): Whether to exclude the last epoch from the global shuffle
-
-    Returns:
-        numpy.ndarray: The document index
-
-    TODO: Explain separate_final_epoch
-    """
-    if not separate_final_epoch or num_epochs == 1:
-        document_index = numpy.mgrid[0:num_epochs, 0 : len(documents)][1]
-        document_index[:] = documents
-        document_index = document_index.reshape(-1)
-        document_index = document_index.astype(numpy.int32)
-        numpy_random_state.shuffle(document_index)
-        return document_index
-
-    doc_idx_first = _build_document_index(documents, num_epochs - 1, numpy_random_state, False)
-    doc_idx_last = _build_document_index(documents, 1, numpy_random_state, False)
-    return numpy.concatenate((doc_idx_first, doc_idx_last))
-
-
-def _build_shuffle_index(
-    num_samples: int, total_size: int, numpy_random_state: numpy.random.RandomState
-) -> numpy.ndarray:
-    """Build the range [0, size) and shuffle
-
-    Args:
-        num_samples (int): The size of the first shuffle range [0, num_samples)
-
-        total_size (int): The size of the entire index. If larger than 'num_samples', it defines
-
-        the second shuffle range [num_samples, total_size)
-
-        numpy_random_state (numpy.random.RandomState): The NumPy random state
-
-    Returns:
-        numpy.ndarray: The shuffle index
-
-    TODO: Explain [0, num_samples) [num_samples, total_size) split
-    """
-    dtype_ = numpy.uint32
-    if total_size >= (numpy.iinfo(numpy.uint32).max - 1):
-        dtype_ = numpy.int64
-
-    shuffle_idx_first = numpy.arange(start=0, stop=num_samples, step=1, dtype=dtype_)
-    numpy_random_state.shuffle(shuffle_idx_first)
-    if num_samples == total_size:
-        return shuffle_idx_first
-
-    shuffle_idx_last = numpy.arange(start=num_samples, stop=total_size, step=1, dtype=dtype_)
-    numpy_random_state.shuffle(shuffle_idx_last)
-
-    return numpy.concatenate((shuffle_idx_first, shuffle_idx_last))

From a5bef2c750d16889290199dc59d754a277d91465 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Mon, 25 Mar 2024 00:12:04 +0000
Subject: [PATCH 49/73] Added Under the hood section to Docs

---
 docs/nanoset.md          | 128 ++++++++++++++++++++++++++++++++++++++-
 tools/preprocess_data.py |   3 +-
 2 files changed, 126 insertions(+), 5 deletions(-)

diff --git a/docs/nanoset.md b/docs/nanoset.md
index d5ae224d..1301fb77 100644
--- a/docs/nanoset.md
+++ b/docs/nanoset.md
@@ -30,13 +30,13 @@ We also include the following scripts:
 - [`hf_datasets_to_json.py`](../tools/hf_datasets_to_json.py): A tool to transform datasets from the Hugging Face Hub (or local) to the .json format required by `preprocess_data.py`.
 - [`merge_datasets.py`](../tools/merge_datasets.py): A tool to merge preprocessed datasets (_.bin_ and _.idx_ files) into a single file.
 
-## NanosetDatasetsArgs
+## Working with Nanosets
 
 To work with Nanosets, we need to configure 3 arguments:
 1. `split`: The distribution we want to divide our dataset into among train, valid, and test split.
 2. `path_to_cache`: Directory used to store certain metadata of Nanosets to reuse them between different runs.
 3. `data_path`: This argument specifies the file/files that will compose the Nanoset. There are 2 ways to specify it:
-   1. If we only indicate _one_ path (path to the files generated by preprocess_data.py WITHOUT the extension), we will create a Nanoset.
+   1. If we only indicate _one_ path (path to the files generated by preprocess_data.py WITHOUT the extension), we will create a `Nanoset`.
     ```yaml
     data:
       dataset:
@@ -46,7 +46,7 @@ To work with Nanosets, we need to configure 3 arguments:
       num_loading_workers: 0
       seed: 1234
     ```
-   2. With a dictionary, we can create a BlendedNanoset where the keys are the paths to the dataset (path to the files generated by preprocess_data.py WITHOUT the extension) and the values are the weight for each dataset.
+   2. With a dictionary, we can create a `BlendedNanoset` where the keys are the paths to the dataset (path to the files generated by preprocess_data.py WITHOUT the extension) and the values are the weight for each dataset.
     ```yaml
     data:
       dataset:
@@ -63,3 +63,125 @@ Finally, to use the Nanosets, launch the training with [`run_train_nanoset.py`](
 ```shell
 torchrun --nproc-per-node 8 run_train_nanoset.py --config configs/nanoset_llama2.yaml
 ```
+
+## Under the hood
+### Nanoset
+A `Nanoset` is paremeterized by the following variables:
+- The underlying `MMapIndexedDataset` instance (`indexed_dataset`)
+- The sequence length `S`
+- The split indices `indexed_indices` (the congituous subset of document or sequence indices used for training, validation, and testing)
+- The total number of samples `N` of the Nanoset
+- The random seed `R`
+
+The `Nanoset` creates three index mappings to facilitate lookup: (1) The `document_index`, (2) the `sample_index`, and (3) the `shuffle_index`.
+
+1. The document index (`document_index`) is a 1-D array array in which the indices _i_ contain the indices of the documents (**each sample from the JSON file**) that belong to the dataset. This array is randomly shuffled with seed `R`.
+```
+Given:
+len(indexed_dataset) => 40
+
+indexed_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+
+Then, for example:
+
+document_index = [14, 8, 17, 0, 3, 15, 7, 19, 11, 12, 9, 2, 10, 1, 4, 16, 13, 5, 18, 6]
+```
+2. The sample index (`sample_index`) is a 2-D array mapping from _j_ to pairs of (_i_, document_index[_i_] offset) of shape `[num_samples + 1, 2]`, where `num_samples` is `(tokens_per_epoch - 1) // S`. The rows _j_ and _j + 1_ serve as the left and right bounds for the j-th sample, so that each sample will contain `S` tokens.
+```
+Given:
+
+S = 1024
+
+Then, for example:
+
+sample_index[0] = (0, 0)
+sample_index[1] = (0, 1024)       => document_index[0] (Document 14) has length greater than S
+sample_index[2] = (1, 512)        => document_index[0] has length 1536
+sample_index[3] = (2, 100)        => document_index[1] (Document 8) has length 1436 (In this sample we get 924 from document_index[1] + 100 from document_index[2])
+sample_index[4] = (3, 0)          => document_index[2] has length 1124
+sample_index[5] = (5, 300)        => document_index[3:5] have a total length of 724 (S - 300)
+sample_index[6] = (6, 24)         => document_index[5] has length 1300
+...
+sample_index[15] = (18, 512)
+sample_index[16] = (19, 1000)     => From document_index[19] we don't have S tokens, so this sample is discarded
+
+len(sample_index) => 17, but only 16 useful samples
+```
+3. The shuffle index (`shuffle_index`) is a 1-D array mapping from _k_ to _j_ of length `n_concatenations * (len(sample_index) - 1)`, where `n_concatenations` is defined as `(N / (len(sample_index) - 1)) + 1`, so that `len(shuffle_index)` is always greater than `N`. Before concatenating the full array, `shuffle_index` is shuffled according to `R`.
+```
+Given:
+
+N = 70
+
+Then, for example:
+
+shuffle_index = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+
+Shuffle the indices -> shuffle_index = [9, 3, 15, 12, 5, 10, 1, 4, 8, 11, 13, 7, 2, 14, 6, 0]
+
+n_concatenations = (70/(17 - 1)) + 1 = 5
+shuffle_index = shuffle_index concatenated 5 times
+
+len(shuffle_index) = 80 > N
+```
+
+To query the `Nanoset` for the k-th sample we do the following:
+1. Use the `shuffle_index` to get the index _j_ into the `sample_index`
+```
+j = shuffle_index[k]
+```
+2. Use the `sample_index` to get the left and right sample-bounding indices into the `document_index` and the starting token offset for each document
+```
+i, offset = sample_index[j]
+i_next, offset_next = sample_index[j + 1]
+```
+3. Use the `document_index` to retrieve `S + 1` tokens from consecutive (in the `document_index`) documents
+```
+sample = []
+sample += indexed_dataset[document_index[i]][offset:]
+if i != i_next:
+    sample += indexed_dataset[document_index[i + 1:i_next]]
+sample += indexed_dataset[document_index[i_next]][:offset_next]
+```
+
+Despite having repeated indices in the `shuffle_index`, throughout 1 epoch, we will only observe each sample once. We achieve this by deactivating shuffling in the `DistributedDataSampler`, so that the indices of the `shuffle_index` are consumed in the order they appear by the multiple processes. It is worth noting that the `document_index` already shuffles the data so that the samples constructed in the `sample_index` do not contain data from consecutive documents, and we shuffle the data again in the `shuffle_index`.
+```
+Given:
+
+4 Processes loading data
+
+(P1) idx_list = [0, 4, 8, 12, 16, ...]    -> shuffle_index[idx_list] = [9, 5, 8, 2, 9, ...]
+(P2) idx_list = [1, 5, 9, 13, 17, ...]    -> shuffle_index[idx_list] = [3, 10, 11, 14, 3, ...]
+(P3) idx_list = [2, 6, 10, 14, 18, ...]   -> shuffle_index[idx_list] = [15, 1, 13, 6, 15, ...]
+(P4) idx_list = [3, 7, 11, 15, 19, ...]   -> shuffle_index[idx_list] = [12, 4, 7, 0, 12, ...]
+```
+### BlendedNanoset
+The `BlendedNanoset` is parameterized by the following variables:
+- The underlying `Nanoset` instances `D`
+- The weights `W` (one per dataset)
+- The number of samples `U`
+
+The `BlendedNanoset` creates two "blending" indices to facilitate lookup: (1) The `dataset_index` and (2) the `dataset_sample_index`.
+
+1. The `dataset_index` is a 1-D array mapping from _i_ to dataset index from `D` of length `U`.
+```
+Given:
+
+D = [d0, d1, d2, d3]
+W = [0.1, 0.5, 0.3, 0.1]
+U = 20
+
+Then, for example:
+
+dataset_index = [1, 2, 0, 1, 3, 1, 2, 1, 2, 1, 0, 1, 2, 1, 3, 1, 2, 1, 2, 1]
+```
+2. The `dataset_sample_index` is a 1-D mapping from _i_ to the sample index for dataset_index[_i_] of length `U`.
+```
+dataset_index =         [1, 2, 0, 1, 3, 1, 2, 1, 2, 1, 0, 1, 2, 1, 3, 1, 2, 1, 2, 1]
+dataset_sample_index =  [0, 0, 0, 1, 0, 2, 1, 3, 2, 4, 1, 5, 3, 6, 1, 7, 4, 8, 5, 9]
+```
+To query the `BlendedNanoset` for the k-th sample we do the following:
+- Use the `dataset_index` to retrieve the corresponding dataset from `D` and the `dataset_sample_index` to retrieve the corresponding sample from that dataset.
+```
+sample = D[dataset_index[k]][dataset_sample_index[k]]
+```
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index bf20df66..a25c6068 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -201,8 +201,7 @@ def main(args):
     # Clean temporary files
     for name in in_ss_out_names:
         os.remove(name["partition"])
-        output_prefixs = glob.glob(name["output_prefix"] + "*")
-        for output in output_prefixs:
+        for output in glob.glob(name["output_prefix"] + "*"):
             os.remove(output)
 
 

From 454d86c5b2adab15ee74e34c0ab2509987d00bc3 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Mon, 25 Mar 2024 00:14:32 +0000
Subject: [PATCH 50/73] Added len assertion of Nanosets belonging to the
 BlendedNanoset to tests

---
 tests/helpers/data.py                  | 8 ++++++++
 tests/test_build_nanoset_dataloader.py | 7 ++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/tests/helpers/data.py b/tests/helpers/data.py
index 6f39743e..c45f3a67 100644
--- a/tests/helpers/data.py
+++ b/tests/helpers/data.py
@@ -6,6 +6,7 @@
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir)))
 from argparse import Namespace
 
+import numpy as np
 import torch
 from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
 from nanotron.sanity_checks import assert_tensor_synced_across_pg
@@ -84,3 +85,10 @@ def assert_batch_dataloader(batch: dict, parallel_context, micro_batch_size: int
             pg=parallel_context.dp_pg,
             msg=lambda err: f"{element} is not synchronized across DP {err}",
         )
+
+
+def get_max_value_by_group(dataset_idx, dataset_sample_idx, group_idx):
+    mask = dataset_idx == group_idx
+    filtered_values = dataset_sample_idx[mask]
+    max_val = np.amax(filtered_values)
+    return max_val
diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index cdc741d1..8e82f9a8 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -6,6 +6,7 @@
     assert_batch_dataloader,
     create_dataset_paths,
     create_dummy_json_dataset,
+    get_max_value_by_group,
     preprocess_dummy_dataset,
 )
 from helpers.utils import available_gpus, get_all_3d_configurations, init_distributed, rerun_if_address_is_in_use
@@ -98,10 +99,14 @@ def _test_build_nanoset_dataloader(
     for config, dataset_type in zip(configs, dataset_types):
         # Create Nanosets
         train_dataset, valid_dataset, test_dataset = NanosetBuilder(config).build()
-        # Check the type of Nanoset and the quantity of Nanosets in BlendedNanoset
+        # Check the type of Nanoset, the quantity of Nanosets in BlendedNanoset and the size of each Nanoset in BlendedNanoset
         assert isinstance(train_dataset, dataset_type)
         if isinstance(train_dataset, BlendedNanoset):
             assert len(train_dataset.datasets) > 1
+            for idx, dataset in enumerate(train_dataset.datasets):
+                assert len(dataset) > get_max_value_by_group(
+                    train_dataset.dataset_index, train_dataset.dataset_sample_index, idx
+                )
 
         # Create Dataloaders
         dataloader = build_nanoset_dataloader(

From b1fd615b2682ba51ab35a37cbb668f66e725732f Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Mon, 25 Mar 2024 00:16:10 +0000
Subject: [PATCH 51/73] Cleaned indexed_dataset

---
 src/nanotron/data/indexed_dataset.py | 115 ---------------------------
 1 file changed, 115 deletions(-)

diff --git a/src/nanotron/data/indexed_dataset.py b/src/nanotron/data/indexed_dataset.py
index 0fc0f60c..ee7c4a69 100644
--- a/src/nanotron/data/indexed_dataset.py
+++ b/src/nanotron/data/indexed_dataset.py
@@ -3,15 +3,11 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Essentially re-written in entirety
-
-import os
 import shutil
 import struct
 import time
 from enum import Enum
 from functools import lru_cache
-from itertools import accumulate
 from types import TracebackType
 from typing import List, Optional, Tuple, Type, Union
 
@@ -313,47 +309,12 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
 
     def __init__(self, path_prefix: str) -> None:
         super().__init__()
-        self.path_prefix = None
-
-        self.index = None
-        self.bin_buffer = None
-        self.bin_buffer_mmap = None
-
-        self.initialize(path_prefix)
-
-    def initialize(self, path_prefix: str) -> None:
-        """Initialize the dataset
-
-        This method is called by MMapIndexedDataset.__init__ during object creation and by
-        MMapIndexedDataset.__setstate__ during un-puckling
-
-        Args:
-            path_prefix (str): The index (.idx) and data (.bin) prefix
-
-        """
         self.path_prefix = path_prefix
 
         self.index = _IndexReader(get_idx_path(self.path_prefix))
         self.bin_buffer_mmap = numpy.memmap(get_bin_path(self.path_prefix), mode="r", order="C")
         self.bin_buffer = memoryview(self.bin_buffer_mmap)
 
-    def __getstate__(self) -> str:
-        """Get the state during pickling
-
-        Returns:
-            Tuple[str, bool]: The state tuple
-        """
-        return self.path_prefix
-
-    def __setstate__(self, path_prefix: str) -> None:
-        """Set the state during un-pickling
-
-        Args:
-            state (Tuple[str, bool]): The state tuple
-
-        """
-        self.initialize(path_prefix)
-
     def __del__(self) -> None:
         """Clean up the object"""
         if self.bin_buffer_mmap is not None:
@@ -369,50 +330,6 @@ def __len__(self) -> int:
         """
         return len(self.index)
 
-    def __getitem__(self, idx: Union[int, numpy.integer, slice]) -> numpy.ndarray:
-        """Return from the dataset
-
-        Args:
-            idx (Union[int, numpy.integer, slice]): The index or index slice into the dataset
-
-        Raises:
-            ValueError: When the index slice is non-contiguous
-
-            TypeError: When the index is of an unexpected type
-
-        Returns:
-            numpy.ndarray: The sequence tokens at the index or index slice
-        """
-
-        if isinstance(idx, (int, numpy.integer)):
-            sequence_pointer, sequence_length = self.index[idx]
-            sequence = numpy.frombuffer(
-                self.bin_buffer,
-                dtype=self.index.dtype,
-                count=sequence_length,
-                offset=sequence_pointer,
-            )
-            return sequence
-
-        elif isinstance(idx, slice):
-            start, stop, step = idx.indices(len(self))
-            if step != 1:
-                raise ValueError("Slices into indexed_dataset must be contiguous")
-            sequence_lengths = self.index.sequence_lengths[idx]
-            sequence_offsets = list(accumulate(sequence_lengths))
-            sequences = numpy.split(
-                numpy.frombuffer(
-                    self.bin_buffer,
-                    dtype=self.index.dtype,
-                    count=sum(sequence_lengths),
-                    offset=self.index.sequence_pointers[start],
-                ),
-                sequence_offsets[:-1],
-            )
-            return sequences
-        else:
-            raise TypeError("Unexpected type received for idx: {}".format(type(idx)))
-
     def get(self, idx: int, offset: int = 0, length: Optional[int] = None) -> numpy.ndarray:
         """Retrieve a single item from the dataset with the option to only
         return a portion of the item.
@@ -445,38 +362,6 @@ def document_indices(self) -> numpy.ndarray:
         """
         return self.index.document_indices
 
-    def get_document_indices(self) -> numpy.ndarray:
-        """Get the document indices
-
-        This method is slated for deprecation.
-
-        Returns:
-            numpy.ndarray: The document indices
-        """
-        return self.index.document_indices
-
-    def set_document_indices(self, document_indices: numpy.ndarray) -> None:
-        """Set the document indices
-
-        This method is slated for deprecation.
-
-        Args:
-            document_indices (numpy.ndarray): The document indices
-        """
-        self.index.document_indices = document_indices
-
-    @staticmethod
-    def exists(path_prefix: str) -> bool:
-        """Return whether the MMapIndexedDataset exists on disk at the prefix
-
-        Args:
-            path_prefix (str): The prefix to the index (.idx) and data (.bin) files
-
-        Returns:
-            bool: Whether the MMapIndexedDataset exists on disk at the prefix
-        """
-        return os.path.exists(get_idx_path(path_prefix)) and os.path.exists(get_bin_path(path_prefix))
-
 
 class MMapIndexedDatasetBuilder(object):
     """Builder class for the MMapIndexedDataset class

From ce4e7432731fd375b9238f0d54e17dc79e79390b Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Mon, 25 Mar 2024 00:50:32 +0000
Subject: [PATCH 52/73] Deleted cpp helpers

---
 src/nanotron/data/blended_nanoset.py   |  15 +--
 src/nanotron/data/helpers.cpp          | 173 -------------------------
 src/nanotron/data/nanoset.py           |  14 +-
 src/nanotron/data/utils.py             | 104 ++++++++++++---
 src/nanotron/trainer.py                |  14 --
 tests/test_build_nanoset_dataloader.py |   6 +-
 6 files changed, 100 insertions(+), 226 deletions(-)
 delete mode 100644 src/nanotron/data/helpers.cpp

diff --git a/src/nanotron/data/blended_nanoset.py b/src/nanotron/data/blended_nanoset.py
index 4e945c5d..bb2b6083 100644
--- a/src/nanotron/data/blended_nanoset.py
+++ b/src/nanotron/data/blended_nanoset.py
@@ -11,6 +11,7 @@
 from nanotron import logging
 from nanotron.data.nanoset import Nanoset
 from nanotron.data.nanoset_configs import NanosetConfig
+from nanotron.data.utils import build_blending_indices
 from nanotron.logging import log_rank
 
 logger = logging.get_logger(__name__)
@@ -120,17 +121,9 @@ def get_path_to(suffix):
             )
 
             t_beg = time.time()
-            from nanotron.data import helpers
-
-            dataset_index = numpy.zeros(self.size, dtype=numpy.int16)
-            dataset_sample_index = numpy.zeros(self.size, dtype=numpy.int64)
-            helpers.build_blending_indices(
-                dataset_index,
-                dataset_sample_index,
-                self.weights,
-                len(self.datasets),
-                self.size,
-                False,
+
+            dataset_index, dataset_sample_index = build_blending_indices(
+                size=self.size, weights=numpy.array(self.weights)
             )
 
             if path_to_cache:
diff --git a/src/nanotron/data/helpers.cpp b/src/nanotron/data/helpers.cpp
deleted file mode 100644
index d11f86f7..00000000
--- a/src/nanotron/data/helpers.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
-
-/* Helper methods for fast index mapping builds */
-
-#include <algorithm>
-#include <iostream>
-#include <limits>
-#include <math.h>
-#include <stdexcept>
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include <random>
-
-namespace py = pybind11;
-using namespace std;
-
-const int32_t LONG_SENTENCE_LEN = 512;
-
-void build_blending_indices(py::array_t<int16_t> &dataset_index,
-                            py::array_t<int64_t> &dataset_sample_index,
-                            const py::array_t<double> &weights,
-                            const int32_t num_datasets,
-                            const int64_t size, const bool verbose)
-{
-  /* Given multiple datasets and a weighting array, build samples
-   such that it follows those weights.*/
-
-  if (verbose)
-  {
-    std::cout << "> building indices for blended datasets ..." << std::endl;
-  }
-
-  // Get the pointer access without the checks.
-  auto dataset_index_ptr = dataset_index.mutable_unchecked<1>();
-  auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>();
-  auto weights_ptr = weights.unchecked<1>();
-
-  // Initialize buffer for number of samples used for each dataset.
-  int64_t current_samples[num_datasets];
-  for (int64_t i = 0; i < num_datasets; ++i)
-  {
-    current_samples[i] = 0;
-  }
-
-  // For each sample:
-  for (int64_t sample_idx = 0; sample_idx < size; ++sample_idx)
-  {
-
-    // Determine where the max error in sampling is happening.
-    auto sample_idx_double = std::max(static_cast<double>(sample_idx), 1.0);
-    int64_t max_error_index = 0;
-    double max_error = weights_ptr[0] * sample_idx_double -
-                       static_cast<double>(current_samples[0]);
-    for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx)
-    {
-      double error = weights_ptr[dataset_idx] * sample_idx_double -
-                     static_cast<double>(current_samples[dataset_idx]);
-      if (error > max_error)
-      {
-        max_error = error;
-        max_error_index = dataset_idx;
-      }
-    }
-
-    // Populate the indices.
-    dataset_index_ptr[sample_idx] = static_cast<int16_t>(max_error_index);
-    dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index];
-
-    // Update the total samples.
-    current_samples[max_error_index] += 1;
-  }
-
-  // print info
-  if (verbose)
-  {
-    std::cout << " > sample ratios:" << std::endl;
-    for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx)
-    {
-      auto ratio = static_cast<double>(current_samples[dataset_idx]) /
-                   static_cast<double>(size);
-      std::cout << "   dataset " << dataset_idx << ", input: " << weights_ptr[dataset_idx] << ", achieved: " << ratio << std::endl;
-    }
-  }
-}
-
-py::array build_sample_idx(const py::array_t<int32_t> &sizes_,
-                           const py::array_t<int32_t> &doc_idx_,
-                           const int32_t seq_length,
-                           const int32_t num_epochs,
-                           const int64_t tokens_per_epoch)
-{
-  /* Sample index (sample_idx) is used for gpt2 like dataset for which
-     the documents are flattened and the samples are built based on this
-     1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
-     where [..., 0] contains the index into `doc_idx` and [..., 1] is the
-     starting offset in that document.*/
-
-  // Consistency checks.
-  assert(seq_length > 1);
-  assert(num_epochs > 0);
-  assert(tokens_per_epoch > 1);
-
-  // Remove bound checks.
-  auto sizes = sizes_.unchecked<1>();
-  auto doc_idx = doc_idx_.unchecked<1>();
-
-  // Mapping and it's length (1D).
-  int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
-  int32_t *sample_idx = new int32_t[2 * (num_samples + 1)];
-
-  // Index into sample_idx.
-  int64_t sample_index = 0;
-  // Index into doc_idx.
-  int64_t doc_idx_index = 0;
-  // Beginning offset for each document.
-  int32_t doc_offset = 0;
-  // Start with first document and no offset.
-  sample_idx[2 * sample_index] = doc_idx_index;
-  sample_idx[2 * sample_index + 1] = doc_offset;
-  ++sample_index;
-
-  while (sample_index <= num_samples)
-  {
-    // Start with a fresh sequence.
-    int32_t remaining_seq_length = seq_length + 1;
-    while (remaining_seq_length != 0)
-    {
-      // Get the document length.
-      auto doc_id = doc_idx[doc_idx_index];
-      auto doc_length = sizes[doc_id] - doc_offset;
-      // And add it to the current sequence.
-      remaining_seq_length -= doc_length;
-      // If we have more than a full sequence, adjust offset and set
-      // remaining length to zero so we return from the while loop.
-      // Note that -1 here is for the same reason we have -1 in
-      // `_num_epochs` calculations.
-      if (remaining_seq_length <= 0)
-      {
-        doc_offset += (remaining_seq_length + doc_length - 1);
-        remaining_seq_length = 0;
-      }
-      else
-      {
-        // Otherwise, start from the beginning of the next document.
-        ++doc_idx_index;
-        doc_offset = 0;
-      }
-    }
-    // Record the sequence.
-    sample_idx[2 * sample_index] = doc_idx_index;
-    sample_idx[2 * sample_index + 1] = doc_offset;
-    ++sample_index;
-  }
-
-  // Method to deallocate memory.
-  py::capsule free_when_done(sample_idx, [](void *mem_)
-                             {
-	int32_t *mem = reinterpret_cast<int32_t*>(mem_);
-	delete[] mem; });
-
-  // Return the numpy array.
-  const auto byte_size = sizeof(int32_t);
-  return py::array(std::vector<int64_t>{num_samples + 1, 2}, // shape
-                   {2 * byte_size, byte_size},               // C-style contiguous strides
-                   sample_idx,                               // the data pointer
-                   free_when_done);                          // numpy array references
-}
-
-PYBIND11_MODULE(helpers, m)
-{
-  m.def("build_sample_idx", &build_sample_idx);
-  m.def("build_blending_indices", &build_blending_indices);
-}
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
index b5bf0363..f3561b05 100644
--- a/src/nanotron/data/nanoset.py
+++ b/src/nanotron/data/nanoset.py
@@ -11,7 +11,7 @@
 from nanotron import logging
 from nanotron.data.indexed_dataset import MMapIndexedDataset
 from nanotron.data.nanoset_configs import NanosetConfig
-from nanotron.data.utils import Split
+from nanotron.data.utils import Split, build_sample_idx
 from nanotron.logging import log_rank
 
 logger = logging.get_logger(__name__)
@@ -213,16 +213,14 @@ def get_path_to(suffix):
                 rank=0,
             )
             t_beg = time.time()
-            from nanotron.data import helpers
 
             assert document_index.dtype == numpy.int32
             assert self.indexed_dataset.sequence_lengths.dtype == numpy.int32
-            sample_index = helpers.build_sample_idx(
-                self.indexed_dataset.sequence_lengths,
-                document_index,
-                sequence_length,
-                1,
-                num_tokens_per_epoch,
+            sample_index = build_sample_idx(
+                sizes=self.indexed_dataset.sequence_lengths,
+                doc_idx=document_index,
+                seq_length=sequence_length,
+                tokens_per_epoch=num_tokens_per_epoch,
             )
             numpy.save(path_to_sample_index, sample_index, allow_pickle=True)
             t_end = time.time()
diff --git a/src/nanotron/data/utils.py b/src/nanotron/data/utils.py
index ce5d1e12..51bf03b3 100644
--- a/src/nanotron/data/utils.py
+++ b/src/nanotron/data/utils.py
@@ -1,6 +1,6 @@
 import re
 from enum import Enum
-from typing import List
+from typing import List, Tuple
 
 import numpy
 
@@ -16,19 +16,6 @@ class Split(Enum):
     test = 2
 
 
-def compile_helpers():
-    """Compile C++ helper functions at runtime. Make sure this is invoked on a single process."""
-    import os
-    import subprocess
-
-    command = ["make", "-C", os.path.abspath(os.path.dirname(__file__))]
-    if subprocess.run(command).returncode != 0:
-        import sys
-
-        log_rank("Failed to compile the C++ dataset helper functions", logger=logger, level=logging.ERROR, rank=0)
-        sys.exit(1)
-
-
 def normalize(weights: List[float]) -> List[float]:
     """Do non-exponentiated normalization
 
@@ -56,8 +43,10 @@ def parse_and_normalize_split(split: str) -> List[float]:
     split = list(map(float, re.findall(r"[.0-9]+", split)))
     split = split + [0.0 for _ in range(len(Split) - len(split))]
 
-    assert len(split) == len(Split)
-    assert all((_ >= 0.0 for _ in split))
+    assert len(split) == len(
+        Split
+    ), "Introduce the split ratios of the datasets in a comma separated string (e.g. 99,1,0)"
+    assert all((_ >= 0.0 for _ in split)), "Don't introduce negative values in the split config. Given: {split}"
 
     split = normalize(split)
 
@@ -78,3 +67,86 @@ def compute_datasets_num_samples(train_iters, eval_interval, eval_iters, global_
     log_rank("    Test:       {}".format(datasets_num_samples[2]), logger=logger, level=logging.INFO, rank=0)
 
     return datasets_num_samples
+
+
+def build_blending_indices(size: int, weights: numpy.ndarray) -> Tuple[numpy.ndarray, numpy.ndarray]:
+    """Given multiple datasets and a weighting array, build samples
+    such that it follows those weights."""
+    # Create empty arrays for dataset indices and dataset sample indices
+    dataset_index = numpy.empty((size,), dtype="uint")
+    dataset_sample_index = numpy.empty((size,), dtype="long")
+
+    # Initialize buffer for number of samples used for each dataset
+    current_samples = numpy.zeros((len(weights),), dtype="long")
+
+    # Iterate over all samples
+    for sample_idx in range(size):
+
+        # Convert sample index to float for comparison against weights
+        sample_idx_float = max(sample_idx, 1.0)
+
+        # Find the dataset with the highest error
+        errors = weights * sample_idx_float - current_samples
+        max_error_index = numpy.argmax(errors)
+
+        # Assign the dataset index and update the sample index
+        dataset_index[sample_idx] = max_error_index
+        dataset_sample_index[sample_idx] = current_samples[max_error_index]
+
+        # Update the total samples for the selected dataset
+        current_samples[max_error_index] += 1
+
+    return dataset_index, dataset_sample_index
+
+
+def build_sample_idx(sizes, doc_idx, seq_length, tokens_per_epoch):
+    # Check validity of inumpyut args.
+    assert seq_length > 1
+    assert tokens_per_epoch > 1
+
+    # Compute the number of samples.
+    num_samples = (tokens_per_epoch - 1) // seq_length
+
+    # Allocate memory for the mapping table.
+    sample_idx = numpy.full([num_samples + 1, 2], fill_value=-999, dtype=numpy.int32)
+
+    # Setup helper vars.
+    sample_index = 0
+    doc_idx_index = 0
+    doc_offset = 0
+
+    # Add the first entry to the mapping table.
+    sample_idx[sample_index][0] = doc_idx_index
+    sample_idx[sample_index][1] = doc_offset
+    sample_index += 1
+
+    # Loop over the rest of the samples.
+    while sample_index <= num_samples:
+        # Start with a fresh sequence.
+        remaining_seq_length = seq_length + 1
+
+        # Keep adding docs until we reach the end of the sequence.
+        while remaining_seq_length != 0:
+            # Look up the current document length.
+            doc_id = doc_idx[doc_idx_index]
+            doc_length = sizes[doc_id] - doc_offset
+
+            # Try to add it to the current sequence.
+            remaining_seq_length -= doc_length
+
+            # If it fits, adjust offset and break out of inner loop.
+            if remaining_seq_length <= 0:
+                doc_offset += remaining_seq_length + doc_length - 1
+                remaining_seq_length = 0
+            else:
+                # Otherwise move to the next document.
+                doc_idx_index += 1
+                doc_offset = 0
+
+        # Store the current sequence in the mapping table.
+        sample_idx[sample_index][0] = doc_idx_index
+        sample_idx[sample_index][1] = doc_offset
+        sample_index += 1
+
+    assert not numpy.any(sample_idx == -999)
+    return sample_idx
diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index ae1f16d1..ba0f297a 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -26,7 +26,6 @@
 from nanotron.config import (
     Config,
     ExistingCheckpointInit,
-    NanosetDatasetsArgs,
     ParallelismArgs,
     RandomInit,
     get_config_from_file,
@@ -150,19 +149,6 @@ def __init__(
         if os.environ.get("NANOTRON_BENCHMARK", "0") == "1":
             log_throughput(self.config, self.parallel_context)
 
-        ########################################
-        ## Compile Nanoset helpers
-        ########################################
-
-        # Only if Using Nanoset Datasets
-        if isinstance(self.config.data.dataset, NanosetDatasetsArgs):
-            if dist.get_rank() == 0:
-                log_rank("Compiling dataset index builder ...", logger=logger, level=logging.INFO, rank=0)
-                from nanotron.data.utils import compile_helpers
-
-                compile_helpers()
-                log_rank("Done with dataset index builder.", logger=logger, level=logging.INFO, rank=0)
-
         ########################################
         ## Setting up our model, optimizers, schedulers, etc.
         ########################################
diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index 8e82f9a8..b326722a 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -15,7 +15,7 @@
 from nanotron.data.dataset_builder import NanosetBuilder
 from nanotron.data.nanoset import Nanoset
 from nanotron.data.nanoset_configs import NanosetConfig
-from nanotron.data.utils import compile_helpers, compute_datasets_num_samples
+from nanotron.data.utils import compute_datasets_num_samples
 from nanotron.parallel import ParallelContext
 
 
@@ -31,9 +31,7 @@
 def test_build_nanoset_dataloader(tp: int, dp: int, pp: int):
     test_context = TestContext()
 
-    # Compile helpers & Create dataset files
-    compile_helpers()
-
+    # Create dataset files
     json_paths, bin_paths = create_dataset_paths(tmp_dir=test_context.get_auto_remove_tmp_dir(), quantity=2)
 
     # Create dummy json datasets

From c0e35eaacdc0d89aa1d965ec4d47a7cfbd526b33 Mon Sep 17 00:00:00 2001
From: Phuc Nguyen <b3f0cus@icloud.com>
Date: Mon, 25 Mar 2024 11:55:38 +0000
Subject: [PATCH 53/73] add a single command that run all the tests

---
 Makefile  | 16 ++++++++++++++++
 README.md |  3 +--
 2 files changed, 17 insertions(+), 2 deletions(-)
 create mode 100644 Makefile

diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..b9e18168
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,16 @@
+# Run nanotron's tests and examples's tests
+test:
+	pytest \
+        --color=yes \
+        --durations=0 \
+        --ignore tests/fp8 \
+        --verbose \
+        tests/
+
+	pip install -r examples/doremi/requirements.txt
+	pytest \
+        --color=yes \
+        --durations=0 \
+        --ignore tests/fp8 \
+        --verbose \
+        examples/doremi/tests/
diff --git a/README.md b/README.md
index 6dcd1228..437efd65 100644
--- a/README.md
+++ b/README.md
@@ -68,8 +68,7 @@ pre-commit run --config .pre-commit-config.yaml --all-files
 
 *As a part of making sure we aren't slowed down as the codebase grows, we will not merge a PR if the features it introduces do not have test coverage.*
 
-We have extensions built on top of Nanotron, with their tests located in the `/examples` folder. Since VSCode defaults to discovering tests only in the `/tests` folder, please run tests from both `/examples` and `/tests` to ensure your PR does not break these extensions.
-
+We have extensions built on top of Nanotron, with their tests located in the `/examples` folder. Since VSCode defaults to discovering tests only in the `/tests` folder, please run tests from both `/examples` and `/tests` to ensure your PR does not break these extensions. Please run `make tests` to execute all the nanotron tests and the tests in the `/examples` directory that you need to pass.
 
 Features we would like to add:
 - [ ] Support `torch.compile`

From 3dd5f62284d34382789f53f486bd2091d9db2237 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Mon, 25 Mar 2024 21:36:23 +0000
Subject: [PATCH 54/73] First review fixes

---
 src/nanotron/data/blended_nanoset.py |  2 +-
 src/nanotron/data/dataset_builder.py |  4 +--
 src/nanotron/data/nanoset.py         |  6 ++--
 tests/helpers/data.py                |  8 +++--
 tools/preprocess_data.py             | 48 ++++++++++++++++------------
 5 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/src/nanotron/data/blended_nanoset.py b/src/nanotron/data/blended_nanoset.py
index bb2b6083..6a234af4 100644
--- a/src/nanotron/data/blended_nanoset.py
+++ b/src/nanotron/data/blended_nanoset.py
@@ -93,7 +93,7 @@ def build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
         Returns:
             Tuple[numpy.ndarray, numpy.ndarray]: The dataset index and the dataset sample index
         """
-        path_to_cache = getattr(self.config, "path_to_cache")
+        path_to_cache = self.config.path_to_cache
 
         if path_to_cache:
 
diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
index db42856e..42c45021 100644
--- a/src/nanotron/data/dataset_builder.py
+++ b/src/nanotron/data/dataset_builder.py
@@ -43,8 +43,8 @@ def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
             List[Union[BlendedNanoset, Nanoset]]: A list of either Nanoset or BlendedNanoset per split
         """
 
-        data_path = getattr(self.config, "data_path")
-        split = getattr(self.config, "split_vector")
+        data_path = self.config.data_path
+        split = self.config.split_vector
 
         # Single Nanoset
         if isinstance(data_path, str):
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
index f3561b05..8001c26a 100644
--- a/src/nanotron/data/nanoset.py
+++ b/src/nanotron/data/nanoset.py
@@ -146,7 +146,7 @@ def build_document_sample_shuffle_indices(
             shuffle index
 
         """
-        path_to_cache = getattr(self.config, "path_to_cache")
+        path_to_cache = self.config.path_to_cache
         if path_to_cache is None:
             path_to_cache = os.path.join(self.indexed_dataset.path_prefix, "cache", f"{type(self).__name__}_indices")
 
@@ -171,7 +171,7 @@ def get_path_to(suffix):
 
         num_tokens_per_epoch = compute_num_tokens_per_epoch(self.indexed_dataset, self.indexed_indices)
 
-        sequence_length = getattr(self.config, "sequence_length")
+        sequence_length = self.config.sequence_length
 
         num_epochs = compute_num_epochs(num_tokens_per_epoch, sequence_length, self.num_samples)
 
@@ -183,7 +183,7 @@ def get_path_to(suffix):
                 rank=0,
             )
 
-            numpy_random_state = numpy.random.RandomState(getattr(self.config, "random_seed"))
+            numpy_random_state = numpy.random.RandomState(self.config.random_seed)
 
             os.makedirs(path_to_cache, exist_ok=True)
 
diff --git a/tests/helpers/data.py b/tests/helpers/data.py
index c45f3a67..b4755aca 100644
--- a/tests/helpers/data.py
+++ b/tests/helpers/data.py
@@ -1,9 +1,13 @@
+import importlib
 import json
 import os
 import sys
+from pathlib import Path
+
+package = importlib.import_module("nanotron")
+package_path = Path(package.__file__).parent.parent.parent
+sys.path.append(str(package_path))
 
-# Hack to import preprocess_data.py
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir)))
 from argparse import Namespace
 
 import numpy as np
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index a25c6068..f09cc040 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -1,44 +1,52 @@
 import argparse
 import glob
+import importlib
 import json
 import multiprocessing
 import os
 import sys
 import time
+from pathlib import Path
 
-# TODO tj-solergibert: Keep this hack or not? Hack to be able to process data without installing nanotron.
-sys.path.append(os.path.join(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)), "src"))
+package = importlib.import_module("nanotron")
+package_path = Path(package.__file__).parent.parent
+sys.path.append(str(package_path))
 
 from nanotron.data import indexed_dataset
 from transformers import AutoTokenizer
 
 
 class Encoder(object):
-    def __init__(self, args):
-        self.args = args
+    def __init__(self, pretrained_model_name_or_path, json_key, append_eos):
+        self.pretrained_model_name_or_path = pretrained_model_name_or_path
+        self.json_key = json_key
+        self.append_eos = append_eos
 
     def initializer(self):
         # Use Encoder class as a container for global data
-        Encoder.tokenizer = AutoTokenizer.from_pretrained(self.args.pretrained_model_name_or_path)
+        Encoder.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_name_or_path)
 
     def encode(self, json_line):
         data = json.loads(json_line)
-        text = data[self.args.json_key]
+        text = data[self.json_key]
 
         text_ids = Encoder.tokenizer.encode(text)
-        if self.args.append_eos:
+        if self.append_eos:
             text_ids.append(Encoder.tokenizer.eos_token_id)
 
         return text_ids, len(text_ids), len(json_line)
 
 
 class Partition(object):
-    def __init__(self, args, workers):
-        self.args = args
-        self.workers = workers
+    def __init__(self, args):
+        self.workers = args.workers // args.partitions
+        self.log_interval = args.log_interval
+        self.pretrained_model_name_or_path = args.pretrained_model_name_or_path
+        self.json_key = args.json_key
+        self.append_eos = args.append_eos
 
     def print_processing_stats(self, count, proc_start, total_bytes_processed):
-        if count % self.args.log_interval == 0:
+        if count % self.log_interval == 0:
             current = time.time()
             elapsed = current - proc_start
             mbs = total_bytes_processed / elapsed / 1024 / 1024
@@ -50,13 +58,13 @@ def process_json_file(self, file_name):
         fin = open(input_file_name, "r", encoding="utf-8")
 
         startup_start = time.time()
-        encoder = Encoder(self.args)
-        tokenizer = AutoTokenizer.from_pretrained(self.args.pretrained_model_name_or_path)
+        encoder = Encoder(self.pretrained_model_name_or_path, self.json_key, self.append_eos)
+        tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_name_or_path)
         pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
         encoded_docs = pool.imap(encoder.encode, fin, 32)
 
-        output_bin_file = "{}_{}.bin".format(output_prefix, self.args.json_key)
-        output_idx_file = "{}_{}.idx".format(output_prefix, self.args.json_key)
+        output_bin_file = "{}_{}.bin".format(output_prefix, self.json_key)
+        output_idx_file = "{}_{}.idx".format(output_prefix, self.json_key)
 
         builder = indexed_dataset.MMapIndexedDatasetBuilder(
             output_bin_file, dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size)
@@ -114,10 +122,10 @@ def get_args():
     return args
 
 
-def get_file_name(args, file_id):
-    file_name, extension = os.path.splitext(args.input)
+def get_file_name(input, output_prefix, file_id):
+    file_name, extension = os.path.splitext(input)
     input_file_name = file_name + "_" + str(file_id) + extension
-    output_prefix = args.output_prefix + "_" + str(file_id)
+    output_prefix = output_prefix + "_" + str(file_id)
     file_names = {"partition": input_file_name, "output_prefix": output_prefix}
     return file_names
 
@@ -142,7 +150,7 @@ def main(args):
 
         # Create .jsonl partition files
         for idx in range(args.partitions):
-            in_ss_out_name = get_file_name(args, idx)
+            in_ss_out_name = get_file_name(args.input, args.output_prefix, idx)
             in_ss_out_names.append(in_ss_out_name)
 
         # Populate .jsonl partition files from parent files
@@ -164,7 +172,7 @@ def main(args):
             partitioned_input_files[idx].close()
 
     assert args.workers % args.partitions == 0
-    partition = Partition(args, args.workers // args.partitions)
+    partition = Partition(args)
 
     # Encode partition files in parallel
     processes = []

From 896473d3f67243fb98c04dd5a0e3ed868175ccb9 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Tue, 26 Mar 2024 20:06:37 +0000
Subject: [PATCH 55/73] Fixed samples of the datasets and updated docs. Moved
 helpers of Nanoset and BlendedNanoset to their respective files.
 Parameterized elements of the dataset and deleted random params. Updated to
 use collator and get_sampler func from dataloader.py

---
 docs/nanoset.md                         |  12 ++-
 run_train_nanoset.py                    |  10 +-
 src/nanotron/config/config.py           |   1 -
 src/nanotron/data/Makefile              |   9 --
 src/nanotron/data/blended_nanoset.py    |  47 ++++++++--
 src/nanotron/data/dataloader_builder.py | 111 ++--------------------
 src/nanotron/data/dataset_builder.py    |  14 +--
 src/nanotron/data/nanoset.py            | 118 +++++++++++++++---------
 src/nanotron/data/nanoset_configs.py    |   9 +-
 src/nanotron/data/utils.py              | 102 +-------------------
 src/nanotron/dataloader.py              |   6 +-
 src/nanotron/trainer.py                 |   3 +-
 tests/test_build_nanoset_dataloader.py  |  51 +++++-----
 13 files changed, 169 insertions(+), 324 deletions(-)
 delete mode 100644 src/nanotron/data/Makefile

diff --git a/docs/nanoset.md b/docs/nanoset.md
index 1301fb77..1d282d11 100644
--- a/docs/nanoset.md
+++ b/docs/nanoset.md
@@ -65,12 +65,20 @@ torchrun --nproc-per-node 8 run_train_nanoset.py --config configs/nanoset_llama2
 ```
 
 ## Under the hood
+### Number of samples
+
+When using Nanosets, we specify the `data_path` to the preprocessed dataset and a `split`. This `split` value will be used to divide the total number of documents into train, valid, and test sets.
+
+For the train split, the number of samples consumed from the Nanoset will be determined by the `number of train steps * global batch size`, so if this number is higher than the number of samples created with the documents assigned to the train split, we will see the dataset samples more than once (> 1 epoch). In the case of the valid and test split, we will see all the samples only once.
+
+In the case of the `BlendedNanoset`, we will also indicate the weight of each dataset to construct data batches according to the specified proportion. In this case, the train split will respect this proportion, considering that the number of samples will be calculated in the same way as in the Nanosets, so it may happen that we consume one dataset for 3 epochs and another larger dataset for only one epoch. For the valid and test splits, the same as in the Nanosets will occur; we will consume all the samples only once.
+
 ### Nanoset
 A `Nanoset` is paremeterized by the following variables:
 - The underlying `MMapIndexedDataset` instance (`indexed_dataset`)
 - The sequence length `S`
 - The split indices `indexed_indices` (the congituous subset of document or sequence indices used for training, validation, and testing)
-- The total number of samples `N` of the Nanoset
+- The total number of samples `N` of the Nanoset that we will consume during training. In the case of the valid and test splits, we will only consume the dataset once
 - The random seed `R`
 
 The `Nanoset` creates three index mappings to facilitate lookup: (1) The `document_index`, (2) the `sample_index`, and (3) the `shuffle_index`.
@@ -107,7 +115,7 @@ sample_index[16] = (19, 1000)     => From document_index[19] we don't have S tok
 
 len(sample_index) => 17, but only 16 useful samples
 ```
-3. The shuffle index (`shuffle_index`) is a 1-D array mapping from _k_ to _j_ of length `n_concatenations * (len(sample_index) - 1)`, where `n_concatenations` is defined as `(N / (len(sample_index) - 1)) + 1`, so that `len(shuffle_index)` is always greater than `N`. Before concatenating the full array, `shuffle_index` is shuffled according to `R`.
+3. In the train split, the shuffle index (`shuffle_index`) is a 1-D array mapping from _k_ to _j_ of length `n_concatenations * (len(sample_index) - 1)`, where `n_concatenations` is defined as `(N / (len(sample_index) - 1)) + 1`, so that `len(shuffle_index)` is always greater than `N`. While for the valid and test splits, `len(shuffle_index) == len(sample_index) - 1`. Before concatenating the full array, `shuffle_index` is shuffled according to `R`.
 ```
 Given:
 
diff --git a/run_train_nanoset.py b/run_train_nanoset.py
index 101f15d7..f4e1b7d4 100644
--- a/run_train_nanoset.py
+++ b/run_train_nanoset.py
@@ -13,7 +13,6 @@
 from nanotron.data.dataloader_builder import build_nanoset_dataloader
 from nanotron.data.dataset_builder import NanosetBuilder
 from nanotron.data.nanoset import NanosetConfig
-from nanotron.data.utils import compute_datasets_num_samples
 from nanotron.parallel.pipeline_parallel.utils import get_input_output_pp_ranks
 from nanotron.trainer import DistributedTrainer
 
@@ -27,19 +26,12 @@ def get_dataloaders(trainer: DistributedTrainer):
     input_pp_rank, output_pp_rank = get_input_output_pp_ranks(model=trainer.model)
 
     # Create Nanoset config
-    split_num_samples = compute_datasets_num_samples(
-        train_iters=trainer.config.tokens.train_steps,
-        eval_interval=trainer.config.tokens.val_check_interval,
-        eval_iters=trainer.config.tokens.val_steps,
-        global_batch_size=trainer.global_batch_size,
-    )
-
     nanoset_config = NanosetConfig(
         random_seed=trainer.config.data.seed,
         sequence_length=trainer.sequence_length,
         data_path=trainer.config.data.dataset.data_path,
         split=trainer.config.data.dataset.split,
-        split_num_samples=split_num_samples,
+        train_split_samples=trainer.config.tokens.train_steps * trainer.global_batch_size,
         path_to_cache=trainer.config.data.dataset.path_to_cache,
     )
 
diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
index 78defc59..cf425601 100644
--- a/src/nanotron/config/config.py
+++ b/src/nanotron/config/config.py
@@ -221,7 +221,6 @@ class TokensArgs:
     batch_accumulation_per_replica: int
 
     val_check_interval: Optional[int] = -1
-    val_steps: Optional[int] = 10
     limit_val_batches: Optional[int] = 0
     limit_test_batches: Optional[int] = 0
 
diff --git a/src/nanotron/data/Makefile b/src/nanotron/data/Makefile
deleted file mode 100644
index 8f9db768..00000000
--- a/src/nanotron/data/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
-CPPFLAGS += $(shell python3 -m pybind11 --includes)
-LIBNAME = helpers
-LIBEXT = $(shell python3-config --extension-suffix)
-
-default: $(LIBNAME)$(LIBEXT)
-
-%$(LIBEXT): %.cpp
-	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
diff --git a/src/nanotron/data/blended_nanoset.py b/src/nanotron/data/blended_nanoset.py
index 6a234af4..9bd043ac 100644
--- a/src/nanotron/data/blended_nanoset.py
+++ b/src/nanotron/data/blended_nanoset.py
@@ -11,7 +11,6 @@
 from nanotron import logging
 from nanotron.data.nanoset import Nanoset
 from nanotron.data.nanoset_configs import NanosetConfig
-from nanotron.data.utils import build_blending_indices
 from nanotron.logging import log_rank
 
 logger = logging.get_logger(__name__)
@@ -42,7 +41,7 @@ def __init__(
     ) -> None:
         assert len(datasets) == len(weights)
         assert numpy.isclose(sum(weights), 1.0)
-        assert all((type(_) == type(datasets[0]) for _ in datasets))
+        assert all((isinstance(dataset, Nanoset) for dataset in datasets))
 
         # Alert user to unnecessary blending
         if len(datasets) == 1:
@@ -50,8 +49,11 @@ def __init__(
 
         self.datasets = datasets
         self.weights = weights
-        self.size = size
         self.config = config
+        # For the train split, we will have global batch size * train steps samples
+        # For the valid and test splits, we will consume entirely both datasets
+        self.dataset_sizes = [len(dataset) for dataset in self.datasets]
+        self.size = size if size is not None else sum(self.dataset_sizes)
 
         # Create unique identifier
 
@@ -60,6 +62,7 @@ def __init__(
         unique_identifiers["datasets"] = [dataset.unique_identifiers for dataset in self.datasets]
         unique_identifiers["weights"] = self.weights
         unique_identifiers["size"] = self.size
+        unique_identifiers["dataset_sizes"] = self.dataset_sizes
 
         self.unique_description = json.dumps(unique_identifiers, indent=4)
         self.unique_description_hash = hashlib.md5(self.unique_description.encode("utf-8")).hexdigest()
@@ -70,9 +73,9 @@ def __init__(
         _ = self[self.size - 1]
         try:
             _ = self[self.size]
-            raise RuntimeError(f"{type(self).__name__} size is improperly bounded")
+            raise RuntimeError("BlendedNanoset size is improperly bounded")
         except IndexError:
-            log_rank(f"> {type(self).__name__} length: {len(self)}", logger=logger, level=logging.INFO, rank=0)
+            log_rank(f"> BlendedNanoset length: {len(self)}", logger=logger, level=logging.INFO, rank=0)
 
     def __len__(self) -> int:
         return self.size
@@ -123,7 +126,7 @@ def get_path_to(suffix):
             t_beg = time.time()
 
             dataset_index, dataset_sample_index = build_blending_indices(
-                size=self.size, weights=numpy.array(self.weights)
+                n_samples=self.size, weights=numpy.array(self.weights), dataset_sizes=self.dataset_sizes
             )
 
             if path_to_cache:
@@ -167,3 +170,35 @@ def get_path_to(suffix):
         log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
         return dataset_index, dataset_sample_index
+
+
+def build_blending_indices(
+    n_samples: int, weights: numpy.ndarray, dataset_sizes: List
+) -> Tuple[numpy.ndarray, numpy.ndarray]:
+    """Given multiple datasets and a weighting array, build samples
+    such that it follows those weights."""
+    # Create empty arrays for dataset indices and dataset sample indices
+    dataset_index = numpy.empty((n_samples,), dtype="uint")
+    dataset_sample_index = numpy.empty((n_samples,), dtype="long")
+
+    # Initialize buffer for number of samples used for each dataset
+    current_samples = numpy.zeros((len(weights),), dtype="long")
+
+    # Iterate over all samples
+    for sample_idx in range(n_samples):
+
+        # Convert sample index to float for comparison against weights
+        sample_idx_float = max(sample_idx, 1.0)
+
+        # Find the dataset with the highest error
+        errors = weights * sample_idx_float - current_samples
+        max_error_index = numpy.argmax(errors)
+
+        # Assign the dataset index and update the sample index
+        dataset_index[sample_idx] = max_error_index
+        dataset_sample_index[sample_idx] = current_samples[max_error_index] % dataset_sizes[max_error_index]
+
+        # Update the total samples for the selected dataset
+        current_samples[max_error_index] += 1
+
+    return dataset_index, dataset_sample_index
diff --git a/src/nanotron/data/dataloader_builder.py b/src/nanotron/data/dataloader_builder.py
index c5467064..1b227c4c 100644
--- a/src/nanotron/data/dataloader_builder.py
+++ b/src/nanotron/data/dataloader_builder.py
@@ -1,20 +1,14 @@
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, Sampler
-from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import DataLoader
 
 import nanotron.distributed as dist
 from nanotron import logging
 from nanotron.dataloader import (
+    DataCollatorForCLM,
     EmptyInfiniteDataset,
-    SkipBatchSampler,
+    _get_train_sampler,
     get_dataloader_worker_init,
 )
 from nanotron.parallel import ParallelContext
-from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
 
 logger = logging.get_logger(__name__)
 
@@ -39,7 +33,7 @@ def build_nanoset_dataloader(
         # No need to spawn a lot of workers, we can just use main
         dataloader_num_workers = 0
 
-    data_collator = NanosetDataCollatorForCLM(
+    data_collator = DataCollatorForCLM(
         sequence_length=sequence_length,
         input_pp_rank=input_pp_rank,
         output_pp_rank=output_pp_rank,
@@ -50,8 +44,8 @@ def build_nanoset_dataloader(
     dp_ranks_size = parallel_context.dp_pg.size()
     dp_rank = parallel_context.dp_pg.rank()
 
-    sampler = get_sampler(
-        dataset=dataset,
+    sampler = _get_train_sampler(
+        train_dataset=dataset,
         dl_ranks_size=dp_ranks_size,
         dl_rank=dp_rank,
         drop_last=dataloader_drop_last,
@@ -68,96 +62,3 @@ def build_nanoset_dataloader(
         pin_memory=dataloader_pin_memory,
         worker_init_fn=get_dataloader_worker_init(dp_rank=dp_rank),
     )
-
-
-@dataclass
-class NanosetDataCollatorForCLM:
-    """
-    Data collator used for causal language modeling with Nanoset.
-
-    - sequence_length: Sequence length of the model
-    - input_pp_rank: Discards last input id token
-    - output_pp_rank: Discards first label id token
-    - other pp ranks: Don't have data. Instead, we use `TensorPointer` to point to the rank having the data.
-    """
-
-    sequence_length: int
-    input_pp_rank: int
-    output_pp_rank: int
-    parallel_context: ParallelContext
-
-    def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Union[torch.Tensor, TensorPointer]]:
-
-        # Process the case when current rank doesn't require data. We return `TensorPointer` that points to ranks having the data.
-        current_pp_rank = dist.get_rank(self.parallel_context.pp_pg)
-
-        if current_pp_rank not in [
-            self.input_pp_rank,
-            self.output_pp_rank,
-        ]:
-            return {
-                "input_ids": TensorPointer(group_rank=self.input_pp_rank),
-                "input_mask": TensorPointer(group_rank=self.input_pp_rank),
-                "label_ids": TensorPointer(group_rank=self.output_pp_rank),
-                "label_mask": TensorPointer(group_rank=self.output_pp_rank),
-            }
-
-        # NOTE: Nanoset datasets key is "text"
-        assert all(list(example.keys()) == ["text"] for example in examples)
-
-        # TODO @nouamanetazi: Is it better to have examples as np.array or torch.Tensor?
-        input_ids = np.vstack([examples[i]["text"] for i in range(len(examples))])  # (b, s)
-        batch_size, expanded_input_length = input_ids.shape
-
-        result: Dict[str, Union[np.ndarray, TensorPointer]] = {}
-
-        result["input_ids"] = TensorPointer(group_rank=self.input_pp_rank)
-        result["input_mask"] = TensorPointer(group_rank=self.input_pp_rank)
-        result["label_ids"] = TensorPointer(group_rank=self.output_pp_rank)
-        result["label_mask"] = TensorPointer(group_rank=self.output_pp_rank)
-
-        assert (
-            expanded_input_length == self.sequence_length + 1
-        ), f"Samples should be of length {self.sequence_length + 1} (seq_len+1), but got {expanded_input_length}"
-
-        # Process inputs: last token is the label
-        if current_pp_rank == self.input_pp_rank:
-            result["input_ids"] = input_ids[:, :-1]
-            result["input_mask"] = np.ones((batch_size, self.sequence_length), dtype=np.bool_)
-
-        # Process labels: shift them to the left
-        if current_pp_rank == self.output_pp_rank:
-            result["label_ids"] = input_ids[:, 1:]
-            result["label_mask"] = np.ones((batch_size, self.sequence_length), dtype=np.bool_)
-
-        if isinstance(result["input_ids"], torch.Tensor) and result["input_ids"].shape[-1] != self.sequence_length:
-            raise ValueError(
-                f"`labels` are incorrectly preprocessed. `labels` length is {result['input_ids'].shape[-1]}, but should be"
-                f" {self.sequence_length}."
-            )
-        if isinstance(result["label_ids"], torch.Tensor) and result["label_ids"].shape[-1] != self.sequence_length:
-            raise ValueError(
-                f"`labels` are incorrectly preprocessed. `labels` length is {result['label_ids'].shape[-1]}, but should be"
-                f" {self.sequence_length}."
-            )
-
-        # Cast np.array to torch.Tensor
-        result = {k: v if isinstance(v, TensorPointer) else torch.from_numpy(v) for k, v in result.items()}
-        return result
-
-
-def get_sampler(
-    dl_ranks_size: int,
-    dl_rank: int,
-    dataset,
-    consumed_train_samples: int,
-    drop_last: Optional[bool] = True,
-) -> Optional[Sampler]:
-    """returns sampler that restricts data loading to a subset of the dataset proper to the DP rank"""
-
-    sampler = DistributedSampler(dataset, num_replicas=dl_ranks_size, rank=dl_rank, shuffle=False, drop_last=drop_last)
-
-    if consumed_train_samples > 0:
-        sampler = SkipBatchSampler(sampler, skip_batches=consumed_train_samples, dp_size=dl_ranks_size)
-
-    return sampler
diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
index 42c45021..cb23aed9 100644
--- a/src/nanotron/data/dataset_builder.py
+++ b/src/nanotron/data/dataset_builder.py
@@ -1,4 +1,3 @@
-import math
 from typing import Any, List, Type, Union
 
 import numpy
@@ -55,12 +54,10 @@ def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
         weight_per_dataset = normalize(list(data_path.values()))
 
         # NOTE: Use 0.5% target margin to ensure that that we do not exhaust the indices of any dataset in the Blend
+        # Just specify sizes for the train splits; valid and test will contain all samples reserved for those splits from all the datasets
         sizes_per_dataset = [
-            [
-                int(math.ceil(target_num_samples * weight * 1.005))
-                for target_num_samples in self.config.split_num_samples
-            ]
-            for weight in weight_per_dataset
+            [int(self.config.split_num_samples[0] * dataset_weight * 1.005), None, None]
+            for dataset_weight in weight_per_dataset
         ]
 
         nanoset_datasets = [[] for _ in range(len(Split))]
@@ -73,9 +70,6 @@ def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
             for j in range(len(nanoset_datasets_split)):
                 nanoset_datasets[j].append(nanoset_datasets_split[j])
 
-        # Sum over all contributing datasets, per split
-        size_per_split = list(map(sum, zip(*sizes_per_dataset)))
-
         blended_nanosets = []
 
         for i in range(len(nanoset_datasets)):
@@ -91,7 +85,7 @@ def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
                         BlendedNanoset,
                         nanoset_datasets[i],
                         weight_per_dataset,
-                        size_per_split[i],
+                        self.config.split_num_samples[i],
                         self.config,
                     )
                 )
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
index 8001c26a..c56129cf 100644
--- a/src/nanotron/data/nanoset.py
+++ b/src/nanotron/data/nanoset.py
@@ -3,7 +3,7 @@
 import os
 import time
 from collections import OrderedDict
-from typing import Dict, Tuple
+from typing import Dict, Tuple, Union
 
 import numpy
 import torch
@@ -11,15 +11,14 @@
 from nanotron import logging
 from nanotron.data.indexed_dataset import MMapIndexedDataset
 from nanotron.data.nanoset_configs import NanosetConfig
-from nanotron.data.utils import Split, build_sample_idx
+from nanotron.data.utils import Split
 from nanotron.logging import log_rank
 
 logger = logging.get_logger(__name__)
 
 
 class Nanoset(torch.utils.data.Dataset):
-    """Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/datasets/gpt_dataset.py
-
+    """
     The base Nanoset dataset
 
     Args:
@@ -28,7 +27,7 @@ class Nanoset(torch.utils.data.Dataset):
 
         indexed_indices (numpy.ndarray): The set of the documents indices to expose
 
-        num_samples (int): The number of samples to draw from the indexed dataset
+        num_samples (int): Number of samples that we will consume from the dataset. If it is None, we will consume all the samples only 1 time (valid and test splits). For the train split, we will introduce train steps * global batch size and compute the number of epochs based on the length of the dataset.
 
         index_split (Split): The indexed_indices Split (train, valid, test)
 
@@ -39,13 +38,12 @@ def __init__(
         self,
         indexed_dataset: MMapIndexedDataset,
         indexed_indices: numpy.ndarray,
-        num_samples: int,
+        num_samples: Union[int, None],
         index_split: Split,
         config: NanosetConfig,
     ) -> None:
 
         assert indexed_indices.size > 0
-        assert num_samples > 0
 
         self.indexed_dataset = indexed_dataset
         self.indexed_indices = indexed_indices
@@ -58,7 +56,6 @@ def __init__(
         self.unique_identifiers = OrderedDict()
         self.unique_identifiers["class"] = type(self).__name__
         self.unique_identifiers["path_prefix"] = self.indexed_dataset.path_prefix
-        self.unique_identifiers["num_samples"] = self.num_samples
         self.unique_identifiers["index_split"] = self.index_split.name
         self.unique_identifiers["split"] = self.config.split
         self.unique_identifiers["random_seed"] = self.config.random_seed
@@ -122,7 +119,7 @@ def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
                 length = None if i < doc_index_end else doc_index_end_offset + 1
                 tokens.append(self.indexed_dataset.get(self.document_index[i], offset=offset, length=length))
 
-        return {"text": numpy.array(numpy.concatenate(tokens), dtype=numpy.int64)}
+        return {"input_ids": numpy.array(numpy.concatenate(tokens), dtype=numpy.int64)}
 
     def build_document_sample_shuffle_indices(
         self,
@@ -171,10 +168,6 @@ def get_path_to(suffix):
 
         num_tokens_per_epoch = compute_num_tokens_per_epoch(self.indexed_dataset, self.indexed_indices)
 
-        sequence_length = self.config.sequence_length
-
-        num_epochs = compute_num_epochs(num_tokens_per_epoch, sequence_length, self.num_samples)
-
         if not cache_hit and torch.distributed.get_rank() == 0:
             log_rank(
                 f"Build and save the {type(self).__name__} {self.index_split.name} indices",
@@ -219,7 +212,7 @@ def get_path_to(suffix):
             sample_index = build_sample_idx(
                 sizes=self.indexed_dataset.sequence_lengths,
                 doc_idx=document_index,
-                seq_length=sequence_length,
+                seq_length=self.config.sequence_length,
                 tokens_per_epoch=num_tokens_per_epoch,
             )
             numpy.save(path_to_sample_index, sample_index, allow_pickle=True)
@@ -236,12 +229,14 @@ def get_path_to(suffix):
             t_beg = time.time()
             shuffle_index = numpy.arange(start=0, stop=(sample_index.shape[0] - 1), step=1, dtype=numpy.uint32)
             numpy_random_state.shuffle(shuffle_index)
-            n_concatenations = (
-                int(self.num_samples / shuffle_index.shape[0]) + 1
-            )  # NOTE: To ensure that we always generate more samples than requested in num_samples
+            if self.num_samples is not None:  # For the train split, concatenate shuffle Indexes
+                n_concatenations = (
+                    int(self.num_samples / shuffle_index.shape[0]) + 1
+                )  # NOTE: To ensure that we always generate more samples than requested in num_samples
+                shuffle_index = numpy.concatenate([shuffle_index for _ in range(n_concatenations)])
             numpy.save(
                 path_to_shuffle_index,
-                numpy.concatenate([shuffle_index for _ in range(n_concatenations)]),
+                shuffle_index,
                 allow_pickle=True,
             )
             t_end = time.time()
@@ -287,12 +282,70 @@ def get_path_to(suffix):
         t_end = time.time()
         log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
-        log_rank(f"> Total number of samples: {shuffle_index.shape[0]}", logger=logger, level=logging.INFO, rank=0)
-        log_rank(f"> Total number of epochs: {num_epochs}", logger=logger, level=logging.INFO, rank=0)
+        log_rank(f"> Total number of samples: {sample_index.shape[0] - 1}", logger=logger, level=logging.INFO, rank=0)
+
+        if (
+            self.num_samples is not None
+        ):  # Compute number of epochs we will iterate over this Nanoset. Just for training
+            num_epochs = round(self.num_samples / (sample_index.shape[0] - 1), 2)
+            log_rank(f"> Total number of epochs: {num_epochs}", logger=logger, level=logging.INFO, rank=0)
 
         return document_index, sample_index, shuffle_index
 
 
+def build_sample_idx(sizes, doc_idx, seq_length, tokens_per_epoch):
+    # Check validity of inumpyut args.
+    assert seq_length > 1
+    assert tokens_per_epoch > 1
+
+    # Compute the number of samples.
+    num_samples = (tokens_per_epoch - 1) // seq_length
+
+    # Allocate memory for the mapping table.
+    sample_idx = numpy.full([num_samples + 1, 2], fill_value=-999, dtype=numpy.int32)
+
+    # Setup helper vars.
+    sample_index = 0
+    doc_idx_index = 0
+    doc_offset = 0
+
+    # Add the first entry to the mapping table.
+    sample_idx[sample_index][0] = doc_idx_index
+    sample_idx[sample_index][1] = doc_offset
+    sample_index += 1
+
+    # Loop over the rest of the samples.
+    while sample_index <= num_samples:
+        # Start with a fresh sequence.
+        remaining_seq_length = seq_length + 1
+
+        # Keep adding docs until we reach the end of the sequence.
+        while remaining_seq_length != 0:
+            # Look up the current document length.
+            doc_id = doc_idx[doc_idx_index]
+            doc_length = sizes[doc_id] - doc_offset
+
+            # Try to add it to the current sequence.
+            remaining_seq_length -= doc_length
+
+            # If it fits, adjust offset and break out of inner loop.
+            if remaining_seq_length <= 0:
+                doc_offset += remaining_seq_length + doc_length - 1
+                remaining_seq_length = 0
+            else:
+                # Otherwise move to the next document.
+                doc_idx_index += 1
+                doc_offset = 0
+
+        # Store the current sequence in the mapping table.
+        sample_idx[sample_index][0] = doc_idx_index
+        sample_idx[sample_index][1] = doc_offset
+        sample_index += 1
+
+    assert not numpy.any(sample_idx == -999)
+    return sample_idx
+
+
 def compute_num_tokens_per_epoch(indexed_dataset: MMapIndexedDataset, indices: numpy.ndarray) -> int:
     """Compute the number of tokens in a single epoch
 
@@ -305,28 +358,3 @@ def compute_num_tokens_per_epoch(indexed_dataset: MMapIndexedDataset, indices: n
         int: The number of tokens in a single epoch
     """
     return numpy.sum(indexed_dataset.sequence_lengths[indices])
-
-
-def compute_num_epochs(num_tokens_per_epoch: int, seq_length: int, num_samples: int) -> int:
-    """Calculate the number of epochs
-
-    Args:
-        num_tokens_per_epoch (int): The number of tokens in a single epoch
-
-        seq_length (int): The sequence length in tokens
-
-        num_samples (int): The total number of samples
-
-    Returns:
-        int: The number of epochs
-    """
-    num_epochs = 0
-    num_tokens = 0
-    while True:
-        num_epochs += 1
-        num_tokens += num_tokens_per_epoch
-        # -1 is because we need to retrieve seq_length + 1 token each time
-        # but the last token will overlap with the first token of the next
-        # sample except for the last sample.
-        if ((num_tokens - 1) // seq_length) >= num_samples:
-            return num_epochs
diff --git a/src/nanotron/data/nanoset_configs.py b/src/nanotron/data/nanoset_configs.py
index fac1cb2b..8e5caaba 100644
--- a/src/nanotron/data/nanoset_configs.py
+++ b/src/nanotron/data/nanoset_configs.py
@@ -23,7 +23,9 @@ class NanosetConfig:
         split_vector Optional[List[float]]): The split string, parsed and normalized post-
         initialization. Not to be passed to the constructor.
 
-        split_num_samples (list[int]): List containing the number of samples per split
+        train_split_samples (int): The number of samples of the train split split (global batch size * training steps)
+
+        split_num_samples (list[int]): List containing the number of samples per split. Valid and test splits set to None in order to consume all the samples. Not to be passed to the constructor.
 
         path_to_cache (str): Where all re-useable dataset indices are to be cached.
     """
@@ -32,7 +34,7 @@ class NanosetConfig:
 
     sequence_length: int
 
-    split_num_samples: list[int]
+    train_split_samples: int
 
     data_path: Union[str, dict]
 
@@ -42,8 +44,11 @@ class NanosetConfig:
 
     split_vector: Optional[List[float]] = field(init=False, default=None)
 
+    split_num_samples: list[int] = field(init=False, default=None)
+
     def __post_init__(self):
         """Python dataclass method that is used to modify attributes after initialization. See
         https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
         """
         self.split_vector = parse_and_normalize_split(self.split)
+        self.split_num_samples = [self.train_split_samples, None, None]
diff --git a/src/nanotron/data/utils.py b/src/nanotron/data/utils.py
index 51bf03b3..64277bd7 100644
--- a/src/nanotron/data/utils.py
+++ b/src/nanotron/data/utils.py
@@ -1,11 +1,10 @@
 import re
 from enum import Enum
-from typing import List, Tuple
+from typing import List
 
 import numpy
 
 from nanotron import logging
-from nanotron.logging import log_rank
 
 logger = logging.get_logger(__name__)
 
@@ -51,102 +50,3 @@ def parse_and_normalize_split(split: str) -> List[float]:
     split = normalize(split)
 
     return split
-
-
-def compute_datasets_num_samples(train_iters, eval_interval, eval_iters, global_batch_size):
-
-    train_samples = train_iters * global_batch_size
-    test_iters = eval_iters
-    eval_iters = (train_iters // eval_interval + 1) * eval_iters
-
-    datasets_num_samples = [train_samples, eval_iters * global_batch_size, test_iters * global_batch_size]
-
-    log_rank(" > Datasets target sizes (minimum size):", logger=logger, level=logging.INFO, rank=0)
-    log_rank("    Train:      {}".format(datasets_num_samples[0]), logger=logger, level=logging.INFO, rank=0)
-    log_rank("    Validation: {}".format(datasets_num_samples[1]), logger=logger, level=logging.INFO, rank=0)
-    log_rank("    Test:       {}".format(datasets_num_samples[2]), logger=logger, level=logging.INFO, rank=0)
-
-    return datasets_num_samples
-
-
-def build_blending_indices(size: int, weights: numpy.ndarray) -> Tuple[numpy.ndarray, numpy.ndarray]:
-    """Given multiple datasets and a weighting array, build samples
-    such that it follows those weights."""
-    # Create empty arrays for dataset indices and dataset sample indices
-    dataset_index = numpy.empty((size,), dtype="uint")
-    dataset_sample_index = numpy.empty((size,), dtype="long")
-
-    # Initialize buffer for number of samples used for each dataset
-    current_samples = numpy.zeros((len(weights),), dtype="long")
-
-    # Iterate over all samples
-    for sample_idx in range(size):
-
-        # Convert sample index to float for comparison against weights
-        sample_idx_float = max(sample_idx, 1.0)
-
-        # Find the dataset with the highest error
-        errors = weights * sample_idx_float - current_samples
-        max_error_index = numpy.argmax(errors)
-
-        # Assign the dataset index and update the sample index
-        dataset_index[sample_idx] = max_error_index
-        dataset_sample_index[sample_idx] = current_samples[max_error_index]
-
-        # Update the total samples for the selected dataset
-        current_samples[max_error_index] += 1
-
-    return dataset_index, dataset_sample_index
-
-
-def build_sample_idx(sizes, doc_idx, seq_length, tokens_per_epoch):
-    # Check validity of inumpyut args.
-    assert seq_length > 1
-    assert tokens_per_epoch > 1
-
-    # Compute the number of samples.
-    num_samples = (tokens_per_epoch - 1) // seq_length
-
-    # Allocate memory for the mapping table.
-    sample_idx = numpy.full([num_samples + 1, 2], fill_value=-999, dtype=numpy.int32)
-
-    # Setup helper vars.
-    sample_index = 0
-    doc_idx_index = 0
-    doc_offset = 0
-
-    # Add the first entry to the mapping table.
-    sample_idx[sample_index][0] = doc_idx_index
-    sample_idx[sample_index][1] = doc_offset
-    sample_index += 1
-
-    # Loop over the rest of the samples.
-    while sample_index <= num_samples:
-        # Start with a fresh sequence.
-        remaining_seq_length = seq_length + 1
-
-        # Keep adding docs until we reach the end of the sequence.
-        while remaining_seq_length != 0:
-            # Look up the current document length.
-            doc_id = doc_idx[doc_idx_index]
-            doc_length = sizes[doc_id] - doc_offset
-
-            # Try to add it to the current sequence.
-            remaining_seq_length -= doc_length
-
-            # If it fits, adjust offset and break out of inner loop.
-            if remaining_seq_length <= 0:
-                doc_offset += remaining_seq_length + doc_length - 1
-                remaining_seq_length = 0
-            else:
-                # Otherwise move to the next document.
-                doc_idx_index += 1
-                doc_offset = 0
-
-        # Store the current sequence in the mapping table.
-        sample_idx[sample_index][0] = doc_idx_index
-        sample_idx[sample_index][1] = doc_offset
-        sample_index += 1
-
-    assert not numpy.any(sample_idx == -999)
-    return sample_idx
diff --git a/src/nanotron/dataloader.py b/src/nanotron/dataloader.py
index d1853286..ec64ebe3 100644
--- a/src/nanotron/dataloader.py
+++ b/src/nanotron/dataloader.py
@@ -403,10 +403,10 @@ def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Uni
 def _get_train_sampler(
     dl_ranks_size: int,
     dl_rank: int,
-    train_dataset: "Dataset",
-    seed: int,
-    use_loop_to_round_batch_size: bool,
+    train_dataset: Union["Dataset", torch.utils.data.Dataset],
     consumed_train_samples: int,
+    seed: int = 42,
+    use_loop_to_round_batch_size: bool = False,
     micro_batch_size: Optional[int] = None,
     drop_last: Optional[bool] = True,
 ) -> Optional[torch.utils.data.Sampler]:
diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index ba0f297a..eb6a5b4a 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -476,8 +476,7 @@ def train_step_logs(
                     {
                         **{log_item.tag: log_item.scalar_value for log_item in log_entries},
                         "iteration_step": self.iteration_step,
-                    },
-                    step=self.iteration_step - 1,
+                    }
                 )
 
             self.loggerwriter.add_scalars_from_list(log_entries, self.iteration_step)
diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index b326722a..2ee424b0 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -1,5 +1,3 @@
-import random
-
 import pytest
 from helpers.context import TestContext
 from helpers.data import (
@@ -15,7 +13,6 @@
 from nanotron.data.dataset_builder import NanosetBuilder
 from nanotron.data.nanoset import Nanoset
 from nanotron.data.nanoset_configs import NanosetConfig
-from nanotron.data.utils import compute_datasets_num_samples
 from nanotron.parallel import ParallelContext
 
 
@@ -27,8 +24,10 @@
         for all_3d_configs in get_all_3d_configurations(gpus)
     ],
 )
+@pytest.mark.parametrize("train_steps", [100, 500])
+@pytest.mark.parametrize("sequence_length", [512, 8192])
 @rerun_if_address_is_in_use()
-def test_build_nanoset_dataloader(tp: int, dp: int, pp: int):
+def test_build_nanoset_dataloader(tp: int, dp: int, pp: int, train_steps: int, sequence_length: int):
     test_context = TestContext()
 
     # Create dataset files
@@ -43,51 +42,45 @@ def test_build_nanoset_dataloader(tp: int, dp: int, pp: int):
         preprocess_dummy_dataset(path_to_json=json_path)
 
     init_distributed(tp=tp, dp=dp, pp=pp)(_test_build_nanoset_dataloader)(
-        test_context=test_context, path_to_bin_files=bin_paths
+        test_context=test_context,
+        path_to_bin_files=bin_paths,
+        train_steps=train_steps,
+        sequence_length=sequence_length,
     )
 
 
 def _test_build_nanoset_dataloader(
-    parallel_context: ParallelContext, test_context: TestContext, path_to_bin_files: str
+    parallel_context: ParallelContext,
+    test_context: TestContext,
+    path_to_bin_files: str,
+    train_steps: int,
+    sequence_length: int,
 ):
     SEED = 1234
-    random.seed(SEED)
-
-    TRAIN_STEPS = random.randint(1000, 10000)
-    VAL_CHECK_INTERVAL = TRAIN_STEPS / 4
-    VAL_STEPS = TRAIN_STEPS / 10
-    MICRO_BATCH_SIZE = 2 ** random.randint(1, 4)
-    N_MICRO_BATCHES_PER_BATCH = 2 ** random.randint(1, 4)
+    MICRO_BATCH_SIZE = 4
+    N_MICRO_BATCHES_PER_BATCH = 8
     GLOBAL_BATCH_SIZE = MICRO_BATCH_SIZE * N_MICRO_BATCHES_PER_BATCH * parallel_context.dp_pg.size()
 
-    SEQ_LENGTH = 2 ** random.randint(10, 14)
-    SPLIT = "{},{},{}".format(random.randint(1, 100), random.randint(1, 100), random.randint(1, 100))
+    SPLIT = "70,20,10"
 
     input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
 
     # Create Nanoset configs: 1. Normal 2. Blended
-    split_num_samples = compute_datasets_num_samples(
-        train_iters=TRAIN_STEPS,
-        eval_interval=VAL_CHECK_INTERVAL,
-        eval_iters=VAL_STEPS,
-        global_batch_size=GLOBAL_BATCH_SIZE,
-    )
-
     nanoset_config = NanosetConfig(
         random_seed=SEED,
-        sequence_length=SEQ_LENGTH,
+        sequence_length=sequence_length,
         data_path=path_to_bin_files[0],
         split=SPLIT,
-        split_num_samples=split_num_samples,
+        train_split_samples=train_steps * GLOBAL_BATCH_SIZE,
         path_to_cache=test_context.get_auto_remove_tmp_dir(),
     )
 
     blended_nanoset_config = NanosetConfig(
         random_seed=SEED,
-        sequence_length=SEQ_LENGTH,
-        data_path={bin_path: random.randint(1, 100) for bin_path in path_to_bin_files},
+        sequence_length=sequence_length,
+        data_path={path_to_bin_files[0]: 0.8, path_to_bin_files[1]: 0.2},
         split=SPLIT,
-        split_num_samples=split_num_samples,
+        train_split_samples=train_steps * GLOBAL_BATCH_SIZE,
         path_to_cache=test_context.get_auto_remove_tmp_dir(),
     )
 
@@ -109,7 +102,7 @@ def _test_build_nanoset_dataloader(
         # Create Dataloaders
         dataloader = build_nanoset_dataloader(
             train_dataset,
-            sequence_length=SEQ_LENGTH,
+            sequence_length=sequence_length,
             parallel_context=parallel_context,
             input_pp_rank=input_pp_rank,
             output_pp_rank=output_pp_rank,
@@ -124,7 +117,7 @@ def _test_build_nanoset_dataloader(
             batch=batch,
             parallel_context=parallel_context,
             micro_batch_size=MICRO_BATCH_SIZE,
-            sequence_length=SEQ_LENGTH,
+            sequence_length=sequence_length,
         )
 
     parallel_context.destroy()

From 50acff8097378fb58eda7000b3b40e1a7f5c68b0 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Tue, 26 Mar 2024 20:36:17 +0000
Subject: [PATCH 56/73] moved LICENSE

---
 LICENSE                   | 60 ---------------------------------------
 src/nanotron/data/LICENSE | 59 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 60 deletions(-)
 create mode 100644 src/nanotron/data/LICENSE

diff --git a/LICENSE b/LICENSE
index 208d5bf8..261eeb9e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -199,63 +199,3 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
-
-This repository also contains code from Facebook (from their Fairseq project)
-and NVIDIA (from their Megatron project). Files from these organizations
-have notices at the top of each file. Below are licenses used in those files, as indicated.
-
-------------- LICENSE FOR Facebook Fairseq code --------------
-
-MIT License
-
-Copyright (c) Facebook, Inc. and its affiliates.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-------------- LICENSE FOR NVIDIA Megatron code --------------
-
-The following applies to all files unless otherwise noted:
-
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---
diff --git a/src/nanotron/data/LICENSE b/src/nanotron/data/LICENSE
new file mode 100644
index 00000000..a5a0a738
--- /dev/null
+++ b/src/nanotron/data/LICENSE
@@ -0,0 +1,59 @@
+This repository also contains code from Facebook (from their Fairseq project)
+and NVIDIA (from their Megatron project). Files from these organizations
+have notices at the top of each file. Below are licenses used in those files, as indicated.
+
+------------- LICENSE FOR Facebook Fairseq code --------------
+
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+------------- LICENSE FOR NVIDIA Megatron code --------------
+
+The following applies to all files unless otherwise noted:
+
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--

From 16f44d1b8177b38a7ae3a2322a912562dbdc244a Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Tue, 26 Mar 2024 21:36:38 +0000
Subject: [PATCH 57/73] Added typing to preprocess_data & log_rank pathc

---
 src/nanotron/logging.py  |  7 +++----
 tools/preprocess_data.py | 34 +++++++++++++++++++++-------------
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/src/nanotron/logging.py b/src/nanotron/logging.py
index 17949830..3c3b1711 100644
--- a/src/nanotron/logging.py
+++ b/src/nanotron/logging.py
@@ -217,16 +217,15 @@ def log_rank(
     **kwargs,
 ):
     """Log only if the current process is the rank specified."""
-    # Use default group if group is not provided
-    try:
+    if torch.distributed.is_initialized():
+        # Use default group is group is not provided
         if group is None:
             group = torch_dist.distributed_c10d._get_default_group()
 
         # rank is None means everyone logs
         if rank is None or dist.get_rank(group) == rank:
             logger.log(level, msg, **kwargs)
-
-    except:
+    else:
         logger.log(level, msg, **kwargs)
 
 
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index f09cc040..e918105b 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -17,7 +17,7 @@
 
 
 class Encoder(object):
-    def __init__(self, pretrained_model_name_or_path, json_key, append_eos):
+    def __init__(self, pretrained_model_name_or_path: str, json_key: str, append_eos: bool):
         self.pretrained_model_name_or_path = pretrained_model_name_or_path
         self.json_key = json_key
         self.append_eos = append_eos
@@ -26,7 +26,7 @@ def initializer(self):
         # Use Encoder class as a container for global data
         Encoder.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_name_or_path)
 
-    def encode(self, json_line):
+    def encode(self, json_line: str):
         data = json.loads(json_line)
         text = data[self.json_key]
 
@@ -38,21 +38,23 @@ def encode(self, json_line):
 
 
 class Partition(object):
-    def __init__(self, args):
-        self.workers = args.workers // args.partitions
-        self.log_interval = args.log_interval
-        self.pretrained_model_name_or_path = args.pretrained_model_name_or_path
-        self.json_key = args.json_key
-        self.append_eos = args.append_eos
-
-    def print_processing_stats(self, count, proc_start, total_bytes_processed):
+    def __init__(
+        self, workers: int, log_interval: int, pretrained_model_name_or_path: str, json_key: str, append_eos: bool
+    ):
+        self.workers = workers
+        self.log_interval = log_interval
+        self.pretrained_model_name_or_path = pretrained_model_name_or_path
+        self.json_key = json_key
+        self.append_eos = append_eos
+
+    def print_processing_stats(self, count: int, proc_start: float, total_bytes_processed: int):
         if count % self.log_interval == 0:
             current = time.time()
             elapsed = current - proc_start
             mbs = total_bytes_processed / elapsed / 1024 / 1024
             print(f"Processed {count} documents", f"({count/elapsed} docs/s, {mbs} MB/s).", file=sys.stderr)
 
-    def process_json_file(self, file_name):
+    def process_json_file(self, file_name: str):
         input_file_name, output_prefix = file_name
         print("Opening", input_file_name)
         fin = open(input_file_name, "r", encoding="utf-8")
@@ -122,7 +124,7 @@ def get_args():
     return args
 
 
-def get_file_name(input, output_prefix, file_id):
+def get_file_name(input: str, output_prefix: str, file_id: int):
     file_name, extension = os.path.splitext(input)
     input_file_name = file_name + "_" + str(file_id) + extension
     output_prefix = output_prefix + "_" + str(file_id)
@@ -172,7 +174,13 @@ def main(args):
             partitioned_input_files[idx].close()
 
     assert args.workers % args.partitions == 0
-    partition = Partition(args)
+    partition = Partition(
+        args.workers // args.partitions,
+        args.log_interval,
+        args.pretrained_model_name_or_path,
+        args.json_key,
+        args.append_eos,
+    )
 
     # Encode partition files in parallel
     processes = []

From 384f3377d38c1d6bb6eb1cd5276f5c2268a05bc2 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Tue, 26 Mar 2024 22:37:36 +0000
Subject: [PATCH 58/73] Add check only input_pp_rank and output_pp_rank get
 Tensors

---
 tests/helpers/data.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/helpers/data.py b/tests/helpers/data.py
index b4755aca..af07eed1 100644
--- a/tests/helpers/data.py
+++ b/tests/helpers/data.py
@@ -10,6 +10,7 @@
 
 from argparse import Namespace
 
+import nanotron.distributed as dist
 import numpy as np
 import torch
 from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
@@ -58,6 +59,16 @@ def assert_batch_dataloader(batch: dict, parallel_context, micro_batch_size: int
     """
     for element in batch:
         tensor = batch[element]
+
+        # Assert that inputs are only present in input_pp_rank and outputs in output_pp_rank
+        input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
+        if dist.get_rank(parallel_context.pp_pg) == input_pp_rank and element.startswith("input_"):
+            assert isinstance(tensor, torch.Tensor)
+        elif dist.get_rank(parallel_context.pp_pg) == output_pp_rank and element.startswith("label_"):
+            assert isinstance(tensor, torch.Tensor)
+        else:
+            assert isinstance(tensor, TensorPointer)
+
         data_class = (
             0  # 0 if tensor is from the ids, 1 if TensorPointer and 2 if mask. Used in the data parallel group check
         )

From 334fcf42d7a1044645d3d22bd8d857cc0f67ef63 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 27 Mar 2024 00:23:40 +0000
Subject: [PATCH 59/73] Added logging BlendedNanoset stats. Cant include typing
 due to import circular error

---
 src/nanotron/data/dataset_builder.py |  4 +++-
 src/nanotron/data/utils.py           | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
index cb23aed9..be08d980 100644
--- a/src/nanotron/data/dataset_builder.py
+++ b/src/nanotron/data/dataset_builder.py
@@ -8,7 +8,7 @@
 from nanotron.data.indexed_dataset import MMapIndexedDataset
 from nanotron.data.nanoset import Nanoset
 from nanotron.data.nanoset_configs import NanosetConfig
-from nanotron.data.utils import Split, normalize
+from nanotron.data.utils import Split, log_BlendedNanoset_stats, normalize
 
 logger = logging.get_logger(__name__)
 
@@ -90,6 +90,8 @@ def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
                     )
                 )
 
+        log_BlendedNanoset_stats(blended_nanosets)
+
         return blended_nanosets
 
     def build_nanoset_dataset_splits(
diff --git a/src/nanotron/data/utils.py b/src/nanotron/data/utils.py
index 64277bd7..5096557b 100644
--- a/src/nanotron/data/utils.py
+++ b/src/nanotron/data/utils.py
@@ -5,6 +5,7 @@
 import numpy
 
 from nanotron import logging
+from nanotron.logging import log_rank
 
 logger = logging.get_logger(__name__)
 
@@ -50,3 +51,30 @@ def parse_and_normalize_split(split: str) -> List[float]:
     split = normalize(split)
 
     return split
+
+
+def count_blending_indexes(dataset_idx: numpy.ndarray, n_datasets: int):
+    counts = []
+
+    for dataset in range(n_datasets):
+        counts.append(numpy.count_nonzero(dataset_idx == dataset))
+
+    return counts
+
+
+def log_BlendedNanoset_stats(blended_nanosets):
+    for blended_nanoset, split_name in zip(blended_nanosets, Split):
+        log_rank(
+            f"> Total number of samples of the {split_name._name_} BlendedNanoset: {len(blended_nanoset)}",
+            logger=logger,
+            level=logging.INFO,
+            rank=0,
+        )
+        nanoset_sample_count = count_blending_indexes(blended_nanoset.dataset_index, len(blended_nanoset.datasets))
+        for idx, nanoset in enumerate(blended_nanoset.datasets):
+            log_rank(
+                f">   Total number of samples from the {nanoset.indexed_dataset.path_prefix.rsplit('/', 1)[-1]} Nanoset: {nanoset_sample_count[idx]} ({round(normalize(nanoset_sample_count)[idx], 2)})",
+                logger=logger,
+                level=logging.INFO,
+                rank=0,
+            )

From 531f802a215a605a9d75f770ea83ba9da600b52b Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 27 Mar 2024 01:23:49 +0000
Subject: [PATCH 60/73] Adapted code to loading dataloader based on training
 stages

---
 docs/nanoset.md                         | 38 ++++++++++++++-----------
 run_train_nanoset.py                    | 19 +++++++------
 src/nanotron/data/dataloader_builder.py |  1 +
 src/nanotron/dataloader.py              |  3 +-
 src/nanotron/trainer.py                 | 31 ++++++++++++--------
 5 files changed, 55 insertions(+), 37 deletions(-)

diff --git a/docs/nanoset.md b/docs/nanoset.md
index 1d282d11..65a214a5 100644
--- a/docs/nanoset.md
+++ b/docs/nanoset.md
@@ -38,25 +38,31 @@ To work with Nanosets, we need to configure 3 arguments:
 3. `data_path`: This argument specifies the file/files that will compose the Nanoset. There are 2 ways to specify it:
    1. If we only indicate _one_ path (path to the files generated by preprocess_data.py WITHOUT the extension), we will create a `Nanoset`.
     ```yaml
-    data:
-      dataset:
-        data_path: nanoset/SlimPajama-6B_text
-        split: 949,50,1
-        path_to_cache: .nanoset_cache
-      num_loading_workers: 0
-      seed: 1234
+    data_stages:
+      - name: General purpose training (Nanoset)
+        start_training_step: 1
+        data:
+          dataset:
+            data_path: nanoset/SlimPajama-6B_text
+            split: 949,50,1
+            path_to_cache: .nanoset_cache
+          num_loading_workers: 0
+          seed: 1234
     ```
    2. With a dictionary, we can create a `BlendedNanoset` where the keys are the paths to the dataset (path to the files generated by preprocess_data.py WITHOUT the extension) and the values are the weight for each dataset.
     ```yaml
-    data:
-      dataset:
-        data_path:
-          nanoset/SlimPajama-6B_text: 0.8
-          nanoset/europarl_text: 0.2
-        split: 949,50,1
-        path_to_cache: .nanoset_cache
-      num_loading_workers: 0
-      seed: 1234
+    data_stages:
+      - name: General purpose training (BlendedNanoset)
+        start_training_step: 1
+        data:
+          dataset:
+            data_path:
+              nanoset/SlimPajama-6B_text: 0.8
+              nanoset/europarl_text: 0.2
+            split: 949,50,1
+            path_to_cache: .nanoset_cache
+          num_loading_workers: 0
+          seed: 1234
     ```
 
 Finally, to use the Nanosets, launch the training with [`run_train_nanoset.py`](../run_train_nanoset.py).
diff --git a/run_train_nanoset.py b/run_train_nanoset.py
index f4e1b7d4..09271b3b 100644
--- a/run_train_nanoset.py
+++ b/run_train_nanoset.py
@@ -21,18 +21,21 @@
 
 def get_dataloaders(trainer: DistributedTrainer):
     """Returns train, valid and test dataloaders"""
+    assert (
+        len(trainer.config.data_stages) == 1
+    ), "Nanosets currently don't support loading DataLoaders based on training stages"
 
     # First, we need to know which ranks to feed the dataloader to
     input_pp_rank, output_pp_rank = get_input_output_pp_ranks(model=trainer.model)
 
     # Create Nanoset config
     nanoset_config = NanosetConfig(
-        random_seed=trainer.config.data.seed,
+        random_seed=trainer.config.data_stages[0].data.seed,
         sequence_length=trainer.sequence_length,
-        data_path=trainer.config.data.dataset.data_path,
-        split=trainer.config.data.dataset.split,
+        data_path=trainer.config.data_stages[0].data.dataset.data_path,
+        split=trainer.config.data_stages[0].data.dataset.split,
         train_split_samples=trainer.config.tokens.train_steps * trainer.global_batch_size,
-        path_to_cache=trainer.config.data.dataset.path_to_cache,
+        path_to_cache=trainer.config.data_stages[0].data.dataset.path_to_cache,
     )
 
     # Build Nanoset datasets
@@ -47,7 +50,7 @@ def get_dataloaders(trainer: DistributedTrainer):
         output_pp_rank=output_pp_rank,
         micro_batch_size=trainer.micro_batch_size,
         consumed_train_samples=trainer.consumed_train_samples,
-        dataloader_num_workers=trainer.config.data.num_loading_workers,
+        dataloader_num_workers=trainer.config.data_stages[0].data.num_loading_workers,
         dataloader_drop_last=True,
     )
 
@@ -58,7 +61,7 @@ def get_dataloaders(trainer: DistributedTrainer):
         input_pp_rank=input_pp_rank,
         output_pp_rank=output_pp_rank,
         micro_batch_size=trainer.micro_batch_size,
-        dataloader_num_workers=trainer.config.data.num_loading_workers,
+        dataloader_num_workers=trainer.config.data_stages[0].data.num_loading_workers,
         dataloader_drop_last=True,
     )
 
@@ -69,7 +72,7 @@ def get_dataloaders(trainer: DistributedTrainer):
         input_pp_rank=input_pp_rank,
         output_pp_rank=output_pp_rank,
         micro_batch_size=trainer.micro_batch_size,
-        dataloader_num_workers=trainer.config.data.num_loading_workers,
+        dataloader_num_workers=trainer.config.data_stages[0].data.num_loading_workers,
         dataloader_drop_last=True,
     )
 
@@ -92,4 +95,4 @@ def get_args():
     train_dataloader, valid_dataloader, test_dataloader = get_dataloaders(trainer)
 
     # Train
-    trainer.train(train_dataloader)
+    trainer.train([train_dataloader])
diff --git a/src/nanotron/data/dataloader_builder.py b/src/nanotron/data/dataloader_builder.py
index 1b227c4c..8e4e9af4 100644
--- a/src/nanotron/data/dataloader_builder.py
+++ b/src/nanotron/data/dataloader_builder.py
@@ -50,6 +50,7 @@ def build_nanoset_dataloader(
         dl_rank=dp_rank,
         drop_last=dataloader_drop_last,
         consumed_train_samples=consumed_train_samples,
+        shuffle=False,
     )
 
     return DataLoader(
diff --git a/src/nanotron/dataloader.py b/src/nanotron/dataloader.py
index ec64ebe3..4479165c 100644
--- a/src/nanotron/dataloader.py
+++ b/src/nanotron/dataloader.py
@@ -409,6 +409,7 @@ def _get_train_sampler(
     use_loop_to_round_batch_size: bool = False,
     micro_batch_size: Optional[int] = None,
     drop_last: Optional[bool] = True,
+    shuffle: bool = True,
 ) -> Optional[torch.utils.data.Sampler]:
     """returns sampler that restricts data loading to a subset of the dataset proper to the DP rank"""
 
@@ -428,7 +429,7 @@ def _get_train_sampler(
         )
     else:
         sampler = DistributedSampler(
-            train_dataset, num_replicas=dl_ranks_size, rank=dl_rank, seed=seed, drop_last=drop_last
+            train_dataset, num_replicas=dl_ranks_size, rank=dl_rank, seed=seed, drop_last=drop_last, shuffle=shuffle
         )
 
     if consumed_train_samples > 0:
diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index 2f653735..0ef83a85 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -33,6 +33,8 @@
     RandomInit,
     get_config_from_file,
 )
+from nanotron.data.blended_nanoset import BlendedNanoset
+from nanotron.data.nanoset import Nanoset
 from nanotron.dataloader import sanity_check_dataloader
 from nanotron.helpers import (
     _vocab_size_with_padding,
@@ -210,10 +212,7 @@ def __init__(
 
         # Setup tensorboard write and log writers on output rank
         self.logger_ranks = self.parallel_context.get_global_rank(
-            ep_rank=0,
-            pp_rank=self.unwrapped_model.output_pp_rank,
-            dp_rank=0,
-            tp_rank=0
+            ep_rank=0, pp_rank=self.unwrapped_model.output_pp_rank, dp_rank=0, tp_rank=0
         ).flatten()
         self.loggerwriter = self.setup_log_writers()
 
@@ -303,15 +302,23 @@ def clear_dataloader_from_memory(dataloader: DataLoader, stage_name: str):
                     if isinstance(prev_dataloader, DataLoader):
                         # NOTE: we don't need to clear dummy data generator from memory
                         clear_dataloader_from_memory(prev_dataloader, stage_name=stage.name)
+                if isinstance(dataloaders[0].dataset, Union[Nanoset, BlendedNanoset]):
+                    log_rank(
+                        f"[Training Stage: {stage.name}] Switching to a new dataset {stage.data.dataset.data_path}",
+                        logger=logger,
+                        level=logging.INFO,
+                        rank=0,
+                    )
+                    dataloader = dataloaders[0]
+                else:
+                    log_rank(
+                        f"[Training Stage: {stage.name}] Switching to a new dataset {stage.data.dataset.hf_dataset_or_datasets}",
+                        logger=logger,
+                        level=logging.INFO,
+                        rank=0,
+                    )
+                    dataloader = dataloaders[stage.name]
 
-                log_rank(
-                    f"[Training Stage: {stage.name}] Switching to a new dataset {stage.data.dataset.hf_dataset_or_datasets}",
-                    logger=logger,
-                    level=logging.INFO,
-                    rank=0,
-                )
-
-                dataloader = dataloaders[stage.name]
                 # NOTE: if a dataloader is lazy initialized, we need to call it to initialize it
                 dataloader = dataloader() if callable(dataloader) else dataloader
                 break

From 0acf47fdb1c206534364d6b4b887a6c19aa6ca1e Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 27 Mar 2024 11:01:57 +0000
Subject: [PATCH 61/73] Added dacite commento to config, renamed get_sampler,
 added split sum to 1 assertion in build_nanoset_datasets_split

---
 src/nanotron/config/config.py           | 4 +++-
 src/nanotron/data/dataloader_builder.py | 4 ++--
 src/nanotron/data/dataset_builder.py    | 2 ++
 src/nanotron/dataloader.py              | 7 ++-----
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
index 52f645ac..b7f08b8f 100644
--- a/src/nanotron/config/config.py
+++ b/src/nanotron/config/config.py
@@ -99,7 +99,9 @@ def __post_init__(self):
 class NanosetDatasetsArgs:
     data_path: Union[str, dict]
     split: Optional[str] = "949,50,1"
-    path_to_cache: Optional[str] = "_"
+    path_to_cache: Optional[
+        str
+    ] = "_"  # If defaults to None, dacite is not able to parse the config. Set to None in __post_init__
 
     def __post_init__(self):
         if not os.path.isdir(self.path_to_cache):
diff --git a/src/nanotron/data/dataloader_builder.py b/src/nanotron/data/dataloader_builder.py
index 8e4e9af4..2e2cf98b 100644
--- a/src/nanotron/data/dataloader_builder.py
+++ b/src/nanotron/data/dataloader_builder.py
@@ -5,8 +5,8 @@
 from nanotron.dataloader import (
     DataCollatorForCLM,
     EmptyInfiniteDataset,
-    _get_train_sampler,
     get_dataloader_worker_init,
+    get_sampler,
 )
 from nanotron.parallel import ParallelContext
 
@@ -44,7 +44,7 @@ def build_nanoset_dataloader(
     dp_ranks_size = parallel_context.dp_pg.size()
     dp_rank = parallel_context.dp_pg.rank()
 
-    sampler = _get_train_sampler(
+    sampler = get_sampler(
         train_dataset=dataset,
         dl_ranks_size=dp_ranks_size,
         dl_rank=dp_rank,
diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
index be08d980..2f440abe 100644
--- a/src/nanotron/data/dataset_builder.py
+++ b/src/nanotron/data/dataset_builder.py
@@ -112,6 +112,8 @@ def build_nanoset_dataset_splits(
         Returns:
             List[Nanoset]: The Nanoset per split. Always returns Nanosets because we build them in each and every rank
         """
+        assert numpy.isclose(sum(split), 1.0), f"Split ratios must sum to 1.00. Passed {split}"
+
         indexed_dataset = self.build_generic_dataset(MMapIndexedDataset, path_prefix)
 
         split_idx_bounds = get_split_indices(split, indexed_dataset.sequence_lengths.shape[0])
diff --git a/src/nanotron/dataloader.py b/src/nanotron/dataloader.py
index 4479165c..5d6e02aa 100644
--- a/src/nanotron/dataloader.py
+++ b/src/nanotron/dataloader.py
@@ -241,7 +241,6 @@ def data_generator() -> Generator[Dict[str, Union[torch.Tensor, TensorPointer]],
     return data_generator
 
 
-# QUESTION: Move it to nanotron.data.dataloader_builder?
 # Adapted from https://github.com/huggingface/accelerate/blob/a73898027a211c3f6dc4460351b0ec246aa824aa/src/accelerate/data_loader.py#L781C1-L824C28
 class SkipBatchSampler(BatchSampler):
     """
@@ -400,7 +399,7 @@ def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Uni
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/47e1676255e5dd86b9541f734cd4f4bdcbb50f4a/src/transformers/trainer.py#L763-L835
-def _get_train_sampler(
+def get_sampler(
     dl_ranks_size: int,
     dl_rank: int,
     train_dataset: Union["Dataset", torch.utils.data.Dataset],
@@ -494,7 +493,7 @@ def get_train_dataloader(
     # TODO @nouamanetazi: Remove unused columns: https://github.com/huggingface/transformers/blob/47e1676255e5dd86b9541f734cd4f4bdcbb50f4a/src/transformers/trainer.py#L852
     # TODO @nouamanetazi: Support torch.utils.data.IterableDataset: https://github.com/huggingface/transformers/blob/47e1676255e5dd86b9541f734cd4f4bdcbb50f4a/src/transformers/trainer.py#L855-L872
 
-    train_sampler = _get_train_sampler(
+    train_sampler = get_sampler(
         dl_rank=dp_rank,
         dl_ranks_size=dp_ranks_size,
         train_dataset=train_dataset,
@@ -519,7 +518,6 @@ def get_train_dataloader(
     )
 
 
-# QUESTION: Move it to nanotron.data.dataloader_builder?
 def get_dataloader_worker_init(dp_rank: int):
     """Creates random states for each worker in order to get different state in each workers"""
 
@@ -531,7 +529,6 @@ def dataloader_worker_init(worker_id):
     return dataloader_worker_init
 
 
-# QUESTION: Move it to nanotron.data.dataloader_builder?
 class EmptyInfiniteDataset:
     """Hack as removing all columns from a datasets.Dataset makes the number of rows 0."""
 

From 40262573ca17aae4e8c03dafdb65cd5d2ed367a4 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 27 Mar 2024 13:00:21 +0000
Subject: [PATCH 62/73] Added deterministic creation of Nanosets &
 BlendedNanoset in all processes to tests/

---
 tests/helpers/data.py                  | 57 +++++++++++++++++++++++++-
 tests/test_build_nanoset_dataloader.py | 16 +++++++-
 2 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/tests/helpers/data.py b/tests/helpers/data.py
index af07eed1..fda5059c 100644
--- a/tests/helpers/data.py
+++ b/tests/helpers/data.py
@@ -1,7 +1,9 @@
+import hashlib
 import importlib
 import json
 import os
 import sys
+from collections import OrderedDict
 from pathlib import Path
 
 package = importlib.import_module("nanotron")
@@ -13,6 +15,9 @@
 import nanotron.distributed as dist
 import numpy as np
 import torch
+from nanotron.data.blended_nanoset import BlendedNanoset
+from nanotron.data.nanoset import Nanoset
+from nanotron.parallel import ParallelContext
 from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
 from nanotron.sanity_checks import assert_tensor_synced_across_pg
 
@@ -52,7 +57,9 @@ def preprocess_dummy_dataset(path_to_json: str):
     main(args)
 
 
-def assert_batch_dataloader(batch: dict, parallel_context, micro_batch_size: int, sequence_length: int):
+def assert_batch_dataloader(
+    batch: dict, parallel_context: ParallelContext, micro_batch_size: int, sequence_length: int
+):
     """
     batch (dict): Batch produced from the Dataloader, with keys input_ids, input_mask, label_ids, label_mask
 
@@ -102,8 +109,54 @@ def assert_batch_dataloader(batch: dict, parallel_context, micro_batch_size: int
         )
 
 
-def get_max_value_by_group(dataset_idx, dataset_sample_idx, group_idx):
+def get_max_value_by_group(dataset_idx: np.array, dataset_sample_idx: np.array, group_idx: int):
     mask = dataset_idx == group_idx
     filtered_values = dataset_sample_idx[mask]
     max_val = np.amax(filtered_values)
     return max_val
+
+
+def assert_nanoset(nanoset: Nanoset, parallel_context: ParallelContext):
+    # Extract a sample from the Nanoset
+    IDX_SAMPLE = 23
+
+    nanoset_identifiers = OrderedDict()
+    nanoset_identifiers["path_prefix"] = nanoset.indexed_dataset.path_prefix
+    nanoset_identifiers["split"] = nanoset.config.split
+    nanoset_identifiers["random_seed"] = nanoset.config.random_seed
+    nanoset_identifiers["sequence_length"] = nanoset.config.sequence_length
+    nanoset_identifiers["length"] = len(nanoset)
+    nanoset_identifiers["input_ids"] = nanoset[IDX_SAMPLE]["input_ids"].tolist()
+    nanoset_identifiers["indices"] = nanoset.indexed_indices.tolist()
+
+    unique_description = json.dumps(nanoset_identifiers, indent=4)
+    # Create 8 digit description hash
+    unique_description_hash = int(hashlib.sha256(unique_description.encode("utf-8")).hexdigest(), 16) % 10**8
+    assert_tensor_synced_across_pg(
+        tensor=torch.tensor(unique_description_hash, device="cuda"),
+        pg=parallel_context.world_pg,
+        msg=lambda err: f"Nanoset is not synchronized across all processes {err}",
+    )
+
+
+def assert_blendednanoset(blendednanoset: BlendedNanoset, parallel_context: ParallelContext):
+    # Extract a sample from the Nanoset
+    IDX_SAMPLE = 23
+
+    blendednanoset_identifiers = OrderedDict()
+    blendednanoset_identifiers["datasets"] = [dataset.unique_identifiers for dataset in blendednanoset.datasets]
+    blendednanoset_identifiers["weights"] = blendednanoset.weights
+    blendednanoset_identifiers["length"] = len(blendednanoset)
+    blendednanoset_identifiers["dataset_sizes"] = blendednanoset.dataset_sizes
+    blendednanoset_identifiers["input_ids"] = blendednanoset[IDX_SAMPLE]["input_ids"].tolist()
+    blendednanoset_identifiers["dataset_index"] = blendednanoset.dataset_index.tolist()
+    blendednanoset_identifiers["dataset_sample_index"] = blendednanoset.dataset_sample_index.tolist()
+
+    unique_description = json.dumps(blendednanoset_identifiers, indent=4)
+    # Create 8 digit description hash
+    unique_description_hash = int(hashlib.sha256(unique_description.encode("utf-8")).hexdigest(), 16) % 10**8
+    assert_tensor_synced_across_pg(
+        tensor=torch.tensor(unique_description_hash, device="cuda"),
+        pg=parallel_context.world_pg,
+        msg=lambda err: f"BlendedNanoset is not synchronized across all processes {err}",
+    )
diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index 2ee424b0..1e4ff9ed 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -2,6 +2,8 @@
 from helpers.context import TestContext
 from helpers.data import (
     assert_batch_dataloader,
+    assert_blendednanoset,
+    assert_nanoset,
     create_dataset_paths,
     create_dummy_json_dataset,
     get_max_value_by_group,
@@ -90,14 +92,26 @@ def _test_build_nanoset_dataloader(
     for config, dataset_type in zip(configs, dataset_types):
         # Create Nanosets
         train_dataset, valid_dataset, test_dataset = NanosetBuilder(config).build()
-        # Check the type of Nanoset, the quantity of Nanosets in BlendedNanoset and the size of each Nanoset in BlendedNanoset
+
         assert isinstance(train_dataset, dataset_type)
+
+        if isinstance(train_dataset, Nanoset):
+            # Assert Nanoset is the same across all processes
+            assert_nanoset(train_dataset, parallel_context)
+
         if isinstance(train_dataset, BlendedNanoset):
+            # Check the BlendedNanoset is composed of > 1 Nanoset
             assert len(train_dataset.datasets) > 1
+            # Assert BlendedNanoset is the same across all processes
+            assert_blendednanoset(train_dataset, parallel_context)
+            # For each Nanoset in BlendedNanoset
             for idx, dataset in enumerate(train_dataset.datasets):
+                # Check that each Nanoset of BlendedNanoset has more samples than the ones requested by BlendedNanoset
                 assert len(dataset) > get_max_value_by_group(
                     train_dataset.dataset_index, train_dataset.dataset_sample_index, idx
                 )
+                # Assert Nanoset is the same across all processes
+                assert_nanoset(dataset, parallel_context)
 
         # Create Dataloaders
         dataloader = build_nanoset_dataloader(

From 05be999d312855069b9cf1723d5c1c68ac5a431e Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 27 Mar 2024 13:01:24 +0000
Subject: [PATCH 63/73] Patch for different datasets based on training stages

---
 run_train.py                  | 2 +-
 src/nanotron/config/config.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/run_train.py b/run_train.py
index 4c36d1ef..e5be3048 100644
--- a/run_train.py
+++ b/run_train.py
@@ -134,7 +134,7 @@ def get_dataloader(trainer: DistributedTrainer) -> Dict[str, DataLoader]:
         # then we lazy initialize the dataloader for the other stages
         stage = cast(DatasetStageArgs, stage)
         dataloader = (
-            get_dataloader(trainer, stage.data)
+            get_dataloader_from_data_stage(trainer, stage.data)
             if idx == 0
             else lambda stage=stage: get_dataloader_from_data_stage(trainer, stage.data)
         )
diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
index b7f08b8f..ba59f74c 100644
--- a/src/nanotron/config/config.py
+++ b/src/nanotron/config/config.py
@@ -346,7 +346,7 @@ def __post_init__(self):
         if self.data_stages is not None:
             names = [stage.name for stage in self.data_stages]
             training_steps = [stage.start_training_step for stage in self.data_stages]
-            assert all(
+            assert any(
                 stage.start_training_step == 1 for stage in self.data_stages
             ), "You must have a training stage starting at 1 in the config's data_stages"
 

From f163c1d6574b5f8f1d7e9d51caa9f006ec35660a Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 27 Mar 2024 13:04:42 +0000
Subject: [PATCH 64/73] Added support for multistage training with Nanosets

---
 run_train_nanoset.py    | 53 ++++++++++++++++++++++++++++-------------
 src/nanotron/trainer.py |  9 ++++---
 2 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/run_train_nanoset.py b/run_train_nanoset.py
index 09271b3b..c689ec40 100644
--- a/run_train_nanoset.py
+++ b/run_train_nanoset.py
@@ -8,34 +8,37 @@
 ```
 """
 import argparse
+from typing import Dict, cast
 
 from nanotron import logging
+from nanotron.config import (
+    DataArgs,
+    DatasetStageArgs,
+)
 from nanotron.data.dataloader_builder import build_nanoset_dataloader
 from nanotron.data.dataset_builder import NanosetBuilder
 from nanotron.data.nanoset import NanosetConfig
 from nanotron.parallel.pipeline_parallel.utils import get_input_output_pp_ranks
 from nanotron.trainer import DistributedTrainer
+from torch.utils.data import DataLoader
 
 logger = logging.get_logger(__name__)
 
 
-def get_dataloaders(trainer: DistributedTrainer):
+def get_dataloader_from_data_stage(trainer: DistributedTrainer, data: DataArgs):
     """Returns train, valid and test dataloaders"""
-    assert (
-        len(trainer.config.data_stages) == 1
-    ), "Nanosets currently don't support loading DataLoaders based on training stages"
 
     # First, we need to know which ranks to feed the dataloader to
     input_pp_rank, output_pp_rank = get_input_output_pp_ranks(model=trainer.model)
 
     # Create Nanoset config
     nanoset_config = NanosetConfig(
-        random_seed=trainer.config.data_stages[0].data.seed,
+        random_seed=data.seed,
         sequence_length=trainer.sequence_length,
-        data_path=trainer.config.data_stages[0].data.dataset.data_path,
-        split=trainer.config.data_stages[0].data.dataset.split,
+        data_path=data.dataset.data_path,
+        split=data.dataset.split,
         train_split_samples=trainer.config.tokens.train_steps * trainer.global_batch_size,
-        path_to_cache=trainer.config.data_stages[0].data.dataset.path_to_cache,
+        path_to_cache=data.dataset.path_to_cache,
     )
 
     # Build Nanoset datasets
@@ -50,33 +53,35 @@ def get_dataloaders(trainer: DistributedTrainer):
         output_pp_rank=output_pp_rank,
         micro_batch_size=trainer.micro_batch_size,
         consumed_train_samples=trainer.consumed_train_samples,
-        dataloader_num_workers=trainer.config.data_stages[0].data.num_loading_workers,
+        dataloader_num_workers=data.num_loading_workers,
         dataloader_drop_last=True,
     )
 
-    valid_dataloader = build_nanoset_dataloader(
+    # Valid dataloader
+    _ = build_nanoset_dataloader(
         valid_dataset,
         trainer.sequence_length,
         parallel_context=trainer.parallel_context,
         input_pp_rank=input_pp_rank,
         output_pp_rank=output_pp_rank,
         micro_batch_size=trainer.micro_batch_size,
-        dataloader_num_workers=trainer.config.data_stages[0].data.num_loading_workers,
+        dataloader_num_workers=data.num_loading_workers,
         dataloader_drop_last=True,
     )
 
-    test_dataloader = build_nanoset_dataloader(
+    # Test dataloader
+    _ = build_nanoset_dataloader(
         test_dataset,
         trainer.sequence_length,
         parallel_context=trainer.parallel_context,
         input_pp_rank=input_pp_rank,
         output_pp_rank=output_pp_rank,
         micro_batch_size=trainer.micro_batch_size,
-        dataloader_num_workers=trainer.config.data_stages[0].data.num_loading_workers,
+        dataloader_num_workers=data.num_loading_workers,
         dataloader_drop_last=True,
     )
 
-    return train_dataloader, valid_dataloader, test_dataloader
+    return train_dataloader
 
 
 def get_args():
@@ -85,6 +90,22 @@ def get_args():
     return parser.parse_args()
 
 
+def get_dataloader(trainer: DistributedTrainer) -> Dict[str, DataLoader]:
+    sorted_stages = sorted(trainer.config.data_stages, key=lambda stage: stage.start_training_step)
+    dataloaders = {}
+    for idx, stage in enumerate(sorted_stages):
+        # NOTE: we only create the dataloader for the first stage,
+        # then we lazy initialize the dataloader for the other stages
+        stage = cast(DatasetStageArgs, stage)
+        dataloader = (
+            get_dataloader_from_data_stage(trainer, stage.data)
+            if idx == 0
+            else lambda stage=stage: get_dataloader_from_data_stage(trainer, stage.data)
+        )
+        dataloaders[stage.name] = dataloader
+    return dataloaders
+
+
 if __name__ == "__main__":
     args = get_args()
     config_file = args.config_file
@@ -92,7 +113,7 @@ def get_args():
     # Load trainer and data
     trainer = DistributedTrainer(config_file)
 
-    train_dataloader, valid_dataloader, test_dataloader = get_dataloaders(trainer)
+    train_dataloader = get_dataloader(trainer)
 
     # Train
-    trainer.train([train_dataloader])
+    trainer.train(train_dataloader)
diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index 0ef83a85..889bbff7 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -29,12 +29,11 @@
     Config,
     DatasetStageArgs,
     ExistingCheckpointInit,
+    NanosetDatasetsArgs,
     ParallelismArgs,
     RandomInit,
     get_config_from_file,
 )
-from nanotron.data.blended_nanoset import BlendedNanoset
-from nanotron.data.nanoset import Nanoset
 from nanotron.dataloader import sanity_check_dataloader
 from nanotron.helpers import (
     _vocab_size_with_padding,
@@ -302,14 +301,13 @@ def clear_dataloader_from_memory(dataloader: DataLoader, stage_name: str):
                     if isinstance(prev_dataloader, DataLoader):
                         # NOTE: we don't need to clear dummy data generator from memory
                         clear_dataloader_from_memory(prev_dataloader, stage_name=stage.name)
-                if isinstance(dataloaders[0].dataset, Union[Nanoset, BlendedNanoset]):
+                if isinstance(stage.data.dataset, NanosetDatasetsArgs):
                     log_rank(
                         f"[Training Stage: {stage.name}] Switching to a new dataset {stage.data.dataset.data_path}",
                         logger=logger,
                         level=logging.INFO,
                         rank=0,
                     )
-                    dataloader = dataloaders[0]
                 else:
                     log_rank(
                         f"[Training Stage: {stage.name}] Switching to a new dataset {stage.data.dataset.hf_dataset_or_datasets}",
@@ -317,7 +315,8 @@ def clear_dataloader_from_memory(dataloader: DataLoader, stage_name: str):
                         level=logging.INFO,
                         rank=0,
                     )
-                    dataloader = dataloaders[stage.name]
+
+                dataloader = dataloaders[stage.name]
 
                 # NOTE: if a dataloader is lazy initialized, we need to call it to initialize it
                 dataloader = dataloader() if callable(dataloader) else dataloader

From fc55f6ed001219be00577aaaba01e9254f142349 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 27 Mar 2024 13:39:32 +0000
Subject: [PATCH 65/73] Fixed redundant logging

---
 src/nanotron/data/blended_nanoset.py | 2 +-
 src/nanotron/data/indexed_dataset.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/nanotron/data/blended_nanoset.py b/src/nanotron/data/blended_nanoset.py
index 9bd043ac..0790e69e 100644
--- a/src/nanotron/data/blended_nanoset.py
+++ b/src/nanotron/data/blended_nanoset.py
@@ -75,7 +75,7 @@ def __init__(
             _ = self[self.size]
             raise RuntimeError("BlendedNanoset size is improperly bounded")
         except IndexError:
-            log_rank(f"> BlendedNanoset length: {len(self)}", logger=logger, level=logging.INFO, rank=0)
+            pass
 
     def __len__(self) -> int:
         return self.size
diff --git a/src/nanotron/data/indexed_dataset.py b/src/nanotron/data/indexed_dataset.py
index ee7c4a69..bb0de613 100644
--- a/src/nanotron/data/indexed_dataset.py
+++ b/src/nanotron/data/indexed_dataset.py
@@ -261,7 +261,6 @@ def __init__(self, idx_path: str) -> None:
         assert self.sequence_lengths.shape[0] == self.sequence_count
         assert self.sequence_lengths.shape[0] == self.document_indices[-1]
 
-        log_rank(f"> Total number of sequences: {len(self)}", logger=logger, level=logging.INFO, rank=0)
         log_rank(
             f"> Total number of documents: {self.document_indices.shape[0] - 1}",
             logger=logger,

From acbc3658c6b31c06395b630e69b9e783eba13b8a Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Thu, 28 Mar 2024 14:23:15 +0000
Subject: [PATCH 66/73] Add Nanoset/BlendedNanoset recovery from failure test

---
 tests/helpers/data.py                  | 57 ++++++++++++++++++++++----
 tests/test_build_nanoset_dataloader.py | 24 +++++++++++
 2 files changed, 74 insertions(+), 7 deletions(-)

diff --git a/tests/helpers/data.py b/tests/helpers/data.py
index fda5059c..079d9eac 100644
--- a/tests/helpers/data.py
+++ b/tests/helpers/data.py
@@ -116,7 +116,20 @@ def get_max_value_by_group(dataset_idx: np.array, dataset_sample_idx: np.array,
     return max_val
 
 
+def compute_hash(identifier: OrderedDict, n_digit: int = 8) -> int:
+    """
+    Creates a sha256 hash from the elements of a OrderedDict
+    """
+    unique_description = json.dumps(identifier, indent=4)
+    # Create n_digit description hash
+    unique_description_hash = int(hashlib.sha256(unique_description.encode("utf-8")).hexdigest(), 16) % 10**n_digit
+    return unique_description_hash
+
+
 def assert_nanoset(nanoset: Nanoset, parallel_context: ParallelContext):
+    """
+    Checks that the same Nanoset is created in all processes
+    """
     # Extract a sample from the Nanoset
     IDX_SAMPLE = 23
 
@@ -129,9 +142,7 @@ def assert_nanoset(nanoset: Nanoset, parallel_context: ParallelContext):
     nanoset_identifiers["input_ids"] = nanoset[IDX_SAMPLE]["input_ids"].tolist()
     nanoset_identifiers["indices"] = nanoset.indexed_indices.tolist()
 
-    unique_description = json.dumps(nanoset_identifiers, indent=4)
-    # Create 8 digit description hash
-    unique_description_hash = int(hashlib.sha256(unique_description.encode("utf-8")).hexdigest(), 16) % 10**8
+    unique_description_hash = compute_hash(nanoset_identifiers)
     assert_tensor_synced_across_pg(
         tensor=torch.tensor(unique_description_hash, device="cuda"),
         pg=parallel_context.world_pg,
@@ -140,7 +151,10 @@ def assert_nanoset(nanoset: Nanoset, parallel_context: ParallelContext):
 
 
 def assert_blendednanoset(blendednanoset: BlendedNanoset, parallel_context: ParallelContext):
-    # Extract a sample from the Nanoset
+    """
+    Checks that the same BlendedNanoset is created in all processes
+    """
+    # Extract a sample from the BlendedNanoset
     IDX_SAMPLE = 23
 
     blendednanoset_identifiers = OrderedDict()
@@ -152,11 +166,40 @@ def assert_blendednanoset(blendednanoset: BlendedNanoset, parallel_context: Para
     blendednanoset_identifiers["dataset_index"] = blendednanoset.dataset_index.tolist()
     blendednanoset_identifiers["dataset_sample_index"] = blendednanoset.dataset_sample_index.tolist()
 
-    unique_description = json.dumps(blendednanoset_identifiers, indent=4)
-    # Create 8 digit description hash
-    unique_description_hash = int(hashlib.sha256(unique_description.encode("utf-8")).hexdigest(), 16) % 10**8
+    unique_description_hash = compute_hash(blendednanoset_identifiers)
     assert_tensor_synced_across_pg(
         tensor=torch.tensor(unique_description_hash, device="cuda"),
         pg=parallel_context.world_pg,
         msg=lambda err: f"BlendedNanoset is not synchronized across all processes {err}",
     )
+
+
+def compute_batch_hash(batch: dict) -> int:
+    """
+    Checks that the Nanoset/BlendedNanoset is in the same state after recovering from a crash
+
+    batch (dict): Batch produced from the Dataloader, with keys input_ids, input_mask, label_ids, label_mask
+
+    """
+    batch_identifiers = OrderedDict()
+
+    for element in batch:
+        tensor = batch[element]
+
+        # TensorPointer
+        if isinstance(tensor, TensorPointer):
+            identifier = tensor.group_rank
+
+        # Attention Masks case: dtype is torch.bool --> Transform to int64
+        elif tensor.dtype == torch.bool:
+            identifier = tensor.long().tolist()
+
+        # Input IDs tensor
+        else:
+            identifier = tensor.tolist()
+
+        batch_identifiers[element] = identifier
+
+    unique_description_hash = compute_hash(batch_identifiers)
+
+    return unique_description_hash
diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index 1e4ff9ed..c9753d8d 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -4,6 +4,7 @@
     assert_batch_dataloader,
     assert_blendednanoset,
     assert_nanoset,
+    compute_batch_hash,
     create_dataset_paths,
     create_dummy_json_dataset,
     get_max_value_by_group,
@@ -134,4 +135,27 @@ def _test_build_nanoset_dataloader(
             sequence_length=sequence_length,
         )
 
+        # Recover from failures
+        SKIPED_BATCHES = 20
+        dataloader = iter(dataloader)
+        for _ in range(SKIPED_BATCHES + 1):  # In order to compare with the first batch of the recovered DataLoader
+            batch = next(dataloader)
+
+        recovered_dataloader = build_nanoset_dataloader(
+            train_dataset,
+            sequence_length=sequence_length,
+            parallel_context=parallel_context,
+            input_pp_rank=input_pp_rank,
+            output_pp_rank=output_pp_rank,
+            micro_batch_size=MICRO_BATCH_SIZE,
+            dataloader_num_workers=0,
+            dataloader_drop_last=True,
+            # NOTE The dataloader serves batches of micro_batch_size despite of batch_accumulation_per_replica
+            consumed_train_samples=SKIPED_BATCHES * MICRO_BATCH_SIZE * parallel_context.dp_pg.size(),
+        )
+
+        recovered_first_batch = next(iter(recovered_dataloader))
+
+        assert compute_batch_hash(batch) == compute_batch_hash(recovered_first_batch)
+
     parallel_context.destroy()

From e3190be3406cb19bf9d4e5129bebe68d59b030c4 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Fri, 29 Mar 2024 01:43:46 +0000
Subject: [PATCH 67/73] Refractored MMapIndexedDataset

---
 docs/nanoset.md                        | 106 ++----
 pyproject.toml                         |   3 +-
 run_train_nanoset.py                   |   2 +-
 src/nanotron/data/LICENSE              |  59 ----
 src/nanotron/data/blended_nanoset.py   |  35 +-
 src/nanotron/data/dataset_builder.py   |  81 +++--
 src/nanotron/data/indexed_dataset.py   | 452 ++-----------------------
 src/nanotron/data/nanoset.py           | 270 +++------------
 src/nanotron/data/nanoset_configs.py   |   3 +-
 src/nanotron/data/utils.py             |  14 +-
 src/nanotron/logging.py                |  15 +-
 tests/helpers/data.py                  |  14 +-
 tests/test_build_nanoset_dataloader.py |  12 +-
 tools/hf_datasets_to_json.py           |  44 ---
 tools/merge_datasets.py                |  76 -----
 tools/preprocess_data.py               | 247 ++++----------
 16 files changed, 255 insertions(+), 1178 deletions(-)
 delete mode 100644 src/nanotron/data/LICENSE
 delete mode 100644 tools/hf_datasets_to_json.py
 delete mode 100644 tools/merge_datasets.py

diff --git a/docs/nanoset.md b/docs/nanoset.md
index 65a214a5..2b375cac 100644
--- a/docs/nanoset.md
+++ b/docs/nanoset.md
@@ -2,33 +2,29 @@
 
 ## Data pre-processing
 
-Nanotron incorporates [`Nanosets`](../src/nanotron/data/nanoset.py), a streamlined version of Megatron-LM datasets. It also includes [`BlendedNanosets`](../src/nanotron/data/blended_nanoset.py), to be able to combine different Nanosets.
+Nanotron incorporates [`Nanosets`](../src/nanotron/data/nanoset.py), a kind of datasets based on numpy memory-mapped arrays. It also includes [`BlendedNanosets`](../src/nanotron/data/blended_nanoset.py), to be able to combine different `Nanosets`.
 
-The training data requires preprocessing. Just like Megatron, first, place your training data in a loose json format, with one json containing a text sample per line. For example:
+
+To use these datasets, first, we need to preprocess the data. The input format can either be a column of a Hugging Face Dataset or a .json file containing a text sample per line. For example:
 
 <pre>
 {"src": "www.nvidia.com", "text": "The quick brown fox", "type": "Eng", "id": "0", "title": "First Part"}
 {"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"}
 </pre>
 
-The loose json is then processed into a binary format for training using the [`tools/preprocess_data.py`](../tools/preprocess_data.py) script. Below we show an example for processing a corpus with the Llama2 tokenizer.
+The dataset is then processed into a mmap format for training using the [`tools/preprocess_data.py`](../tools/preprocess_data.py) script. Below we show an example for processing a corpus with the Llama2 tokenizer.
 
 <pre>
 python tools/preprocess_data.py \
        --input data/my_corpus.json \
        --output-prefix data/processed-datasets/my-llama2-dataset \
        --pretrained-model-name-or-path models/Llama-2-7b \
-       --workers 128 \
-       --partitions 8
+       --workers 128
 </pre>
 
 In `--pretrained-model-name-or-path`, we will have to specify a tokenizer in the same way as we do when using `AutoTokenizers.from_pretrained(...)`.
 
-The output will be two files named, in this case, `my-llama2-dataset_text.bin` and `my-llama2-dataset_text.idx`. The `data_path` specified later in training is the full path and new filename, but **without** the file extension.
-
-We also include the following scripts:
-- [`hf_datasets_to_json.py`](../tools/hf_datasets_to_json.py): A tool to transform datasets from the Hugging Face Hub (or local) to the .json format required by `preprocess_data.py`.
-- [`merge_datasets.py`](../tools/merge_datasets.py): A tool to merge preprocessed datasets (_.bin_ and _.idx_ files) into a single file.
+The output will be one file named, in this case, `my-llama2-dataset_input_ids.npy`. We will then have to specify this file in the `data_path` field in the config file.
 
 ## Working with Nanosets
 
@@ -36,20 +32,20 @@ To work with Nanosets, we need to configure 3 arguments:
 1. `split`: The distribution we want to divide our dataset into among train, valid, and test split.
 2. `path_to_cache`: Directory used to store certain metadata of Nanosets to reuse them between different runs.
 3. `data_path`: This argument specifies the file/files that will compose the Nanoset. There are 2 ways to specify it:
-   1. If we only indicate _one_ path (path to the files generated by preprocess_data.py WITHOUT the extension), we will create a `Nanoset`.
+   1. If we specify a single path, we will create a `Nanoset`.
     ```yaml
     data_stages:
       - name: General purpose training (Nanoset)
         start_training_step: 1
         data:
           dataset:
-            data_path: nanoset/SlimPajama-6B_text
+            data_path: nanosets/SlimPajama-6B_input_ids.npy
             split: 949,50,1
             path_to_cache: .nanoset_cache
           num_loading_workers: 0
           seed: 1234
     ```
-   2. With a dictionary, we can create a `BlendedNanoset` where the keys are the paths to the dataset (path to the files generated by preprocess_data.py WITHOUT the extension) and the values are the weight for each dataset.
+   2. With a dictionary, we can create a `BlendedNanoset` where the keys are the paths to the dataset files and the values are the weights for each dataset.
     ```yaml
     data_stages:
       - name: General purpose training (BlendedNanoset)
@@ -57,8 +53,8 @@ To work with Nanosets, we need to configure 3 arguments:
         data:
           dataset:
             data_path:
-              nanoset/SlimPajama-6B_text: 0.8
-              nanoset/europarl_text: 0.2
+              nanoset/SlimPajama-6B_input_ids.npy: 0.8
+              nanoset/europarl_input_ids.npy: 0.2
             split: 949,50,1
             path_to_cache: .nanoset_cache
           num_loading_workers: 0
@@ -73,68 +69,38 @@ torchrun --nproc-per-node 8 run_train_nanoset.py --config configs/nanoset_llama2
 ## Under the hood
 ### Number of samples
 
-When using Nanosets, we specify the `data_path` to the preprocessed dataset and a `split`. This `split` value will be used to divide the total number of documents into train, valid, and test sets.
+When using Nanosets, we specify the `data_path` to the preprocessed dataset and a `split`. This `split` value will be used to divide the total number of tokens into train, valid, and test sets. The number of samples will be the `number of tokens in split / sequence_lenght`.
 
-For the train split, the number of samples consumed from the Nanoset will be determined by the `number of train steps * global batch size`, so if this number is higher than the number of samples created with the documents assigned to the train split, we will see the dataset samples more than once (> 1 epoch). In the case of the valid and test split, we will see all the samples only once.
+For the train split, the number of samples consumed from the Nanoset will be determined by the `number of train steps * global batch size`, so if this number is higher than the number of samples in the train split, we will see the dataset samples more than once (> 1 epoch). In the case of the valid and test split, we will see all the samples only once.
 
-In the case of the `BlendedNanoset`, we will also indicate the weight of each dataset to construct data batches according to the specified proportion. In this case, the train split will respect this proportion, considering that the number of samples will be calculated in the same way as in the Nanosets, so it may happen that we consume one dataset for 3 epochs and another larger dataset for only one epoch. For the valid and test splits, the same as in the Nanosets will occur; we will consume all the samples only once.
+In the case of the `BlendedNanoset`, we will also indicate the weight of each dataset to construct data batches according to the specified proportion. In this case, the train split will respect this proportion, considering that the number of samples will be computed in the same way as in the `Nanosets`, so it may happen that we consume one dataset for 3 epochs and another larger dataset for only one epoch. For the valid and test splits, the same as in the `Nanosets` will occur; we will consume all the samples only once.
 
 ### Nanoset
 A `Nanoset` is paremeterized by the following variables:
 - The underlying `MMapIndexedDataset` instance (`indexed_dataset`)
 - The sequence length `S`
-- The split indices `indexed_indices` (the congituous subset of document or sequence indices used for training, validation, and testing)
+- The split indices `indexed_indices` (the congituous subset of sample indices used for training, validation, and testing)
 - The total number of samples `N` of the Nanoset that we will consume during training. In the case of the valid and test splits, we will only consume the dataset once
 - The random seed `R`
 
-The `Nanoset` creates three index mappings to facilitate lookup: (1) The `document_index`, (2) the `sample_index`, and (3) the `shuffle_index`.
-
-1. The document index (`document_index`) is a 1-D array array in which the indices _i_ contain the indices of the documents (**each sample from the JSON file**) that belong to the dataset. This array is randomly shuffled with seed `R`.
-```
-Given:
-len(indexed_dataset) => 40
-
-indexed_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
-
-Then, for example:
-
-document_index = [14, 8, 17, 0, 3, 15, 7, 19, 11, 12, 9, 2, 10, 1, 4, 16, 13, 5, 18, 6]
-```
-2. The sample index (`sample_index`) is a 2-D array mapping from _j_ to pairs of (_i_, document_index[_i_] offset) of shape `[num_samples + 1, 2]`, where `num_samples` is `(tokens_per_epoch - 1) // S`. The rows _j_ and _j + 1_ serve as the left and right bounds for the j-th sample, so that each sample will contain `S` tokens.
-```
-Given:
-
-S = 1024
+The `Nanoset` creates a single index (`shuffle_index`) to map the indices of the Nanoset (0, 1, 2, ... `N`) to the indices of the `MMapIndexedDataset` for the specific split (`indexed_indices`).
 
-Then, for example:
-
-sample_index[0] = (0, 0)
-sample_index[1] = (0, 1024)       => document_index[0] (Document 14) has length greater than S
-sample_index[2] = (1, 512)        => document_index[0] has length 1536
-sample_index[3] = (2, 100)        => document_index[1] (Document 8) has length 1436 (In this sample we get 924 from document_index[1] + 100 from document_index[2])
-sample_index[4] = (3, 0)          => document_index[2] has length 1124
-sample_index[5] = (5, 300)        => document_index[3:5] have a total length of 724 (S - 300)
-sample_index[6] = (6, 24)         => document_index[5] has length 1300
-...
-sample_index[15] = (18, 512)
-sample_index[16] = (19, 1000)     => From document_index[19] we don't have S tokens, so this sample is discarded
-
-len(sample_index) => 17, but only 16 useful samples
-```
-3. In the train split, the shuffle index (`shuffle_index`) is a 1-D array mapping from _k_ to _j_ of length `n_concatenations * (len(sample_index) - 1)`, where `n_concatenations` is defined as `(N / (len(sample_index) - 1)) + 1`, so that `len(shuffle_index)` is always greater than `N`. While for the valid and test splits, `len(shuffle_index) == len(sample_index) - 1`. Before concatenating the full array, `shuffle_index` is shuffled according to `R`.
+In the train split, the shuffle index (`shuffle_index`) is a 1-D array mapping from _k_ to _j_ of length `n_concatenations * len(indexed_indices)`, where `n_concatenations` is defined as `(N / len(indexed_indices)) + 1`, so that `len(shuffle_index)` is always greater than `N`. While for the valid and test splits, `len(shuffle_index) == len(indexed_indices)`. Before concatenating the full array, `shuffle_index` is shuffled according to `R`.
 ```
 Given:
 
 N = 70
 
+indexed_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+
 Then, for example:
 
-shuffle_index = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+shuffle_index = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
 
-Shuffle the indices -> shuffle_index = [9, 3, 15, 12, 5, 10, 1, 4, 8, 11, 13, 7, 2, 14, 6, 0]
+Shuffle the indices -> shuffle_index = [19, 9, 3, 18, 15, 12, 5, 10, 17, 1, 4, 8, 11, 16, 13, 7, 2, 14, 6, 0]
 
-n_concatenations = (70/(17 - 1)) + 1 = 5
-shuffle_index = shuffle_index concatenated 5 times
+n_concatenations = (70/(20)) + 1 = 4
+shuffle_index = shuffle_index concatenated 4 times
 
 len(shuffle_index) = 80 > N
 ```
@@ -144,30 +110,24 @@ To query the `Nanoset` for the k-th sample we do the following:
 ```
 j = shuffle_index[k]
 ```
-2. Use the `sample_index` to get the left and right sample-bounding indices into the `document_index` and the starting token offset for each document
-```
-i, offset = sample_index[j]
-i_next, offset_next = sample_index[j + 1]
+2. To retrieve `S + 1` tokens from the `indexed_dataset` we have to specify the `offset` (`idx * sequence length` for CausalLM) and the length (the number of tokens to extract)
 ```
-3. Use the `document_index` to retrieve `S + 1` tokens from consecutive (in the `document_index`) documents
-```
-sample = []
-sample += indexed_dataset[document_index[i]][offset:]
-if i != i_next:
-    sample += indexed_dataset[document_index[i + 1:i_next]]
-sample += indexed_dataset[document_index[i_next]][:offset_next]
+offset = j * sequence_length
+sample = indexed_dataset[offset:offset + sequence_length + 1]
 ```
 
-Despite having repeated indices in the `shuffle_index`, throughout 1 epoch, we will only observe each sample once. We achieve this by deactivating shuffling in the `DistributedDataSampler`, so that the indices of the `shuffle_index` are consumed in the order they appear by the multiple processes. It is worth noting that the `document_index` already shuffles the data so that the samples constructed in the `sample_index` do not contain data from consecutive documents, and we shuffle the data again in the `shuffle_index`.
+Despite having repeated indices in the `shuffle_index`, throughout 1 epoch, we will only observe each sample once. We achieve this by deactivating shuffling in the `DistributedDataSampler`, so that the indices of the `shuffle_index` are consumed in the order they appear by the multiple processes. It is worth noting that the samples are already shuffled in the `shuffle_index`.
 ```
 Given:
 
 4 Processes loading data
 
-(P1) idx_list = [0, 4, 8, 12, 16, ...]    -> shuffle_index[idx_list] = [9, 5, 8, 2, 9, ...]
-(P2) idx_list = [1, 5, 9, 13, 17, ...]    -> shuffle_index[idx_list] = [3, 10, 11, 14, 3, ...]
-(P3) idx_list = [2, 6, 10, 14, 18, ...]   -> shuffle_index[idx_list] = [15, 1, 13, 6, 15, ...]
-(P4) idx_list = [3, 7, 11, 15, 19, ...]   -> shuffle_index[idx_list] = [12, 4, 7, 0, 12, ...]
+[19, 9, 3, 18, 15, 12, 5, 10, 17, 1, 4, 8, 11, 16, 13, 7, 2, 14, 6, 0]
+
+(P1) idx_list = [0, 4, 8, 12, 16, ...]    -> shuffle_index[idx_list] = [19, 15, 17, 11, 2, 19, ...]
+(P2) idx_list = [1, 5, 9, 13, 17, ...]    -> shuffle_index[idx_list] = [9, 12, 1, 16, 14, 9, ...]
+(P3) idx_list = [2, 6, 10, 14, 18, ...]   -> shuffle_index[idx_list] = [3, 5, 4, 13, 6, 3, ...]
+(P4) idx_list = [3, 7, 11, 15, 19, ...]   -> shuffle_index[idx_list] = [18, 10, 8, 7, 0, 18, ...]
 ```
 ### BlendedNanoset
 The `BlendedNanoset` is parameterized by the following variables:
diff --git a/pyproject.toml b/pyproject.toml
index df3dcf5f..532c0c42 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,8 @@ dependencies = [
     "safetensors",
     "dacite",
     "tqdm",
-    "transformers"
+    "transformers",
+    "datasets"
 ]
 
 [tool.setuptools.packages.find]
diff --git a/run_train_nanoset.py b/run_train_nanoset.py
index c689ec40..214fe34d 100644
--- a/run_train_nanoset.py
+++ b/run_train_nanoset.py
@@ -25,7 +25,7 @@
 logger = logging.get_logger(__name__)
 
 
-def get_dataloader_from_data_stage(trainer: DistributedTrainer, data: DataArgs):
+def get_dataloader_from_data_stage(trainer: DistributedTrainer, data: DataArgs) -> DataLoader:
     """Returns train, valid and test dataloaders"""
 
     # First, we need to know which ranks to feed the dataloader to
diff --git a/src/nanotron/data/LICENSE b/src/nanotron/data/LICENSE
deleted file mode 100644
index a5a0a738..00000000
--- a/src/nanotron/data/LICENSE
+++ /dev/null
@@ -1,59 +0,0 @@
-This repository also contains code from Facebook (from their Fairseq project)
-and NVIDIA (from their Megatron project). Files from these organizations
-have notices at the top of each file. Below are licenses used in those files, as indicated.
-
-------------- LICENSE FOR Facebook Fairseq code --------------
-
-MIT License
-
-Copyright (c) Facebook, Inc. and its affiliates.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-------------- LICENSE FOR NVIDIA Megatron code --------------
-
-The following applies to all files unless otherwise noted:
-
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---
diff --git a/src/nanotron/data/blended_nanoset.py b/src/nanotron/data/blended_nanoset.py
index 0790e69e..3f309e9e 100644
--- a/src/nanotron/data/blended_nanoset.py
+++ b/src/nanotron/data/blended_nanoset.py
@@ -17,7 +17,8 @@
 
 
 class BlendedNanoset(torch.utils.data.Dataset):
-    """Conjugating class for a set of Nanoset instances
+    """
+    Conjugating class for a set of Nanoset instances
 
     Args:
         datasets (List[Nanoset]): The Nanoset instances to blend
@@ -27,9 +28,6 @@ class BlendedNanoset(torch.utils.data.Dataset):
         size (int): The number of samples to draw from the blend
 
         config (NanosetConfig): The config object which informs dataset creation
-
-    Raises:
-        RuntimeError: When the dataset has fewer or more samples than 'size' post-initialization
     """
 
     def __init__(
@@ -43,13 +41,10 @@ def __init__(
         assert numpy.isclose(sum(weights), 1.0)
         assert all((isinstance(dataset, Nanoset) for dataset in datasets))
 
-        # Alert user to unnecessary blending
-        if len(datasets) == 1:
-            log_rank("Building a BlendedNanoset for a single Nanoset", logger=logger, level=logging.WARNING, rank=0)
-
         self.datasets = datasets
         self.weights = weights
         self.config = config
+
         # For the train split, we will have global batch size * train steps samples
         # For the valid and test splits, we will consume entirely both datasets
         self.dataset_sizes = [len(dataset) for dataset in self.datasets]
@@ -87,7 +82,8 @@ def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
         return self.datasets[dataset_id][dataset_sample_id]
 
     def build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
-        """Build and optionally cache the dataset index and the dataset sample index
+        """
+        Build and optionally cache the dataset index and the dataset sample index
 
         The dataset index is a 1-D mapping which determines the dataset to query. The dataset
         sample index is a 1-D mapping which determines the sample to request from the queried
@@ -103,25 +99,16 @@ def build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
             def get_path_to(suffix):
                 return os.path.join(path_to_cache, f"{self.unique_description_hash}-{type(self).__name__}-{suffix}")
 
-            path_to_description = get_path_to("description.txt")
             path_to_dataset_index = get_path_to("dataset_index.npy")
             path_to_dataset_sample_index = get_path_to("dataset_sample_index.npy")
-            cache_hit = all(
-                map(
-                    os.path.isfile,
-                    [path_to_description, path_to_dataset_index, path_to_dataset_sample_index],
-                )
-            )
+            cache_hit = all(map(os.path.isfile, [path_to_dataset_index, path_to_dataset_sample_index]))
+
         else:
             cache_hit = False
 
         if not path_to_cache or (not cache_hit and torch.distributed.get_rank() == 0):
-            log_rank(f"Build and save the {type(self).__name__} indices", logger=logger, level=logging.INFO, rank=0)
 
-            # Build the dataset and dataset sample indexes
-            log_rank(
-                "\tBuild and save the dataset and dataset sample indexes", logger=logger, level=logging.INFO, rank=0
-            )
+            log_rank(f"Build and save the {type(self).__name__} indices", logger=logger, level=logging.INFO, rank=0)
 
             t_beg = time.time()
 
@@ -131,9 +118,7 @@ def get_path_to(suffix):
 
             if path_to_cache:
                 os.makedirs(path_to_cache, exist_ok=True)
-                # Write the description
-                with open(path_to_description, "wt") as writer:
-                    writer.write(self.unique_description)
+
                 # Save the indexes
                 numpy.save(path_to_dataset_index, dataset_index, allow_pickle=True)
                 numpy.save(path_to_dataset_sample_index, dataset_sample_index, allow_pickle=True)
@@ -175,7 +160,7 @@ def get_path_to(suffix):
 def build_blending_indices(
     n_samples: int, weights: numpy.ndarray, dataset_sizes: List
 ) -> Tuple[numpy.ndarray, numpy.ndarray]:
-    """Given multiple datasets and a weighting array, build samples
+    """Given multiple datasets and a weighting array, build samples indexes
     such that it follows those weights."""
     # Create empty arrays for dataset indices and dataset sample indices
     dataset_index = numpy.empty((n_samples,), dtype="uint")
diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
index 2f440abe..b4113845 100644
--- a/src/nanotron/data/dataset_builder.py
+++ b/src/nanotron/data/dataset_builder.py
@@ -30,7 +30,8 @@ def __init__(
         self.config = config
 
     def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
-        """Build all dataset splits according to the provided data_path(s)
+        """
+        Build all dataset splits according to the provided data_path(s)
 
         This method is distributed-aware and must be called on all ranks.
 
@@ -47,7 +48,9 @@ def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
 
         # Single Nanoset
         if isinstance(data_path, str):
-            return self.build_nanoset_dataset_splits(data_path, split, self.config.split_num_samples)
+            return self.build_nanoset_dataset_splits(
+                data_path, self.config.sequence_length, split, self.config.split_num_samples
+            )
 
         # Blended Nanoset
         prefix_per_dataset = list(data_path.keys())
@@ -64,7 +67,7 @@ def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
 
         for i in range(len(prefix_per_dataset)):
             nanoset_datasets_split = self.build_nanoset_dataset_splits(
-                prefix_per_dataset[i], split, sizes_per_dataset[i]
+                prefix_per_dataset[i], self.config.sequence_length, split, sizes_per_dataset[i]
             )
 
             for j in range(len(nanoset_datasets_split)):
@@ -97,14 +100,18 @@ def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
     def build_nanoset_dataset_splits(
         self,
         path_prefix: str,
+        sequence_length: int,
         split: List[float],
         sizes: List[int],
     ) -> List[Nanoset]:
-        """Build each Nanoset split from a single MMapIndexedDataset
+        """
+        Build each Nanoset split from a single MMapIndexedDataset
 
         Args:
             path_prefix (str): The MMapIndexedDataset .bin and .idx file prefix
 
+            sequence_length (int): The number of tokens MMapIndexedDataset has to extract for each sample
+
             split (List[float]): The dataset split ratios (must sum to 1.00)
 
             sizes (List[int]): The number of total samples to draw from each split
@@ -116,7 +123,7 @@ def build_nanoset_dataset_splits(
 
         indexed_dataset = self.build_generic_dataset(MMapIndexedDataset, path_prefix)
 
-        split_idx_bounds = get_split_indices(split, indexed_dataset.sequence_lengths.shape[0])
+        split_idx_bounds = get_split_indices(split, len(indexed_dataset), sequence_length)
 
         split_indices = [
             numpy.arange(
@@ -129,13 +136,13 @@ def build_nanoset_dataset_splits(
         ]
 
         nanoset_datasets = []
-        for i, _split in enumerate(Split):
+        for i, split_name in enumerate(Split):
             if split[i] == 0.0:
                 nanoset_datasets.append(None)
             else:
                 nanoset_datasets.append(
                     self.build_generic_dataset(
-                        Nanoset, indexed_dataset, split_indices[i], sizes[i], _split, self.config
+                        Nanoset, indexed_dataset, split_indices[i], sizes[i], split_name, self.config
                     )
                 )
 
@@ -146,7 +153,8 @@ def build_generic_dataset(
         cls: Type[DistributedDataset],
         *args: Any,
     ) -> DistributedDataset:
-        """Build the DistributedDataset
+        """
+        Build the DistributedDataset
 
         Args:
             cls (Type[DistributedDataset]): The DistributedDataset class to be built
@@ -154,59 +162,48 @@ def build_generic_dataset(
             args (Tuple[Any]): The positional arguments used to build the provided
             DistributedDataset class
 
-        Raises:
-            Exception: When the dataset constructor raises an OSError
-
         Returns:
             DistributedDataset: The DistributedDataset instantion
         """
-        if torch.distributed.is_initialized():
-            rank = torch.distributed.get_rank()
-
-            dataset = None
-
-            # First, build on rank 0
-            if rank == 0:
-                try:
-                    dataset = cls(*args)
-                except OSError as err:
-                    log = (
-                        "Failed to write dataset materials to the data cache directory. "
-                        + "Please supply a directory to which you have write access via "
-                        + "the path_to_cache attribute in NanosetConfig and "
-                        + "retry. Refer to the preserved traceback above for more information."
-                    )
-                    raise Exception(log) from err
 
-            torch.distributed.barrier()
+        rank = torch.distributed.get_rank()
+        dataset = None
 
-            if rank != 0:
-                dataset = cls(*args)
+        # First, build on rank 0
+        if rank == 0:
+            dataset = cls(*args)
 
-            return dataset
+        torch.distributed.barrier()
 
-        return cls(*args)
+        # Then, in the other ranks
+        if rank != 0:
+            dataset = cls(*args)
 
+        return dataset
 
-def get_split_indices(split: List[float], num_elements: int) -> List[int]:
-    """Determine the document index bounds per split
+
+def get_split_indices(split: List[float], number_of_tokens: int, sequence_length: int) -> List[int]:
+    """
+    Determine the sample index bounds per split. The division is done at the Token level
 
     Args:
         split (List[float]): The dataset split ratios (must sum to 1.00)
 
-        num_elements (int): The number of elements, e.g. sequences or documents, available for
-        the split
+        number_of_tokens (int): The number of tokens available in the MMapIndexedDataset
+
+        sequence_length (int): The number of tokens to extract from MMapIndexedDataset for each sample
 
     Returns:
-        List[int]: The indices for all three splits e.g. [0, 900, 990, 1000] for a 1000-document
-        set and a [90.0, 9.0, 1.0] split
+        List[int]: The indices for all three splits e.g. [0, 800, 900, 1000] for a 1024000 token dataset,
+        1024 sequence length and a [8, 1, 1] split
     """
+    number_of_samples = int(number_of_tokens / sequence_length)
     split_indices = [0]
     for split_pct in split:
-        split_indices.append(split_indices[-1] + int(round(split_pct * float(num_elements))))
-    split_indices[1:] = [_ - (split_indices[-1] - num_elements) for _ in split_indices[1:]]
+        split_indices.append(split_indices[-1] + int(round(split_pct * float(number_of_samples))))
+    split_indices[1:] = [_ - (split_indices[-1] - number_of_samples) for _ in split_indices[1:]]
 
     assert len(split_indices) == len(split) + 1
-    assert split_indices[-1] == num_elements
+    assert split_indices[-1] == number_of_samples
 
     return split_indices
diff --git a/src/nanotron/data/indexed_dataset.py b/src/nanotron/data/indexed_dataset.py
index bb0de613..625f8673 100644
--- a/src/nanotron/data/indexed_dataset.py
+++ b/src/nanotron/data/indexed_dataset.py
@@ -1,458 +1,64 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import shutil
-import struct
-import time
-from enum import Enum
-from functools import lru_cache
-from types import TracebackType
-from typing import List, Optional, Tuple, Type, Union
-
 import numpy
 import torch
 
 from nanotron import logging
-from nanotron.logging import log_rank
 
 logger = logging.get_logger(__name__)
 
-_INDEX_HEADER = b"MMIDIDX\x00\x00"
-
-
-class DType(Enum):
-    """The NumPy data type Enum for writing/reading the MMapIndexedDataset indices"""
-
-    uint8 = 1
-    int8 = 2
-    int16 = 3
-    int32 = 4
-    int64 = 5
-    float64 = 6
-    float32 = 7
-    uint16 = 8
-
-    @classmethod
-    def code_from_dtype(cls, value: Type[numpy.number]) -> int:
-        """Get the code from the dtype
-
-        Args:
-            value (Type[numpy.number]): The dtype
-
-        Returns:
-            int: The code
-        """
-        return cls[value.__name__].value
-
-    @classmethod
-    def dtype_from_code(cls, value: int) -> Type[numpy.number]:
-        """Get the dtype from the code
-
-        Args:
-            value (int): The code
-
-        Returns:
-            Type[numpy.number]: The dtype
-        """
-        return getattr(numpy, cls(value).name)
-
-    @staticmethod
-    def size(key: Union[int, Type[numpy.number]]) -> int:
-        """Get the size of the dtype/code in bytes
-
-        Args:
-            key (Union[int, Type[numpy.number]]): The dtype or code
-
-        Raises:
-            ValueError: If the key is neither dtype nor integer code
-
-        Returns:
-            int: The size of the dtype/code in in bytes
-        """
-        if isinstance(key, int):
-            return DType.dtype_from_code(key)().itemsize
-        elif numpy.number in key.__mro__:
-            return key().itemsize
-        else:
-            raise ValueError
-
-    @staticmethod
-    def optimal_dtype(cardinality: Optional[int]) -> Type[numpy.number]:
-        """Get the dtype to use for an index of a certain cardinality
-
-        Args:
-            cardinality (Optional[int]): The number of elements to be indexed
-
-        Returns:
-            Type[numpy.number]: The dtype to use for the index
-        """
-        if cardinality is not None and cardinality < 65500:
-            return numpy.uint16
-        else:
-            return numpy.int32
-
 
-class _IndexWriter(object):
-    """Object class to write the index (.idx) file
+class MMapIndexedDataset(torch.utils.data.Dataset):
+    """The low-level interface dataset class
 
     Args:
-        idx_path (str): The path to the index file
+        path_to_mmap (str): Path to the mmap dataset
 
-        dtype (Type[numpy.number]): The dtype of the index file
     """
 
-    def __init__(self, idx_path: str, dtype: Type[numpy.number]) -> None:
-        self.idx_path = idx_path
-        self.dtype = dtype
-
-    def __enter__(self) -> "_IndexWriter":
-        """Enter the context introduced by the 'with' keyword
-
-        Returns:
-            _IndexWriter: The instance
-        """
-        self.idx_writer = open(self.idx_path, "wb")
-        # fixed, vestigial practice
-        self.idx_writer.write(_INDEX_HEADER)
-        # fixed, vestigial practice
-        self.idx_writer.write(struct.pack("<Q", 1))
-        # the numeric code for the dtype
-        self.idx_writer.write(struct.pack("<B", DType.code_from_dtype(self.dtype)))
-        return self
-
-    def __exit__(
-        self,
-        exc_type: Optional[Type[BaseException]],
-        exc_val: Optional[BaseException],
-        exc_tb: Optional[TracebackType],
-    ) -> Optional[bool]:
-        """Exit the context introduced by the 'with' keyword
-
-        Args:
-            exc_type (Optional[Type[BaseException]]): Exception type
-
-            exc_val (Optional[BaseException]): Exception value
-
-            exc_tb (Optional[TracebackType]): Exception traceback object
-
-        Returns:
-            Optional[bool]: Whether to silence the exception
-        """
-        self.idx_writer.close()
-
-    def write(
-        self,
-        sequence_lengths: List[int],
-        document_indices: List[int],
-    ) -> None:
-        """Write the index (.idx) file
+    def __init__(self, path_to_mmap: str) -> None:
+        super().__init__()
+        self.path_to_mmap = path_to_mmap
 
-        Args:
-            sequence_lengths (List[int]): The length of each sequence
+        self.bin_buffer_mmap = numpy.memmap(self.path_to_mmap, mode="r", order="C")
+        self.bin_buffer = memoryview(self.bin_buffer_mmap)
 
-            document_indices (List[int]): The seqyebce indices demarcating the end of each document
+    def get(self, idx: int, length: int, offset: int = None) -> numpy.ndarray:
         """
-        sequence_pointers = self._sequence_pointers(sequence_lengths)
-
-        # the number of sequences in the dataset
-        sequence_count = len(sequence_lengths)
-        self.idx_writer.write(struct.pack("<Q", sequence_count))
-
-        # the number of documents in the dataset
-        document_count = len(document_indices)
-        self.idx_writer.write(struct.pack("<Q", document_count))
-
-        # the number of tokens per sequence
-        sequence_lengths = numpy.array(sequence_lengths, dtype=numpy.int32)
-        self.idx_writer.write(sequence_lengths.tobytes(order="C"))
-        del sequence_lengths
-
-        # the byte offsets for all sequences
-        sequence_pointers = numpy.array(sequence_pointers, dtype=numpy.int64)
-        self.idx_writer.write(sequence_pointers.tobytes(order="C"))
-        del sequence_pointers
-
-        # the sequence indices marking the end of each document
-        document_indices = numpy.array(document_indices, dtype=numpy.int64)
-        self.idx_writer.write(document_indices.tobytes(order="C"))
-
-    def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
-        """Build the sequence pointers per the sequence lengths and dtype size
+        Returns length + 1 tokens from the memmap dataset
+        For CausalLM length == sequence_length
 
         Args:
-            sequence_lengths (List[int]): The length of each sequence
-
-        Returns:
-            List[int]: The pointer to the beginning of each sequence
+            idx (int): The index into the memmap dataset
+            length (int): The quantity of tokens to extract from the memmap dataset
+            offset (int): The offset in the memmap dataset to extract tokens from
         """
-        itemsize = DType.size(self.dtype)
-        curr_ptr = 0
-        list_ptr = []
-        for length in sequence_lengths:
-            list_ptr.append(curr_ptr)
-            curr_ptr += length * itemsize
-        return list_ptr
-
-
-class _IndexReader(object):
-    """Object class to read the index (.idx) file
-
-    Args:
-        idx_path (str): The path to the index file
+        if offset is None:
+            # dtype=uint16, 2 bytes per token
+            offset = idx * length * 2
 
-    """
-
-    def __init__(self, idx_path: str) -> None:
-
-        log_rank(f"Load the {type(self).__name__} from {idx_path}", logger=logger, level=logging.INFO, rank=0)
-
-        with open(idx_path, "rb") as stream:
-            header = stream.read(9)
-            assert header == _INDEX_HEADER, f"bad header, cannot read: {idx_path}"
-
-            version = struct.unpack("<Q", stream.read(8))[0]
-            assert version == 1, f"bad version, cannot read: {idx_path}"
-
-            code = struct.unpack("<B", stream.read(1))[0]
-            self.dtype = DType.dtype_from_code(code)
-            self.dtype_size = DType.size(self.dtype)
-
-            self.sequence_count = struct.unpack("<Q", stream.read(8))[0]
-            self.document_count = struct.unpack("<Q", stream.read(8))[0]
-
-            offset = stream.tell()
-
-        self.bin_buffer_mmap = numpy.memmap(idx_path, mode="r", order="C")
-        self.bin_buffer = memoryview(self.bin_buffer_mmap)
-
-        log_rank("\tExtract the sequence lengths", logger=logger, level=logging.INFO, rank=0)
-        t_beg = time.time()
-        self.sequence_lengths = numpy.frombuffer(
-            self.bin_buffer, dtype=numpy.int32, count=self.sequence_count, offset=offset
-        )
-        t_end = time.time()
-        log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
-
-        log_rank("\tExtract the sequence pointers", logger=logger, level=logging.INFO, rank=0)
-        t_beg = time.time()
-        self.sequence_pointers = numpy.frombuffer(
-            self.bin_buffer,
-            dtype=numpy.int64,
-            count=self.sequence_count,
-            offset=offset + self.sequence_lengths.nbytes,
-        )
-        t_end = time.time()
-        log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
-
-        log_rank("\tExtract the document indices", logger=logger, level=logging.INFO, rank=0)
-        t_beg = time.time()
-        self.document_indices = numpy.frombuffer(
-            self.bin_buffer,
-            dtype=numpy.int64,
-            count=self.document_count,
-            offset=offset + self.sequence_lengths.nbytes + self.sequence_pointers.nbytes,
-        )
-        t_end = time.time()
-        log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
-
-        assert self.sequence_lengths.shape[0] == len(self)
-        assert self.sequence_lengths.shape[0] == self.sequence_count
-        assert self.sequence_lengths.shape[0] == self.document_indices[-1]
-
-        log_rank(
-            f"> Total number of documents: {self.document_indices.shape[0] - 1}",
-            logger=logger,
-            level=logging.INFO,
-            rank=0,
-        )
-
-    def __del__(self) -> None:
-        """Clean up the object"""
-        self.bin_buffer_mmap._mmap.close()
-        del self.bin_buffer_mmap
-
-    def __len__(self) -> int:
-        """Return the length of the dataset
+        sequence = numpy.frombuffer(self.bin_buffer, dtype=numpy.uint16, count=length + 1, offset=offset)
+        return sequence
 
-        Returns:
-            int: The length of the dataset
+    def __getitem__(self, idx: int) -> numpy.ndarray:
         """
-        return self.sequence_count
-
-    @lru_cache(maxsize=8)
-    def __getitem__(self, idx: int) -> Tuple[numpy.int32, numpy.int64, Optional[numpy.int8]]:
-        """Return the pointer, length, and mode at the index
+        Returns 1 token from the memmap dataset. Check get method
 
         Args:
-            idx (int): The index into the dataset
-
-        Returns:
-            Tuple[numpy.int32, numpy.int64, Optional[numpy.int8]]: The pointer, length and mode at
-            the index
+            idx (int): The index of the token
         """
-        return (
-            self.sequence_pointers[idx],
-            self.sequence_lengths[idx],
-        )
-
-
-class MMapIndexedDataset(torch.utils.data.Dataset):
-    """The low-level interface dataset class
-
-    Args:
-        path_prefix (str): The index (.idx) and data (.bin) prefix
-
-    """
-
-    def __init__(self, path_prefix: str) -> None:
-        super().__init__()
-        self.path_prefix = path_prefix
-
-        self.index = _IndexReader(get_idx_path(self.path_prefix))
-        self.bin_buffer_mmap = numpy.memmap(get_bin_path(self.path_prefix), mode="r", order="C")
-        self.bin_buffer = memoryview(self.bin_buffer_mmap)
+        # dtype=uint16, 2 bytes per token
+        offset = idx * 2
+        token = numpy.frombuffer(self.bin_buffer, dtype=numpy.uint16, count=1, offset=offset)
+        return token
 
     def __del__(self) -> None:
         """Clean up the object"""
         if self.bin_buffer_mmap is not None:
             self.bin_buffer_mmap._mmap.close()
         del self.bin_buffer_mmap
-        del self.index
 
     def __len__(self) -> int:
-        """Return the length of the dataset i.e. the number of sequences in the index
-
-        Returns:
-            int: The length of the dataset
-        """
-        return len(self.index)
-
-    def get(self, idx: int, offset: int = 0, length: Optional[int] = None) -> numpy.ndarray:
-        """Retrieve a single item from the dataset with the option to only
-        return a portion of the item.
-
-        get(idx) is the same as [idx] but get() does not support slicing.
-        """
-
-        sequence_pointer, sequence_length = self.index[idx]
-        if length is None:
-            length = sequence_length - offset
-        sequence_pointer += offset * DType.size(self.index.dtype)
-        sequence = numpy.frombuffer(self.bin_buffer, dtype=self.index.dtype, count=length, offset=sequence_pointer)
-        return sequence
-
-    @property
-    def sequence_lengths(self) -> numpy.ndarray:
-        """Get the sequence lengths
-
-        Returns:
-            numpy.ndarray: The sequence lengths
-        """
-        return self.index.sequence_lengths
-
-    @property
-    def document_indices(self) -> numpy.ndarray:
-        """Get the document indices
-
-        Returns:
-            numpy.ndarray: The document indices
-        """
-        return self.index.document_indices
-
-
-class MMapIndexedDatasetBuilder(object):
-    """Builder class for the MMapIndexedDataset class
-
-    Args:
-        bin_path (str): The path to the data (.bin) file
-
-        dtype (Type[numpy.number], optional): The dtype of the index file. Defaults to numpy.int32.
-    """
-
-    def __init__(self, bin_path: str, dtype: Type[numpy.number] = numpy.int32) -> None:
-        self.data_file = open(bin_path, "wb")
-        self.dtype = dtype
-
-        self.sequence_lengths = []
-        self.document_indices = [0]
-
-    def add_item(self, tensor: torch.Tensor) -> None:
-        """Add a single item to the dataset
-
-        Args:
-            tensor (torch.Tensor): The item to add to the data file
-
         """
-        np_array = numpy.array(tensor.numpy(), dtype=self.dtype)
-        self.data_file.write(np_array.tobytes(order="C"))
-        self.sequence_lengths.append(np_array.size)
-
-    def add_document(self, tensor: torch.Tensor, lengths: List[int]) -> None:
-        """Add an entire document to the dataset
-
-        Args:
-            tensor (torch.Tensor): The document to add
-            lengths (List[int]): The lengths of each item in the document
+        Returns the number of tokens in the file
         """
-        np_array = numpy.array(tensor, dtype=self.dtype)
-        self.data_file.write(np_array.tobytes(order="C"))
-        self.sequence_lengths.extend(lengths)
-        self.document_indices.append(len(self.sequence_lengths))
-
-    def end_document(self) -> None:
-        """Finalize the document, for use with MMapIndexedDatasetBuilder.add_item"""
-        self.document_indices.append(len(self.sequence_lengths))
-
-    def add_index(self, path_prefix: str) -> None:
-        """Add an entire MMapIndexedDataset to the dataset
-
-        Args:
-            path_prefix (str): The index (.idx) and data (.bin) prefix
-        """
-        # Concatenate index
-        index = _IndexReader(get_idx_path(path_prefix))
-        assert index.dtype == self.dtype
-
-        offset = len(self.sequence_lengths)
-        self.sequence_lengths.extend(index.sequence_lengths)
-        self.document_indices.extend((offset + index.document_indices)[1:])
-
-        # Concatenate data
-        with open(get_bin_path(path_prefix), "rb") as f:
-            shutil.copyfileobj(f, self.data_file)
-
-    def finalize(self, idx_path: str) -> None:
-        """Clean up and write the index (.idx) file
-
-        Args:
-            idx_path (str): The path to the index file
-        """
-        self.data_file.close()
-        with _IndexWriter(idx_path, self.dtype) as writer:
-            writer.write(self.sequence_lengths, self.document_indices)
-
-
-def get_idx_path(path_prefix: str) -> str:
-    """Get the path to the index file from the prefix
-
-    Args:
-        path_prefix (str): The prefix
-
-    Returns:
-        str: The path to the index file
-    """
-    return path_prefix + ".idx"
-
-
-def get_bin_path(path_prefix: str) -> str:
-    """Get the path to the data file from the prefix
-
-    Args:
-        path_prefix (str): The prefix
-
-    Returns:
-        str: The path to the data file
-    """
-    return path_prefix + ".bin"
+        # dtype=uint16, 2 bytes per token
+        return int(len(self.bin_buffer) / 2)
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
index c56129cf..0bec7fd6 100644
--- a/src/nanotron/data/nanoset.py
+++ b/src/nanotron/data/nanoset.py
@@ -3,7 +3,7 @@
 import os
 import time
 from collections import OrderedDict
-from typing import Dict, Tuple, Union
+from typing import Dict, Union
 
 import numpy
 import torch
@@ -25,9 +25,12 @@ class Nanoset(torch.utils.data.Dataset):
         indexed_dataset (MMapIndexedDataset): The MMapIndexedDataset around which to build the
         Nanoset
 
-        indexed_indices (numpy.ndarray): The set of the documents indices to expose
+        indexed_indices (numpy.ndarray): The set of indexes to sample from the MMapIndexedDataset dataset
 
-        num_samples (int): Number of samples that we will consume from the dataset. If it is None, we will consume all the samples only 1 time (valid and test splits). For the train split, we will introduce train steps * global batch size and compute the number of epochs based on the length of the dataset.
+        num_samples (int): Number of samples that we will consume from the dataset. If it is None, we will
+                           consume all the samples only 1 time (valid and test splits). For the train split,
+                           we will introduce train steps * global batch size and compute the number of epochs
+                           based on the number of samples of the dataset.
 
         index_split (Split): The indexed_indices Split (train, valid, test)
 
@@ -43,8 +46,6 @@ def __init__(
         config: NanosetConfig,
     ) -> None:
 
-        assert indexed_indices.size > 0
-
         self.indexed_dataset = indexed_dataset
         self.indexed_indices = indexed_indices
         self.num_samples = num_samples
@@ -55,7 +56,7 @@ def __init__(
 
         self.unique_identifiers = OrderedDict()
         self.unique_identifiers["class"] = type(self).__name__
-        self.unique_identifiers["path_prefix"] = self.indexed_dataset.path_prefix
+        self.unique_identifiers["path_to_mmap"] = self.indexed_dataset.path_to_mmap
         self.unique_identifiers["index_split"] = self.index_split.name
         self.unique_identifiers["split"] = self.config.split
         self.unique_identifiers["random_seed"] = self.config.random_seed
@@ -64,170 +65,78 @@ def __init__(
         self.unique_description = json.dumps(self.unique_identifiers, indent=4)
         self.unique_description_hash = hashlib.md5(self.unique_description.encode("utf-8")).hexdigest()
 
-        # Load or build/cache the document, sample, and shuffle indices
+        self.shuffle_index = self.build_shuffle_indices()
 
-        (
-            self.document_index,
-            self.sample_index,
-            self.shuffle_index,
-        ) = self.build_document_sample_shuffle_indices()
+        # Check size
+        _ = self[len(self) - 1]
+        try:
+            _ = self[len(self)]
+            raise RuntimeError("Nanoset size is improperly bounded")
+        except IndexError:
+            pass
 
     def __len__(self) -> int:
         """
         Returns:
-            int: The length of the dataset
+            int: The number of samples of the Nanoset
         """
+
         return self.shuffle_index.shape[0]
 
     def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
-        """Get the text (token ids) for a given index
+        """Get the input ids for a given index
 
         Args:
             idx (int): The index into the dataset
 
         Returns:
-            Dict[str, numpy.ndarray]: The token ids wrapped in a dictionary
+            Dict[str, numpy.ndarray]: The input ids wrapped in a dictionary
         """
 
         # Do the shuffle mapping
         idx = self.shuffle_index[idx]
+        # torch can't convert np.ndarray of type numpy.uint16
+        return {"input_ids": self.indexed_dataset.get(idx, self.config.sequence_length).astype(numpy.int32)}
 
-        # Get the beginning and end documents and offsets
-        doc_index_beg, doc_index_beg_offset = self.sample_index[idx]
-        doc_index_end, doc_index_end_offset = self.sample_index[idx + 1]
-
-        tokens = []
-
-        # Sample spans a single document
-        if doc_index_beg == doc_index_end:
-
-            # Add the entire sample
-            tokens.append(
-                self.indexed_dataset.get(
-                    self.document_index[doc_index_beg],
-                    offset=doc_index_beg_offset,
-                    length=doc_index_end_offset - doc_index_beg_offset + 1,
-                )
-            )
-
-        # Sample spans multiple documents
-        else:
-            for i in range(doc_index_beg, doc_index_end + 1):
-
-                # Add the sample part
-                offset = 0 if i > doc_index_beg else doc_index_beg_offset
-                length = None if i < doc_index_end else doc_index_end_offset + 1
-                tokens.append(self.indexed_dataset.get(self.document_index[i], offset=offset, length=length))
-
-        return {"input_ids": numpy.array(numpy.concatenate(tokens), dtype=numpy.int64)}
-
-    def build_document_sample_shuffle_indices(
-        self,
-    ) -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]:
-        """Build the document index, the sample index, and the shuffle index
-
-        The document index:
-            -- 1-D
-            -- An ordered array of document ids
-
-        The sample index:
-            -- 2-D
-            -- The document indices and offsets which mark the start of every sample to generate samples of sequence length length
+    def build_shuffle_indices(self) -> numpy.ndarray:
+        """
+        Build the shuffle index
 
-        The shuffle index:
+        Shuffle index:
             -- 1-D
-            -- A random permutation of index range of the sample index
+            -- A random permutation of index range of the indexed indices of the memmap dataset for this split
 
         Returns:
-            Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]: The document index, the sample index, and the
-            shuffle index
-
+            numpy.ndarray: The shuffle index
         """
+
         path_to_cache = self.config.path_to_cache
         if path_to_cache is None:
-            path_to_cache = os.path.join(self.indexed_dataset.path_prefix, "cache", f"{type(self).__name__}_indices")
+            path_to_cache = os.path.join(
+                self.indexed_dataset.path_prefix[:-4], "cache", f"{type(self).__name__}_indices"
+            )
 
         def get_path_to(suffix):
             return os.path.join(path_to_cache, f"{self.unique_description_hash}-{type(self).__name__}-{suffix}")
 
-        path_to_description = get_path_to("description.txt")
-        path_to_document_index = get_path_to("document_index.npy")
-        path_to_sample_index = get_path_to("sample_index.npy")
         path_to_shuffle_index = get_path_to("shuffle_index.npy")
-        cache_hit = all(
-            map(
-                os.path.isfile,
-                [
-                    path_to_description,
-                    path_to_document_index,
-                    path_to_sample_index,
-                    path_to_shuffle_index,
-                ],
-            )
-        )
-
-        num_tokens_per_epoch = compute_num_tokens_per_epoch(self.indexed_dataset, self.indexed_indices)
+        cache_hit = os.path.isfile(path_to_shuffle_index)
 
         if not cache_hit and torch.distributed.get_rank() == 0:
-            log_rank(
-                f"Build and save the {type(self).__name__} {self.index_split.name} indices",
-                logger=logger,
-                level=logging.INFO,
-                rank=0,
-            )
-
-            numpy_random_state = numpy.random.RandomState(self.config.random_seed)
 
             os.makedirs(path_to_cache, exist_ok=True)
 
-            # Write the description
-            with open(path_to_description, "wt") as writer:
-                writer.write(self.unique_description)
-
-            # Build the document index
-            log_rank(
-                f"\tBuild and save the document index to {os.path.basename(path_to_document_index)}",
-                logger=logger,
-                level=logging.INFO,
-                rank=0,
-            )
-            t_beg = time.time()
-            document_index = numpy.copy(self.indexed_indices)
-            numpy_random_state.shuffle(document_index)
-            numpy.save(path_to_document_index, document_index, allow_pickle=True)
-            t_end = time.time()
-            log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
-
-            # Build the sample index
-            log_rank(
-                f"\tBuild and save the sample index to {os.path.basename(path_to_sample_index)}",
-                logger=logger,
-                level=logging.INFO,
-                rank=0,
-            )
-            t_beg = time.time()
-
-            assert document_index.dtype == numpy.int32
-            assert self.indexed_dataset.sequence_lengths.dtype == numpy.int32
-            sample_index = build_sample_idx(
-                sizes=self.indexed_dataset.sequence_lengths,
-                doc_idx=document_index,
-                seq_length=self.config.sequence_length,
-                tokens_per_epoch=num_tokens_per_epoch,
-            )
-            numpy.save(path_to_sample_index, sample_index, allow_pickle=True)
-            t_end = time.time()
-            log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
+            numpy_random_state = numpy.random.RandomState(self.config.random_seed)
 
             # Build the shuffle index
             log_rank(
-                f"\tBuild and save the shuffle index to {os.path.basename(path_to_shuffle_index)}",
+                f"Build and save the shuffle index to {os.path.basename(path_to_shuffle_index)} for the {type(self).__name__} {self.index_split.name} split",
                 logger=logger,
                 level=logging.INFO,
                 rank=0,
             )
             t_beg = time.time()
-            shuffle_index = numpy.arange(start=0, stop=(sample_index.shape[0] - 1), step=1, dtype=numpy.uint32)
+            shuffle_index = self.indexed_indices.copy()
             numpy_random_state.shuffle(shuffle_index)
             if self.num_samples is not None:  # For the train split, concatenate shuffle Indexes
                 n_concatenations = (
@@ -243,36 +152,7 @@ def get_path_to(suffix):
             log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
         log_rank(
-            f"Load the {type(self).__name__} {self.index_split.name} indices",
-            logger=logger,
-            level=logging.INFO,
-            rank=0,
-        )
-
-        log_rank(
-            f"\tLoad the document index from {os.path.basename(path_to_document_index)}",
-            logger=logger,
-            level=logging.INFO,
-            rank=0,
-        )
-        t_beg = time.time()
-        document_index = numpy.load(path_to_document_index, allow_pickle=True, mmap_mode="r")
-        t_end = time.time()
-        log_rank(f"\t> time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
-
-        log_rank(
-            f"\tLoad the sample index from {os.path.basename(path_to_sample_index)}",
-            logger=logger,
-            level=logging.INFO,
-            rank=0,
-        )
-        t_beg = time.time()
-        sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode="r")
-        t_end = time.time()
-        log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
-
-        log_rank(
-            f"\tLoad the shuffle index from {os.path.basename(path_to_shuffle_index)}",
+            f"Load the shuffle index from {os.path.basename(path_to_shuffle_index)} for the {type(self).__name__} {self.index_split.name} split",
             logger=logger,
             level=logging.INFO,
             rank=0,
@@ -282,79 +162,11 @@ def get_path_to(suffix):
         t_end = time.time()
         log_rank(f"\t> Time elapsed: {t_end - t_beg:4f} seconds", logger=logger, level=logging.DEBUG, rank=0)
 
-        log_rank(f"> Total number of samples: {sample_index.shape[0] - 1}", logger=logger, level=logging.INFO, rank=0)
+        log_rank(f"> Total number of samples: {len(self.indexed_indices)}", logger=logger, level=logging.INFO, rank=0)
 
-        if (
-            self.num_samples is not None
-        ):  # Compute number of epochs we will iterate over this Nanoset. Just for training
-            num_epochs = round(self.num_samples / (sample_index.shape[0] - 1), 2)
+        if self.num_samples is not None:
+            # Compute number of epochs we will iterate over this Nanoset. Just for training
+            num_epochs = round(self.num_samples / (len(self.indexed_indices)), 2)
             log_rank(f"> Total number of epochs: {num_epochs}", logger=logger, level=logging.INFO, rank=0)
 
-        return document_index, sample_index, shuffle_index
-
-
-def build_sample_idx(sizes, doc_idx, seq_length, tokens_per_epoch):
-    # Check validity of inumpyut args.
-    assert seq_length > 1
-    assert tokens_per_epoch > 1
-
-    # Compute the number of samples.
-    num_samples = (tokens_per_epoch - 1) // seq_length
-
-    # Allocate memory for the mapping table.
-    sample_idx = numpy.full([num_samples + 1, 2], fill_value=-999, dtype=numpy.int32)
-
-    # Setup helper vars.
-    sample_index = 0
-    doc_idx_index = 0
-    doc_offset = 0
-
-    # Add the first entry to the mapping table.
-    sample_idx[sample_index][0] = doc_idx_index
-    sample_idx[sample_index][1] = doc_offset
-    sample_index += 1
-
-    # Loop over the rest of the samples.
-    while sample_index <= num_samples:
-        # Start with a fresh sequence.
-        remaining_seq_length = seq_length + 1
-
-        # Keep adding docs until we reach the end of the sequence.
-        while remaining_seq_length != 0:
-            # Look up the current document length.
-            doc_id = doc_idx[doc_idx_index]
-            doc_length = sizes[doc_id] - doc_offset
-
-            # Try to add it to the current sequence.
-            remaining_seq_length -= doc_length
-
-            # If it fits, adjust offset and break out of inner loop.
-            if remaining_seq_length <= 0:
-                doc_offset += remaining_seq_length + doc_length - 1
-                remaining_seq_length = 0
-            else:
-                # Otherwise move to the next document.
-                doc_idx_index += 1
-                doc_offset = 0
-
-        # Store the current sequence in the mapping table.
-        sample_idx[sample_index][0] = doc_idx_index
-        sample_idx[sample_index][1] = doc_offset
-        sample_index += 1
-
-    assert not numpy.any(sample_idx == -999)
-    return sample_idx
-
-
-def compute_num_tokens_per_epoch(indexed_dataset: MMapIndexedDataset, indices: numpy.ndarray) -> int:
-    """Compute the number of tokens in a single epoch
-
-    Args:
-        indexed_dataset (MMapIndexedDataset): The underlying MMapIndexedDataset
-
-        indices (numpy.ndarray): The subset of indices into the underlying MMapIndexedDataset
-
-    Returns:
-        int: The number of tokens in a single epoch
-    """
-    return numpy.sum(indexed_dataset.sequence_lengths[indices])
+        return shuffle_index
diff --git a/src/nanotron/data/nanoset_configs.py b/src/nanotron/data/nanoset_configs.py
index 8e5caaba..9d512193 100644
--- a/src/nanotron/data/nanoset_configs.py
+++ b/src/nanotron/data/nanoset_configs.py
@@ -6,7 +6,8 @@
 
 @dataclass
 class NanosetConfig:
-    """Configuration object for Nanoset datasets
+    """
+    Configuration object for Nanoset datasets
 
     Attributes:
 
diff --git a/src/nanotron/data/utils.py b/src/nanotron/data/utils.py
index 5096557b..78e6650a 100644
--- a/src/nanotron/data/utils.py
+++ b/src/nanotron/data/utils.py
@@ -17,7 +17,8 @@ class Split(Enum):
 
 
 def normalize(weights: List[float]) -> List[float]:
-    """Do non-exponentiated normalization
+    """
+    Normalize elements of a list
 
     Args:
         weights (List[float]): The weights
@@ -32,13 +33,14 @@ def normalize(weights: List[float]) -> List[float]:
 
 
 def parse_and_normalize_split(split: str) -> List[float]:
-    """Parse the dataset split ratios from a string
+    """
+    Parse the dataset split ratios from a string
 
     Args:
-        split (str): The train valid test split string e.g. "99,1,0"
+        split (str): The train, valid, test split string e.g. "90,8,2"
 
     Returns:
-        List[float]: The trian valid test split ratios e.g. [99.0, 1.0, 0.0]
+        List[float]: The trian, valid, test split ratios e.g. [0.9, 0.08, 0.02]
     """
     split = list(map(float, re.findall(r"[.0-9]+", split)))
     split = split + [0.0 for _ in range(len(Split) - len(split))]
@@ -64,6 +66,8 @@ def count_blending_indexes(dataset_idx: numpy.ndarray, n_datasets: int):
 
 def log_BlendedNanoset_stats(blended_nanosets):
     for blended_nanoset, split_name in zip(blended_nanosets, Split):
+        if blended_nanoset is None:
+            break
         log_rank(
             f"> Total number of samples of the {split_name._name_} BlendedNanoset: {len(blended_nanoset)}",
             logger=logger,
@@ -73,7 +77,7 @@ def log_BlendedNanoset_stats(blended_nanosets):
         nanoset_sample_count = count_blending_indexes(blended_nanoset.dataset_index, len(blended_nanoset.datasets))
         for idx, nanoset in enumerate(blended_nanoset.datasets):
             log_rank(
-                f">   Total number of samples from the {nanoset.indexed_dataset.path_prefix.rsplit('/', 1)[-1]} Nanoset: {nanoset_sample_count[idx]} ({round(normalize(nanoset_sample_count)[idx], 2)})",
+                f">   Total number of samples from the {nanoset.indexed_dataset.path_to_mmap.rsplit('/', 1)[-1]} Nanoset: {nanoset_sample_count[idx]} ({round(normalize(nanoset_sample_count)[idx], 2)})",
                 logger=logger,
                 level=logging.INFO,
                 rank=0,
diff --git a/src/nanotron/logging.py b/src/nanotron/logging.py
index 3c3b1711..708393b5 100644
--- a/src/nanotron/logging.py
+++ b/src/nanotron/logging.py
@@ -217,15 +217,12 @@ def log_rank(
     **kwargs,
 ):
     """Log only if the current process is the rank specified."""
-    if torch.distributed.is_initialized():
-        # Use default group is group is not provided
-        if group is None:
-            group = torch_dist.distributed_c10d._get_default_group()
-
-        # rank is None means everyone logs
-        if rank is None or dist.get_rank(group) == rank:
-            logger.log(level, msg, **kwargs)
-    else:
+    # Use default group is group is not provided
+    if group is None:
+        group = torch_dist.distributed_c10d._get_default_group()
+
+    # rank is None means everyone logs
+    if rank is None or dist.get_rank(group) == rank:
         logger.log(level, msg, **kwargs)
 
 
diff --git a/tests/helpers/data.py b/tests/helpers/data.py
index 079d9eac..141ff5a9 100644
--- a/tests/helpers/data.py
+++ b/tests/helpers/data.py
@@ -26,9 +26,9 @@
 
 def create_dataset_paths(tmp_dir: str, quantity: int):
     json_dataset_path = [os.path.join(tmp_dir, f"pytest_{i}") for i in range(quantity)]
-    bin_dataset_path = [f"{path}_text" for path in json_dataset_path]
+    mmap_dataset_path = [f"{path}_input_ids.npy" for path in json_dataset_path]
 
-    return json_dataset_path, bin_dataset_path
+    return json_dataset_path, mmap_dataset_path
 
 
 def create_dummy_json_dataset(path_to_json: str, dummy_text: str, n_samples: int = 50000):
@@ -44,13 +44,11 @@ def preprocess_dummy_dataset(path_to_json: str):
     # Create args for preprocessing
     args = Namespace(
         input=path_to_json + ".json",
-        json_key="text",
+        column="text",
         output_prefix=path_to_json,
         pretrained_model_name_or_path="openai-community/gpt2",
-        workers=int(min(os.cpu_count(), 8)),
-        partitions=int((min(os.cpu_count(), 8) / 2)),
-        append_eos=True,
-        log_interval=int(1000),
+        num_workers=int(min(os.cpu_count(), 4)),
+        add_special_tokens=False,
     )
 
     # tools/preprocess_data.py main
@@ -134,7 +132,7 @@ def assert_nanoset(nanoset: Nanoset, parallel_context: ParallelContext):
     IDX_SAMPLE = 23
 
     nanoset_identifiers = OrderedDict()
-    nanoset_identifiers["path_prefix"] = nanoset.indexed_dataset.path_prefix
+    nanoset_identifiers["path_to_mmap"] = nanoset.indexed_dataset.path_to_mmap
     nanoset_identifiers["split"] = nanoset.config.split
     nanoset_identifiers["random_seed"] = nanoset.config.random_seed
     nanoset_identifiers["sequence_length"] = nanoset.config.sequence_length
diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index c9753d8d..c90140fb 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -34,11 +34,11 @@ def test_build_nanoset_dataloader(tp: int, dp: int, pp: int, train_steps: int, s
     test_context = TestContext()
 
     # Create dataset files
-    json_paths, bin_paths = create_dataset_paths(tmp_dir=test_context.get_auto_remove_tmp_dir(), quantity=2)
+    json_paths, mmap_dataset_paths = create_dataset_paths(tmp_dir=test_context.get_auto_remove_tmp_dir(), quantity=2)
 
     # Create dummy json datasets
     for idx, json_path in enumerate(json_paths):
-        create_dummy_json_dataset(path_to_json=json_path, dummy_text=f"Nanoset {idx}!")
+        create_dummy_json_dataset(path_to_json=json_path, dummy_text=f"Nanoset {idx}!", n_samples=(idx + 1) * 50000)
 
     # Preprocess dummy json datasets
     for json_path in json_paths:
@@ -46,7 +46,7 @@ def test_build_nanoset_dataloader(tp: int, dp: int, pp: int, train_steps: int, s
 
     init_distributed(tp=tp, dp=dp, pp=pp)(_test_build_nanoset_dataloader)(
         test_context=test_context,
-        path_to_bin_files=bin_paths,
+        path_to_mmap_files=mmap_dataset_paths,
         train_steps=train_steps,
         sequence_length=sequence_length,
     )
@@ -55,7 +55,7 @@ def test_build_nanoset_dataloader(tp: int, dp: int, pp: int, train_steps: int, s
 def _test_build_nanoset_dataloader(
     parallel_context: ParallelContext,
     test_context: TestContext,
-    path_to_bin_files: str,
+    path_to_mmap_files: str,
     train_steps: int,
     sequence_length: int,
 ):
@@ -72,7 +72,7 @@ def _test_build_nanoset_dataloader(
     nanoset_config = NanosetConfig(
         random_seed=SEED,
         sequence_length=sequence_length,
-        data_path=path_to_bin_files[0],
+        data_path=path_to_mmap_files[0],
         split=SPLIT,
         train_split_samples=train_steps * GLOBAL_BATCH_SIZE,
         path_to_cache=test_context.get_auto_remove_tmp_dir(),
@@ -81,7 +81,7 @@ def _test_build_nanoset_dataloader(
     blended_nanoset_config = NanosetConfig(
         random_seed=SEED,
         sequence_length=sequence_length,
-        data_path={path_to_bin_files[0]: 0.8, path_to_bin_files[1]: 0.2},
+        data_path={path_to_mmap_files[0]: 0.8, path_to_mmap_files[1]: 0.2},
         split=SPLIT,
         train_split_samples=train_steps * GLOBAL_BATCH_SIZE,
         path_to_cache=test_context.get_auto_remove_tmp_dir(),
diff --git a/tools/hf_datasets_to_json.py b/tools/hf_datasets_to_json.py
deleted file mode 100644
index e8ac70b9..00000000
--- a/tools/hf_datasets_to_json.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import argparse
-
-from datasets import concatenate_datasets, load_dataset
-
-
-def main(args):
-    # Load all the Dataset
-    if args.split is None:
-        ds = load_dataset(args.dataset_path, num_proc=args.num_workers)
-        ds = concatenate_datasets([ds[splits] for splits in ds.keys()])
-    # Load a split of the Dataset
-    else:
-        ds = load_dataset(args.dataset_path, split=args.split, num_proc=args.num_workers)
-
-    ds = ds.select_columns(args.column_name)
-
-    if args.column_name not in ["text"]:
-        print(f"Renaming {ds.column_names[0]} to 'text'")
-        ds = ds.rename_column(ds.column_names[0], "text")
-
-    # Store dataset to json file
-    ds.to_json(path_or_buf=args.output_json, num_proc=args.num_workers)
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--dataset-path", type=str, required=True, help="Path to local stored dataset or repository on the HF hub"
-    )
-    parser.add_argument("--split", type=str, help="Which split of the data to process")
-    parser.add_argument(
-        "--column-name", type=str, required=True, help="Name of the column containing the data to process"
-    )
-    parser.add_argument("--output-json", type=str, required=True, help="Path to the json output file")
-    parser.add_argument(
-        "--num-workers", type=int, default=8, help="Number of processes to load the dataset and store the json file"
-    )
-
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    _args = get_args()
-    main(_args)
diff --git a/tools/merge_datasets.py b/tools/merge_datasets.py
deleted file mode 100644
index 8e4f0b04..00000000
--- a/tools/merge_datasets.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import argparse
-import os
-
-from nanotron.data.indexed_dataset import (
-    MMapIndexedDataset,
-    MMapIndexedDatasetBuilder,
-    get_bin_path,
-    get_idx_path,
-)
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-
-    group = parser.add_argument_group(title="input data")
-    group.add_argument(
-        "--input",
-        type=str,
-        required=True,
-        help="Path to directory containing all document files to merge",
-    )
-
-    group = parser.add_argument_group(title="output data")
-    group.add_argument(
-        "--output-prefix",
-        type=str,
-        required=True,
-        help="Path to binary output file without suffix",
-    )
-
-    args = parser.parse_args()
-
-    assert os.path.isdir(args.input), f"ERROR: {args.input} is not a directory or does not exist"
-
-    assert os.path.isdir(
-        os.path.dirname(args.output_prefix)
-    ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist"
-
-    return args
-
-
-def main():
-    args = get_args()
-
-    prefixes = set()
-    for basename in os.listdir(args.input):
-        prefix, ext = os.path.splitext(basename)
-
-        if prefix in prefixes:
-            continue
-
-        if not os.path.isfile(os.path.join(args.input, basename)):
-            continue
-
-        ext_pair = ".bin" if ext == ".idx" else ".idx"
-        assert os.path.isfile(
-            os.path.join(args.input, prefix) + ext_pair
-        ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}"
-
-        prefixes.add(prefix)
-
-    builder = None
-    for prefix in sorted(prefixes):
-        if builder is None:
-            dataset = MMapIndexedDataset(os.path.join(args.input, prefix))
-            builder = MMapIndexedDatasetBuilder(get_bin_path(args.output_prefix), dtype=dataset.index.dtype)
-            del dataset
-
-        builder.add_index(os.path.join(args.input, prefix))
-
-    builder.finalize(get_idx_path(args.output_prefix))
-
-
-if __name__ == "__main__":
-
-    main()
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index e918105b..8811863f 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -1,95 +1,38 @@
 import argparse
-import glob
-import importlib
-import json
 import multiprocessing
 import os
-import sys
-import time
-from pathlib import Path
-
-package = importlib.import_module("nanotron")
-package_path = Path(package.__file__).parent.parent
-sys.path.append(str(package_path))
-
-from nanotron.data import indexed_dataset
-from transformers import AutoTokenizer
-
-
-class Encoder(object):
-    def __init__(self, pretrained_model_name_or_path: str, json_key: str, append_eos: bool):
-        self.pretrained_model_name_or_path = pretrained_model_name_or_path
-        self.json_key = json_key
-        self.append_eos = append_eos
-
-    def initializer(self):
-        # Use Encoder class as a container for global data
-        Encoder.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_name_or_path)
-
-    def encode(self, json_line: str):
-        data = json.loads(json_line)
-        text = data[self.json_key]
-
-        text_ids = Encoder.tokenizer.encode(text)
-        if self.append_eos:
-            text_ids.append(Encoder.tokenizer.eos_token_id)
-
-        return text_ids, len(text_ids), len(json_line)
-
-
-class Partition(object):
-    def __init__(
-        self, workers: int, log_interval: int, pretrained_model_name_or_path: str, json_key: str, append_eos: bool
-    ):
-        self.workers = workers
-        self.log_interval = log_interval
-        self.pretrained_model_name_or_path = pretrained_model_name_or_path
-        self.json_key = json_key
-        self.append_eos = append_eos
-
-    def print_processing_stats(self, count: int, proc_start: float, total_bytes_processed: int):
-        if count % self.log_interval == 0:
-            current = time.time()
-            elapsed = current - proc_start
-            mbs = total_bytes_processed / elapsed / 1024 / 1024
-            print(f"Processed {count} documents", f"({count/elapsed} docs/s, {mbs} MB/s).", file=sys.stderr)
-
-    def process_json_file(self, file_name: str):
-        input_file_name, output_prefix = file_name
-        print("Opening", input_file_name)
-        fin = open(input_file_name, "r", encoding="utf-8")
-
-        startup_start = time.time()
-        encoder = Encoder(self.pretrained_model_name_or_path, self.json_key, self.append_eos)
-        tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_name_or_path)
-        pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
-        encoded_docs = pool.imap(encoder.encode, fin, 32)
-
-        output_bin_file = "{}_{}.bin".format(output_prefix, self.json_key)
-        output_idx_file = "{}_{}.idx".format(output_prefix, self.json_key)
-
-        builder = indexed_dataset.MMapIndexedDatasetBuilder(
-            output_bin_file, dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size)
-        )
+import shutil
+
+import numpy as np
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
-        startup_end = time.time()
-        proc_start = time.time()
-        total_bytes_processed = 0
-        print("Time to startup:", startup_end - startup_start)
-        for i, (text_ids, len_text_ids, bytes_processed) in enumerate(encoded_docs, start=1):
-            total_bytes_processed += bytes_processed
-            builder.add_document([text_ids], [len_text_ids])
-            self.print_processing_stats(i, proc_start, total_bytes_processed)
+from datasets import Dataset, concatenate_datasets, load_dataset
 
-        fin.close()
-        builder.finalize(output_idx_file)
+
+def preprocess_shard(
+    dataset_shard: Dataset, output_file: str, tokenizer: PreTrainedTokenizerBase, column: str, add_special_tokens: bool
+):
+    dataset_shard = dataset_shard.map(
+        lambda x: {"input_ids": tokenizer.encode(x, add_special_tokens=add_special_tokens)},
+        input_columns=column,
+        batched=False,
+        remove_columns=[column],
+    )
+    input_ids_file = open(output_file, "wb")
+    for sample in dataset_shard:
+        np_array = np.array(sample["input_ids"], dtype=np.uint16)
+        input_ids_file.write(np_array.tobytes(order="C"))
+    input_ids_file.close()
 
 
 def get_args():
     parser = argparse.ArgumentParser()
     group = parser.add_argument_group(title="input data")
-    group.add_argument("--input", type=str, required=True, help="Path to input JSON")
-    group.add_argument("--json-key", type=str, default="text", help="Key to extract from json")
+    group.add_argument(
+        "--input", type=str, required=True, help="Path to local stored dataset or repository on the Hugging Face hub"
+    )
+    group.add_argument("--column", type=str, default="text", help="Column to preprocess from the Dataset")
+    parser.add_argument("--split", type=str, default="train", help="Which split of the data to process")
 
     group = parser.add_argument_group(title="tokenizer")
     group.add_argument(
@@ -98,127 +41,79 @@ def get_args():
         required=True,
         help="A path to a directory containing vocabulary files required by the tokenizer or the model id of a predefined tokenizer hosted inside a model repo on the Hugging Face Hub.",
     )
-    group.add_argument("--append-eos", action="store_true", help="Append an <eos> token to the end of a sample.")
-
-    group = parser.add_argument_group(title="output data")
     group.add_argument(
-        "--output-prefix", type=str, required=True, help="Path to binary and index output files without suffix"
+        "--add-special-tokens",
+        action="store_true",
+        help="Whether or not to add special tokens when encoding the sequences. This will be passed to the Tokenizer",
     )
 
+    group = parser.add_argument_group(title="output data")
+    group.add_argument("--output-prefix", type=str, required=True, help="Path to the output processed dataset file")
+
     group = parser.add_argument_group(title="runtime")
-    group.add_argument(
-        "--workers",
-        type=int,
-        required=True,
-        help=(
-            "Number of worker processes to launch."
-            "A good default for fast pre-processing "
-            "is: (workers * partitions) = available CPU cores."
-        ),
-    )
-    group.add_argument("--partitions", type=int, default=1, help="Number of file partitions")
-    group.add_argument("--log-interval", type=int, default=1000, help="Interval between progress updates")
+    group.add_argument("--num-workers", type=int, default=8, help="Number of workers processing the dataset")
 
     args = parser.parse_args()
 
     return args
 
 
-def get_file_name(input: str, output_prefix: str, file_id: int):
-    file_name, extension = os.path.splitext(input)
-    input_file_name = file_name + "_" + str(file_id) + extension
-    output_prefix = output_prefix + "_" + str(file_id)
-    file_names = {"partition": input_file_name, "output_prefix": output_prefix}
-    return file_names
-
-
 def main(args):
-    # Check if json file is not empty
-    assert os.path.getsize(args.input), f"{args.input} is empty!"
+
     # Check if output directory exists
     if not os.path.isdir(os.path.abspath(os.path.join(args.output_prefix, os.path.pardir))):
         print(f"Creating {os.path.abspath(os.path.join(args.output_prefix, os.path.pardir))} directory...")
         os.makedirs(os.path.abspath(os.path.join(args.output_prefix, os.path.pardir)), exist_ok=True)
 
-    in_ss_out_names = []
-    if args.partitions == 1:
-        file_names = {
-            "partition": args.input,
-            "output_prefix": args.output_prefix,
-        }
-        in_ss_out_names.append(file_names)
+    if args.input.endswith(".json"):  # For processing JSON files (Cross compatibility with other projects)
+        ds = load_dataset("json", data_files=args.input)
+        ds = concatenate_datasets([ds[splits] for splits in ds.keys()])  # Return DatasetDict
     else:
-        in_file_names = glob.glob(args.input)
-
-        # Create .jsonl partition files
-        for idx in range(args.partitions):
-            in_ss_out_name = get_file_name(args.input, args.output_prefix, idx)
-            in_ss_out_names.append(in_ss_out_name)
-
-        # Populate .jsonl partition files from parent files
-        partitioned_input_files = []
-        for idx in range(args.partitions):
-            partitioned_input_file = open(in_ss_out_names[idx]["partition"], "w")
-            partitioned_input_files.append(partitioned_input_file)
-
-        index = 0
-        for in_file_name in in_file_names:
-            fin = open(in_file_name, "r", encoding="utf-8")
-
-            for line in fin:
-                partitioned_input_files[index].write(line)
-                index = (index + 1) % args.partitions
-            fin.close()
-
-        for idx in range(args.partitions):
-            partitioned_input_files[idx].close()
-
-    assert args.workers % args.partitions == 0
-    partition = Partition(
-        args.workers // args.partitions,
-        args.log_interval,
-        args.pretrained_model_name_or_path,
-        args.json_key,
-        args.append_eos,
-    )
+        ds = load_dataset(args.input, split=args.split, num_proc=args.num_workers)
 
-    # Encode partition files in parallel
+    ds = ds.select_columns(args.column)
+
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
+
+    # Create tmp directory for worker outputs
+    tmp_folder = os.path.abspath(os.path.join(args.output_prefix, os.pardir, "tmp"))
+    os.makedirs(tmp_folder, exist_ok=True)
+    workers_output_files = []
     processes = []
-    input_key = "partition"
-    for name in in_ss_out_names:
+
+    print("Creating worker output files...")
+    for worker in range(args.num_workers):
+        worker_output_file = os.path.join(tmp_folder, f"worker_{worker}_input_ids.npy")
+        workers_output_files.append(worker_output_file)
+
         p = multiprocessing.Process(
-            target=partition.process_json_file, args=((name[input_key], name["output_prefix"]),)
+            target=preprocess_shard,
+            args=(
+                ds.shard(num_shards=args.num_workers, index=worker, contiguous=True),
+                worker_output_file,
+                tokenizer,
+                args.column,
+                args.add_special_tokens,
+            ),
         )
+
         p.start()
         processes.append(p)
 
     for p in processes:
         p.join()
 
-    if args.partitions == 1:
-        return
-
-    # Merge bin/idx partitions
-    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
-
-    output_bin_file = "{}_{}.bin".format(args.output_prefix, args.json_key)
-    output_idx_file = "{}_{}.idx".format(args.output_prefix, args.json_key)
-
-    builder = indexed_dataset.MMapIndexedDatasetBuilder(
-        output_bin_file, dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size)
-    )
-
-    for name in in_ss_out_names:
-        parition_output_prefix = name["output_prefix"]
-        full_partition_output_prefix = "{}_{}".format(parition_output_prefix, args.json_key)
-        builder.add_index(full_partition_output_prefix)
-    builder.finalize(output_idx_file)
-
-    # Clean temporary files
-    for name in in_ss_out_names:
-        os.remove(name["partition"])
-        for output in glob.glob(name["output_prefix"] + "*"):
-            os.remove(output)
+    print("Merging worker output files...")
+    output_file = f"{args.output_prefix}_input_ids.npy"
+    input_ids_file = open(output_file, "wb")
+    for worker_output_file in workers_output_files:
+        with open(worker_output_file, "rb") as f:
+            shutil.copyfileobj(f, input_ids_file)
+        os.remove(worker_output_file)
+
+    input_ids_file.close()
+    os.rmdir(tmp_folder)
+    print(f"Done! {args.input} processed dataset stored in {output_file}")
 
 
 if __name__ == "__main__":

From fb60c4fb37686babfa86756eac4ff01d76796e16 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 3 Apr 2024 00:51:24 +0000
Subject: [PATCH 68/73] Updated Tests & added Nanosets to run_train.py

---
 docs/nanoset.md                        |   2 +-
 examples/config_nanoset.yaml           | 103 +++++++++++++++++++++
 run_train.py                           |  40 +++++++--
 run_train_nanoset.py                   | 119 -------------------------
 src/nanotron/data/nanoset.py           |   2 +-
 tests/helpers/data.py                  |   4 +-
 tests/test_build_nanoset_dataloader.py | 114 ++++++++++++++++++++---
 7 files changed, 243 insertions(+), 141 deletions(-)
 create mode 100644 examples/config_nanoset.yaml
 delete mode 100644 run_train_nanoset.py

diff --git a/docs/nanoset.md b/docs/nanoset.md
index 2b375cac..7c2c7463 100644
--- a/docs/nanoset.md
+++ b/docs/nanoset.md
@@ -19,7 +19,7 @@ python tools/preprocess_data.py \
        --input data/my_corpus.json \
        --output-prefix data/processed-datasets/my-llama2-dataset \
        --pretrained-model-name-or-path models/Llama-2-7b \
-       --workers 128
+       --num-workers 128
 </pre>
 
 In `--pretrained-model-name-or-path`, we will have to specify a tokenizer in the same way as we do when using `AutoTokenizers.from_pretrained(...)`.
diff --git a/examples/config_nanoset.yaml b/examples/config_nanoset.yaml
new file mode 100644
index 00000000..f5b2f5bc
--- /dev/null
+++ b/examples/config_nanoset.yaml
@@ -0,0 +1,103 @@
+checkpoints:
+  checkpoint_interval: 200
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_initial_state: false
+
+data_stages:
+  - name: General purpose training (Nanoset)
+    start_training_step: 1
+    data:
+      dataset:
+        data_path: datasets/testing_alpaca_small_input_ids.npy
+        split: 8,1,1
+      num_loading_workers: 0
+      seed: 1234
+
+  - name: Second purpose training (BlendedNanoset)
+    start_training_step: 15
+    data:
+      dataset:
+        data_path:
+          datasets/testing_alpaca_small_input_ids.npy: 0.8
+          datasets/yelp_review_full_input_ids.npy: 0.2
+        split: 6,2,2
+      num_loading_workers: 0
+      seed: 1234
+
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: false
+  project: debug
+  run: tiny_llama_%date_%jobid
+  seed: 42
+  step: null
+lighteval: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: bfloat16
+  init_method:
+    std: 0.025
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 1
+    eos_token_id: 2
+    hidden_act: silu
+    hidden_size: 16
+    initializer_range: 0.02
+    intermediate_size: 64
+    is_llama_config: true
+    max_position_embeddings: 256
+    num_attention_heads: 4
+    num_hidden_layers: 2
+    num_key_value_heads: 4
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 50257
+optimizer:
+  accumulate_grad_in_fp32: true
+  adam_beta1: 0.9
+  adam_beta2: 0.95
+  adam_eps: 1.0e-08
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.0003
+    lr_decay_starting_step: null
+    lr_decay_steps: 8
+    lr_decay_style: cosine
+    lr_warmup_steps: 2
+    lr_warmup_style: linear
+    min_decay_lr: 1.0e-05
+  torch_adam_is_fused: true
+  weight_decay: 0.01
+  zero_stage: 0
+parallelism:
+  dp: 2
+  pp: 1
+  pp_engine: 1f1b
+  tp: 2
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: gpt2
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 1
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 2
+  sequence_length: 32
+  train_steps: 100
+  val_check_interval: -1
diff --git a/run_train.py b/run_train.py
index e5be3048..689d193d 100644
--- a/run_train.py
+++ b/run_train.py
@@ -11,11 +11,10 @@
 from typing import Dict, cast
 
 from nanotron import logging
-from nanotron.config import (
-    DataArgs,
-    DatasetStageArgs,
-    PretrainDatasetsArgs,
-)
+from nanotron.config import DataArgs, DatasetStageArgs, NanosetDatasetsArgs, PretrainDatasetsArgs
+from nanotron.data.dataloader_builder import build_nanoset_dataloader
+from nanotron.data.dataset_builder import NanosetBuilder
+from nanotron.data.nanoset_configs import NanosetConfig
 from nanotron.dataloader import (
     clm_process,
     dummy_infinite_data_generator,
@@ -120,6 +119,37 @@ def get_dataloader_from_data_stage(trainer: DistributedTrainer, data: DataArgs):
                 f"Dataset is too small for steps ({total_tokens_dataset} < {num_tokens_needed_for_training}), "
                 f"Try train_steps<={len(dataloader.dataset) // trainer.global_batch_size + trainer.start_iteration_step}"
             )
+
+    # Case 3: Nanosets
+    elif isinstance(data.dataset, NanosetDatasetsArgs):
+        # Create Nanoset config
+        nanoset_config = NanosetConfig(
+            random_seed=data.seed,
+            sequence_length=trainer.sequence_length,
+            data_path=data.dataset.data_path,
+            split=data.dataset.split,
+            train_split_samples=trainer.config.tokens.train_steps * trainer.global_batch_size,
+            path_to_cache=data.dataset.path_to_cache,
+        )
+
+        # Build Nanoset datasets
+        train_dataset, _, _ = NanosetBuilder(nanoset_config).build()
+
+        # Prepare train dataloaders
+        train_dataloader = build_nanoset_dataloader(
+            train_dataset,
+            trainer.sequence_length,
+            parallel_context=trainer.parallel_context,
+            input_pp_rank=input_pp_rank,
+            output_pp_rank=output_pp_rank,
+            micro_batch_size=trainer.micro_batch_size,
+            consumed_train_samples=trainer.consumed_train_samples,
+            dataloader_num_workers=data.num_loading_workers,
+            dataloader_drop_last=True,
+        )
+
+        return train_dataloader
+
     else:
         raise ValueError(f"Unhandled case of `self.config.data.dataset`. Got: {data.dataset}")
 
diff --git a/run_train_nanoset.py b/run_train_nanoset.py
deleted file mode 100644
index 214fe34d..00000000
--- a/run_train_nanoset.py
+++ /dev/null
@@ -1,119 +0,0 @@
-"""
-Nanotron training script.
-
-Usage:
-```
-export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
-torchrun --nproc_per_node=8 run_train.py --config-file examples/config_tiny_llama.yaml
-```
-"""
-import argparse
-from typing import Dict, cast
-
-from nanotron import logging
-from nanotron.config import (
-    DataArgs,
-    DatasetStageArgs,
-)
-from nanotron.data.dataloader_builder import build_nanoset_dataloader
-from nanotron.data.dataset_builder import NanosetBuilder
-from nanotron.data.nanoset import NanosetConfig
-from nanotron.parallel.pipeline_parallel.utils import get_input_output_pp_ranks
-from nanotron.trainer import DistributedTrainer
-from torch.utils.data import DataLoader
-
-logger = logging.get_logger(__name__)
-
-
-def get_dataloader_from_data_stage(trainer: DistributedTrainer, data: DataArgs) -> DataLoader:
-    """Returns train, valid and test dataloaders"""
-
-    # First, we need to know which ranks to feed the dataloader to
-    input_pp_rank, output_pp_rank = get_input_output_pp_ranks(model=trainer.model)
-
-    # Create Nanoset config
-    nanoset_config = NanosetConfig(
-        random_seed=data.seed,
-        sequence_length=trainer.sequence_length,
-        data_path=data.dataset.data_path,
-        split=data.dataset.split,
-        train_split_samples=trainer.config.tokens.train_steps * trainer.global_batch_size,
-        path_to_cache=data.dataset.path_to_cache,
-    )
-
-    # Build Nanoset datasets
-    train_dataset, valid_dataset, test_dataset = NanosetBuilder(nanoset_config).build()
-
-    # Prepare train, valid and test dataloaders
-    train_dataloader = build_nanoset_dataloader(
-        train_dataset,
-        trainer.sequence_length,
-        parallel_context=trainer.parallel_context,
-        input_pp_rank=input_pp_rank,
-        output_pp_rank=output_pp_rank,
-        micro_batch_size=trainer.micro_batch_size,
-        consumed_train_samples=trainer.consumed_train_samples,
-        dataloader_num_workers=data.num_loading_workers,
-        dataloader_drop_last=True,
-    )
-
-    # Valid dataloader
-    _ = build_nanoset_dataloader(
-        valid_dataset,
-        trainer.sequence_length,
-        parallel_context=trainer.parallel_context,
-        input_pp_rank=input_pp_rank,
-        output_pp_rank=output_pp_rank,
-        micro_batch_size=trainer.micro_batch_size,
-        dataloader_num_workers=data.num_loading_workers,
-        dataloader_drop_last=True,
-    )
-
-    # Test dataloader
-    _ = build_nanoset_dataloader(
-        test_dataset,
-        trainer.sequence_length,
-        parallel_context=trainer.parallel_context,
-        input_pp_rank=input_pp_rank,
-        output_pp_rank=output_pp_rank,
-        micro_batch_size=trainer.micro_batch_size,
-        dataloader_num_workers=data.num_loading_workers,
-        dataloader_drop_last=True,
-    )
-
-    return train_dataloader
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--config-file", type=str, required=True, help="Path to the YAML or python config file")
-    return parser.parse_args()
-
-
-def get_dataloader(trainer: DistributedTrainer) -> Dict[str, DataLoader]:
-    sorted_stages = sorted(trainer.config.data_stages, key=lambda stage: stage.start_training_step)
-    dataloaders = {}
-    for idx, stage in enumerate(sorted_stages):
-        # NOTE: we only create the dataloader for the first stage,
-        # then we lazy initialize the dataloader for the other stages
-        stage = cast(DatasetStageArgs, stage)
-        dataloader = (
-            get_dataloader_from_data_stage(trainer, stage.data)
-            if idx == 0
-            else lambda stage=stage: get_dataloader_from_data_stage(trainer, stage.data)
-        )
-        dataloaders[stage.name] = dataloader
-    return dataloaders
-
-
-if __name__ == "__main__":
-    args = get_args()
-    config_file = args.config_file
-
-    # Load trainer and data
-    trainer = DistributedTrainer(config_file)
-
-    train_dataloader = get_dataloader(trainer)
-
-    # Train
-    trainer.train(train_dataloader)
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
index 0bec7fd6..77c1c9bf 100644
--- a/src/nanotron/data/nanoset.py
+++ b/src/nanotron/data/nanoset.py
@@ -113,7 +113,7 @@ def build_shuffle_indices(self) -> numpy.ndarray:
         path_to_cache = self.config.path_to_cache
         if path_to_cache is None:
             path_to_cache = os.path.join(
-                self.indexed_dataset.path_prefix[:-4], "cache", f"{type(self).__name__}_indices"
+                self.indexed_dataset.path_to_mmap[:-4], "cache", f"{type(self).__name__}_indices"
             )
 
         def get_path_to(suffix):
diff --git a/tests/helpers/data.py b/tests/helpers/data.py
index 141ff5a9..fd515e07 100644
--- a/tests/helpers/data.py
+++ b/tests/helpers/data.py
@@ -124,7 +124,7 @@ def compute_hash(identifier: OrderedDict, n_digit: int = 8) -> int:
     return unique_description_hash
 
 
-def assert_nanoset(nanoset: Nanoset, parallel_context: ParallelContext):
+def assert_nanoset_sync_across_all_ranks(nanoset: Nanoset, parallel_context: ParallelContext):
     """
     Checks that the same Nanoset is created in all processes
     """
@@ -148,7 +148,7 @@ def assert_nanoset(nanoset: Nanoset, parallel_context: ParallelContext):
     )
 
 
-def assert_blendednanoset(blendednanoset: BlendedNanoset, parallel_context: ParallelContext):
+def assert_blendednanoset_sync_across_all_ranks(blendednanoset: BlendedNanoset, parallel_context: ParallelContext):
     """
     Checks that the same BlendedNanoset is created in all processes
     """
diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index c90140fb..c3faacda 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -2,15 +2,15 @@
 from helpers.context import TestContext
 from helpers.data import (
     assert_batch_dataloader,
-    assert_blendednanoset,
-    assert_nanoset,
+    assert_blendednanoset_sync_across_all_ranks,
+    assert_nanoset_sync_across_all_ranks,
     compute_batch_hash,
     create_dataset_paths,
     create_dummy_json_dataset,
     get_max_value_by_group,
     preprocess_dummy_dataset,
 )
-from helpers.utils import available_gpus, get_all_3d_configurations, init_distributed, rerun_if_address_is_in_use
+from helpers.utils import available_gpus, get_all_3d_configurations, init_distributed
 from nanotron.data.blended_nanoset import BlendedNanoset
 from nanotron.data.dataloader_builder import build_nanoset_dataloader
 from nanotron.data.dataset_builder import NanosetBuilder
@@ -27,9 +27,8 @@
         for all_3d_configs in get_all_3d_configurations(gpus)
     ],
 )
-@pytest.mark.parametrize("train_steps", [100, 500])
+@pytest.mark.parametrize("train_steps", [5, 10])
 @pytest.mark.parametrize("sequence_length", [512, 8192])
-@rerun_if_address_is_in_use()
 def test_build_nanoset_dataloader(tp: int, dp: int, pp: int, train_steps: int, sequence_length: int):
     test_context = TestContext()
 
@@ -92,19 +91,19 @@ def _test_build_nanoset_dataloader(
 
     for config, dataset_type in zip(configs, dataset_types):
         # Create Nanosets
-        train_dataset, valid_dataset, test_dataset = NanosetBuilder(config).build()
+        train_dataset, _, _ = NanosetBuilder(config).build()
 
         assert isinstance(train_dataset, dataset_type)
 
         if isinstance(train_dataset, Nanoset):
             # Assert Nanoset is the same across all processes
-            assert_nanoset(train_dataset, parallel_context)
+            assert_nanoset_sync_across_all_ranks(train_dataset, parallel_context)
 
         if isinstance(train_dataset, BlendedNanoset):
             # Check the BlendedNanoset is composed of > 1 Nanoset
             assert len(train_dataset.datasets) > 1
             # Assert BlendedNanoset is the same across all processes
-            assert_blendednanoset(train_dataset, parallel_context)
+            assert_blendednanoset_sync_across_all_ranks(train_dataset, parallel_context)
             # For each Nanoset in BlendedNanoset
             for idx, dataset in enumerate(train_dataset.datasets):
                 # Check that each Nanoset of BlendedNanoset has more samples than the ones requested by BlendedNanoset
@@ -112,7 +111,7 @@ def _test_build_nanoset_dataloader(
                     train_dataset.dataset_index, train_dataset.dataset_sample_index, idx
                 )
                 # Assert Nanoset is the same across all processes
-                assert_nanoset(dataset, parallel_context)
+                assert_nanoset_sync_across_all_ranks(dataset, parallel_context)
 
         # Create Dataloaders
         dataloader = build_nanoset_dataloader(
@@ -135,15 +134,104 @@ def _test_build_nanoset_dataloader(
             sequence_length=sequence_length,
         )
 
+    parallel_context.destroy()
+
+
+@pytest.mark.parametrize(
+    "tp,dp,pp",
+    [
+        pytest.param(*all_3d_configs)
+        for gpus in range(1, min(available_gpus(), 4) + 1)
+        for all_3d_configs in get_all_3d_configurations(gpus)
+    ],
+)
+@pytest.mark.parametrize("train_steps", [100, 500])
+@pytest.mark.parametrize("skipped_batches", [20, 50])
+def test_recover_nanoset_dataloader(tp: int, dp: int, pp: int, train_steps: int, skipped_batches: int):
+    test_context = TestContext()
+
+    # Create dataset files
+    json_paths, mmap_dataset_paths = create_dataset_paths(tmp_dir=test_context.get_auto_remove_tmp_dir(), quantity=2)
+
+    # Create dummy json datasets
+    for idx, json_path in enumerate(json_paths):
+        create_dummy_json_dataset(path_to_json=json_path, dummy_text=f"Nanoset {idx}!", n_samples=(idx + 1) * 50000)
+
+    # Preprocess dummy json datasets
+    for json_path in json_paths:
+        preprocess_dummy_dataset(path_to_json=json_path)
+
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_recover_nanoset_dataloader)(
+        test_context=test_context,
+        path_to_mmap_files=mmap_dataset_paths,
+        train_steps=train_steps,
+        skipped_batches=skipped_batches,
+    )
+
+
+def _test_recover_nanoset_dataloader(
+    parallel_context: ParallelContext,
+    test_context: TestContext,
+    path_to_mmap_files: str,
+    train_steps: int,
+    skipped_batches: int,
+):
+    SEED = 1234
+    MICRO_BATCH_SIZE = 4
+    N_MICRO_BATCHES_PER_BATCH = 8
+    GLOBAL_BATCH_SIZE = MICRO_BATCH_SIZE * N_MICRO_BATCHES_PER_BATCH * parallel_context.dp_pg.size()
+    SEQUENCE_LENGTH = 1024
+
+    SPLIT = "70,20,10"
+
+    input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
+
+    # Create Nanoset configs: 1. Normal 2. Blended
+    nanoset_config = NanosetConfig(
+        random_seed=SEED,
+        sequence_length=SEQUENCE_LENGTH,
+        data_path=path_to_mmap_files[0],
+        split=SPLIT,
+        train_split_samples=train_steps * GLOBAL_BATCH_SIZE,
+        path_to_cache=test_context.get_auto_remove_tmp_dir(),
+    )
+
+    blended_nanoset_config = NanosetConfig(
+        random_seed=SEED,
+        sequence_length=SEQUENCE_LENGTH,
+        data_path={path_to_mmap_files[0]: 0.8, path_to_mmap_files[1]: 0.2},
+        split=SPLIT,
+        train_split_samples=train_steps * GLOBAL_BATCH_SIZE,
+        path_to_cache=test_context.get_auto_remove_tmp_dir(),
+    )
+
+    configs = [nanoset_config, blended_nanoset_config]
+
+    for config in configs:
+        # Create Nanosets
+        train_dataset, _, _ = NanosetBuilder(config).build()
+
+        # Create initial Dataloader
+        dataloader = build_nanoset_dataloader(
+            train_dataset,
+            sequence_length=SEQUENCE_LENGTH,
+            parallel_context=parallel_context,
+            input_pp_rank=input_pp_rank,
+            output_pp_rank=output_pp_rank,
+            micro_batch_size=MICRO_BATCH_SIZE,
+            dataloader_num_workers=0,
+            dataloader_drop_last=True,
+        )
+
         # Recover from failures
-        SKIPED_BATCHES = 20
         dataloader = iter(dataloader)
-        for _ in range(SKIPED_BATCHES + 1):  # In order to compare with the first batch of the recovered DataLoader
+        for _ in range(skipped_batches + 1):  # In order to compare with the first batch of the recovered DataLoader
             batch = next(dataloader)
 
+        # Create recover Dataloader
         recovered_dataloader = build_nanoset_dataloader(
             train_dataset,
-            sequence_length=sequence_length,
+            sequence_length=SEQUENCE_LENGTH,
             parallel_context=parallel_context,
             input_pp_rank=input_pp_rank,
             output_pp_rank=output_pp_rank,
@@ -151,7 +239,7 @@ def _test_build_nanoset_dataloader(
             dataloader_num_workers=0,
             dataloader_drop_last=True,
             # NOTE The dataloader serves batches of micro_batch_size despite of batch_accumulation_per_replica
-            consumed_train_samples=SKIPED_BATCHES * MICRO_BATCH_SIZE * parallel_context.dp_pg.size(),
+            consumed_train_samples=skipped_batches * MICRO_BATCH_SIZE * parallel_context.dp_pg.size(),
         )
 
         recovered_first_batch = next(iter(recovered_dataloader))

From b0c5b09635a044193c8f06289401d03cacdde682 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 3 Apr 2024 09:22:48 +0000
Subject: [PATCH 69/73] Reducing number of tests

---
 tests/test_build_nanoset_dataloader.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index c3faacda..ee05a61a 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -145,9 +145,8 @@ def _test_build_nanoset_dataloader(
         for all_3d_configs in get_all_3d_configurations(gpus)
     ],
 )
-@pytest.mark.parametrize("train_steps", [100, 500])
 @pytest.mark.parametrize("skipped_batches", [20, 50])
-def test_recover_nanoset_dataloader(tp: int, dp: int, pp: int, train_steps: int, skipped_batches: int):
+def test_recover_nanoset_dataloader(tp: int, dp: int, pp: int, skipped_batches: int):
     test_context = TestContext()
 
     # Create dataset files
@@ -164,7 +163,6 @@ def test_recover_nanoset_dataloader(tp: int, dp: int, pp: int, train_steps: int,
     init_distributed(tp=tp, dp=dp, pp=pp)(_test_recover_nanoset_dataloader)(
         test_context=test_context,
         path_to_mmap_files=mmap_dataset_paths,
-        train_steps=train_steps,
         skipped_batches=skipped_batches,
     )
 
@@ -173,7 +171,6 @@ def _test_recover_nanoset_dataloader(
     parallel_context: ParallelContext,
     test_context: TestContext,
     path_to_mmap_files: str,
-    train_steps: int,
     skipped_batches: int,
 ):
     SEED = 1234
@@ -181,6 +178,7 @@ def _test_recover_nanoset_dataloader(
     N_MICRO_BATCHES_PER_BATCH = 8
     GLOBAL_BATCH_SIZE = MICRO_BATCH_SIZE * N_MICRO_BATCHES_PER_BATCH * parallel_context.dp_pg.size()
     SEQUENCE_LENGTH = 1024
+    TRAIN_STEPS = 100
 
     SPLIT = "70,20,10"
 
@@ -192,7 +190,7 @@ def _test_recover_nanoset_dataloader(
         sequence_length=SEQUENCE_LENGTH,
         data_path=path_to_mmap_files[0],
         split=SPLIT,
-        train_split_samples=train_steps * GLOBAL_BATCH_SIZE,
+        train_split_samples=TRAIN_STEPS * GLOBAL_BATCH_SIZE,
         path_to_cache=test_context.get_auto_remove_tmp_dir(),
     )
 
@@ -201,7 +199,7 @@ def _test_recover_nanoset_dataloader(
         sequence_length=SEQUENCE_LENGTH,
         data_path={path_to_mmap_files[0]: 0.8, path_to_mmap_files[1]: 0.2},
         split=SPLIT,
-        train_split_samples=train_steps * GLOBAL_BATCH_SIZE,
+        train_split_samples=TRAIN_STEPS * GLOBAL_BATCH_SIZE,
         path_to_cache=test_context.get_auto_remove_tmp_dir(),
     )
 

From ad21be06bae6feead121f308cfa02255e0942633 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Wed, 3 Apr 2024 09:44:19 +0000
Subject: [PATCH 70/73] Adding back @rerun_if_address_is_in_use() decorator

---
 tests/test_build_nanoset_dataloader.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index ee05a61a..5eca9b7b 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -10,7 +10,7 @@
     get_max_value_by_group,
     preprocess_dummy_dataset,
 )
-from helpers.utils import available_gpus, get_all_3d_configurations, init_distributed
+from helpers.utils import available_gpus, get_all_3d_configurations, init_distributed, rerun_if_address_is_in_use
 from nanotron.data.blended_nanoset import BlendedNanoset
 from nanotron.data.dataloader_builder import build_nanoset_dataloader
 from nanotron.data.dataset_builder import NanosetBuilder
@@ -29,6 +29,7 @@
 )
 @pytest.mark.parametrize("train_steps", [5, 10])
 @pytest.mark.parametrize("sequence_length", [512, 8192])
+@rerun_if_address_is_in_use()
 def test_build_nanoset_dataloader(tp: int, dp: int, pp: int, train_steps: int, sequence_length: int):
     test_context = TestContext()
 
@@ -146,6 +147,7 @@ def _test_build_nanoset_dataloader(
     ],
 )
 @pytest.mark.parametrize("skipped_batches", [20, 50])
+@rerun_if_address_is_in_use()
 def test_recover_nanoset_dataloader(tp: int, dp: int, pp: int, skipped_batches: int):
     test_context = TestContext()
 

From 3523ca3f034fda23f76b6c1e5007eb6546c38af3 Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sat, 20 Apr 2024 14:39:54 +0000
Subject: [PATCH 71/73] Added nanosets flavour

---
 .github/workflows/3d_parallelism_unit_tests.yaml |  1 +
 docs/nanoset.md                                  | 12 +++++++++---
 pyproject.toml                                   |  7 +++++--
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/3d_parallelism_unit_tests.yaml b/.github/workflows/3d_parallelism_unit_tests.yaml
index 73804d6c..28b70dd9 100644
--- a/.github/workflows/3d_parallelism_unit_tests.yaml
+++ b/.github/workflows/3d_parallelism_unit_tests.yaml
@@ -45,6 +45,7 @@ jobs:
         pip install -e .
         pip install -e .[dev]
         pip install -e .[test]
+        pip install -e .[nanosets]
 
     - name: Show installed libraries and their versions
       run: pip freeze | tee installed.txt
diff --git a/docs/nanoset.md b/docs/nanoset.md
index 7c2c7463..c94c95fd 100644
--- a/docs/nanoset.md
+++ b/docs/nanoset.md
@@ -1,4 +1,10 @@
-# Nanoset
+# Nanosets
+
+## Install
+To use `Nanosets`, it's necessary to install Nanotron with the `nanosets` flavor.
+```
+pip install -e '.[nanosets]'
+```
 
 ## Data pre-processing
 
@@ -18,11 +24,11 @@ The dataset is then processed into a mmap format for training using the [`tools/
 python tools/preprocess_data.py \
        --input data/my_corpus.json \
        --output-prefix data/processed-datasets/my-llama2-dataset \
-       --pretrained-model-name-or-path models/Llama-2-7b \
+       --tokenizer-name-or-path meta-llama/Llama-2-7b-hf \
        --num-workers 128
 </pre>
 
-In `--pretrained-model-name-or-path`, we will have to specify a tokenizer in the same way as we do when using `AutoTokenizers.from_pretrained(...)`.
+In `--tokenizer-name-or-path`, we will have to specify a tokenizer in the same way as we do when using `AutoTokenizers.from_pretrained(...)`.
 
 The output will be one file named, in this case, `my-llama2-dataset_input_ids.npy`. We will then have to specify this file in the `data_path` field in the config file.
 
diff --git a/pyproject.toml b/pyproject.toml
index 532c0c42..a9fbafda 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,8 +21,6 @@ dependencies = [
     "safetensors",
     "dacite",
     "tqdm",
-    "transformers",
-    "datasets"
 ]
 
 [tool.setuptools.packages.find]
@@ -49,6 +47,11 @@ fast-modeling = [
     "flash-attn>=2.5.0",
 ]
 
+nanosets = [
+     "transformers",
+     "datasets",
+]
+
 [build-system]
 requires = [
     "setuptools",

From 0c0f58d384b1c7766ba8c345a10201680b06fdaf Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sat, 20 Apr 2024 14:42:54 +0000
Subject: [PATCH 72/73] Added world process group to NanosetBuilder
 (main_rank_first)

---
 run_train.py                           |  2 +-
 src/nanotron/data/dataset_builder.py   | 24 +++++++-----------------
 tests/test_build_nanoset_dataloader.py |  4 ++--
 3 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/run_train.py b/run_train.py
index 689d193d..ae2d4f85 100644
--- a/run_train.py
+++ b/run_train.py
@@ -133,7 +133,7 @@ def get_dataloader_from_data_stage(trainer: DistributedTrainer, data: DataArgs):
         )
 
         # Build Nanoset datasets
-        train_dataset, _, _ = NanosetBuilder(nanoset_config).build()
+        train_dataset, _, _ = NanosetBuilder(nanoset_config, trainer.parallel_context.world_pg).build()
 
         # Prepare train dataloaders
         train_dataloader = build_nanoset_dataloader(
diff --git a/src/nanotron/data/dataset_builder.py b/src/nanotron/data/dataset_builder.py
index b4113845..db9bc97e 100644
--- a/src/nanotron/data/dataset_builder.py
+++ b/src/nanotron/data/dataset_builder.py
@@ -1,14 +1,15 @@
 from typing import Any, List, Type, Union
 
 import numpy
-import torch
 
+from nanotron import distributed as dist
 from nanotron import logging
 from nanotron.data.blended_nanoset import BlendedNanoset
 from nanotron.data.indexed_dataset import MMapIndexedDataset
 from nanotron.data.nanoset import Nanoset
 from nanotron.data.nanoset_configs import NanosetConfig
 from nanotron.data.utils import Split, log_BlendedNanoset_stats, normalize
+from nanotron.utils import main_rank_first
 
 logger = logging.get_logger(__name__)
 
@@ -21,13 +22,12 @@ class NanosetBuilder(object):
     Args:
 
         config (NanosetConfig): The config object which informs dataset creation
+        world_pg (dist.ProcessGroup): World parallel group for creating the Nanosets in main rank first
     """
 
-    def __init__(
-        self,
-        config: NanosetConfig,
-    ):
+    def __init__(self, config: NanosetConfig, world_pg: dist.ProcessGroup):
         self.config = config
+        self.world_pg = world_pg
 
     def build(self) -> List[Union[BlendedNanoset, Nanoset]]:
         """
@@ -166,17 +166,7 @@ def build_generic_dataset(
             DistributedDataset: The DistributedDataset instantion
         """
 
-        rank = torch.distributed.get_rank()
-        dataset = None
-
-        # First, build on rank 0
-        if rank == 0:
-            dataset = cls(*args)
-
-        torch.distributed.barrier()
-
-        # Then, in the other ranks
-        if rank != 0:
+        with main_rank_first(self.world_pg):
             dataset = cls(*args)
 
         return dataset
@@ -195,7 +185,7 @@ def get_split_indices(split: List[float], number_of_tokens: int, sequence_length
 
     Returns:
         List[int]: The indices for all three splits e.g. [0, 800, 900, 1000] for a 1024000 token dataset,
-        1024 sequence length and a [8, 1, 1] split
+        1024 sequence length and a [0.8, 0.1, 0.1] split
     """
     number_of_samples = int(number_of_tokens / sequence_length)
     split_indices = [0]
diff --git a/tests/test_build_nanoset_dataloader.py b/tests/test_build_nanoset_dataloader.py
index 5eca9b7b..b931323d 100644
--- a/tests/test_build_nanoset_dataloader.py
+++ b/tests/test_build_nanoset_dataloader.py
@@ -92,7 +92,7 @@ def _test_build_nanoset_dataloader(
 
     for config, dataset_type in zip(configs, dataset_types):
         # Create Nanosets
-        train_dataset, _, _ = NanosetBuilder(config).build()
+        train_dataset, _, _ = NanosetBuilder(config, parallel_context.world_pg).build()
 
         assert isinstance(train_dataset, dataset_type)
 
@@ -209,7 +209,7 @@ def _test_recover_nanoset_dataloader(
 
     for config in configs:
         # Create Nanosets
-        train_dataset, _, _ = NanosetBuilder(config).build()
+        train_dataset, _, _ = NanosetBuilder(config, parallel_context.world_pg).build()
 
         # Create initial Dataloader
         dataloader = build_nanoset_dataloader(

From b70720c886961648df284517c4dfd884fe3ae3ef Mon Sep 17 00:00:00 2001
From: tj-solergibert <tj.solergibert@gmail.com>
Date: Sat, 20 Apr 2024 14:44:55 +0000
Subject: [PATCH 73/73] Modified preprocess_data

---
 tests/helpers/data.py    |  2 +-
 tools/preprocess_data.py | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/tests/helpers/data.py b/tests/helpers/data.py
index fd515e07..20901e0b 100644
--- a/tests/helpers/data.py
+++ b/tests/helpers/data.py
@@ -46,7 +46,7 @@ def preprocess_dummy_dataset(path_to_json: str):
         input=path_to_json + ".json",
         column="text",
         output_prefix=path_to_json,
-        pretrained_model_name_or_path="openai-community/gpt2",
+        tokenizer_name_or_path="openai-community/gpt2",
         num_workers=int(min(os.cpu_count(), 4)),
         add_special_tokens=False,
     )
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 8811863f..f230f15a 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -4,6 +4,7 @@
 import shutil
 
 import numpy as np
+from tqdm import tqdm
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
 from datasets import Dataset, concatenate_datasets, load_dataset
@@ -13,9 +14,10 @@ def preprocess_shard(
     dataset_shard: Dataset, output_file: str, tokenizer: PreTrainedTokenizerBase, column: str, add_special_tokens: bool
 ):
     dataset_shard = dataset_shard.map(
-        lambda x: {"input_ids": tokenizer.encode(x, add_special_tokens=add_special_tokens)},
+        lambda x: {"input_ids": tokenizer(x, add_special_tokens=add_special_tokens).input_ids},
         input_columns=column,
-        batched=False,
+        batched=True,
+        desc="Tokenizing Dataset",
         remove_columns=[column],
     )
     input_ids_file = open(output_file, "wb")
@@ -36,7 +38,7 @@ def get_args():
 
     group = parser.add_argument_group(title="tokenizer")
     group.add_argument(
-        "--pretrained-model-name-or-path",
+        "--tokenizer-name-or-path",
         type=str,
         required=True,
         help="A path to a directory containing vocabulary files required by the tokenizer or the model id of a predefined tokenizer hosted inside a model repo on the Hugging Face Hub.",
@@ -67,13 +69,15 @@ def main(args):
 
     if args.input.endswith(".json"):  # For processing JSON files (Cross compatibility with other projects)
         ds = load_dataset("json", data_files=args.input)
-        ds = concatenate_datasets([ds[splits] for splits in ds.keys()])  # Return DatasetDict
+        ds = concatenate_datasets(
+            [ds[splits] for splits in ds.keys()]
+        )  # load_dataset returns DatasetDict and we want a Dataset
     else:
         ds = load_dataset(args.input, split=args.split, num_proc=args.num_workers)
 
     ds = ds.select_columns(args.column)
 
-    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path)
 
     # Create tmp directory for worker outputs
     tmp_folder = os.path.abspath(os.path.join(args.output_prefix, os.pardir, "tmp"))
@@ -103,10 +107,9 @@ def main(args):
     for p in processes:
         p.join()
 
-    print("Merging worker output files...")
     output_file = f"{args.output_prefix}_input_ids.npy"
     input_ids_file = open(output_file, "wb")
-    for worker_output_file in workers_output_files:
+    for worker_output_file in tqdm(workers_output_files, desc="Merging worker output files"):
         with open(worker_output_file, "rb") as f:
             shutil.copyfileobj(f, input_ids_file)
         os.remove(worker_output_file)