From 8db345cf4faa68f6e5b1f4551acd3278458767af Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Thu, 9 May 2024 19:17:21 +0300 Subject: [PATCH 1/2] Bump LitData to at least 0.2.6 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 63cc9d6d86..7627a17691 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ all = [ "sentencepiece>=0.2.0", # llama-based models "tokenizers>=0.15.2", # pythia, falcon, redpajama "requests>=2.31.0", # litgpt.data - "litdata>=0.2.2,<0.2.6", # litgpt.data + "litdata>=0.2.6", # litgpt.data "litserve>=0.1.0", # litgpt.deploy "zstandard>=0.22.0", # litgpt.data.prepare_slimpajama.py "pandas>=1.9.0", # litgpt.data.prepare_starcoder.py From 292b2b38edcb0e87415f03244c7f6e5ae0231b41 Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Thu, 9 May 2024 19:17:48 +0300 Subject: [PATCH 2/2] Set iterate_over_all to False for CombinedStreamingDataset --- litgpt/data/tinyllama.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/litgpt/data/tinyllama.py b/litgpt/data/tinyllama.py index 0f32507aa2..d0267f2ff7 100644 --- a/litgpt/data/tinyllama.py +++ b/litgpt/data/tinyllama.py @@ -69,7 +69,9 @@ def train_dataloader(self) -> DataLoader: # Mix SlimPajama data and Starcoder data with these proportions: weights = (0.693584, 0.306416) - combined_dataset = CombinedStreamingDataset(datasets=train_datasets, seed=self.seed, weights=weights) + combined_dataset = CombinedStreamingDataset( + datasets=train_datasets, seed=self.seed, weights=weights, iterate_over_all=False + ) train_dataloader = StreamingDataLoader( combined_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True )