Skip to content

Commit

Permalink
Install litdata instead of lightning[data] (#1046)
Browse files Browse the repository at this point in the history
  • Loading branch information
carmocca authored and awaelchli committed Mar 15, 2024
1 parent 0929fe5 commit e3d8e3f
Show file tree
Hide file tree
Showing 8 changed files with 14 additions and 15 deletions.
6 changes: 3 additions & 3 deletions litgpt/data/openwebtext.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def connect(

def prepare_data(self) -> None:
from datasets import Dataset, load_dataset
from lightning.data import optimize
from litdata import optimize

if str(self.data_path).startswith("s3://"):
print(f"The OpenWebText data path points to an S3 location: {self.data_path}. Skipping preprocessing.")
Expand Down Expand Up @@ -81,7 +81,7 @@ def tokenize(data: Dataset, index: int):
)

def train_dataloader(self) -> DataLoader:
from lightning.data.streaming import StreamingDataLoader, StreamingDataset, TokensLoader
from litdata.streaming import StreamingDataLoader, StreamingDataset, TokensLoader

train_dataset = StreamingDataset(
input_dir=self.data_path_train,
Expand All @@ -95,7 +95,7 @@ def train_dataloader(self) -> DataLoader:
return train_dataloader

def val_dataloader(self) -> DataLoader:
from lightning.data.streaming import StreamingDataset, TokensLoader
from litdata.streaming import StreamingDataset, TokensLoader

val_dataset = StreamingDataset(
input_dir=self.data_path_val,
Expand Down
2 changes: 1 addition & 1 deletion litgpt/data/prepare_slimpajama.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def prepare(
chunk_size: int = (2049 * 16384),
fast_dev_run: bool = False,
) -> None:
from lightning.data.streaming import DataProcessor
from litdata.processing.data_processor import DataProcessor

tokenizer = Tokenizer(tokenizer_path)
data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
Expand Down
4 changes: 2 additions & 2 deletions litgpt/data/prepare_starcoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

_LITDATA_AVAILABLE = RequirementCache("litdata")
if _LITDATA_AVAILABLE:
from lightning.data.streaming import DataChunkRecipe
from litdata.processing.data_processor import DataChunkRecipe
else:
DataChunkRecipe = object

Expand Down Expand Up @@ -57,7 +57,7 @@ def prepare(
chunk_size: int = (2049 * 8192),
fast_dev_run: bool = False,
) -> None:
from lightning.data.streaming import DataProcessor
from litdata.processing.data_processor import DataProcessor

tokenizer = Tokenizer(tokenizer_path)
data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
Expand Down
4 changes: 2 additions & 2 deletions litgpt/data/tinyllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def prepare_data(self) -> None:
)

def train_dataloader(self) -> DataLoader:
from lightning.data.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset, TokensLoader
from litdata.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset, TokensLoader

train_datasets = [
StreamingDataset(
Expand All @@ -79,7 +79,7 @@ def train_dataloader(self) -> DataLoader:
return train_dataloader

def val_dataloader(self) -> DataLoader:
from lightning.data.streaming import StreamingDataset, TokensLoader
from litdata.streaming import StreamingDataset, TokensLoader

val_dataset = StreamingDataset(
input_dir=self.slimpajama_val,
Expand Down
3 changes: 1 addition & 2 deletions requirements-all.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@ sentencepiece # llama-based models
tokenizers # pythia, falcon, redpajama
datasets # eval
requests # litgpt.data
litdata # litgpt.data
zstandard # litgpt.data.prepare_slimpajama.py
pandas # litgpt.data.prepare_starcoder.py
pyarrow # litgpt.data.prepare_starcoder.py
tensorboard # litgpt.pretrain
torchmetrics # litgpt.pretrain
# eval
git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8beaa90051fb52db77f0a529
# scripts/prepare_slimpajama.py, scripts/prepare_starcoder.py
lightning[data] @ git+https://github.com/Lightning-AI/lightning@f23b3b1e7fdab1d325f79f69a28706d33144f27e
4 changes: 2 additions & 2 deletions tests/data/test_openwebtext.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@


@pytest.mark.skipif(sys.platform == "win32", reason="Not in the mood to add Windows support right now.")
@mock.patch("lightning.data.optimize")
@mock.patch("litdata.optimize")
@mock.patch("datasets.load_dataset")
def test_openwebtext(_, optimize_mock, tmp_path, monkeypatch, mock_tokenizer):
from litgpt.data import OpenWebText
from lightning.data.streaming import StreamingDataLoader, StreamingDataset
from litdata.streaming import StreamingDataLoader, StreamingDataset

data = OpenWebText(data_path=(tmp_path / "openwebtext"))
assert data.seq_length == 2048
Expand Down
2 changes: 1 addition & 1 deletion tests/data/test_tinyllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def test_tinyllama(tmp_path, monkeypatch):
from litgpt.data import TinyLlama
from lightning.data.streaming import StreamingDataLoader, StreamingDataset, CombinedStreamingDataset
from litdata.streaming import StreamingDataLoader, StreamingDataset, CombinedStreamingDataset

data = TinyLlama(data_path=(tmp_path / "data"))
assert data.seq_length == 2048
Expand Down
4 changes: 2 additions & 2 deletions tutorials/pretrain_tinyllama.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ Around 1.2 TB of disk space is required to store both datasets.

## Prepare the datasets for training

In order to start pretraining litgpt on it, you need to read, tokenize, and write the data in binary chunks. This will leverage our `lightning.data` optimization pipeline and streaming dataset that comes with Lightning.
In order to start pretraining litgpt on it, you need to read, tokenize, and write the data in binary chunks. This will leverage the `litdata` optimization pipeline and streaming dataset.

First, install additional dependencies for preprocessing:

```bash
pip install 'lightning[data]' torchmetrics tensorboard sentencepiece zstandard pandas pyarrow 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
pip install -r requirements-all.txt
```

You will need to have the tokenizer config available:
Expand Down

0 comments on commit e3d8e3f

Please sign in to comment.