Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Install litdata instead of lightning[data] #1046

Merged
merged 3 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions litgpt/data/openwebtext.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def connect(

def prepare_data(self) -> None:
from datasets import Dataset, load_dataset
from lightning.data import optimize
from litdata import optimize

if str(self.data_path).startswith("s3://"):
print(f"The OpenWebText data path points to an S3 location: {self.data_path}. Skipping preprocessing.")
Expand Down Expand Up @@ -81,7 +81,7 @@ def tokenize(data: Dataset, index: int):
)

def train_dataloader(self) -> DataLoader:
from lightning.data.streaming import StreamingDataLoader, StreamingDataset, TokensLoader
from litdata.streaming import StreamingDataLoader, StreamingDataset, TokensLoader

train_dataset = StreamingDataset(
input_dir=self.data_path_train,
Expand All @@ -95,7 +95,7 @@ def train_dataloader(self) -> DataLoader:
return train_dataloader

def val_dataloader(self) -> DataLoader:
from lightning.data.streaming import StreamingDataset, TokensLoader
from litdata.streaming import StreamingDataset, TokensLoader

val_dataset = StreamingDataset(
input_dir=self.data_path_val,
Expand Down
2 changes: 1 addition & 1 deletion litgpt/data/prepare_slimpajama.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def prepare(
chunk_size: int = (2049 * 16384),
fast_dev_run: bool = False,
) -> None:
from lightning.data.streaming import DataProcessor
from litdata.processing.data_processor import DataProcessor

tokenizer = Tokenizer(tokenizer_path)
data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
Expand Down
4 changes: 2 additions & 2 deletions litgpt/data/prepare_starcoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

_LITDATA_AVAILABLE = RequirementCache("litdata")
if _LITDATA_AVAILABLE:
from lightning.data.streaming import DataChunkRecipe
from litdata.processing.data_processor import DataChunkRecipe
else:
DataChunkRecipe = object

Expand Down Expand Up @@ -57,7 +57,7 @@ def prepare(
chunk_size: int = (2049 * 8192),
fast_dev_run: bool = False,
) -> None:
from lightning.data.streaming import DataProcessor
from litdata.processing.data_processor import DataProcessor

tokenizer = Tokenizer(tokenizer_path)
data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
Expand Down
4 changes: 2 additions & 2 deletions litgpt/data/tinyllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def prepare_data(self) -> None:
)

def train_dataloader(self) -> DataLoader:
from lightning.data.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset, TokensLoader
from litdata.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset, TokensLoader

train_datasets = [
StreamingDataset(
Expand All @@ -79,7 +79,7 @@ def train_dataloader(self) -> DataLoader:
return train_dataloader

def val_dataloader(self) -> DataLoader:
from lightning.data.streaming import StreamingDataset, TokensLoader
from litdata.streaming import StreamingDataset, TokensLoader

val_dataset = StreamingDataset(
input_dir=self.slimpajama_val,
Expand Down
3 changes: 1 addition & 2 deletions requirements-all.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@ sentencepiece # llama-based models
tokenizers # pythia, falcon, redpajama
datasets # eval
requests # litgpt.data
litdata # litgpt.data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested the latest version right now in the studio and all good.

zstandard # litgpt.data.prepare_slimpajama.py
pandas # litgpt.data.prepare_starcoder.py
pyarrow # litgpt.data.prepare_starcoder.py
tensorboard # litgpt.pretrain
torchmetrics # litgpt.pretrain
# eval
git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8beaa90051fb52db77f0a529
# scripts/prepare_slimpajama.py, scripts/prepare_starcoder.py
lightning[data] @ git+https://github.com/Lightning-AI/lightning@f23b3b1e7fdab1d325f79f69a28706d33144f27e
4 changes: 2 additions & 2 deletions tests/data/test_openwebtext.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@


@pytest.mark.skipif(sys.platform == "win32", reason="Not in the mood to add Windows support right now.")
@mock.patch("lightning.data.optimize")
@mock.patch("litdata.optimize")
@mock.patch("datasets.load_dataset")
def test_openwebtext(_, optimize_mock, tmp_path, monkeypatch, mock_tokenizer):
from litgpt.data import OpenWebText
from lightning.data.streaming import StreamingDataLoader, StreamingDataset
from litdata.streaming import StreamingDataLoader, StreamingDataset

data = OpenWebText(data_path=(tmp_path / "openwebtext"))
assert data.seq_length == 2048
Expand Down
2 changes: 1 addition & 1 deletion tests/data/test_tinyllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def test_tinyllama(tmp_path, monkeypatch):
from litgpt.data import TinyLlama
from lightning.data.streaming import StreamingDataLoader, StreamingDataset, CombinedStreamingDataset
from litdata.streaming import StreamingDataLoader, StreamingDataset, CombinedStreamingDataset

data = TinyLlama(data_path=(tmp_path / "data"))
assert data.seq_length == 2048
Expand Down
4 changes: 2 additions & 2 deletions tutorials/pretrain_tinyllama.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ Around 1.2 TB of disk space is required to store both datasets.

## Prepare the datasets for training

In order to start pretraining litgpt on it, you need to read, tokenize, and write the data in binary chunks. This will leverage our `lightning.data` optimization pipeline and streaming dataset that comes with Lightning.
In order to start pretraining litgpt on it, you need to read, tokenize, and write the data in binary chunks. This will leverage the `litdata` optimization pipeline and streaming dataset.

First, install additional dependencies for preprocessing:

```bash
pip install 'lightning[data]' torchmetrics tensorboard sentencepiece zstandard pandas pyarrow 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
pip install -r requirements-all.txt
```

You will need to have the tokenizer config available:
Expand Down
Loading