Skip to content

Commit

Permalink
text files data module
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt committed Apr 2, 2024
1 parent a475405 commit c2aa49e
Show file tree
Hide file tree
Showing 4 changed files with 225 additions and 1 deletion.
2 changes: 2 additions & 0 deletions litgpt/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from litgpt.data.lima import LIMA
from litgpt.data.lit_data import LitData
from litgpt.data.longform import LongForm
from litgpt.data.text_files import TextFiles
from litgpt.data.tinyllama import TinyLlama
from litgpt.data.tinystories import TinyStories
from litgpt.data.openwebtext import OpenWebText
Expand All @@ -30,6 +31,7 @@
"LongForm",
"OpenWebText",
"SFTDataset",
"TextFiles",
"TinyLlama",
"TinyStories",
"get_sft_collate_fn",
Expand Down
160 changes: 160 additions & 0 deletions litgpt/data/text_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
import glob
import os
from dataclasses import dataclass, field
from functools import partial
from pathlib import Path
from tqdm import tqdm
from typing import Optional

from torch.utils.data import DataLoader

from litgpt import Tokenizer
from litgpt.data import DataModule


def get_half_workers():
num_workers = (os.cpu_count() - 1) // 2
if num_workers < 1:
return 1
else:
return num_workers


@dataclass
class TextFiles(DataModule):
"""The TextFile data module used for pretraining.
Reads in text data from plaintext files contained in a data folder
and provides training and validation dataloaders that return batches of tokens.
Every sample is set to a fixed length.
"""
train_data_path: Path = Path("data/")
"""The path to the data directory used for training that
contains .txt files"""
val_data_path: Optional[str] = None
"""The path to the data directory used for validation that
contains .txt files. Splits off data for validation from the
training set if None."""
seed: int = 42
"""The seed to use for shuffling the dataset."""
num_workers: Optional[int] = None
"""The number of workers to use for data loading.
Sets the number of workers equal to the number of avaialable CPUs-1 by default."""

tokenizer: Optional[Tokenizer] = field(default=None, init=False, repr=False)
batch_size: int = field(default=1, init=False, repr=False)
max_seq_length: int = field(default=-1, init=False, repr=False)

def connect(self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: int = -1) -> None:
self.tokenizer = tokenizer
self.batch_size = batch_size
self.max_seq_length = max_seq_length + 1 # Increase by one because we need the next token as well

def __post_init__(self) -> None:
self.data_path_train = self.train_data_path / "train"
if self.val_data_path is None:
self.data_path_val = self.train_data_path / "val"
else:
self.data_path_val = Path(self.val_data_path) / "val"

def prepare_data(self) -> None:
from litdata import optimize

train_files = sorted(glob.glob(str(self.train_data_path / "*.txt")))
assert len(train_files) > 0, f"No .txt files found in train data {train_files}"
assert len(train_files) > 1, f"Expected at least two .txt files in {train_files}"

if self.val_data_path is not None:
self.val_data_path = Path(self.val_data_path)
val_files = sorted(glob.glob(str(self.val_data_path / "*.txt")))
assert len(val_files) > 0, f"No .txt files found in validation data {val_files}"
# train/test split. let's use only shard 0 for test split, rest train
else:
val_files, *train_files = train_files

if self.num_workers is None:
num_workers = os.cpu_count() - 1
else:
num_workers = self.num_workers

if not Path(self.data_path_train).is_dir():
optimize(
fn=partial(tokenize, tokenizer=self.tokenizer),
inputs=train_files,
output_dir=str(self.data_path_train),
num_workers=num_workers,
chunk_bytes="50MB",
)
if not Path(self.data_path_val).is_dir():
optimize(
fn=partial(tokenize, tokenizer=self.tokenizer),
inputs=[val_files] if not isinstance(val_files, list) else val_files,
output_dir=str(self.data_path_val),
num_workers=1, # there's only 1 file
chunk_bytes="50MB",
)

def train_dataloader(self) -> DataLoader:
from litdata.streaming import StreamingDataLoader, StreamingDataset, TokensLoader

train_dataset = StreamingDataset(
input_dir=str(self.data_path_train),
item_loader=TokensLoader(block_size=self.max_seq_length),
shuffle=True,
drop_last=True,
)
if self.num_workers is None:
num_workers = get_half_workers()

train_dataloader = StreamingDataLoader(
train_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=num_workers, drop_last=True
)
return train_dataloader

def val_dataloader(self) -> DataLoader:
from litdata.streaming import StreamingDataset, TokensLoader

if self.num_workers is None:
num_workers = get_half_workers()

val_dataset = StreamingDataset(
input_dir=str(self.data_path_val),
item_loader=TokensLoader(block_size=self.max_seq_length),
shuffle=True,
# Consider setting to False, but we would lose some samples due to truncation when world size > 1
drop_last=True,
)
val_dataloader = DataLoader(
val_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=num_workers, drop_last=True
)
return val_dataloader


def tokenize(filename: str, tokenizer: Tokenizer):
if tokenizer is None:
raise ValueError(
"Tokenizer is None. If you are using this data module via `litgpt pretrain`, "
"please provide a valid `--tokenizer_dir` path."
)
with open(filename, "r", encoding="utf-8") as file:
text = file.read()
text = text.strip()

chunks = []
total_length = len(text)
num_chunks = 10
chunk_size = total_length // num_chunks
for i in range(num_chunks):
start_index = i * chunk_size
end_index = (i + 1) * chunk_size if i < 9 else total_length
chunks.append(text[start_index:end_index])


global_rank = int(os.environ["DATA_OPTIMIZER_GLOBAL_RANK"])
num_workers = int(os.environ["DATA_OPTIMIZER_NUM_WORKERS"])
local_rank = global_rank % num_workers
for example in tqdm(chunks, position=local_rank):
tokens = tokenizer.encode(example.strip(), bos=True, eos=False) # encode the text, use BOS
yield tokens

5 changes: 5 additions & 0 deletions litgpt/data/tinystories.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@ def val_dataloader(self) -> DataLoader:


def tokenize(filename: str, tokenizer: Tokenizer):
if tokenizer is None:
raise ValueError(
"Tokenizer is None. If you are using this data module via `litgpt pretrain`, "
"please provide a valid `--tokenizer_dir` path."
)
with open(filename, "r") as f:
data = json.load(f)
global_rank = int(os.environ["DATA_OPTIMIZER_GLOBAL_RANK"])
Expand Down
59 changes: 58 additions & 1 deletion tutorials/pretrain.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
This document explains how to pretrain LLMs using LitGPT.

&nbsp;
## The Pretraining API
## The `pretrain` API

You can pretrain models in LitGPT using the `litgpt pretrain` API starting with any of the available architectures listed by calling `litgpt pretrain` without any additional arguments:

Expand Down Expand Up @@ -36,8 +36,65 @@ litgpt pretrain \
```


&nbsp;
## Pretrain on custom data

The simplest way to get started with pretraining on custom data is by using the `TextFiles` data module, which lets you pretrain a dataset from a folder containing plain text files.

For instance, assume you stored a number of text files in a `custom_pretraining_dataset` folder (we recommend avoiding small files and concatenating them to files of at least 50 Mb for efficiency):

```bash
~ ls -lh custom_pretraining_data
total 3225M
-rw-r--r-- 1 sebastian 50M Apr 2 18:31 combined_1.txt
-rw-r--r-- 1 sebastian 50M Apr 2 18:31 combined_2.txt
-rw-r--r-- 1 sebastian 50M Apr 2 18:31 combined_3.txt
-rw-r--r-- 1 sebastian 50M Apr 2 18:31 combined_4.txt
-rw-r--r-- 1 sebastian 50M Apr 2 18:31 combined_5.txt
...
```

You can then use the `TextFiles` API to pretrain a model (here a small `pythia-14m` model for illustration purposes) from scratch as follows:

```bash
litgpt download \
--repo_id EleutherAI/pythia-14m \
--tokenizer_only true

litgpt pretrain \
--model_name pythia-14m \
--data TextFiles \
--tokenizer_dir checkpoints/EleutherAI/pythia-14m \
--data.train_data_path custom_pretraining_data \
--train.learning_rate 0.005 \
--train.lr_warmup_steps=200
```


&nbsp;
## Continued pretraining on custom data

Often, it makes sense to adopt an existing pretrained model and further pretrain it on our own custom data. The existing pretrained model can be either our own pretrained model or a model downloaded from a model hub.

For instance, let's assume we download a Pythia model:

```bash
litgpt download --repo_id EleutherAI/pythia-14m
```

Next, assume we have a custom dataset stored in text files similar to the *Pretrain on custom data* above. We can further pretrain the Pythia model via the `--initial_checkpoint_dir` setting as follows:

```bash
litgpt pretrain \
--model_name pythia-14m \
--initial_checkpoint_dir checkpoints/EleutherAI/pythia-14m \
--out_dir new_phi-2_checkpoint \
--data TextFiles \
--data.train_data_path custom_pretraining_data \
--train.learning_rate 0.005 \
--train.lr_warmup_steps=200
```

&nbsp;
## Pretrain a 1.1B TinyLlama model

Expand Down

0 comments on commit c2aa49e

Please sign in to comment.