From c2aa49e0a1110c105a253526fca6b029a114b5ba Mon Sep 17 00:00:00 2001 From: rasbt Date: Tue, 2 Apr 2024 20:41:05 +0000 Subject: [PATCH] text files data module --- litgpt/data/__init__.py | 2 + litgpt/data/text_files.py | 160 +++++++++++++++++++++++++++++++++++++ litgpt/data/tinystories.py | 5 ++ tutorials/pretrain.md | 59 +++++++++++++- 4 files changed, 225 insertions(+), 1 deletion(-) create mode 100644 litgpt/data/text_files.py diff --git a/litgpt/data/__init__.py b/litgpt/data/__init__.py index f3043ca301..97fb67aa05 100644 --- a/litgpt/data/__init__.py +++ b/litgpt/data/__init__.py @@ -11,6 +11,7 @@ from litgpt.data.lima import LIMA from litgpt.data.lit_data import LitData from litgpt.data.longform import LongForm +from litgpt.data.text_files import TextFiles from litgpt.data.tinyllama import TinyLlama from litgpt.data.tinystories import TinyStories from litgpt.data.openwebtext import OpenWebText @@ -30,6 +31,7 @@ "LongForm", "OpenWebText", "SFTDataset", + "TextFiles", "TinyLlama", "TinyStories", "get_sft_collate_fn", diff --git a/litgpt/data/text_files.py b/litgpt/data/text_files.py new file mode 100644 index 0000000000..7ac6e3b41c --- /dev/null +++ b/litgpt/data/text_files.py @@ -0,0 +1,160 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +import glob +import os +from dataclasses import dataclass, field +from functools import partial +from pathlib import Path +from tqdm import tqdm +from typing import Optional + +from torch.utils.data import DataLoader + +from litgpt import Tokenizer +from litgpt.data import DataModule + + +def get_half_workers(): + num_workers = (os.cpu_count() - 1) // 2 + if num_workers < 1: + return 1 + else: + return num_workers + + +@dataclass +class TextFiles(DataModule): + """The TextFile data module used for pretraining. + + Reads in text data from plaintext files contained in a data folder + and provides training and validation dataloaders that return batches of tokens. + Every sample is set to a fixed length. + """ + train_data_path: Path = Path("data/") + """The path to the data directory used for training that + contains .txt files""" + val_data_path: Optional[str] = None + """The path to the data directory used for validation that + contains .txt files. Splits off data for validation from the + training set if None.""" + seed: int = 42 + """The seed to use for shuffling the dataset.""" + num_workers: Optional[int] = None + """The number of workers to use for data loading. + Sets the number of workers equal to the number of avaialable CPUs-1 by default.""" + + tokenizer: Optional[Tokenizer] = field(default=None, init=False, repr=False) + batch_size: int = field(default=1, init=False, repr=False) + max_seq_length: int = field(default=-1, init=False, repr=False) + + def connect(self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: int = -1) -> None: + self.tokenizer = tokenizer + self.batch_size = batch_size + self.max_seq_length = max_seq_length + 1 # Increase by one because we need the next token as well + + def __post_init__(self) -> None: + self.data_path_train = self.train_data_path / "train" + if self.val_data_path is None: + self.data_path_val = self.train_data_path / "val" + else: + self.data_path_val = Path(self.val_data_path) / "val" + + def prepare_data(self) -> None: + from litdata import optimize + + train_files = sorted(glob.glob(str(self.train_data_path / "*.txt"))) + assert len(train_files) > 0, f"No .txt files found in train data {train_files}" + assert len(train_files) > 1, f"Expected at least two .txt files in {train_files}" + + if self.val_data_path is not None: + self.val_data_path = Path(self.val_data_path) + val_files = sorted(glob.glob(str(self.val_data_path / "*.txt"))) + assert len(val_files) > 0, f"No .txt files found in validation data {val_files}" + # train/test split. let's use only shard 0 for test split, rest train + else: + val_files, *train_files = train_files + + if self.num_workers is None: + num_workers = os.cpu_count() - 1 + else: + num_workers = self.num_workers + + if not Path(self.data_path_train).is_dir(): + optimize( + fn=partial(tokenize, tokenizer=self.tokenizer), + inputs=train_files, + output_dir=str(self.data_path_train), + num_workers=num_workers, + chunk_bytes="50MB", + ) + if not Path(self.data_path_val).is_dir(): + optimize( + fn=partial(tokenize, tokenizer=self.tokenizer), + inputs=[val_files] if not isinstance(val_files, list) else val_files, + output_dir=str(self.data_path_val), + num_workers=1, # there's only 1 file + chunk_bytes="50MB", + ) + + def train_dataloader(self) -> DataLoader: + from litdata.streaming import StreamingDataLoader, StreamingDataset, TokensLoader + + train_dataset = StreamingDataset( + input_dir=str(self.data_path_train), + item_loader=TokensLoader(block_size=self.max_seq_length), + shuffle=True, + drop_last=True, + ) + if self.num_workers is None: + num_workers = get_half_workers() + + train_dataloader = StreamingDataLoader( + train_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=num_workers, drop_last=True + ) + return train_dataloader + + def val_dataloader(self) -> DataLoader: + from litdata.streaming import StreamingDataset, TokensLoader + + if self.num_workers is None: + num_workers = get_half_workers() + + val_dataset = StreamingDataset( + input_dir=str(self.data_path_val), + item_loader=TokensLoader(block_size=self.max_seq_length), + shuffle=True, + # Consider setting to False, but we would lose some samples due to truncation when world size > 1 + drop_last=True, + ) + val_dataloader = DataLoader( + val_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=num_workers, drop_last=True + ) + return val_dataloader + + +def tokenize(filename: str, tokenizer: Tokenizer): + if tokenizer is None: + raise ValueError( + "Tokenizer is None. If you are using this data module via `litgpt pretrain`, " + "please provide a valid `--tokenizer_dir` path." + ) + with open(filename, "r", encoding="utf-8") as file: + text = file.read() + text = text.strip() + + chunks = [] + total_length = len(text) + num_chunks = 10 + chunk_size = total_length // num_chunks + for i in range(num_chunks): + start_index = i * chunk_size + end_index = (i + 1) * chunk_size if i < 9 else total_length + chunks.append(text[start_index:end_index]) + + + global_rank = int(os.environ["DATA_OPTIMIZER_GLOBAL_RANK"]) + num_workers = int(os.environ["DATA_OPTIMIZER_NUM_WORKERS"]) + local_rank = global_rank % num_workers + for example in tqdm(chunks, position=local_rank): + tokens = tokenizer.encode(example.strip(), bos=True, eos=False) # encode the text, use BOS + yield tokens + diff --git a/litgpt/data/tinystories.py b/litgpt/data/tinystories.py index 90fce42341..95c3007fec 100644 --- a/litgpt/data/tinystories.py +++ b/litgpt/data/tinystories.py @@ -103,6 +103,11 @@ def val_dataloader(self) -> DataLoader: def tokenize(filename: str, tokenizer: Tokenizer): + if tokenizer is None: + raise ValueError( + "Tokenizer is None. If you are using this data module via `litgpt pretrain`, " + "please provide a valid `--tokenizer_dir` path." + ) with open(filename, "r") as f: data = json.load(f) global_rank = int(os.environ["DATA_OPTIMIZER_GLOBAL_RANK"]) diff --git a/tutorials/pretrain.md b/tutorials/pretrain.md index 4a8db678e1..358b3243ea 100644 --- a/tutorials/pretrain.md +++ b/tutorials/pretrain.md @@ -4,7 +4,7 @@ This document explains how to pretrain LLMs using LitGPT.   -## The Pretraining API +## The `pretrain` API You can pretrain models in LitGPT using the `litgpt pretrain` API starting with any of the available architectures listed by calling `litgpt pretrain` without any additional arguments: @@ -36,8 +36,65 @@ litgpt pretrain \ ``` +  +## Pretrain on custom data + +The simplest way to get started with pretraining on custom data is by using the `TextFiles` data module, which lets you pretrain a dataset from a folder containing plain text files. + +For instance, assume you stored a number of text files in a `custom_pretraining_dataset` folder (we recommend avoiding small files and concatenating them to files of at least 50 Mb for efficiency): + +```bash +~ ls -lh custom_pretraining_data +total 3225M +-rw-r--r-- 1 sebastian 50M Apr 2 18:31 combined_1.txt +-rw-r--r-- 1 sebastian 50M Apr 2 18:31 combined_2.txt +-rw-r--r-- 1 sebastian 50M Apr 2 18:31 combined_3.txt +-rw-r--r-- 1 sebastian 50M Apr 2 18:31 combined_4.txt +-rw-r--r-- 1 sebastian 50M Apr 2 18:31 combined_5.txt +... +``` + +You can then use the `TextFiles` API to pretrain a model (here a small `pythia-14m` model for illustration purposes) from scratch as follows: + +```bash +litgpt download \ + --repo_id EleutherAI/pythia-14m \ + --tokenizer_only true + +litgpt pretrain \ + --model_name pythia-14m \ + --data TextFiles \ + --tokenizer_dir checkpoints/EleutherAI/pythia-14m \ + --data.train_data_path custom_pretraining_data \ + --train.learning_rate 0.005 \ + --train.lr_warmup_steps=200 +``` +  +## Continued pretraining on custom data + +Often, it makes sense to adopt an existing pretrained model and further pretrain it on our own custom data. The existing pretrained model can be either our own pretrained model or a model downloaded from a model hub. + +For instance, let's assume we download a Pythia model: + +```bash +litgpt download --repo_id EleutherAI/pythia-14m +``` + +Next, assume we have a custom dataset stored in text files similar to the *Pretrain on custom data* above. We can further pretrain the Pythia model via the `--initial_checkpoint_dir` setting as follows: + +```bash +litgpt pretrain \ + --model_name pythia-14m \ + --initial_checkpoint_dir checkpoints/EleutherAI/pythia-14m \ + --out_dir new_phi-2_checkpoint \ + --data TextFiles \ + --data.train_data_path custom_pretraining_data \ + --train.learning_rate 0.005 \ + --train.lr_warmup_steps=200 +``` +   ## Pretrain a 1.1B TinyLlama model