diff --git a/config_hub/finetune/llama-2-7b/full.yaml b/config_hub/finetune/llama-2-7b/full.yaml index eb7cb8ede2..3f7cdfbd8a 100644 --- a/config_hub/finetune/llama-2-7b/full.yaml +++ b/config_hub/finetune/llama-2-7b/full.yaml @@ -6,7 +6,7 @@ data: class_path: litgpt.data.AlpacaGPT4 init_args: mask_prompt: false - test_split_fraction: 0.03847 + val_split_fraction: 0.03847 prompt_style: "alpaca" ignore_index: -1 seed: 42 diff --git a/config_hub/finetune/llama-2-7b/lora.yaml b/config_hub/finetune/llama-2-7b/lora.yaml index 54fedd574b..486929d8f3 100644 --- a/config_hub/finetune/llama-2-7b/lora.yaml +++ b/config_hub/finetune/llama-2-7b/lora.yaml @@ -15,7 +15,7 @@ data: class_path: litgpt.data.AlpacaGPT4 init_args: mask_prompt: false - test_split_fraction: 0.03847 + val_split_fraction: 0.03847 prompt_style: "alpaca" ignore_index: -1 seed: 42 diff --git a/config_hub/finetune/tiny-llama/lora.yaml b/config_hub/finetune/tiny-llama/lora.yaml index a59d2ded6e..af7d41f07e 100644 --- a/config_hub/finetune/tiny-llama/lora.yaml +++ b/config_hub/finetune/tiny-llama/lora.yaml @@ -15,7 +15,7 @@ data: class_path: litgpt.data.AlpacaGPT4 init_args: mask_prompt: false - test_split_fraction: 0.03847 + val_split_fraction: 0.03847 prompt_style: "alpaca" ignore_index: -1 seed: 42 diff --git a/litgpt/data/alpaca.py b/litgpt/data/alpaca.py index 3b1d830d74..70d9c3d2c4 100644 --- a/litgpt/data/alpaca.py +++ b/litgpt/data/alpaca.py @@ -22,8 +22,8 @@ class Alpaca(LitDataModule): mask_prompt: bool = False """Whether to mask the prompt section from the label (with ``ignore_index``).""" - test_split_fraction: float = 0.03865 # to get exactly 2000 test samples, - """The fraction of the dataset to use for the test/validation dataset. The rest is used for training.""" + val_split_fraction: float = 0.03865 # to get exactly 2000 validation samples, + """The fraction of the dataset to use for the validation dataset. The rest is used for training.""" prompt_style: Union[str, PromptStyle] = "alpaca" """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles.""" ignore_index: int = -1 @@ -70,7 +70,7 @@ def setup(self, stage: str = "") -> None: # Partition the dataset into train and test train_data, test_data = random_split( data, - [1.0 - self.test_split_fraction, self.test_split_fraction], + [1.0 - self.val_split_fraction, self.val_split_fraction], generator=torch.Generator().manual_seed(self.seed) ) train_data, test_data = list(train_data), list(test_data) diff --git a/litgpt/data/alpaca_2k.py b/litgpt/data/alpaca_2k.py index 8c29d7266a..bb9ea8f8ec 100644 --- a/litgpt/data/alpaca_2k.py +++ b/litgpt/data/alpaca_2k.py @@ -11,8 +11,8 @@ class Alpaca2k(Alpaca): """Alpaca2k data module for supervised finetuning.""" - test_split_fraction: float = 0.05 # to get exactly 100 test samples, - """The fraction of the dataset to use for the test/validation dataset. The rest is used for training.""" + val_split_fraction: float = 0.05 # to get exactly 100 validation samples, + """The fraction of the dataset to use for the validation dataset. The rest is used for training.""" download_dir: Path = Path("./data/alpaca2k") """The directory in which the downloaded datasetgets saved.""" repo_id: str = field(repr=False, default="mhenrichsen/alpaca_2k_test") @@ -30,7 +30,7 @@ def setup(self, stage: str = "") -> None: dataset = load_dataset(self.repo_id, cache_dir=self.download_dir) - train_validation_split = dataset["train"].train_test_split(test_size=self.test_split_fraction, seed=self.seed) + train_validation_split = dataset["train"].train_test_split(test_size=self.val_split_fraction, seed=self.seed) train_data = train_validation_split["train"] test_data = train_validation_split["test"] diff --git a/litgpt/data/alpaca_gpt4.py b/litgpt/data/alpaca_gpt4.py index 6a7cdc140b..9a66193a2e 100644 --- a/litgpt/data/alpaca_gpt4.py +++ b/litgpt/data/alpaca_gpt4.py @@ -13,8 +13,8 @@ class AlpacaGPT4(Alpaca): """AlpacaGPT4 data module for supervised finetuning.""" - test_split_fraction: float = 0.03847 # to get exactly 2000 test samples, - """The fraction of the dataset to use for the test/validation dataset. The rest is used for training.""" + val_split_fraction: float = 0.03847 # to get exactly 2000 test samples, + """The fraction of the dataset to use for the validation dataset. The rest is used for training.""" download_dir: Path = Path("./data/alpacagpt4") """The directory in which the downloaded datasetgets saved.""" file_url: str = field(repr=False, default=_URL) diff --git a/litgpt/data/dolly.py b/litgpt/data/dolly.py index 0a7af16440..9824e8d13c 100644 --- a/litgpt/data/dolly.py +++ b/litgpt/data/dolly.py @@ -20,8 +20,8 @@ class Dolly(Alpaca): mask_prompt: bool = False """Whether to mask the prompt section from the label (with ``ignore_index``).""" - test_split_fraction: float = 0.1 - """The fraction of the dataset to use for the test/validation dataset. The rest is used for training.""" + val_split_fraction: float = 0.1 + """The fraction of the dataset to use for the validation dataset. The rest is used for training.""" prompt_style: Union[str, PromptStyle] = "alpaca" """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles.""" ignore_index: int = -1 @@ -49,7 +49,7 @@ def setup(self, stage: str = "") -> None: # Partition the dataset into train and test train_data, test_data = random_split( data, - [1.0 - self.test_split_fraction, self.test_split_fraction], + [1.0 - self.val_split_fraction, self.val_split_fraction], generator=torch.Generator().manual_seed(self.seed) ) train_data, test_data = list(train_data), list(test_data) diff --git a/litgpt/data/json.py b/litgpt/data/json.py index 20d227563b..3115d4639e 100644 --- a/litgpt/data/json.py +++ b/litgpt/data/json.py @@ -23,7 +23,7 @@ class JSON(LitDataModule): and can optionally have a key 'input' (see Alpaca).""" mask_prompt: bool = False """Whether to mask the prompt section from the label (with ``ignore_index``).""" - test_split_fraction: Optional[float] = None + val_split_fraction: Optional[float] = None """The fraction of the dataset to use for the validation dataset. The rest is used for training. Only applies if you passed in a single file to `json_path`.""" prompt_style: Union[str, PromptStyle] = "alpaca" @@ -39,13 +39,13 @@ class JSON(LitDataModule): batch_size: int = field(default=1, init=False, repr=False) max_seq_length: int = field(default=-1, init=False, repr=False) train_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False) - test_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False) + val_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False) def __post_init__(self): - if self.json_path.is_dir() and self.test_split_fraction is not None: + if self.json_path.is_dir() and self.val_split_fraction is not None: raise ValueError( "If `json_path` is a directory, it must contain 'train.json' and 'val.json' files and" - f" hence `test_split_fraction` should not be set. Got `{self.test_split_fraction=}`." + f" hence `val_split_fraction` should not be set. Got `{self.val_split_fraction=}`." ) if not self.json_path.exists(): raise FileNotFoundError( @@ -112,7 +112,7 @@ def get_splits(self) -> Tuple: # Partition the dataset into train and test train_data, test_data = random_split( data, - [1.0 - self.test_split_fraction, self.test_split_fraction], + [1.0 - self.val_split_fraction, self.val_split_fraction], generator=torch.Generator().manual_seed(self.seed) ) return train_data, test_data diff --git a/litgpt/data/lima.py b/litgpt/data/lima.py index b059c1bc63..263b80dd15 100644 --- a/litgpt/data/lima.py +++ b/litgpt/data/lima.py @@ -19,8 +19,8 @@ class LIMA(LitDataModule): mask_prompt: bool = False """Whether to mask the prompt section from the label (with ``ignore_index``).""" - test_split_fraction: float = 0.1 - """The fraction of the dataset to use for the test/validation dataset. The rest is used for training.""" + val_split_fraction: float = 0.1 + """The fraction of the dataset to use for the validation dataset. The rest is used for training.""" prompt_style: Union[str, PromptStyle] = "alpaca" """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles.""" ignore_index: int = -1 @@ -77,7 +77,7 @@ def setup(self, stage: str = "") -> None: # Partition the dataset into train and test train_data, test_data = random_split( data, - [1.0 - self.test_split_fraction, self.test_split_fraction], + [1.0 - self.val_split_fraction, self.val_split_fraction], generator=torch.Generator().manual_seed(self.seed) ) train_data, test_data = list(train_data), list(test_data) diff --git a/litgpt/data/openwebtext.py b/litgpt/data/openwebtext.py index 2ac408718e..d50e1f84d5 100644 --- a/litgpt/data/openwebtext.py +++ b/litgpt/data/openwebtext.py @@ -18,8 +18,8 @@ class OpenWebText(LitDataModule): data_path: Union[str, Path] = Path("data/openwebtext") """The path to the data directory, containing two folders 'train' and 'val' which are the output of the preprocessing step. The path can also be a remote path (e.g., s3://).""" - test_split_fraction: float = 0.0005 - """The fraction of data that should be put aside for validation/testing.""" + val_split_fraction: float = 0.0005 + """The fraction of data that should be put aside for validation.""" seed: int = 42 """The seed to use for shuffling the training data.""" num_workers: int = 8 @@ -59,7 +59,7 @@ def prepare_data(self) -> None: dataset = load_dataset("openwebtext", num_proc=(os.cpu_count() // 2), trust_remote_code=True) # Split the data in training and validation - split_dataset = dataset["train"].train_test_split(test_size=self.test_split_fraction, seed=self.seed, shuffle=True) + split_dataset = dataset["train"].train_test_split(test_size=self.val_split_fraction, seed=self.seed, shuffle=True) split_dataset["val"] = split_dataset.pop("test") # rename the test split to val def tokenize(data: Dataset, index: int): diff --git a/tests/data/test_alpaca.py b/tests/data/test_alpaca.py index 9097c7190c..a82364c9db 100644 --- a/tests/data/test_alpaca.py +++ b/tests/data/test_alpaca.py @@ -5,7 +5,7 @@ def test_alpaca(mock_tokenizer, alpaca_path): from litgpt.prompts import Alpaca as AlpacaPromptStyle alpaca = Alpaca( - test_split_fraction=0.5, + val_split_fraction=0.5, download_dir=alpaca_path.parent, file_name=alpaca_path.name, num_workers=0, diff --git a/tests/data/test_dolly.py b/tests/data/test_dolly.py index 76540ef858..25f9c9879e 100644 --- a/tests/data/test_dolly.py +++ b/tests/data/test_dolly.py @@ -5,7 +5,7 @@ def test_dolly(mock_tokenizer, dolly_path): from litgpt.prompts import Alpaca as AlpacaPromptStyle alpaca = Dolly( - test_split_fraction=0.5, + val_split_fraction=0.5, download_dir=dolly_path.parent, file_name=dolly_path.name, num_workers=0, diff --git a/tests/data/test_json.py b/tests/data/test_json.py index bf1356a3c4..8e180b4802 100644 --- a/tests/data/test_json.py +++ b/tests/data/test_json.py @@ -24,7 +24,7 @@ def apply(self, prompt, **kwargs): with open(json_path, "w", encoding="utf-8") as fp: json.dump(mock_data, fp) - data = JSON(json_path, test_split_fraction=0.5, prompt_style=Style(), num_workers=0) + data = JSON(json_path, val_split_fraction=0.5, prompt_style=Style(), num_workers=0) data.connect(tokenizer=mock_tokenizer, batch_size=2) data.prepare_data() # does nothing data.setup() @@ -61,8 +61,8 @@ def test_json_input_validation(tmp_path): with pytest.raises(FileNotFoundError, match="The `json_path` must be a file or a directory"): JSON(tmp_path / "not exist") - with pytest.raises(ValueError, match="`test_split_fraction` should not be set"): - JSON(tmp_path, test_split_fraction=0.5) + with pytest.raises(ValueError, match="`val_split_fraction` should not be set"): + JSON(tmp_path, val_split_fraction=0.5) data = JSON(tmp_path) data.prepare_data() # does nothing diff --git a/tests/test_adapter.py b/tests/test_adapter.py index 5071233c4a..199bc38412 100644 --- a/tests/test_adapter.py +++ b/tests/test_adapter.py @@ -73,7 +73,7 @@ def test_adapter_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path) data=Alpaca( download_dir=alpaca_path.parent, file_name=alpaca_path.name, - test_split_fraction=0.5, + val_split_fraction=0.5, num_workers=0 ), checkpoint_dir=fake_checkpoint_dir, @@ -173,7 +173,7 @@ def test_adapter_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca data=Alpaca( download_dir=alpaca_path.parent, file_name=alpaca_path.name, - test_split_fraction=0.5, + val_split_fraction=0.5, num_workers=0, ), precision="16-true", diff --git a/tests/test_adapter_v2.py b/tests/test_adapter_v2.py index 8bcc0bc9b5..485f4128b3 100644 --- a/tests/test_adapter_v2.py +++ b/tests/test_adapter_v2.py @@ -96,7 +96,7 @@ def test_adapter_v2_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_pa data=Alpaca( download_dir=alpaca_path.parent, file_name=alpaca_path.name, - test_split_fraction=0.5, + val_split_fraction=0.5, num_workers=0 ), checkpoint_dir=fake_checkpoint_dir, @@ -262,7 +262,7 @@ def test_adapter_v2_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alp data=Alpaca( download_dir=alpaca_path.parent, file_name=alpaca_path.name, - test_split_fraction=0.5, + val_split_fraction=0.5, num_workers=0 ), precision="16-true", diff --git a/tests/test_full.py b/tests/test_full.py index 8a6ceba1ae..b1f5ede9ed 100644 --- a/tests/test_full.py +++ b/tests/test_full.py @@ -30,7 +30,7 @@ def test_full_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path): data=Alpaca( download_dir=alpaca_path.parent, file_name=alpaca_path.name, - test_split_fraction=0.5, + val_split_fraction=0.5, num_workers=0 ), checkpoint_dir=fake_checkpoint_dir, diff --git a/tests/test_lora.py b/tests/test_lora.py index c80f8d6208..c9cf3fc79e 100644 --- a/tests/test_lora.py +++ b/tests/test_lora.py @@ -203,7 +203,7 @@ def test_lora_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path): data=Alpaca( download_dir=alpaca_path.parent, file_name=alpaca_path.name, - test_split_fraction=0.5, + val_split_fraction=0.5, num_workers=0 ), checkpoint_dir=fake_checkpoint_dir, @@ -631,7 +631,7 @@ def test_lora_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca_pa data=Alpaca( download_dir=alpaca_path.parent, file_name=alpaca_path.name, - test_split_fraction=0.5, + val_split_fraction=0.5, num_workers=0, ), checkpoint_dir=fake_checkpoint_dir, diff --git a/tutorials/prepare_dataset.md b/tutorials/prepare_dataset.md index d35b258372..a52410641d 100644 --- a/tutorials/prepare_dataset.md +++ b/tutorials/prepare_dataset.md @@ -345,7 +345,7 @@ python litgpt/finetune/lora.py \ You can also customize how the dataset is read by using these additional parameters -- `test_split_fraction`: The fraction of the data to split. Defaults to `0.1` +- `val_split_fraction`: The fraction of the data to split. Defaults to `0.1` - `seed`: The seed value to reproduce the same random splits for train and test data. @@ -359,7 +359,7 @@ To use the settings described above, you can add the respective command line arg python litgpt/finetune/lora.py \ --data JSON \ --data.json_path path/to/your/data.json \ - --data.test_split_fraction 0.1 \ + --data.val_split_fraction 0.1 \ --data.seed 42 \ --data.mask_inputs False \ --data.ignore_index -1 \ diff --git a/xla/scripts/prepare_alpaca.py b/xla/scripts/prepare_alpaca.py index 61ca7bf3b5..ac395be5a8 100644 --- a/xla/scripts/prepare_alpaca.py +++ b/xla/scripts/prepare_alpaca.py @@ -23,7 +23,7 @@ def prepare( destination_path: Path = Path("data/alpaca"), checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), - test_split_fraction: float = 0.03865, # to get exactly 2000 test samples, + val_split_fraction: float = 0.03865, # to get exactly 2000 validation samples, seed: int = 42, mask_inputs: bool = False, # as in alpaca-lora data_file_name: str = "alpaca_data_cleaned_archive.json", @@ -53,7 +53,7 @@ def prepare( # Partition the dataset into train and test train_set, test_set = random_split( - data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed) + data, [1.0 - val_split_fraction, val_split_fraction], generator=torch.Generator().manual_seed(seed) ) train_set, test_set = list(train_set), list(test_set)