Test split -> Val split (#1065)

Lightning-AI · Mar 15, 2024 · a5ecf5d · a5ecf5d
1 parent a19c9a0
commit a5ecf5d
Show file tree

Hide file tree

Showing 19 changed files with 41 additions and 41 deletions.
diff --git a/config_hub/finetune/llama-2-7b/full.yaml b/config_hub/finetune/llama-2-7b/full.yaml
@@ -6,7 +6,7 @@ data:
   class_path: litgpt.data.AlpacaGPT4
   init_args:
     mask_prompt: false
-    test_split_fraction: 0.03847
+    val_split_fraction: 0.03847
     prompt_style: "alpaca"
     ignore_index: -1
     seed: 42

diff --git a/config_hub/finetune/llama-2-7b/lora.yaml b/config_hub/finetune/llama-2-7b/lora.yaml
@@ -15,7 +15,7 @@ data:
   class_path: litgpt.data.AlpacaGPT4
   init_args:
     mask_prompt: false
-    test_split_fraction: 0.03847
+    val_split_fraction: 0.03847
     prompt_style: "alpaca"
     ignore_index: -1
     seed: 42

diff --git a/config_hub/finetune/tiny-llama/lora.yaml b/config_hub/finetune/tiny-llama/lora.yaml
@@ -15,7 +15,7 @@ data:
   class_path: litgpt.data.AlpacaGPT4
   init_args:
     mask_prompt: false
-    test_split_fraction: 0.03847
+    val_split_fraction: 0.03847
     prompt_style: "alpaca"
     ignore_index: -1
     seed: 42

diff --git a/litgpt/data/alpaca.py b/litgpt/data/alpaca.py
@@ -22,8 +22,8 @@ class Alpaca(LitDataModule):
 
     mask_prompt: bool = False
     """Whether to mask the prompt section from the label (with ``ignore_index``)."""
-    test_split_fraction: float = 0.03865  # to get exactly 2000 test samples,
-    """The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
+    val_split_fraction: float = 0.03865  # to get exactly 2000 validation samples,
+    """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
     prompt_style: Union[str, PromptStyle] = "alpaca"
     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
     ignore_index: int = -1
@@ -70,7 +70,7 @@ def setup(self, stage: str = "") -> None:
         # Partition the dataset into train and test
         train_data, test_data = random_split(
             data,
-            [1.0 - self.test_split_fraction, self.test_split_fraction],
+            [1.0 - self.val_split_fraction, self.val_split_fraction],
             generator=torch.Generator().manual_seed(self.seed)
         )
         train_data, test_data = list(train_data), list(test_data)

diff --git a/litgpt/data/alpaca_2k.py b/litgpt/data/alpaca_2k.py
@@ -11,8 +11,8 @@
 class Alpaca2k(Alpaca):
     """Alpaca2k data module for supervised finetuning."""
 
-    test_split_fraction: float = 0.05  # to get exactly 100 test samples,
-    """The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
+    val_split_fraction: float = 0.05  # to get exactly 100 validation samples,
+    """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
     download_dir: Path = Path("./data/alpaca2k")
     """The directory in which the downloaded datasetgets saved."""
     repo_id: str = field(repr=False, default="mhenrichsen/alpaca_2k_test")
@@ -30,7 +30,7 @@ def setup(self, stage: str = "") -> None:
 
         dataset = load_dataset(self.repo_id, cache_dir=self.download_dir)
 
-        train_validation_split = dataset["train"].train_test_split(test_size=self.test_split_fraction, seed=self.seed)
+        train_validation_split = dataset["train"].train_test_split(test_size=self.val_split_fraction, seed=self.seed)
         train_data = train_validation_split["train"]
         test_data = train_validation_split["test"]
 

diff --git a/litgpt/data/alpaca_gpt4.py b/litgpt/data/alpaca_gpt4.py
@@ -13,8 +13,8 @@
 class AlpacaGPT4(Alpaca):
     """AlpacaGPT4 data module for supervised finetuning."""
 
-    test_split_fraction: float = 0.03847  # to get exactly 2000 test samples,
-    """The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
+    val_split_fraction: float = 0.03847  # to get exactly 2000 test samples,
+    """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
     download_dir: Path = Path("./data/alpacagpt4")
     """The directory in which the downloaded datasetgets saved."""
     file_url: str = field(repr=False, default=_URL)

diff --git a/litgpt/data/dolly.py b/litgpt/data/dolly.py
@@ -20,8 +20,8 @@ class Dolly(Alpaca):
 
     mask_prompt: bool = False
     """Whether to mask the prompt section from the label (with ``ignore_index``)."""
-    test_split_fraction: float = 0.1
-    """The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
+    val_split_fraction: float = 0.1
+    """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
     prompt_style: Union[str, PromptStyle] = "alpaca"
     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
     ignore_index: int = -1
@@ -49,7 +49,7 @@ def setup(self, stage: str = "") -> None:
         # Partition the dataset into train and test
         train_data, test_data = random_split(
             data,
-            [1.0 - self.test_split_fraction, self.test_split_fraction],
+            [1.0 - self.val_split_fraction, self.val_split_fraction],
             generator=torch.Generator().manual_seed(self.seed)
         )
         train_data, test_data = list(train_data), list(test_data)

diff --git a/litgpt/data/json.py b/litgpt/data/json.py
@@ -23,7 +23,7 @@ class JSON(LitDataModule):
     and can optionally have a key 'input' (see Alpaca)."""
     mask_prompt: bool = False
     """Whether to mask the prompt section from the label (with ``ignore_index``)."""
-    test_split_fraction: Optional[float] = None
+    val_split_fraction: Optional[float] = None
     """The fraction of the dataset to use for the validation dataset. The rest is used for training.
     Only applies if you passed in a single file to `json_path`."""
     prompt_style: Union[str, PromptStyle] = "alpaca"
@@ -39,13 +39,13 @@ class JSON(LitDataModule):
     batch_size: int = field(default=1, init=False, repr=False)
     max_seq_length: int = field(default=-1, init=False, repr=False)
     train_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
-    test_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
+    val_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
 
     def __post_init__(self):
-        if self.json_path.is_dir() and self.test_split_fraction is not None:
+        if self.json_path.is_dir() and self.val_split_fraction is not None:
             raise ValueError(
                 "If `json_path` is a directory, it must contain 'train.json' and 'val.json' files and"
-                f" hence `test_split_fraction` should not be set. Got `{self.test_split_fraction=}`."
+                f" hence `val_split_fraction` should not be set. Got `{self.val_split_fraction=}`."
             )
         if not self.json_path.exists():
             raise FileNotFoundError(
@@ -112,7 +112,7 @@ def get_splits(self) -> Tuple:
             # Partition the dataset into train and test
             train_data, test_data = random_split(
                 data,
-                [1.0 - self.test_split_fraction, self.test_split_fraction],
+                [1.0 - self.val_split_fraction, self.val_split_fraction],
                 generator=torch.Generator().manual_seed(self.seed)
             )
             return train_data, test_data

diff --git a/litgpt/data/lima.py b/litgpt/data/lima.py
@@ -19,8 +19,8 @@ class LIMA(LitDataModule):
 
     mask_prompt: bool = False
     """Whether to mask the prompt section from the label (with ``ignore_index``)."""
-    test_split_fraction: float = 0.1
-    """The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
+    val_split_fraction: float = 0.1
+    """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
     prompt_style: Union[str, PromptStyle] = "alpaca"
     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
     ignore_index: int = -1
@@ -77,7 +77,7 @@ def setup(self, stage: str = "") -> None:
         # Partition the dataset into train and test
         train_data, test_data = random_split(
             data,
-            [1.0 - self.test_split_fraction, self.test_split_fraction],
+            [1.0 - self.val_split_fraction, self.val_split_fraction],
             generator=torch.Generator().manual_seed(self.seed)
         )
         train_data, test_data = list(train_data), list(test_data)

diff --git a/litgpt/data/openwebtext.py b/litgpt/data/openwebtext.py
@@ -18,8 +18,8 @@ class OpenWebText(LitDataModule):
     data_path: Union[str, Path] = Path("data/openwebtext")
     """The path to the data directory, containing two folders 'train' and 'val'
     which are the output of the preprocessing step. The path can also be a remote path (e.g., s3://)."""
-    test_split_fraction: float = 0.0005
-    """The fraction of data that should be put aside for validation/testing."""
+    val_split_fraction: float = 0.0005
+    """The fraction of data that should be put aside for validation."""
     seed: int = 42
     """The seed to use for shuffling the training data."""
     num_workers: int = 8
@@ -59,7 +59,7 @@ def prepare_data(self) -> None:
         dataset = load_dataset("openwebtext", num_proc=(os.cpu_count() // 2), trust_remote_code=True)
 
         # Split the data in training and validation
-        split_dataset = dataset["train"].train_test_split(test_size=self.test_split_fraction, seed=self.seed, shuffle=True)
+        split_dataset = dataset["train"].train_test_split(test_size=self.val_split_fraction, seed=self.seed, shuffle=True)
         split_dataset["val"] = split_dataset.pop("test")  # rename the test split to val
 
         def tokenize(data: Dataset, index: int):

diff --git a/tests/data/test_alpaca.py b/tests/data/test_alpaca.py
@@ -5,7 +5,7 @@ def test_alpaca(mock_tokenizer, alpaca_path):
     from litgpt.prompts import Alpaca as AlpacaPromptStyle
 
     alpaca = Alpaca(
-        test_split_fraction=0.5,
+        val_split_fraction=0.5,
         download_dir=alpaca_path.parent,
         file_name=alpaca_path.name,
         num_workers=0,

diff --git a/tests/data/test_dolly.py b/tests/data/test_dolly.py
@@ -5,7 +5,7 @@ def test_dolly(mock_tokenizer, dolly_path):
     from litgpt.prompts import Alpaca as AlpacaPromptStyle
 
     alpaca = Dolly(
-        test_split_fraction=0.5,
+        val_split_fraction=0.5,
         download_dir=dolly_path.parent,
         file_name=dolly_path.name,
         num_workers=0,

diff --git a/tests/data/test_json.py b/tests/data/test_json.py
@@ -24,7 +24,7 @@ def apply(self, prompt, **kwargs):
     with open(json_path, "w", encoding="utf-8") as fp:
         json.dump(mock_data, fp)
 
-    data = JSON(json_path, test_split_fraction=0.5, prompt_style=Style(), num_workers=0)
+    data = JSON(json_path, val_split_fraction=0.5, prompt_style=Style(), num_workers=0)
     data.connect(tokenizer=mock_tokenizer, batch_size=2)
     data.prepare_data()  # does nothing
     data.setup()
@@ -61,8 +61,8 @@ def test_json_input_validation(tmp_path):
     with pytest.raises(FileNotFoundError, match="The `json_path` must be a file or a directory"):
         JSON(tmp_path / "not exist")
 
-    with pytest.raises(ValueError, match="`test_split_fraction` should not be set"):
-        JSON(tmp_path, test_split_fraction=0.5)
+    with pytest.raises(ValueError, match="`val_split_fraction` should not be set"):
+        JSON(tmp_path, val_split_fraction=0.5)
 
     data = JSON(tmp_path)
     data.prepare_data()  # does nothing

diff --git a/tests/test_adapter.py b/tests/test_adapter.py
@@ -73,7 +73,7 @@ def test_adapter_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path)
             data=Alpaca(
                 download_dir=alpaca_path.parent,
                 file_name=alpaca_path.name,
-                test_split_fraction=0.5,
+                val_split_fraction=0.5,
                 num_workers=0
             ),
             checkpoint_dir=fake_checkpoint_dir,
@@ -173,7 +173,7 @@ def test_adapter_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca
             data=Alpaca(
                 download_dir=alpaca_path.parent,
                 file_name=alpaca_path.name,
-                test_split_fraction=0.5,
+                val_split_fraction=0.5,
                 num_workers=0,
             ),
             precision="16-true",

diff --git a/tests/test_adapter_v2.py b/tests/test_adapter_v2.py
@@ -96,7 +96,7 @@ def test_adapter_v2_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_pa
             data=Alpaca(
                 download_dir=alpaca_path.parent,
                 file_name=alpaca_path.name,
-                test_split_fraction=0.5,
+                val_split_fraction=0.5,
                 num_workers=0
             ),
             checkpoint_dir=fake_checkpoint_dir,
@@ -262,7 +262,7 @@ def test_adapter_v2_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alp
             data=Alpaca(
                 download_dir=alpaca_path.parent,
                 file_name=alpaca_path.name,
-                test_split_fraction=0.5,
+                val_split_fraction=0.5,
                 num_workers=0
             ),
             precision="16-true",

diff --git a/tests/test_full.py b/tests/test_full.py
@@ -30,7 +30,7 @@ def test_full_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path):
         data=Alpaca(
             download_dir=alpaca_path.parent,
             file_name=alpaca_path.name,
-            test_split_fraction=0.5,
+            val_split_fraction=0.5,
             num_workers=0
         ),
         checkpoint_dir=fake_checkpoint_dir,

diff --git a/tests/test_lora.py b/tests/test_lora.py
@@ -203,7 +203,7 @@ def test_lora_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path):
             data=Alpaca(
                 download_dir=alpaca_path.parent,
                 file_name=alpaca_path.name,
-                test_split_fraction=0.5,
+                val_split_fraction=0.5,
                 num_workers=0
             ),
             checkpoint_dir=fake_checkpoint_dir,
@@ -631,7 +631,7 @@ def test_lora_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca_pa
             data=Alpaca(
                 download_dir=alpaca_path.parent,
                 file_name=alpaca_path.name,
-                test_split_fraction=0.5,
+                val_split_fraction=0.5,
                 num_workers=0,
             ),
             checkpoint_dir=fake_checkpoint_dir,

diff --git a/tutorials/prepare_dataset.md b/tutorials/prepare_dataset.md
@@ -345,7 +345,7 @@ python litgpt/finetune/lora.py \
 
 You can also customize how the dataset is read by using these additional parameters
 
-- `test_split_fraction`: The fraction of the data to split. Defaults to `0.1`
+- `val_split_fraction`: The fraction of the data to split. Defaults to `0.1`
 
 - `seed`: The seed value to reproduce the same random splits for train and test data.
 
@@ -359,7 +359,7 @@ To use the settings described above, you can add the respective command line arg
 python litgpt/finetune/lora.py \
   --data JSON \
   --data.json_path path/to/your/data.json \
-  --data.test_split_fraction 0.1 \
+  --data.val_split_fraction 0.1 \
   --data.seed 42 \
   --data.mask_inputs False \
   --data.ignore_index -1 \

diff --git a/xla/scripts/prepare_alpaca.py b/xla/scripts/prepare_alpaca.py
@@ -23,7 +23,7 @@
 def prepare(
     destination_path: Path = Path("data/alpaca"),
     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
-    test_split_fraction: float = 0.03865,  # to get exactly 2000 test samples,
+    val_split_fraction: float = 0.03865,  # to get exactly 2000 validation samples,
     seed: int = 42,
     mask_inputs: bool = False,  # as in alpaca-lora
     data_file_name: str = "alpaca_data_cleaned_archive.json",
@@ -53,7 +53,7 @@ def prepare(
 
     # Partition the dataset into train and test
     train_set, test_set = random_split(
-        data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed)
+        data, [1.0 - val_split_fraction, val_split_fraction], generator=torch.Generator().manual_seed(seed)
     )
     train_set, test_set = list(train_set), list(test_set)