From 3dc3fb96e189849806c0f8c80130003833f08942 Mon Sep 17 00:00:00 2001
From: rasbt <mail@sebastianraschka.com>
Date: Fri, 8 Mar 2024 16:23:36 +0000
Subject: [PATCH] Test split -> Val split

---
 config_hub/finetune/llama-2-7b/full.yaml |  2 +-
 config_hub/finetune/llama-2-7b/lora.yaml |  2 +-
 config_hub/finetune/tiny-llama/lora.yaml |  2 +-
 litgpt/data/alpaca.py                    |  6 +++---
 litgpt/data/alpaca_2k.py                 |  6 +++---
 litgpt/data/alpaca_gpt4.py               |  4 ++--
 litgpt/data/dolly.py                     |  6 +++---
 litgpt/data/json.py                      | 10 +++++-----
 litgpt/data/lima.py                      |  6 +++---
 litgpt/data/openwebtext.py               |  6 +++---
 tests/data/test_alpaca.py                |  2 +-
 tests/data/test_dolly.py                 |  2 +-
 tests/data/test_json.py                  |  6 +++---
 tests/test_adapter.py                    |  4 ++--
 tests/test_adapter_v2.py                 |  4 ++--
 tests/test_full.py                       |  2 +-
 tests/test_lora.py                       |  4 ++--
 tutorials/prepare_dataset.md             |  4 ++--
 xla/scripts/prepare_alpaca.py            |  4 ++--
 19 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/config_hub/finetune/llama-2-7b/full.yaml b/config_hub/finetune/llama-2-7b/full.yaml
index eb7cb8ede2..3f7cdfbd8a 100644
--- a/config_hub/finetune/llama-2-7b/full.yaml
+++ b/config_hub/finetune/llama-2-7b/full.yaml
@@ -6,7 +6,7 @@ data:
   class_path: litgpt.data.AlpacaGPT4
   init_args:
     mask_prompt: false
-    test_split_fraction: 0.03847
+    val_split_fraction: 0.03847
     prompt_style: "alpaca"
     ignore_index: -1
     seed: 42
diff --git a/config_hub/finetune/llama-2-7b/lora.yaml b/config_hub/finetune/llama-2-7b/lora.yaml
index 54fedd574b..486929d8f3 100644
--- a/config_hub/finetune/llama-2-7b/lora.yaml
+++ b/config_hub/finetune/llama-2-7b/lora.yaml
@@ -15,7 +15,7 @@ data:
   class_path: litgpt.data.AlpacaGPT4
   init_args:
     mask_prompt: false
-    test_split_fraction: 0.03847
+    val_split_fraction: 0.03847
     prompt_style: "alpaca"
     ignore_index: -1
     seed: 42
diff --git a/config_hub/finetune/tiny-llama/lora.yaml b/config_hub/finetune/tiny-llama/lora.yaml
index a59d2ded6e..af7d41f07e 100644
--- a/config_hub/finetune/tiny-llama/lora.yaml
+++ b/config_hub/finetune/tiny-llama/lora.yaml
@@ -15,7 +15,7 @@ data:
   class_path: litgpt.data.AlpacaGPT4
   init_args:
     mask_prompt: false
-    test_split_fraction: 0.03847
+    val_split_fraction: 0.03847
     prompt_style: "alpaca"
     ignore_index: -1
     seed: 42
diff --git a/litgpt/data/alpaca.py b/litgpt/data/alpaca.py
index 3b1d830d74..70d9c3d2c4 100644
--- a/litgpt/data/alpaca.py
+++ b/litgpt/data/alpaca.py
@@ -22,8 +22,8 @@ class Alpaca(LitDataModule):
 
     mask_prompt: bool = False
     """Whether to mask the prompt section from the label (with ``ignore_index``)."""
-    test_split_fraction: float = 0.03865  # to get exactly 2000 test samples,
-    """The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
+    val_split_fraction: float = 0.03865  # to get exactly 2000 validation samples,
+    """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
     prompt_style: Union[str, PromptStyle] = "alpaca"
     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
     ignore_index: int = -1
@@ -70,7 +70,7 @@ def setup(self, stage: str = "") -> None:
         # Partition the dataset into train and test
         train_data, test_data = random_split(
             data,
-            [1.0 - self.test_split_fraction, self.test_split_fraction],
+            [1.0 - self.val_split_fraction, self.val_split_fraction],
             generator=torch.Generator().manual_seed(self.seed)
         )
         train_data, test_data = list(train_data), list(test_data)
diff --git a/litgpt/data/alpaca_2k.py b/litgpt/data/alpaca_2k.py
index 8c29d7266a..bb9ea8f8ec 100644
--- a/litgpt/data/alpaca_2k.py
+++ b/litgpt/data/alpaca_2k.py
@@ -11,8 +11,8 @@
 class Alpaca2k(Alpaca):
     """Alpaca2k data module for supervised finetuning."""
 
-    test_split_fraction: float = 0.05  # to get exactly 100 test samples,
-    """The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
+    val_split_fraction: float = 0.05  # to get exactly 100 validation samples,
+    """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
     download_dir: Path = Path("./data/alpaca2k")
     """The directory in which the downloaded datasetgets saved."""
     repo_id: str = field(repr=False, default="mhenrichsen/alpaca_2k_test")
@@ -30,7 +30,7 @@ def setup(self, stage: str = "") -> None:
 
         dataset = load_dataset(self.repo_id, cache_dir=self.download_dir)
 
-        train_validation_split = dataset["train"].train_test_split(test_size=self.test_split_fraction, seed=self.seed)
+        train_validation_split = dataset["train"].train_test_split(test_size=self.val_split_fraction, seed=self.seed)
         train_data = train_validation_split["train"]
         test_data = train_validation_split["test"]
 
diff --git a/litgpt/data/alpaca_gpt4.py b/litgpt/data/alpaca_gpt4.py
index 6a7cdc140b..9a66193a2e 100644
--- a/litgpt/data/alpaca_gpt4.py
+++ b/litgpt/data/alpaca_gpt4.py
@@ -13,8 +13,8 @@
 class AlpacaGPT4(Alpaca):
     """AlpacaGPT4 data module for supervised finetuning."""
 
-    test_split_fraction: float = 0.03847  # to get exactly 2000 test samples,
-    """The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
+    val_split_fraction: float = 0.03847  # to get exactly 2000 test samples,
+    """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
     download_dir: Path = Path("./data/alpacagpt4")
     """The directory in which the downloaded datasetgets saved."""
     file_url: str = field(repr=False, default=_URL)
diff --git a/litgpt/data/dolly.py b/litgpt/data/dolly.py
index 0a7af16440..9824e8d13c 100644
--- a/litgpt/data/dolly.py
+++ b/litgpt/data/dolly.py
@@ -20,8 +20,8 @@ class Dolly(Alpaca):
 
     mask_prompt: bool = False
     """Whether to mask the prompt section from the label (with ``ignore_index``)."""
-    test_split_fraction: float = 0.1
-    """The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
+    val_split_fraction: float = 0.1
+    """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
     prompt_style: Union[str, PromptStyle] = "alpaca"
     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
     ignore_index: int = -1
@@ -49,7 +49,7 @@ def setup(self, stage: str = "") -> None:
         # Partition the dataset into train and test
         train_data, test_data = random_split(
             data,
-            [1.0 - self.test_split_fraction, self.test_split_fraction],
+            [1.0 - self.val_split_fraction, self.val_split_fraction],
             generator=torch.Generator().manual_seed(self.seed)
         )
         train_data, test_data = list(train_data), list(test_data)
diff --git a/litgpt/data/json.py b/litgpt/data/json.py
index 20d227563b..3115d4639e 100644
--- a/litgpt/data/json.py
+++ b/litgpt/data/json.py
@@ -23,7 +23,7 @@ class JSON(LitDataModule):
     and can optionally have a key 'input' (see Alpaca)."""
     mask_prompt: bool = False
     """Whether to mask the prompt section from the label (with ``ignore_index``)."""
-    test_split_fraction: Optional[float] = None
+    val_split_fraction: Optional[float] = None
     """The fraction of the dataset to use for the validation dataset. The rest is used for training.
     Only applies if you passed in a single file to `json_path`."""
     prompt_style: Union[str, PromptStyle] = "alpaca"
@@ -39,13 +39,13 @@ class JSON(LitDataModule):
     batch_size: int = field(default=1, init=False, repr=False)
     max_seq_length: int = field(default=-1, init=False, repr=False)
     train_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
-    test_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
+    val_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
 
     def __post_init__(self):
-        if self.json_path.is_dir() and self.test_split_fraction is not None:
+        if self.json_path.is_dir() and self.val_split_fraction is not None:
             raise ValueError(
                 "If `json_path` is a directory, it must contain 'train.json' and 'val.json' files and"
-                f" hence `test_split_fraction` should not be set. Got `{self.test_split_fraction=}`."
+                f" hence `val_split_fraction` should not be set. Got `{self.val_split_fraction=}`."
             )
         if not self.json_path.exists():
             raise FileNotFoundError(
@@ -112,7 +112,7 @@ def get_splits(self) -> Tuple:
             # Partition the dataset into train and test
             train_data, test_data = random_split(
                 data,
-                [1.0 - self.test_split_fraction, self.test_split_fraction],
+                [1.0 - self.val_split_fraction, self.val_split_fraction],
                 generator=torch.Generator().manual_seed(self.seed)
             )
             return train_data, test_data
diff --git a/litgpt/data/lima.py b/litgpt/data/lima.py
index b059c1bc63..263b80dd15 100644
--- a/litgpt/data/lima.py
+++ b/litgpt/data/lima.py
@@ -19,8 +19,8 @@ class LIMA(LitDataModule):
 
     mask_prompt: bool = False
     """Whether to mask the prompt section from the label (with ``ignore_index``)."""
-    test_split_fraction: float = 0.1
-    """The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
+    val_split_fraction: float = 0.1
+    """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
     prompt_style: Union[str, PromptStyle] = "alpaca"
     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
     ignore_index: int = -1
@@ -77,7 +77,7 @@ def setup(self, stage: str = "") -> None:
         # Partition the dataset into train and test
         train_data, test_data = random_split(
             data,
-            [1.0 - self.test_split_fraction, self.test_split_fraction],
+            [1.0 - self.val_split_fraction, self.val_split_fraction],
             generator=torch.Generator().manual_seed(self.seed)
         )
         train_data, test_data = list(train_data), list(test_data)
diff --git a/litgpt/data/openwebtext.py b/litgpt/data/openwebtext.py
index 2ac408718e..d50e1f84d5 100644
--- a/litgpt/data/openwebtext.py
+++ b/litgpt/data/openwebtext.py
@@ -18,8 +18,8 @@ class OpenWebText(LitDataModule):
     data_path: Union[str, Path] = Path("data/openwebtext")
     """The path to the data directory, containing two folders 'train' and 'val'
     which are the output of the preprocessing step. The path can also be a remote path (e.g., s3://)."""
-    test_split_fraction: float = 0.0005
-    """The fraction of data that should be put aside for validation/testing."""
+    val_split_fraction: float = 0.0005
+    """The fraction of data that should be put aside for validation."""
     seed: int = 42
     """The seed to use for shuffling the training data."""
     num_workers: int = 8
@@ -59,7 +59,7 @@ def prepare_data(self) -> None:
         dataset = load_dataset("openwebtext", num_proc=(os.cpu_count() // 2), trust_remote_code=True)
 
         # Split the data in training and validation
-        split_dataset = dataset["train"].train_test_split(test_size=self.test_split_fraction, seed=self.seed, shuffle=True)
+        split_dataset = dataset["train"].train_test_split(test_size=self.val_split_fraction, seed=self.seed, shuffle=True)
         split_dataset["val"] = split_dataset.pop("test")  # rename the test split to val
 
         def tokenize(data: Dataset, index: int):
diff --git a/tests/data/test_alpaca.py b/tests/data/test_alpaca.py
index 9097c7190c..a82364c9db 100644
--- a/tests/data/test_alpaca.py
+++ b/tests/data/test_alpaca.py
@@ -5,7 +5,7 @@ def test_alpaca(mock_tokenizer, alpaca_path):
     from litgpt.prompts import Alpaca as AlpacaPromptStyle
 
     alpaca = Alpaca(
-        test_split_fraction=0.5,
+        val_split_fraction=0.5,
         download_dir=alpaca_path.parent,
         file_name=alpaca_path.name,
         num_workers=0,
diff --git a/tests/data/test_dolly.py b/tests/data/test_dolly.py
index 76540ef858..25f9c9879e 100644
--- a/tests/data/test_dolly.py
+++ b/tests/data/test_dolly.py
@@ -5,7 +5,7 @@ def test_dolly(mock_tokenizer, dolly_path):
     from litgpt.prompts import Alpaca as AlpacaPromptStyle
 
     alpaca = Dolly(
-        test_split_fraction=0.5,
+        val_split_fraction=0.5,
         download_dir=dolly_path.parent,
         file_name=dolly_path.name,
         num_workers=0,
diff --git a/tests/data/test_json.py b/tests/data/test_json.py
index bf1356a3c4..8e180b4802 100644
--- a/tests/data/test_json.py
+++ b/tests/data/test_json.py
@@ -24,7 +24,7 @@ def apply(self, prompt, **kwargs):
     with open(json_path, "w", encoding="utf-8") as fp:
         json.dump(mock_data, fp)
 
-    data = JSON(json_path, test_split_fraction=0.5, prompt_style=Style(), num_workers=0)
+    data = JSON(json_path, val_split_fraction=0.5, prompt_style=Style(), num_workers=0)
     data.connect(tokenizer=mock_tokenizer, batch_size=2)
     data.prepare_data()  # does nothing
     data.setup()
@@ -61,8 +61,8 @@ def test_json_input_validation(tmp_path):
     with pytest.raises(FileNotFoundError, match="The `json_path` must be a file or a directory"):
         JSON(tmp_path / "not exist")
 
-    with pytest.raises(ValueError, match="`test_split_fraction` should not be set"):
-        JSON(tmp_path, test_split_fraction=0.5)
+    with pytest.raises(ValueError, match="`val_split_fraction` should not be set"):
+        JSON(tmp_path, val_split_fraction=0.5)
 
     data = JSON(tmp_path)
     data.prepare_data()  # does nothing
diff --git a/tests/test_adapter.py b/tests/test_adapter.py
index 5071233c4a..199bc38412 100644
--- a/tests/test_adapter.py
+++ b/tests/test_adapter.py
@@ -73,7 +73,7 @@ def test_adapter_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path)
             data=Alpaca(
                 download_dir=alpaca_path.parent,
                 file_name=alpaca_path.name,
-                test_split_fraction=0.5,
+                val_split_fraction=0.5,
                 num_workers=0
             ),
             checkpoint_dir=fake_checkpoint_dir,
@@ -173,7 +173,7 @@ def test_adapter_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca
             data=Alpaca(
                 download_dir=alpaca_path.parent,
                 file_name=alpaca_path.name,
-                test_split_fraction=0.5,
+                val_split_fraction=0.5,
                 num_workers=0,
             ),
             precision="16-true",
diff --git a/tests/test_adapter_v2.py b/tests/test_adapter_v2.py
index 8bcc0bc9b5..485f4128b3 100644
--- a/tests/test_adapter_v2.py
+++ b/tests/test_adapter_v2.py
@@ -96,7 +96,7 @@ def test_adapter_v2_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_pa
             data=Alpaca(
                 download_dir=alpaca_path.parent,
                 file_name=alpaca_path.name,
-                test_split_fraction=0.5,
+                val_split_fraction=0.5,
                 num_workers=0
             ),
             checkpoint_dir=fake_checkpoint_dir,
@@ -262,7 +262,7 @@ def test_adapter_v2_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alp
             data=Alpaca(
                 download_dir=alpaca_path.parent,
                 file_name=alpaca_path.name,
-                test_split_fraction=0.5,
+                val_split_fraction=0.5,
                 num_workers=0
             ),
             precision="16-true",
diff --git a/tests/test_full.py b/tests/test_full.py
index 8a6ceba1ae..b1f5ede9ed 100644
--- a/tests/test_full.py
+++ b/tests/test_full.py
@@ -30,7 +30,7 @@ def test_full_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path):
         data=Alpaca(
             download_dir=alpaca_path.parent,
             file_name=alpaca_path.name,
-            test_split_fraction=0.5,
+            val_split_fraction=0.5,
             num_workers=0
         ),
         checkpoint_dir=fake_checkpoint_dir,
diff --git a/tests/test_lora.py b/tests/test_lora.py
index c80f8d6208..c9cf3fc79e 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -203,7 +203,7 @@ def test_lora_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path):
             data=Alpaca(
                 download_dir=alpaca_path.parent,
                 file_name=alpaca_path.name,
-                test_split_fraction=0.5,
+                val_split_fraction=0.5,
                 num_workers=0
             ),
             checkpoint_dir=fake_checkpoint_dir,
@@ -631,7 +631,7 @@ def test_lora_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca_pa
             data=Alpaca(
                 download_dir=alpaca_path.parent,
                 file_name=alpaca_path.name,
-                test_split_fraction=0.5,
+                val_split_fraction=0.5,
                 num_workers=0,
             ),
             checkpoint_dir=fake_checkpoint_dir,
diff --git a/tutorials/prepare_dataset.md b/tutorials/prepare_dataset.md
index d35b258372..a52410641d 100644
--- a/tutorials/prepare_dataset.md
+++ b/tutorials/prepare_dataset.md
@@ -345,7 +345,7 @@ python litgpt/finetune/lora.py \
 
 You can also customize how the dataset is read by using these additional parameters
 
-- `test_split_fraction`: The fraction of the data to split. Defaults to `0.1`
+- `val_split_fraction`: The fraction of the data to split. Defaults to `0.1`
 
 - `seed`: The seed value to reproduce the same random splits for train and test data.
 
@@ -359,7 +359,7 @@ To use the settings described above, you can add the respective command line arg
 python litgpt/finetune/lora.py \
   --data JSON \
   --data.json_path path/to/your/data.json \
-  --data.test_split_fraction 0.1 \
+  --data.val_split_fraction 0.1 \
   --data.seed 42 \
   --data.mask_inputs False \
   --data.ignore_index -1 \
diff --git a/xla/scripts/prepare_alpaca.py b/xla/scripts/prepare_alpaca.py
index 61ca7bf3b5..ac395be5a8 100644
--- a/xla/scripts/prepare_alpaca.py
+++ b/xla/scripts/prepare_alpaca.py
@@ -23,7 +23,7 @@
 def prepare(
     destination_path: Path = Path("data/alpaca"),
     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
-    test_split_fraction: float = 0.03865,  # to get exactly 2000 test samples,
+    val_split_fraction: float = 0.03865,  # to get exactly 2000 validation samples,
     seed: int = 42,
     mask_inputs: bool = False,  # as in alpaca-lora
     data_file_name: str = "alpaca_data_cleaned_archive.json",
@@ -53,7 +53,7 @@ def prepare(
 
     # Partition the dataset into train and test
     train_set, test_set = random_split(
-        data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed)
+        data, [1.0 - val_split_fraction, val_split_fraction], generator=torch.Generator().manual_seed(seed)
     )
     train_set, test_set = list(train_set), list(test_set)