Skip to content

Commit

Permalink
Test split -> Val split (#1065)
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt authored and awaelchli committed Mar 15, 2024
1 parent a19c9a0 commit a5ecf5d
Show file tree
Hide file tree
Showing 19 changed files with 41 additions and 41 deletions.
2 changes: 1 addition & 1 deletion config_hub/finetune/llama-2-7b/full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ data:
class_path: litgpt.data.AlpacaGPT4
init_args:
mask_prompt: false
test_split_fraction: 0.03847
val_split_fraction: 0.03847
prompt_style: "alpaca"
ignore_index: -1
seed: 42
Expand Down
2 changes: 1 addition & 1 deletion config_hub/finetune/llama-2-7b/lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ data:
class_path: litgpt.data.AlpacaGPT4
init_args:
mask_prompt: false
test_split_fraction: 0.03847
val_split_fraction: 0.03847
prompt_style: "alpaca"
ignore_index: -1
seed: 42
Expand Down
2 changes: 1 addition & 1 deletion config_hub/finetune/tiny-llama/lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ data:
class_path: litgpt.data.AlpacaGPT4
init_args:
mask_prompt: false
test_split_fraction: 0.03847
val_split_fraction: 0.03847
prompt_style: "alpaca"
ignore_index: -1
seed: 42
Expand Down
6 changes: 3 additions & 3 deletions litgpt/data/alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ class Alpaca(LitDataModule):

mask_prompt: bool = False
"""Whether to mask the prompt section from the label (with ``ignore_index``)."""
test_split_fraction: float = 0.03865 # to get exactly 2000 test samples,
"""The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
val_split_fraction: float = 0.03865 # to get exactly 2000 validation samples,
"""The fraction of the dataset to use for the validation dataset. The rest is used for training."""
prompt_style: Union[str, PromptStyle] = "alpaca"
"""The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
ignore_index: int = -1
Expand Down Expand Up @@ -70,7 +70,7 @@ def setup(self, stage: str = "") -> None:
# Partition the dataset into train and test
train_data, test_data = random_split(
data,
[1.0 - self.test_split_fraction, self.test_split_fraction],
[1.0 - self.val_split_fraction, self.val_split_fraction],
generator=torch.Generator().manual_seed(self.seed)
)
train_data, test_data = list(train_data), list(test_data)
Expand Down
6 changes: 3 additions & 3 deletions litgpt/data/alpaca_2k.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
class Alpaca2k(Alpaca):
"""Alpaca2k data module for supervised finetuning."""

test_split_fraction: float = 0.05 # to get exactly 100 test samples,
"""The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
val_split_fraction: float = 0.05 # to get exactly 100 validation samples,
"""The fraction of the dataset to use for the validation dataset. The rest is used for training."""
download_dir: Path = Path("./data/alpaca2k")
"""The directory in which the downloaded datasetgets saved."""
repo_id: str = field(repr=False, default="mhenrichsen/alpaca_2k_test")
Expand All @@ -30,7 +30,7 @@ def setup(self, stage: str = "") -> None:

dataset = load_dataset(self.repo_id, cache_dir=self.download_dir)

train_validation_split = dataset["train"].train_test_split(test_size=self.test_split_fraction, seed=self.seed)
train_validation_split = dataset["train"].train_test_split(test_size=self.val_split_fraction, seed=self.seed)
train_data = train_validation_split["train"]
test_data = train_validation_split["test"]

Expand Down
4 changes: 2 additions & 2 deletions litgpt/data/alpaca_gpt4.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
class AlpacaGPT4(Alpaca):
"""AlpacaGPT4 data module for supervised finetuning."""

test_split_fraction: float = 0.03847 # to get exactly 2000 test samples,
"""The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
val_split_fraction: float = 0.03847 # to get exactly 2000 test samples,
"""The fraction of the dataset to use for the validation dataset. The rest is used for training."""
download_dir: Path = Path("./data/alpacagpt4")
"""The directory in which the downloaded datasetgets saved."""
file_url: str = field(repr=False, default=_URL)
Expand Down
6 changes: 3 additions & 3 deletions litgpt/data/dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ class Dolly(Alpaca):

mask_prompt: bool = False
"""Whether to mask the prompt section from the label (with ``ignore_index``)."""
test_split_fraction: float = 0.1
"""The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
val_split_fraction: float = 0.1
"""The fraction of the dataset to use for the validation dataset. The rest is used for training."""
prompt_style: Union[str, PromptStyle] = "alpaca"
"""The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
ignore_index: int = -1
Expand Down Expand Up @@ -49,7 +49,7 @@ def setup(self, stage: str = "") -> None:
# Partition the dataset into train and test
train_data, test_data = random_split(
data,
[1.0 - self.test_split_fraction, self.test_split_fraction],
[1.0 - self.val_split_fraction, self.val_split_fraction],
generator=torch.Generator().manual_seed(self.seed)
)
train_data, test_data = list(train_data), list(test_data)
Expand Down
10 changes: 5 additions & 5 deletions litgpt/data/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class JSON(LitDataModule):
and can optionally have a key 'input' (see Alpaca)."""
mask_prompt: bool = False
"""Whether to mask the prompt section from the label (with ``ignore_index``)."""
test_split_fraction: Optional[float] = None
val_split_fraction: Optional[float] = None
"""The fraction of the dataset to use for the validation dataset. The rest is used for training.
Only applies if you passed in a single file to `json_path`."""
prompt_style: Union[str, PromptStyle] = "alpaca"
Expand All @@ -39,13 +39,13 @@ class JSON(LitDataModule):
batch_size: int = field(default=1, init=False, repr=False)
max_seq_length: int = field(default=-1, init=False, repr=False)
train_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
test_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
val_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)

def __post_init__(self):
if self.json_path.is_dir() and self.test_split_fraction is not None:
if self.json_path.is_dir() and self.val_split_fraction is not None:
raise ValueError(
"If `json_path` is a directory, it must contain 'train.json' and 'val.json' files and"
f" hence `test_split_fraction` should not be set. Got `{self.test_split_fraction=}`."
f" hence `val_split_fraction` should not be set. Got `{self.val_split_fraction=}`."
)
if not self.json_path.exists():
raise FileNotFoundError(
Expand Down Expand Up @@ -112,7 +112,7 @@ def get_splits(self) -> Tuple:
# Partition the dataset into train and test
train_data, test_data = random_split(
data,
[1.0 - self.test_split_fraction, self.test_split_fraction],
[1.0 - self.val_split_fraction, self.val_split_fraction],
generator=torch.Generator().manual_seed(self.seed)
)
return train_data, test_data
Expand Down
6 changes: 3 additions & 3 deletions litgpt/data/lima.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ class LIMA(LitDataModule):

mask_prompt: bool = False
"""Whether to mask the prompt section from the label (with ``ignore_index``)."""
test_split_fraction: float = 0.1
"""The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
val_split_fraction: float = 0.1
"""The fraction of the dataset to use for the validation dataset. The rest is used for training."""
prompt_style: Union[str, PromptStyle] = "alpaca"
"""The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
ignore_index: int = -1
Expand Down Expand Up @@ -77,7 +77,7 @@ def setup(self, stage: str = "") -> None:
# Partition the dataset into train and test
train_data, test_data = random_split(
data,
[1.0 - self.test_split_fraction, self.test_split_fraction],
[1.0 - self.val_split_fraction, self.val_split_fraction],
generator=torch.Generator().manual_seed(self.seed)
)
train_data, test_data = list(train_data), list(test_data)
Expand Down
6 changes: 3 additions & 3 deletions litgpt/data/openwebtext.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ class OpenWebText(LitDataModule):
data_path: Union[str, Path] = Path("data/openwebtext")
"""The path to the data directory, containing two folders 'train' and 'val'
which are the output of the preprocessing step. The path can also be a remote path (e.g., s3://)."""
test_split_fraction: float = 0.0005
"""The fraction of data that should be put aside for validation/testing."""
val_split_fraction: float = 0.0005
"""The fraction of data that should be put aside for validation."""
seed: int = 42
"""The seed to use for shuffling the training data."""
num_workers: int = 8
Expand Down Expand Up @@ -59,7 +59,7 @@ def prepare_data(self) -> None:
dataset = load_dataset("openwebtext", num_proc=(os.cpu_count() // 2), trust_remote_code=True)

# Split the data in training and validation
split_dataset = dataset["train"].train_test_split(test_size=self.test_split_fraction, seed=self.seed, shuffle=True)
split_dataset = dataset["train"].train_test_split(test_size=self.val_split_fraction, seed=self.seed, shuffle=True)
split_dataset["val"] = split_dataset.pop("test") # rename the test split to val

def tokenize(data: Dataset, index: int):
Expand Down
2 changes: 1 addition & 1 deletion tests/data/test_alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ def test_alpaca(mock_tokenizer, alpaca_path):
from litgpt.prompts import Alpaca as AlpacaPromptStyle

alpaca = Alpaca(
test_split_fraction=0.5,
val_split_fraction=0.5,
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
num_workers=0,
Expand Down
2 changes: 1 addition & 1 deletion tests/data/test_dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ def test_dolly(mock_tokenizer, dolly_path):
from litgpt.prompts import Alpaca as AlpacaPromptStyle

alpaca = Dolly(
test_split_fraction=0.5,
val_split_fraction=0.5,
download_dir=dolly_path.parent,
file_name=dolly_path.name,
num_workers=0,
Expand Down
6 changes: 3 additions & 3 deletions tests/data/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def apply(self, prompt, **kwargs):
with open(json_path, "w", encoding="utf-8") as fp:
json.dump(mock_data, fp)

data = JSON(json_path, test_split_fraction=0.5, prompt_style=Style(), num_workers=0)
data = JSON(json_path, val_split_fraction=0.5, prompt_style=Style(), num_workers=0)
data.connect(tokenizer=mock_tokenizer, batch_size=2)
data.prepare_data() # does nothing
data.setup()
Expand Down Expand Up @@ -61,8 +61,8 @@ def test_json_input_validation(tmp_path):
with pytest.raises(FileNotFoundError, match="The `json_path` must be a file or a directory"):
JSON(tmp_path / "not exist")

with pytest.raises(ValueError, match="`test_split_fraction` should not be set"):
JSON(tmp_path, test_split_fraction=0.5)
with pytest.raises(ValueError, match="`val_split_fraction` should not be set"):
JSON(tmp_path, val_split_fraction=0.5)

data = JSON(tmp_path)
data.prepare_data() # does nothing
Expand Down
4 changes: 2 additions & 2 deletions tests/test_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def test_adapter_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path)
data=Alpaca(
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
test_split_fraction=0.5,
val_split_fraction=0.5,
num_workers=0
),
checkpoint_dir=fake_checkpoint_dir,
Expand Down Expand Up @@ -173,7 +173,7 @@ def test_adapter_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca
data=Alpaca(
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
test_split_fraction=0.5,
val_split_fraction=0.5,
num_workers=0,
),
precision="16-true",
Expand Down
4 changes: 2 additions & 2 deletions tests/test_adapter_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def test_adapter_v2_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_pa
data=Alpaca(
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
test_split_fraction=0.5,
val_split_fraction=0.5,
num_workers=0
),
checkpoint_dir=fake_checkpoint_dir,
Expand Down Expand Up @@ -262,7 +262,7 @@ def test_adapter_v2_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alp
data=Alpaca(
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
test_split_fraction=0.5,
val_split_fraction=0.5,
num_workers=0
),
precision="16-true",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_full.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_full_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path):
data=Alpaca(
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
test_split_fraction=0.5,
val_split_fraction=0.5,
num_workers=0
),
checkpoint_dir=fake_checkpoint_dir,
Expand Down
4 changes: 2 additions & 2 deletions tests/test_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def test_lora_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path):
data=Alpaca(
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
test_split_fraction=0.5,
val_split_fraction=0.5,
num_workers=0
),
checkpoint_dir=fake_checkpoint_dir,
Expand Down Expand Up @@ -631,7 +631,7 @@ def test_lora_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca_pa
data=Alpaca(
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
test_split_fraction=0.5,
val_split_fraction=0.5,
num_workers=0,
),
checkpoint_dir=fake_checkpoint_dir,
Expand Down
4 changes: 2 additions & 2 deletions tutorials/prepare_dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ python litgpt/finetune/lora.py \

You can also customize how the dataset is read by using these additional parameters

- `test_split_fraction`: The fraction of the data to split. Defaults to `0.1`
- `val_split_fraction`: The fraction of the data to split. Defaults to `0.1`

- `seed`: The seed value to reproduce the same random splits for train and test data.

Expand All @@ -359,7 +359,7 @@ To use the settings described above, you can add the respective command line arg
python litgpt/finetune/lora.py \
--data JSON \
--data.json_path path/to/your/data.json \
--data.test_split_fraction 0.1 \
--data.val_split_fraction 0.1 \
--data.seed 42 \
--data.mask_inputs False \
--data.ignore_index -1 \
Expand Down
4 changes: 2 additions & 2 deletions xla/scripts/prepare_alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
def prepare(
destination_path: Path = Path("data/alpaca"),
checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
test_split_fraction: float = 0.03865, # to get exactly 2000 test samples,
val_split_fraction: float = 0.03865, # to get exactly 2000 validation samples,
seed: int = 42,
mask_inputs: bool = False, # as in alpaca-lora
data_file_name: str = "alpaca_data_cleaned_archive.json",
Expand Down Expand Up @@ -53,7 +53,7 @@ def prepare(

# Partition the dataset into train and test
train_set, test_set = random_split(
data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed)
data, [1.0 - val_split_fraction, val_split_fraction], generator=torch.Generator().manual_seed(seed)
)
train_set, test_set = list(train_set), list(test_set)

Expand Down

0 comments on commit a5ecf5d

Please sign in to comment.