Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test split -> Val split #1065

Merged
merged 1 commit into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config_hub/finetune/llama-2-7b/full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ data:
class_path: litgpt.data.AlpacaGPT4
init_args:
mask_prompt: false
test_split_fraction: 0.03847
val_split_fraction: 0.03847
prompt_style: "alpaca"
ignore_index: -1
seed: 42
Expand Down
2 changes: 1 addition & 1 deletion config_hub/finetune/llama-2-7b/lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ data:
class_path: litgpt.data.AlpacaGPT4
init_args:
mask_prompt: false
test_split_fraction: 0.03847
val_split_fraction: 0.03847
prompt_style: "alpaca"
ignore_index: -1
seed: 42
Expand Down
2 changes: 1 addition & 1 deletion config_hub/finetune/tiny-llama/lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ data:
class_path: litgpt.data.AlpacaGPT4
init_args:
mask_prompt: false
test_split_fraction: 0.03847
val_split_fraction: 0.03847
prompt_style: "alpaca"
ignore_index: -1
seed: 42
Expand Down
6 changes: 3 additions & 3 deletions litgpt/data/alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ class Alpaca(LitDataModule):

mask_prompt: bool = False
"""Whether to mask the prompt section from the label (with ``ignore_index``)."""
test_split_fraction: float = 0.03865 # to get exactly 2000 test samples,
"""The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
val_split_fraction: float = 0.03865 # to get exactly 2000 validation samples,
"""The fraction of the dataset to use for the validation dataset. The rest is used for training."""
prompt_style: Union[str, PromptStyle] = "alpaca"
"""The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
ignore_index: int = -1
Expand Down Expand Up @@ -70,7 +70,7 @@ def setup(self, stage: str = "") -> None:
# Partition the dataset into train and test
train_data, test_data = random_split(
data,
[1.0 - self.test_split_fraction, self.test_split_fraction],
[1.0 - self.val_split_fraction, self.val_split_fraction],
generator=torch.Generator().manual_seed(self.seed)
)
train_data, test_data = list(train_data), list(test_data)
Expand Down
6 changes: 3 additions & 3 deletions litgpt/data/alpaca_2k.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
class Alpaca2k(Alpaca):
"""Alpaca2k data module for supervised finetuning."""

test_split_fraction: float = 0.05 # to get exactly 100 test samples,
"""The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
val_split_fraction: float = 0.05 # to get exactly 100 validation samples,
"""The fraction of the dataset to use for the validation dataset. The rest is used for training."""
download_dir: Path = Path("./data/alpaca2k")
"""The directory in which the downloaded datasetgets saved."""
repo_id: str = field(repr=False, default="mhenrichsen/alpaca_2k_test")
Expand All @@ -30,7 +30,7 @@ def setup(self, stage: str = "") -> None:

dataset = load_dataset(self.repo_id, cache_dir=self.download_dir)

train_validation_split = dataset["train"].train_test_split(test_size=self.test_split_fraction, seed=self.seed)
train_validation_split = dataset["train"].train_test_split(test_size=self.val_split_fraction, seed=self.seed)
train_data = train_validation_split["train"]
test_data = train_validation_split["test"]

Expand Down
4 changes: 2 additions & 2 deletions litgpt/data/alpaca_gpt4.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
class AlpacaGPT4(Alpaca):
"""AlpacaGPT4 data module for supervised finetuning."""

test_split_fraction: float = 0.03847 # to get exactly 2000 test samples,
"""The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
val_split_fraction: float = 0.03847 # to get exactly 2000 test samples,
"""The fraction of the dataset to use for the validation dataset. The rest is used for training."""
download_dir: Path = Path("./data/alpacagpt4")
"""The directory in which the downloaded datasetgets saved."""
file_url: str = field(repr=False, default=_URL)
Expand Down
6 changes: 3 additions & 3 deletions litgpt/data/dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ class Dolly(Alpaca):

mask_prompt: bool = False
"""Whether to mask the prompt section from the label (with ``ignore_index``)."""
test_split_fraction: float = 0.1
"""The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
val_split_fraction: float = 0.1
"""The fraction of the dataset to use for the validation dataset. The rest is used for training."""
prompt_style: Union[str, PromptStyle] = "alpaca"
"""The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
ignore_index: int = -1
Expand Down Expand Up @@ -49,7 +49,7 @@ def setup(self, stage: str = "") -> None:
# Partition the dataset into train and test
train_data, test_data = random_split(
data,
[1.0 - self.test_split_fraction, self.test_split_fraction],
[1.0 - self.val_split_fraction, self.val_split_fraction],
generator=torch.Generator().manual_seed(self.seed)
)
train_data, test_data = list(train_data), list(test_data)
Expand Down
10 changes: 5 additions & 5 deletions litgpt/data/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class JSON(LitDataModule):
and can optionally have a key 'input' (see Alpaca)."""
mask_prompt: bool = False
"""Whether to mask the prompt section from the label (with ``ignore_index``)."""
test_split_fraction: Optional[float] = None
val_split_fraction: Optional[float] = None
"""The fraction of the dataset to use for the validation dataset. The rest is used for training.
Only applies if you passed in a single file to `json_path`."""
prompt_style: Union[str, PromptStyle] = "alpaca"
Expand All @@ -39,13 +39,13 @@ class JSON(LitDataModule):
batch_size: int = field(default=1, init=False, repr=False)
max_seq_length: int = field(default=-1, init=False, repr=False)
train_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
test_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
val_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)

def __post_init__(self):
if self.json_path.is_dir() and self.test_split_fraction is not None:
if self.json_path.is_dir() and self.val_split_fraction is not None:
raise ValueError(
"If `json_path` is a directory, it must contain 'train.json' and 'val.json' files and"
f" hence `test_split_fraction` should not be set. Got `{self.test_split_fraction=}`."
f" hence `val_split_fraction` should not be set. Got `{self.val_split_fraction=}`."
)
if not self.json_path.exists():
raise FileNotFoundError(
Expand Down Expand Up @@ -112,7 +112,7 @@ def get_splits(self) -> Tuple:
# Partition the dataset into train and test
train_data, test_data = random_split(
data,
[1.0 - self.test_split_fraction, self.test_split_fraction],
[1.0 - self.val_split_fraction, self.val_split_fraction],
generator=torch.Generator().manual_seed(self.seed)
)
return train_data, test_data
Expand Down
6 changes: 3 additions & 3 deletions litgpt/data/lima.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ class LIMA(LitDataModule):

mask_prompt: bool = False
"""Whether to mask the prompt section from the label (with ``ignore_index``)."""
test_split_fraction: float = 0.1
"""The fraction of the dataset to use for the test/validation dataset. The rest is used for training."""
val_split_fraction: float = 0.1
"""The fraction of the dataset to use for the validation dataset. The rest is used for training."""
prompt_style: Union[str, PromptStyle] = "alpaca"
"""The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
ignore_index: int = -1
Expand Down Expand Up @@ -77,7 +77,7 @@ def setup(self, stage: str = "") -> None:
# Partition the dataset into train and test
train_data, test_data = random_split(
data,
[1.0 - self.test_split_fraction, self.test_split_fraction],
[1.0 - self.val_split_fraction, self.val_split_fraction],
generator=torch.Generator().manual_seed(self.seed)
)
train_data, test_data = list(train_data), list(test_data)
Expand Down
6 changes: 3 additions & 3 deletions litgpt/data/openwebtext.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ class OpenWebText(LitDataModule):
data_path: Union[str, Path] = Path("data/openwebtext")
"""The path to the data directory, containing two folders 'train' and 'val'
which are the output of the preprocessing step. The path can also be a remote path (e.g., s3://)."""
test_split_fraction: float = 0.0005
"""The fraction of data that should be put aside for validation/testing."""
val_split_fraction: float = 0.0005
"""The fraction of data that should be put aside for validation."""
seed: int = 42
"""The seed to use for shuffling the training data."""
num_workers: int = 8
Expand Down Expand Up @@ -59,7 +59,7 @@ def prepare_data(self) -> None:
dataset = load_dataset("openwebtext", num_proc=(os.cpu_count() // 2), trust_remote_code=True)

# Split the data in training and validation
split_dataset = dataset["train"].train_test_split(test_size=self.test_split_fraction, seed=self.seed, shuffle=True)
split_dataset = dataset["train"].train_test_split(test_size=self.val_split_fraction, seed=self.seed, shuffle=True)
split_dataset["val"] = split_dataset.pop("test") # rename the test split to val

def tokenize(data: Dataset, index: int):
Expand Down
2 changes: 1 addition & 1 deletion tests/data/test_alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ def test_alpaca(mock_tokenizer, alpaca_path):
from litgpt.prompts import Alpaca as AlpacaPromptStyle

alpaca = Alpaca(
test_split_fraction=0.5,
val_split_fraction=0.5,
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
num_workers=0,
Expand Down
2 changes: 1 addition & 1 deletion tests/data/test_dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ def test_dolly(mock_tokenizer, dolly_path):
from litgpt.prompts import Alpaca as AlpacaPromptStyle

alpaca = Dolly(
test_split_fraction=0.5,
val_split_fraction=0.5,
download_dir=dolly_path.parent,
file_name=dolly_path.name,
num_workers=0,
Expand Down
6 changes: 3 additions & 3 deletions tests/data/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def apply(self, prompt, **kwargs):
with open(json_path, "w", encoding="utf-8") as fp:
json.dump(mock_data, fp)

data = JSON(json_path, test_split_fraction=0.5, prompt_style=Style(), num_workers=0)
data = JSON(json_path, val_split_fraction=0.5, prompt_style=Style(), num_workers=0)
data.connect(tokenizer=mock_tokenizer, batch_size=2)
data.prepare_data() # does nothing
data.setup()
Expand Down Expand Up @@ -61,8 +61,8 @@ def test_json_input_validation(tmp_path):
with pytest.raises(FileNotFoundError, match="The `json_path` must be a file or a directory"):
JSON(tmp_path / "not exist")

with pytest.raises(ValueError, match="`test_split_fraction` should not be set"):
JSON(tmp_path, test_split_fraction=0.5)
with pytest.raises(ValueError, match="`val_split_fraction` should not be set"):
JSON(tmp_path, val_split_fraction=0.5)

data = JSON(tmp_path)
data.prepare_data() # does nothing
Expand Down
4 changes: 2 additions & 2 deletions tests/test_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def test_adapter_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path)
data=Alpaca(
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
test_split_fraction=0.5,
val_split_fraction=0.5,
num_workers=0
),
checkpoint_dir=fake_checkpoint_dir,
Expand Down Expand Up @@ -173,7 +173,7 @@ def test_adapter_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca
data=Alpaca(
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
test_split_fraction=0.5,
val_split_fraction=0.5,
num_workers=0,
),
precision="16-true",
Expand Down
4 changes: 2 additions & 2 deletions tests/test_adapter_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def test_adapter_v2_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_pa
data=Alpaca(
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
test_split_fraction=0.5,
val_split_fraction=0.5,
num_workers=0
),
checkpoint_dir=fake_checkpoint_dir,
Expand Down Expand Up @@ -262,7 +262,7 @@ def test_adapter_v2_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alp
data=Alpaca(
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
test_split_fraction=0.5,
val_split_fraction=0.5,
num_workers=0
),
precision="16-true",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_full.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_full_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path):
data=Alpaca(
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
test_split_fraction=0.5,
val_split_fraction=0.5,
num_workers=0
),
checkpoint_dir=fake_checkpoint_dir,
Expand Down
4 changes: 2 additions & 2 deletions tests/test_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def test_lora_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path):
data=Alpaca(
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
test_split_fraction=0.5,
val_split_fraction=0.5,
num_workers=0
),
checkpoint_dir=fake_checkpoint_dir,
Expand Down Expand Up @@ -631,7 +631,7 @@ def test_lora_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca_pa
data=Alpaca(
download_dir=alpaca_path.parent,
file_name=alpaca_path.name,
test_split_fraction=0.5,
val_split_fraction=0.5,
num_workers=0,
),
checkpoint_dir=fake_checkpoint_dir,
Expand Down
4 changes: 2 additions & 2 deletions tutorials/prepare_dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ python litgpt/finetune/lora.py \

You can also customize how the dataset is read by using these additional parameters

- `test_split_fraction`: The fraction of the data to split. Defaults to `0.1`
- `val_split_fraction`: The fraction of the data to split. Defaults to `0.1`

- `seed`: The seed value to reproduce the same random splits for train and test data.

Expand All @@ -359,7 +359,7 @@ To use the settings described above, you can add the respective command line arg
python litgpt/finetune/lora.py \
--data JSON \
--data.json_path path/to/your/data.json \
--data.test_split_fraction 0.1 \
--data.val_split_fraction 0.1 \
--data.seed 42 \
--data.mask_inputs False \
--data.ignore_index -1 \
Expand Down
4 changes: 2 additions & 2 deletions xla/scripts/prepare_alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
def prepare(
destination_path: Path = Path("data/alpaca"),
checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
test_split_fraction: float = 0.03865, # to get exactly 2000 test samples,
val_split_fraction: float = 0.03865, # to get exactly 2000 validation samples,
seed: int = 42,
mask_inputs: bool = False, # as in alpaca-lora
data_file_name: str = "alpaca_data_cleaned_archive.json",
Expand Down Expand Up @@ -53,7 +53,7 @@ def prepare(

# Partition the dataset into train and test
train_set, test_set = random_split(
data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed)
data, [1.0 - val_split_fraction, val_split_fraction], generator=torch.Generator().manual_seed(seed)
)
train_set, test_set = list(train_set), list(test_set)

Expand Down
Loading