From 58dd2a89336b979e3f62e11198f5977687f873ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Sat, 9 Mar 2024 20:02:01 +0100 Subject: [PATCH] Match the default ignore index to PyTorch's (#1076) --- config_hub/finetune/llama-2-7b/full.yaml | 2 +- config_hub/finetune/llama-2-7b/lora.yaml | 2 +- config_hub/finetune/tiny-llama/lora.yaml | 2 +- litgpt/data/alpaca.py | 2 +- litgpt/data/base.py | 6 +++--- litgpt/data/deita.py | 2 +- litgpt/data/dolly.py | 2 +- litgpt/data/flan.py | 2 +- litgpt/data/json.py | 2 +- litgpt/data/lima.py | 2 +- litgpt/data/longform.py | 2 +- litgpt/utils.py | 2 +- tests/test_utils.py | 2 +- tutorials/prepare_dataset.md | 4 ++-- xla/scripts/prepare_alpaca.py | 2 +- 15 files changed, 18 insertions(+), 18 deletions(-) diff --git a/config_hub/finetune/llama-2-7b/full.yaml b/config_hub/finetune/llama-2-7b/full.yaml index 3f7cdfbd8a..e7465c82b5 100644 --- a/config_hub/finetune/llama-2-7b/full.yaml +++ b/config_hub/finetune/llama-2-7b/full.yaml @@ -8,7 +8,7 @@ data: mask_prompt: false val_split_fraction: 0.03847 prompt_style: "alpaca" - ignore_index: -1 + ignore_index: -100 seed: 42 num_workers: 4 download_dir: data/alpacagpt4 diff --git a/config_hub/finetune/llama-2-7b/lora.yaml b/config_hub/finetune/llama-2-7b/lora.yaml index 486929d8f3..7fba7ac095 100644 --- a/config_hub/finetune/llama-2-7b/lora.yaml +++ b/config_hub/finetune/llama-2-7b/lora.yaml @@ -17,7 +17,7 @@ data: mask_prompt: false val_split_fraction: 0.03847 prompt_style: "alpaca" - ignore_index: -1 + ignore_index: -100 seed: 42 num_workers: 4 download_dir: data/alpacagpt4 diff --git a/config_hub/finetune/tiny-llama/lora.yaml b/config_hub/finetune/tiny-llama/lora.yaml index af7d41f07e..278d4c2bc4 100644 --- a/config_hub/finetune/tiny-llama/lora.yaml +++ b/config_hub/finetune/tiny-llama/lora.yaml @@ -17,7 +17,7 @@ data: mask_prompt: false val_split_fraction: 0.03847 prompt_style: "alpaca" - ignore_index: -1 + ignore_index: -100 seed: 42 num_workers: 4 download_dir: data/alpacagpt4 diff --git a/litgpt/data/alpaca.py b/litgpt/data/alpaca.py index 70d9c3d2c4..78699c8ae4 100644 --- a/litgpt/data/alpaca.py +++ b/litgpt/data/alpaca.py @@ -26,7 +26,7 @@ class Alpaca(LitDataModule): """The fraction of the dataset to use for the validation dataset. The rest is used for training.""" prompt_style: Union[str, PromptStyle] = "alpaca" """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles.""" - ignore_index: int = -1 + ignore_index: int = -100 """The index to use for elements to be ignored in the label.""" seed: int = 42 """The random seed for creating the train/val splits and shuffling the dataset.""" diff --git a/litgpt/data/base.py b/litgpt/data/base.py index 9188058093..73dcd80c49 100644 --- a/litgpt/data/base.py +++ b/litgpt/data/base.py @@ -60,7 +60,7 @@ def __init__( prompt_style: Union[str, PromptStyle], max_seq_length: int = -1, mask_prompt: bool = True, - ignore_index: int = -1, + ignore_index: int = -100, transform: Optional[Callable[[Any], Any]] = None ) -> None: self.data = data @@ -97,7 +97,7 @@ def __getitem__(self, idx: int) -> Dict[str, Tensor]: return {"input_ids": encoded_prompt_and_response.type(torch.int64), "labels": labels.type(torch.int64)} -def get_sft_collate_fn(max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -1): +def get_sft_collate_fn(max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -100): """Returns the collate function for supervised finetuning (needed in the DataLoader). The collate function gets a list of dicts with keys `input_ids` and `labels`. @@ -108,7 +108,7 @@ def get_sft_collate_fn(max_seq_length: int = -1, pad_id: int = 0, ignore_index: def _sft_collate_fn( - samples: List[Dict[str, Tensor]], max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -1 + samples: List[Dict[str, Tensor]], max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -100 ) -> Dict[str, Tensor]: batched = {} diff --git a/litgpt/data/deita.py b/litgpt/data/deita.py index cb242f12b1..6dacb979b0 100644 --- a/litgpt/data/deita.py +++ b/litgpt/data/deita.py @@ -21,7 +21,7 @@ class Deita(LitDataModule): """Whether to mask the prompt section from the label (with ``ignore_index``).""" prompt_style: Union[str, PromptStyle] = "alpaca" """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles.""" - ignore_index: int = -1 + ignore_index: int = -100 """The index to use for elements to be ignored in the label.""" seed: int = 42 """The random seed for shuffling the dataset.""" diff --git a/litgpt/data/dolly.py b/litgpt/data/dolly.py index 9824e8d13c..473fe3a0d4 100644 --- a/litgpt/data/dolly.py +++ b/litgpt/data/dolly.py @@ -24,7 +24,7 @@ class Dolly(Alpaca): """The fraction of the dataset to use for the validation dataset. The rest is used for training.""" prompt_style: Union[str, PromptStyle] = "alpaca" """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles.""" - ignore_index: int = -1 + ignore_index: int = -100 """The index to use for elements to be ignored in the label.""" seed: int = 42 """The random seed for creating the train/val splits and shuffling the dataset.""" diff --git a/litgpt/data/flan.py b/litgpt/data/flan.py index e00620576c..31aac2927d 100644 --- a/litgpt/data/flan.py +++ b/litgpt/data/flan.py @@ -26,7 +26,7 @@ class FLAN(LitDataModule): """Whether to mask the prompt section from the label (with ``ignore_index``).""" prompt_style: Union[str, PromptStyle] = "flan" """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles.""" - ignore_index: int = -1 + ignore_index: int = -100 """The index to use for elements to be ignored in the label.""" seed: int = 42 """The random seed for shuffling the dataset.""" diff --git a/litgpt/data/json.py b/litgpt/data/json.py index 3115d4639e..9789e90eca 100644 --- a/litgpt/data/json.py +++ b/litgpt/data/json.py @@ -28,7 +28,7 @@ class JSON(LitDataModule): Only applies if you passed in a single file to `json_path`.""" prompt_style: Union[str, PromptStyle] = "alpaca" """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles.""" - ignore_index: int = -1 + ignore_index: int = -100 """The index to use for elements to be ignored in the label.""" seed: int = 42 """The random seed for creating the train/val splits and shuffling the dataset.""" diff --git a/litgpt/data/lima.py b/litgpt/data/lima.py index 263b80dd15..9313aac951 100644 --- a/litgpt/data/lima.py +++ b/litgpt/data/lima.py @@ -23,7 +23,7 @@ class LIMA(LitDataModule): """The fraction of the dataset to use for the validation dataset. The rest is used for training.""" prompt_style: Union[str, PromptStyle] = "alpaca" """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles.""" - ignore_index: int = -1 + ignore_index: int = -100 """The index to use for elements to be ignored in the label.""" seed: int = 42 """The random seed for creating the train/val splits and shuffling the dataset.""" diff --git a/litgpt/data/longform.py b/litgpt/data/longform.py index a174dbf5e6..ffa1f84be2 100644 --- a/litgpt/data/longform.py +++ b/litgpt/data/longform.py @@ -25,7 +25,7 @@ class LongForm(LitDataModule): """Whether to mask the prompt section from the label (with ``ignore_index``).""" prompt_style: Union[str, PromptStyle] = "longform" """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles.""" - ignore_index: int = -1 + ignore_index: int = -100 """The index to use for elements to be ignored in the label.""" seed: int = 42 """The random seed for shuffling the dataset.""" diff --git a/litgpt/utils.py b/litgpt/utils.py index 500c8beaa0..e2a0fe834a 100644 --- a/litgpt/utils.py +++ b/litgpt/utils.py @@ -236,7 +236,7 @@ def chunked_cross_entropy( logits: Union[torch.Tensor, List[torch.Tensor]], targets: torch.Tensor, chunk_size: int = 128, - ignore_index: int = -1, + ignore_index: int = -100, ) -> torch.Tensor: # with large max_sequence_lengths, the beginning of `backward` allocates a large memory chunk which can dominate # the memory usage in fine-tuning settings with low number of parameters. diff --git a/tests/test_utils.py b/tests/test_utils.py index 95a845f012..56c7b0959c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -122,7 +122,7 @@ def test_chunked_cross_entropy(ignore_index, B): ignore_index=(ignore_index if ignore_index is not None else -100), ) - ignore_index = ignore_index if ignore_index is not None else -1 + ignore_index = ignore_index if ignore_index is not None else -100 regular_loss = chunked_cross_entropy(regular_logits, targets, chunk_size=0, ignore_index=ignore_index) assert torch.equal(baseline_loss, regular_loss) assert regular_loss.numel() == 1 diff --git a/tutorials/prepare_dataset.md b/tutorials/prepare_dataset.md index a52410641d..96049ff5a3 100644 --- a/tutorials/prepare_dataset.md +++ b/tutorials/prepare_dataset.md @@ -351,7 +351,7 @@ You can also customize how the dataset is read by using these additional paramet - `mask_inputs`: Whether to mask the prompt section from the label (with `ignore_index`). -- `ignore_index`: The index to use for labels that should be ignored. Defaults to `-1` (used when `mask_inputs` is `True`). +- `ignore_index`: The index to use for labels that should be ignored. Defaults to `-100` (used when `mask_inputs` is `True`). To use the settings described above, you can add the respective command line arguments when calling the finetuning scripts as shown in the example below: @@ -362,7 +362,7 @@ python litgpt/finetune/lora.py \ --data.val_split_fraction 0.1 \ --data.seed 42 \ --data.mask_inputs False \ - --data.ignore_index -1 \ + --data.ignore_index -100 \ --checkpoint_dir "checkpoints/tiiuae/falcon-7b" ``` diff --git a/xla/scripts/prepare_alpaca.py b/xla/scripts/prepare_alpaca.py index f36deedc5c..0a7f208f33 100644 --- a/xla/scripts/prepare_alpaca.py +++ b/xla/scripts/prepare_alpaca.py @@ -22,7 +22,7 @@ def prepare( mask_inputs: bool = False, # as in alpaca-lora data_file_name: str = "alpaca_data_cleaned_archive.json", data_file_url: str = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json", - ignore_index: int = -1, + ignore_index: int = -100, max_seq_length: Optional[int] = None, ) -> None: """Prepare the Alpaca dataset for instruction tuning.