From 58dd2a89336b979e3f62e11198f5977687f873ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Sat, 9 Mar 2024 20:02:01 +0100
Subject: [PATCH] Match the default ignore index to PyTorch's (#1076)

---
 config_hub/finetune/llama-2-7b/full.yaml | 2 +-
 config_hub/finetune/llama-2-7b/lora.yaml | 2 +-
 config_hub/finetune/tiny-llama/lora.yaml | 2 +-
 litgpt/data/alpaca.py                    | 2 +-
 litgpt/data/base.py                      | 6 +++---
 litgpt/data/deita.py                     | 2 +-
 litgpt/data/dolly.py                     | 2 +-
 litgpt/data/flan.py                      | 2 +-
 litgpt/data/json.py                      | 2 +-
 litgpt/data/lima.py                      | 2 +-
 litgpt/data/longform.py                  | 2 +-
 litgpt/utils.py                          | 2 +-
 tests/test_utils.py                      | 2 +-
 tutorials/prepare_dataset.md             | 4 ++--
 xla/scripts/prepare_alpaca.py            | 2 +-
 15 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/config_hub/finetune/llama-2-7b/full.yaml b/config_hub/finetune/llama-2-7b/full.yaml
index 3f7cdfbd8a..e7465c82b5 100644
--- a/config_hub/finetune/llama-2-7b/full.yaml
+++ b/config_hub/finetune/llama-2-7b/full.yaml
@@ -8,7 +8,7 @@ data:
     mask_prompt: false
     val_split_fraction: 0.03847
     prompt_style: "alpaca"
-    ignore_index: -1
+    ignore_index: -100
     seed: 42
     num_workers: 4
     download_dir: data/alpacagpt4
diff --git a/config_hub/finetune/llama-2-7b/lora.yaml b/config_hub/finetune/llama-2-7b/lora.yaml
index 486929d8f3..7fba7ac095 100644
--- a/config_hub/finetune/llama-2-7b/lora.yaml
+++ b/config_hub/finetune/llama-2-7b/lora.yaml
@@ -17,7 +17,7 @@ data:
     mask_prompt: false
     val_split_fraction: 0.03847
     prompt_style: "alpaca"
-    ignore_index: -1
+    ignore_index: -100
     seed: 42
     num_workers: 4
     download_dir: data/alpacagpt4
diff --git a/config_hub/finetune/tiny-llama/lora.yaml b/config_hub/finetune/tiny-llama/lora.yaml
index af7d41f07e..278d4c2bc4 100644
--- a/config_hub/finetune/tiny-llama/lora.yaml
+++ b/config_hub/finetune/tiny-llama/lora.yaml
@@ -17,7 +17,7 @@ data:
     mask_prompt: false
     val_split_fraction: 0.03847
     prompt_style: "alpaca"
-    ignore_index: -1
+    ignore_index: -100
     seed: 42
     num_workers: 4
     download_dir: data/alpacagpt4
diff --git a/litgpt/data/alpaca.py b/litgpt/data/alpaca.py
index 70d9c3d2c4..78699c8ae4 100644
--- a/litgpt/data/alpaca.py
+++ b/litgpt/data/alpaca.py
@@ -26,7 +26,7 @@ class Alpaca(LitDataModule):
     """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
     prompt_style: Union[str, PromptStyle] = "alpaca"
     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
-    ignore_index: int = -1
+    ignore_index: int = -100
     """The index to use for elements to be ignored in the label."""
     seed: int = 42
     """The random seed for creating the train/val splits and shuffling the dataset."""
diff --git a/litgpt/data/base.py b/litgpt/data/base.py
index 9188058093..73dcd80c49 100644
--- a/litgpt/data/base.py
+++ b/litgpt/data/base.py
@@ -60,7 +60,7 @@ def __init__(
         prompt_style: Union[str, PromptStyle],
         max_seq_length: int = -1,
         mask_prompt: bool = True,
-        ignore_index: int = -1,
+        ignore_index: int = -100,
         transform: Optional[Callable[[Any], Any]] = None
     ) -> None:
         self.data = data
@@ -97,7 +97,7 @@ def __getitem__(self, idx: int) -> Dict[str, Tensor]:
         return {"input_ids": encoded_prompt_and_response.type(torch.int64), "labels": labels.type(torch.int64)}
 
 
-def get_sft_collate_fn(max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -1):
+def get_sft_collate_fn(max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -100):
     """Returns the collate function for supervised finetuning (needed in the DataLoader).
 
     The collate function gets a list of dicts with keys `input_ids` and `labels`.
@@ -108,7 +108,7 @@ def get_sft_collate_fn(max_seq_length: int = -1, pad_id: int = 0, ignore_index:
 
 
 def _sft_collate_fn(
-    samples: List[Dict[str, Tensor]], max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -1
+    samples: List[Dict[str, Tensor]], max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -100
 ) -> Dict[str, Tensor]:
 
     batched = {}
diff --git a/litgpt/data/deita.py b/litgpt/data/deita.py
index cb242f12b1..6dacb979b0 100644
--- a/litgpt/data/deita.py
+++ b/litgpt/data/deita.py
@@ -21,7 +21,7 @@ class Deita(LitDataModule):
     """Whether to mask the prompt section from the label (with ``ignore_index``)."""
     prompt_style: Union[str, PromptStyle] = "alpaca"
     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
-    ignore_index: int = -1
+    ignore_index: int = -100
     """The index to use for elements to be ignored in the label."""
     seed: int = 42
     """The random seed for shuffling the dataset."""
diff --git a/litgpt/data/dolly.py b/litgpt/data/dolly.py
index 9824e8d13c..473fe3a0d4 100644
--- a/litgpt/data/dolly.py
+++ b/litgpt/data/dolly.py
@@ -24,7 +24,7 @@ class Dolly(Alpaca):
     """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
     prompt_style: Union[str, PromptStyle] = "alpaca"
     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
-    ignore_index: int = -1
+    ignore_index: int = -100
     """The index to use for elements to be ignored in the label."""
     seed: int = 42
     """The random seed for creating the train/val splits and shuffling the dataset."""
diff --git a/litgpt/data/flan.py b/litgpt/data/flan.py
index e00620576c..31aac2927d 100644
--- a/litgpt/data/flan.py
+++ b/litgpt/data/flan.py
@@ -26,7 +26,7 @@ class FLAN(LitDataModule):
     """Whether to mask the prompt section from the label (with ``ignore_index``)."""
     prompt_style: Union[str, PromptStyle] = "flan"
     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
-    ignore_index: int = -1
+    ignore_index: int = -100
     """The index to use for elements to be ignored in the label."""
     seed: int = 42
     """The random seed for shuffling the dataset."""
diff --git a/litgpt/data/json.py b/litgpt/data/json.py
index 3115d4639e..9789e90eca 100644
--- a/litgpt/data/json.py
+++ b/litgpt/data/json.py
@@ -28,7 +28,7 @@ class JSON(LitDataModule):
     Only applies if you passed in a single file to `json_path`."""
     prompt_style: Union[str, PromptStyle] = "alpaca"
     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
-    ignore_index: int = -1
+    ignore_index: int = -100
     """The index to use for elements to be ignored in the label."""
     seed: int = 42
     """The random seed for creating the train/val splits and shuffling the dataset."""
diff --git a/litgpt/data/lima.py b/litgpt/data/lima.py
index 263b80dd15..9313aac951 100644
--- a/litgpt/data/lima.py
+++ b/litgpt/data/lima.py
@@ -23,7 +23,7 @@ class LIMA(LitDataModule):
     """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
     prompt_style: Union[str, PromptStyle] = "alpaca"
     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
-    ignore_index: int = -1
+    ignore_index: int = -100
     """The index to use for elements to be ignored in the label."""
     seed: int = 42
     """The random seed for creating the train/val splits and shuffling the dataset."""
diff --git a/litgpt/data/longform.py b/litgpt/data/longform.py
index a174dbf5e6..ffa1f84be2 100644
--- a/litgpt/data/longform.py
+++ b/litgpt/data/longform.py
@@ -25,7 +25,7 @@ class LongForm(LitDataModule):
     """Whether to mask the prompt section from the label (with ``ignore_index``)."""
     prompt_style: Union[str, PromptStyle] = "longform"
     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
-    ignore_index: int = -1
+    ignore_index: int = -100
     """The index to use for elements to be ignored in the label."""
     seed: int = 42
     """The random seed for shuffling the dataset."""
diff --git a/litgpt/utils.py b/litgpt/utils.py
index 500c8beaa0..e2a0fe834a 100644
--- a/litgpt/utils.py
+++ b/litgpt/utils.py
@@ -236,7 +236,7 @@ def chunked_cross_entropy(
     logits: Union[torch.Tensor, List[torch.Tensor]],
     targets: torch.Tensor,
     chunk_size: int = 128,
-    ignore_index: int = -1,
+    ignore_index: int = -100,
 ) -> torch.Tensor:
     # with large max_sequence_lengths, the beginning of `backward` allocates a large memory chunk which can dominate
     # the memory usage in fine-tuning settings with low number of parameters.
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 95a845f012..56c7b0959c 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -122,7 +122,7 @@ def test_chunked_cross_entropy(ignore_index, B):
         ignore_index=(ignore_index if ignore_index is not None else -100),
     )
 
-    ignore_index = ignore_index if ignore_index is not None else -1
+    ignore_index = ignore_index if ignore_index is not None else -100
     regular_loss = chunked_cross_entropy(regular_logits, targets, chunk_size=0, ignore_index=ignore_index)
     assert torch.equal(baseline_loss, regular_loss)
     assert regular_loss.numel() == 1
diff --git a/tutorials/prepare_dataset.md b/tutorials/prepare_dataset.md
index a52410641d..96049ff5a3 100644
--- a/tutorials/prepare_dataset.md
+++ b/tutorials/prepare_dataset.md
@@ -351,7 +351,7 @@ You can also customize how the dataset is read by using these additional paramet
 
 - `mask_inputs`: Whether to mask the prompt section from the label (with `ignore_index`).
 
-- `ignore_index`: The index to use for labels that should be ignored. Defaults to `-1` (used when `mask_inputs` is `True`).
+- `ignore_index`: The index to use for labels that should be ignored. Defaults to `-100` (used when `mask_inputs` is `True`).
 
 To use the settings described above, you can add the respective command line arguments when calling the finetuning scripts as shown in the example below:
 
@@ -362,7 +362,7 @@ python litgpt/finetune/lora.py \
   --data.val_split_fraction 0.1 \
   --data.seed 42 \
   --data.mask_inputs False \
-  --data.ignore_index -1 \
+  --data.ignore_index -100 \
   --checkpoint_dir "checkpoints/tiiuae/falcon-7b"
 ```
 
diff --git a/xla/scripts/prepare_alpaca.py b/xla/scripts/prepare_alpaca.py
index f36deedc5c..0a7f208f33 100644
--- a/xla/scripts/prepare_alpaca.py
+++ b/xla/scripts/prepare_alpaca.py
@@ -22,7 +22,7 @@ def prepare(
     mask_inputs: bool = False,  # as in alpaca-lora
     data_file_name: str = "alpaca_data_cleaned_archive.json",
     data_file_url: str = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json",
-    ignore_index: int = -1,
+    ignore_index: int = -100,
     max_seq_length: Optional[int] = None,
 ) -> None:
     """Prepare the Alpaca dataset for instruction tuning.