Skip to content

Commit

Permalink
Merge branch 'wip' into toml
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt authored Mar 11, 2024
2 parents 4bc698c + 2667fec commit a7ab349
Show file tree
Hide file tree
Showing 16 changed files with 65 additions and 32 deletions.
2 changes: 1 addition & 1 deletion config_hub/finetune/llama-2-7b/full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ data:
mask_prompt: false
val_split_fraction: 0.03847
prompt_style: "alpaca"
ignore_index: -1
ignore_index: -100
seed: 42
num_workers: 4
download_dir: data/alpacagpt4
Expand Down
2 changes: 1 addition & 1 deletion config_hub/finetune/llama-2-7b/lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ data:
mask_prompt: false
val_split_fraction: 0.03847
prompt_style: "alpaca"
ignore_index: -1
ignore_index: -100
seed: 42
num_workers: 4
download_dir: data/alpacagpt4
Expand Down
2 changes: 1 addition & 1 deletion config_hub/finetune/tiny-llama/lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ data:
mask_prompt: false
val_split_fraction: 0.03847
prompt_style: "alpaca"
ignore_index: -1
ignore_index: -100
seed: 42
num_workers: 4
download_dir: data/alpacagpt4
Expand Down
2 changes: 1 addition & 1 deletion litgpt/data/alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class Alpaca(LitDataModule):
"""The fraction of the dataset to use for the validation dataset. The rest is used for training."""
prompt_style: Union[str, PromptStyle] = "alpaca"
"""The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
ignore_index: int = -1
ignore_index: int = -100
"""The index to use for elements to be ignored in the label."""
seed: int = 42
"""The random seed for creating the train/val splits and shuffling the dataset."""
Expand Down
6 changes: 3 additions & 3 deletions litgpt/data/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __init__(
prompt_style: Union[str, PromptStyle],
max_seq_length: int = -1,
mask_prompt: bool = True,
ignore_index: int = -1,
ignore_index: int = -100,
transform: Optional[Callable[[Any], Any]] = None
) -> None:
self.data = data
Expand Down Expand Up @@ -97,7 +97,7 @@ def __getitem__(self, idx: int) -> Dict[str, Tensor]:
return {"input_ids": encoded_prompt_and_response.type(torch.int64), "labels": labels.type(torch.int64)}


def get_sft_collate_fn(max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -1):
def get_sft_collate_fn(max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -100):
"""Returns the collate function for supervised finetuning (needed in the DataLoader).
The collate function gets a list of dicts with keys `input_ids` and `labels`.
Expand All @@ -108,7 +108,7 @@ def get_sft_collate_fn(max_seq_length: int = -1, pad_id: int = 0, ignore_index:


def _sft_collate_fn(
samples: List[Dict[str, Tensor]], max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -1
samples: List[Dict[str, Tensor]], max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -100
) -> Dict[str, Tensor]:

batched = {}
Expand Down
2 changes: 1 addition & 1 deletion litgpt/data/deita.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class Deita(LitDataModule):
"""Whether to mask the prompt section from the label (with ``ignore_index``)."""
prompt_style: Union[str, PromptStyle] = "alpaca"
"""The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
ignore_index: int = -1
ignore_index: int = -100
"""The index to use for elements to be ignored in the label."""
seed: int = 42
"""The random seed for shuffling the dataset."""
Expand Down
2 changes: 1 addition & 1 deletion litgpt/data/dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class Dolly(Alpaca):
"""The fraction of the dataset to use for the validation dataset. The rest is used for training."""
prompt_style: Union[str, PromptStyle] = "alpaca"
"""The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
ignore_index: int = -1
ignore_index: int = -100
"""The index to use for elements to be ignored in the label."""
seed: int = 42
"""The random seed for creating the train/val splits and shuffling the dataset."""
Expand Down
2 changes: 1 addition & 1 deletion litgpt/data/flan.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class FLAN(LitDataModule):
"""Whether to mask the prompt section from the label (with ``ignore_index``)."""
prompt_style: Union[str, PromptStyle] = "flan"
"""The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
ignore_index: int = -1
ignore_index: int = -100
"""The index to use for elements to be ignored in the label."""
seed: int = 42
"""The random seed for shuffling the dataset."""
Expand Down
26 changes: 19 additions & 7 deletions litgpt/data/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

@dataclass
class JSON(LitDataModule):
"""Loads JSON data for supervised finetuning."""
"""Loads JSON or JSONL data for supervised finetuning."""

json_path: Path
"""A path to a JSON file or a directory with `train.json` and `val.json` containing the data.
Expand All @@ -28,7 +28,7 @@ class JSON(LitDataModule):
Only applies if you passed in a single file to `json_path`."""
prompt_style: Union[str, PromptStyle] = "alpaca"
"""The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
ignore_index: int = -1
ignore_index: int = -100
"""The index to use for elements to be ignored in the label."""
seed: int = 42
"""The random seed for creating the train/val splits and shuffling the dataset."""
Expand Down Expand Up @@ -118,16 +118,28 @@ def get_splits(self) -> Tuple:
return train_data, test_data

# A directory containing train.json and val.json
if (self.json_path / "train.json").is_file() and (self.json_path / f"val.json").is_file():
train_data = load_split(self.json_path / "train.json")
test_data = load_split(self.json_path / f"val.json")
if (train_file := self.find_split("train")) and (val_file := self.find_split("val")):
train_data = load_split(train_file)
test_data = load_split(val_file)
return train_data, test_data

raise FileNotFoundError(
"The `json_path` must be a file or a directory containing 'train.json' and 'val.json' files."
)

def find_split(self, split_name: str) -> Optional[Path]:
for suffix in (".json", ".jsonl"):
if (file := self.json_path / f"{split_name}{suffix}").is_file():
return file
return None


def load_split(json_path: Path) -> Any:
with open(json_path, "r", encoding="utf-8") as file:
return json.load(file)
if json_path.suffix == ".json":
with open(json_path, "r", encoding="utf-8") as file:
return json.load(file)
if json_path.suffix == ".jsonl":
with open(json_path, "r", encoding="utf-8") as file:
return [json.loads(line) for line in file]
else:
raise ValueError(f"Unsupported file format: {json_path.suffix}. Expected `.json` or `.jsonl`.")
2 changes: 1 addition & 1 deletion litgpt/data/lima.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class LIMA(LitDataModule):
"""The fraction of the dataset to use for the validation dataset. The rest is used for training."""
prompt_style: Union[str, PromptStyle] = "alpaca"
"""The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
ignore_index: int = -1
ignore_index: int = -100
"""The index to use for elements to be ignored in the label."""
seed: int = 42
"""The random seed for creating the train/val splits and shuffling the dataset."""
Expand Down
2 changes: 1 addition & 1 deletion litgpt/data/longform.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class LongForm(LitDataModule):
"""Whether to mask the prompt section from the label (with ``ignore_index``)."""
prompt_style: Union[str, PromptStyle] = "longform"
"""The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
ignore_index: int = -1
ignore_index: int = -100
"""The index to use for elements to be ignored in the label."""
seed: int = 42
"""The random seed for shuffling the dataset."""
Expand Down
2 changes: 1 addition & 1 deletion litgpt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def chunked_cross_entropy(
logits: Union[torch.Tensor, List[torch.Tensor]],
targets: torch.Tensor,
chunk_size: int = 128,
ignore_index: int = -1,
ignore_index: int = -100,
) -> torch.Tensor:
# with large max_sequence_lengths, the beginning of `backward` allocates a large memory chunk which can dominate
# the memory usage in fine-tuning settings with low number of parameters.
Expand Down
37 changes: 29 additions & 8 deletions tests/data/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
import pytest


def test_json(tmp_path, mock_tokenizer):
@pytest.mark.parametrize("as_jsonl", [False, True])
def test_json(as_jsonl, tmp_path, mock_tokenizer):
from litgpt.data import JSON
from litgpt.prompts import PromptStyle

class Style(PromptStyle):
def apply(self, prompt, **kwargs):
return f"X: {prompt} {kwargs['input']} Y:"

json_path = tmp_path / "data.json"
json_path = tmp_path / ("data.jsonl" if as_jsonl else "data.json")
mock_data = [
{"instruction": "Add", "input": "2+2", "output": "4"},
{"instruction": "Subtract", "input": "5-3", "output": "2"},
Expand All @@ -22,7 +23,12 @@ def apply(self, prompt, **kwargs):
]

with open(json_path, "w", encoding="utf-8") as fp:
json.dump(mock_data, fp)
if as_jsonl:
for line in mock_data:
json.dump(line, fp)
fp.write("\n")
else:
json.dump(mock_data, fp)

data = JSON(json_path, val_split_fraction=0.5, prompt_style=Style(), num_workers=0)
data.connect(tokenizer=mock_tokenizer, batch_size=2)
Expand Down Expand Up @@ -77,7 +83,8 @@ def test_json_input_validation(tmp_path):
data.setup()


def test_json_with_splits(tmp_path, mock_tokenizer):
@pytest.mark.parametrize("as_jsonl", [False, True])
def test_json_with_splits(as_jsonl, tmp_path, mock_tokenizer):
from litgpt.data import JSON

mock_train_data = [
Expand All @@ -89,10 +96,24 @@ def test_json_with_splits(tmp_path, mock_tokenizer):
{"instruction": "Multiply", "input": "6*4", "output": "24"},
{"instruction": "Divide", "input": "10/2", "output": "5"},
]
with open(tmp_path / "train.json", "w", encoding="utf-8") as fp:
json.dump(mock_train_data, fp)
with open(tmp_path / "val.json", "w", encoding="utf-8") as fp:
json.dump(mock_test_data, fp)

train_file = tmp_path / ("train.jsonl" if as_jsonl else "train.json")
val_file = tmp_path / ("val.jsonl" if as_jsonl else "val.json")

with open(train_file, "w", encoding="utf-8") as fp:
if as_jsonl:
for line in mock_train_data:
json.dump(line, fp)
fp.write("\n")
else:
json.dump(mock_train_data, fp)
with open(val_file, "w", encoding="utf-8") as fp:
if as_jsonl:
for line in mock_test_data:
json.dump(line, fp)
fp.write("\n")
else:
json.dump(mock_test_data, fp)

data = JSON(tmp_path, num_workers=0)
data.connect(tokenizer=mock_tokenizer, batch_size=2)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def test_chunked_cross_entropy(ignore_index, B):
ignore_index=(ignore_index if ignore_index is not None else -100),
)

ignore_index = ignore_index if ignore_index is not None else -1
ignore_index = ignore_index if ignore_index is not None else -100
regular_loss = chunked_cross_entropy(regular_logits, targets, chunk_size=0, ignore_index=ignore_index)
assert torch.equal(baseline_loss, regular_loss)
assert regular_loss.numel() == 1
Expand Down
4 changes: 2 additions & 2 deletions tutorials/prepare_dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ You can also customize how the dataset is read by using these additional paramet

- `mask_inputs`: Whether to mask the prompt section from the label (with `ignore_index`).

- `ignore_index`: The index to use for labels that should be ignored. Defaults to `-1` (used when `mask_inputs` is `True`).
- `ignore_index`: The index to use for labels that should be ignored. Defaults to `-100` (used when `mask_inputs` is `True`).

To use the settings described above, you can add the respective command line arguments when calling the finetuning scripts as shown in the example below:

Expand All @@ -362,7 +362,7 @@ python litgpt/finetune/lora.py \
--data.val_split_fraction 0.1 \
--data.seed 42 \
--data.mask_inputs False \
--data.ignore_index -1 \
--data.ignore_index -100 \
--checkpoint_dir "checkpoints/tiiuae/falcon-7b"
```

Expand Down
2 changes: 1 addition & 1 deletion xla/scripts/prepare_alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def prepare(
mask_inputs: bool = False, # as in alpaca-lora
data_file_name: str = "alpaca_data_cleaned_archive.json",
data_file_url: str = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json",
ignore_index: int = -1,
ignore_index: int = -100,
max_seq_length: Optional[int] = None,
) -> None:
"""Prepare the Alpaca dataset for instruction tuning.
Expand Down

0 comments on commit a7ab349

Please sign in to comment.