From bde8093551c0f49269ac940947ba5cd9ceea043a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 7 Mar 2024 18:56:48 +0100 Subject: [PATCH] Move `prepare_*` scripts inside the package (#1044) --- .github/workflows/cpu-tests.yml | 2 +- .../data}/prepare_slimpajama.py | 13 +++++-------- {scripts => litgpt/data}/prepare_starcoder.py | 19 ++++++++++++------- requirements-all.txt | 8 ++++---- tutorials/convert_lit_models.md | 17 ++++------------- tutorials/pretrain_tinyllama.md | 8 ++++---- 6 files changed, 30 insertions(+), 37 deletions(-) rename {scripts => litgpt/data}/prepare_slimpajama.py (89%) rename {scripts => litgpt/data}/prepare_starcoder.py (86%) diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml index 3b0de8debb..58cbd9690a 100644 --- a/.github/workflows/cpu-tests.yml +++ b/.github/workflows/cpu-tests.yml @@ -52,7 +52,7 @@ jobs: # make sure all modules are importable modules=$( find * -type f -name "*.py" | \ - grep -v tests | grep "/" | grep -v lm_eval | grep -v xla | grep -v prepare_slimpajama | grep -v prepare_starcoder | \ + grep -v tests | grep "/" | grep -v lm_eval | grep -v xla | \ sed 's/\.py$//' | sed 's/\//./g' | \ sed 's/.__init__//g' | xargs -I {} echo "import {};" ) diff --git a/scripts/prepare_slimpajama.py b/litgpt/data/prepare_slimpajama.py similarity index 89% rename from scripts/prepare_slimpajama.py rename to litgpt/data/prepare_slimpajama.py index 35c4b2718c..c89b199564 100644 --- a/scripts/prepare_slimpajama.py +++ b/litgpt/data/prepare_slimpajama.py @@ -2,19 +2,12 @@ import json import os -import sys import time from pathlib import Path -import zstandard as zstd -from lightning.data.streaming import DataChunkRecipe, DataProcessor - -# support running without installing as a package -wd = Path(__file__).parent.parent.resolve() -sys.path.append(str(wd)) - from litgpt import Tokenizer from litgpt.utils import CLI +from litgpt.data.prepare_starcoder import DataChunkRecipe class SlimPajamaDataRecipe(DataChunkRecipe): @@ -27,6 +20,8 @@ def prepare_structure(self, input_dir): return [str(file) for file in files] def prepare_item(self, filepath): + import zstandard as zstd + with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: for row in f: text = json.loads(row)["text"] @@ -43,6 +38,8 @@ def prepare( chunk_size: int = (2049 * 16384), fast_dev_run: bool = False, ) -> None: + from lightning.data.streaming import DataProcessor + tokenizer = Tokenizer(tokenizer_path) data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) data_processor = DataProcessor( diff --git a/scripts/prepare_starcoder.py b/litgpt/data/prepare_starcoder.py similarity index 86% rename from scripts/prepare_starcoder.py rename to litgpt/data/prepare_starcoder.py index 45bd6b478d..d7d1bdff45 100644 --- a/scripts/prepare_starcoder.py +++ b/litgpt/data/prepare_starcoder.py @@ -1,21 +1,22 @@ # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. import os -import sys import time import traceback from pathlib import Path -import pyarrow.parquet as pq -from lightning.data.streaming import DataChunkRecipe, DataProcessor - -# support running without installing as a package -wd = Path(__file__).parent.parent.resolve() -sys.path.append(str(wd)) +from lightning_utilities.core.imports import RequirementCache from litgpt import Tokenizer from litgpt.utils import CLI +_LITDATA_AVAILABLE = RequirementCache("litdata") +if _LITDATA_AVAILABLE: + from lightning.data.streaming import DataChunkRecipe +else: + DataChunkRecipe = object + + class StarcoderDataRecipe(DataChunkRecipe): def __init__(self, tokenizer: Tokenizer, chunk_size: int): @@ -27,6 +28,8 @@ def prepare_structure(self, input_dir): return [str(file) for file in files] def prepare_item(self, item_metadata): + import pyarrow.parquet as pq + filepath = item_metadata start = time.time() @@ -54,6 +57,8 @@ def prepare( chunk_size: int = (2049 * 8192), fast_dev_run: bool = False, ) -> None: + from lightning.data.streaming import DataProcessor + tokenizer = Tokenizer(tokenizer_path) data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) data_processor = DataProcessor( diff --git a/requirements-all.txt b/requirements-all.txt index 2823ca818e..81dde77b6f 100644 --- a/requirements-all.txt +++ b/requirements-all.txt @@ -6,12 +6,12 @@ sentencepiece # llama-based models tokenizers # pythia, falcon, redpajama datasets # eval requests # litgpt.data -zstandard # scripts/prepare_starcoder.py -pandas # scripts/prepare_starcoder.py -pyarrow # scripts/prepare_starcoder.py +zstandard # litgpt.data.prepare_slimpajama.py +pandas # litgpt.data.prepare_starcoder.py +pyarrow # litgpt.data.prepare_starcoder.py tensorboard # litgpt.pretrain torchmetrics # litgpt.pretrain # eval git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8beaa90051fb52db77f0a529 # scripts/prepare_slimpajama.py, scripts/prepare_starcoder.py -lightning[data] @ git+https://github.com/Lightning-AI/lightning@f23b3b1e7fdab1d325f79f69a28706d33144f27e +lightning[data] @ git+https://github.com/Lightning-AI/lightning@f23b3b1e7fdab1d325f79f69a28706d33144f27e \ No newline at end of file diff --git a/tutorials/convert_lit_models.md b/tutorials/convert_lit_models.md index 13b4017a07..53ba412798 100644 --- a/tutorials/convert_lit_models.md +++ b/tutorials/convert_lit_models.md @@ -78,16 +78,7 @@ Then, we download the model we specified via `$repo_id` above: python scripts/download.py --repo_id $repo_id ``` - -2. Prepare a dataset for finetuning: - -```bash -python scripts/prepare_alpaca.py \ - --checkpoint_dir checkpoints/$repo_id \ - --destination_path data/alpaca -``` - -3. Finetune the model: +2. Finetune the model: ```bash @@ -100,7 +91,7 @@ python litgpt/finetune/lora.py \ --data Alpaca ``` -4. Merge LoRA weights: +3. Merge LoRA weights: Note that this step only applies if the model was finetuned with `lora.py` above and not when `full.py` was used for finetuning. @@ -110,7 +101,7 @@ python scripts/merge_lora.py \ ``` -5. Convert the finetuning model back into a HF format: +4. Convert the finetuning model back into a HF format: ```bash python scripts/convert_lit_checkpoint.py \ @@ -120,7 +111,7 @@ python scripts/convert_lit_checkpoint.py \ ``` -6. Load the model into a `transformers` model: +5. Load the model into a `transformers` model: ```python import torch diff --git a/tutorials/pretrain_tinyllama.md b/tutorials/pretrain_tinyllama.md index 044c83cd24..8438fe3a64 100644 --- a/tutorials/pretrain_tinyllama.md +++ b/tutorials/pretrain_tinyllama.md @@ -67,7 +67,7 @@ You will require **1.1 TB** of disk space for Starcoder and **2.5** TB of space **Starcoder:** ```bash -python scripts/prepare_starcoder.py \ +python litgpt/data/prepare_starcoder.py \ --input_dir data/starcoderdata-raw \ --output_dir data/starcoder \ --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf @@ -76,17 +76,17 @@ python scripts/prepare_starcoder.py \ **SlimPajama:** ```bash -python scripts/prepare_slimpajama.py \ +python litgpt/data/prepare_slimpajama.py \ --input_dir data/slimpajama-raw/validation \ --output_dir data/slimpajama/val \ --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf -python scripts/prepare_slimpajama.py \ +python litgpt/data/prepare_slimpajama.py \ --input_dir data/slimpajama-raw/test \ --output_dir data/slimpajama/test \ --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf -python scripts/prepare_slimpajama.py \ +python litgpt/data/prepare_slimpajama.py \ --input_dir data/slimpajama-raw/train \ --output_dir data/slimpajama/train \ --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf