From bde8093551c0f49269ac940947ba5cd9ceea043a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Thu, 7 Mar 2024 18:56:48 +0100
Subject: [PATCH] Move `prepare_*` scripts inside the package (#1044)

---
 .github/workflows/cpu-tests.yml               |  2 +-
 .../data}/prepare_slimpajama.py               | 13 +++++--------
 {scripts => litgpt/data}/prepare_starcoder.py | 19 ++++++++++++-------
 requirements-all.txt                          |  8 ++++----
 tutorials/convert_lit_models.md               | 17 ++++-------------
 tutorials/pretrain_tinyllama.md               |  8 ++++----
 6 files changed, 30 insertions(+), 37 deletions(-)
 rename {scripts => litgpt/data}/prepare_slimpajama.py (89%)
 rename {scripts => litgpt/data}/prepare_starcoder.py (86%)

diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml
index 3b0de8debb..58cbd9690a 100644
--- a/.github/workflows/cpu-tests.yml
+++ b/.github/workflows/cpu-tests.yml
@@ -52,7 +52,7 @@ jobs:
         # make sure all modules are importable
         modules=$(
           find * -type f -name "*.py" | \
-          grep -v tests | grep "/" | grep -v lm_eval | grep -v xla | grep -v prepare_slimpajama | grep -v prepare_starcoder | \
+          grep -v tests | grep "/" | grep -v lm_eval | grep -v xla | \
           sed 's/\.py$//' | sed 's/\//./g' | \
           sed 's/.__init__//g' | xargs -I {} echo "import {};"
         )
diff --git a/scripts/prepare_slimpajama.py b/litgpt/data/prepare_slimpajama.py
similarity index 89%
rename from scripts/prepare_slimpajama.py
rename to litgpt/data/prepare_slimpajama.py
index 35c4b2718c..c89b199564 100644
--- a/scripts/prepare_slimpajama.py
+++ b/litgpt/data/prepare_slimpajama.py
@@ -2,19 +2,12 @@
 
 import json
 import os
-import sys
 import time
 from pathlib import Path
 
-import zstandard as zstd
-from lightning.data.streaming import DataChunkRecipe, DataProcessor
-
-# support running without installing as a package
-wd = Path(__file__).parent.parent.resolve()
-sys.path.append(str(wd))
-
 from litgpt import Tokenizer
 from litgpt.utils import CLI
+from litgpt.data.prepare_starcoder import DataChunkRecipe
 
 
 class SlimPajamaDataRecipe(DataChunkRecipe):
@@ -27,6 +20,8 @@ def prepare_structure(self, input_dir):
         return [str(file) for file in files]
 
     def prepare_item(self, filepath):
+        import zstandard as zstd
+
         with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
             for row in f:
                 text = json.loads(row)["text"]
@@ -43,6 +38,8 @@ def prepare(
     chunk_size: int = (2049 * 16384),
     fast_dev_run: bool = False,
 ) -> None:
+    from lightning.data.streaming import DataProcessor
+
     tokenizer = Tokenizer(tokenizer_path)
     data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
     data_processor = DataProcessor(
diff --git a/scripts/prepare_starcoder.py b/litgpt/data/prepare_starcoder.py
similarity index 86%
rename from scripts/prepare_starcoder.py
rename to litgpt/data/prepare_starcoder.py
index 45bd6b478d..d7d1bdff45 100644
--- a/scripts/prepare_starcoder.py
+++ b/litgpt/data/prepare_starcoder.py
@@ -1,21 +1,22 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 
 import os
-import sys
 import time
 import traceback
 from pathlib import Path
 
-import pyarrow.parquet as pq
-from lightning.data.streaming import DataChunkRecipe, DataProcessor
-
-# support running without installing as a package
-wd = Path(__file__).parent.parent.resolve()
-sys.path.append(str(wd))
+from lightning_utilities.core.imports import RequirementCache
 
 from litgpt import Tokenizer
 from litgpt.utils import CLI
 
+_LITDATA_AVAILABLE = RequirementCache("litdata")
+if _LITDATA_AVAILABLE:
+    from lightning.data.streaming import DataChunkRecipe
+else:
+    DataChunkRecipe = object
+
+
 
 class StarcoderDataRecipe(DataChunkRecipe):
     def __init__(self, tokenizer: Tokenizer, chunk_size: int):
@@ -27,6 +28,8 @@ def prepare_structure(self, input_dir):
         return [str(file) for file in files]
 
     def prepare_item(self, item_metadata):
+        import pyarrow.parquet as pq
+
         filepath = item_metadata
         start = time.time()
 
@@ -54,6 +57,8 @@ def prepare(
     chunk_size: int = (2049 * 8192),
     fast_dev_run: bool = False,
 ) -> None:
+    from lightning.data.streaming import DataProcessor
+
     tokenizer = Tokenizer(tokenizer_path)
     data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
     data_processor = DataProcessor(
diff --git a/requirements-all.txt b/requirements-all.txt
index 2823ca818e..81dde77b6f 100644
--- a/requirements-all.txt
+++ b/requirements-all.txt
@@ -6,12 +6,12 @@ sentencepiece             # llama-based models
 tokenizers                # pythia, falcon, redpajama
 datasets                  # eval
 requests                  # litgpt.data
-zstandard                 # scripts/prepare_starcoder.py
-pandas                    # scripts/prepare_starcoder.py
-pyarrow                   # scripts/prepare_starcoder.py
+zstandard                 # litgpt.data.prepare_slimpajama.py
+pandas                    # litgpt.data.prepare_starcoder.py
+pyarrow                   # litgpt.data.prepare_starcoder.py
 tensorboard               # litgpt.pretrain
 torchmetrics              # litgpt.pretrain
 # eval
 git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8beaa90051fb52db77f0a529
 # scripts/prepare_slimpajama.py, scripts/prepare_starcoder.py
-lightning[data] @ git+https://github.com/Lightning-AI/lightning@f23b3b1e7fdab1d325f79f69a28706d33144f27e
+lightning[data] @ git+https://github.com/Lightning-AI/lightning@f23b3b1e7fdab1d325f79f69a28706d33144f27e
\ No newline at end of file
diff --git a/tutorials/convert_lit_models.md b/tutorials/convert_lit_models.md
index 13b4017a07..53ba412798 100644
--- a/tutorials/convert_lit_models.md
+++ b/tutorials/convert_lit_models.md
@@ -78,16 +78,7 @@ Then, we download the model we specified via `$repo_id` above:
 python scripts/download.py --repo_id $repo_id
 ```
 
-
-2. Prepare a dataset for finetuning:
-
-```bash
-python scripts/prepare_alpaca.py \
-    --checkpoint_dir checkpoints/$repo_id \
-    --destination_path data/alpaca
-```
-
-3. Finetune the model:
+2. Finetune the model:
 
 
 ```bash
@@ -100,7 +91,7 @@ python litgpt/finetune/lora.py \
    --data Alpaca
 ```
 
-4. Merge LoRA weights:
+3. Merge LoRA weights:
 
 Note that this step only applies if the model was finetuned with `lora.py` above and not when `full.py` was used for finetuning.
 
@@ -110,7 +101,7 @@ python scripts/merge_lora.py \
 ```
 
 
-5. Convert the finetuning model back into a HF format:
+4. Convert the finetuning model back into a HF format:
 
 ```bash
 python scripts/convert_lit_checkpoint.py \
@@ -120,7 +111,7 @@ python scripts/convert_lit_checkpoint.py \
 ```
 
 
-6. Load the model into a `transformers` model:
+5. Load the model into a `transformers` model:
 
 ```python
 import torch
diff --git a/tutorials/pretrain_tinyllama.md b/tutorials/pretrain_tinyllama.md
index 044c83cd24..8438fe3a64 100644
--- a/tutorials/pretrain_tinyllama.md
+++ b/tutorials/pretrain_tinyllama.md
@@ -67,7 +67,7 @@ You will require **1.1 TB** of disk space for Starcoder and **2.5** TB of space
 **Starcoder:**
 
 ```bash
-python scripts/prepare_starcoder.py \
+python litgpt/data/prepare_starcoder.py \
   --input_dir data/starcoderdata-raw \
   --output_dir data/starcoder \
   --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf
@@ -76,17 +76,17 @@ python scripts/prepare_starcoder.py \
 **SlimPajama:**
 
 ```bash
-python scripts/prepare_slimpajama.py \
+python litgpt/data/prepare_slimpajama.py \
   --input_dir data/slimpajama-raw/validation \
   --output_dir data/slimpajama/val \
   --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf
 
-python scripts/prepare_slimpajama.py \
+python litgpt/data/prepare_slimpajama.py \
   --input_dir data/slimpajama-raw/test \
   --output_dir data/slimpajama/test \
   --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf
 
-python scripts/prepare_slimpajama.py \
+python litgpt/data/prepare_slimpajama.py \
   --input_dir data/slimpajama-raw/train \
   --output_dir data/slimpajama/train \
   --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf