Skip to content

Commit

Permalink
Move prepare_* scripts inside the package (#1044)
Browse files Browse the repository at this point in the history
  • Loading branch information
carmocca authored Mar 7, 2024
1 parent e511ba6 commit bde8093
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 37 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/cpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
# make sure all modules are importable
modules=$(
find * -type f -name "*.py" | \
grep -v tests | grep "/" | grep -v lm_eval | grep -v xla | grep -v prepare_slimpajama | grep -v prepare_starcoder | \
grep -v tests | grep "/" | grep -v lm_eval | grep -v xla | \
sed 's/\.py$//' | sed 's/\//./g' | \
sed 's/.__init__//g' | xargs -I {} echo "import {};"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,12 @@

import json
import os
import sys
import time
from pathlib import Path

import zstandard as zstd
from lightning.data.streaming import DataChunkRecipe, DataProcessor

# support running without installing as a package
wd = Path(__file__).parent.parent.resolve()
sys.path.append(str(wd))

from litgpt import Tokenizer
from litgpt.utils import CLI
from litgpt.data.prepare_starcoder import DataChunkRecipe


class SlimPajamaDataRecipe(DataChunkRecipe):
Expand All @@ -27,6 +20,8 @@ def prepare_structure(self, input_dir):
return [str(file) for file in files]

def prepare_item(self, filepath):
import zstandard as zstd

with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
for row in f:
text = json.loads(row)["text"]
Expand All @@ -43,6 +38,8 @@ def prepare(
chunk_size: int = (2049 * 16384),
fast_dev_run: bool = False,
) -> None:
from lightning.data.streaming import DataProcessor

tokenizer = Tokenizer(tokenizer_path)
data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
data_processor = DataProcessor(
Expand Down
19 changes: 12 additions & 7 deletions scripts/prepare_starcoder.py → litgpt/data/prepare_starcoder.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.

import os
import sys
import time
import traceback
from pathlib import Path

import pyarrow.parquet as pq
from lightning.data.streaming import DataChunkRecipe, DataProcessor

# support running without installing as a package
wd = Path(__file__).parent.parent.resolve()
sys.path.append(str(wd))
from lightning_utilities.core.imports import RequirementCache

from litgpt import Tokenizer
from litgpt.utils import CLI

_LITDATA_AVAILABLE = RequirementCache("litdata")
if _LITDATA_AVAILABLE:
from lightning.data.streaming import DataChunkRecipe
else:
DataChunkRecipe = object



class StarcoderDataRecipe(DataChunkRecipe):
def __init__(self, tokenizer: Tokenizer, chunk_size: int):
Expand All @@ -27,6 +28,8 @@ def prepare_structure(self, input_dir):
return [str(file) for file in files]

def prepare_item(self, item_metadata):
import pyarrow.parquet as pq

filepath = item_metadata
start = time.time()

Expand Down Expand Up @@ -54,6 +57,8 @@ def prepare(
chunk_size: int = (2049 * 8192),
fast_dev_run: bool = False,
) -> None:
from lightning.data.streaming import DataProcessor

tokenizer = Tokenizer(tokenizer_path)
data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
data_processor = DataProcessor(
Expand Down
8 changes: 4 additions & 4 deletions requirements-all.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ sentencepiece # llama-based models
tokenizers # pythia, falcon, redpajama
datasets # eval
requests # litgpt.data
zstandard # scripts/prepare_starcoder.py
pandas # scripts/prepare_starcoder.py
pyarrow # scripts/prepare_starcoder.py
zstandard # litgpt.data.prepare_slimpajama.py
pandas # litgpt.data.prepare_starcoder.py
pyarrow # litgpt.data.prepare_starcoder.py
tensorboard # litgpt.pretrain
torchmetrics # litgpt.pretrain
# eval
git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8beaa90051fb52db77f0a529
# scripts/prepare_slimpajama.py, scripts/prepare_starcoder.py
lightning[data] @ git+https://github.com/Lightning-AI/lightning@f23b3b1e7fdab1d325f79f69a28706d33144f27e
lightning[data] @ git+https://github.com/Lightning-AI/lightning@f23b3b1e7fdab1d325f79f69a28706d33144f27e
17 changes: 4 additions & 13 deletions tutorials/convert_lit_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,7 @@ Then, we download the model we specified via `$repo_id` above:
python scripts/download.py --repo_id $repo_id
```


2. Prepare a dataset for finetuning:

```bash
python scripts/prepare_alpaca.py \
--checkpoint_dir checkpoints/$repo_id \
--destination_path data/alpaca
```

3. Finetune the model:
2. Finetune the model:


```bash
Expand All @@ -100,7 +91,7 @@ python litgpt/finetune/lora.py \
--data Alpaca
```

4. Merge LoRA weights:
3. Merge LoRA weights:

Note that this step only applies if the model was finetuned with `lora.py` above and not when `full.py` was used for finetuning.

Expand All @@ -110,7 +101,7 @@ python scripts/merge_lora.py \
```


5. Convert the finetuning model back into a HF format:
4. Convert the finetuning model back into a HF format:

```bash
python scripts/convert_lit_checkpoint.py \
Expand All @@ -120,7 +111,7 @@ python scripts/convert_lit_checkpoint.py \
```


6. Load the model into a `transformers` model:
5. Load the model into a `transformers` model:

```python
import torch
Expand Down
8 changes: 4 additions & 4 deletions tutorials/pretrain_tinyllama.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ You will require **1.1 TB** of disk space for Starcoder and **2.5** TB of space
**Starcoder:**

```bash
python scripts/prepare_starcoder.py \
python litgpt/data/prepare_starcoder.py \
--input_dir data/starcoderdata-raw \
--output_dir data/starcoder \
--tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf
Expand All @@ -76,17 +76,17 @@ python scripts/prepare_starcoder.py \
**SlimPajama:**

```bash
python scripts/prepare_slimpajama.py \
python litgpt/data/prepare_slimpajama.py \
--input_dir data/slimpajama-raw/validation \
--output_dir data/slimpajama/val \
--tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf

python scripts/prepare_slimpajama.py \
python litgpt/data/prepare_slimpajama.py \
--input_dir data/slimpajama-raw/test \
--output_dir data/slimpajama/test \
--tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf

python scripts/prepare_slimpajama.py \
python litgpt/data/prepare_slimpajama.py \
--input_dir data/slimpajama-raw/train \
--output_dir data/slimpajama/train \
--tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf
Expand Down

0 comments on commit bde8093

Please sign in to comment.