Skip to content

Commit

Permalink
Fix base requirements (#916)
Browse files Browse the repository at this point in the history
  • Loading branch information
carmocca authored Feb 5, 2024
1 parent cca2986 commit d80842c
Show file tree
Hide file tree
Showing 9 changed files with 42 additions and 68 deletions.
8 changes: 1 addition & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,7 @@ git clone https://github.com/Lightning-AI/lit-gpt
cd lit-gpt
```

Install the minimal dependencies:

```bash
pip install -r requirements.txt
```

Install with all dependencies (including quantization, sentencepiece, tokenizers for Llama models, etc.):
Install with all dependencies (including CLI, quantization, tokenizers for all models, etc.):

```bash
pip install -r requirements-all.txt
Expand Down
22 changes: 12 additions & 10 deletions requirements-all.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
-r requirements.txt
bitsandbytes==0.41.0 # quantization
scipy # required by bitsandbytes
sentencepiece # llama-based models
tokenizers # pythia, falcon, redpajama
datasets # eval
zstandard # scripts/prepare_redpajama.py, scripts/prepare_starcoder.py
pandas # scripts/prepare_csv.py, scripts/prepare_starcoder.py
pyarrow # scripts/prepare_starcoder.py
tensorboard # pretrain/tinyllama.py
torchmetrics # pretrain/tinyllama.py
jsonargparse[signatures] # CLI
bitsandbytes==0.41.0 # quantization
scipy # required by bitsandbytes
sentencepiece # llama-based models
tokenizers # pythia, falcon, redpajama
datasets # eval
requests # scripts/prepare_*
zstandard # scripts/prepare_redpajama.py, scripts/prepare_starcoder.py
pandas # scripts/prepare_csv.py, scripts/prepare_starcoder.py
pyarrow # scripts/prepare_starcoder.py
tensorboard # pretrain/tinyllama.py
torchmetrics # pretrain/tinyllama.py
# eval
git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8beaa90051fb52db77f0a529
# scripts/prepare_slimpajama.py, scripts/prepare_starcoder.py, pretrain/tinyllama.py
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
torch>=2.2.0
lightning @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af
jsonargparse[signatures] # CLI
7 changes: 6 additions & 1 deletion scripts/prepare_alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from pathlib import Path
from typing import Optional

import requests
import torch
from lightning_utilities.core.imports import RequirementCache
from torch.utils.data import random_split
from tqdm import tqdm

Expand Down Expand Up @@ -90,6 +90,11 @@ def download_if_missing(file_path: Path, file_url: str) -> None:
"""Downloads the raw json data file and saves it in the given destination."""
if file_path.exists() and file_path.stat().st_size > 0:
return
requests_available = RequirementCache("requests")
if not requests_available:
raise ModuleNotFoundError(str(requests_available))
import requests

with open(file_path, "w", encoding="utf-8") as f:
f.write(requests.get(file_url).text)

Expand Down
11 changes: 2 additions & 9 deletions scripts/prepare_dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
from pathlib import Path
from typing import Optional

import requests
import torch
from torch.utils.data import random_split
from tqdm import tqdm

from scripts.prepare_alpaca import download_if_missing

# support running without installing as a package
wd = Path(__file__).parent.parent.resolve()
sys.path.append(str(wd))
Expand Down Expand Up @@ -92,14 +93,6 @@ def prepare(
torch.save(test_set, destination_path / "test.pt")


def download_if_missing(file_path: Path, file_url: str) -> None:
"""Downloads the raw json data file and saves it in the given destination."""
if file_path.exists():
return
with open(file_path, "w", encoding="utf-8") as f:
f.write(requests.get(file_url).text)


def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict:
"""Processes a single sample.
Expand Down
11 changes: 2 additions & 9 deletions scripts/prepare_flan.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,18 @@
from pathlib import Path
from typing import Optional

import requests
import torch
from tqdm import tqdm

from scripts.prepare_alpaca import download_if_missing

# support running without installing as a package
wd = Path(__file__).parent.parent.resolve()
sys.path.append(str(wd))

from lit_gpt.tokenizer import Tokenizer


def download_if_missing(file_path: Path, file_url: str):
"""Downloads the raw json data file and saves it in the given destination."""
if file_path.exists() and file_path.stat().st_size > 0:
return
with open(file_path, "w", encoding="utf-8") as f:
f.write(requests.get(file_url).text)


def load_jsonl(filename):
data = []
with open(filename, "r", encoding="utf-8") as f:
Expand Down
11 changes: 2 additions & 9 deletions scripts/prepare_longform.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
from pathlib import Path
from typing import Optional

import requests
import torch
from tqdm import tqdm

from scripts.prepare_alpaca import download_if_missing

# support running without installing as a package
wd = Path(__file__).parent.parent.resolve()
sys.path.append(str(wd))
Expand Down Expand Up @@ -90,14 +91,6 @@ def prepare(
torch.save(test_data, destination_path / "test.pt")


def download_if_missing(file_path: Path, file_url: str) -> None:
"""Downloads the raw json data file and saves it in the given destination."""
if file_path.exists() and file_path.stat().st_size > 0:
return
with open(file_path, "w", encoding="utf-8") as f:
f.write(requests.get(file_url).text)


def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict:
"""Processes a single sample.
Expand Down
15 changes: 6 additions & 9 deletions tests/test_packed_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,22 @@
from unittest.mock import MagicMock

import pytest
import requests
from torch.utils.data import IterableDataset


def maybe_get_file(url, file_path):
if not file_path.exists():
with open(file_path, "w", encoding="utf-8") as f:
f.write(requests.get(url).text)
from scripts.prepare_alpaca import download_if_missing


def test_packed_dataset(tmp_path):
tmp_path.mkdir(parents=True, exist_ok=True)

vocabulary_path = tmp_path / "tokenizer.json"
maybe_get_file("https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json", vocabulary_path)
download_if_missing(
vocabulary_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json"
)

tokenizer_path = tmp_path / "tokenizer_config.json"
maybe_get_file(
"https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json", tokenizer_path
download_if_missing(
tokenizer_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json"
)

from lit_gpt import Tokenizer
Expand Down
24 changes: 11 additions & 13 deletions tests/test_prepare_redpajama.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,17 @@
from pathlib import Path
from unittest import mock

import requests


def maybe_get_file(url, file_path):
if not file_path.exists():
with open(file_path, "w", encoding="utf-8") as f:
f.write(requests.get(url).text)
from scripts.prepare_alpaca import download_if_missing


def test_prepare_sample(tmp_path):
vocabulary_path = tmp_path / "tokenizer.json"
maybe_get_file("https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json", vocabulary_path)
download_if_missing(
vocabulary_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json"
)
tokenizer_path = tmp_path / "tokenizer_config.json"
maybe_get_file(
"https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json", tokenizer_path
download_if_missing(
tokenizer_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json"
)
with open(tmp_path / "lit_config.json", "w") as f:
json.dump({"block_size": 2048}, f)
Expand Down Expand Up @@ -66,10 +62,12 @@ def test_prepare_sample(tmp_path):

def test_prepare_full(tmp_path):
vocabulary_path = tmp_path / "tokenizer.json"
maybe_get_file("https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json", vocabulary_path)
download_if_missing(
vocabulary_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json"
)
tokenizer_path = tmp_path / "tokenizer_config.json"
maybe_get_file(
"https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json", tokenizer_path
download_if_missing(
tokenizer_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json"
)
with open(tmp_path / "lit_config.json", "w") as f:
json.dump({"block_size": 2048}, f)
Expand Down

0 comments on commit d80842c

Please sign in to comment.