Fix base requirements (#916)

Lightning-AI · Feb 5, 2024 · d80842c · d80842c
1 parent cca2986
commit d80842c
Show file tree

Hide file tree

Showing 9 changed files with 42 additions and 68 deletions.
diff --git a/README.md b/README.md
@@ -98,13 +98,7 @@ git clone https://github.com/Lightning-AI/lit-gpt
 cd lit-gpt
 ```
 
-Install the minimal dependencies:
-
-```bash
-pip install -r requirements.txt
-```
-
-Install with all dependencies (including quantization, sentencepiece, tokenizers for Llama models, etc.):
+Install with all dependencies (including CLI, quantization, tokenizers for all models, etc.):
 
 ```bash
 pip install -r requirements-all.txt

diff --git a/requirements-all.txt b/requirements-all.txt
@@ -1,14 +1,16 @@
 -r requirements.txt
-bitsandbytes==0.41.0    # quantization
-scipy                   # required by bitsandbytes
-sentencepiece           # llama-based models
-tokenizers              # pythia, falcon, redpajama
-datasets                # eval
-zstandard               # scripts/prepare_redpajama.py, scripts/prepare_starcoder.py
-pandas                  # scripts/prepare_csv.py, scripts/prepare_starcoder.py
-pyarrow                 # scripts/prepare_starcoder.py
-tensorboard             # pretrain/tinyllama.py
-torchmetrics            # pretrain/tinyllama.py
+jsonargparse[signatures]  # CLI
+bitsandbytes==0.41.0      # quantization
+scipy                     # required by bitsandbytes
+sentencepiece             # llama-based models
+tokenizers                # pythia, falcon, redpajama
+datasets                  # eval
+requests                  # scripts/prepare_*
+zstandard                 # scripts/prepare_redpajama.py, scripts/prepare_starcoder.py
+pandas                    # scripts/prepare_csv.py, scripts/prepare_starcoder.py
+pyarrow                   # scripts/prepare_starcoder.py
+tensorboard               # pretrain/tinyllama.py
+torchmetrics              # pretrain/tinyllama.py
 # eval
 git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8beaa90051fb52db77f0a529
 # scripts/prepare_slimpajama.py, scripts/prepare_starcoder.py, pretrain/tinyllama.py

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,2 @@
 torch>=2.2.0
 lightning @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af
-jsonargparse[signatures]  # CLI
diff --git a/scripts/prepare_alpaca.py b/scripts/prepare_alpaca.py
@@ -7,8 +7,8 @@
 from pathlib import Path
 from typing import Optional
 
-import requests
 import torch
+from lightning_utilities.core.imports import RequirementCache
 from torch.utils.data import random_split
 from tqdm import tqdm
 
@@ -90,6 +90,11 @@ def download_if_missing(file_path: Path, file_url: str) -> None:
     """Downloads the raw json data file and saves it in the given destination."""
     if file_path.exists() and file_path.stat().st_size > 0:
         return
+    requests_available = RequirementCache("requests")
+    if not requests_available:
+        raise ModuleNotFoundError(str(requests_available))
+    import requests
+
     with open(file_path, "w", encoding="utf-8") as f:
         f.write(requests.get(file_url).text)
 

diff --git a/scripts/prepare_dolly.py b/scripts/prepare_dolly.py
@@ -7,11 +7,12 @@
 from pathlib import Path
 from typing import Optional
 
-import requests
 import torch
 from torch.utils.data import random_split
 from tqdm import tqdm
 
+from scripts.prepare_alpaca import download_if_missing
+
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
@@ -92,14 +93,6 @@ def prepare(
     torch.save(test_set, destination_path / "test.pt")
 
 
-def download_if_missing(file_path: Path, file_url: str) -> None:
-    """Downloads the raw json data file and saves it in the given destination."""
-    if file_path.exists():
-        return
-    with open(file_path, "w", encoding="utf-8") as f:
-        f.write(requests.get(file_url).text)
-
-
 def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict:
     """Processes a single sample.
 

diff --git a/scripts/prepare_flan.py b/scripts/prepare_flan.py
@@ -6,25 +6,18 @@
 from pathlib import Path
 from typing import Optional
 
-import requests
 import torch
 from tqdm import tqdm
 
+from scripts.prepare_alpaca import download_if_missing
+
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
 from lit_gpt.tokenizer import Tokenizer
 
 
-def download_if_missing(file_path: Path, file_url: str):
-    """Downloads the raw json data file and saves it in the given destination."""
-    if file_path.exists() and file_path.stat().st_size > 0:
-        return
-    with open(file_path, "w", encoding="utf-8") as f:
-        f.write(requests.get(file_url).text)
-
-
 def load_jsonl(filename):
     data = []
     with open(filename, "r", encoding="utf-8") as f:

diff --git a/scripts/prepare_longform.py b/scripts/prepare_longform.py
@@ -7,10 +7,11 @@
 from pathlib import Path
 from typing import Optional
 
-import requests
 import torch
 from tqdm import tqdm
 
+from scripts.prepare_alpaca import download_if_missing
+
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
@@ -90,14 +91,6 @@ def prepare(
     torch.save(test_data, destination_path / "test.pt")
 
 
-def download_if_missing(file_path: Path, file_url: str) -> None:
-    """Downloads the raw json data file and saves it in the given destination."""
-    if file_path.exists() and file_path.stat().st_size > 0:
-        return
-    with open(file_path, "w", encoding="utf-8") as f:
-        f.write(requests.get(file_url).text)
-
-
 def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict:
     """Processes a single sample.
 

diff --git a/tests/test_packed_dataset.py b/tests/test_packed_dataset.py
@@ -4,25 +4,22 @@
 from unittest.mock import MagicMock
 
 import pytest
-import requests
 from torch.utils.data import IterableDataset
 
-
-def maybe_get_file(url, file_path):
-    if not file_path.exists():
-        with open(file_path, "w", encoding="utf-8") as f:
-            f.write(requests.get(url).text)
+from scripts.prepare_alpaca import download_if_missing
 
 
 def test_packed_dataset(tmp_path):
     tmp_path.mkdir(parents=True, exist_ok=True)
 
     vocabulary_path = tmp_path / "tokenizer.json"
-    maybe_get_file("https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json", vocabulary_path)
+    download_if_missing(
+        vocabulary_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json"
+    )
 
     tokenizer_path = tmp_path / "tokenizer_config.json"
-    maybe_get_file(
-        "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json", tokenizer_path
+    download_if_missing(
+        tokenizer_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json"
     )
 
     from lit_gpt import Tokenizer

diff --git a/tests/test_prepare_redpajama.py b/tests/test_prepare_redpajama.py
@@ -7,21 +7,17 @@
 from pathlib import Path
 from unittest import mock
 
-import requests
-
-
-def maybe_get_file(url, file_path):
-    if not file_path.exists():
-        with open(file_path, "w", encoding="utf-8") as f:
-            f.write(requests.get(url).text)
+from scripts.prepare_alpaca import download_if_missing
 
 
 def test_prepare_sample(tmp_path):
     vocabulary_path = tmp_path / "tokenizer.json"
-    maybe_get_file("https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json", vocabulary_path)
+    download_if_missing(
+        vocabulary_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json"
+    )
     tokenizer_path = tmp_path / "tokenizer_config.json"
-    maybe_get_file(
-        "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json", tokenizer_path
+    download_if_missing(
+        tokenizer_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json"
     )
     with open(tmp_path / "lit_config.json", "w") as f:
         json.dump({"block_size": 2048}, f)
@@ -66,10 +62,12 @@ def test_prepare_sample(tmp_path):
 
 def test_prepare_full(tmp_path):
     vocabulary_path = tmp_path / "tokenizer.json"
-    maybe_get_file("https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json", vocabulary_path)
+    download_if_missing(
+        vocabulary_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json"
+    )
     tokenizer_path = tmp_path / "tokenizer_config.json"
-    maybe_get_file(
-        "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json", tokenizer_path
+    download_if_missing(
+        tokenizer_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json"
     )
     with open(tmp_path / "lit_config.json", "w") as f:
         json.dump({"block_size": 2048}, f)