Merge branch 'main' into multiline

Lightning-AI · Apr 16, 2024 · 74f33d2 · 74f33d2
2 parents 2a7c397 + 410a712
commit 74f33d2
Show file tree

Hide file tree

Showing 21 changed files with 145 additions and 51 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1 +1,2 @@
 * @awaelchli @carmocca @lantiga
+/README.md                           @williamfalcon @lantiga
diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml
@@ -39,14 +39,14 @@ jobs:
       uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
-
-    - name: Install uv
-      run: pip install uv
+        cache: 'pip'
+        cache-dependency-path: |
+          pyproject.toml
 
     - name: Install minimal dependencies
       run: |
-        uv pip install --system .
-        uv pip list
+        pip install .
+        pip list
         # make sure all modules are still importable with only the minimal dependencies available
         modules=$(
           find litgpt -type f -name "*.py" | \
@@ -58,8 +58,8 @@ jobs:
 
     - name: Install all dependencies
       run: |
-        uv pip install --system '.[all,test]'
-        uv pip list
+        pip install '.[all,test]'
+        pip list
 
     - name: Run tests
       run: |

diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ Uses the latest state-of-the-art techniques:
 
 
 ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pytorch-lightning)
-![cpu-tests](https://github.com/lightning-AI/lit-stablelm/actions/workflows/cpu-tests.yml/badge.svg) [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/Lightning-AI/lit-stablelm/blob/master/LICENSE) [![Discord](https://img.shields.io/discord/1077906959069626439?style=plastic)](https://discord.gg/VptPCZkGNa)
+![cpu-tests](https://github.com/lightning-AI/lit-stablelm/actions/workflows/cpu-tests.yml/badge.svg) [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/Lightning-AI/lit-stablelm/blob/master/LICENSE) [![Discord](https://img.shields.io/discord/1077906959069626439)](https://discord.gg/VptPCZkGNa)
 
 <p align="center">
   <a href="https://lightning.ai/">Lightning.ai</a> •
@@ -25,14 +25,24 @@ Uses the latest state-of-the-art techniques:
   <a href="#finetune-an-llm">Finetune, pretrain LLMs</a> •
   <a href="#choose-from-20-llms">Models</a> •
   <a href="#state-of-the-art-features">Features</a> •
-  <a href="#training-recipes">Training recipes (YAML)</a> •
-  <a href="#litgpt-design-principles">Design principles</a>
+  <a href="#training-recipes">Training recipes (YAML)</a>
 </p>
 
 </div>
 
 &nbsp;
 
+## Finetune, pretrain and deploy AI models Lightning fast ⚡⚡
+LitGPT is a command-line tool to use, pretrain, finetune and deploy LLMs. It is based on configs with highly-optimized recipes for training the world's largest, most powerful open-source LLMs.     
+
+We've reimplemented all the model architectures and training recipes for 3 reasons:   
+
+1. Remove all abstraction layers and have single file implementations.
+2. Guarantee Apache 2.0 compliance to enable enterprise use without limits.
+3. Optimized every detail of every model to get the fastest performance possible to lower cost and training speeds.       
+
+&nbsp;
+
 ## Install LitGPT
 
 Install LitGPT with all dependencies (including CLI, quantization, tokenizers for all models, etc.):
@@ -60,8 +70,7 @@ pip install -e '.[all]'
 ---
 
 # Get started
-LitGPT is a command-line tool to use, pretrain, finetune and deploy LLMs.
-
+LitGPT is CLI and config-based. Select the model and the action you want to take on that model (finetune, pretrain, evaluate, deploy, etc...):    
 
 &nbsp;
 
@@ -97,32 +106,64 @@ litgpt finetune lora \
   --checkpoint_dir checkpoints/microsoft/phi-2 \
   --data JSON \
   --data.json_path my_custom_dataset.json \
-  --val_split_fraction 0.1 \
+  --data.val_split_fraction 0.1 \
   --out_dir out/phi-2-lora
 
 # 3) Chat with the model
 litgpt chat \
   --checkpoint_dir out/phi-2-lora/final
 ```
 
-&nbsp;
-
-### Pretrain an LLM
-Train an LLM from scratch on your own data via [pretraining](tutorials/pretrain.md):
+### Pretrain an LLM   
+Train an LLM from scratch on your own data via pretraining:
 
 ```bash
+mkdir -p custom_texts
+curl https://www.gutenberg.org/cache/epub/24440/pg24440.txt --output custom_texts/book1.txt
+curl https://www.gutenberg.org/cache/epub/26393/pg26393.txt --output custom_texts/book2.txt
+
+# 1) Download a tokenizer
+litgpt download \
+  --repo_id EleutherAI/pythia-160m \
+  --tokenizer_only True
+
+# 2) Pretrain the model
+litgpt pretrain \
+  --model_name pythia-160m \
+  --tokenizer_dir checkpoints/EleutherAI/pythia-160m \
+  --data TextFiles \
+  --data.train_data_path "custom_texts/" \
+  --train.max_tokens 10_000_000 \
+  --out_dir out/custom-model
+
+# 3) Chat with the model
+litgpt chat \
+  --checkpoint_dir out/custom-model/final
+```
+
+### Continue pretraining an LLM       
+This is another way of finetuning that specialize an already pretrained model by training on custom data:    
+
+```
+mkdir -p custom_texts
+curl https://www.gutenberg.org/cache/epub/24440/pg24440.txt --output custom_texts/book1.txt
+curl https://www.gutenberg.org/cache/epub/26393/pg26393.txt --output custom_texts/book2.txt
+
 # 1) Download a pretrained model
-litgpt download --repo_id microsoft/phi-2
+litgpt download --repo_id EleutherAI/pythia-160m
 
-# 2) Finetune the model
+# 2) Continue pretraining the model
 litgpt pretrain \
-  --initial_checkpoint_dir checkpoints/microsoft/phi-2 \
-  --data Alpaca2k \
-  --out_dir out/custom-phi-2
+  --model_name pythia-160m \
+  --initial_checkpoint_dir checkpoints/EleutherAI/pythia-160m \
+  --data TextFiles \
+  --data.train_data_path "custom_texts/" \
+  --train.max_tokens 10_000_000 \
+  --out_dir out/custom-model
 
 # 3) Chat with the model
 litgpt chat \
-  --checkpoint_dir out/phi-2-lora/final
+  --checkpoint_dir out/custom-model/final
 ```
 
 &nbsp;
@@ -436,6 +477,17 @@ The LitGPT repository was the official starter kit for the [NeurIPS 2023 LLM Eff
 
 LitGPT powered the [TinyLlama project](https://github.com/jzhang38/TinyLlama) and [TinyLlama: An Open-Source Small Language Model](https://arxiv.org/abs/2401.02385) research paper.
 
+&nbsp;
+
+**🍪 MicroLlama: MicroLlama-300M**
+
+[MicroLlama](https://github.com/keeeeenw/MicroLlama) is a 300M Llama model pretrained on 50B tokens powered by TinyLlama and LitGPT.
+
+&nbsp;
+
+**🔬 Pre-training Small Base LMs with Fewer Tokens**
+
+The research paper ["Pre-training Small Base LMs with Fewer Tokens"](https://arxiv.org/abs/2404.08634), which utilizes LitGPT, develops smaller base language models by inheriting a few transformer blocks from larger models and training on a tiny fraction of the data used by the larger models. It demonstrates that these smaller models can perform comparably to larger models despite using significantly less training data and resources.
 
 &nbsp;
 

diff --git a/extensions/thunder/pretrain.py b/extensions/thunder/pretrain.py
@@ -26,6 +26,7 @@
 from litgpt.utils import (
     CLI,
     CycleIterator,
+    capture_hparams,
     choose_logger,
     chunked_cross_entropy,
     copy_config_files,
@@ -97,7 +98,7 @@ def setup(
         executors: If using Thunder, the executors to enable.
         strategy: If desired, the strategy to use.
     """
-    hparams = locals()
+    hparams = capture_hparams()
     data = TinyLlama() if data is None else data
     if model_config is not None and model_name is not None:
         raise ValueError("Only one of `model_name` or `model_config` can be set.")

diff --git a/litgpt/data/tinystories.py b/litgpt/data/tinystories.py
@@ -106,7 +106,7 @@ def val_dataloader(self) -> DataLoader:
 
 
 def tokenize(filename: str, tokenizer: Tokenizer):
-    with open(filename, "r") as f:
+    with open(filename, "r", encoding="utf-8") as f:
         data = json.load(f)
     global_rank = int(os.environ["DATA_OPTIMIZER_GLOBAL_RANK"])
     num_workers = int(os.environ["DATA_OPTIMIZER_NUM_WORKERS"])

diff --git a/litgpt/eval/evaluate.py b/litgpt/eval/evaluate.py
@@ -92,7 +92,7 @@ def convert_and_evaluate(
     save_filepath = out_dir / Path("results.json") if save_filepath is None else Path(save_filepath)
     config_filepath = checkpoint_dir/"model_config.yaml"
 
-    with open(config_filepath) as f:
+    with open(config_filepath, encoding="utf-8") as f:
         config_dict = yaml.safe_load(f)
     repo_id = f"{config_dict['hf_config']['org']}/{config_dict['hf_config']['name']}"
 

diff --git a/litgpt/pretrain.py b/litgpt/pretrain.py
@@ -26,6 +26,7 @@
 from litgpt.utils import (
     CLI,
     CycleIterator,
+    capture_hparams,
     choose_logger,
     chunked_cross_entropy,
     copy_config_files,
@@ -87,7 +88,7 @@ def setup(
         logger_name: The name of the logger to send metrics to.
         seed: The random seed to use for reproducibility.
     """
-    hparams = locals()
+    hparams = capture_hparams()
     data = TinyLlama() if data is None else data
     if model_config is not None and model_name is not None:
         raise ValueError("Only one of `model_name` or `model_config` can be set.")

diff --git a/litgpt/prompts.py b/litgpt/prompts.py
@@ -251,7 +251,7 @@ def stop_tokens(self, tokenizer: "Tokenizer") -> Tuple[List[int], ...]:
 
 class Phi2(PromptStyle):
     def apply(self, prompt: str, **kwargs: str) -> str:
-        return f"Instruct:{prompt}\nOutput:"
+        return f"Instruct: {prompt}\nOutput:"
 
 
 class TinyLlama(PromptStyle):
@@ -340,12 +340,12 @@ def save_prompt_style(style: Union[str, PromptStyle], checkpoint_dir: Path) -> N
     cls = type(style)
     # Allow saving the full module path for user-defined prompt classes
     config = {"class_path": f"{cls.__module__}.{cls.__name__}"}
-    with open(checkpoint_dir / "prompt_style.yaml", "w") as file:
+    with open(checkpoint_dir / "prompt_style.yaml", "w", encoding="utf-8") as file:
         yaml.dump(config, file)
 
 
 def load_prompt_style(checkpoint_dir: Path) -> PromptStyle:
-    with open(checkpoint_dir / "prompt_style.yaml", "r") as file:
+    with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file:
         config = yaml.safe_load(file)
     # Support loading the full module path for user-defined prompt classes
     full_module_path, cls_name = config["class_path"].rsplit(".", 1)

diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py
@@ -329,7 +329,7 @@ def convert_hf_checkpoint(
     # Load the json file containing weight mapping
     pytorch_bin_map_json_path = checkpoint_dir / "pytorch_model.bin.index.json"
     if pytorch_bin_map_json_path.is_file():  # not all checkpoints have this file
-        with open(pytorch_bin_map_json_path) as json_map:
+        with open(pytorch_bin_map_json_path, encoding="utf-8") as json_map:
             bin_index = json.load(json_map)
         bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
     else:

diff --git a/litgpt/scripts/merge_lora.py b/litgpt/scripts/merge_lora.py
@@ -72,7 +72,7 @@ def load_lora_metadata(checkpoint_dir: Path) -> Tuple[Dict[str, Any], Path, Opti
             f" the `litgpt/finetune/lora.py` script."
         )
 
-    with open(hparams_file, "r") as file:
+    with open(hparams_file, "r", encoding="utf-8") as file:
         hparams = yaml.safe_load(file)
 
     lora_params = {k: v for k, v in hparams.items() if k.startswith("lora_")}

diff --git a/litgpt/tokenizer.py b/litgpt/tokenizer.py
@@ -33,14 +33,14 @@ def __init__(self, checkpoint_dir: Union[Path, str]) -> None:
             self.backend = "huggingface"
 
             if (special_tokens_path := checkpoint_dir / "tokenizer_config.json").is_file():
-                with open(special_tokens_path) as fp:
+                with open(special_tokens_path, encoding="utf-8") as fp:
                     config = json.load(fp)
                 bos_token = config.get("bos_token")
                 self.bos_id = self.token_to_id(bos_token) if bos_token is not None else None
                 eos_token = config.get("eos_token")
                 self.eos_id = self.token_to_id(eos_token) if eos_token is not None else None
             if (special_tokens_path := checkpoint_dir / "generation_config.json").is_file():
-                with open(special_tokens_path) as fp:
+                with open(special_tokens_path, encoding="utf-8") as fp:
                     config = json.load(fp)
                 if self.bos_id is None:
                     self.bos_id = config.get("bos_token_id")
@@ -71,7 +71,7 @@ def token_to_id(self, token: str) -> int:
     def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool:
         if not (tokenizer_config_path := checkpoint_dir / "tokenizer_config.json").is_file():
             return False
-        with open(tokenizer_config_path) as fp:
+        with open(tokenizer_config_path, encoding="utf-8") as fp:
             config = json.load(fp)
         if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")):
             return True

diff --git a/litgpt/utils.py b/litgpt/utils.py
@@ -1,11 +1,12 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 
 """Utility functions for training and inference."""
+import inspect
 import math
 import pickle
 import shutil
 import sys
-from dataclasses import asdict
+from dataclasses import asdict, is_dataclass
 from io import BytesIO
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Literal, Mapping, Optional, TypeVar, Union
@@ -404,6 +405,21 @@ def CLI(*args: Any, **kwargs: Any) -> Any:
     return CLI(*args, **kwargs)
 
 
+def capture_hparams() -> Dict[str, Any]:
+    """Captures the local variables ('hyperparameters') from where this function gets called."""
+    caller_frame = inspect.currentframe().f_back
+    locals_of_caller = caller_frame.f_locals
+    hparams = {}
+    for name, value in locals_of_caller.items():
+        if value is None or isinstance(value, (int, float, str, bool, Path)):
+            hparams[name] = value
+        elif is_dataclass(value):
+            hparams[name] = asdict(value)
+        else:
+            hparams[name] = str(value)
+    return hparams
+
+
 def save_hyperparameters(function: callable, checkpoint_dir: Path) -> None:
     """Captures the CLI parameters passed to `function` without running `function` and saves them to the checkpoint."""
     from jsonargparse import capture_parser
@@ -430,7 +446,7 @@ def save_hyperparameters(function: callable, checkpoint_dir: Path) -> None:
 
 def save_config(config: "Config", checkpoint_dir: Path) -> None:
     config_dict = asdict(config)
-    with open(checkpoint_dir / "model_config.yaml", "w") as fp:
+    with open(checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp:
         yaml.dump(config_dict, fp)
 
 

diff --git a/tests/data/test_tinystories.py b/tests/data/test_tinystories.py
@@ -43,7 +43,7 @@ def test_tokenize(tmp_path, monkeypatch):
     story1, story2 = "foo bar", "    fun    "
     data = [{"story": story1}, {"story": story2}]
     shard_path = tmp_path / "data.json"
-    with open(shard_path, "w") as f:
+    with open(shard_path, "w", encoding="utf-8") as f:
         json.dump(data, f)
 
     class Tokenizer:

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -60,7 +60,7 @@ def test_from_checkpoint(tmp_path):
 
     # 3. If only `lit_config.py` exists.
     config_data = {"name": "pythia-14m", "block_size": 24, "n_layer": 2}
-    with open(tmp_path / "model_config.yaml", "w") as file:
+    with open(tmp_path / "model_config.yaml", "w", encoding="utf-8") as file:
         yaml.dump(config_data, file)
     config = Config.from_checkpoint(tmp_path)
     assert config.name == "pythia-14m"
@@ -69,7 +69,7 @@ def test_from_checkpoint(tmp_path):
 
     # 4. Both `lit_config.py` and a matching config exist, but `lit_config.py` supersedes matching config
     (tmp_path / "pythia-14m").mkdir()
-    with open(tmp_path / "pythia-14m/model_config.yaml", "w") as file:
+    with open(tmp_path / "pythia-14m/model_config.yaml", "w", encoding="utf-8") as file:
         yaml.dump(config_data, file)
     config = Config.from_checkpoint(tmp_path / "pythia-14m")
     assert config.name == "pythia-14m"

diff --git a/tests/test_convert_lit_checkpoint.py b/tests/test_convert_lit_checkpoint.py
@@ -35,7 +35,7 @@ def test_convert_lit_checkpoint(tmp_path):
     checkpoint_path = tmp_path / "lit_model.pth"
     config_path = tmp_path / "model_config.yaml"
     torch.save(ours_model.state_dict(), checkpoint_path)
-    with open(config_path, "w") as fp:
+    with open(config_path, "w", encoding="utf-8") as fp:
         yaml.dump(asdict(ours_config), fp)
     output_dir = tmp_path / "out_dir"
 

diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
@@ -35,7 +35,7 @@ def test_evaluate_script(tmp_path, monkeypatch):
     checkpoint_path = tmp_path / "lit_model.pth"
     torch.save(ours_model.state_dict(), checkpoint_path)
     config_path = tmp_path / "model_config.yaml"
-    with open(config_path, "w") as fp:
+    with open(config_path, "w", encoding="utf-8") as fp:
         yaml.dump(asdict(ours_config), fp)
 
     fn_kwargs = dict(
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		* @awaelchli @carmocca @lantiga
		/README.md @williamfalcon @lantiga