From 7bb498fb1700ee7c0415b352d6e689edcb918da5 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Apr 2024 16:07:59 -0400 Subject: [PATCH 1/8] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 34706620f9..ef39c4fd4f 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,7 @@ litgpt chat \ --checkpoint_dir out/phi-2-lora/final ``` +### Pretrain an LLM Train an LLM from scratch on your own data via pretraining: ```bash @@ -131,7 +132,8 @@ litgpt chat \ --checkpoint_dir out/custom-model/final ``` -Specialize an already pretrained model by training on custom data: +### Continue pretraining an LLM +This is another way of finetuning that specialize an already pretrained model by training on custom data: ``` mkdir -p custom_texts From c81800f455dd997f786cbe2e110eff1f5c0d2d3b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Apr 2024 16:10:57 -0400 Subject: [PATCH 2/8] Update CODEOWNERS --- .github/CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 8349e1320f..af64423bb7 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1,2 @@ * @awaelchli @carmocca @lantiga +/README.md @williamfalcon @lantiga From d84c5c1a6f871b9dd4f3fec8715ba4a72d035156 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 15 Apr 2024 12:36:35 -0400 Subject: [PATCH 3/8] Update README.md (#1291) Co-authored-by: Sebastian Raschka --- README.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ef39c4fd4f..17cb709a08 100644 --- a/README.md +++ b/README.md @@ -25,14 +25,24 @@ Uses the latest state-of-the-art techniques: Finetune, pretrain LLMsModelsFeatures • - Training recipes (YAML) • - Design principles + Training recipes (YAML)

  +## Finetune, pretrain and deploy AI models Lightning fast ⚡⚡ +LitGPT is a command-line tool to use, pretrain, finetune and deploy LLMs. It is based on configs with highly-optimized recipes for training the world's largest, most powerful open-source LLMs. + +We've reimplemented all the model architectures and training recipes for 3 reasons: + +1. Remove all abstraction layers and have single file implementations. +2. Guarantee Apache 2.0 compliance to enable enterprise use without limits. +3. Optimized every detail of every model to get the fastest performance possible to lower cost and training speeds. + +  + ## Install LitGPT Install LitGPT with all dependencies (including CLI, quantization, tokenizers for all models, etc.): @@ -60,8 +70,7 @@ pip install -e '.[all]' --- # Get started -LitGPT is a command-line tool to use, pretrain, finetune and deploy LLMs. - +LitGPT is CLI and config-based. Select the model and the action you want to take on that model (finetune, pretrain, evaluate, deploy, etc...):   From c4831983ee488b89bdcb502e0763c496d8dee593 Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Mon, 15 Apr 2024 12:36:47 -0400 Subject: [PATCH 4/8] Fix badge size (#1293) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 17cb709a08..0082d426a8 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Uses the latest state-of-the-art techniques: ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pytorch-lightning) -![cpu-tests](https://github.com/lightning-AI/lit-stablelm/actions/workflows/cpu-tests.yml/badge.svg) [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/Lightning-AI/lit-stablelm/blob/master/LICENSE) [![Discord](https://img.shields.io/discord/1077906959069626439?style=plastic)](https://discord.gg/VptPCZkGNa) +![cpu-tests](https://github.com/lightning-AI/lit-stablelm/actions/workflows/cpu-tests.yml/badge.svg) [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/Lightning-AI/lit-stablelm/blob/master/LICENSE) [![Discord](https://img.shields.io/discord/1077906959069626439)](https://discord.gg/VptPCZkGNa)

Lightning.ai • From af8347be3192c4deb5b55059442ca7eacf07f341 Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov <58434077+Andrei-Aksionov@users.noreply.github.com> Date: Mon, 15 Apr 2024 20:15:46 +0300 Subject: [PATCH 5/8] Replace UV with regular pip (#1292) --- .github/workflows/cpu-tests.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml index 59a3ce3075..cbb8d2805c 100644 --- a/.github/workflows/cpu-tests.yml +++ b/.github/workflows/cpu-tests.yml @@ -39,14 +39,14 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - - name: Install uv - run: pip install uv + cache: 'pip' + cache-dependency-path: | + pyproject.toml - name: Install minimal dependencies run: | - uv pip install --system . - uv pip list + pip install . + pip list # make sure all modules are still importable with only the minimal dependencies available modules=$( find litgpt -type f -name "*.py" | \ @@ -58,8 +58,8 @@ jobs: - name: Install all dependencies run: | - uv pip install --system '.[all,test]' - uv pip list + pip install '.[all,test]' + pip list - name: Run tests run: | From de1e1c76b973a474ce3894d02456457b53c5fc7f Mon Sep 17 00:00:00 2001 From: Mikhail Gerassimov Date: Mon, 15 Apr 2024 22:36:02 +0500 Subject: [PATCH 6/8] error: Unrecognized arguments: --val_split_fraction 0.1 (#1295) --- README.md | 2 +- tutorials/prepare_dataset.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0082d426a8..6c55a1ee5c 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ litgpt finetune lora \ --checkpoint_dir checkpoints/microsoft/phi-2 \ --data JSON \ --data.json_path my_custom_dataset.json \ - --val_split_fraction 0.1 \ + --data.val_split_fraction 0.1 \ --out_dir out/phi-2-lora # 3) Chat with the model diff --git a/tutorials/prepare_dataset.md b/tutorials/prepare_dataset.md index 867b612f91..055b769bac 100644 --- a/tutorials/prepare_dataset.md +++ b/tutorials/prepare_dataset.md @@ -334,7 +334,7 @@ Then simply run any of the finetuning scripts with this input: litgpt finetune lora \ --data JSON \ --data.json_path path/to/your/data.json \ - --val_split_fraction 0.1 \ + --data.val_split_fraction 0.1 \ --checkpoint_dir "checkpoints/tiiuae/falcon-7b" ``` From 67d0b0c6cbbcf10e63b81a9963807e0a3e1854bd Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Mon, 15 Apr 2024 13:36:36 -0400 Subject: [PATCH 7/8] Add Pre-training Small Base LMs with Fewer Tokens paper to community projects (#1290) --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 6c55a1ee5c..eaa5fbf7f5 100644 --- a/README.md +++ b/README.md @@ -483,6 +483,11 @@ LitGPT powered the [TinyLlama project](https://github.com/jzhang38/TinyLlama) an [MicroLlama](https://github.com/keeeeenw/MicroLlama) is a 300M Llama model pretrained on 50B tokens powered by TinyLlama and LitGPT. +  + +**🔬 Pre-training Small Base LMs with Fewer Tokens** + +The research paper ["Pre-training Small Base LMs with Fewer Tokens"](https://arxiv.org/abs/2404.08634), which utilizes LitGPT, develops smaller base language models by inheriting a few transformer blocks from larger models and training on a tiny fraction of the data used by the larger models. It demonstrates that these smaller models can perform comparably to larger models despite using significantly less training data and resources.   From 410a7126f82ea550d4a43dab89367547b073b5a3 Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov <58434077+Andrei-Aksionov@users.noreply.github.com> Date: Mon, 15 Apr 2024 21:22:32 +0300 Subject: [PATCH 8/8] Specify `UTF-8` encoding for every `open` function (#1283) Co-authored-by: William Falcon --- litgpt/data/tinystories.py | 2 +- litgpt/eval/evaluate.py | 2 +- litgpt/prompts.py | 4 ++-- litgpt/scripts/convert_hf_checkpoint.py | 2 +- litgpt/scripts/merge_lora.py | 2 +- litgpt/tokenizer.py | 6 +++--- litgpt/utils.py | 2 +- tests/data/test_tinystories.py | 2 +- tests/test_config.py | 4 ++-- tests/test_convert_lit_checkpoint.py | 2 +- tests/test_evaluate.py | 2 +- tests/test_merge_lora.py | 6 +++--- tests/test_prompts.py | 4 ++-- tests/test_utils.py | 4 ++-- 14 files changed, 22 insertions(+), 22 deletions(-) diff --git a/litgpt/data/tinystories.py b/litgpt/data/tinystories.py index ec8f5d6cec..632a015e44 100644 --- a/litgpt/data/tinystories.py +++ b/litgpt/data/tinystories.py @@ -106,7 +106,7 @@ def val_dataloader(self) -> DataLoader: def tokenize(filename: str, tokenizer: Tokenizer): - with open(filename, "r") as f: + with open(filename, "r", encoding="utf-8") as f: data = json.load(f) global_rank = int(os.environ["DATA_OPTIMIZER_GLOBAL_RANK"]) num_workers = int(os.environ["DATA_OPTIMIZER_NUM_WORKERS"]) diff --git a/litgpt/eval/evaluate.py b/litgpt/eval/evaluate.py index 097c02f4d2..12aa3179e9 100644 --- a/litgpt/eval/evaluate.py +++ b/litgpt/eval/evaluate.py @@ -92,7 +92,7 @@ def convert_and_evaluate( save_filepath = out_dir / Path("results.json") if save_filepath is None else Path(save_filepath) config_filepath = checkpoint_dir/"model_config.yaml" - with open(config_filepath) as f: + with open(config_filepath, encoding="utf-8") as f: config_dict = yaml.safe_load(f) repo_id = f"{config_dict['hf_config']['org']}/{config_dict['hf_config']['name']}" diff --git a/litgpt/prompts.py b/litgpt/prompts.py index df1a7150b6..d827413913 100644 --- a/litgpt/prompts.py +++ b/litgpt/prompts.py @@ -340,12 +340,12 @@ def save_prompt_style(style: Union[str, PromptStyle], checkpoint_dir: Path) -> N cls = type(style) # Allow saving the full module path for user-defined prompt classes config = {"class_path": f"{cls.__module__}.{cls.__name__}"} - with open(checkpoint_dir / "prompt_style.yaml", "w") as file: + with open(checkpoint_dir / "prompt_style.yaml", "w", encoding="utf-8") as file: yaml.dump(config, file) def load_prompt_style(checkpoint_dir: Path) -> PromptStyle: - with open(checkpoint_dir / "prompt_style.yaml", "r") as file: + with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file: config = yaml.safe_load(file) # Support loading the full module path for user-defined prompt classes full_module_path, cls_name = config["class_path"].rsplit(".", 1) diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py index 98f8d4e01f..9fdb337ee4 100644 --- a/litgpt/scripts/convert_hf_checkpoint.py +++ b/litgpt/scripts/convert_hf_checkpoint.py @@ -329,7 +329,7 @@ def convert_hf_checkpoint( # Load the json file containing weight mapping pytorch_bin_map_json_path = checkpoint_dir / "pytorch_model.bin.index.json" if pytorch_bin_map_json_path.is_file(): # not all checkpoints have this file - with open(pytorch_bin_map_json_path) as json_map: + with open(pytorch_bin_map_json_path, encoding="utf-8") as json_map: bin_index = json.load(json_map) bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()} else: diff --git a/litgpt/scripts/merge_lora.py b/litgpt/scripts/merge_lora.py index 1e1120d214..aff59daef4 100644 --- a/litgpt/scripts/merge_lora.py +++ b/litgpt/scripts/merge_lora.py @@ -72,7 +72,7 @@ def load_lora_metadata(checkpoint_dir: Path) -> Tuple[Dict[str, Any], Path, Opti f" the `litgpt/finetune/lora.py` script." ) - with open(hparams_file, "r") as file: + with open(hparams_file, "r", encoding="utf-8") as file: hparams = yaml.safe_load(file) lora_params = {k: v for k, v in hparams.items() if k.startswith("lora_")} diff --git a/litgpt/tokenizer.py b/litgpt/tokenizer.py index 3a6758eb62..55c972e69a 100644 --- a/litgpt/tokenizer.py +++ b/litgpt/tokenizer.py @@ -33,14 +33,14 @@ def __init__(self, checkpoint_dir: Union[Path, str]) -> None: self.backend = "huggingface" if (special_tokens_path := checkpoint_dir / "tokenizer_config.json").is_file(): - with open(special_tokens_path) as fp: + with open(special_tokens_path, encoding="utf-8") as fp: config = json.load(fp) bos_token = config.get("bos_token") self.bos_id = self.token_to_id(bos_token) if bos_token is not None else None eos_token = config.get("eos_token") self.eos_id = self.token_to_id(eos_token) if eos_token is not None else None if (special_tokens_path := checkpoint_dir / "generation_config.json").is_file(): - with open(special_tokens_path) as fp: + with open(special_tokens_path, encoding="utf-8") as fp: config = json.load(fp) if self.bos_id is None: self.bos_id = config.get("bos_token_id") @@ -71,7 +71,7 @@ def token_to_id(self, token: str) -> int: def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool: if not (tokenizer_config_path := checkpoint_dir / "tokenizer_config.json").is_file(): return False - with open(tokenizer_config_path) as fp: + with open(tokenizer_config_path, encoding="utf-8") as fp: config = json.load(fp) if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")): return True diff --git a/litgpt/utils.py b/litgpt/utils.py index 37ebdfd6f9..0e40d336a1 100644 --- a/litgpt/utils.py +++ b/litgpt/utils.py @@ -446,7 +446,7 @@ def save_hyperparameters(function: callable, checkpoint_dir: Path) -> None: def save_config(config: "Config", checkpoint_dir: Path) -> None: config_dict = asdict(config) - with open(checkpoint_dir / "model_config.yaml", "w") as fp: + with open(checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp: yaml.dump(config_dict, fp) diff --git a/tests/data/test_tinystories.py b/tests/data/test_tinystories.py index bfb009c1a9..e1c15d67db 100644 --- a/tests/data/test_tinystories.py +++ b/tests/data/test_tinystories.py @@ -43,7 +43,7 @@ def test_tokenize(tmp_path, monkeypatch): story1, story2 = "foo bar", " fun " data = [{"story": story1}, {"story": story2}] shard_path = tmp_path / "data.json" - with open(shard_path, "w") as f: + with open(shard_path, "w", encoding="utf-8") as f: json.dump(data, f) class Tokenizer: diff --git a/tests/test_config.py b/tests/test_config.py index d1b0c57bd2..e1b6b28850 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -60,7 +60,7 @@ def test_from_checkpoint(tmp_path): # 3. If only `lit_config.py` exists. config_data = {"name": "pythia-14m", "block_size": 24, "n_layer": 2} - with open(tmp_path / "model_config.yaml", "w") as file: + with open(tmp_path / "model_config.yaml", "w", encoding="utf-8") as file: yaml.dump(config_data, file) config = Config.from_checkpoint(tmp_path) assert config.name == "pythia-14m" @@ -69,7 +69,7 @@ def test_from_checkpoint(tmp_path): # 4. Both `lit_config.py` and a matching config exist, but `lit_config.py` supersedes matching config (tmp_path / "pythia-14m").mkdir() - with open(tmp_path / "pythia-14m/model_config.yaml", "w") as file: + with open(tmp_path / "pythia-14m/model_config.yaml", "w", encoding="utf-8") as file: yaml.dump(config_data, file) config = Config.from_checkpoint(tmp_path / "pythia-14m") assert config.name == "pythia-14m" diff --git a/tests/test_convert_lit_checkpoint.py b/tests/test_convert_lit_checkpoint.py index 8464a53d8d..f44609a4f1 100644 --- a/tests/test_convert_lit_checkpoint.py +++ b/tests/test_convert_lit_checkpoint.py @@ -35,7 +35,7 @@ def test_convert_lit_checkpoint(tmp_path): checkpoint_path = tmp_path / "lit_model.pth" config_path = tmp_path / "model_config.yaml" torch.save(ours_model.state_dict(), checkpoint_path) - with open(config_path, "w") as fp: + with open(config_path, "w", encoding="utf-8") as fp: yaml.dump(asdict(ours_config), fp) output_dir = tmp_path / "out_dir" diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 94b90b4551..3ea6994532 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -35,7 +35,7 @@ def test_evaluate_script(tmp_path, monkeypatch): checkpoint_path = tmp_path / "lit_model.pth" torch.save(ours_model.state_dict(), checkpoint_path) config_path = tmp_path / "model_config.yaml" - with open(config_path, "w") as fp: + with open(config_path, "w", encoding="utf-8") as fp: yaml.dump(asdict(ours_config), fp) fn_kwargs = dict( diff --git a/tests/test_merge_lora.py b/tests/test_merge_lora.py index 4e505c09b5..2c8458f02b 100644 --- a/tests/test_merge_lora.py +++ b/tests/test_merge_lora.py @@ -31,7 +31,7 @@ def test_merge_lora(tmp_path, fake_checkpoint_dir, pretrained_dtype, lora_dtype) # Create a fake pretrained checkpoint config = dict(block_size=128, padded_vocab_size=256, n_layer=3, n_head=8, n_embd=16) - with open(pretrained_checkpoint_dir / "model_config.yaml", "w") as fp: + with open(pretrained_checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp: yaml.dump(config, fp) base_model = GPT.from_name("pythia-14m", **config).to(dtype=pretrained_dtype) state_dict = base_model.state_dict() @@ -45,7 +45,7 @@ def test_merge_lora(tmp_path, fake_checkpoint_dir, pretrained_dtype, lora_dtype) assert len(state_dict) == 6 torch.save(state_dict, lora_checkpoint_dir / "lit_model.pth.lora") hparams = dict(checkpoint_dir=str(pretrained_checkpoint_dir), **lora_kwargs) - with open(lora_checkpoint_dir / "hyperparameters.yaml", "w") as file: + with open(lora_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file: yaml.dump(hparams, file) shutil.copyfile(pretrained_checkpoint_dir / "model_config.yaml", lora_checkpoint_dir / "model_config.yaml") @@ -80,7 +80,7 @@ def test_load_lora_metadata(fake_checkpoint_dir): load_lora_metadata(fake_checkpoint_dir) hparams = dict(precision="bf16-mixed", checkpoint_dir="checkpoints/meta-llama/Llama-2-7b", lora_r=8, lora_alpha=16) - with open(fake_checkpoint_dir / "hyperparameters.yaml", "w") as file: + with open(fake_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file: yaml.dump(hparams, file) lora_args, pretrained_dir, precision = load_lora_metadata(fake_checkpoint_dir) diff --git a/tests/test_prompts.py b/tests/test_prompts.py index 3a4ef8dcc2..3250ce4801 100644 --- a/tests/test_prompts.py +++ b/tests/test_prompts.py @@ -98,7 +98,7 @@ def test_save_load_prompt_style(tmp_path): assert not has_prompt_style(checkpoint_dir) save_prompt_style("alpaca", checkpoint_dir) assert has_prompt_style(checkpoint_dir) - with open(checkpoint_dir / "prompt_style.yaml", "r") as file: + with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file: contents = yaml.safe_load(file) assert contents == {"class_path": "litgpt.prompts.Alpaca"} loaded = load_prompt_style(checkpoint_dir) @@ -108,7 +108,7 @@ def test_save_load_prompt_style(tmp_path): checkpoint_dir = tmp_path / "custom" checkpoint_dir.mkdir() save_prompt_style(CustomPromptStyle(), checkpoint_dir) - with open(checkpoint_dir / "prompt_style.yaml", "r") as file: + with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file: contents = yaml.safe_load(file) assert contents == {"class_path": "test_prompts.CustomPromptStyle"} loaded = load_prompt_style(checkpoint_dir) diff --git a/tests/test_utils.py b/tests/test_utils.py index d76ae98056..99d883a3f2 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -250,7 +250,7 @@ def test_save_hyperparameters(tmp_path): with mock.patch("sys.argv", ["any.py", "--out_dir", str(tmp_path), "--foo", "True"]): CLI(_test_function) - with open(tmp_path / "hyperparameters.yaml", "r") as file: + with open(tmp_path / "hyperparameters.yaml", "r", encoding="utf-8") as file: hparams = yaml.full_load(file) assert hparams["out_dir"] == str(tmp_path) @@ -277,7 +277,7 @@ def test_save_hyperparameters_known_commands(command, tmp_path): with mock.patch("sys.argv", [*command.split(" "), "--out_dir", str(tmp_path), "--foo", "True"]): save_hyperparameters(_test_function2, tmp_path) - with open(tmp_path / "hyperparameters.yaml", "r") as file: + with open(tmp_path / "hyperparameters.yaml", "r", encoding="utf-8") as file: hparams = yaml.full_load(file) assert hparams["out_dir"] == str(tmp_path)