Lightning-AI · carmocca · Apr 4, 2024 · Mar 21, 2024 · Mar 21, 2024 · Mar 21, 2024
@@ -23,6 +23,7 @@
 )
 from litgpt.scripts.download import download_from_hub as download_fn
 from litgpt.scripts.merge_lora import merge_lora as merge_lora_fn
+from litgpt.scripts.evaluate import convert_and_evaluate as evaluate_fn
 
 if TYPE_CHECKING:
     from jsonargparse import ArgumentParser
@@ -78,6 +79,7 @@ def main() -> None:
             },
         },
         "merge_lora": {"help": "Merges the LoRA weights with the base model.", "fn": merge_lora_fn},
+        "evaluate": {"help": "Evaluate a model with the LM Evaluation Harness.", "fn": evaluate_fn},
     }
 
     from jsonargparse import set_config_read_mode, set_docstring_parse_options

diff --git a/litgpt/scripts/evaluate.py b/litgpt/scripts/evaluate.py
@@ -0,0 +1,116 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import json
+import os
+from pathlib import Path
+from typing import Optional
+import torch
+
+from litgpt.scripts.convert_lit_checkpoint import convert_lit_checkpoint
+from litgpt.utils import CLI, copy_config_files
+
+
+def safe_safetensors(out_dir, repo_id):
+    from transformers import AutoModel
+
+    state_dict = torch.load(out_dir/"model.pth")
+    model = AutoModel.from_pretrained(
+        repo_id, state_dict=state_dict
+    )
+    model.save_pretrained(out_dir)
+
+
+def prepare_results(results, save_filepath, print_results=True):
+    from lm_eval.utils import make_table
+
+    if print_results:
+        print(make_table(results))
+        if "groups" in results:
+            print(make_table(results, "groups"))
+
+    json_result = json.dumps(
+            results, indent=2, ensure_ascii=False
+    )
+    save_filepath.open("w", encoding="utf-8").write(json_result)
+
+
+def convert_and_evaluate(
+    checkpoint_dir: Optional[str] = None,
+    out_dir: Optional[str] = None,
+    repo_id: Optional[str] = None,
+    skip_conversion: bool = False,
+    tasks: Optional[str] = "hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge",
+    num_fewshot: Optional[int] = None,
+    batch_size: int = 1,
+    device: Optional[str] = None,
+    limit: Optional[float] = None,
+    seed: int = 1234,
+    save_filepath: Optional[str] = None,
+) -> None:
+    """Convert a LitGPT model and run the LM Evaluation Harness
+
+    Arguments:
+        checkpoint_dir: Directory where the `lit_model.pth` and tokenizer files are located.
+        out_dir: Directory in which to save the converted checkpoints for evaluation.
+        repo_id: The original repo ID the model was derived from.
+        skip_conversion: Set to `True` to skip the model conversion,
+            assuming the model has already been converted and the
+            model.pth and .safetensor files exist.
+        tasks: CSV of task names to evaluate.
+           By default, the Open LM Leaderboard tasks are used:
+           "hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge"
+        num_fewshot: Number of examples in few-shot context.
+        batch_size: Batch size configuration.
+        device: Device to use for evaluation, for example, "cuda" or "cuda:0".
+        limit: Limit on number of examples per task.
+        seed: Random seed.
+        save_filepath: The file where the results will be saved. 
+          Saves to `out_dir`/results.json by default.
+    """
+
+    from lm_eval import evaluator
+
+    if checkpoint_dir is None:
+        raise ValueError("Provide a checkpoint_dir argument.")
+    if out_dir is None:
+        raise ValueError("Provide a checkpoint_dir argument.")
+    if repo_id is None:
+        raise ValueError("Provide a repo_id argument.")   
+
+    checkpoint_dir, out_dir = Path(checkpoint_dir), Path(out_dir)
+
+    if save_filepath is None:
+        save_filepath = "results.json"
+        save_filepath = out_dir / Path(save_filepath)
+    else:
+        save_filepath = Path(save_filepath)
+
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    copy_config_files(source_dir=checkpoint_dir, out_dir=out_dir)
+
+    if not skip_conversion:
+        convert_lit_checkpoint(checkpoint_dir=checkpoint_dir, output_dir=out_dir)
+        safe_safetensors(out_dir, repo_id)
+
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+    results = evaluator.simple_evaluate(
+        model="hf",
+        model_args=f"pretrained={out_dir}",
+        tasks=tasks.split(","),
+        num_fewshot=num_fewshot,
+        batch_size=batch_size,
+        device=device,
+        limit=limit,
+        random_seed=seed,
+        numpy_random_seed=seed,
+        torch_random_seed=seed,
+    )
+
+    print("results", results)
+    prepare_results(results, save_filepath)
+
+
+if __name__ == "__main__":
+    CLI(convert_and_evaluate)
@@ -28,6 +28,7 @@ test = [
     "pytest-timeout",
     "transformers>=4.38.0",
     "einops",
+    "lm-eval>=0.42.0",
     "protobuf",
     "lightning-thunder; python_version >= '3.10'",
 ]
@@ -43,6 +44,8 @@ all = [
     "pyarrow",                   # litgpt.data.prepare_starcoder.py
     "tensorboard",               # litgpt.pretrain
     "torchmetrics",              # litgpt.pretrain
+    "transformers>=4.38.0",      # litgpt.evaluate
+    "lm-eval>=0.42.0",           # litgpt.evaluate
     "safetensors",               # download
     "huggingface_hub[hf_transfer]>=0.21.0"  # download
 ]

@@ -15,14 +15,15 @@ def test_cli():
         main()
     out = out.getvalue()
     assert "usage: litgpt" in out
-    assert "{download,chat,finetune,pretrain,generate,convert,merge_lora}" in out
+    assert "{download,chat,finetune,pretrain,generate,convert,merge_lora,evaluate}" in out
     assert (
         """Available subcommands:
     download            Download weights or tokenizer data from the Hugging
                         Face Hub.
     chat                Chat with a model."""
         in out
     )
+    assert ("""evaluate            Evaluate a model with the LM Evaluation Harness.""") in out
 
     out = StringIO()
     with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "finetune", "-h"]):

@@ -0,0 +1,62 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import sys
+from pathlib import Path
+
+import datasets
+import pytest
+
+from litgpt.scripts.download import download_from_hub
+from litgpt.scripts.evaluate import safe_safetensors, prepare_results
+from litgpt.scripts.convert_lit_checkpoint import convert_lit_checkpoint
+from lm_eval import evaluator
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+
+@pytest.mark.xfail(
+    raises=(datasets.builder.DatasetGenerationError, NotImplementedError),
+    strict=False,
+    match="Loading a dataset cached in a LocalFileSystem is not supported",
+)
+def test_run_eval(tmp_path, float_like):
+    repo_id = "EleutherAI/pythia-14m"
+    download_from_hub(repo_id=repo_id, checkpoint_dir=tmp_path)
+
+    checkpoint_path = Path(tmp_path) / Path(repo_id)
+
+    convert_lit_checkpoint(checkpoint_dir=checkpoint_path, output_dir=checkpoint_path)
+    safe_safetensors(out_dir=checkpoint_path, repo_id=repo_id)
+
+    eval_tasks = "coqa,hellaswag"
+    results = evaluator.simple_evaluate(
+        model="hf",
+        model_args=f"pretrained={checkpoint_path}",
+        tasks=eval_tasks.split(","),
+        limit=2,
+        device="cpu"
+    )
+
+    save_path = checkpoint_path/"results.json"
+    prepare_results(results, save_path, print_results=False)
+
+    print(checkpoint_path/"dump.txt")
+    assert save_path.is_file()
+    assert results["results"] == {
+            'coqa': {
+                'alias': 'coqa',
+                'em,none': 0.0,
+                'em_stderr,none': 0.0,
+                'f1,none': 0.0,
+                'f1_stderr,none': 0.0
+            },
+            'hellaswag': {
+                'acc,none': 0.0,
+                'acc_stderr,none': 0.0,
+                'acc_norm,none': 0.5,
+                'acc_norm_stderr,none': 0.5,
+                'alias': 'hellaswag'
+            }
+    }
@@ -9,59 +9,74 @@ You can evaluate LitGPT using [EleutherAI's lm-eval](https://github.com/Eleuther
 You need to install the `lm-eval` framework first:
 
 ```bash
-pip install 'lm_eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8b'
+pip install lm_eval
 ```
 
 &nbsp;
 
 ### Evaluating LitGPT base models
 
-Use the following command to evaluate LitGPT models on all tasks in Eleuther AI's Evaluation Harness.
+Suppose you downloaded a base model that we want to evaluate. Here, we use the `microsoft/phi-2` model:
 
 ```bash
-python eval/lm_eval_harness.py \
-    --checkpoint_dir "checkpoints/meta-llama/Llama-2-7b-hf" \
-    --precision "bf16-true" \
-    --save_filepath "results.json"
+litgpt download --repo_id microsoft/phi-2
 ```
 
-To evaluate on LLMs on specific tasks, for example, TruthfulQA and HellaSwag, you can use the `--eval_task` flag as follows:
+The download command above will save the model to the `checkoints/microsoft/phi-2` directory, which we can
+specify in the following evaluation command:
 
-```bash
-python eval/lm_eval_harness.py \
-    --checkpoint_dir "checkpoints/meta-llama/Llama-2-7b-hf" \
-    --eval_tasks "[truthfulqa_mc,hellaswag]" \
-    --precision "bf16-true" \
-    --save_filepath "results.json"
+
+```
+litgpt evaluate \
+  --checkpoint_dir checkpoints/microsoft/phi-2/ \
+  --out_dir evaluate_model/ \
+  --repo_id microsoft/phi-2
 ```
 
-A list of supported tasks can be found [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md).
+Please note that the `litgpt eval` command run an internal model conversion. 
+This is only necessary the first time you want to evaluate a model. To skip the conversion, 
+when you want to evaluate a model a second time, you can pass the `--skip_conversion true` argument:
+
+```
+litgpt evaluate \
+  --checkpoint_dir checkpoints/microsoft/phi-2/ \
+  --out_dir evaluate_model/ \
+  --repo_id microsoft/phi-2 \
+  --skip_conversion true
+```
 
 &nbsp;
 
-### Evaluating LoRA-finetuned LLMs
+> [!TIP]
+> By default, `ligpt evaluate` will evaluate a model on all Open LM Leaderboard tasks, which corresponds
+to the setting `--tasks "hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge"`. 
 
-The above command can be used to evaluate models that are saved via a single checkpoint file. This includes downloaded checkpoints and base models finetuned via the full and adapter finetuning scripts.
+> [!TIP]
+> The evaluation may take a long time, and for testing purpoes, you may want to reduce the number of tasks
+> or set a limit for the number of examples per task, for example, `--limit 10`.
 
-For LoRA-finetuned models, you need to first merge the LoRA weights with the original checkpoint file as described in the [Merging LoRA Weights](finetune_lora.md#merging-lora-weights) section of the LoRA finetuning documentation.
+A list of supported tasks can be found [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md).
 
-&nbsp;
 
-## FAQs
 
-* **How do I evaluate on MMLU?**
 
-  MMLU is available as with lm-eval harness but the task name is not MMLU. You can use `hendrycksTest*` as regex to evaluate on MMLU.
+&nbsp;
+
+### Evaluating LoRA-finetuned LLMs
 
-  ```shell
-  python eval/lm_eval_harness.py \
-      --checkpoint_dir "checkpoints/meta-llama/Llama-2-7b-hf" \
-      --precision "bf16-true" \
-      --eval_tasks "[hendrycksTest*]" \
-      --num_fewshot 5 \
-      --save_filepath "results.json"
-  ```
+No further conversion is necessary when evaluating LoRA-finetuned models as the `finetune lora` command already prepares the necessary merged model files:
 
-* **Is Truthful MC is not available in lm-eval?**
+```bash
+litgpt finetune lora \
+  --checkpoint_dir checkpoints/microsoft/phi-2 \
+  --out_dir lora_model
+```
 
-  It is available as `truthfulqa_mc`.
+&nbsp;
+
+```
+litgpt evaluate \
+  --checkpoint_dir lora_model/final \
+  --out_dir evaluate_model/ \
+  --repo_id microsoft/phi-2
+```