Lightning-AI · carmocca · Apr 4, 2024 · Mar 21, 2024 · Mar 21, 2024 · Mar 21, 2024
@@ -41,7 +41,7 @@ jobs:
       displayName: "Image info & NVIDIA"
 
     - script: |
-        pip install '.[all,test]' 'lm_eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8b'
+        pip install '.[all,test]'
       displayName: 'Install dependencies'
 
     - script: |

@@ -61,7 +61,7 @@ jobs:
 
     - name: Install all dependencies
       run: |
-        uv pip install --system -e '.[all,test]' 'lm_eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8b'
+        uv pip install --system -e '.[all,test]'
         uv pip list
 
     - name: Run tests

@@ -23,6 +23,7 @@
 )
 from litgpt.scripts.download import download_from_hub as download_fn
 from litgpt.scripts.merge_lora import merge_lora as merge_lora_fn
+from litgpt.eval.evaluate import convert_and_evaluate as evaluate_fn
 
 if TYPE_CHECKING:
     from jsonargparse import ArgumentParser
@@ -78,6 +79,7 @@ def main() -> None:
             },
         },
         "merge_lora": {"help": "Merges the LoRA weights with the base model.", "fn": merge_lora_fn},
+        "evaluate": {"help": "Evaluate a model with the LM Evaluation Harness.", "fn": evaluate_fn},
     }
 
     from jsonargparse import set_config_read_mode, set_docstring_parse_options

@@ -0,0 +1,115 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import json
+import os
+from pathlib import Path
+from typing import Optional
+import yaml
+import torch
+
+from litgpt.scripts.convert_lit_checkpoint import convert_lit_checkpoint
+from litgpt.utils import CLI, copy_config_files
+
+
+def safe_safetensors(out_dir, repo_id):
+    from transformers import AutoModel
+
+    state_dict = torch.load(out_dir/"model.pth")
+    model = AutoModel.from_pretrained(
+         repo_id, state_dict=state_dict
+     )
+    model.save_pretrained(out_dir)
+
+
+def prepare_results(results, save_filepath, print_results=True):
+    from lm_eval.utils import make_table
+
+    if print_results:
+        print(make_table(results))
+        if "groups" in results:
+            print(make_table(results, "groups"))
+
+    json_result = json.dumps(
+        results, indent=2, ensure_ascii=False
+    )
+    save_filepath.open("w", encoding="utf-8").write(json_result)
+
+
+def convert_and_evaluate(
+    checkpoint_dir: str,
+    out_dir: Optional[str] = None,
+    force_conversion: bool = False,
+    tasks: Optional[str] = "hellaswag,truthfulqa_mc2,mmlu",
+    num_fewshot: Optional[int] = None,
+    batch_size: int = 1,
+    device: Optional[str] = None,
+    limit: Optional[float] = None,
+    seed: int = 1234,
+    save_filepath: Optional[str] = None,
+) -> None:
+    """Convert a LitGPT model and run the LM Evaluation Harness
+
+    Arguments:
+        checkpoint_dir: Directory where the `lit_model.pth` and tokenizer files are located.
+        out_dir: Directory in which to save the converted checkpoints for evaluation.
+            Saves to `checkpoint_dir`/evaluate by default.
+        force_conversion: Set to `True` to reconvert the model and override
+            an existing model.pth from a previous evaluation call.
+        tasks: CSV of task names to evaluate.
+           By default, the following tasks are used:
+           "hellaswag,truthfulqa_mc2,mmlu"
+        num_fewshot: Number of examples in few-shot context.
+        batch_size: Batch size configuration.
+        device: Device to use for evaluation, for example, "cuda" or "cuda:0".
+        limit: Limit on number of examples per task.
+        seed: Random seed.
+        save_filepath: The file where the results will be saved.
+            Saves to `out_dir/results.json` by default.
+    """
+
+    from lm_eval import evaluator
+
+    checkpoint_dir = Path(checkpoint_dir)
+
+    if out_dir is None:
+        out_dir = checkpoint_dir / "evaluate"
+    else:
+        out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    save_filepath = out_dir / Path("results.json") if save_filepath is None else Path(save_filepath)
+    config_filepath = checkpoint_dir/"model_config.yaml"
+
+    with open(config_filepath) as f:
+        config_dict = yaml.safe_load(f)
+    repo_id = f"{config_dict['hf_config']['org']}/{config_dict['hf_config']['name']}"
+
+    copy_config_files(source_dir=checkpoint_dir, out_dir=out_dir)
+
+    model_path = out_dir / "model.pth"
+    if not model_path.exists() or force_conversion:
+        convert_lit_checkpoint(checkpoint_dir=checkpoint_dir, output_dir=out_dir)
+
+    safetensors_path = out_dir / "model.safetensors"
+    if not safetensors_path.exists() or force_conversion:
+        safe_safetensors(out_dir, repo_id)
+
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+    results = evaluator.simple_evaluate(
+        model="hf",
+        model_args=f"pretrained={out_dir}",
+        tasks=tasks.split(","),
+        num_fewshot=num_fewshot,
+        batch_size=batch_size,
+        device=device,
+        limit=limit,
+        random_seed=seed,
+        numpy_random_seed=seed,
+        torch_random_seed=seed,
+    )
+    prepare_results(results, save_filepath)
+
+
+if __name__ == "__main__":
+    CLI(convert_and_evaluate)
@@ -26,7 +26,7 @@ test = [
     "pytest",
     "pytest-rerunfailures",
     "pytest-timeout",
-    "transformers>=4.38.0",
+    "transformers>=4.38.0",  # numerical comparisons
     "einops",
     "protobuf",
     "lightning-thunder; python_version >= '3.10'",
@@ -35,14 +35,16 @@ all = [
     "bitsandbytes==0.42.0",      # quantization
     "sentencepiece",             # llama-based models
     "tokenizers",                # pythia, falcon, redpajama
-    "datasets",                  # eval
     "requests",                  # litgpt.data
     "litdata",                   # litgpt.data
     "zstandard",                 # litgpt.data.prepare_slimpajama.py
     "pandas",                    # litgpt.data.prepare_starcoder.py
     "pyarrow",                   # litgpt.data.prepare_starcoder.py
     "tensorboard",               # litgpt.pretrain
     "torchmetrics",              # litgpt.pretrain
+    "datasets",                  # litgpt.evaluate
+    "transformers>=4.38.0",      # litgpt.evaluate
+    "lm-eval>=0.4.2",            # litgpt.evaluate
     "safetensors",               # download
     "huggingface_hub[hf_transfer]>=0.21.0"  # download
 ]

@@ -15,14 +15,15 @@ def test_cli():
         main()
     out = out.getvalue()
     assert "usage: litgpt" in out
-    assert "{download,chat,finetune,pretrain,generate,convert,merge_lora}" in out
+    assert "{download,chat,finetune,pretrain,generate,convert,merge_lora,evaluate}" in out
     assert (
         """Available subcommands:
     download            Download weights or tokenizer data from the Hugging
                         Face Hub.
     chat                Chat with a model."""
         in out
     )
+    assert ("""evaluate            Evaluate a model with the LM Evaluation Harness.""") in out
 
     out = StringIO()
     with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "finetune", "-h"]):