Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add litgpt evaluate command #1177

Merged
merged 49 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from 42 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
375e99e
`litgpt evaluate` command
rasbt Mar 21, 2024
0c53da1
update package dependench
rasbt Mar 21, 2024
669ce22
add llm-eval dependency
rasbt Mar 21, 2024
d161d12
move imports
rasbt Mar 21, 2024
e7ebfbc
update cli test
rasbt Mar 21, 2024
4660507
cleanup
rasbt Mar 21, 2024
018cc89
eval unit test
rasbt Mar 22, 2024
98130f9
run tests on cpu
rasbt Mar 22, 2024
a549535
Add lm-eval to test dependencies
rasbt Mar 22, 2024
7ff3ff2
bump version
rasbt Mar 22, 2024
c47e764
Update litgpt/scripts/evaluate.py
rasbt Mar 25, 2024
042d2a5
Update litgpt/scripts/evaluate.py
rasbt Mar 25, 2024
359dad5
Update litgpt/scripts/evaluate.py
rasbt Mar 25, 2024
9bbc5cc
Merge branch 'main' into litgpt-eval
rasbt Mar 25, 2024
f7147c4
make args required
rasbt Mar 25, 2024
0786285
automatically infer repo_id
rasbt Mar 25, 2024
b54095d
check out_dir defaults
rasbt Mar 25, 2024
4c77a6a
move evaluate.py
rasbt Mar 25, 2024
223eb95
Merge branch 'main' into litgpt-eval
rasbt Mar 25, 2024
96d8229
Deps
carmocca Mar 26, 2024
9d9ef7c
Extra file
carmocca Mar 26, 2024
5abec5a
fix import
awaelchli Mar 27, 2024
966ff3e
fix evaluate reference
rasbt Mar 27, 2024
9b2ae7d
fix doc formatting
rasbt Mar 27, 2024
bb4ea30
prototype
rasbt Mar 27, 2024
8988dda
Add batch size
rasbt Mar 28, 2024
f7a46f1
Merge branch 'main' into litgpt-eval
rasbt Mar 28, 2024
45968da
revert to saving temp file and fix output print
rasbt Mar 29, 2024
4c712a2
Merge branch 'main' into litgpt-eval
rasbt Mar 29, 2024
b3b693e
run test on cpu
rasbt Mar 29, 2024
17d4aa2
update tests and docs
rasbt Mar 29, 2024
296101d
update
rasbt Mar 29, 2024
bacd1d6
fix test
rasbt Mar 30, 2024
afaee75
fix test
rasbt Mar 30, 2024
687a382
fix test
rasbt Mar 30, 2024
cdb06c6
Merge branch 'main' into litgpt-eval
rasbt Mar 30, 2024
5faa293
fix tests
rasbt Mar 30, 2024
1c3686c
extend tests
rasbt Mar 30, 2024
a881630
finally fixed
rasbt Mar 30, 2024
1ca218b
Merge branch 'main' into litgpt-eval
rasbt Apr 1, 2024
012ad9b
add new pretrain image
rasbt Apr 2, 2024
8c55ca1
Merge branch 'main' into litgpt-eval
rasbt Apr 2, 2024
9b381c1
Parametrize CLI test
carmocca Apr 3, 2024
b53b688
Minor fixes
carmocca Apr 3, 2024
6cc84ab
Merge branch 'main' into litgpt-eval
carmocca Apr 3, 2024
887ff61
Update evaluation.md
rasbt Apr 3, 2024
6e9e238
Merge branch 'main' into litgpt-eval
rasbt Apr 3, 2024
5a944d2
Apply suggestions from code review
carmocca Apr 4, 2024
efb6ca4
Update tutorials/evaluation.md
carmocca Apr 4, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/azure-gpu-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
displayName: "Image info & NVIDIA"

- script: |
pip install '.[all,test]' 'lm_eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8b'
pip install '.[all,test]'
displayName: 'Install dependencies'

- script: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/cpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:

- name: Install all dependencies
run: |
uv pip install --system -e '.[all,test]' 'lm_eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8b'
uv pip install --system -e '.[all,test]'
uv pip list

- name: Run tests
Expand Down
189 changes: 0 additions & 189 deletions eval/lm_eval_harness.py

This file was deleted.

2 changes: 2 additions & 0 deletions litgpt/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
)
from litgpt.scripts.download import download_from_hub as download_fn
from litgpt.scripts.merge_lora import merge_lora as merge_lora_fn
from litgpt.eval.evaluate import convert_and_evaluate as evaluate_fn

if TYPE_CHECKING:
from jsonargparse import ArgumentParser
Expand Down Expand Up @@ -78,6 +79,7 @@ def main() -> None:
},
},
"merge_lora": {"help": "Merges the LoRA weights with the base model.", "fn": merge_lora_fn},
"evaluate": {"help": "Evaluate a model with the LM Evaluation Harness.", "fn": evaluate_fn},
}

from jsonargparse import set_config_read_mode, set_docstring_parse_options
Expand Down
115 changes: 115 additions & 0 deletions litgpt/eval/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.

import json
import os
from pathlib import Path
from typing import Optional
import yaml
import torch

from litgpt.scripts.convert_lit_checkpoint import convert_lit_checkpoint
from litgpt.utils import CLI, copy_config_files


def safe_safetensors(out_dir, repo_id):
carmocca marked this conversation as resolved.
Show resolved Hide resolved
from transformers import AutoModel

state_dict = torch.load(out_dir/"model.pth")
model = AutoModel.from_pretrained(
repo_id, state_dict=state_dict
)
model.save_pretrained(out_dir)
rasbt marked this conversation as resolved.
Show resolved Hide resolved


def prepare_results(results, save_filepath, print_results=True):
from lm_eval.utils import make_table

if print_results:
print(make_table(results))
if "groups" in results:
print(make_table(results, "groups"))

json_result = json.dumps(
results, indent=2, ensure_ascii=False
)
save_filepath.open("w", encoding="utf-8").write(json_result)


def convert_and_evaluate(
checkpoint_dir: str,
out_dir: Optional[str] = None,
force_conversion: bool = False,
tasks: Optional[str] = "hellaswag,truthfulqa_mc2,mmlu",
num_fewshot: Optional[int] = None,
batch_size: int = 1,
device: Optional[str] = None,
limit: Optional[float] = None,
seed: int = 1234,
save_filepath: Optional[str] = None,
) -> None:
"""Convert a LitGPT model and run the LM Evaluation Harness

Arguments:
checkpoint_dir: Directory where the `lit_model.pth` and tokenizer files are located.
out_dir: Directory in which to save the converted checkpoints for evaluation.
Saves to `checkpoint_dir`/evaluate by default.
force_conversion: Set to `True` to reconvert the model and override
an existing model.pth from a previous evaluation call.
tasks: CSV of task names to evaluate.
By default, the following tasks are used:
"hellaswag,truthfulqa_mc2,mmlu"
num_fewshot: Number of examples in few-shot context.
batch_size: Batch size configuration.
device: Device to use for evaluation, for example, "cuda" or "cuda:0".
limit: Limit on number of examples per task.
seed: Random seed.
save_filepath: The file where the results will be saved.
Saves to `out_dir/results.json` by default.
"""

from lm_eval import evaluator

checkpoint_dir = Path(checkpoint_dir)

if out_dir is None:
out_dir = checkpoint_dir / "evaluate"
else:
out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)

save_filepath = out_dir / Path("results.json") if save_filepath is None else Path(save_filepath)
config_filepath = checkpoint_dir/"model_config.yaml"

with open(config_filepath) as f:
config_dict = yaml.safe_load(f)
repo_id = f"{config_dict['hf_config']['org']}/{config_dict['hf_config']['name']}"

copy_config_files(source_dir=checkpoint_dir, out_dir=out_dir)

model_path = out_dir / "model.pth"
if not model_path.exists() or force_conversion:
convert_lit_checkpoint(checkpoint_dir=checkpoint_dir, output_dir=out_dir)

safetensors_path = out_dir / "model.safetensors"
if not safetensors_path.exists() or force_conversion:
safe_safetensors(out_dir, repo_id)
carmocca marked this conversation as resolved.
Show resolved Hide resolved

os.environ["TOKENIZERS_PARALLELISM"] = "false"

results = evaluator.simple_evaluate(
model="hf",
model_args=f"pretrained={out_dir}",
tasks=tasks.split(","),
num_fewshot=num_fewshot,
batch_size=batch_size,
device=device,
limit=limit,
random_seed=seed,
numpy_random_seed=seed,
torch_random_seed=seed,
)
prepare_results(results, save_filepath)


if __name__ == "__main__":
CLI(convert_and_evaluate)
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ test = [
"pytest",
"pytest-rerunfailures",
"pytest-timeout",
"transformers>=4.38.0",
"transformers>=4.38.0", # numerical comparisons
"einops",
"protobuf",
"lightning-thunder; python_version >= '3.10'",
Expand All @@ -35,14 +35,16 @@ all = [
"bitsandbytes==0.42.0", # quantization
"sentencepiece", # llama-based models
"tokenizers", # pythia, falcon, redpajama
"datasets", # eval
"requests", # litgpt.data
"litdata", # litgpt.data
"zstandard", # litgpt.data.prepare_slimpajama.py
"pandas", # litgpt.data.prepare_starcoder.py
"pyarrow", # litgpt.data.prepare_starcoder.py
"tensorboard", # litgpt.pretrain
"torchmetrics", # litgpt.pretrain
"datasets", # litgpt.evaluate
"transformers>=4.38.0", # litgpt.evaluate
"lm-eval>=0.4.2", # litgpt.evaluate
"safetensors", # download
"huggingface_hub[hf_transfer]>=0.21.0" # download
]
Expand Down
3 changes: 2 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,15 @@ def test_cli():
main()
out = out.getvalue()
assert "usage: litgpt" in out
assert "{download,chat,finetune,pretrain,generate,convert,merge_lora}" in out
assert "{download,chat,finetune,pretrain,generate,convert,merge_lora,evaluate}" in out
assert (
"""Available subcommands:
download Download weights or tokenizer data from the Hugging
Face Hub.
chat Chat with a model."""
in out
)
assert ("""evaluate Evaluate a model with the LM Evaluation Harness.""") in out

out = StringIO()
with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "finetune", "-h"]):
Expand Down
Loading
Loading