Test filtering

Lightning-AI · Nov 17, 2023 · edc4ed5 · edc4ed5
1 parent f475bd5
commit edc4ed5
Show file tree

Hide file tree

Showing 11 changed files with 183 additions and 46 deletions.
diff --git a/.github/azure-gpu-test.yml b/.github/azure-gpu-test.yml
@@ -53,4 +53,14 @@ jobs:
       displayName: "Env details"
 
     - bash: pytest -v --disable-pytest-warnings --strict-markers --color=yes
-      displayName: 'Testing'
+      displayName: 'Ordinary tests'
+      env:
+        PL_RUN_CUDA_TESTS: "1"
+      timeoutInMinutes: "5"
+
+    - bash: bash run_standalone_tests.sh
+      workingDirectory: tests
+      env:
+        PL_RUN_CUDA_TESTS: "1"
+      displayName: "Standalone tests"
+      timeoutInMinutes: "5"
diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml
@@ -58,13 +58,13 @@ jobs:
 
     - name: Run tests without the package installed
       run: |
-        pip install -r requirements-all.txt pytest pytest-rerunfailures transformers einops protobuf
+        pip install -r requirements-all.txt pytest pytest-rerunfailures pytest-timeout transformers einops protobuf
         pip list
 
-        pytest -v --disable-pytest-warnings --strict-markers --color=yes
+        pytest -v --disable-pytest-warnings --strict-markers --color=yes --timeout 60
 
     - name: Run tests
       run: |
         pip install . --no-deps
 
-        pytest -v --disable-pytest-warnings --strict-markers --color=yes
+        pytest -v --disable-pytest-warnings --strict-markers --color=yes --timeout 60
diff --git a/pretrain/tinyllama.py b/pretrain/tinyllama.py
@@ -161,14 +161,13 @@ def train(fabric, state, train_dataloader, val_dataloader, resume):
             if curr_iter < initial_iter:
                 curr_iter += 1
                 continue
-            else:
-                resume = False
-                curr_iter = -1
-                fabric.barrier()
-                fabric.print(
-                    "Resuming data loader finished."
-                    f"Took {time.perf_counter() - total_t0:.1f} seconds to reach iteration {initial_iter}."
-                )
+            resume = False
+            curr_iter = -1
+            fabric.barrier()
+            fabric.print(
+                "Resuming data loader finished."
+                f"Took {time.perf_counter() - total_t0:.1f} seconds to reach iteration {initial_iter}."
+            )
 
         if state["iter_num"] >= max_iters:
             break

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,9 +1,11 @@
 import os
 import sys
 from pathlib import Path
+from typing import List
 
 import pytest
 import torch
+from lightning.fabric.utilities.testing import _runif_reasons
 
 wd = Path(__file__).parent.parent.absolute()
 
@@ -40,3 +42,53 @@ def tensor_like():
 def restore_default_dtype():
     # just in case
     torch.set_default_dtype(torch.float32)
+
+
+def RunIf(**kwargs):
+    reasons, marker_kwargs = _runif_reasons(**kwargs)
+    return pytest.mark.skipif(condition=len(reasons) > 0, reason=f"Requires: [{' + '.join(reasons)}]", **marker_kwargs)
+
+
+# https://github.com/Lightning-AI/lightning/blob/6e517bd55b50166138ce6ab915abd4547702994b/tests/tests_fabric/conftest.py#L140
+def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.Config) -> None:
+    initial_size = len(items)
+    conditions = []
+    filtered, skipped = 0, 0
+
+    options = {"standalone": "PL_RUN_STANDALONE_TESTS", "min_cuda_gpus": "PL_RUN_CUDA_TESTS"}
+    if os.getenv(options["standalone"], "0") == "1" and os.getenv(options["min_cuda_gpus"], "0") == "1":
+        # special case: we don't have a CPU job for standalone tests, so we shouldn't run only cuda tests.
+        # by deleting the key, we avoid filtering out the CPU tests
+        del options["min_cuda_gpus"]
+
+    for kwarg, env_var in options.items():
+        # this will compute the intersection of all tests selected per environment variable
+        if os.getenv(env_var, "0") == "1":
+            conditions.append(env_var)
+            for i, test in reversed(list(enumerate(items))):  # loop in reverse, since we are going to pop items
+                already_skipped = any(marker.name == "skip" for marker in test.own_markers)
+                if already_skipped:
+                    # the test was going to be skipped anyway, filter it out
+                    items.pop(i)
+                    skipped += 1
+                    continue
+                has_runif_with_kwarg = any(
+                    marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers
+                )
+                if not has_runif_with_kwarg:
+                    # the test has `@RunIf(kwarg=True)`, filter it out
+                    items.pop(i)
+                    filtered += 1
+
+    if config.option.verbose >= 0 and (filtered or skipped):
+        writer = config.get_terminal_writer()
+        writer.write(
+            (
+                f"\nThe number of tests has been filtered from {initial_size} to {initial_size - filtered} after the"
+                f" filters {conditions}.\n{skipped} tests are marked as unconditional skips.\nIn total,"
+                f" {len(items)} tests will run.\n"
+            ),
+            flush=True,
+            bold=True,
+            purple=True,  # oh yeah, branded pytest messages
+        )
diff --git a/tests/run_standalone_tests.sh b/tests/run_standalone_tests.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+set -e
+
+# Batch size for testing: Determines how many standalone test invocations run in parallel
+# It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE
+test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-1}"
+
+# this environment variable allows special tests to run
+export PL_RUN_STANDALONE_TESTS=1
+# python arguments
+defaults="-m pytest --no-header -v --disable-pytest-warnings --strict-markers --color=yes -s --timeout 120"
+echo "Using defaults: ${defaults}"
+
+# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster
+grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py')
+
+# file paths, remove duplicates
+files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq)
+
+# get the list of parametrizations. we need to call them separately. the last two lines are removed.
+# note: if there's a syntax error, this will fail with some garbled output
+if [[ "$OSTYPE" == "darwin"* ]]; then
+  parametrizations=$(python3 -m pytest $files --collect-only --quiet "$@" | tail -r | sed -e '1,3d' | tail -r)
+else
+  parametrizations=$(python3 -m pytest $files --collect-only --quiet "$@" | head -n -2)
+fi
+# remove the "tests/" path suffix
+path_suffix=$(basename "$(pwd)")"/"  # https://stackoverflow.com/a/8223345
+parametrizations=${parametrizations//$path_suffix/}
+parametrizations_arr=($parametrizations)
+
+report=''
+
+rm -f standalone_test_output.txt  # in case it exists, remove it
+function show_batched_output {
+  if [ -f standalone_test_output.txt ]; then  # if exists
+    cat standalone_test_output.txt
+    # heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail
+    if grep -iE 'error|exception|traceback|failed' standalone_test_output.txt | grep -qvE 'on_exception|xfailed'; then
+      echo "Potential error! Stopping."
+      rm standalone_test_output.txt
+      exit 1
+    fi
+    rm standalone_test_output.txt
+  fi
+}
+trap show_batched_output EXIT  # show the output on exit
+
+for i in "${!parametrizations_arr[@]}"; do
+  parametrization=${parametrizations_arr[$i]}
+  prefix="$((i+1))/${#parametrizations_arr[@]}"
+
+  echo "$prefix: Running $parametrization"
+  # execute the test in the background
+  # redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them
+  # output to std{out,err} because the outputs would be garbled together
+  python3 ${defaults} "$parametrization" &>> standalone_test_output.txt &
+  # save the PID in an array
+  pids[${i}]=$!
+  # add row to the final report
+  report+="Ran\t$parametrization\n"
+
+  if ((($i + 1) % $test_batch_size == 0)); then
+    # wait for running tests
+    for pid in ${pids[*]}; do wait $pid; done
+    unset pids  # empty the array
+    show_batched_output
+  fi
+done
+# wait for leftover tests
+for pid in ${pids[*]}; do wait $pid; done
+show_batched_output
+
+# echo test report
+printf '=%.s' {1..80}
+printf "\n$report"
+printf '=%.s' {1..80}
+printf '\n'
diff --git a/tests/test_adapter.py b/tests/test_adapter.py
@@ -1,13 +1,13 @@
-import sys
 from contextlib import redirect_stdout
 from dataclasses import asdict
 from io import StringIO
 from unittest.mock import Mock
 
-import pytest
 import torch
 from lightning import Fabric
 
+from tests.conftest import RunIf
+
 
 def test_config_identical():
     import lit_gpt.adapter as gpt_adapter
@@ -106,7 +106,7 @@ def test_adapter_gpt_init_weights():
     assert (param == 0).all()
 
 
-@pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="torch.compile not supported on this platform")
+@RunIf(dynamo=True)
 @torch.inference_mode()
 def test_adapter_compile():
     from lit_gpt.adapter import GPT

diff --git a/tests/test_adapter_v2.py b/tests/test_adapter_v2.py
@@ -1,12 +1,12 @@
-import sys
 from contextlib import redirect_stdout
 from io import StringIO
 from unittest.mock import Mock
 
-import pytest
 import torch
 from lightning import Fabric
 
+from tests.conftest import RunIf
+
 
 def test_config_identical():
     import lit_gpt.adapter_v2 as gpt_adapter
@@ -135,7 +135,7 @@ def test_base_model_can_be_adapter_v2_loaded():
         assert adapter_filter(k, None)
 
 
-@pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="torch.compile not supported on this platform")
+@RunIf(dynamo=True)
 @torch.inference_mode()
 def test_adapter_v2_compile():
     from lit_gpt.adapter_v2 import GPT

diff --git a/tests/test_gptq.py b/tests/test_gptq.py
@@ -1,10 +1,11 @@
 import lightning as L
 import pytest
 import torch
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2
 
+from tests.conftest import RunIf
 
-@pytest.mark.skipif(_TORCH_GREATER_EQUAL_2_2, reason="Core dumped")
+
+@RunIf(max_torch="2.2")  # TODO: core dumped
 def test_gptq_blockwise_quantization():
     from quantize.gptq import _TRITON_AVAILABLE
 

diff --git a/tests/test_lora.py b/tests/test_lora.py
@@ -1,4 +1,3 @@
-import sys
 from contextlib import redirect_stdout
 from io import StringIO
 from itertools import product
@@ -8,6 +7,8 @@
 import torch
 from lightning import Fabric
 
+from tests.conftest import RunIf
+
 
 def test_lora_layer_replacement():
     from lit_gpt.lora import GPT, Config, LoRALinear
@@ -351,7 +352,7 @@ def test_lora_qkv_linear_weights_merged_status(rank, enable_lora, expected_merge
     assert layer.merged == expected_merged
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="8bit requires CUDA")
+@RunIf(min_cuda_gpus=1)
 # platform dependent cuda issue: libbitsandbytes_cpu.so: undefined symbol: cquantize_blockwise_fp16_nf4
 @pytest.mark.xfail(raises=AttributeError, strict=False)
 # https://github.com/Lightning-AI/lit-gpt/issues/513
@@ -456,7 +457,7 @@ def test_base_model_can_be_lora_loaded():
         assert lora_filter(k, None)
 
 
-@pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="torch.compile not supported on this platform")
+@RunIf(dynamo=True)
 @torch.inference_mode()
 def test_lora_compile():
     from lit_gpt.lora import GPT

diff --git a/tests/test_model.py b/tests/test_model.py
@@ -9,6 +9,8 @@
 from lightning.fabric.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_2_2
 from lightning_utilities.core.imports import compare_version
 
+from tests.conftest import RunIf
+
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
@@ -341,7 +343,7 @@ def test_against_hf_phi(device, dtype):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA"),
+                RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -393,7 +395,7 @@ def test_against_hf_mistral(device, dtype):
     torch.testing.assert_close(ours_y, theirs_y)
 
 
-@pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="torch.compile not supported on this platform")
+@RunIf(dynamo=True)
 @torch.inference_mode()
 def test_model_compile():
     from lit_gpt import GPT
@@ -468,14 +470,10 @@ def test_model_kv_cache_amp():
 
 
 # https://github.com/pytorch/pytorch/blob/ad3572a5d/torch/testing/_internal/common_cuda.py#L31-L34
-SUPPORTS_FLASH_ATTENTION = (
-    torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 0) and not _IS_WINDOWS
-)
-SUPPORTS_MEM_EFF_ATTENTION = torch.cuda.is_available()
-SUPPORTS_FUSED_ATTENTION = SUPPORTS_FLASH_ATTENTION or SUPPORTS_MEM_EFF_ATTENTION
+SUPPORTS_FLASH_ATTENTION = torch.cuda.get_device_capability() >= (8, 0) and not _IS_WINDOWS
 
 
-@pytest.mark.skipif(not SUPPORTS_FUSED_ATTENTION, reason="Unsupported")
+@RunIf(min_cuda_gpus=1)
 @pytest.mark.parametrize("config", config_module.configs, ids=[c["name"] for c in config_module.configs])
 @torch.inference_mode()
 def test_sdpa_choice(config):
@@ -515,13 +513,12 @@ def assert_sdpa_uses_flash(original_fn, q, k, v, mask):
         with torch.backends.cuda.sdp_kernel(enable_mem_efficient=False):
             model(x)
 
-    if SUPPORTS_MEM_EFF_ATTENTION:
-        expected = SDPBackend.EFFICIENT_ATTENTION if config.head_size % 8 == 0 else SDPBackend.MATH
-        with torch.backends.cuda.sdp_kernel(enable_flash=False):
-            model(x)
+    expected = SDPBackend.EFFICIENT_ATTENTION if config.head_size % 8 == 0 else SDPBackend.MATH
+    with torch.backends.cuda.sdp_kernel(enable_flash=False):
+        model(x)
 
 
-@pytest.mark.skipif(not SUPPORTS_FUSED_ATTENTION, reason="Unsupported")
+@RunIf(min_cuda_gpus=1)
 @pytest.mark.parametrize("config", config_module.configs, ids=[c["name"] for c in config_module.configs])
 @torch.inference_mode()
 def test_sdpa_choice_kv_cache(config):
@@ -559,11 +556,8 @@ def assert_sdpa_uses_flash(original_fn, q, k, v, mask):
         with torch.backends.cuda.sdp_kernel(enable_mem_efficient=False):
             model(x, input_pos)
 
-    if SUPPORTS_MEM_EFF_ATTENTION:
-        expected = (
-            SDPBackend.EFFICIENT_ATTENTION
-            if config.head_size % 8 == 0 and config.n_query_groups != 1
-            else SDPBackend.MATH
-        )
-        with torch.backends.cuda.sdp_kernel(enable_flash=False):
-            model(x, input_pos)
+    expected = (
+        SDPBackend.EFFICIENT_ATTENTION if config.head_size % 8 == 0 and config.n_query_groups != 1 else SDPBackend.MATH
+    )
+    with torch.backends.cuda.sdp_kernel(enable_flash=False):
+        model(x, input_pos)
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,12 +1,13 @@
 import os
-import sys
 from contextlib import redirect_stderr
 from io import StringIO
 
 import pytest
 import torch
 import torch.nn.functional as F
 
+from tests.conftest import RunIf
+
 
 def test_find_multiple():
     from lit_gpt.utils import find_multiple
@@ -20,7 +21,8 @@ def test_find_multiple():
     assert find_multiple(50254, 512) == 50688
 
 
-@pytest.mark.skipif(sys.platform == "win32", reason="match fails on windows. why did they have to use backslashes?")
+# match fails on windows. why did they have to use backslashes?
+@RunIf(skip_windows=True)
 def test_check_valid_checkpoint_dir(tmp_path):
     from lit_gpt.utils import check_valid_checkpoint_dir