CI sanity check test for env vars (pytorch#120519)

Make a test that fails on purpose to trigger retries. Check the opposite of success (that env vars exist) It's bit hacky because I want it to fail on the normal flow in order to trigger reruns but I don't want to expose the failures to users since it's confusing. Pull Request resolved: pytorch#120519 Approved by: https://github.com/huydhn
kurtamohler · Mar 8, 2024 · f43b9c5 · f43b9c5
1 parent 75bb049
commit f43b9c5
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 1 deletion.
diff --git a/test/run_test.py b/test/run_test.py
@@ -2,6 +2,7 @@
 
 import argparse
 import copy
+import glob
 import json
 import os
 import pathlib
@@ -381,6 +382,7 @@ def run_test(
     launcher_cmd=None,
     extra_unittest_args=None,
     env=None,
+    print_log=True,
 ) -> int:
     env = env or os.environ.copy()
     maybe_set_hip_visible_devies()
@@ -542,7 +544,7 @@ def run_test(
             # comes up in the future.
             ret_code = 0 if ret_code == 5 or ret_code == 4 else ret_code
 
-    if options.pipe_logs:
+    if options.pipe_logs and print_log:
         handle_log_file(
             test_module, log_path, failed=(ret_code != 0), was_rerun=was_rerun
         )
@@ -1002,6 +1004,23 @@ def get_pytest_args(options, is_cpp_test=False, is_distributed_test=False):
     return pytest_args
 
 
+def run_ci_sanity_check(test: ShardedTest, test_directory, options):
+    assert (
+        test.name == "test_ci_sanity_check_fail"
+    ), f"This handler only works for test_ci_sanity_check_fail, got {test.name}"
+    ret_code = run_test(test, test_directory, options, print_log=False)
+    # This test should fail
+    if ret_code != 1:
+        return 1
+    test_reports_dir = str(REPO_ROOT / "test/test-reports")
+    # Delete the log files and xmls generated by the test
+    for file in glob.glob(f"{test_reports_dir}/{test.name}*.log"):
+        os.remove(file)
+    for dirname in glob.glob(f"{test_reports_dir}/**/{test.name}"):
+        shutil.rmtree(dirname)
+    return 0
+
+
 CUSTOM_HANDLERS = {
     "test_cuda_primary_ctx": run_test_with_subprocess,
     "test_cuda_nvml_based_avail": run_test_with_subprocess,
@@ -1024,6 +1043,7 @@ def get_pytest_args(options, is_cpp_test=False, is_distributed_test=False):
     "distributed/rpc/test_share_memory": run_test_with_subprocess,
     "distributed/rpc/cuda/test_tensorpipe_agent": run_test_with_subprocess,
     "doctests": run_doctests,
+    "test_ci_sanity_check_fail": run_ci_sanity_check,
 }
 
 

diff --git a/test/test_ci_sanity_check_fail.py b/test/test_ci_sanity_check_fail.py
@@ -0,0 +1,16 @@
+# Owner(s): ["module: ci"]
+# Sanity check for CI setup in GHA.  This file is expected to fail so it can trigger reruns
+
+import os
+
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestCISanityCheck(TestCase):
+    def test_env_vars_exist(self):
+        # This check should fail and trigger reruns.  If it passes, something is wrong
+        self.assertTrue(os.environ.get("CI") is None)
+
+
+if __name__ == "__main__":
+    run_tests()