From 42c3067fa78c2ff82a5301849e2e46b8601ead15 Mon Sep 17 00:00:00 2001
From: Mikhail Uvarov <arcusfelis@gmail.com>
Date: Mon, 9 Dec 2024 15:52:16 +0100
Subject: [PATCH 1/2] Add cth_validate_nodes

The hook would check if MIM is running before init_per_suite
for each big test

It would check it for nodes requested by setting TEST_HOSTS variable
--one-node flag still works as expected

To disable this check use new --skip-validate-nodes flag

It is compatible with cth_surefire
---
 big_tests/default.spec               | 12 +++++-----
 big_tests/dynamic_domains.spec       | 10 ++++----
 big_tests/src/ct_mongoose_hook.erl   |  6 +++--
 big_tests/src/cth_validate_nodes.erl | 35 ++++++++++++++++++++++++++++
 test/common/distributed_helper.erl   | 32 ++++++++++++++++++++++++-
 tools/test-runner.sh                 | 12 ++++++++++
 6 files changed, 93 insertions(+), 14 deletions(-)
 create mode 100644 big_tests/src/cth_validate_nodes.erl

diff --git a/big_tests/default.spec b/big_tests/default.spec
index 6df21abda9c..5cf76ca2c71 100644
--- a/big_tests/default.spec
+++ b/big_tests/default.spec
@@ -130,13 +130,13 @@
 %% ct_mongoose_hook will:
 %% * ensure preset & mim_data_dir values are passed to ct Config
 %% * check server's purity after SUITE
-{ct_hooks, [ct_groups_summary_hook, ct_tty_hook,
+{ct_hooks, [cth_validate_nodes,
+            ct_groups_summary_hook,
+            ct_tty_hook,
             ct_mongoose_hook,
+            ct_mongoose_log_hook,
             {ct_mongoose_log_hook, [{host, mim2}, {print_init_and_done_for_testcases, false}]},
             {ct_mongoose_log_hook, [{host, mim3}, {print_init_and_done_for_testcases, false}]},
             ct_progress_hook,
-            ct_markdown_errors_hook, ct_mongoose_log_hook]}.
-
-%% since test-runner.sh can be executed with the --one-node option,
-%% log collection is enabled by default for host mim1 only.
-% {ct_hooks, [{ct_mongoose_log_hook,[{host, mim2}, {log, []}]}]}.
+            ct_markdown_errors_hook
+]}.
diff --git a/big_tests/dynamic_domains.spec b/big_tests/dynamic_domains.spec
index 7ed9ad920b0..0df35ae5d33 100644
--- a/big_tests/dynamic_domains.spec
+++ b/big_tests/dynamic_domains.spec
@@ -173,12 +173,12 @@
 %% ct_mongoose_hook will:
 %% * ensure preset & mim_data_dir values are passed to ct Config
 %% * check server's purity after SUITE
-{ct_hooks, [ct_groups_summary_hook, ct_tty_hook, ct_mongoose_hook, ct_progress_hook,
+{ct_hooks, [cth_validate_nodes,
+            ct_groups_summary_hook,
+            ct_tty_hook,
+            ct_mongoose_hook,
+            ct_progress_hook,
             ct_markdown_errors_hook,
             ct_mongoose_log_hook,
             {ct_mongoose_log_hook, [{host, mim2}, {print_init_and_done_for_testcases, false}]},
             {ct_mongoose_log_hook, [{host, mim3}, {print_init_and_done_for_testcases, false}]}]}.
-
-%% since test-runner.sh can be executed with the --one-node option,
-%% log collection is enabled by default for host mim1 only.
-% {ct_hooks, [{ct_mongoose_log_hook,[{host, mim2}, {log, []}]}]}.
diff --git a/big_tests/src/ct_mongoose_hook.erl b/big_tests/src/ct_mongoose_hook.erl
index 921adfe111b..e02fc68b2aa 100644
--- a/big_tests/src/ct_mongoose_hook.erl
+++ b/big_tests/src/ct_mongoose_hook.erl
@@ -41,14 +41,16 @@ init(_Id, _Opts) ->
     {ok, #{}}.
 
 %% @doc Called before init_per_suite is called.
-pre_init_per_suite(Suite,Config,State) ->
+pre_init_per_suite(Suite, [_|_] = Config, State) ->
     Preset = case application:get_env(common_test, test_label) of
                  {ok, Value} -> Value;
                  _ -> undefined
              end,
     DataDir = path_helper:data_dir(Suite, Config),
     NewConfig = [{preset, Preset}, {mim_data_dir, DataDir} | Config],
-    {NewConfig, State}.
+    {NewConfig, State};
+pre_init_per_suite(_Suite, SkipOrFail, State) ->
+    {SkipOrFail, State}.
 
 %% @doc Called after init_per_suite.
 post_init_per_suite(_Suite, _Config, Return, State) ->
diff --git a/big_tests/src/cth_validate_nodes.erl b/big_tests/src/cth_validate_nodes.erl
new file mode 100644
index 00000000000..aa4f5ce8f0d
--- /dev/null
+++ b/big_tests/src/cth_validate_nodes.erl
@@ -0,0 +1,35 @@
+-module(cth_validate_nodes).
+
+%% Callbacks
+-export([id/1]).
+-export([init/2]).
+
+-export([pre_init_per_suite/3]).
+-export([terminate/1]).
+
+-record(state, {}).
+
+%% CT callbacks
+
+id(_Opts) ->
+    "cth_validate_nodes_001".
+
+init(_Id, _Opts) ->
+    {ok, #state{}}.
+
+pre_init_per_suite(_Suite, Config, State) ->
+    case distributed_helper:validate_nodes() of
+        ok ->
+            {Config, State};
+        {error, Reason} ->
+            case os:getenv("SKIP_VALIDATE_NODES") of
+                "true" ->
+                    ct:pal("Skip failing with ~p in ct_check_rpc_nodes", [Reason]),
+                    {Config, State};
+                _ ->
+                    {{fail, Reason}, State}
+            end
+    end.
+
+terminate(_State) ->
+    ok.
diff --git a/test/common/distributed_helper.erl b/test/common/distributed_helper.erl
index 7f1ffe86726..5bb856b5e97 100644
--- a/test/common/distributed_helper.erl
+++ b/test/common/distributed_helper.erl
@@ -39,7 +39,7 @@ add_node_to_cluster(Node, Config) ->
     end,
     Config.
 
-add_node_to_mnesia_cluster(Node, Config) ->
+add_node_to_mnesia_cluster(Node, _Config) ->
     ClusterMemberNode = maps:get(node, mim()),
     ok = rpc(Node#{timeout => cluster_op_timeout()},
              mongoose_cluster, join, [ClusterMemberNode]),
@@ -205,3 +205,33 @@ subhost_pattern(SubhostTemplate) ->
 
 lookup_config_opt(Key) ->
     rpc(mim(), mongoose_config, lookup_opt, [Key]).
+
+%% @doc Checks if MongooseIM nodes are running
+validate_nodes() ->
+    Results = [validate_node(Node) || Node <- get_node_keys()],
+    Errors = [Res || Res <- Results, Res =/= ok],
+    case Errors of
+        [] -> ok;
+        _ -> {error, Errors}
+    end.
+
+get_node_keys() ->
+    case os:getenv("TEST_HOSTS") of
+        false ->
+            [NodeKey || {NodeKey, _Opts} <- ct:get_config(hosts)];
+        EnvValue -> %% EnvValue examples are "mim" or "mim mim2"
+            BinHosts = binary:split(iolist_to_binary(EnvValue), <<" ">>, [global]),
+            [binary_to_atom(Node, utf8) || Node <- BinHosts]
+    end.
+
+validate_node(NodeKey) ->
+    Spec = #{node := Node} = rpc_spec(NodeKey),
+    try rpc(Spec, application, which_applications, []) of
+        Loaded ->
+            case lists:keymember(mongooseim, 1, Loaded) of
+                true -> ok;
+                false -> {validate_node_failed, mongooseim_not_running, Node}
+            end
+    catch error:{badrpc, Reason} ->
+        {validate_node_failed, {badrpc, Reason}, Node}
+    end.
diff --git a/tools/test-runner.sh b/tools/test-runner.sh
index 1828d829e67..46730446f21 100755
--- a/tools/test-runner.sh
+++ b/tools/test-runner.sh
@@ -37,6 +37,7 @@ Options:
 --skip-start-nodes    -- do not start nodes before big tests
 --skip-stop-nodes     -- do not stop nodes after big tests
 --skip-setup-db       -- do not start any databases, the same as "--db --" option
+--skip-validate-nodes -- do not check if MongooseIM is running
 --no-parallel         -- run most commands in sequence
 --tls-dist            -- enable encryption between nodes in big tests
 --verbose             -- print script output
@@ -115,6 +116,9 @@ Script examples:
     Sets dev-nodes and test-hosts to empty lists
     Reruns mam_SUITE
 
+./tools/test-runner.sh --skip-small-tests --skip-start-nodes --skip-validate-nodes -- connect
+    Continues test execution even if MongooseIM is not running on all nodes
+
 ./tools/test-runner.sh --rerun-big-tests -- mam
     The same command as above
 
@@ -309,6 +313,7 @@ SELECTED_TESTS=()
 STOP_SCRIPT=false
 SKIP_DB_SETUP=false
 DB_FROM_PRESETS=true
+SKIP_VALIDATE_NODES=false
 
 # Parse command line arguments
 # Prefer arguments to env variables
@@ -353,6 +358,11 @@ case $key in
         DB_FROM_PRESETS=false
     ;;
 
+    --skip-validate-nodes)
+        shift # past argument
+        SKIP_VALIDATE_NODES=true
+    ;;
+
     # Similar how we parse --db option
     --preset)
         shift # past argument
@@ -635,6 +645,7 @@ export TESTSPEC="auto_big_tests.spec"
 export START_NODES="$START_NODES"
 export STOP_NODES="$STOP_NODES"
 export PAUSE_BEFORE_BIG_TESTS="$PAUSE_BEFORE_BIG_TESTS"
+export SKIP_VALIDATE_NODES="$SKIP_VALIDATE_NODES"
 
 # Debug printing
 echo "Variables:"
@@ -653,6 +664,7 @@ echo "    TESTSPEC=$TESTSPEC"
 echo "    TLS_DIST=$TLS_DIST"
 echo "    START_NODES=$START_NODES"
 echo "    STOP_NODES=$STOP_NODES"
+echo "    SKIP_VALIDATE_NODES=$SKIP_VALIDATE_NODES"
 echo ""
 
 

From fa9a11e40f03744a9091d07319b6a71523f819ce Mon Sep 17 00:00:00 2001
From: Mikhail Uvarov <arcusfelis@gmail.com>
Date: Mon, 9 Dec 2024 16:58:24 +0100
Subject: [PATCH 2/2] Handle race-condition of node not being validated

If it is restartating at the end of the test suite
If MIM is restarted asynchronously, simple checking if application is running
is not enough. We first need to wait for MIM to actually fully restart.

The best place to do it is in end_per_suite. So, if some suite crashes MIM,
the rest of suites would quickly fail in init_per_suite (and we need to wait
only once)
---
 big_tests/src/cth_validate_nodes.erl | 13 ++++++++++---
 test/common/distributed_helper.erl   | 14 ++++++++++++--
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/big_tests/src/cth_validate_nodes.erl b/big_tests/src/cth_validate_nodes.erl
index aa4f5ce8f0d..3e91fe6d9da 100644
--- a/big_tests/src/cth_validate_nodes.erl
+++ b/big_tests/src/cth_validate_nodes.erl
@@ -5,9 +5,10 @@
 -export([init/2]).
 
 -export([pre_init_per_suite/3]).
+-export([post_end_per_suite/4]).
 -export([terminate/1]).
 
--record(state, {}).
+-record(state, {node_keys = []}).
 
 %% CT callbacks
 
@@ -19,8 +20,8 @@ init(_Id, _Opts) ->
 
 pre_init_per_suite(_Suite, Config, State) ->
     case distributed_helper:validate_nodes() of
-        ok ->
-            {Config, State};
+        {ok, NodeKeys} ->
+            {Config, State#state{node_keys = NodeKeys}};
         {error, Reason} ->
             case os:getenv("SKIP_VALIDATE_NODES") of
                 "true" ->
@@ -31,5 +32,11 @@ pre_init_per_suite(_Suite, Config, State) ->
             end
     end.
 
+post_end_per_suite(_SuiteName, _Config, Return, State = #state{node_keys = NodeKeys}) ->
+    %% In case a suite is restarting the node at the end of the suite execution
+    %% ensure we wait enough for it actually start
+    distributed_helper:wait_for_nodes_to_start(NodeKeys),
+    {Return, State#state{node_keys = []}}.
+
 terminate(_State) ->
     ok.
diff --git a/test/common/distributed_helper.erl b/test/common/distributed_helper.erl
index 5bb856b5e97..9ace65deb7d 100644
--- a/test/common/distributed_helper.erl
+++ b/test/common/distributed_helper.erl
@@ -208,13 +208,23 @@ lookup_config_opt(Key) ->
 
 %% @doc Checks if MongooseIM nodes are running
 validate_nodes() ->
-    Results = [validate_node(Node) || Node <- get_node_keys()],
+    validate_nodes(get_node_keys()).
+
+validate_nodes(NodeKeys) ->
+    Results = [validate_node(Node) || Node <- NodeKeys],
     Errors = [Res || Res <- Results, Res =/= ok],
     case Errors of
-        [] -> ok;
+        [] -> {ok, NodeKeys};
         _ -> {error, Errors}
     end.
 
+wait_for_nodes_to_start([]) -> ok;
+wait_for_nodes_to_start(NodeKeys) ->
+    wait_helper:wait_until(fun() -> validate_nodes(NodeKeys) end, {ok, NodeKeys},
+                           #{time_left => timer:seconds(20),
+                             sleep_time => 1000,
+                             name => wait_for_nodes_to_start}).
+
 get_node_keys() ->
     case os:getenv("TEST_HOSTS") of
         false ->