From fdd023ff8239ded21998fc711362759e0ca68bcc Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 9 May 2024 10:14:47 +0800 Subject: [PATCH] Migrate cluster mode tests to normal framework (#442) We currently has two disjoint TCL frameworks: 1. Normal testing framework, which trigger by runtest, which individually launches nodes for testing. 2. Cluster framework, which trigger by runtest-cluster, which pre-allocates N nodes and uses them for testing large configurations. The normal TCL testing framework is much more readily tested and is also automatically run as part of the CI for new PRs. The runtest-cluster since it runs very slowly (cannot be parallelized), it currently only runs in daily CI, this results in some changes to the cluster not being exposed in PR CI in time. This PR migrate the Cluster mode tests to normal framework. Some cluster tests are kept in runtest-cluster because of timing issues or not yet supported, we can process them later. Signed-off-by: Binbin --- tests/cluster/tests/01-faildet.tcl | 38 ---- tests/cluster/tests/11-manual-takeover.tcl | 71 -------- tests/cluster/tests/13-no-failover-option.tcl | 61 ------- tests/support/cluster_util.tcl | 162 ++++++++++++++++-- tests/support/util.tcl | 5 + tests/test_helper.tcl | 10 ++ .../00-base.tcl => unit/cluster/base.tcl} | 65 +++++-- .../cluster/cluster-nodes-slots.tcl} | 15 +- .../cluster/cluster-slots.tcl} | 42 ++--- .../cluster/consistency-check.tcl} | 32 ++-- .../cluster/diskless-load-swapdb.tcl} | 30 ++-- tests/unit/cluster/faildet.tcl | 64 +++++++ .../cluster/failover.tcl} | 36 ++-- .../cluster/half-migrated-slot.tcl} | 16 +- .../18-info.tcl => unit/cluster/info.tcl} | 14 +- .../cluster/manual-failover.tcl} | 67 ++++---- tests/unit/cluster/manual-takeover.tcl | 90 ++++++++++ .../cluster/many-slot-migration.tcl} | 13 +- tests/unit/cluster/multi-slot-operations.tcl | 2 +- tests/unit/cluster/no-failover-option.tcl | 62 +++++++ .../09-pubsub.tcl => unit/cluster/pubsub.tcl} | 8 +- .../cluster/pubsubshard-slot-migration.tcl} | 16 +- .../cluster/pubsubshard.tcl} | 14 +- .../cluster/replica-in-sync.tcl} | 20 +-- .../cluster/slave-selection.tcl} | 70 ++++---- .../cluster/slave-stop-cond.tcl} | 28 +-- .../cluster/slot-migration-response.tcl} | 15 +- .../cluster/transactions-on-replica.tcl} | 22 ++- .../cluster/update-msg.tcl} | 53 ++++-- 29 files changed, 690 insertions(+), 451 deletions(-) delete mode 100644 tests/cluster/tests/01-faildet.tcl delete mode 100644 tests/cluster/tests/11-manual-takeover.tcl delete mode 100644 tests/cluster/tests/13-no-failover-option.tcl rename tests/{cluster/tests/00-base.tcl => unit/cluster/base.tcl} (52%) rename tests/{cluster/tests/19-cluster-nodes-slots.tcl => unit/cluster/cluster-nodes-slots.tcl} (89%) rename tests/{cluster/tests/15-cluster-slots.tcl => unit/cluster/cluster-slots.tcl} (77%) rename tests/{cluster/tests/14-consistency-check.tcl => unit/cluster/consistency-check.tcl} (82%) rename tests/{cluster/tests/17-diskless-load-swapdb.tcl => unit/cluster/diskless-load-swapdb.tcl} (80%) create mode 100644 tests/unit/cluster/faildet.tcl rename tests/{cluster/tests/02-failover.tcl => unit/cluster/failover.tcl} (56%) rename tests/{cluster/tests/20-half-migrated-slot.tcl => unit/cluster/half-migrated-slot.tcl} (90%) rename tests/{cluster/tests/18-info.tcl => unit/cluster/info.tcl} (85%) rename tests/{cluster/tests/10-manual-failover.tcl => unit/cluster/manual-failover.tcl} (76%) create mode 100644 tests/unit/cluster/manual-takeover.tcl rename tests/{cluster/tests/21-many-slot-migration.tcl => unit/cluster/many-slot-migration.tcl} (84%) create mode 100644 tests/unit/cluster/no-failover-option.tcl rename tests/{cluster/tests/09-pubsub.tcl => unit/cluster/pubsub.tcl} (90%) rename tests/{cluster/tests/25-pubsubshard-slot-migration.tcl => unit/cluster/pubsubshard-slot-migration.tcl} (96%) rename tests/{cluster/tests/26-pubsubshard.tcl => unit/cluster/pubsubshard.tcl} (95%) rename tests/{cluster/tests/22-replica-in-sync.tcl => unit/cluster/replica-in-sync.tcl} (91%) rename tests/{cluster/tests/05-slave-selection.tcl => unit/cluster/slave-selection.tcl} (73%) rename tests/{cluster/tests/06-slave-stop-cond.tcl => unit/cluster/slave-stop-cond.tcl} (76%) rename tests/{cluster/tests/29-slot-migration-response.tcl => unit/cluster/slot-migration-response.tcl} (77%) rename tests/{cluster/tests/16-transactions-on-replica.tcl => unit/cluster/transactions-on-replica.tcl} (86%) rename tests/{cluster/tests/08-update-msg.tcl => unit/cluster/update-msg.tcl} (55%) diff --git a/tests/cluster/tests/01-faildet.tcl b/tests/cluster/tests/01-faildet.tcl deleted file mode 100644 index 5d40aad7da..0000000000 --- a/tests/cluster/tests/01-faildet.tcl +++ /dev/null @@ -1,38 +0,0 @@ -# Check the basic monitoring and failover capabilities. - -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} - -test "Cluster should start ok" { - assert_cluster_state ok -} - -test "Killing two slave nodes" { - kill_instance valkey 5 - kill_instance valkey 6 -} - -test "Cluster should be still up" { - assert_cluster_state ok -} - -test "Killing one master node" { - kill_instance valkey 0 -} - -# Note: the only slave of instance 0 is already down so no -# failover is possible, that would change the state back to ok. -test "Cluster should be down now" { - assert_cluster_state fail -} - -test "Restarting master node" { - restart_instance valkey 0 -} - -test "Cluster should be up again" { - assert_cluster_state ok -} diff --git a/tests/cluster/tests/11-manual-takeover.tcl b/tests/cluster/tests/11-manual-takeover.tcl deleted file mode 100644 index ebc95960f3..0000000000 --- a/tests/cluster/tests/11-manual-takeover.tcl +++ /dev/null @@ -1,71 +0,0 @@ -# Manual takeover test - -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} - -test "Cluster is up" { - assert_cluster_state ok -} - -test "Cluster is writable" { - cluster_write_test 0 -} - -# For this test, disable replica failover until -# all of the primaries are confirmed killed. Otherwise -# there might be enough time to elect a replica. -set replica_ids { 5 6 7 } -foreach id $replica_ids { - R $id config set cluster-replica-no-failover yes -} - -test "Killing majority of master nodes" { - kill_instance valkey 0 - kill_instance valkey 1 - kill_instance valkey 2 -} - -foreach id $replica_ids { - R $id config set cluster-replica-no-failover no -} - -test "Cluster should eventually be down" { - assert_cluster_state fail -} - -test "Use takeover to bring slaves back" { - foreach id $replica_ids { - R $id cluster failover takeover - } -} - -test "Cluster should eventually be up again" { - assert_cluster_state ok -} - -test "Cluster is writable" { - cluster_write_test 4 -} - -test "Instance #5, #6, #7 are now masters" { - foreach id $replica_ids { - assert {[RI $id role] eq {master}} - } -} - -test "Restarting the previously killed master nodes" { - restart_instance valkey 0 - restart_instance valkey 1 - restart_instance valkey 2 -} - -test "Instance #0, #1, #2 gets converted into a slaves" { - wait_for_condition 1000 50 { - [RI 0 role] eq {slave} && [RI 1 role] eq {slave} && [RI 2 role] eq {slave} - } else { - fail "Old masters not converted into slaves" - } -} diff --git a/tests/cluster/tests/13-no-failover-option.tcl b/tests/cluster/tests/13-no-failover-option.tcl deleted file mode 100644 index c11a502f8a..0000000000 --- a/tests/cluster/tests/13-no-failover-option.tcl +++ /dev/null @@ -1,61 +0,0 @@ -# Check that the no-failover option works - -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} - -test "Cluster is up" { - assert_cluster_state ok -} - -test "Cluster is writable" { - cluster_write_test 0 -} - -test "Instance #5 is a slave" { - assert {[RI 5 role] eq {slave}} - - # Configure it to never failover the master - R 5 CONFIG SET cluster-slave-no-failover yes -} - -test "Instance #5 synced with the master" { - wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} - } else { - fail "Instance #5 master link status is not up" - } -} - -test "The nofailover flag is propagated" { - set slave5_id [dict get [get_myself 5] id] - - foreach_valkey_id id { - wait_for_condition 1000 50 { - [has_flag [get_node_by_id $id $slave5_id] nofailover] - } else { - fail "Instance $id can't see the nofailover flag of slave" - } - } -} - -set current_epoch [CI 1 cluster_current_epoch] - -test "Killing one master node" { - kill_instance valkey 0 -} - -test "Cluster should be still down after some time" { - after 10000 - assert_cluster_state fail -} - -test "Instance #5 is still a slave" { - assert {[RI 5 role] eq {slave}} -} - -test "Restarting the previously killed master node" { - restart_instance valkey 0 -} diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index d89a5a384d..ebca69d9ca 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -1,5 +1,91 @@ # Cluster helper functions +source tests/support/cli.tcl +source tests/support/cluster.tcl + +proc config_set_all_nodes {keyword value} { + for {set j 0} {$j < [llength $::servers]} {incr j} { + R $j config set $keyword $value + } +} + +proc get_instance_id_by_port {type port} { + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[srv [expr -1*$j] port] == $port} { + return $j + } + } + fail "Instance port $port not found." +} + +# Check if the cluster is writable and readable. Use node "port" +# as a starting point to talk with the cluster. +proc cluster_write_test {port} { + set prefix [randstring 20 20 alpha] + set cluster [valkey_cluster 127.0.0.1:$port] + for {set j 0} {$j < 100} {incr j} { + $cluster set key.$j $prefix.$j + } + for {set j 0} {$j < 100} {incr j} { + assert {[$cluster get key.$j] eq "$prefix.$j"} + } + $cluster close +} + +# Helper function to attempt to have each node in a cluster +# meet each other. +proc join_nodes_in_cluster {} { + # Join node 0 with 1, 1 with 2, ... and so forth. + # If auto-discovery works all nodes will know every other node + # eventually. + set ids {} + for {set id 0} {$id < [llength $::servers]} {incr id} {lappend ids $id} + for {set j 0} {$j < [expr [llength $ids]-1]} {incr j} { + set a [lindex $ids $j] + set b [lindex $ids [expr $j+1]] + set b_port [srv -$b port] + R $a cluster meet 127.0.0.1 $b_port + } + + for {set id 0} {$id < [llength $::servers]} {incr id} { + wait_for_condition 1000 50 { + [llength [get_cluster_nodes $id connected]] == [llength $ids] + } else { + return 0 + } + } + return 1 +} + +# Search the first node starting from ID $first that is not +# already configured as a replica. +proc cluster_find_available_replica {first} { + for {set id 0} {$id < [llength $::servers]} {incr id} { + if {$id < $first} continue + set me [cluster_get_myself $id] + if {[dict get $me slaveof] eq {-}} {return $id} + } + fail "No available replicas" +} + +proc fix_cluster {addr} { + set code [catch { + exec src/valkey-cli {*}[valkeycli_tls_config "./tests"] --cluster fix $addr << yes + } result] + if {$code != 0} { + puts "valkey-cli --cluster fix returns non-zero exit code, output below:\n$result" + } + # Note: valkey-cli --cluster fix may return a non-zero exit code if nodes don't agree, + # but we can ignore that and rely on the check below. + wait_for_cluster_state ok + wait_for_condition 100 100 { + [catch {exec src/valkey-cli {*}[valkeycli_tls_config "./tests"] --cluster check $addr} result] == 0 + } else { + puts "valkey-cli --cluster check returns non-zero exit code, output below:\n$result" + fail "Cluster could not settle with configuration" + } +} + # Check if cluster configuration is consistent. # All the nodes in the cluster should show same slots configuration and have health # state "online" to be considered as consistent. @@ -59,7 +145,7 @@ proc wait_for_cluster_size {cluster_size} { # Check that cluster nodes agree about "state", or raise an error. proc wait_for_cluster_state {state} { for {set j 0} {$j < [llength $::servers]} {incr j} { - wait_for_condition 100 50 { + wait_for_condition 1000 50 { [CI $j cluster_state] eq $state } else { fail "Cluster node $j cluster_state:[CI $j cluster_state]" @@ -69,7 +155,7 @@ proc wait_for_cluster_state {state} { # Default slot allocation for clusters, each master has a continuous block # and approximately equal number of slots. -proc continuous_slot_allocation {masters} { +proc continuous_slot_allocation {masters replicas} { set avg [expr double(16384) / $masters] set slot_start 0 for {set j 0} {$j < $masters} {incr j} { @@ -79,9 +165,47 @@ proc continuous_slot_allocation {masters} { } } +# Assuming nodes are reset, this function performs slots allocation. +# Only the first 'masters' nodes are used. +proc cluster_allocate_slots {masters replicas} { + set slot 16383 + while {$slot >= 0} { + # Allocate successive slots to random nodes. + set node [randomInt $masters] + lappend slots_$node $slot + incr slot -1 + } + for {set j 0} {$j < $masters} {incr j} { + R $j cluster addslots {*}[set slots_${j}] + } +} + +proc default_replica_allocation {masters replicas} { + # Setup master/replica relationships + set node_count [expr $masters + $replicas] + for {set i 0} {$i < $masters} {incr i} { + set nodeid [R $i CLUSTER MYID] + for {set j [expr $i + $masters]} {$j < $node_count} {incr j $masters} { + R $j CLUSTER REPLICATE $nodeid + } + } +} + +# Add 'replicas' replicas to a cluster composed of 'masters' masters. +# It assumes that masters are allocated sequentially from instance ID 0 +# to N-1. +proc cluster_allocate_replicas {masters replicas} { + for {set j 0} {$j < $replicas} {incr j} { + set master_id [expr {$j % $masters}] + set replica_id [cluster_find_available_replica $masters] + set master_myself [cluster_get_myself $master_id] + R $replica_id cluster replicate [dict get $master_myself id] + } +} + # Setup method to be executed to configure the cluster before the # tests run. -proc cluster_setup {masters node_count slot_allocator code} { +proc cluster_setup {masters replicas node_count slot_allocator replica_allocator code} { # Have all nodes meet if {$::tls} { set tls_cluster [lindex [R 0 CONFIG GET tls-cluster] 1] @@ -96,17 +220,12 @@ proc cluster_setup {masters node_count slot_allocator code} { } } - $slot_allocator $masters + $slot_allocator $masters $replicas wait_for_cluster_propagation # Setup master/replica relationships - for {set i 0} {$i < $masters} {incr i} { - set nodeid [R $i CLUSTER MYID] - for {set j [expr $i + $masters]} {$j < $node_count} {incr j $masters} { - R $j CLUSTER REPLICATE $nodeid - } - } + $replica_allocator $masters $replicas wait_for_cluster_propagation wait_for_cluster_state "ok" @@ -116,11 +235,11 @@ proc cluster_setup {masters node_count slot_allocator code} { # Start a cluster with the given number of masters and replicas. Replicas # will be allocated to masters by round robin. -proc start_cluster {masters replicas options code {slot_allocator continuous_slot_allocation}} { +proc start_cluster {masters replicas options code {slot_allocator continuous_slot_allocation} {replica_allocator default_replica_allocation}} { set node_count [expr $masters + $replicas] # Set the final code to be the tests + cluster setup - set code [list cluster_setup $masters $node_count $slot_allocator $code] + set code [list cluster_setup $masters $replicas $node_count $slot_allocator $replica_allocator $code] # Configure the starting of multiple servers. Set cluster node timeout # aggressively since many tests depend on ping/pong messages. @@ -149,8 +268,19 @@ proc cluster_get_myself id { return {} } -# Returns a parsed CLUSTER NODES output as a list of dictionaries. -proc get_cluster_nodes id { +# Get a specific node by ID by parsing the CLUSTER NODES output +# of the instance Number 'instance_id' +proc cluster_get_node_by_id {instance_id node_id} { + set nodes [get_cluster_nodes $instance_id] + foreach n $nodes { + if {[dict get $n id] eq $node_id} {return $n} + } + return {} +} + +# Returns a parsed CLUSTER NODES output as a list of dictionaries. Optional status field +# can be specified to only returns entries that match the provided status. +proc get_cluster_nodes {id {status "*"}} { set lines [split [R $id cluster nodes] "\r\n"] set nodes {} foreach l $lines { @@ -168,7 +298,9 @@ proc get_cluster_nodes id { linkstate [lindex $args 7] \ slots [lrange $args 8 end] \ ] - lappend nodes $node + if {[string match $status [lindex $args 7]]} { + lappend nodes $node + } } return $nodes } diff --git a/tests/support/util.tcl b/tests/support/util.tcl index d5af536b52..9d69e44232 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -653,6 +653,11 @@ proc process_is_alive pid { } } +# Return true if the specified process is paused by pause_process. +proc process_is_paused pid { + return [string match {*T*} [lindex [exec ps j $pid] 16]] +} + proc pause_process pid { exec kill -SIGSTOP $pid wait_for_condition 50 100 { diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 4df9110552..57fb2beb13 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -221,6 +221,16 @@ proc valkey_client {args} { return $client } +proc valkey_deferring_client_by_addr {host port} { + set client [valkey $host $port 1 $::tls] + return $client +} + +proc valkey_client_by_addr {host port} { + set client [valkey $host $port 0 $::tls] + return $client +} + # Provide easy access to INFO properties. Same semantic as "proc r". proc s {args} { set level 0 diff --git a/tests/cluster/tests/00-base.tcl b/tests/unit/cluster/base.tcl similarity index 52% rename from tests/cluster/tests/00-base.tcl rename to tests/unit/cluster/base.tcl index cfb458fee0..78e5b897e9 100644 --- a/tests/cluster/tests/00-base.tcl +++ b/tests/unit/cluster/base.tcl @@ -1,20 +1,51 @@ # Check the basic monitoring and failover capabilities. -source "../tests/includes/init-tests.tcl" +# make sure the test infra won't use SELECT +set old_singledb $::singledb +set ::singledb 1 -if {$::simulate_error} { - test "This test will fail" { - fail "Simulated error" +tags {tls:skip external:skip cluster} { + +set base_conf [list cluster-enabled yes] +start_multiple_servers 5 [list overrides $base_conf] { + +test "Cluster nodes are reachable" { + for {set id 0} {$id < [llength $::servers]} {incr id} { + # Every node should be reachable. + wait_for_condition 1000 50 { + ([catch {R $id ping} ping_reply] == 0) && + ($ping_reply eq {PONG}) + } else { + catch {R $id ping} err + fail "Node #$id keeps replying '$err' to PING." + } } } +test "Cluster Join and auto-discovery test" { + # Use multiple attempts since sometimes nodes timeout + # while attempting to connect. + for {set attempts 3} {$attempts > 0} {incr attempts -1} { + if {[join_nodes_in_cluster] == 1} { + break + } + } + if {$attempts == 0} { + fail "Cluster failed to form full mesh" + } +} + +test "Before slots allocation, all nodes report cluster failure" { + wait_for_cluster_state fail +} + test "Different nodes have different IDs" { set ids {} set numnodes 0 - foreach_valkey_id id { + for {set id 0} {$id < [llength $::servers]} {incr id} { incr numnodes # Every node should just know itself. - set nodeid [dict get [get_myself $id] id] + set nodeid [dict get [cluster_get_myself $id] id] assert {$nodeid ne {}} lappend ids $nodeid } @@ -23,7 +54,7 @@ test "Different nodes have different IDs" { } test "It is possible to perform slot allocation" { - cluster_allocate_slots 5 + cluster_allocate_slots 5 0 } test "After the join, every node gets a different config epoch" { @@ -31,7 +62,7 @@ test "After the join, every node gets a different config epoch" { while {[incr trynum -1] != 0} { # We check that this condition is true for *all* the nodes. set ok 1 ; # Will be set to 0 every time a node is not ok. - foreach_valkey_id id { + for {set id 0} {$id < [llength $::servers]} {incr id} { set epochs {} foreach n [get_cluster_nodes $id] { lappend epochs [dict get $n config_epoch] @@ -51,7 +82,7 @@ test "After the join, every node gets a different config epoch" { } test "Nodes should report cluster_state is ok now" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Sanity for CLUSTER COUNTKEYSINSLOT" { @@ -60,19 +91,19 @@ test "Sanity for CLUSTER COUNTKEYSINSLOT" { } test "It is possible to write and read from the cluster" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "CLUSTER RESET SOFT test" { - set last_epoch_node0 [get_info_field [R 0 cluster info] cluster_current_epoch] + set last_epoch_node0 [CI 0 cluster_current_epoch] R 0 FLUSHALL R 0 CLUSTER RESET - assert {[get_info_field [R 0 cluster info] cluster_current_epoch] eq $last_epoch_node0} + assert {[CI 0 cluster_current_epoch] eq $last_epoch_node0} - set last_epoch_node1 [get_info_field [R 1 cluster info] cluster_current_epoch] + set last_epoch_node1 [CI 1 cluster_current_epoch] R 1 FLUSHALL R 1 CLUSTER RESET SOFT - assert {[get_info_field [R 1 cluster info] cluster_current_epoch] eq $last_epoch_node1} + assert {[CI 1 cluster_current_epoch] eq $last_epoch_node1} } test "Coverage: CLUSTER HELP" { @@ -87,3 +118,9 @@ test "CLUSTER SLAVES and CLUSTER REPLICAS with zero replicas" { assert_equal {} [R 0 cluster slaves [R 0 CLUSTER MYID]] assert_equal {} [R 0 cluster replicas [R 0 CLUSTER MYID]] } + +} ;# stop servers + +} ;# tags + +set ::singledb $old_singledb diff --git a/tests/cluster/tests/19-cluster-nodes-slots.tcl b/tests/unit/cluster/cluster-nodes-slots.tcl similarity index 89% rename from tests/cluster/tests/19-cluster-nodes-slots.tcl rename to tests/unit/cluster/cluster-nodes-slots.tcl index 77faec9128..e584ed1e0b 100644 --- a/tests/cluster/tests/19-cluster-nodes-slots.tcl +++ b/tests/unit/cluster/cluster-nodes-slots.tcl @@ -1,17 +1,13 @@ # Optimize CLUSTER NODES command by generating all nodes slot topology firstly -source "../tests/includes/init-tests.tcl" - -test "Create a 2 nodes cluster" { - cluster_create_with_continuous_slots 2 2 -} +start_cluster 2 2 {tags {external:skip cluster}} { test "Cluster should start ok" { - assert_cluster_state ok + wait_for_cluster_state ok } -set master1 [Rn 0] -set master2 [Rn 1] +set master1 [srv 0 "client"] +set master2 [srv -1 "client"] test "Continuous slots distribution" { assert_match "* 0-8191*" [$master1 CLUSTER NODES] @@ -23,7 +19,6 @@ test "Continuous slots distribution" { assert_match "* 0-4095 4097-8191*" [$master1 CLUSTER NODES] assert_match "*0 4095*4097 8191*" [$master1 CLUSTER SLOTS] - $master2 CLUSTER DELSLOTS 12288 assert_match "* 8192-12287 12289-16383*" [$master2 CLUSTER NODES] assert_match "*8192 12287*12289 16383*" [$master2 CLUSTER SLOTS] @@ -48,3 +43,5 @@ test "Discontinuous slots distribution" { assert_match "* 8192-12283 12285 12287 12289-16379 16381*" [$master2 CLUSTER NODES] assert_match "*8192 12283*12285 12285*12287 12287*12289 16379*16381 16381*" [$master2 CLUSTER SLOTS] } + +} ;# start_cluster diff --git a/tests/cluster/tests/15-cluster-slots.tcl b/tests/unit/cluster/cluster-slots.tcl similarity index 77% rename from tests/cluster/tests/15-cluster-slots.tcl rename to tests/unit/cluster/cluster-slots.tcl index 927c1ff0dd..441e5644e4 100644 --- a/tests/cluster/tests/15-cluster-slots.tcl +++ b/tests/unit/cluster/cluster-slots.tcl @@ -1,39 +1,27 @@ -source "../tests/includes/init-tests.tcl" - -proc cluster_allocate_mixedSlots {n} { +proc cluster_allocate_mixedSlots {masters replicas} { set slot 16383 while {$slot >= 0} { - set node [expr {$slot % $n}] + set node [expr {$slot % $masters}] lappend slots_$node $slot incr slot -1 } - for {set j 0} {$j < $n} {incr j} { + for {set j 0} {$j < $masters} {incr j} { R $j cluster addslots {*}[set slots_${j}] } } -proc create_cluster_with_mixedSlot {masters slaves} { - cluster_allocate_mixedSlots $masters - if {$slaves} { - cluster_allocate_slaves $masters $slaves - } - assert_cluster_state ok -} - -test "Create a 5 nodes cluster" { - create_cluster_with_mixedSlot 5 15 -} +start_cluster 5 10 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "Instance #5 is a slave" { - assert {[RI 5 role] eq {slave}} + assert {[s -5 role] eq {slave}} } test "client do not break when cluster slot" { @@ -44,13 +32,13 @@ test "client do not break when cluster slot" { } test "client can handle keys with hash tag" { - set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] + set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] $cluster set foo{tag} bar $cluster close } test "slot migration is valid from primary to another primary" { - set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] + set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] set key order1 set slot [$cluster cluster keyslot $key] array set nodefrom [$cluster masternode_for_slot $slot] @@ -61,17 +49,15 @@ test "slot migration is valid from primary to another primary" { } test "slot migration is invalid from primary to replica" { - set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] + set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] set key order1 set slot [$cluster cluster keyslot $key] array set nodefrom [$cluster masternode_for_slot $slot] # Get replica node serving slot. set replicanodeinfo [$cluster cluster replicas $nodefrom(id)] - puts $replicanodeinfo set args [split $replicanodeinfo " "] set replicaid [lindex [split [lindex $args 0] \{] 1] - puts $replicaid catch {[$nodefrom(link) cluster setslot $slot node $replicaid]} err assert_match "*Target node is not a master" $err @@ -117,12 +103,14 @@ proc count_bound_slots {n} { if {$::tls} { test {CLUSTER SLOTS from non-TLS client in TLS cluster} { set slots_tls [R 0 cluster slots] - set host [get_instance_attrib valkey 0 host] - set plaintext_port [get_instance_attrib valkey 0 plaintext-port] + set host [srv 0 host] + set plaintext_port [srv 0 pport] set client_plain [valkey $host $plaintext_port 0 0] set slots_plain [$client_plain cluster slots] $client_plain close # Compare the ports in the first row assert_no_match [lindex $slots_tls 0 3 1] [lindex $slots_plain 0 3 1] } -} \ No newline at end of file +} + +} cluster_allocate_mixedSlots cluster_allocate_replicas ;# start_cluster diff --git a/tests/cluster/tests/14-consistency-check.tcl b/tests/unit/cluster/consistency-check.tcl similarity index 82% rename from tests/cluster/tests/14-consistency-check.tcl rename to tests/unit/cluster/consistency-check.tcl index bc7c893214..14e5407cde 100644 --- a/tests/cluster/tests/14-consistency-check.tcl +++ b/tests/unit/cluster/consistency-check.tcl @@ -1,22 +1,18 @@ -source "../tests/includes/init-tests.tcl" -source "../../../tests/support/cli.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +start_cluster 5 5 {tags {external:skip cluster}} { test "Cluster should start ok" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } proc find_non_empty_master {} { set master_id_no {} - foreach_valkey_id id { - if {[RI $id role] eq {master} && [R $id dbsize] > 0} { + + for {set id 0} {$id < [llength $::servers]} {incr id} { + if {[s -$id role] eq {master} && [R $id dbsize] > 0} { set master_id_no $id break } @@ -37,7 +33,7 @@ proc get_one_of_my_replica {id} { proc cluster_write_keys_with_expire {id ttl} { set prefix [randstring 20 20 alpha] - set port [get_instance_attrib valkey $id port] + set port [srv -$id port] set cluster [valkey_cluster 127.0.0.1:$port] for {set j 100} {$j < 200} {incr j} { $cluster setex key_expire.$j $ttl $prefix.$j @@ -80,11 +76,10 @@ proc test_slave_load_expired_keys {aof} { # make replica create persistence file if {$aof == "yes"} { - # we need to wait for the initial AOFRW to be done, otherwise - # kill_instance (which now uses SIGTERM will fail ("Writing initial AOF, can't exit") + # we need to wait for the initial AOFRW to be done wait_for_condition 100 10 { - [RI $replica_id aof_rewrite_scheduled] eq 0 && - [RI $replica_id aof_rewrite_in_progress] eq 0 + [s -$replica_id aof_rewrite_scheduled] eq 0 && + [s -$replica_id aof_rewrite_in_progress] eq 0 } else { fail "AOFRW didn't finish" } @@ -93,7 +88,8 @@ proc test_slave_load_expired_keys {aof} { } # kill the replica (would stay down until re-started) - kill_instance valkey $replica_id + set paused_pid [srv -$replica_id pid] + pause_process $paused_pid # Make sure the master doesn't do active expire (sending DELs to the replica) R $master_id DEBUG SET-ACTIVE-EXPIRE 0 @@ -102,7 +98,7 @@ proc test_slave_load_expired_keys {aof} { after [expr $data_ttl*1000] # start the replica again (loading an RDB or AOF file) - restart_instance valkey $replica_id + resume_process $paused_pid # make sure the keys are still there set replica_dbsize_3 [R $replica_id dbsize] @@ -122,3 +118,5 @@ proc test_slave_load_expired_keys {aof} { test_slave_load_expired_keys no test_slave_load_expired_keys yes + +} ;# start_cluster diff --git a/tests/cluster/tests/17-diskless-load-swapdb.tcl b/tests/unit/cluster/diskless-load-swapdb.tcl similarity index 80% rename from tests/cluster/tests/17-diskless-load-swapdb.tcl rename to tests/unit/cluster/diskless-load-swapdb.tcl index e7b69d71b8..68c2135493 100644 --- a/tests/cluster/tests/17-diskless-load-swapdb.tcl +++ b/tests/unit/cluster/diskless-load-swapdb.tcl @@ -1,24 +1,20 @@ # Check that replica keys and keys to slots map are right after failing to diskless load using SWAPDB. -source "../tests/includes/init-tests.tcl" - -test "Create a primary with a replica" { - create_cluster 1 1 -} +start_cluster 1 1 {tags {external:skip cluster}} { test "Cluster should start ok" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "Main db not affected when fail to diskless load" { - set master [Rn 0] - set replica [Rn 1] + set master [srv 0 "client"] + set replica [srv -1 "client"] set master_id 0 - set replica_id 1 + set replica_id -1 $replica READONLY $replica config set repl-diskless-load swapdb @@ -42,7 +38,7 @@ test "Main db not affected when fail to diskless load" { # Save an RDB and kill the replica $replica save - kill_instance valkey $replica_id + pause_process [srv $replica_id pid] # Delete the key from master $master del $slot0_key @@ -60,7 +56,9 @@ test "Main db not affected when fail to diskless load" { } # Start the replica again - restart_instance valkey $replica_id + resume_process [srv $replica_id pid] + restart_server $replica_id true false + set replica [srv -1 "client"] $replica READONLY # Start full sync, wait till after db started loading in background @@ -71,16 +69,20 @@ test "Main db not affected when fail to diskless load" { } # Kill master, abort full sync - kill_instance valkey $master_id + pause_process [srv $master_id pid] # Start full sync, wait till the replica detects the disconnection wait_for_condition 500 10 { [s $replica_id async_loading] eq 0 } else { - fail "Fail to full sync" + fail "Fail to stop the full sync" } # Replica keys and keys to slots map still both are right assert_equal {1} [$replica get $slot0_key] assert_equal $slot0_key [$replica CLUSTER GETKEYSINSLOT 0 1] + + resume_process [srv $master_id pid] } + +} ;# start_cluster diff --git a/tests/unit/cluster/faildet.tcl b/tests/unit/cluster/faildet.tcl new file mode 100644 index 0000000000..1a0b888392 --- /dev/null +++ b/tests/unit/cluster/faildet.tcl @@ -0,0 +1,64 @@ +# Check the basic monitoring and failover capabilities. + +start_cluster 5 5 {tags {external:skip cluster}} { + +test "Cluster should start ok" { + wait_for_cluster_state ok +} + +set paused_pid5 [srv -5 pid] +set paused_pid6 [srv -6 pid] +test "Killing two slave nodes" { + pause_process $paused_pid5 + pause_process $paused_pid6 +} + +test "Cluster should be still up" { + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid5]} continue + if {[process_is_paused $paused_pid6]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } +} + +set paused_pid [srv -5 pid] +test "Killing one master node" { + pause_process $paused_pid +} + +# Note: the only slave of instance 0 is already down so no +# failover is possible, that would change the state back to ok. +test "Cluster should be down now" { + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + if {[process_is_paused $paused_pid5]} continue + if {[process_is_paused $paused_pid6]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } +} + +test "Restarting master node" { + pause_process $paused_pid +} + +test "Cluster should be up again" { + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid5]} continue + if {[process_is_paused $paused_pid6]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } +} + +} ;# start_cluster diff --git a/tests/cluster/tests/02-failover.tcl b/tests/unit/cluster/failover.tcl similarity index 56% rename from tests/cluster/tests/02-failover.tcl rename to tests/unit/cluster/failover.tcl index f5b83a6665..b2c68db3d2 100644 --- a/tests/cluster/tests/02-failover.tcl +++ b/tests/unit/cluster/failover.tcl @@ -1,26 +1,22 @@ # Check the basic monitoring and failover capabilities. -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +start_cluster 5 5 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "Instance #5 is a slave" { - assert {[RI 5 role] eq {slave}} + assert {[s -5 role] eq {slave}} } test "Instance #5 synced with the master" { wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} + [s -5 master_link_status] eq {up} } else { fail "Instance #5 master link status is not up" } @@ -28,8 +24,9 @@ test "Instance #5 synced with the master" { set current_epoch [CI 1 cluster_current_epoch] +set paused_pid [srv 0 pid] test "Killing one master node" { - kill_instance valkey 0 + pause_process $paused_pid } test "Wait for failover" { @@ -41,25 +38,34 @@ test "Wait for failover" { } test "Cluster should eventually be up again" { - assert_cluster_state ok + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } } test "Cluster is writable" { - cluster_write_test 1 + cluster_write_test [srv -1 port] } test "Instance #5 is now a master" { - assert {[RI 5 role] eq {master}} + assert {[s -5 role] eq {master}} } test "Restarting the previously killed master node" { - restart_instance valkey 0 + resume_process $paused_pid } test "Instance #0 gets converted into a slave" { wait_for_condition 1000 50 { - [RI 0 role] eq {slave} + [s 0 role] eq {slave} } else { fail "Old master was not converted into slave" } } + +} ;# start_cluster diff --git a/tests/cluster/tests/20-half-migrated-slot.tcl b/tests/unit/cluster/half-migrated-slot.tcl similarity index 90% rename from tests/cluster/tests/20-half-migrated-slot.tcl rename to tests/unit/cluster/half-migrated-slot.tcl index ede42613be..5374ba4bae 100644 --- a/tests/cluster/tests/20-half-migrated-slot.tcl +++ b/tests/unit/cluster/half-migrated-slot.tcl @@ -5,19 +5,17 @@ # 4. migration is half finished on "migrating" node # 5. migration is half finished on "importing" node -source "../tests/includes/init-tests.tcl" -source "../tests/includes/utils.tcl" +source tests/support/cluster_util.tcl -test "Create a 2 nodes cluster" { - create_cluster 2 0 - config_set_all_nodes cluster-allow-replica-migration no -} +start_cluster 2 0 {tags {external:skip cluster}} { + +config_set_all_nodes cluster-allow-replica-migration no test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] +set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] catch {unset nodefrom} catch {unset nodeto} @@ -91,3 +89,5 @@ test "Half-finish importing" { } config_set_all_nodes cluster-allow-replica-migration yes + +} ;# start_cluster diff --git a/tests/cluster/tests/18-info.tcl b/tests/unit/cluster/info.tcl similarity index 85% rename from tests/cluster/tests/18-info.tcl rename to tests/unit/cluster/info.tcl index 68c62d3576..0d7b249899 100644 --- a/tests/cluster/tests/18-info.tcl +++ b/tests/unit/cluster/info.tcl @@ -1,17 +1,13 @@ # Check cluster info stats -source "../tests/includes/init-tests.tcl" - -test "Create a primary with a replica" { - create_cluster 2 0 -} +start_cluster 2 0 {tags {external:skip cluster}} { test "Cluster should start ok" { - assert_cluster_state ok + wait_for_cluster_state ok } -set primary1 [Rn 0] -set primary2 [Rn 1] +set primary1 [srv 0 "client"] +set primary2 [srv -1 "client"] proc cmdstat {instance cmd} { return [cmdrstat $cmd $instance] @@ -43,3 +39,5 @@ test "errorstats: rejected call due to MOVED Redirection" { assert_match {*count=1*} [errorstat $perr MOVED] assert_match {*calls=0,*,rejected_calls=1,failed_calls=0} [cmdstat $perr set] } + +} ;# start_cluster diff --git a/tests/cluster/tests/10-manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl similarity index 76% rename from tests/cluster/tests/10-manual-failover.tcl rename to tests/unit/cluster/manual-failover.tcl index 4af4148cee..2d0a8921cb 100644 --- a/tests/cluster/tests/10-manual-failover.tcl +++ b/tests/unit/cluster/manual-failover.tcl @@ -1,26 +1,21 @@ # Check the manual failover - -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +start_cluster 5 5 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "Instance #5 is a slave" { - assert {[RI 5 role] eq {slave}} + assert {[s -5 role] eq {slave}} } test "Instance #5 synced with the master" { wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} + [s -5 master_link_status] eq {up} } else { fail "Instance #5 master link status is not up" } @@ -30,7 +25,7 @@ set current_epoch [CI 1 cluster_current_epoch] set numkeys 50000 set numops 10000 -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] +set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] catch {unset content} array set content {} @@ -47,7 +42,7 @@ test "Send CLUSTER FAILOVER to #5, during load" { if {$listid % 2} { $cluster rpush $key $ele } else { - $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele + $cluster eval {server.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele } lappend content($key) $ele @@ -68,15 +63,15 @@ test "Wait for failover" { } test "Cluster should eventually be up again" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 1 + cluster_write_test [srv -1 port] } test "Instance #5 is now a master" { - assert {[RI 5 role] eq {master}} + assert {[s -5 role] eq {master}} } test "Verify $numkeys keys for consistency with logical content" { @@ -88,35 +83,32 @@ test "Verify $numkeys keys for consistency with logical content" { test "Instance #0 gets converted into a slave" { wait_for_condition 1000 50 { - [RI 0 role] eq {slave} + [s 0 role] eq {slave} } else { fail "Old master was not converted into slave" } } -## Check that manual failover does not happen if we can't talk with the master. - -source "../tests/includes/init-tests.tcl" +} ;# start_cluster -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +## Check that manual failover does not happen if we can't talk with the master. +start_cluster 5 5 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "Instance #5 is a slave" { - assert {[RI 5 role] eq {slave}} + assert {[s -5 role] eq {slave}} } test "Instance #5 synced with the master" { wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} + [s -5 master_link_status] eq {up} } else { fail "Instance #5 master link status is not up" } @@ -133,7 +125,7 @@ test "Send CLUSTER FAILOVER to instance #5" { test "Instance #5 is still a slave after some time (no failover)" { after 5000 - assert {[RI 5 role] eq {master}} + assert {[s -5 role] eq {master}} } test "Wait for instance #0 to return back alive" { @@ -141,29 +133,26 @@ test "Wait for instance #0 to return back alive" { assert {[R 0 read] eq {OK}} } -## Check with "force" failover happens anyway. - -source "../tests/includes/init-tests.tcl" +} ;# start_cluster -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +## Check with "force" failover happens anyway. +start_cluster 5 10 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "Instance #5 is a slave" { - assert {[RI 5 role] eq {slave}} + assert {[s -5 role] eq {slave}} } test "Instance #5 synced with the master" { wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} + [s -5 master_link_status] eq {up} } else { fail "Instance #5 master link status is not up" } @@ -180,7 +169,7 @@ test "Send CLUSTER FAILOVER to instance #5" { test "Instance #5 is a master after some time" { wait_for_condition 1000 50 { - [RI 5 role] eq {master} + [s -5 role] eq {master} } else { fail "Instance #5 is not a master after some time regardless of FORCE" } @@ -190,3 +179,5 @@ test "Wait for instance #0 to return back alive" { R 0 deferred 0 assert {[R 0 read] eq {OK}} } + +} ;# start_cluster diff --git a/tests/unit/cluster/manual-takeover.tcl b/tests/unit/cluster/manual-takeover.tcl new file mode 100644 index 0000000000..8a4509b397 --- /dev/null +++ b/tests/unit/cluster/manual-takeover.tcl @@ -0,0 +1,90 @@ +# Manual takeover test + +start_cluster 5 5 {tags {external:skip cluster}} { + +test "Cluster is up" { + wait_for_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test [srv -1 port] +} + +# For this test, disable replica failover until +# all of the primaries are confirmed killed. Otherwise +# there might be enough time to elect a replica. +set replica_ids { 5 6 7 } +foreach id $replica_ids { + R $id config set cluster-replica-no-failover yes +} + +set paused_pid [srv 0 pid] +set paused_pid1 [srv -1 pid] +set paused_pid2 [srv -2 pid] +test "Killing majority of master nodes" { + pause_process $paused_pid + pause_process $paused_pid1 + pause_process $paused_pid2 +} + +foreach id $replica_ids { + R $id config set cluster-replica-no-failover no +} + +test "Cluster should eventually be down" { + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + if {[process_is_paused $paused_pid1]} continue + if {[process_is_paused $paused_pid2]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "fail" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } +} + +test "Use takeover to bring slaves back" { + foreach id $replica_ids { + R $id cluster failover takeover + } +} + +test "Cluster should eventually be up again" { + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + if {[process_is_paused $paused_pid1]} continue + if {[process_is_paused $paused_pid2]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } +} + +test "Cluster is writable" { + cluster_write_test [srv -4 port] +} + +test "Instance #5, #6, #7 are now masters" { + assert {[s -5 role] eq {master}} + assert {[s -6 role] eq {master}} + assert {[s -7 role] eq {master}} +} + +test "Restarting the previously killed master nodes" { + resume_process $paused_pid + resume_process $paused_pid1 + resume_process $paused_pid2 +} + +test "Instance #0, #1, #2 gets converted into a slaves" { + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && [s -1 role] eq {slave} && [s -2 role] eq {slave} + } else { + fail "Old masters not converted into slaves" + } +} + +} ;# start_cluster diff --git a/tests/cluster/tests/21-many-slot-migration.tcl b/tests/unit/cluster/many-slot-migration.tcl similarity index 84% rename from tests/cluster/tests/21-many-slot-migration.tcl rename to tests/unit/cluster/many-slot-migration.tcl index 40dc498126..d269beb0e4 100644 --- a/tests/cluster/tests/21-many-slot-migration.tcl +++ b/tests/unit/cluster/many-slot-migration.tcl @@ -1,22 +1,19 @@ # Tests for many simultaneous migrations. -source "../tests/includes/init-tests.tcl" -source "../tests/includes/utils.tcl" +source tests/support/cluster_util.tcl # TODO: This test currently runs without replicas, as failovers (which may # happen on lower-end CI platforms) are still not handled properly by the # cluster during slot migration (related to #6339). -test "Create a 10 nodes cluster" { - create_cluster 10 0 +start_cluster 10 0 {tags {external:skip cluster}} { config_set_all_nodes cluster-allow-replica-migration no -} test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] +set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] catch {unset nodefrom} catch {unset nodeto} @@ -56,3 +53,5 @@ test "Keys are accessible" { } config_set_all_nodes cluster-allow-replica-migration yes + +} ;# start_cluster diff --git a/tests/unit/cluster/multi-slot-operations.tcl b/tests/unit/cluster/multi-slot-operations.tcl index cc7bb7ae0f..fe3246a3fa 100644 --- a/tests/unit/cluster/multi-slot-operations.tcl +++ b/tests/unit/cluster/multi-slot-operations.tcl @@ -1,5 +1,5 @@ # This test uses a custom slot allocation for testing -proc cluster_allocate_with_continuous_slots_local {n} { +proc cluster_allocate_with_continuous_slots_local {masters replicas} { R 0 cluster ADDSLOTSRANGE 0 3276 R 1 cluster ADDSLOTSRANGE 3277 6552 R 2 cluster ADDSLOTSRANGE 6553 9828 diff --git a/tests/unit/cluster/no-failover-option.tcl b/tests/unit/cluster/no-failover-option.tcl new file mode 100644 index 0000000000..af13dfa15b --- /dev/null +++ b/tests/unit/cluster/no-failover-option.tcl @@ -0,0 +1,62 @@ +# Check that the no-failover option works + +source tests/support/cluster.tcl + +start_cluster 3 3 {tags {external:skip cluster}} { + +test "Cluster is up" { + wait_for_cluster_state ok +} + +test "Instance #3 is a replica" { + assert {[s -3 role] eq {slave}} + + # Configure it to never failover the master + R 3 CONFIG SET cluster-replica-no-failover yes +} + +test "Instance #3 synced with the master" { + wait_for_condition 1000 50 { + [s -3 master_link_status] eq {up} + } else { + fail "Instance #3 master link status is not up" + } +} + +test "The nofailover flag is propagated" { + set replica3_id [dict get [cluster_get_myself 3] id] + + for {set j 0} {$j < [llength $::servers]} {incr j} { + wait_for_condition 1000 50 { + [cluster_has_flag [cluster_get_node_by_id $j $replica3_id] nofailover] + } else { + fail "Instance $id can't see the nofailover flag of replica" + } + } +} + +test "Killing one master node" { + pause_process [srv 0 pid] +} + +test "Cluster should be still down after some time" { + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {fail} && + [CI 2 cluster_state] eq {fail} && + [CI 3 cluster_state] eq {fail} && + [CI 4 cluster_state] eq {fail} && + [CI 5 cluster_state] eq {fail} + } else { + fail "Cluster doesn't fail" + } +} + +test "Instance #3 is still a replica" { + assert {[s -3 role] eq {slave}} +} + +test "Restarting the previously killed master node" { + resume_process [srv 0 pid] +} + +} ;# start_cluster diff --git a/tests/cluster/tests/09-pubsub.tcl b/tests/unit/cluster/pubsub.tcl similarity index 90% rename from tests/cluster/tests/09-pubsub.tcl rename to tests/unit/cluster/pubsub.tcl index e62b91c4b5..12cb409fd9 100644 --- a/tests/cluster/tests/09-pubsub.tcl +++ b/tests/unit/cluster/pubsub.tcl @@ -1,10 +1,6 @@ # Test PUBLISH propagation across the cluster. -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +start_cluster 5 5 {tags {external:skip cluster}} { proc test_cluster_publish {instance instances} { # Subscribe all the instances but the one we use to send. @@ -38,3 +34,5 @@ test "Test publishing to master" { test "Test publishing to slave" { test_cluster_publish 5 10 } + +} ;# start_cluster diff --git a/tests/cluster/tests/25-pubsubshard-slot-migration.tcl b/tests/unit/cluster/pubsubshard-slot-migration.tcl similarity index 96% rename from tests/cluster/tests/25-pubsubshard-slot-migration.tcl rename to tests/unit/cluster/pubsubshard-slot-migration.tcl index 45ec500eb7..c5a324f094 100644 --- a/tests/cluster/tests/25-pubsubshard-slot-migration.tcl +++ b/tests/unit/cluster/pubsubshard-slot-migration.tcl @@ -1,17 +1,15 @@ -source "../tests/includes/init-tests.tcl" +source tests/support/cluster.tcl -test "Create a 3 nodes cluster" { - cluster_create_with_continuous_slots 3 3 -} +start_cluster 3 3 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] +set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] proc get_addr_replica_serving_slot slot { - set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] + set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] array set node [$cluster masternode_for_slot $slot] set replicanodeinfo [$cluster cluster replicas $node(id)] @@ -209,4 +207,6 @@ test "Reset cluster, verify sunsubscribe message" { $cluster close $subscribeclient close -} \ No newline at end of file +} + +} ;# start_cluster diff --git a/tests/cluster/tests/26-pubsubshard.tcl b/tests/unit/cluster/pubsubshard.tcl similarity index 95% rename from tests/cluster/tests/26-pubsubshard.tcl rename to tests/unit/cluster/pubsubshard.tcl index fe0e7d39ab..e32b6a3a0e 100644 --- a/tests/cluster/tests/26-pubsubshard.tcl +++ b/tests/unit/cluster/pubsubshard.tcl @@ -1,14 +1,13 @@ # Test PUBSUB shard propagation in a cluster slot. -source "../tests/includes/init-tests.tcl" +source tests/support/cluster.tcl -test "Create a 3 nodes cluster" { - cluster_create_with_continuous_slots 3 3 -} +# Start a cluster with 3 masters and 3 replicas. +start_cluster 3 3 {tags {external:skip cluster}} { -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] -test "Pub/Sub shard basics" { +set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] +test "Pub/Sub shard basics" { set slot [$cluster cluster keyslot "channel.0"] array set publishnode [$cluster masternode_for_slot $slot] array set notshardnode [$cluster masternode_notfor_slot $slot] @@ -123,8 +122,11 @@ test "PUBSUB channels/shardchannels" { assert_equal {3} [llength [$publishclient pubsub shardchannels]] sunsubscribe $subscribeclient + $subscribeclient read set channel_list [$publishclient pubsub shardchannels] assert_equal {2} [llength $channel_list] assert {[lsearch -exact $channel_list "\{channel.0\}2"] >= 0} assert {[lsearch -exact $channel_list "\{channel.0\}3"] >= 0} } + +} ;# start_cluster diff --git a/tests/cluster/tests/22-replica-in-sync.tcl b/tests/unit/cluster/replica-in-sync.tcl similarity index 91% rename from tests/cluster/tests/22-replica-in-sync.tcl rename to tests/unit/cluster/replica-in-sync.tcl index b5645aa75f..776c3ca73c 100644 --- a/tests/cluster/tests/22-replica-in-sync.tcl +++ b/tests/unit/cluster/replica-in-sync.tcl @@ -1,15 +1,13 @@ -source "../tests/includes/init-tests.tcl" +source tests/support/cluster.tcl -test "Create a 1 node cluster" { - create_cluster 1 0 -} +start_cluster 1 1 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } proc is_in_slots {master_id replica} { @@ -90,17 +88,17 @@ test "Replica in loading state is hidden" { # The master will be the last to know the replica # is loading, so we will wait on that and assert - # the replica is loading afterwards. + # the replica is loading afterwards. wait_for_condition 100 50 { ![is_in_slots $master_id $replica] } else { fail "Replica was always present in cluster slots" } - assert_equal 1 [s $replica_id loading] + assert_equal 1 [s [expr {-1*$replica_id}] loading] # Wait for the replica to finish full-sync and become online wait_for_condition 200 50 { - [s $replica_id master_link_status] eq "up" + [s [expr {-1*$replica_id}] master_link_status] eq "up" } else { fail "Replica didn't finish loading" } @@ -115,7 +113,7 @@ test "Replica in loading state is hidden" { } else { fail "Replica is not back to slots" } - assert_equal 1 [is_in_slots $replica_id $replica] + assert_equal 1 [is_in_slots $replica_id $replica] } test "Check disconnected replica not hidden from slots" { @@ -144,3 +142,5 @@ test "Check disconnected replica not hidden from slots" { # undo config R $master_id config set requirepass "" } + +} ;# start_cluster diff --git a/tests/cluster/tests/05-slave-selection.tcl b/tests/unit/cluster/slave-selection.tcl similarity index 73% rename from tests/cluster/tests/05-slave-selection.tcl rename to tests/unit/cluster/slave-selection.tcl index bb3a06134e..9c047a0d2d 100644 --- a/tests/cluster/tests/05-slave-selection.tcl +++ b/tests/unit/cluster/slave-selection.tcl @@ -1,16 +1,12 @@ # Slave selection test # Check the algorithm trying to pick the slave with the most complete history. -source "../tests/includes/init-tests.tcl" - # Create a cluster with 5 master and 10 slaves, so that we have 2 # slaves for each master. -test "Create a 5 nodes cluster" { - create_cluster 5 10 -} +start_cluster 5 10 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "The first master has actually two slaves" { @@ -34,21 +30,21 @@ test "CLUSTER SLAVES and CLUSTER REPLICAS output is consistent" { } test {Slaves of #0 are instance #5 and #10 as expected} { - set port0 [get_instance_attrib valkey 0 port] + set port0 [srv 0 port] assert {[lindex [R 5 role] 2] == $port0} assert {[lindex [R 10 role] 2] == $port0} } test "Instance #5 and #10 synced with the master" { wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} && - [RI 10 master_link_status] eq {up} + [s -5 master_link_status] eq {up} && + [s -10 master_link_status] eq {up} } else { fail "Instance #5 or #10 master link status is not up" } } -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] +set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] test "Slaves are both able to receive and acknowledge writes" { for {set j 0} {$j < 100} {incr j} { @@ -57,6 +53,7 @@ test "Slaves are both able to receive and acknowledge writes" { assert {[R 0 wait 2 60000] == 2} } +set paused_pid [srv 0 pid] test "Write data while slave #10 is paused and can't receive it" { # Stop the slave with a multi/exec transaction so that the master will # be killed as soon as it can accept writes again. @@ -80,12 +77,12 @@ test "Write data while slave #10 is paused and can't receive it" { assert {[R 10 read] eq {OK OK}} # Kill the master so that a reconnection will not be possible. - kill_instance valkey 0 + pause_process $paused_pid } test "Wait for instance #5 (and not #10) to turn into a master" { wait_for_condition 1000 50 { - [RI 5 role] eq {master} + [s -5 role] eq {master} } else { fail "No failover detected" } @@ -96,11 +93,18 @@ test "Wait for the node #10 to return alive before ending the test" { } test "Cluster should eventually be up again" { - assert_cluster_state ok + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } } test "Node #10 should eventually replicate node #5" { - set port5 [get_instance_attrib valkey 5 port] + set port5 [srv -5 port] wait_for_condition 1000 50 { ([lindex [R 10 role] 2] == $port5) && ([lindex [R 10 role] 3] eq {connected}) @@ -109,16 +113,14 @@ test "Node #10 should eventually replicate node #5" { } } -source "../tests/includes/init-tests.tcl" +} ;# start_cluster # Create a cluster with 3 master and 15 slaves, so that we have 5 # slaves for eatch master. -test "Create a 3 nodes cluster" { - create_cluster 3 15 -} +start_cluster 3 15 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "The first master has actually 5 slaves" { @@ -130,7 +132,7 @@ test "The first master has actually 5 slaves" { } test {Slaves of #0 are instance #3, #6, #9, #12 and #15 as expected} { - set port0 [get_instance_attrib valkey 0 port] + set port0 [srv 0 port] assert {[lindex [R 3 role] 2] == $port0} assert {[lindex [R 6 role] 2] == $port0} assert {[lindex [R 9 role] 2] == $port0} @@ -140,11 +142,11 @@ test {Slaves of #0 are instance #3, #6, #9, #12 and #15 as expected} { test {Instance #3, #6, #9, #12 and #15 synced with the master} { wait_for_condition 1000 50 { - [RI 3 master_link_status] eq {up} && - [RI 6 master_link_status] eq {up} && - [RI 9 master_link_status] eq {up} && - [RI 12 master_link_status] eq {up} && - [RI 15 master_link_status] eq {up} + [s -3 master_link_status] eq {up} && + [s -6 master_link_status] eq {up} && + [s -9 master_link_status] eq {up} && + [s -12 master_link_status] eq {up} && + [s -15 master_link_status] eq {up} } else { fail "Instance #3 or #6 or #9 or #12 or #15 master link status is not up" } @@ -152,7 +154,7 @@ test {Instance #3, #6, #9, #12 and #15 synced with the master} { proc master_detected {instances} { foreach instance [dict keys $instances] { - if {[RI $instance role] eq {master}} { + if {[s -$instance role] eq {master}} { return true } } @@ -167,7 +169,7 @@ test "New Master down consecutively" { for {set i 0} {$i < $loops} {incr i} { set master_id -1 foreach instance [dict keys $instances] { - if {[RI $instance role] eq {master}} { + if {[s -$instance role] eq {master}} { set master_id $instance break; } @@ -179,13 +181,23 @@ test "New Master down consecutively" { set instances [dict remove $instances $master_id] - kill_instance valkey $master_id + set paused_pid [srv [expr $master_id * -1] pid] + pause_process $paused_pid wait_for_condition 1000 50 { [master_detected $instances] } else { fail "No failover detected when master $master_id fails" } - assert_cluster_state ok + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } } } + +} ;# start_cluster diff --git a/tests/cluster/tests/06-slave-stop-cond.tcl b/tests/unit/cluster/slave-stop-cond.tcl similarity index 76% rename from tests/cluster/tests/06-slave-stop-cond.tcl rename to tests/unit/cluster/slave-stop-cond.tcl index 3813f37365..b97c7b6907 100644 --- a/tests/cluster/tests/06-slave-stop-cond.tcl +++ b/tests/unit/cluster/slave-stop-cond.tcl @@ -2,15 +2,11 @@ # Check that if there is a disconnection time limit, the slave will not try # to failover its master. -source "../tests/includes/init-tests.tcl" - # Create a cluster with 5 master and 5 slaves. -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +start_cluster 5 5 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "The first master has actually one slave" { @@ -22,13 +18,13 @@ test "The first master has actually one slave" { } test {Slaves of #0 is instance #5 as expected} { - set port0 [get_instance_attrib valkey 0 port] + set port0 [srv 0 port] assert {[lindex [R 5 role] 2] == $port0} } test "Instance #5 synced with the master" { wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} + [s -5 master_link_status] eq {up} } else { fail "Instance #5 master link status is not up" } @@ -38,6 +34,7 @@ test "Lower the slave validity factor of #5 to the value of 2" { assert {[R 5 config set cluster-slave-validity-factor 2] eq {OK}} } +set paused_pid [srv 0 pid] test "Break master-slave link and prevent further reconnections" { # Stop the slave with a multi/exec transaction so that the master will # be killed as soon as it can accept writes again. @@ -60,7 +57,7 @@ test "Break master-slave link and prevent further reconnections" { assert {[R 5 read] eq {OK OK}} # Kill the master so that a reconnection will not be possible. - kill_instance valkey 0 + pause_process $paused_pid } test "Slave #5 is reachable and alive" { @@ -69,9 +66,18 @@ test "Slave #5 is reachable and alive" { test "Slave #5 should not be able to failover" { after 10000 - assert {[RI 5 role] eq {slave}} + assert {[s -5 role] eq {slave}} } test "Cluster should be down" { - assert_cluster_state fail + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + wait_for_condition 100 50 { + [CI $j cluster_state] eq "fail" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } } + +} ;# start_cluster diff --git a/tests/cluster/tests/29-slot-migration-response.tcl b/tests/unit/cluster/slot-migration-response.tcl similarity index 77% rename from tests/cluster/tests/29-slot-migration-response.tcl rename to tests/unit/cluster/slot-migration-response.tcl index bc76735591..e1db7041c2 100644 --- a/tests/cluster/tests/29-slot-migration-response.tcl +++ b/tests/unit/cluster/slot-migration-response.tcl @@ -1,18 +1,15 @@ # Tests for the response of slot migrations. +source tests/support/cluster.tcl -source "../tests/includes/init-tests.tcl" -source "../tests/includes/utils.tcl" +start_cluster 2 0 {tags {external:skip cluster}} { -test "Create a 2 nodes cluster" { - create_cluster 2 0 - config_set_all_nodes cluster-allow-replica-migration no -} +config_set_all_nodes cluster-allow-replica-migration no test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] +set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] catch {unset nodefrom} catch {unset nodeto} @@ -48,3 +45,5 @@ test "Test cluster responses during migration of slot x" { } config_set_all_nodes cluster-allow-replica-migration yes + +} ;# start_cluster diff --git a/tests/cluster/tests/16-transactions-on-replica.tcl b/tests/unit/cluster/transactions-on-replica.tcl similarity index 86% rename from tests/cluster/tests/16-transactions-on-replica.tcl rename to tests/unit/cluster/transactions-on-replica.tcl index b509892f54..b53af58cac 100644 --- a/tests/cluster/tests/16-transactions-on-replica.tcl +++ b/tests/unit/cluster/transactions-on-replica.tcl @@ -1,17 +1,13 @@ # Check basic transactions on a replica. -source "../tests/includes/init-tests.tcl" - -test "Create a primary with a replica" { - create_cluster 1 1 -} +start_cluster 1 1 {tags {external:skip cluster}} { test "Cluster should start ok" { - assert_cluster_state ok + wait_for_cluster_state ok } -set primary [Rn 0] -set replica [Rn 1] +set primary [srv 0 "client"] +set replica [srv -1 "client"] test "Can't read from replica without READONLY" { $primary SET a 1 @@ -58,13 +54,13 @@ test "MULTI-EXEC with write operations is MOVED" { } test "read-only blocking operations from replica" { - set rd [valkey_deferring_client valkey 1] + set rd [valkey_deferring_client -1] $rd readonly $rd read $rd XREAD BLOCK 0 STREAMS k 0 wait_for_condition 1000 50 { - [RI 1 blocked_clients] eq {1} + [s -1 blocked_clients] eq {1} } else { fail "client wasn't blocked" } @@ -78,8 +74,10 @@ test "read-only blocking operations from replica" { test "reply MOVED when eval from replica for update" { catch {[$replica eval {#!lua - return redis.call('del','a') + return server.call('del','a') } 1 a ]} err assert {[string range $err 0 4] eq {MOVED}} -} \ No newline at end of file +} + +} ;# start_cluster diff --git a/tests/cluster/tests/08-update-msg.tcl b/tests/unit/cluster/update-msg.tcl similarity index 55% rename from tests/cluster/tests/08-update-msg.tcl rename to tests/unit/cluster/update-msg.tcl index bff3d0a862..2bec2de27c 100644 --- a/tests/cluster/tests/08-update-msg.tcl +++ b/tests/unit/cluster/update-msg.tcl @@ -9,27 +9,23 @@ # of the UPDATE messages it will receive from the other nodes when its # configuration will be found to be outdated. -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +start_cluster 5 5 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "Instance #5 is a slave" { - assert {[RI 5 role] eq {slave}} + assert {[s -5 role] eq {slave}} } test "Instance #5 synced with the master" { wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} + [s -5 master_link_status] eq {up} } else { fail "Instance #5 master link status is not up" } @@ -37,8 +33,9 @@ test "Instance #5 synced with the master" { set current_epoch [CI 1 cluster_current_epoch] +set paused_pid [srv 0 pid] test "Killing one master node" { - kill_instance valkey 0 + pause_process $paused_pid } test "Wait for failover" { @@ -50,41 +47,59 @@ test "Wait for failover" { } test "Cluster should eventually be up again" { - assert_cluster_state ok + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } } test "Cluster is writable" { - cluster_write_test 1 + cluster_write_test [srv -1 port] } test "Instance #5 is now a master" { - assert {[RI 5 role] eq {master}} + assert {[s -5 role] eq {master}} } +set paused_pid5 [srv -5 pid] test "Killing the new master #5" { - kill_instance valkey 5 + pause_process $paused_pid5 } test "Cluster should be down now" { - assert_cluster_state fail + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + if {[process_is_paused $paused_pid5]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "fail" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } } test "Restarting the old master node" { - restart_instance valkey 0 + resume_process $paused_pid } test "Instance #0 gets converted into a slave" { wait_for_condition 1000 50 { - [RI 0 role] eq {slave} + [s 0 role] eq {slave} } else { fail "Old master was not converted into slave" } } test "Restarting the new master node" { - restart_instance valkey 5 + resume_process $paused_pid5 } test "Cluster is up again" { - assert_cluster_state ok + wait_for_cluster_state ok } + +} ;# start_cluster