Skip to content

Commit

Permalink
Test Verify hub restore to passive hub following failover and relocat…
Browse files Browse the repository at this point in the history
…e of app

Signed-off-by: prsurve <[email protected]>
  • Loading branch information
prsurve committed Jun 7, 2024
1 parent 9482269 commit c5496c9
Show file tree
Hide file tree
Showing 3 changed files with 195 additions and 4 deletions.
2 changes: 1 addition & 1 deletion ocs_ci/helpers/dr_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -997,7 +997,7 @@ def create_backup_schedule():
"""
old_ctx = config.cur_index
config.switch_ctx(get_active_acm_index())
backup_schedule = templating.load_yaml(constants.MDR_BACKUP_SCHEDULE_YAML)
backup_schedule = templating.load_yaml(constants.BACKUP_SCHEDULE_YAML)
backup_schedule_yaml = tempfile.NamedTemporaryFile(
mode="w+", prefix="bkp", delete=False
)
Expand Down
4 changes: 1 addition & 3 deletions ocs_ci/ocs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -1061,9 +1061,7 @@
MDR_VELERO_POD_COUNT = 1
MDR_DPA = "dpa-1"
MDR_MULTICLUSTER_ENGINE = "multiclusterengine"
MDR_BACKUP_SCHEDULE_YAML = os.path.join(
TEMPLATE_MULTICLUSTER_DIR, "backupschedule.yaml"
)
BACKUP_SCHEDULE_YAML = os.path.join(TEMPLATE_MULTICLUSTER_DIR, "backupschedule.yaml")
MDR_BACKUP_SCHEDULE_RESOURCE = "schedule-acm"


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import logging
import time
from concurrent.futures import ThreadPoolExecutor

from ocs_ci.framework.pytest_customization.marks import tier4a, turquoise_squad
from ocs_ci.framework import config
from ocs_ci.ocs.acm.acm import validate_cluster_import
from ocs_ci.ocs import constants
from ocs_ci.ocs.node import get_node_objs
from ocs_ci.helpers.dr_helpers import (
failover,
relocate,
restore_backup,
create_backup_schedule,
get_current_primary_cluster_name,
get_current_secondary_cluster_name,
get_passive_acm_index,
wait_for_all_resources_creation,
wait_for_all_resources_deletion,
verify_drpolicy_cli,
verify_restore_is_completed,
get_scheduling_interval,
)
from ocs_ci.ocs.exceptions import UnexpectedBehaviour
from ocs_ci.ocs.resources.pod import wait_for_pods_to_be_running
from ocs_ci.ocs.utils import get_active_acm_index
from ocs_ci.utility.utils import TimeoutSampler


logger = logging.getLogger(__name__)


@tier4a
@turquoise_squad
class TestActiveHubDownAndRestore:
"""
Test failover and relocate all apps when active hub down and restored RDR
"""

def test_hub_recovery_failover_and_relocate(self, nodes_multicluster, dr_workload):

"""
Tests to verify failover and relocate all apps when active hub down and restored RDR
"""

# acm_obj = AcmAddClusters()
# Deploy Subscription and Appset based application
rdr_workload = dr_workload(
num_of_subscription=1, num_of_appset=1, switch_ctx=get_passive_acm_index()
)
logger.info(type(rdr_workload))
primary_cluster_name = get_current_primary_cluster_name(
rdr_workload[0].workload_namespace
)
secondary_cluster_name = get_current_secondary_cluster_name(
rdr_workload[0].workload_namespace
)
scheduling_interval = get_scheduling_interval(
rdr_workload[0].workload_namespace, rdr_workload[0].workload_type
)
# Create backup-schedule on active hub
create_backup_schedule()
two_times_scheduling_interval = 2 * scheduling_interval # Time in minutes
wait_time = 300
logger.info(f"Wait {wait_time} until backup is taken ")
time.sleep(wait_time)

# Get the active hub nodes
logger.info("Getting Active cluster node details")
config.switch_ctx(get_active_acm_index())
active_hub_index = config.cur_index
active_hub_cluster_node_objs = get_node_objs()
# ToDo Add verification for dpa and policy

# Shutdown active hub nodes
logger.info("Shutting down all the nodes of active hub")
nodes_multicluster[active_hub_index].stop_nodes(active_hub_cluster_node_objs)
logger.info(
"All nodes of active hub zone are powered off, "
f"wait {wait_time} seconds before restoring in passive hub"
)

# Restore new hub
restore_backup()
logger.info(f"Wait {wait_time} until restores are taken ")
time.sleep(wait_time)

# Verify the restore is completed
verify_restore_is_completed()

# Validate the clusters are imported
clusters = [primary_cluster_name, secondary_cluster_name]
for cluster in clusters:
for sample in TimeoutSampler(
timeout=1800,
sleep=60,
func=validate_cluster_import,
cluster_name=cluster,
switch_ctx=get_passive_acm_index(),
):
if sample:
logger.info(
f"Cluster: {cluster} successfully imported post hub recovery"
f"Cluster: {cluster} successfully imported post hub recovery"
)
# Validate klusterlet addons are running on managed cluster
config.switch_to_cluster_by_name(cluster)
wait_for_pods_to_be_running(
namespace=constants.ACM_ADDONS_NAMESPACE, timeout=300, sleep=15
)
break
else:
logger.error(
f"import of cluster: {cluster} failed post hub recovery"
)
raise UnexpectedBehaviour(
f"import of cluster: {cluster} failed post hub recovery"
)
# Wait or verify the drpolicy is in validated state
verify_drpolicy_cli(switch_ctx=get_passive_acm_index())

# Failover action via CLI
failover_results = []
with ThreadPoolExecutor() as executor:
for wl in rdr_workload:
failover_results.append(
executor.submit(
failover,
failover_cluster=secondary_cluster_name,
namespace=wl.workload_namespace,
workload_type=wl.workload_type,
workload_placement_name=rdr_workload.appset_placement_name
if wl.workload_type != constants.SUBSCRIPTION
else None,
switch_ctx=get_passive_acm_index(),
)
)
time.sleep(60)

# Wait for failover results
for fl in failover_results:
fl.result()

# Verify resources creation on secondary cluster (failoverCluster)
config.switch_to_cluster_by_name(secondary_cluster_name)
for wl in rdr_workload:
wait_for_all_resources_creation(
wl.workload_pvc_count,
wl.workload_pod_count,
wl.workload_namespace,
)
# Verify application are deleted from old cluster
config.switch_to_cluster_by_name(primary_cluster_name)
for wl in rdr_workload:
wait_for_all_resources_deletion(wl.workload_namespace)

logger.info(f"Waiting for {two_times_scheduling_interval} minutes to run IOs")
time.sleep(two_times_scheduling_interval * 60)

relocate_results = []
with ThreadPoolExecutor() as executor:
for wl in rdr_workload:
relocate_results.append(
executor.submit(
relocate,
preferred_cluster=primary_cluster_name,
namespace=wl.workload_namespace,
workload_type=wl.workload_type,
workload_placement_name=rdr_workload.appset_placement_name
if wl.workload_type != constants.SUBSCRIPTION
else None,
switch_ctx=get_passive_acm_index(),
)
)
time.sleep(60)

# Wait for relocate results
for rl in relocate_results:
rl.result()

# Verify resources creation on preferredCluster
config.switch_to_cluster_by_name(primary_cluster_name)
for wl in rdr_workload:
wait_for_all_resources_creation(
wl.workload_pvc_count,
wl.workload_pod_count,
wl.workload_namespace,
)

# Verify resources deletion from previous primary or current secondary cluster
config.switch_to_cluster_by_name(secondary_cluster_name)
for wl in rdr_workload:
wait_for_all_resources_deletion(wl.workload_namespace)

0 comments on commit c5496c9

Please sign in to comment.