From c5496c9c3324f6e9e06c2c7b8fd3e9557cacbc7b Mon Sep 17 00:00:00 2001 From: prsurve Date: Mon, 1 Apr 2024 10:00:45 +0530 Subject: [PATCH] Test Verify hub restore to passive hub following failover and relocate of app Signed-off-by: prsurve --- ocs_ci/helpers/dr_helpers.py | 2 +- ocs_ci/ocs/constants.py | 4 +- .../test_active_hub_down_and_restore.py | 193 ++++++++++++++++++ 3 files changed, 195 insertions(+), 4 deletions(-) create mode 100644 tests/functional/disaster-recovery/regional-dr/test_active_hub_down_and_restore.py diff --git a/ocs_ci/helpers/dr_helpers.py b/ocs_ci/helpers/dr_helpers.py index 47ee0fb60149..fc2f918c7159 100644 --- a/ocs_ci/helpers/dr_helpers.py +++ b/ocs_ci/helpers/dr_helpers.py @@ -997,7 +997,7 @@ def create_backup_schedule(): """ old_ctx = config.cur_index config.switch_ctx(get_active_acm_index()) - backup_schedule = templating.load_yaml(constants.MDR_BACKUP_SCHEDULE_YAML) + backup_schedule = templating.load_yaml(constants.BACKUP_SCHEDULE_YAML) backup_schedule_yaml = tempfile.NamedTemporaryFile( mode="w+", prefix="bkp", delete=False ) diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py index 806f0c97ff86..2ef377dcff6e 100644 --- a/ocs_ci/ocs/constants.py +++ b/ocs_ci/ocs/constants.py @@ -1061,9 +1061,7 @@ MDR_VELERO_POD_COUNT = 1 MDR_DPA = "dpa-1" MDR_MULTICLUSTER_ENGINE = "multiclusterengine" -MDR_BACKUP_SCHEDULE_YAML = os.path.join( - TEMPLATE_MULTICLUSTER_DIR, "backupschedule.yaml" -) +BACKUP_SCHEDULE_YAML = os.path.join(TEMPLATE_MULTICLUSTER_DIR, "backupschedule.yaml") MDR_BACKUP_SCHEDULE_RESOURCE = "schedule-acm" diff --git a/tests/functional/disaster-recovery/regional-dr/test_active_hub_down_and_restore.py b/tests/functional/disaster-recovery/regional-dr/test_active_hub_down_and_restore.py new file mode 100644 index 000000000000..459cf9c7394e --- /dev/null +++ b/tests/functional/disaster-recovery/regional-dr/test_active_hub_down_and_restore.py @@ -0,0 +1,193 @@ +import logging +import time +from concurrent.futures import ThreadPoolExecutor + +from ocs_ci.framework.pytest_customization.marks import tier4a, turquoise_squad +from ocs_ci.framework import config +from ocs_ci.ocs.acm.acm import validate_cluster_import +from ocs_ci.ocs import constants +from ocs_ci.ocs.node import get_node_objs +from ocs_ci.helpers.dr_helpers import ( + failover, + relocate, + restore_backup, + create_backup_schedule, + get_current_primary_cluster_name, + get_current_secondary_cluster_name, + get_passive_acm_index, + wait_for_all_resources_creation, + wait_for_all_resources_deletion, + verify_drpolicy_cli, + verify_restore_is_completed, + get_scheduling_interval, +) +from ocs_ci.ocs.exceptions import UnexpectedBehaviour +from ocs_ci.ocs.resources.pod import wait_for_pods_to_be_running +from ocs_ci.ocs.utils import get_active_acm_index +from ocs_ci.utility.utils import TimeoutSampler + + +logger = logging.getLogger(__name__) + + +@tier4a +@turquoise_squad +class TestActiveHubDownAndRestore: + """ + Test failover and relocate all apps when active hub down and restored RDR + """ + + def test_hub_recovery_failover_and_relocate(self, nodes_multicluster, dr_workload): + + """ + Tests to verify failover and relocate all apps when active hub down and restored RDR + """ + + # acm_obj = AcmAddClusters() + # Deploy Subscription and Appset based application + rdr_workload = dr_workload( + num_of_subscription=1, num_of_appset=1, switch_ctx=get_passive_acm_index() + ) + logger.info(type(rdr_workload)) + primary_cluster_name = get_current_primary_cluster_name( + rdr_workload[0].workload_namespace + ) + secondary_cluster_name = get_current_secondary_cluster_name( + rdr_workload[0].workload_namespace + ) + scheduling_interval = get_scheduling_interval( + rdr_workload[0].workload_namespace, rdr_workload[0].workload_type + ) + # Create backup-schedule on active hub + create_backup_schedule() + two_times_scheduling_interval = 2 * scheduling_interval # Time in minutes + wait_time = 300 + logger.info(f"Wait {wait_time} until backup is taken ") + time.sleep(wait_time) + + # Get the active hub nodes + logger.info("Getting Active cluster node details") + config.switch_ctx(get_active_acm_index()) + active_hub_index = config.cur_index + active_hub_cluster_node_objs = get_node_objs() + # ToDo Add verification for dpa and policy + + # Shutdown active hub nodes + logger.info("Shutting down all the nodes of active hub") + nodes_multicluster[active_hub_index].stop_nodes(active_hub_cluster_node_objs) + logger.info( + "All nodes of active hub zone are powered off, " + f"wait {wait_time} seconds before restoring in passive hub" + ) + + # Restore new hub + restore_backup() + logger.info(f"Wait {wait_time} until restores are taken ") + time.sleep(wait_time) + + # Verify the restore is completed + verify_restore_is_completed() + + # Validate the clusters are imported + clusters = [primary_cluster_name, secondary_cluster_name] + for cluster in clusters: + for sample in TimeoutSampler( + timeout=1800, + sleep=60, + func=validate_cluster_import, + cluster_name=cluster, + switch_ctx=get_passive_acm_index(), + ): + if sample: + logger.info( + f"Cluster: {cluster} successfully imported post hub recovery" + f"Cluster: {cluster} successfully imported post hub recovery" + ) + # Validate klusterlet addons are running on managed cluster + config.switch_to_cluster_by_name(cluster) + wait_for_pods_to_be_running( + namespace=constants.ACM_ADDONS_NAMESPACE, timeout=300, sleep=15 + ) + break + else: + logger.error( + f"import of cluster: {cluster} failed post hub recovery" + ) + raise UnexpectedBehaviour( + f"import of cluster: {cluster} failed post hub recovery" + ) + # Wait or verify the drpolicy is in validated state + verify_drpolicy_cli(switch_ctx=get_passive_acm_index()) + + # Failover action via CLI + failover_results = [] + with ThreadPoolExecutor() as executor: + for wl in rdr_workload: + failover_results.append( + executor.submit( + failover, + failover_cluster=secondary_cluster_name, + namespace=wl.workload_namespace, + workload_type=wl.workload_type, + workload_placement_name=rdr_workload.appset_placement_name + if wl.workload_type != constants.SUBSCRIPTION + else None, + switch_ctx=get_passive_acm_index(), + ) + ) + time.sleep(60) + + # Wait for failover results + for fl in failover_results: + fl.result() + + # Verify resources creation on secondary cluster (failoverCluster) + config.switch_to_cluster_by_name(secondary_cluster_name) + for wl in rdr_workload: + wait_for_all_resources_creation( + wl.workload_pvc_count, + wl.workload_pod_count, + wl.workload_namespace, + ) + # Verify application are deleted from old cluster + config.switch_to_cluster_by_name(primary_cluster_name) + for wl in rdr_workload: + wait_for_all_resources_deletion(wl.workload_namespace) + + logger.info(f"Waiting for {two_times_scheduling_interval} minutes to run IOs") + time.sleep(two_times_scheduling_interval * 60) + + relocate_results = [] + with ThreadPoolExecutor() as executor: + for wl in rdr_workload: + relocate_results.append( + executor.submit( + relocate, + preferred_cluster=primary_cluster_name, + namespace=wl.workload_namespace, + workload_type=wl.workload_type, + workload_placement_name=rdr_workload.appset_placement_name + if wl.workload_type != constants.SUBSCRIPTION + else None, + switch_ctx=get_passive_acm_index(), + ) + ) + time.sleep(60) + + # Wait for relocate results + for rl in relocate_results: + rl.result() + + # Verify resources creation on preferredCluster + config.switch_to_cluster_by_name(primary_cluster_name) + for wl in rdr_workload: + wait_for_all_resources_creation( + wl.workload_pvc_count, + wl.workload_pod_count, + wl.workload_namespace, + ) + + # Verify resources deletion from previous primary or current secondary cluster + config.switch_to_cluster_by_name(secondary_cluster_name) + for wl in rdr_workload: + wait_for_all_resources_deletion(wl.workload_namespace)