Test Verify hub restore to passive hub following failover and relocat…

…e of app Signed-off-by: prsurve <[email protected]>
red-hat-storage · Jun 7, 2024 · c5496c9 · c5496c9
1 parent 9482269
commit c5496c9
Show file tree

Hide file tree

Showing 3 changed files with 195 additions and 4 deletions.
diff --git a/ocs_ci/helpers/dr_helpers.py b/ocs_ci/helpers/dr_helpers.py
@@ -997,7 +997,7 @@ def create_backup_schedule():
     """
     old_ctx = config.cur_index
     config.switch_ctx(get_active_acm_index())
-    backup_schedule = templating.load_yaml(constants.MDR_BACKUP_SCHEDULE_YAML)
+    backup_schedule = templating.load_yaml(constants.BACKUP_SCHEDULE_YAML)
     backup_schedule_yaml = tempfile.NamedTemporaryFile(
         mode="w+", prefix="bkp", delete=False
     )

diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py
@@ -1061,9 +1061,7 @@
 MDR_VELERO_POD_COUNT = 1
 MDR_DPA = "dpa-1"
 MDR_MULTICLUSTER_ENGINE = "multiclusterengine"
-MDR_BACKUP_SCHEDULE_YAML = os.path.join(
-    TEMPLATE_MULTICLUSTER_DIR, "backupschedule.yaml"
-)
+BACKUP_SCHEDULE_YAML = os.path.join(TEMPLATE_MULTICLUSTER_DIR, "backupschedule.yaml")
 MDR_BACKUP_SCHEDULE_RESOURCE = "schedule-acm"
 
 

diff --git a/tests/functional/disaster-recovery/regional-dr/test_active_hub_down_and_restore.py b/tests/functional/disaster-recovery/regional-dr/test_active_hub_down_and_restore.py
@@ -0,0 +1,193 @@
+import logging
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+from ocs_ci.framework.pytest_customization.marks import tier4a, turquoise_squad
+from ocs_ci.framework import config
+from ocs_ci.ocs.acm.acm import validate_cluster_import
+from ocs_ci.ocs import constants
+from ocs_ci.ocs.node import get_node_objs
+from ocs_ci.helpers.dr_helpers import (
+    failover,
+    relocate,
+    restore_backup,
+    create_backup_schedule,
+    get_current_primary_cluster_name,
+    get_current_secondary_cluster_name,
+    get_passive_acm_index,
+    wait_for_all_resources_creation,
+    wait_for_all_resources_deletion,
+    verify_drpolicy_cli,
+    verify_restore_is_completed,
+    get_scheduling_interval,
+)
+from ocs_ci.ocs.exceptions import UnexpectedBehaviour
+from ocs_ci.ocs.resources.pod import wait_for_pods_to_be_running
+from ocs_ci.ocs.utils import get_active_acm_index
+from ocs_ci.utility.utils import TimeoutSampler
+
+
+logger = logging.getLogger(__name__)
+
+
+@tier4a
+@turquoise_squad
+class TestActiveHubDownAndRestore:
+    """
+    Test failover and relocate all apps when active hub down and restored RDR
+    """
+
+    def test_hub_recovery_failover_and_relocate(self, nodes_multicluster, dr_workload):
+
+        """
+        Tests to verify failover and relocate all apps when active hub down and restored RDR
+        """
+
+        # acm_obj = AcmAddClusters()
+        # Deploy Subscription and Appset based application
+        rdr_workload = dr_workload(
+            num_of_subscription=1, num_of_appset=1, switch_ctx=get_passive_acm_index()
+        )
+        logger.info(type(rdr_workload))
+        primary_cluster_name = get_current_primary_cluster_name(
+            rdr_workload[0].workload_namespace
+        )
+        secondary_cluster_name = get_current_secondary_cluster_name(
+            rdr_workload[0].workload_namespace
+        )
+        scheduling_interval = get_scheduling_interval(
+            rdr_workload[0].workload_namespace, rdr_workload[0].workload_type
+        )
+        # Create backup-schedule on active hub
+        create_backup_schedule()
+        two_times_scheduling_interval = 2 * scheduling_interval  # Time in minutes
+        wait_time = 300
+        logger.info(f"Wait {wait_time} until backup is taken ")
+        time.sleep(wait_time)
+
+        # Get the active hub nodes
+        logger.info("Getting Active cluster node details")
+        config.switch_ctx(get_active_acm_index())
+        active_hub_index = config.cur_index
+        active_hub_cluster_node_objs = get_node_objs()
+        # ToDo Add verification for dpa and policy
+
+        # Shutdown active hub nodes
+        logger.info("Shutting down all the nodes of active hub")
+        nodes_multicluster[active_hub_index].stop_nodes(active_hub_cluster_node_objs)
+        logger.info(
+            "All nodes of active hub zone are powered off, "
+            f"wait {wait_time} seconds before restoring in passive hub"
+        )
+
+        # Restore new hub
+        restore_backup()
+        logger.info(f"Wait {wait_time} until restores are taken ")
+        time.sleep(wait_time)
+
+        # Verify the restore is completed
+        verify_restore_is_completed()
+
+        # Validate the clusters are imported
+        clusters = [primary_cluster_name, secondary_cluster_name]
+        for cluster in clusters:
+            for sample in TimeoutSampler(
+                timeout=1800,
+                sleep=60,
+                func=validate_cluster_import,
+                cluster_name=cluster,
+                switch_ctx=get_passive_acm_index(),
+            ):
+                if sample:
+                    logger.info(
+                        f"Cluster: {cluster} successfully imported post hub recovery"
+                        f"Cluster: {cluster} successfully imported post hub recovery"
+                    )
+                    # Validate klusterlet addons are running on managed cluster
+                    config.switch_to_cluster_by_name(cluster)
+                    wait_for_pods_to_be_running(
+                        namespace=constants.ACM_ADDONS_NAMESPACE, timeout=300, sleep=15
+                    )
+                    break
+                else:
+                    logger.error(
+                        f"import of cluster: {cluster} failed post hub recovery"
+                    )
+                    raise UnexpectedBehaviour(
+                        f"import of cluster: {cluster} failed post hub recovery"
+                    )
+                # Wait or verify the drpolicy is in validated state
+        verify_drpolicy_cli(switch_ctx=get_passive_acm_index())
+
+        # Failover action via CLI
+        failover_results = []
+        with ThreadPoolExecutor() as executor:
+            for wl in rdr_workload:
+                failover_results.append(
+                    executor.submit(
+                        failover,
+                        failover_cluster=secondary_cluster_name,
+                        namespace=wl.workload_namespace,
+                        workload_type=wl.workload_type,
+                        workload_placement_name=rdr_workload.appset_placement_name
+                        if wl.workload_type != constants.SUBSCRIPTION
+                        else None,
+                        switch_ctx=get_passive_acm_index(),
+                    )
+                )
+                time.sleep(60)
+
+        # Wait for failover results
+        for fl in failover_results:
+            fl.result()
+
+        # Verify resources creation on secondary cluster (failoverCluster)
+        config.switch_to_cluster_by_name(secondary_cluster_name)
+        for wl in rdr_workload:
+            wait_for_all_resources_creation(
+                wl.workload_pvc_count,
+                wl.workload_pod_count,
+                wl.workload_namespace,
+            )
+            # Verify application are deleted from old cluster
+        config.switch_to_cluster_by_name(primary_cluster_name)
+        for wl in rdr_workload:
+            wait_for_all_resources_deletion(wl.workload_namespace)
+
+        logger.info(f"Waiting for {two_times_scheduling_interval} minutes to run IOs")
+        time.sleep(two_times_scheduling_interval * 60)
+
+        relocate_results = []
+        with ThreadPoolExecutor() as executor:
+            for wl in rdr_workload:
+                relocate_results.append(
+                    executor.submit(
+                        relocate,
+                        preferred_cluster=primary_cluster_name,
+                        namespace=wl.workload_namespace,
+                        workload_type=wl.workload_type,
+                        workload_placement_name=rdr_workload.appset_placement_name
+                        if wl.workload_type != constants.SUBSCRIPTION
+                        else None,
+                        switch_ctx=get_passive_acm_index(),
+                    )
+                )
+                time.sleep(60)
+
+        # Wait for relocate results
+        for rl in relocate_results:
+            rl.result()
+
+        # Verify resources creation on preferredCluster
+        config.switch_to_cluster_by_name(primary_cluster_name)
+        for wl in rdr_workload:
+            wait_for_all_resources_creation(
+                wl.workload_pvc_count,
+                wl.workload_pod_count,
+                wl.workload_namespace,
+            )
+
+        # Verify resources deletion from previous primary or current secondary cluster
+        config.switch_to_cluster_by_name(secondary_cluster_name)
+        for wl in rdr_workload:
+            wait_for_all_resources_deletion(wl.workload_namespace)