Test to verify the hub restore when active hub down following failove…

…r and relocate of app (#9094) * Test failover and relocate all apps in a single zone after a zone disruption Signed-off-by: Akarsha-rai <[email protected]>
red-hat-storage · Feb 29, 2024 · 6bd7075 · 6bd7075
1 parent c090acc
commit 6bd7075
Show file tree

Hide file tree

Showing 10 changed files with 537 additions and 78 deletions.
diff --git a/ocs_ci/deployment/deployment.py b/ocs_ci/deployment/deployment.py
@@ -220,6 +220,52 @@ def do_deploy_submariner(self):
             submariner = Submariner()
             submariner.deploy()
 
+    def deploy_gitops_operator(self, switch_ctx=None):
+        """
+        Deploy GitOps operator
+
+        Args:
+            switch_ctx (int): The cluster index by the cluster name
+
+        """
+        config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx()
+
+        logger.info("Creating GitOps Operator Subscription")
+        gitops_subscription_yaml_data = templating.load_yaml(
+            constants.GITOPS_SUBSCRIPTION_YAML
+        )
+        package_manifest = PackageManifest(
+            resource_name=constants.GITOPS_OPERATOR_NAME,
+        )
+        gitops_subscription_yaml_data["spec"][
+            "startingCSV"
+        ] = package_manifest.get_current_csv(
+            channel="latest", csv_pattern=constants.GITOPS_OPERATOR_NAME
+        )
+
+        gitops_subscription_manifest = tempfile.NamedTemporaryFile(
+            mode="w+", prefix="gitops_subscription_manifest", delete=False
+        )
+        templating.dump_data_to_temp_yaml(
+            gitops_subscription_yaml_data, gitops_subscription_manifest.name
+        )
+        run_cmd(f"oc create -f {gitops_subscription_manifest.name}")
+
+        self.wait_for_subscription(
+            constants.GITOPS_OPERATOR_NAME, namespace=constants.OPENSHIFT_OPERATORS
+        )
+        logger.info("Sleeping for 90 seconds after subscribing to GitOps Operator")
+        time.sleep(90)
+        subscriptions = ocp.OCP(
+            kind=constants.SUBSCRIPTION_WITH_ACM,
+            resource_name=constants.GITOPS_OPERATOR_NAME,
+            namespace=constants.OPENSHIFT_OPERATORS,
+        ).get()
+        gitops_csv_name = subscriptions["status"]["currentCSV"]
+        csv = CSV(resource_name=gitops_csv_name, namespace=constants.GITOPS_NAMESPACE)
+        csv.wait_for_phase("Succeeded", timeout=720)
+        logger.info("GitOps Operator Deployment Succeeded")
+
     def do_gitops_deploy(self):
         """
         Deploy GitOps operator
@@ -233,43 +279,7 @@ def do_gitops_deploy(self):
         # Multicluster operations
         if config.multicluster:
             config.switch_acm_ctx()
-            logger.info("Creating GitOps Operator Subscription")
-            gitops_subscription_yaml_data = templating.load_yaml(
-                constants.GITOPS_SUBSCRIPTION_YAML
-            )
-            package_manifest = PackageManifest(
-                resource_name=constants.GITOPS_OPERATOR_NAME,
-            )
-            gitops_subscription_yaml_data["spec"][
-                "startingCSV"
-            ] = package_manifest.get_current_csv(
-                channel="latest", csv_pattern=constants.GITOPS_OPERATOR_NAME
-            )
-
-            gitops_subscription_manifest = tempfile.NamedTemporaryFile(
-                mode="w+", prefix="gitops_subscription_manifest", delete=False
-            )
-            templating.dump_data_to_temp_yaml(
-                gitops_subscription_yaml_data, gitops_subscription_manifest.name
-            )
-            run_cmd(f"oc create -f {gitops_subscription_manifest.name}")
-
-            self.wait_for_subscription(
-                constants.GITOPS_OPERATOR_NAME, namespace=constants.OPENSHIFT_OPERATORS
-            )
-            logger.info("Sleeping for 90 seconds after subscribing to GitOps Operator")
-            time.sleep(90)
-            subscriptions = ocp.OCP(
-                kind=constants.SUBSCRIPTION_WITH_ACM,
-                resource_name=constants.GITOPS_OPERATOR_NAME,
-                namespace=constants.OPENSHIFT_OPERATORS,
-            ).get()
-            gitops_csv_name = subscriptions["status"]["currentCSV"]
-            csv = CSV(
-                resource_name=gitops_csv_name, namespace=constants.GITOPS_NAMESPACE
-            )
-            csv.wait_for_phase("Succeeded", timeout=720)
-            logger.info("GitOps Operator Deployment Succeeded")
+            self.deploy_gitops_operator()
 
             logger.info("Creating GitOps CLuster Resource")
             run_cmd(f"oc create -f {constants.GITOPS_CLUSTER_YAML}")

diff --git a/ocs_ci/helpers/dr_helpers.py b/ocs_ci/helpers/dr_helpers.py
@@ -7,7 +7,7 @@
 
 from ocs_ci.framework import config
 from ocs_ci.ocs import constants, ocp
-from ocs_ci.ocs.exceptions import TimeoutExpiredError
+from ocs_ci.ocs.exceptions import TimeoutExpiredError, UnexpectedBehaviour
 from ocs_ci.ocs.resources.drpc import DRPC
 from ocs_ci.ocs.resources.pod import get_all_pods
 from ocs_ci.ocs.resources.pv import get_all_pvs
@@ -17,8 +17,10 @@
     get_non_acm_cluster_config,
     get_active_acm_index,
     get_primary_cluster_config,
+    get_passive_acm_index,
 )
 from ocs_ci.utility import version, templating
+from ocs_ci.utility.retry import retry
 from ocs_ci.utility.utils import TimeoutSampler, CommandFailed, run_cmd
 
 logger = logging.getLogger(__name__)
@@ -131,6 +133,7 @@ def failover(
     namespace,
     workload_type=constants.SUBSCRIPTION,
     workload_placement_name=None,
+    switch_ctx=None,
 ):
     """
     Initiates Failover action to the specified cluster
@@ -140,10 +143,11 @@ def failover(
         namespace (str): Namespace where workload is running
         workload_type (str): Type of workload, i.e., Subscription or ApplicationSet
         workload_placement_name (str): Placement name
+        switch_ctx (int): The cluster index by the cluster name
 
     """
     restore_index = config.cur_index
-    config.switch_acm_ctx()
+    config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx()
     failover_params = f'{{"spec":{{"action":"{constants.ACTION_FAILOVER}","failoverCluster":"{failover_cluster}"}}}}'
     if workload_type == constants.APPLICATION_SET:
         namespace = constants.GITOPS_CLUSTER_NAMESPACE
@@ -171,6 +175,7 @@ def relocate(
     namespace,
     workload_type=constants.SUBSCRIPTION,
     workload_placement_name=None,
+    switch_ctx=None,
 ):
     """
     Initiates Relocate action to the specified cluster
@@ -180,10 +185,11 @@ def relocate(
         namespace (str): Namespace where workload is running
         workload_type (str): Type of workload, i.e., Subscription or ApplicationSet
         workload_placement_name (str): Placement name
+        switch_ctx (int): The cluster index by the cluster name
 
     """
     restore_index = config.cur_index
-    config.switch_acm_ctx()
+    config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx()
     relocate_params = f'{{"spec":{{"action":"{constants.ACTION_RELOCATE}","preferredCluster":"{preferred_cluster}"}}}}'
     if workload_type == constants.APPLICATION_SET:
         namespace = constants.GITOPS_CLUSTER_NAMESPACE
@@ -835,21 +841,22 @@ def get_managed_cluster_node_ips():
     return cluster_data
 
 
-def enable_fence(drcluster_name):
+def enable_fence(drcluster_name, switch_ctx=None):
     """
     Once the managed cluster is fenced, all communication
     from applications to the ODF external storage cluster will fail
 
     Args:
         drcluster_name (str): Name of the DRcluster which needs to be fenced
+        switch_ctx (int): The cluster index by the cluster name
 
     """
 
     logger.info(
         f"Edit the DRCluster resource for {drcluster_name} cluster on the Hub cluster"
     )
     restore_index = config.cur_index
-    config.switch_acm_ctx()
+    config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx()
     fence_params = f'{{"spec":{{"clusterFence":"{constants.ACTION_FENCE}"}}}}'
     drcluster_obj = ocp.OCP(resource_name=drcluster_name, kind=constants.DRCLUSTER)
     if not drcluster_obj.patch(params=fence_params, format_type="merge"):
@@ -888,21 +895,22 @@ def configure_drcluster_for_fencing():
     config.switch_ctx(old_ctx)
 
 
-def enable_unfence(drcluster_name):
+def enable_unfence(drcluster_name, switch_ctx=None):
     """
     The OpenShift cluster to be Unfenced is the one where applications
     are not currently running and the cluster that was Fenced earlier.
 
     Args:
         drcluster_name (str): Name of the DRcluster which needs to be fenced
+        switch_ctx (int): The cluster index by the cluster name
 
     """
 
     logger.info(
         f"Edit the DRCluster resource for {drcluster_name} cluster on the Hub cluster"
     )
     restore_index = config.cur_index
-    config.switch_acm_ctx()
+    config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx()
     unfence_params = f'{{"spec":{{"clusterFence":"{constants.ACTION_UNFENCE}"}}}}'
     drcluster_obj = ocp.OCP(resource_name=drcluster_name, kind=constants.DRCLUSTER)
     if not drcluster_obj.patch(params=unfence_params, format_type="merge"):
@@ -911,21 +919,22 @@ def enable_unfence(drcluster_name):
     config.switch_ctx(restore_index)
 
 
-def fence_state(drcluster_name, fence_state):
+def fence_state(drcluster_name, fence_state, switch_ctx=None):
     """
     Sets the specified clusterFence state
 
     Args:
        drcluster_name (str): Name of the DRcluster which needs to be fenced
        fence_state (str): Specify the clusterfence state either constants.ACTION_UNFENCE and ACTION_FENCE
+       switch_ctx (int): The cluster index by the cluster name
 
     """
 
     logger.info(
         f"Edit the DRCluster {drcluster_name} cluster clusterfence state {fence_state}  "
     )
     restore_index = config.cur_index
-    config.switch_acm_ctx()
+    config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx()
     params = f'{{"spec":{{"clusterFence":"{fence_state}"}}}}'
     drcluster_obj = ocp.OCP(resource_name=drcluster_name, kind=constants.DRCLUSTER)
     if not drcluster_obj.patch(params=params, format_type="merge"):
@@ -936,25 +945,51 @@ def fence_state(drcluster_name, fence_state):
     config.switch_ctx(restore_index)
 
 
-def get_fence_state(drcluster_name):
+def get_fence_state(drcluster_name, switch_ctx=None):
     """
     Returns the clusterfence state of given drcluster
 
     Args:
         drcluster_name (str): Name of the DRcluster
+        switch_ctx (int): The cluster index by the cluster name
 
     Returns:
         state (str): If drcluster are fenced: Fenced or Unfenced, else None if not defined
 
     """
     restore_index = config.cur_index
-    config.switch_acm_ctx()
+    config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx()
     drcluster_obj = ocp.OCP(resource_name=drcluster_name, kind=constants.DRCLUSTER)
-    state = drcluster_obj.get().get("spec").get("clusterFence")
+    state = drcluster_obj.get().get("status").get("phase")
     config.switch_ctx(restore_index)
     return state
 
 
+@retry(UnexpectedBehaviour, tries=40, delay=5, backoff=5)
+def verify_fence_state(drcluster_name, state, switch_ctx=None):
+    """
+    Verify the specified drcluster is in expected state
+
+    Args:
+        drcluster_name (str): Name of the DRcluster
+        state (str): The fence state it is either constants.ACTION_FENCE or constants.ACTION_UNFENCE
+        switch_ctx (int): The cluster index by the cluster name
+
+    Raises:
+        Raises exception Unexpected-behaviour if the specified drcluster is not in the given state condition
+    """
+    sample = get_fence_state(drcluster_name=drcluster_name, switch_ctx=switch_ctx)
+    if sample == state:
+        logger.info(f"Primary managed cluster {drcluster_name} reached {state} state")
+    else:
+        logger.error(
+            f"Primary managed cluster {drcluster_name} not reached {state} state"
+        )
+        raise UnexpectedBehaviour(
+            f"Primary managed cluster {drcluster_name} not reached {state} state"
+        )
+
+
 def create_backup_schedule():
     """
     Create backupschedule resource only on active hub
@@ -971,25 +1006,79 @@ def create_backup_schedule():
     config.switch_ctx(old_ctx)
 
 
-def gracefully_reboot_ocp_nodes(
-    namespace, drcluster_name, workload_type=constants.SUBSCRIPTION
-):
+def gracefully_reboot_ocp_nodes(drcluster_name):
     """
     Gracefully reboot OpenShift Container Platform
     nodes which was fenced before
 
     Args:
-        namespace (str): Name of the namespace
         drcluster_name (str): Name of the drcluster which needs to be rebooted
-        workload_type (str): Type of workload. ie Subscription(Default) or ApplicationSet
 
     """
+    config.switch_to_cluster_by_name(drcluster_name)
+    gracefully_reboot_nodes()
 
-    primary_cluster_name = get_current_primary_cluster_name(
-        namespace=namespace, workload_type=workload_type
+
+def restore_backup():
+    """
+    Restores the backup in new hub and make it as active
+
+    """
+
+    restore_index = config.cur_index
+    config.switch_ctx(get_passive_acm_index())
+    backup_schedule = templating.load_yaml(constants.DR_RESTORE_YAML)
+    backup_schedule_yaml = tempfile.NamedTemporaryFile(
+        mode="w+", prefix="restore", delete=False
     )
-    if primary_cluster_name == drcluster_name:
-        set_current_primary_cluster_context(namespace, workload_type)
+    templating.dump_data_to_temp_yaml(backup_schedule, backup_schedule_yaml.name)
+    run_cmd(f"oc create -f {backup_schedule_yaml.name}")
+    config.switch_ctx(restore_index)
+
+
+@retry(UnexpectedBehaviour, tries=40, delay=5, backoff=5)
+def verify_restore_is_completed():
+    """
+    Function to verify restore is completed or finished
+
+    """
+    restore_index = config.cur_index
+    config.switch_ctx(get_passive_acm_index())
+    restore_obj = ocp.OCP(
+        kind=constants.ACM_HUB_RESTORE, namespace=constants.ACM_HUB_BACKUP_NAMESPACE
+    )
+    cmd_output = restore_obj.exec_oc_cmd(command="get restore -oyaml")
+    status = cmd_output["items"][0]["status"]["phase"]
+    if status == "Finished":
+        logger.info("Restore completed successfully")
     else:
-        set_current_secondary_cluster_context(namespace, workload_type)
-    gracefully_reboot_nodes()
+        logger.error(f"Restore failed with some errors: {cmd_output}")
+        raise UnexpectedBehaviour("Restore failed with some errors")
+    config.switch_ctx(restore_index)
+
+
+@retry(UnexpectedBehaviour, tries=60, delay=5, backoff=2)
+def verify_drpolicy_cli(switch_ctx=None):
+    """
+    Function to verify DRPolicy status
+
+    Returns:
+        bool: True if the status is in succeed state, else raise exception
+        switch_ctx (int): The cluster index by the cluster name
+
+    """
+
+    restore_index = config.cur_index
+    config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx()
+    drpolicy_obj = ocp.OCP(kind=constants.DRPOLICY)
+    status = drpolicy_obj.get().get("items")[0].get("status").get("conditions")[0]
+    if status.get("reason") == "Succeeded":
+        logger.info("DRPolicy validation succeeded")
+        config.switch_ctx(restore_index)
+        return True
+    else:
+        logger.warning(f"DRPolicy is not in succeeded or validated state: {status}")
+        config.switch_ctx(restore_index)
+        raise UnexpectedBehaviour(
+            f"DRPolicy is not in succeeded or validated state: {status}"
+        )