From 6bd7075e3bf3c9c1e39806985c7e24fadac2afca Mon Sep 17 00:00:00 2001 From: Akarsha Rai Date: Thu, 29 Feb 2024 10:12:27 +0530 Subject: [PATCH] Test to verify the hub restore when active hub down following failover and relocate of app (#9094) * Test failover and relocate all apps in a single zone after a zone disruption Signed-off-by: Akarsha-rai --- ocs_ci/deployment/deployment.py | 84 ++--- ocs_ci/helpers/dr_helpers.py | 135 ++++++-- ocs_ci/ocs/acm/acm.py | 5 +- ocs_ci/ocs/constants.py | 2 + ocs_ci/ocs/dr/dr_workload.py | 12 +- ocs_ci/ocs/node.py | 31 +- ocs_ci/ocs/utils.py | 12 + .../ocs-deployment/multicluster/restore.yaml | 10 + tests/conftest.py | 10 +- .../test_active_hub_down_and_restore.py | 314 ++++++++++++++++++ 10 files changed, 537 insertions(+), 78 deletions(-) create mode 100644 ocs_ci/templates/ocs-deployment/multicluster/restore.yaml create mode 100644 tests/functional/disaster-recovery/metro-dr/test_active_hub_down_and_restore.py diff --git a/ocs_ci/deployment/deployment.py b/ocs_ci/deployment/deployment.py index 5b57ad50c84..e0fd71ec034 100644 --- a/ocs_ci/deployment/deployment.py +++ b/ocs_ci/deployment/deployment.py @@ -220,6 +220,52 @@ def do_deploy_submariner(self): submariner = Submariner() submariner.deploy() + def deploy_gitops_operator(self, switch_ctx=None): + """ + Deploy GitOps operator + + Args: + switch_ctx (int): The cluster index by the cluster name + + """ + config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx() + + logger.info("Creating GitOps Operator Subscription") + gitops_subscription_yaml_data = templating.load_yaml( + constants.GITOPS_SUBSCRIPTION_YAML + ) + package_manifest = PackageManifest( + resource_name=constants.GITOPS_OPERATOR_NAME, + ) + gitops_subscription_yaml_data["spec"][ + "startingCSV" + ] = package_manifest.get_current_csv( + channel="latest", csv_pattern=constants.GITOPS_OPERATOR_NAME + ) + + gitops_subscription_manifest = tempfile.NamedTemporaryFile( + mode="w+", prefix="gitops_subscription_manifest", delete=False + ) + templating.dump_data_to_temp_yaml( + gitops_subscription_yaml_data, gitops_subscription_manifest.name + ) + run_cmd(f"oc create -f {gitops_subscription_manifest.name}") + + self.wait_for_subscription( + constants.GITOPS_OPERATOR_NAME, namespace=constants.OPENSHIFT_OPERATORS + ) + logger.info("Sleeping for 90 seconds after subscribing to GitOps Operator") + time.sleep(90) + subscriptions = ocp.OCP( + kind=constants.SUBSCRIPTION_WITH_ACM, + resource_name=constants.GITOPS_OPERATOR_NAME, + namespace=constants.OPENSHIFT_OPERATORS, + ).get() + gitops_csv_name = subscriptions["status"]["currentCSV"] + csv = CSV(resource_name=gitops_csv_name, namespace=constants.GITOPS_NAMESPACE) + csv.wait_for_phase("Succeeded", timeout=720) + logger.info("GitOps Operator Deployment Succeeded") + def do_gitops_deploy(self): """ Deploy GitOps operator @@ -233,43 +279,7 @@ def do_gitops_deploy(self): # Multicluster operations if config.multicluster: config.switch_acm_ctx() - logger.info("Creating GitOps Operator Subscription") - gitops_subscription_yaml_data = templating.load_yaml( - constants.GITOPS_SUBSCRIPTION_YAML - ) - package_manifest = PackageManifest( - resource_name=constants.GITOPS_OPERATOR_NAME, - ) - gitops_subscription_yaml_data["spec"][ - "startingCSV" - ] = package_manifest.get_current_csv( - channel="latest", csv_pattern=constants.GITOPS_OPERATOR_NAME - ) - - gitops_subscription_manifest = tempfile.NamedTemporaryFile( - mode="w+", prefix="gitops_subscription_manifest", delete=False - ) - templating.dump_data_to_temp_yaml( - gitops_subscription_yaml_data, gitops_subscription_manifest.name - ) - run_cmd(f"oc create -f {gitops_subscription_manifest.name}") - - self.wait_for_subscription( - constants.GITOPS_OPERATOR_NAME, namespace=constants.OPENSHIFT_OPERATORS - ) - logger.info("Sleeping for 90 seconds after subscribing to GitOps Operator") - time.sleep(90) - subscriptions = ocp.OCP( - kind=constants.SUBSCRIPTION_WITH_ACM, - resource_name=constants.GITOPS_OPERATOR_NAME, - namespace=constants.OPENSHIFT_OPERATORS, - ).get() - gitops_csv_name = subscriptions["status"]["currentCSV"] - csv = CSV( - resource_name=gitops_csv_name, namespace=constants.GITOPS_NAMESPACE - ) - csv.wait_for_phase("Succeeded", timeout=720) - logger.info("GitOps Operator Deployment Succeeded") + self.deploy_gitops_operator() logger.info("Creating GitOps CLuster Resource") run_cmd(f"oc create -f {constants.GITOPS_CLUSTER_YAML}") diff --git a/ocs_ci/helpers/dr_helpers.py b/ocs_ci/helpers/dr_helpers.py index 87dff81d684..b164917493b 100644 --- a/ocs_ci/helpers/dr_helpers.py +++ b/ocs_ci/helpers/dr_helpers.py @@ -7,7 +7,7 @@ from ocs_ci.framework import config from ocs_ci.ocs import constants, ocp -from ocs_ci.ocs.exceptions import TimeoutExpiredError +from ocs_ci.ocs.exceptions import TimeoutExpiredError, UnexpectedBehaviour from ocs_ci.ocs.resources.drpc import DRPC from ocs_ci.ocs.resources.pod import get_all_pods from ocs_ci.ocs.resources.pv import get_all_pvs @@ -17,8 +17,10 @@ get_non_acm_cluster_config, get_active_acm_index, get_primary_cluster_config, + get_passive_acm_index, ) from ocs_ci.utility import version, templating +from ocs_ci.utility.retry import retry from ocs_ci.utility.utils import TimeoutSampler, CommandFailed, run_cmd logger = logging.getLogger(__name__) @@ -131,6 +133,7 @@ def failover( namespace, workload_type=constants.SUBSCRIPTION, workload_placement_name=None, + switch_ctx=None, ): """ Initiates Failover action to the specified cluster @@ -140,10 +143,11 @@ def failover( namespace (str): Namespace where workload is running workload_type (str): Type of workload, i.e., Subscription or ApplicationSet workload_placement_name (str): Placement name + switch_ctx (int): The cluster index by the cluster name """ restore_index = config.cur_index - config.switch_acm_ctx() + config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx() failover_params = f'{{"spec":{{"action":"{constants.ACTION_FAILOVER}","failoverCluster":"{failover_cluster}"}}}}' if workload_type == constants.APPLICATION_SET: namespace = constants.GITOPS_CLUSTER_NAMESPACE @@ -171,6 +175,7 @@ def relocate( namespace, workload_type=constants.SUBSCRIPTION, workload_placement_name=None, + switch_ctx=None, ): """ Initiates Relocate action to the specified cluster @@ -180,10 +185,11 @@ def relocate( namespace (str): Namespace where workload is running workload_type (str): Type of workload, i.e., Subscription or ApplicationSet workload_placement_name (str): Placement name + switch_ctx (int): The cluster index by the cluster name """ restore_index = config.cur_index - config.switch_acm_ctx() + config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx() relocate_params = f'{{"spec":{{"action":"{constants.ACTION_RELOCATE}","preferredCluster":"{preferred_cluster}"}}}}' if workload_type == constants.APPLICATION_SET: namespace = constants.GITOPS_CLUSTER_NAMESPACE @@ -835,13 +841,14 @@ def get_managed_cluster_node_ips(): return cluster_data -def enable_fence(drcluster_name): +def enable_fence(drcluster_name, switch_ctx=None): """ Once the managed cluster is fenced, all communication from applications to the ODF external storage cluster will fail Args: drcluster_name (str): Name of the DRcluster which needs to be fenced + switch_ctx (int): The cluster index by the cluster name """ @@ -849,7 +856,7 @@ def enable_fence(drcluster_name): f"Edit the DRCluster resource for {drcluster_name} cluster on the Hub cluster" ) restore_index = config.cur_index - config.switch_acm_ctx() + config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx() fence_params = f'{{"spec":{{"clusterFence":"{constants.ACTION_FENCE}"}}}}' drcluster_obj = ocp.OCP(resource_name=drcluster_name, kind=constants.DRCLUSTER) if not drcluster_obj.patch(params=fence_params, format_type="merge"): @@ -888,13 +895,14 @@ def configure_drcluster_for_fencing(): config.switch_ctx(old_ctx) -def enable_unfence(drcluster_name): +def enable_unfence(drcluster_name, switch_ctx=None): """ The OpenShift cluster to be Unfenced is the one where applications are not currently running and the cluster that was Fenced earlier. Args: drcluster_name (str): Name of the DRcluster which needs to be fenced + switch_ctx (int): The cluster index by the cluster name """ @@ -902,7 +910,7 @@ def enable_unfence(drcluster_name): f"Edit the DRCluster resource for {drcluster_name} cluster on the Hub cluster" ) restore_index = config.cur_index - config.switch_acm_ctx() + config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx() unfence_params = f'{{"spec":{{"clusterFence":"{constants.ACTION_UNFENCE}"}}}}' drcluster_obj = ocp.OCP(resource_name=drcluster_name, kind=constants.DRCLUSTER) if not drcluster_obj.patch(params=unfence_params, format_type="merge"): @@ -911,13 +919,14 @@ def enable_unfence(drcluster_name): config.switch_ctx(restore_index) -def fence_state(drcluster_name, fence_state): +def fence_state(drcluster_name, fence_state, switch_ctx=None): """ Sets the specified clusterFence state Args: drcluster_name (str): Name of the DRcluster which needs to be fenced fence_state (str): Specify the clusterfence state either constants.ACTION_UNFENCE and ACTION_FENCE + switch_ctx (int): The cluster index by the cluster name """ @@ -925,7 +934,7 @@ def fence_state(drcluster_name, fence_state): f"Edit the DRCluster {drcluster_name} cluster clusterfence state {fence_state} " ) restore_index = config.cur_index - config.switch_acm_ctx() + config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx() params = f'{{"spec":{{"clusterFence":"{fence_state}"}}}}' drcluster_obj = ocp.OCP(resource_name=drcluster_name, kind=constants.DRCLUSTER) if not drcluster_obj.patch(params=params, format_type="merge"): @@ -936,25 +945,51 @@ def fence_state(drcluster_name, fence_state): config.switch_ctx(restore_index) -def get_fence_state(drcluster_name): +def get_fence_state(drcluster_name, switch_ctx=None): """ Returns the clusterfence state of given drcluster Args: drcluster_name (str): Name of the DRcluster + switch_ctx (int): The cluster index by the cluster name Returns: state (str): If drcluster are fenced: Fenced or Unfenced, else None if not defined """ restore_index = config.cur_index - config.switch_acm_ctx() + config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx() drcluster_obj = ocp.OCP(resource_name=drcluster_name, kind=constants.DRCLUSTER) - state = drcluster_obj.get().get("spec").get("clusterFence") + state = drcluster_obj.get().get("status").get("phase") config.switch_ctx(restore_index) return state +@retry(UnexpectedBehaviour, tries=40, delay=5, backoff=5) +def verify_fence_state(drcluster_name, state, switch_ctx=None): + """ + Verify the specified drcluster is in expected state + + Args: + drcluster_name (str): Name of the DRcluster + state (str): The fence state it is either constants.ACTION_FENCE or constants.ACTION_UNFENCE + switch_ctx (int): The cluster index by the cluster name + + Raises: + Raises exception Unexpected-behaviour if the specified drcluster is not in the given state condition + """ + sample = get_fence_state(drcluster_name=drcluster_name, switch_ctx=switch_ctx) + if sample == state: + logger.info(f"Primary managed cluster {drcluster_name} reached {state} state") + else: + logger.error( + f"Primary managed cluster {drcluster_name} not reached {state} state" + ) + raise UnexpectedBehaviour( + f"Primary managed cluster {drcluster_name} not reached {state} state" + ) + + def create_backup_schedule(): """ Create backupschedule resource only on active hub @@ -971,25 +1006,79 @@ def create_backup_schedule(): config.switch_ctx(old_ctx) -def gracefully_reboot_ocp_nodes( - namespace, drcluster_name, workload_type=constants.SUBSCRIPTION -): +def gracefully_reboot_ocp_nodes(drcluster_name): """ Gracefully reboot OpenShift Container Platform nodes which was fenced before Args: - namespace (str): Name of the namespace drcluster_name (str): Name of the drcluster which needs to be rebooted - workload_type (str): Type of workload. ie Subscription(Default) or ApplicationSet """ + config.switch_to_cluster_by_name(drcluster_name) + gracefully_reboot_nodes() - primary_cluster_name = get_current_primary_cluster_name( - namespace=namespace, workload_type=workload_type + +def restore_backup(): + """ + Restores the backup in new hub and make it as active + + """ + + restore_index = config.cur_index + config.switch_ctx(get_passive_acm_index()) + backup_schedule = templating.load_yaml(constants.DR_RESTORE_YAML) + backup_schedule_yaml = tempfile.NamedTemporaryFile( + mode="w+", prefix="restore", delete=False ) - if primary_cluster_name == drcluster_name: - set_current_primary_cluster_context(namespace, workload_type) + templating.dump_data_to_temp_yaml(backup_schedule, backup_schedule_yaml.name) + run_cmd(f"oc create -f {backup_schedule_yaml.name}") + config.switch_ctx(restore_index) + + +@retry(UnexpectedBehaviour, tries=40, delay=5, backoff=5) +def verify_restore_is_completed(): + """ + Function to verify restore is completed or finished + + """ + restore_index = config.cur_index + config.switch_ctx(get_passive_acm_index()) + restore_obj = ocp.OCP( + kind=constants.ACM_HUB_RESTORE, namespace=constants.ACM_HUB_BACKUP_NAMESPACE + ) + cmd_output = restore_obj.exec_oc_cmd(command="get restore -oyaml") + status = cmd_output["items"][0]["status"]["phase"] + if status == "Finished": + logger.info("Restore completed successfully") else: - set_current_secondary_cluster_context(namespace, workload_type) - gracefully_reboot_nodes() + logger.error(f"Restore failed with some errors: {cmd_output}") + raise UnexpectedBehaviour("Restore failed with some errors") + config.switch_ctx(restore_index) + + +@retry(UnexpectedBehaviour, tries=60, delay=5, backoff=2) +def verify_drpolicy_cli(switch_ctx=None): + """ + Function to verify DRPolicy status + + Returns: + bool: True if the status is in succeed state, else raise exception + switch_ctx (int): The cluster index by the cluster name + + """ + + restore_index = config.cur_index + config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx() + drpolicy_obj = ocp.OCP(kind=constants.DRPOLICY) + status = drpolicy_obj.get().get("items")[0].get("status").get("conditions")[0] + if status.get("reason") == "Succeeded": + logger.info("DRPolicy validation succeeded") + config.switch_ctx(restore_index) + return True + else: + logger.warning(f"DRPolicy is not in succeeded or validated state: {status}") + config.switch_ctx(restore_index) + raise UnexpectedBehaviour( + f"DRPolicy is not in succeeded or validated state: {status}" + ) diff --git a/ocs_ci/ocs/acm/acm.py b/ocs_ci/ocs/acm/acm.py index 565ca908a7f..516002218f5 100644 --- a/ocs_ci/ocs/acm/acm.py +++ b/ocs_ci/ocs/acm/acm.py @@ -466,12 +466,13 @@ def verify_running_acm(): log.info(f"ACM Version Detected: {acm_version}") -def validate_cluster_import(cluster_name): +def validate_cluster_import(cluster_name, switch_ctx=None): """ Validate ACM status of managed cluster Args: cluster_name: (str): cluster name to validate + switch_ctx (int): The cluster index by the cluster name Assert: All conditions of selected managed cluster should be "True", Failed otherwise @@ -479,7 +480,7 @@ def validate_cluster_import(cluster_name): Return: True, if not AssertionError """ - config.switch_ctx(0) + config.switch_ctx(switch_ctx) if switch_ctx else config.switch_ctx(0) oc_obj = OCP(kind=ACM_MANAGED_CLUSTERS) conditions = oc_obj.exec_oc_cmd( f"get managedclusters {cluster_name} -ojsonpath='{{.status.conditions}}'" diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py index e8a817fef4f..db415179f28 100644 --- a/ocs_ci/ocs/constants.py +++ b/ocs_ci/ocs/constants.py @@ -917,6 +917,7 @@ DR_RAMEN_HUB_OPERATOR_CONFIG = "ramen-hub-operator-config" DR_RAMEN_CLUSTER_OPERATOR_CONFIG = "ramen-dr-cluster-operator-config" ODF_MULTICLUSTER_ORCHESTRATOR_CONTROLLER_MANAGER = "odfmo-controller-manager" +DR_RESTORE_YAML = os.path.join(TEMPLATE_MULTICLUSTER_DIR, "restore.yaml") RDR_MODE = "regional-dr" MDR_MODE = "metro-dr" MDR_DR_POLICY = "odr-policy-mdr" @@ -2271,6 +2272,7 @@ TEMPLATE_DIR, "gitops-deployment", "subscription.yaml" ) ACM_HUB_BACKUP_NAMESPACE = "open-cluster-management-backup" +ACM_HUB_RESTORE = "Restore" # Vault encryption KMS types for PV encryption VAULT_TOKEN = "vaulttokens" diff --git a/ocs_ci/ocs/dr/dr_workload.py b/ocs_ci/ocs/dr/dr_workload.py index 69a183eaeb4..613b7fa43c1 100644 --- a/ocs_ci/ocs/dr/dr_workload.py +++ b/ocs_ci/ocs/dr/dr_workload.py @@ -213,13 +213,14 @@ def verify_workload_deployment(self): self.workload_pvc_count, self.workload_pod_count, self.workload_namespace ) - def delete_workload(self, force=False, rbd_name="rbd"): + def delete_workload(self, force=False, rbd_name="rbd", switch_ctx=None): """ Delete busybox workload Args: force (bool): If True, force remove the stuck resources, default False rbd_name (str): Name of the pool + switch_ctx (int): The cluster index by the cluster name Raises: ResourceNotDeleted: In case workload resources not deleted properly @@ -238,7 +239,7 @@ def delete_workload(self, force=False, rbd_name="rbd"): ) try: - config.switch_acm_ctx() + config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx() run_cmd( f"oc delete -k {self.workload_subscription_dir}/{self.workload_name}" ) @@ -288,7 +289,7 @@ def delete_workload(self, force=False, rbd_name="rbd"): raise ResourceNotDeleted(err_msg) finally: - config.switch_acm_ctx() + config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx() run_cmd(f"oc delete -k {self.workload_subscription_dir}") @@ -444,13 +445,14 @@ def check_pod_pvc_status(self, skip_replication_resources=False): skip_replication_resources=skip_replication_resources, ) - def delete_workload(self, force=False, rbd_name="rbd"): + def delete_workload(self, force=False, rbd_name="rbd", switch_ctx=None): """ Delete busybox workload Args: force (bool): If True, force remove the stuck resources, default False rbd_name (str): Name of the pool, default "rbd" + switch_ctx (int): The cluster index by the cluster name Raises: ResourceNotDeleted: In case workload resources not deleted properly @@ -458,7 +460,7 @@ def delete_workload(self, force=False, rbd_name="rbd"): """ image_uuids = dr_helpers.get_image_uuids(self.workload_namespace) try: - config.switch_acm_ctx() + config.switch_ctx(switch_ctx) if switch_ctx else config.switch_acm_ctx() run_cmd(cmd=f"oc delete -f {self.appset_yaml_file}", timeout=900) for cluster in get_non_acm_cluster_config(): diff --git a/ocs_ci/ocs/node.py b/ocs_ci/ocs/node.py index a339062bebe..8c549befe04 100644 --- a/ocs_ci/ocs/node.py +++ b/ocs_ci/ocs/node.py @@ -231,13 +231,14 @@ def schedule_nodes(node_names): wait_for_nodes_status(node_names) -def drain_nodes(node_names, timeout=1800): +def drain_nodes(node_names, timeout=1800, disable_eviction=False): """ Drain nodes Args: node_names (list): The names of the nodes timeout (int): Time to wait for the drain nodes 'oc' command + disable_eviction (bool): On True will delete pod that is protected by PDB, False by default Raises: TimeoutExpired: in case drain command fails to complete in time @@ -253,11 +254,18 @@ def drain_nodes(node_names, timeout=1800): >= version.VERSION_4_7 else "--delete-local-data" ) - ocp.exec_oc_cmd( - f"adm drain {node_names_str} --force=true --ignore-daemonsets " - f"{drain_deletion_flag}", - timeout=timeout, - ) + if disable_eviction: + ocp.exec_oc_cmd( + f"adm drain {node_names_str} --force=true --ignore-daemonsets " + f"{drain_deletion_flag} --disable-eviction", + timeout=timeout, + ) + else: + ocp.exec_oc_cmd( + f"adm drain {node_names_str} --force=true --ignore-daemonsets " + f"{drain_deletion_flag}", + timeout=timeout, + ) except TimeoutExpired: ct_pod = pod.get_ceph_tools_pod() ceph_status = ct_pod.exec_cmd_on_pod("ceph status", out_yaml_format=False) @@ -2756,11 +2764,14 @@ def generate_nodes_for_provider_worker_node_tests(): return generated_nodes -def gracefully_reboot_nodes(): +def gracefully_reboot_nodes(disable_eviction=False): """ Gracefully reboot OpenShift Container Platform nodes + Args: + disable_eviction (bool): On True will delete pod that is protected by PDB, False by default + """ from ocs_ci.ocs import platform_nodes @@ -2771,12 +2782,14 @@ def gracefully_reboot_nodes(): for node in node_objs: node_name = node.name unschedule_nodes([node_name]) - drain_nodes([node_name]) + drain_nodes(node_names=[node_name], disable_eviction=disable_eviction) nodes.restart_nodes([node], wait=False) log.info(f"Waiting for {waiting_time} seconds") time.sleep(waiting_time) schedule_nodes([node_name]) - wait_for_nodes_status(status=constants.NODE_READY, timeout=180) + wait_for_nodes_status( + node_names=[node], status=constants.NODE_READY, timeout=1800 + ) def get_num_of_racks(): diff --git a/ocs_ci/ocs/utils.py b/ocs_ci/ocs/utils.py index 839cd31ffbc..d9231d7bb45 100644 --- a/ocs_ci/ocs/utils.py +++ b/ocs_ci/ocs/utils.py @@ -1537,6 +1537,18 @@ def get_active_acm_index(): return cluster.MULTICLUSTER["multicluster_index"] +def get_passive_acm_index(): + """ + Get index of passive acm cluster + """ + for cluster in ocsci_config.clusters: + if ( + cluster.MULTICLUSTER["acm_cluster"] + and not cluster.MULTICLUSTER["active_acm_cluster"] + ): + return cluster.MULTICLUSTER["multicluster_index"] + + def get_primary_cluster_config(): """ Get the primary cluster config object in a DR scenario diff --git a/ocs_ci/templates/ocs-deployment/multicluster/restore.yaml b/ocs_ci/templates/ocs-deployment/multicluster/restore.yaml new file mode 100644 index 00000000000..a4d21c1fa6e --- /dev/null +++ b/ocs_ci/templates/ocs-deployment/multicluster/restore.yaml @@ -0,0 +1,10 @@ +apiVersion: cluster.open-cluster-management.io/v1beta1 +kind: Restore +metadata: + name: restore-acm + namespace: open-cluster-management-backup +spec: + cleanupBeforeRestore: None + veleroManagedClustersBackupName: latest + veleroCredentialsBackupName: latest + veleroResourcesBackupName: latest diff --git a/tests/conftest.py b/tests/conftest.py index fd60070e55f..d541b740bb4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6429,9 +6429,13 @@ def dr_workload(request): """ instances = [] + ctx = [] def factory( - num_of_subscription=1, num_of_appset=0, pvc_interface=constants.CEPHBLOCKPOOL + num_of_subscription=1, + num_of_appset=0, + pvc_interface=constants.CEPHBLOCKPOOL, + switch_ctx=None, ): """ Args: @@ -6439,6 +6443,7 @@ def factory( num_of_appset (int): Number of ApplicationSet type workload to be created pvc_interface (str): 'CephBlockPool' or 'CephFileSystem'. This decides whether a RBD based or CephFS based resource is created. RBD is default. + switch_ctx (int): The cluster index by the cluster name Raises: ResourceNotDeleted: In case workload resources not deleted properly @@ -6482,13 +6487,14 @@ def factory( dr_helpers.wait_for_mirroring_status_ok( replaying_images=total_pvc_count ) + ctx.append(switch_ctx) return instances def teardown(): failed_to_delete = False for instance in instances: try: - instance.delete_workload(force=True) + instance.delete_workload(switch_ctx=ctx[0], force=True) except ResourceNotDeleted: failed_to_delete = True diff --git a/tests/functional/disaster-recovery/metro-dr/test_active_hub_down_and_restore.py b/tests/functional/disaster-recovery/metro-dr/test_active_hub_down_and_restore.py new file mode 100644 index 00000000000..ee3774ba80a --- /dev/null +++ b/tests/functional/disaster-recovery/metro-dr/test_active_hub_down_and_restore.py @@ -0,0 +1,314 @@ +import logging +import time +from concurrent.futures import ThreadPoolExecutor + +import pytest + +from ocs_ci.framework.pytest_customization.marks import tier4a, turquoise_squad +from ocs_ci.framework import config +from ocs_ci.ocs.acm.acm import AcmAddClusters, validate_cluster_import +from ocs_ci.ocs.dr.dr_workload import validate_data_integrity +from ocs_ci.ocs import constants +from ocs_ci.deployment.deployment import Deployment +from ocs_ci.ocs.node import get_node_objs +from ocs_ci.helpers.dr_helpers import ( + enable_fence, + enable_unfence, + get_fence_state, + failover, + relocate, + restore_backup, + create_backup_schedule, + set_current_primary_cluster_context, + get_current_primary_cluster_name, + get_current_secondary_cluster_name, + get_passive_acm_index, + wait_for_all_resources_creation, + wait_for_all_resources_deletion, + gracefully_reboot_ocp_nodes, + verify_drpolicy_cli, + verify_restore_is_completed, + verify_fence_state, +) +from ocs_ci.helpers.dr_helpers_ui import ( + check_cluster_status_on_acm_console, + failover_relocate_ui, +) +from ocs_ci.ocs.exceptions import UnexpectedBehaviour +from ocs_ci.ocs.resources.pod import wait_for_pods_to_be_running +from ocs_ci.ocs.utils import get_active_acm_index +from ocs_ci.utility import version +from ocs_ci.utility.utils import TimeoutSampler + + +logger = logging.getLogger(__name__) + + +@tier4a +@turquoise_squad +class TestActiveHubDownAndRestore: + """ + Test failover and relocate all apps when active hub down and restored + """ + + @pytest.fixture(autouse=True) + def teardown(self, request, dr_workload): + """ + If fenced, unfence the cluster and reboot nodes + """ + + def finalizer(): + if ( + self.primary_cluster_name + and get_fence_state( + drcluster_name=self.primary_cluster_name, + switch_ctx=get_passive_acm_index(), + ) + == "Fenced" + ): + enable_unfence( + drcluster_name=self.primary_cluster_name, + switch_ctx=get_passive_acm_index(), + ) + gracefully_reboot_ocp_nodes(self.primary_cluster_name) + + request.addfinalizer(finalizer) + + @pytest.mark.polarion_id("OCS-XXXX") + def test_application_failover_and_relocate( + self, setup_acm_ui, nodes_multicluster, dr_workload + ): + + """ + Tests to verify failover and relocate all appswhen active hub down and restored + + """ + + if config.RUN.get("mdr_failover_via_ui"): + ocs_version = version.get_semantic_ocs_version_from_config() + if ocs_version <= version.VERSION_4_12: + logger.error( + "ODF/ACM version isn't supported for Failover/Relocate operation" + ) + raise NotImplementedError + + acm_obj = AcmAddClusters() + # Deploy Subscription based application + sub = dr_workload(num_of_subscription=1, switch_ctx=get_passive_acm_index())[0] + self.namespace = sub.workload_namespace + self.workload_type = sub.workload_type + + # Deploy AppSet based application + appset = dr_workload( + num_of_subscription=0, num_of_appset=1, switch_ctx=get_passive_acm_index() + )[0] + + # Workloads list + workloads = [sub, appset] + + # Create application on Primary managed cluster + set_current_primary_cluster_context(self.namespace) + self.primary_cluster_name = get_current_primary_cluster_name( + namespace=self.namespace + ) + secondary_cluster_name = get_current_secondary_cluster_name(self.namespace) + + # Create backup-schedule on active hub + create_backup_schedule() + # ToDo: To verify all the backups are taken Need to find a exact way to verify backups + wait_time = 300 + logger.info(f"Wait {wait_time} until backup is taken ") + time.sleep(wait_time) + + # Install gitops operator on passive hub + dep_obj = Deployment() + dep_obj.deploy_gitops_operator(switch_ctx=get_passive_acm_index()) + + # Get the active hub nodes + config.switch_ctx(get_active_acm_index()) + active_hub_index = config.cur_index + active_hub_cluster_node_objs = get_node_objs() + + # Shutdown active hub nodes + logger.info("Shutting down all the nodes of active hub") + nodes_multicluster[active_hub_index].stop_nodes(active_hub_cluster_node_objs) + logger.info( + "All nodes of active hub zone are powered off, " + f"wait {wait_time} seconds before restoring in passive hub" + ) + + # Restore new hub + restore_backup() + logger.info(f"Wait {wait_time} until restores are taken ") + time.sleep(wait_time) + # Verify the restore is completed + verify_restore_is_completed() + + # Validate the clusters are imported + clusters = [self.primary_cluster_name, secondary_cluster_name] + for cluster in clusters: + for sample in TimeoutSampler( + timeout=1800, + sleep=60, + func=validate_cluster_import, + cluster_name=cluster, + switch_ctx=get_passive_acm_index(), + ): + if sample: + logger.info( + f"Cluster: {cluster} successfully imported post hub recovery" + ) + # Validate klusterlet addons are running on managed cluster + config.switch_to_cluster_by_name(cluster) + wait_for_pods_to_be_running( + namespace=constants.ACM_ADDONS_NAMESPACE, timeout=300, sleep=15 + ) + break + else: + logger.error( + f"import of cluster: {cluster} failed post hub recovery" + ) + raise UnexpectedBehaviour( + f"import of cluster: {cluster} failed post hub recovery" + ) + + # Wait or verify the drpolicy is in validated state + verify_drpolicy_cli(switch_ctx=get_passive_acm_index()) + + # ToDo: Deploy application in both managed cluster and + # to verify the applications are present in secondary cluster + + # Fenced the primary managed cluster + enable_fence( + drcluster_name=self.primary_cluster_name, + switch_ctx=get_passive_acm_index(), + ) + # Verify the primary managed cluster is in Fenced state + verify_fence_state( + drcluster_name=self.primary_cluster_name, + state=constants.ACTION_FENCE, + switch_ctx=get_passive_acm_index(), + ) + + # Application Failover to Secondary managed cluster + if ( + config.RUN.get("mdr_failover_via_ui") + and self.workload_type == constants.SUBSCRIPTION + ): + logger.info("Start the process of Failover from ACM UI") + config.switch_ctx(get_passive_acm_index()) + failover_relocate_ui( + acm_obj, + workload_to_move=f"{workloads[0].workload_name}-1", + policy_name=workloads[0].dr_policy_name, + failover_or_preferred_cluster=secondary_cluster_name, + ) + else: + failover_results = [] + with ThreadPoolExecutor() as executor: + for wl in workloads: + failover_results.append( + executor.submit( + failover, + failover_cluster=secondary_cluster_name, + namespace=wl.workload_namespace, + switch_ctx=get_passive_acm_index(), + ) + ) + time.sleep(5) + + # Wait for failover results + for fl in failover_results: + fl.result() + + # Verify resources creation on secondary cluster (failoverCluster) + config.switch_to_cluster_by_name(secondary_cluster_name) + for wl in workloads: + wait_for_all_resources_creation( + wl.workload_pvc_count, + wl.workload_pod_count, + wl.workload_namespace, + ) + + # Verify application are deleted from old cluster + config.switch_to_cluster_by_name(self.primary_cluster_name) + for wl in workloads: + wait_for_all_resources_deletion(wl.workload_namespace) + + # Validate data integrity + config.switch_to_cluster_by_name(secondary_cluster_name) + for wl in workloads: + validate_data_integrity(wl.workload_namespace) + + # Unfenced the managed cluster which was Fenced earlier + enable_unfence( + drcluster_name=self.primary_cluster_name, + switch_ctx=get_passive_acm_index(), + ) + # Verify the primary managed cluster is in Unfenced state + verify_fence_state( + drcluster_name=self.primary_cluster_name, + state=constants.ACTION_UNFENCE, + switch_ctx=get_passive_acm_index(), + ) + + # Reboot the nodes which unfenced + gracefully_reboot_ocp_nodes(self.primary_cluster_name) + + # Application Relocate to Primary managed cluster + secondary_cluster_name = get_current_secondary_cluster_name(self.namespace) + if ( + config.RUN.get("mdr_relocate_via_ui") + and self.workload_type == constants.SUBSCRIPTION + ): + logger.info("Start the process of Relocate from ACM UI") + # Relocate via ACM UI + config.switch_ctx(get_passive_acm_index()) + check_cluster_status_on_acm_console(acm_obj) + failover_relocate_ui( + acm_obj, + workload_to_move=f"{workloads[0].workload_name}-1", + policy_name=workloads[0].dr_policy_name, + failover_or_preferred_cluster=secondary_cluster_name, + action=constants.ACTION_RELOCATE, + ) + else: + relocate_results = [] + with ThreadPoolExecutor() as executor: + for wl in workloads: + relocate_results.append( + executor.submit( + relocate, + preferred_cluster=secondary_cluster_name, + namespace=wl.workload_namespace, + switch_ctx=get_passive_acm_index(), + ) + ) + time.sleep(5) + + # Wait for relocate results + for rl in relocate_results: + rl.result() + + # Verify resources deletion from previous primary or current secondary cluster + config.switch_to_cluster_by_name(secondary_cluster_name) + for wl in workloads: + wait_for_all_resources_creation( + wl.workload_pvc_count, + wl.workload_pod_count, + wl.workload_namespace, + ) + + # Verify resources creation on preferredCluster + config.switch_to_cluster_by_name(self.primary_cluster_name) + for wl in workloads: + wait_for_all_resources_creation( + wl.workload_pvc_count, + wl.workload_pod_count, + wl.workload_namespace, + ) + + # Validate data integrity + config.switch_to_cluster_by_name(self.primary_cluster_name) + for wl in workloads: + validate_data_integrity(wl.workload_namespace)