From b172a1618aa80c1f8729de0ee14b2c939558833d Mon Sep 17 00:00:00 2001 From: Parikshith Date: Tue, 6 Feb 2024 13:19:08 +0530 Subject: [PATCH] MDR+CNV app deployment and failover/relocate (#9121) Signed-off-by: Parikshith --- conf/ocsci/mdr_workload.yaml | 12 + ocs_ci/helpers/dr_helpers.py | 23 ++ ocs_ci/ocs/constants.py | 3 + ocs_ci/ocs/dr/dr_workload.py | 217 ++++++++++++++++++ tests/conftest.py | 77 ++++++- .../test_cnv_app_failover_relocate.py | 210 +++++++++++++++++ 6 files changed, 541 insertions(+), 1 deletion(-) create mode 100644 tests/functional/disaster-recovery/metro-dr/test_cnv_app_failover_relocate.py diff --git a/conf/ocsci/mdr_workload.yaml b/conf/ocsci/mdr_workload.yaml index 719922772f9..2ace9ca02fc 100644 --- a/conf/ocsci/mdr_workload.yaml +++ b/conf/ocsci/mdr_workload.yaml @@ -30,3 +30,15 @@ ENV_DATA: dr_workload_app_pvc_selector: {'appname': 'busybox_app5'}, pod_count: 2, pvc_count: 2 }, ] + dr_cnv_workload_appset: [ + { name: "vm-appset-1", workload_dir: "mdr/cnv-workload/appset/vm-appset-1", + dr_workload_app_placement_name: "vm-appset-1-placement", vm_name: "vm-workload-1", + dr_workload_app_pvc_selector: { 'appname': 'kubevirt' }, pod_count: 1, pvc_count: 1 + }, + ] + dr_cnv_workload_sub: [ + { name: "vm-sub-1", workload_dir: "mdr/cnv-workload/subscription/vm-sub-1", + dr_workload_app_placement_name: "vm-sub-1-placement", vm_name: "vm-workload-1", + dr_workload_app_pvc_selector: { 'appname': 'kubevirt' }, pod_count: 1, pvc_count: 1 + }, + ] diff --git a/ocs_ci/helpers/dr_helpers.py b/ocs_ci/helpers/dr_helpers.py index 642282e98c9..83beed7f222 100644 --- a/ocs_ci/helpers/dr_helpers.py +++ b/ocs_ci/helpers/dr_helpers.py @@ -688,6 +688,29 @@ def wait_for_all_resources_deletion( sample.wait_for_func_value(0) +def wait_for_cnv_workload( + vm_name, namespace, phase=constants.STATUS_RUNNING, timeout=600 +): + """ + Wait for VM to reach a phase + + Args: + vm_name (str): Name of the VM + namespace (str): Namespace of the vm workload + phase (str): Phase of the vm resource to wait for. example: Running, Stopped + timeout (int): time in seconds to wait for resource deletion + + """ + logger.info(f"Wait for VM: {vm_name} to reach {phase} state") + vm_obj = ocp.OCP( + kind=constants.VIRTUAL_MACHINE_INSTANCES, + resource_name=vm_name, + namespace=namespace, + ) + vm_obj._has_phase = True + vm_obj.wait_for_phase(phase=constants.STATUS_RUNNING, timeout=timeout) + + def wait_for_replication_destinations_creation(rep_dest_count, namespace, timeout=900): """ Wait for ReplicationDestination resources to be created diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py index d9b5b44f6a5..9d9b2e313cc 100644 --- a/ocs_ci/ocs/constants.py +++ b/ocs_ci/ocs/constants.py @@ -2360,6 +2360,9 @@ APPLICATION_ARGOCD = "applications.argoproj.io" PLACEMENT_KIND = "placements.cluster.open-cluster-management.io" +# CNV +VIRTUAL_MACHINE_INSTANCES = "vmi" + # Stretch cluster STRETCH_CLUSTER_NAMESPACE = "sc-project" diff --git a/ocs_ci/ocs/dr/dr_workload.py b/ocs_ci/ocs/dr/dr_workload.py index c99a5546d29..c04ad488300 100644 --- a/ocs_ci/ocs/dr/dr_workload.py +++ b/ocs_ci/ocs/dr/dr_workload.py @@ -489,6 +489,223 @@ def delete_workload(self, force=False, rbd_name="rbd"): raise ResourceNotDeleted(err_msg) +class CnvWorkload(DRWorkload): + """ + Class handling everything related to CNV workloads covers both Subscription and Appset apps + + """ + + def __init__(self, **kwargs): + workload_repo_url = config.ENV_DATA["dr_workload_repo_url"] + workload_repo_branch = config.ENV_DATA["dr_workload_repo_branch"] + super().__init__("cnv", workload_repo_url, workload_repo_branch) + + self.workload_name = kwargs.get("workload_name") + self.vm_name = kwargs.get("vm_name") + self.workload_type = kwargs.get("workload_type") + self.workload_namespace = kwargs.get("workload_namespace", None) + self.workload_pod_count = kwargs.get("workload_pod_count") + self.workload_pvc_count = kwargs.get("workload_pvc_count") + self.dr_policy_name = kwargs.get( + "dr_policy_name", config.ENV_DATA.get("dr_policy_name") + ) or (dr_helpers.get_all_drpolicy()[0]["metadata"]["name"]) + self.preferred_primary_cluster = config.ENV_DATA.get( + "preferred_primary_cluster" + ) or (get_primary_cluster_config().ENV_DATA["cluster_name"]) + self.target_clone_dir = config.ENV_DATA.get( + "target_clone_dir", constants.DR_WORKLOAD_REPO_BASE_DIR + ) + self.cnv_workload_dir = os.path.join( + self.target_clone_dir, kwargs.get("workload_dir") + ) + self.cnv_workload_yaml_file = os.path.join( + self.cnv_workload_dir, self.workload_name + ".yaml" + ) + self.drpc_yaml_file = os.path.join(constants.DRPC_PATH) + self.cnv_workload_placement_name = kwargs.get("workload_placement_name") + self.cnv_workload_pvc_selector = kwargs.get("workload_pvc_selector") + + def deploy_workload(self): + """ + Deployment specific to cnv workloads + + """ + self._deploy_prereqs() + self.workload_namespace = self._get_workload_namespace() + + # Load DRPC + drpc_yaml_data = templating.load_yaml(self.drpc_yaml_file) + drpc_yaml_data["metadata"]["name"] = f"{self.cnv_workload_placement_name}-drpc" + drpc_yaml_data["spec"]["preferredCluster"] = self.preferred_primary_cluster + drpc_yaml_data["spec"]["drPolicyRef"]["name"] = self.dr_policy_name + drpc_yaml_data["spec"]["placementRef"][ + "name" + ] = self.cnv_workload_placement_name + if self.workload_type == constants.SUBSCRIPTION: + drpc_yaml_data["metadata"]["namespace"] = self.workload_namespace + drpc_yaml_data["spec"]["placementRef"][ + "namespace" + ] = self.workload_namespace + drpc_yaml_data["spec"]["pvcSelector"][ + "matchLabels" + ] = self.cnv_workload_pvc_selector + drcp_data_yaml = tempfile.NamedTemporaryFile( + mode="w+", prefix="drpc", delete=False + ) + templating.dump_data_to_temp_yaml(drpc_yaml_data, drcp_data_yaml.name) + + cnv_workload_yaml_data_load = list( + templating.load_yaml(self.cnv_workload_yaml_file, multi_document=True) + ) + log.info(cnv_workload_yaml_data_load) + for cnv_workload_yaml_data in cnv_workload_yaml_data_load: + # Update Channel for sub apps + if self.workload_type == constants.SUBSCRIPTION: + if cnv_workload_yaml_data["kind"] == "Channel": + cnv_workload_yaml_data["spec"]["pathname"] = self.workload_repo_url + + if cnv_workload_yaml_data["kind"] == constants.PLACEMENT: + # Update preferred cluster name + cnv_workload_yaml_data["spec"]["predicates"][0][ + "requiredClusterSelector" + ]["labelSelector"]["matchExpressions"][0]["values"][ + 0 + ] = self.preferred_primary_cluster + + templating.dump_data_to_temp_yaml( + cnv_workload_yaml_data_load, self.cnv_workload_yaml_file + ) + config.switch_acm_ctx() + run_cmd(f"oc create -f {self.cnv_workload_yaml_file}") + self.add_annotation_to_placement() + run_cmd(f"oc create -f {drcp_data_yaml.name}") + self.verify_workload_deployment() + + def _deploy_prereqs(self): + """ + Perform prerequisites + + """ + # Clone workload repo + clone_repo( + url=self.workload_repo_url, + location=self.target_clone_dir, + branch=self.workload_repo_branch, + ) + + def add_annotation_to_placement(self): + """ + Add annotation to appset and sub placements + + """ + + config.switch_acm_ctx() + placement_obj = ocp.OCP( + kind=constants.PLACEMENT_KIND, + resource_name=self.cnv_workload_placement_name, + namespace=constants.GITOPS_CLUSTER_NAMESPACE + if self.workload_type == constants.APPLICATION_SET + else self.workload_namespace, + ) + placement_obj.annotate( + annotation="cluster.open-cluster-management.io/experimental-scheduling-disable='true'" + ) + + def _get_workload_namespace(self): + """ + Get the workload namespace + + """ + + cnv_workload_data = list( + templating.load_yaml(self.cnv_workload_yaml_file, multi_document=True) + ) + + for _wl_data in cnv_workload_data: + if self.workload_type == constants.APPLICATION_SET: + if _wl_data["kind"] == constants.APPLICATION_SET: + return _wl_data["spec"]["template"]["spec"]["destination"][ + "namespace" + ] + else: + if _wl_data["kind"] == constants.SUBSCRIPTION: + return _wl_data["metadata"]["namespace"] + + def _get_workload_name(self): + """ + Get cnv workload name + + """ + cnv_workload_data = list( + templating.load_yaml(self.cnv_workload_yaml_file, multi_document=True) + ) + + for _wl_data in cnv_workload_data: + if ( + _wl_data["kind"] == constants.APPLICATION_SET + or _wl_data["kind"] == constants.SUBSCRIPTION + ): + return _wl_data["metadata"]["name"] + if self.workload_type == constants.APPLICATION_SET: + if _wl_data["kind"] == constants.APPLICATION_SET: + return _wl_data["metadata"]["name"] + else: + if _wl_data["kind"] == constants.SUBSCRIPTION: + return _wl_data["metadata"]["name"] + + def verify_workload_deployment(self): + """ + Verify cnv workload deployment + + """ + config.switch_to_cluster_by_name(self.preferred_primary_cluster) + dr_helpers.wait_for_all_resources_creation( + self.workload_pvc_count, + self.workload_pod_count, + self.workload_namespace, + ) + dr_helpers.wait_for_cnv_workload( + vm_name=self.vm_name, + namespace=self.workload_namespace, + phase=constants.STATUS_RUNNING, + ) + + def delete_workload(self, force=False): + """ + Deletes cnv workload + + Raises: + ResourceNotDeleted: In case workload resources not deleted properly + + """ + try: + config.switch_acm_ctx() + run_cmd(cmd=f"oc delete -f {self.cnv_workload_yaml_file}", timeout=900) + + for cluster in get_non_acm_cluster_config(): + config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"]) + dr_helpers.wait_for_all_resources_deletion( + namespace=self.workload_namespace, + check_replication_resources_state=False, + ) + log.info(f"Verify VM: {self.vm_name} is deletion") + vm_obj = ocp.OCP( + kind=constants.VIRTUAL_MACHINE_INSTANCES, + resource_name=self.vm_name, + namespace=self.workload_namespace, + ) + vm_obj.wait_for_delete(timeout=300) + + except ( + TimeoutExpired, + TimeoutExpiredError, + TimeoutError, + UnexpectedBehaviour, + ) as ex: + err_msg = f"Failed to delete the workload: {ex}" + raise ResourceNotDeleted(err_msg) + + def validate_data_integrity(namespace, path="/mnt/test/hashfile", timeout=600): """ Verifies the md5sum values of files are OK diff --git a/tests/conftest.py b/tests/conftest.py index a40d5a2256b..6b178e7e690 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -36,7 +36,7 @@ craft_s3_command, put_bucket_policy, ) -from ocs_ci.ocs.dr.dr_workload import BusyBox, BusyBox_AppSet +from ocs_ci.ocs.dr.dr_workload import BusyBox, BusyBox_AppSet, CnvWorkload from ocs_ci.ocs.exceptions import ( CommandFailed, TimeoutExpiredError, @@ -6517,6 +6517,81 @@ def teardown(): return factory +@pytest.fixture() +def cnv_dr_workload(request): + """ + Deploys CNV based workload for DR setup + + """ + instances = [] + + def factory( + num_of_vm_subscription=1, + num_of_vm_appset=0, + ): + """ + Args: + num_of_vm_subscription (int): Number of Subscription type workload to be created + num_of_vm_appset (int): Number of ApplicationSet type workload to be created + + Raises: + ResourceNotDeleted: In case workload resources not deleted properly + + Returns: + list: objects of workload class. + + """ + total_pvc_count = 0 + + for index in range(num_of_vm_subscription): + workload_details = ocsci_config.ENV_DATA["dr_cnv_workload_sub"][index] + workload = CnvWorkload( + workload_type=constants.SUBSCRIPTION, + workload_dir=workload_details["workload_dir"], + vm_name=workload_details["vm_name"], + workload_name=workload_details["name"], + workload_pod_count=workload_details["pod_count"], + workload_pvc_count=workload_details["pvc_count"], + workload_placement_name=workload_details[ + "dr_workload_app_placement_name" + ], + workload_pvc_selector=workload_details["dr_workload_app_pvc_selector"], + ) + instances.append(workload) + total_pvc_count += workload_details["pvc_count"] + workload.deploy_workload() + + for index in range(num_of_vm_appset): + workload_details = ocsci_config.ENV_DATA["dr_cnv_workload_appset"][index] + workload = CnvWorkload( + workload_type=constants.APPLICATION_SET, + workload_dir=workload_details["workload_dir"], + vm_name=workload_details["vm_name"], + workload_name=workload_details["name"], + workload_pod_count=workload_details["pod_count"], + workload_pvc_count=workload_details["pvc_count"], + workload_placement_name=workload_details[ + "dr_workload_app_placement_name" + ], + workload_pvc_selector=workload_details["dr_workload_app_pvc_selector"], + ) + instances.append(workload) + total_pvc_count += workload_details["pvc_count"] + workload.deploy_workload() + + return instances + + def teardown(): + for instance in instances: + try: + instance.delete_workload(force=True) + except ResourceNotDeleted: + raise ResourceNotDeleted("Workload deletion was unsuccessful") + + request.addfinalizer(teardown) + return factory + + @pytest.fixture(scope="class") def lvm_storageclass_factory_class(request, storageclass_factory_class): return lvm_storageclass_factory_fixture(request, storageclass_factory_class) diff --git a/tests/functional/disaster-recovery/metro-dr/test_cnv_app_failover_relocate.py b/tests/functional/disaster-recovery/metro-dr/test_cnv_app_failover_relocate.py new file mode 100644 index 00000000000..aec756a1de0 --- /dev/null +++ b/tests/functional/disaster-recovery/metro-dr/test_cnv_app_failover_relocate.py @@ -0,0 +1,210 @@ +import logging +import pytest +import time + + +from ocs_ci.framework.pytest_customization.marks import tier2 +from ocs_ci.framework import config +from ocs_ci.ocs import constants +from ocs_ci.ocs.node import wait_for_nodes_status, get_node_objs +from ocs_ci.helpers.dr_helpers import ( + enable_fence, + enable_unfence, + get_fence_state, + failover, + relocate, + set_current_primary_cluster_context, + set_current_secondary_cluster_context, + get_current_primary_cluster_name, + get_current_secondary_cluster_name, + wait_for_all_resources_creation, + wait_for_all_resources_deletion, + gracefully_reboot_ocp_nodes, + wait_for_cnv_workload, +) + +from ocs_ci.framework.pytest_customization.marks import turquoise_squad + +logger = logging.getLogger(__name__) + +polarion_id_cnv_primary_up = "OCS-5413" +polarion_id_cnv_primary_down = "OCS-5414" + + +@tier2 +@turquoise_squad +class TestCnvApplicationMDR: + """ + Includes tests related to CNV workloads on MDR environment. + """ + + @pytest.fixture(autouse=True) + def teardown(self, request, cnv_dr_workload): + """ + Teardown function: If fenced, un-fence the cluster and reboot nodes + """ + + def finalizer(): + if ( + self.primary_cluster_name + and get_fence_state(self.primary_cluster_name) == "Fenced" + ): + enable_unfence(self.primary_cluster_name) + gracefully_reboot_ocp_nodes( + self.wl_namespace, self.primary_cluster_name + ) + + request.addfinalizer(finalizer) + + @pytest.mark.parametrize( + argnames=["primary_cluster_down"], + argvalues=[ + pytest.param( + False, + marks=pytest.mark.polarion_id(polarion_id_cnv_primary_up), + id="primary_up", + ), + pytest.param( + True, + marks=pytest.mark.polarion_id(polarion_id_cnv_primary_down), + id="primary_down", + ), + ], + ) + def test_cnv_app_failover_relocate( + self, + primary_cluster_down, + nodes_multicluster, + cnv_dr_workload, + node_restart_teardown, + ): + """ + Tests to verify CNV based subscription and appset application deployment and + fail-over/relocate between managed clusters. + + """ + # Create CNV applications(appset+sub) + cnv_workloads = cnv_dr_workload(num_of_vm_subscription=1, num_of_vm_appset=1) + self.wl_namespace = cnv_workloads[0].workload_namespace + + set_current_primary_cluster_context( + self.wl_namespace, cnv_workloads[0].workload_type + ) + primary_cluster_index = config.cur_index + + self.primary_cluster_name = get_current_primary_cluster_name( + namespace=self.wl_namespace, workload_type=cnv_workloads[0].workload_type + ) + + # Shutting down primary cluster nodes + node_objs = get_node_objs() + if primary_cluster_down: + logger.info("Stopping primary cluster nodes") + nodes_multicluster[primary_cluster_index].stop_nodes(node_objs) + + # Fence the primary managed cluster + enable_fence(drcluster_name=self.primary_cluster_name) + + secondary_cluster_name = get_current_secondary_cluster_name( + self.wl_namespace, cnv_workloads[0].workload_type + ) + + # TODO: Write a file or any IO inside VM + + # Fail-over the apps to secondary managed cluster + for cnv_wl in cnv_workloads: + failover( + failover_cluster=secondary_cluster_name, + namespace=cnv_wl.workload_namespace, + workload_type=cnv_wl.workload_type, + workload_placement_name=cnv_wl.cnv_workload_placement_name + if cnv_wl.workload_type != constants.SUBSCRIPTION + else None, + ) + + # Verify VM and its resources in secondary managed cluster + set_current_primary_cluster_context( + self.wl_namespace, cnv_workloads[0].workload_type + ) + for cnv_wl in cnv_workloads: + wait_for_all_resources_creation( + cnv_wl.workload_pvc_count, + cnv_wl.workload_pod_count, + cnv_wl.workload_namespace, + ) + wait_for_cnv_workload( + vm_name=cnv_wl.vm_name, + namespace=cnv_wl.workload_namespace, + phase=constants.STATUS_RUNNING, + ) + + # Start nodes if cluster is down + wait_time = 120 + if primary_cluster_down: + logger.info( + f"Waiting for {wait_time} seconds before starting nodes of previous primary cluster" + ) + time.sleep(wait_time) + nodes_multicluster[primary_cluster_index].start_nodes(node_objs) + logger.info( + f"Waiting for {wait_time} seconds after starting nodes of previous primary cluster" + ) + time.sleep(wait_time) + wait_for_nodes_status([node.name for node in node_objs]) + + # Verify application are deleted from old managed cluster + set_current_secondary_cluster_context( + cnv_workloads[0].workload_namespace, cnv_workloads[0].workload_type + ) + for cnv_wl in cnv_workloads: + wait_for_all_resources_deletion(cnv_wl.workload_namespace) + + # TODO: Validate Data integrity + + # Un-fence the managed cluster + enable_unfence(drcluster_name=self.primary_cluster_name) + + # Reboot the nodes after unfenced + gracefully_reboot_ocp_nodes( + self.wl_namespace, self.primary_cluster_name, cnv_workloads[0].workload_type + ) + + secondary_cluster_name = get_current_secondary_cluster_name( + self.wl_namespace, cnv_workloads[0].workload_type + ) + + # Relocate cnv apps back to primary managed cluster + for cnv_wl in cnv_workloads: + relocate( + preferred_cluster=secondary_cluster_name, + namespace=cnv_wl.workload_namespace, + workload_type=cnv_wl.workload_type, + workload_placement_name=cnv_wl.cnv_workload_placement_name + if cnv_wl.workload_type != constants.SUBSCRIPTION + else None, + ) + + set_current_secondary_cluster_context( + self.wl_namespace, cnv_workloads[0].workload_type + ) + # Verify resources deletion from previous primary or current secondary cluster + for cnv_wl in cnv_workloads: + wait_for_all_resources_deletion(cnv_wl.workload_namespace) + + # Verify resource creation and VM status on relocated cluster + set_current_primary_cluster_context( + self.wl_namespace, cnv_workloads[0].workload_type + ) + for cnv_wl in cnv_workloads: + wait_for_all_resources_creation( + cnv_wl.workload_pvc_count, + cnv_wl.workload_pod_count, + cnv_wl.workload_namespace, + ) + wait_for_cnv_workload( + vm_name=cnv_wl.vm_name, + namespace=cnv_wl.workload_namespace, + phase=constants.STATUS_RUNNING, + ) + + # TODO: Validate Data integrity