From 670d9b787125c6528e42f97ab063613646977cda Mon Sep 17 00:00:00 2001 From: Oded Viner <61982127+OdedViner@users.noreply.github.com> Date: Mon, 13 May 2024 18:05:58 +0300 Subject: [PATCH] Test CLI tool for disk replacement proceudre (#9655) * Test CLI tool for disk replacement proceudre Signed-off-by: oviner --- ocs_ci/helpers/helpers.py | 2 +- ocs_ci/ocs/osd_operations.py | 176 ++++++++++-------- ocs_ci/utility/utils.py | 17 +- .../z_cluster/nodes/test_disk_failures.py | 20 ++ 4 files changed, 130 insertions(+), 85 deletions(-) diff --git a/ocs_ci/helpers/helpers.py b/ocs_ci/helpers/helpers.py index 6e918fabc49..ea08a2602b6 100644 --- a/ocs_ci/helpers/helpers.py +++ b/ocs_ci/helpers/helpers.py @@ -4586,7 +4586,7 @@ def retrieve_cli_binary(cli_type="mcg"): if cli_type == "mcg": local_cli_path = constants.NOOBAA_OPERATOR_LOCAL_CLI_PATH elif cli_type == "odf": - local_cli_path = constants.CLI_TOOL_LOCAL_PATH + local_cli_path = os.path.join(config.RUN["bin_dir"], "odf-cli") local_cli_dir = os.path.dirname(local_cli_path) live_deployment = config.DEPLOYMENT["live_deployment"] if live_deployment and semantic_version >= version.VERSION_4_13: diff --git a/ocs_ci/ocs/osd_operations.py b/ocs_ci/ocs/osd_operations.py index 8c351ec25a5..e6c0962b1c5 100644 --- a/ocs_ci/ocs/osd_operations.py +++ b/ocs_ci/ocs/osd_operations.py @@ -19,16 +19,20 @@ delete_osd_removal_job, ) from ocs_ci.helpers.sanity_helpers import Sanity +from ocs_ci.helpers.helpers import retrieve_cli_binary +from ocs_ci.utility.utils import run_cmd_interactive logger = logging.getLogger(__name__) -def osd_device_replacement(nodes): +def osd_device_replacement(nodes, cli_tool=False): """ Replacing randomly picked osd device Args: - node (OCS): The OCS object representing the node + nodes (OCS): The OCS object representing the node + cli_tool (bool): using cli tool to replace the disk if cli_tool is True otherwise use "oc" commands + """ logger.info("Picking a PV which to be deleted from the platform side") osd_pvs = get_deviceset_pvs() @@ -101,91 +105,105 @@ def osd_device_replacement(nodes): == claim_name ][0] osd_deployment_name = osd_deployment.name + osd_pod_name = osd_pod.name # Delete the volume from the platform side logger.info(f"Deleting {volume_path} from the platform side") nodes.detach_volume(volume_path, osd_node) - # Scale down OSD deployment - logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0") - ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"]) - ocp_obj.exec_oc_cmd(f"scale --replicas=0 deployment/{osd_deployment_name}") - - # Force delete OSD pod if necessary - osd_pod_name = osd_pod.name - logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted") - try: - osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) - except TimeoutError: - osd_pod.delete(force=True) - osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) - - # Run ocs-osd-removal job - osd_removal_job = run_osd_removal_job([osd_id]) - assert osd_removal_job, "ocs-osd-removal failed to create" - is_completed = verify_osd_removal_job_completed_successfully(osd_id) - assert is_completed, "ocs-osd-removal-job is not in status 'completed'" - logger.info("ocs-osd-removal-job completed successfully") - - osd_pvc_name = osd_pvc.name - - if ocp_version < version.VERSION_4_6: - # Delete the OSD prepare job - logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}") - osd_prepare_job.delete() - osd_prepare_job.ocp.wait_for_delete( - resource_name=osd_prepare_job_name, timeout=120 - ) - - # Delete the OSD PVC - logger.info(f"Deleting OSD PVC {osd_pvc_name}") - osd_pvc.delete() - osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) - - # Delete the OSD deployment - logger.info(f"Deleting OSD deployment {osd_deployment_name}") - osd_deployment.delete() - osd_deployment.ocp.wait_for_delete( - resource_name=osd_deployment_name, timeout=120 + if cli_tool: + retrieve_cli_binary(cli_type="odf") + run_cmd_interactive( + cmd=f"odf-cli purge-osd {osd_id}", + prompts_answers={ + "yes-force-destroy-osd": "yes-force-destroy-osd", + "completed removal of OSD": "", + }, + string_answer=True, + raise_exception=False, ) else: - # If ocp version is '4.6' and above the osd removal job should - # delete the OSD prepare job, OSD PVC, OSD deployment - # We just need to verify the old PV is in the expected status - logger.info(f"Verify that the old PV '{osd_pv_name}' is in the expected status") - if cluster.is_lso_cluster(): - expected_old_pv_statuses = [constants.STATUS_RELEASED] + # Scale down OSD deployment + logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0") + ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"]) + ocp_obj.exec_oc_cmd(f"scale --replicas=0 deployment/{osd_deployment_name}") + + # Force delete OSD pod if necessary + logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted") + try: + osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) + except TimeoutError: + osd_pod.delete(force=True) + osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) + + # Run ocs-osd-removal job + osd_removal_job = run_osd_removal_job([osd_id]) + assert osd_removal_job, "ocs-osd-removal failed to create" + is_completed = verify_osd_removal_job_completed_successfully(osd_id) + assert is_completed, "ocs-osd-removal-job is not in status 'completed'" + logger.info("ocs-osd-removal-job completed successfully") + + osd_pvc_name = osd_pvc.name + + if ocp_version < version.VERSION_4_6: + # Delete the OSD prepare job + logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}") + osd_prepare_job.delete() + osd_prepare_job.ocp.wait_for_delete( + resource_name=osd_prepare_job_name, timeout=120 + ) + + # Delete the OSD PVC + logger.info(f"Deleting OSD PVC {osd_pvc_name}") + osd_pvc.delete() + osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) + + # Delete the OSD deployment + logger.info(f"Deleting OSD deployment {osd_deployment_name}") + osd_deployment.delete() + osd_deployment.ocp.wait_for_delete( + resource_name=osd_deployment_name, timeout=120 + ) else: - expected_old_pv_statuses = [ - constants.STATUS_RELEASED, - constants.STATUS_FAILED, - ] - try: - if osd_pv.ocp.get_resource_status(osd_pv_name) in expected_old_pv_statuses: - try: - logger.info(f"Verifying deletion of PV {osd_pv_name}") - osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) - except TimeoutError: - osd_pv.delete() - osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) - except Exception as e: - logger.error(f"Old PV does not exist {e}") - - # If we use LSO, we need to create and attach a new disk manually - if cluster.is_lso_cluster(): - node.add_disk_to_node(osd_node) - - if ocp_version < version.VERSION_4_6: - # Delete the rook ceph operator pod to trigger reconciliation - rook_operator_pod = get_operator_pods()[0] - logger.info(f"deleting Rook Ceph operator pod {rook_operator_pod.name}") - rook_operator_pod.delete() - - # Delete the OSD removal job - logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}") - is_deleted = delete_osd_removal_job(osd_id) - assert is_deleted, "Failed to delete ocs-osd-removal-job" - logger.info("ocs-osd-removal-job deleted successfully") + # If ocp version is '4.6' and above the osd removal job should + # delete the OSD prepare job, OSD PVC, OSD deployment + # We just need to verify the old PV is in the expected status + logger.info( + f"Verify that the old PV '{osd_pv_name}' is in the expected status" + ) + if cluster.is_lso_cluster(): + expected_old_pv_statuses = [constants.STATUS_RELEASED] + else: + expected_old_pv_statuses = [ + constants.STATUS_RELEASED, + constants.STATUS_FAILED, + ] + try: + if osd_pv.ocp.get_resource_status(osd_pv_name) in expected_old_pv_statuses: + try: + logger.info(f"Verifying deletion of PV {osd_pv_name}") + osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) + except TimeoutError: + osd_pv.delete() + osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) + except Exception as e: + logger.error(f"Old PV does not exist {e}") + + # If we use LSO, we need to create and attach a new disk manually + if cluster.is_lso_cluster(): + node.add_disk_to_node(osd_node) + + if ocp_version < version.VERSION_4_6: + # Delete the rook ceph operator pod to trigger reconciliation + rook_operator_pod = get_operator_pods()[0] + logger.info(f"deleting Rook Ceph operator pod {rook_operator_pod.name}") + rook_operator_pod.delete() + + # Delete the OSD removal job + logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}") + is_deleted = delete_osd_removal_job(osd_id) + assert is_deleted, "Failed to delete ocs-osd-removal-job" + logger.info("ocs-osd-removal-job deleted successfully") timeout = 600 # Wait for OSD PVC to get created and reach Bound state diff --git a/ocs_ci/utility/utils.py b/ocs_ci/utility/utils.py index 238d560d0de..5de45f872ea 100644 --- a/ocs_ci/utility/utils.py +++ b/ocs_ci/utility/utils.py @@ -497,7 +497,9 @@ def run_cmd( return mask_secrets(completed_process.stdout.decode(), secrets) -def run_cmd_interactive(cmd, prompts_answers, timeout=300): +def run_cmd_interactive( + cmd, prompts_answers, timeout=300, string_answer=False, raise_exception=True +): """ Handle interactive prompts with answers during subctl command @@ -505,7 +507,8 @@ def run_cmd_interactive(cmd, prompts_answers, timeout=300): cmd(str): Command to be executed prompts_answers(dict): Prompts as keys and answers as values timeout(int): Timeout in seconds, for pexpect to wait for prompt - + string_answer (bool): string answer + raise_exception (bool): raise excption Raises: InteractivePromptException: in case something goes wrong @@ -513,9 +516,13 @@ def run_cmd_interactive(cmd, prompts_answers, timeout=300): child = pexpect.spawn(cmd) for prompt, answer in prompts_answers.items(): if child.expect(prompt, timeout=timeout): - raise InteractivePromptException("Unexpected Prompt") - - if not child.sendline("".join([answer, constants.ENTER_KEY])): + if raise_exception: + raise InteractivePromptException("Unexpected Prompt") + if string_answer: + send_line = answer + else: + send_line = "".join([answer, constants.ENTER_KEY]) + if not child.sendline(send_line): raise InteractivePromptException("Failed to provide answer to the prompt") diff --git a/tests/functional/z_cluster/nodes/test_disk_failures.py b/tests/functional/z_cluster/nodes/test_disk_failures.py index b2823b86dc3..1d980674cec 100644 --- a/tests/functional/z_cluster/nodes/test_disk_failures.py +++ b/tests/functional/z_cluster/nodes/test_disk_failures.py @@ -15,6 +15,7 @@ skipif_external_mode, skipif_managed_service, skipif_hci_provider_and_client, + skipif_ocs_version, ) from ocs_ci.helpers.sanity_helpers import Sanity from ocs_ci.helpers.helpers import ( @@ -236,3 +237,22 @@ def test_recovery_from_volume_deletion( self.sanity_helpers.create_resources( pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory ) + + @bugzilla("2234479") + @vsphere_platform_required + @skipif_ocs_version("<4.15") + @pytest.mark.polarion_id("OCS-5502") + @skipif_external_mode + def test_recovery_from_volume_deletion_cli_tool( + self, nodes, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory + ): + """ + Test cluster recovery from disk deletion from the platform side. + Based on documented procedure detailed in + https://bugzilla.redhat.com/show_bug.cgi?id=1823183 + + """ + osd_operations.osd_device_replacement(nodes, cli_tool=True) + self.sanity_helpers.create_resources( + pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory + )