diff --git a/ocs_ci/utility/aws.py b/ocs_ci/utility/aws.py index 63bc44e38ba..9d0b1e200a1 100644 --- a/ocs_ci/utility/aws.py +++ b/ocs_ci/utility/aws.py @@ -1822,13 +1822,7 @@ def get_data_volumes(deviceset_pvs): aws = AWS() volume_ids = [ - "vol-" - + pv.get() - .get("spec") - .get("awsElasticBlockStore") - .get("volumeID") - .partition("vol-")[-1] - for pv in deviceset_pvs + pv.get().get("spec").get("csi").get("volumeHandle") for pv in deviceset_pvs ] return [aws.ec2_resource.Volume(vol_id) for vol_id in volume_ids] @@ -2081,7 +2075,7 @@ def create_and_attach_ebs_volumes( instance_id=worker["id"], name=f"{worker['name']}_extra_volume_{number}", size=size, - device=f"/dev/{device_names[number-1]}", + device=f"/dev/{device_names[number - 1]}", ) diff --git a/tests/manage/z_cluster/nodes/test_disk_failures.py b/tests/manage/z_cluster/nodes/test_disk_failures.py index e938a284ad5..7670fc7106a 100644 --- a/tests/manage/z_cluster/nodes/test_disk_failures.py +++ b/tests/manage/z_cluster/nodes/test_disk_failures.py @@ -18,12 +18,14 @@ from ocs_ci.helpers.helpers import ( wait_for_ct_pod_recovery, clear_crash_warning_and_osd_removal_leftovers, + run_cmd_verify_cli_output, ) from ocs_ci.ocs.resources.pod import ( get_osd_pods, get_pod_node, delete_pods, get_pod_objs, + wait_for_pods_to_be_running, ) from ocs_ci.utility.aws import AWSTimeoutException from ocs_ci.ocs.resources.storage_cluster import osd_encryption_verification @@ -59,7 +61,7 @@ def detach_volume_and_wait_for_attach(self, nodes, data_volume, worker_node): except AWSTimeoutException as e: if "Volume state: in-use" in e: logger.info( - f"Volume {data_volume} re-attached successfully to worker" + f"Volume {data_volume} is still attached to worker, detach did not complete" f" node {worker_node}" ) else: @@ -69,10 +71,17 @@ def detach_volume_and_wait_for_attach(self, nodes, data_volume, worker_node): Wait for worker volume to be re-attached automatically to the node """ - assert nodes.wait_for_volume_attach(data_volume), ( - f"Volume {data_volume} failed to be re-attached to worker " - f"node {worker_node}" - ) + logger.info(f"Volume {data_volume} is deattached successfully") + if config.ENV_DATA.get("platform", "").lower() == constants.AWS_PLATFORM: + logger.info( + f"For {constants.AWS_PLATFORM} platform, attaching volume manually" + ) + nodes.attach_volume(volume=data_volume, node=worker_node) + else: + assert nodes.wait_for_volume_attach(data_volume), ( + f"Volume {data_volume} failed to be re-attached to worker " + f"node {worker_node}" + ) @pytest.fixture(autouse=True) def teardown(self, request, nodes): @@ -173,6 +182,27 @@ def test_detach_attach_worker_volume( # W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster # becomes healthy eventually # TODO: Remove 'tries=100' + + logger.info("Wait for all the pods in openshift-storage to be in running state") + assert wait_for_pods_to_be_running( + timeout=720 + ), "Not all the pods reached running state" + + logger.info("Archive OSD crash if occurred due to detach and attach of volume") + is_daemon_recently_crash_warnings = run_cmd_verify_cli_output( + cmd="ceph health detail", + expected_output_lst={"HEALTH_WARN", "daemons have recently crashed"}, + cephtool_cmd=True, + ) + if is_daemon_recently_crash_warnings: + logger.info("Clear all ceph crash warnings") + # Importing here to avoid shadow by loop variable + from ocs_ci.ocs.resources import pod + + ct_pod = pod.get_ceph_tools_pod() + ct_pod.exec_ceph_cmd(ceph_cmd="ceph crash archive-all") + else: + logger.info("There are no daemon crash warnings") self.sanity_helpers.health_check(tries=100) @skipif_managed_service @@ -208,6 +238,27 @@ def test_detach_attach_2_data_volumes( [worker_and_volume["worker"] for worker_and_volume in workers_and_volumes] ) + logger.info("Wait for all the pods in openshift-storage to be in running state") + assert wait_for_pods_to_be_running( + timeout=720 + ), "Not all the pods reached running state" + + logger.info("Archive OSD crash if occurred due to detach and attach of volume") + is_daemon_recently_crash_warnings = run_cmd_verify_cli_output( + cmd="ceph health detail", + expected_output_lst={"HEALTH_WARN", "daemons have recently crashed"}, + cephtool_cmd=True, + ) + if is_daemon_recently_crash_warnings: + logger.info("Clear all ceph crash warnings") + # Importing here to avoid shadow by loop variable + from ocs_ci.ocs.resources import pod + + ct_pod = pod.get_ceph_tools_pod() + ct_pod.exec_ceph_cmd(ceph_cmd="ceph crash archive-all") + else: + logger.info("There are no daemon crash warnings") + # Validate cluster is still functional self.sanity_helpers.health_check() self.sanity_helpers.create_resources(