PR9671 backport (#9907)

Signed-off-by: am-agrawa <[email protected]>
red-hat-storage · Jun 19, 2024 · 8fc469a · 8fc469a
1 parent b090e9d
commit 8fc469a
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 13 deletions.
diff --git a/ocs_ci/utility/aws.py b/ocs_ci/utility/aws.py
@@ -1822,13 +1822,7 @@ def get_data_volumes(deviceset_pvs):
     aws = AWS()
 
     volume_ids = [
-        "vol-"
-        + pv.get()
-        .get("spec")
-        .get("awsElasticBlockStore")
-        .get("volumeID")
-        .partition("vol-")[-1]
-        for pv in deviceset_pvs
+        pv.get().get("spec").get("csi").get("volumeHandle") for pv in deviceset_pvs
     ]
     return [aws.ec2_resource.Volume(vol_id) for vol_id in volume_ids]
 
@@ -2081,7 +2075,7 @@ def create_and_attach_ebs_volumes(
                     instance_id=worker["id"],
                     name=f"{worker['name']}_extra_volume_{number}",
                     size=size,
-                    device=f"/dev/{device_names[number-1]}",
+                    device=f"/dev/{device_names[number - 1]}",
                 )
 
 

diff --git a/tests/manage/z_cluster/nodes/test_disk_failures.py b/tests/manage/z_cluster/nodes/test_disk_failures.py
@@ -18,12 +18,14 @@
 from ocs_ci.helpers.helpers import (
     wait_for_ct_pod_recovery,
     clear_crash_warning_and_osd_removal_leftovers,
+    run_cmd_verify_cli_output,
 )
 from ocs_ci.ocs.resources.pod import (
     get_osd_pods,
     get_pod_node,
     delete_pods,
     get_pod_objs,
+    wait_for_pods_to_be_running,
 )
 from ocs_ci.utility.aws import AWSTimeoutException
 from ocs_ci.ocs.resources.storage_cluster import osd_encryption_verification
@@ -59,7 +61,7 @@ def detach_volume_and_wait_for_attach(self, nodes, data_volume, worker_node):
         except AWSTimeoutException as e:
             if "Volume state: in-use" in e:
                 logger.info(
-                    f"Volume {data_volume} re-attached successfully to worker"
+                    f"Volume {data_volume} is still attached to worker, detach did not complete"
                     f" node {worker_node}"
                 )
             else:
@@ -69,10 +71,17 @@ def detach_volume_and_wait_for_attach(self, nodes, data_volume, worker_node):
             Wait for worker volume to be re-attached automatically
             to the node
             """
-            assert nodes.wait_for_volume_attach(data_volume), (
-                f"Volume {data_volume} failed to be re-attached to worker "
-                f"node {worker_node}"
-            )
+            logger.info(f"Volume {data_volume} is deattached successfully")
+            if config.ENV_DATA.get("platform", "").lower() == constants.AWS_PLATFORM:
+                logger.info(
+                    f"For {constants.AWS_PLATFORM} platform, attaching volume manually"
+                )
+                nodes.attach_volume(volume=data_volume, node=worker_node)
+            else:
+                assert nodes.wait_for_volume_attach(data_volume), (
+                    f"Volume {data_volume} failed to be re-attached to worker "
+                    f"node {worker_node}"
+                )
 
     @pytest.fixture(autouse=True)
     def teardown(self, request, nodes):
@@ -173,6 +182,27 @@ def test_detach_attach_worker_volume(
         # W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster
         # becomes healthy eventually
         # TODO: Remove 'tries=100'
+
+        logger.info("Wait for all the pods in openshift-storage to be in running state")
+        assert wait_for_pods_to_be_running(
+            timeout=720
+        ), "Not all the pods reached running state"
+
+        logger.info("Archive OSD crash if occurred due to detach and attach of volume")
+        is_daemon_recently_crash_warnings = run_cmd_verify_cli_output(
+            cmd="ceph health detail",
+            expected_output_lst={"HEALTH_WARN", "daemons have recently crashed"},
+            cephtool_cmd=True,
+        )
+        if is_daemon_recently_crash_warnings:
+            logger.info("Clear all ceph crash warnings")
+            # Importing here to avoid shadow by loop variable
+            from ocs_ci.ocs.resources import pod
+
+            ct_pod = pod.get_ceph_tools_pod()
+            ct_pod.exec_ceph_cmd(ceph_cmd="ceph crash archive-all")
+        else:
+            logger.info("There are no daemon crash warnings")
         self.sanity_helpers.health_check(tries=100)
 
     @skipif_managed_service
@@ -208,6 +238,27 @@ def test_detach_attach_2_data_volumes(
             [worker_and_volume["worker"] for worker_and_volume in workers_and_volumes]
         )
 
+        logger.info("Wait for all the pods in openshift-storage to be in running state")
+        assert wait_for_pods_to_be_running(
+            timeout=720
+        ), "Not all the pods reached running state"
+
+        logger.info("Archive OSD crash if occurred due to detach and attach of volume")
+        is_daemon_recently_crash_warnings = run_cmd_verify_cli_output(
+            cmd="ceph health detail",
+            expected_output_lst={"HEALTH_WARN", "daemons have recently crashed"},
+            cephtool_cmd=True,
+        )
+        if is_daemon_recently_crash_warnings:
+            logger.info("Clear all ceph crash warnings")
+            # Importing here to avoid shadow by loop variable
+            from ocs_ci.ocs.resources import pod
+
+            ct_pod = pod.get_ceph_tools_pod()
+            ct_pod.exec_ceph_cmd(ceph_cmd="ceph crash archive-all")
+        else:
+            logger.info("There are no daemon crash warnings")
+
         # Validate cluster is still functional
         self.sanity_helpers.health_check()
         self.sanity_helpers.create_resources(