Skip to content

Commit

Permalink
PR9671 backport (#9907)
Browse files Browse the repository at this point in the history
Signed-off-by: am-agrawa <[email protected]>
  • Loading branch information
am-agrawa authored Jun 19, 2024
1 parent b090e9d commit 8fc469a
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 13 deletions.
10 changes: 2 additions & 8 deletions ocs_ci/utility/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -1822,13 +1822,7 @@ def get_data_volumes(deviceset_pvs):
aws = AWS()

volume_ids = [
"vol-"
+ pv.get()
.get("spec")
.get("awsElasticBlockStore")
.get("volumeID")
.partition("vol-")[-1]
for pv in deviceset_pvs
pv.get().get("spec").get("csi").get("volumeHandle") for pv in deviceset_pvs
]
return [aws.ec2_resource.Volume(vol_id) for vol_id in volume_ids]

Expand Down Expand Up @@ -2081,7 +2075,7 @@ def create_and_attach_ebs_volumes(
instance_id=worker["id"],
name=f"{worker['name']}_extra_volume_{number}",
size=size,
device=f"/dev/{device_names[number-1]}",
device=f"/dev/{device_names[number - 1]}",
)


Expand Down
61 changes: 56 additions & 5 deletions tests/manage/z_cluster/nodes/test_disk_failures.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
from ocs_ci.helpers.helpers import (
wait_for_ct_pod_recovery,
clear_crash_warning_and_osd_removal_leftovers,
run_cmd_verify_cli_output,
)
from ocs_ci.ocs.resources.pod import (
get_osd_pods,
get_pod_node,
delete_pods,
get_pod_objs,
wait_for_pods_to_be_running,
)
from ocs_ci.utility.aws import AWSTimeoutException
from ocs_ci.ocs.resources.storage_cluster import osd_encryption_verification
Expand Down Expand Up @@ -59,7 +61,7 @@ def detach_volume_and_wait_for_attach(self, nodes, data_volume, worker_node):
except AWSTimeoutException as e:
if "Volume state: in-use" in e:
logger.info(
f"Volume {data_volume} re-attached successfully to worker"
f"Volume {data_volume} is still attached to worker, detach did not complete"
f" node {worker_node}"
)
else:
Expand All @@ -69,10 +71,17 @@ def detach_volume_and_wait_for_attach(self, nodes, data_volume, worker_node):
Wait for worker volume to be re-attached automatically
to the node
"""
assert nodes.wait_for_volume_attach(data_volume), (
f"Volume {data_volume} failed to be re-attached to worker "
f"node {worker_node}"
)
logger.info(f"Volume {data_volume} is deattached successfully")
if config.ENV_DATA.get("platform", "").lower() == constants.AWS_PLATFORM:
logger.info(
f"For {constants.AWS_PLATFORM} platform, attaching volume manually"
)
nodes.attach_volume(volume=data_volume, node=worker_node)
else:
assert nodes.wait_for_volume_attach(data_volume), (
f"Volume {data_volume} failed to be re-attached to worker "
f"node {worker_node}"
)

@pytest.fixture(autouse=True)
def teardown(self, request, nodes):
Expand Down Expand Up @@ -173,6 +182,27 @@ def test_detach_attach_worker_volume(
# W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster
# becomes healthy eventually
# TODO: Remove 'tries=100'

logger.info("Wait for all the pods in openshift-storage to be in running state")
assert wait_for_pods_to_be_running(
timeout=720
), "Not all the pods reached running state"

logger.info("Archive OSD crash if occurred due to detach and attach of volume")
is_daemon_recently_crash_warnings = run_cmd_verify_cli_output(
cmd="ceph health detail",
expected_output_lst={"HEALTH_WARN", "daemons have recently crashed"},
cephtool_cmd=True,
)
if is_daemon_recently_crash_warnings:
logger.info("Clear all ceph crash warnings")
# Importing here to avoid shadow by loop variable
from ocs_ci.ocs.resources import pod

ct_pod = pod.get_ceph_tools_pod()
ct_pod.exec_ceph_cmd(ceph_cmd="ceph crash archive-all")
else:
logger.info("There are no daemon crash warnings")
self.sanity_helpers.health_check(tries=100)

@skipif_managed_service
Expand Down Expand Up @@ -208,6 +238,27 @@ def test_detach_attach_2_data_volumes(
[worker_and_volume["worker"] for worker_and_volume in workers_and_volumes]
)

logger.info("Wait for all the pods in openshift-storage to be in running state")
assert wait_for_pods_to_be_running(
timeout=720
), "Not all the pods reached running state"

logger.info("Archive OSD crash if occurred due to detach and attach of volume")
is_daemon_recently_crash_warnings = run_cmd_verify_cli_output(
cmd="ceph health detail",
expected_output_lst={"HEALTH_WARN", "daemons have recently crashed"},
cephtool_cmd=True,
)
if is_daemon_recently_crash_warnings:
logger.info("Clear all ceph crash warnings")
# Importing here to avoid shadow by loop variable
from ocs_ci.ocs.resources import pod

ct_pod = pod.get_ceph_tools_pod()
ct_pod.exec_ceph_cmd(ceph_cmd="ceph crash archive-all")
else:
logger.info("There are no daemon crash warnings")

# Validate cluster is still functional
self.sanity_helpers.health_check()
self.sanity_helpers.create_resources(
Expand Down

0 comments on commit 8fc469a

Please sign in to comment.