Skip to content

Commit

Permalink
[release-4.15] Fix fetching volumeHandle and manually attach Volumes …
Browse files Browse the repository at this point in the history
…on AWS post detach (#9825)


Signed-off-by: am-agrawa <[email protected]>
Co-authored-by: am-agrawa <[email protected]>
  • Loading branch information
openshift-cherrypick-robot and am-agrawa authored May 29, 2024
1 parent 8436b38 commit 5afd1d3
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 13 deletions.
1 change: 1 addition & 0 deletions ocs_ci/ocs/platform_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,7 @@ def get_volume_attachments(ebs_volume):
return ebs_volume.attachments

try:

for sample in TimeoutSampler(300, 3, get_volume_attachments, volume):
logger.info(f"EBS volume {volume.id} attachments are: {sample}")
if sample:
Expand Down
8 changes: 1 addition & 7 deletions ocs_ci/utility/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -2225,13 +2225,7 @@ def get_data_volumes(deviceset_pvs):
aws = AWS()

volume_ids = [
"vol-"
+ pv.get()
.get("spec")
.get("awsElasticBlockStore")
.get("volumeID")
.partition("vol-")[-1]
for pv in deviceset_pvs
pv.get().get("spec").get("csi").get("volumeHandle") for pv in deviceset_pvs
]
return [aws.ec2_resource.Volume(vol_id) for vol_id in volume_ids]

Expand Down
64 changes: 58 additions & 6 deletions tests/functional/z_cluster/nodes/test_disk_failures.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging

import pytest

from ocs_ci.ocs import node, constants
Expand All @@ -20,12 +21,14 @@
from ocs_ci.helpers.helpers import (
wait_for_ct_pod_recovery,
clear_crash_warning_and_osd_removal_leftovers,
run_cmd_verify_cli_output,
)
from ocs_ci.ocs.resources.pod import (
get_osd_pods,
get_pod_node,
delete_pods,
get_pod_objs,
wait_for_pods_to_be_running,
)
from ocs_ci.utility.aws import AWSTimeoutException
from ocs_ci.ocs.resources.storage_cluster import osd_encryption_verification
Expand Down Expand Up @@ -62,20 +65,26 @@ def detach_volume_and_wait_for_attach(self, nodes, data_volume, worker_node):
except AWSTimeoutException as e:
if "Volume state: in-use" in e:
logger.info(
f"Volume {data_volume} re-attached successfully to worker"
f"Volume {data_volume} is still attached to worker, detach did not complete"
f" node {worker_node}"
)
else:
raise
else:
"""
Wait for worker volume to be re-attached automatically
to the node
"""
assert nodes.wait_for_volume_attach(data_volume), (
f"Volume {data_volume} failed to be re-attached to worker "
f"node {worker_node}"
)
logger.info(f"Volume {data_volume} is deattached successfully")
if config.ENV_DATA.get("platform", "").lower() == constants.AWS_PLATFORM:
logger.info(
f"For {constants.AWS_PLATFORM} platform, attaching volume manually"
)
nodes.attach_volume(volume=data_volume, node=worker_node)
else:
assert nodes.wait_for_volume_attach(data_volume), (
f"Volume {data_volume} failed to be re-attached to worker "
f"node {worker_node}"
)

@pytest.fixture(autouse=True)
def teardown(self, request, nodes):
Expand Down Expand Up @@ -151,6 +160,7 @@ def test_detach_attach_worker_volume(
"""
# Get a data volume
data_volume = nodes.get_data_volumes()[0]

# Get the worker node according to the volume attachment
worker = nodes.get_node_by_attached_volume(data_volume)

Expand All @@ -177,6 +187,27 @@ def test_detach_attach_worker_volume(
# W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster
# becomes healthy eventually
# TODO: Remove 'tries=100'

logger.info("Wait for all the pods in openshift-storage to be in running state")
assert wait_for_pods_to_be_running(
timeout=720
), "Not all the pods reached running state"

logger.info("Archive OSD crash if occurred due to detach and attach of volume")
is_daemon_recently_crash_warnings = run_cmd_verify_cli_output(
cmd="ceph health detail",
expected_output_lst={"HEALTH_WARN", "daemons have recently crashed"},
cephtool_cmd=True,
)
if is_daemon_recently_crash_warnings:
logger.info("Clear all ceph crash warnings")
# Importing here to avoid shadow by loop variable
from ocs_ci.ocs.resources import pod

ct_pod = pod.get_ceph_tools_pod()
ct_pod.exec_ceph_cmd(ceph_cmd="ceph crash archive-all")
else:
logger.info("There are no daemon crash warnings")
self.sanity_helpers.health_check(tries=100)

@skipif_managed_service
Expand Down Expand Up @@ -213,6 +244,27 @@ def test_detach_attach_2_data_volumes(
[worker_and_volume["worker"] for worker_and_volume in workers_and_volumes]
)

logger.info("Wait for all the pods in openshift-storage to be in running state")
assert wait_for_pods_to_be_running(
timeout=720
), "Not all the pods reached running state"

logger.info("Archive OSD crash if occurred due to detach and attach of volume")
is_daemon_recently_crash_warnings = run_cmd_verify_cli_output(
cmd="ceph health detail",
expected_output_lst={"HEALTH_WARN", "daemons have recently crashed"},
cephtool_cmd=True,
)
if is_daemon_recently_crash_warnings:
logger.info("Clear all ceph crash warnings")
# Importing here to avoid shadow by loop variable
from ocs_ci.ocs.resources import pod

ct_pod = pod.get_ceph_tools_pod()
ct_pod.exec_ceph_cmd(ceph_cmd="ceph crash archive-all")
else:
logger.info("There are no daemon crash warnings")

# Validate cluster is still functional
self.sanity_helpers.health_check()
self.sanity_helpers.create_resources(
Expand Down

0 comments on commit 5afd1d3

Please sign in to comment.