Skip to content

Commit

Permalink
remove ceph crashes after upgrade
Browse files Browse the repository at this point in the history
remove ceph crashes after upgrade due to bug
https://bugzilla.redhat.com/show_bug.cgi?id=2249844
and Ceph bug:
https://bugzilla.redhat.com/show_bug.cgi?id=2249814

Signed-off-by: vavuthu <[email protected]>
  • Loading branch information
vavuthu authored and petr-balogh committed Nov 16, 2023
1 parent f6a84a1 commit 5ab5714
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 3 deletions.
11 changes: 9 additions & 2 deletions ocs_ci/ocs/resources/pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,13 +316,18 @@ def get_labels(self):
"""
return self.pod_data.get("metadata").get("labels")

def exec_ceph_cmd(self, ceph_cmd, format="json-pretty"):
def exec_ceph_cmd(
self, ceph_cmd, format="json-pretty", out_yaml_format=True, timeout=600
):
"""
Execute a Ceph command on the Ceph tools pod
Args:
ceph_cmd (str): The Ceph command to execute on the Ceph tools pod
format (str): The returning output format of the Ceph command
out_yaml_format (bool): whether to return yaml loaded python
object OR to return raw output
timeout (int): timeout for the exec_cmd_on_pod, defaults to 600 seconds
Returns:
dict: Ceph command output
Expand All @@ -335,7 +340,9 @@ def exec_ceph_cmd(self, ceph_cmd, format="json-pretty"):
ceph_cmd = ceph_cmd
if format:
ceph_cmd += f" --format {format}"
out = self.exec_cmd_on_pod(ceph_cmd)
out = self.exec_cmd_on_pod(
ceph_cmd, out_yaml_format=out_yaml_format, timeout=timeout
)

# For some commands, like "ceph fs ls", the returned output is a list
if isinstance(out, list):
Expand Down
23 changes: 22 additions & 1 deletion ocs_ci/ocs/resources/storage_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,14 @@
)
from ocs_ci.utility.retry import retry
from ocs_ci.utility.rgwutils import get_rgw_count
from ocs_ci.utility.utils import run_cmd, TimeoutSampler
from ocs_ci.utility.utils import (
remove_ceph_crashes,
run_ceph_health_cmd,
run_cmd,
TimeoutSampler,
)
from ocs_ci.utility.decorators import switch_to_orig_index_at_last
from time import sleep

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -616,6 +622,21 @@ def ocs_install_verification(
health_check_tries = 20
health_check_delay = 30
if post_upgrade_verification:
# remove ceph crashes after upgrade due to bug
# https://bugzilla.redhat.com/show_bug.cgi?id=2249844
# and Ceph bug:
# https://bugzilla.redhat.com/show_bug.cgi?id=2249814
log.info(
"Sleeping for 600 seconds to allow crash reports to report to ceph health"
)
sleep(600)
ceph_health = run_ceph_health_cmd(
namespace=config.ENV_DATA["cluster_namespace"]
)
if "daemons have recently crashed" in ceph_health:
# remove crashes on ceph
remove_ceph_crashes(ct_pod)

# In case of upgrade with FIO we have to wait longer time to see
# health OK. See discussion in BZ:
# https://bugzilla.redhat.com/show_bug.cgi?id=1817727
Expand Down
65 changes: 65 additions & 0 deletions ocs_ci/utility/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from ocs_ci.ocs import constants, defaults
from ocs_ci.ocs.exceptions import (
CephHealthException,
CephToolBoxNotFoundException,
ClientDownloadError,
CommandFailed,
TagNotFoundException,
Expand Down Expand Up @@ -2323,6 +2324,28 @@ def create_ceph_health_cmd(namespace):
return ceph_health_cmd


def run_ceph_health_cmd():
"""
Run the ceph health command
Raises:
CommandFailed: In case the rook-ceph-tools pod failed to reach the Ready state.
Returns:
str: The output of the ceph health command
"""
# Import here to avoid circular loop
from ocs_ci.ocs.resources.pod import get_ceph_tools_pod

try:
ct_pod = get_ceph_tools_pod()
except (AssertionError, CephToolBoxNotFoundException) as ex:
raise CommandFailed(ex)

return ct_pod.exec_ceph_cmd(
ceph_cmd="ceph health", format=None, out_yaml_format=False, timeout=120
)


def get_rook_repo(branch="master", to_checkout=None):
"""
Clone and checkout the rook repository to specific branch/commit.
Expand Down Expand Up @@ -4378,3 +4401,45 @@ def filter_out_emojis(plaintext):
# Join the characters back together to form the filtered string
filtered_string = "".join(filtered_chars)
return filtered_string


def remove_ceph_crashes(toolbox_pod):
"""
Deletes the Ceph crashes
Args:
toolbox_pod (obj): Ceph toolbox pod object
"""
ceph_crash_ids = get_ceph_crashes(toolbox_pod)
archive_ceph_crashes(toolbox_pod)
log.info(f"Removing all ceph crashes {ceph_crash_ids}")
for each_ceph_crash in ceph_crash_ids:
toolbox_pod.exec_ceph_cmd(f"ceph crash rm {each_ceph_crash}")


def get_ceph_crashes(toolbox_pod):
"""
Gets all Ceph crashes
Args:
toolbox_pod (obj): Ceph toolbox pod object
Returns:
list: List of ceph crash ID's
"""
ceph_crashes = toolbox_pod.exec_ceph_cmd("ceph crash ls")
return [each_crash["crash_id"] for each_crash in ceph_crashes]


def archive_ceph_crashes(toolbox_pod):
"""
Archive all Ceph crashes
Args:
toolbox_pod (obj): Ceph toolbox pod object
"""
log.info("Archiving all ceph crashes")
toolbox_pod.exec_ceph_cmd("ceph crash archive-all")

0 comments on commit 5ab5714

Please sign in to comment.