From adc570fa6d02525b958b868c4e7ef04015663d80 Mon Sep 17 00:00:00 2001 From: vavuthu Date: Wed, 6 Sep 2023 17:00:47 +0530 Subject: [PATCH] remove ceph crashes after upgrade remove ceph crashes after upgrade due to bug https://bugzilla.redhat.com/show_bug.cgi?id=2249844 and Ceph bug: https://bugzilla.redhat.com/show_bug.cgi?id=2249814 Signed-off-by: vavuthu --- ocs_ci/ocs/resources/pod.py | 11 ++++- ocs_ci/ocs/resources/storage_cluster.py | 23 ++++++++- ocs_ci/utility/utils.py | 65 +++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 3 deletions(-) diff --git a/ocs_ci/ocs/resources/pod.py b/ocs_ci/ocs/resources/pod.py index 6b2da29d3a73..9184d8098b83 100644 --- a/ocs_ci/ocs/resources/pod.py +++ b/ocs_ci/ocs/resources/pod.py @@ -316,13 +316,18 @@ def get_labels(self): """ return self.pod_data.get("metadata").get("labels") - def exec_ceph_cmd(self, ceph_cmd, format="json-pretty"): + def exec_ceph_cmd( + self, ceph_cmd, format="json-pretty", out_yaml_format=True, timeout=600 + ): """ Execute a Ceph command on the Ceph tools pod Args: ceph_cmd (str): The Ceph command to execute on the Ceph tools pod format (str): The returning output format of the Ceph command + out_yaml_format (bool): whether to return yaml loaded python + object OR to return raw output + timeout (int): timeout for the exec_cmd_on_pod, defaults to 600 seconds Returns: dict: Ceph command output @@ -335,7 +340,9 @@ def exec_ceph_cmd(self, ceph_cmd, format="json-pretty"): ceph_cmd = ceph_cmd if format: ceph_cmd += f" --format {format}" - out = self.exec_cmd_on_pod(ceph_cmd) + out = self.exec_cmd_on_pod( + ceph_cmd, out_yaml_format=out_yaml_format, timeout=timeout + ) # For some commands, like "ceph fs ls", the returned output is a list if isinstance(out, list): diff --git a/ocs_ci/ocs/resources/storage_cluster.py b/ocs_ci/ocs/resources/storage_cluster.py index 118d7187ea8d..e708dd8e2f5f 100644 --- a/ocs_ci/ocs/resources/storage_cluster.py +++ b/ocs_ci/ocs/resources/storage_cluster.py @@ -72,8 +72,14 @@ ) from ocs_ci.utility.retry import retry from ocs_ci.utility.rgwutils import get_rgw_count -from ocs_ci.utility.utils import run_cmd, TimeoutSampler +from ocs_ci.utility.utils import ( + remove_ceph_crashes, + run_ceph_health_cmd, + run_cmd, + TimeoutSampler, +) from ocs_ci.utility.decorators import switch_to_orig_index_at_last +from time import sleep log = logging.getLogger(__name__) @@ -616,6 +622,21 @@ def ocs_install_verification( health_check_tries = 20 health_check_delay = 30 if post_upgrade_verification: + # remove ceph crashes after upgrade due to bug + # https://bugzilla.redhat.com/show_bug.cgi?id=2249844 + # and Ceph bug: + # https://bugzilla.redhat.com/show_bug.cgi?id=2249814 + log.info( + "Sleeping for 600 seconds to allow crash reports to report to ceph health" + ) + sleep(600) + ceph_health = run_ceph_health_cmd( + namespace=config.ENV_DATA["cluster_namespace"] + ) + if "daemons have recently crashed" in ceph_health: + # remove crashes on ceph + remove_ceph_crashes(ct_pod) + # In case of upgrade with FIO we have to wait longer time to see # health OK. See discussion in BZ: # https://bugzilla.redhat.com/show_bug.cgi?id=1817727 diff --git a/ocs_ci/utility/utils.py b/ocs_ci/utility/utils.py index 5fc3ad69145c..ff0a43a5b82f 100644 --- a/ocs_ci/utility/utils.py +++ b/ocs_ci/utility/utils.py @@ -41,6 +41,7 @@ from ocs_ci.ocs import constants, defaults from ocs_ci.ocs.exceptions import ( CephHealthException, + CephToolBoxNotFoundException, ClientDownloadError, CommandFailed, TagNotFoundException, @@ -2323,6 +2324,28 @@ def create_ceph_health_cmd(namespace): return ceph_health_cmd +def run_ceph_health_cmd(): + """ + Run the ceph health command + + Raises: + CommandFailed: In case the rook-ceph-tools pod failed to reach the Ready state. + Returns: + str: The output of the ceph health command + """ + # Import here to avoid circular loop + from ocs_ci.ocs.resources.pod import get_ceph_tools_pod + + try: + ct_pod = get_ceph_tools_pod() + except (AssertionError, CephToolBoxNotFoundException) as ex: + raise CommandFailed(ex) + + return ct_pod.exec_ceph_cmd( + ceph_cmd="ceph health", format=None, out_yaml_format=False, timeout=120 + ) + + def get_rook_repo(branch="master", to_checkout=None): """ Clone and checkout the rook repository to specific branch/commit. @@ -4378,3 +4401,45 @@ def filter_out_emojis(plaintext): # Join the characters back together to form the filtered string filtered_string = "".join(filtered_chars) return filtered_string + + +def remove_ceph_crashes(toolbox_pod): + """ + Deletes the Ceph crashes + + Args: + toolbox_pod (obj): Ceph toolbox pod object + + """ + ceph_crash_ids = get_ceph_crashes(toolbox_pod) + archive_ceph_crashes(toolbox_pod) + log.info(f"Removing all ceph crashes {ceph_crash_ids}") + for each_ceph_crash in ceph_crash_ids: + toolbox_pod.exec_ceph_cmd(f"ceph crash rm {each_ceph_crash}") + + +def get_ceph_crashes(toolbox_pod): + """ + Gets all Ceph crashes + + Args: + toolbox_pod (obj): Ceph toolbox pod object + + Returns: + list: List of ceph crash ID's + + """ + ceph_crashes = toolbox_pod.exec_ceph_cmd("ceph crash ls") + return [each_crash["crash_id"] for each_crash in ceph_crashes] + + +def archive_ceph_crashes(toolbox_pod): + """ + Archive all Ceph crashes + + Args: + toolbox_pod (obj): Ceph toolbox pod object + + """ + log.info("Archiving all ceph crashes") + toolbox_pod.exec_ceph_cmd("ceph crash archive-all")