From 5389318640f6b8b32266c7170c8238f24922222e Mon Sep 17 00:00:00 2001 From: Filip Balak Date: Mon, 15 Jan 2024 13:34:23 +0100 Subject: [PATCH] Provider client specific alerting tests (#9078) * add provider client specific alerting tests Signed-off-by: fbalak --- ocs_ci/ocs/constants.py | 6 + ocs_ci/ocs/resources/storageconsumer.py | 136 ++++++++++++++++++ tests/functional/monitoring/conftest.py | 80 ++++++++++- .../prometheus/alerts/test_provider_client.py | 84 +++++++++++ 4 files changed, 304 insertions(+), 2 deletions(-) create mode 100644 ocs_ci/ocs/resources/storageconsumer.py create mode 100644 tests/functional/monitoring/prometheus/alerts/test_provider_client.py diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py index 396675f3f4d..c80df013df0 100644 --- a/ocs_ci/ocs/constants.py +++ b/ocs_ci/ocs/constants.py @@ -146,6 +146,7 @@ MACHINESETS = "machinesets" STORAGECLUSTER = "storagecluster" CLUSTER_OPERATOR = "ClusterOperator" +CRONJOB = "CronJob" MONITORING = "monitoring" CLUSTER_SERVICE_VERSION = "csv" JOB = "job" @@ -174,6 +175,7 @@ LVMCLUSTER = "odf-lvmcluster" LVMSCLUSTER = "lvmscluster" STORAGECLASSCLAIM = "StorageClassClaim" +STORAGECONSUMER = "StorageConsumer" MACHINEHEALTHCHECK = "machinehealthcheck" STORAGECLIENT = "StorageClient" MANAGED_FUSION_OFFERING = "ManagedFusionOffering" @@ -980,6 +982,10 @@ ALERT_KUBEHPAREPLICASMISMATCH = "KubeHpaReplicasMismatch" ALERT_KUBEPERSISTENTVOLUMEINODESFILLINGUP = "KubePersistentVolumeInodesFillingUp" ALERT_CEPHOSDSLOWOPS = "CephOSDSlowOps" +ALERT_STORAGECLIENTHEARTBEATMISSED = "StorageClientHeartbeatMissed" +ALERT_STORAGECLIENTINCOMPATIBLEOPERATORVERSION = ( + "StorageClientIncompatibleOperatorVersion" +) # OCS Deployment related constants OPERATOR_NODE_LABEL = "cluster.ocs.openshift.io/openshift-storage=''" diff --git a/ocs_ci/ocs/resources/storageconsumer.py b/ocs_ci/ocs/resources/storageconsumer.py new file mode 100644 index 00000000000..5d63f7f55a9 --- /dev/null +++ b/ocs_ci/ocs/resources/storageconsumer.py @@ -0,0 +1,136 @@ +""" +A module for all StorageConsumer functionalities and abstractions. +""" +import logging + +from ocs_ci.framework import config +from ocs_ci.ocs import constants, ocp +from ocs_ci.ocs.resources.ocs import OCS +from ocs_ci.utility.utils import exec_cmd + +log = logging.getLogger(__name__) + + +class StorageConsumer: + """ + Base StorageConsumer class + """ + + def __init__(self, consumer_name, consumer_context=None): + """ + Args: + consumer_name (string): name of the StorageConsumer resource + consumer_context (int): index of cluster context. This is needed for + consumer operations executed on consumer + (e.g. manipulation of heartbeat cronjob) + """ + self.consumer_context = consumer_context + self.name = consumer_name + self.ocp = ocp.OCP( + resource_name=self.name, + kind=constants.STORAGECONSUMER, + namespace=config.cluster_ctx.ENV_DATA["cluster_namespace"], + ) + if self.consumer_context: + self.provider_context = config.cluster_ctx.MULTICLUSTER[ + "multicluster_index" + ] + self.heartbeat_cronjob = self.get_heartbeat_cronjob() + else: + self.provider_context = None + self.heartbeat_cronjob = None + + def get_ocs_version(self): + """ + Get ocs version from storageconsumer resource. + + Returns: + string: consumer ocs version + + """ + return ( + self.ocp.get(resource_name=self.name) + .get("status") + .get("client") + .get("operatorVersion") + ) + + def set_ocs_version(self, version): + """ + Update ocs consumer version in storageconsumer resource. This change assumes + that the hearthbeat is stopped so that the version is not overwritten by it. + + Args: + version (str): OCS version to be set + + """ + cmd = [ + "oc", + "patch", + "StorageConsumer", + self.name, + "--type", + "json", + "-p=" + + "'" + + f'[{{"op": "replace", "path": "/status/client/operatorVersion", "value":"{version}"}}]' + + "'", + "--subresource", + "status", + ] + exec_cmd(" ".join(cmd)) + + def stop_heartbeat(self): + """ + Suspend status reporter cron job. + """ + self._switch_consumer_cluster() + patch_param = '{"spec": {"suspend": true}}' + self.heartbeat_cronjob.ocp.patch( + resource_name=self.heartbeat_cronjob.name, params=patch_param + ) + self._switch_provider_cluster() + + def resume_heartbeat(self): + """ + Resume status reporter cron job. + """ + self._switch_consumer_cluster() + patch_param = '{"spec": {"suspend": false}}' + self.heartbeat_cronjob.ocp.patch( + resource_name=self.heartbeat_cronjob.name, params=patch_param + ) + self._switch_provider_cluster() + + def get_heartbeat_cronjob(self): + """ + Returns: + object: status reporter cronjob OCS object + + """ + self._switch_consumer_cluster() + cronjobs_obj = ocp.OCP( + kind=constants.CRONJOB, + namespace=config.cluster_ctx.ENV_DATA["cluster_namespace"], + ) + cronjob = [ + OCS(**job) + for job in cronjobs_obj.get().get("items") + if job["metadata"]["name"].endswith("status-reporter") + ][0] + self._switch_provider_cluster() + return cronjob + + def _switch_provider_cluster(self): + """ + Switch context to provider cluster. + """ + config.switch_ctx(self.provider_context) + log.info(f"Switched to provider cluster with index {self.provider_context}") + + def _switch_consumer_cluster(self): + """ + Switch context to consumer cluster. + """ + config.switch_ctx(self.consumer_context) + log.info(f"Switched to consumer cluster with index {self.consumer_context}") diff --git a/tests/functional/monitoring/conftest.py b/tests/functional/monitoring/conftest.py index d898c8f1372..eebded7b77c 100644 --- a/tests/functional/monitoring/conftest.py +++ b/tests/functional/monitoring/conftest.py @@ -18,13 +18,13 @@ schedule_nodes, ) from ocs_ci.ocs import rados_utils -from ocs_ci.ocs.resources import deployment, pod +from ocs_ci.ocs.resources import deployment, pod, storageconsumer from ocs_ci.ocs.resources.objectbucket import MCGCLIBucket from ocs_ci.ocs.resources.pod import get_mon_pods, get_osd_pods from ocs_ci.utility.kms import get_kms_endpoint, set_kms_endpoint from ocs_ci.utility.pagerduty import get_pagerduty_service_id from ocs_ci.utility.retry import retry -from ocs_ci.utility.utils import ceph_health_check, TimeoutSampler +from ocs_ci.utility.utils import ceph_health_check, TimeoutSampler, exec_cmd from ocs_ci.utility.workloadfixture import measure_operation, is_measurement_done from ocs_ci.helpers import helpers from ocs_ci.helpers.helpers import create_unique_resource_name @@ -1133,3 +1133,79 @@ def teardown(): teardown() return measured_op + + +@pytest.fixture +def measure_change_client_ocs_version_and_stop_heartbeat( + request, measurement_dir, threading_lock +): + """ + Change ocs version of client to a different number, measure the time when it was + rewritten and alerts that were triggered during this event. To achieve the change + will be also stopped heartbeat cron job on the client to ensure that the version + is not rewritten. + + Returns: + dict: Contains information about `start` and `stop` time for rewritting + the client version + + """ + original_cluster = config.cluster_ctx.MULTICLUSTER["multicluster_index"] + logger.info(f"Provider cluster key: {original_cluster}") + logger.info("Switch to client cluster") + config.switch_to_consumer() + client_cluster = config.cluster_ctx.MULTICLUSTER["multicluster_index"] + logger.info(f"Client cluster key: {client_cluster}") + cluster_id = exec_cmd( + "oc get clusterversion version -o jsonpath='{.spec.clusterID}'" + ).stdout.decode("utf-8") + client_name = f"storageconsumer-{cluster_id}" + logger.info(f"Switch to original cluster ({original_cluster})") + config.switch_ctx(original_cluster) + client = storageconsumer.StorageConsumer( + client_name, consumer_context=client_cluster + ) + current_version = client.get_ocs_version() + logger.info(f"Reported client version: {current_version}") + + def change_client_version(): + """ + Stop heartbeat and change value of ocs version in storage client resource + for 3 minutes. + + """ + nonlocal client + nonlocal original_cluster + # run_time of operation + run_time = 60 * 3 + client.stop_heartbeat() + client.set_ocs_version("4.13.0") + logger.info(f"Waiting for {run_time} seconds") + time.sleep(run_time) + logger.info(f"Switch to original cluster ({original_cluster})") + config.switch_ctx(original_cluster) + return + + def teardown(): + nonlocal client + nonlocal original_cluster + nonlocal client_cluster + logger.info(f"Switch to client cluster ({client_cluster})") + config.switch_ctx(client_cluster) + client.resume_heartbeat() + logger.info(f"Switch to original cluster ({original_cluster})") + config.switch_ctx(original_cluster) + + request.addfinalizer(teardown) + + test_file = os.path.join(measurement_dir, "measure_change_client_version.json") + measured_op = measure_operation( + change_client_version, + test_file, + threading_lock=threading_lock, + metadata={"client_name": client_name}, + ) + + teardown() + + return measured_op diff --git a/tests/functional/monitoring/prometheus/alerts/test_provider_client.py b/tests/functional/monitoring/prometheus/alerts/test_provider_client.py new file mode 100644 index 00000000000..d51c65832c7 --- /dev/null +++ b/tests/functional/monitoring/prometheus/alerts/test_provider_client.py @@ -0,0 +1,84 @@ +import logging +import pytest + +from ocs_ci.framework.pytest_customization.marks import blue_squad +from ocs_ci.framework.testlib import ( + tier4c, + runs_on_provider, + hci_provider_and_client_required, +) +from ocs_ci.ocs import constants +from ocs_ci.utility import prometheus +from ocs_ci.ocs.ocp import OCP + + +log = logging.getLogger(__name__) + + +@blue_squad +@tier4c +@runs_on_provider +@hci_provider_and_client_required +@pytest.mark.polarion_id("OCS-5392") +def test_change_client_ocs_version_and_stop_heartbeat( + measure_change_client_ocs_version_and_stop_heartbeat, threading_lock +): + """ + Test that there are appropriate alerts raised when ocs version of client + is changed to a different version and those alerts are cleared when the + heartbeat is resumed. During the test is stopped heartbeat cronjob on + client in order to stop overwritting the version set for testing. When the + heartbeat is resumed thereshould be also resumed version reporting so the + version should contain previous version. + + """ + api = prometheus.PrometheusAPI(threading_lock=threading_lock) + + # get alerts from time when manager deployment was scaled down + alerts = measure_change_client_ocs_version_and_stop_heartbeat.get( + "prometheus_alerts" + ) + client_name = measure_change_client_ocs_version_and_stop_heartbeat.get( + "metadata" + ).get("client_name") + target_alerts = [ + { + "label": constants.ALERT_STORAGECLIENTHEARTBEATMISSED, + "msg": f"Storage Client ({client_name}) heartbeat missed for more than 120 (s). " + "Lossy network connectivity might exist", + }, + { + "label": constants.ALERT_STORAGECLIENTINCOMPATIBLEOPERATORVERSION, + "msg": f"Storage Client Operator ({client_name}) differs by more " + "than 1 minor version. Client configuration may be incompatible and unsupported", + }, + ] + states = ["firing"] + + for target_alert in target_alerts: + prometheus.check_alert_list( + label=target_alert["label"], + msg=target_alert["msg"], + alerts=alerts, + states=states, + severity="error", + ) + prometheus.check_alert_list( + label=target_alert["label"], + msg=target_alert["msg"], + alerts=alerts, + states=states, + severity="warning", + ) + api.check_alert_cleared( + label=target_alert["label"], + measure_end_time=measure_change_client_ocs_version_and_stop_heartbeat.get( + "stop" + ), + time_min=300, + ) + + +def teardown_module(): + ocs_obj = OCP() + ocs_obj.login_as_sa()