Skip to content

Commit

Permalink
Provider client specific alerting tests (#9078)
Browse files Browse the repository at this point in the history
* add provider client specific alerting tests

Signed-off-by: fbalak <[email protected]>
  • Loading branch information
fbalak authored Jan 15, 2024
1 parent af95cf5 commit 5389318
Show file tree
Hide file tree
Showing 4 changed files with 304 additions and 2 deletions.
6 changes: 6 additions & 0 deletions ocs_ci/ocs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@
MACHINESETS = "machinesets"
STORAGECLUSTER = "storagecluster"
CLUSTER_OPERATOR = "ClusterOperator"
CRONJOB = "CronJob"
MONITORING = "monitoring"
CLUSTER_SERVICE_VERSION = "csv"
JOB = "job"
Expand Down Expand Up @@ -174,6 +175,7 @@
LVMCLUSTER = "odf-lvmcluster"
LVMSCLUSTER = "lvmscluster"
STORAGECLASSCLAIM = "StorageClassClaim"
STORAGECONSUMER = "StorageConsumer"
MACHINEHEALTHCHECK = "machinehealthcheck"
STORAGECLIENT = "StorageClient"
MANAGED_FUSION_OFFERING = "ManagedFusionOffering"
Expand Down Expand Up @@ -980,6 +982,10 @@
ALERT_KUBEHPAREPLICASMISMATCH = "KubeHpaReplicasMismatch"
ALERT_KUBEPERSISTENTVOLUMEINODESFILLINGUP = "KubePersistentVolumeInodesFillingUp"
ALERT_CEPHOSDSLOWOPS = "CephOSDSlowOps"
ALERT_STORAGECLIENTHEARTBEATMISSED = "StorageClientHeartbeatMissed"
ALERT_STORAGECLIENTINCOMPATIBLEOPERATORVERSION = (
"StorageClientIncompatibleOperatorVersion"
)

# OCS Deployment related constants
OPERATOR_NODE_LABEL = "cluster.ocs.openshift.io/openshift-storage=''"
Expand Down
136 changes: 136 additions & 0 deletions ocs_ci/ocs/resources/storageconsumer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
"""
A module for all StorageConsumer functionalities and abstractions.
"""
import logging

from ocs_ci.framework import config
from ocs_ci.ocs import constants, ocp
from ocs_ci.ocs.resources.ocs import OCS
from ocs_ci.utility.utils import exec_cmd

log = logging.getLogger(__name__)


class StorageConsumer:
"""
Base StorageConsumer class
"""

def __init__(self, consumer_name, consumer_context=None):
"""
Args:
consumer_name (string): name of the StorageConsumer resource
consumer_context (int): index of cluster context. This is needed for
consumer operations executed on consumer
(e.g. manipulation of heartbeat cronjob)
"""
self.consumer_context = consumer_context
self.name = consumer_name
self.ocp = ocp.OCP(
resource_name=self.name,
kind=constants.STORAGECONSUMER,
namespace=config.cluster_ctx.ENV_DATA["cluster_namespace"],
)
if self.consumer_context:
self.provider_context = config.cluster_ctx.MULTICLUSTER[
"multicluster_index"
]
self.heartbeat_cronjob = self.get_heartbeat_cronjob()
else:
self.provider_context = None
self.heartbeat_cronjob = None

def get_ocs_version(self):
"""
Get ocs version from storageconsumer resource.
Returns:
string: consumer ocs version
"""
return (
self.ocp.get(resource_name=self.name)
.get("status")
.get("client")
.get("operatorVersion")
)

def set_ocs_version(self, version):
"""
Update ocs consumer version in storageconsumer resource. This change assumes
that the hearthbeat is stopped so that the version is not overwritten by it.
Args:
version (str): OCS version to be set
"""
cmd = [
"oc",
"patch",
"StorageConsumer",
self.name,
"--type",
"json",
"-p="
+ "'"
+ f'[{{"op": "replace", "path": "/status/client/operatorVersion", "value":"{version}"}}]'
+ "'",
"--subresource",
"status",
]
exec_cmd(" ".join(cmd))

def stop_heartbeat(self):
"""
Suspend status reporter cron job.
"""
self._switch_consumer_cluster()
patch_param = '{"spec": {"suspend": true}}'
self.heartbeat_cronjob.ocp.patch(
resource_name=self.heartbeat_cronjob.name, params=patch_param
)
self._switch_provider_cluster()

def resume_heartbeat(self):
"""
Resume status reporter cron job.
"""
self._switch_consumer_cluster()
patch_param = '{"spec": {"suspend": false}}'
self.heartbeat_cronjob.ocp.patch(
resource_name=self.heartbeat_cronjob.name, params=patch_param
)
self._switch_provider_cluster()

def get_heartbeat_cronjob(self):
"""
Returns:
object: status reporter cronjob OCS object
"""
self._switch_consumer_cluster()
cronjobs_obj = ocp.OCP(
kind=constants.CRONJOB,
namespace=config.cluster_ctx.ENV_DATA["cluster_namespace"],
)
cronjob = [
OCS(**job)
for job in cronjobs_obj.get().get("items")
if job["metadata"]["name"].endswith("status-reporter")
][0]
self._switch_provider_cluster()
return cronjob

def _switch_provider_cluster(self):
"""
Switch context to provider cluster.
"""
config.switch_ctx(self.provider_context)
log.info(f"Switched to provider cluster with index {self.provider_context}")

def _switch_consumer_cluster(self):
"""
Switch context to consumer cluster.
"""
config.switch_ctx(self.consumer_context)
log.info(f"Switched to consumer cluster with index {self.consumer_context}")
80 changes: 78 additions & 2 deletions tests/functional/monitoring/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
schedule_nodes,
)
from ocs_ci.ocs import rados_utils
from ocs_ci.ocs.resources import deployment, pod
from ocs_ci.ocs.resources import deployment, pod, storageconsumer
from ocs_ci.ocs.resources.objectbucket import MCGCLIBucket
from ocs_ci.ocs.resources.pod import get_mon_pods, get_osd_pods
from ocs_ci.utility.kms import get_kms_endpoint, set_kms_endpoint
from ocs_ci.utility.pagerduty import get_pagerduty_service_id
from ocs_ci.utility.retry import retry
from ocs_ci.utility.utils import ceph_health_check, TimeoutSampler
from ocs_ci.utility.utils import ceph_health_check, TimeoutSampler, exec_cmd
from ocs_ci.utility.workloadfixture import measure_operation, is_measurement_done
from ocs_ci.helpers import helpers
from ocs_ci.helpers.helpers import create_unique_resource_name
Expand Down Expand Up @@ -1133,3 +1133,79 @@ def teardown():
teardown()

return measured_op


@pytest.fixture
def measure_change_client_ocs_version_and_stop_heartbeat(
request, measurement_dir, threading_lock
):
"""
Change ocs version of client to a different number, measure the time when it was
rewritten and alerts that were triggered during this event. To achieve the change
will be also stopped heartbeat cron job on the client to ensure that the version
is not rewritten.
Returns:
dict: Contains information about `start` and `stop` time for rewritting
the client version
"""
original_cluster = config.cluster_ctx.MULTICLUSTER["multicluster_index"]
logger.info(f"Provider cluster key: {original_cluster}")
logger.info("Switch to client cluster")
config.switch_to_consumer()
client_cluster = config.cluster_ctx.MULTICLUSTER["multicluster_index"]
logger.info(f"Client cluster key: {client_cluster}")
cluster_id = exec_cmd(
"oc get clusterversion version -o jsonpath='{.spec.clusterID}'"
).stdout.decode("utf-8")
client_name = f"storageconsumer-{cluster_id}"
logger.info(f"Switch to original cluster ({original_cluster})")
config.switch_ctx(original_cluster)
client = storageconsumer.StorageConsumer(
client_name, consumer_context=client_cluster
)
current_version = client.get_ocs_version()
logger.info(f"Reported client version: {current_version}")

def change_client_version():
"""
Stop heartbeat and change value of ocs version in storage client resource
for 3 minutes.
"""
nonlocal client
nonlocal original_cluster
# run_time of operation
run_time = 60 * 3
client.stop_heartbeat()
client.set_ocs_version("4.13.0")
logger.info(f"Waiting for {run_time} seconds")
time.sleep(run_time)
logger.info(f"Switch to original cluster ({original_cluster})")
config.switch_ctx(original_cluster)
return

def teardown():
nonlocal client
nonlocal original_cluster
nonlocal client_cluster
logger.info(f"Switch to client cluster ({client_cluster})")
config.switch_ctx(client_cluster)
client.resume_heartbeat()
logger.info(f"Switch to original cluster ({original_cluster})")
config.switch_ctx(original_cluster)

request.addfinalizer(teardown)

test_file = os.path.join(measurement_dir, "measure_change_client_version.json")
measured_op = measure_operation(
change_client_version,
test_file,
threading_lock=threading_lock,
metadata={"client_name": client_name},
)

teardown()

return measured_op
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import logging
import pytest

from ocs_ci.framework.pytest_customization.marks import blue_squad
from ocs_ci.framework.testlib import (
tier4c,
runs_on_provider,
hci_provider_and_client_required,
)
from ocs_ci.ocs import constants
from ocs_ci.utility import prometheus
from ocs_ci.ocs.ocp import OCP


log = logging.getLogger(__name__)


@blue_squad
@tier4c
@runs_on_provider
@hci_provider_and_client_required
@pytest.mark.polarion_id("OCS-5392")
def test_change_client_ocs_version_and_stop_heartbeat(
measure_change_client_ocs_version_and_stop_heartbeat, threading_lock
):
"""
Test that there are appropriate alerts raised when ocs version of client
is changed to a different version and those alerts are cleared when the
heartbeat is resumed. During the test is stopped heartbeat cronjob on
client in order to stop overwritting the version set for testing. When the
heartbeat is resumed thereshould be also resumed version reporting so the
version should contain previous version.
"""
api = prometheus.PrometheusAPI(threading_lock=threading_lock)

# get alerts from time when manager deployment was scaled down
alerts = measure_change_client_ocs_version_and_stop_heartbeat.get(
"prometheus_alerts"
)
client_name = measure_change_client_ocs_version_and_stop_heartbeat.get(
"metadata"
).get("client_name")
target_alerts = [
{
"label": constants.ALERT_STORAGECLIENTHEARTBEATMISSED,
"msg": f"Storage Client ({client_name}) heartbeat missed for more than 120 (s). "
"Lossy network connectivity might exist",
},
{
"label": constants.ALERT_STORAGECLIENTINCOMPATIBLEOPERATORVERSION,
"msg": f"Storage Client Operator ({client_name}) differs by more "
"than 1 minor version. Client configuration may be incompatible and unsupported",
},
]
states = ["firing"]

for target_alert in target_alerts:
prometheus.check_alert_list(
label=target_alert["label"],
msg=target_alert["msg"],
alerts=alerts,
states=states,
severity="error",
)
prometheus.check_alert_list(
label=target_alert["label"],
msg=target_alert["msg"],
alerts=alerts,
states=states,
severity="warning",
)
api.check_alert_cleared(
label=target_alert["label"],
measure_end_time=measure_change_client_ocs_version_and_stop_heartbeat.get(
"stop"
),
time_min=300,
)


def teardown_module():
ocs_obj = OCP()
ocs_obj.login_as_sa()

0 comments on commit 5389318

Please sign in to comment.