diff --git a/ocs_ci/framework/pytest_customization/ocscilib.py b/ocs_ci/framework/pytest_customization/ocscilib.py index fdf95128aec..6ce8428c641 100644 --- a/ocs_ci/framework/pytest_customization/ocscilib.py +++ b/ocs_ci/framework/pytest_customization/ocscilib.py @@ -712,8 +712,13 @@ def pytest_runtest_makereport(item, call): ): metrics = item.get_closest_marker("gather_metrics_on_fail").args try: + threading_lock = call.getfixturevalue("threading_lock") collect_prometheus_metrics( - metrics, f"{item.name}-{call.when}", call.start, call.stop + metrics, + f"{item.name}-{call.when}", + call.start, + call.stop, + threading_lock=threading_lock, ) except Exception: log.exception("Failed to collect prometheus metrics") diff --git a/ocs_ci/ocs/cluster.py b/ocs_ci/ocs/cluster.py index aa0528ef123..3b0cee83722 100644 --- a/ocs_ci/ocs/cluster.py +++ b/ocs_ci/ocs/cluster.py @@ -2387,7 +2387,9 @@ class for lvm cluster """ - def __init__(self, fstrim=False, fail_on_thin_pool_not_empty=False): + def __init__( + self, fstrim=False, fail_on_thin_pool_not_empty=False, threading_lock=None + ): """ Initiate the class, gets 2 parameters. Args: @@ -2405,6 +2407,7 @@ def __init__(self, fstrim=False, fail_on_thin_pool_not_empty=False): self.vg_data = None self.node_ssh = None self.new_prom = None + self.threading_lock = threading_lock func_list = [ self.cluster_ip(), self.get_lvmcluster(), @@ -2461,7 +2464,7 @@ def __init__(self, fstrim=False, fail_on_thin_pool_not_empty=False): thread_init_class(func_list, shutdown=0) def init_prom(self): - self.new_prom = PrometheusAPI() + self.new_prom = PrometheusAPI(threading_lock=self.threading_lock) def get_lvmcluster(self): """ diff --git a/ocs_ci/ocs/cluster_load.py b/ocs_ci/ocs/cluster_load.py index 8068dd6f6f0..7967fdb037d 100644 --- a/ocs_ci/ocs/cluster_load.py +++ b/ocs_ci/ocs/cluster_load.py @@ -52,6 +52,7 @@ def __init__( sa_factory=None, pod_factory=None, target_percentage=None, + threading_lock=None, ): """ Initializer for ClusterLoad @@ -63,9 +64,9 @@ def __init__( pod_factory (function): A call to pod_factory function target_percentage (float): The percentage of cluster load that is required. The value should be greater than 0.1 and smaller than 0.95 - + threading_lock (threading.RLock): A threading.RLock object to be used for threading lock """ - self.prometheus_api = PrometheusAPI() + self.prometheus_api = PrometheusAPI(threading_lock=threading_lock) self.pvc_factory = pvc_factory self.sa_factory = sa_factory self.pod_factory = pod_factory diff --git a/ocs_ci/ocs/exceptions.py b/ocs_ci/ocs/exceptions.py index e230f122256..e88f39062a2 100644 --- a/ocs_ci/ocs/exceptions.py +++ b/ocs_ci/ocs/exceptions.py @@ -647,5 +647,9 @@ class UnableUpgradeConnectionException(Exception): pass +class NoThreadingLockUsedError(Exception): + pass + + class VSLMNotFoundException(Exception): pass diff --git a/ocs_ci/ocs/fiojob.py b/ocs_ci/ocs/fiojob.py index 1a3acea12a4..293e8be577e 100644 --- a/ocs_ci/ocs/fiojob.py +++ b/ocs_ci/ocs/fiojob.py @@ -346,6 +346,7 @@ def workload_fio_storageutilization( keep_fio_data=False, minimal_time=480, throw_skip=True, + threading_lock=None, ): """ This function implements core functionality of fio storage utilization @@ -392,6 +393,7 @@ def workload_fio_storageutilization( (See more details in the function 'measure_operation') throw_skip (bool): if True function will raise pytest.skip.Exception and test will be skipped, otherwise return None + threading_lock (threading.RLock): lock to be used for thread synchronization when calling 'oc' cmd Returns: dict: measurement results with timestamps and other medatada from @@ -537,6 +539,7 @@ def workload_fio_storageutilization( test_file, measure_after=True, minimal_time=minimal_time, + threading_lock=threading_lock, ) # we don't need to delete anything if this fixture has been already diff --git a/ocs_ci/ocs/monitoring.py b/ocs_ci/ocs/monitoring.py index 5c8e4b0ed4d..71745cea560 100644 --- a/ocs_ci/ocs/monitoring.py +++ b/ocs_ci/ocs/monitoring.py @@ -151,16 +151,19 @@ def get_list_pvc_objs_created_on_monitoring_pods(): @retry(ServiceUnavailable, tries=60, delay=3, backoff=1) -def get_metrics_persistentvolumeclaims_info(): +def get_metrics_persistentvolumeclaims_info(threading_lock): """ Returns the created pvc information on prometheus pod + Args: + threading_lock (threading.RLock): A lock to prevent multiple threads calling 'oc' command at the same time + Returns: response.content (dict): The pvc metrics collected on prometheus pod """ - prometheus = ocs_ci.utility.prometheus.PrometheusAPI() + prometheus = ocs_ci.utility.prometheus.PrometheusAPI(threading_lock=threading_lock) response = prometheus.get( "query?query=kube_pod_spec_volumes_persistentvolumeclaims_info" ) @@ -170,12 +173,13 @@ def get_metrics_persistentvolumeclaims_info(): @retry(UnexpectedBehaviour, tries=60, delay=3, backoff=1) -def check_pvcdata_collected_on_prometheus(pvc_name): +def check_pvcdata_collected_on_prometheus(pvc_name, threading_lock): """ Checks whether initially pvc related data is collected on pod Args: pvc_name (str): Name of the pvc + threading_lock (threading.RLock): A lock to prevent multiple threads calling 'oc' command at the same time Returns: True on success, raises UnexpectedBehaviour on failures @@ -184,7 +188,7 @@ def check_pvcdata_collected_on_prometheus(pvc_name): logger.info( f"Verify for created pvc {pvc_name} related data is collected on prometheus pod" ) - pvcs_data = get_metrics_persistentvolumeclaims_info() + pvcs_data = get_metrics_persistentvolumeclaims_info(threading_lock=threading_lock) list_pvcs_data = pvcs_data.get("data").get("result") pvc_list = [ pvc @@ -201,18 +205,19 @@ def check_pvcdata_collected_on_prometheus(pvc_name): return True -def check_ceph_health_status_metrics_on_prometheus(mgr_pod): +def check_ceph_health_status_metrics_on_prometheus(mgr_pod, threading_lock): """ Check ceph health status metric is collected on prometheus pod Args: mgr_pod (str): Name of the mgr pod + threading_lock (obj): Threading lock object to ensure only one thread is making 'oc' calls Returns: bool: True on success, false otherwise """ - prometheus = ocs_ci.utility.prometheus.PrometheusAPI() + prometheus = ocs_ci.utility.prometheus.PrometheusAPI(threading_lock=threading_lock) response = prometheus.get("query?query=ceph_health_status") ceph_health_metric = json.loads(response.content.decode("utf-8")) return bool( @@ -261,17 +266,20 @@ def prometheus_health_check(name=constants.MONITORING, kind=constants.CLUSTER_OP return False -def check_ceph_metrics_available(): +def check_ceph_metrics_available(threading_lock): """ Check that all healthy ceph metrics are available. + Args: + threading_lock (threading.RLock): A lock to use for thread safety 'oc' calls + Returns: bool: True on success, false otherwise """ logger.info("check ceph metrics available") # Check ceph metrics available - prometheus = ocs_ci.utility.prometheus.PrometheusAPI() + prometheus = ocs_ci.utility.prometheus.PrometheusAPI(threading_lock=threading_lock) list_of_metrics_without_results = metrics.get_missing_metrics( prometheus, metrics.ceph_metrics_healthy, @@ -319,15 +327,18 @@ def get_prometheus_response(api, query) -> dict: return json.loads(resp.text) -def get_pvc_namespace_metrics(): +def get_pvc_namespace_metrics(threading_lock): """ Get PVC and Namespace metrics from Prometheus. + Args: + threading_lock (threading.RLock): A lock to use for thread safety 'oc' calls + Returns: dict: A dictionary containing the PVC and Namespace metrics data """ - api = ocs_ci.utility.prometheus.PrometheusAPI() + api = ocs_ci.utility.prometheus.PrometheusAPI(threading_lock=threading_lock) pvc_namespace = {} @@ -354,7 +365,7 @@ def get_pvc_namespace_metrics(): return pvc_namespace -def get_ceph_capacity_metrics(): +def get_ceph_capacity_metrics(threading_lock): """ Get CEPH capacity breakdown data from Prometheus, return all response texts collected to a dict Use the queries from ceph-storage repo: @@ -366,7 +377,7 @@ def get_ceph_capacity_metrics(): Returns: dict: A dictionary containing the CEPH capacity breakdown data """ - api = ocs_ci.utility.prometheus.PrometheusAPI() + api = ocs_ci.utility.prometheus.PrometheusAPI(threading_lock=threading_lock) ceph_capacity = {} logger.info("Get CEPH capacity breakdown data from Prometheus") diff --git a/ocs_ci/ocs/ocp.py b/ocs_ci/ocs/ocp.py index e719017073a..4bfddc730d3 100644 --- a/ocs_ci/ocs/ocp.py +++ b/ocs_ci/ocs/ocp.py @@ -67,7 +67,7 @@ def __init__( field_selector (str): Selector (field query) to filter on, supports '=', '==', and '!='. (e.g. status.phase=Running) cluster_kubeconfig (str): Path to the cluster kubeconfig file. Useful in a multicluster configuration - threading_lock (threading.Lock): threading.Lock object that is used + threading_lock (threading.RLock): threading.RLock object that is used for handling concurrent oc commands silent (bool): If True will silent errors from the server, default false skip_tls_verify (bool): Adding '--insecure-skip-tls-verify' to oc command for diff --git a/ocs_ci/ocs/utils.py b/ocs_ci/ocs/utils.py index b1e44d81d69..ed5e8eb6586 100644 --- a/ocs_ci/ocs/utils.py +++ b/ocs_ci/ocs/utils.py @@ -1319,6 +1319,7 @@ def collect_prometheus_metrics( start, stop, step=1.0, + threading_lock=None, ): """ Collects metrics from Prometheus and saves them in file in json format. @@ -1333,8 +1334,9 @@ def collect_prometheus_metrics( start (str): start timestamp of required datapoints stop (str): stop timestamp of required datapoints step (float): step of required datapoints + threading_lock: (threading.RLock): Lock to use for thread safety (default: None) """ - api = PrometheusAPI() + api = PrometheusAPI(threading_lock=threading_lock) log_dir_path = os.path.join( os.path.expanduser(ocsci_config.RUN["log_dir"]), f"failed_testcase_ocs_logs_{ocsci_config.RUN['run_id']}", diff --git a/ocs_ci/utility/prometheus.py b/ocs_ci/utility/prometheus.py index 7be2ad9490a..91436a22c9a 100644 --- a/ocs_ci/utility/prometheus.py +++ b/ocs_ci/utility/prometheus.py @@ -10,7 +10,7 @@ from ocs_ci.framework import config from ocs_ci.ocs import constants, defaults -from ocs_ci.ocs.exceptions import AlertingError, AuthError +from ocs_ci.ocs.exceptions import AlertingError, AuthError, NoThreadingLockUsedError from ocs_ci.ocs.ocp import OCP from ocs_ci.utility.ssl_certs import get_root_ca_cert from ocs_ci.utility.utils import TimeoutIterator @@ -330,6 +330,11 @@ def __init__(self, user=None, password=None, threading_lock=None): Args: user (str): OpenShift username used to connect to API """ + if threading_lock is None: + raise NoThreadingLockUsedError( + "using threading.Lock object is mandatory for PrometheusAPI class" + ) + if ( config.ENV_DATA["platform"].lower() == "ibm_cloud" and config.ENV_DATA["deployment_type"] == "managed" diff --git a/ocs_ci/utility/utils.py b/ocs_ci/utility/utils.py index b8c1a1dbfdd..a85aead0163 100644 --- a/ocs_ci/utility/utils.py +++ b/ocs_ci/utility/utils.py @@ -471,7 +471,7 @@ def run_cmd( timeout (int): Timeout for the command, defaults to 600 seconds. ignore_error (bool): True if ignore non zero return code and do not raise the exception. - threading_lock (threading.Lock): threading.Lock object that is used + threading_lock (threading.RLock): threading.RLock object that is used for handling concurrent oc commands silent (bool): If True will silent errors from the server, default false @@ -601,7 +601,7 @@ def exec_cmd( timeout (int): Timeout for the command, defaults to 600 seconds. ignore_error (bool): True if ignore non zero return code and do not raise the exception. - threading_lock (threading.Lock): threading.Lock object that is used + threading_lock (threading.RLock): threading.RLock object that is used for handling concurrent oc commands silent (bool): If True will silent errors from the server, default false use_shell (bool): If True will pass the cmd without splitting diff --git a/ocs_ci/utility/workloadfixture.py b/ocs_ci/utility/workloadfixture.py index 489e84b7ada..96e359e934b 100644 --- a/ocs_ci/utility/workloadfixture.py +++ b/ocs_ci/utility/workloadfixture.py @@ -65,7 +65,7 @@ def measure_operation( and utilized data are measured after the utilization is completed pagerduty_service_ids (list): Service IDs from PagerDuty system used incidents query - threading_lock (threading.Lock): Lock used for synchronization of the threads in Prometheus calls + threading_lock (threading.RLock): Lock used for synchronization of the threads in Prometheus calls Returns: dict: contains information about `start` and `stop` time of given diff --git a/tests/conftest.py b/tests/conftest.py index 2a85be9f77d..379eca5f88d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -430,9 +430,10 @@ def threading_lock(): threading.Lock object that can be used in threads across multiple tests. Returns: - threading.Lock: lock object + threading.Rlock: Reentrant Lock object. A reentrant lock (or RLock) is a type of lock that allows the same + thread to acquire the lock multiple times without causing a deadlock """ - return threading.Lock() + return threading.RLock() @pytest.fixture(scope="session", autouse=True) @@ -1772,6 +1773,7 @@ def cluster_load( pvc_factory_session, service_account_factory_session, pod_factory_session, + threading_lock, ): """ Run IO during the test execution @@ -1805,6 +1807,7 @@ def cluster_load( pvc_factory=pvc_factory_session, pod_factory=pod_factory_session, target_percentage=io_load, + threading_lock=threading_lock, ) cl_load_obj.reach_cluster_load_percentage() except Exception as ex: @@ -1814,7 +1817,7 @@ def cluster_load( if (log_utilization or io_in_bg) and not deployment_test: if not cl_load_obj: try: - cl_load_obj = ClusterLoad() + cl_load_obj = ClusterLoad(threading_lock=threading_lock) except Exception as ex: log.error(cluster_load_error_msg, ex) cluster_load_error = ex diff --git a/tests/e2e/system_test/test_cluster_full_and_recovery.py b/tests/e2e/system_test/test_cluster_full_and_recovery.py index c0184d6def9..47cbe1ebe8c 100644 --- a/tests/e2e/system_test/test_cluster_full_and_recovery.py +++ b/tests/e2e/system_test/test_cluster_full_and_recovery.py @@ -50,6 +50,7 @@ def test_cluster_full_and_recovery( pvc_factory, pod_factory, project_factory, + threading_lock, ): """ 1.Create PVC1 [FS + RBD] @@ -159,6 +160,7 @@ def test_cluster_full_and_recovery( sleep=50, func=self.verify_alerts_via_prometheus, expected_alerts=expected_alerts, + threading_lock=threading_lock, ) if not sample.wait_for_func_status(result=True): log.error(f"The alerts {expected_alerts} do not exist after 600 sec") @@ -317,18 +319,19 @@ def verify_osd_used_capacity_greater_than_expected(self, expected_used_capacity) return True return False - def verify_alerts_via_prometheus(self, expected_alerts): + def verify_alerts_via_prometheus(self, expected_alerts, threading_lock): """ Verify Alerts on prometheus Args: expected_alerts (list): list of alert names + threading_lock (threading.Rlock): Lock object to prevent simultaneous calls to 'oc' Returns: bool: True if expected_alerts exist, False otherwise """ - prometheus = PrometheusAPI() + prometheus = PrometheusAPI(threading_lock=threading_lock) log.info("Logging of all prometheus alerts started") alerts_response = prometheus.get( "alerts", payload={"silenced": False, "inhibited": False} diff --git a/tests/e2e/workloads/ocp/monitoring/test_monitoring_on_negative_scenarios.py b/tests/e2e/workloads/ocp/monitoring/test_monitoring_on_negative_scenarios.py index 615d8d4cea4..e50ce5eb9bc 100644 --- a/tests/e2e/workloads/ocp/monitoring/test_monitoring_on_negative_scenarios.py +++ b/tests/e2e/workloads/ocp/monitoring/test_monitoring_on_negative_scenarios.py @@ -43,10 +43,13 @@ @retry(AssertionError, tries=30, delay=3, backoff=1) -def wait_to_update_mgrpod_info_prometheus_pod(): +def wait_to_update_mgrpod_info_prometheus_pod(threading_lock): """ Validates the ceph health metrics is updated on prometheus pod + Args: + threading_lock (threading.RLock): A lock to ensure only one thread is making the 'oc' calls + """ log.info("Verifying ceph health status metrics is updated after rebooting the node") @@ -60,15 +63,21 @@ def wait_to_update_mgrpod_info_prometheus_pod(): .get("name") ) assert check_ceph_health_status_metrics_on_prometheus( - mgr_pod=mgr_pod + mgr_pod=mgr_pod, threading_lock=threading_lock ), "Ceph health status metrics are not updated after the rebooting node where the mgr running" log.info("Ceph health status metrics is updated") @retry(AssertionError, tries=30, delay=5, backoff=2) -def check_ceph_metrics_available_within_time(): - assert ( - check_ceph_metrics_available() +def check_ceph_metrics_available_within_time(threading_lock): + """ + Validates the ceph metrics are available on prometheus pod + Args: + threading_lock (threading.RLock): A lock to ensure only one thread is accessing the 'oc' command + + """ + assert check_ceph_metrics_available( + threading_lock=threading_lock ), "failed to get results for some metrics after Downscaling and Upscaling deployment mgr" @@ -78,11 +87,15 @@ def check_ceph_metrics_available_within_time(): delay=15, backoff=1, ) -def wait_for_nodes_status_and_prometheus_health_check(pods): +def wait_for_nodes_status_and_prometheus_health_check(pods, threading_lock): """ Waits for the all the nodes to be in running state and also check prometheus health + Args: + pods (list): List of pods + threading_lock (threading.RLock): A lock to ensure only one thread is accessing the 'oc' command + """ # Validate all nodes are in READY state @@ -92,7 +105,7 @@ def wait_for_nodes_status_and_prometheus_health_check(pods): # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( - pod_obj.pvc.name + pod_obj.pvc.name, threading_lock ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" assert prometheus_health_check(), "Prometheus health is degraded" @@ -159,7 +172,7 @@ def finalizer(): request.addfinalizer(finalizer) @pytest.fixture() - def pods(self, multi_pvc_factory, dc_pod_factory): + def pods(self, multi_pvc_factory, dc_pod_factory, threading_lock): """ Prepare multiple dc pods for the test @@ -183,12 +196,12 @@ def pods(self, multi_pvc_factory, dc_pod_factory): # Check for the created pvc metrics on prometheus pod for pod_obj in pod_objs: assert check_pvcdata_collected_on_prometheus( - pod_obj.pvc.name + pod_obj.pvc.name, threading_lock ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" return pod_objs @pytest.mark.polarion_id("OCS-576") - def test_monitoring_after_restarting_prometheus_pod(self, pods): + def test_monitoring_after_restarting_prometheus_pod(self, pods, threading_lock): """ Test case to validate prometheus pod restart should not have any functional impact @@ -225,11 +238,13 @@ def test_monitoring_after_restarting_prometheus_pod(self, pods): for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( - pod_obj.pvc.name + pod_obj.pvc.name, threading_lock ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" @pytest.mark.polarion_id("OCS-579") - def test_monitoring_after_draining_node_where_prometheus_hosted(self, pods): + def test_monitoring_after_draining_node_where_prometheus_hosted( + self, pods, threading_lock + ): """ Test case to validate when node is drained where prometheus is hosted, prometheus pod should re-spin on new healthy node @@ -306,11 +321,11 @@ def test_monitoring_after_draining_node_where_prometheus_hosted(self, pods): # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( - pod_obj.pvc.name + pod_obj.pvc.name, threading_lock ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" @pytest.mark.polarion_id("OCS-580") - def test_monitoring_after_respinning_ceph_pods(self, pods): + def test_monitoring_after_respinning_ceph_pods(self, pods, threading_lock): """ Test case to validate respinning the ceph pods and its interaction with prometheus pod @@ -327,14 +342,14 @@ def test_monitoring_after_respinning_ceph_pods(self, pods): # Check for the created pvc metrics on prometheus pod for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( - pod_obj.pvc.name + pod_obj.pvc.name, threading_lock ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" # Validate osd is up and ceph health is ok self.sanity_helpers.health_check(tries=40) @pytest.mark.polarion_id("OCS-605") - def test_monitoring_when_osd_down(self, pods): + def test_monitoring_when_osd_down(self, pods, threading_lock): """ Test case to validate monitoring when osd is down @@ -358,7 +373,7 @@ def test_monitoring_when_osd_down(self, pods): # Check for the created pvc metrics when osd is down for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( - pod_obj.pvc.name + pod_obj.pvc.name, threading_lock ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" # Make osd up which was down @@ -368,7 +383,9 @@ def test_monitoring_when_osd_down(self, pods): self.sanity_helpers.health_check(tries=40) @pytest.mark.polarion_id("OCS-606") - def test_monitoring_when_one_of_the_prometheus_node_down(self, nodes, pods): + def test_monitoring_when_one_of_the_prometheus_node_down( + self, nodes, pods, threading_lock + ): """ Test case to validate when the prometheus pod is down and its interaction with prometheus @@ -404,14 +421,14 @@ def test_monitoring_when_one_of_the_prometheus_node_down(self, nodes, pods): # Check for the created pvc metrics after restarting node where prometheus pod is hosted for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( - pod_obj.pvc.name + pod_obj.pvc.name, threading_lock ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" log.info( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is collected" ) @pytest.mark.polarion_id("OCS-709") - def test_monitoring_after_rebooting_master_node(self, nodes, pods): + def test_monitoring_after_rebooting_master_node(self, nodes, pods, threading_lock): """ Test case to validate rebooting master node shouldn't delete the data collected on prometheus pod @@ -430,13 +447,15 @@ def test_monitoring_after_rebooting_master_node(self, nodes, pods): log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) - wait_for_nodes_status_and_prometheus_health_check(pods) + wait_for_nodes_status_and_prometheus_health_check(pods, threading_lock) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) @pytest.mark.polarion_id("OCS-710") - def test_monitoring_after_rebooting_node_where_mgr_is_running(self, nodes, pods): + def test_monitoring_after_rebooting_node_where_mgr_is_running( + self, nodes, pods, threading_lock + ): """ Test case to validate rebooting a node where mgr is running should not delete the data collected on prometheus pod @@ -481,18 +500,20 @@ def test_monitoring_after_rebooting_node_where_mgr_is_running(self, nodes, pods) self.sanity_helpers.health_check(tries=40) # Check for ceph health check metrics is updated with new mgr pod - wait_to_update_mgrpod_info_prometheus_pod() + wait_to_update_mgrpod_info_prometheus_pod(threading_lock) # Check for the created pvc metrics after rebooting the node where mgr pod was running for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( - pod_obj.pvc.name + pod_obj.pvc.name, threading_lock ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" @pytest.mark.polarion_id("OCS-711") @skipif_aws_i3 @skipif_ibm_cloud - def test_monitoring_shutdown_and_recovery_prometheus_node(self, nodes, pods): + def test_monitoring_shutdown_and_recovery_prometheus_node( + self, nodes, pods, threading_lock + ): """ Test case to validate whether shutdown and recovery of a node where monitoring pods running has no functional impact @@ -533,7 +554,7 @@ def test_monitoring_shutdown_and_recovery_prometheus_node(self, nodes, pods): # Check for the created pvc metrics after shutdown and recovery of prometheus nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( - pod_obj.pvc.name + pod_obj.pvc.name, threading_lock ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" @pytest.mark.polarion_id("OCS-638") @@ -599,7 +620,7 @@ def test_monitoring_delete_pvc(self): assert prometheus_health_check(), "Prometheus cluster health is not OK" @pytest.mark.polarion_id("OCS-1535") - def test_monitoring_shutdown_mgr_pod(self, pods): + def test_monitoring_shutdown_mgr_pod(self, pods, threading_lock): """ Montoring backed by OCS, bring mgr down(replica: 0) for some time and check ceph related metrics @@ -637,4 +658,4 @@ def test_monitoring_shutdown_mgr_pod(self, pods): ) # Check ceph metrics available - check_ceph_metrics_available_within_time() + check_ceph_metrics_available_within_time(threading_lock) diff --git a/tests/e2e/workloads/ocp/registry/test_registry_by_increasing_num_of_image_registry_pods.py b/tests/e2e/workloads/ocp/registry/test_registry_by_increasing_num_of_image_registry_pods.py index 66dc8276d9c..24622b3a65d 100644 --- a/tests/e2e/workloads/ocp/registry/test_registry_by_increasing_num_of_image_registry_pods.py +++ b/tests/e2e/workloads/ocp/registry/test_registry_by_increasing_num_of_image_registry_pods.py @@ -61,13 +61,13 @@ def finalizer(): request.addfinalizer(finalizer) @pytest.mark.polarion_id("OCS-1900") - def test_registry_by_increasing_num_of_registry_pods(self, count=3): + def test_registry_by_increasing_num_of_registry_pods(self, threading_lock, count=3): """ Test registry by increasing number of registry pods and validate all the image-registry pod should have the same PVC backend. """ - api = prometheus.PrometheusAPI() + api = prometheus.PrometheusAPI(threading_lock=threading_lock) # Increase the replica count to 3 assert modify_registry_pod_count( diff --git a/tests/lvmo/test_lvm_alerts.py b/tests/lvmo/test_lvm_alerts.py index 73ac1c76d60..d4761bf67c0 100644 --- a/tests/lvmo/test_lvm_alerts.py +++ b/tests/lvmo/test_lvm_alerts.py @@ -61,8 +61,10 @@ class TestLvmCapacityAlerts(ManageTest): block = False @pytest.fixture() - def init_lvm(self): - self.lvm = LVM(fstrim=True, fail_on_thin_pool_not_empty=True) + def init_lvm(self, threading_lock): + self.lvm = LVM( + fstrim=True, fail_on_thin_pool_not_empty=True, threading_lock=threading_lock + ) disk1 = self.lvm.pv_data["pv_list"][0] log.info(f"PV List: {self.lvm.pv_data['pv_list']}") self.disk_size = self.lvm.pv_data[disk1]["pv_size"] @@ -91,6 +93,7 @@ def test_thin_pool_capacity_alert( pvc_factory, pod_factory, volume_binding_mode, + threading_lock, ): """ diff --git a/tests/lvmo/test_lvm_clone_base.py b/tests/lvmo/test_lvm_clone_base.py index c289911cbf7..6fd593be28c 100644 --- a/tests/lvmo/test_lvm_clone_base.py +++ b/tests/lvmo/test_lvm_clone_base.py @@ -61,6 +61,7 @@ def test_create_clone_from_pvc( pvc_clone_factory, pvc_factory, pod_factory, + threading_lock, ): """ test create delete snapshot @@ -74,7 +75,9 @@ def test_create_clone_from_pvc( .* Run IO """ - lvm = LVM(fstrim=True, fail_on_thin_pool_not_empty=True) + lvm = LVM( + fstrim=True, fail_on_thin_pool_not_empty=True, threading_lock=threading_lock + ) logger.info(f"LVMCluster version is {lvm.get_lvm_version()}") logger.info( f"Lvm thin-pool overprovisionRation is {lvm.get_lvm_thin_pool_config_overprovision_ratio()}" diff --git a/tests/lvmo/test_lvm_clone_bigger_than_disk.py b/tests/lvmo/test_lvm_clone_bigger_than_disk.py index 589b14a63cb..5be016defda 100644 --- a/tests/lvmo/test_lvm_clone_bigger_than_disk.py +++ b/tests/lvmo/test_lvm_clone_bigger_than_disk.py @@ -58,6 +58,7 @@ def test_create_clone_from_pvc_bigger_than_disk( pvc_clone_factory, pvc_factory, pod_factory, + threading_lock, ): """ test create delete snapshot @@ -74,7 +75,9 @@ def test_create_clone_from_pvc_bigger_than_disk( access_mode = constants.ACCESS_MODE_RWO - lvm = LVM(fstrim=True, fail_on_thin_pool_not_empty=True) + lvm = LVM( + fstrim=True, fail_on_thin_pool_not_empty=True, threading_lock=threading_lock + ) disk1 = lvm.pv_data["pv_list"][0] disk_size = lvm.pv_data[disk1]["pv_size"] pvc_size = int(float(disk_size)) * 2 diff --git a/tests/lvmo/test_lvm_multi_clone.py b/tests/lvmo/test_lvm_multi_clone.py index c2145bbae48..d73843f9237 100644 --- a/tests/lvmo/test_lvm_multi_clone.py +++ b/tests/lvmo/test_lvm_multi_clone.py @@ -65,6 +65,7 @@ def test_create_multi_clone_from_pvc( pvc_clone_factory, pvc_factory, pod_factory, + threading_lock, ): """ test create delete multi snapshot @@ -77,7 +78,9 @@ def test_create_multi_clone_from_pvc( .* Run IO """ - lvm = LVM(fstrim=True, fail_on_thin_pool_not_empty=True) + lvm = LVM( + fstrim=True, fail_on_thin_pool_not_empty=True, threading_lock=threading_lock + ) logger.info(f"LVMCluster version is {lvm.get_lvm_version()}") logger.info( f"Lvm thin-pool overprovisionRation is {lvm.get_lvm_thin_pool_config_overprovision_ratio()}" diff --git a/tests/lvmo/test_lvm_multi_snapshot.py b/tests/lvmo/test_lvm_multi_snapshot.py index bd56770cec3..66f90e51e0b 100644 --- a/tests/lvmo/test_lvm_multi_snapshot.py +++ b/tests/lvmo/test_lvm_multi_snapshot.py @@ -66,6 +66,7 @@ def test_create_multi_snapshot_from_pvc( snapshot_restore_factory, pvc_factory, pod_factory, + threading_lock, ): """ test create delete multi snapshot @@ -78,7 +79,9 @@ def test_create_multi_snapshot_from_pvc( .* Run IO """ - lvm = LVM(fstrim=True, fail_on_thin_pool_not_empty=True) + lvm = LVM( + fstrim=True, fail_on_thin_pool_not_empty=True, threading_lock=threading_lock + ) logger.info(f"LVMCluster version is {lvm.get_lvm_version()}") logger.info( f"Lvm thin-pool overprovisionRation is {lvm.get_lvm_thin_pool_config_overprovision_ratio()}" diff --git a/tests/lvmo/test_lvm_snapshot_base.py b/tests/lvmo/test_lvm_snapshot_base.py index a135419e870..0c94831d7cd 100644 --- a/tests/lvmo/test_lvm_snapshot_base.py +++ b/tests/lvmo/test_lvm_snapshot_base.py @@ -62,6 +62,7 @@ def test_create_snapshot_from_pvc( snapshot_restore_factory, pvc_factory, pod_factory, + threading_lock, ): """ test create delete snapshot @@ -74,7 +75,9 @@ def test_create_snapshot_from_pvc( .* Run IO """ - lvm = LVM(fstrim=True, fail_on_thin_pool_not_empty=True) + lvm = LVM( + fstrim=True, fail_on_thin_pool_not_empty=True, threading_lock=threading_lock + ) logger.info(f"LVMCluster version is {lvm.get_lvm_version()}") logger.info( f"Lvm thin-pool overprovisionRation is {lvm.get_lvm_thin_pool_config_overprovision_ratio()}" diff --git a/tests/lvmo/test_lvm_snapshot_bigger_than_disk.py b/tests/lvmo/test_lvm_snapshot_bigger_than_disk.py index b3cc0c0d711..943dc9b83c7 100644 --- a/tests/lvmo/test_lvm_snapshot_bigger_than_disk.py +++ b/tests/lvmo/test_lvm_snapshot_bigger_than_disk.py @@ -60,6 +60,7 @@ def test_create_snapshot_from_pvc_bigger_than_disk( snapshot_restore_factory, pvc_factory, pod_factory, + threading_lock, ): """ test create delete snapshot @@ -77,7 +78,9 @@ def test_create_snapshot_from_pvc_bigger_than_disk( access_mode = constants.ACCESS_MODE_RWO - lvm = LVM(fstrim=True, fail_on_thin_pool_not_empty=True) + lvm = LVM( + fstrim=True, fail_on_thin_pool_not_empty=True, threading_lock=threading_lock + ) disk1 = lvm.pv_data["pv_list"][0] disk_size = lvm.pv_data[disk1]["pv_size"] pvc_size = int(float(disk_size)) * 2 diff --git a/tests/lvmo/test_lvmo_pvc_resize.py b/tests/lvmo/test_lvmo_pvc_resize.py index bb35d78294a..e45d4f23068 100644 --- a/tests/lvmo/test_lvmo_pvc_resize.py +++ b/tests/lvmo/test_lvmo_pvc_resize.py @@ -58,8 +58,10 @@ class TestLVMPVCResize(ManageTest): block = False @pytest.fixture() - def init_lvm(self): - self.lvm = LVM(fstrim=True, fail_on_thin_pool_not_empty=True) + def init_lvm(self, threading_lock): + self.lvm = LVM( + fstrim=True, fail_on_thin_pool_not_empty=True, threading_lock=threading_lock + ) disk1 = self.lvm.pv_data["pv_list"][0] log.info(f"PV List: {self.lvm.pv_data['pv_list']}") self.disk_size = self.lvm.pv_data[disk1]["pv_size"] diff --git a/tests/manage/mcg/test_noobaa_prometheus.py b/tests/manage/mcg/test_noobaa_prometheus.py index f92fe94306f..3f9d603e266 100644 --- a/tests/manage/mcg/test_noobaa_prometheus.py +++ b/tests/manage/mcg/test_noobaa_prometheus.py @@ -19,9 +19,9 @@ @retry(ReturnedEmptyResponseException, tries=30, delay=10, backoff=1) -def get_bucket_used_bytes_metric(bucket_name): +def get_bucket_used_bytes_metric(bucket_name, threading_lock): response = json.loads( - PrometheusAPI() + PrometheusAPI(threading_lock=threading_lock) .get(f'query?query=NooBaa_bucket_used_bytes{{bucket_name="{bucket_name}"}}') .content.decode("utf-8") ) diff --git a/tests/manage/monitoring/conftest.py b/tests/manage/monitoring/conftest.py index b63efd81839..3012941789d 100644 --- a/tests/manage/monitoring/conftest.py +++ b/tests/manage/monitoring/conftest.py @@ -400,7 +400,13 @@ def wait_with_corrupted_pg(): @pytest.fixture def workload_storageutilization_05p_rbd( - project, fio_pvc_dict, fio_job_dict, fio_configmap_dict, measurement_dir, tmp_path + project, + fio_pvc_dict, + fio_job_dict, + fio_configmap_dict, + measurement_dir, + tmp_path, + threading_lock, ): fixture_name = "workload_storageutilization_05p_rbd" measured_op = workload_fio_storageutilization( @@ -412,6 +418,7 @@ def workload_storageutilization_05p_rbd( measurement_dir, tmp_path, target_percentage=0.05, + threading_lock=threading_lock, ) return measured_op @@ -425,6 +432,7 @@ def workload_storageutilization_50p_rbd( measurement_dir, tmp_path, supported_configuration, + threading_lock, ): fixture_name = "workload_storageutilization_50p_rbd" measured_op = workload_fio_storageutilization( @@ -436,13 +444,20 @@ def workload_storageutilization_50p_rbd( measurement_dir, tmp_path, target_percentage=0.5, + threading_lock=threading_lock, ) return measured_op @pytest.fixture def workload_storageutilization_checksum_rbd( - project, fio_pvc_dict, fio_job_dict, fio_configmap_dict, measurement_dir, tmp_path + project, + fio_pvc_dict, + fio_job_dict, + fio_configmap_dict, + measurement_dir, + tmp_path, + threading_lock, ): fixture_name = "workload_storageutilization_checksum_rbd" measured_op = workload_fio_storageutilization( @@ -455,6 +470,7 @@ def workload_storageutilization_checksum_rbd( tmp_path, target_size=10, with_checksum=True, + threading_lock=threading_lock, ) return measured_op @@ -468,6 +484,7 @@ def workload_storageutilization_85p_rbd( measurement_dir, tmp_path, supported_configuration, + threading_lock, ): fixture_name = "workload_storageutilization_85p_rbd" measured_op = workload_fio_storageutilization( @@ -479,6 +496,7 @@ def workload_storageutilization_85p_rbd( measurement_dir, tmp_path, target_percentage=0.85, + threading_lock=threading_lock, ) return measured_op @@ -492,6 +510,7 @@ def workload_storageutilization_97p_rbd( measurement_dir, tmp_path, supported_configuration, + threading_lock, ): fixture_name = "workload_storageutilization_97p_rbd" measured_op = workload_fio_storageutilization( @@ -503,13 +522,20 @@ def workload_storageutilization_97p_rbd( measurement_dir, tmp_path, target_percentage=0.97, + threading_lock=threading_lock, ) return measured_op @pytest.fixture def workload_storageutilization_05p_cephfs( - project, fio_pvc_dict, fio_job_dict, fio_configmap_dict, measurement_dir, tmp_path + project, + fio_pvc_dict, + fio_job_dict, + fio_configmap_dict, + measurement_dir, + tmp_path, + threading_lock, ): fixture_name = "workload_storageutilization_05p_cephfs" measured_op = workload_fio_storageutilization( @@ -521,6 +547,7 @@ def workload_storageutilization_05p_cephfs( measurement_dir, tmp_path, target_percentage=0.05, + threading_lock=threading_lock, ) return measured_op @@ -534,6 +561,7 @@ def workload_storageutilization_50p_cephfs( measurement_dir, tmp_path, supported_configuration, + threading_lock, ): fixture_name = "workload_storageutilization_50p_cephfs" measured_op = workload_fio_storageutilization( @@ -545,6 +573,7 @@ def workload_storageutilization_50p_cephfs( measurement_dir, tmp_path, target_percentage=0.5, + threading_lock=threading_lock, ) return measured_op @@ -558,6 +587,7 @@ def workload_storageutilization_85p_cephfs( measurement_dir, tmp_path, supported_configuration, + threading_lock, ): fixture_name = "workload_storageutilization_85p_cephfs" measured_op = workload_fio_storageutilization( @@ -569,6 +599,7 @@ def workload_storageutilization_85p_cephfs( measurement_dir, tmp_path, target_percentage=0.85, + threading_lock=threading_lock, ) return measured_op @@ -582,6 +613,7 @@ def workload_storageutilization_97p_cephfs( measurement_dir, tmp_path, supported_configuration, + threading_lock, ): fixture_name = "workload_storageutilization_97p_cephfs" measured_op = workload_fio_storageutilization( @@ -593,6 +625,7 @@ def workload_storageutilization_97p_cephfs( measurement_dir, tmp_path, target_percentage=0.97, + threading_lock=threading_lock, ) return measured_op @@ -602,7 +635,13 @@ def workload_storageutilization_97p_cephfs( @pytest.fixture def workload_storageutilization_10g_rbd( - project, fio_pvc_dict, fio_job_dict, fio_configmap_dict, measurement_dir, tmp_path + project, + fio_pvc_dict, + fio_job_dict, + fio_configmap_dict, + measurement_dir, + tmp_path, + threading_lock, ): fixture_name = "workload_storageutilization_10G_rbd" measured_op = workload_fio_storageutilization( @@ -614,13 +653,20 @@ def workload_storageutilization_10g_rbd( measurement_dir, tmp_path, target_size=10, + threading_lock=threading_lock, ) return measured_op @pytest.fixture def workload_storageutilization_10g_cephfs( - project, fio_pvc_dict, fio_job_dict, fio_configmap_dict, measurement_dir, tmp_path + project, + fio_pvc_dict, + fio_job_dict, + fio_configmap_dict, + measurement_dir, + tmp_path, + threading_lock, ): fixture_name = "workload_storageutilization_10G_cephfs" measured_op = workload_fio_storageutilization( @@ -632,6 +678,7 @@ def workload_storageutilization_10g_cephfs( measurement_dir, tmp_path, target_size=10, + threading_lock=threading_lock, ) return measured_op diff --git a/tests/manage/monitoring/prometheus/test_alerting_works.py b/tests/manage/monitoring/prometheus/test_alerting_works.py index 6ab2ddc98c4..309001874d4 100644 --- a/tests/manage/monitoring/prometheus/test_alerting_works.py +++ b/tests/manage/monitoring/prometheus/test_alerting_works.py @@ -12,11 +12,11 @@ @blue_squad -def test_alerting_works(): +def test_alerting_works(threading_lock): """ If alerting works then there is at least one alert. """ - prometheus = ocs_ci.utility.prometheus.PrometheusAPI() + prometheus = ocs_ci.utility.prometheus.PrometheusAPI(threading_lock=threading_lock) alerts_response = prometheus.get( "alerts", payload={"silenced": False, "inhibited": False} ) @@ -30,11 +30,11 @@ def test_alerting_works(): @pytest.mark.polarion_id("OCS-2503") @bugzilla("1897674") @tier1 -def test_prometheus_rule_failures(): +def test_prometheus_rule_failures(threading_lock): """ There should be no PrometheusRuleFailures alert when OCS is configured. """ - prometheus = ocs_ci.utility.prometheus.PrometheusAPI() + prometheus = ocs_ci.utility.prometheus.PrometheusAPI(threading_lock=threading_lock) alerts_response = prometheus.get( "alerts", payload={"silenced": False, "inhibited": False} ) diff --git a/tests/manage/monitoring/prometheus/test_capacity.py b/tests/manage/monitoring/prometheus/test_capacity.py index 75b5fff6164..b927a693788 100644 --- a/tests/manage/monitoring/prometheus/test_capacity.py +++ b/tests/manage/monitoring/prometheus/test_capacity.py @@ -23,12 +23,14 @@ "ceph_cluster_total_used_bytes", "cluster:memory_usage_bytes:sum" ) @skipif_managed_service -def test_rbd_capacity_workload_alerts(workload_storageutilization_97p_rbd): +def test_rbd_capacity_workload_alerts( + workload_storageutilization_97p_rbd, threading_lock +): """ Test that there are appropriate alerts when ceph cluster is utilized via RBD interface. """ - api = prometheus.PrometheusAPI() + api = prometheus.PrometheusAPI(threading_lock=threading_lock) measure_end_time = workload_storageutilization_97p_rbd.get("stop") # Check utilization on 97% @@ -88,11 +90,13 @@ def test_rbd_capacity_workload_alerts(workload_storageutilization_97p_rbd): "ceph_cluster_total_used_bytes", "cluster:memory_usage_bytes:sum" ) @skipif_managed_service -def test_cephfs_capacity_workload_alerts(workload_storageutilization_97p_cephfs): +def test_cephfs_capacity_workload_alerts( + workload_storageutilization_97p_cephfs, threading_lock +): """ Test that there are appropriate alerts when ceph cluster is utilized. """ - api = prometheus.PrometheusAPI() + api = prometheus.PrometheusAPI(threading_lock=threading_lock) measure_end_time = workload_storageutilization_97p_cephfs.get("stop") # Check utilization on 97% diff --git a/tests/manage/monitoring/prometheus/test_ceph.py b/tests/manage/monitoring/prometheus/test_ceph.py index 01b984171a6..76c570d3c8d 100644 --- a/tests/manage/monitoring/prometheus/test_ceph.py +++ b/tests/manage/monitoring/prometheus/test_ceph.py @@ -14,14 +14,14 @@ @tier4a @pytest.mark.polarion_id("OCS-903") @skipif_managed_service -def test_corrupt_pg_alerts(measure_corrupt_pg): +def test_corrupt_pg_alerts(measure_corrupt_pg, threading_lock): """ Test that there are appropriate alerts when Placement group on one OSD is corrupted.ceph manager is unavailable and that this alert is cleared when the manager is back online. """ - api = prometheus.PrometheusAPI() + api = prometheus.PrometheusAPI(threading_lock=threading_lock) alerts = measure_corrupt_pg.get("prometheus_alerts") for target_label, target_msg, target_states, target_severity in [ @@ -60,14 +60,14 @@ def test_corrupt_pg_alerts(measure_corrupt_pg): @tier4a @pytest.mark.polarion_id("OCS-898") @skipif_managed_service -def test_ceph_health(measure_stop_ceph_osd, measure_corrupt_pg): +def test_ceph_health(measure_stop_ceph_osd, measure_corrupt_pg, threading_lock): """ Test that there are appropriate alerts for Ceph health triggered. For this check of Ceph Warning state is used measure_stop_ceph_osd utilization monitor and for Ceph Error state is used measure_corrupt_pg utilization. """ - api = prometheus.PrometheusAPI() + api = prometheus.PrometheusAPI(threading_lock=threading_lock) alerts = measure_stop_ceph_osd.get("prometheus_alerts") target_label = constants.ALERT_CLUSTERWARNINGSTATE diff --git a/tests/manage/monitoring/prometheus/test_deployment_status.py b/tests/manage/monitoring/prometheus/test_deployment_status.py index 281d6cd2582..03f3bbc8da6 100644 --- a/tests/manage/monitoring/prometheus/test_deployment_status.py +++ b/tests/manage/monitoring/prometheus/test_deployment_status.py @@ -20,13 +20,13 @@ @tier4c @pytest.mark.polarion_id("OCS-1052") @skipif_managed_service -def test_ceph_manager_stopped(measure_stop_ceph_mgr): +def test_ceph_manager_stopped(measure_stop_ceph_mgr, threading_lock): """ Test that there is appropriate alert when ceph manager is unavailable and that this alert is cleared when the manager is back online. """ - api = prometheus.PrometheusAPI() + api = prometheus.PrometheusAPI(threading_lock=threading_lock) # get alerts from time when manager deployment was scaled down alerts = measure_stop_ceph_mgr.get("prometheus_alerts") @@ -50,13 +50,13 @@ def test_ceph_manager_stopped(measure_stop_ceph_mgr): @tier4c @pytest.mark.polarion_id("OCS-904") @skipif_managed_service -def test_ceph_monitor_stopped(measure_stop_ceph_mon): +def test_ceph_monitor_stopped(measure_stop_ceph_mon, threading_lock): """ Test that there is appropriate alert related to ceph monitor quorum when there is even number of ceph monitors and that this alert is cleared when monitors are back online. """ - api = prometheus.PrometheusAPI() + api = prometheus.PrometheusAPI(threading_lock=threading_lock) # get alerts from time when manager deployment was scaled down alerts = measure_stop_ceph_mon.get("prometheus_alerts") @@ -93,12 +93,12 @@ def test_ceph_monitor_stopped(measure_stop_ceph_mon): @pytest.mark.parametrize("create_mon_quorum_loss", [True]) @skipif_managed_service @skipif_ocs_version("<4.9") -def test_ceph_mons_quorum_lost(measure_stop_ceph_mon): +def test_ceph_mons_quorum_lost(measure_stop_ceph_mon, threading_lock): """ Test to verify that CephMonQuorumLost alert is seen and that this alert is cleared when monitors are back online. """ - api = prometheus.PrometheusAPI() + api = prometheus.PrometheusAPI(threading_lock=threading_lock) # get alerts from time when manager deployment was scaled down alerts = measure_stop_ceph_mon.get("prometheus_alerts") @@ -122,12 +122,12 @@ def test_ceph_mons_quorum_lost(measure_stop_ceph_mon): @tier4c @pytest.mark.polarion_id("OCS-900") @skipif_managed_service -def test_ceph_osd_stopped(measure_stop_ceph_osd): +def test_ceph_osd_stopped(measure_stop_ceph_osd, threading_lock): """ Test that there is appropriate alert related to situation when ceph osd is down. Alert is cleared when osd disk is back online. """ - api = prometheus.PrometheusAPI() + api = prometheus.PrometheusAPI(threading_lock=threading_lock) # get alerts from time when manager deployment was scaled down alerts = measure_stop_ceph_osd.get("prometheus_alerts") diff --git a/tests/manage/monitoring/prometheus/test_hpa.py b/tests/manage/monitoring/prometheus/test_hpa.py index 0d09e7a054c..0c354b1160b 100644 --- a/tests/manage/monitoring/prometheus/test_hpa.py +++ b/tests/manage/monitoring/prometheus/test_hpa.py @@ -17,11 +17,11 @@ @marks.polarion_id("OCS-2375") @marks.bugzilla("1836299") @skipif_managed_service -def test_hpa_maxreplica_alert(): +def test_hpa_maxreplica_alert(threading_lock): """ Test to verify that no HPA max replica alert is triggered """ - api = prometheus.PrometheusAPI() + api = prometheus.PrometheusAPI(threading_lock=threading_lock) logger.info( f"Verifying whether {constants.ALERT_KUBEHPAREPLICASMISMATCH} " diff --git a/tests/manage/monitoring/prometheus/test_noobaa.py b/tests/manage/monitoring/prometheus/test_noobaa.py index f60fd0b809a..7a013112a4c 100644 --- a/tests/manage/monitoring/prometheus/test_noobaa.py +++ b/tests/manage/monitoring/prometheus/test_noobaa.py @@ -22,11 +22,11 @@ @skipif_managed_service @skipif_disconnected_cluster @skipif_aws_creds_are_missing -def test_noobaa_bucket_quota(measure_noobaa_exceed_bucket_quota): +def test_noobaa_bucket_quota(measure_noobaa_exceed_bucket_quota, threading_lock): """ Test that there are appropriate alerts when NooBaa Bucket Quota is reached. """ - api = prometheus.PrometheusAPI() + api = prometheus.PrometheusAPI(threading_lock=threading_lock) alerts = measure_noobaa_exceed_bucket_quota.get("prometheus_alerts") @@ -119,12 +119,12 @@ def test_noobaa_bucket_quota(measure_noobaa_exceed_bucket_quota): @skipif_managed_service @skipif_disconnected_cluster @skipif_aws_creds_are_missing -def test_noobaa_ns_bucket(measure_noobaa_ns_target_bucket_deleted): +def test_noobaa_ns_bucket(measure_noobaa_ns_target_bucket_deleted, threading_lock): """ Test that there are appropriate alerts when target bucket used of namespace store used in namespace bucket is deleted. """ - api = prometheus.PrometheusAPI() + api = prometheus.PrometheusAPI(threading_lock=threading_lock) alerts = measure_noobaa_ns_target_bucket_deleted.get("prometheus_alerts") diff --git a/tests/manage/monitoring/prometheus/test_rgw.py b/tests/manage/monitoring/prometheus/test_rgw.py index 91b0aef7004..520e242b3b0 100644 --- a/tests/manage/monitoring/prometheus/test_rgw.py +++ b/tests/manage/monitoring/prometheus/test_rgw.py @@ -19,13 +19,13 @@ @pytest.mark.polarion_id("OCS-2323") @pytest.mark.bugzilla("1953615") @skipif_managed_service -def test_rgw_unavailable(measure_stop_rgw): +def test_rgw_unavailable(measure_stop_rgw, threading_lock): """ Test that there is appropriate alert when RGW is unavailable and that this alert is cleared when the RGW interface is back online. """ - api = prometheus.PrometheusAPI() + api = prometheus.PrometheusAPI(threading_lock=threading_lock) # get alerts from time when manager deployment was scaled down alerts = measure_stop_rgw.get("prometheus_alerts") diff --git a/tests/manage/monitoring/prometheusmetrics/test_monitoring_defaults.py b/tests/manage/monitoring/prometheusmetrics/test_monitoring_defaults.py index 405351ef031..b409352f938 100644 --- a/tests/manage/monitoring/prometheusmetrics/test_monitoring_defaults.py +++ b/tests/manage/monitoring/prometheusmetrics/test_monitoring_defaults.py @@ -33,13 +33,13 @@ @pytest.mark.first @pytest.mark.polarion_id("OCS-1261") @skipif_managed_service -def test_monitoring_enabled(): +def test_monitoring_enabled(threading_lock): """ OCS Monitoring is enabled after OCS installation (which is why this test has a post deployment marker) by asking for values of one ceph and one noobaa related metrics. """ - prometheus = PrometheusAPI() + prometheus = PrometheusAPI(threading_lock=threading_lock) if ( storagecluster_independent_check() @@ -120,12 +120,12 @@ def test_ceph_mgr_dashboard_not_deployed(): @tier1 @pytest.mark.polarion_id("OCS-1267") @skipif_managed_service -def test_ceph_rbd_metrics_available(): +def test_ceph_rbd_metrics_available(threading_lock): """ Ceph RBD metrics should be provided via OCP Prometheus as well. See also: https://ceph.com/rbd/new-in-nautilus-rbd-performance-monitoring/ """ - prometheus = PrometheusAPI() + prometheus = PrometheusAPI(threading_lock=threading_lock) list_of_metrics_without_results = metrics.get_missing_metrics( prometheus, metrics.ceph_rbd_metrics ) @@ -143,7 +143,7 @@ def test_ceph_rbd_metrics_available(): @metrics_for_external_mode_required @pytest.mark.polarion_id("OCS-1268") @skipif_managed_service -def test_ceph_metrics_available(): +def test_ceph_metrics_available(threading_lock): """ Ceph metrics as listed in KNIP-634 should be provided via OCP Prometheus. @@ -155,7 +155,7 @@ def test_ceph_metrics_available(): Since ODF 4.9 only subset of all ceph metrics ``ceph_metrics_healthy`` will be always available, as noted in BZ 2028649. """ - prometheus = PrometheusAPI() + prometheus = PrometheusAPI(threading_lock=threading_lock) list_of_metrics_without_results = metrics.get_missing_metrics( prometheus, metrics.ceph_metrics_healthy, @@ -176,14 +176,14 @@ def test_ceph_metrics_available(): @pytest.mark.post_ocp_upgrade @pytest.mark.polarion_id("OCS-1302") @skipif_managed_service -def test_monitoring_reporting_ok_when_idle(workload_idle): +def test_monitoring_reporting_ok_when_idle(workload_idle, threading_lock): """ When nothing is happening, OCP Prometheus reports OCS status as OK. If this test case fails, the status is either reported wrong or the cluster is in a broken state. Either way, a failure here is not good. """ - prometheus = PrometheusAPI() + prometheus = PrometheusAPI(threading_lock=threading_lock) health_result = prometheus.query_range( query="ceph_health_status", diff --git a/tests/manage/monitoring/prometheusmetrics/test_ocs_utilization.py b/tests/manage/monitoring/prometheusmetrics/test_ocs_utilization.py index ca77fbb8a03..a573680b1e1 100644 --- a/tests/manage/monitoring/prometheusmetrics/test_ocs_utilization.py +++ b/tests/manage/monitoring/prometheusmetrics/test_ocs_utilization.py @@ -32,12 +32,12 @@ @marks.polarion_id("OCS-2364") @marks.bugzilla("1849309") @skipif_managed_service -def test_mcg_cpu_usage(workload_idle): +def test_mcg_cpu_usage(workload_idle, threading_lock): """ Without any IO workload, cpu utilization of MCG pods should be minimal. No pod should utilize more than 0.1 cpu units. """ - prometheus = PrometheusAPI() + prometheus = PrometheusAPI(threading_lock=threading_lock) cpu_result = prometheus.query_range( query=CPU_USAGE_POD + '{namespace="openshift-storage",pod=~"^noobaa.*"}', start=workload_idle["start"], diff --git a/tests/manage/monitoring/prometheusmetrics/test_rgw.py b/tests/manage/monitoring/prometheusmetrics/test_rgw.py index 0ef739e6bb6..70c7e3d204b 100644 --- a/tests/manage/monitoring/prometheusmetrics/test_rgw.py +++ b/tests/manage/monitoring/prometheusmetrics/test_rgw.py @@ -25,7 +25,9 @@ @tier4c @pytest.mark.polarion_id("OCS-2385") @skipif_managed_service -def test_ceph_rgw_metrics_after_metrics_exporter_respin(rgw_deployments): +def test_ceph_rgw_metrics_after_metrics_exporter_respin( + rgw_deployments, threading_lock +): """ RGW metrics should be provided via OCP Prometheus even after ocs-metrics-exporter pod is respinned. @@ -52,7 +54,7 @@ def test_ceph_rgw_metrics_after_metrics_exporter_respin(rgw_deployments): ) logger.info("Collect RGW metrics") - prometheus = PrometheusAPI() + prometheus = PrometheusAPI(threading_lock=threading_lock) list_of_metrics_without_results = metrics.get_missing_metrics( prometheus, metrics.ceph_rgw_metrics ) diff --git a/tests/manage/monitoring/test_workload_fixture.py b/tests/manage/monitoring/test_workload_fixture.py index 5bba23f8b13..b68fd8f380e 100644 --- a/tests/manage/monitoring/test_workload_fixture.py +++ b/tests/manage/monitoring/test_workload_fixture.py @@ -54,7 +54,7 @@ @blue_squad @pytest.mark.libtest @skipif_managed_service -def test_workload_rbd(workload_storageutilization_50p_rbd): +def test_workload_rbd(workload_storageutilization_50p_rbd, threading_lock): """ Purpose of this test is to make the workload fixture executed, and show how to query prometheus. @@ -62,7 +62,7 @@ def test_workload_rbd(workload_storageutilization_50p_rbd): Note that this test is valid only on 3 osd cluster with all pools using 3 way replication. """ - prometheus = PrometheusAPI() + prometheus = PrometheusAPI(threading_lock=threading_lock) # Asking for values of `ceph_osd_stat_bytes_used` for every 15s in # when the workload fixture was utilizing 50% of the OCS storage. result_used = prometheus.query_range( diff --git a/tests/manage/monitoring/test_workload_with_distruptions.py b/tests/manage/monitoring/test_workload_with_distruptions.py index a1803e45b5c..6826afdc501 100644 --- a/tests/manage/monitoring/test_workload_with_distruptions.py +++ b/tests/manage/monitoring/test_workload_with_distruptions.py @@ -263,7 +263,7 @@ def finalizer(): @tier3 @pytest.mark.polarion_id("OCS-5158") @blue_squad - def test_ceph_osd_slow_ops_alert(self, setup): + def test_ceph_osd_slow_ops_alert(self, setup, threading_lock): """ Test to verify bz #1966139, more info about Prometheus alert - #1885441 @@ -279,7 +279,7 @@ def test_ceph_osd_slow_ops_alert(self, setup): storage ends - fail the test """ - api = PrometheusAPI() + api = PrometheusAPI(threading_lock=threading_lock) while get_percent_used_capacity() < self.full_osd_threshold: time_passed_sec = time.perf_counter() - self.start_workload_time diff --git a/tests/manage/pv_services/pvc_clone/test_clone_when_pvc_full.py b/tests/manage/pv_services/pvc_clone/test_clone_when_pvc_full.py index 3e8c1e947c0..6f034f3b98f 100644 --- a/tests/manage/pv_services/pvc_clone/test_clone_when_pvc_full.py +++ b/tests/manage/pv_services/pvc_clone/test_clone_when_pvc_full.py @@ -47,7 +47,7 @@ def setup(self, project_factory, pvc_clone_factory, create_pvcs_and_pods): access_modes_cephfs=[constants.ACCESS_MODE_RWO], ) - def test_clone_when_full(self, pvc_clone_factory, pod_factory): + def test_clone_when_full(self, pvc_clone_factory, pod_factory, threading_lock): """ Create a clone from an existing PVC when the PVC is 100% utilized. Verify data integrity. @@ -57,7 +57,7 @@ def test_clone_when_full(self, pvc_clone_factory, pod_factory): """ pvc_size_expanded = 6 file_name = "fio_full" - prometheus_api = PrometheusAPI() + prometheus_api = PrometheusAPI(threading_lock=threading_lock) # Run IO to utilize 100% of volume log.info("Run IO on all pods to utilise 100% of PVCs") diff --git a/tests/manage/pv_services/pvc_resize/test_pvc_expansion_when_full.py b/tests/manage/pv_services/pvc_resize/test_pvc_expansion_when_full.py index 9f348143bd3..4b1c5cd046f 100644 --- a/tests/manage/pv_services/pvc_resize/test_pvc_expansion_when_full.py +++ b/tests/manage/pv_services/pvc_resize/test_pvc_expansion_when_full.py @@ -43,7 +43,7 @@ def setup(self, create_pvcs_and_pods): access_modes_cephfs=[constants.ACCESS_MODE_RWO], ) - def test_pvc_expansion_when_full(self): + def test_pvc_expansion_when_full(self, threading_lock): """ Verify PVC expansion when the PVC is 100% utilized. Verify utilization alert will stop firing after volume expansion. @@ -79,7 +79,7 @@ def test_pvc_expansion_when_full(self): ) log.info(f"Verified: Used space on pod {pod_obj.name} is 100%") - prometheus_api = PrometheusAPI() + prometheus_api = PrometheusAPI(threading_lock=threading_lock) # Wait till utilization alerts starts for response in TimeoutSampler(140, 5, prometheus_api.get, "alerts"):