Implement BG alert check and resource utilization checks

Signed-off-by: Mahesh Shetty <[email protected]>
red-hat-storage · Dec 12, 2024 · a7388df · a7388df
1 parent cf1b5d1
commit a7388df
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 23 deletions.
diff --git a/ocs_ci/helpers/helpers.py b/ocs_ci/helpers/helpers.py
@@ -4298,6 +4298,24 @@ def get_mon_db_size_in_kb(mon_pod_obj):
     return mon_db_size_kb
 
 
+def get_noobaa_db_usage_percent():
+    """
+    Get noobaa db usage percentage
+
+    Returns:
+        str: Noobaa db usage percentage
+
+    """
+    noobaa_db_pod_obj = pod.get_noobaa_pods(
+        noobaa_label=constants.NOOBAA_DB_LABEL_47_AND_ABOVE
+    )
+    cmd_out = noobaa_db_pod_obj[0].exec_cmd_on_pod(
+        command="df -h /var/lib/pgsql/", out_yaml_format=False
+    )
+    df_out = cmd_out.split()
+    return df_out[-2]
+
+
 def get_noobaa_db_size():
     """
     Get noobaa db size
@@ -4313,7 +4331,7 @@ def get_noobaa_db_size():
         command="df -h /var/lib/pgsql/", out_yaml_format=False
     )
     df_out = cmd_out.split()
-    return df_out[2]
+    return df_out[-5]
 
 
 def get_noobaa_db_used_space():

diff --git a/ocs_ci/helpers/mcg_stress_helper.py b/ocs_ci/helpers/mcg_stress_helper.py
@@ -1,8 +1,11 @@
 import logging
 import concurrent.futures
 import time
+import textwrap
 
-from ocs_ci.helpers.helpers import get_noobaa_db_size, get_noobaa_db_used_space
+from tabulate import tabulate
+
+from ocs_ci.helpers.helpers import get_noobaa_db_size, get_noobaa_db_usage_percent
 from ocs_ci.ocs import constants
 from ocs_ci.ocs.resources.mcg import MCG
 from ocs_ci.ocs.resources.objectbucket import OBC
@@ -24,6 +27,8 @@
     CephHealthException,
     CommandFailed,
 )
+from ocs_ci.ocs.resources.pod import pod_resource_utilization_raw_output_from_adm_top
+from ocs_ci.utility.prometheus import PrometheusAPI
 
 logger = logging.getLogger(__name__)
 
@@ -368,7 +373,7 @@ def delete_objects_in_batches(bucket, batch_size):
         )
 
 
-def run_background_cluster_checks(scale_noobaa_db_pv, event=None):
+def run_background_cluster_checks(scale_noobaa_db_pv, event=None, threading_lock=None):
     """
     Run background checks to verify noobaa health
     and cluster health overall
@@ -381,19 +386,37 @@ def run_background_cluster_checks(scale_noobaa_db_pv, event=None):
 
     """
     ceph_cluster = CephCluster()
+    prometheus_api = PrometheusAPI(threading_lock=threading_lock)
+    prometheus_alert_list = list()
+
+    logger.info(
+        "\n"
+        "\nNow starting background check operations to check the following"
+        "\n1. Nooba Health"
+        "\n2. Ceph Health"
+        "\n3. Noobaa DB usage"
+        "\n4. Prometheus Alerts"
+        "\n5. Memory and CPU utilization for Noobaa pods"
+        "\n"
+    )
 
     @retry(NoobaaHealthException, tries=10, delay=60)
     def check_noobaa_health():
 
         while True:
 
             ceph_cluster.noobaa_health_check()
-            logger.info("BACKGROUND CHECK: Noobaa is healthy... rechecking in 1 minute")
-            time.sleep(60)
+            logger.info(
+                "\n"
+                "\n[BACKGROUND CHECK]"
+                "\nNoobaa is healthy... rechecking in 1 minute"
+                "\n"
+            )
 
             if event.is_set():
-                logger.info("BACKGROUND CHECK: Stopping the Noobaa health check")
+                logger.info("[BACKGROUND CHECK] Stopping the Noobaa health check")
                 break
+            time.sleep(60)
 
     @retry(CephHealthException, tries=10, delay=60)
     def check_ceph_health():
@@ -402,45 +425,109 @@ def check_ceph_health():
 
             if ceph_cluster.get_ceph_health() == constants.CEPH_HEALTH_ERROR:
                 raise CephHealthException
-            logger.info("BACKGROUND CHECK: Ceph is healthy... rechecking in 1 minute")
-            time.sleep(60)
+            logger.info(
+                "\n"
+                "\n[BACKGROUND CHECK]"
+                "\nCeph is healthy... rechecking in 1 minute"
+                "\n"
+            )
 
             if event.is_set():
-                logger.info("BACKGROUND CHECK: Stopping the Ceph health check")
+                logger.info("[BACKGROUND CHECK] Stopping the Ceph health check")
                 break
+            time.sleep(60)
 
     @retry(CommandFailed, tries=10, delay=60)
     def check_noobaa_db_size():
 
         while True:
-
-            nb_db_pv_used = get_noobaa_db_used_space()
-            nb_db_pv_size = get_noobaa_db_size()
-            used_percent = int((nb_db_pv_used * 100) / nb_db_pv_size)
+            used_percent = int(get_noobaa_db_usage_percent().split("%")[0])
+            nb_db_pv_size = int(get_noobaa_db_size().split("G")[0])
             if used_percent > 85:
                 logger.info(
-                    f"BACKGROUND CHECK: Noobaa db is {used_percent} percentage. Increasing the noobaa db by 50%"
+                    f"\n"
+                    f"\n[BACKGROUND CHECK]"
+                    f"\nNoobaa db is {used_percent} percentage. Increasing the noobaa db by 50%"
+                    f"\n"
                 )
-                new_size = int(nb_db_pv_size + int(nb_db_pv_size.split("G")[0]) / 2)
+                new_size = int(nb_db_pv_size + (nb_db_pv_size // 2))
                 scale_noobaa_db_pv(pvc_size=new_size)
                 logger.info(
-                    f"BACKGROUND CHECK: Scaled noobaa db to new size {new_size}"
+                    f"\n"
+                    f"\n[BACKGROUND CHECK]"
+                    f"\nScaled noobaa db to new size {new_size}"
+                    f"\n"
                 )
             logger.info(
-                f"BACKGROUND CHECK: Current noobaa db usage is at {used_percent}%... Rechecking in 5 minutes..."
+                f"\n"
+                f"\n[BACKGROUND CHECK]"
+                f"\nCurrent noobaa db usage is at {used_percent}%... Rechecking in 5 minutes..."
+                f"\n"
             )
+
+            if event.is_set():
+                logger.info("[BACKGROUND CHECK] Stopping the Noobaa db size check")
+                break
             time.sleep(300)
 
+    def check_prometheus_alerts():
+
+        while True:
+
+            prometheus_api.prometheus_log(prometheus_alert_list)
+            alert_tab = list()
+            alert_printed = list()
+            alert_tab.append(["Alert Name", "Description", "State"])
+            for alert in prometheus_alert_list:
+                if alert["labels"]["alertname"] in alert_printed:
+                    continue
+                alert_tab.append(
+                    [
+                        alert["labels"]["alertname"].strip(),
+                        "\n".join(
+                            textwrap.wrap(alert["annotations"]["description"], width=50)
+                        ),
+                        alert["state"],
+                    ]
+                )
+                alert_printed.append(alert["labels"]["alertname"])
+            logger.info(
+                f"\n"
+                f"\n[BACKGROUND CHECK]"
+                f"\nThese are the alerts so far in Prometheus: "
+                f"\n{tabulate(alert_tab[1:], headers=alert_tab[0], tablefmt='grid')}"
+                f"\n"
+            )
+
             if event.is_set():
-                logger.info("BACKGROUND CHECK: Stopping the Noobaa db size check")
+                logger.info("[BACKGROUND CHECK] Stopping Prometheus alert logging")
                 break
+            time.sleep(300)
 
-    logger.info("Initiating background ops")
-    executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)
+    def check_noobaa_pod_resource_utilization():
+
+        while True:
+            logger.info(
+                f"\n"
+                f"\n[BACKGROUND CHECK]"
+                f"\nCurrent noobaa pod resource utilization: "
+                f"\n{pod_resource_utilization_raw_output_from_adm_top(selector=constants.NOOBAA_APP_LABEL)}"
+                f"\n"
+            )
+
+            if event.is_set():
+                logger.info(
+                    "[BACKGROUND CHECK] Stopping noobaa pod resource utilization checks"
+                )
+                break
+            time.sleep(300)
+
+    executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
     futures_obj = list()
     futures_obj.append(executor.submit(check_noobaa_health))
     futures_obj.append(executor.submit(check_ceph_health))
     futures_obj.append(executor.submit(check_noobaa_db_size))
-
+    futures_obj.append(executor.submit(check_prometheus_alerts))
+    futures_obj.append(executor.submit(check_noobaa_pod_resource_utilization))
     for future in futures_obj:
         future.result()
diff --git a/ocs_ci/ocs/resources/pod.py b/ocs_ci/ocs/resources/pod.py
@@ -3432,20 +3432,26 @@ def wait_for_osd_pods_having_ids(osd_ids, timeout=180, sleep=10):
 
 def pod_resource_utilization_raw_output_from_adm_top(
     namespace=config.ENV_DATA["cluster_namespace"],
+    selector=None,
 ):
     """
     Gets the pod's memory utilization using adm top command.
 
     Args:
         namespace (str) : The pod's namespace where the adm top command has to be run
+        selector (str): selector to filter the pods. Ex: for noobaa, selector='app=noobaa'
 
     Returns:
         str : Raw output of adm top pods command
 
     """
     obj = ocp.OCP()
+    command = f"adm top pods -n {namespace}"
+    if selector:
+        command += f" --selector={selector}"
+
     resource_utilization_all_pods = obj.exec_oc_cmd(
-        command=f"adm top pods -n {namespace}", out_yaml_format=False
+        command=command, out_yaml_format=False
     )
     logger.info("Command RAW output of adm top pods")
     logger.info(f"{resource_utilization_all_pods}")

diff --git a/tests/cross_functional/stress/test_noobaa_under_stress.py b/tests/cross_functional/stress/test_noobaa_under_stress.py
@@ -32,6 +32,7 @@ def test_noobaa_under_stress(
         bucket_factory,
         scale_noobaa_resources_session,
         scale_noobaa_db_pod_pv_size,
+        threading_lock,
     ):
         """
         Stress Noobaa by performing bulk s3 operations. This consists mainly 3 stages
@@ -60,7 +61,10 @@ def test_noobaa_under_stress(
         bg_executor = ThreadPoolExecutor(max_workers=1)
 
         bg_future = bg_executor.submit(
-            run_background_cluster_checks, scale_noobaa_db_pod_pv_size, event=bg_event
+            run_background_cluster_checks,
+            scale_noobaa_db_pod_pv_size,
+            event=bg_event,
+            threading_lock=threading_lock,
         )
 
         # Iterate and stress the cluster with object upload
@@ -87,6 +91,7 @@ def test_noobaa_under_stress(
                     self.base_setup_buckets,
                     iteration_no=i,
                     event=event,
+                    multiplier=i,
                 )
             )