From 46079cf913463a899dac8ad490069ee1d1970578 Mon Sep 17 00:00:00 2001 From: Mahesh Shetty Date: Fri, 6 Dec 2024 18:21:56 +0530 Subject: [PATCH] Background check implementation and Implementing multiplier upload Signed-off-by: Mahesh Shetty --- ocs_ci/helpers/helpers.py | 18 +++ ocs_ci/helpers/mcg_stress_helper.py | 111 ++++++++++++++++-- ocs_ci/ocs/constants.py | 5 + tests/conftest.py | 4 + .../stress/test_noobaa_under_stress.py | 19 +++ 5 files changed, 147 insertions(+), 10 deletions(-) diff --git a/ocs_ci/helpers/helpers.py b/ocs_ci/helpers/helpers.py index e3fca3a71762..623d94ffcb40 100644 --- a/ocs_ci/helpers/helpers.py +++ b/ocs_ci/helpers/helpers.py @@ -4298,6 +4298,24 @@ def get_mon_db_size_in_kb(mon_pod_obj): return mon_db_size_kb +def get_noobaa_db_size(): + """ + Get noobaa db size + + Returns: + str: Noobaa db size + + """ + noobaa_db_pod_obj = pod.get_noobaa_pods( + noobaa_label=constants.NOOBAA_DB_LABEL_47_AND_ABOVE + ) + cmd_out = noobaa_db_pod_obj[0].exec_cmd_on_pod( + command="df -h /var/lib/pgsql/", out_yaml_format=False + ) + df_out = cmd_out.split() + return df_out[2] + + def get_noobaa_db_used_space(): """ Get noobaa db size diff --git a/ocs_ci/helpers/mcg_stress_helper.py b/ocs_ci/helpers/mcg_stress_helper.py index f78ca692ba7e..3eea7f57fcb2 100644 --- a/ocs_ci/helpers/mcg_stress_helper.py +++ b/ocs_ci/helpers/mcg_stress_helper.py @@ -1,6 +1,9 @@ import logging import concurrent.futures +import time +from ocs_ci.helpers.helpers import get_noobaa_db_size, get_noobaa_db_used_space +from ocs_ci.ocs import constants from ocs_ci.ocs.resources.mcg import MCG from ocs_ci.ocs.resources.objectbucket import OBC from ocs_ci.ocs.resources.bucket_policy import NoobaaAccount @@ -14,11 +17,20 @@ list_objects_in_batches, s3_delete_objects, ) +from ocs_ci.utility.retry import retry +from ocs_ci.ocs.cluster import CephCluster +from ocs_ci.ocs.exceptions import ( + NoobaaHealthException, + CephHealthException, + CommandFailed, +) logger = logging.getLogger(__name__) -def upload_objs_to_buckets(mcg_obj, pod_obj, buckets, iteration_no, event=None): +def upload_objs_to_buckets( + mcg_obj, pod_obj, buckets, iteration_no, event=None, multiplier=1 +): """ This will upload objects present in the stress-cli pod to the buckets provided concurrently @@ -46,15 +58,16 @@ def upload_objs_to_buckets(mcg_obj, pod_obj, buckets, iteration_no, event=None): logger.info( f"OBJECT UPLOAD: Uploading objects to the bucket {bucket.name}" ) - future = executor.submit( - sync_object_directory, - pod_obj, - src_path, - f"s3://{bucket.name}/{iteration_no}/", - s3_obj, - timeout=20000, - ) - futures.append(future) + for i in range(multiplier): + future = executor.submit( + sync_object_directory, + pod_obj, + src_path, + f"s3://{bucket.name}/{iteration_no}/{i+1}/", + s3_obj, + timeout=20000, + ) + futures.append(future) logger.info( "OBJECT UPLOAD: Waiting for the objects upload to complete for all the buckets" @@ -353,3 +366,81 @@ def delete_objects_in_batches(bucket, batch_size): logger.info( f"Total objects deleted {total_objs_deleted} in bucket {bucket_name}" ) + + +def run_background_cluster_checks(scale_noobaa_db_pv, event=None): + """ + Run background checks to verify noobaa health + and cluster health overall + + 1. Check Noobaa Health + 2. Check Ceph Health + 3. Check Noobaa db usage + 4. Check for any alerts + 5. Memory and CPU utilization + + """ + ceph_cluster = CephCluster() + + @retry(NoobaaHealthException, tries=10, delay=60) + def check_noobaa_health(): + + while True: + + ceph_cluster.noobaa_health_check() + logger.info("BACKGROUND CHECK: Noobaa is healthy... rechecking in 1 minute") + time.sleep(60) + + if event.is_set(): + logger.info("BACKGROUND CHECK: Stopping the Noobaa health check") + break + + @retry(CephHealthException, tries=10, delay=60) + def check_ceph_health(): + + while True: + + if ceph_cluster.get_ceph_health() == constants.CEPH_HEALTH_ERROR: + raise CephHealthException + logger.info("BACKGROUND CHECK: Ceph is healthy... rechecking in 1 minute") + time.sleep(60) + + if event.is_set(): + logger.info("BACKGROUND CHECK: Stopping the Ceph health check") + break + + @retry(CommandFailed, tries=10, delay=60) + def check_noobaa_db_size(): + + while True: + + nb_db_pv_used = get_noobaa_db_used_space() + nb_db_pv_size = get_noobaa_db_size() + used_percent = int((nb_db_pv_used * 100) / nb_db_pv_size) + if used_percent > 85: + logger.info( + f"BACKGROUND CHECK: Noobaa db is {used_percent} percentage. Increasing the noobaa db by 50%" + ) + new_size = int(nb_db_pv_size + int(nb_db_pv_size.split("G")[0]) / 2) + scale_noobaa_db_pv(pvc_size=new_size) + logger.info( + f"BACKGROUND CHECK: Scaled noobaa db to new size {new_size}" + ) + logger.info( + f"BACKGROUND CHECK: Current noobaa db usage is at {used_percent}%... Rechecking in 5 minutes..." + ) + time.sleep(300) + + if event.is_set(): + logger.info("BACKGROUND CHECK: Stopping the Noobaa db size check") + break + + logger.info("Initiating background ops") + executor = concurrent.futures.ThreadPoolExecutor(max_workers=3) + futures_obj = list() + futures_obj.append(executor.submit(check_noobaa_health)) + futures_obj.append(executor.submit(check_ceph_health)) + futures_obj.append(executor.submit(check_noobaa_db_size)) + + for future in futures_obj: + future.result() diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py index 0666b7aa31aa..95c31de92276 100644 --- a/ocs_ci/ocs/constants.py +++ b/ocs_ci/ocs/constants.py @@ -140,6 +140,11 @@ HEALTHY_OB_CLI_MODE = "Mode:OPTIMAL" HEALTHY_PV_BS = ["OPTIMAL", "LOW_CAPACITY"] +# check health +CEPH_HEALTH_WARN = "HEALTH_WARN" +CEPH_HEALTH_ERROR = "HEALTH_ERR" +CEPH_HEALTH_OK = "HEALTH_OK" + # noobaa-core config.js parameters CONFIG_JS_PREFIX = "CONFIG_JS_" BUCKET_REPLICATOR_DELAY_PARAM = CONFIG_JS_PREFIX + "BUCKET_REPLICATOR_DELAY" diff --git a/tests/conftest.py b/tests/conftest.py index 2e77d31d14f3..c40905daa1d2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8390,6 +8390,10 @@ def finalizer(): @pytest.fixture() def scale_noobaa_db_pod_pv_size(request): + return scale_noobaa_db_pv(request) + + +def scale_noobaa_db_pv(request): """ This fixtue helps to scale the noobaa db pv size. follows KCS: https://access.redhat.com/solutions/6976547 diff --git a/tests/cross_functional/stress/test_noobaa_under_stress.py b/tests/cross_functional/stress/test_noobaa_under_stress.py index 23b9646d72cd..e7c5bca81adb 100644 --- a/tests/cross_functional/stress/test_noobaa_under_stress.py +++ b/tests/cross_functional/stress/test_noobaa_under_stress.py @@ -11,6 +11,7 @@ list_objs_from_bucket, download_objs_from_bucket, delete_objects_in_batches, + run_background_cluster_checks, ) logger = logging.getLogger(__name__) @@ -29,6 +30,7 @@ def test_noobaa_under_stress( rgw_obj_session, stress_test_directory_setup, bucket_factory, + scale_noobaa_resources_session, ): """ Stress Noobaa by performing bulk s3 operations. This consists mainly 3 stages @@ -162,3 +164,20 @@ def test_noobaa_under_stress( logger.info("Waiting for all the delete object operations to complete") for future in futures: future.result() + + +def test_sample(scale_noobaa_db_pod_pv_size): + + bg_event = Event() + executor = ThreadPoolExecutor(max_workers=1) + + bg_future = executor.submit( + run_background_cluster_checks, scale_noobaa_db_pod_pv_size, event=bg_event + ) + + import time + + time.sleep(300) + + bg_event.set() + bg_future.result()