From e444e7ec9b1b2426b54cf154a1b2b3185c2f8d1f Mon Sep 17 00:00:00 2001 From: Mahesh Shetty Date: Mon, 18 Mar 2024 10:47:03 +0530 Subject: [PATCH] 4.14 System test: Test log based replication deletion sync feature with noobaa related disruptive ops :bomb: (#8534) Signed-off-by: Mahesh Shetty --- ocs_ci/ocs/bucket_utils.py | 36 ++++ ocs_ci/ocs/constants.py | 2 + ocs_ci/ocs/resources/mockup_bucket_logger.py | 28 +++ tests/conftest.py | 82 +++++++++ tests/cross_functional/conftest.py | 69 +++++--- .../test_mcg_replication_with_disruptions.py | 164 ++++++++++++++++++ 6 files changed, 358 insertions(+), 23 deletions(-) diff --git a/ocs_ci/ocs/bucket_utils.py b/ocs_ci/ocs/bucket_utils.py index 7a3a2d6825a..6a992ebce0d 100644 --- a/ocs_ci/ocs/bucket_utils.py +++ b/ocs_ci/ocs/bucket_utils.py @@ -2604,3 +2604,39 @@ def bulk_s3_put_bucket_lifecycle_config(mcg_obj, buckets, lifecycle_config): Bucket=bucket.name, LifecycleConfiguration=lifecycle_config ) logger.info("Applied lifecyle rule on all the buckets") + + +def upload_test_objects_to_source_and_wait_for_replication( + mcg_obj, source_bucket, target_bucket, mockup_logger, timeout +): + """ + Upload a set of objects to the source bucket, logs the operations and wait for the replication to complete. + + """ + logger.info("Uploading test objects and waiting for replication to complete") + mockup_logger.upload_test_objs_and_log(source_bucket.name) + + assert compare_bucket_object_list( + mcg_obj, + source_bucket.name, + target_bucket.name, + timeout=timeout, + ), f"Standard replication failed to complete in {timeout} seconds" + + +def delete_objects_from_source_and_wait_for_deletion_sync( + mcg_obj, source_bucket, target_bucket, mockup_logger, timeout +): + """ + Delete all objects from the source bucket,logs the operations and wait for the deletion sync to complete. + + """ + logger.info("Deleting source objects and waiting for deletion sync with target") + mockup_logger.delete_all_objects_and_log(source_bucket.name) + + assert compare_bucket_object_list( + mcg_obj, + source_bucket.name, + target_bucket.name, + timeout=timeout, + ), f"Deletion sync failed to complete in {timeout} seconds" diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py index 22b5ef74c02..6b2e76237a6 100644 --- a/ocs_ci/ocs/constants.py +++ b/ocs_ci/ocs/constants.py @@ -288,6 +288,7 @@ DEFAULT_NOOBAA_BACKINGSTORE = "noobaa-default-backing-store" DEFAULT_NOOBAA_BUCKETCLASS = "noobaa-default-bucket-class" NOOBAA_RESOURCE_NAME = "noobaa" +NOOBAA_DB_PVC_NAME = "db-noobaa-db-pg-0" MIN_PV_BACKINGSTORE_SIZE_IN_GB = 17 JENKINS_BUILD = "jax-rs-build" JENKINS_BUILD_COMPLETE = "Complete" @@ -1109,6 +1110,7 @@ FUSIONAAS_PLATFORM, ] BAREMETAL_PLATFORMS = [BAREMETAL_PLATFORM, BAREMETALPSI_PLATFORM] +DEFAULT_AWS_REGION = "us-east-2" HCI_PROVIDER_CLIENT_PLATFORMS = [ HCI_BAREMETAL, diff --git a/ocs_ci/ocs/resources/mockup_bucket_logger.py b/ocs_ci/ocs/resources/mockup_bucket_logger.py index ddc532dc135..8c535493a23 100644 --- a/ocs_ci/ocs/resources/mockup_bucket_logger.py +++ b/ocs_ci/ocs/resources/mockup_bucket_logger.py @@ -56,6 +56,10 @@ def __init__(self, awscli_pod, mcg_obj, bucket_factory, platform, region): f"ls -A1 {constants.AWSCLI_TEST_OBJ_DIR}" ).split(" ") + @property + def standard_test_obj_list(self): + return self._standard_test_obj_list + def upload_test_objs_and_log(self, bucket_name): """ Uploads files from files_dir to the MCG bucket and write matching @@ -97,6 +101,30 @@ def upload_arbitrary_object_and_log(self, bucket_name): self._upload_mockup_logs(bucket_name, [obj_name], "PUT") + def delete_objs_and_log(self, bucket_name, objs): + """ + Delete list of objects from the MCG bucket and write + matching mockup logs + + Args: + bucket_name(str): Name of the MCG bucket + objs(list): List of the objects to delete + + """ + logger.info(f"Deleting the {objs} from the bucket") + obj_list = list_objects_from_bucket( + self.awscli_pod, + f"s3://{bucket_name}", + s3_obj=self.mcg_obj, + ) + if set(objs).issubset(set(obj_list)): + for i in range(len(objs)): + s3cmd = craft_s3_command( + f"rm s3://{bucket_name}/{objs[i]}", self.mcg_obj + ) + self.awscli_pod.exec_cmd_on_pod(s3cmd) + self._upload_mockup_logs(bucket_name, objs, "DELETE") + def delete_all_objects_and_log(self, bucket_name): """ Deletes all objects from the MCG bucket and write matching mockup logs diff --git a/tests/conftest.py b/tests/conftest.py index fe45b516215..ddf0b954d0f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -56,6 +56,8 @@ from ocs_ci.ocs.ocp import OCP from ocs_ci.ocs.resources import pvc from ocs_ci.ocs.resources.bucket_policy import gen_bucket_policy +from ocs_ci.ocs.resources.mcg_replication_policy import AwsLogBasedReplicationPolicy +from ocs_ci.ocs.resources.mockup_bucket_logger import MockupBucketLogger from ocs_ci.ocs.scale_lib import FioPodScale from ocs_ci.ocs.utils import ( setup_ceph_toolbox, @@ -7439,3 +7441,83 @@ def finalizer(): request.addfinalizer(finalizer) return factory + + +@pytest.fixture() +def reduce_replication_delay_setup(add_env_vars_to_noobaa_core_class): + """ + A fixture to reduce the replication delay to one minute. + + Args: + new_delay_in_miliseconds (function): A function to add env vars to the noobaa-core pod + + """ + log.warning("Reducing replication delay") + + def factory(new_delay_in_miliseconds=60 * 1000): + new_env_var_tuples = [ + (constants.BUCKET_REPLICATOR_DELAY_PARAM, new_delay_in_miliseconds), + (constants.BUCKET_LOG_REPLICATOR_DELAY_PARAM, new_delay_in_miliseconds), + ] + add_env_vars_to_noobaa_core_class(new_env_var_tuples) + + return factory + + +@pytest.fixture() +def aws_log_based_replication_setup( + awscli_pod_session, mcg_obj_session, bucket_factory, reduce_replication_delay_setup +): + """ + A fixture to set up standard log-based replication with deletion sync. + + Args: + awscli_pod_session(Pod): A pod running the AWS CLI + mcg_obj_session(MCG): An MCG object + bucket_factory: A bucket factory fixture + + Returns: + MockupBucketLogger: A MockupBucketLogger object + Bucket: The source bucket + Bucket: The target bucket + + """ + + reduce_replication_delay_setup() + + def factory(bucketclass_dict=None): + log.info("Starting log-based replication setup") + if bucketclass_dict is None: + bucketclass_dict = { + "interface": "OC", + "namespace_policy_dict": { + "type": "Single", + "namespacestore_dict": { + constants.AWS_PLATFORM: [(1, constants.DEFAULT_AWS_REGION)] + }, + }, + } + target_bucket = bucket_factory(bucketclass=bucketclass_dict)[0] + + mockup_logger = MockupBucketLogger( + awscli_pod=awscli_pod_session, + mcg_obj=mcg_obj_session, + bucket_factory=bucket_factory, + platform=constants.AWS_PLATFORM, + region=constants.DEFAULT_AWS_REGION, + ) + replication_policy = AwsLogBasedReplicationPolicy( + destination_bucket=target_bucket.name, + sync_deletions=True, + logs_bucket=mockup_logger.logs_bucket_uls_name, + ) + + source_bucket = bucket_factory( + 1, bucketclass=bucketclass_dict, replication_policy=replication_policy + )[0] + + log.info("log-based replication setup complete") + + return mockup_logger, source_bucket, target_bucket + + return factory diff --git a/tests/cross_functional/conftest.py b/tests/cross_functional/conftest.py index 441f1559d37..36c65dd6f8d 100644 --- a/tests/cross_functional/conftest.py +++ b/tests/cross_functional/conftest.py @@ -319,31 +319,10 @@ def finalizer(): @pytest.fixture() -def noobaa_db_backup_and_recovery(request, snapshot_factory): - """ - Verify noobaa backup and recovery - - 1. Take snapshot db-noobaa-db-0 PVC and retore it to PVC - 2. Scale down the statefulset noobaa-db - 3. Get the yaml of the current PVC, db-noobaa-db-0 and - change the parameter persistentVolumeReclaimPolicy to Retain for restored PVC - 4. Delete both PVCs, the PV for the original claim db-noobaa-db-0 will be removed. - The PV for claim db-noobaa-db-0-snapshot-restore will move to ‘Released’ - 5. Edit again restore PV and remove the claimRef section. - The volume will transition to Available. - 6. Edit the yaml db-noobaa-db-0.yaml and change the setting volumeName to restored PVC. - 7. Scale up the stateful set again and the pod should be running - - """ +def noobaa_db_backup(request, snapshot_factory): restore_pvc_objs = [] - def factory(snapshot_factory=snapshot_factory): - # Get noobaa pods before execution - noobaa_pods = pod.get_noobaa_pods() - - # Get noobaa PVC before execution - noobaa_pvc_obj = pvc.get_pvc_objs(pvc_names=["db-noobaa-db-pg-0"]) - noobaa_pv_name = noobaa_pvc_obj[0].get("spec").get("spec").get("volumeName") + def factory(noobaa_pvc_obj): # Take snapshot db-noobaa-db-0 PVC logger.info(f"Creating snapshot of the {noobaa_pvc_obj[0].name} PVC") @@ -381,6 +360,15 @@ def factory(snapshot_factory=snapshot_factory): f"Succeesfuly created PVC {restore_pvc_obj.name} " f"from snapshot {snap_obj.name}" ) + return restore_pvc_objs, snap_obj + + return factory + + +@pytest.fixture() +def noobaa_db_recovery_from_backup(request): + def factory(snap_obj, noobaa_pvc_obj, noobaa_pods): + noobaa_pv_name = noobaa_pvc_obj[0].get("spec").get("spec").get("volumeName") # Scale down the statefulset noobaa-db modify_statefulset_replica_count( @@ -474,6 +462,41 @@ def factory(snapshot_factory=snapshot_factory): "Changed the parameter persistentVolumeReclaimPolicy to Delete again" ) + return factory + + +@pytest.fixture() +def noobaa_db_backup_and_recovery( + request, snapshot_factory, noobaa_db_backup, noobaa_db_recovery_from_backup +): + """ + Verify noobaa backup and recovery + + 1. Take snapshot db-noobaa-db-0 PVC and retore it to PVC + 2. Scale down the statefulset noobaa-db + 3. Get the yaml of the current PVC, db-noobaa-db-0 and + change the parameter persistentVolumeReclaimPolicy to Retain for restored PVC + 4. Delete both PVCs, the PV for the original claim db-noobaa-db-0 will be removed. + The PV for claim db-noobaa-db-0-snapshot-restore will move to ‘Released’ + 5. Edit again restore PV and remove the claimRef section. + The volume will transition to Available. + 6. Edit the yaml db-noobaa-db-0.yaml and change the setting volumeName to restored PVC. + 7. Scale up the stateful set again and the pod should be running + + """ + restore_pvc_objs = [] + + def factory(snapshot_factory=snapshot_factory): + global restore_pvc_objs + # Get noobaa pods before execution + noobaa_pods = pod.get_noobaa_pods() + + # Get noobaa PVC before execution + noobaa_pvc_obj = pvc.get_pvc_objs(pvc_names=["db-noobaa-db-pg-0"]) + + restore_pvc_objs, snap_obj = noobaa_db_backup(noobaa_pvc_obj) + noobaa_db_recovery_from_backup(snap_obj, noobaa_pvc_obj, noobaa_pods) + def finalizer(): # Get the statefulset replica count sst_obj = OCP( diff --git a/tests/cross_functional/system_test/test_mcg_replication_with_disruptions.py b/tests/cross_functional/system_test/test_mcg_replication_with_disruptions.py index 3b97899cae7..10f244b5b99 100644 --- a/tests/cross_functional/system_test/test_mcg_replication_with_disruptions.py +++ b/tests/cross_functional/system_test/test_mcg_replication_with_disruptions.py @@ -1,7 +1,11 @@ import logging import pytest +import random +import time +from concurrent.futures import ThreadPoolExecutor +from ocs_ci.ocs import constants from ocs_ci.framework.testlib import ( E2ETest, skipif_ocs_version, @@ -15,19 +19,28 @@ skipif_vsphere_ipi, magenta_squad, mcg, + bugzilla, + polarion_id, ) from ocs_ci.ocs.node import get_worker_nodes, get_node_objs from ocs_ci.ocs.bucket_utils import ( compare_bucket_object_list, patch_replication_policy_to_bucket, write_random_test_objects_to_bucket, + upload_test_objects_to_source_and_wait_for_replication, + update_replication_policy, ) from ocs_ci.ocs import ocp +from ocs_ci.ocs.resources.pvc import get_pvc_objs from ocs_ci.ocs.utils import get_pod_name_by_pattern from ocs_ci.ocs.resources.pod import ( delete_pods, wait_for_pods_to_be_running, get_rgw_pods, + get_noobaa_db_pod, + get_noobaa_core_pod, + get_noobaa_pods, + wait_for_noobaa_pods_running, ) from ocs_ci.utility.retry import retry from ocs_ci.ocs.exceptions import CommandFailed, ResourceWrongStatusException @@ -228,3 +241,154 @@ def test_replication_with_disruptions( mcg_obj_session, source_bucket_name, target_bucket_name ) logger.info("Objects sync works even when the cluster is rebooted") + + +@system_test +@magenta_squad +@skipif_vsphere_ipi +class TestLogBasedReplicationWithDisruptions: + @retry(Exception, tries=10, delay=5) + def delete_objs_in_batch(self, objs_to_delete, mockup_logger, source_bucket): + """ + This function deletes objects in a batch + """ + for obj in objs_to_delete: + mockup_logger.delete_objs_and_log(source_bucket.name, [obj]) + # adding momentary sleep just to slowdown the deletion + # process + time.sleep(5) + logger.info(f"Successfully deleted these objects: {objs_to_delete}") + + @polarion_id("OCS-5457") + @bugzilla("2266805") + def test_log_based_replication_with_disruptions( + self, + mcg_obj_session, + aws_log_based_replication_setup, + noobaa_db_backup, + noobaa_db_recovery_from_backup, + setup_mcg_bg_features, + validate_mcg_bg_features, + ): + """ + This is a system test flow to test log based bucket replication + deletion sync is not impacted due to some noobaa specific disruptions + like noobaa pod restarts, noobaa db backup & recovery etc + + 1. Setup log based bucket replication between the buckets + 2. Upload some objects and make sure replication works + 3. Keep deleting some objects from the source bucket and make sure + deletion sync works as expected through out. + 4. In another thread, restart the noobaa pods (db & core), make sure + deletion sync works for the step-3 deletion works as expected + 5. Now take backup of Noobaa db using PV backup method + 6. Remove the log based replication rules, perform some deletion in + source bucket. make sure deletion sync doesn't work + 7. Recover noobaa db from the backup taken in step-5 + 8. Now check if deletion sync works by deleting some objects from + source bucket. Note: Expectation is still unclear + 9. Now patch the bucket to remove complete replication policy and + make sure no replication - no deletion sync works + + """ + # entry criteria setup + feature_setup_map = setup_mcg_bg_features( + num_of_buckets=5, + object_amount=5, + is_disruptive=True, + skip_any_features=["nsfs", "rgw kafka", "caching", "replication"], + ) + + mockup_logger, source_bucket, target_bucket = aws_log_based_replication_setup() + + # upload test objects to the bucket and verify replication + upload_test_objects_to_source_and_wait_for_replication( + mcg_obj_session, + source_bucket, + target_bucket, + mockup_logger, + 600, + ) + + # Delete objects in the first set in a batch and perform noobaa pod + # restarts at the same time and make sure deletion sync works + + objs_in_bucket = mockup_logger.standard_test_obj_list + objs_to_delete = random.sample(objs_in_bucket, 3) + + with ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit( + self.delete_objs_in_batch, objs_to_delete, mockup_logger, source_bucket + ) + + # Restart noobaa pods + nb_core_pod = get_noobaa_core_pod() + nb_db_pod = get_noobaa_db_pod() + nb_core_pod.delete() + nb_db_pod.delete() + wait_for_pods_to_be_running(pod_names=[nb_core_pod.name, nb_db_pod.name]) + + # Wait for the object deletion worker in the BG to completion + future.result() + assert compare_bucket_object_list( + mcg_obj_session, + source_bucket.name, + target_bucket.name, + timeout=600, + ), f"Deletion sync failed to complete for the objects {objs_to_delete} deleted in the first bucket set" + + # Take noobaa db backup and remove deletion replication policy for the second bucket set + # Get noobaa pods before execution + noobaa_pods = get_noobaa_pods() + + # Get noobaa PVC before execution + noobaa_pvc_obj = get_pvc_objs(pvc_names=[constants.NOOBAA_DB_PVC_NAME]) + + _, snap_obj = noobaa_db_backup(noobaa_pvc_obj) + + disable_deletion_sync = source_bucket.replication_policy + disable_deletion_sync["rules"][0]["sync_deletions"] = False + update_replication_policy(source_bucket.name, disable_deletion_sync) + logger.info("Deleting all the objects from the second bucket") + mockup_logger.delete_all_objects_and_log(source_bucket.name) + assert not compare_bucket_object_list( + mcg_obj_session, + source_bucket.name, + target_bucket.name, + timeout=300, + ), "Deletion sync was done but not expected" + + # Do noobaa db recovery and see if the deletion sync works now + noobaa_db_recovery_from_backup(snap_obj, noobaa_pvc_obj, noobaa_pods) + wait_for_noobaa_pods_running(timeout=420) + + assert compare_bucket_object_list( + mcg_obj_session, + source_bucket.name, + target_bucket.name, + timeout=600, + ), "Deletion sync was not done but expected" + + # Remove replication policy and upload some objects to the bucket + # make sure the replication itself doesn't take place + disable_replication = source_bucket.replication_policy + disable_replication["rules"] = [] + update_replication_policy(source_bucket.name, dict()) + + logger.info("Uploading test objects and waiting for replication to complete") + mockup_logger.upload_test_objs_and_log(source_bucket.name) + + assert not compare_bucket_object_list( + mcg_obj_session, + source_bucket.name, + target_bucket.name, + timeout=300, + ), "Standard replication completed even though replication policy is removed" + + validate_mcg_bg_features( + feature_setup_map, + run_in_bg=False, + skip_any_features=["nsfs", "rgw kafka", "caching"], + object_amount=5, + ) + logger.info("No issues seen with the MCG bg feature validation")