diff --git a/ocs_ci/ocs/cluster.py b/ocs_ci/ocs/cluster.py index 0e10fb6d03c0..194d0432d7fc 100644 --- a/ocs_ci/ocs/cluster.py +++ b/ocs_ci/ocs/cluster.py @@ -30,6 +30,7 @@ ThinPoolUtilityWrong, TimeoutExpiredError, ResourceWrongStatusException, + CephHealthException, ) from ocs_ci.ocs.resources import ocs, storage_cluster import ocs_ci.ocs.constants as constant @@ -50,14 +51,24 @@ from ocs_ci.framework import config from ocs_ci.ocs import ocp, constants, exceptions from ocs_ci.ocs.exceptions import PoolNotFound -from ocs_ci.ocs.resources.pvc import get_all_pvc_objs +from ocs_ci.ocs.resources.pvc import ( + get_all_pvc_objs, + get_deviceset_pvcs, + get_deviceset_pvs, +) from ocs_ci.ocs.ocp import OCP, wait_for_cluster_connectivity from ocs_ci.ocs.resources.ocs import OCS from ocs_ci.ocs.resources.pvc import PVC from ocs_ci.utility.connection import Connection from ocs_ci.utility.lvmo_utils import get_lvm_cluster_name -from ocs_ci.ocs.resources.pod import get_mds_pods, wait_for_pods_to_be_running +from ocs_ci.ocs.resources.pod import ( + get_mds_pods, + wait_for_pods_to_be_running, + get_osd_pods, + delete_pods, +) from ocs_ci.utility.decorators import switch_to_orig_index_at_last +from ocs_ci.ocs.resources.pv import get_pv_size logger = logging.getLogger(__name__) @@ -3268,3 +3279,279 @@ def client_clusters_health_check(): client_cluster_health_check() logger.info("The client clusters health check passed successfully") + + +def check_resources_state_post_resize_osd(old_osd_pods, old_osd_pvcs, old_osd_pvs): + """ + Check that the pods, PVCs, and PVs are in the expected state post resizing the osd. + It will perform the following steps: + 1. Check that the old osd pods are in a terminating state or deleted + 2. Check that the new osd pods running, and we have exactly the same number + of osd pods as the old ones. + 3. Check that the PVCs are in a Bound state + 4. Check that the old PVC and PV names are equal to the current PVC and PV names + + Args: + old_osd_pods (list): The old osd pod objects before resizing the osd + old_osd_pvcs (list): The old osd PVC objects before resizing the osd + old_osd_pvs (list): The old osd PV objects before resizing the osd + + Raises: + ResourceWrongStatusException: If the pods, PVCs, and PVs are not in the expected state. + + """ + old_osd_pods_count = len(old_osd_pods) + logger.info("Wait for the OSD pods to reach the status Terminated or be deleted") + old_osd_pod_names = [p.name for p in old_osd_pods] + res = pod.wait_for_pods_to_be_in_statuses( + expected_statuses=[constants.STATUS_TERMINATING], + pod_names=old_osd_pod_names, + timeout=300, + sleep=20, + ) + if not res: + raise ResourceWrongStatusException( + "The OSD pods failed to reach the status Terminated or be deleted" + ) + + logger.info("Check that the new OSD pods are running") + ocp_pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) + ocp_pod.wait_for_resource( + condition=constants.STATUS_RUNNING, + selector=constants.OSD_APP_LABEL, + resource_count=old_osd_pods_count, + timeout=300, + sleep=20, + ) + + logger.info( + f"Check that the number of the new OSD pods are exactly {old_osd_pods_count}" + ) + for osd_pods in TimeoutSampler(timeout=180, sleep=10, func=get_osd_pods): + osd_pods_count = len(osd_pods) + logger.info(f"number of osd pods = {osd_pods_count}") + if old_osd_pods_count == osd_pods_count: + break + + logger.info("Check that the PVCs are in a Bound state") + ocp_pvc = OCP(kind=constants.PVC, namespace=config.ENV_DATA["cluster_namespace"]) + ocp_pvc.wait_for_resource( + timeout=30, + sleep=5, + condition=constants.STATUS_BOUND, + selector=constants.OSD_PVC_GENERIC_LABEL, + resource_count=len(old_osd_pvcs), + ) + + current_osd_pvcs = get_deviceset_pvcs() + old_pvc_names = [p.name for p in old_osd_pvcs] + current_pvc_names = [p.name for p in current_osd_pvcs] + logger.info(f"Old PVC names = {old_pvc_names}") + logger.info(f"Current PVC names = {current_pvc_names}") + + current_osd_pvs = get_deviceset_pvs() + old_pv_names = [p.name for p in old_osd_pvs] + current_pv_names = [p.name for p in current_osd_pvs] + logger.info(f"Old PV names = {old_pv_names}") + logger.info(f"Current PV names = {current_pv_names}") + + logger.info( + "Check that the old PVC and PV names are equal to the current PVC and PV names" + ) + if not old_pvc_names == current_pvc_names: + raise ResourceWrongStatusException( + f"The old PVC names {old_pvc_names} are not equal to the " + f"current PVC names {current_pvc_names}" + ) + if not old_pv_names == current_pv_names: + raise ResourceWrongStatusException( + f"The old PV names {old_pv_names} are not equal to the " + f"current PV names {current_pv_names}" + ) + + +def check_resources_size_post_resize_osd(expected_storage_size): + """ + Check if the current storagecluster size, PVCs, and PVs are in the expected size. + + Args: + expected_storage_size (str): The expected storage size after resizing the osd + + Raises: + ResourceWrongStatusException: If the pods, PVCs, and PVs are not in the expected size + + """ + logger.info(f"The expected storage size is {expected_storage_size}") + + current_storage_size = storage_cluster.get_storage_size() + logger.info(f"The current storage size is {current_storage_size}") + logger.info( + "Check that the current storage size equal to the expected storage size" + ) + if storage_cluster.get_storage_size() != expected_storage_size: + raise ResourceWrongStatusException( + f"The current storage size {current_storage_size} is not equal " + f"to the expected size {expected_storage_size}" + ) + + logger.info( + "Check that the PVC and PV sizes are equal to the expected storage size" + ) + current_osd_pvcs = get_deviceset_pvcs() + expected_storage_size_in_gb = convert_device_size(expected_storage_size, "GB") + pvc_sizes = [pvc.size for pvc in current_osd_pvcs] + logger.info(f"PVC sizes = {pvc_sizes}") + if not all([p_size == expected_storage_size_in_gb for p_size in pvc_sizes]): + raise ResourceWrongStatusException( + f"The PVC sizes are not equal to the expected storage size {expected_storage_size_in_gb}" + ) + + current_osd_pvs = get_deviceset_pvs() + pv_sizes = [get_pv_size(pv.get()) for pv in current_osd_pvs] + logger.info(f"PV sizes {pv_sizes}") + if not all([p_size == expected_storage_size_in_gb for p_size in pv_sizes]): + raise ResourceWrongStatusException( + f"The PV sizes are not equal to the expected storage size {expected_storage_size_in_gb}" + ) + + +def check_ceph_state_post_resize_osd(expected_storage_size): + """ + Check the Ceph state post resize osd. It will perform the following steps: + 1. Check that the Ceph capacity is equal to the expected storage size + 2. Check the Ceph device classes and osd tree + + Args: + expected_storage_size (str): The expected storage size after resizing the osd + + Raises: + CephHealthException: If the Ceph capacity is not equal to the expected storage size, or + the Ceph device classes and osd tree checks failed + + """ + ceph_cluster = CephCluster() + ceph_capacity = ceph_cluster.get_ceph_capacity() + expected_storage_size_in_gb = convert_device_size(expected_storage_size, "GB") + logger.info( + f"Check that the Ceph capacity {ceph_capacity} is equal " + f"to the expected storage size {expected_storage_size_in_gb}" + ) + if not int(ceph_capacity) == expected_storage_size_in_gb: + raise CephHealthException( + f"The Ceph capacity {ceph_capacity} is not equal to the " + f"expected storage size {expected_storage_size_in_gb}" + ) + + logger.info("Check the Ceph device classes and osd tree") + device_class = storage_cluster.get_device_class() + ct_pod = pod.get_ceph_tools_pod() + try: + storage_cluster.verify_storage_device_class(device_class) + storage_cluster.verify_device_class_in_osd_tree(ct_pod, device_class) + except AssertionError as ex: + raise CephHealthException(ex) + if not check_ceph_osd_tree(): + raise CephHealthException("The OSD tree is not created/modified correctly") + + +def base_ceph_verification_steps_post_resize_osd( + old_osd_pods, old_osd_pvcs, old_osd_pvs, expected_storage_size +): + """ + Check the Ceph verification steps post resize OSD. + It will perform the following steps: + 1. Check the resources state post resize OSD + 2. Check the resources size post resize OSD + 3. Check the Ceph state post resize OSD + + Args: + old_osd_pods (list): The old osd pod objects before resizing the osd + old_osd_pvcs (list): The old osd PVC objects before resizing the osd + old_osd_pvs (list): The old osd PV objects before resizing the osd + expected_storage_size (str): The expected storage size after resizing the osd + + Raises: + ResourceWrongStatusException: If the resources are not in the expected state and size + CephHealthException: If Ceph is not in the expected state. + + """ + logger.info("Check the resources state post resize OSD") + check_resources_state_post_resize_osd(old_osd_pods, old_osd_pvcs, old_osd_pvs) + logger.info("Check the resources size post resize OSD") + check_resources_size_post_resize_osd(expected_storage_size) + logger.info("Check the Ceph state post resize OSD") + check_ceph_state_post_resize_osd(expected_storage_size) + logger.info("All the Ceph verification steps post resize osd finished successfully") + + +def ceph_verification_steps_post_resize_osd( + old_osd_pods, old_osd_pvcs, old_osd_pvs, expected_storage_size +): + """ + Check the Ceph verification steps post resize OSD, as described in the above function: + 'base_ceph_verification_steps_post_resize_osd'. + If we get one of the exceptions: 'ResourceWrongStatusException' and 'CephHealthException' it will + restart the osd pods and try again several times. + If it fails after the number of tries, it returns False. Otherwise, if it succeeded, it returns True. + + Args: + old_osd_pods (list): The old osd pod objects before resizing the osd + old_osd_pvcs (list): The old osd PVC objects before resizing the osd + old_osd_pvs (list): The old osd PV objects before resizing the osd + expected_storage_size (str): The expected storage size after resizing the osd + + Returns: + bool: True, if the Ceph verification steps post resize OSD succeeded after the given tries. + False, otherwise + + """ + num_of_tries = 5 + for i in range(num_of_tries): + try: + base_ceph_verification_steps_post_resize_osd( + old_osd_pods, old_osd_pvcs, old_osd_pvs, expected_storage_size + ) + return True + except (ResourceWrongStatusException, CephHealthException) as ex: + logger.warning( + f"The Ceph verification steps failed due to the error: {str(ex)}. " + f"Try to restart the OSD pods before the next iteration" + ) + old_osd_pods = get_osd_pods() + delete_pods(old_osd_pods, wait=False) + + logger.warning( + f"Failed to complete the Ceph verification steps post resize osd after {num_of_tries} tries" + ) + return False + + +def check_ceph_health_after_resize_osd( + ceph_health_tries=40, ceph_rebalance_timeout=900 +): + """ + Check Ceph health after resize osd + + Args: + ceph_health_tries (int): The number of tries to wait for the Ceph health to be OK. + ceph_rebalance_timeout (int): The time to wait for the Ceph cluster rebalanced. + + """ + if config.RUN.get("io_in_bg"): + logger.info( + "Increase the time to wait for Ceph health to be health OK, " + "because we run IO in the background" + ) + additional_ceph_health_tries = int(config.RUN.get("io_load") * 1.3) + ceph_health_tries += additional_ceph_health_tries + + additional_ceph_rebalance_timeout = int(config.RUN.get("io_load") * 100) + ceph_rebalance_timeout += additional_ceph_rebalance_timeout + + ceph_health_check( + namespace=config.ENV_DATA["cluster_namespace"], tries=ceph_health_tries + ) + ceph_cluster_obj = CephCluster() + assert ceph_cluster_obj.wait_for_rebalance( + timeout=ceph_rebalance_timeout + ), "Data re-balance failed to complete" diff --git a/ocs_ci/ocs/resources/pod.py b/ocs_ci/ocs/resources/pod.py index 6b319bf10fa5..dbe298ebc85a 100644 --- a/ocs_ci/ocs/resources/pod.py +++ b/ocs_ci/ocs/resources/pod.py @@ -3583,3 +3583,64 @@ def _check_if_pod_deleted(label, namespace): namespace=namespace, ) sampler.wait_for_func_status(True) + + +def calculate_md5sum_of_pod_files(pods_for_integrity_check, pod_file_name): + """ + Calculate the md5sum of the pod files, and save it in the pod objects + + Args: + pods_for_integrity_check (list): The list of the pod objects to calculate the md5sum + pod_file_name (str): The pod file name to save the md5sum + + """ + # Wait for IO to finish + logger.info("Wait for IO to finish on pods") + for pod_obj in pods_for_integrity_check: + pod_obj.get_fio_results() + logger.info(f"IO finished on pod {pod_obj.name}") + # Calculate md5sum + pod_file_name = ( + pod_file_name + if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) + else pod_obj.get_storage_path(storage_type="block") + ) + logger.info( + f"Calculate the md5sum of the file {pod_file_name} in the pod {pod_obj.name}" + ) + pod_obj.pvc.md5sum = cal_md5sum( + pod_obj, + pod_file_name, + pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, + ) + + +def verify_md5sum_on_pod_files(pods_for_integrity_check, pod_file_name): + """ + Verify the md5sum of the pod files + + Args: + pods_for_integrity_check (list): The list of the pod objects to verify the md5sum + pod_file_name (str): The pod file name to verify its md5sum + + Raises: + AssertionError: If file doesn't exist or md5sum mismatch + + """ + for pod_obj in pods_for_integrity_check: + pod_file_name = ( + pod_obj.get_storage_path(storage_type="block") + if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK) + else pod_file_name + ) + verify_data_integrity( + pod_obj, + pod_file_name, + pod_obj.pvc.md5sum, + pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, + ) + logger.info( + f"Verified: md5sum of {pod_file_name} on pod {pod_obj.name} " + f"matches with the original md5sum" + ) + logger.info("Data integrity check passed on all pods") diff --git a/ocs_ci/ocs/resources/storage_cluster.py b/ocs_ci/ocs/resources/storage_cluster.py index 94c7b5baffe3..fc2ecb58da7b 100644 --- a/ocs_ci/ocs/resources/storage_cluster.py +++ b/ocs_ci/ocs/resources/storage_cluster.py @@ -1648,26 +1648,7 @@ def get_osd_size(): int: osd size """ - sc = get_storage_cluster() - size = ( - sc.get() - .get("items")[0] - .get("spec") - .get("storageDeviceSets")[0] - .get("dataPVCTemplate") - .get("spec") - .get("resources") - .get("requests") - .get("storage") - ) - if size.isdigit or config.DEPLOYMENT.get("local_storage"): - # In the case of UI deployment of LSO cluster, the value in StorageCluster CR - # is set to 1, so we can not take OSD size from there. For LSO we will return - # the size from PVC. - pvc = get_deviceset_pvcs()[0] - return int(pvc.get()["status"]["capacity"]["storage"][:-2]) - else: - return int(size[:-2]) + return get_storage_size()[:-2] def get_deviceset_count(): @@ -2669,3 +2650,56 @@ def validate_serviceexport(): assert mon_count == len( get_mon_pods() ), f"Mon serviceexport count mismatch {mon_count} != {len(get_mon_pods())}" + + +def get_storage_size(): + """ + Get the storagecluster storage size + + Returns: + str: The storagecluster storage size + + """ + sc = get_storage_cluster() + storage = ( + sc.get() + .get("items")[0] + .get("spec") + .get("storageDeviceSets")[0] + .get("dataPVCTemplate") + .get("spec") + .get("resources") + .get("requests") + .get("storage") + ) + if storage.isdigit or config.DEPLOYMENT.get("local_storage"): + # In the case of UI deployment of LSO cluster, the value in StorageCluster CR + # is set to 1, so we can not take OSD size from there. For LSO we will return + # the size from PVC. + pvc = get_deviceset_pvcs()[0] + return pvc.get()["status"]["capacity"]["storage"] + else: + return storage + + +def resize_osd(new_osd_size): + """ + Resize the OSD(e.g., from 512 to 1024, 1024 to 2048, etc.) + + Args: + new_osd_size (str): The new osd size(e.g, 512Gi, 1024Gi, 1Ti, 2Ti, etc.) + + Returns: + bool: True in case if changes are applied. False otherwise + + """ + sc = get_storage_cluster() + # Patch the OSD storage size + path = "/spec/storageDeviceSets/0/dataPVCTemplate/spec/resources/requests/storage" + params = f"""[{{ "op": "replace", "path": "{path}", "value": {new_osd_size}}}]""" + res = sc.patch( + resource_name=sc.get()["items"][0]["metadata"]["name"], + params=params.strip("\n"), + format_type="json", + ) + return res diff --git a/tests/functional/z_cluster/cluster_expansion/test_resize_osd.py b/tests/functional/z_cluster/cluster_expansion/test_resize_osd.py new file mode 100644 index 000000000000..76ba908a7548 --- /dev/null +++ b/tests/functional/z_cluster/cluster_expansion/test_resize_osd.py @@ -0,0 +1,158 @@ +import random +import pytest +import logging + +from ocs_ci.framework.pytest_customization.marks import ( + polarion_id, + skipif_aws_i3, + skipif_bm, + skipif_external_mode, + skipif_bmpsi, + skipif_ibm_power, + skipif_lso, + skipif_managed_service, + skipif_hci_provider_and_client, + brown_squad, +) +from ocs_ci.framework.testlib import ( + ignore_leftovers, + ManageTest, + tier1, +) +from ocs_ci.ocs import constants +from ocs_ci.ocs.cluster import ( + ceph_verification_steps_post_resize_osd, + check_ceph_health_after_resize_osd, +) +from ocs_ci.ocs.resources.pod import ( + get_osd_pods, + calculate_md5sum_of_pod_files, + verify_md5sum_on_pod_files, +) +from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs, get_deviceset_pvs +from ocs_ci.ocs.resources.storage_cluster import resize_osd, get_storage_size +from ocs_ci.helpers.sanity_helpers import Sanity + + +logger = logging.getLogger(__name__) + + +@brown_squad +@ignore_leftovers +@polarion_id("OCS-1191") +@skipif_managed_service +@skipif_aws_i3 +@skipif_bm +@skipif_bmpsi +@skipif_lso +@skipif_external_mode +@skipif_ibm_power +@skipif_managed_service +@skipif_hci_provider_and_client +class TestResizeOSD(ManageTest): + """ + Automates the resize OSD test procedure + """ + + @pytest.fixture(autouse=True) + def setup(self, create_pvcs_and_pods): + """ + Init all the data for the resize osd test + + """ + self.create_pvcs_and_pods = create_pvcs_and_pods + + self.old_osd_pods = get_osd_pods() + self.old_storage_size = get_storage_size() + self.old_osd_pvcs = get_deviceset_pvcs() + self.old_osd_pvs = get_deviceset_pvs() + self.new_storage_size = None + + self.pod_file_name = "fio_test" + self.sanity_helpers = Sanity() + pvc_size = random.randint(3, 7) + self.pvcs1, self.pods_for_integrity_check = create_pvcs_and_pods( + pvc_size=pvc_size, num_of_rbd_pvc=6, num_of_cephfs_pvc=6 + ) + pvc_size = random.randint(3, 8) + self.pvcs2, self.pods_for_run_io = create_pvcs_and_pods( + pvc_size=pvc_size, num_of_rbd_pvc=5, num_of_cephfs_pvc=5 + ) + + def run_io_on_pods(self, pods, size="1G", runtime=30): + """ + Run IO on the pods + + Args: + pods (list): The list of pods for running the IO + size (str): Size in MB or Gi, e.g. '200M'. Default value is '1G' + runtime (int): The number of seconds IO should run for + + """ + logger.info("Starting IO on all pods") + for pod_obj in pods: + storage_type = ( + "block" + if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK + else "fs" + ) + rate = f"{random.randint(1, 5)}M" + pod_obj.run_io( + storage_type=storage_type, + size=size, + runtime=runtime, + rate=rate, + fio_filename=self.pod_file_name, + end_fsync=1, + ) + logger.info(f"IO started on pod {pod_obj.name}") + logger.info("Started IO on all pods") + + def prepare_data_before_resize_osd(self): + """ + Prepare the data before resizing the osd + + """ + logger.info("Run IO on the pods for integrity check") + self.run_io_on_pods(self.pods_for_integrity_check) + logger.info("Calculate the md5sum of the pods for integrity check") + calculate_md5sum_of_pod_files(self.pods_for_integrity_check, self.pod_file_name) + runtime = 180 + logger.info(f"Run IO on the pods in the test background for {runtime} seconds") + self.run_io_on_pods(self.pods_for_run_io, size="2G", runtime=runtime) + + def verification_steps_post_resize_osd(self): + assert ceph_verification_steps_post_resize_osd( + self.old_osd_pods, + self.old_osd_pvcs, + self.old_osd_pvs, + self.new_storage_size, + ) + logger.info("Verify the md5sum of the pods for integrity check") + verify_md5sum_on_pod_files(self.pods_for_integrity_check, self.pod_file_name) + check_ceph_health_after_resize_osd() + + logger.info("Try to create more resources and run IO") + pvc_size = random.randint(3, 7) + self.pvcs3, self.pods_for_run_io = self.create_pvcs_and_pods( + pvc_size=pvc_size, num_of_rbd_pvc=6, num_of_cephfs_pvc=6 + ) + self.run_io_on_pods(self.pods_for_run_io, size="2G") + logger.info("Check the cluster health") + self.sanity_helpers.health_check() + + @tier1 + def test_resize_osd(self): + """ + Test resize OSD + """ + self.prepare_data_before_resize_osd() + + logger.info(f"The current osd size is {self.old_storage_size}") + size = int(self.old_storage_size[0:-2]) + size_type = self.old_storage_size[-2:] + self.new_storage_size = f"{size * 2}{size_type}" + logger.info(f"Increase the osd size to {self.new_storage_size}") + resize_osd(self.new_storage_size) + + self.verification_steps_post_resize_osd()