Skip to content

Commit

Permalink
Create the resize osd test and the verification steps
Browse files Browse the repository at this point in the history
Signed-off-by: Itzhak Kave <[email protected]>
  • Loading branch information
Itzhak Kave committed Apr 7, 2024
1 parent f85b8c2 commit 6b1af11
Show file tree
Hide file tree
Showing 5 changed files with 549 additions and 22 deletions.
274 changes: 272 additions & 2 deletions ocs_ci/ocs/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
ThinPoolUtilityWrong,
TimeoutExpiredError,
ResourceWrongStatusException,
StorageSizeNotReflectedException,
)
from ocs_ci.ocs.resources import ocs, storage_cluster
import ocs_ci.ocs.constants as constant
Expand All @@ -50,14 +51,24 @@
from ocs_ci.framework import config
from ocs_ci.ocs import ocp, constants, exceptions
from ocs_ci.ocs.exceptions import PoolNotFound
from ocs_ci.ocs.resources.pvc import get_all_pvc_objs
from ocs_ci.ocs.resources.pvc import (
get_all_pvc_objs,
get_deviceset_pvcs,
get_deviceset_pvs,
)
from ocs_ci.ocs.ocp import OCP, wait_for_cluster_connectivity
from ocs_ci.ocs.resources.ocs import OCS
from ocs_ci.ocs.resources.pvc import PVC
from ocs_ci.utility.connection import Connection
from ocs_ci.utility.lvmo_utils import get_lvm_cluster_name
from ocs_ci.ocs.resources.pod import get_mds_pods, wait_for_pods_to_be_running
from ocs_ci.ocs.resources.pod import (
get_mds_pods,
wait_for_pods_to_be_running,
get_osd_pods,
delete_pods,
)
from ocs_ci.utility.decorators import switch_to_orig_index_at_last
from ocs_ci.ocs.resources.pv import get_pv_size

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -3268,3 +3279,262 @@ def client_clusters_health_check():
client_cluster_health_check()

logger.info("The client clusters health check passed successfully")


def check_resources_state_post_resize_osd(old_osd_pods, old_osd_pvcs, old_osd_pvs):
"""
Check that the pods, PVCs, and PVs are in the expected state post resizing the osd.
It will perform the following steps:
1. Check that the old osd pods are in a terminating state or deleted
2. Check that the new osd pods running, and we have exactly the same number of osd pods as the old ones.
3. Check that the PVCs are in a Bound state
4. Check that the old PVC and PV names are equal to the current PVC and PV names
Args:
old_osd_pods (list): The old osd pod objects before resizing the osd
old_osd_pvcs (list): The old osd PVC objects before resizing the osd
old_osd_pvs (list): The old osd PV objects before resizing the osd
"""
old_osd_pods_count = len(old_osd_pods)
logger.info("Wait for the OSD pods to reach the status Terminated or to be deleted")
old_osd_pod_names = [p.name for p in old_osd_pods]
res = pod.wait_for_pods_to_be_in_statuses(
expected_statuses=[constants.STATUS_TERMINATING],
pod_names=old_osd_pod_names,
timeout=300,
sleep=20,
)
assert res, "The OSD pods failed to reach the status Terminated or to be deleted"

logger.info("Check that the new OSD pods are running")
ocp_pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"])
ocp_pod.wait_for_resource(
condition=constants.STATUS_RUNNING,
selector=constants.OSD_APP_LABEL,
resource_count=old_osd_pods_count,
timeout=300,
sleep=20,
)

logger.info(
f"Check that the number of the new OSD pods are exactly {old_osd_pods_count}"
)
for osd_pods in TimeoutSampler(timeout=180, sleep=10, func=get_osd_pods):
osd_pods_count = len(osd_pods)
logger.info(f"number of osd pods = {osd_pods_count}")
if old_osd_pods_count == osd_pods_count:
break

logger.info("Check that the PVCs are in a Bound state")
ocp_pvc = OCP(kind=constants.PVC, namespace=config.ENV_DATA["cluster_namespace"])
ocp_pvc.wait_for_resource(
timeout=30,
sleep=5,
condition=constants.STATUS_BOUND,
selector=constants.OSD_PVC_GENERIC_LABEL,
resource_count=len(old_osd_pvcs),
)

current_osd_pvcs = get_deviceset_pvcs()
old_pvc_names = [p.name for p in old_osd_pvcs]
current_pvc_names = [p.name for p in current_osd_pvcs]
logger.info(f"Old PVC names = {old_pvc_names}")
logger.info(f"Current PVC names = {current_pvc_names}")

current_osd_pvs = get_deviceset_pvs()
old_pv_names = [p.name for p in old_osd_pvs]
current_pv_names = [p.name for p in current_osd_pvs]
logger.info(f"Old PV names = {old_pv_names}")
logger.info(f"Current PV names = {current_pv_names}")

logger.info(
"Check that the old PVC and PV names are equal to the current PVC and PV names"
)
assert old_pvc_names == current_pvc_names, (
f"The old PVC names {old_pvc_names} are not equal to the "
f"current PVC names {current_pvc_names}"
)
assert old_pv_names == current_pv_names, (
f"The old PV names {old_pv_names} are not equal to the "
f"current PV names {current_pv_names}"
)


def check_storage_size_is_reflected(expected_storage_size):
"""
Check that the expected storage size is reflected in the current storage size, PVCs, PVs,
and ceph capacity.
Args:
expected_storage_size (str): The expected storage size
Raises:
StorageSizeNotReflectedException: If the current storage size, PVCs, PVs, and ceph capacity
are not in the expected size
"""
logger.info(f"The expected storage size is {expected_storage_size}")

current_storage_size = storage_cluster.get_storage_size()
logger.info(f"The current storage size is {current_storage_size}")
logger.info(
"Check that the current storage size equal to the expected storage size"
)
if storage_cluster.get_storage_size() != expected_storage_size:
raise StorageSizeNotReflectedException(
f"The current storage size {current_storage_size} is not equal "
f"to the expected size {expected_storage_size}"
)

logger.info(
"Check that the PVC and PV sizes are equal to the expected storage size"
)
current_osd_pvcs = get_deviceset_pvcs()
expected_storage_size_in_gb = convert_device_size(expected_storage_size, "GB")
pvc_sizes = [pvc.size for pvc in current_osd_pvcs]
logger.info(f"PVC sizes = {pvc_sizes}")
if not all([p_size == expected_storage_size_in_gb for p_size in pvc_sizes]):
raise StorageSizeNotReflectedException(
f"The PVC sizes are not equal to the expected storage size {expected_storage_size_in_gb}"
)

current_osd_pvs = get_deviceset_pvs()
pv_sizes = [get_pv_size(pv.get()) for pv in current_osd_pvs]
logger.info(f"PV sizes {pv_sizes}")
if not all([p_size == expected_storage_size_in_gb for p_size in pv_sizes]):
raise StorageSizeNotReflectedException(
f"The PV sizes are not equal to the expected storage size {expected_storage_size_in_gb}"
)

ceph_cluster = CephCluster()
ceph_capacity = ceph_cluster.get_ceph_capacity()
expected_storage_size_in_gb = convert_device_size(expected_storage_size, "GB")
logger.info(
f"Check that the Ceph capacity {ceph_capacity} is equal "
f"to the expected storage size {expected_storage_size_in_gb}"
)
if not int(ceph_capacity) == expected_storage_size_in_gb:
raise StorageSizeNotReflectedException(
f"The Ceph capacity {ceph_capacity} is not equal to the "
f"expected storage size {expected_storage_size_in_gb}"
)


def check_ceph_state_post_resize_osd():
"""
Check the Ceph state post resize osd. It will perform the following steps:
1. Check that the Ceph capacity is equal to the expected storage size
2. Check the Ceph device classes and osd tree
"""
logger.info("Check the Ceph device classes and osd tree")
device_class = storage_cluster.get_device_class()
ct_pod = pod.get_ceph_tools_pod()
storage_cluster.verify_storage_device_class(device_class)
storage_cluster.verify_device_class_in_osd_tree(ct_pod, device_class)
assert check_ceph_osd_tree()


def base_ceph_verification_steps_post_resize_osd(
old_osd_pods, old_osd_pvcs, old_osd_pvs, expected_storage_size
):
"""
Check the Ceph verification steps post resize OSD.
It will perform the following steps:
1. Check the resources state post resize OSD
2. Check the resources size post resize OSD
3. Check the Ceph state post resize OSD
Args:
old_osd_pods (list): The old osd pod objects before resizing the osd
old_osd_pvcs (list): The old osd PVC objects before resizing the osd
old_osd_pvs (list): The old osd PV objects before resizing the osd
expected_storage_size (str): The expected storage size after resizing the osd
Raises:
StorageSizeNotReflectedException: If the current storage size, PVCs, PVs, and ceph capacity
are not in the expected size
"""
logger.info("Check the resources state post resize OSD")
check_resources_state_post_resize_osd(old_osd_pods, old_osd_pvcs, old_osd_pvs)
logger.info("Check the resources size post resize OSD")
check_storage_size_is_reflected(expected_storage_size)
logger.info("Check the Ceph state post resize OSD")
check_ceph_state_post_resize_osd()
logger.info("All the Ceph verification steps post resize osd finished successfully")


def ceph_verification_steps_post_resize_osd(
old_osd_pods, old_osd_pvcs, old_osd_pvs, expected_storage_size, num_of_tries=6
):
"""
Try to execute the function 'base_ceph_verification_steps_post_resize_osd' a number of tries
until success, ignoring the exception 'StorageSizeNotReflectedException'.
In every iteration, if we get the exception 'StorageSizeNotReflectedException', it will restart
the osd pods and try again until it reaches the maximum tries.
Args:
old_osd_pods (list): The old osd pod objects before resizing the osd
old_osd_pvcs (list): The old osd PVC objects before resizing the osd
old_osd_pvs (list): The old osd PV objects before resizing the osd
expected_storage_size (str): The expected storage size after resizing the osd
num_of_tries (int): The number of tries to try executing the
function 'base_ceph_verification_steps_post_resize_osd'.
Raises:
StorageSizeNotReflectedException: If the current storage size, PVCs, PVs, and ceph capacity
are not in the expected size
"""
ex = StorageSizeNotReflectedException()
for i in range(1, num_of_tries + 1):
try:
base_ceph_verification_steps_post_resize_osd(
old_osd_pods, old_osd_pvcs, old_osd_pvs, expected_storage_size
)
return
except StorageSizeNotReflectedException as ex:
logger.warning(
f"The Ceph verification steps failed due to the error: {str(ex)}. "
f"Try to restart the OSD pods before the next iteration"
)
old_osd_pods = get_osd_pods()
delete_pods(old_osd_pods, wait=False)

logger.warning(
f"Failed to complete the Ceph verification steps post resize osd after {num_of_tries} tries"
)
raise ex


def check_ceph_health_after_resize_osd(
ceph_health_tries=40, ceph_rebalance_timeout=900
):
"""
Check Ceph health after resize osd
Args:
ceph_health_tries (int): The number of tries to wait for the Ceph health to be OK.
ceph_rebalance_timeout (int): The time to wait for the Ceph cluster rebalanced.
"""
if config.RUN.get("io_in_bg"):
logger.info(
"Increase the time to wait for Ceph health to be health OK, "
"because we run IO in the background"
)
additional_ceph_health_tries = int(config.RUN.get("io_load") * 1.3)
ceph_health_tries += additional_ceph_health_tries

additional_ceph_rebalance_timeout = int(config.RUN.get("io_load") * 100)
ceph_rebalance_timeout += additional_ceph_rebalance_timeout

ceph_health_check(
namespace=config.ENV_DATA["cluster_namespace"], tries=ceph_health_tries
)
ceph_cluster_obj = CephCluster()
assert ceph_cluster_obj.wait_for_rebalance(
timeout=ceph_rebalance_timeout
), "Data re-balance failed to complete"
4 changes: 4 additions & 0 deletions ocs_ci/ocs/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,3 +690,7 @@ class NoRunningCephToolBoxException(Exception):

class UsernameNotFoundException(Exception):
pass


class StorageSizeNotReflectedException(Exception):
pass
61 changes: 61 additions & 0 deletions ocs_ci/ocs/resources/pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -3583,3 +3583,64 @@ def _check_if_pod_deleted(label, namespace):
namespace=namespace,
)
sampler.wait_for_func_status(True)


def calculate_md5sum_of_pod_files(pods_for_integrity_check, pod_file_name):
"""
Calculate the md5sum of the pod files, and save it in the pod objects
Args:
pods_for_integrity_check (list): The list of the pod objects to calculate the md5sum
pod_file_name (str): The pod file name to save the md5sum
"""
# Wait for IO to finish
logger.info("Wait for IO to finish on pods")
for pod_obj in pods_for_integrity_check:
pod_obj.get_fio_results()
logger.info(f"IO finished on pod {pod_obj.name}")
# Calculate md5sum
pod_file_name = (
pod_file_name
if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM)
else pod_obj.get_storage_path(storage_type="block")
)
logger.info(
f"Calculate the md5sum of the file {pod_file_name} in the pod {pod_obj.name}"
)
pod_obj.pvc.md5sum = cal_md5sum(
pod_obj,
pod_file_name,
pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK,
)


def verify_md5sum_on_pod_files(pods_for_integrity_check, pod_file_name):
"""
Verify the md5sum of the pod files
Args:
pods_for_integrity_check (list): The list of the pod objects to verify the md5sum
pod_file_name (str): The pod file name to verify its md5sum
Raises:
AssertionError: If file doesn't exist or md5sum mismatch
"""
for pod_obj in pods_for_integrity_check:
pod_file_name = (
pod_obj.get_storage_path(storage_type="block")
if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK)
else pod_file_name
)
verify_data_integrity(
pod_obj,
pod_file_name,
pod_obj.pvc.md5sum,
pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK,
)
logger.info(
f"Verified: md5sum of {pod_file_name} on pod {pod_obj.name} "
f"matches with the original md5sum"
)
logger.info("Data integrity check passed on all pods")
Loading

0 comments on commit 6b1af11

Please sign in to comment.