Skip to content

Commit

Permalink
Create the resize osd test and the verification steps
Browse files Browse the repository at this point in the history
Signed-off-by: Itzhak Kave <[email protected]>
  • Loading branch information
Itzhak Kave committed Apr 4, 2024
1 parent f85b8c2 commit 14956f1
Show file tree
Hide file tree
Showing 4 changed files with 562 additions and 22 deletions.
291 changes: 289 additions & 2 deletions ocs_ci/ocs/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
ThinPoolUtilityWrong,
TimeoutExpiredError,
ResourceWrongStatusException,
CephHealthException,
)
from ocs_ci.ocs.resources import ocs, storage_cluster
import ocs_ci.ocs.constants as constant
Expand All @@ -50,14 +51,24 @@
from ocs_ci.framework import config
from ocs_ci.ocs import ocp, constants, exceptions
from ocs_ci.ocs.exceptions import PoolNotFound
from ocs_ci.ocs.resources.pvc import get_all_pvc_objs
from ocs_ci.ocs.resources.pvc import (
get_all_pvc_objs,
get_deviceset_pvcs,
get_deviceset_pvs,
)
from ocs_ci.ocs.ocp import OCP, wait_for_cluster_connectivity
from ocs_ci.ocs.resources.ocs import OCS
from ocs_ci.ocs.resources.pvc import PVC
from ocs_ci.utility.connection import Connection
from ocs_ci.utility.lvmo_utils import get_lvm_cluster_name
from ocs_ci.ocs.resources.pod import get_mds_pods, wait_for_pods_to_be_running
from ocs_ci.ocs.resources.pod import (
get_mds_pods,
wait_for_pods_to_be_running,
get_osd_pods,
delete_pods,
)
from ocs_ci.utility.decorators import switch_to_orig_index_at_last
from ocs_ci.ocs.resources.pv import get_pv_size

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -3268,3 +3279,279 @@ def client_clusters_health_check():
client_cluster_health_check()

logger.info("The client clusters health check passed successfully")


def check_resources_state_post_resize_osd(old_osd_pods, old_osd_pvcs, old_osd_pvs):
"""
Check that the pods, PVCs, and PVs are in the expected state post resizing the osd.
It will perform the following steps:
1. Check that the old osd pods are in a terminating state or deleted
2. Check that the new osd pods running, and we have exactly the same number
of osd pods as the old ones.
3. Check that the PVCs are in a Bound state
4. Check that the old PVC and PV names are equal to the current PVC and PV names
Args:
old_osd_pods (list): The old osd pod objects before resizing the osd
old_osd_pvcs (list): The old osd PVC objects before resizing the osd
old_osd_pvs (list): The old osd PV objects before resizing the osd
Raises:
ResourceWrongStatusException: If the pods, PVCs, and PVs are not in the expected state.
"""
old_osd_pods_count = len(old_osd_pods)
logger.info("Wait for the OSD pods to reach the status Terminated or be deleted")
old_osd_pod_names = [p.name for p in old_osd_pods]
res = pod.wait_for_pods_to_be_in_statuses(
expected_statuses=[constants.STATUS_TERMINATING],
pod_names=old_osd_pod_names,
timeout=300,
sleep=20,
)
if not res:
raise ResourceWrongStatusException(
"The OSD pods failed to reach the status Terminated or be deleted"
)

logger.info("Check that the new OSD pods are running")
ocp_pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"])
ocp_pod.wait_for_resource(
condition=constants.STATUS_RUNNING,
selector=constants.OSD_APP_LABEL,
resource_count=old_osd_pods_count,
timeout=300,
sleep=20,
)

logger.info(
f"Check that the number of the new OSD pods are exactly {old_osd_pods_count}"
)
for osd_pods in TimeoutSampler(timeout=180, sleep=10, func=get_osd_pods):
osd_pods_count = len(osd_pods)
logger.info(f"number of osd pods = {osd_pods_count}")
if old_osd_pods_count == osd_pods_count:
break

logger.info("Check that the PVCs are in a Bound state")
ocp_pvc = OCP(kind=constants.PVC, namespace=config.ENV_DATA["cluster_namespace"])
ocp_pvc.wait_for_resource(
timeout=30,
sleep=5,
condition=constants.STATUS_BOUND,
selector=constants.OSD_PVC_GENERIC_LABEL,
resource_count=len(old_osd_pvcs),
)

current_osd_pvcs = get_deviceset_pvcs()
old_pvc_names = [p.name for p in old_osd_pvcs]
current_pvc_names = [p.name for p in current_osd_pvcs]
logger.info(f"Old PVC names = {old_pvc_names}")
logger.info(f"Current PVC names = {current_pvc_names}")

current_osd_pvs = get_deviceset_pvs()
old_pv_names = [p.name for p in old_osd_pvs]
current_pv_names = [p.name for p in current_osd_pvs]
logger.info(f"Old PV names = {old_pv_names}")
logger.info(f"Current PV names = {current_pv_names}")

logger.info(
"Check that the old PVC and PV names are equal to the current PVC and PV names"
)
if not old_pvc_names == current_pvc_names:
raise ResourceWrongStatusException(
f"The old PVC names {old_pvc_names} are not equal to the "
f"current PVC names {current_pvc_names}"
)
if not old_pv_names == current_pv_names:
raise ResourceWrongStatusException(
f"The old PV names {old_pv_names} are not equal to the "
f"current PV names {current_pv_names}"
)


def check_resources_size_post_resize_osd(expected_storage_size):
"""
Check if the current storagecluster size, PVCs, and PVs are in the expected size.
Args:
expected_storage_size (str): The expected storage size after resizing the osd
Raises:
ResourceWrongStatusException: If the pods, PVCs, and PVs are not in the expected size
"""
logger.info(f"The expected storage size is {expected_storage_size}")

current_storage_size = storage_cluster.get_storage_size()
logger.info(f"The current storage size is {current_storage_size}")
logger.info(
"Check that the current storage size equal to the expected storage size"
)
if storage_cluster.get_storage_size() != expected_storage_size:
raise ResourceWrongStatusException(
f"The current storage size {current_storage_size} is not equal "
f"to the expected size {expected_storage_size}"
)

logger.info(
"Check that the PVC and PV sizes are equal to the expected storage size"
)
current_osd_pvcs = get_deviceset_pvcs()
expected_storage_size_in_gb = convert_device_size(expected_storage_size, "GB")
pvc_sizes = [pvc.size for pvc in current_osd_pvcs]
logger.info(f"PVC sizes = {pvc_sizes}")
if not all([p_size == expected_storage_size_in_gb for p_size in pvc_sizes]):
raise ResourceWrongStatusException(
f"The PVC sizes are not equal to the expected storage size {expected_storage_size_in_gb}"
)

current_osd_pvs = get_deviceset_pvs()
pv_sizes = [get_pv_size(pv.get()) for pv in current_osd_pvs]
logger.info(f"PV sizes {pv_sizes}")
if not all([p_size == expected_storage_size_in_gb for p_size in pv_sizes]):
raise ResourceWrongStatusException(
f"The PV sizes are not equal to the expected storage size {expected_storage_size_in_gb}"
)


def check_ceph_state_post_resize_osd(expected_storage_size):
"""
Check the Ceph state post resize osd. It will perform the following steps:
1. Check that the Ceph capacity is equal to the expected storage size
2. Check the Ceph device classes and osd tree
Args:
expected_storage_size (str): The expected storage size after resizing the osd
Raises:
CephHealthException: If the Ceph capacity is not equal to the expected storage size, or
the Ceph device classes and osd tree checks failed
"""
ceph_cluster = CephCluster()
ceph_capacity = ceph_cluster.get_ceph_capacity()
expected_storage_size_in_gb = convert_device_size(expected_storage_size, "GB")
logger.info(
f"Check that the Ceph capacity {ceph_capacity} is equal "
f"to the expected storage size {expected_storage_size_in_gb}"
)
if not int(ceph_capacity) == expected_storage_size_in_gb:
raise CephHealthException(
f"The Ceph capacity {ceph_capacity} is not equal to the "
f"expected storage size {expected_storage_size_in_gb}"
)

logger.info("Check the Ceph device classes and osd tree")
device_class = storage_cluster.get_device_class()
ct_pod = pod.get_ceph_tools_pod()
try:
storage_cluster.verify_storage_device_class(device_class)
storage_cluster.verify_device_class_in_osd_tree(ct_pod, device_class)
except AssertionError as ex:
raise CephHealthException(ex)
if not check_ceph_osd_tree():
raise CephHealthException("The OSD tree is not created/modified correctly")


def base_ceph_verification_steps_post_resize_osd(
old_osd_pods, old_osd_pvcs, old_osd_pvs, expected_storage_size
):
"""
Check the Ceph verification steps post resize OSD.
It will perform the following steps:
1. Check the resources state post resize OSD
2. Check the resources size post resize OSD
3. Check the Ceph state post resize OSD
Args:
old_osd_pods (list): The old osd pod objects before resizing the osd
old_osd_pvcs (list): The old osd PVC objects before resizing the osd
old_osd_pvs (list): The old osd PV objects before resizing the osd
expected_storage_size (str): The expected storage size after resizing the osd
Raises:
ResourceWrongStatusException: If the resources are not in the expected state and size
CephHealthException: If Ceph is not in the expected state.
"""
logger.info("Check the resources state post resize OSD")
check_resources_state_post_resize_osd(old_osd_pods, old_osd_pvcs, old_osd_pvs)
logger.info("Check the resources size post resize OSD")
check_resources_size_post_resize_osd(expected_storage_size)
logger.info("Check the Ceph state post resize OSD")
check_ceph_state_post_resize_osd(expected_storage_size)
logger.info("All the Ceph verification steps post resize osd finished successfully")


def ceph_verification_steps_post_resize_osd(
old_osd_pods, old_osd_pvcs, old_osd_pvs, expected_storage_size
):
"""
Check the Ceph verification steps post resize OSD, as described in the above function:
'base_ceph_verification_steps_post_resize_osd'.
If we get one of the exceptions: 'ResourceWrongStatusException' and 'CephHealthException' it will
restart the osd pods and try again several times.
If it fails after the number of tries, it returns False. Otherwise, if it succeeded, it returns True.
Args:
old_osd_pods (list): The old osd pod objects before resizing the osd
old_osd_pvcs (list): The old osd PVC objects before resizing the osd
old_osd_pvs (list): The old osd PV objects before resizing the osd
expected_storage_size (str): The expected storage size after resizing the osd
Returns:
bool: True, if the Ceph verification steps post resize OSD succeeded after the given tries.
False, otherwise
"""
num_of_tries = 5
for i in range(num_of_tries):
try:
base_ceph_verification_steps_post_resize_osd(
old_osd_pods, old_osd_pvcs, old_osd_pvs, expected_storage_size
)
return True
except (ResourceWrongStatusException, CephHealthException) as ex:
logger.warning(
f"The Ceph verification steps failed due to the error: {str(ex)}. "
f"Try to restart the OSD pods before the next iteration"
)
old_osd_pods = get_osd_pods()
delete_pods(old_osd_pods, wait=False)

logger.warning(
f"Failed to complete the Ceph verification steps post resize osd after {num_of_tries} tries"
)
return False


def check_ceph_health_after_resize_osd(
ceph_health_tries=40, ceph_rebalance_timeout=900
):
"""
Check Ceph health after resize osd
Args:
ceph_health_tries (int): The number of tries to wait for the Ceph health to be OK.
ceph_rebalance_timeout (int): The time to wait for the Ceph cluster rebalanced.
"""
if config.RUN.get("io_in_bg"):
logger.info(
"Increase the time to wait for Ceph health to be health OK, "
"because we run IO in the background"
)
additional_ceph_health_tries = int(config.RUN.get("io_load") * 1.3)
ceph_health_tries += additional_ceph_health_tries

additional_ceph_rebalance_timeout = int(config.RUN.get("io_load") * 100)
ceph_rebalance_timeout += additional_ceph_rebalance_timeout

ceph_health_check(
namespace=config.ENV_DATA["cluster_namespace"], tries=ceph_health_tries
)
ceph_cluster_obj = CephCluster()
assert ceph_cluster_obj.wait_for_rebalance(
timeout=ceph_rebalance_timeout
), "Data re-balance failed to complete"
61 changes: 61 additions & 0 deletions ocs_ci/ocs/resources/pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -3583,3 +3583,64 @@ def _check_if_pod_deleted(label, namespace):
namespace=namespace,
)
sampler.wait_for_func_status(True)


def calculate_md5sum_of_pod_files(pods_for_integrity_check, pod_file_name):
"""
Calculate the md5sum of the pod files, and save it in the pod objects
Args:
pods_for_integrity_check (list): The list of the pod objects to calculate the md5sum
pod_file_name (str): The pod file name to save the md5sum
"""
# Wait for IO to finish
logger.info("Wait for IO to finish on pods")
for pod_obj in pods_for_integrity_check:
pod_obj.get_fio_results()
logger.info(f"IO finished on pod {pod_obj.name}")
# Calculate md5sum
pod_file_name = (
pod_file_name
if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM)
else pod_obj.get_storage_path(storage_type="block")
)
logger.info(
f"Calculate the md5sum of the file {pod_file_name} in the pod {pod_obj.name}"
)
pod_obj.pvc.md5sum = cal_md5sum(
pod_obj,
pod_file_name,
pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK,
)


def verify_md5sum_on_pod_files(pods_for_integrity_check, pod_file_name):
"""
Verify the md5sum of the pod files
Args:
pods_for_integrity_check (list): The list of the pod objects to verify the md5sum
pod_file_name (str): The pod file name to verify its md5sum
Raises:
AssertionError: If file doesn't exist or md5sum mismatch
"""
for pod_obj in pods_for_integrity_check:
pod_file_name = (
pod_obj.get_storage_path(storage_type="block")
if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK)
else pod_file_name
)
verify_data_integrity(
pod_obj,
pod_file_name,
pod_obj.pvc.md5sum,
pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK,
)
logger.info(
f"Verified: md5sum of {pod_file_name} on pod {pod_obj.name} "
f"matches with the original md5sum"
)
logger.info("Data integrity check passed on all pods")
Loading

0 comments on commit 14956f1

Please sign in to comment.