Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create new test file for testing resize OSD #9583

Merged
merged 4 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ocs_ci/ocs/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,3 +694,7 @@ class UsernameNotFoundException(Exception):

class MultiStorageClusterExternalCephHealth(Exception):
pass


class StorageSizeNotReflectedException(Exception):
pass
317 changes: 317 additions & 0 deletions ocs_ci/ocs/resources/osd_resize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,317 @@
import logging

from ocs_ci.ocs.exceptions import (
StorageSizeNotReflectedException,
ResourceWrongStatusException,
CephHealthException,
)
from ocs_ci.ocs.resources.pod import (
get_osd_pods,
delete_pods,
wait_for_pods_to_be_in_statuses,
get_ceph_tools_pod,
)
from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs, get_deviceset_pvs, get_pvc_size
from ocs_ci.ocs.resources.pv import get_pv_size
from ocs_ci.ocs.resources.storage_cluster import (
get_storage_size,
get_device_class,
verify_storage_device_class,
verify_device_class_in_osd_tree,
)
from ocs_ci.ocs.cluster import check_ceph_osd_tree, CephCluster
from ocs_ci.utility.utils import ceph_health_check, TimeoutSampler, convert_device_size
from ocs_ci.ocs import constants
from ocs_ci.ocs.ocp import OCP
from ocs_ci.framework import config


logger = logging.getLogger(__name__)


def check_resources_state_post_resize_osd(old_osd_pods, old_osd_pvcs, old_osd_pvs):
"""
Check that the pods, PVCs, and PVs are in the expected state post resizing the osd.
It will perform the following steps:
1. Check that the old osd pods are in a terminating state or deleted
2. Check that the new osd pods running, and we have exactly the same number of osd pods as the old ones.
3. Check that the PVCs are in a Bound state
4. Check that the old PVC and PV names are equal to the current PVC and PV names

Args:
old_osd_pods (list): The old osd pod objects before resizing the osd
old_osd_pvcs (list): The old osd PVC objects before resizing the osd
old_osd_pvs (list): The old osd PV objects before resizing the osd

Raises:
ResourceWrongStatusException: If the following occurs:
1. The OSD pods failed to reach the status Terminated or to be deleted
2. The old PVC and PV names are not equal to the current PVC and PV names

"""
old_osd_pods_count = len(old_osd_pods)
logger.info("Wait for the OSD pods to reach the status Terminated or to be deleted")
old_osd_pod_names = [p.name for p in old_osd_pods]
res = wait_for_pods_to_be_in_statuses(
expected_statuses=[constants.STATUS_TERMINATING],
pod_names=old_osd_pod_names,
timeout=300,
sleep=20,
)
if not res:
raise ResourceWrongStatusException(
"The OSD pods failed to reach the status Terminated or to be deleted"
)

logger.info("Check that the new OSD pods are running")
ocp_pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"])
ocp_pod.wait_for_resource(
condition=constants.STATUS_RUNNING,
selector=constants.OSD_APP_LABEL,
resource_count=old_osd_pods_count,
timeout=300,
sleep=20,
)

logger.info(
f"Check that the number of the new OSD pods are exactly {old_osd_pods_count}"
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we also add a validation that the Running OSD pod names are different than old_osd_pods names?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I can add it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

for osd_pods in TimeoutSampler(timeout=180, sleep=10, func=get_osd_pods):
osd_pods_count = len(osd_pods)
logger.info(f"number of osd pods = {osd_pods_count}")
if old_osd_pods_count == osd_pods_count:
break

logger.info("Verify that the new osd pod names are different than the old ones")
osd_pods = get_osd_pods()
new_name_set = {p.name for p in osd_pods}
old_name_set = {p.name for p in old_osd_pods}
if new_name_set.intersection(old_name_set):
raise ResourceWrongStatusException(
f"There are shared values between the new osd pod names and the old osd pod names. "
f"old osd pod names = {old_name_set}, new osd pod names = {new_name_set}"
)

logger.info("Check that the PVCs are in a Bound state")
ocp_pvc = OCP(kind=constants.PVC, namespace=config.ENV_DATA["cluster_namespace"])
ocp_pvc.wait_for_resource(
timeout=30,
sleep=5,
condition=constants.STATUS_BOUND,
selector=constants.OSD_PVC_GENERIC_LABEL,
resource_count=len(old_osd_pvcs),
)

current_osd_pvcs = get_deviceset_pvcs()
old_pvc_names = [p.name for p in old_osd_pvcs]
current_pvc_names = [p.name for p in current_osd_pvcs]
logger.info(f"Old PVC names = {old_pvc_names}")
logger.info(f"Current PVC names = {current_pvc_names}")

current_osd_pvs = get_deviceset_pvs()
old_pv_names = [p.name for p in old_osd_pvs]
current_pv_names = [p.name for p in current_osd_pvs]
logger.info(f"Old PV names = {old_pv_names}")
logger.info(f"Current PV names = {current_pv_names}")

logger.info(
"Check that the old PVC and PV names are equal to the current PVC and PV names"
)
if not old_pvc_names == current_pvc_names:
raise ResourceWrongStatusException(
f"The old PVC names {old_pvc_names} are not equal to the "
f"current PVC names {current_pvc_names}"
)
if not old_pv_names == current_pv_names:
raise ResourceWrongStatusException(
f"The old PV names {old_pv_names} are not equal to the "
f"current PV names {current_pv_names}"
)


def check_storage_size_is_reflected(expected_storage_size):
"""
Check that the expected storage size is reflected in the current storage size, PVCs, PVs,
and ceph capacity.

Args:
expected_storage_size (str): The expected storage size

Raises:
StorageSizeNotReflectedException: If the current storage size, PVCs, PVs, and ceph capacity
are not in the expected size

"""
logger.info(f"The expected storage size is {expected_storage_size}")
current_storage_size = get_storage_size()
logger.info(f"The current storage size is {current_storage_size}")

expected_storage_size_in_gb = convert_device_size(expected_storage_size, "GB", 1024)
current_storage_size_in_gb = convert_device_size(current_storage_size, "GB", 1024)
logger.info(
"Check that the current storage size equal to the expected storage size"
)
if current_storage_size_in_gb != expected_storage_size_in_gb:
raise StorageSizeNotReflectedException(
f"The current storage size {current_storage_size} is not equal "
f"to the expected size {expected_storage_size}"
)

logger.info(
"Check that the PVC and PV sizes are equal to the expected storage size"
)
current_osd_pvcs = get_deviceset_pvcs()
pvc_sizes = [get_pvc_size(pvc) for pvc in current_osd_pvcs]
logger.info(f"PVC sizes = {pvc_sizes}")
if not all([p_size == expected_storage_size_in_gb for p_size in pvc_sizes]):
raise StorageSizeNotReflectedException(
f"The PVC sizes are not equal to the expected storage size {expected_storage_size_in_gb}"
)
ebenahar marked this conversation as resolved.
Show resolved Hide resolved

current_osd_pvs = get_deviceset_pvs()
pv_sizes = [get_pv_size(pv.get()) for pv in current_osd_pvs]
logger.info(f"PV sizes {pv_sizes}")
if not all([p_size == expected_storage_size_in_gb for p_size in pv_sizes]):
raise StorageSizeNotReflectedException(
f"The PV sizes are not equal to the expected storage size {expected_storage_size_in_gb}"
ebenahar marked this conversation as resolved.
Show resolved Hide resolved
)

ceph_cluster = CephCluster()
ceph_capacity = ceph_cluster.get_ceph_capacity()
expected_storage_size_in_gb = convert_device_size(expected_storage_size, "GB")
logger.info(
f"Check that the Ceph capacity {ceph_capacity} is equal "
f"to the expected storage size {expected_storage_size_in_gb}"
)
if not int(ceph_capacity) == expected_storage_size_in_gb:
raise StorageSizeNotReflectedException(
f"The Ceph capacity {ceph_capacity} is not equal to the "
f"expected storage size {expected_storage_size_in_gb}"
)


def check_ceph_state_post_resize_osd():
"""
Check the Ceph state post resize osd.
The function checks the Ceph device classes and osd tree.

Raises:
CephHealthException: In case the Ceph device classes and osd tree checks
didn't finish successfully

"""
logger.info("Check the Ceph device classes and osd tree")
device_class = get_device_class()
ct_pod = get_ceph_tools_pod()
try:
verify_storage_device_class(device_class)
verify_device_class_in_osd_tree(ct_pod, device_class)
except AssertionError as ex:
raise CephHealthException(ex)
if not check_ceph_osd_tree():
raise CephHealthException("The ceph osd tree checks didn't finish successfully")


def base_ceph_verification_steps_post_resize_osd(
old_osd_pods, old_osd_pvcs, old_osd_pvs, expected_storage_size
):
"""
Check the Ceph verification steps post resize OSD.
It will perform the following steps:
1. Check the resources state post resize OSD
2. Check the resources size post resize OSD
3. Check the Ceph state post resize OSD

Args:
old_osd_pods (list): The old osd pod objects before resizing the osd
old_osd_pvcs (list): The old osd PVC objects before resizing the osd
old_osd_pvs (list): The old osd PV objects before resizing the osd
expected_storage_size (str): The expected storage size after resizing the osd

Raises:
StorageSizeNotReflectedException: If the current storage size, PVCs, PVs, and ceph capacity
are not in the expected size

"""
logger.info("Check the resources state post resize OSD")
check_resources_state_post_resize_osd(old_osd_pods, old_osd_pvcs, old_osd_pvs)
logger.info("Check the resources size post resize OSD")
check_storage_size_is_reflected(expected_storage_size)
logger.info("Check the Ceph state post resize OSD")
check_ceph_state_post_resize_osd()
logger.info("All the Ceph verification steps post resize osd finished successfully")


def ceph_verification_steps_post_resize_osd(
old_osd_pods, old_osd_pvcs, old_osd_pvs, expected_storage_size, num_of_tries=6
):
"""
Try to execute the function 'base_ceph_verification_steps_post_resize_osd' a number of tries
ebenahar marked this conversation as resolved.
Show resolved Hide resolved
until success, ignoring the exception 'StorageSizeNotReflectedException'.
In every iteration, if we get the exception 'StorageSizeNotReflectedException', it will restart
the osd pods and try again until it reaches the maximum tries.

Args:
old_osd_pods (list): The old osd pod objects before resizing the osd
old_osd_pvcs (list): The old osd PVC objects before resizing the osd
old_osd_pvs (list): The old osd PV objects before resizing the osd
expected_storage_size (str): The expected storage size after resizing the osd
num_of_tries (int): The number of tries to try executing the
function 'base_ceph_verification_steps_post_resize_osd'.

Raises:
StorageSizeNotReflectedException: If the current storage size, PVCs, PVs, and ceph capacity
are not in the expected size

"""
ex = StorageSizeNotReflectedException()
for i in range(1, num_of_tries + 1):
try:
base_ceph_verification_steps_post_resize_osd(
old_osd_pods, old_osd_pvcs, old_osd_pvs, expected_storage_size
)
return
except StorageSizeNotReflectedException as local_ex:
ex = local_ex
logger.warning(
f"The Ceph verification steps failed due to the error: {str(local_ex)}. "
f"Try to restart the OSD pods before the next iteration"
)
old_osd_pods = get_osd_pods()
delete_pods(old_osd_pods, wait=False)

logger.warning(
f"Failed to complete the Ceph verification steps post resize osd after {num_of_tries} tries"
)
raise ex


def check_ceph_health_after_resize_osd(
ceph_health_tries=40, ceph_rebalance_timeout=900
):
"""
Check Ceph health after resize osd

Args:
ceph_health_tries (int): The number of tries to wait for the Ceph health to be OK.
ceph_rebalance_timeout (int): The time to wait for the Ceph cluster rebalanced.

"""
if config.RUN.get("io_in_bg"):
logger.info(
"Increase the time to wait for Ceph health to be health OK, "
"because we run IO in the background"
)
additional_ceph_health_tries = int(config.RUN.get("io_load") * 1.3)
ceph_health_tries += additional_ceph_health_tries

additional_ceph_rebalance_timeout = int(config.RUN.get("io_load") * 100)
ceph_rebalance_timeout += additional_ceph_rebalance_timeout

ceph_health_check(
namespace=config.ENV_DATA["cluster_namespace"], tries=ceph_health_tries
)
ceph_cluster_obj = CephCluster()
assert ceph_cluster_obj.wait_for_rebalance(
timeout=ceph_rebalance_timeout
), "Data re-balance failed to complete"
61 changes: 61 additions & 0 deletions ocs_ci/ocs/resources/pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -3583,3 +3583,64 @@ def _check_if_pod_deleted(label, namespace):
namespace=namespace,
)
sampler.wait_for_func_status(True)


def calculate_md5sum_of_pod_files(pods_for_integrity_check, pod_file_name):
"""
Calculate the md5sum of the pod files, and save it in the pod objects

Args:
pods_for_integrity_check (list): The list of the pod objects to calculate the md5sum
pod_file_name (str): The pod file name to save the md5sum

"""
# Wait for IO to finish
logger.info("Wait for IO to finish on pods")
for pod_obj in pods_for_integrity_check:
pod_obj.get_fio_results()
logger.info(f"IO finished on pod {pod_obj.name}")
# Calculate md5sum
pod_file_name = (
pod_file_name
if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM)
else pod_obj.get_storage_path(storage_type="block")
)
logger.info(
f"Calculate the md5sum of the file {pod_file_name} in the pod {pod_obj.name}"
)
pod_obj.pvc.md5sum = cal_md5sum(
pod_obj,
pod_file_name,
pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK,
)


def verify_md5sum_on_pod_files(pods_for_integrity_check, pod_file_name):
"""
Verify the md5sum of the pod files

Args:
pods_for_integrity_check (list): The list of the pod objects to verify the md5sum
pod_file_name (str): The pod file name to verify its md5sum

Raises:
AssertionError: If file doesn't exist or md5sum mismatch

"""
for pod_obj in pods_for_integrity_check:
pod_file_name = (
pod_obj.get_storage_path(storage_type="block")
if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK)
else pod_file_name
)
verify_data_integrity(
pod_obj,
pod_file_name,
pod_obj.pvc.md5sum,
pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK,
)
logger.info(
f"Verified: md5sum of {pod_file_name} on pod {pod_obj.name} "
f"matches with the original md5sum"
)
logger.info("Data integrity check passed on all pods")
Loading
Loading