Skip to content

Commit

Permalink
Test CLI tool for disk replacement proceudre (#9655)
Browse files Browse the repository at this point in the history
* Test CLI tool for disk replacement proceudre

Signed-off-by: oviner <[email protected]>
  • Loading branch information
OdedViner authored May 13, 2024
1 parent 02ef1fa commit 670d9b7
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 85 deletions.
2 changes: 1 addition & 1 deletion ocs_ci/helpers/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4586,7 +4586,7 @@ def retrieve_cli_binary(cli_type="mcg"):
if cli_type == "mcg":
local_cli_path = constants.NOOBAA_OPERATOR_LOCAL_CLI_PATH
elif cli_type == "odf":
local_cli_path = constants.CLI_TOOL_LOCAL_PATH
local_cli_path = os.path.join(config.RUN["bin_dir"], "odf-cli")
local_cli_dir = os.path.dirname(local_cli_path)
live_deployment = config.DEPLOYMENT["live_deployment"]
if live_deployment and semantic_version >= version.VERSION_4_13:
Expand Down
176 changes: 97 additions & 79 deletions ocs_ci/ocs/osd_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,20 @@
delete_osd_removal_job,
)
from ocs_ci.helpers.sanity_helpers import Sanity
from ocs_ci.helpers.helpers import retrieve_cli_binary
from ocs_ci.utility.utils import run_cmd_interactive


logger = logging.getLogger(__name__)


def osd_device_replacement(nodes):
def osd_device_replacement(nodes, cli_tool=False):
"""
Replacing randomly picked osd device
Args:
node (OCS): The OCS object representing the node
nodes (OCS): The OCS object representing the node
cli_tool (bool): using cli tool to replace the disk if cli_tool is True otherwise use "oc" commands
"""
logger.info("Picking a PV which to be deleted from the platform side")
osd_pvs = get_deviceset_pvs()
Expand Down Expand Up @@ -101,91 +105,105 @@ def osd_device_replacement(nodes):
== claim_name
][0]
osd_deployment_name = osd_deployment.name
osd_pod_name = osd_pod.name

# Delete the volume from the platform side
logger.info(f"Deleting {volume_path} from the platform side")
nodes.detach_volume(volume_path, osd_node)

# Scale down OSD deployment
logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0")
ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"])
ocp_obj.exec_oc_cmd(f"scale --replicas=0 deployment/{osd_deployment_name}")

# Force delete OSD pod if necessary
osd_pod_name = osd_pod.name
logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted")
try:
osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)
except TimeoutError:
osd_pod.delete(force=True)
osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)

# Run ocs-osd-removal job
osd_removal_job = run_osd_removal_job([osd_id])
assert osd_removal_job, "ocs-osd-removal failed to create"
is_completed = verify_osd_removal_job_completed_successfully(osd_id)
assert is_completed, "ocs-osd-removal-job is not in status 'completed'"
logger.info("ocs-osd-removal-job completed successfully")

osd_pvc_name = osd_pvc.name

if ocp_version < version.VERSION_4_6:
# Delete the OSD prepare job
logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}")
osd_prepare_job.delete()
osd_prepare_job.ocp.wait_for_delete(
resource_name=osd_prepare_job_name, timeout=120
)

# Delete the OSD PVC
logger.info(f"Deleting OSD PVC {osd_pvc_name}")
osd_pvc.delete()
osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

# Delete the OSD deployment
logger.info(f"Deleting OSD deployment {osd_deployment_name}")
osd_deployment.delete()
osd_deployment.ocp.wait_for_delete(
resource_name=osd_deployment_name, timeout=120
if cli_tool:
retrieve_cli_binary(cli_type="odf")
run_cmd_interactive(
cmd=f"odf-cli purge-osd {osd_id}",
prompts_answers={
"yes-force-destroy-osd": "yes-force-destroy-osd",
"completed removal of OSD": "",
},
string_answer=True,
raise_exception=False,
)
else:
# If ocp version is '4.6' and above the osd removal job should
# delete the OSD prepare job, OSD PVC, OSD deployment
# We just need to verify the old PV is in the expected status
logger.info(f"Verify that the old PV '{osd_pv_name}' is in the expected status")
if cluster.is_lso_cluster():
expected_old_pv_statuses = [constants.STATUS_RELEASED]
# Scale down OSD deployment
logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0")
ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"])
ocp_obj.exec_oc_cmd(f"scale --replicas=0 deployment/{osd_deployment_name}")

# Force delete OSD pod if necessary
logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted")
try:
osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)
except TimeoutError:
osd_pod.delete(force=True)
osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)

# Run ocs-osd-removal job
osd_removal_job = run_osd_removal_job([osd_id])
assert osd_removal_job, "ocs-osd-removal failed to create"
is_completed = verify_osd_removal_job_completed_successfully(osd_id)
assert is_completed, "ocs-osd-removal-job is not in status 'completed'"
logger.info("ocs-osd-removal-job completed successfully")

osd_pvc_name = osd_pvc.name

if ocp_version < version.VERSION_4_6:
# Delete the OSD prepare job
logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}")
osd_prepare_job.delete()
osd_prepare_job.ocp.wait_for_delete(
resource_name=osd_prepare_job_name, timeout=120
)

# Delete the OSD PVC
logger.info(f"Deleting OSD PVC {osd_pvc_name}")
osd_pvc.delete()
osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

# Delete the OSD deployment
logger.info(f"Deleting OSD deployment {osd_deployment_name}")
osd_deployment.delete()
osd_deployment.ocp.wait_for_delete(
resource_name=osd_deployment_name, timeout=120
)
else:
expected_old_pv_statuses = [
constants.STATUS_RELEASED,
constants.STATUS_FAILED,
]
try:
if osd_pv.ocp.get_resource_status(osd_pv_name) in expected_old_pv_statuses:
try:
logger.info(f"Verifying deletion of PV {osd_pv_name}")
osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
except TimeoutError:
osd_pv.delete()
osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
except Exception as e:
logger.error(f"Old PV does not exist {e}")

# If we use LSO, we need to create and attach a new disk manually
if cluster.is_lso_cluster():
node.add_disk_to_node(osd_node)

if ocp_version < version.VERSION_4_6:
# Delete the rook ceph operator pod to trigger reconciliation
rook_operator_pod = get_operator_pods()[0]
logger.info(f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
rook_operator_pod.delete()

# Delete the OSD removal job
logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}")
is_deleted = delete_osd_removal_job(osd_id)
assert is_deleted, "Failed to delete ocs-osd-removal-job"
logger.info("ocs-osd-removal-job deleted successfully")
# If ocp version is '4.6' and above the osd removal job should
# delete the OSD prepare job, OSD PVC, OSD deployment
# We just need to verify the old PV is in the expected status
logger.info(
f"Verify that the old PV '{osd_pv_name}' is in the expected status"
)
if cluster.is_lso_cluster():
expected_old_pv_statuses = [constants.STATUS_RELEASED]
else:
expected_old_pv_statuses = [
constants.STATUS_RELEASED,
constants.STATUS_FAILED,
]
try:
if osd_pv.ocp.get_resource_status(osd_pv_name) in expected_old_pv_statuses:
try:
logger.info(f"Verifying deletion of PV {osd_pv_name}")
osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
except TimeoutError:
osd_pv.delete()
osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
except Exception as e:
logger.error(f"Old PV does not exist {e}")

# If we use LSO, we need to create and attach a new disk manually
if cluster.is_lso_cluster():
node.add_disk_to_node(osd_node)

if ocp_version < version.VERSION_4_6:
# Delete the rook ceph operator pod to trigger reconciliation
rook_operator_pod = get_operator_pods()[0]
logger.info(f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
rook_operator_pod.delete()

# Delete the OSD removal job
logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}")
is_deleted = delete_osd_removal_job(osd_id)
assert is_deleted, "Failed to delete ocs-osd-removal-job"
logger.info("ocs-osd-removal-job deleted successfully")

timeout = 600
# Wait for OSD PVC to get created and reach Bound state
Expand Down
17 changes: 12 additions & 5 deletions ocs_ci/utility/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,25 +497,32 @@ def run_cmd(
return mask_secrets(completed_process.stdout.decode(), secrets)


def run_cmd_interactive(cmd, prompts_answers, timeout=300):
def run_cmd_interactive(
cmd, prompts_answers, timeout=300, string_answer=False, raise_exception=True
):
"""
Handle interactive prompts with answers during subctl command
Args:
cmd(str): Command to be executed
prompts_answers(dict): Prompts as keys and answers as values
timeout(int): Timeout in seconds, for pexpect to wait for prompt
string_answer (bool): string answer
raise_exception (bool): raise excption
Raises:
InteractivePromptException: in case something goes wrong
"""
child = pexpect.spawn(cmd)
for prompt, answer in prompts_answers.items():
if child.expect(prompt, timeout=timeout):
raise InteractivePromptException("Unexpected Prompt")

if not child.sendline("".join([answer, constants.ENTER_KEY])):
if raise_exception:
raise InteractivePromptException("Unexpected Prompt")
if string_answer:
send_line = answer
else:
send_line = "".join([answer, constants.ENTER_KEY])
if not child.sendline(send_line):
raise InteractivePromptException("Failed to provide answer to the prompt")


Expand Down
20 changes: 20 additions & 0 deletions tests/functional/z_cluster/nodes/test_disk_failures.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
skipif_external_mode,
skipif_managed_service,
skipif_hci_provider_and_client,
skipif_ocs_version,
)
from ocs_ci.helpers.sanity_helpers import Sanity
from ocs_ci.helpers.helpers import (
Expand Down Expand Up @@ -236,3 +237,22 @@ def test_recovery_from_volume_deletion(
self.sanity_helpers.create_resources(
pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
)

@bugzilla("2234479")
@vsphere_platform_required
@skipif_ocs_version("<4.15")
@pytest.mark.polarion_id("OCS-5502")
@skipif_external_mode
def test_recovery_from_volume_deletion_cli_tool(
self, nodes, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
):
"""
Test cluster recovery from disk deletion from the platform side.
Based on documented procedure detailed in
https://bugzilla.redhat.com/show_bug.cgi?id=1823183
"""
osd_operations.osd_device_replacement(nodes, cli_tool=True)
self.sanity_helpers.create_resources(
pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
)

0 comments on commit 670d9b7

Please sign in to comment.