From 514ceea2434aa97f3d5acf8dc4b0ae21d31e1a8d Mon Sep 17 00:00:00 2001 From: Itzhak Kave Date: Mon, 8 Jan 2024 17:06:09 +0200 Subject: [PATCH] Create the node functionalities for IBMCloud BM, and create a new test file for testing the node restart functionalities Signed-off-by: Itzhak Kave --- ocs_ci/ocs/platform_nodes.py | 108 +++++++- ocs_ci/utility/ibmcloud_bm.py | 153 +++++++++++ .../z_cluster/nodes/test_nodes_restart_hci.py | 244 ++++++++++++++++++ 3 files changed, 504 insertions(+), 1 deletion(-) create mode 100644 ocs_ci/utility/ibmcloud_bm.py create mode 100644 tests/functional/z_cluster/nodes/test_nodes_restart_hci.py diff --git a/ocs_ci/ocs/platform_nodes.py b/ocs_ci/ocs/platform_nodes.py index dcb671b30829..e31094f5373b 100644 --- a/ocs_ci/ocs/platform_nodes.py +++ b/ocs_ci/ocs/platform_nodes.py @@ -60,7 +60,11 @@ get_module_ip, get_terraform_ignition_provider, ) -from ocs_ci.ocs.node import wait_for_nodes_status, get_nodes_in_statuses +from ocs_ci.ocs.node import ( + wait_for_nodes_status, + get_nodes_in_statuses, + get_node_internal_ip, +) from ocs_ci.utility.vsphere_nodes import VSPHERENode from paramiko.ssh_exception import NoValidConnectionsError, AuthenticationException from semantic_version import Version @@ -92,6 +96,7 @@ def __init__(self): "rosa": AWSNodes, "vsphere_upi": VMWareUPINodes, "fusion_aas": AWSNodes, + "hci_baremetal": IBMCloudBMNodes, } def get_nodes_platform(self): @@ -3070,3 +3075,104 @@ def restart_nodes_by_stop_and_start_teardown(self): node_names = [n.name for n in not_ready_nodes] if node_names: self.gcp.start_instances(node_names) + + +class IBMCloudBMNodes(NodesBase): + """ + IBM Cloud for Bare metal machines class + + """ + + def __init__(self): + super(IBMCloudBMNodes, self).__init__() + from ocs_ci.utility import ibmcloud_bm + + self.ibmcloud_bm = ibmcloud_bm.IBMCloudBM() + + def get_machines(self, nodes): + """ + Get the machines associated with the given nodes + + Args: + nodes (list): The OCS objects of the nodes + + Returns: + list: List of dictionaries. List of the machines associated with the given nodes + + """ + node_ips = [get_node_internal_ip(n) for n in nodes] + return self.ibmcloud_bm.get_machines_by_ips(node_ips) + + def stop_nodes(self, nodes): + """ + Stop nodes + + Args: + nodes (list): The OCS objects of the nodes + + """ + machines = self.get_machines(nodes) + self.ibmcloud_bm.stop_machines(machines) + + def start_nodes(self, nodes): + """ + Start nodes + + Args: + nodes (list): The OCS objects of the nodes + + """ + machines = self.get_machines(nodes) + self.ibmcloud_bm.start_machines(machines) + + def restart_nodes(self, nodes, force=False): + """ + Restart nodes + + Args: + nodes (list): The OCS objects of the nodes + force (bool): If True, it will force restarting the nodes. False, otherwise. + Default value is False. + + """ + machines = self.get_machines(nodes) + self.ibmcloud_bm.restart_machines(machines, force=force) + + def restart_nodes_by_stop_and_start(self, nodes, wait=True, timeout=300): + """ + Restart the nodes by stop and start + + Args: + nodes (list): The OCS objects of the nodes + wait (bool): If True, wait for the nodes to be ready. False, otherwise + timeout (int): The time to wait for the nodes to be ready + + """ + machines = self.get_machines(nodes) + self.ibmcloud_bm.restart_machines_by_stop_and_start(machines) + if wait: + node_names = [n.name for n in nodes] + wait_for_nodes_status(node_names, timeout=timeout) + + def restart_nodes_by_stop_and_start_teardown(self): + """ + Start the nodes in a NotReady state + + """ + nodes_not_ready = get_nodes_in_statuses([constants.NODE_NOT_READY]) + machines = self.get_machines(nodes_not_ready) + self.ibmcloud_bm.start_machines(machines) + + def create_nodes(self, node_conf, node_type, num_nodes): + """ + Create nodes + + """ + raise NotImplementedError("Create nodes functionality not implemented") + + def terminate_nodes(self, nodes, wait=True): + """ + Terminate nodes + + """ + raise NotImplementedError("terminate nodes functionality is not implemented") diff --git a/ocs_ci/utility/ibmcloud_bm.py b/ocs_ci/utility/ibmcloud_bm.py new file mode 100644 index 000000000000..6b4e6fd7b106 --- /dev/null +++ b/ocs_ci/utility/ibmcloud_bm.py @@ -0,0 +1,153 @@ +# -*- coding: utf8 -*- +""" +Module for interactions with IBM Cloud Cluster. + +""" + +import json +import logging +import time + +from ocs_ci.framework import config +from ocs_ci.ocs.exceptions import CommandFailed +from ocs_ci.utility.utils import run_cmd + + +logger = logging.getLogger(name=__file__) +ibm_config = config.AUTH.get("ibmcloud", {}) + + +def login(): + """ + Login to IBM Cloud account + """ + api_key = ibm_config["api_key"] + login_cmd = f"ibmcloud login --apikey {api_key}" + account_id = ibm_config.get("account_id") + if account_id: + login_cmd += f" -c {account_id}" + api_endpoint = ibm_config.get("api_endpoint") + if api_endpoint: + login_cmd += f" -a {api_endpoint}" + region = config.ENV_DATA.get("region") + if region: + login_cmd += f" -r {region}" + logger.info("Logging to IBM cloud") + run_cmd(login_cmd, secrets=[api_key]) + logger.info("Successfully logged in to IBM cloud") + config.RUN["ibmcloud_last_login"] = time.time() + + +def run_ibmcloud_bm_cmd(cmd, secrets=None, timeout=600, ignore_error=False, **kwargs): + """ + Wrapper function for `run_cmd` which if needed will perform IBM Cloud login + command before running the ibmcloud bare metal command. In the case run_cmd will fail + because the IBM cloud got disconnected, it will login and re-try. + + Args: + cmd (str): command to run + secrets (list): A list of secrets to be masked with asterisks + This kwarg is popped in order to not interfere with + subprocess.run(``**kwargs``) + timeout (int): Timeout for the command, defaults to 600 seconds. + ignore_error (bool): True if ignore non zero return code and do not + raise the exception. + """ + last_login = config.RUN.get("ibmcloud_last_login", 0) + timeout_from_last_login = time.time() - last_login + basic_cmd = "ibmcloud sl hardware " + cmd = basic_cmd + cmd + + # Login if the timeout from last login is greater than 9.5 minutes. + if not last_login or timeout_from_last_login > 570: + login() + try: + return run_cmd(cmd, secrets, timeout, ignore_error, **kwargs) + except CommandFailed as ex: + if "Please login" in str(ex): + login() + return run_cmd(cmd, secrets, timeout, ignore_error, **kwargs) + + +class IBMCloudBM(object): + """ + Wrapper for Ibm Cloud with Bare metal machines + """ + + def get_all_machines(self): + """ + Get all the machines in the IBMCloud Bare metal machines + + Returns: + list: List of dictionaries. List of all the machines in the IBMCloud Bare metal machines + + """ + cmd = "list --output json" + machine_list = json.loads(run_ibmcloud_bm_cmd(cmd)) + return machine_list + + def get_machines_by_ips(self, machine_ips): + """ + Get the machines in the IBMCloud Bare metal machines that have the given machine IPs + + Args: + machine_ips (list): The list of the machine IPs to search for. + + Returns: + Get the machines in the IBMCloud Bare metal machines that have the given machine IPs + + """ + machine_list = self.get_all_machines() + return [m for m in machine_list if m["primaryIpAddress"] in machine_ips] + + def stop_machines(self, machines): + """ + Stop the IBMCloud Bare metal machines + + Args: + machines (list): List of the IBMCLoud Bare metal machines objects to stop + + """ + for m in machines: + logger.info(f"Powering off the machine with ip {m['primaryIpAddress']}") + cmd = f"power-off {m['id']} -f" + run_ibmcloud_bm_cmd(cmd) + + def start_machines(self, machines): + """ + Start the IBMCloud Bare metal machines + + Args: + machines (list): List of the IBMCLoud Bare metal machines objects to start + + """ + for m in machines: + logger.info(f"Powering on the machine with ip {m['primaryIpAddress']}") + cmd = f"power-on {m['id']}" + run_ibmcloud_bm_cmd(cmd) + + def restart_machines(self, machines, force=False): + """ + Reboot the IBMCloud Bare metal machines + + Args: + machines (list): List of the IBMCLoud Bare metal machines objects to restart + force (bool): If False, will perform a soft reboot. Otherwise, if True, will perform a hard reboot + + """ + reboot_type = "hard" if force else "soft" + for m in machines: + logger.info(f"Reboot the machine with the ip {m['primaryIpAddress']}") + cmd = f"reboot {m['id']} -f --{reboot_type}" + run_ibmcloud_bm_cmd(cmd) + + def restart_machines_by_stop_and_start(self, machines): + """ + Restart the IBMCloud Bare metal machines by stop and start + + Args: + machines (list): List of the IBMCLoud Bare metal machines objects to restart + + """ + self.stop_machines(machines) + self.start_machines(machines) diff --git a/tests/functional/z_cluster/nodes/test_nodes_restart_hci.py b/tests/functional/z_cluster/nodes/test_nodes_restart_hci.py new file mode 100644 index 000000000000..60dda1ae4a57 --- /dev/null +++ b/tests/functional/z_cluster/nodes/test_nodes_restart_hci.py @@ -0,0 +1,244 @@ +import logging +import pytest +import random + + +from ocs_ci.framework.pytest_customization.marks import brown_squad +from ocs_ci.framework.testlib import ( + tier4a, + tier4b, + ignore_leftovers, + ManageTest, + bugzilla, + hci_provider_and_client_required, +) +from ocs_ci.ocs import constants +from ocs_ci.ocs.constants import HCI_PROVIDER +from ocs_ci.ocs.exceptions import ResourceWrongStatusException +from ocs_ci.ocs.node import ( + get_node_objs, + recover_node_to_ready_state, + get_osd_running_nodes, + get_node_osd_ids, + wait_for_osd_ids_come_up_on_node, + wait_for_nodes_status, + get_nodes, + wait_for_node_count_to_reach_status, + drain_nodes, + schedule_nodes, +) +from ocs_ci.ocs.resources import pod +from ocs_ci.helpers.sanity_helpers import Sanity +from ocs_ci.ocs.cluster import ( + ceph_health_check, +) +from ocs_ci.framework import config +from ocs_ci.utility.utils import switch_to_correct_cluster_at_setup + +logger = logging.getLogger(__name__) + + +@brown_squad +@ignore_leftovers +@hci_provider_and_client_required +class TestNodesRestartHCI(ManageTest): + """ + Test nodes restart scenarios when using HCI platform + """ + + @pytest.fixture(autouse=True) + def setup(self, request, create_scale_pods_and_pvcs_using_kube_job_on_ms_consumers): + """ + Initialize Sanity instance, and create pods and PVCs factory + + """ + self.orig_index = config.cur_index + switch_to_correct_cluster_at_setup(request) + self.sanity_helpers = Sanity() + + @pytest.fixture(autouse=True) + def teardown(self, request, nodes): + """ + Make sure all nodes are up again + + """ + + def finalizer(): + ocp_nodes = get_node_objs() + for n in ocp_nodes: + recover_node_to_ready_state(n) + + logger.info("Switch to the original cluster index") + config.switch_ctx(self.orig_index) + ceph_health_check() + + request.addfinalizer(finalizer) + + @tier4a + @pytest.mark.polarion_id("OCS-3980") + @pytest.mark.parametrize( + "cluster_type", + [HCI_PROVIDER], + ) + def test_osd_node_restart_and_check_osd_pods_status(self, cluster_type, nodes): + """ + 1) Restart one of the osd nodes. + 2) Check that the osd pods associated with the node should change to a Terminating state. + 3) Wait for the node to reach Ready state. + 4) Check that the new osd pods with the same ids start on the same node. + + """ + osd_node_name = random.choice(get_osd_running_nodes()) + osd_node = get_node_objs([osd_node_name])[0] + + old_osd_pod_ids = get_node_osd_ids(osd_node_name) + logger.info(f"osd pod ids: {old_osd_pod_ids}") + node_osd_pods = pod.get_osd_pods_having_ids(old_osd_pod_ids) + node_osd_pod_names = [p.name for p in node_osd_pods] + + logger.info(f"Going to restart the node {osd_node_name}") + nodes.restart_nodes(nodes=[osd_node], wait=False) + + logger.info("Verify the node osd pods go into a Terminating state") + res = pod.wait_for_pods_to_be_in_statuses( + [constants.STATUS_TERMINATING], node_osd_pod_names + ) + assert res, "Not all the node osd pods are in a Terminating state" + + wait_for_nodes_status(node_names=[osd_node_name], timeout=300) + + if len(get_nodes(constants.WORKER_MACHINE)) <= 3: + assert wait_for_osd_ids_come_up_on_node( + osd_node_name, old_osd_pod_ids, timeout=300 + ) + logger.info( + f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}" + ) + else: + new_osd_pods = pod.wait_for_osd_pods_having_ids(osd_ids=old_osd_pod_ids) + new_osd_pod_names = [p.name for p in new_osd_pods] + logger.info( + f"Wait for the new osd pods with the ids {old_osd_pod_ids} to be running" + ) + res = pod.wait_for_pods_to_be_in_statuses( + constants.STATUS_RUNNING, + new_osd_pod_names, + raise_pod_not_found_error=True, + ) + assert res, "Not all the node osd pods are in a Running state" + + @tier4a + @pytest.mark.parametrize( + argnames=["cluster_type", "node_type"], + argvalues=[ + pytest.param( + *[HCI_PROVIDER, constants.WORKER_MACHINE], + ), + pytest.param( + *[HCI_PROVIDER, constants.MASTER_MACHINE], + ), + ], + ) + def test_nodes_restart(self, cluster_type, nodes, node_type): + """ + Test nodes restart (from the platform layer) + + """ + node_count = len(get_nodes(node_type=node_type)) + ocp_nodes = get_nodes(node_type=node_type) + ocp_node = random.choice(ocp_nodes) + + nodes.restart_nodes(nodes=[ocp_node]) + wait_for_node_count_to_reach_status(node_count=node_count, node_type=node_type) + ceph_health_check() + + @tier4b + @bugzilla("1754287") + @pytest.mark.polarion_id("OCS-2015") + @pytest.mark.parametrize( + argnames=["cluster_type", "node_type"], + argvalues=[ + pytest.param(*[HCI_PROVIDER, constants.WORKER_MACHINE]), + ], + ) + def test_rolling_nodes_restart(self, cluster_type, nodes, node_type): + """ + Test restart nodes one after the other and check health status in between + + """ + node_count = len(get_nodes(node_type)) + ocp_nodes = get_nodes(node_type=node_type) + + for node in ocp_nodes: + nodes.restart_nodes(nodes=[node]) + wait_for_node_count_to_reach_status( + node_count=node_count, node_type=node_type + ) + ceph_health_check(tries=40) + + @tier4a + @pytest.mark.parametrize( + argnames=["cluster_type", "node_type"], + argvalues=[ + pytest.param(*[HCI_PROVIDER, constants.WORKER_MACHINE]), + ], + ) + def test_node_maintenance_restart(self, cluster_type, nodes, node_type): + """ + - Mark as unschedulable and drain 1 worker node in the provider cluster + - Restart the node + - Mark the node as schedulable + + """ + typed_nodes = get_nodes(node_type=node_type) + assert typed_nodes, f"Failed to find a {node_type} node." + typed_node = random.choice(typed_nodes) + typed_node_name = typed_node.name + + # Get the current reboot events from the node + reboot_events_cmd = ( + f"get events -A --field-selector involvedObject.name=" + f"{typed_node_name},reason=Rebooted -o yaml" + ) + + # Find the number of reboot events in the node + num_events = len(typed_node.ocp.exec_oc_cmd(reboot_events_cmd)["items"]) + + # Unschedule and drain the node + drain_nodes([typed_node_name]) + # Wait for the node to be unschedule + wait_for_nodes_status( + node_names=[typed_node_name], + status=constants.NODE_READY_SCHEDULING_DISABLED, + ) + + # Restart the node + nodes.restart_nodes(nodes=typed_nodes, wait=False) + + # Verify that the node restarted + try: + wait_for_nodes_status( + node_names=[typed_node_name], + status=constants.NODE_NOT_READY_SCHEDULING_DISABLED, + ) + except ResourceWrongStatusException: + # Sometimes, the node will be back to running state quickly so + # that the status change won't be detected. Verify the node was + # actually restarted by checking the reboot events count + new_num_events = len( + typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"] + ) + assert new_num_events > num_events, ( + f"Reboot event not found." f"Node {typed_node_name} did not restart." + ) + + # Wait for the node to be Ready + wait_for_nodes_status( + node_names=[typed_node_name], + status=constants.NODE_READY_SCHEDULING_DISABLED, + ) + + # Mark the node as schedulable + schedule_nodes([typed_node_name]) + + self.sanity_helpers.health_check()