diff --git a/conf/deployment/fusion_hci_pc/provider_bm_upi_1az_rhcos_nvme_3m_3w.yaml b/conf/deployment/fusion_hci_pc/provider_bm_upi_1az_rhcos_nvme_3m_3w.yaml index 82e399088f1..83290b384fd 100644 --- a/conf/deployment/fusion_hci_pc/provider_bm_upi_1az_rhcos_nvme_3m_3w.yaml +++ b/conf/deployment/fusion_hci_pc/provider_bm_upi_1az_rhcos_nvme_3m_3w.yaml @@ -11,6 +11,7 @@ ENV_DATA: master_replicas: 3 mon_type: 'hostpath' osd_type: 'nvme' + region: 'us-east' REPORTING: # This is to be used in internal image for must gather on HCI ocs_must_gather_image: "quay.io/rhceph-dev/ocs-must-gather" diff --git a/ocs_ci/framework/pytest_customization/marks.py b/ocs_ci/framework/pytest_customization/marks.py index c2690b82781..8675f654320 100644 --- a/ocs_ci/framework/pytest_customization/marks.py +++ b/ocs_ci/framework/pytest_customization/marks.py @@ -259,6 +259,11 @@ reason="Test runs ONLY on OSD or ROSA cluster", ) +provider_client_platform_required = pytest.mark.skipif( + (config.ENV_DATA["platform"].lower() not in HCI_PROVIDER_CLIENT_PLATFORMS), + reason="Test runs ONLY on cluster with HCI provider-client platform", +) + provider_client_ms_platform_required = pytest.mark.skipif( (config.ENV_DATA["platform"].lower() not in HCI_PC_OR_MS_PLATFORM), reason="Test runs ONLY on cluster with managed service or HCI provider-client platform", diff --git a/ocs_ci/ocs/platform_nodes.py b/ocs_ci/ocs/platform_nodes.py index dcb671b3082..fd523dfe568 100644 --- a/ocs_ci/ocs/platform_nodes.py +++ b/ocs_ci/ocs/platform_nodes.py @@ -60,7 +60,11 @@ get_module_ip, get_terraform_ignition_provider, ) -from ocs_ci.ocs.node import wait_for_nodes_status, get_nodes_in_statuses +from ocs_ci.ocs.node import ( + wait_for_nodes_status, + get_nodes_in_statuses, + get_node_internal_ip, +) from ocs_ci.utility.vsphere_nodes import VSPHERENode from paramiko.ssh_exception import NoValidConnectionsError, AuthenticationException from semantic_version import Version @@ -92,6 +96,7 @@ def __init__(self): "rosa": AWSNodes, "vsphere_upi": VMWareUPINodes, "fusion_aas": AWSNodes, + "hci_baremetal": IBMCloudBMNodes, } def get_nodes_platform(self): @@ -3070,3 +3075,127 @@ def restart_nodes_by_stop_and_start_teardown(self): node_names = [n.name for n in not_ready_nodes] if node_names: self.gcp.start_instances(node_names) + + +class IBMCloudBMNodes(NodesBase): + """ + IBM Cloud for Bare metal machines class + + """ + + def __init__(self): + super(IBMCloudBMNodes, self).__init__() + from ocs_ci.utility import ibmcloud_bm + + self.ibmcloud_bm = ibmcloud_bm.IBMCloudBM() + + def get_machines(self, nodes): + """ + Get the machines associated with the given nodes + + Args: + nodes (list): The OCS objects of the nodes + + Returns: + list: List of dictionaries. List of the machines associated with the given nodes + + """ + node_ips = [get_node_internal_ip(n) for n in nodes] + return self.ibmcloud_bm.get_machines_by_ips(node_ips) + + def stop_nodes(self, nodes, wait=True): + """ + Stop nodes + + Args: + nodes (list): The OCS objects of the nodes + wait (bool): If True, wait for the nodes to be in a NotReady state. False, otherwise + + """ + machines = self.get_machines(nodes) + self.ibmcloud_bm.stop_machines(machines) + if wait: + node_names = [n.name for n in nodes] + wait_for_nodes_status( + node_names, constants.NODE_NOT_READY, timeout=180, sleep=5 + ) + + def start_nodes(self, nodes, wait=True): + """ + Start nodes + + Args: + nodes (list): The OCS objects of the nodes + wait (bool): If True, wait for the nodes to be ready. False, otherwise + + """ + machines = self.get_machines(nodes) + self.ibmcloud_bm.start_machines(machines) + if wait: + node_names = [n.name for n in nodes] + wait_for_nodes_status( + node_names, constants.NODE_READY, timeout=720, sleep=20 + ) + + def restart_nodes(self, nodes, wait=True, force=False): + """ + Restart nodes + + Args: + nodes (list): The OCS objects of the nodes + wait (bool): If True, wait for the nodes to be ready. False, otherwise + force (bool): If True, it will force restarting the nodes. False, otherwise. + Default value is False. + + """ + machines = self.get_machines(nodes) + self.ibmcloud_bm.restart_machines(machines, force=force) + if wait: + node_names = [n.name for n in nodes] + logger.info( + f"Wait for the nodes {node_names} to reach the status {constants.NODE_NOT_READY}" + ) + wait_for_nodes_status( + node_names, constants.NODE_NOT_READY, timeout=180, sleep=5 + ) + logger.info( + f"Wait for the nodes {node_names} to be in a Ready status again" + ) + wait_for_nodes_status( + node_names, constants.NODE_READY, timeout=720, sleep=20 + ) + + def restart_nodes_by_stop_and_start(self, nodes, wait=True): + """ + Restart the nodes by stop and start + + Args: + nodes (list): The OCS objects of the nodes + wait (bool): If True, wait for the nodes to be ready. False, otherwise + + """ + self.stop_nodes(nodes, wait=True) + self.start_nodes(nodes, wait=wait) + + def restart_nodes_by_stop_and_start_teardown(self): + """ + Start the nodes in a NotReady state + + """ + nodes_not_ready = get_nodes_in_statuses([constants.NODE_NOT_READY]) + machines = self.get_machines(nodes_not_ready) + self.ibmcloud_bm.start_machines(machines) + + def create_nodes(self, node_conf, node_type, num_nodes): + """ + Create nodes + + """ + raise NotImplementedError("Create nodes functionality not implemented") + + def terminate_nodes(self, nodes, wait=True): + """ + Terminate nodes + + """ + raise NotImplementedError("terminate nodes functionality is not implemented") diff --git a/ocs_ci/utility/ibmcloud_bm.py b/ocs_ci/utility/ibmcloud_bm.py new file mode 100644 index 00000000000..d70b4d6a477 --- /dev/null +++ b/ocs_ci/utility/ibmcloud_bm.py @@ -0,0 +1,154 @@ +# -*- coding: utf8 -*- +""" +Module for interactions with IBM Cloud Cluster. + +""" + +import json +import logging + +from ocs_ci.framework import config +from ocs_ci.ocs.exceptions import CommandFailed +from ocs_ci.utility.utils import run_cmd + + +logger = logging.getLogger(name=__file__) +ibm_config = config.AUTH.get("ibmcloud", {}) + + +class IBMCloudBM(object): + """ + Wrapper for IBM Cloud with Bare metal machines + """ + + def __init__(self, region=None): + """ + Constructor for IBM Cloud Bare Metal machines + + Args: + region (str): The region of the IBM Cloud Bare Metal machines + + """ + self.api_key = ibm_config["api_key"] + self.account_id = ibm_config.get("account_id") + self.region = region or config.ENV_DATA.get("region") + + def login(self): + """ + Login to IBM Cloud account + """ + login_cmd = f"ibmcloud login --apikey {self.api_key} -c {self.account_id} -r {self.region}" + logger.info("Logging to IBM cloud") + run_cmd(login_cmd, secrets=[self.api_key]) + logger.info("Successfully logged in to IBM cloud") + + def run_ibmcloud_bm_cmd( + self, cmd, secrets=None, timeout=600, ignore_error=False, **kwargs + ): + """ + Wrapper function for `run_cmd` which if needed will perform IBM Cloud login + command before running the ibmcloud bare metal command. In the case run_cmd will fail + because the IBM cloud got disconnected, it will login and re-try. + + Args: + cmd (str): command to run + secrets (list): A list of secrets to be masked with asterisks + This kwarg is popped in order to not interfere with + subprocess.run(``**kwargs``) + timeout (int): Timeout for the command, defaults to 600 seconds. + ignore_error (bool): True if ignore non zero return code and do not + raise the exception. + """ + basic_cmd = "ibmcloud sl hardware " + cmd = basic_cmd + cmd + + try: + return run_cmd(cmd, secrets, timeout, ignore_error, **kwargs) + except CommandFailed as ex: + login_error_messages = [ + "Error: Failed to get", + "Access Denied", + "Please login", + ] + # Check if we need to re-login to IBM Cloud account + if any([error_msg in str(ex) for error_msg in login_error_messages]): + self.login() + return run_cmd(cmd, secrets, timeout, ignore_error, **kwargs) + + def get_all_machines(self): + """ + Get all the machines in the IBMCloud Bare metal machines + + Returns: + list: List of dictionaries. List of all the machines in the IBMCloud Bare metal machines + + """ + cmd = "list --output json" + machine_list = json.loads(self.run_ibmcloud_bm_cmd(cmd)) + return machine_list + + def get_machines_by_ips(self, machine_ips): + """ + Get the machines in the IBMCloud Bare metal machines that have the given machine IPs + + Args: + machine_ips (list): The list of the machine IPs to search for. + + Returns: + Get the machines in the IBMCloud Bare metal machines that have the given machine IPs + + """ + machine_list = self.get_all_machines() + return [m for m in machine_list if m["primaryIpAddress"] in machine_ips] + + def stop_machines(self, machines): + """ + Stop the IBMCloud Bare metal machines + + Args: + machines (list): List of the IBMCLoud Bare metal machines objects to stop + + """ + for m in machines: + logger.info(f"Powering off the machine with ip {m['primaryIpAddress']}") + cmd = f"power-off {m['id']} -f" + self.run_ibmcloud_bm_cmd(cmd) + + def start_machines(self, machines): + """ + Start the IBMCloud Bare metal machines + + Args: + machines (list): List of the IBMCLoud Bare metal machines objects to start + + """ + for m in machines: + logger.info(f"Powering on the machine with ip {m['primaryIpAddress']}") + cmd = f"power-on {m['id']}" + self.run_ibmcloud_bm_cmd(cmd) + + def restart_machines(self, machines, force=False): + """ + Reboot the IBMCloud Bare metal machines + + Args: + machines (list): List of the IBMCLoud Bare metal machines objects to restart + force (bool): If False, will perform a soft reboot. Otherwise, if True, will perform a hard reboot + + """ + reboot_type = "hard" if force else "soft" + for m in machines: + logger.info(f"Reboot the machine with the ip {m['primaryIpAddress']}") + cmd = f"reboot {m['id']} -f --{reboot_type}" + self.run_ibmcloud_bm_cmd(cmd) + + def restart_machines_by_stop_and_start(self, machines): + """ + Restart the IBMCloud Bare metal machines by stop and start + + Args: + machines (list): List of the IBMCLoud Bare metal machines objects to restart + + """ + self.stop_machines(machines) + self.start_machines(machines) diff --git a/tests/functional/z_cluster/nodes/test_nodes_restart_hci.py b/tests/functional/z_cluster/nodes/test_nodes_restart_hci.py new file mode 100644 index 00000000000..ac3b23f6d28 --- /dev/null +++ b/tests/functional/z_cluster/nodes/test_nodes_restart_hci.py @@ -0,0 +1,273 @@ +import logging +import pytest +import random + + +from ocs_ci.framework.pytest_customization.marks import brown_squad +from ocs_ci.framework.testlib import ( + tier4a, + tier4b, + ignore_leftovers, + ManageTest, + bugzilla, + provider_client_platform_required, + polarion_id, +) +from ocs_ci.ocs import constants +from ocs_ci.ocs.constants import HCI_PROVIDER +from ocs_ci.ocs.exceptions import ResourceWrongStatusException +from ocs_ci.ocs.node import ( + get_node_objs, + recover_node_to_ready_state, + get_osd_running_nodes, + get_node_osd_ids, + wait_for_nodes_status, + get_nodes, + wait_for_node_count_to_reach_status, + drain_nodes, + schedule_nodes, +) +from ocs_ci.ocs.resources import pod +from ocs_ci.helpers.sanity_helpers import Sanity +from ocs_ci.ocs.cluster import ( + ceph_health_check, +) +from ocs_ci.framework import config +from ocs_ci.utility.utils import switch_to_correct_cluster_at_setup + +logger = logging.getLogger(__name__) + + +@brown_squad +@ignore_leftovers +@provider_client_platform_required +class TestNodesRestartHCI(ManageTest): + """ + Test nodes restart scenarios when using HCI platform + """ + + @pytest.fixture(autouse=True) + def setup(self, request, create_scale_pods_and_pvcs_using_kube_job_on_ms_consumers): + """ + Initialize Sanity instance, and create pods and PVCs factory + + """ + self.orig_index = config.cur_index + switch_to_correct_cluster_at_setup(request) + self.sanity_helpers = Sanity() + + @pytest.fixture(autouse=True) + def teardown(self, request, nodes): + """ + Make sure all nodes are up again + + """ + + def finalizer(): + ocp_nodes = get_node_objs() + for n in ocp_nodes: + recover_node_to_ready_state(n) + + logger.info("Switch to the original cluster index") + config.switch_ctx(self.orig_index) + ceph_health_check() + + request.addfinalizer(finalizer) + + @tier4a + @pytest.mark.polarion_id("OCS-3980") + @pytest.mark.parametrize( + "cluster_type", + [HCI_PROVIDER], + ) + def test_osd_node_restart_and_check_osd_pods_status(self, cluster_type, nodes): + """ + 1) Restart one of the osd nodes. + 2) Check that the osd pods associated with the node should change to a Terminating state. + 4) Wait for the node to reach the Ready state. + 5) Wait for the new osd pods with the old ids to be running + + """ + osd_node_name = random.choice(get_osd_running_nodes()) + osd_node = get_node_objs([osd_node_name])[0] + + old_osd_pod_ids = get_node_osd_ids(osd_node_name) + logger.info(f"osd pod ids: {old_osd_pod_ids}") + node_osd_pods = pod.get_osd_pods_having_ids(old_osd_pod_ids) + node_osd_pod_names = [p.name for p in node_osd_pods] + + logger.info(f"Going to restart the node {osd_node_name}") + nodes.restart_nodes(nodes=[osd_node], wait=False) + + logger.info("Verify the node osd pods go into a Terminating state") + res = pod.wait_for_pods_to_be_in_statuses( + [constants.STATUS_TERMINATING], node_osd_pod_names, timeout=480, sleep=20 + ) + assert res, "Not all the node osd pods are in a Terminating state" + + logger.info(f"Wait for the node {osd_node_name} to be ready") + wait_for_nodes_status(node_names=[osd_node_name], timeout=720, sleep=20) + + new_osd_pods = pod.wait_for_osd_pods_having_ids(osd_ids=old_osd_pod_ids) + new_osd_pod_names = [p.name for p in new_osd_pods] + logger.info( + f"Wait for the new osd pods with the ids {old_osd_pod_ids} to be running" + ) + res = pod.wait_for_pods_to_be_in_statuses( + constants.STATUS_RUNNING, + new_osd_pod_names, + raise_pod_not_found_error=True, + ) + assert res, "Not all the node osd pods are in a Running state" + + @tier4a + @pytest.mark.parametrize( + argnames=["cluster_type", "node_type"], + argvalues=[ + pytest.param( + *[HCI_PROVIDER, constants.WORKER_MACHINE], + marks=pytest.mark.polarion_id("OCS-5420"), + ), + pytest.param( + *[HCI_PROVIDER, constants.MASTER_MACHINE], + marks=pytest.mark.polarion_id("OCS-5420"), + ), + ], + ) + def test_nodes_restart(self, cluster_type, nodes, node_type): + """ + Test nodes restart (from the platform layer) + + """ + node_count = len(get_nodes(node_type=node_type)) + ocp_nodes = get_nodes(node_type=node_type) + ocp_node = random.choice(ocp_nodes) + + nodes.restart_nodes(nodes=[ocp_node], wait=True) + logger.info("Wait for the expected node count to be ready...") + wait_for_node_count_to_reach_status(node_count=node_count, node_type=node_type) + ceph_health_check() + + @tier4a + @pytest.mark.parametrize( + argnames=["cluster_type", "node_type"], + argvalues=[ + pytest.param( + *[HCI_PROVIDER, constants.WORKER_MACHINE], + marks=pytest.mark.polarion_id("OCS-5421"), + ), + pytest.param( + *[HCI_PROVIDER, constants.MASTER_MACHINE], + marks=pytest.mark.polarion_id("OCS-5421"), + ), + ], + ) + def test_nodes_restart_by_stop_and_start(self, cluster_type, nodes, node_type): + """ + Test nodes restart by stop and start (from the platform layer) + + """ + node_count = len(get_nodes(node_type=node_type)) + ocp_nodes = get_nodes(node_type=node_type) + ocp_node = random.choice(ocp_nodes) + + nodes.restart_nodes_by_stop_and_start(nodes=[ocp_node], wait=True) + logger.info("Wait for the expected node count to be ready...") + wait_for_node_count_to_reach_status(node_count=node_count, node_type=node_type) + ceph_health_check() + + @tier4b + @bugzilla("1754287") + @pytest.mark.polarion_id("OCS-2015") + @pytest.mark.parametrize( + argnames=["cluster_type", "node_type"], + argvalues=[ + pytest.param(*[HCI_PROVIDER, constants.WORKER_MACHINE]), + ], + ) + def test_rolling_nodes_restart(self, cluster_type, nodes, node_type): + """ + Test restart nodes one after the other and check health status in between + + """ + node_count = len(get_nodes(node_type)) + ocp_nodes = get_nodes(node_type=node_type) + + for node in ocp_nodes: + nodes.restart_nodes(nodes=[node]) + wait_for_node_count_to_reach_status( + node_count=node_count, node_type=node_type + ) + ceph_health_check(tries=40) + + @tier4a + @polarion_id("OCS-4482") + @pytest.mark.parametrize( + argnames=["cluster_type", "node_type"], + argvalues=[ + pytest.param(*[HCI_PROVIDER, constants.WORKER_MACHINE]), + ], + ) + def test_node_maintenance_restart(self, cluster_type, nodes, node_type): + """ + - Mark as unschedulable and drain 1 worker node in the provider cluster + - Restart the node + - Mark the node as schedulable + + """ + typed_nodes = get_nodes(node_type=node_type) + assert typed_nodes, f"Failed to find a {node_type} node." + typed_node = random.choice(typed_nodes) + typed_node_name = typed_node.name + + # Get the current reboot events from the node + reboot_events_cmd = ( + f"get events -A --field-selector involvedObject.name=" + f"{typed_node_name},reason=Rebooted -o yaml" + ) + + # Find the number of reboot events in the node + num_events = len(typed_node.ocp.exec_oc_cmd(reboot_events_cmd)["items"]) + + # Unschedule and drain the node + drain_nodes([typed_node_name]) + # Wait for the node to be unschedule + wait_for_nodes_status( + node_names=[typed_node_name], + status=constants.NODE_READY_SCHEDULING_DISABLED, + ) + + # Restart the node + nodes.restart_nodes(nodes=[typed_node], wait=False) + + # Verify that the node restarted + try: + wait_for_nodes_status( + node_names=[typed_node_name], + status=constants.NODE_NOT_READY_SCHEDULING_DISABLED, + timeout=180, + sleep=5, + ) + except ResourceWrongStatusException: + # Sometimes, the node will be back to running state quickly so + # that the status change won't be detected. Verify the node was + # actually restarted by checking the reboot events count + new_num_events = len( + typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"] + ) + assert new_num_events > num_events, ( + f"Reboot event not found." f"Node {typed_node_name} did not restart." + ) + + # Wait for the node to be Ready + wait_for_nodes_status( + node_names=[typed_node_name], + status=constants.NODE_READY_SCHEDULING_DISABLED, + timeout=720, + sleep=20, + ) + + # Mark the node as schedulable + schedule_nodes([typed_node_name]) + + self.sanity_helpers.health_check()