diff --git a/ocs_ci/helpers/stretchcluster_helper.py b/ocs_ci/helpers/stretchcluster_helper.py index 3e745511f70..4dede44b723 100644 --- a/ocs_ci/helpers/stretchcluster_helper.py +++ b/ocs_ci/helpers/stretchcluster_helper.py @@ -141,7 +141,6 @@ def check_errors_regex(desc_out, err_msgs): replica_count=4, namespace=constants.STRETCH_CLUSTER_NAMESPACE, ) - break elif ( app_label.split("=")[1] in str(pod.get_labels()) @@ -164,7 +163,6 @@ def check_errors_regex(desc_out, err_msgs): count=4, namespace=constants.STRETCH_CLUSTER_NAMESPACE, ) - break elif ( app_label.split("=")[1] in str(pod.get_labels()) @@ -187,7 +185,7 @@ def check_errors_regex(desc_out, err_msgs): replica_count=2, namespace=constants.STRETCH_CLUSTER_NAMESPACE, ) - break + break # fetch workload pod details now and make sure all of them are running logger.info("Checking if the logwriter pods are up and running now") @@ -261,6 +259,7 @@ def check_errors_regex(desc_out, err_msgs): if check_errors_regex(desc_out, error_messages) and not restarted: + logger.info(f"{pod.name} description:\n{desc_out}") pod_node = get_pod_node(pod) logger.info( f"We need to restart the all the nodes in the zone of node {pod_node.name}" @@ -295,7 +294,11 @@ def check_errors_regex(desc_out, err_msgs): "because of known errors and no nodes restart was done." "Please check..." ) - raise Exception + raise Exception( + "Raising exception because none of the pods are failing" + "because of known errors and no nodes restart was done." + "Please check..." + ) # fetch workload pod details now and make sure all of them are running logger.info("Checking if the logwriter pods are up and running now") @@ -319,4 +322,4 @@ def recover_from_ceph_stuck(sc_obj): """ sc_obj.reset_conn_score() - return sc_obj.check_ceph_accessibility(timeout=30) + return sc_obj.check_ceph_accessibility(timeout=120) diff --git a/ocs_ci/ocs/cnv/virtual_machine.py b/ocs_ci/ocs/cnv/virtual_machine.py index bcf003f65a1..044dd4a4aba 100644 --- a/ocs_ci/ocs/cnv/virtual_machine.py +++ b/ocs_ci/ocs/cnv/virtual_machine.py @@ -646,6 +646,8 @@ def delete(self): """ Delete the VirtualMachine """ + if self.ready(): + self.stop() if self.secret_obj: self.secret_obj.delete() self.vm_ocp_obj.delete(resource_name=self._vm_name) diff --git a/ocs_ci/ocs/resources/stretchcluster.py b/ocs_ci/ocs/resources/stretchcluster.py index e1100cfedbf..5a204e7ce94 100644 --- a/ocs_ci/ocs/resources/stretchcluster.py +++ b/ocs_ci/ocs/resources/stretchcluster.py @@ -284,6 +284,7 @@ def check_for_write_pause(self, label, start_time, end_time): failed += 1 elif failed <= max_fail_expected: failed += 1 + break else: raise @@ -468,20 +469,23 @@ def check_ceph_accessibility(self, timeout, delay=60, grace=180): ceph_out = ceph_tools_pod.exec_sh_cmd_on_pod( command=command, timeout=timeout + grace ) - logger.info(ceph_out) + logger.info(f"Ceph status output:\n{ceph_out}") if "monclient(hunting): authenticate timed out" in ceph_out: logger.warning("Ceph was hung for sometime.") return False return True except Exception as err: - if "TimeoutExpired" in err.args[0]: + if ( + "TimeoutExpired" in err.args[0] + or "monclient(hunting): authenticate timed out" in err.args[0] + ): logger.error("Ceph status check got timed out. maybe ceph is hung.") return False elif ( "connect: no route to host" in err.args[0] or "error dialing backend" in err.args[0] ): - ceph_tools_pod.delete(wait=False) + ceph_tools_pod.delete(force=True) raise def get_out_of_quorum_nodes(self): diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py b/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py index 4adb635addc..08ff761150b 100644 --- a/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py +++ b/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py @@ -20,6 +20,8 @@ wait_for_pods_to_be_in_statuses, get_deployment_name, wait_for_pods_by_label_count, + get_all_pods, + get_pod_node, ) from ocs_ci.ocs.resources.pvc import get_pvc_objs from ocs_ci.ocs.resources.stretchcluster import StretchCluster @@ -27,6 +29,8 @@ logger = logging.getLogger(__name__) +CNV_WORKLOAD_NAMESPACE = "namespace-cnv-workload" + @pytest.fixture(scope="class") def setup_logwriter_workloads( @@ -141,7 +145,9 @@ def finalizer(): def setup_cnv_workload(request, cnv_workload_class, setup_cnv): logger.info("Setting up CNV workload and creating some data") - vm_obj = cnv_workload_class(volume_interface=constants.VM_VOLUME_PVC)[0] + vm_obj = cnv_workload_class( + volume_interface=constants.VM_VOLUME_PVC, namespace=CNV_WORKLOAD_NAMESPACE + )[0] vm_obj.run_ssh_cmd(command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400") md5sum_before = cal_md5sum_vm(vm_obj, file_path="/file_1.txt") @@ -198,8 +204,14 @@ def test_single_mon_failures(self): logger.info("testing single mon failures scenario") sc_obj = StretchCluster() - # get mon-pod of a single zone - mon_pods_in_zone = sc_obj.get_mon_pods_in_a_zone("data-1") + # get mon-pod of a zone where the cnv workloads + # are running + pod_objs = get_all_pods(namespace=CNV_WORKLOAD_NAMESPACE) + assert len(pod_objs) != 0, "No vmi pod instances are running" + node_obj = get_pod_node(pod_objs[0]) + mon_pods_in_zone = sc_obj.get_mon_pods_in_a_zone( + node_obj.get()["metadata"]["labels"][constants.ZONE_LABEL] + ) mon_pod_to_fail = random.choice(mon_pods_in_zone).name # get the deployment of the mon-pod @@ -267,8 +279,14 @@ def test_single_osd_failure(self): logger.info("testing single osd failure scenarios") sc_obj = StretchCluster() - # get osd-pod of a single zone - osd_pods_in_zone = sc_obj.get_osd_pods_in_a_zone("data-1") + # get osd-pod of a zone where the cnv + # workloads are running + pod_objs = get_all_pods(namespace=CNV_WORKLOAD_NAMESPACE) + assert len(pod_objs) != 0, "No vmi pod instances are running" + node_obj = get_pod_node(pod_objs[0]) + osd_pods_in_zone = sc_obj.get_osd_pods_in_a_zone( + node_obj.get()["metadata"]["labels"][constants.ZONE_LABEL] + ) osd_pod_to_fail = random.choice(osd_pods_in_zone).name # get the deployment of the osd-pod diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_netsplit.py b/tests/functional/disaster-recovery/sc_arbiter/test_netsplit.py index 5cbf9563c0b..c2c77b92562 100644 --- a/tests/functional/disaster-recovery/sc_arbiter/test_netsplit.py +++ b/tests/functional/disaster-recovery/sc_arbiter/test_netsplit.py @@ -15,7 +15,7 @@ ) from ocs_ci.ocs.resources.stretchcluster import StretchCluster -from ocs_ci.ocs.exceptions import CephHealthException +from ocs_ci.ocs.exceptions import CephHealthException, CommandFailed from ocs_ci.ocs import constants from ocs_ci.ocs.node import get_all_nodes @@ -26,6 +26,7 @@ wait_for_pods_to_be_in_statuses, get_ceph_tools_pod, ) +from ocs_ci.utility.retry import retry logger = logging.getLogger(__name__) @@ -185,29 +186,37 @@ def test_netsplit( logger.info(f"Ended netsplit at {end_time}") # check vm data written before the failure for integrity - logger.info("Waiting for VM SSH connectivity!") - vm_obj.wait_for_ssh_connectivity() - md5sum_after = cal_md5sum_vm(vm_obj, file_path="/file_1.txt") - assert ( - md5sum_before == md5sum_after - ), "Data integrity of the file inside VM is not maintained during the failure" - logger.info( - "Data integrity of the file inside VM is maintained during the failure" - ) + @retry(CommandFailed, tries=10, delay=10, backoff=1) + def _validate_vm_workload(): + """ + Validate vm workload post recovery - # check if new data can be created - vm_obj.run_ssh_cmd( - command="dd if=/dev/zero of=/file_2.txt bs=1024 count=103600" - ) - logger.info("Successfully created new data inside VM") + """ + logger.info("Waiting for VM SSH connectivity!") + vm_obj.wait_for_ssh_connectivity() + md5sum_after = cal_md5sum_vm(vm_obj, file_path="/file_1.txt") + assert ( + md5sum_before == md5sum_after + ), "Data integrity of the file inside VM is not maintained during the failure" + logger.info( + "Data integrity of the file inside VM is maintained during the failure" + ) + + # check if new data can be created + vm_obj.run_ssh_cmd( + command="dd if=/dev/zero of=/file_2.txt bs=1024 count=103600" + ) + logger.info("Successfully created new data inside VM") + + # check if the data can be copied back to local machine + vm_obj.scp_from_vm(local_path="/tmp", vm_src_path="/file_1.txt") + logger.info("VM data is successfully copied back to local machine") - # check if the data can be copied back to local machine - vm_obj.scp_from_vm(local_path="/tmp", vm_src_path="/file_1.txt") - logger.info("VM data is successfully copied back to local machine") + # stop the VM + vm_obj.stop() + logger.info("Stoped the VM successfully") - # stop the VM - vm_obj.stop() - logger.info("Stoped the VM successfully") + _validate_vm_workload() # get all the running logwriter pods sc_obj.get_logwriter_reader_pods(