diff --git a/ocs_ci/ocs/resources/stretchcluster.py b/ocs_ci/ocs/resources/stretchcluster.py index e83a9f531b35..7946e60bcff9 100644 --- a/ocs_ci/ocs/resources/stretchcluster.py +++ b/ocs_ci/ocs/resources/stretchcluster.py @@ -202,28 +202,31 @@ def check_for_read_pause(self, label, start_time, end_time): """ paused = 0 + max_fail_expected = len(self.workload_map[label][0]) - 2 + failed = 0 for pod_obj in self.workload_map[label][0]: - if get_pod_node(pod_obj).name in self.non_quorum_nodes: - logger.info( - f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone" + try: + pause_count = 0 + time_var = start_time + pod_log = get_pod_logs( + pod_name=pod_obj.name, namespace=constants.STRETCH_CLUSTER_NAMESPACE ) - continue - pause_count = 0 - time_var = start_time - pod_log = get_pod_logs( - pod_name=pod_obj.name, namespace=constants.STRETCH_CLUSTER_NAMESPACE - ) - logger.info(f"Current pod: {pod_obj.name}") - while time_var <= (end_time + timedelta(minutes=1)): - t_time = time_var.strftime("%H:%M") - if f" {t_time}" not in pod_log: - pause_count += 1 - logger.info(f"Read pause: {t_time}") + logger.info(f"Current pod: {pod_obj.name}") + while time_var <= (end_time + timedelta(minutes=1)): + t_time = time_var.strftime("%H:%M") + if f" {t_time}" not in pod_log: + pause_count += 1 + logger.info(f"Read pause: {t_time}") + else: + logger.info(f"Read success: {t_time}") + time_var = time_var + timedelta(minutes=1) + if pause_count > 5: + paused += 1 + except CommandFailed: + if failed <= max_fail_expected: + failed += 1 else: - logger.info(f"Read success: {t_time}") - time_var = time_var + timedelta(minutes=1) - if pause_count > 5: - paused += 1 + raise return paused @retry(CommandFailed, tries=6, delay=10) @@ -241,13 +244,14 @@ def check_for_write_pause(self, label, start_time, end_time): """ paused = 0 + max_fail_expected = ( + len(self.workload_map[label][0]) - 2 + if label == constants.LOGWRITER_CEPHFS_LABEL + else 1 + ) + failed = 0 for pod_obj in self.workload_map[label][0]: - if get_pod_node(pod_obj).name in self.non_quorum_nodes: - logger.info( - f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone" - ) - continue - excepted = 0 + no_such_file_expected = 1 for file_name in self.logfile_map[label][0]: pause_count = 0 try: @@ -269,13 +273,16 @@ def check_for_write_pause(self, label, start_time, end_time): "No such file or directory" in err.args[0] and label == constants.LOGWRITER_RBD_LABEL ): - if excepted == 0: + if no_such_file_expected == 1: logger.info( f"Seems like file {file_name} is not in RBD pod {pod_obj.name}" ) - excepted += 1 + no_such_file_expected += 1 else: raise UnexpectedBehaviour + failed += 1 + elif failed <= max_fail_expected: + failed += 1 else: raise @@ -437,7 +444,7 @@ def check_for_data_loss(self, label): return True @retry(CommandFailed, tries=15, delay=5) - def check_ceph_accessibility(self, timeout, delay=5, grace=120): + def check_ceph_accessibility(self, timeout, delay=60, grace=180): """ Check for ceph access for the 'timeout' seconds @@ -469,7 +476,10 @@ def check_ceph_accessibility(self, timeout, delay=5, grace=120): if "TimeoutExpired" in err.args[0]: logger.error("Ceph status check got timed out. maybe ceph is hung.") return False - elif "connect: no route to host" in err.args[0]: + elif ( + "connect: no route to host" in err.args[0] + or "error dialing backend" in err.args[0] + ): ceph_tools_pod.delete(wait=False) raise @@ -634,7 +644,7 @@ def cephfs_failure_checks( self.check_for_read_pause( constants.LOGREADER_CEPHFS_LABEL, start_time, end_time ) - == 0 + <= 2 ), "Read operations are paused for CephFS workloads even for the ones in available zones" logger.info("All read operations are successful for CephFs workload") @@ -653,7 +663,7 @@ def rbd_failure_checks(self, start_time, end_time, **kwargs): start_time, end_time, ) - == 0 + <= 1 ), "Write operations paused for RBD workloads even for the ones in available zone" logger.info("all write operations are successful for RBD workloads") diff --git a/tests/conftest.py b/tests/conftest.py index 3d510b2a11e9..b24bd1384188 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8576,8 +8576,8 @@ def setup_cnv(request): cnv_obj = CNVInstaller() installed = False if not cnv_obj.post_install_verification(): - cnv_obj.deploy_cnv(check_cnv_deployed=False, check_cnv_ready=False) installed = True + cnv_obj.deploy_cnv() def finalizer(): """ diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py b/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py index 8576bc54f38a..545a713df44a 100644 --- a/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py +++ b/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py @@ -136,8 +136,47 @@ def finalizer(): request.addfinalizer(finalizer) +@pytest.fixture(scope="class") +def setup_cnv_workload(request, cnv_workload, setup_cnv): + + logger.info("Setting up CNV workload and creating some data") + vm_obj = cnv_workload(volume_interface=constants.VM_VOLUME_PVC)[0] + vm_obj.run_ssh_cmd(command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400") + md5sum_before = vm_obj.run_ssh_cmd(command="md5sum /file_1.txt") + + def finalizer(): + + # check vm data written before the failure for integrity + logger.info("Waiting for VM SSH connectivity!") + vm_obj.wait_for_ssh_connectivity() + md5sum_after = vm_obj.run_ssh_cmd(command="md5sum /file_1.txt") + assert ( + md5sum_before == md5sum_after + ), "Data integrity of the file inside VM is not maintained during the failure" + logger.info( + "Data integrity of the file inside VM is maintained during the failure" + ) + + # check if new data can be created + vm_obj.run_ssh_cmd( + command="dd if=/dev/zero of=/file_2.txt bs=1024 count=103600" + ) + logger.info("Successfully created new data inside VM") + + # check if the data can be copied back to local machine + vm_obj.scp_from_vm(local_path="/tmp", vm_src_path="/file_1.txt") + logger.info("VM data is successfully copied back to local machine") + + # stop the VM + vm_obj.stop() + logger.info("Stoped the VM successfully") + + request.addfinalizer(finalizer) + + @turquoise_squad @stretchcluster_required +@pytest.mark.usefixtures("setup_cnv_workload") @pytest.mark.usefixtures("setup_logwriter_workloads") class TestMonAndOSDFailures: """ diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_netsplit.py b/tests/functional/disaster-recovery/sc_arbiter/test_netsplit.py index db4a51de01ce..e572f13792d1 100644 --- a/tests/functional/disaster-recovery/sc_arbiter/test_netsplit.py +++ b/tests/functional/disaster-recovery/sc_arbiter/test_netsplit.py @@ -84,14 +84,15 @@ def finalizer(): @pytest.mark.parametrize( argnames="zones, duration", argvalues=[ - # pytest.param( - # constants.NETSPLIT_DATA_1_DATA_2, - # 30, - # marks=[ - # pytest.mark.polarion_id("OCS-5069"), - # pytest.mark.polarion_id("OCS-5071"), - # ], - # ), + pytest.param( + constants.NETSPLIT_DATA_1_DATA_2, + 30, + marks=[ + pytest.mark.polarion_id("OCS-5069"), + pytest.mark.polarion_id("OCS-5071"), + pytest.mark.bugzilla("2265992"), + ], + ), pytest.param( constants.NETSPLIT_ARBITER_DATA_1, 15, @@ -118,7 +119,7 @@ def finalizer(): ), ], ids=[ - # "Data-1-Data-2", + "Data-1-Data-2", "Arbiter-Data-1", "Arbiter-Data-1-and-Arbiter-Data-2", "Arbiter-Data-1-and-Data-1-Data-2", @@ -195,17 +196,9 @@ def test_netsplit( ) logger.info(f"Netsplit induced at {start_time} for zones {zones}") - # get the nodes which are present in the - # out of quorum zone - if ( - zones != constants.NETSPLIT_ARBITER_DATA_1 - and zones != constants.NETSPLIT_ARBITER_DATA_1_AND_ARBITER_DATA_2 - ): - sc_obj.get_out_of_quorum_nodes() - # check for ceph accessibility and note the end time (UTC) timeout = (end_time - datetime.now(timezone.utc)).total_seconds() - if not sc_obj.check_ceph_accessibility(timeout=timeout): + if not sc_obj.check_ceph_accessibility(timeout=int(timeout)): assert recover_from_ceph_stuck( sc_obj ), "Something went wrong. not expected. please check rook-ceph logs"