diff --git a/ocs_ci/helpers/stretchcluster_helper.py b/ocs_ci/helpers/stretchcluster_helper.py index 3e745511f70..4dede44b723 100644 --- a/ocs_ci/helpers/stretchcluster_helper.py +++ b/ocs_ci/helpers/stretchcluster_helper.py @@ -141,7 +141,6 @@ def check_errors_regex(desc_out, err_msgs): replica_count=4, namespace=constants.STRETCH_CLUSTER_NAMESPACE, ) - break elif ( app_label.split("=")[1] in str(pod.get_labels()) @@ -164,7 +163,6 @@ def check_errors_regex(desc_out, err_msgs): count=4, namespace=constants.STRETCH_CLUSTER_NAMESPACE, ) - break elif ( app_label.split("=")[1] in str(pod.get_labels()) @@ -187,7 +185,7 @@ def check_errors_regex(desc_out, err_msgs): replica_count=2, namespace=constants.STRETCH_CLUSTER_NAMESPACE, ) - break + break # fetch workload pod details now and make sure all of them are running logger.info("Checking if the logwriter pods are up and running now") @@ -261,6 +259,7 @@ def check_errors_regex(desc_out, err_msgs): if check_errors_regex(desc_out, error_messages) and not restarted: + logger.info(f"{pod.name} description:\n{desc_out}") pod_node = get_pod_node(pod) logger.info( f"We need to restart the all the nodes in the zone of node {pod_node.name}" @@ -295,7 +294,11 @@ def check_errors_regex(desc_out, err_msgs): "because of known errors and no nodes restart was done." "Please check..." ) - raise Exception + raise Exception( + "Raising exception because none of the pods are failing" + "because of known errors and no nodes restart was done." + "Please check..." + ) # fetch workload pod details now and make sure all of them are running logger.info("Checking if the logwriter pods are up and running now") @@ -319,4 +322,4 @@ def recover_from_ceph_stuck(sc_obj): """ sc_obj.reset_conn_score() - return sc_obj.check_ceph_accessibility(timeout=30) + return sc_obj.check_ceph_accessibility(timeout=120) diff --git a/ocs_ci/ocs/resources/stretchcluster.py b/ocs_ci/ocs/resources/stretchcluster.py index e1100cfedbf..5a204e7ce94 100644 --- a/ocs_ci/ocs/resources/stretchcluster.py +++ b/ocs_ci/ocs/resources/stretchcluster.py @@ -284,6 +284,7 @@ def check_for_write_pause(self, label, start_time, end_time): failed += 1 elif failed <= max_fail_expected: failed += 1 + break else: raise @@ -468,20 +469,23 @@ def check_ceph_accessibility(self, timeout, delay=60, grace=180): ceph_out = ceph_tools_pod.exec_sh_cmd_on_pod( command=command, timeout=timeout + grace ) - logger.info(ceph_out) + logger.info(f"Ceph status output:\n{ceph_out}") if "monclient(hunting): authenticate timed out" in ceph_out: logger.warning("Ceph was hung for sometime.") return False return True except Exception as err: - if "TimeoutExpired" in err.args[0]: + if ( + "TimeoutExpired" in err.args[0] + or "monclient(hunting): authenticate timed out" in err.args[0] + ): logger.error("Ceph status check got timed out. maybe ceph is hung.") return False elif ( "connect: no route to host" in err.args[0] or "error dialing backend" in err.args[0] ): - ceph_tools_pod.delete(wait=False) + ceph_tools_pod.delete(force=True) raise def get_out_of_quorum_nodes(self): diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py b/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py index 4adb635addc..08ff761150b 100644 --- a/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py +++ b/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py @@ -20,6 +20,8 @@ wait_for_pods_to_be_in_statuses, get_deployment_name, wait_for_pods_by_label_count, + get_all_pods, + get_pod_node, ) from ocs_ci.ocs.resources.pvc import get_pvc_objs from ocs_ci.ocs.resources.stretchcluster import StretchCluster @@ -27,6 +29,8 @@ logger = logging.getLogger(__name__) +CNV_WORKLOAD_NAMESPACE = "namespace-cnv-workload" + @pytest.fixture(scope="class") def setup_logwriter_workloads( @@ -141,7 +145,9 @@ def finalizer(): def setup_cnv_workload(request, cnv_workload_class, setup_cnv): logger.info("Setting up CNV workload and creating some data") - vm_obj = cnv_workload_class(volume_interface=constants.VM_VOLUME_PVC)[0] + vm_obj = cnv_workload_class( + volume_interface=constants.VM_VOLUME_PVC, namespace=CNV_WORKLOAD_NAMESPACE + )[0] vm_obj.run_ssh_cmd(command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400") md5sum_before = cal_md5sum_vm(vm_obj, file_path="/file_1.txt") @@ -198,8 +204,14 @@ def test_single_mon_failures(self): logger.info("testing single mon failures scenario") sc_obj = StretchCluster() - # get mon-pod of a single zone - mon_pods_in_zone = sc_obj.get_mon_pods_in_a_zone("data-1") + # get mon-pod of a zone where the cnv workloads + # are running + pod_objs = get_all_pods(namespace=CNV_WORKLOAD_NAMESPACE) + assert len(pod_objs) != 0, "No vmi pod instances are running" + node_obj = get_pod_node(pod_objs[0]) + mon_pods_in_zone = sc_obj.get_mon_pods_in_a_zone( + node_obj.get()["metadata"]["labels"][constants.ZONE_LABEL] + ) mon_pod_to_fail = random.choice(mon_pods_in_zone).name # get the deployment of the mon-pod @@ -267,8 +279,14 @@ def test_single_osd_failure(self): logger.info("testing single osd failure scenarios") sc_obj = StretchCluster() - # get osd-pod of a single zone - osd_pods_in_zone = sc_obj.get_osd_pods_in_a_zone("data-1") + # get osd-pod of a zone where the cnv + # workloads are running + pod_objs = get_all_pods(namespace=CNV_WORKLOAD_NAMESPACE) + assert len(pod_objs) != 0, "No vmi pod instances are running" + node_obj = get_pod_node(pod_objs[0]) + osd_pods_in_zone = sc_obj.get_osd_pods_in_a_zone( + node_obj.get()["metadata"]["labels"][constants.ZONE_LABEL] + ) osd_pod_to_fail = random.choice(osd_pods_in_zone).name # get the deployment of the osd-pod