Address review comments

Signed-off-by: Mahesh Shetty <[email protected]>
red-hat-storage · Dec 19, 2024 · 07af854 · 07af854
1 parent 335d044
commit 07af854
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 13 deletions.
diff --git a/ocs_ci/helpers/stretchcluster_helper.py b/ocs_ci/helpers/stretchcluster_helper.py
@@ -141,7 +141,6 @@ def check_errors_regex(desc_out, err_msgs):
                         replica_count=4,
                         namespace=constants.STRETCH_CLUSTER_NAMESPACE,
                     )
-                    break
 
                 elif (
                     app_label.split("=")[1] in str(pod.get_labels())
@@ -164,7 +163,6 @@ def check_errors_regex(desc_out, err_msgs):
                         count=4,
                         namespace=constants.STRETCH_CLUSTER_NAMESPACE,
                     )
-                    break
 
                 elif (
                     app_label.split("=")[1] in str(pod.get_labels())
@@ -187,7 +185,7 @@ def check_errors_regex(desc_out, err_msgs):
                         replica_count=2,
                         namespace=constants.STRETCH_CLUSTER_NAMESPACE,
                     )
-                    break
+                break
 
     # fetch workload pod details now and make sure all of them are running
     logger.info("Checking if the logwriter pods are up and running now")
@@ -261,6 +259,7 @@ def check_errors_regex(desc_out, err_msgs):
 
         if check_errors_regex(desc_out, error_messages) and not restarted:
 
+            logger.info(f"{pod.name} description:\n{desc_out}")
             pod_node = get_pod_node(pod)
             logger.info(
                 f"We need to restart the all the nodes in the zone of node {pod_node.name}"
@@ -295,7 +294,11 @@ def check_errors_regex(desc_out, err_msgs):
             "because of known errors and no nodes restart was done."
             "Please check..."
         )
-        raise Exception
+        raise Exception(
+            "Raising exception because none of the pods are failing"
+            "because of known errors and no nodes restart was done."
+            "Please check..."
+        )
 
     # fetch workload pod details now and make sure all of them are running
     logger.info("Checking if the logwriter pods are up and running now")
@@ -319,4 +322,4 @@ def recover_from_ceph_stuck(sc_obj):
     """
 
     sc_obj.reset_conn_score()
-    return sc_obj.check_ceph_accessibility(timeout=30)
+    return sc_obj.check_ceph_accessibility(timeout=120)
diff --git a/ocs_ci/ocs/resources/stretchcluster.py b/ocs_ci/ocs/resources/stretchcluster.py
@@ -284,6 +284,7 @@ def check_for_write_pause(self, label, start_time, end_time):
                         failed += 1
                     elif failed <= max_fail_expected:
                         failed += 1
+                        break
                     else:
                         raise
 
@@ -468,20 +469,23 @@ def check_ceph_accessibility(self, timeout, delay=60, grace=180):
             ceph_out = ceph_tools_pod.exec_sh_cmd_on_pod(
                 command=command, timeout=timeout + grace
             )
-            logger.info(ceph_out)
+            logger.info(f"Ceph status output:\n{ceph_out}")
             if "monclient(hunting): authenticate timed out" in ceph_out:
                 logger.warning("Ceph was hung for sometime.")
                 return False
             return True
         except Exception as err:
-            if "TimeoutExpired" in err.args[0]:
+            if (
+                "TimeoutExpired" in err.args[0]
+                or "monclient(hunting): authenticate timed out" in err.args[0]
+            ):
                 logger.error("Ceph status check got timed out. maybe ceph is hung.")
                 return False
             elif (
                 "connect: no route to host" in err.args[0]
                 or "error dialing backend" in err.args[0]
             ):
-                ceph_tools_pod.delete(wait=False)
+                ceph_tools_pod.delete(force=True)
             raise
 
     def get_out_of_quorum_nodes(self):

diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py b/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py
@@ -20,13 +20,17 @@
     wait_for_pods_to_be_in_statuses,
     get_deployment_name,
     wait_for_pods_by_label_count,
+    get_all_pods,
+    get_pod_node,
 )
 from ocs_ci.ocs.resources.pvc import get_pvc_objs
 from ocs_ci.ocs.resources.stretchcluster import StretchCluster
 from ocs_ci.ocs import constants
 
 logger = logging.getLogger(__name__)
 
+CNV_WORKLOAD_NAMESPACE = "namespace-cnv-workload"
+
 
 @pytest.fixture(scope="class")
 def setup_logwriter_workloads(
@@ -141,7 +145,9 @@ def finalizer():
 def setup_cnv_workload(request, cnv_workload_class, setup_cnv):
 
     logger.info("Setting up CNV workload and creating some data")
-    vm_obj = cnv_workload_class(volume_interface=constants.VM_VOLUME_PVC)[0]
+    vm_obj = cnv_workload_class(
+        volume_interface=constants.VM_VOLUME_PVC, namespace=CNV_WORKLOAD_NAMESPACE
+    )[0]
     vm_obj.run_ssh_cmd(command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400")
     md5sum_before = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")
 
@@ -198,8 +204,14 @@ def test_single_mon_failures(self):
         logger.info("testing single mon failures scenario")
         sc_obj = StretchCluster()
 
-        # get mon-pod of a single zone
-        mon_pods_in_zone = sc_obj.get_mon_pods_in_a_zone("data-1")
+        # get mon-pod of a zone where the cnv workloads
+        # are running
+        pod_objs = get_all_pods(namespace=CNV_WORKLOAD_NAMESPACE)
+        assert len(pod_objs) != 0, "No vmi pod instances are running"
+        node_obj = get_pod_node(pod_objs[0])
+        mon_pods_in_zone = sc_obj.get_mon_pods_in_a_zone(
+            node_obj.get()["metadata"]["labels"][constants.ZONE_LABEL]
+        )
         mon_pod_to_fail = random.choice(mon_pods_in_zone).name
 
         # get the deployment of the mon-pod
@@ -267,8 +279,14 @@ def test_single_osd_failure(self):
         logger.info("testing single osd failure scenarios")
         sc_obj = StretchCluster()
 
-        # get osd-pod of a single zone
-        osd_pods_in_zone = sc_obj.get_osd_pods_in_a_zone("data-1")
+        # get osd-pod of a zone where the cnv
+        # workloads are running
+        pod_objs = get_all_pods(namespace=CNV_WORKLOAD_NAMESPACE)
+        assert len(pod_objs) != 0, "No vmi pod instances are running"
+        node_obj = get_pod_node(pod_objs[0])
+        osd_pods_in_zone = sc_obj.get_osd_pods_in_a_zone(
+            node_obj.get()["metadata"]["labels"][constants.ZONE_LABEL]
+        )
         osd_pod_to_fail = random.choice(osd_pods_in_zone).name
 
         # get the deployment of the osd-pod