Skip to content

Commit

Permalink
Address review comments
Browse files Browse the repository at this point in the history
Signed-off-by: Mahesh Shetty <[email protected]>
  • Loading branch information
Mahesh Shetty authored and Mahesh Shetty committed Dec 19, 2024
1 parent 335d044 commit 07af854
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 13 deletions.
13 changes: 8 additions & 5 deletions ocs_ci/helpers/stretchcluster_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,6 @@ def check_errors_regex(desc_out, err_msgs):
replica_count=4,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
break

elif (
app_label.split("=")[1] in str(pod.get_labels())
Expand All @@ -164,7 +163,6 @@ def check_errors_regex(desc_out, err_msgs):
count=4,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
break

elif (
app_label.split("=")[1] in str(pod.get_labels())
Expand All @@ -187,7 +185,7 @@ def check_errors_regex(desc_out, err_msgs):
replica_count=2,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
break
break

# fetch workload pod details now and make sure all of them are running
logger.info("Checking if the logwriter pods are up and running now")
Expand Down Expand Up @@ -261,6 +259,7 @@ def check_errors_regex(desc_out, err_msgs):

if check_errors_regex(desc_out, error_messages) and not restarted:

logger.info(f"{pod.name} description:\n{desc_out}")
pod_node = get_pod_node(pod)
logger.info(
f"We need to restart the all the nodes in the zone of node {pod_node.name}"
Expand Down Expand Up @@ -295,7 +294,11 @@ def check_errors_regex(desc_out, err_msgs):
"because of known errors and no nodes restart was done."
"Please check..."
)
raise Exception
raise Exception(
"Raising exception because none of the pods are failing"
"because of known errors and no nodes restart was done."
"Please check..."
)

# fetch workload pod details now and make sure all of them are running
logger.info("Checking if the logwriter pods are up and running now")
Expand All @@ -319,4 +322,4 @@ def recover_from_ceph_stuck(sc_obj):
"""

sc_obj.reset_conn_score()
return sc_obj.check_ceph_accessibility(timeout=30)
return sc_obj.check_ceph_accessibility(timeout=120)
10 changes: 7 additions & 3 deletions ocs_ci/ocs/resources/stretchcluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ def check_for_write_pause(self, label, start_time, end_time):
failed += 1
elif failed <= max_fail_expected:
failed += 1
break
else:
raise

Expand Down Expand Up @@ -468,20 +469,23 @@ def check_ceph_accessibility(self, timeout, delay=60, grace=180):
ceph_out = ceph_tools_pod.exec_sh_cmd_on_pod(
command=command, timeout=timeout + grace
)
logger.info(ceph_out)
logger.info(f"Ceph status output:\n{ceph_out}")
if "monclient(hunting): authenticate timed out" in ceph_out:
logger.warning("Ceph was hung for sometime.")
return False
return True
except Exception as err:
if "TimeoutExpired" in err.args[0]:
if (
"TimeoutExpired" in err.args[0]
or "monclient(hunting): authenticate timed out" in err.args[0]
):
logger.error("Ceph status check got timed out. maybe ceph is hung.")
return False
elif (
"connect: no route to host" in err.args[0]
or "error dialing backend" in err.args[0]
):
ceph_tools_pod.delete(wait=False)
ceph_tools_pod.delete(force=True)
raise

def get_out_of_quorum_nodes(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,17 @@
wait_for_pods_to_be_in_statuses,
get_deployment_name,
wait_for_pods_by_label_count,
get_all_pods,
get_pod_node,
)
from ocs_ci.ocs.resources.pvc import get_pvc_objs
from ocs_ci.ocs.resources.stretchcluster import StretchCluster
from ocs_ci.ocs import constants

logger = logging.getLogger(__name__)

CNV_WORKLOAD_NAMESPACE = "namespace-cnv-workload"


@pytest.fixture(scope="class")
def setup_logwriter_workloads(
Expand Down Expand Up @@ -141,7 +145,9 @@ def finalizer():
def setup_cnv_workload(request, cnv_workload_class, setup_cnv):

logger.info("Setting up CNV workload and creating some data")
vm_obj = cnv_workload_class(volume_interface=constants.VM_VOLUME_PVC)[0]
vm_obj = cnv_workload_class(
volume_interface=constants.VM_VOLUME_PVC, namespace=CNV_WORKLOAD_NAMESPACE
)[0]
vm_obj.run_ssh_cmd(command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400")
md5sum_before = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")

Expand Down Expand Up @@ -198,8 +204,14 @@ def test_single_mon_failures(self):
logger.info("testing single mon failures scenario")
sc_obj = StretchCluster()

# get mon-pod of a single zone
mon_pods_in_zone = sc_obj.get_mon_pods_in_a_zone("data-1")
# get mon-pod of a zone where the cnv workloads
# are running
pod_objs = get_all_pods(namespace=CNV_WORKLOAD_NAMESPACE)
assert len(pod_objs) != 0, "No vmi pod instances are running"
node_obj = get_pod_node(pod_objs[0])
mon_pods_in_zone = sc_obj.get_mon_pods_in_a_zone(
node_obj.get()["metadata"]["labels"][constants.ZONE_LABEL]
)
mon_pod_to_fail = random.choice(mon_pods_in_zone).name

# get the deployment of the mon-pod
Expand Down Expand Up @@ -267,8 +279,14 @@ def test_single_osd_failure(self):
logger.info("testing single osd failure scenarios")
sc_obj = StretchCluster()

# get osd-pod of a single zone
osd_pods_in_zone = sc_obj.get_osd_pods_in_a_zone("data-1")
# get osd-pod of a zone where the cnv
# workloads are running
pod_objs = get_all_pods(namespace=CNV_WORKLOAD_NAMESPACE)
assert len(pod_objs) != 0, "No vmi pod instances are running"
node_obj = get_pod_node(pod_objs[0])
osd_pods_in_zone = sc_obj.get_osd_pods_in_a_zone(
node_obj.get()["metadata"]["labels"][constants.ZONE_LABEL]
)
osd_pod_to_fail = random.choice(osd_pods_in_zone).name

# get the deployment of the osd-pod
Expand Down

0 comments on commit 07af854

Please sign in to comment.