Skip to content

Commit

Permalink
Address review comments
Browse files Browse the repository at this point in the history
Signed-off-by: Mahesh Shetty <[email protected]>
  • Loading branch information
Mahesh Shetty authored and Mahesh Shetty committed Dec 20, 2024
1 parent 8e29845 commit eb601b1
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 34 deletions.
13 changes: 8 additions & 5 deletions ocs_ci/helpers/stretchcluster_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,6 @@ def check_errors_regex(desc_out, err_msgs):
replica_count=4,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
break

elif (
app_label.split("=")[1] in str(pod.get_labels())
Expand All @@ -164,7 +163,6 @@ def check_errors_regex(desc_out, err_msgs):
count=4,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
break

elif (
app_label.split("=")[1] in str(pod.get_labels())
Expand All @@ -187,7 +185,7 @@ def check_errors_regex(desc_out, err_msgs):
replica_count=2,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
break
break

# fetch workload pod details now and make sure all of them are running
logger.info("Checking if the logwriter pods are up and running now")
Expand Down Expand Up @@ -261,6 +259,7 @@ def check_errors_regex(desc_out, err_msgs):

if check_errors_regex(desc_out, error_messages) and not restarted:

logger.info(f"{pod.name} description:\n{desc_out}")
pod_node = get_pod_node(pod)
logger.info(
f"We need to restart the all the nodes in the zone of node {pod_node.name}"
Expand Down Expand Up @@ -295,7 +294,11 @@ def check_errors_regex(desc_out, err_msgs):
"because of known errors and no nodes restart was done."
"Please check..."
)
raise Exception
raise Exception(
"Raising exception because none of the pods are failing"
"because of known errors and no nodes restart was done."
"Please check..."
)

# fetch workload pod details now and make sure all of them are running
logger.info("Checking if the logwriter pods are up and running now")
Expand All @@ -319,4 +322,4 @@ def recover_from_ceph_stuck(sc_obj):
"""

sc_obj.reset_conn_score()
return sc_obj.check_ceph_accessibility(timeout=30)
return sc_obj.check_ceph_accessibility(timeout=120)
2 changes: 2 additions & 0 deletions ocs_ci/ocs/cnv/virtual_machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,8 @@ def delete(self):
"""
Delete the VirtualMachine
"""
if self.ready():
self.stop()
if self.secret_obj:
self.secret_obj.delete()
self.vm_ocp_obj.delete(resource_name=self._vm_name)
Expand Down
10 changes: 7 additions & 3 deletions ocs_ci/ocs/resources/stretchcluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ def check_for_write_pause(self, label, start_time, end_time):
failed += 1
elif failed <= max_fail_expected:
failed += 1
break
else:
raise

Expand Down Expand Up @@ -468,20 +469,23 @@ def check_ceph_accessibility(self, timeout, delay=60, grace=180):
ceph_out = ceph_tools_pod.exec_sh_cmd_on_pod(
command=command, timeout=timeout + grace
)
logger.info(ceph_out)
logger.info(f"Ceph status output:\n{ceph_out}")
if "monclient(hunting): authenticate timed out" in ceph_out:
logger.warning("Ceph was hung for sometime.")
return False
return True
except Exception as err:
if "TimeoutExpired" in err.args[0]:
if (
"TimeoutExpired" in err.args[0]
or "monclient(hunting): authenticate timed out" in err.args[0]
):
logger.error("Ceph status check got timed out. maybe ceph is hung.")
return False
elif (
"connect: no route to host" in err.args[0]
or "error dialing backend" in err.args[0]
):
ceph_tools_pod.delete(wait=False)
ceph_tools_pod.delete(force=True)
raise

def get_out_of_quorum_nodes(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,17 @@
wait_for_pods_to_be_in_statuses,
get_deployment_name,
wait_for_pods_by_label_count,
get_all_pods,
get_pod_node,
)
from ocs_ci.ocs.resources.pvc import get_pvc_objs
from ocs_ci.ocs.resources.stretchcluster import StretchCluster
from ocs_ci.ocs import constants

logger = logging.getLogger(__name__)

CNV_WORKLOAD_NAMESPACE = "namespace-cnv-workload"


@pytest.fixture(scope="class")
def setup_logwriter_workloads(
Expand Down Expand Up @@ -141,7 +145,9 @@ def finalizer():
def setup_cnv_workload(request, cnv_workload_class, setup_cnv):

logger.info("Setting up CNV workload and creating some data")
vm_obj = cnv_workload_class(volume_interface=constants.VM_VOLUME_PVC)[0]
vm_obj = cnv_workload_class(
volume_interface=constants.VM_VOLUME_PVC, namespace=CNV_WORKLOAD_NAMESPACE
)[0]
vm_obj.run_ssh_cmd(command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400")
md5sum_before = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")

Expand Down Expand Up @@ -198,8 +204,14 @@ def test_single_mon_failures(self):
logger.info("testing single mon failures scenario")
sc_obj = StretchCluster()

# get mon-pod of a single zone
mon_pods_in_zone = sc_obj.get_mon_pods_in_a_zone("data-1")
# get mon-pod of a zone where the cnv workloads
# are running
pod_objs = get_all_pods(namespace=CNV_WORKLOAD_NAMESPACE)
assert len(pod_objs) != 0, "No vmi pod instances are running"
node_obj = get_pod_node(pod_objs[0])
mon_pods_in_zone = sc_obj.get_mon_pods_in_a_zone(
node_obj.get()["metadata"]["labels"][constants.ZONE_LABEL]
)
mon_pod_to_fail = random.choice(mon_pods_in_zone).name

# get the deployment of the mon-pod
Expand Down Expand Up @@ -267,8 +279,14 @@ def test_single_osd_failure(self):
logger.info("testing single osd failure scenarios")
sc_obj = StretchCluster()

# get osd-pod of a single zone
osd_pods_in_zone = sc_obj.get_osd_pods_in_a_zone("data-1")
# get osd-pod of a zone where the cnv
# workloads are running
pod_objs = get_all_pods(namespace=CNV_WORKLOAD_NAMESPACE)
assert len(pod_objs) != 0, "No vmi pod instances are running"
node_obj = get_pod_node(pod_objs[0])
osd_pods_in_zone = sc_obj.get_osd_pods_in_a_zone(
node_obj.get()["metadata"]["labels"][constants.ZONE_LABEL]
)
osd_pod_to_fail = random.choice(osd_pods_in_zone).name

# get the deployment of the osd-pod
Expand Down
51 changes: 30 additions & 21 deletions tests/functional/disaster-recovery/sc_arbiter/test_netsplit.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
)

from ocs_ci.ocs.resources.stretchcluster import StretchCluster
from ocs_ci.ocs.exceptions import CephHealthException
from ocs_ci.ocs.exceptions import CephHealthException, CommandFailed

from ocs_ci.ocs import constants
from ocs_ci.ocs.node import get_all_nodes
Expand All @@ -26,6 +26,7 @@
wait_for_pods_to_be_in_statuses,
get_ceph_tools_pod,
)
from ocs_ci.utility.retry import retry

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -185,29 +186,37 @@ def test_netsplit(
logger.info(f"Ended netsplit at {end_time}")

# check vm data written before the failure for integrity
logger.info("Waiting for VM SSH connectivity!")
vm_obj.wait_for_ssh_connectivity()
md5sum_after = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")
assert (
md5sum_before == md5sum_after
), "Data integrity of the file inside VM is not maintained during the failure"
logger.info(
"Data integrity of the file inside VM is maintained during the failure"
)
@retry(CommandFailed, tries=10, delay=10, backoff=1)
def _validate_vm_workload():
"""
Validate vm workload post recovery
# check if new data can be created
vm_obj.run_ssh_cmd(
command="dd if=/dev/zero of=/file_2.txt bs=1024 count=103600"
)
logger.info("Successfully created new data inside VM")
"""
logger.info("Waiting for VM SSH connectivity!")
vm_obj.wait_for_ssh_connectivity()
md5sum_after = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")
assert (
md5sum_before == md5sum_after
), "Data integrity of the file inside VM is not maintained during the failure"
logger.info(
"Data integrity of the file inside VM is maintained during the failure"
)

# check if new data can be created
vm_obj.run_ssh_cmd(
command="dd if=/dev/zero of=/file_2.txt bs=1024 count=103600"
)
logger.info("Successfully created new data inside VM")

# check if the data can be copied back to local machine
vm_obj.scp_from_vm(local_path="/tmp", vm_src_path="/file_1.txt")
logger.info("VM data is successfully copied back to local machine")

# check if the data can be copied back to local machine
vm_obj.scp_from_vm(local_path="/tmp", vm_src_path="/file_1.txt")
logger.info("VM data is successfully copied back to local machine")
# stop the VM
vm_obj.stop()
logger.info("Stoped the VM successfully")

# stop the VM
vm_obj.stop()
logger.info("Stoped the VM successfully")
_validate_vm_workload()

# get all the running logwriter pods
sc_obj.get_logwriter_reader_pods(
Expand Down

0 comments on commit eb601b1

Please sign in to comment.