Skip to content

Commit

Permalink
Add cnv workload to mon & osd failure test
Browse files Browse the repository at this point in the history
Signed-off-by: Mahesh Shetty <[email protected]>
  • Loading branch information
Mahesh Shetty authored and Mahesh Shetty committed Nov 21, 2024
1 parent fd19eca commit 813fc52
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 50 deletions.
72 changes: 41 additions & 31 deletions ocs_ci/ocs/resources/stretchcluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,28 +202,31 @@ def check_for_read_pause(self, label, start_time, end_time):
"""
paused = 0
max_fail_expected = len(self.workload_map[label][0]) - 2
failed = 0
for pod_obj in self.workload_map[label][0]:
if get_pod_node(pod_obj).name in self.non_quorum_nodes:
logger.info(
f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone"
try:
pause_count = 0
time_var = start_time
pod_log = get_pod_logs(
pod_name=pod_obj.name, namespace=constants.STRETCH_CLUSTER_NAMESPACE
)
continue
pause_count = 0
time_var = start_time
pod_log = get_pod_logs(
pod_name=pod_obj.name, namespace=constants.STRETCH_CLUSTER_NAMESPACE
)
logger.info(f"Current pod: {pod_obj.name}")
while time_var <= (end_time + timedelta(minutes=1)):
t_time = time_var.strftime("%H:%M")
if f" {t_time}" not in pod_log:
pause_count += 1
logger.info(f"Read pause: {t_time}")
logger.info(f"Current pod: {pod_obj.name}")
while time_var <= (end_time + timedelta(minutes=1)):
t_time = time_var.strftime("%H:%M")
if f" {t_time}" not in pod_log:
pause_count += 1
logger.info(f"Read pause: {t_time}")
else:
logger.info(f"Read success: {t_time}")
time_var = time_var + timedelta(minutes=1)
if pause_count > 5:
paused += 1
except CommandFailed:
if failed <= max_fail_expected:
failed += 1
else:
logger.info(f"Read success: {t_time}")
time_var = time_var + timedelta(minutes=1)
if pause_count > 5:
paused += 1
raise
return paused

@retry(CommandFailed, tries=6, delay=10)
Expand All @@ -241,13 +244,14 @@ def check_for_write_pause(self, label, start_time, end_time):
"""
paused = 0
max_fail_expected = (
len(self.workload_map[label][0]) - 2
if label == constants.LOGWRITER_CEPHFS_LABEL
else 1
)
failed = 0
for pod_obj in self.workload_map[label][0]:
if get_pod_node(pod_obj).name in self.non_quorum_nodes:
logger.info(
f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone"
)
continue
excepted = 0
no_such_file_expected = 1
for file_name in self.logfile_map[label][0]:
pause_count = 0
try:
Expand All @@ -269,13 +273,16 @@ def check_for_write_pause(self, label, start_time, end_time):
"No such file or directory" in err.args[0]
and label == constants.LOGWRITER_RBD_LABEL
):
if excepted == 0:
if no_such_file_expected == 1:
logger.info(
f"Seems like file {file_name} is not in RBD pod {pod_obj.name}"
)
excepted += 1
no_such_file_expected += 1
else:
raise UnexpectedBehaviour
failed += 1
elif failed <= max_fail_expected:
failed += 1
else:
raise

Expand Down Expand Up @@ -437,7 +444,7 @@ def check_for_data_loss(self, label):
return True

@retry(CommandFailed, tries=15, delay=5)
def check_ceph_accessibility(self, timeout, delay=5, grace=120):
def check_ceph_accessibility(self, timeout, delay=60, grace=180):
"""
Check for ceph access for the 'timeout' seconds
Expand Down Expand Up @@ -469,7 +476,10 @@ def check_ceph_accessibility(self, timeout, delay=5, grace=120):
if "TimeoutExpired" in err.args[0]:
logger.error("Ceph status check got timed out. maybe ceph is hung.")
return False
elif "connect: no route to host" in err.args[0]:
elif (
"connect: no route to host" in err.args[0]
or "error dialing backend" in err.args[0]
):
ceph_tools_pod.delete(wait=False)
raise

Expand Down Expand Up @@ -634,7 +644,7 @@ def cephfs_failure_checks(
self.check_for_read_pause(
constants.LOGREADER_CEPHFS_LABEL, start_time, end_time
)
== 0
<= 2
), "Read operations are paused for CephFS workloads even for the ones in available zones"
logger.info("All read operations are successful for CephFs workload")

Expand All @@ -653,7 +663,7 @@ def rbd_failure_checks(self, start_time, end_time, **kwargs):
start_time,
end_time,
)
== 0
<= 1
), "Write operations paused for RBD workloads even for the ones in available zone"
logger.info("all write operations are successful for RBD workloads")

Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8576,8 +8576,8 @@ def setup_cnv(request):
cnv_obj = CNVInstaller()
installed = False
if not cnv_obj.post_install_verification():
cnv_obj.deploy_cnv(check_cnv_deployed=False, check_cnv_ready=False)
installed = True
cnv_obj.deploy_cnv()

def finalizer():
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,47 @@ def finalizer():
request.addfinalizer(finalizer)


@pytest.fixture(scope="class")
def setup_cnv_workload(request, cnv_workload, setup_cnv):

logger.info("Setting up CNV workload and creating some data")
vm_obj = cnv_workload(volume_interface=constants.VM_VOLUME_PVC)[0]
vm_obj.run_ssh_cmd(command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400")
md5sum_before = vm_obj.run_ssh_cmd(command="md5sum /file_1.txt")

def finalizer():

# check vm data written before the failure for integrity
logger.info("Waiting for VM SSH connectivity!")
vm_obj.wait_for_ssh_connectivity()
md5sum_after = vm_obj.run_ssh_cmd(command="md5sum /file_1.txt")
assert (
md5sum_before == md5sum_after
), "Data integrity of the file inside VM is not maintained during the failure"
logger.info(
"Data integrity of the file inside VM is maintained during the failure"
)

# check if new data can be created
vm_obj.run_ssh_cmd(
command="dd if=/dev/zero of=/file_2.txt bs=1024 count=103600"
)
logger.info("Successfully created new data inside VM")

# check if the data can be copied back to local machine
vm_obj.scp_from_vm(local_path="/tmp", vm_src_path="/file_1.txt")
logger.info("VM data is successfully copied back to local machine")

# stop the VM
vm_obj.stop()
logger.info("Stoped the VM successfully")

request.addfinalizer(finalizer)


@turquoise_squad
@stretchcluster_required
@pytest.mark.usefixtures("setup_cnv_workload")
@pytest.mark.usefixtures("setup_logwriter_workloads")
class TestMonAndOSDFailures:
"""
Expand Down
29 changes: 11 additions & 18 deletions tests/functional/disaster-recovery/sc_arbiter/test_netsplit.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,15 @@ def finalizer():
@pytest.mark.parametrize(
argnames="zones, duration",
argvalues=[
# pytest.param(
# constants.NETSPLIT_DATA_1_DATA_2,
# 30,
# marks=[
# pytest.mark.polarion_id("OCS-5069"),
# pytest.mark.polarion_id("OCS-5071"),
# ],
# ),
pytest.param(
constants.NETSPLIT_DATA_1_DATA_2,
30,
marks=[
pytest.mark.polarion_id("OCS-5069"),
pytest.mark.polarion_id("OCS-5071"),
pytest.mark.bugzilla("2265992"),
],
),
pytest.param(
constants.NETSPLIT_ARBITER_DATA_1,
15,
Expand All @@ -118,7 +119,7 @@ def finalizer():
),
],
ids=[
# "Data-1-Data-2",
"Data-1-Data-2",
"Arbiter-Data-1",
"Arbiter-Data-1-and-Arbiter-Data-2",
"Arbiter-Data-1-and-Data-1-Data-2",
Expand Down Expand Up @@ -195,17 +196,9 @@ def test_netsplit(
)
logger.info(f"Netsplit induced at {start_time} for zones {zones}")

# get the nodes which are present in the
# out of quorum zone
if (
zones != constants.NETSPLIT_ARBITER_DATA_1
and zones != constants.NETSPLIT_ARBITER_DATA_1_AND_ARBITER_DATA_2
):
sc_obj.get_out_of_quorum_nodes()

# check for ceph accessibility and note the end time (UTC)
timeout = (end_time - datetime.now(timezone.utc)).total_seconds()
if not sc_obj.check_ceph_accessibility(timeout=timeout):
if not sc_obj.check_ceph_accessibility(timeout=int(timeout)):
assert recover_from_ceph_stuck(
sc_obj
), "Something went wrong. not expected. please check rook-ceph logs"
Expand Down

0 comments on commit 813fc52

Please sign in to comment.