Skip to content

Commit

Permalink
Merge pull request #343 from ldoktor/reliability
Browse files Browse the repository at this point in the history
CI: Improve e2e tests reliability
  • Loading branch information
wainersm authored Feb 6, 2024
2 parents ee79ae5 + 2d78020 commit ed63a3d
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 42 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ccruntime_e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ jobs:
if [ $RUNNING_INSTANCE = "s390x" ]; then
args=""
fi
./run-local.sh -r "${{ matrix.runtimeclass }}" "${args}"
./run-local.sh -t -r "${{ matrix.runtimeclass }}" "${args}"
env:
RUNNING_INSTANCE: ${{ matrix.instance }}

Expand Down
49 changes: 24 additions & 25 deletions tests/e2e/operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ build_operator () {
# so it's better to check it before adding the target repo.
local sd="$(git config --global --get safe.directory ${project_dir} || true)"
if [ "${sd}" == "" ]; then
echo "Add repo ${project_dir} to git's safe.directory"
echo "::debug:: Add repo ${project_dir} to git's safe.directory"
git config --global --add safe.directory "${project_dir}"
else
echo "Repo ${project_dir} already in git's safe.directory"
echo "::debug:: Repo ${project_dir} already in git's safe.directory"
fi

pushd "$project_dir" >/dev/null
Expand Down Expand Up @@ -67,9 +67,9 @@ build_pre_install_img() {
handle_older_containerd() {
command -v containerd >/dev/null || return
local version=$(containerd -v | awk '{ print $3 }' | sed 's/^v//')
echo "system's containerd version: $version"
echo "::debug:: system's containerd version: $version"
if [[ "$version" =~ ^1.6 || "$version" =~ ^1.5 ]]; then
echo "Old system's containerd ($version). Configuring the operator to install a newer one"
echo "::warning:: Old system's containerd ($version). Configuring the operator to install a newer one"
pushd "$project_dir" >/dev/null
for kfile in $(find config/ -name "kustomization.yaml" \
-exec grep -l INSTALL_OFFICIAL_CONTAINERD {} \;);do
Expand Down Expand Up @@ -104,10 +104,10 @@ install_operator() {
local cmd="kubectl get pods -n "$op_ns" --no-headers |"
cmd+="egrep -q ${controller_pod}.*'\<Running\>'"
if ! wait_for_process 120 10 "$cmd"; then
echo "ERROR: ${controller_pod} pod is not running"
echo "::error:: ${controller_pod} pod is not running"

local pod_id="$(get_pods_regex $controller_pod $op_ns)"
echo "DEBUG: Pod $pod_id"
echo "::debug:: Pod $pod_id"
debug_pod "$pod_id" "$op_ns"

return 1
Expand Down Expand Up @@ -135,10 +135,10 @@ install_ccruntime() {
cmd="kubectl get pods -n "$op_ns" --no-headers |"
cmd+="egrep -q ${pod}.*'\<Running\>'"
if ! wait_for_process 600 30 "$cmd"; then
echo "ERROR: $pod pod is not running"
echo "::error:: $pod pod is not running"

local pod_id="$(get_pods_regex $pod $op_ns)"
echo "DEBUG: Pod $pod_id"
echo "::debug:: Pod $pod_id"
debug_pod "$pod_id" "$op_ns"

return 1
Expand All @@ -149,7 +149,7 @@ install_ccruntime() {
# There could be a case where it is not even if the pods above are running.
cmd="kubectl get runtimeclass | grep -q ${runtimeclass}"
if ! wait_for_process 300 30 "$cmd"; then
echo "ERROR: runtimeclass ${runtimeclass} is not up"
echo "::error:: runtimeclass ${runtimeclass} is not up"
return 1
fi
# To keep operator running, we should resume registry stopped during containerd restart.
Expand All @@ -164,12 +164,12 @@ uninstall_ccruntime() {
popd >/dev/null

# Wait and ensure ccruntime pods are gone
#
local cmd="! sudo -E kubectl get pods -n $op_ns |"
cmd+="grep -q -e cc-operator-daemon-install"
# (ensure failing kubectl keeps iterating)
local cmd="_OUT=\$(sudo -E kubectl get pods -n '$op_ns')"
cmd+=" && ! echo \$_OUT | grep -q -e cc-operator-daemon-install"
cmd+=" -e cc-operator-pre-install-daemon"
if ! wait_for_process 720 30 "$cmd"; then
echo "ERROR: there are ccruntime pods still running"
echo "::error:: there are ccruntime pods still running"
echo "::group::Describe pods from $op_ns namespace"
kubectl -n "$op_ns" describe pods || true
echo "::endgroup::"
Expand All @@ -183,7 +183,7 @@ uninstall_ccruntime() {
# Labels should be gone
if kubectl get nodes "$SAFE_HOST_NAME" -o jsonpath='{.metadata.labels}' | \
grep -q -e cc-preinstall -e katacontainers.io; then
echo "ERROR: there are labels left behind"
echo "::error:: there are labels left behind"
kubectl get nodes "$SAFE_HOST_NAME" -o jsonpath='{.metadata.labels}'

return 1
Expand All @@ -207,7 +207,7 @@ kustomization_set_image() {
# and this can introduce false-positive on the tests. So let's check the old image really
# exist.
if ! grep -q "name: ${old}$" ./kustomization.yaml; then
echo "ERROR: expected image ${old} in ${overlay_dir}/kustomization.yaml"
echo "::error:: expected image ${old} in ${overlay_dir}/kustomization.yaml"
return 1
fi

Expand Down Expand Up @@ -242,15 +242,14 @@ uninstall_operator() {
popd >/dev/null

# Wait and ensure the controller pod is gone
#
local pod="cc-operator-controller-manager"
local cmd="! kubectl get pods -n $op_ns |"
cmd+="grep -q $pod"
# (ensure failing kubectl keeps iterating)
local cmd="_OUT=\$(sudo -E kubectl get pods -n '$op_ns')"
cmd+="&& ! echo \$_OUT | grep -q -e cc-operator-controller-manager"
if ! wait_for_process 180 30 "$cmd"; then
echo "ERROR: the controller manager is still running"
echo "::error:: the controller manager is still running"

local pod_id="$(get_pods_regex $pod $op_ns)"
echo "DEBUG: Pod $pod_id"
echo "::debug:: Pod $pod_id"
debug_pod "$pod_id" "$op_ns"

return 1
Expand All @@ -269,7 +268,7 @@ wait_for_stabilization() {

while read -r pod container restart_count; do
if [ "${restart_counts[$pod-$container]--1}" != "$restart_count" ]; then
echo "DEBUG: Pod: $pod, Container: $container, Restart count: $restart_count"
echo "::debug:: Pod: $pod, Container: $container, Restart count: $restart_count"
restart_counts["$pod-$container"]=$restart_count
change=1
fi
Expand All @@ -278,10 +277,10 @@ wait_for_stabilization() {
[ $change -eq 0 ] && ((iteration+=1))

if [ $iteration -gt 3 ]; then
echo "INFO: No new restarts in 3x21s, proceeding..."
echo "::info:: No new restarts in 3x21s, proceeding..."
break
elif [ $count -gt 20 ]; then
echo "ERROR: Pods are still restarting after 20x21s, bailing out!"
echo "::error:: Pods are still restarting after 20x21s, bailing out!"
return 1
fi

Expand Down Expand Up @@ -335,7 +334,7 @@ main() {
wait_for_stabilization
;;
*)
echo "Unknown command '$1'"
echo "::error:: Unknown command '$1'"
usage && exit 1
esac
fi
Expand Down
3 changes: 3 additions & 0 deletions tests/e2e/operator_tests.bats
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
load "${BATS_TEST_DIRNAME}/lib.sh"
test_tag="[cc][operator]"

# Set 10m timeout for each test
export BATS_TEST_TIMEOUT=600

is_operator_installed() {
[ "$(kubectl get deployment -n "$ns" --no-headers 2>/dev/null | wc -l)" \
-gt 0 ]
Expand Down
47 changes: 31 additions & 16 deletions tests/e2e/run-local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ step_start_cluster=0
step_install_operator=0
runtimeclass=""
undo="false"
timeout="false"

usage() {
cat <<-EOF
Expand All @@ -29,44 +30,58 @@ usage() {
the tests. Defaults to "kata-qemu".
-u: undo the installation and configuration before exiting. Useful for
baremetal machine were it needs to do clean up for the next tests.
-t: enable default timeout for each operation (useful for CI)
EOF
}

parse_args() {
while getopts "hr:u" opt; do
while getopts "hr:ut" opt; do
case $opt in
h) usage && exit 0;;
r) runtimeclass="$OPTARG";;
u) undo="true";;
t) timeout="true";;
*) usage && exit 1;;
esac
done
}

run() {
duration=$1; shift
if [ "$timeout" == "true" ]; then
timeout $duration "$@"
else
"$@"
fi
}

undo_changes() {
pushd "$script_dir" >/dev/null
# Do not try to undo steps that did not execute.
if [ $step_install_operator -eq 1 ]; then
echo "INFO: Uninstall the operator"
sudo -E PATH="$PATH" bash -c './operator.sh uninstall' || true
echo "::info:: Uninstall the operator"
run 10m sudo -E PATH="$PATH" bash -c './operator.sh uninstall' || true
fi

if [ $step_start_cluster -eq 1 ]; then
echo "INFO: Shutdown the cluster"
sudo -E PATH="$PATH" bash -c './cluster/down.sh' || true
echo "::info:: Shutdown the cluster"
run 5m sudo -E PATH="$PATH" bash -c './cluster/down.sh' || true
fi

if [ $step_bootstrap_env -eq 1 ]; then
echo "INFO: Undo the bootstrap"
ansible-playbook -i localhost, -c local --tags undo ansible/main.yml || true
echo "::info:: Undo the bootstrap"
run 5m ansible-playbook -i localhost, -c local --tags undo ansible/main.yml || true
fi
popd >/dev/null
}

on_exit() {
RET="$?"
if [ "$undo" == "true" ]; then
[ "$RET" -ne 0 ] && echo && echo "::error:: Testing failed with $RET, starting the cleanup..."
undo_changes
fi
[ "$RET" -ne 0 ] && echo && echo "::error:: Testing failed with $RET" || echo "::info:: Testing passed"
}

trap on_exit EXIT
Expand All @@ -78,28 +93,28 @@ main() {

# Check Ansible is installed.
if ! command -v ansible-playbook >/dev/null; then
echo "ERROR: ansible-playbook is required to run this script."
echo "::error:: ansible-playbook is required to run this script."
exit 1
fi

export "PATH=$PATH:/usr/local/bin"

pushd "$script_dir" >/dev/null
echo "INFO: Bootstrap the local machine"
echo "::info:: Bootstrap the local machine"
step_bootstrap_env=1
ansible-playbook -i localhost, -c local --tags untagged ansible/main.yml
run 10m ansible-playbook -i localhost, -c local --tags untagged ansible/main.yml

echo "INFO: Bring up the test cluster"
echo "::info:: Bring up the test cluster"
step_start_cluster=1
sudo -E PATH="$PATH" bash -c './cluster/up.sh'
run 10m sudo -E PATH="$PATH" bash -c './cluster/up.sh'
export KUBECONFIG=/etc/kubernetes/admin.conf

echo "INFO: Build and install the operator"
echo "::info:: Build and install the operator"
step_install_operator=1
sudo -E PATH="$PATH" bash -c './operator.sh'
run 20m sudo -E PATH="$PATH" bash -c './operator.sh'

echo "INFO: Run tests"
cmd="sudo -E PATH=\"$PATH\" bash -c "
echo "::info:: Run tests"
cmd="run 20m sudo -E PATH=\"$PATH\" bash -c "
if [ -z "$runtimeclass" ]; then
cmd+="'./tests_runner.sh'"
else
Expand Down

0 comments on commit ed63a3d

Please sign in to comment.