From 1becce5f5c21839c086b5b697372b5dc79cb31b5 Mon Sep 17 00:00:00 2001 From: David AMSALLEM Date: Wed, 17 Apr 2024 15:01:43 +0200 Subject: [PATCH] Improve Quiesce management when evicting pods * Perform a DryRun eviction before Quiescing a node. * QuiesceUndo when pod fail to evict. --- controllers/aero_info_calls.go | 35 ++++++++++++++++++++++++++++++++++ controllers/pod.go | 6 ++++-- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/controllers/aero_info_calls.go b/controllers/aero_info_calls.go index c579a2a1a..95e857817 100644 --- a/controllers/aero_info_calls.go +++ b/controllers/aero_info_calls.go @@ -14,10 +14,13 @@ limitations under the License. package controllers import ( + "context" "fmt" "time" corev1 "k8s.io/api/core/v1" + policyv1beta1 "k8s.io/api/policy/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" asdbv1beta1 "github.com/aerospike/aerospike-kubernetes-operator/api/v1beta1" "github.com/aerospike/aerospike-kubernetes-operator/pkg/utils" @@ -75,6 +78,28 @@ func (r *SingleClusterReconciler) waitForMultipleNodesSafeStopReady( } } + /* + * Dry run evictions on pods. + * This is to avoid an unecessary quiesce if the eviction fail. + * Still an unquiesce in case of error during real eviction must be considered. + */ + for _, pod := range pods { + if err := r.KubeClient.CoreV1().Pods(pod.Namespace).Evict(context.TODO(), + &policyv1beta1.Eviction{ + ObjectMeta: metav1.ObjectMeta{ + Name: pod.Name, + Namespace: pod.Namespace, + }, + DeleteOptions: &metav1.DeleteOptions{ + DryRun: []string{metav1.DryRunAll}, + }, + }); err != nil { + + r.Log.Info(fmt.Sprintf("Not evictable pod %s in ns %s. Won't quiesce and retry in 30sec.", pod.Name, pod.Namespace)) + return reconcileRequeueAfter(30) + } + } + if err := r.quiescePods(policy, allHostConns, pods, ignorablePods); err != nil { return reconcileError(err) } @@ -104,6 +129,16 @@ func (r *SingleClusterReconciler) quiescePods( return deployment.InfoQuiesce(r.Log, policy, allHostConns, selectedHostConns, removedNSes) } +func (r *SingleClusterReconciler) quiesceUndoPods(policy *as.ClientPolicy, pod *corev1.Pod) error { + + selectedHostConns, err := r.newPodsHostConnWithOption([]corev1.Pod{*pod}, []corev1.Pod{}) + if err != nil { + return err + } + + return deployment.InfoQuiesceUndo(r.Log, policy, selectedHostConns) +} + // TODO: Check only for migration func (r *SingleClusterReconciler) waitForClusterStability( policy *as.ClientPolicy, allHostConns []*deployment.HostConn, diff --git a/controllers/pod.go b/controllers/pod.go index c974c1f0b..cfad34276 100644 --- a/controllers/pod.go +++ b/controllers/pod.go @@ -281,8 +281,10 @@ func (r *SingleClusterReconciler) restartPods( Namespace: pod.Namespace, }, }); err != nil { - r.Log.Error(err, "Failed to evict pod") - return reconcileError(err) + r.Log.Error(err, fmt.Sprintf("Not evictable pod %s in ns %s. QuiesceUndo and retry in 30sec. Error: %s", pod.Name, pod.Namespace, err.Error())) + // in case of error during the eviction, unquiesce the node since it has been quiesced. + r.quiesceUndoPods(r.getClientPolicy(), pod) + return reconcileRequeueAfter(30) } }