Skip to content

Commit

Permalink
Improve Quiesce management when evicting pods
Browse files Browse the repository at this point in the history
* Perform a DryRun eviction before Quiescing a node.
* QuiesceUndo when pod fail to evict.
  • Loading branch information
David AMSALLEM authored and pgoron committed Apr 19, 2024
1 parent db9b2ed commit 1becce5
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 2 deletions.
35 changes: 35 additions & 0 deletions controllers/aero_info_calls.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@ limitations under the License.
package controllers

import (
"context"
"fmt"
"time"

corev1 "k8s.io/api/core/v1"
policyv1beta1 "k8s.io/api/policy/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

asdbv1beta1 "github.com/aerospike/aerospike-kubernetes-operator/api/v1beta1"
"github.com/aerospike/aerospike-kubernetes-operator/pkg/utils"
Expand Down Expand Up @@ -75,6 +78,28 @@ func (r *SingleClusterReconciler) waitForMultipleNodesSafeStopReady(
}
}

/*
* Dry run evictions on pods.
* This is to avoid an unecessary quiesce if the eviction fail.

Check failure on line 83 in controllers/aero_info_calls.go

View workflow job for this annotation

GitHub Actions / lint (.)

`unecessary` is a misspelling of `unnecessary` (misspell)
* Still an unquiesce in case of error during real eviction must be considered.
*/
for _, pod := range pods {
if err := r.KubeClient.CoreV1().Pods(pod.Namespace).Evict(context.TODO(),
&policyv1beta1.Eviction{
ObjectMeta: metav1.ObjectMeta{
Name: pod.Name,
Namespace: pod.Namespace,
},
DeleteOptions: &metav1.DeleteOptions{
DryRun: []string{metav1.DryRunAll},
},
}); err != nil {

Check failure on line 96 in controllers/aero_info_calls.go

View workflow job for this annotation

GitHub Actions / lint (.)

unnecessary leading newline (whitespace)

r.Log.Info(fmt.Sprintf("Not evictable pod %s in ns %s. Won't quiesce and retry in 30sec.", pod.Name, pod.Namespace))
return reconcileRequeueAfter(30)
}
}

if err := r.quiescePods(policy, allHostConns, pods, ignorablePods); err != nil {
return reconcileError(err)
}
Expand Down Expand Up @@ -104,6 +129,16 @@ func (r *SingleClusterReconciler) quiescePods(
return deployment.InfoQuiesce(r.Log, policy, allHostConns, selectedHostConns, removedNSes)
}

func (r *SingleClusterReconciler) quiesceUndoPods(policy *as.ClientPolicy, pod *corev1.Pod) error {

Check failure on line 132 in controllers/aero_info_calls.go

View workflow job for this annotation

GitHub Actions / lint (.)

unnecessary leading newline (whitespace)

selectedHostConns, err := r.newPodsHostConnWithOption([]corev1.Pod{*pod}, []corev1.Pod{})
if err != nil {
return err
}

return deployment.InfoQuiesceUndo(r.Log, policy, selectedHostConns)
}

// TODO: Check only for migration
func (r *SingleClusterReconciler) waitForClusterStability(
policy *as.ClientPolicy, allHostConns []*deployment.HostConn,
Expand Down
6 changes: 4 additions & 2 deletions controllers/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,10 @@ func (r *SingleClusterReconciler) restartPods(
Namespace: pod.Namespace,
},
}); err != nil {
r.Log.Error(err, "Failed to evict pod")
return reconcileError(err)
r.Log.Error(err, fmt.Sprintf("Not evictable pod %s in ns %s. QuiesceUndo and retry in 30sec. Error: %s", pod.Name, pod.Namespace, err.Error()))

Check failure on line 284 in controllers/pod.go

View workflow job for this annotation

GitHub Actions / lint (.)

line is 147 characters (lll)
// in case of error during the eviction, unquiesce the node since it has been quiesced.
r.quiesceUndoPods(r.getClientPolicy(), pod)

Check failure on line 286 in controllers/pod.go

View workflow job for this annotation

GitHub Actions / lint (.)

Error return value of `r.quiesceUndoPods` is not checked (errcheck)
return reconcileRequeueAfter(30)
}
}

Expand Down

0 comments on commit 1becce5

Please sign in to comment.