Skip to content

Commit

Permalink
Admission webhook: Undo prepare-shutdown calls if last-downscale fail…
Browse files Browse the repository at this point in the history
…s to store (#151)

Fix a snag found in #146 where if the "downscaled" annotation/configmap fails to persist, the scale operation is denied, but the pods are not informed via DELETE that they should no longer shutdown.
  • Loading branch information
seizethedave authored Jun 21, 2024
1 parent f5bef38 commit dddb21d
Show file tree
Hide file tree
Showing 4 changed files with 217 additions and 79 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

## main / unreleased

* [ENHANCEMENT] prepare-downscale admission webhook: undo prepare-shutdown calls if adding the `last-downscale` annotation fails. #151

## v0.17.0

* [CHANGE] The docker base images are now based off distroless images rather than Alpine. #149
Expand Down
77 changes: 47 additions & 30 deletions pkg/admission/prep_downscale.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (

const (
PrepareDownscaleWebhookPath = "/admission/prepare-downscale"
maxPrepareGoroutines = 32
)

func PrepareDownscale(ctx context.Context, logger log.Logger, ar v1.AdmissionReview, api *kubernetes.Clientset, useZoneTracker bool, zoneTrackerConfigMapName string) *v1.AdmissionResponse {
Expand Down Expand Up @@ -154,27 +155,36 @@ func prepareDownscale(ctx context.Context, l log.Logger, ar v1.AdmissionReview,
}
}

// Create a slice of endpoint addresses for pods to send HTTP POST requests to and to fail if any don't return 200
// It's a downscale, so we need to prepare the pods that are going away for shutdown.
eps := createEndpoints(ar, oldInfo, newInfo, port, path)

if err := sendPrepareShutdownRequests(ctx, logger, client, eps); err != nil {
// Down-scale operation is disallowed because a pod failed to prepare for shutdown and cannot be deleted
level.Error(logger).Log("msg", "downscale not allowed due to error", "err", err)
// Down-scale operation is disallowed because at least one pod failed to
// prepare for shutdown and cannot be deleted. We also need to
// un-prepare them all.

level.Error(logger).Log("msg", "downscale not allowed due to host(s) failing to prepare for downscale. unpreparing...", "err", err)
undoPrepareShutdownRequests(ctx, logger, client, eps)

return deny(
"downscale of %s/%s in %s from %d to %d replicas is not allowed because one or more pods failed to prepare for shutdown.",
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldInfo.replicas, *newInfo.replicas,
)
}

if err := addDownscaledAnnotationToStatefulSet(ctx, api, ar.Request.Namespace, ar.Request.Name); err != nil {
level.Error(logger).Log("msg", "downscale not allowed due to error while adding annotation", "err", err)
// Down-scale operation is disallowed because we failed to add the
// annotation to the statefulset. We again need to un-prepare all pods.
level.Error(logger).Log("msg", "downscale not allowed due to error while adding annotation. unpreparing...", "err", err)
undoPrepareShutdownRequests(ctx, logger, client, eps)

return deny(
"downscale of %s/%s in %s from %d to %d replicas is not allowed because adding an annotation to the statefulset failed.",
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldInfo.replicas, *newInfo.replicas,
)
}

// Down-scale operation is allowed because all pods successfully prepared for shutdown
// Otherwise, we've made it through the gauntlet, and the downscale is allowed.
level.Info(logger).Log("msg", "downscale allowed")
return &v1.AdmissionResponse{
Allowed: true,
Expand Down Expand Up @@ -503,20 +513,17 @@ func invokePrepareShutdown(ctx context.Context, method string, parentLogger log.
}

func sendPrepareShutdownRequests(ctx context.Context, logger log.Logger, client httpClient, eps []endpoint) error {
if len(eps) == 0 {
return nil
}

span, ctx := opentracing.StartSpanFromContext(ctx, "admission.sendPrepareShutdownRequests()")
defer span.Finish()

// Attempt to POST to every prepare-shutdown endpoint. If any fail, we'll
// undo them all with a DELETE.
if len(eps) == 0 {
return nil
}

const maxGoroutines = 32
// Attempt to POST to every prepare-shutdown endpoint.

g, ectx := errgroup.WithContext(ctx)
g.SetLimit(maxGoroutines)
g.SetLimit(maxPrepareGoroutines)
for _, ep := range eps {
ep := ep
g.Go(func() error {
Expand All @@ -527,25 +534,35 @@ func sendPrepareShutdownRequests(ctx context.Context, logger log.Logger, client
})
}

err := g.Wait()
if err != nil {
// At least one failed. Undo them all.
level.Warn(logger).Log("msg", "failed to prepare hosts for shutdown. unpreparing...", "err", err)
undoGroup, _ := errgroup.WithContext(ctx)
undoGroup.SetLimit(maxGoroutines)
for _, ep := range eps {
ep := ep
undoGroup.Go(func() error {
if err := invokePrepareShutdown(ctx, http.MethodDelete, logger, client, ep); err != nil {
level.Warn(logger).Log("msg", "failed to undo prepare shutdown request", "url", ep.url, "err", err)
}
return nil
})
}
_ = undoGroup.Wait()
return g.Wait()
}

// undoPrepareShutdownRequests sends an HTTP DELETE to each of the given endpoints.
func undoPrepareShutdownRequests(ctx context.Context, logger log.Logger, client httpClient, eps []endpoint) {
span, ctx := opentracing.StartSpanFromContext(ctx, "admission.undoPrepareShutdownRequests()")
defer span.Finish()

if len(eps) == 0 {
return
}

return err
// Unlike sendPrepareShutdownRequests, we attempt to send each pod a DELETE
// without regard for failures.

undoGroup, _ := errgroup.WithContext(ctx)
undoGroup.SetLimit(maxPrepareGoroutines)
for _, ep := range eps {
ep := ep
undoGroup.Go(func() error {
if err := invokePrepareShutdown(ctx, http.MethodDelete, logger, client, ep); err != nil {
level.Warn(logger).Log("msg", "failed to undo prepare shutdown request", "url", ep.url, "err", err)
// (We swallow the error so all of the deletes are attempted.)
}
return nil
})
}

_ = undoGroup.Wait()
}

var tenantResolver spanlogger.TenantResolver = noTenantResolver{}
Expand Down
Loading

0 comments on commit dddb21d

Please sign in to comment.