From 3d8dd051b42bdac4d6e30ccc26285064573eac7c Mon Sep 17 00:00:00 2001 From: Joseph Kwasniewski Date: Mon, 18 Sep 2023 13:13:37 -0700 Subject: [PATCH] Analytics Webhook retries --- artifacts/flagger/crd.yaml | 10 +++ charts/flagger/crds/crd.yaml | 10 +++ kustomize/base/flagger/crd.yaml | 10 +++ pkg/apis/flagger/v1beta1/canary.go | 7 ++ pkg/apis/flagger/v1beta1/status.go | 5 ++ .../flagger/v1beta1/zz_generated.deepcopy.go | 17 +++++ pkg/canary/controller.go | 1 + pkg/canary/daemonset_status.go | 5 ++ pkg/canary/deployment_status.go | 5 ++ pkg/canary/service_controller.go | 5 ++ pkg/canary/status.go | 25 +++++++ pkg/controller/scheduler.go | 28 ++++++- pkg/controller/scheduler_deployment_test.go | 74 +++++++++++++++++++ pkg/controller/scheduler_test.go | 13 ++++ 14 files changed, 212 insertions(+), 3 deletions(-) diff --git a/artifacts/flagger/crd.yaml b/artifacts/flagger/crd.yaml index ad4686100..8f50c6c1f 100644 --- a/artifacts/flagger/crd.yaml +++ b/artifacts/flagger/crd.yaml @@ -1086,6 +1086,16 @@ spec: type: object additionalProperties: type: string + status: + description: CanaryStatus defines the observed state of a canary. + type: object + properties: + retries: + description: Current number of retries the webhook has performed + type: number + retries: + description: Number of retries before incrementing canary failure count + type: number sessionAffinity: description: SessionAffinity represents the session affinity settings for a canary run. type: object diff --git a/charts/flagger/crds/crd.yaml b/charts/flagger/crds/crd.yaml index ad4686100..8f50c6c1f 100644 --- a/charts/flagger/crds/crd.yaml +++ b/charts/flagger/crds/crd.yaml @@ -1086,6 +1086,16 @@ spec: type: object additionalProperties: type: string + status: + description: CanaryStatus defines the observed state of a canary. + type: object + properties: + retries: + description: Current number of retries the webhook has performed + type: number + retries: + description: Number of retries before incrementing canary failure count + type: number sessionAffinity: description: SessionAffinity represents the session affinity settings for a canary run. type: object diff --git a/kustomize/base/flagger/crd.yaml b/kustomize/base/flagger/crd.yaml index ad4686100..8f50c6c1f 100644 --- a/kustomize/base/flagger/crd.yaml +++ b/kustomize/base/flagger/crd.yaml @@ -1086,6 +1086,16 @@ spec: type: object additionalProperties: type: string + status: + description: CanaryStatus defines the observed state of a canary. + type: object + properties: + retries: + description: Current number of retries the webhook has performed + type: number + retries: + description: Number of retries before incrementing canary failure count + type: number sessionAffinity: description: SessionAffinity represents the session affinity settings for a canary run. type: object diff --git a/pkg/apis/flagger/v1beta1/canary.go b/pkg/apis/flagger/v1beta1/canary.go index 40d162d1c..a37e52396 100644 --- a/pkg/apis/flagger/v1beta1/canary.go +++ b/pkg/apis/flagger/v1beta1/canary.go @@ -390,6 +390,13 @@ type CanaryWebhook struct { // Metadata (key-value pairs) for this webhook // +optional Metadata *map[string]string `json:"metadata,omitempty"` + + // Canary Webhook Status + Status CanaryWebhookStatus `json:"status"` + + // Retries for failing webhook before incrementing rollout failure threshold + // +optional + Retries int `json:"retries,omitempty"` } // CanaryWebhookPayload holds the deployment info and metadata sent to webhooks diff --git a/pkg/apis/flagger/v1beta1/status.go b/pkg/apis/flagger/v1beta1/status.go index fd92f08d6..91814c517 100644 --- a/pkg/apis/flagger/v1beta1/status.go +++ b/pkg/apis/flagger/v1beta1/status.go @@ -88,3 +88,8 @@ type CanaryStatus struct { // +optional Conditions []CanaryCondition `json:"conditions,omitempty"` } + +// CanaryStatus is used for state persistence (read-only) +type CanaryWebhookStatus struct { + Retries int `json:"retries"` +} diff --git a/pkg/apis/flagger/v1beta1/zz_generated.deepcopy.go b/pkg/apis/flagger/v1beta1/zz_generated.deepcopy.go index 123da753e..f139ef06d 100644 --- a/pkg/apis/flagger/v1beta1/zz_generated.deepcopy.go +++ b/pkg/apis/flagger/v1beta1/zz_generated.deepcopy.go @@ -584,6 +584,7 @@ func (in *CanaryWebhook) DeepCopyInto(out *CanaryWebhook) { } } } + out.Status = in.Status return } @@ -620,6 +621,22 @@ func (in *CanaryWebhookPayload) DeepCopy() *CanaryWebhookPayload { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CanaryWebhookStatus) DeepCopyInto(out *CanaryWebhookStatus) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CanaryWebhookStatus. +func (in *CanaryWebhookStatus) DeepCopy() *CanaryWebhookStatus { + if in == nil { + return nil + } + out := new(CanaryWebhookStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *CrossNamespaceObjectReference) DeepCopyInto(out *CrossNamespaceObjectReference) { *out = *in diff --git a/pkg/canary/controller.go b/pkg/canary/controller.go index 86241d4ea..72edc5930 100644 --- a/pkg/canary/controller.go +++ b/pkg/canary/controller.go @@ -29,6 +29,7 @@ type Controller interface { SetStatusWeight(canary *flaggerv1.Canary, val int) error SetStatusIterations(canary *flaggerv1.Canary, val int) error SetStatusPhase(canary *flaggerv1.Canary, phase flaggerv1.CanaryPhase) error + SetWebhookStatusRetries(canary *flaggerv1.Canary, webhook int, val int) error Initialize(canary *flaggerv1.Canary) error Promote(canary *flaggerv1.Canary) error HasTargetChanged(canary *flaggerv1.Canary) (bool, error) diff --git a/pkg/canary/daemonset_status.go b/pkg/canary/daemonset_status.go index 746d612b7..55512290b 100644 --- a/pkg/canary/daemonset_status.go +++ b/pkg/canary/daemonset_status.go @@ -71,3 +71,8 @@ func (c *DaemonSetController) SetStatusIterations(cd *flaggerv1.Canary, val int) func (c *DaemonSetController) SetStatusPhase(cd *flaggerv1.Canary, phase flaggerv1.CanaryPhase) error { return setStatusPhase(c.flaggerClient, cd, phase) } + +// SetWebhookStatusRetries updates the webhook retries counter +func (c *DaemonSetController) SetWebhookStatusRetries(cd *flaggerv1.Canary, webhook int, val int) error { + return SetWebhookStatusRetries(c.flaggerClient, cd, webhook, val) +} diff --git a/pkg/canary/deployment_status.go b/pkg/canary/deployment_status.go index 46027499d..2ec0a36ba 100644 --- a/pkg/canary/deployment_status.go +++ b/pkg/canary/deployment_status.go @@ -61,3 +61,8 @@ func (c *DeploymentController) SetStatusIterations(cd *flaggerv1.Canary, val int func (c *DeploymentController) SetStatusPhase(cd *flaggerv1.Canary, phase flaggerv1.CanaryPhase) error { return setStatusPhase(c.flaggerClient, cd, phase) } + +// SetWebhookStatusRetries updates the webhook retries counter +func (c *DeploymentController) SetWebhookStatusRetries(cd *flaggerv1.Canary, webhook int, val int) error { + return SetWebhookStatusRetries(c.flaggerClient, cd, webhook, val) +} diff --git a/pkg/canary/service_controller.go b/pkg/canary/service_controller.go index cb8d559e0..fdd51dfa0 100644 --- a/pkg/canary/service_controller.go +++ b/pkg/canary/service_controller.go @@ -60,6 +60,11 @@ func (c *ServiceController) SetStatusPhase(cd *flaggerv1.Canary, phase flaggerv1 return setStatusPhase(c.flaggerClient, cd, phase) } +// SetWebhookStatusRetries updates the webhook retries counter +func (c *ServiceController) SetWebhookStatusRetries(cd *flaggerv1.Canary, webhook int, val int) error { + return SetWebhookStatusRetries(c.flaggerClient, cd, webhook, val) +} + // GetMetadata returns the pod label selector, label value and svc ports func (c *ServiceController) GetMetadata(_ *flaggerv1.Canary) (string, string, map[string]int32, error) { return "", "", nil, nil diff --git a/pkg/canary/status.go b/pkg/canary/status.go index a8917e8c7..d93d93eb7 100644 --- a/pkg/canary/status.go +++ b/pkg/canary/status.go @@ -185,6 +185,31 @@ func setStatusPhase(flaggerClient clientset.Interface, cd *flaggerv1.Canary, pha return nil } +func SetWebhookStatusRetries(flaggerClient clientset.Interface, cd *flaggerv1.Canary, webhook int, val int) error { + firstTry := true + name, ns := cd.GetName(), cd.GetNamespace() + err := retry.RetryOnConflict(retry.DefaultBackoff, func() (err error) { + if !firstTry { + cd, err = flaggerClient.FlaggerV1beta1().Canaries(ns).Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("canary %s.%s get query failed: %w", name, ns, err) + } + } + + cdCopy := cd.DeepCopy() + cdCopy.GetAnalysis().Webhooks[webhook].Status.Retries = val + cdCopy.Status.LastTransitionTime = metav1.Now() + + err = updateStatusWithUpgrade(flaggerClient, cdCopy) + firstTry = false + return + }) + if err != nil { + return fmt.Errorf("failed after retries: %w", err) + } + return nil +} + // getStatusCondition returns a condition based on type func getStatusCondition(status flaggerv1.CanaryStatus, conditionType flaggerv1.CanaryConditionType) *flaggerv1.CanaryCondition { for i := range status.Conditions { diff --git a/pkg/controller/scheduler.go b/pkg/controller/scheduler.go index d470d993d..2dac9c1e8 100644 --- a/pkg/controller/scheduler.go +++ b/pkg/controller/scheduler.go @@ -430,6 +430,16 @@ func (c *Controller) advanceCanary(name string, namespace string) { return } } else { + webhookStatus := c.runAnalysisWebhooks(cd, canaryController) + if webhookStatus == "failure" { + if err := canaryController.SetStatusFailedChecks(cd, cd.Status.FailedChecks+1); err != nil { + c.recordEventWarningf(cd, "%v", err) + } + return + } else if webhookStatus == "retry" { + return + } + if ok := c.runAnalysis(cd); !ok { if err := canaryController.SetStatusFailedChecks(cd, cd.Status.FailedChecks+1); err != nil { c.recordEventWarningf(cd, "%v", err) @@ -723,19 +733,31 @@ func (c *Controller) runBlueGreen(canary *flaggerv1.Canary, canaryController can } -func (c *Controller) runAnalysis(canary *flaggerv1.Canary) bool { +func (c *Controller) runAnalysisWebhooks(canary *flaggerv1.Canary, canaryController canary.Controller) string { // run external checks - for _, webhook := range canary.GetAnalysis().Webhooks { + for id, webhook := range canary.GetAnalysis().Webhooks { if webhook.Type == "" || webhook.Type == flaggerv1.RolloutHook { err := CallWebhook(canary.Name, canary.Namespace, flaggerv1.CanaryPhaseProgressing, webhook) if err != nil { + if webhook.Retries > 0 && webhook.Status.Retries < webhook.Retries { + if err := canaryController.SetWebhookStatusRetries(canary, id, webhook.Status.Retries+1); err != nil { + c.recordEventWarningf(canary, "%v", err) + } + c.recordEventWarningf(canary, "Halt %s.%s advancement external check %s failed %v retrying %d more times", + canary.Name, canary.Namespace, webhook.Name, err, webhook.Retries-webhook.Status.Retries-1) + return "retry" + } c.recordEventWarningf(canary, "Halt %s.%s advancement external check %s failed %v", canary.Name, canary.Namespace, webhook.Name, err) - return false + return "failure" } } } + return "success" +} + +func (c *Controller) runAnalysis(canary *flaggerv1.Canary) bool { ok := c.runBuiltinMetricChecks(canary) if !ok { return ok diff --git a/pkg/controller/scheduler_deployment_test.go b/pkg/controller/scheduler_deployment_test.go index 51161ec44..a6402d877 100644 --- a/pkg/controller/scheduler_deployment_test.go +++ b/pkg/controller/scheduler_deployment_test.go @@ -646,3 +646,77 @@ func TestScheduler_DeploymentAlerts(t *testing.T) { // initialization done - now send alert mocks.ctrl.advanceCanary("podinfo", "default") } + +func TestScheduler_DeploymentWebhookRetry(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer ts.Close() + + tsOk := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusFound) + })) + defer ts.Close() + + canary := newDeploymentTestCanary() + canary.Spec.Analysis.Webhooks = []flaggerv1.CanaryWebhook{ + { + Name: "failing-url", + Type: "rollout", + URL: ts.URL, + Retries: 3, + }, + { + Name: "success-url", + Type: "rollout", + URL: tsOk.URL, + Retries: 5, + }, + } + mocks := newDeploymentFixture(canary) + + // initializing + mocks.ctrl.advanceCanary("podinfo", "default") + + // make primary ready + mocks.makePrimaryReady(t) + + // initialized + mocks.ctrl.advanceCanary("podinfo", "default") + require.NoError(t, assertPhase(mocks.flaggerClient, "podinfo", flaggerv1.CanaryPhaseInitialized)) + + // update + dep2 := newDeploymentTestDeploymentV2() + _, err := mocks.kubeClient.AppsV1().Deployments("default").Update(context.TODO(), dep2, metav1.UpdateOptions{}) + require.NoError(t, err) + + // detect changes + mocks.ctrl.advanceCanary("podinfo", "default") + require.NoError(t, assertPhase(mocks.flaggerClient, "podinfo", flaggerv1.CanaryPhaseProgressing)) + mocks.makeCanaryReady(t) + + // progressing + mocks.ctrl.advanceCanary("podinfo", "default") + require.NoError(t, assertPhase(mocks.flaggerClient, "podinfo", flaggerv1.CanaryPhaseProgressing)) + + // verify webhook retry counter increases + mocks.ctrl.advanceCanary("podinfo", "default") + // Failing URL 1, Success URL 0 + require.NoError(t, assertWebhookRetries(mocks.flaggerClient, "podinfo", 0, 1)) + require.NoError(t, assertWebhookRetries(mocks.flaggerClient, "podinfo", 1, 0)) + // Failing URL 2, Success URL 0 + mocks.ctrl.advanceCanary("podinfo", "default") + require.NoError(t, assertWebhookRetries(mocks.flaggerClient, "podinfo", 0, 2)) + require.NoError(t, assertWebhookRetries(mocks.flaggerClient, "podinfo", 1, 0)) + // Failing URL 3, Success URL 0 + mocks.ctrl.advanceCanary("podinfo", "default") + require.NoError(t, assertWebhookRetries(mocks.flaggerClient, "podinfo", 0, 3)) + require.NoError(t, assertWebhookRetries(mocks.flaggerClient, "podinfo", 1, 0)) + + // update failed checks to max + mocks.deployer.SyncStatus(mocks.canary, flaggerv1.CanaryStatus{Phase: flaggerv1.CanaryPhaseProgressing, FailedChecks: 10}) + + // canary failed due to failed webhook + mocks.ctrl.advanceCanary("podinfo", "default") + require.NoError(t, assertPhase(mocks.flaggerClient, "podinfo", flaggerv1.CanaryPhaseFailed)) +} diff --git a/pkg/controller/scheduler_test.go b/pkg/controller/scheduler_test.go index 3f7d78a4b..818bf4f1e 100644 --- a/pkg/controller/scheduler_test.go +++ b/pkg/controller/scheduler_test.go @@ -60,6 +60,19 @@ func assertPhase(flaggerClient clientset.Interface, canary string, phase flagger return nil } +func assertWebhookRetries(flaggerClient clientset.Interface, canary string, webhook int, retries int) error { + c, err := flaggerClient.FlaggerV1beta1().Canaries("default").Get(context.TODO(), canary, metav1.GetOptions{}) + if err != nil { + return err + } + + if c.GetAnalysis().Webhooks[webhook].Status.Retries != retries { + return fmt.Errorf("got webhook status retries %d wanted %d", c.GetAnalysis().Webhooks[webhook].Status.Retries, retries) + } + + return nil +} + func alwaysReady() bool { return true }