From b5c69cf4477681887103751132d4016299010e64 Mon Sep 17 00:00:00 2001 From: "aleksej.paschenko" Date: Wed, 15 Jun 2022 04:45:50 +0300 Subject: [PATCH] Ketch can't deploy new apps after failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an invalid field is set in the App CR like so (it can be any invalid field): ``` volumes: - name: aaa/bbb persistentVolumeClaim: claimName: aaa/bbb ``` The following kubernetes event will get emitted: ``` failed to get deploy events: create Pod bulletinboard-web-1-0 in StatefulSet bulletinboard-web-1 failed error: Pod "bulletinboard-web-1-0" is invalid: spec.volumes[0].name: Invalid value: "aaa/bbb": a lowercase RFC 1123 label must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?') ``` Due to this, the following for loop will never break as the ObservedGeneration will never increase from 0 due to the invalid field, only until 10 minutes has elapsed with the ctx: ketch/app_controller.go at c6aa42099f24287c429c7b703fe21f531c417051 ยท theketchio/ketch Additional logic within wl, err = cli.Get(ctx) to check the event associated with the workload and return the error on a specific condition (e.Type == "Warning" && e.Reason == "FailedCreate") was added --- internal/controllers/app_controller.go | 37 ++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/internal/controllers/app_controller.go b/internal/controllers/app_controller.go index f95ecbdf..8f89434f 100644 --- a/internal/controllers/app_controller.go +++ b/internal/controllers/app_controller.go @@ -243,6 +243,12 @@ type condition struct { Reason string } +type eventCondition struct { + Type string + Reason string + Message string +} + // workload contains the needed information for watchDeployEvents logic // deployments and statefulsets are both supported so it became necessary // to abstract their common properties into a separate type @@ -254,6 +260,7 @@ type workload struct { Generation int ObservedGeneration int Conditions []condition + Events []eventCondition } type workloadClient struct { @@ -284,6 +291,15 @@ func (cli workloadClient) Get(ctx context.Context) (*workload, error) { for _, c := range o.Status.Conditions { w.Conditions = append(w.Conditions, condition{Type: string(c.Type), Reason: c.Reason}) } + e, err := cli.k8sClient.CoreV1().Events(cli.workloadNamespace).List(ctx, metav1.ListOptions{FieldSelector: "involvedObject.name=" + o.Name, TypeMeta: metav1.TypeMeta{Kind: "Pod"}}) + if err != nil { + return nil, err + } + for _, e := range e.Items { + if e.FirstTimestamp == o.ObjectMeta.CreationTimestamp { + w.Events = append(w.Events, eventCondition{Type: e.Type, Reason: e.Reason, Message: e.Message}) + } + } return &w, nil case ketchv1.StatefulSetAppType: o, err := cli.k8sClient.AppsV1().StatefulSets(cli.workloadNamespace).Get(ctx, cli.workloadName, metav1.GetOptions{}) @@ -303,6 +319,15 @@ func (cli workloadClient) Get(ctx context.Context) (*workload, error) { for _, c := range o.Status.Conditions { w.Conditions = append(w.Conditions, condition{Type: string(c.Type), Reason: c.Reason}) } + e, err := cli.k8sClient.CoreV1().Events(cli.workloadNamespace).List(ctx, metav1.ListOptions{FieldSelector: "involvedObject.name=" + o.Name, TypeMeta: metav1.TypeMeta{Kind: "StatefulSet"}}) + if err != nil { + return nil, err + } + for _, e := range e.Items { + if e.FirstTimestamp == o.ObjectMeta.CreationTimestamp { + w.Events = append(w.Events, eventCondition{Type: e.Type, Reason: e.Reason, Message: e.Message}) + } + } return &w, nil } return nil, fmt.Errorf("unknown workload type") @@ -495,6 +520,9 @@ func (r *AppReconciler) watchDeployEvents(ctx context.Context, app *ketchv1.App, recorder.Eventf(app, v1.EventTypeWarning, ketchv1.AppReconcileError, "error getting deployments: %s", err.Error()) return err } + if err := checkWorkloadEvent(wl); err != nil { + return err + } select { case <-time.After(100 * time.Millisecond): case <-timeout: @@ -681,6 +709,15 @@ func isDeploymentEvent(msg watch.Event, name string) bool { return ok && strings.HasPrefix(evt.Name, name) } +func checkWorkloadEvent(wl *workload) error { + for _, e := range wl.Events { + if e.Type == "Warning" && e.Reason == "FailedCreate" { + return errors.New(e.Message) + } + } + return nil +} + // createDeployTimeoutError gets pods that are not status == ready aggregates and returns the pod phase errors func createDeployTimeoutError(ctx context.Context, cli kubernetes.Interface, app *ketchv1.App, timeout time.Duration, namespace, group, label string) error { var deploymentVersion int