From 98b46de5efc99d70de1e8baae037b2d8bef31e81 Mon Sep 17 00:00:00 2001 From: Anvesh Reddy Pinnapureddy Date: Wed, 21 Aug 2024 11:51:49 +0530 Subject: [PATCH] Decouple full & delta snapshot backup ready conditions & Remove deprecated fields in etcd.status --- api/v1alpha1/etcd.go | 26 +- api/v1alpha1/zz_generated.deepcopy.go | 15 - .../crd-druid.gardener.cloud_etcds.yaml | 80 +-- .../bases/crd-druid.gardener.cloud_etcds.yaml | 80 +-- docs/concepts/controllers.md | 2 + docs/development/getting-started-locally.md | 2 +- docs/proposals/01-multi-node-etcd-clusters.md | 14 +- go.mod | 2 +- internal/controller/etcd/reconcile_status.go | 2 - internal/health/condition/builder.go | 8 +- internal/health/condition/builder_test.go | 18 +- .../health/condition/check_backup_ready.go | 231 ++++++-- .../condition/check_backup_ready_test.go | 520 ++++++++++++------ internal/health/status/check.go | 6 +- internal/health/status/check_test.go | 33 +- internal/utils/miscellaneous.go | 18 + test/e2e/etcd_backup_test.go | 36 +- test/e2e/etcd_multi_node_test.go | 13 - test/utils/etcd.go | 1 - 19 files changed, 674 insertions(+), 433 deletions(-) diff --git a/api/v1alpha1/etcd.go b/api/v1alpha1/etcd.go index 5b3a2cfed..3f89bf940 100644 --- a/api/v1alpha1/etcd.go +++ b/api/v1alpha1/etcd.go @@ -48,11 +48,12 @@ const ( // +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.ready` // +kubebuilder:printcolumn:name="Quorate",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].status` // +kubebuilder:printcolumn:name="All Members Ready",type=string,JSONPath=`.status.conditions[?(@.type=="AllMembersReady")].status` -// +kubebuilder:printcolumn:name="Backup Ready",type=string,JSONPath=`.status.conditions[?(@.type=="BackupReady")].status` // +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` -// +kubebuilder:printcolumn:name="Cluster Size",type=integer,JSONPath=`.spec.replicas`,priority=1 // +kubebuilder:printcolumn:name="Current Replicas",type=integer,JSONPath=`.status.currentReplicas`,priority=1 // +kubebuilder:printcolumn:name="Ready Replicas",type=integer,JSONPath=`.status.readyReplicas`,priority=1 +// +kubebuilder:printcolumn:name="Backup Ready",type=string,JSONPath=`.status.conditions[?(@.type=="BackupReady")].status` +// +kubebuilder:printcolumn:name="Full Backup Ready",type=string,JSONPath=`.status.conditions[?(@.type=="FullSnapshotBackupReady")].status` +// +kubebuilder:printcolumn:name="Delta Backup Ready",type=string,JSONPath=`.status.conditions[?(@.type=="DeltaSnapshotBackupReady")].status` // Etcd is the Schema for the etcds API type Etcd struct { @@ -320,6 +321,10 @@ const ( ConditionTypeAllMembersReady ConditionType = "AllMembersReady" // ConditionTypeBackupReady is a constant for a condition type indicating that the etcd backup is ready. ConditionTypeBackupReady ConditionType = "BackupReady" + // ConditionTypeFullSnapshotBackupReady is a constant for a condition type indicating that the full snapshot backup is ready. + ConditionTypeFullSnapshotBackupReady ConditionType = "FullSnapshotBackupReady" + // ConditionTypeDeltaSnapshotBackupReady is a constant for a condition type indicating that the delta snapshot backup is ready. + ConditionTypeDeltaSnapshotBackupReady ConditionType = "DeltaSnapshotBackupReady" // ConditionTypeDataVolumesReady is a constant for a condition type indicating that the etcd data volumes are ready. ConditionTypeDataVolumesReady ConditionType = "DataVolumesReady" ) @@ -374,10 +379,6 @@ type EtcdStatus struct { // Conditions represents the latest available observations of an etcd's current state. // +optional Conditions []Condition `json:"conditions,omitempty"` - // ServiceName is the name of the etcd service. - // Deprecated: this field will be removed in the future. - // +optional - ServiceName *string `json:"serviceName,omitempty"` // LastError represents the last occurred error. // Deprecated: Use LastErrors instead. // +optional @@ -388,10 +389,6 @@ type EtcdStatus struct { // LastOperation indicates the last operation performed on this resource. // +optional LastOperation *LastOperation `json:"lastOperation,omitempty"` - // Cluster size is the current size of the etcd cluster. - // Deprecated: this field will not be populated with any value and will be removed in the future. - // +optional - ClusterSize *int32 `json:"clusterSize,omitempty"` // CurrentReplicas is the current replica count for the etcd cluster. // +optional CurrentReplicas int32 `json:"currentReplicas,omitempty"` @@ -404,15 +401,6 @@ type EtcdStatus struct { // Ready is `true` if all etcd replicas are ready. // +optional Ready *bool `json:"ready,omitempty"` - // UpdatedReplicas is the count of updated replicas in the etcd cluster. - // Deprecated: this field will be removed in the future. - // +optional - UpdatedReplicas int32 `json:"updatedReplicas,omitempty"` - // LabelSelector is a label query over pods that should match the replica count. - // It must match the pod template's labels. - // Deprecated: this field will be removed in the future. - // +optional - LabelSelector *metav1.LabelSelector `json:"labelSelector,omitempty"` // Members represents the members of the etcd cluster // +optional Members []EtcdMemberStatus `json:"members,omitempty"` diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index a94f34e03..3fed4941e 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -566,11 +566,6 @@ func (in *EtcdStatus) DeepCopyInto(out *EtcdStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } - if in.ServiceName != nil { - in, out := &in.ServiceName, &out.ServiceName - *out = new(string) - **out = **in - } if in.LastError != nil { in, out := &in.LastError, &out.LastError *out = new(string) @@ -588,21 +583,11 @@ func (in *EtcdStatus) DeepCopyInto(out *EtcdStatus) { *out = new(LastOperation) (*in).DeepCopyInto(*out) } - if in.ClusterSize != nil { - in, out := &in.ClusterSize, &out.ClusterSize - *out = new(int32) - **out = **in - } if in.Ready != nil { in, out := &in.Ready, &out.Ready *out = new(bool) **out = **in } - if in.LabelSelector != nil { - in, out := &in.LabelSelector, &out.LabelSelector - *out = new(v1.LabelSelector) - (*in).DeepCopyInto(*out) - } if in.Members != nil { in, out := &in.Members, &out.Members *out = make([]EtcdMemberStatus, len(*in)) diff --git a/charts/druid/charts/crds/templates/crd-druid.gardener.cloud_etcds.yaml b/charts/druid/charts/crds/templates/crd-druid.gardener.cloud_etcds.yaml index 49b507783..7f865204b 100644 --- a/charts/druid/charts/crds/templates/crd-druid.gardener.cloud_etcds.yaml +++ b/charts/druid/charts/crds/templates/crd-druid.gardener.cloud_etcds.yaml @@ -24,16 +24,9 @@ spec: - jsonPath: .status.conditions[?(@.type=="AllMembersReady")].status name: All Members Ready type: string - - jsonPath: .status.conditions[?(@.type=="BackupReady")].status - name: Backup Ready - type: string - jsonPath: .metadata.creationTimestamp name: Age type: date - - jsonPath: .spec.replicas - name: Cluster Size - priority: 1 - type: integer - jsonPath: .status.currentReplicas name: Current Replicas priority: 1 @@ -42,6 +35,15 @@ spec: name: Ready Replicas priority: 1 type: integer + - jsonPath: .status.conditions[?(@.type=="BackupReady")].status + name: Backup Ready + type: string + - jsonPath: .status.conditions[?(@.type=="FullSnapshotBackupReady")].status + name: Full Backup Ready + type: string + - jsonPath: .status.conditions[?(@.type=="DeltaSnapshotBackupReady")].status + name: Delta Backup Ready + type: string name: v1alpha1 schema: openAPIV3Schema: @@ -1785,12 +1787,6 @@ spec: status: description: EtcdStatus defines the observed state of Etcd. properties: - clusterSize: - description: |- - Cluster size is the current size of the etcd cluster. - Deprecated: this field will not be populated with any value and will be removed in the future. - format: int32 - type: integer conditions: description: Conditions represents the latest available observations of an etcd's current state. @@ -1848,53 +1844,6 @@ spec: description: Name of the referent type: string type: object - labelSelector: - description: |- - LabelSelector is a label query over pods that should match the replica count. - It must match the pod template's labels. - Deprecated: this field will be removed in the future. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. - The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies - to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic lastError: description: |- LastError represents the last occurred error. @@ -2014,17 +1963,6 @@ spec: description: Replicas is the replica count of the etcd cluster. format: int32 type: integer - serviceName: - description: |- - ServiceName is the name of the etcd service. - Deprecated: this field will be removed in the future. - type: string - updatedReplicas: - description: |- - UpdatedReplicas is the count of updated replicas in the etcd cluster. - Deprecated: this field will be removed in the future. - format: int32 - type: integer type: object type: object served: true diff --git a/config/crd/bases/crd-druid.gardener.cloud_etcds.yaml b/config/crd/bases/crd-druid.gardener.cloud_etcds.yaml index 49b507783..7f865204b 100644 --- a/config/crd/bases/crd-druid.gardener.cloud_etcds.yaml +++ b/config/crd/bases/crd-druid.gardener.cloud_etcds.yaml @@ -24,16 +24,9 @@ spec: - jsonPath: .status.conditions[?(@.type=="AllMembersReady")].status name: All Members Ready type: string - - jsonPath: .status.conditions[?(@.type=="BackupReady")].status - name: Backup Ready - type: string - jsonPath: .metadata.creationTimestamp name: Age type: date - - jsonPath: .spec.replicas - name: Cluster Size - priority: 1 - type: integer - jsonPath: .status.currentReplicas name: Current Replicas priority: 1 @@ -42,6 +35,15 @@ spec: name: Ready Replicas priority: 1 type: integer + - jsonPath: .status.conditions[?(@.type=="BackupReady")].status + name: Backup Ready + type: string + - jsonPath: .status.conditions[?(@.type=="FullSnapshotBackupReady")].status + name: Full Backup Ready + type: string + - jsonPath: .status.conditions[?(@.type=="DeltaSnapshotBackupReady")].status + name: Delta Backup Ready + type: string name: v1alpha1 schema: openAPIV3Schema: @@ -1785,12 +1787,6 @@ spec: status: description: EtcdStatus defines the observed state of Etcd. properties: - clusterSize: - description: |- - Cluster size is the current size of the etcd cluster. - Deprecated: this field will not be populated with any value and will be removed in the future. - format: int32 - type: integer conditions: description: Conditions represents the latest available observations of an etcd's current state. @@ -1848,53 +1844,6 @@ spec: description: Name of the referent type: string type: object - labelSelector: - description: |- - LabelSelector is a label query over pods that should match the replica count. - It must match the pod template's labels. - Deprecated: this field will be removed in the future. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. - The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies - to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic lastError: description: |- LastError represents the last occurred error. @@ -2014,17 +1963,6 @@ spec: description: Replicas is the replica count of the etcd cluster. format: int32 type: integer - serviceName: - description: |- - ServiceName is the name of the etcd service. - Deprecated: this field will be removed in the future. - type: string - updatedReplicas: - description: |- - UpdatedReplicas is the count of updated replicas in the etcd cluster. - Deprecated: this field will be removed in the future. - format: int32 - type: integer type: object type: object served: true diff --git a/docs/concepts/controllers.md b/docs/concepts/controllers.md index 66c170b74..305c6a117 100644 --- a/docs/concepts/controllers.md +++ b/docs/concepts/controllers.md @@ -79,6 +79,8 @@ Status fields related to the etcd cluster itself, such as `Members`, `PeerUrlTLS - `AllMembersReady`: indicates readiness of all members of the etcd cluster. - `Ready`: indicates overall readiness of the etcd cluster in serving traffic. - `BackupReady`: indicates health of the etcd backups, i.e., whether etcd backups are being taken regularly as per schedule. This condition is applicable only when backups are enabled for the etcd cluster. +- `FullSnapshotBackupReady`: indicates health of the full snapshot backups, i.e whether full snapshot backups are being taken regularly as per schedule. This condition is applicable only when backups are enabled for the etcd cluster. +- `DeltaSnapshotBackupReady`: indicates health of the delta snapshot backups, i.e whether delta snapshot backups are being taken regularly as per schedule. This condition is applicable only when backups are enabled for the etcd cluster. - `DataVolumesReady`: indicates health of the persistent volumes containing the etcd data. ## Compaction Controller diff --git a/docs/development/getting-started-locally.md b/docs/development/getting-started-locally.md index 1af9cc944..35802dc9b 100644 --- a/docs/development/getting-started-locally.md +++ b/docs/development/getting-started-locally.md @@ -122,7 +122,7 @@ kubectl apply -f config/samples/druid_v1alpha1_etcd.yaml ### Verify the Etcd cluster -To obtain information regarding the newly instantiated etcd cluster, perform the following step, which gives details such as the cluster size, readiness status of its members, and various other attributes. +To obtain information regarding the newly instantiated etcd cluster, perform the following step, which gives details such as the current & ready replicas, readiness status of its members, Full and Delta backup status and various other attributes. ```sh kubectl get etcd -o=wide diff --git a/docs/proposals/01-multi-node-etcd-clusters.md b/docs/proposals/01-multi-node-etcd-clusters.md index d61b6d1b6..89f56c59e 100644 --- a/docs/proposals/01-multi-node-etcd-clusters.md +++ b/docs/proposals/01-multi-node-etcd-clusters.md @@ -50,7 +50,6 @@ This document proposes an approach (along with some alternatives) to support pro - [Member name as the key](#member-name-as-the-key) - [Member Leases](#member-leases) - [Conditions](#conditions) - - [ClusterSize](#clustersize) - [Alternative](#alternative-4) - [Decision table for etcd-druid based on the status](#decision-table-for-etcd-druid-based-on-the-status) - [1. Pink of health](#1-pink-of-health) @@ -563,8 +562,6 @@ status: lastTransitionTime: "2020-11-10T12:48:01Z" reason: FullBackupSucceeded # FullBackupSucceeded|IncrementalBackupSucceeded|FullBackupFailed|IncrementalBackupFailed ... - clusterSize: 3 - ... replicas: 3 ... members: @@ -647,15 +644,6 @@ The `BackupReady` condition will be maintained by the leading `etcd-backup-resto More condition types could be introduced in the future if specific purposes arise. -### ClusterSize - -The `clusterSize` field contains the current size of the ETCD cluster. It will be actively kept up-to-date by `etcd-druid` in all scenarios. - -- Before [bootstrapping](#bootstrapping) the ETCD cluster (during cluster creation or later bootstrapping because of [quorum failure](#recommended-action-9)), `etcd-druid` will clear the `status.members` array and set `status.clusterSize` to be equal to `spec.replicas`. -- While the ETCD cluster is quorate, `etcd-druid` will actively set `status.clusterSize` to be equal to length of the `status.members` whenever the length of the array changes (say, due to scaling of the ETCD cluster). - -Given that `clusterSize` reliably represents the size of the ETCD cluster, it can be used to calculate the `Ready` [condition](#conditions). - ### Alternative The alternative is for `etcd-druid` to maintain the status in the `Etcd` status sub-resource. @@ -1255,7 +1243,7 @@ The life-cycle of these work-flows is shown below. - Take [backups](#backup) (full and incremental) at configured regular intervals - [Defragment](#defragmentation) all the members sequentially at configured regular intervals -- Cleanup superflous members from the ETCD cluster for which there is no corresponding pod (the [ordinal](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#ordinal-index) in the pod name is greater than the [cluster size](#clustersize)) at regular intervals (or whenever the `Etcd` resource [status](#status) changes by watching it) +- Cleanup superflous members from the ETCD cluster for which there is no corresponding pod (the [ordinal](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#ordinal-index) in the pod name is greater than the cluster size) at regular intervals (or whenever the `Etcd` resource [status](#status) changes by watching it) - The cleanup of [superfluous entries in `status.members` array](#13-superfluous-member-entries-in-etcd-status) is already covered [here](#recommended-action-12) ## High Availability diff --git a/go.mod b/go.mod index 3b506e3cb..046edbbc2 100644 --- a/go.mod +++ b/go.mod @@ -15,6 +15,7 @@ require ( github.com/onsi/ginkgo/v2 v2.19.0 github.com/onsi/gomega v1.33.1 github.com/prometheus/client_golang v1.18.0 + github.com/robfig/cron/v3 v3.0.1 github.com/spf13/pflag v1.0.5 go.uber.org/mock v0.4.0 go.uber.org/zap v1.27.0 @@ -107,7 +108,6 @@ require ( github.com/prometheus/client_model v0.6.0 // indirect github.com/prometheus/common v0.45.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect - github.com/robfig/cron/v3 v3.0.1 // indirect github.com/shopspring/decimal v1.3.1 // indirect github.com/sirupsen/logrus v1.9.3 // indirect github.com/spf13/afero v1.11.0 // indirect diff --git a/internal/controller/etcd/reconcile_status.go b/internal/controller/etcd/reconcile_status.go index 94ece8f7b..6557d8f2a 100644 --- a/internal/controller/etcd/reconcile_status.go +++ b/internal/controller/etcd/reconcile_status.go @@ -69,13 +69,11 @@ func (r *Reconciler) inspectStatefulSetAndMutateETCDStatus(ctx component.Operato ready, _ := utils.IsStatefulSetReady(expectedReplicas, sts) etcd.Status.CurrentReplicas = sts.Status.CurrentReplicas etcd.Status.ReadyReplicas = sts.Status.ReadyReplicas - etcd.Status.UpdatedReplicas = sts.Status.UpdatedReplicas etcd.Status.Replicas = sts.Status.CurrentReplicas etcd.Status.Ready = &ready } else { etcd.Status.CurrentReplicas = 0 etcd.Status.ReadyReplicas = 0 - etcd.Status.UpdatedReplicas = 0 etcd.Status.Ready = pointer.Bool(false) } return ctrlutils.ContinueReconcile() diff --git a/internal/health/condition/builder.go b/internal/health/condition/builder.go index 2f71ba55f..cb6cb9acd 100644 --- a/internal/health/condition/builder.go +++ b/internal/health/condition/builder.go @@ -15,9 +15,11 @@ import ( // skipMergeConditions contain the list of conditions we don't want to add to the list if not recalculated var skipMergeConditions = map[druidv1alpha1.ConditionType]struct{}{ - druidv1alpha1.ConditionTypeReady: {}, - druidv1alpha1.ConditionTypeAllMembersReady: {}, - druidv1alpha1.ConditionTypeBackupReady: {}, + druidv1alpha1.ConditionTypeReady: {}, + druidv1alpha1.ConditionTypeAllMembersReady: {}, + druidv1alpha1.ConditionTypeBackupReady: {}, + druidv1alpha1.ConditionTypeFullSnapshotBackupReady: {}, + druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady: {}, } // Builder is an interface for building conditions. diff --git a/internal/health/condition/builder_test.go b/internal/health/condition/builder_test.go index 5f197ca31..a5bf9c6b4 100644 --- a/internal/health/condition/builder_test.go +++ b/internal/health/condition/builder_test.go @@ -63,10 +63,26 @@ var _ = Describe("Builder", func() { Type: druidv1alpha1.ConditionTypeBackupReady, LastUpdateTime: metav1.NewTime(oldConditionTime), LastTransitionTime: metav1.NewTime(oldConditionTime), - Status: druidv1alpha1.ConditionTrue, + Status: druidv1alpha1.ConditionFalse, Reason: "foobar reason", Message: "foobar message", }, + { + Type: druidv1alpha1.ConditionTypeFullSnapshotBackupReady, + LastUpdateTime: metav1.NewTime(oldConditionTime), + LastTransitionTime: metav1.NewTime(oldConditionTime), + Status: druidv1alpha1.ConditionTrue, + Reason: "full foobar reason", + Message: "full foobar message", + }, + { + Type: druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady, + LastUpdateTime: metav1.NewTime(oldConditionTime), + LastTransitionTime: metav1.NewTime(oldConditionTime), + Status: druidv1alpha1.ConditionFalse, + Reason: "delta foobar reason", + Message: "delta foobar message", + }, } builder.WithOldConditions(oldConditions) diff --git a/internal/health/condition/check_backup_ready.go b/internal/health/condition/check_backup_ready.go index 6d1fd3900..f76afe103 100644 --- a/internal/health/condition/check_backup_ready.go +++ b/internal/health/condition/check_backup_ready.go @@ -10,13 +10,18 @@ import ( "time" druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1" + "github.com/gardener/etcd-druid/internal/utils" coordinationv1 "k8s.io/api/coordination/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" ) -type backupReadyCheck struct { +type fullSnapshotBackupReadyCheck struct { + cl client.Client +} + +type deltaSnapshotBackupReadyCheck struct { cl client.Client } @@ -25,93 +30,199 @@ const ( BackupSucceeded string = "BackupSucceeded" // BackupFailed is a constant that means that etcd backup has failed BackupFailed string = "BackupFailed" + // SnapshotUploadedOnSchedule is a constant that means that the etcd backup has been uploaded on schedule + SnapshotUploadedOnSchedule string = "SnapshotUploadedOnSchedule" + // SnapshotMissedSchedule is a constant that means that the etcd backup has missed the schedule + SnapshotMissedSchedule string = "SnapshotMissedSchedule" + // SnapshotProcessNotStarted is a constant that means the etcd snapshotting has not started yet + SnapshotProcessNotStarted string = "SnapshottingProcessNotStarted" // Unknown is a constant that means that the etcd backup status is currently not known Unknown string = "Unknown" // NotChecked is a constant that means that the etcd backup status has not been updated or rechecked NotChecked string = "NotChecked" ) -func (a *backupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) Result { - //Default case +// BackupReadyCheck returns a result for the "BackupReady" condition check +func BackupReadyCheck(results []Result) Result { result := &result{ conType: druidv1alpha1.ConditionTypeBackupReady, status: druidv1alpha1.ConditionUnknown, reason: Unknown, - message: "Cannot determine etcd backup status", + message: "Cannot determine backup upload status", + } + + var FullSnapshotBackupReadyCheckResult, DeltaSnapshotBackupReadyCheckResult Result = nil, nil + for _, result := range results { + if result == nil { + continue + } + if result.ConditionType() == druidv1alpha1.ConditionTypeFullSnapshotBackupReady { + FullSnapshotBackupReadyCheckResult = result + continue + } + if result.ConditionType() == druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady { + DeltaSnapshotBackupReadyCheckResult = result + continue + } + } + if FullSnapshotBackupReadyCheckResult == nil || DeltaSnapshotBackupReadyCheckResult == nil { + return nil + } + var ( + fullSnapshotBackupMissedSchedule = FullSnapshotBackupReadyCheckResult.Status() == druidv1alpha1.ConditionFalse + fullSnapshotBackupSucceeded = FullSnapshotBackupReadyCheckResult.Status() == druidv1alpha1.ConditionTrue + deltaSnapshotBackupMissedSchedule = DeltaSnapshotBackupReadyCheckResult.Status() == druidv1alpha1.ConditionFalse + ) + // False case + if fullSnapshotBackupMissedSchedule || deltaSnapshotBackupMissedSchedule { + result.status = druidv1alpha1.ConditionFalse + result.reason = BackupFailed + result.message = backupFailureMessage(fullSnapshotBackupMissedSchedule, deltaSnapshotBackupMissedSchedule) + return result + } + // True case + if fullSnapshotBackupSucceeded { + result.status = druidv1alpha1.ConditionTrue + result.reason = BackupSucceeded + result.message = "Snapshot backup succeeded" + return result + } + // Unknown case + return result +} + +func (f *fullSnapshotBackupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) Result { + //Default case + result := &result{ + conType: druidv1alpha1.ConditionTypeFullSnapshotBackupReady, + status: druidv1alpha1.ConditionUnknown, + reason: Unknown, + message: "Cannot determine full snapshot upload status", } // Special case of etcd not being configured to take snapshots - // Do not add the BackupReady condition if backup is not configured - if etcd.Spec.Backup.Store == nil || etcd.Spec.Backup.Store.Provider == nil || len(*etcd.Spec.Backup.Store.Provider) == 0 { + // Do not add the FullSnapshotBackupReady condition if backup is not configured + if !isBackupConfigured(&etcd) { return nil } //Fetch snapshot leases var ( - fullSnapErr, incrSnapErr error - fullSnapLease = &coordinationv1.Lease{} - deltaSnapLease = &coordinationv1.Lease{} + fullSnapErr error + fullSnapLease = &coordinationv1.Lease{} + fullSnapshotInterval = 24 * time.Hour + err error ) - fullSnapErr = a.cl.Get(ctx, types.NamespacedName{Name: getFullSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, fullSnapLease) - incrSnapErr = a.cl.Get(ctx, types.NamespacedName{Name: getDeltaSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, deltaSnapLease) + fullSnapErr = f.cl.Get(ctx, types.NamespacedName{Name: getFullSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, fullSnapLease) + fullLeaseRenewTime := fullSnapLease.Spec.RenewTime + fullLeaseCreationTime := fullSnapLease.ObjectMeta.CreationTimestamp - //Set status to Unknown if errors in fetching snapshot leases or lease never renewed - if fullSnapErr != nil || incrSnapErr != nil || (fullSnapLease.Spec.RenewTime == nil && deltaSnapLease.Spec.RenewTime == nil) { + if etcd.Spec.Backup.FullSnapshotSchedule != nil { + if fullSnapshotInterval, err = utils.ComputeScheduleInterval(*etcd.Spec.Backup.FullSnapshotSchedule); err != nil { + return result + } + } + //Set status to Unknown if errors in fetching full snapshot lease + if fullSnapErr != nil { return result } + if fullLeaseRenewTime == nil { + if time.Since(fullLeaseCreationTime.Time) < fullSnapshotInterval { + return result + } else { + result.status = druidv1alpha1.ConditionFalse + result.reason = SnapshotMissedSchedule + result.message = "Full snapshot missed schedule. Backup is unhealthy" + return result + } + } else { + if time.Since(fullLeaseRenewTime.Time) < fullSnapshotInterval { + result.status = druidv1alpha1.ConditionTrue + result.reason = SnapshotUploadedOnSchedule + result.message = fmt.Sprintf("Full snapshot uploaded successfully %v ago", time.Since(fullLeaseRenewTime.Time)) + return result + } else { + result.status = druidv1alpha1.ConditionFalse + result.reason = SnapshotMissedSchedule + result.message = fmt.Sprintf("Full snapshot missed schedule, last full snapshot was taken %v ago", time.Since(fullLeaseRenewTime.Time)) + return result + } + } +} + +func (d *deltaSnapshotBackupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) Result { + //Default case + result := &result{ + conType: druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady, + status: druidv1alpha1.ConditionUnknown, + reason: Unknown, + message: "Cannot determine delta snapshot upload status", + } + // Special case of etcd not being configured to take snapshots + // Do not add the DeltaSnapshotBackupReady condition if backup is not configured + if !isBackupConfigured(&etcd) { + return nil + } + var ( + incrSnapErr error + deltaSnapLease = &coordinationv1.Lease{} + ) + incrSnapErr = d.cl.Get(ctx, types.NamespacedName{Name: getDeltaSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, deltaSnapLease) deltaLeaseRenewTime := deltaSnapLease.Spec.RenewTime - fullLeaseRenewTime := fullSnapLease.Spec.RenewTime - fullLeaseCreateTime := &fullSnapLease.ObjectMeta.CreationTimestamp - - if fullLeaseRenewTime == nil && deltaLeaseRenewTime != nil { - // Most probable during reconcile of existing clusters if fresh leases are created - // Treat backup as succeeded if delta snap lease renewal happens in the required time window and full snap lease is not older than 24h. - if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseCreateTime.Time) < 24*time.Hour { - result.reason = BackupSucceeded - result.message = "Delta snapshot backup succeeded" + deltaSnapshotPeriod := etcd.Spec.Backup.DeltaSnapshotPeriod.Duration + deltaLeaseCreationTime := deltaSnapLease.ObjectMeta.CreationTimestamp + + //Set status to Unknown if error in fetching delta snapshot lease + if incrSnapErr != nil { + return result + } + + if deltaLeaseRenewTime == nil { + if time.Since(deltaLeaseCreationTime.Time) < deltaSnapshotPeriod { result.status = druidv1alpha1.ConditionTrue + result.reason = SnapshotProcessNotStarted + result.message = "Delta snapshotting has not been triggered yet" return result - } - } else if deltaLeaseRenewTime == nil && fullLeaseRenewTime != nil { - //Most probable during a startup scenario for new clusters - //Special case. Return Unknown condition for some time to allow delta backups to start up - if time.Since(fullLeaseRenewTime.Time) > 5*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration { - result.message = "Periodic delta snapshots not started yet" + } else if time.Since(deltaLeaseCreationTime.Time) < 3*deltaSnapshotPeriod { + result.status = druidv1alpha1.ConditionUnknown + result.reason = Unknown + result.message = "Periodic delta snapshotting has not started yet" return result } - } else if deltaLeaseRenewTime != nil && fullLeaseRenewTime != nil { - //Both snap leases are maintained. Both are expected to be renewed periodically - if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseRenewTime.Time) < 24*time.Hour { - result.reason = BackupSucceeded - result.message = "Snapshot backup succeeded" + } else { + if time.Since(deltaLeaseRenewTime.Time) < deltaSnapshotPeriod { result.status = druidv1alpha1.ConditionTrue + result.reason = SnapshotUploadedOnSchedule + result.message = fmt.Sprintf("Delta snapshot uploaded successfully %v ago", time.Since(deltaLeaseRenewTime.Time)) + return result + } else if time.Since(deltaLeaseRenewTime.Time) < 3*deltaSnapshotPeriod { + result.status = druidv1alpha1.ConditionUnknown + result.reason = Unknown + result.message = "One or more delta snapshots seems to have missed their scheduled times, likely due to a transient issue with updating the lease" return result } } - //Cases where snapshot leases are not updated for a long time - //If snapshot leases are present and leases aren't updated, it is safe to assume that backup is not healthy - + //Cases where delta snapshot lease is not renewed at all or not updated for a long time + //If delta snapshot lease is present and not updated, it is safe to assume that backup is not healthy if etcd.Status.Conditions != nil { - var prevBackupReadyStatus druidv1alpha1.Condition - for _, prevBackupReadyStatus = range etcd.Status.Conditions { - if prevBackupReadyStatus.Type == druidv1alpha1.ConditionTypeBackupReady { + var prevDeltaSnapshotBackupReadyStatus druidv1alpha1.Condition + for _, prevDeltaSnapshotBackupReadyStatus = range etcd.Status.Conditions { + if prevDeltaSnapshotBackupReadyStatus.Type == druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady { break } } - // Transition to "False" state only if present state is "Unknown" or "False" - if deltaLeaseRenewTime != nil && (prevBackupReadyStatus.Status == druidv1alpha1.ConditionUnknown || prevBackupReadyStatus.Status == druidv1alpha1.ConditionFalse) { - if time.Since(deltaLeaseRenewTime.Time) > 3*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration { + if prevDeltaSnapshotBackupReadyStatus.Status == druidv1alpha1.ConditionUnknown || prevDeltaSnapshotBackupReadyStatus.Status == druidv1alpha1.ConditionFalse { + if deltaLeaseRenewTime == nil || time.Since(deltaLeaseRenewTime.Time) > 3*deltaSnapshotPeriod { result.status = druidv1alpha1.ConditionFalse - result.reason = BackupFailed - result.message = "Stale snapshot leases. Not renewed in a long time" + result.reason = SnapshotMissedSchedule + result.message = "Stale snapshot lease. Not updated for a long time. Backup is unhealthy" return result } } } - //Transition to "Unknown" state is we cannot prove a "True" state return result } @@ -124,9 +235,33 @@ func getFullSnapLeaseName(etcd *druidv1alpha1.Etcd) string { return fmt.Sprintf("%s-full-snap", string(etcd.ObjectMeta.Name)) } -// BackupReadyCheck returns a check for the "BackupReady" condition. -func BackupReadyCheck(cl client.Client) Checker { - return &backupReadyCheck{ +// FullSnapshotBackupReadyCheck returns a check for the "FullSnapshotBackupReady" condition. +func FullSnapshotBackupReadyCheck(cl client.Client) Checker { + return &fullSnapshotBackupReadyCheck{ + cl: cl, + } +} + +// DeltaSnapshotBackupReadyCheck returns a check for the "DeltaSnapshotBackupReady" condition. +func DeltaSnapshotBackupReadyCheck(cl client.Client) Checker { + return &deltaSnapshotBackupReadyCheck{ cl: cl, } } + +func backupFailureMessage(fullSnapshotBackupMissedSchedule, deltaSnapshotBackupMissedSchedule bool) string { + if fullSnapshotBackupMissedSchedule && deltaSnapshotBackupMissedSchedule { + return "Both Full & Delta snapshot backups missed their schedule" + } + if fullSnapshotBackupMissedSchedule { + return "Full snapshot backup missed schedule" + } + return "Delta snapshot backup missed schedule" +} + +func isBackupConfigured(etcd *druidv1alpha1.Etcd) bool { + if etcd.Spec.Backup.Store == nil || etcd.Spec.Backup.Store.Provider == nil || len(*etcd.Spec.Backup.Store.Provider) == 0 { + return false + } + return true +} diff --git a/internal/health/condition/check_backup_ready_test.go b/internal/health/condition/check_backup_ready_test.go index 6280401ef..786d1c63f 100644 --- a/internal/health/condition/check_backup_ready_test.go +++ b/internal/health/condition/check_backup_ready_test.go @@ -24,8 +24,8 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" ) -var _ = Describe("BackupReadyCheck", func() { - Describe("#Check", func() { +var _ = Describe("BackupsReadyCheck", func() { + Describe("#SnapshotsBackupReadyCheck", func() { var ( storageProvider druidv1alpha1.StorageProvider = "testStorageProvider" mockCtrl *gomock.Controller @@ -78,146 +78,376 @@ var _ = Describe("BackupReadyCheck", func() { AfterEach(func() { mockCtrl.Finish() }) + Describe("#DeltaSnapshotBackupReadyCheck", func() { - Context("With no snapshot leases present", func() { - It("Should return Unknown rediness", func() { - cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn( - func(_ context.Context, _ client.ObjectKey, _ *coordinationv1.Lease, _ ...client.GetOption) error { - return &noLeaseError - }, - ).AnyTimes() + Context("With no delta snapshot lease present", func() { + It("Should return Unknown readiness", func() { + cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, er *coordinationv1.Lease, _ ...client.GetOption) error { + return &noLeaseError + }, + ).AnyTimes() + + check := DeltaSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) + Expect(result).ToNot(BeNil()) + Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady)) + Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown)) + Expect(result.Reason()).To(Equal(Unknown)) + }) + }) - check := BackupReadyCheck(cl) - result := check.Check(context.TODO(), etcd) + Context("With delta snapshot lease present", func() { + + Context("With lease not renewed even once", func() { + It("Should set status to True if lease has not been renewed within the deltaSnapshotPeriod duration of lease creation", func() { + cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-delta-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { + *le = lease + le.Spec.RenewTime = nil + le.Spec.HolderIdentity = nil + le.ObjectMeta.CreationTimestamp = v1.Now() + return nil + }, + ).AnyTimes() + + check := DeltaSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) + + Expect(result).ToNot(BeNil()) + Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady)) + Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue)) + Expect(result.Reason()).To(Equal(SnapshotProcessNotStarted)) + }) + + It("Should set status to Unknown if lease has not been renewed until 3*deltaSnapshotPeriod duration of lease creation", func() { + cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-delta-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { + *le = lease + le.Spec.RenewTime = nil + le.Spec.HolderIdentity = nil + le.ObjectMeta.CreationTimestamp = v1.Time{Time: time.Now().Add(-2 * deltaSnapshotDuration)} + return nil + }, + ).AnyTimes() + + check := DeltaSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) + + Expect(result).ToNot(BeNil()) + Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady)) + Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown)) + Expect(result.Reason()).To(Equal(Unknown)) + }) + + It("Should set status to False if lease has not been renewed even after 3*deltaSnapshotPeriod duration of lease creation", func() { + cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-delta-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { + *le = lease + le.Spec.RenewTime = nil + le.Spec.HolderIdentity = nil + le.ObjectMeta.CreationTimestamp = v1.Time{Time: time.Now().Add(-4 * deltaSnapshotDuration)} + return nil + }, + ).AnyTimes() + + etcd.Status.Conditions = []druidv1alpha1.Condition{ + { + Type: druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady, + Status: druidv1alpha1.ConditionUnknown, + Message: "Unknown", + }, + } - Expect(result).ToNot(BeNil()) - Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady)) - Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown)) - Expect(result.Reason()).To(Equal(Unknown)) - }) - }) + check := DeltaSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) + + Expect(result).ToNot(BeNil()) + Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady)) + Expect(result.Status()).To(Equal(druidv1alpha1.ConditionFalse)) + Expect(result.Reason()).To(Equal(SnapshotMissedSchedule)) + }) + }) + + Context("With lease renewed at least once", func() { + It("Should set status to True if lease has been renewed within the deltaSnapshotPeriod duration", func() { + cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-delta-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { + *le = lease + le.Spec.RenewTime = &v1.MicroTime{Time: time.Now()} + return nil + }, + ).AnyTimes() + + check := DeltaSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) + + Expect(result).ToNot(BeNil()) + Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady)) + Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue)) + Expect(result.Reason()).To(Equal(SnapshotUploadedOnSchedule)) + }) + + It("Should set status to Unknown if lease has not been renewed within the 3*deltaSnapshotPeriod duration", func() { + cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-delta-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { + *le = lease + le.Spec.RenewTime = &v1.MicroTime{Time: time.Now().Add(-2 * deltaSnapshotDuration)} + return nil + }, + ).AnyTimes() + + check := DeltaSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) + + Expect(result).ToNot(BeNil()) + Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady)) + Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown)) + Expect(result.Reason()).To(Equal(Unknown)) + }) + + It("Should set status to False if lease has not been renewed even after 3*deltaSnapshotPeriod duration", func() { + cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-delta-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { + *le = lease + le.Spec.RenewTime = &v1.MicroTime{Time: time.Now().Add(-4 * deltaSnapshotDuration)} + return nil + }, + ).AnyTimes() + + etcd.Status.Conditions = []druidv1alpha1.Condition{ + { + Type: druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady, + Status: druidv1alpha1.ConditionUnknown, + Message: "Unknown", + }, + } - Context("With both snapshot leases present", func() { - It("Should set status to BackupSucceeded if both leases are recently renewed", func() { - cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn( - func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { - *le = lease - return nil - }, - ).AnyTimes() + check := DeltaSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) - check := BackupReadyCheck(cl) - result := check.Check(context.TODO(), etcd) + Expect(result).ToNot(BeNil()) + Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady)) + Expect(result.Status()).To(Equal(druidv1alpha1.ConditionFalse)) + Expect(result.Reason()).To(Equal(SnapshotMissedSchedule)) + }) + }) + }) - Expect(result).ToNot(BeNil()) - Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady)) - Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue)) - Expect(result.Reason()).To(Equal(BackupSucceeded)) + Context("With no backup store configured", func() { + It("Should return nil condition", func() { + cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, er *coordinationv1.Lease) error { + return &noLeaseError + }, + ).AnyTimes() + + etcd.Spec.Backup.Store = nil + check := DeltaSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) + + Expect(result).To(BeNil()) + etcd.Spec.Backup.Store = &druidv1alpha1.StoreSpec{ + Prefix: "test-prefix", + Provider: &storageProvider, + } + }) }) - It("Should set status to BackupSucceeded if delta snap lease is recently created and empty full snap lease has been created in the last 24h", func() { - cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn( - func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { - *le = lease - le.Spec.RenewTime = nil - le.Spec.HolderIdentity = nil - le.ObjectMeta.CreationTimestamp = v1.Now() - return nil - }, - ).AnyTimes() - cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-delta-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn( - func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { - *le = lease - return nil - }, - ).AnyTimes() + Context("With backup store is configured but provider is nil", func() { + It("Should return nil condition", func() { + cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, er *coordinationv1.Lease) error { + return &noLeaseError + }, + ).AnyTimes() - check := BackupReadyCheck(cl) - result := check.Check(context.TODO(), etcd) + etcd.Spec.Backup.Store.Provider = nil + check := DeltaSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) - Expect(result).ToNot(BeNil()) - Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady)) - Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue)) - Expect(result.Reason()).To(Equal(BackupSucceeded)) + Expect(result).To(BeNil()) + etcd.Spec.Backup.Store.Provider = &storageProvider + }) + }) + }) + Describe("#FullSnapshotBackupReadyCheck", func() { + Context("With no full snapshot lease present", func() { + It("Should return Unknown readiness", func() { + cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, er *coordinationv1.Lease, _ ...client.GetOption) error { + return &noLeaseError + }, + ).AnyTimes() + + check := FullSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) + Expect(result).ToNot(BeNil()) + Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeFullSnapshotBackupReady)) + Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown)) + Expect(result.Reason()).To(Equal(Unknown)) + }) }) - It("Should set status to Unknown if empty delta snap lease is present but full snap lease is renewed recently", func() { - cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn( - func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { - *le = lease - le.Spec.RenewTime = &v1.MicroTime{Time: lease.Spec.RenewTime.Time.Add(-5 * deltaSnapshotDuration)} - return nil - }, - ).AnyTimes() - cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-delta-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn( - func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { - *le = lease - le.Spec.RenewTime = nil - le.Spec.HolderIdentity = nil - return nil - }, - ).AnyTimes() - - check := BackupReadyCheck(cl) - result := check.Check(context.TODO(), etcd) - - Expect(result).ToNot(BeNil()) - Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady)) - Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown)) - Expect(result.Reason()).To(Equal(Unknown)) - Expect(result.Message()).To(Equal("Periodic delta snapshots not started yet")) + Context("With full snapshot lease present", func() { + Context("With lease not renewed even once", func() { + It("Should set status to Unknown if lease has not been renewed at all within the first 24 hours of lease creation", func() { + cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { + *le = lease + le.Spec.RenewTime = nil + le.Spec.HolderIdentity = nil + le.ObjectMeta.CreationTimestamp = v1.Now() + return nil + }, + ).AnyTimes() + + check := FullSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) + + Expect(result).ToNot(BeNil()) + Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeFullSnapshotBackupReady)) + Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown)) + Expect(result.Reason()).To(Equal(Unknown)) + }) + + It("Should set status to False if lease has not been renewed at all even after 24 hours of lease creation", func() { + cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { + *le = lease + le.Spec.RenewTime = nil + le.Spec.HolderIdentity = nil + le.ObjectMeta.CreationTimestamp = v1.Time{Time: time.Now().Add(-25 * time.Hour)} + return nil + }, + ).AnyTimes() + + check := FullSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) + + Expect(result).ToNot(BeNil()) + Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeFullSnapshotBackupReady)) + Expect(result.Status()).To(Equal(druidv1alpha1.ConditionFalse)) + Expect(result.Reason()).To(Equal(SnapshotMissedSchedule)) + }) + }) + + Context("With lease renewed at least once", func() { + It("Should set status to True if lease has been renewed within the last 24 hours", func() { + cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { + *le = lease + le.Spec.RenewTime = &v1.MicroTime{Time: time.Now()} + return nil + }, + ).AnyTimes() + + check := FullSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) + + Expect(result).ToNot(BeNil()) + Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeFullSnapshotBackupReady)) + Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue)) + Expect(result.Reason()).To(Equal(SnapshotUploadedOnSchedule)) + }) + + It("Should set status to False if lease has not been renewed within the last 24 hours", func() { + cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { + *le = lease + le.Spec.RenewTime = &v1.MicroTime{Time: time.Now().Add(-25 * time.Hour)} + return nil + }, + ).AnyTimes() + + check := FullSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) + + Expect(result).ToNot(BeNil()) + Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeFullSnapshotBackupReady)) + Expect(result.Status()).To(Equal(druidv1alpha1.ConditionFalse)) + Expect(result.Reason()).To(Equal(SnapshotMissedSchedule)) + }) + }) }) - It("Should set status to Unknown if both leases are stale", func() { - cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn( - func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { - *le = lease - le.Spec.RenewTime = &v1.MicroTime{ - Time: time.Now().Add(-10 * time.Minute), - } - return nil - }, - ).AnyTimes() + Context("With no backup store configured", func() { + It("Should return nil condition", func() { + cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, er *coordinationv1.Lease) error { + return &noLeaseError + }, + ).AnyTimes() + + etcd.Spec.Backup.Store = nil + check := FullSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) + + Expect(result).To(BeNil()) + etcd.Spec.Backup.Store = &druidv1alpha1.StoreSpec{ + Prefix: "test-prefix", + Provider: &storageProvider, + } + }) + }) - etcd.Status.Conditions = []druidv1alpha1.Condition{ - { - Type: druidv1alpha1.ConditionTypeBackupReady, - Status: druidv1alpha1.ConditionTrue, - Message: "True", - }, - } + Context("With backup store is configured but provider is nil", func() { + It("Should return nil condition", func() { + cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ client.ObjectKey, er *coordinationv1.Lease) error { + return &noLeaseError + }, + ).AnyTimes() - check := BackupReadyCheck(cl) - result := check.Check(context.TODO(), etcd) + etcd.Spec.Backup.Store.Provider = nil + check := FullSnapshotBackupReadyCheck(cl) + result := check.Check(context.TODO(), etcd) - Expect(result).ToNot(BeNil()) - Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady)) - Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown)) - Expect(result.Reason()).To(Equal(Unknown)) + Expect(result).To(BeNil()) + etcd.Spec.Backup.Store.Provider = &storageProvider + }) }) + }) + }) + Describe("#BackupReadyCheck", func() { + var results []Result + BeforeEach(func() { + results = []Result{ + &result{ + ConType: druidv1alpha1.ConditionTypeFullSnapshotBackupReady, + ConStatus: druidv1alpha1.ConditionTrue, + }, + &result{ + ConType: druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady, + ConStatus: druidv1alpha1.ConditionTrue, + }, + nil, + &result{ + ConType: druidv1alpha1.ConditionTypeReady, + ConStatus: druidv1alpha1.ConditionTrue, + }, + &result{ + ConType: druidv1alpha1.ConditionTypeDataVolumesReady, + ConStatus: druidv1alpha1.ConditionTrue, + }, + } + }) + Context("With at least one of Full or Delta snapshot backup condition check is nil", func() { + It("Should return nil condition", func() { + results = append(results[:1], results[2:]...) + result := BackupReadyCheck(results) + Expect(result).To(BeNil()) - It("Should set status to BackupFailed if both leases are stale and current condition is Unknown", func() { - cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn( - func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error { - *le = lease - le.Spec.RenewTime = &v1.MicroTime{ - Time: time.Now().Add(-10 * time.Minute), - } - return nil - }, - ).AnyTimes() - - etcd.Status.Conditions = []druidv1alpha1.Condition{ - { - Type: druidv1alpha1.ConditionTypeBackupReady, - Status: druidv1alpha1.ConditionUnknown, - Message: "Unknown", - }, - } - - check := BackupReadyCheck(cl) - result := check.Check(context.TODO(), etcd) + results = append(results[:], results[1:]...) + result = BackupReadyCheck(results) + Expect(result).To(BeNil()) + }) + }) + Context("With at least one of Full or Delta snapshot backup condition check is False", func() { + It("Should return False readiness", func() { + results[0].(*result).ConStatus = druidv1alpha1.ConditionFalse + result := BackupReadyCheck(results) Expect(result).ToNot(BeNil()) Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady)) Expect(result.Status()).To(Equal(druidv1alpha1.ConditionFalse)) @@ -225,40 +455,24 @@ var _ = Describe("BackupReadyCheck", func() { }) }) - Context("With no backup store configured", func() { - It("Should return nil condition", func() { - cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any()).DoAndReturn( - func(_ context.Context, _ client.ObjectKey, _ *coordinationv1.Lease) error { - return &noLeaseError - }, - ).AnyTimes() - - etcd.Spec.Backup.Store = nil - check := BackupReadyCheck(cl) - result := check.Check(context.TODO(), etcd) - - Expect(result).To(BeNil()) - etcd.Spec.Backup.Store = &druidv1alpha1.StoreSpec{ - Prefix: "test-prefix", - Provider: &storageProvider, - } + Context("With at least one of Full or Delta snapshot backup condition check is Unknown", func() { + It("Should return Unknown readiness", func() { + results[1].(*result).ConStatus = druidv1alpha1.ConditionUnknown + result := BackupReadyCheck(results) + Expect(result).ToNot(BeNil()) + Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady)) + Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown)) + Expect(result.Reason()).To(Equal(Unknown)) }) }) - Context("With backup store is configured but provider is nil", func() { - It("Should return nil condition", func() { - cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any()).DoAndReturn( - func(_ context.Context, _ client.ObjectKey, _ *coordinationv1.Lease) error { - return &noLeaseError - }, - ).AnyTimes() - - etcd.Spec.Backup.Store.Provider = nil - check := BackupReadyCheck(cl) - result := check.Check(context.TODO(), etcd) - - Expect(result).To(BeNil()) - etcd.Spec.Backup.Store.Provider = &storageProvider + Context("With both Full and Delta snapshot backup condition checks are True", func() { + It("Should return True readiness", func() { + result := BackupReadyCheck(results) + Expect(result).ToNot(BeNil()) + Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady)) + Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue)) + Expect(result.Reason()).To(Equal(BackupSucceeded)) }) }) }) diff --git a/internal/health/status/check.go b/internal/health/status/check.go index 242090374..358f3ab90 100644 --- a/internal/health/status/check.go +++ b/internal/health/status/check.go @@ -37,7 +37,8 @@ var ( ConditionChecks = []ConditionCheckFn{ condition.AllMembersReadyCheck, condition.ReadyCheck, - condition.BackupReadyCheck, + condition.FullSnapshotBackupReadyCheck, + condition.DeltaSnapshotBackupReadyCheck, condition.DataVolumesReadyCheck, } // EtcdMemberChecks are the etcd member checks. @@ -91,10 +92,11 @@ func (c *Checker) executeConditionChecks(ctx context.Context, etcd *druidv1alpha wg.Wait() })() - results := make([]condition.Result, 0, len(ConditionChecks)) + results := make([]condition.Result, 0, len(ConditionChecks)+1) for r := range resultCh { results = append(results, r) } + results = append(results, condition.BackupReadyCheck(results)) conditions := c.conditionBuilderFn(). WithNowFunc(func() metav1.Time { return metav1.NewTime(TimeNow()) }). diff --git a/internal/health/status/check_test.go b/internal/health/status/check_test.go index e21f43849..8c5fc1329 100644 --- a/internal/health/status/check_test.go +++ b/internal/health/status/check_test.go @@ -51,12 +51,20 @@ var _ = Describe("Check", func() { Message: "bar message", }, { - Type: druidv1alpha1.ConditionTypeBackupReady, + Type: druidv1alpha1.ConditionTypeFullSnapshotBackupReady, Status: druidv1alpha1.ConditionUnknown, LastTransitionTime: metav1.NewTime(timeBefore), LastUpdateTime: metav1.NewTime(timeBefore), - Reason: "foobar reason", - Message: "foobar message", + Reason: "full foobar reason", + Message: "full foobar message", + }, + { + Type: druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady, + Status: druidv1alpha1.ConditionUnknown, + LastTransitionTime: metav1.NewTime(timeBefore), + LastUpdateTime: metav1.NewTime(timeBefore), + Reason: "delta foobar reason", + Message: "delta foobar message", }, { Type: druidv1alpha1.ConditionTypeDataVolumesReady, @@ -107,7 +115,10 @@ var _ = Describe("Check", func() { return createConditionCheck(druidv1alpha1.ConditionTypeAllMembersReady, druidv1alpha1.ConditionTrue, "bar reason", "bar message") }, func(client.Client) condition.Checker { - return createConditionCheck(druidv1alpha1.ConditionTypeBackupReady, druidv1alpha1.ConditionUnknown, "foobar reason", "foobar message") + return createConditionCheck(druidv1alpha1.ConditionTypeFullSnapshotBackupReady, druidv1alpha1.ConditionUnknown, "full foobar reason", "full foobar message") + }, + func(client.Client) condition.Checker { + return createConditionCheck(druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady, druidv1alpha1.ConditionUnknown, "delta foobar reason", "delta foobar message") }, func(client.Client) condition.Checker { return createConditionCheck(druidv1alpha1.ConditionTypeDataVolumesReady, druidv1alpha1.ConditionUnknown, "foobar reason", "foobar message") @@ -149,12 +160,20 @@ var _ = Describe("Check", func() { "Message": Equal("bar message"), }), MatchFields(IgnoreExtras, Fields{ - "Type": Equal(druidv1alpha1.ConditionTypeBackupReady), + "Type": Equal(druidv1alpha1.ConditionTypeFullSnapshotBackupReady), "Status": Equal(druidv1alpha1.ConditionUnknown), "LastTransitionTime": Equal(metav1.NewTime(timeBefore)), "LastUpdateTime": Equal(metav1.NewTime(timeNow)), - "Reason": Equal("foobar reason"), - "Message": Equal("foobar message"), + "Reason": Equal("full foobar reason"), + "Message": Equal("full foobar message"), + }), + MatchFields(IgnoreExtras, Fields{ + "Type": Equal(druidv1alpha1.ConditionTypeDeltaSnapshotBackupReady), + "Status": Equal(druidv1alpha1.ConditionUnknown), + "LastTransitionTime": Equal(metav1.NewTime(timeBefore)), + "LastUpdateTime": Equal(metav1.NewTime(timeNow)), + "Reason": Equal("delta foobar reason"), + "Message": Equal("delta foobar message"), }), MatchFields(IgnoreExtras, Fields{ "Type": Equal(druidv1alpha1.ConditionTypeDataVolumesReady), diff --git a/internal/utils/miscellaneous.go b/internal/utils/miscellaneous.go index b116019de..f5a0de839 100644 --- a/internal/utils/miscellaneous.go +++ b/internal/utils/miscellaneous.go @@ -8,7 +8,9 @@ import ( "fmt" "maps" "strings" + "time" + cron "github.com/robfig/cron/v3" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -76,3 +78,19 @@ func IfConditionOr[T any](condition bool, trueVal, falseVal T) T { func PointerOf[T any](val T) *T { return &val } + +// ComputeScheduleInterval computes the interval between two activations for the given cron schedule. +// Assumes that every cron activation is at equal intervals apart, based on cron schedules such as +// "once every X hours", "once every Y days", "at 1:00pm on every Tuesday", etc. +// TODO: write a new function to accurately compute the previous activation time from the cron schedule +// in order to compute when the previous activation of the cron schedule was supposed to have occurred, +// instead of relying on the assumption that all the cron activations are evenly spaced. +func ComputeScheduleInterval(cronSchedule string) (time.Duration, error) { + schedule, err := cron.ParseStandard(cronSchedule) + if err != nil { + return 0, err + } + nextScheduledTime := schedule.Next(time.Now()) + nextNextScheduledTime := schedule.Next(nextScheduledTime) + return nextNextScheduledTime.Sub(nextScheduledTime), nil +} diff --git a/test/e2e/etcd_backup_test.go b/test/e2e/etcd_backup_test.go index a4a2727bf..c035972aa 100644 --- a/test/e2e/etcd_backup_test.go +++ b/test/e2e/etcd_backup_test.go @@ -225,18 +225,7 @@ func checkEtcdReady(ctx context.Context, cl client.Client, logger logr.Logger, e if len(etcd.Status.Conditions) == 0 { return fmt.Errorf("etcd %s status conditions is empty", etcd.Name) } - - for _, c := range etcd.Status.Conditions { - // skip BackupReady status check if etcd.Spec.Backup.Store is not configured. - if etcd.Spec.Backup.Store == nil && c.Type == v1alpha1.ConditionTypeBackupReady { - continue - } - if c.Status != v1alpha1.ConditionTrue { - return fmt.Errorf("etcd %q status %q condition %s is not True", - etcd.Name, c.Type, c.Status) - } - } - return nil + return checkEtcdConditions(etcd) }, timeout, pollingInterval).Should(BeNil()) logger.Info("etcd is ready") @@ -317,3 +306,26 @@ func purgeEtcdPVCs(ctx context.Context, cl client.Client, etcdName string) { DeleteOptions: delOptions, }))).ShouldNot(HaveOccurred()) } + +func checkEtcdConditions(etcd *v1alpha1.Etcd) error { + backupEnabled := etcd.Spec.Backup.Store != nil + for _, condition := range etcd.Status.Conditions { + // determine if backup conditions need to be checked + // skip BackupReady, FullSnapshotBackupReady & DeltaSnapshotBackupReady status checks if etcd.Spec.Backup.Store is not configured. + if !backupEnabled && isBackupCondition(condition.Type) { + continue + } + // return an error if any condition is not true + if condition.Status != v1alpha1.ConditionTrue { + return fmt.Errorf("etcd %q has a %q condition that is not True: status %s", etcd.Name, condition.Type, condition.Status) + } + } + return nil +} + +// isBackupCondition checks if the condition type refers to backups health +func isBackupCondition(conditionType v1alpha1.ConditionType) bool { + return conditionType == v1alpha1.ConditionTypeBackupReady || + conditionType == v1alpha1.ConditionTypeFullSnapshotBackupReady || + conditionType == v1alpha1.ConditionTypeDeltaSnapshotBackupReady +} diff --git a/test/e2e/etcd_multi_node_test.go b/test/e2e/etcd_multi_node_test.go index 0e9d3f629..5ac0a5296 100644 --- a/test/e2e/etcd_multi_node_test.go +++ b/test/e2e/etcd_multi_node_test.go @@ -388,19 +388,6 @@ func hibernateAndCheckEtcd(ctx context.Context, cl client.Client, logger logr.Lo if etcd.Status.Ready != nil && *etcd.Status.Ready != true { return fmt.Errorf("etcd %s is not ready", etcd.Name) } - - // TODO: uncomment me once scale down is supported, - // currently ClusterSize is not updated while scaling down. - // - // if etcd.Status.ClusterSize == nil { - // return fmt.Errorf("etcd %s cluster size is empty", etcd.Name) - // } - // - // if *etcd.Status.ClusterSize != 0 { - // return fmt.Errorf("etcd %q cluster size is %d, but expected to be 0", - // etcdName, *etcd.Status.ClusterSize) - // } - if etcd.Status.ReadyReplicas != 0 { return fmt.Errorf("etcd readyReplicas is %d, but expected to be 0", etcd.Status.ReadyReplicas) } diff --git a/test/utils/etcd.go b/test/utils/etcd.go index 4d5757aa7..b779b11e3 100644 --- a/test/utils/etcd.go +++ b/test/utils/etcd.go @@ -133,7 +133,6 @@ func (eb *EtcdBuilder) WithReadyStatus() *EtcdBuilder { ReadyReplicas: eb.etcd.Spec.Replicas, Replicas: eb.etcd.Spec.Replicas, CurrentReplicas: eb.etcd.Spec.Replicas, - UpdatedReplicas: eb.etcd.Spec.Replicas, Ready: pointer.Bool(true), Members: members, Conditions: []druidv1alpha1.Condition{