From 161ff2861593c55c30d85ff9e6e99d46967ef7bc Mon Sep 17 00:00:00 2001 From: Malay Kumar Parida Date: Thu, 23 Nov 2023 21:04:24 +0530 Subject: [PATCH] Ensure resource profile changes are applied correctly without any issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Determine resource profile change by comparing sc.Spec.ResourceProfile with sc.Status.LastAppliedResourceProfile. If they are different, then a resource profile change is in progress. • Mark the ContinueUpgradeAfterChecksEvenIfNotHealthy flag on the cephCluster CR to false if a resource profile change is in progress. • During the below process mark the storageCluster phase as Progressing. The storageCluster conditions will show "New resource profile is being applied". • Keep checking if any of the pods with the new resource profile label are in pending state for more than 2 minutes. If yes, then assume that the resource profile change has failed. Mark the storageCluster phase as Error and the storageCluster conditions will show "New resource profile failed to apply, please revert to the last working resource profile". • Start Checking if the ceph dameons(mgr,mon,osd,mds,rgw) have the correct number of running pods where each pod has the current resource profile label. • If the above checks are successful, the new resource profile has been applied successfully. Set the resource profile in the status & Mark the storageCluster phase to Ready. Signed-off-by: Malay Kumar Parida --- api/v1/storagecluster_types.go | 3 + controllers/defaults/defaults.go | 2 + controllers/storagecluster/cephcluster.go | 15 ++- controllers/storagecluster/cephfilesystem.go | 1 + .../storagecluster/cephobjectstores.go | 1 + controllers/storagecluster/reconcile.go | 20 ++++ controllers/storagecluster/resourceprofile.go | 96 +++++++++++++++++++ controllers/util/k8sutil.go | 23 +++++ 8 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 controllers/storagecluster/resourceprofile.go diff --git a/api/v1/storagecluster_types.go b/api/v1/storagecluster_types.go index 8405e9a064..924bf1751a 100644 --- a/api/v1/storagecluster_types.go +++ b/api/v1/storagecluster_types.go @@ -496,6 +496,9 @@ type StorageClusterStatus struct { // +optional FailureDomainValues []string `json:"failureDomainValues,omitempty"` + // LastAppliedResourceProfile is the resource profile that was last applied successfully & is currently in use. + LastAppliedResourceProfile string `json:"lastAppliedResourceProfile,omitempty"` + // StorageProviderEndpoint holds endpoint info on Provider cluster which is required // for consumer to establish connection with the storage providing cluster. StorageProviderEndpoint string `json:"storageProviderEndpoint,omitempty"` diff --git a/controllers/defaults/defaults.go b/controllers/defaults/defaults.go index 123b4867bb..6a6842d03e 100644 --- a/controllers/defaults/defaults.go +++ b/controllers/defaults/defaults.go @@ -11,6 +11,8 @@ const ( // RackTopologyKey is the node label used to distribute storage nodes // when there are not enough AZs presnet across the nodes RackTopologyKey = "topology.rook.io/rack" + // ODFResourceProfileKey is the label key used to identify the resource profile of the pod + ODFResourceProfileKey = "odf-resource-profile" // KubeMajorTopologySpreadConstraints is the minimum major kube version to support TSC // used along with KubeMinorTSC for version comparison KubeMajorTopologySpreadConstraints = "1" diff --git a/controllers/storagecluster/cephcluster.go b/controllers/storagecluster/cephcluster.go index 51a7181321..0da359b8fe 100644 --- a/controllers/storagecluster/cephcluster.go +++ b/controllers/storagecluster/cephcluster.go @@ -461,9 +461,13 @@ func newCephCluster(sc *ocsv1.StorageCluster, cephImage string, nodeCount int, s rookCephv1.KeyOSD: systemNodeCritical, }, Resources: newCephDaemonResources(sc), - ContinueUpgradeAfterChecksEvenIfNotHealthy: true, + // if resource profile change is in progress, then set this flag to false + ContinueUpgradeAfterChecksEvenIfNotHealthy: sc.Spec.ResourceProfile == sc.Status.LastAppliedResourceProfile, LogCollector: logCollector, Labels: rookCephv1.LabelsSpec{ + rookCephv1.KeyMgr: rookCephv1.Labels{defaults.ODFResourceProfileKey: sc.Spec.ResourceProfile}, + rookCephv1.KeyMon: rookCephv1.Labels{defaults.ODFResourceProfileKey: sc.Spec.ResourceProfile}, + rookCephv1.KeyOSD: rookCephv1.Labels{defaults.ODFResourceProfileKey: sc.Spec.ResourceProfile}, rookCephv1.KeyMonitoring: getCephClusterMonitoringLabels(*sc), }, CSI: rookCephv1.CSIDriverSpec{ @@ -1303,3 +1307,12 @@ func isBluestore(store rookCephv1.OSDStore) bool { return false } + +func (r *StorageClusterReconciler) getOsdCount(sc *ocsv1.StorageCluster) int { + storageClassDeviceSets := newStorageClassDeviceSets(sc, r.serverVersion) + osdCount := 0 + for _, ds := range storageClassDeviceSets { + osdCount += ds.Count + } + return osdCount +} diff --git a/controllers/storagecluster/cephfilesystem.go b/controllers/storagecluster/cephfilesystem.go index fefd9af9f5..970dc21c0b 100644 --- a/controllers/storagecluster/cephfilesystem.go +++ b/controllers/storagecluster/cephfilesystem.go @@ -41,6 +41,7 @@ func (r *StorageClusterReconciler) newCephFilesystemInstances(initStorageCluster Resources: defaults.GetProfileDaemonResources("mds", initStorageCluster), // set PriorityClassName for the MDS pods PriorityClassName: openshiftUserCritical, + Labels: cephv1.Labels{defaults.ODFResourceProfileKey: initStorageCluster.Spec.ResourceProfile}, }, }, } diff --git a/controllers/storagecluster/cephobjectstores.go b/controllers/storagecluster/cephobjectstores.go index 8086a9b6e8..231fdbcf39 100644 --- a/controllers/storagecluster/cephobjectstores.go +++ b/controllers/storagecluster/cephobjectstores.go @@ -187,6 +187,7 @@ func (r *StorageClusterReconciler) newCephObjectStoreInstances(initData *ocsv1.S Resources: defaults.GetProfileDaemonResources("rgw", initData), // set PriorityClassName for the rgw pods PriorityClassName: openshiftUserCritical, + Labels: cephv1.Labels{defaults.ODFResourceProfileKey: initData.Spec.ResourceProfile}, }, }, }, diff --git a/controllers/storagecluster/reconcile.go b/controllers/storagecluster/reconcile.go index da7dc3869d..8274ba558b 100644 --- a/controllers/storagecluster/reconcile.go +++ b/controllers/storagecluster/reconcile.go @@ -471,6 +471,26 @@ func (r *StorageClusterReconciler) reconcilePhases( return returnRes, nil } } + // Process resource profiles only if the cluster is not external or provider mode or noobaa standalone, and if the resource profile has changed + if !(instance.Spec.ExternalStorage.Enable || instance.Spec.AllowRemoteStorageConsumers || r.IsNoobaaStandalone) && + (instance.Spec.ResourceProfile != instance.Status.LastAppliedResourceProfile) { + err := r.ensureResourceProfileChangeApplied(instance) + if err != nil { + if err == errResourceProfileChangeApplying { + reason := ocsv1.ReconcileFailed + message := err.Error() + statusutil.SetProgressingCondition(&instance.Status.Conditions, reason, message) + instance.Status.Phase = statusutil.PhaseProgressing + return reconcile.Result{Requeue: true}, nil + } else if err == errResourceProfileChangeFailed { + reason := ocsv1.ReconcileFailed + message := err.Error() + statusutil.SetErrorCondition(&instance.Status.Conditions, reason, message) + instance.Status.Phase = statusutil.PhaseError + } + return reconcile.Result{}, err + } + } // All component operators are in a happy state. if r.conditions == nil { r.Log.Info("No component operator reported negatively.") diff --git a/controllers/storagecluster/resourceprofile.go b/controllers/storagecluster/resourceprofile.go new file mode 100644 index 0000000000..b19b862b7e --- /dev/null +++ b/controllers/storagecluster/resourceprofile.go @@ -0,0 +1,96 @@ +package storagecluster + +import ( + "errors" + "time" + + ocsv1 "github.com/red-hat-storage/ocs-operator/v4/api/v1" + "github.com/red-hat-storage/ocs-operator/v4/controllers/defaults" + "github.com/red-hat-storage/ocs-operator/v4/controllers/util" + corev1 "k8s.io/api/core/v1" +) + +const ( + resourceProfileChangeApplyingMessage = "New resource profile is being applied" + resourceProfileChangeFailedMessage = "New resource profile failed to apply, please revert to the last working resource profile" +) + +var ( + errResourceProfileChangeApplying = errors.New(resourceProfileChangeApplyingMessage) + errResourceProfileChangeFailed = errors.New(resourceProfileChangeFailedMessage) +) + +func (r *StorageClusterReconciler) ensureResourceProfileChangeApplied(sc *ocsv1.StorageCluster) error { + currentResourceProfile := sc.Spec.ResourceProfile + lastAppliedResourceProfile := sc.Status.LastAppliedResourceProfile + r.Log.Info("Applying new resource profile", "current", currentResourceProfile, "last", lastAppliedResourceProfile) + + // If any of the pods with the current resource profile label are in pending state for more than 2 minutes, then we assume that the resource profile change has failed as the pods are stuck in pending state + podList, err := util.GetPodsWithLabels(r.ctx, r.Client, sc.Namespace, map[string]string{defaults.ODFResourceProfileKey: currentResourceProfile}) + if err != nil { + return err + } + for _, pod := range podList.Items { + if pod.Status.Phase == corev1.PodPending { + if time.Since(pod.CreationTimestamp.Time) < 2*time.Minute { + return errResourceProfileChangeApplying + } + r.Log.Error(errResourceProfileChangeFailed, "Pod is stuck in pending state", "pod", pod.Name) + return errResourceProfileChangeFailed + } + } + + // Verify if expected number of mgr pods with the current resource profile label are running + if err := r.verifyDaemonWithResourceProfile(sc.Namespace, map[string]string{"app": "rook-ceph-mgr", defaults.ODFResourceProfileKey: currentResourceProfile}, getMgrCount(sc)); err != nil { + return err + } + + // Verify if expected number of mon pods with the current resource profile label are running + if err := r.verifyDaemonWithResourceProfile(sc.Namespace, map[string]string{"app": "rook-ceph-mon", defaults.ODFResourceProfileKey: currentResourceProfile}, getMonCount(sc)); err != nil { + return err + } + + // Verify if expected number of osd pods with the current resource profile label are running + if err := r.verifyDaemonWithResourceProfile(sc.Namespace, map[string]string{"app": "rook-ceph-osd", defaults.ODFResourceProfileKey: currentResourceProfile}, r.getOsdCount(sc)); err != nil { + return err + } + + // Verify if expected number of mds pods with the current resource profile label are running + if err := r.verifyDaemonWithResourceProfile(sc.Namespace, map[string]string{"app": "rook-ceph-mds", defaults.ODFResourceProfileKey: currentResourceProfile}, 2*getActiveMetadataServers(sc)); err != nil { + return err + } + + // If rgw is not skipped, Verify if expected number of rgw pods with the current resource profile label are running + skiprgw, err := r.PlatformsShouldSkipObjectStore() + if err != nil { + return err + } + if !skiprgw { + if err := r.verifyDaemonWithResourceProfile(sc.Namespace, map[string]string{"app": "rook-ceph-rgw", defaults.ODFResourceProfileKey: currentResourceProfile}, getCephObjectStoreGatewayInstances(sc)); err != nil { + return err + } + } + + // If we are here, then all the ceph daemons have the correct count of pods with the new resource profile + // New resource profile has been applied successfully + sc.Status.LastAppliedResourceProfile = currentResourceProfile + if err = r.Client.Status().Update(r.ctx, sc); err != nil { + r.Log.Error(err, "Error updating status after resource profile change") + return err + } + r.Log.Info("Resource profile change applied successfully", "current", currentResourceProfile, "last", lastAppliedResourceProfile) + return nil +} + +// verifyDaemonsWithResourceProfile verifies if a ceph daemon has the expected number of pods running with the given resource profile label +func (r *StorageClusterReconciler) verifyDaemonWithResourceProfile(namespace string, labelSelector map[string]string, count int) error { + podList, err := util.GetPodsWithLabels(r.ctx, r.Client, namespace, labelSelector) + if err != nil { + return err + } + if util.GetCountOfRunningPods(podList) != count { + r.Log.Info("pod count mismatch", "app", labelSelector["app"], "expected", count, "actual", len(podList.Items)) + return errResourceProfileChangeApplying + } + return nil +} diff --git a/controllers/util/k8sutil.go b/controllers/util/k8sutil.go index 7e487aeb02..065c1ec429 100644 --- a/controllers/util/k8sutil.go +++ b/controllers/util/k8sutil.go @@ -9,6 +9,7 @@ import ( "github.com/go-logr/logr" configv1 "github.com/openshift/api/config/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -95,3 +96,25 @@ func RestartPod(ctx context.Context, kubeClient client.Client, logger *logr.Logg } } } + +// GetPodsWithLabels gives all the pods that are in a namespace after filtering them based on the given label selector +func GetPodsWithLabels(ctx context.Context, kubeClient client.Client, namespace string, labelSelector map[string]string) (*corev1.PodList, error) { + podList := &corev1.PodList{} + if err := kubeClient.List(ctx, podList, client.InNamespace(namespace), &client.ListOptions{ + LabelSelector: labels.SelectorFromSet(labelSelector), + }); err != nil { + return nil, err + } + return podList, nil +} + +// getCountOfRunningPods gives the count of pods in running state in a given pod list +func GetCountOfRunningPods(podList *corev1.PodList) int { + count := 0 + for _, pod := range podList.Items { + if pod.Status.Phase == corev1.PodRunning { + count++ + } + } + return count +}