From ee7c71c89e29135b33199cd2238cb9d5135b1e93 Mon Sep 17 00:00:00 2001 From: Redouane Kachach Date: Mon, 6 May 2024 12:58:46 +0200 Subject: [PATCH] mgr: fix UpdateActiveMgrLabel to retry label update on failure fix UpdateActiveMgrLabel to retry label update on failure. Previously, the function always returned the current manager, even if update failed, leading to inconsistent labels. This fix ensures retries on failures. Fixes: https://github.com/rook/rook/issues/13601 Signed-off-by: Redouane Kachach --- cmd/rook/ceph/mgr.go | 4 ++-- pkg/operator/ceph/cluster/mgr/mgr.go | 9 ++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/cmd/rook/ceph/mgr.go b/cmd/rook/ceph/mgr.go index 2ace8ed385aa..839555088886 100644 --- a/cmd/rook/ceph/mgr.go +++ b/cmd/rook/ceph/mgr.go @@ -96,9 +96,9 @@ func runMgrSidecar(cmd *cobra.Command, args []string) error { clusterInfo.CephVersion = *version m := mgr.New(context, &clusterInfo, clusterSpec, "") - prevActiveMgr := "unknown" + activeMgr := "unknown" for { - prevActiveMgr, err = m.UpdateActiveMgrLabel(daemonName, prevActiveMgr) + activeMgr, err = m.UpdateActiveMgrLabel(daemonName, activeMgr) if err != nil { logger.Errorf("failed to reconcile services. %v", err) } else { diff --git a/pkg/operator/ceph/cluster/mgr/mgr.go b/pkg/operator/ceph/cluster/mgr/mgr.go index 3f8c5b4dceb0..f0a887bc62b7 100644 --- a/pkg/operator/ceph/cluster/mgr/mgr.go +++ b/pkg/operator/ceph/cluster/mgr/mgr.go @@ -271,6 +271,10 @@ func (c *Cluster) UpdateActiveMgrLabel(daemonNameToUpdate string, prevActiveMgr return "", err // force mrg_role update in the next call } + // Normally, there should only be one mgr pod with the specific name daemonNameToUpdate. However, + // during transitions, there might be additional mgr pods shutting down. To handle this, the code + // updates the label mgrRoleLabelName on all mgr pods. If this update fails, the system rolls back + // the currently active manager (currActiveMgr). This way the next call will retry the update. for i, pod := range pods.Items { labels := pod.GetLabels() @@ -288,7 +292,10 @@ func (c *Cluster) UpdateActiveMgrLabel(daemonNameToUpdate string, prevActiveMgr pod.SetLabels(labels) _, err = c.context.Clientset.CoreV1().Pods(c.clusterInfo.Namespace).Update(c.clusterInfo.Context, &pods.Items[i], metav1.UpdateOptions{}) if err != nil { - logger.Infof("cannot update the active mgr pod %q. err=%v", pods.Items[i].Name, err) + // In case of failure we report as 'active manager' the previous value, this way + // we force refreshing mgrRoleLabelName label next time this function is called + currActiveMgr = prevActiveMgr + logger.Warningf("cannot update the active mgr pod %q. err=%v", pods.Items[i].Name, err) } } }