Skip to content

Commit

Permalink
Merge pull request rook#14160 from rkachach/fix_issue_13601
Browse files Browse the repository at this point in the history
mgr: fix UpdateActiveMgrLabel to retry label update on failure
  • Loading branch information
travisn authored May 6, 2024
2 parents 1af97d0 + ee7c71c commit 905c824
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 3 deletions.
4 changes: 2 additions & 2 deletions cmd/rook/ceph/mgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,9 @@ func runMgrSidecar(cmd *cobra.Command, args []string) error {
clusterInfo.CephVersion = *version

m := mgr.New(context, &clusterInfo, clusterSpec, "")
prevActiveMgr := "unknown"
activeMgr := "unknown"
for {
prevActiveMgr, err = m.UpdateActiveMgrLabel(daemonName, prevActiveMgr)
activeMgr, err = m.UpdateActiveMgrLabel(daemonName, activeMgr)
if err != nil {
logger.Errorf("failed to reconcile services. %v", err)
} else {
Expand Down
9 changes: 8 additions & 1 deletion pkg/operator/ceph/cluster/mgr/mgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,10 @@ func (c *Cluster) UpdateActiveMgrLabel(daemonNameToUpdate string, prevActiveMgr
return "", err // force mrg_role update in the next call
}

// Normally, there should only be one mgr pod with the specific name daemonNameToUpdate. However,
// during transitions, there might be additional mgr pods shutting down. To handle this, the code
// updates the label mgrRoleLabelName on all mgr pods. If this update fails, the system rolls back
// the currently active manager (currActiveMgr). This way the next call will retry the update.
for i, pod := range pods.Items {

labels := pod.GetLabels()
Expand All @@ -288,7 +292,10 @@ func (c *Cluster) UpdateActiveMgrLabel(daemonNameToUpdate string, prevActiveMgr
pod.SetLabels(labels)
_, err = c.context.Clientset.CoreV1().Pods(c.clusterInfo.Namespace).Update(c.clusterInfo.Context, &pods.Items[i], metav1.UpdateOptions{})
if err != nil {
logger.Infof("cannot update the active mgr pod %q. err=%v", pods.Items[i].Name, err)
// In case of failure we report as 'active manager' the previous value, this way
// we force refreshing mgrRoleLabelName label next time this function is called
currActiveMgr = prevActiveMgr
logger.Warningf("cannot update the active mgr pod %q. err=%v", pods.Items[i].Name, err)
}
}
}
Expand Down

0 comments on commit 905c824

Please sign in to comment.