Skip to content

Commit

Permalink
fix: prevent link and clustermachine deletion from getting stuck
Browse files Browse the repository at this point in the history
This PR contains fixes for the following issues:
- When a `Link` resource was deleted, `MachineCleanupController` was unable to remove finalizers on the `MachineSetNode` resource, as it did not pass the correct owner `""` when calling `Teardown`.
- `ClusterMachineStatusController` not handling the cases where the matching `Machine` might be missing.
- `MachineSetController` not handling the cases where the `Machine` resource was missing for any of the `MachineSetNode`s that were tearing down.

The latter two fixes prevent clusters from getting stuck if they are in a state where there is a missing machine in their machine sets.

Closes #100, closes #97.

Signed-off-by: Utku Ozdemir <[email protected]>
  • Loading branch information
utkuozdemir committed Apr 3, 2024
1 parent ae85293 commit 5dc2eaa
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func NewClusterMachineStatusController() *ClusterMachineStatusController {
},
TransformFunc: func(ctx context.Context, r controller.Reader, _ *zap.Logger, clusterMachine *omni.ClusterMachine, clusterMachineStatus *omni.ClusterMachineStatus) error {
machine, err := safe.ReaderGet[*omni.Machine](ctx, r, resource.NewMetadata(resources.DefaultNamespace, omni.MachineType, clusterMachine.Metadata().ID(), resource.VersionUndefined))
if err != nil {
if err != nil && !state.IsNotFoundError(err) {
return err
}

Expand All @@ -51,7 +51,7 @@ func NewClusterMachineStatusController() *ClusterMachineStatusController {
omni.LabelMachineSet,
)

if machine.TypedSpec().Value.Connected {
if machine != nil && machine.TypedSpec().Value.Connected {
clusterMachineStatus.Metadata().Labels().Set(omni.MachineStatusLabelConnected, "")
} else {
clusterMachineStatus.Metadata().Labels().Delete(omni.MachineStatusLabelConnected)
Expand Down Expand Up @@ -140,11 +140,12 @@ func NewClusterMachineStatusController() *ClusterMachineStatusController {

_, isControlPlaneNode := clusterMachineStatus.Metadata().Labels().Get(omni.LabelControlPlaneRole)
if (status.GetStage() == machineapi.MachineStatusEvent_BOOTING || status.GetStage() == machineapi.MachineStatusEvent_RUNNING) && isControlPlaneNode {
cmsVal.ApidAvailable = machine.TypedSpec().Value.Connected
cmsVal.ApidAvailable = machine != nil && machine.TypedSpec().Value.Connected
}

// should we also mark it as not ready when the clustermachine is tearing down (?)
cmsVal.Ready = status.GetStatus().GetReady() && machine.TypedSpec().Value.Connected
cmsVal.Ready = status.GetStatus().GetReady() &&
machine != nil && machine.TypedSpec().Value.Connected

if clusterMachine.Metadata().Phase() == resource.PhaseTearingDown {
cmsVal.Stage = specs.ClusterMachineStatusSpec_DESTROYING
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/resource"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
"go.uber.org/zap"

"github.com/siderolabs/omni/client/pkg/omni/resources"
Expand Down Expand Up @@ -92,6 +93,10 @@ func UpdateFinalizers(ctx context.Context, r controller.ReaderWriter, rc *Reconc
for _, clusterMachine := range rc.GetClusterMachines() {
if clusterMachine.Metadata().Phase() == resource.PhaseRunning && !clusterMachine.Metadata().Finalizers().Has(ControllerName) {
if err := r.AddFinalizer(ctx, omni.NewMachine(resources.DefaultNamespace, clusterMachine.Metadata().ID()).Metadata(), ControllerName); err != nil {
if state.IsNotFoundError(err) {
continue
}

return err
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func (h *sameIDHandler[I, O]) FinalizerRemoval(ctx context.Context, r controller
return nil
}

ready, err := r.Teardown(ctx, md)
ready, err := r.Teardown(ctx, md, controller.WithOwner(""))
if err != nil {
return err
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Copyright (c) 2024 Sidero Labs, Inc.
//
// Use of this software is governed by the Business Source License
// included in the LICENSE file.

package omni_test

import (
"context"
"testing"
"time"

"github.com/cosi-project/runtime/pkg/resource/rtestutils"
"github.com/cosi-project/runtime/pkg/state"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/suite"

"github.com/siderolabs/omni/client/pkg/omni/resources"
"github.com/siderolabs/omni/client/pkg/omni/resources/omni"
omnictrl "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni"
)

type MachineCleanupSuite struct {
OmniSuite
}

func (suite *MachineCleanupSuite) TestCleanup() {
require := suite.Require()

suite.ctx, suite.ctxCancel = context.WithTimeout(suite.ctx, time.Second*10)

suite.startRuntime()

controller := omnictrl.NewMachineCleanupController()

require.NoError(suite.runtime.RegisterController(controller))

machineID := "machine-cleanup-test-machine"

machineSet := omni.NewMachineSet(resources.DefaultNamespace, "machine-cleanup-test-machine-set")
machineSetNode := omni.NewMachineSetNode(resources.DefaultNamespace, machineID, machineSet)
machine := omni.NewMachine(resources.DefaultNamespace, machineID)

machine.Metadata().Finalizers().Add(controller.Name())

require.NoError(suite.state.Create(suite.ctx, machineSetNode))
require.NoError(suite.state.Create(suite.ctx, machine))

_, err := suite.state.Teardown(suite.ctx, machine.Metadata())

require.NoError(err)

assertNoResource[*omni.MachineSetNode](&suite.OmniSuite, machineSetNode)

assertResource[*omni.Machine](&suite.OmniSuite, machine.Metadata(), func(r *omni.Machine, assertion *assert.Assertions) {
assertion.Empty(r.Metadata().Finalizers())
})

require.NoError(suite.state.Destroy(suite.ctx, machine.Metadata()))
}

func (suite *MachineCleanupSuite) TestSkipMachineSetNodeWithOwner() {
require := suite.Require()

suite.ctx, suite.ctxCancel = context.WithTimeout(suite.ctx, time.Second*10)

suite.startRuntime()

controller := omnictrl.NewMachineCleanupController()

require.NoError(suite.runtime.RegisterController(controller))

machineID := "machine-cleanup-skip-test-machine"

machineSet := omni.NewMachineSet(resources.DefaultNamespace, "machine-cleanup-skip-test-machine-set")
machineSetNode := omni.NewMachineSetNode(resources.DefaultNamespace, machineID, machineSet)
machine := omni.NewMachine(resources.DefaultNamespace, machineID)

machine.Metadata().Finalizers().Add(controller.Name())
require.NoError(machineSetNode.Metadata().SetOwner("some-owner"))

require.NoError(suite.state.Create(suite.ctx, machineSetNode, state.WithCreateOwner("some-owner")))
require.NoError(suite.state.Create(suite.ctx, machine))

rtestutils.Destroy[*omni.Machine](suite.ctx, suite.T(), suite.state, []string{machine.Metadata().ID()})

// MachineSetNode should still be around, as it is owned by a controller - CleanupController should skip it
assertResource[*omni.MachineSetNode](&suite.OmniSuite, machine.Metadata(), func(*omni.MachineSetNode, *assert.Assertions) {})
}

func TestMachineCleanupSuite(t *testing.T) {
suite.Run(t, new(MachineCleanupSuite))
}

0 comments on commit 5dc2eaa

Please sign in to comment.