Skip to content

Commit

Permalink
feat: reset machine when it's removed from Omni
Browse files Browse the repository at this point in the history
The reset removes Talos from the machine completely.
Fixes: #419

Signed-off-by: Artem Chernyshev <[email protected]>
  • Loading branch information
Unix4ever committed Oct 30, 2024
1 parent 900987b commit cc59192
Show file tree
Hide file tree
Showing 3 changed files with 196 additions and 9 deletions.
194 changes: 194 additions & 0 deletions internal/backend/runtime/omni/controllers/omni/machine_teardown.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
// Copyright (c) 2024 Sidero Labs, Inc.
//
// Use of this software is governed by the Business Source License
// included in the LICENSE file.

package omni

import (
"context"
"fmt"
"time"

"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/controller/generic"
"github.com/cosi-project/runtime/pkg/resource"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
"github.com/siderolabs/gen/optional"
"github.com/siderolabs/talos/pkg/machinery/client"
"go.uber.org/zap"

"github.com/siderolabs/omni/client/pkg/omni/resources"
"github.com/siderolabs/omni/client/pkg/omni/resources/omni"
"github.com/siderolabs/omni/internal/backend/runtime/talos"
)

// MachineTeardownControllerName is the name of the MachineTeardownController.
const MachineTeardownControllerName = "MachineTeardownController"

// MachineTeardownController processes additional teardown steps for a machine leaving a machine set.
type MachineTeardownController struct {
generic.NamedController
}

// NewMachineTeardownController initializes MachineTeardownController.
func NewMachineTeardownController() *MachineTeardownController {
return &MachineTeardownController{
NamedController: generic.NamedController{
ControllerName: MachineTeardownControllerName,
},
}
}

// Settings implements controller.QController interface.
func (ctrl *MachineTeardownController) Settings() controller.QSettings {
return controller.QSettings{
Inputs: []controller.Input{
{
Namespace: resources.DefaultNamespace,
Type: omni.MachineStatusType,
Kind: controller.InputQPrimary,
},
{
Namespace: resources.DefaultNamespace,
Type: omni.TalosConfigType,
Kind: controller.InputQMapped,
},
},
Concurrency: optional.Some[uint](4),
}
}

// MapInput implements controller.QController interface.
func (ctrl *MachineTeardownController) MapInput(_ context.Context, _ *zap.Logger, _ controller.QRuntime, ptr resource.Pointer) ([]resource.Pointer, error) {
if ptr.Type() == omni.TalosConfigType {
return nil, nil
}

return nil, fmt.Errorf("unexpected resource type %q", ptr.Type())
}

// Reconcile implements controller.QController interface.
func (ctrl *MachineTeardownController) Reconcile(ctx context.Context, logger *zap.Logger, r controller.QRuntime, ptr resource.Pointer) error {
machineStatus, err := safe.ReaderGetByID[*omni.MachineStatus](ctx, r, ptr.ID())
if err != nil {
if state.IsNotFoundError(err) {
return nil
}

return err
}

if machineStatus.Metadata().Phase() == resource.PhaseTearingDown {
if err := ctrl.resetMachine(ctx, r, machineStatus, logger); err != nil {
return err
}

return r.RemoveFinalizer(ctx, machineStatus.Metadata(), ctrl.Name())
}

if !machineStatus.Metadata().Finalizers().Has(ctrl.Name()) {
return r.AddFinalizer(ctx, machineStatus.Metadata(), ctrl.Name())
}

return nil
}

func (ctrl *MachineTeardownController) resetMachine(ctx context.Context, r controller.QRuntime,
machineStatus *omni.MachineStatus, logger *zap.Logger,
) error {
ctx, cancel := context.WithTimeout(ctx, time.Second*10)
defer cancel()

client, err := ctrl.getClient(ctx, r, machineStatus)
if err != nil {
return err
}

disks, err := client.Disks(ctx)
if err != nil {
logger.Warn("machine wipe check failed", zap.Error(err))

return nil
}

var installed bool

for _, m := range disks.Messages {
for _, d := range m.Disks {
if d.SystemDisk {
installed = true

break
}
}
}

if !installed {
logger.Info("skipping machine wipe as Talos is not installed")

return nil
}

// try to wipe the machine without any attempts to retry it
err = client.Reset(ctx, false, false)
if err != nil {
logger.Warn("machine wipe failed", zap.Error(err))

return nil
}

logger.Info("wiped Talos on the machine")

return nil
}

func (ctrl *MachineTeardownController) getClient(
ctx context.Context,
r controller.QRuntime,
machineStatus *omni.MachineStatus,
) (*client.Client, error) {
address := machineStatus.TypedSpec().Value.ManagementAddress
opts := talos.GetSocketOptions(address)

clusterName, ok := machineStatus.Metadata().Labels().Get(omni.LabelCluster)
if !ok {
return client.New(ctx,
append(
opts,
client.WithTLSConfig(insecureTLSConfig),
client.WithEndpoints(address),
)...)
}

talosConfig, err := safe.ReaderGet[*omni.TalosConfig](ctx, r, omni.NewTalosConfig(resources.DefaultNamespace, clusterName).Metadata())
if err != nil && !state.IsNotFoundError(err) {
return nil, fmt.Errorf("cluster '%s' failed to get talosconfig: %w", clusterName, err)
}

if talosConfig == nil {
return client.New(ctx,
append(
opts,
client.WithTLSConfig(insecureTLSConfig),
client.WithEndpoints(address),
)...)
}

var endpoints []string

if opts == nil {
endpoints = []string{address}
}

config := omni.NewTalosClientConfig(talosConfig, endpoints...)
opts = append(opts, client.WithConfig(config))

result, err := client.New(ctx, opts...)
if err != nil {
return nil, fmt.Errorf("failed to create client to machine '%s': %w", machineStatus.Metadata().ID(), err)
}

return result, nil
}
1 change: 1 addition & 0 deletions internal/backend/runtime/omni/omni.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ func New(talosClientFactory *talos.ClientFactory, dnsService *dns.Service, workl
omnictrl.NewLabelsExtractorController[*omni.MachineStatus](),
omnictrl.NewMachineRequestSetStatusController(),
omnictrl.NewClusterMachineRequestStatusController(),
omnictrl.NewMachineTeardownController(),
}

if config.Config.Auth.SAML.Enabled {
Expand Down
10 changes: 1 addition & 9 deletions internal/pkg/siderolink/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -586,15 +586,7 @@ func (manager *Manager) cleanupDestroyedLinks(ctx context.Context) error {
for {
select {
case event := <-events:
//nolint:exhaustive
switch event.Type {
case state.Updated:
if event.Resource.Metadata().Phase() != resource.PhaseTearingDown {
break
}

fallthrough
case state.Destroyed:
if event.Type == state.Destroyed {
link, ok := event.Resource.(*siderolink.Link)
if !ok {
return fmt.Errorf("failed to cast resource to siderolink.Link type")
Expand Down

0 comments on commit cc59192

Please sign in to comment.