From cc5919273b62320034a13c3d75902d29e94fd499 Mon Sep 17 00:00:00 2001 From: Artem Chernyshev Date: Thu, 24 Oct 2024 21:11:44 +0300 Subject: [PATCH] feat: reset machine when it's removed from Omni The reset removes Talos from the machine completely. Fixes: https://github.com/siderolabs/omni/issues/419 Signed-off-by: Artem Chernyshev --- .../omni/controllers/omni/machine_teardown.go | 194 ++++++++++++++++++ internal/backend/runtime/omni/omni.go | 1 + internal/pkg/siderolink/manager.go | 10 +- 3 files changed, 196 insertions(+), 9 deletions(-) create mode 100644 internal/backend/runtime/omni/controllers/omni/machine_teardown.go diff --git a/internal/backend/runtime/omni/controllers/omni/machine_teardown.go b/internal/backend/runtime/omni/controllers/omni/machine_teardown.go new file mode 100644 index 00000000..0173fce0 --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/machine_teardown.go @@ -0,0 +1,194 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package omni + +import ( + "context" + "fmt" + "time" + + "github.com/cosi-project/runtime/pkg/controller" + "github.com/cosi-project/runtime/pkg/controller/generic" + "github.com/cosi-project/runtime/pkg/resource" + "github.com/cosi-project/runtime/pkg/safe" + "github.com/cosi-project/runtime/pkg/state" + "github.com/siderolabs/gen/optional" + "github.com/siderolabs/talos/pkg/machinery/client" + "go.uber.org/zap" + + "github.com/siderolabs/omni/client/pkg/omni/resources" + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/talos" +) + +// MachineTeardownControllerName is the name of the MachineTeardownController. +const MachineTeardownControllerName = "MachineTeardownController" + +// MachineTeardownController processes additional teardown steps for a machine leaving a machine set. +type MachineTeardownController struct { + generic.NamedController +} + +// NewMachineTeardownController initializes MachineTeardownController. +func NewMachineTeardownController() *MachineTeardownController { + return &MachineTeardownController{ + NamedController: generic.NamedController{ + ControllerName: MachineTeardownControllerName, + }, + } +} + +// Settings implements controller.QController interface. +func (ctrl *MachineTeardownController) Settings() controller.QSettings { + return controller.QSettings{ + Inputs: []controller.Input{ + { + Namespace: resources.DefaultNamespace, + Type: omni.MachineStatusType, + Kind: controller.InputQPrimary, + }, + { + Namespace: resources.DefaultNamespace, + Type: omni.TalosConfigType, + Kind: controller.InputQMapped, + }, + }, + Concurrency: optional.Some[uint](4), + } +} + +// MapInput implements controller.QController interface. +func (ctrl *MachineTeardownController) MapInput(_ context.Context, _ *zap.Logger, _ controller.QRuntime, ptr resource.Pointer) ([]resource.Pointer, error) { + if ptr.Type() == omni.TalosConfigType { + return nil, nil + } + + return nil, fmt.Errorf("unexpected resource type %q", ptr.Type()) +} + +// Reconcile implements controller.QController interface. +func (ctrl *MachineTeardownController) Reconcile(ctx context.Context, logger *zap.Logger, r controller.QRuntime, ptr resource.Pointer) error { + machineStatus, err := safe.ReaderGetByID[*omni.MachineStatus](ctx, r, ptr.ID()) + if err != nil { + if state.IsNotFoundError(err) { + return nil + } + + return err + } + + if machineStatus.Metadata().Phase() == resource.PhaseTearingDown { + if err := ctrl.resetMachine(ctx, r, machineStatus, logger); err != nil { + return err + } + + return r.RemoveFinalizer(ctx, machineStatus.Metadata(), ctrl.Name()) + } + + if !machineStatus.Metadata().Finalizers().Has(ctrl.Name()) { + return r.AddFinalizer(ctx, machineStatus.Metadata(), ctrl.Name()) + } + + return nil +} + +func (ctrl *MachineTeardownController) resetMachine(ctx context.Context, r controller.QRuntime, + machineStatus *omni.MachineStatus, logger *zap.Logger, +) error { + ctx, cancel := context.WithTimeout(ctx, time.Second*10) + defer cancel() + + client, err := ctrl.getClient(ctx, r, machineStatus) + if err != nil { + return err + } + + disks, err := client.Disks(ctx) + if err != nil { + logger.Warn("machine wipe check failed", zap.Error(err)) + + return nil + } + + var installed bool + + for _, m := range disks.Messages { + for _, d := range m.Disks { + if d.SystemDisk { + installed = true + + break + } + } + } + + if !installed { + logger.Info("skipping machine wipe as Talos is not installed") + + return nil + } + + // try to wipe the machine without any attempts to retry it + err = client.Reset(ctx, false, false) + if err != nil { + logger.Warn("machine wipe failed", zap.Error(err)) + + return nil + } + + logger.Info("wiped Talos on the machine") + + return nil +} + +func (ctrl *MachineTeardownController) getClient( + ctx context.Context, + r controller.QRuntime, + machineStatus *omni.MachineStatus, +) (*client.Client, error) { + address := machineStatus.TypedSpec().Value.ManagementAddress + opts := talos.GetSocketOptions(address) + + clusterName, ok := machineStatus.Metadata().Labels().Get(omni.LabelCluster) + if !ok { + return client.New(ctx, + append( + opts, + client.WithTLSConfig(insecureTLSConfig), + client.WithEndpoints(address), + )...) + } + + talosConfig, err := safe.ReaderGet[*omni.TalosConfig](ctx, r, omni.NewTalosConfig(resources.DefaultNamespace, clusterName).Metadata()) + if err != nil && !state.IsNotFoundError(err) { + return nil, fmt.Errorf("cluster '%s' failed to get talosconfig: %w", clusterName, err) + } + + if talosConfig == nil { + return client.New(ctx, + append( + opts, + client.WithTLSConfig(insecureTLSConfig), + client.WithEndpoints(address), + )...) + } + + var endpoints []string + + if opts == nil { + endpoints = []string{address} + } + + config := omni.NewTalosClientConfig(talosConfig, endpoints...) + opts = append(opts, client.WithConfig(config)) + + result, err := client.New(ctx, opts...) + if err != nil { + return nil, fmt.Errorf("failed to create client to machine '%s': %w", machineStatus.Metadata().ID(), err) + } + + return result, nil +} diff --git a/internal/backend/runtime/omni/omni.go b/internal/backend/runtime/omni/omni.go index 9b44729d..1ec91975 100644 --- a/internal/backend/runtime/omni/omni.go +++ b/internal/backend/runtime/omni/omni.go @@ -254,6 +254,7 @@ func New(talosClientFactory *talos.ClientFactory, dnsService *dns.Service, workl omnictrl.NewLabelsExtractorController[*omni.MachineStatus](), omnictrl.NewMachineRequestSetStatusController(), omnictrl.NewClusterMachineRequestStatusController(), + omnictrl.NewMachineTeardownController(), } if config.Config.Auth.SAML.Enabled { diff --git a/internal/pkg/siderolink/manager.go b/internal/pkg/siderolink/manager.go index d99e86c7..699e42b6 100644 --- a/internal/pkg/siderolink/manager.go +++ b/internal/pkg/siderolink/manager.go @@ -586,15 +586,7 @@ func (manager *Manager) cleanupDestroyedLinks(ctx context.Context) error { for { select { case event := <-events: - //nolint:exhaustive - switch event.Type { - case state.Updated: - if event.Resource.Metadata().Phase() != resource.PhaseTearingDown { - break - } - - fallthrough - case state.Destroyed: + if event.Type == state.Destroyed { link, ok := event.Resource.(*siderolink.Link) if !ok { return fmt.Errorf("failed to cast resource to siderolink.Link type")