From c16ccb992355fc1d24eab0782a51230644ba3047 Mon Sep 17 00:00:00 2001 From: Dmitri Fedotov Date: Fri, 9 Aug 2024 16:30:59 +0300 Subject: [PATCH] add discovery state and rework initial power handling --- api/v1alpha1/server_types.go | 3 + api/v1alpha1/zz_generated.deepcopy.go | 2 +- docs/api-reference/api.md | 3 + internal/controller/server_controller.go | 191 +++++++++++------- internal/controller/server_controller_test.go | 42 ++-- 5 files changed, 148 insertions(+), 93 deletions(-) diff --git a/api/v1alpha1/server_types.go b/api/v1alpha1/server_types.go index bcff418..f319fcf 100644 --- a/api/v1alpha1/server_types.go +++ b/api/v1alpha1/server_types.go @@ -114,6 +114,9 @@ const ( // ServerStateInitial indicates that the server is in its initial state. ServerStateInitial ServerState = "Initial" + // ServerStateDiscovery indicates that the server is in its discovery state. + ServerStateDiscovery ServerState = "Discovery" + // ServerStateAvailable indicates that the server is available for use. ServerStateAvailable ServerState = "Available" diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 8000b7f..24bd8c0 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -9,7 +9,7 @@ package v1alpha1 import ( corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ) diff --git a/docs/api-reference/api.md b/docs/api-reference/api.md index a9d7422..b605de6 100644 --- a/docs/api-reference/api.md +++ b/docs/api-reference/api.md @@ -1909,6 +1909,9 @@ if no boot configuration is specified.

"Available"

ServerStateAvailable indicates that the server is available for use.

+

"Discovery"

+

ServerStateDiscovery indicates that the server is in its discovery state.

+

"Error"

ServerStateError indicates that there is an error with the server.

diff --git a/internal/controller/server_controller.go b/internal/controller/server_controller.go index c803990..9aab354 100644 --- a/internal/controller/server_controller.go +++ b/internal/controller/server_controller.go @@ -174,11 +174,15 @@ func (r *ServerReconciler) reconcile(ctx context.Context, log logr.Logger, serve // Server state-machine: // // A Server goes through the following stages: -// Initial -> Available -> Reserved -> Tainted -> Available ... +// Initial -> Discovery -> Available -> Reserved -> Tainted -> Available ... // // Initial: // In the initial state we create a ServerBootConfiguration and an Ignition to start the Probe server on the -// Server. This Probe server registers with the managers /registry/{uuid} endpoint it's address, so the reconciler can +// Server. The Server is patched to the state Discovery. +// +// Discovery: +// In the discovery state we expect the Server to come up with the Probe server running. +// This Probe server registers with the managers /registry/{uuid} endpoint it's address, so the reconciler can // fetch the server details from this endpoint. Once completed the Server is patched to the state Available. // // Available: @@ -198,89 +202,125 @@ func (r *ServerReconciler) reconcile(ctx context.Context, log logr.Logger, serve func (r *ServerReconciler) ensureServerStateTransition(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) { switch server.Status.State { case metalv1alpha1.ServerStateInitial: - // apply boot configuration - if err := r.applyBootConfigurationAndIgnitionForDiscovery(ctx, log, server); err != nil { - return false, fmt.Errorf("failed to apply server boot configuration: %w", err) - } - log.V(1).Info("Applied Server boot configuration") + return r.handleInitialState(ctx, log, server) + case metalv1alpha1.ServerStateDiscovery: + return r.handleDiscoveryState(ctx, log, server) + case metalv1alpha1.ServerStateAvailable: + return r.handleAvailableState(ctx, log, server) + case metalv1alpha1.ServerStateReserved: + return r.handleReservedState(ctx, log, server) + default: + return false, nil + } +} - if ready, err := r.serverBootConfigurationIsReady(ctx, server); err != nil || !ready { - log.V(1).Info("Server boot configuration is not ready. Retrying ...") - return true, err - } - log.V(1).Info("Server boot configuration is ready") +func (r *ServerReconciler) handleInitialState(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) { + if err := r.applyBootConfigurationAndIgnitionForDiscovery(ctx, log, server); err != nil { + return false, fmt.Errorf("failed to apply server boot configuration: %w", err) + } + log.V(1).Info("Applied Server boot configuration") - if err := r.pxeBootServer(ctx, log, server); err != nil { - return false, fmt.Errorf("failed to boot server: %w", err) - } - log.V(1).Info("Booted Server in PXE") + if err := r.pxeBootServer(ctx, log, server); err != nil { + return false, fmt.Errorf("failed to set PXE boot for server: %w", err) + } + log.V(1).Info("Set PXE Boot for Server") - ready, err := r.extractServerDetailsFromRegistry(ctx, log, server) - if !ready && err == nil { - log.V(1).Info("Server agent did not post info to registry") - return true, nil - } - if err != nil { - log.V(1).Info("Could not get server details from registry.") - return false, err - } - log.V(1).Info("Extracted Server details") + if modified, err := r.patchServerState(ctx, server, metalv1alpha1.ServerStateDiscovery); err != nil || modified { + return false, err + } + return false, nil +} - serverBase := server.DeepCopy() - server.Spec.Power = metalv1alpha1.PowerOff - if err := r.Patch(ctx, server, client.MergeFrom(serverBase)); err != nil { - return false, fmt.Errorf("failed to update server power state: %w", err) - } - log.V(1).Info("Updated Server power state", "PowerState", metalv1alpha1.PowerOff) +func (r *ServerReconciler) handleDiscoveryState(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) { + serverBase := server.DeepCopy() + server.Spec.Power = metalv1alpha1.PowerOn + if err := r.Patch(ctx, server, client.MergeFrom(serverBase)); err != nil { + return false, fmt.Errorf("failed to update server power state: %w", err) + } + log.V(1).Info("Updated Server power state", "PowerState", metalv1alpha1.PowerOn) - if err := r.ensureServerPowerState(ctx, log, server); err != nil { - return false, fmt.Errorf("failed to ensure server power state: %w", err) - } - log.V(1).Info("Server state set to power off") + if err := r.ensureServerPowerState(ctx, log, server); err != nil { + return false, fmt.Errorf("failed to ensure server power state: %w", err) + } + log.V(1).Info("Server state set to power on") - if err := r.invalidateRegistryEntryForServer(log, server); err != nil { - return false, fmt.Errorf("failed to invalidate registry entry for server: %w", err) - } - log.V(1).Info("Removed Server from Registry") + if ready, err := r.serverBootConfigurationIsReady(ctx, server); err != nil || !ready { + log.V(1).Info("Server boot configuration is not ready. Retrying ...") + return true, err + } + log.V(1).Info("Server boot configuration is ready") - log.V(1).Info("Setting Server state set to available") - if modified, err := r.patchServerState(ctx, server, metalv1alpha1.ServerStateAvailable); err != nil || modified { - return false, err - } - case metalv1alpha1.ServerStateAvailable: - if err := r.ensureInitialBootConfigurationIsDeleted(ctx, server); err != nil { - return false, fmt.Errorf("failed to ensure server initial boot configuration is deleted: %w", err) - } - log.V(1).Info("Ensured initial boot configuration is deleted") + ready, err := r.extractServerDetailsFromRegistry(ctx, log, server) + if !ready && err == nil { + log.V(1).Info("Server agent did not post info to registry") + return true, nil + } + if err != nil { + log.V(1).Info("Could not get server details from registry.") + return false, err + } + log.V(1).Info("Extracted Server details") - if err := r.ensureServerPowerState(ctx, log, server); err != nil { - return false, fmt.Errorf("failed to ensure server power state: %w", err) - } - if err := r.ensureIndicatorLED(ctx, log, server); err != nil { - return false, fmt.Errorf("failed to ensure server indicator led: %w", err) - } - log.V(1).Info("Reconciled available state") - case metalv1alpha1.ServerStateReserved: - if ready, err := r.serverBootConfigurationIsReady(ctx, server); err != nil || !ready { - log.V(1).Info("Server boot configuration is not ready. Retrying ...") - return true, err - } - log.V(1).Info("Server boot configuration is ready") + if err := r.invalidateRegistryEntryForServer(log, server); err != nil { + return false, fmt.Errorf("failed to invalidate registry entry for server: %w", err) + } + log.V(1).Info("Removed Server from Registry") - if err := r.pxeBootServer(ctx, log, server); err != nil { - return false, fmt.Errorf("failed to boot server: %w", err) - } - log.V(1).Info("Booted Server in PXE") + log.V(1).Info("Setting Server state set to available") + if modified, err := r.patchServerState(ctx, server, metalv1alpha1.ServerStateAvailable); err != nil || modified { + return false, err + } + return false, nil +} - if err := r.ensureServerPowerState(ctx, log, server); err != nil { - return false, fmt.Errorf("failed to ensure server power state: %w", err) - } +func (r *ServerReconciler) handleAvailableState(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) { + serverBase := server.DeepCopy() + server.Spec.Power = metalv1alpha1.PowerOff + if err := r.Patch(ctx, server, client.MergeFrom(serverBase)); err != nil { + return false, fmt.Errorf("failed to update server power state: %w", err) + } + log.V(1).Info("Updated Server power state", "PowerState", metalv1alpha1.PowerOff) - if err := r.ensureIndicatorLED(ctx, log, server); err != nil { - return false, fmt.Errorf("failed to ensure server indicator led: %w", err) - } - log.V(1).Info("Reconciled reserved state") + if err := r.ensureServerPowerState(ctx, log, server); err != nil { + return false, fmt.Errorf("failed to ensure server power state: %w", err) } + log.V(1).Info("Server state set to power off") + + if err := r.ensureInitialBootConfigurationIsDeleted(ctx, server); err != nil { + return false, fmt.Errorf("failed to ensure server initial boot configuration is deleted: %w", err) + } + log.V(1).Info("Ensured initial boot configuration is deleted") + + if err := r.ensureServerPowerState(ctx, log, server); err != nil { + return false, fmt.Errorf("failed to ensure server power state: %w", err) + } + if err := r.ensureIndicatorLED(ctx, log, server); err != nil { + return false, fmt.Errorf("failed to ensure server indicator led: %w", err) + } + log.V(1).Info("Reconciled available state") + return false, nil +} + +func (r *ServerReconciler) handleReservedState(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) { + if ready, err := r.serverBootConfigurationIsReady(ctx, server); err != nil || !ready { + log.V(1).Info("Server boot configuration is not ready. Retrying ...") + return true, err + } + log.V(1).Info("Server boot configuration is ready") + + if err := r.pxeBootServer(ctx, log, server); err != nil { + return false, fmt.Errorf("failed to boot server: %w", err) + } + log.V(1).Info("Booted Server in PXE") + + if err := r.ensureServerPowerState(ctx, log, server); err != nil { + return false, fmt.Errorf("failed to ensure server power state: %w", err) + } + + if err := r.ensureIndicatorLED(ctx, log, server); err != nil { + return false, fmt.Errorf("failed to ensure server indicator led: %w", err) + } + log.V(1).Info("Reconciled reserved state") return false, nil } @@ -470,11 +510,6 @@ func (r *ServerReconciler) pxeBootServer(ctx context.Context, log logr.Logger, s if err := bmcClient.SetPXEBootOnce(server.Spec.UUID); err != nil { return fmt.Errorf("failed to set PXE boot one for server: %w", err) } - - // TODO: do a proper restart if Server is already in PowerOn state - if err := bmcClient.PowerOn(server.Spec.UUID); err != nil { - return fmt.Errorf("failed to power on server: %w", err) - } return nil } diff --git a/internal/controller/server_controller_test.go b/internal/controller/server_controller_test.go index ef4c30e..bfc5442 100644 --- a/internal/controller/server_controller_test.go +++ b/internal/controller/server_controller_test.go @@ -49,12 +49,34 @@ var _ = Describe("Server Controller", func() { } Eventually(Get(bmc)).Should(Succeed()) - By("Ensuring the boot configuration has been created") + By("Ensuring that the Server resource has been created") server := &metalv1alpha1.Server{ ObjectMeta: metav1.ObjectMeta{ Name: fmt.Sprintf("compute-0-%s", bmc.Name), }, } + Eventually(Object(server)).Should(SatisfyAll( + HaveField("Finalizers", ContainElement(ServerFinalizer)), + HaveField("OwnerReferences", ContainElement(metav1.OwnerReference{ + APIVersion: "metal.ironcore.dev/v1alpha1", + Kind: "BMC", + Name: bmc.Name, + UID: bmc.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + })), + HaveField("Spec.UUID", "38947555-7742-3448-3784-823347823834"), + HaveField("Spec.Power", metalv1alpha1.Power("")), + HaveField("Spec.IndicatorLED", metalv1alpha1.IndicatorLED("")), + HaveField("Spec.ServerClaimRef", BeNil()), + HaveField("Status.Manufacturer", "Contoso"), + HaveField("Status.SKU", "8675309"), + HaveField("Status.SerialNumber", "437XR1138R2"), + HaveField("Status.IndicatorLED", metalv1alpha1.OffIndicatorLED), + HaveField("Status.State", metalv1alpha1.ServerStateInitial), + )) + + By("Ensuring the boot configuration has been created") bootConfig := &metalv1alpha1.ServerBootConfiguration{ ObjectMeta: metav1.ObjectMeta{ Namespace: ns.Name, @@ -88,7 +110,7 @@ var _ = Describe("Server Controller", func() { HaveField("Data", HaveKeyWithValue("ignition", MatchYAML(testdata.DefaultIgnition))), )) - By("Ensuring that the Server resource has been created") + By("Ensuring that the Server is set to discovery and powered on") Eventually(Object(server)).Should(SatisfyAll( HaveField("Finalizers", ContainElement(ServerFinalizer)), HaveField("OwnerReferences", ContainElement(metav1.OwnerReference{ @@ -99,11 +121,7 @@ var _ = Describe("Server Controller", func() { Controller: ptr.To(true), BlockOwnerDeletion: ptr.To(true), })), - HaveField("Spec.UUID", "38947555-7742-3448-3784-823347823834"), - HaveField("Spec.Power", metalv1alpha1.Power("")), - HaveField("Spec.IndicatorLED", metalv1alpha1.IndicatorLED("")), - HaveField("Spec.ServerClaimRef", BeNil()), - HaveField("Spec.BMCRef", &v1.LocalObjectReference{Name: bmc.Name}), + HaveField("Spec.Power", metalv1alpha1.Power("On")), HaveField("Spec.BootConfigurationRef", &v1.ObjectReference{ Kind: "ServerBootConfiguration", Namespace: ns.Name, @@ -111,11 +129,7 @@ var _ = Describe("Server Controller", func() { UID: bootConfig.UID, APIVersion: "metal.ironcore.dev/v1alpha1", }), - HaveField("Status.Manufacturer", "Contoso"), - HaveField("Status.SKU", "8675309"), - HaveField("Status.SerialNumber", "437XR1138R2"), - HaveField("Status.IndicatorLED", metalv1alpha1.OffIndicatorLED), - HaveField("Status.State", metalv1alpha1.ServerStateInitial), + HaveField("Status.State", metalv1alpha1.ServerStateDiscovery), )) By("Patching the boot configuration to a Ready state") @@ -221,7 +235,7 @@ var _ = Describe("Server Controller", func() { Eventually(Object(server)).Should(SatisfyAll( HaveField("Finalizers", ContainElement(ServerFinalizer)), HaveField("Spec.UUID", "38947555-7742-3448-3784-823347823834"), - HaveField("Spec.Power", metalv1alpha1.Power("")), + HaveField("Spec.Power", metalv1alpha1.Power("On")), HaveField("Spec.IndicatorLED", metalv1alpha1.IndicatorLED("")), HaveField("Spec.ServerClaimRef", BeNil()), HaveField("Spec.BootConfigurationRef", &v1.ObjectReference{ @@ -235,7 +249,7 @@ var _ = Describe("Server Controller", func() { HaveField("Status.SKU", "8675309"), HaveField("Status.SerialNumber", "437XR1138R2"), HaveField("Status.IndicatorLED", metalv1alpha1.OffIndicatorLED), - HaveField("Status.State", metalv1alpha1.ServerStateInitial), + HaveField("Status.State", metalv1alpha1.ServerStateDiscovery), )) By("Patching the boot configuration to a Ready state")