From c16ccb992355fc1d24eab0782a51230644ba3047 Mon Sep 17 00:00:00 2001
From: Dmitri Fedotov
Date: Fri, 9 Aug 2024 16:30:59 +0300
Subject: [PATCH] add discovery state and rework initial power handling
---
api/v1alpha1/server_types.go | 3 +
api/v1alpha1/zz_generated.deepcopy.go | 2 +-
docs/api-reference/api.md | 3 +
internal/controller/server_controller.go | 191 +++++++++++-------
internal/controller/server_controller_test.go | 42 ++--
5 files changed, 148 insertions(+), 93 deletions(-)
diff --git a/api/v1alpha1/server_types.go b/api/v1alpha1/server_types.go
index bcff418..f319fcf 100644
--- a/api/v1alpha1/server_types.go
+++ b/api/v1alpha1/server_types.go
@@ -114,6 +114,9 @@ const (
// ServerStateInitial indicates that the server is in its initial state.
ServerStateInitial ServerState = "Initial"
+ // ServerStateDiscovery indicates that the server is in its discovery state.
+ ServerStateDiscovery ServerState = "Discovery"
+
// ServerStateAvailable indicates that the server is available for use.
ServerStateAvailable ServerState = "Available"
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
index 8000b7f..24bd8c0 100644
--- a/api/v1alpha1/zz_generated.deepcopy.go
+++ b/api/v1alpha1/zz_generated.deepcopy.go
@@ -9,7 +9,7 @@ package v1alpha1
import (
corev1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/apis/meta/v1"
+ v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
)
diff --git a/docs/api-reference/api.md b/docs/api-reference/api.md
index a9d7422..b605de6 100644
--- a/docs/api-reference/api.md
+++ b/docs/api-reference/api.md
@@ -1909,6 +1909,9 @@ if no boot configuration is specified.
"Available" |
ServerStateAvailable indicates that the server is available for use.
|
+
"Discovery" |
+ServerStateDiscovery indicates that the server is in its discovery state.
+ |
"Error" |
ServerStateError indicates that there is an error with the server.
|
diff --git a/internal/controller/server_controller.go b/internal/controller/server_controller.go
index c803990..9aab354 100644
--- a/internal/controller/server_controller.go
+++ b/internal/controller/server_controller.go
@@ -174,11 +174,15 @@ func (r *ServerReconciler) reconcile(ctx context.Context, log logr.Logger, serve
// Server state-machine:
//
// A Server goes through the following stages:
-// Initial -> Available -> Reserved -> Tainted -> Available ...
+// Initial -> Discovery -> Available -> Reserved -> Tainted -> Available ...
//
// Initial:
// In the initial state we create a ServerBootConfiguration and an Ignition to start the Probe server on the
-// Server. This Probe server registers with the managers /registry/{uuid} endpoint it's address, so the reconciler can
+// Server. The Server is patched to the state Discovery.
+//
+// Discovery:
+// In the discovery state we expect the Server to come up with the Probe server running.
+// This Probe server registers with the managers /registry/{uuid} endpoint it's address, so the reconciler can
// fetch the server details from this endpoint. Once completed the Server is patched to the state Available.
//
// Available:
@@ -198,89 +202,125 @@ func (r *ServerReconciler) reconcile(ctx context.Context, log logr.Logger, serve
func (r *ServerReconciler) ensureServerStateTransition(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) {
switch server.Status.State {
case metalv1alpha1.ServerStateInitial:
- // apply boot configuration
- if err := r.applyBootConfigurationAndIgnitionForDiscovery(ctx, log, server); err != nil {
- return false, fmt.Errorf("failed to apply server boot configuration: %w", err)
- }
- log.V(1).Info("Applied Server boot configuration")
+ return r.handleInitialState(ctx, log, server)
+ case metalv1alpha1.ServerStateDiscovery:
+ return r.handleDiscoveryState(ctx, log, server)
+ case metalv1alpha1.ServerStateAvailable:
+ return r.handleAvailableState(ctx, log, server)
+ case metalv1alpha1.ServerStateReserved:
+ return r.handleReservedState(ctx, log, server)
+ default:
+ return false, nil
+ }
+}
- if ready, err := r.serverBootConfigurationIsReady(ctx, server); err != nil || !ready {
- log.V(1).Info("Server boot configuration is not ready. Retrying ...")
- return true, err
- }
- log.V(1).Info("Server boot configuration is ready")
+func (r *ServerReconciler) handleInitialState(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) {
+ if err := r.applyBootConfigurationAndIgnitionForDiscovery(ctx, log, server); err != nil {
+ return false, fmt.Errorf("failed to apply server boot configuration: %w", err)
+ }
+ log.V(1).Info("Applied Server boot configuration")
- if err := r.pxeBootServer(ctx, log, server); err != nil {
- return false, fmt.Errorf("failed to boot server: %w", err)
- }
- log.V(1).Info("Booted Server in PXE")
+ if err := r.pxeBootServer(ctx, log, server); err != nil {
+ return false, fmt.Errorf("failed to set PXE boot for server: %w", err)
+ }
+ log.V(1).Info("Set PXE Boot for Server")
- ready, err := r.extractServerDetailsFromRegistry(ctx, log, server)
- if !ready && err == nil {
- log.V(1).Info("Server agent did not post info to registry")
- return true, nil
- }
- if err != nil {
- log.V(1).Info("Could not get server details from registry.")
- return false, err
- }
- log.V(1).Info("Extracted Server details")
+ if modified, err := r.patchServerState(ctx, server, metalv1alpha1.ServerStateDiscovery); err != nil || modified {
+ return false, err
+ }
+ return false, nil
+}
- serverBase := server.DeepCopy()
- server.Spec.Power = metalv1alpha1.PowerOff
- if err := r.Patch(ctx, server, client.MergeFrom(serverBase)); err != nil {
- return false, fmt.Errorf("failed to update server power state: %w", err)
- }
- log.V(1).Info("Updated Server power state", "PowerState", metalv1alpha1.PowerOff)
+func (r *ServerReconciler) handleDiscoveryState(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) {
+ serverBase := server.DeepCopy()
+ server.Spec.Power = metalv1alpha1.PowerOn
+ if err := r.Patch(ctx, server, client.MergeFrom(serverBase)); err != nil {
+ return false, fmt.Errorf("failed to update server power state: %w", err)
+ }
+ log.V(1).Info("Updated Server power state", "PowerState", metalv1alpha1.PowerOn)
- if err := r.ensureServerPowerState(ctx, log, server); err != nil {
- return false, fmt.Errorf("failed to ensure server power state: %w", err)
- }
- log.V(1).Info("Server state set to power off")
+ if err := r.ensureServerPowerState(ctx, log, server); err != nil {
+ return false, fmt.Errorf("failed to ensure server power state: %w", err)
+ }
+ log.V(1).Info("Server state set to power on")
- if err := r.invalidateRegistryEntryForServer(log, server); err != nil {
- return false, fmt.Errorf("failed to invalidate registry entry for server: %w", err)
- }
- log.V(1).Info("Removed Server from Registry")
+ if ready, err := r.serverBootConfigurationIsReady(ctx, server); err != nil || !ready {
+ log.V(1).Info("Server boot configuration is not ready. Retrying ...")
+ return true, err
+ }
+ log.V(1).Info("Server boot configuration is ready")
- log.V(1).Info("Setting Server state set to available")
- if modified, err := r.patchServerState(ctx, server, metalv1alpha1.ServerStateAvailable); err != nil || modified {
- return false, err
- }
- case metalv1alpha1.ServerStateAvailable:
- if err := r.ensureInitialBootConfigurationIsDeleted(ctx, server); err != nil {
- return false, fmt.Errorf("failed to ensure server initial boot configuration is deleted: %w", err)
- }
- log.V(1).Info("Ensured initial boot configuration is deleted")
+ ready, err := r.extractServerDetailsFromRegistry(ctx, log, server)
+ if !ready && err == nil {
+ log.V(1).Info("Server agent did not post info to registry")
+ return true, nil
+ }
+ if err != nil {
+ log.V(1).Info("Could not get server details from registry.")
+ return false, err
+ }
+ log.V(1).Info("Extracted Server details")
- if err := r.ensureServerPowerState(ctx, log, server); err != nil {
- return false, fmt.Errorf("failed to ensure server power state: %w", err)
- }
- if err := r.ensureIndicatorLED(ctx, log, server); err != nil {
- return false, fmt.Errorf("failed to ensure server indicator led: %w", err)
- }
- log.V(1).Info("Reconciled available state")
- case metalv1alpha1.ServerStateReserved:
- if ready, err := r.serverBootConfigurationIsReady(ctx, server); err != nil || !ready {
- log.V(1).Info("Server boot configuration is not ready. Retrying ...")
- return true, err
- }
- log.V(1).Info("Server boot configuration is ready")
+ if err := r.invalidateRegistryEntryForServer(log, server); err != nil {
+ return false, fmt.Errorf("failed to invalidate registry entry for server: %w", err)
+ }
+ log.V(1).Info("Removed Server from Registry")
- if err := r.pxeBootServer(ctx, log, server); err != nil {
- return false, fmt.Errorf("failed to boot server: %w", err)
- }
- log.V(1).Info("Booted Server in PXE")
+ log.V(1).Info("Setting Server state set to available")
+ if modified, err := r.patchServerState(ctx, server, metalv1alpha1.ServerStateAvailable); err != nil || modified {
+ return false, err
+ }
+ return false, nil
+}
- if err := r.ensureServerPowerState(ctx, log, server); err != nil {
- return false, fmt.Errorf("failed to ensure server power state: %w", err)
- }
+func (r *ServerReconciler) handleAvailableState(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) {
+ serverBase := server.DeepCopy()
+ server.Spec.Power = metalv1alpha1.PowerOff
+ if err := r.Patch(ctx, server, client.MergeFrom(serverBase)); err != nil {
+ return false, fmt.Errorf("failed to update server power state: %w", err)
+ }
+ log.V(1).Info("Updated Server power state", "PowerState", metalv1alpha1.PowerOff)
- if err := r.ensureIndicatorLED(ctx, log, server); err != nil {
- return false, fmt.Errorf("failed to ensure server indicator led: %w", err)
- }
- log.V(1).Info("Reconciled reserved state")
+ if err := r.ensureServerPowerState(ctx, log, server); err != nil {
+ return false, fmt.Errorf("failed to ensure server power state: %w", err)
}
+ log.V(1).Info("Server state set to power off")
+
+ if err := r.ensureInitialBootConfigurationIsDeleted(ctx, server); err != nil {
+ return false, fmt.Errorf("failed to ensure server initial boot configuration is deleted: %w", err)
+ }
+ log.V(1).Info("Ensured initial boot configuration is deleted")
+
+ if err := r.ensureServerPowerState(ctx, log, server); err != nil {
+ return false, fmt.Errorf("failed to ensure server power state: %w", err)
+ }
+ if err := r.ensureIndicatorLED(ctx, log, server); err != nil {
+ return false, fmt.Errorf("failed to ensure server indicator led: %w", err)
+ }
+ log.V(1).Info("Reconciled available state")
+ return false, nil
+}
+
+func (r *ServerReconciler) handleReservedState(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) {
+ if ready, err := r.serverBootConfigurationIsReady(ctx, server); err != nil || !ready {
+ log.V(1).Info("Server boot configuration is not ready. Retrying ...")
+ return true, err
+ }
+ log.V(1).Info("Server boot configuration is ready")
+
+ if err := r.pxeBootServer(ctx, log, server); err != nil {
+ return false, fmt.Errorf("failed to boot server: %w", err)
+ }
+ log.V(1).Info("Booted Server in PXE")
+
+ if err := r.ensureServerPowerState(ctx, log, server); err != nil {
+ return false, fmt.Errorf("failed to ensure server power state: %w", err)
+ }
+
+ if err := r.ensureIndicatorLED(ctx, log, server); err != nil {
+ return false, fmt.Errorf("failed to ensure server indicator led: %w", err)
+ }
+ log.V(1).Info("Reconciled reserved state")
return false, nil
}
@@ -470,11 +510,6 @@ func (r *ServerReconciler) pxeBootServer(ctx context.Context, log logr.Logger, s
if err := bmcClient.SetPXEBootOnce(server.Spec.UUID); err != nil {
return fmt.Errorf("failed to set PXE boot one for server: %w", err)
}
-
- // TODO: do a proper restart if Server is already in PowerOn state
- if err := bmcClient.PowerOn(server.Spec.UUID); err != nil {
- return fmt.Errorf("failed to power on server: %w", err)
- }
return nil
}
diff --git a/internal/controller/server_controller_test.go b/internal/controller/server_controller_test.go
index ef4c30e..bfc5442 100644
--- a/internal/controller/server_controller_test.go
+++ b/internal/controller/server_controller_test.go
@@ -49,12 +49,34 @@ var _ = Describe("Server Controller", func() {
}
Eventually(Get(bmc)).Should(Succeed())
- By("Ensuring the boot configuration has been created")
+ By("Ensuring that the Server resource has been created")
server := &metalv1alpha1.Server{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("compute-0-%s", bmc.Name),
},
}
+ Eventually(Object(server)).Should(SatisfyAll(
+ HaveField("Finalizers", ContainElement(ServerFinalizer)),
+ HaveField("OwnerReferences", ContainElement(metav1.OwnerReference{
+ APIVersion: "metal.ironcore.dev/v1alpha1",
+ Kind: "BMC",
+ Name: bmc.Name,
+ UID: bmc.UID,
+ Controller: ptr.To(true),
+ BlockOwnerDeletion: ptr.To(true),
+ })),
+ HaveField("Spec.UUID", "38947555-7742-3448-3784-823347823834"),
+ HaveField("Spec.Power", metalv1alpha1.Power("")),
+ HaveField("Spec.IndicatorLED", metalv1alpha1.IndicatorLED("")),
+ HaveField("Spec.ServerClaimRef", BeNil()),
+ HaveField("Status.Manufacturer", "Contoso"),
+ HaveField("Status.SKU", "8675309"),
+ HaveField("Status.SerialNumber", "437XR1138R2"),
+ HaveField("Status.IndicatorLED", metalv1alpha1.OffIndicatorLED),
+ HaveField("Status.State", metalv1alpha1.ServerStateInitial),
+ ))
+
+ By("Ensuring the boot configuration has been created")
bootConfig := &metalv1alpha1.ServerBootConfiguration{
ObjectMeta: metav1.ObjectMeta{
Namespace: ns.Name,
@@ -88,7 +110,7 @@ var _ = Describe("Server Controller", func() {
HaveField("Data", HaveKeyWithValue("ignition", MatchYAML(testdata.DefaultIgnition))),
))
- By("Ensuring that the Server resource has been created")
+ By("Ensuring that the Server is set to discovery and powered on")
Eventually(Object(server)).Should(SatisfyAll(
HaveField("Finalizers", ContainElement(ServerFinalizer)),
HaveField("OwnerReferences", ContainElement(metav1.OwnerReference{
@@ -99,11 +121,7 @@ var _ = Describe("Server Controller", func() {
Controller: ptr.To(true),
BlockOwnerDeletion: ptr.To(true),
})),
- HaveField("Spec.UUID", "38947555-7742-3448-3784-823347823834"),
- HaveField("Spec.Power", metalv1alpha1.Power("")),
- HaveField("Spec.IndicatorLED", metalv1alpha1.IndicatorLED("")),
- HaveField("Spec.ServerClaimRef", BeNil()),
- HaveField("Spec.BMCRef", &v1.LocalObjectReference{Name: bmc.Name}),
+ HaveField("Spec.Power", metalv1alpha1.Power("On")),
HaveField("Spec.BootConfigurationRef", &v1.ObjectReference{
Kind: "ServerBootConfiguration",
Namespace: ns.Name,
@@ -111,11 +129,7 @@ var _ = Describe("Server Controller", func() {
UID: bootConfig.UID,
APIVersion: "metal.ironcore.dev/v1alpha1",
}),
- HaveField("Status.Manufacturer", "Contoso"),
- HaveField("Status.SKU", "8675309"),
- HaveField("Status.SerialNumber", "437XR1138R2"),
- HaveField("Status.IndicatorLED", metalv1alpha1.OffIndicatorLED),
- HaveField("Status.State", metalv1alpha1.ServerStateInitial),
+ HaveField("Status.State", metalv1alpha1.ServerStateDiscovery),
))
By("Patching the boot configuration to a Ready state")
@@ -221,7 +235,7 @@ var _ = Describe("Server Controller", func() {
Eventually(Object(server)).Should(SatisfyAll(
HaveField("Finalizers", ContainElement(ServerFinalizer)),
HaveField("Spec.UUID", "38947555-7742-3448-3784-823347823834"),
- HaveField("Spec.Power", metalv1alpha1.Power("")),
+ HaveField("Spec.Power", metalv1alpha1.Power("On")),
HaveField("Spec.IndicatorLED", metalv1alpha1.IndicatorLED("")),
HaveField("Spec.ServerClaimRef", BeNil()),
HaveField("Spec.BootConfigurationRef", &v1.ObjectReference{
@@ -235,7 +249,7 @@ var _ = Describe("Server Controller", func() {
HaveField("Status.SKU", "8675309"),
HaveField("Status.SerialNumber", "437XR1138R2"),
HaveField("Status.IndicatorLED", metalv1alpha1.OffIndicatorLED),
- HaveField("Status.State", metalv1alpha1.ServerStateInitial),
+ HaveField("Status.State", metalv1alpha1.ServerStateDiscovery),
))
By("Patching the boot configuration to a Ready state")