Skip to content

Commit

Permalink
add discovery state and rework initial power handling
Browse files Browse the repository at this point in the history
  • Loading branch information
defo89 committed Aug 9, 2024
1 parent 2289cae commit c16ccb9
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 93 deletions.
3 changes: 3 additions & 0 deletions api/v1alpha1/server_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ const (
// ServerStateInitial indicates that the server is in its initial state.
ServerStateInitial ServerState = "Initial"

// ServerStateDiscovery indicates that the server is in its discovery state.
ServerStateDiscovery ServerState = "Discovery"

// ServerStateAvailable indicates that the server is available for use.
ServerStateAvailable ServerState = "Available"

Expand Down
2 changes: 1 addition & 1 deletion api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions docs/api-reference/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -1909,6 +1909,9 @@ if no boot configuration is specified.</p>
<tbody><tr><td><p>&#34;Available&#34;</p></td>
<td><p>ServerStateAvailable indicates that the server is available for use.</p>
</td>
</tr><tr><td><p>&#34;Discovery&#34;</p></td>
<td><p>ServerStateDiscovery indicates that the server is in its discovery state.</p>
</td>
</tr><tr><td><p>&#34;Error&#34;</p></td>
<td><p>ServerStateError indicates that there is an error with the server.</p>
</td>
Expand Down
191 changes: 113 additions & 78 deletions internal/controller/server_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,15 @@ func (r *ServerReconciler) reconcile(ctx context.Context, log logr.Logger, serve
// Server state-machine:
//
// A Server goes through the following stages:
// Initial -> Available -> Reserved -> Tainted -> Available ...
// Initial -> Discovery -> Available -> Reserved -> Tainted -> Available ...
//
// Initial:
// In the initial state we create a ServerBootConfiguration and an Ignition to start the Probe server on the
// Server. This Probe server registers with the managers /registry/{uuid} endpoint it's address, so the reconciler can
// Server. The Server is patched to the state Discovery.
//
// Discovery:
// In the discovery state we expect the Server to come up with the Probe server running.
// This Probe server registers with the managers /registry/{uuid} endpoint it's address, so the reconciler can
// fetch the server details from this endpoint. Once completed the Server is patched to the state Available.
//
// Available:
Expand All @@ -198,89 +202,125 @@ func (r *ServerReconciler) reconcile(ctx context.Context, log logr.Logger, serve
func (r *ServerReconciler) ensureServerStateTransition(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) {
switch server.Status.State {
case metalv1alpha1.ServerStateInitial:
// apply boot configuration
if err := r.applyBootConfigurationAndIgnitionForDiscovery(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to apply server boot configuration: %w", err)
}
log.V(1).Info("Applied Server boot configuration")
return r.handleInitialState(ctx, log, server)
case metalv1alpha1.ServerStateDiscovery:
return r.handleDiscoveryState(ctx, log, server)
case metalv1alpha1.ServerStateAvailable:
return r.handleAvailableState(ctx, log, server)
case metalv1alpha1.ServerStateReserved:
return r.handleReservedState(ctx, log, server)
default:
return false, nil
}
}

if ready, err := r.serverBootConfigurationIsReady(ctx, server); err != nil || !ready {
log.V(1).Info("Server boot configuration is not ready. Retrying ...")
return true, err
}
log.V(1).Info("Server boot configuration is ready")
func (r *ServerReconciler) handleInitialState(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) {
if err := r.applyBootConfigurationAndIgnitionForDiscovery(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to apply server boot configuration: %w", err)
}
log.V(1).Info("Applied Server boot configuration")

if err := r.pxeBootServer(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to boot server: %w", err)
}
log.V(1).Info("Booted Server in PXE")
if err := r.pxeBootServer(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to set PXE boot for server: %w", err)
}
log.V(1).Info("Set PXE Boot for Server")

ready, err := r.extractServerDetailsFromRegistry(ctx, log, server)
if !ready && err == nil {
log.V(1).Info("Server agent did not post info to registry")
return true, nil
}
if err != nil {
log.V(1).Info("Could not get server details from registry.")
return false, err
}
log.V(1).Info("Extracted Server details")
if modified, err := r.patchServerState(ctx, server, metalv1alpha1.ServerStateDiscovery); err != nil || modified {
return false, err
}
return false, nil
}

serverBase := server.DeepCopy()
server.Spec.Power = metalv1alpha1.PowerOff
if err := r.Patch(ctx, server, client.MergeFrom(serverBase)); err != nil {
return false, fmt.Errorf("failed to update server power state: %w", err)
}
log.V(1).Info("Updated Server power state", "PowerState", metalv1alpha1.PowerOff)
func (r *ServerReconciler) handleDiscoveryState(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) {
serverBase := server.DeepCopy()
server.Spec.Power = metalv1alpha1.PowerOn
if err := r.Patch(ctx, server, client.MergeFrom(serverBase)); err != nil {
return false, fmt.Errorf("failed to update server power state: %w", err)
}
log.V(1).Info("Updated Server power state", "PowerState", metalv1alpha1.PowerOn)

if err := r.ensureServerPowerState(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to ensure server power state: %w", err)
}
log.V(1).Info("Server state set to power off")
if err := r.ensureServerPowerState(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to ensure server power state: %w", err)
}
log.V(1).Info("Server state set to power on")

if err := r.invalidateRegistryEntryForServer(log, server); err != nil {
return false, fmt.Errorf("failed to invalidate registry entry for server: %w", err)
}
log.V(1).Info("Removed Server from Registry")
if ready, err := r.serverBootConfigurationIsReady(ctx, server); err != nil || !ready {
log.V(1).Info("Server boot configuration is not ready. Retrying ...")
return true, err
}
log.V(1).Info("Server boot configuration is ready")

log.V(1).Info("Setting Server state set to available")
if modified, err := r.patchServerState(ctx, server, metalv1alpha1.ServerStateAvailable); err != nil || modified {
return false, err
}
case metalv1alpha1.ServerStateAvailable:
if err := r.ensureInitialBootConfigurationIsDeleted(ctx, server); err != nil {
return false, fmt.Errorf("failed to ensure server initial boot configuration is deleted: %w", err)
}
log.V(1).Info("Ensured initial boot configuration is deleted")
ready, err := r.extractServerDetailsFromRegistry(ctx, log, server)
if !ready && err == nil {
log.V(1).Info("Server agent did not post info to registry")
return true, nil
}
if err != nil {
log.V(1).Info("Could not get server details from registry.")
return false, err
}
log.V(1).Info("Extracted Server details")

if err := r.ensureServerPowerState(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to ensure server power state: %w", err)
}
if err := r.ensureIndicatorLED(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to ensure server indicator led: %w", err)
}
log.V(1).Info("Reconciled available state")
case metalv1alpha1.ServerStateReserved:
if ready, err := r.serverBootConfigurationIsReady(ctx, server); err != nil || !ready {
log.V(1).Info("Server boot configuration is not ready. Retrying ...")
return true, err
}
log.V(1).Info("Server boot configuration is ready")
if err := r.invalidateRegistryEntryForServer(log, server); err != nil {
return false, fmt.Errorf("failed to invalidate registry entry for server: %w", err)
}
log.V(1).Info("Removed Server from Registry")

if err := r.pxeBootServer(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to boot server: %w", err)
}
log.V(1).Info("Booted Server in PXE")
log.V(1).Info("Setting Server state set to available")
if modified, err := r.patchServerState(ctx, server, metalv1alpha1.ServerStateAvailable); err != nil || modified {
return false, err
}
return false, nil
}

if err := r.ensureServerPowerState(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to ensure server power state: %w", err)
}
func (r *ServerReconciler) handleAvailableState(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) {
serverBase := server.DeepCopy()
server.Spec.Power = metalv1alpha1.PowerOff
if err := r.Patch(ctx, server, client.MergeFrom(serverBase)); err != nil {
return false, fmt.Errorf("failed to update server power state: %w", err)
}
log.V(1).Info("Updated Server power state", "PowerState", metalv1alpha1.PowerOff)

if err := r.ensureIndicatorLED(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to ensure server indicator led: %w", err)
}
log.V(1).Info("Reconciled reserved state")
if err := r.ensureServerPowerState(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to ensure server power state: %w", err)
}
log.V(1).Info("Server state set to power off")

if err := r.ensureInitialBootConfigurationIsDeleted(ctx, server); err != nil {
return false, fmt.Errorf("failed to ensure server initial boot configuration is deleted: %w", err)
}
log.V(1).Info("Ensured initial boot configuration is deleted")

if err := r.ensureServerPowerState(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to ensure server power state: %w", err)
}
if err := r.ensureIndicatorLED(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to ensure server indicator led: %w", err)
}
log.V(1).Info("Reconciled available state")
return false, nil
}

func (r *ServerReconciler) handleReservedState(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) (bool, error) {
if ready, err := r.serverBootConfigurationIsReady(ctx, server); err != nil || !ready {
log.V(1).Info("Server boot configuration is not ready. Retrying ...")
return true, err
}
log.V(1).Info("Server boot configuration is ready")

if err := r.pxeBootServer(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to boot server: %w", err)
}
log.V(1).Info("Booted Server in PXE")

if err := r.ensureServerPowerState(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to ensure server power state: %w", err)
}

if err := r.ensureIndicatorLED(ctx, log, server); err != nil {
return false, fmt.Errorf("failed to ensure server indicator led: %w", err)
}
log.V(1).Info("Reconciled reserved state")
return false, nil
}

Expand Down Expand Up @@ -470,11 +510,6 @@ func (r *ServerReconciler) pxeBootServer(ctx context.Context, log logr.Logger, s
if err := bmcClient.SetPXEBootOnce(server.Spec.UUID); err != nil {
return fmt.Errorf("failed to set PXE boot one for server: %w", err)
}

// TODO: do a proper restart if Server is already in PowerOn state
if err := bmcClient.PowerOn(server.Spec.UUID); err != nil {
return fmt.Errorf("failed to power on server: %w", err)
}
return nil
}

Expand Down
42 changes: 28 additions & 14 deletions internal/controller/server_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,34 @@ var _ = Describe("Server Controller", func() {
}
Eventually(Get(bmc)).Should(Succeed())

By("Ensuring the boot configuration has been created")
By("Ensuring that the Server resource has been created")
server := &metalv1alpha1.Server{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("compute-0-%s", bmc.Name),
},
}
Eventually(Object(server)).Should(SatisfyAll(
HaveField("Finalizers", ContainElement(ServerFinalizer)),
HaveField("OwnerReferences", ContainElement(metav1.OwnerReference{
APIVersion: "metal.ironcore.dev/v1alpha1",
Kind: "BMC",
Name: bmc.Name,
UID: bmc.UID,
Controller: ptr.To(true),
BlockOwnerDeletion: ptr.To(true),
})),
HaveField("Spec.UUID", "38947555-7742-3448-3784-823347823834"),
HaveField("Spec.Power", metalv1alpha1.Power("")),
HaveField("Spec.IndicatorLED", metalv1alpha1.IndicatorLED("")),
HaveField("Spec.ServerClaimRef", BeNil()),
HaveField("Status.Manufacturer", "Contoso"),
HaveField("Status.SKU", "8675309"),
HaveField("Status.SerialNumber", "437XR1138R2"),
HaveField("Status.IndicatorLED", metalv1alpha1.OffIndicatorLED),
HaveField("Status.State", metalv1alpha1.ServerStateInitial),
))

By("Ensuring the boot configuration has been created")
bootConfig := &metalv1alpha1.ServerBootConfiguration{
ObjectMeta: metav1.ObjectMeta{
Namespace: ns.Name,
Expand Down Expand Up @@ -88,7 +110,7 @@ var _ = Describe("Server Controller", func() {
HaveField("Data", HaveKeyWithValue("ignition", MatchYAML(testdata.DefaultIgnition))),
))

By("Ensuring that the Server resource has been created")
By("Ensuring that the Server is set to discovery and powered on")
Eventually(Object(server)).Should(SatisfyAll(
HaveField("Finalizers", ContainElement(ServerFinalizer)),
HaveField("OwnerReferences", ContainElement(metav1.OwnerReference{
Expand All @@ -99,23 +121,15 @@ var _ = Describe("Server Controller", func() {
Controller: ptr.To(true),
BlockOwnerDeletion: ptr.To(true),
})),
HaveField("Spec.UUID", "38947555-7742-3448-3784-823347823834"),
HaveField("Spec.Power", metalv1alpha1.Power("")),
HaveField("Spec.IndicatorLED", metalv1alpha1.IndicatorLED("")),
HaveField("Spec.ServerClaimRef", BeNil()),
HaveField("Spec.BMCRef", &v1.LocalObjectReference{Name: bmc.Name}),
HaveField("Spec.Power", metalv1alpha1.Power("On")),
HaveField("Spec.BootConfigurationRef", &v1.ObjectReference{
Kind: "ServerBootConfiguration",
Namespace: ns.Name,
Name: server.Name,
UID: bootConfig.UID,
APIVersion: "metal.ironcore.dev/v1alpha1",
}),
HaveField("Status.Manufacturer", "Contoso"),
HaveField("Status.SKU", "8675309"),
HaveField("Status.SerialNumber", "437XR1138R2"),
HaveField("Status.IndicatorLED", metalv1alpha1.OffIndicatorLED),
HaveField("Status.State", metalv1alpha1.ServerStateInitial),
HaveField("Status.State", metalv1alpha1.ServerStateDiscovery),
))

By("Patching the boot configuration to a Ready state")
Expand Down Expand Up @@ -221,7 +235,7 @@ var _ = Describe("Server Controller", func() {
Eventually(Object(server)).Should(SatisfyAll(
HaveField("Finalizers", ContainElement(ServerFinalizer)),
HaveField("Spec.UUID", "38947555-7742-3448-3784-823347823834"),
HaveField("Spec.Power", metalv1alpha1.Power("")),
HaveField("Spec.Power", metalv1alpha1.Power("On")),
HaveField("Spec.IndicatorLED", metalv1alpha1.IndicatorLED("")),
HaveField("Spec.ServerClaimRef", BeNil()),
HaveField("Spec.BootConfigurationRef", &v1.ObjectReference{
Expand All @@ -235,7 +249,7 @@ var _ = Describe("Server Controller", func() {
HaveField("Status.SKU", "8675309"),
HaveField("Status.SerialNumber", "437XR1138R2"),
HaveField("Status.IndicatorLED", metalv1alpha1.OffIndicatorLED),
HaveField("Status.State", metalv1alpha1.ServerStateInitial),
HaveField("Status.State", metalv1alpha1.ServerStateDiscovery),
))

By("Patching the boot configuration to a Ready state")
Expand Down

0 comments on commit c16ccb9

Please sign in to comment.