Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Go unhealthy if not ready before timeout #306

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
3 changes: 3 additions & 0 deletions azimuth_capi/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,9 @@ class Configuration(
#: The number of seconds to wait between timer executions
timer_interval: conint(gt = 0) = 60

#: The number of minutes to wait befoore marking a cluster as unhealthy
cluster_timeout_seconds: conint(gt = 0) = 30 * 60

#: The field manager name to use for server-side apply
easykube_field_manager: constr(min_length = 1) = "azimuth-capi-operator"

Expand Down
4 changes: 4 additions & 0 deletions azimuth_capi/models/v1alpha1/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,10 @@ class ClusterStatus(schema.BaseModel, extra = "allow"):
default_factory = dict,
description = "The services for the cluster, indexed by service name."
)
last_updated: schema.Optional[dt.datetime] = Field(
default = None,
description = "Used to trigger the timeout of pending states"
)


class Cluster(
Expand Down
36 changes: 36 additions & 0 deletions azimuth_capi/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,16 @@ async def on_cluster_create(logger, instance, name, namespace, patch, **kwargs):
if instance.spec.paused:
logger.info("reconciliation is paused - no action taken")
return

# To help trigger timeouts if we get stuck,
# ensure we set the last_updated time.
# But be sure not to reset that if we retrying
# the same update after a failure, so we timeout
# if we are stuck in a retry loop before the end
# of the update function.
if not instance.status.last_updated:
await save_cluster_status(instance)

# Make sure that the secret exists
eksecrets = await ekclient.api("v1").resource("secrets")
secret = await eksecrets.fetch(
Expand Down Expand Up @@ -450,6 +460,7 @@ async def on_cluster_create(logger, instance, name, namespace, patch, **kwargs):
for ng in helm_values.get("nodeGroups", [])
]
else:
# TODO(johngarbutt) look for when a lease has an error, and go unhealthy
raise kopf.TemporaryError("lease is not active", delay = 15)
if settings.zenith.enabled:
helm_values = mergeconcat(
Expand Down Expand Up @@ -481,6 +492,12 @@ async def on_cluster_create(logger, instance, name, namespace, patch, **kwargs):
labels = patch.setdefault("metadata", {}).setdefault("labels", {})
labels[f"{settings.api_group}/cluster-template"] = instance.spec.template_name

# We might have been stuck in an error loop that has just been
# fixed by the operator, so in this case we reset the last_updated
# now we are letting the async poll loops take over
instance.status.last_updated = dt.datetime.now(dt.timezone.utc)
await save_cluster_status(instance)


@model_handler(api.Cluster, kopf.on.delete)
async def on_cluster_delete(logger, instance, name, namespace, **kwargs):
Expand Down Expand Up @@ -575,6 +592,25 @@ async def on_cluster_resume(instance, name, namespace, **kwargs):
await save_cluster_status(instance)


@model_handler(
api.Cluster,
kopf.timer,
# we don't set idle timeout, as we want to run during changes
interval = settings.timer_interval)
async def check_cluster_timeout(logger, instance, patch, **kwargs):
if instance.spec.paused:
logger.info("reconciliation is paused - no action taken")
return

# Trigger a timeout check if we are in a transitional state
if instance.status.phase in {None, api.ClusterPhase.PENDING,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

None shouldn't be possible here - is it a condition you have seen? You should get UNKNOWN instead.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe I saw this in my testing, because this can trigger before the first update completes, because I have not set the idle timer here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, but I should include unknown in here, maybe I did hit unknown.

api.ClusterPhase.RECONCILING,
api.ClusterPhase.UPGRADING,
api.ClusterPhase.UNHEALTHY,
api.ClusterPhase.UNKNOWN}:
await save_cluster_status(instance)


def on_related_object_event(
*args,
# This function maps an object to a cluster name
Expand Down
37 changes: 31 additions & 6 deletions azimuth_capi/status.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import datetime as dt
import logging

from .config import settings
from .models.v1alpha1 import (
ClusterPhase,
LeasePhase,
Expand Down Expand Up @@ -48,29 +50,28 @@ def _reconcile_cluster_phase(cluster):
Sets the overall cluster phase based on the component phases.
"""
# Only consider the lease when reconciling the cluster phase if one is set
if cluster.spec.lease_name:
if (
cluster.spec.lease_name and
cluster.status.lease_phase not in {LeasePhase.ACTIVE, LeasePhase.UPDATING}
):
if cluster.status.lease_phase in {
LeasePhase.CREATING,
LeasePhase.PENDING,
LeasePhase.STARTING
}:
cluster.status.phase = ClusterPhase.PENDING
return
if cluster.status.lease_phase == LeasePhase.ERROR:
cluster.status.phase = ClusterPhase.FAILED
return
if cluster.status.lease_phase in {
LeasePhase.TERMINATING,
LeasePhase.TERMINATED,
LeasePhase.DELETING
}:
cluster.status.phase = ClusterPhase.UNHEALTHY
return
if cluster.status.lease_phase == LeasePhase.UNKNOWN:
cluster.status.phase = ClusterPhase.PENDING
return
# At this point, either there is no lease or the lease phase is Active or Updating
if cluster.status.networking_phase in {
elif cluster.status.networking_phase in {
NetworkingPhase.PENDING,
NetworkingPhase.PROVISIONING
}:
Expand Down Expand Up @@ -143,6 +144,30 @@ def _reconcile_cluster_phase(cluster):
else:
cluster.status.phase = ClusterPhase.READY

# If we hit a terminal state, remove the updated timestamp,
if cluster.status.phase in {ClusterPhase.READY, ClusterPhase.FAILED}:
# reset the timeout timestamp, as we are now in a stable state
# if we do not reset the last_updated here, we are unable
# to know at the start of an update if we are re-entering
# due to an error vs starting for the first time since the
# last successful update (or its our fist ever update)
cluster.status.last_updated = None
else:
# if not a terminal state, ensure timestamp has been set
if cluster.status.last_updated is None:
now = dt.datetime.now(dt.timezone.utc)
cluster.status.last_updated = now
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this case should still work when the state change isn't triggered by a spec change. I think this means when we get an auto healing failure, we should timeout the operation correctly. But I should re-test that case, for sure.


# timeout pending states if we are stuck there too long
if cluster.status.phase in {ClusterPhase.PENDING,
ClusterPhase.RECONCILING,
ClusterPhase.UPGRADING}:
now = dt.datetime.now(dt.timezone.utc)
timeout_after = cluster.status.last_updated + dt.timedelta(
seconds=settings.cluster_timeout_seconds)
if now > timeout_after:
cluster.status.phase = ClusterPhase.UNHEALTHY


def lease_updated(cluster, obj):
"""
Expand Down
Loading