azimuth-cloud · JohnGarbutt · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024 · Oct 15, 2024
diff --git a/azimuth_capi/config.py b/azimuth_capi/config.py
@@ -207,6 +207,9 @@ class Configuration(
     #: The number of seconds to wait between timer executions
     timer_interval: conint(gt = 0) = 60
 
+    #: The number of minutes to wait befoore marking a cluster as unhealthy
+    cluster_timeout_seconds: conint(gt = 0) = 30 * 60
+
     #: The field manager name to use for server-side apply
     easykube_field_manager: constr(min_length = 1) = "azimuth-capi-operator"
 

diff --git a/azimuth_capi/models/v1alpha1/cluster.py b/azimuth_capi/models/v1alpha1/cluster.py
@@ -387,6 +387,10 @@ class ClusterStatus(schema.BaseModel, extra = "allow"):
         default_factory = dict,
         description = "The services for the cluster, indexed by service name."
     )
+    last_updated: schema.Optional[dt.datetime] = Field(
+        default = None,
+        description = "Used to trigger the timeout of pending states"
+    )
 
 
 class Cluster(

diff --git a/azimuth_capi/operator.py b/azimuth_capi/operator.py
@@ -417,6 +417,16 @@ async def on_cluster_create(logger, instance, name, namespace, patch, **kwargs):
     if instance.spec.paused:
         logger.info("reconciliation is paused - no action taken")
         return
+
+    # To help trigger timeouts if we get stuck,
+    # ensure we set the last_updated time.
+    # But be sure not to reset that if we retrying
+    # the same update after a failure, so we timeout
+    # if we are stuck in a retry loop before the end
+    # of the update function.
+    if not instance.status.last_updated:
+        await save_cluster_status(instance)
+
     # Make sure that the secret exists
     eksecrets = await ekclient.api("v1").resource("secrets")
     secret = await eksecrets.fetch(
@@ -450,6 +460,7 @@ async def on_cluster_create(logger, instance, name, namespace, patch, **kwargs):
                 for ng in helm_values.get("nodeGroups", [])
             ]
         else:
+            # TODO(johngarbutt) look for when a lease has an error, and go unhealthy
             raise kopf.TemporaryError("lease is not active", delay = 15)
     if settings.zenith.enabled:
         helm_values = mergeconcat(
@@ -481,6 +492,12 @@ async def on_cluster_create(logger, instance, name, namespace, patch, **kwargs):
     labels = patch.setdefault("metadata", {}).setdefault("labels", {})
     labels[f"{settings.api_group}/cluster-template"] = instance.spec.template_name
 
+    # We might have been stuck in an error loop that has just been
+    # fixed by the operator, so in this case we reset the last_updated
+    # now we are letting the async poll loops take over
+    instance.status.last_updated = dt.datetime.now(dt.timezone.utc)
+    await save_cluster_status(instance)
+
 
 @model_handler(api.Cluster, kopf.on.delete)
 async def on_cluster_delete(logger, instance, name, namespace, **kwargs):
@@ -575,6 +592,25 @@ async def on_cluster_resume(instance, name, namespace, **kwargs):
     await save_cluster_status(instance)
 
 
+@model_handler(
+    api.Cluster,
+    kopf.timer,
+    # we don't set idle timeout, as we want to run during changes
+    interval = settings.timer_interval)
+async def check_cluster_timeout(logger, instance, patch, **kwargs):
+    if instance.spec.paused:
+        logger.info("reconciliation is paused - no action taken")
+        return
+
+    # Trigger a timeout check if we are in a transitional state
+    if instance.status.phase in {None, api.ClusterPhase.PENDING,
+                                 api.ClusterPhase.RECONCILING,
+                                 api.ClusterPhase.UPGRADING,
+                                 api.ClusterPhase.UNHEALTHY,
+                                 api.ClusterPhase.UNKNOWN}:
+        await save_cluster_status(instance)
+
+
 def on_related_object_event(
     *args,
     # This function maps an object to a cluster name

diff --git a/azimuth_capi/status.py b/azimuth_capi/status.py
@@ -1,5 +1,7 @@
+import datetime as dt
 import logging
 
+from .config import settings
 from .models.v1alpha1 import (
     ClusterPhase,
     LeasePhase,
@@ -48,29 +50,28 @@ def _reconcile_cluster_phase(cluster):
     Sets the overall cluster phase based on the component phases.
     """
     # Only consider the lease when reconciling the cluster phase if one is set
-    if cluster.spec.lease_name:
+    if (
+        cluster.spec.lease_name and
+        cluster.status.lease_phase not in {LeasePhase.ACTIVE, LeasePhase.UPDATING}
+    ):
         if cluster.status.lease_phase in {
             LeasePhase.CREATING,
             LeasePhase.PENDING,
             LeasePhase.STARTING
         }:
             cluster.status.phase = ClusterPhase.PENDING
-            return
         if cluster.status.lease_phase == LeasePhase.ERROR:
             cluster.status.phase = ClusterPhase.FAILED
-            return
         if cluster.status.lease_phase in {
             LeasePhase.TERMINATING,
             LeasePhase.TERMINATED,
             LeasePhase.DELETING
         }:
             cluster.status.phase = ClusterPhase.UNHEALTHY
-            return
         if cluster.status.lease_phase == LeasePhase.UNKNOWN:
             cluster.status.phase = ClusterPhase.PENDING
-            return
     # At this point, either there is no lease or the lease phase is Active or Updating
-    if cluster.status.networking_phase in {
+    elif cluster.status.networking_phase in {
         NetworkingPhase.PENDING,
         NetworkingPhase.PROVISIONING
     }:
@@ -143,6 +144,30 @@ def _reconcile_cluster_phase(cluster):
     else:
         cluster.status.phase = ClusterPhase.READY
 
+    # If we hit a terminal state, remove the updated timestamp,
+    if cluster.status.phase in {ClusterPhase.READY, ClusterPhase.FAILED}:
+        # reset the timeout timestamp, as we are now in a stable state
+        # if we do not reset the last_updated here, we are unable
+        # to know at the start of an update if we are re-entering
+        # due to an error vs starting for the first time since the
+        # last successful update (or its our fist ever update)
+        cluster.status.last_updated = None
+    else:
+        # if not a terminal state, ensure timestamp has been set
+        if cluster.status.last_updated is None:
+            now = dt.datetime.now(dt.timezone.utc)
+            cluster.status.last_updated = now
+
+    # timeout pending states if we are stuck there too long
+    if cluster.status.phase in {ClusterPhase.PENDING,
+                                ClusterPhase.RECONCILING,
+                                ClusterPhase.UPGRADING}:
+        now = dt.datetime.now(dt.timezone.utc)
+        timeout_after = cluster.status.last_updated + dt.timedelta(
+            seconds=settings.cluster_timeout_seconds)
+        if now > timeout_after:
+            cluster.status.phase = ClusterPhase.UNHEALTHY
+
 
 def lease_updated(cluster, obj):
     """