From f4d5f73008d62d242f330bd2b8223174986308a6 Mon Sep 17 00:00:00 2001
From: Andrew Durbin <andrewd@zededa.com>
Date: Wed, 16 Oct 2024 15:21:06 -0600
Subject: [PATCH] Node Drain Request/Status API

The base of an upcoming PR to enable deferring node
reboot and shutdown operations.  In an Edge-Node-Cluster
each node can hold a replica copy of a cluster volume.
If the other nodes 1 and 2 in a cluster have recently recovered
from a failure they could be rebuilding their replicas
(1.x, 1.y, 2.x, 2.y) off of the only remaining readable copies
(3.x, 3.y) on node 3.
This API is to enable zedagent,nodeagent,baseosmgr to
request a drain of replicas off of an edge-node (rebuild completion) before allowing
a node outage.

Signed-off-by: Andrew Durbin <andrewd@zededa.com>
---
 pkg/pillar/docs/zedkube.md      | 153 ++++++++++++++++++++++++++++++++
 pkg/pillar/kubeapi/kubetypes.go |  75 ++++++++++++++++
 pkg/pillar/kubeapi/nodedrain.go | 100 +++++++++++++++++++++
 pkg/pillar/kubeapi/nokube.go    |  13 +++
 4 files changed, 341 insertions(+)
 create mode 100644 pkg/pillar/docs/zedkube.md
 create mode 100644 pkg/pillar/kubeapi/kubetypes.go
 create mode 100644 pkg/pillar/kubeapi/nodedrain.go

diff --git a/pkg/pillar/docs/zedkube.md b/pkg/pillar/docs/zedkube.md
new file mode 100644
index 0000000000..ad899c8c88
--- /dev/null
+++ b/pkg/pillar/docs/zedkube.md
@@ -0,0 +1,153 @@
+# Clustered eve nodes (zedkube)
+
+## Overview
+
+## Components
+
+### kubenodeop
+
+kubenodeop handles cordoning, uncordoning, and draining of clustered eve-os nodes.
+Any given node could be hosting one or more longhorn volume replicas and thus could be the rebuild source for other node replicas.
+A drain operation should be performed before any Node Operation / Node Command which can cause an extended outage of a node such as a reboot, shutdown, reset.
+kubenodeop handles NodeDrainRequest objects which zedkube subscribes to, initiates the drain, and publishes NodeDrainStatus objects.
+
+### kubeapi
+
+1. `kubeapi.GetNodeDrainStatus()` to determine if system supports drain
+    - HV!=kubevirt: NOTSUPPORTED
+    - HV=kubevirt will return:
+        - NOTSUPPORTED if in single node.
+        - NOTREQUESTED if in cluster mode
+1. `kubeapi.RequestNodeDrain()` to begin a drain
+
+### Drain PubSub setup (node reboot/shutdown)
+
+1. zedagent/handlenodedrain.go:`initNodeDrainPubSub()`
+    - subscribes to NodeDrainStatus from zedkube
+    - creates publication of NodeDrainRequest
+1. nodeagent/handlenodedrain.go:`initNodeDrainPubSub()`
+    - subscribe to NodeDrainStatus from zedkube
+
+### Drain Request path (node reboot/shutdown)
+
+1. zedagent/parseconfig.go:`scheduleDeviceOperation()`
+    - If `shouldDeferForNodeDrain()` is true
+        - Set Reboot or shutdown cmd deferred state in zedagentContext
+1. zedagent/handlenodedrain.go:`shouldDeferForNodeDrain()`
+    - NodeDrainStatus == (NOTREQUESTED || FAILEDCORDON || FAILEDDRAIN):
+        - Drain is requested via `kubeapi.RequestNodeDrain()`
+        - return Defer
+    - NodeDrainStatus == (UNKNOWN || NOTSUPPORTED || COMPLETE )
+        - return !Defer
+    - NodeDrainStatus == (REQUESTED || STARTING || CORDONED || DRAINRETRYING ):
+        - return Defer
+
+### Drain Status Handler (node reboot/shutdown)
+
+1. zedagent/handlenodedrain.go:`handleNodeDrainStatusImpl()`
+    - NodeDrainStatus = FAILEDCORDON or FAILEDDRAIN
+        - Unpublish NodeDrainRequest
+1. nodeagent/handlenodedrain.go:`handleNodeDrainStatusImplNA()`
+    - NodeDrainStatus >= REQUESTED and < COMPLETE
+        - republish nodeagentstatus with drainInProgress set
+    - NodeDrainStatus == COMPLETE
+        - republish nodeagentstatus with drainInProgress cleared
+1. zedagent/zedagent.go:`handleNodeAgentStatusImpl()`
+    - If there is:
+        - a deferred device op
+        - nodeagent configctx reports drain complete
+    - Then process deferred reboot/shutdown
+
+### Drain PubSub setup (node eveimage-update)
+
+1. baseosmgr/handlenodedrain.go:`initNodeDrainPubSub()`
+    - subscribe to NodeDrainStatus from zedkube
+    - setup publication to NodeDrainRequest
+
+### Drain Request path (node eveimage-update)
+
+1. baseosmgr/handlebaseos.go:`baseOsHandleStatusUpdateUUID()`
+    - If BaseOs download complete (LOADING||LOADED||INSTALLED), not currently Activated, and new config requested it Activated
+        - Check `shouldDeferForNodeDrain()`, if defer requested return as Completion will later will complete this BaseOsStatusUpdate.
+1. baseosmgr/handlenodedrain.go:`shouldDeferForNodeDrain()`
+    - NodeDrainStatus == (NOTREQUESTED || FAILEDCORDON || FAILEDDRAIN):
+        - save BaseOsId in baseOsMgrContext.deferredBaseOsID
+        - Drain is requested via `kubeapi.RequestNodeDrain()`
+        - return Defer
+    - NodeDrainStatus == (UNKNOWN || NOTSUPPORTED || COMPLETE )
+        - return !Defer
+    - NodeDrainStatus == (REQUESTED || STARTING || CORDONED || DRAINRETRYING ):
+        - return Defer
+
+### Drain Status Handler (node eve-image update)
+
+1. baseosmgr/handlenodedrain.go:`handleNodeDrainStatusImpl()`
+    - NodeDrainStatus == FAILEDCORDON or FAILEDDRAIN:
+        - Unpublish NodeDrainRequest
+    - NodeDrainStatus == COMPLETE:
+        - Complete deferred baseOsMgrContext.deferredBaseOsID to `baseOsHandleStatusUpdateUUID()`
+
+### General DrainRequest Processing
+
+1. zedkube/zedkube.go:Run()
+    - sub to NodeDrainRequest from zedagent and baseosmgr
+    - new publication of NodeDrainStatus
+    - Init NodeDrainStatus to NOTSUPPORTED
+1. zedkube/zedkube.go:`handleEdgeNodeClusterConfigImpl()`
+    - System switching to cluster membership: NodeDrainStatus -> NOTREQUESTED
+1. zedkube/zedkube.go:`handleEdgeNodeClusterConfigDelete()`
+    - System switching to single node: NodeDrainStatus -> NOTSUPPORTED
+1. zedkube/handlenodedrain.go:`handleNodeDrainRequestImpl()`
+    - NodeDrainStatus -> REQUESTED
+1. zedkube/kubenodeop.go:`cordonAndDrainNode()`
+    - NodeDrainStatus -> STARTING
+    - Retry Cordon up to 10 times (in case k8s api states object changed)
+        - when retries exhausted: NodeDrainStatus -> FAILEDCORDON
+    - NodeDrainStatus -> CORDONED
+    - Retry Drain up to 5 times
+        - between tries: NodeDrainStatus -> DRAINRETRYING
+        - on failure: NodeDrainStatus -> FAILEDDRAIN
+    - NodeDrainStatus -> COMPLETE
+
+## Debugging
+
+### PubSub NodeDrainRequest/NodeDrainStatus
+
+/run/zedagent/NodeDrainRequest/global.json
+/run/baseosmgr/NodeDrainRequest/global.json
+/run/zedkube/NodeDrainStatus/global.json
+
+The current node drain progress is available from the global NodeDrainStatus object found at
+`cat /run/zedkube/NodeDrainStatus/global.json | jq .`
+
+NodeDrainStatus can be forced by writing the object (in pillar svc container fs) to: /persist/kube-status/force-NodeDrainStatus-global.json
+
+eg. to force disable drain:
+echo '{"Status":1,"RequestedBy":1}' > /persist/kube-status/kubeforce-NodeDrainStatus-global.json
+
+eg. to force deviceop drain complete:
+echo '{"Status":9,"RequestedBy":2}' > /persist/kube-status/force-NodeDrainStatus-global.json
+
+eg. to force baseosmgr drain complete:
+echo '{"Status":9,"RequestedBy":3}' > /persist/kube-status/force-NodeDrainStatus-global.json
+
+"Cannot evict pod as it would violate the pod's disruption budget":
+If NodeDrainStatus can get stuck if attempting to drain a node running a pod where the pod has an
+explicit spec.nodeName == "drain node".  Delete the pod to continue.
+If workload is a statefulset declaing spec.nodeName and node is already cordoned.  Then deleting the pod is not sufficient
+The statefulset must be deleted.
+
+### NodeDrainRequest/NodeDrainStatus log strings
+
+- NodeDrainRequest
+- NodeDrainStatus
+- cordonNode
+- cordonAndDrainNode
+- scheduleDeviceOperation
+- baseOsHandleStatusUpdateUUID
+- nodedrain-step
+- kubevirt_node_drain_completion_time_seconds
+...
+    zgrep 'kubevirt_node_drain_completion_time_seconds' /persist/newlog/keepSentQueue/dev.log.1725511530990.gz | jq -r .content | jq -r .msg | cut -d ':' -f 2
+    s34.559219
+...
diff --git a/pkg/pillar/kubeapi/kubetypes.go b/pkg/pillar/kubeapi/kubetypes.go
new file mode 100644
index 0000000000..48e973599b
--- /dev/null
+++ b/pkg/pillar/kubeapi/kubetypes.go
@@ -0,0 +1,75 @@
+// Copyright (c) 2024 Zededa, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+package kubeapi
+
+import "time"
+
+// DrainStatus tracks progress of draining a node of replica disks and workloads
+type DrainStatus uint8
+
+const (
+	UNKNOWN       DrainStatus = iota + 0 // UNKNOWN Unable to determine
+	NOTSUPPORTED                         // NOTSUPPORTED System not (HV=kubevirt and clustered)
+	NOTREQUESTED                         // NOTREQUESTED Not yet requested
+	REQUESTED                            // REQUESTED From zedagent device operation or baseosmgr new update
+	STARTING                             // STARTING Zedkube go routine started, not yet cordoned
+	CORDONED                             // CORDONED Node Unschedulable set
+	FAILEDCORDON                         // FAILEDCORDON Node modification unable to apply
+	DRAINRETRYING                        // DRAINRETRYING Drain retry in progress, could be retried replica rebuild
+	FAILEDDRAIN                          // FAILEDDRAIN Could be retried replica rebuild
+	COMPLETE                             // COMPLETE All node workloads removed from system
+)
+
+func (status DrainStatus) String() string {
+	switch status {
+	case UNKNOWN:
+		return "Unknown"
+	case NOTSUPPORTED:
+		return "Not Supported"
+	case NOTREQUESTED:
+		return "Not Requested"
+	case REQUESTED:
+		return "Requested"
+	case STARTING:
+		return "Starting"
+	case CORDONED:
+		return "Cordoned"
+	case FAILEDCORDON:
+		return "Failed Cordon"
+	case DRAINRETRYING:
+		return "Drain Retrying"
+	case FAILEDDRAIN:
+		return "Failed Drain"
+	case COMPLETE:
+		return "Complete"
+	default:
+		return "Unknown"
+	}
+}
+
+// DrainRequester is a user initiated edge-node operation from a pillar microservice
+type DrainRequester uint8
+
+const (
+	NONE     DrainRequester = iota + 1 // NONE - The default value
+	DEVICEOP                           // DEVICEOP - Node Reboot or shutdown
+	UPDATE                             // UPDATE - baseos update
+)
+
+// NodeDrainRequest is the trigger to NodeDrainStatus
+//
+//	Used by Reboots, Prepare-Shutdown, baseos updates
+type NodeDrainRequest struct {
+	RequestedAt time.Time
+	RequestedBy DrainRequester
+	Context     string
+}
+
+// NodeDrainStatus is a response to NodeDrainRequest
+//
+//	Subscribe to updates to continue NodeDrainRequest operations.
+type NodeDrainStatus struct {
+	Status      DrainStatus
+	RequestedBy DrainRequester
+}
diff --git a/pkg/pillar/kubeapi/nodedrain.go b/pkg/pillar/kubeapi/nodedrain.go
new file mode 100644
index 0000000000..d2732921ca
--- /dev/null
+++ b/pkg/pillar/kubeapi/nodedrain.go
@@ -0,0 +1,100 @@
+// Copyright (c) 2024 Zededa, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+//go:build kubevirt
+
+package kubeapi
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"time"
+
+	"github.com/lf-edge/eve/pkg/pillar/base"
+	"github.com/lf-edge/eve/pkg/pillar/pubsub"
+)
+
+// An alternate path to force a drain status in the event of a drain issue.
+const forceNodeDrainPath string = "/persist/kube-status/force-NodeDrainStatus-global.json"
+
+// RequestNodeDrain generates the NodeDrainRequest object and publishes it
+func RequestNodeDrain(pubNodeDrainRequest pubsub.Publication, requester DrainRequester, context string) error {
+	drainReq := NodeDrainRequest{
+		RequestedAt: time.Now(),
+		RequestedBy: requester,
+		Context:     context,
+	}
+	err := pubNodeDrainRequest.Publish("global", drainReq)
+	if err != nil {
+		return fmt.Errorf("RequestNodeDrain: error publishing drain request: %v", err)
+	}
+	return nil
+}
+
+// GetDrainStatusOverride : an alternate way to set drain status for debug
+func GetDrainStatusOverride(log *base.LogObject) *NodeDrainStatus {
+	if _, err := os.Stat(forceNodeDrainPath); err != nil {
+		return nil
+	}
+	b, err := os.ReadFile(forceNodeDrainPath)
+	if err != nil {
+		log.Warnf("Unable to read %s:%v", forceNodeDrainPath, err)
+		return nil
+	}
+	cfg := NodeDrainStatus{}
+	err = json.Unmarshal(b, &cfg)
+	if err != nil {
+		log.Warnf("Unable to Unmarshal %s to NodeDrainStatus: %v", forceNodeDrainPath, err)
+		return nil
+	}
+	if cfg.Status == COMPLETE {
+		err = os.Remove(forceNodeDrainPath)
+		if err != nil {
+			log.Warnf("could not remove %s: %v", forceNodeDrainPath, err)
+		}
+	}
+	return &cfg
+}
+
+// CleanupDrainStatusOverride is used at microservice startup to cleanup
+// a previously user written override file
+func CleanupDrainStatusOverride(log *base.LogObject) {
+	if _, err := os.Stat(forceNodeDrainPath); err != nil {
+		return
+	}
+	err := os.Remove(forceNodeDrainPath)
+	if err != nil {
+		log.Warnf("CleanupDrainStatusOverride could not remove %s: %v", forceNodeDrainPath, err)
+		return
+	}
+	return
+}
+
+// DrainStatusFaultInjectionWait while this file exists, wait in the drain status goroutine
+func DrainStatusFaultInjectionWait() bool {
+	injectFaultPath := "/tmp/DrainStatus_FaultInjection_Wait"
+	if _, err := os.Stat(injectFaultPath); err == nil {
+		return true
+	}
+	return false
+}
+
+// GetNodeDrainStatus is a wrapper to either return latest NodeDrainStatus
+//
+//	or return a forced status from /persist/force-NodeDrainStatus-global.json
+func GetNodeDrainStatus(subNodeDrainStatus pubsub.Subscription, log *base.LogObject) *NodeDrainStatus {
+	override := GetDrainStatusOverride(log)
+	if override != nil {
+		return override
+	}
+
+	items := subNodeDrainStatus.GetAll()
+	glbStatus, ok := items["global"].(NodeDrainStatus)
+	if !ok {
+		// This should only be expected on an HV=kubevirt build
+		// and only very early in boot (before zedkube starts)
+		return &NodeDrainStatus{Status: UNKNOWN, RequestedBy: NONE}
+	}
+	return &glbStatus
+}
diff --git a/pkg/pillar/kubeapi/nokube.go b/pkg/pillar/kubeapi/nokube.go
index 16beb48c02..f8f7d86ac5 100644
--- a/pkg/pillar/kubeapi/nokube.go
+++ b/pkg/pillar/kubeapi/nokube.go
@@ -6,6 +6,7 @@
 package kubeapi
 
 import (
+	"fmt"
 	"time"
 
 	"github.com/lf-edge/eve/pkg/pillar/base"
@@ -27,3 +28,15 @@ func CleanupStaleVMI() (int, error) {
 func GetPVCList(*base.LogObject) ([]string, error) {
 	panic("GetPVCList is not built")
 }
+
+// RequestNodeDrain is a stub for non-kubevirt builds
+func RequestNodeDrain(pubsub.Publication, DrainRequester, string) error {
+	// Nothing to do here, just noop
+	return fmt.Errorf("nokube requested drain, should not get here")
+}
+
+// GetNodeDrainStatus is a stub for non-kubevirt builds
+func GetNodeDrainStatus(pubsub.Subscription) *NodeDrainStatus {
+	// No need to query for inprogress operations, just a noop
+	return &NodeDrainStatus{Status: NOTSUPPORTED}
+}