From 9a2610fbfae19c189fe259e2b14ce4ead6b0efdb Mon Sep 17 00:00:00 2001
From: Krisztian Litkey <krisztian.litkey@intel.com>
Date: Fri, 22 Nov 2024 11:34:50 +0200
Subject: [PATCH 1/4] resmgr,agent: propagate startup config error back to CR.

Update the agent's config notification callback signature and
the resmgr notification callaback to propagate errors for the
initial configuration back to the source CR before exiting.

Signed-off-by: Krisztian Litkey <krisztian.litkey@intel.com>
---
 pkg/agent/agent.go             | 13 +++++++++----
 pkg/resmgr/resource-manager.go | 14 +++++++-------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go
index af0409ef1..334b298b2 100644
--- a/pkg/agent/agent.go
+++ b/pkg/agent/agent.go
@@ -103,7 +103,7 @@ func TemplateConfigInterface() ConfigInterface {
 }
 
 // NotifyFn is a function to call when the effective configuration changes.
-type NotifyFn func(cfg interface{}) error
+type NotifyFn func(cfg interface{}) (bool, error)
 
 var (
 	// Our logger instance for the agent.
@@ -544,12 +544,17 @@ func (a *Agent) updateConfig(cfg metav1.Object) {
 		}
 	}
 
-	err := a.notifyFn(cfg)
+	fatal, err := a.notifyFn(cfg)
+	a.patchConfigStatus(a.currentCfg, cfg, err)
+
 	if err != nil {
-		log.Errorf("failed to apply configuration: %v", err)
+		if fatal {
+			log.Fatalf("failed to apply configuration: %v", err)
+		} else {
+			log.Errorf("failed to apply configuration: %v", err)
+		}
 	}
 
-	a.patchConfigStatus(a.currentCfg, cfg, err)
 	a.currentCfg = cfg
 }
 
diff --git a/pkg/resmgr/resource-manager.go b/pkg/resmgr/resource-manager.go
index cff9c7cfb..b5ba74f23 100644
--- a/pkg/resmgr/resource-manager.go
+++ b/pkg/resmgr/resource-manager.go
@@ -128,17 +128,17 @@ func (m *resmgr) Start() error {
 	return nil
 }
 
-func (m *resmgr) updateConfig(newCfg interface{}) error {
+func (m *resmgr) updateConfig(newCfg interface{}) (bool, error) {
 	if newCfg == nil {
-		return fmt.Errorf("can't run without effective configuration...")
+		return false, fmt.Errorf("can't run without effective configuration...")
 	}
 
 	cfg, ok := newCfg.(cfgapi.ResmgrConfig)
 	if !ok {
 		if !m.running {
-			log.Fatalf("got initial configuration of unexpected type %T", newCfg)
+			return true, fmt.Errorf("got initial configuration of unexpected type %T", newCfg)
 		} else {
-			return fmt.Errorf("got configuration of unexpected type %T", newCfg)
+			return false, fmt.Errorf("got configuration of unexpected type %T", newCfg)
 		}
 	}
 
@@ -151,17 +151,17 @@ func (m *resmgr) updateConfig(newCfg interface{}) error {
 		log.InfoBlock("  <initial config> ", "%s", dump)
 
 		if err := m.start(cfg); err != nil {
-			log.Fatalf("failed to start with initial configuration: %v", err)
+			return true, fmt.Errorf("failed to start with initial configuration: %v", err)
 		}
 
 		m.running = true
-		return nil
+		return false, nil
 	}
 
 	log.Infof("configuration update %s (generation %d):", meta.GetName(), meta.GetGeneration())
 	log.InfoBlock("  <updated config> ", "%s", dump)
 
-	return m.reconfigure(cfg)
+	return false, m.reconfigure(cfg)
 }
 
 // Start resource management once we acquired initial configuration.

From a1eaf95ce28c301f3e4fae01d6f4ce277985e346 Mon Sep 17 00:00:00 2001
From: Krisztian Litkey <krisztian.litkey@intel.com>
Date: Fri, 22 Nov 2024 15:12:12 +0200
Subject: [PATCH 2/4] test/e2e: fix get-config-node-status-error.

Signed-off-by: Krisztian Litkey <krisztian.litkey@intel.com>
---
 test/e2e/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/e2e/run.sh b/test/e2e/run.sh
index cee7d8736..40da77e8b 100755
--- a/test/e2e/run.sh
+++ b/test/e2e/run.sh
@@ -502,7 +502,7 @@ get-config-node-status-result() {
 get-config-node-status-error() {
     local resource="$1" node="$(get-hostname-for-vm)"
     vm-command-q "kubectl get -n kube-system $resource \
-                      -ojsonpath=\"{.status.nodes['$node'].error}\""
+                      -ojsonpath=\"{.status.nodes['$node'].errors}\""
 }
 
 wait-config-node-status() {

From edd0ad311044e6b5179180ff1ec6ed03d1ad5538 Mon Sep 17 00:00:00 2001
From: Krisztian Litkey <krisztian.litkey@intel.com>
Date: Fri, 22 Nov 2024 15:13:46 +0200
Subject: [PATCH 3/4] test/e2e: allow expected helm-launch (getting available)
 errors.

Signed-off-by: Krisztian Litkey <krisztian.litkey@intel.com>
---
 test/e2e/run.sh | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/test/e2e/run.sh b/test/e2e/run.sh
index 40da77e8b..d3fc8eced 100755
--- a/test/e2e/run.sh
+++ b/test/e2e/run.sh
@@ -353,6 +353,7 @@ helm-launch() { # script API
     #         default: 20s
     #     cfgresource: config custom resource to wait for node status to change in
     #         default: balloonspolicies/default or topologyawarepolicies/default
+    #     expect_error: don't exit, expect availability error
     #
     # Example:
     #     helm_config=$(instantiate helm-config.yaml) helm-launch balloons
@@ -361,6 +362,7 @@ helm-launch() { # script API
     local helm_config="${helm_config:-$TEST_DIR/helm-config.yaml}"
     local ds_name="${daemonset_name:-}" ctr_name="${container_name:-nri-resource-policy-$1}"
     local helm_name="${helm_name:-test}" timeout="${launch_timeout:-20s}"
+    local expect_error="${expect_error:-0}"
     local plugin="$1" cfgresource=${cfgresource:-} cfgstatus
     local deadline
     shift
@@ -403,8 +405,15 @@ helm-launch() { # script API
 
     deadline=$(deadline-for-timeout $timeout)
     vm-command "kubectl wait -n kube-system ds/${ds_name} --timeout=$timeout \
-                    --for=jsonpath='{.status.numberAvailable}'=1" || \
-        error "Timeout while waiting daemonset/${ds_name} to be available"
+                    --for=jsonpath='{.status.numberAvailable}'=1"
+
+    if [ "$COMMAND_STATUS" != "0" ]; then
+        if [ "$expect_error" != "1" ]; then
+            error "Timeout while waiting daemonset/${ds_name} to be available"
+        else
+            return 1
+        fi
+    fi
 
     timeout=$(timeout-for-deadline $deadline)
     timeout=$timeout wait-config-node-status $cfgresource
@@ -413,7 +422,6 @@ helm-launch() { # script API
     if [ "$result" != "Success" ]; then
         reason=$(get-config-node-status-error $cfgresource)
         error "Plugin $plugin configuration failed: $reason"
-        return 1
     fi
 
     vm-start-log-collection -n kube-system ds/$ds_name -c $ctr_name

From a441a0f2d9626ef46bccbbea2aff54505a0fe2a7 Mon Sep 17 00:00:00 2001
From: Krisztian Litkey <krisztian.litkey@intel.com>
Date: Fri, 22 Nov 2024 15:19:14 +0200
Subject: [PATCH 4/4] test/e2e: add initial config error propagation test.

Signed-off-by: Krisztian Litkey <krisztian.litkey@intel.com>
---
 .../n4c16/test12-config-status/code.var.sh    | 28 ++++++++++++++++++-
 .../custom-config.yaml.in                     |  2 +-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/code.var.sh
index 644001707..c8472e046 100644
--- a/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/code.var.sh
+++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/code.var.sh
@@ -1,4 +1,13 @@
-helm-terminate
+CONFIG_GROUP="group.test"
+
+cleanup() {
+    vm-command "kubectl delete -n kube-system topologyawarepolicies.config.nri/default" || :
+    vm-command "kubectl delete -n kube-system topologyawarepolicies.config.nri/$CONFIG_GROUP" || :
+    vm-command "kubectl label nodes --all config.nri/group-" || :
+    helm-terminate || :
+}
+
+cleanup
 helm_config=$(instantiate helm-config.yaml) helm-launch topology-aware
 
 sleep 1
@@ -12,6 +21,7 @@ vm-command "kubectl wait -n kube-system topologyawarepolicies/default \
     error "expected initial Success status"
 }
 
+# verify propagation of errors back to source CR
 vm-put-file $(RESERVED_CPU=750x instantiate custom-config.yaml) broken-config.yaml
 vm-command "kubectl apply -f broken-config.yaml"
 
@@ -26,3 +36,19 @@ vm-command "kubectl wait -n kube-system topologyawarepolicies/default \
 }
 
 helm-terminate
+
+# verify propagation of initial configuration errors back to source CR
+vm-put-file $(CONFIG_NAME="$CONFIG_GROUP" RESERVED_CPU=750x instantiate custom-config.yaml) \
+            broken-group-config.yaml
+vm-command "kubectl apply -f broken-group-config.yaml" || \
+    error "failed to install broken group config"
+vm-command "kubectl label nodes --all config.nri/group=test" || \
+    error "failed to label nodes for group config"
+
+expect_error=1 launch_timeout=5s helm_config=$(instantiate helm-config.yaml) helm-launch topology-aware
+get-config-node-status-error topologyawarepolicies/$CONFIG_GROUP | \
+    grep "failed to parse" | grep -q 750x || {
+    error "expected initial error not found in status"
+}
+
+cleanup
diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/custom-config.yaml.in b/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/custom-config.yaml.in
index 720ef00d4..4a5f812de 100644
--- a/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/custom-config.yaml.in
+++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/custom-config.yaml.in
@@ -1,7 +1,7 @@
 apiVersion: config.nri/v1alpha1
 kind: TopologyAwarePolicy
 metadata:
-  name: default
+  name: ${CONFIG_NAME:-default}
   namespace: kube-system
 spec:
   pinCPU: true