From 14c1ac32bf3e98eb54ccbcaa4b55cf19823482c4 Mon Sep 17 00:00:00 2001 From: akutz Date: Mon, 16 Dec 2024 13:21:21 -0600 Subject: [PATCH] Do not CBLO if VM watcher service fails This patch updates the behavior of the VM watcher service to no longer return an error if there is an issue starting. While this is the correct behavior, it also results in CBLO for the VM Op pod if the reason for the error is external and not resolved. For example, if a user configures the host name of vCenter to be invalid, the VM Op pod will be in CBLO until this is resolved. Again, this is technically correct, and resolves itself once the user fixes the misconfigured setting. However, the VM Op pod becoming unavailable on a single-node Supervisor also means no webhooks are running, and this would be bad. --- services/vm-watcher/vm_watcher_service.go | 16 ++++++--- .../vm-watcher/vm_watcher_service_test.go | 34 ++++++++++++++++--- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/services/vm-watcher/vm_watcher_service.go b/services/vm-watcher/vm_watcher_service.go index dbdfc74bf..5a7f9e0e6 100644 --- a/services/vm-watcher/vm_watcher_service.go +++ b/services/vm-watcher/vm_watcher_service.go @@ -84,10 +84,18 @@ func (s Service) Start(ctx context.Context) error { // error, then do not treat the error as fatal. This allows the // loop to run again, kicking off another watcher with what should // be updated credentials. - if !vsphereclient.IsInvalidLogin(err) && - !vsphereclient.IsNotAuthenticatedError(err) { - - return err + if vsphereclient.IsInvalidLogin(err) || + vsphereclient.IsNotAuthenticatedError(err) { + + logger.V(4).Error( + err, + "Authn/authz issue trying start vm watcher service") + + } else { + // Log unexpected error. + logger.Error( + err, + "Unexpected error trying to start vm watcher service") } } } diff --git a/services/vm-watcher/vm_watcher_service_test.go b/services/vm-watcher/vm_watcher_service_test.go index 986b48a04..09fc24148 100644 --- a/services/vm-watcher/vm_watcher_service_test.go +++ b/services/vm-watcher/vm_watcher_service_test.go @@ -221,8 +221,12 @@ var _ = Describe( }) When("the port is invalid", func() { + var ( + oldPort string + ) + JustBeforeEach(func() { - vcSimCtx.Suite.StartErrExpected = true + vcSimCtx.Suite.StartErrExpected = false Eventually(func(g Gomega) { vsClientMu.RLock() @@ -236,6 +240,7 @@ var _ = Describe( vsClientMu.Lock() defer vsClientMu.Unlock() + oldPort = vcSimCtx.VCClientConfig.Port vcSimCtx.VCClientConfig.Port = "1" }) @@ -247,10 +252,31 @@ var _ = Describe( }) }) - Specify("the service should fail", func() { + Specify("the service should be restarted once the port is fixed", func() { Eventually(func(g Gomega) { - g.Expect(vcSimCtx.Suite.StartErr()).To(HaveOccurred()) - g.Expect(atomic.LoadInt32(&numNewClientCalls)).To(Equal(int32(2))) + vsClientMu.RLock() + defer vsClientMu.RUnlock() + + g.Expect(vsClient.Valid()).To(BeFalse()) + g.Expect(atomic.LoadInt32(&numNewClientCalls)).To(BeNumerically(">=", int32(2))) + }).Should(Succeed()) + + By("fix the port", func() { + vsClientMu.Lock() + defer vsClientMu.Unlock() + + vcSimCtx.VCClientConfig.Port = oldPort + }) + + Eventually(func(g Gomega) { + vsClientMu.RLock() + defer vsClientMu.RUnlock() + + g.Expect(atomic.LoadInt32(&numNewClientCalls)).To(BeNumerically(">=", int32(3))) + }).Should(Succeed()) + + Consistently(func(g Gomega) { + g.Expect(vcSimCtx.Suite.StartErr()).ToNot(HaveOccurred()) }).Should(Succeed()) }) })