substratusai · samos123 · Feb 7, 2024 · Feb 3, 2024 · Feb 3, 2024 · Feb 3, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -27,8 +27,11 @@ jobs:
         run: make test-integration
 
   e2e:
+    strategy:
+      matrix:
+        replicas: ["1", "3"]
     runs-on: ubuntu-latest
-
+    name: E2E kind tests Lingo.replicas=${{ matrix.replicas }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v2
@@ -49,4 +52,4 @@ jobs:
           sudo mv skaffold /usr/local/bin
 
       - name: Run e2e tests
-        run: make test-e2e
+        run: REPLICAS=${{ matrix.replicas }} make test-e2e
diff --git a/pkg/leader/election.go b/pkg/leader/election.go
@@ -10,6 +10,7 @@ import (
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/tools/leaderelection"
 	"k8s.io/client-go/tools/leaderelection/resourcelock"
+	"k8s.io/client-go/util/flowcontrol"
 )
 
 func NewElection(clientset kubernetes.Interface, id, namespace string) *Election {
@@ -63,5 +64,23 @@ type Election struct {
 }
 
 func (le *Election) Start(ctx context.Context) {
-	leaderelection.RunOrDie(ctx, le.config)
+	backoff := flowcontrol.NewBackOff(1*time.Second, 15*time.Second)
+	const backoffID = "lingo-leader-election"
+	retryCount := 0
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		default:
+			if retryCount > 0 {
+				backoff.Next(backoffID, backoff.Clock.Now())
+				delay := backoff.Get(backoffID)
+				log.Printf("Leader election failed, retrying in %v. RetryCount: %v", delay, retryCount+1)
+				<-time.After(delay)
+			}
+			log.Printf("Starting leader election process. RetryCount: %v", retryCount+1)
+			leaderelection.RunOrDie(ctx, le.config)
+			retryCount++
+		}
+	}
 }
diff --git a/tests/e2e/test.sh b/tests/e2e/test.sh
@@ -6,6 +6,7 @@ set -xe
 HOST=127.0.0.1
 PORT=30080
 BASE_URL="http://$HOST:$PORT/v1"
+REPLICAS=${REPLICAS:-3}
 
 
 if kind get clusters | grep -q substratus-test; then
@@ -42,6 +43,9 @@ if ! kubectl get deployment lingo; then
   skaffold run
 fi
 
+kubectl patch deployment lingo --patch "{\"spec\": {\"replicas\": $REPLICAS}}"
+
+kubectl logs -f deployment/lingo &
 
 kubectl wait --for=condition=available --timeout=30s deployment/lingo
 
@@ -89,18 +93,32 @@ if [ "$replicas" -eq 1 ]; then
   exit 1
 fi
 
-echo "Waiting for deployment to scale down back to 0 within 2 minutes"
+# Verify that leader election works by forcing a 120 second apiserver outage
+KIND_NODE=$(kind get nodes --name=substratus-test)
+docker exec ${KIND_NODE} iptables -I INPUT -p tcp --dport 6443 -j DROP
+sleep 120
+docker exec ${KIND_NODE} iptables -D INPUT -p tcp --dport 6443 -j DROP
+
+until kubectl get pods; do
+  echo "Waiting for apiserver to be back up"
+  sleep 1
+done
+
+# rerun kubectl logs because previous one got killed when apiserver was down
+kubectl logs --tail=500 -f deployment/lingo &
+
+echo "Waiting for deployment to scale down back to 0 within ~1 minute"
 for i in {1..15}; do
   if [ "$i" -eq 15 ]; then
     echo "Test failed: Expected 0 replica after not having requests for more than 1 minute, got $replicas"
     exit 1
   fi
-  replicas=$(kubectl get deployment stapi-minilm-l6-v2 -o jsonpath='{.spec.replicas}')
+  replicas=$(kubectl get deployment stapi-minilm-l6-v2 -o jsonpath='{.spec.replicas}' || true)
   if [ "$replicas" -eq 0 ]; then
     echo "Test passed: Expected 0 replica after not having requests for more than 1 minute"
     break
   fi
-  sleep 8
+  sleep 6
 done
 
 echo "Patching stapi deployment to sleep on startup"