Skip to content

Commit

Permalink
Fix all flaky tests (#600)
Browse files Browse the repository at this point in the history
- `TestPartitionInstanceLifecycler`: Increase test polling timeout. 1s seems tight, it fails with `-race`. Fixes #572
- `TestBasicLifecycler_HeartbeatWhileRunning`: Same as above
- `TestRejoin`: Converted the test to use `require.Eventually` instead of a custom func + increased polling timeout a bit
- `TestTLSServerWithLocalhostCertWithClientCertificateEnforcementUsingClientCA1`: Wait until the server is ready before running tests
  • Loading branch information
julienduchesne authored Oct 8, 2024
1 parent f77c1dd commit 13d10a4
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 32 deletions.
17 changes: 17 additions & 0 deletions crypto/tls/test/tls_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,23 @@ func newIntegrationClientServer(
require.NoError(t, err)
}()

// Wait until the server is up and running
assert.Eventually(t, func() bool {
conn, err := net.DialTimeout("tcp", httpAddr.String(), 1*time.Second)
if err != nil {
t.Logf("error dialing http: %v", err)
return false
}
defer conn.Close()
grpcConn, err := net.DialTimeout("tcp", grpcAddr.String(), 1*time.Second)
if err != nil {
t.Logf("error dialing grpc: %v", err)
return false
}
defer grpcConn.Close()
return true
}, 2500*time.Millisecond, 1*time.Second, "server is not up")

httpURL := fmt.Sprintf("https://localhost:%d/hello", httpAddr.Port)
grpcHost := net.JoinHostPort("localhost", strconv.Itoa(grpcAddr.Port))

Expand Down
31 changes: 5 additions & 26 deletions kv/memberlist/memberlist_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"math"
"math/rand"
"net"
"reflect"
"sort"
"strconv"
"strings"
Expand Down Expand Up @@ -1243,24 +1242,24 @@ func TestRejoin(t *testing.T) {
require.NoError(t, services.StartAndAwaitRunning(context.Background(), mkv2))
defer services.StopAndAwaitTerminated(context.Background(), mkv2) //nolint:errcheck

membersFunc := func() interface{} {
return mkv2.memberlist.NumMembers()
expectMembers := func(expected int) func() bool {
return func() bool { return mkv2.memberlist.NumMembers() == expected }
}

poll(t, 7*time.Second, 2, membersFunc) // Probe interval is 5s, with 2s timeout, so probe for 7s.
require.Eventually(t, expectMembers(2), 10*time.Second, 100*time.Millisecond, "expected 2 members in the cluster")

// Shutdown first KV
require.NoError(t, services.StopAndAwaitTerminated(context.Background(), mkv1))

// Second KV should see single member now.
poll(t, 7*time.Second, 1, membersFunc)
require.Eventually(t, expectMembers(1), 10*time.Second, 100*time.Millisecond, "expected 1 member in the cluster")

// Let's start first KV again. It is not configured to join the cluster, but KV2 is rejoining.
mkv1 = NewKV(cfg1, log.NewNopLogger(), &dnsProviderMock{}, prometheus.NewPedanticRegistry())
require.NoError(t, services.StartAndAwaitRunning(context.Background(), mkv1))
defer services.StopAndAwaitTerminated(context.Background(), mkv1) //nolint:errcheck

poll(t, 7*time.Second, 2, membersFunc)
require.Eventually(t, expectMembers(2), 10*time.Second, 100*time.Millisecond, "expected 2 member in the cluster")
}

func TestMessageBuffer(t *testing.T) {
Expand Down Expand Up @@ -1601,26 +1600,6 @@ func getOrCreateData(in interface{}) *data {
return r
}

// poll repeatedly evaluates condition until we either timeout, or it succeeds.
func poll(t testing.TB, d time.Duration, want interface{}, have func() interface{}) {
t.Helper()

deadline := time.Now().Add(d)
for {
if time.Now().After(deadline) {
break
}
if reflect.DeepEqual(want, have()) {
return
}
time.Sleep(d / 100)
}
h := have()
if !reflect.DeepEqual(want, h) {
t.Fatalf("expected %v, got %v", want, h)
}
}

type testLogger struct {
}

Expand Down
8 changes: 3 additions & 5 deletions ring/basic_lifecycler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -331,12 +331,10 @@ func TestBasicLifecycler_HeartbeatWhileRunning(t *testing.T) {
desc, _ := getInstanceFromStore(t, store, testInstanceID)
initialTimestamp := desc.GetTimestamp()

test.Poll(t, time.Second, true, func() interface{} {
assert.Eventually(t, func() bool {
desc, _ := getInstanceFromStore(t, store, testInstanceID)
currTimestamp := desc.GetTimestamp()

return currTimestamp > initialTimestamp
})
return desc.GetTimestamp() > initialTimestamp
}, 2*time.Second, 10*time.Millisecond, "expected timestamp to be updated")

assert.Greater(t, testutil.ToFloat64(lifecycler.metrics.heartbeats), float64(0))
}
Expand Down
2 changes: 1 addition & 1 deletion ring/partition_instance_lifecycler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func TestPartitionInstanceLifecycler(t *testing.T) {
assert.Eventually(t, func() bool {
actual := getPartitionRingFromStore(t, store, ringKey)
return actual.Partitions[1].State == PartitionActive
}, time.Second, eventuallyTick)
}, 3*time.Second, eventuallyTick)
})

t.Run("should wait for the configured minimum waiting time before switching a pending partition to active", func(t *testing.T) {
Expand Down

0 comments on commit 13d10a4

Please sign in to comment.