Skip to content

Commit

Permalink
Really fix flaky TestMultipleClients tests (#603)
Browse files Browse the repository at this point in the history
* Really fix flaky `TestMultipleClients` tests
#599 went in a wrong direction, so I revert it here: it makes all `TestMultipleClients` tests flaky since not all members are necessarily joined after the updates

Instead, to fix the `TestMultipleClientsWithMixedLabelsAndExpectFailure` test, I add a couple more labeled members, that way, even if we reach 3 updates (which happened some times), we'll never get to 5 with a single member

Also, try to fix the `TestTLSServerWithLocalhostCertWithClientCertificateEnforcementUsingClientCA1` test by closing GRPC connections. Maybe this (golang/go#36700) is the issue?

* Set `MaxIdleConnsPerHost`

* New attempt to fix this test. Could it be aborted connections?

* Retry RSTs
  • Loading branch information
julienduchesne authored Oct 10, 2024
1 parent 2e104a8 commit f4d4811
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 36 deletions.
64 changes: 36 additions & 28 deletions crypto/tls/test/tls_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@ import (
"path/filepath"
"runtime"
"strconv"
"strings"
"testing"
"time"

"github.com/gogo/status"
"github.com/hashicorp/go-cleanhttp"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -103,6 +105,7 @@ func newIntegrationClientServer(

serv, err := server.New(cfg)
require.NoError(t, err)
defer serv.Shutdown()

serv.HTTP.HandleFunc("/hello", func(w http.ResponseWriter, _ *http.Request) {
fmt.Fprintf(w, "OK")
Expand All @@ -115,36 +118,42 @@ func newIntegrationClientServer(
require.NoError(t, err)
}()

// Wait until the server is up and running
assert.Eventually(t, func() bool {
conn, err := net.DialTimeout("tcp", httpAddr.String(), 1*time.Second)
if err != nil {
t.Logf("error dialing http: %v", err)
return false
}
defer conn.Close()
grpcConn, err := net.DialTimeout("tcp", grpcAddr.String(), 1*time.Second)
if err != nil {
t.Logf("error dialing grpc: %v", err)
return false
}
defer grpcConn.Close()
return true
}, 2500*time.Millisecond, 1*time.Second, "server is not up")

httpURL := fmt.Sprintf("https://localhost:%d/hello", httpAddr.Port)
grpcHost := net.JoinHostPort("localhost", strconv.Itoa(grpcAddr.Port))

for _, tc := range tcs {
tlsClientConfig, err := tc.tlsConfig.GetTLSConfig()
require.NoError(t, err)

// HTTP
t.Run("HTTP/"+tc.name, func(t *testing.T) {
transport := &http.Transport{TLSClientConfig: tlsClientConfig}
tlsClientConfig, err := tc.tlsConfig.GetTLSConfig()
require.NoError(t, err)

transport := cleanhttp.DefaultTransport()
transport.TLSClientConfig = tlsClientConfig
client := &http.Client{Transport: transport}

resp, err := client.Get(httpURL)
cancellableCtx, cancel := context.WithCancel(context.Background())
defer cancel()

req, err := http.NewRequestWithContext(cancellableCtx, http.MethodGet, httpURL, nil)
require.NoError(t, err)

resp, err := client.Do(req)
// We retry the request a few times in case of a TCP reset (and we're expecting an error)
// Sometimes, the server resets the connection rather than sending the TLS error
// Seems that even Google have issues with RST flakiness: https://go-review.googlesource.com/c/go/+/527196
isRST := func(err error) bool {
if err == nil {
return false
}
return strings.Contains(err.Error(), "connection reset by peer") || strings.Contains(err.Error(), "broken pipe")
}
for i := 0; i < 3 && isRST(err) && tc.httpExpectError != nil; i++ {
time.Sleep(100 * time.Millisecond)
resp, err = client.Do(req)
if err == nil {
defer resp.Body.Close()
}
}
if err == nil {
defer resp.Body.Close()
}
Expand Down Expand Up @@ -175,16 +184,18 @@ func newIntegrationClientServer(
dialOptions = append([]grpc.DialOption{grpc.WithDefaultCallOptions(clientConfig.CallOptions()...)}, dialOptions...)

conn, err := grpc.NewClient(grpcHost, dialOptions...)
assert.NoError(t, err, tc.name)
require.NoError(t, err, tc.name)
require.NoError(t, err, tc.name)
defer conn.Close()

cancellableCtx, cancel := context.WithCancel(context.Background())
defer cancel()

client := grpc_health_v1.NewHealthClient(conn)

// TODO: Investigate why the client doesn't really receive the
// error about the bad certificate from the server side and just
// see connection closed instead
resp, err := client.Check(context.TODO(), &grpc_health_v1.HealthCheckRequest{})
resp, err := client.Check(cancellableCtx, &grpc_health_v1.HealthCheckRequest{})
if tc.grpcExpectError != nil {
tc.grpcExpectError(t, err)
return
Expand All @@ -194,10 +205,7 @@ func newIntegrationClientServer(
assert.Equal(t, grpc_health_v1.HealthCheckResponse_SERVING, resp.Status)
}
})

}

serv.Shutdown()
}

func TestServerWithoutTlsEnabled(t *testing.T) {
Expand Down
13 changes: 5 additions & 8 deletions kv/memberlist/memberlist_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -590,12 +590,16 @@ func TestMultipleClientsWithMixedLabelsAndExpectFailure(t *testing.T) {
// 1) ""
// 2) "label1"
// 3) "label2"
// 4) "label3"
// 5) "label4"
//
// We expect that it won't be possible to build a memberlist cluster with mixed labels.
var membersLabel = []string{
"",
"label1",
"label2",
"label3",
"label4",
}

configGen := func(i int) KVConfig {
Expand All @@ -609,7 +613,7 @@ func TestMultipleClientsWithMixedLabelsAndExpectFailure(t *testing.T) {

err := testMultipleClientsWithConfigGenerator(t, len(membersLabel), configGen)
require.Error(t, err)
require.Contains(t, err.Error(), fmt.Sprintf("expected to see %d members, got", len(membersLabel)))
require.Contains(t, err.Error(), fmt.Sprintf("expected to see at least %d updates", len(membersLabel)))
}

func TestMultipleClientsWithMixedLabelsAndClusterLabelVerificationDisabled(t *testing.T) {
Expand Down Expand Up @@ -720,7 +724,6 @@ func testMultipleClientsWithConfigGenerator(t *testing.T, members int, configGen
firstKv := clients[0]
ctx, cancel := context.WithTimeout(context.Background(), casInterval*3) // Watch for 3x cas intervals.
updates := 0
gotMembers := 0
firstKv.WatchKey(ctx, key, func(in interface{}) bool {
updates++

Expand All @@ -733,18 +736,12 @@ func testMultipleClientsWithConfigGenerator(t *testing.T, members int, configGen
"tokens, oldest timestamp:", now.Sub(time.Unix(minTimestamp, 0)).String(),
"avg timestamp:", now.Sub(time.Unix(avgTimestamp, 0)).String(),
"youngest timestamp:", now.Sub(time.Unix(maxTimestamp, 0)).String())
gotMembers = len(r.Members)
return true // yes, keep watching
})
cancel() // make linter happy

t.Logf("Ring updates observed: %d", updates)

// We expect that all members are in the ring
if gotMembers != members {
return fmt.Errorf("expected to see %d members, got %d", members, gotMembers)
}

if updates < members {
// in general, at least one update from each node. (although that's not necessarily true...
// but typically we get more updates than that anyway)
Expand Down

0 comments on commit f4d4811

Please sign in to comment.