From f4d4811b2c342f61b72431f44e761ed8ea358e92 Mon Sep 17 00:00:00 2001 From: Julien Duchesne Date: Thu, 10 Oct 2024 10:56:12 -0400 Subject: [PATCH] Really fix flaky `TestMultipleClients` tests (#603) * Really fix flaky `TestMultipleClients` tests https://github.com/grafana/dskit/pull/599 went in a wrong direction, so I revert it here: it makes all `TestMultipleClients` tests flaky since not all members are necessarily joined after the updates Instead, to fix the `TestMultipleClientsWithMixedLabelsAndExpectFailure` test, I add a couple more labeled members, that way, even if we reach 3 updates (which happened some times), we'll never get to 5 with a single member Also, try to fix the `TestTLSServerWithLocalhostCertWithClientCertificateEnforcementUsingClientCA1` test by closing GRPC connections. Maybe this (https://github.com/golang/go/issues/36700) is the issue? * Set `MaxIdleConnsPerHost` * New attempt to fix this test. Could it be aborted connections? * Retry RSTs --- crypto/tls/test/tls_integration_test.go | 64 ++++++++++++++----------- kv/memberlist/memberlist_client_test.go | 13 ++--- 2 files changed, 41 insertions(+), 36 deletions(-) diff --git a/crypto/tls/test/tls_integration_test.go b/crypto/tls/test/tls_integration_test.go index 941cd39e3..c656c7e07 100644 --- a/crypto/tls/test/tls_integration_test.go +++ b/crypto/tls/test/tls_integration_test.go @@ -13,10 +13,12 @@ import ( "path/filepath" "runtime" "strconv" + "strings" "testing" "time" "github.com/gogo/status" + "github.com/hashicorp/go-cleanhttp" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -103,6 +105,7 @@ func newIntegrationClientServer( serv, err := server.New(cfg) require.NoError(t, err) + defer serv.Shutdown() serv.HTTP.HandleFunc("/hello", func(w http.ResponseWriter, _ *http.Request) { fmt.Fprintf(w, "OK") @@ -115,36 +118,42 @@ func newIntegrationClientServer( require.NoError(t, err) }() - // Wait until the server is up and running - assert.Eventually(t, func() bool { - conn, err := net.DialTimeout("tcp", httpAddr.String(), 1*time.Second) - if err != nil { - t.Logf("error dialing http: %v", err) - return false - } - defer conn.Close() - grpcConn, err := net.DialTimeout("tcp", grpcAddr.String(), 1*time.Second) - if err != nil { - t.Logf("error dialing grpc: %v", err) - return false - } - defer grpcConn.Close() - return true - }, 2500*time.Millisecond, 1*time.Second, "server is not up") - httpURL := fmt.Sprintf("https://localhost:%d/hello", httpAddr.Port) grpcHost := net.JoinHostPort("localhost", strconv.Itoa(grpcAddr.Port)) for _, tc := range tcs { - tlsClientConfig, err := tc.tlsConfig.GetTLSConfig() - require.NoError(t, err) - // HTTP t.Run("HTTP/"+tc.name, func(t *testing.T) { - transport := &http.Transport{TLSClientConfig: tlsClientConfig} + tlsClientConfig, err := tc.tlsConfig.GetTLSConfig() + require.NoError(t, err) + + transport := cleanhttp.DefaultTransport() + transport.TLSClientConfig = tlsClientConfig client := &http.Client{Transport: transport} - resp, err := client.Get(httpURL) + cancellableCtx, cancel := context.WithCancel(context.Background()) + defer cancel() + + req, err := http.NewRequestWithContext(cancellableCtx, http.MethodGet, httpURL, nil) + require.NoError(t, err) + + resp, err := client.Do(req) + // We retry the request a few times in case of a TCP reset (and we're expecting an error) + // Sometimes, the server resets the connection rather than sending the TLS error + // Seems that even Google have issues with RST flakiness: https://go-review.googlesource.com/c/go/+/527196 + isRST := func(err error) bool { + if err == nil { + return false + } + return strings.Contains(err.Error(), "connection reset by peer") || strings.Contains(err.Error(), "broken pipe") + } + for i := 0; i < 3 && isRST(err) && tc.httpExpectError != nil; i++ { + time.Sleep(100 * time.Millisecond) + resp, err = client.Do(req) + if err == nil { + defer resp.Body.Close() + } + } if err == nil { defer resp.Body.Close() } @@ -175,16 +184,18 @@ func newIntegrationClientServer( dialOptions = append([]grpc.DialOption{grpc.WithDefaultCallOptions(clientConfig.CallOptions()...)}, dialOptions...) conn, err := grpc.NewClient(grpcHost, dialOptions...) - assert.NoError(t, err, tc.name) - require.NoError(t, err, tc.name) require.NoError(t, err, tc.name) + defer conn.Close() + + cancellableCtx, cancel := context.WithCancel(context.Background()) + defer cancel() client := grpc_health_v1.NewHealthClient(conn) // TODO: Investigate why the client doesn't really receive the // error about the bad certificate from the server side and just // see connection closed instead - resp, err := client.Check(context.TODO(), &grpc_health_v1.HealthCheckRequest{}) + resp, err := client.Check(cancellableCtx, &grpc_health_v1.HealthCheckRequest{}) if tc.grpcExpectError != nil { tc.grpcExpectError(t, err) return @@ -194,10 +205,7 @@ func newIntegrationClientServer( assert.Equal(t, grpc_health_v1.HealthCheckResponse_SERVING, resp.Status) } }) - } - - serv.Shutdown() } func TestServerWithoutTlsEnabled(t *testing.T) { diff --git a/kv/memberlist/memberlist_client_test.go b/kv/memberlist/memberlist_client_test.go index 0d14c5f76..4df4f0572 100644 --- a/kv/memberlist/memberlist_client_test.go +++ b/kv/memberlist/memberlist_client_test.go @@ -590,12 +590,16 @@ func TestMultipleClientsWithMixedLabelsAndExpectFailure(t *testing.T) { // 1) "" // 2) "label1" // 3) "label2" + // 4) "label3" + // 5) "label4" // // We expect that it won't be possible to build a memberlist cluster with mixed labels. var membersLabel = []string{ "", "label1", "label2", + "label3", + "label4", } configGen := func(i int) KVConfig { @@ -609,7 +613,7 @@ func TestMultipleClientsWithMixedLabelsAndExpectFailure(t *testing.T) { err := testMultipleClientsWithConfigGenerator(t, len(membersLabel), configGen) require.Error(t, err) - require.Contains(t, err.Error(), fmt.Sprintf("expected to see %d members, got", len(membersLabel))) + require.Contains(t, err.Error(), fmt.Sprintf("expected to see at least %d updates", len(membersLabel))) } func TestMultipleClientsWithMixedLabelsAndClusterLabelVerificationDisabled(t *testing.T) { @@ -720,7 +724,6 @@ func testMultipleClientsWithConfigGenerator(t *testing.T, members int, configGen firstKv := clients[0] ctx, cancel := context.WithTimeout(context.Background(), casInterval*3) // Watch for 3x cas intervals. updates := 0 - gotMembers := 0 firstKv.WatchKey(ctx, key, func(in interface{}) bool { updates++ @@ -733,18 +736,12 @@ func testMultipleClientsWithConfigGenerator(t *testing.T, members int, configGen "tokens, oldest timestamp:", now.Sub(time.Unix(minTimestamp, 0)).String(), "avg timestamp:", now.Sub(time.Unix(avgTimestamp, 0)).String(), "youngest timestamp:", now.Sub(time.Unix(maxTimestamp, 0)).String()) - gotMembers = len(r.Members) return true // yes, keep watching }) cancel() // make linter happy t.Logf("Ring updates observed: %d", updates) - // We expect that all members are in the ring - if gotMembers != members { - return fmt.Errorf("expected to see %d members, got %d", members, gotMembers) - } - if updates < members { // in general, at least one update from each node. (although that's not necessarily true... // but typically we get more updates than that anyway)