Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the retry of the rpc client #26795

Merged
merged 1 commit into from
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions configs/milvus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -406,10 +406,10 @@ grpc:
dialTimeout: 200
keepAliveTime: 10000
keepAliveTimeout: 20000
maxMaxAttempts: 5
initialBackoff: 1
maxBackoff: 10
backoffMultiplier: 2
maxMaxAttempts: 10
initialBackOff: 0.2 # seconds
maxBackoff: 10 # seconds
backoffMultiplier: 2.0 # deprecated
clientMaxSendSize: 268435456
clientMaxRecvSize: 268435456

Expand Down
4 changes: 0 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ module github.com/milvus-io/milvus
go 1.18

require (
github.com/DATA-DOG/go-sqlmock v1.5.0
github.com/aliyun/credentials-go v1.2.7
github.com/antlr/antlr4/runtime/Go/antlr v0.0.0-20210826220005-b48c857c3a0e
github.com/antonmedv/expr v1.8.9
Expand Down Expand Up @@ -53,7 +52,6 @@ require (
golang.org/x/text v0.9.0
google.golang.org/grpc v1.54.0
google.golang.org/grpc/examples v0.0.0-20220617181431-3e7b97febc7f
gorm.io/gorm v1.23.8
stathat.com/c/consistent v1.0.0
)

Expand Down Expand Up @@ -118,8 +116,6 @@ require (
github.com/gsterjov/go-libsecret v0.0.0-20161001094733-a6f4afe4910c // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/ianlancetaylor/cgosymbolizer v0.0.0-20221217025313-27d3c9f66b6a // indirect
github.com/jinzhu/inflection v1.0.0 // indirect
github.com/jinzhu/now v1.1.5 // indirect
github.com/jonboulle/clockwork v0.2.2 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/asmfmt v1.3.1 // indirect
Expand Down
9 changes: 0 additions & 9 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@ github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym
github.com/CloudyKit/fastprinter v0.0.0-20200109182630-33d98a066a53/go.mod h1:+3IMCy2vIlbG1XG/0ggNQv0SvxCAIpPM5b1nCz56Xno=
github.com/CloudyKit/jet/v3 v3.0.0/go.mod h1:HKQPgSJmdK8hdoAbKUUWajkHyHo4RaU5rMdUywE7VMo=
github.com/DATA-DOG/go-sqlmock v1.3.3/go.mod h1:f/Ixk793poVmq4qj/V1dPUg2JEAKC73Q5eFN3EC/SaM=
github.com/DATA-DOG/go-sqlmock v1.5.0 h1:Shsta01QNfFxHCfpW6YH2STWB0MudeXXEWMr20OEh60=
github.com/DATA-DOG/go-sqlmock v1.5.0/go.mod h1:f/Ixk793poVmq4qj/V1dPUg2JEAKC73Q5eFN3EC/SaM=
github.com/DataDog/zstd v1.5.0 h1:+K/VEwIAaPcHiMtQvpLD4lqW7f0Gk3xdYZmI1hD+CXo=
github.com/DataDog/zstd v1.5.0/go.mod h1:g4AWEaM3yOg3HYfnJ3YIawPnVdXJh9QME85blwSAmyw=
github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c h1:RGWPOewvKIROun94nF7v2cua9qP+thov/7M50KEoeSU=
Expand Down Expand Up @@ -468,11 +466,6 @@ github.com/jhump/gopoet v0.1.0/go.mod h1:me9yfT6IJSlOL3FCfrg+L6yzUEZ+5jW6WHt4Sk+
github.com/jhump/goprotoc v0.5.0/go.mod h1:VrbvcYrQOrTi3i0Vf+m+oqQWk9l72mjkJCYo7UvLHRQ=
github.com/jhump/protoreflect v1.11.0/go.mod h1:U7aMIjN0NWq9swDP7xDdoMfRHb35uiuTd3Z9nFXJf5E=
github.com/jhump/protoreflect v1.12.0/go.mod h1:JytZfP5d0r8pVNLZvai7U/MCuTWITgrI4tTg7puQFKI=
github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E=
github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc=
github.com/jinzhu/now v1.1.4/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8=
github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ=
github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8=
github.com/jmespath/go-jmespath v0.3.0/go.mod h1:9QtRXoHjLGCJ5IBSaohpXITPlowMeeYCZ7fLUTSywik=
github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo=
github.com/jonboulle/clockwork v0.2.2 h1:UOGuzwb1PwsrDAObMuhUnj0p5ULPj8V/xJ7Kx9qUBdQ=
Expand Down Expand Up @@ -1447,8 +1440,6 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gorm.io/gorm v1.23.8 h1:h8sGJ+biDgBA1AD1Ha9gFCx7h8npU7AsLdlkX0n2TpE=
gorm.io/gorm v1.23.8/go.mod h1:l2lP/RyAtc1ynaTjFksBde/O8v9oOGIApu2/xRitmZk=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
Expand Down
19 changes: 13 additions & 6 deletions internal/querycoordv2/meta/collection_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package meta
import (
"context"
"fmt"
"strconv"
"sync"
"time"

Expand Down Expand Up @@ -123,20 +124,25 @@ func (m *CollectionManager) Recover(broker Broker) error {
return err
}

ctx := log.WithTraceID(context.Background(), strconv.FormatInt(time.Now().UnixNano(), 10))
ctxLog := log.Ctx(ctx)
ctxLog.Info("recover collections and partitions from kv store")

for _, collection := range collections {
// Dropped collection should be deprecated
_, err = broker.GetCollectionSchema(context.Background(), collection.GetCollectionID())
_, err = broker.GetCollectionSchema(ctx, collection.GetCollectionID())
if errors.Is(err, merr.ErrCollectionNotFound) {
log.Info("skip dropped collection during recovery", zap.Int64("collection", collection.GetCollectionID()))
ctxLog.Info("skip dropped collection during recovery", zap.Int64("collection", collection.GetCollectionID()))
m.catalog.ReleaseCollection(collection.GetCollectionID())
continue
}
if err != nil {
ctxLog.Warn("failed to get collection schema", zap.Error(err))
return err
}
// Collections not loaded done should be deprecated
if collection.GetStatus() != querypb.LoadStatus_Loaded || collection.GetReplicaNumber() <= 0 {
log.Info("skip recovery and release collection",
ctxLog.Info("skip recovery and release collection",
zap.Int64("collectionID", collection.GetCollectionID()),
zap.String("status", collection.GetStatus().String()),
zap.Int32("replicaNumber", collection.GetReplicaNumber()),
Expand All @@ -150,13 +156,14 @@ func (m *CollectionManager) Recover(broker Broker) error {
}

for collection, partitions := range partitions {
existPartitions, err := broker.GetPartitions(context.Background(), collection)
existPartitions, err := broker.GetPartitions(ctx, collection)
if errors.Is(err, merr.ErrCollectionNotFound) {
log.Info("skip dropped collection during recovery", zap.Int64("collection", collection))
ctxLog.Info("skip dropped collection during recovery", zap.Int64("collection", collection))
m.catalog.ReleaseCollection(collection)
continue
}
if err != nil {
ctxLog.Warn("failed to get partitions", zap.Error(err))
return err
}
omitPartitions := make([]int64, 0)
Expand All @@ -168,7 +175,7 @@ func (m *CollectionManager) Recover(broker Broker) error {
return true
})
if len(omitPartitions) > 0 {
log.Info("skip dropped partitions during recovery",
ctxLog.Info("skip dropped partitions during recovery",
zap.Int64("collection", collection), zap.Int64s("partitions", omitPartitions))
m.catalog.ReleasePartition(collection, omitPartitions...)
}
Expand Down
2 changes: 1 addition & 1 deletion internal/querycoordv2/meta/coordinator_broker.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ func (broker *CoordinatorBroker) GetCollectionSchema(ctx context.Context, collec

err = merr.Error(resp.GetStatus())
if err != nil {
log.Warn("failed to get collection schema", zap.Error(err))
log.Ctx(ctx).Warn("failed to get collection schema", zap.Error(err))
return nil, err
}
return resp.GetSchema(), nil
Expand Down
4 changes: 3 additions & 1 deletion internal/querycoordv2/mocks/querynode.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ func (node *MockQueryNode) Start() error {
case <-node.ctx.Done():
return nil
default:
return &milvuspb.ComponentStates{}
return &milvuspb.ComponentStates{
Status: successStatus,
}
}
}, func(context.Context, *milvuspb.GetComponentStatesRequest) error {
select {
Expand Down
6 changes: 3 additions & 3 deletions internal/querycoordv2/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ func (s *Server) initMeta() error {
log.Info("recover meta...")
err := s.meta.CollectionManager.Recover(s.broker)
if err != nil {
log.Warn("failed to recover collections")
log.Warn("failed to recover collections", zap.Error(err))
return err
}
collections := s.meta.GetAll()
Expand All @@ -323,13 +323,13 @@ func (s *Server) initMeta() error {

err = s.meta.ReplicaManager.Recover(collections)
if err != nil {
log.Warn("failed to recover replicas")
log.Warn("failed to recover replicas", zap.Error(err))
return err
}

err = s.meta.ResourceManager.Recover()
if err != nil {
log.Warn("failed to recover resource groups")
log.Warn("failed to recover resource groups", zap.Error(err))
return err
}

Expand Down
3 changes: 3 additions & 0 deletions internal/querycoordv2/session/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package session
import (
"context"
"net"
"strconv"
"testing"
"time"

Expand All @@ -45,10 +46,12 @@ type ClusterTestSuite struct {

func (suite *ClusterTestSuite) SetupSuite() {
paramtable.Init()
paramtable.Get().Save("grpc.client.maxMaxAttempts", "1")
suite.setupServers()
}

func (suite *ClusterTestSuite) TearDownSuite() {
paramtable.Get().Save("grpc.client.maxMaxAttempts", strconv.FormatInt(paramtable.DefaultMaxAttempts, 10))
for _, svr := range suite.svrs {
svr.GracefulStop()
}
Expand Down
2 changes: 1 addition & 1 deletion internal/querynodev2/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ func (node *QueryNode) queryChannel(ctx context.Context, req *querypb.QueryReque
))

//
failRet.Status.ErrorCode = commonpb.ErrorCode_Success
ret.Status = merr.Status(nil)
latency := tr.ElapseSpan()
metrics.QueryNodeSQReqLatency.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), metrics.QueryLabel, metrics.Leader).Observe(float64(latency.Milliseconds()))
metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), metrics.QueryLabel, metrics.SuccessLabel, metrics.Leader).Inc()
Expand Down
3 changes: 2 additions & 1 deletion internal/querynodev2/segments/result.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,8 @@ func MergeInternalRetrieveResult(ctx context.Context, retrieveResults []*interna
)
var (
ret = &internalpb.RetrieveResults{
Ids: &schemapb.IDs{},
Status: merr.Status(nil),
Ids: &schemapb.IDs{},
}
skipDupCnt int64
loopEnd int
Expand Down
2 changes: 1 addition & 1 deletion internal/rootcoord/root_coord.go
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ func (c *Core) Init() error {
log.Error("RootCoord start failed", zap.Error(err))
}
})
log.Info("RootCoord startup success")
log.Info("RootCoord startup success", zap.String("address", c.session.Address))
return err
}
c.UpdateStateCode(commonpb.StateCode_StandBy)
Expand Down
Loading