From b29237e5d5dc1e41236d460b8310abd1faad97c1 Mon Sep 17 00:00:00 2001 From: wei liu Date: Tue, 3 Dec 2024 18:16:39 +0800 Subject: [PATCH] enhance: Add collection id to search request count metrics (#38069) (#38144) pr: #38069 #38167 --------- Signed-off-by: Wei Liu --- internal/querynodev2/handlers.go | 16 ++++----- internal/querynodev2/services.go | 18 +++++----- pkg/metrics/querynode_metrics.go | 56 ++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 17 deletions(-) diff --git a/internal/querynodev2/handlers.go b/internal/querynodev2/handlers.go index abe645d4e5423..fa570734b7664 100644 --- a/internal/querynodev2/handlers.go +++ b/internal/querynodev2/handlers.go @@ -193,10 +193,10 @@ func (node *QueryNode) queryChannel(ctx context.Context, req *querypb.QueryReque ) var err error - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.TotalLabel, metrics.Leader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.TotalLabel, metrics.Leader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() defer func() { if err != nil { - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.FailLabel, metrics.Leader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.FailLabel, metrics.Leader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() } }() @@ -252,12 +252,12 @@ func (node *QueryNode) queryChannel(ctx context.Context, req *querypb.QueryReque latency := tr.ElapseSpan() metrics.QueryNodeSQReqLatency.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.Leader).Observe(float64(latency.Milliseconds())) - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.SuccessLabel, metrics.Leader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.SuccessLabel, metrics.Leader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() return resp, nil } func (node *QueryNode) queryChannelStream(ctx context.Context, req *querypb.QueryRequest, channel string, srv streamrpc.QueryStreamServer) error { - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.TotalLabel, metrics.Leader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.TotalLabel, metrics.Leader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() msgID := req.Req.Base.GetMsgID() log := log.Ctx(ctx).With( zap.Int64("msgID", msgID), @@ -269,7 +269,7 @@ func (node *QueryNode) queryChannelStream(ctx context.Context, req *querypb.Quer var err error defer func() { if err != nil { - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.FailLabel, metrics.Leader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.FailLabel, metrics.Leader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() } }() @@ -350,10 +350,10 @@ func (node *QueryNode) searchChannel(ctx context.Context, req *querypb.SearchReq defer node.lifetime.Done() var err error - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.SearchLabel, metrics.TotalLabel, metrics.Leader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.SearchLabel, metrics.TotalLabel, metrics.Leader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() defer func() { if err != nil { - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.SearchLabel, metrics.FailLabel, metrics.Leader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.SearchLabel, metrics.FailLabel, metrics.Leader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() } }() @@ -404,7 +404,7 @@ func (node *QueryNode) searchChannel(ctx context.Context, req *querypb.SearchReq // update metric to prometheus latency := tr.ElapseSpan() metrics.QueryNodeSQReqLatency.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.SearchLabel, metrics.Leader).Observe(float64(latency.Milliseconds())) - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.SearchLabel, metrics.SuccessLabel, metrics.Leader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.SearchLabel, metrics.SuccessLabel, metrics.Leader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() metrics.QueryNodeSearchNQ.WithLabelValues(fmt.Sprint(node.GetNodeID())).Observe(float64(req.Req.GetNq())) metrics.QueryNodeSearchTopK.WithLabelValues(fmt.Sprint(node.GetNodeID())).Observe(float64(req.Req.GetTopk())) return resp, nil diff --git a/internal/querynodev2/services.go b/internal/querynodev2/services.go index b238c41b3f31e..1fa77d3d4ec74 100644 --- a/internal/querynodev2/services.go +++ b/internal/querynodev2/services.go @@ -674,10 +674,10 @@ func (node *QueryNode) SearchSegments(ctx context.Context, req *querypb.SearchRe } defer node.lifetime.Done() - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.SearchLabel, metrics.TotalLabel, metrics.FromLeader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.SearchLabel, metrics.TotalLabel, metrics.FromLeader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() defer func() { if !merr.Ok(resp.GetStatus()) { - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.SearchLabel, metrics.FailLabel, metrics.FromLeader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.SearchLabel, metrics.FailLabel, metrics.FromLeader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() } }() @@ -725,7 +725,7 @@ func (node *QueryNode) SearchSegments(ctx context.Context, req *querypb.SearchRe latency := tr.ElapseSpan() metrics.QueryNodeSQReqLatency.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.SearchLabel, metrics.FromLeader).Observe(float64(latency.Milliseconds())) - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.SearchLabel, metrics.SuccessLabel, metrics.FromLeader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.SearchLabel, metrics.SuccessLabel, metrics.FromLeader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() resp = task.SearchResult() resp.GetCostAggregation().ResponseTime = tr.ElapseSpan().Milliseconds() @@ -852,10 +852,10 @@ func (node *QueryNode) QuerySegments(ctx context.Context, req *querypb.QueryRequ } defer node.lifetime.Done() - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.TotalLabel, metrics.FromLeader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.TotalLabel, metrics.FromLeader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() defer func() { if resp.GetStatus().GetErrorCode() != commonpb.ErrorCode_Success { - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.FailLabel, metrics.FromLeader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.FailLabel, metrics.FromLeader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() } }() @@ -894,7 +894,7 @@ func (node *QueryNode) QuerySegments(ctx context.Context, req *querypb.QueryRequ // TODO QueryNodeSQLatencyInQueue QueryNodeReduceLatency latency := tr.ElapseSpan() metrics.QueryNodeSQReqLatency.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.FromLeader).Observe(float64(latency.Milliseconds())) - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.SuccessLabel, metrics.FromLeader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.SuccessLabel, metrics.FromLeader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() result := task.Result() result.GetCostAggregation().ResponseTime = latency.Milliseconds() result.GetCostAggregation().TotalNQ = node.scheduler.GetWaitingTaskTotalNQ() @@ -1049,10 +1049,10 @@ func (node *QueryNode) QueryStreamSegments(req *querypb.QueryRequest, srv queryp ) resp := &internalpb.RetrieveResults{} - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.TotalLabel, metrics.FromLeader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.TotalLabel, metrics.FromLeader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() defer func() { if resp.GetStatus().GetErrorCode() != commonpb.ErrorCode_Success { - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.FailLabel, metrics.FromLeader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.FailLabel, metrics.FromLeader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() } }() @@ -1083,7 +1083,7 @@ func (node *QueryNode) QueryStreamSegments(req *querypb.QueryRequest, srv queryp // TODO QueryNodeSQLatencyInQueue QueryNodeReduceLatency latency := tr.ElapseSpan() metrics.QueryNodeSQReqLatency.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.FromLeader).Observe(float64(latency.Milliseconds())) - metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.SuccessLabel, metrics.FromLeader).Inc() + metrics.QueryNodeSQCount.WithLabelValues(fmt.Sprint(node.GetNodeID()), metrics.QueryLabel, metrics.SuccessLabel, metrics.FromLeader, fmt.Sprint(req.GetReq().GetCollectionID())).Inc() return nil } diff --git a/pkg/metrics/querynode_metrics.go b/pkg/metrics/querynode_metrics.go index 80ce633bb8ccf..eb04d158d5580 100644 --- a/pkg/metrics/querynode_metrics.go +++ b/pkg/metrics/querynode_metrics.go @@ -162,6 +162,7 @@ var ( queryTypeLabelName, statusLabelName, requestScope, + collectionIDLabelName, }) QueryNodeSQReqLatency = prometheus.NewHistogramVec( @@ -905,4 +906,59 @@ func CleanupQueryNodeCollectionMetrics(nodeID int64, collectionID int64) { nodeIDLabelName: nodeIDLabel, collectionIDLabelName: collectionIDLabel, }) + QueryNodeNumSegments. + DeletePartialMatch( + prometheus.Labels{ + nodeIDLabelName: nodeIDLabel, + collectionIDLabelName: collectionIDLabel, + }) + + QueryNodeSQCount. + DeletePartialMatch( + prometheus.Labels{ + nodeIDLabelName: nodeIDLabel, + collectionIDLabelName: collectionIDLabel, + }) + + QueryNodeSearchHitSegmentNum. + DeletePartialMatch( + prometheus.Labels{ + nodeIDLabelName: nodeIDLabel, + collectionIDLabelName: collectionIDLabel, + }) + + QueryNodeSegmentPruneRatio. + DeletePartialMatch( + prometheus.Labels{ + nodeIDLabelName: nodeIDLabel, + collectionIDLabelName: collectionIDLabel, + }) + + QueryNodeSegmentPruneBias. + DeletePartialMatch( + prometheus.Labels{ + nodeIDLabelName: nodeIDLabel, + collectionIDLabelName: collectionIDLabel, + }) + + QueryNodeSegmentPruneLatency. + DeletePartialMatch( + prometheus.Labels{ + nodeIDLabelName: nodeIDLabel, + collectionIDLabelName: collectionIDLabel, + }) + + QueryNodeEntitiesSize. + DeletePartialMatch( + prometheus.Labels{ + nodeIDLabelName: nodeIDLabel, + collectionIDLabelName: collectionIDLabel, + }) + + QueryNodeLevelZeroSize. + DeletePartialMatch( + prometheus.Labels{ + nodeIDLabelName: nodeIDLabel, + collectionIDLabelName: collectionIDLabel, + }) }