Skip to content

Commit

Permalink
fix: [2.4] querycoord collection num metric (#36471)(#38233) (#38240)
Browse files Browse the repository at this point in the history
Cherry pick from master
pr: #36471 #38233

related to: #36456

---------

Signed-off-by: shaoting-huang <[email protected]>
Signed-off-by: Congqi Xia <[email protected]>
Co-authored-by: sthuang <[email protected]>
  • Loading branch information
congqixia and shaoting-huang authored Dec 6, 2024
1 parent 12cc500 commit 9eb08e1
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 8 deletions.
4 changes: 0 additions & 4 deletions internal/querycoordv2/job/job_release.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,6 @@ func (job *ReleaseCollectionJob) Execute() error {
proxyutil.SetMsgType(commonpb.MsgType_ReleaseCollection))

waitCollectionReleased(job.dist, job.checkerController, req.GetCollectionID())
metrics.QueryCoordNumCollections.WithLabelValues().Dec()
metrics.QueryCoordNumPartitions.WithLabelValues().Sub(float64(len(toRelease)))
metrics.QueryCoordReleaseCount.WithLabelValues(metrics.TotalLabel).Inc()
metrics.QueryCoordReleaseCount.WithLabelValues(metrics.SuccessLabel).Inc()
return nil
Expand Down Expand Up @@ -196,7 +194,6 @@ func (job *ReleasePartitionJob) Execute() error {
log.Warn("failed to remove replicas", zap.Error(err))
}
job.targetObserver.ReleaseCollection(req.GetCollectionID())
metrics.QueryCoordNumCollections.WithLabelValues().Dec()
// try best discard cache
// shall not affect releasing if failed
job.proxyManager.InvalidateCollectionMetaCache(job.ctx,
Expand All @@ -216,6 +213,5 @@ func (job *ReleasePartitionJob) Execute() error {
job.targetObserver.ReleasePartition(req.GetCollectionID(), toRelease...)
waitCollectionReleased(job.dist, job.checkerController, req.GetCollectionID(), toRelease...)
}
metrics.QueryCoordNumPartitions.WithLabelValues().Sub(float64(len(toRelease)))
return nil
}
14 changes: 10 additions & 4 deletions internal/querycoordv2/meta/collection_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,11 @@ func (m *CollectionManager) putPartition(partitions []*Partition, withSave bool)
return nil
}

func (m *CollectionManager) updateLoadMetrics() {
metrics.QueryCoordNumCollections.WithLabelValues().Set(float64(len(lo.Filter(lo.Values(m.collections), func(coll *Collection, _ int) bool { return coll.LoadPercentage == 100 }))))
metrics.QueryCoordNumPartitions.WithLabelValues().Set(float64(len(lo.Filter(lo.Values(m.partitions), func(part *Partition, _ int) bool { return part.LoadPercentage == 100 }))))
}

func (m *CollectionManager) UpdatePartitionLoadPercent(partitionID int64, loadPercent int32) error {
m.rwmutex.Lock()
defer m.rwmutex.Unlock()
Expand Down Expand Up @@ -592,9 +597,7 @@ func (m *CollectionManager) UpdateCollectionLoadPercent(collectionID int64) (int
// if collection becomes loaded, clear it's recoverTimes in load info
newCollection.RecoverTimes = 0

// TODO: what if part of the collection has been unloaded? Now we decrease the metric only after
// `ReleaseCollection` is triggered. Maybe it's hard to make this metric really accurate.
metrics.QueryCoordNumCollections.WithLabelValues().Inc()
defer m.updateLoadMetrics()
elapsed := time.Since(newCollection.CreatedAt)
metrics.QueryCoordLoadLatency.WithLabelValues().Observe(float64(elapsed.Milliseconds()))
eventlog.Record(eventlog.NewRawEvt(eventlog.Level_Info, fmt.Sprintf("Collection %d loaded", newCollection.CollectionID)))
Expand All @@ -620,6 +623,7 @@ func (m *CollectionManager) RemoveCollection(collectionID typeutil.UniqueID) err
delete(m.collectionPartitions, collectionID)
}
metrics.CleanQueryCoordMetricsWithCollectionID(collectionID)
m.updateLoadMetrics()
return nil
}

Expand All @@ -631,7 +635,8 @@ func (m *CollectionManager) RemovePartition(collectionID typeutil.UniqueID, part
m.rwmutex.Lock()
defer m.rwmutex.Unlock()

return m.removePartition(collectionID, partitionIDs...)
err := m.removePartition(collectionID, partitionIDs...)
return err
}

func (m *CollectionManager) removePartition(collectionID typeutil.UniqueID, partitionIDs ...typeutil.UniqueID) error {
Expand All @@ -644,6 +649,7 @@ func (m *CollectionManager) removePartition(collectionID typeutil.UniqueID, part
delete(m.partitions, id)
delete(partitions, id)
}
m.updateLoadMetrics()

return nil
}
Expand Down

0 comments on commit 9eb08e1

Please sign in to comment.