From c1f91589963632704fb7f3a629fca259b3010b87 Mon Sep 17 00:00:00 2001 From: Chun Han <116052805+MrPresent-Han@users.noreply.github.com> Date: Fri, 13 Dec 2024 03:54:43 -0500 Subject: [PATCH] =?UTF-8?q?fix:=20search-group-by=20failed=20to=20get=20da?= =?UTF-8?q?ta=20from=20multi-chunked-segment(##=E2=80=A6=20(#38383)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit related: #38343 Signed-off-by: MrPresent-Han Co-authored-by: MrPresent-Han --- .../operator/groupby/SearchGroupByOperator.h | 67 ++++++++++--------- internal/core/src/index/ScalarIndex.h | 2 +- internal/core/src/segcore/SegmentSealedImpl.h | 9 ++- 3 files changed, 44 insertions(+), 34 deletions(-) diff --git a/internal/core/src/exec/operator/groupby/SearchGroupByOperator.h b/internal/core/src/exec/operator/groupby/SearchGroupByOperator.h index 3a91bb1828818..f05c42c22dbfe 100644 --- a/internal/core/src/exec/operator/groupby/SearchGroupByOperator.h +++ b/internal/core/src/exec/operator/groupby/SearchGroupByOperator.h @@ -61,50 +61,53 @@ class GrowingDataGetter : public DataGetter { template class SealedDataGetter : public DataGetter { private: - std::shared_ptr> field_data_; - std::shared_ptr> str_field_data_; - const index::ScalarIndex* field_index_; + const segcore::SegmentSealed& segment_; + const FieldId field_id_; + bool from_data_; + mutable std::unordered_map> + str_view_map_; + // Getting str_view from segment is cpu-costly, this map is to cache this view for performance public: - SealedDataGetter(const segcore::SegmentSealed& segment, FieldId& field_id) { - if (segment.HasFieldData(field_id)) { - if constexpr (std::is_same_v) { - str_field_data_ = - std::make_shared>( - segment.chunk_view(field_id, 0) - .first); - } else { - auto span = segment.chunk_data(field_id, 0); - field_data_ = std::make_shared>( - span.data(), span.valid_data(), span.row_count()); - } - } else if (segment.HasIndex(field_id)) { - this->field_index_ = &(segment.chunk_scalar_index(field_id, 0)); - } else { - PanicInfo(UnexpectedError, - "The segment used to init data getter has no effective " - "data source, neither" - "index or data"); + SealedDataGetter(const segcore::SegmentSealed& segment, FieldId& field_id) + : segment_(segment), field_id_(field_id) { + from_data_ = segment_.HasFieldData(field_id_); + if (!from_data_ && !segment_.HasIndex(field_id_)) { + PanicInfo( + UnexpectedError, + "The segment:{} used to init data getter has no effective " + "data source, neither" + "index or data", + segment_.get_segment_id()); } } - SealedDataGetter(const SealedDataGetter& other) - : field_data_(other.field_data_), - str_field_data_(other.str_field_data_), - field_index_(other.field_index_) { - } - T Get(int64_t idx) const { - if (field_data_ || str_field_data_) { + if (from_data_) { + auto id_offset_pair = segment_.get_chunk_by_offset(field_id_, idx); + auto chunk_id = id_offset_pair.first; + auto inner_offset = id_offset_pair.second; if constexpr (std::is_same_v) { + if (str_view_map_.find(chunk_id) == str_view_map_.end()) { + // for now, search_group_by does not handle null values + auto [str_chunk_view, _] = + segment_.chunk_view(field_id_, + chunk_id); + str_view_map_[chunk_id] = std::move(str_chunk_view); + } + auto& str_chunk_view = str_view_map_[chunk_id]; std::string_view str_val_view = - str_field_data_->operator[](idx); + str_chunk_view.operator[](inner_offset); return std::string(str_val_view.data(), str_val_view.length()); + } else { + Span span = segment_.chunk_data(field_id_, chunk_id); + auto raw = span.operator[](inner_offset); + return raw; } - return field_data_->operator[](idx); } else { - auto raw = (*field_index_).Reverse_Lookup(idx); + auto& chunk_index = segment_.chunk_scalar_index(field_id_, 0); + auto raw = chunk_index.Reverse_Lookup(idx); AssertInfo(raw.has_value(), "field data not found"); return raw.value(); } diff --git a/internal/core/src/index/ScalarIndex.h b/internal/core/src/index/ScalarIndex.h index 15c95d27e0b0e..ba1ac78da992a 100644 --- a/internal/core/src/index/ScalarIndex.h +++ b/internal/core/src/index/ScalarIndex.h @@ -134,7 +134,7 @@ class ScalarIndex : public IndexBase { } virtual bool - IsMmapSupported() const { + IsMmapSupported() const override { return index_type_ == milvus::index::BITMAP_INDEX_TYPE || index_type_ == milvus::index::HYBRID_INDEX_TYPE; } diff --git a/internal/core/src/segcore/SegmentSealedImpl.h b/internal/core/src/segcore/SegmentSealedImpl.h index 0266916dd5f7a..3b99562b6d714 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.h +++ b/internal/core/src/segcore/SegmentSealedImpl.h @@ -165,7 +165,14 @@ class SegmentSealedImpl : public SegmentSealed { std::pair get_chunk_by_offset(FieldId field_id, int64_t offset) const override { - PanicInfo(ErrorCode::Unsupported, "Not implemented"); + if (fields_.find(field_id) == fields_.end()) { + PanicInfo( + ErrorCode::FieldIDInvalid, + "Failed to get chunk offset towards a non-existing field:{}", + field_id.get()); + } + // for sealed segment, chunk id is always zero and input offset is the target offset + return std::make_pair(0, offset); } int64_t