diff --git a/configs/milvus.yaml b/configs/milvus.yaml index 4a2a62cdc7e12..ef2d99ffe3c22 100644 --- a/configs/milvus.yaml +++ b/configs/milvus.yaml @@ -871,4 +871,4 @@ trace: #maxMemSize will the whole available GPU memory. gpu: initMemSize: # Gpu Memory Pool init size - maxMemSize: # Gpu Memory Pool Max size + maxMemSize: # Gpu Memory Pool Max size \ No newline at end of file diff --git a/internal/core/src/common/FieldData.cpp b/internal/core/src/common/FieldData.cpp index 220b6a3864f8b..bd913d6541567 100644 --- a/internal/core/src/common/FieldData.cpp +++ b/internal/core/src/common/FieldData.cpp @@ -30,6 +30,9 @@ template void FieldDataImpl::FillFieldData(const void* source, ssize_t element_count) { + AssertInfo(!nullable_, + "need to fill valid_data, use the 3-argument version instead"); + if (element_count == 0) { return; } @@ -40,7 +43,38 @@ FieldDataImpl::FillFieldData(const void* source, } std::copy_n(static_cast(source), element_count * dim_, - field_data_.data() + length_ * dim_); + data_.data() + length_ * dim_); + length_ += element_count; +} + +template +void +FieldDataImpl::FillFieldData( + const void* field_data, const uint8_t* valid_data, ssize_t element_count) { + AssertInfo( + nullable_, + "no need to fill valid_data, use the 2-argument version instead"); + if (element_count == 0) { + return; + } + + std::lock_guard lck(tell_mutex_); + if (length_ + element_count > get_num_rows()) { + resize_field_data(length_ + element_count); + } + std::copy_n(static_cast(field_data), + element_count * dim_, + data_.data() + length_ * dim_); + + ssize_t byte_count = (element_count + 7) / 8; + // Note: if 'nullable == true` and valid_data is nullptr + // means null_count == 0, will fill it with 0xFF + if (valid_data == nullptr) { + valid_data_.resize(byte_count, 0xFF); + } else { + std::copy_n(valid_data, byte_count, valid_data_.data()); + } + length_ += element_count; } @@ -66,6 +100,7 @@ FieldDataImpl::FillFieldData( if (element_count == 0) { return; } + null_count = array->null_count(); switch (data_type_) { case DataType::BOOL: { AssertInfo(array->type()->id() == arrow::Type::type::BOOL, @@ -76,42 +111,71 @@ FieldDataImpl::FillFieldData( for (size_t index = 0; index < element_count; ++index) { values[index] = bool_array->Value(index); } + if (nullable_) { + return FillFieldData(values.data(), + bool_array->null_bitmap_data(), + element_count); + } return FillFieldData(values.data(), element_count); } case DataType::INT8: { auto array_info = GetDataInfoFromArray( array); + if (nullable_) { + return FillFieldData( + array_info.first, array->null_bitmap_data(), element_count); + } return FillFieldData(array_info.first, array_info.second); } case DataType::INT16: { auto array_info = GetDataInfoFromArray(array); + if (nullable_) { + return FillFieldData( + array_info.first, array->null_bitmap_data(), element_count); + } return FillFieldData(array_info.first, array_info.second); } case DataType::INT32: { auto array_info = GetDataInfoFromArray(array); + if (nullable_) { + return FillFieldData( + array_info.first, array->null_bitmap_data(), element_count); + } return FillFieldData(array_info.first, array_info.second); } case DataType::INT64: { auto array_info = GetDataInfoFromArray(array); + if (nullable_) { + return FillFieldData( + array_info.first, array->null_bitmap_data(), element_count); + } return FillFieldData(array_info.first, array_info.second); } case DataType::FLOAT: { auto array_info = GetDataInfoFromArray(array); + if (nullable_) { + return FillFieldData( + array_info.first, array->null_bitmap_data(), element_count); + } return FillFieldData(array_info.first, array_info.second); } case DataType::DOUBLE: { auto array_info = GetDataInfoFromArray(array); + if (nullable_) { + return FillFieldData( + array_info.first, array->null_bitmap_data(), element_count); + } return FillFieldData(array_info.first, array_info.second); } case DataType::STRING: @@ -124,6 +188,10 @@ FieldDataImpl::FillFieldData( for (size_t index = 0; index < element_count; ++index) { values[index] = string_array->GetString(index); } + if (nullable_) { + return FillFieldData( + values.data(), array->null_bitmap_data(), element_count); + } return FillFieldData(values.data(), element_count); } case DataType::JSON: { @@ -136,17 +204,33 @@ FieldDataImpl::FillFieldData( values[index] = Json(simdjson::padded_string(json_array->GetString(index))); } + if (nullable_) { + return FillFieldData( + values.data(), array->null_bitmap_data(), element_count); + } return FillFieldData(values.data(), element_count); } case DataType::ARRAY: { auto array_array = std::dynamic_pointer_cast(array); std::vector values(element_count); + int null_number = 0; for (size_t index = 0; index < element_count; ++index) { ScalarArray field_data; - field_data.ParseFromString(array_array->GetString(index)); + if (array_array->GetString(index) == "") { + null_number++; + continue; + } + auto success = + field_data.ParseFromString(array_array->GetString(index)); + AssertInfo(success, "parse from string failed"); values[index] = Array(field_data); } + if (nullable_) { + return FillFieldData( + values.data(), array->null_bitmap_data(), element_count); + } + AssertInfo(null_number == 0, "get empty string when not nullable"); return FillFieldData(values.data(), element_count); } case DataType::VECTOR_FLOAT: @@ -201,27 +285,33 @@ template class FieldDataImpl; template class FieldDataImpl, true>; FieldDataPtr -InitScalarFieldData(const DataType& type, int64_t cap_rows) { +InitScalarFieldData(const DataType& type, bool nullable, int64_t cap_rows) { switch (type) { case DataType::BOOL: - return std::make_shared>(type, cap_rows); + return std::make_shared>(type, nullable, cap_rows); case DataType::INT8: - return std::make_shared>(type, cap_rows); + return std::make_shared>( + type, nullable, cap_rows); case DataType::INT16: - return std::make_shared>(type, cap_rows); + return std::make_shared>( + type, nullable, cap_rows); case DataType::INT32: - return std::make_shared>(type, cap_rows); + return std::make_shared>( + type, nullable, cap_rows); case DataType::INT64: - return std::make_shared>(type, cap_rows); + return std::make_shared>( + type, nullable, cap_rows); case DataType::FLOAT: - return std::make_shared>(type, cap_rows); + return std::make_shared>(type, nullable, cap_rows); case DataType::DOUBLE: - return std::make_shared>(type, cap_rows); + return std::make_shared>( + type, nullable, cap_rows); case DataType::STRING: case DataType::VARCHAR: - return std::make_shared>(type, cap_rows); + return std::make_shared>( + type, nullable, cap_rows); case DataType::JSON: - return std::make_shared>(type, cap_rows); + return std::make_shared>(type, nullable, cap_rows); default: PanicInfo(DataTypeInvalid, "InitScalarFieldData not support data type " + diff --git a/internal/core/src/common/FieldData.h b/internal/core/src/common/FieldData.h index 60e0c74b3ad56..de796fa3c5e8a 100644 --- a/internal/core/src/common/FieldData.h +++ b/internal/core/src/common/FieldData.h @@ -30,14 +30,18 @@ template class FieldData : public FieldDataImpl { public: static_assert(IsScalar || std::is_same_v); - explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0) + explicit FieldData(DataType data_type, + bool nullable, + int64_t buffered_num_rows = 0) : FieldDataImpl::FieldDataImpl( - 1, data_type, buffered_num_rows) { + 1, data_type, nullable, buffered_num_rows) { } static_assert(IsScalar || std::is_same_v); - explicit FieldData(DataType data_type, FixedVector&& inner_data) + explicit FieldData(DataType data_type, + bool nullable, + FixedVector&& inner_data) : FieldDataImpl::FieldDataImpl( - 1, data_type, std::move(inner_data)) { + 1, data_type, nullable, std::move(inner_data)) { } }; @@ -45,8 +49,10 @@ template <> class FieldData : public FieldDataStringImpl { public: static_assert(IsScalar || std::is_same_v); - explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0) - : FieldDataStringImpl(data_type, buffered_num_rows) { + explicit FieldData(DataType data_type, + bool nullable, + int64_t buffered_num_rows = 0) + : FieldDataStringImpl(data_type, nullable, buffered_num_rows) { } }; @@ -54,8 +60,10 @@ template <> class FieldData : public FieldDataJsonImpl { public: static_assert(IsScalar || std::is_same_v); - explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0) - : FieldDataJsonImpl(data_type, buffered_num_rows) { + explicit FieldData(DataType data_type, + bool nullable, + int64_t buffered_num_rows = 0) + : FieldDataJsonImpl(data_type, nullable, buffered_num_rows) { } }; @@ -63,8 +71,10 @@ template <> class FieldData : public FieldDataArrayImpl { public: static_assert(IsScalar || std::is_same_v); - explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0) - : FieldDataArrayImpl(data_type, buffered_num_rows) { + explicit FieldData(DataType data_type, + bool nullable, + int64_t buffered_num_rows = 0) + : FieldDataArrayImpl(data_type, nullable, buffered_num_rows) { } }; @@ -75,7 +85,7 @@ class FieldData : public FieldDataImpl { DataType data_type, int64_t buffered_num_rows = 0) : FieldDataImpl::FieldDataImpl( - dim, data_type, buffered_num_rows) { + dim, data_type, false, buffered_num_rows) { } }; @@ -86,7 +96,7 @@ class FieldData : public FieldDataImpl { DataType data_type, int64_t buffered_num_rows = 0) : binary_dim_(dim), - FieldDataImpl(dim / 8, data_type, buffered_num_rows) { + FieldDataImpl(dim / 8, data_type, false, buffered_num_rows) { Assert(dim % 8 == 0); } @@ -106,7 +116,7 @@ class FieldData : public FieldDataImpl { DataType data_type, int64_t buffered_num_rows = 0) : FieldDataImpl::FieldDataImpl( - dim, data_type, buffered_num_rows) { + dim, data_type, false, buffered_num_rows) { } }; @@ -117,7 +127,7 @@ class FieldData : public FieldDataImpl { DataType data_type, int64_t buffered_num_rows = 0) : FieldDataImpl::FieldDataImpl( - dim, data_type, buffered_num_rows) { + dim, data_type, false, buffered_num_rows) { } }; @@ -134,6 +144,6 @@ using FieldDataChannel = Channel; using FieldDataChannelPtr = std::shared_ptr; FieldDataPtr -InitScalarFieldData(const DataType& type, int64_t cap_rows); +InitScalarFieldData(const DataType& type, bool nullable, int64_t cap_rows); } // namespace milvus \ No newline at end of file diff --git a/internal/core/src/common/FieldDataInterface.h b/internal/core/src/common/FieldDataInterface.h index 17916f08e6259..2fab8b8394193 100644 --- a/internal/core/src/common/FieldDataInterface.h +++ b/internal/core/src/common/FieldDataInterface.h @@ -25,6 +25,7 @@ #include #include +#include "Types.h" #include "arrow/api.h" #include "arrow/array/array_binary.h" #include "common/FieldMeta.h" @@ -40,7 +41,8 @@ using DataType = milvus::DataType; class FieldDataBase { public: - explicit FieldDataBase(DataType data_type) : data_type_(data_type) { + explicit FieldDataBase(DataType data_type, bool nullable) + : data_type_(data_type), nullable_(nullable) { } virtual ~FieldDataBase() = default; @@ -49,6 +51,11 @@ class FieldDataBase { virtual void FillFieldData(const void* source, ssize_t element_count) = 0; + virtual void + FillFieldData(const void* field_data, + const uint8_t* valid_data, + ssize_t element_count) = 0; + virtual void FillFieldData(const std::shared_ptr array) = 0; @@ -57,6 +64,9 @@ class FieldDataBase { virtual void* Data() = 0; + virtual uint8_t* + ValidData() = 0; + // For all FieldDataImpl subclasses, this method returns a Type* that points // at the offset-th row of this field data. virtual const void* @@ -66,9 +76,15 @@ class FieldDataBase { virtual int64_t Size() const = 0; + virtual int64_t + DataSize() const = 0; + + virtual int64_t + ValidDataSize() const = 0; + // Returns the serialized bytes size of the index-th row. virtual int64_t - Size(ssize_t index) const = 0; + DataSize(ssize_t index) const = 0; // Number of filled rows virtual size_t @@ -77,6 +93,9 @@ class FieldDataBase { virtual bool IsFull() const = 0; + virtual bool + IsNullable() const = 0; + virtual void Reserve(size_t cap) = 0; @@ -94,8 +113,15 @@ class FieldDataBase { return data_type_; } + virtual int64_t + get_null_count() const = 0; + + virtual bool + is_valid(ssize_t offset) const = 0; + protected: const DataType data_type_; + const bool nullable_; }; template @@ -112,25 +138,53 @@ class FieldDataImpl : public FieldDataBase { public: explicit FieldDataImpl(ssize_t dim, DataType data_type, + bool nullable, int64_t buffered_num_rows = 0) - : FieldDataBase(data_type), + : FieldDataBase(data_type, nullable), num_rows_(buffered_num_rows), dim_(is_type_entire_row ? 1 : dim) { - field_data_.resize(num_rows_ * dim_); + data_.resize(num_rows_ * dim_); + if (nullable) { + if (IsVectorDataType(data_type)) { + PanicInfo(NotImplemented, "vector type not support null"); + } + valid_data_.resize((num_rows_ + 7) / 8); + } } explicit FieldDataImpl(size_t dim, DataType type, - FixedVector&& field_data) - : FieldDataBase(type), dim_(is_type_entire_row ? 1 : dim) { - field_data_ = std::move(field_data); - Assert(field_data.size() % dim == 0); - num_rows_ = field_data.size() / dim; + bool nullable, + FixedVector&& data) + : FieldDataBase(type, nullable), dim_(is_type_entire_row ? 1 : dim) { + AssertInfo(!nullable, "need to fill valid_data when nullable is true"); + data_ = std::move(data); + Assert(data.size() % dim == 0); + num_rows_ = data.size() / dim; + } + + explicit FieldDataImpl(size_t dim, + DataType type, + bool nullable, + FixedVector&& data, + FixedVector&& valid_data) + : FieldDataBase(type, nullable), dim_(is_type_entire_row ? 1 : dim) { + AssertInfo(nullable, + "no need to fill valid_data when nullable is false"); + data_ = std::move(data); + valid_data_ = std::move(valid_data); + Assert(data.size() % dim == 0); + num_rows_ = data.size() / dim; } void FillFieldData(const void* source, ssize_t element_count) override; + void + FillFieldData(const void* field_data, + const uint8_t* valid_data, + ssize_t element_count) override; + void FillFieldData(const std::shared_ptr array) override; @@ -155,7 +209,12 @@ class FieldDataImpl : public FieldDataBase { void* Data() override { - return field_data_.data(); + return data_.data(); + } + + uint8_t* + ValidData() override { + return valid_data_.data(); } const void* @@ -164,16 +223,36 @@ class FieldDataImpl : public FieldDataBase { "field data subscript out of range"); AssertInfo(offset < length(), "subscript position don't has valid value"); - return &field_data_[offset]; - } + return &data_[offset]; + } + + // std::optional + // Value(ssize_t offset) { + // if (!is_type_entire_row) { + // return RawValue(offset); + // } + // AssertInfo(offset < get_num_rows(), + // "field data subscript out of range"); + // AssertInfo(offset < length(), + // "subscript position don't has valid value"); + // if (nullable_ && !valid_data_[offset]) { + // return std::nullopt; + // } + // return &field_data_[offset]; + // } int64_t Size() const override { + return DataSize() + ValidDataSize(); + } + + int64_t + DataSize() const override { return sizeof(Type) * length() * dim_; } int64_t - Size(ssize_t offset) const override { + DataSize(ssize_t offset) const override { AssertInfo(offset < get_num_rows(), "field data subscript out of range"); AssertInfo(offset < length(), @@ -181,6 +260,14 @@ class FieldDataImpl : public FieldDataBase { return sizeof(Type) * dim_; } + int64_t + ValidDataSize() const override { + if (nullable_) { + return sizeof(uint8_t) * (length() + 7) / 8; + } + return 0; + } + size_t Length() const override { return length_; @@ -193,12 +280,20 @@ class FieldDataImpl : public FieldDataBase { return buffered_num_rows == filled_num_rows; } + bool + IsNullable() const override { + return nullable_; + } + void Reserve(size_t cap) override { std::lock_guard lck(num_rows_mutex_); if (cap > num_rows_) { num_rows_ = cap; - field_data_.resize(num_rows_ * dim_); + data_.resize(num_rows_ * dim_); + } + if (nullable_) { + valid_data_.resize((num_rows_ + 7) / 8); } } @@ -214,7 +309,10 @@ class FieldDataImpl : public FieldDataBase { std::lock_guard lck(num_rows_mutex_); if (num_rows > num_rows_) { num_rows_ = num_rows; - field_data_.resize(num_rows_ * dim_); + data_.resize(num_rows_ * dim_); + if (nullable_) { + valid_data_.resize((num_rows + 7) / 8); + } } } @@ -229,12 +327,34 @@ class FieldDataImpl : public FieldDataBase { return dim_; } + int64_t + get_null_count() const override { + std::shared_lock lck(tell_mutex_); + return null_count; + } + + bool + is_valid(ssize_t offset) const override { + std::shared_lock lck(tell_mutex_); + AssertInfo(offset < get_num_rows(), + "field data subscript out of range"); + AssertInfo(offset < length(), + "subscript position don't has valid value"); + if (!nullable_) { + return true; + } + auto bit = (valid_data_[offset >> 3] >> ((offset & 0x07))) & 1; + return bit; + } + protected: - FixedVector field_data_; - // number of elements field_data_ can hold + FixedVector data_{}; + FixedVector valid_data_{}; + // number of elements data_ can hold int64_t num_rows_; mutable std::shared_mutex num_rows_mutex_; - // number of actual elements in field_data_ + int64_t null_count{0}; + // number of actual elements in data_ size_t length_{}; mutable std::shared_mutex tell_mutex_; @@ -244,27 +364,30 @@ class FieldDataImpl : public FieldDataBase { class FieldDataStringImpl : public FieldDataImpl { public: - explicit FieldDataStringImpl(DataType data_type, int64_t total_num_rows = 0) - : FieldDataImpl(1, data_type, total_num_rows) { + explicit FieldDataStringImpl(DataType data_type, + bool nullable, + int64_t total_num_rows = 0) + : FieldDataImpl( + 1, data_type, nullable, total_num_rows) { } int64_t - Size() const override { + DataSize() const override { int64_t data_size = 0; for (size_t offset = 0; offset < length(); ++offset) { - data_size += field_data_[offset].size(); + data_size += data_[offset].size(); } return data_size; } int64_t - Size(ssize_t offset) const override { + DataSize(ssize_t offset) const override { AssertInfo(offset < get_num_rows(), "field data subscript out of range"); AssertInfo(offset < length(), "subscript position don't has valid value"); - return field_data_[offset].size(); + return data_[offset].size(); } void @@ -281,36 +404,46 @@ class FieldDataStringImpl : public FieldDataImpl { auto i = 0; for (const auto& str : *array) { - field_data_[length_ + i] = str.value(); + data_[length_ + i] = str.value(); i++; } + if (IsNullable()) { + auto valid_data = array->null_bitmap_data(); + if (valid_data == nullptr) { + valid_data_.resize((n + 7) / 8, 0xFF); + } else { + std::copy_n(valid_data, (n + 7) / 8, valid_data_.data()); + } + } length_ += n; } }; class FieldDataJsonImpl : public FieldDataImpl { public: - explicit FieldDataJsonImpl(DataType data_type, int64_t total_num_rows = 0) - : FieldDataImpl(1, data_type, total_num_rows) { + explicit FieldDataJsonImpl(DataType data_type, + bool nullable, + int64_t total_num_rows = 0) + : FieldDataImpl(1, data_type, nullable, total_num_rows) { } int64_t - Size() const override { + DataSize() const override { int64_t data_size = 0; for (size_t offset = 0; offset < length(); ++offset) { - data_size += field_data_[offset].data().size(); + data_size += data_[offset].data().size(); } return data_size; } int64_t - Size(ssize_t offset) const override { + DataSize(ssize_t offset) const override { AssertInfo(offset < get_num_rows(), "field data subscript out of range"); AssertInfo(offset < length(), "subscript position don't has valid value"); - return field_data_[offset].data().size(); + return data_[offset].data().size(); } void @@ -337,10 +470,17 @@ class FieldDataJsonImpl : public FieldDataImpl { auto i = 0; for (const auto& json : *array) { - field_data_[length_ + i] = - Json(simdjson::padded_string(json.value())); + data_[length_ + i] = Json(simdjson::padded_string(json.value())); i++; } + if (IsNullable()) { + auto valid_data = array->null_bitmap_data(); + if (valid_data == nullptr) { + valid_data_.resize((n + 7) / 8, 0xFF); + } else { + std::copy_n(valid_data, (n + 7) / 8, valid_data_.data()); + } + } length_ += n; } }; @@ -351,28 +491,28 @@ class FieldDataSparseVectorImpl explicit FieldDataSparseVectorImpl(DataType data_type, int64_t total_num_rows = 0) : FieldDataImpl, true>( - /*dim=*/1, data_type, total_num_rows), + /*dim=*/1, data_type, false, total_num_rows), vec_dim_(0) { AssertInfo(data_type == DataType::VECTOR_SPARSE_FLOAT, "invalid data type for sparse vector"); } int64_t - Size() const override { + DataSize() const override { int64_t data_size = 0; for (size_t i = 0; i < length(); ++i) { - data_size += field_data_[i].data_byte_size(); + data_size += data_[i].data_byte_size(); } return data_size; } int64_t - Size(ssize_t offset) const override { + DataSize(ssize_t offset) const override { AssertInfo(offset < get_num_rows(), "field data subscript out of range"); AssertInfo(offset < length(), "subscript position don't has valid value"); - return field_data_[offset].data_byte_size(); + return data_[offset].data_byte_size(); } // source is a pointer to element_count of @@ -393,7 +533,7 @@ class FieldDataSparseVectorImpl auto& row = ptr[i]; vec_dim_ = std::max(vec_dim_, row.dim()); } - std::copy_n(ptr, element_count, field_data_.data() + length_); + std::copy_n(ptr, element_count, data_.data() + length_); length_ += element_count; } @@ -412,7 +552,7 @@ class FieldDataSparseVectorImpl for (int64_t i = 0; i < array->length(); ++i) { auto view = array->GetView(i); - auto& row = field_data_[length_ + i]; + auto& row = data_[length_ + i]; row = CopyAndWrapSparseRow(view.data(), view.size()); vec_dim_ = std::max(vec_dim_, row.dim()); } @@ -430,27 +570,28 @@ class FieldDataSparseVectorImpl class FieldDataArrayImpl : public FieldDataImpl { public: - explicit FieldDataArrayImpl(DataType data_type, int64_t total_num_rows = 0) - : FieldDataImpl(1, data_type, total_num_rows) { + explicit FieldDataArrayImpl(DataType data_type, + bool nullable, + int64_t total_num_rows = 0) + : FieldDataImpl(1, data_type, nullable, total_num_rows) { } int64_t - Size() const { + DataSize() const override { int64_t data_size = 0; for (size_t offset = 0; offset < length(); ++offset) { - data_size += field_data_[offset].byte_size(); + data_size += data_[offset].byte_size(); } - return data_size; } int64_t - Size(ssize_t offset) const { + DataSize(ssize_t offset) const override { AssertInfo(offset < get_num_rows(), "field data subscript out of range"); AssertInfo(offset < length(), "subscript position don't has valid value"); - return field_data_[offset].byte_size(); + return data_[offset].byte_size(); } }; diff --git a/internal/core/src/common/FieldMeta.h b/internal/core/src/common/FieldMeta.h index b75df4ab9c268..42522e9b4f452 100644 --- a/internal/core/src/common/FieldMeta.h +++ b/internal/core/src/common/FieldMeta.h @@ -35,27 +35,34 @@ class FieldMeta { FieldMeta& operator=(FieldMeta&&) = default; - FieldMeta(const FieldName& name, FieldId id, DataType type) - : name_(name), id_(id), type_(type) { + FieldMeta(const FieldName& name, FieldId id, DataType type, bool nullable) + : name_(name), id_(id), type_(type), nullable_(nullable) { Assert(!IsVectorDataType(type_)); } FieldMeta(const FieldName& name, FieldId id, DataType type, - int64_t max_length) + int64_t max_length, + bool nullable) : name_(name), id_(id), type_(type), - string_info_(StringInfo{max_length}) { + string_info_(StringInfo{max_length}), + nullable_(nullable) { Assert(IsStringDataType(type_)); } FieldMeta(const FieldName& name, FieldId id, DataType type, - DataType element_type) - : name_(name), id_(id), type_(type), element_type_(element_type) { + DataType element_type, + bool nullable) + : name_(name), + id_(id), + type_(type), + element_type_(element_type), + nullable_(nullable) { Assert(IsArrayDataType(type_)); } @@ -65,12 +72,15 @@ class FieldMeta { FieldId id, DataType type, int64_t dim, - std::optional metric_type) + std::optional metric_type, + bool nullable) : name_(name), id_(id), type_(type), - vector_info_(VectorInfo{dim, std::move(metric_type)}) { + vector_info_(VectorInfo{dim, std::move(metric_type)}), + nullable_(nullable) { Assert(IsVectorDataType(type_)); + Assert(!nullable); } int64_t @@ -126,6 +136,11 @@ class FieldMeta { return IsStringDataType(type_); } + bool + is_nullable() const { + return nullable_; + } + size_t get_sizeof() const { AssertInfo(!IsSparseFloatVectorDataType(type_), @@ -157,6 +172,7 @@ class FieldMeta { FieldId id_; DataType type_ = DataType::NONE; DataType element_type_ = DataType::NONE; + bool nullable_; std::optional vector_info_; std::optional string_info_; }; diff --git a/internal/core/src/common/Schema.cpp b/internal/core/src/common/Schema.cpp index 7aa4fc1630bcb..d5eaa200920dd 100644 --- a/internal/core/src/common/Schema.cpp +++ b/internal/core/src/common/Schema.cpp @@ -38,7 +38,7 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) { schema_proto.fields()) { auto field_id = FieldId(child.fieldid()); auto name = FieldName(child.name()); - + auto nullable = child.nullable(); if (field_id.get() < 100) { // system field id auto is_system = @@ -60,22 +60,27 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) { dim = boost::lexical_cast(type_map.at("dim")); } if (!index_map.count("metric_type")) { - schema->AddField(name, field_id, data_type, dim, std::nullopt); + schema->AddField( + name, field_id, data_type, dim, std::nullopt, false); } else { auto metric_type = index_map.at("metric_type"); - schema->AddField(name, field_id, data_type, dim, metric_type); + schema->AddField( + name, field_id, data_type, dim, metric_type, false); } } else if (IsStringDataType(data_type)) { auto type_map = RepeatedKeyValToMap(child.type_params()); AssertInfo(type_map.count(MAX_LENGTH), "max_length not found"); auto max_len = boost::lexical_cast(type_map.at(MAX_LENGTH)); - schema->AddField(name, field_id, data_type, max_len); + schema->AddField(name, field_id, data_type, max_len, nullable); } else if (IsArrayDataType(data_type)) { - schema->AddField( - name, field_id, data_type, DataType(child.element_type())); + schema->AddField(name, + field_id, + data_type, + DataType(child.element_type()), + nullable); } else { - schema->AddField(name, field_id, data_type); + schema->AddField(name, field_id, data_type, nullable); } if (child.is_primary_key()) { @@ -93,6 +98,7 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) { const FieldMeta FieldMeta::RowIdMeta(FieldName("RowID"), RowFieldID, - DataType::INT64); + DataType::INT64, + false); } // namespace milvus diff --git a/internal/core/src/common/Schema.h b/internal/core/src/common/Schema.h index 754766f54388b..b6ae2065de6de 100644 --- a/internal/core/src/common/Schema.h +++ b/internal/core/src/common/Schema.h @@ -34,29 +34,35 @@ static int64_t debug_id = START_USER_FIELDID; class Schema { public: FieldId - AddDebugField(const std::string& name, DataType data_type) { + AddDebugField(const std::string& name, + DataType data_type, + bool nullable = false) { auto field_id = FieldId(debug_id); debug_id++; - this->AddField(FieldName(name), field_id, data_type); + this->AddField(FieldName(name), field_id, data_type, nullable); return field_id; } FieldId AddDebugField(const std::string& name, DataType data_type, - DataType element_type) { + DataType element_type, + bool nullable = false) { auto field_id = FieldId(debug_id); debug_id++; - this->AddField(FieldName(name), field_id, data_type, element_type); + this->AddField( + FieldName(name), field_id, data_type, element_type, nullable); return field_id; } FieldId - AddDebugArrayField(const std::string& name, DataType element_type) { + AddDebugArrayField(const std::string& name, + DataType element_type, + bool nullable) { auto field_id = FieldId(debug_id); debug_id++; this->AddField( - FieldName(name), field_id, DataType::ARRAY, element_type); + FieldName(name), field_id, DataType::ARRAY, element_type, nullable); return field_id; } @@ -68,16 +74,19 @@ class Schema { std::optional metric_type) { auto field_id = FieldId(debug_id); debug_id++; - auto field_meta = - FieldMeta(FieldName(name), field_id, data_type, dim, metric_type); + auto field_meta = FieldMeta( + FieldName(name), field_id, data_type, dim, metric_type, false); this->AddField(std::move(field_meta)); return field_id; } // scalar type void - AddField(const FieldName& name, const FieldId id, DataType data_type) { - auto field_meta = FieldMeta(name, id, data_type); + AddField(const FieldName& name, + const FieldId id, + DataType data_type, + bool nullable) { + auto field_meta = FieldMeta(name, id, data_type, nullable); this->AddField(std::move(field_meta)); } @@ -86,8 +95,10 @@ class Schema { AddField(const FieldName& name, const FieldId id, DataType data_type, - DataType element_type) { - auto field_meta = FieldMeta(name, id, data_type, element_type); + DataType element_type, + bool nullable) { + auto field_meta = + FieldMeta(name, id, data_type, element_type, nullable); this->AddField(std::move(field_meta)); } @@ -96,8 +107,9 @@ class Schema { AddField(const FieldName& name, const FieldId id, DataType data_type, - int64_t max_length) { - auto field_meta = FieldMeta(name, id, data_type, max_length); + int64_t max_length, + bool nullable) { + auto field_meta = FieldMeta(name, id, data_type, max_length, nullable); this->AddField(std::move(field_meta)); } @@ -107,8 +119,10 @@ class Schema { const FieldId id, DataType data_type, int64_t dim, - std::optional metric_type) { - auto field_meta = FieldMeta(name, id, data_type, dim, metric_type); + std::optional metric_type, + bool nullable) { + auto field_meta = + FieldMeta(name, id, data_type, dim, metric_type, false); this->AddField(std::move(field_meta)); } diff --git a/internal/core/src/common/Vector.h b/internal/core/src/common/Vector.h index dab66ffb18a31..bdffd67689cf9 100644 --- a/internal/core/src/common/Vector.h +++ b/internal/core/src/common/Vector.h @@ -65,7 +65,8 @@ class ColumnVector final : public BaseVector { size_t length, std::optional null_count = std::nullopt) : BaseVector(data_type, length, null_count) { - values_ = InitScalarFieldData(data_type, length); + //todo: support null expr + values_ = InitScalarFieldData(data_type, false, length); } // ColumnVector(FixedVector&& data) @@ -78,7 +79,7 @@ class ColumnVector final : public BaseVector { ColumnVector(TargetBitmap&& bitmap) : BaseVector(DataType::INT8, bitmap.size()) { values_ = std::make_shared>( - bitmap.size(), DataType::INT8, std::move(bitmap).into()); + bitmap.size(), DataType::INT8, false, std::move(bitmap).into()); } virtual ~ColumnVector() override { diff --git a/internal/core/src/index/BitmapIndex.cpp b/internal/core/src/index/BitmapIndex.cpp index 6d160a04c32ae..3052dce0cd5ea 100644 --- a/internal/core/src/index/BitmapIndex.cpp +++ b/internal/core/src/index/BitmapIndex.cpp @@ -117,8 +117,9 @@ BitmapIndex::BuildV2(const Config& config) { auto data = rec.ValueUnsafe(); auto total_num_rows = data->num_rows(); auto col_data = data->GetColumnByName(field_name); + // todo: support nullable index auto field_data = storage::CreateFieldData( - DataType(GetDType()), 0, total_num_rows); + DataType(GetDType()), false, 0, total_num_rows); field_data->FillFieldData(col_data); field_datas.push_back(field_data); } diff --git a/internal/core/src/index/HybridScalarIndex.cpp b/internal/core/src/index/HybridScalarIndex.cpp index 4a7a38666523a..628cde37aa92e 100644 --- a/internal/core/src/index/HybridScalarIndex.cpp +++ b/internal/core/src/index/HybridScalarIndex.cpp @@ -295,8 +295,9 @@ HybridScalarIndex::BuildV2(const Config& config) { auto data = rec.ValueUnsafe(); auto total_num_rows = data->num_rows(); auto col_data = data->GetColumnByName(field_name); + // todo: support nullable index auto field_data = storage::CreateFieldData( - DataType(GetDType()), 0, total_num_rows); + DataType(GetDType()), false, 0, total_num_rows); field_data->FillFieldData(col_data); field_datas.push_back(field_data); } diff --git a/internal/core/src/index/InvertedIndexTantivy.cpp b/internal/core/src/index/InvertedIndexTantivy.cpp index 6d4e4e6d1b8a7..22a71ed6373e7 100644 --- a/internal/core/src/index/InvertedIndexTantivy.cpp +++ b/internal/core/src/index/InvertedIndexTantivy.cpp @@ -169,8 +169,9 @@ InvertedIndexTantivy::BuildV2(const Config& config) { auto data = rec.ValueUnsafe(); auto total_num_rows = data->num_rows(); auto col_data = data->GetColumnByName(field_name); + // todo: support nullable index auto field_data = storage::CreateFieldData( - DataType(GetDType()), 0, total_num_rows); + DataType(GetDType()), false, 0, total_num_rows); field_data->FillFieldData(col_data); field_datas.push_back(field_data); } diff --git a/internal/core/src/index/ScalarIndexSort.cpp b/internal/core/src/index/ScalarIndexSort.cpp index 2a37d9b096885..842cd13e8f8fd 100644 --- a/internal/core/src/index/ScalarIndexSort.cpp +++ b/internal/core/src/index/ScalarIndexSort.cpp @@ -72,8 +72,15 @@ ScalarIndexSort::BuildV2(const Config& config) { auto data = rec.ValueUnsafe(); auto total_num_rows = data->num_rows(); auto col_data = data->GetColumnByName(field_name); + auto nullable = + col_data->type()->id() == arrow::Type::NA ? true : false; + // will support build scalar index when nullable in the future just skip it + // now, not support to build index in nullable field_data + // todo: support nullable index + AssertInfo(!nullable, + "not support to build index in nullable field_data"); auto field_data = storage::CreateFieldData( - DataType(GetDType()), 0, total_num_rows); + DataType(GetDType()), nullable, 0, total_num_rows); field_data->FillFieldData(col_data); field_datas.push_back(field_data); } diff --git a/internal/core/src/index/StringIndexMarisa.cpp b/internal/core/src/index/StringIndexMarisa.cpp index a5130b7615794..3d861793f048f 100644 --- a/internal/core/src/index/StringIndexMarisa.cpp +++ b/internal/core/src/index/StringIndexMarisa.cpp @@ -83,8 +83,15 @@ StringIndexMarisa::BuildV2(const Config& config) { auto data = rec.ValueUnsafe(); auto total_num_rows = data->num_rows(); auto col_data = data->GetColumnByName(field_name); - auto field_data = - storage::CreateFieldData(DataType::STRING, 0, total_num_rows); + auto nullable = + col_data->type()->id() == arrow::Type::NA ? true : false; + // will support build scalar index when nullable in the future just skip it + // now, not support to build index in nullable field_data + // todo: support nullable index + AssertInfo(!nullable, + "not support to build index in nullable field_data"); + auto field_data = storage::CreateFieldData( + DataType::STRING, nullable, 0, total_num_rows); field_data->FillFieldData(col_data); field_datas.push_back(field_data); } diff --git a/internal/core/src/index/Utils.cpp b/internal/core/src/index/Utils.cpp index 9f7148428af06..dfd41298b44a3 100644 --- a/internal/core/src/index/Utils.cpp +++ b/internal/core/src/index/Utils.cpp @@ -249,9 +249,9 @@ AssembleIndexDatas(std::map& index_datas) { std::string prefix = item[NAME]; int slice_num = item[SLICE_NUM]; auto total_len = static_cast(item[TOTAL_LEN]); - + // todo: support nullable index auto new_field_data = - storage::CreateFieldData(DataType::INT8, 1, total_len); + storage::CreateFieldData(DataType::INT8, false, 1, total_len); for (auto i = 0; i < slice_num; ++i) { std::string file_name = GenSlicedFileName(prefix, i); @@ -288,9 +288,9 @@ AssembleIndexDatas(std::map& index_datas, std::string prefix = item[NAME]; int slice_num = item[SLICE_NUM]; auto total_len = static_cast(item[TOTAL_LEN]); - + // todo: support nullable index auto new_field_data = - storage::CreateFieldData(DataType::INT8, 1, total_len); + storage::CreateFieldData(DataType::INT8, false, 1, total_len); for (auto i = 0; i < slice_num; ++i) { std::string file_name = GenSlicedFileName(prefix, i); diff --git a/internal/core/src/index/VectorMemIndex.cpp b/internal/core/src/index/VectorMemIndex.cpp index 0b140cc82c1cc..9861222548276 100644 --- a/internal/core/src/index/VectorMemIndex.cpp +++ b/internal/core/src/index/VectorMemIndex.cpp @@ -259,9 +259,9 @@ VectorMemIndex::LoadV2(const Config& config) { std::string prefix = item[NAME]; int slice_num = item[SLICE_NUM]; auto total_len = static_cast(item[TOTAL_LEN]); - - auto new_field_data = - milvus::storage::CreateFieldData(DataType::INT8, 1, total_len); + // todo: support nullable index + auto new_field_data = milvus::storage::CreateFieldData( + DataType::INT8, false, 1, total_len); for (auto i = 0; i < slice_num; ++i) { std::string file_name = index_prefix + "/" + GenSlicedFileName(prefix, i); @@ -358,9 +358,9 @@ VectorMemIndex::Load(milvus::tracer::TraceContext ctx, std::string prefix = item[NAME]; int slice_num = item[SLICE_NUM]; auto total_len = static_cast(item[TOTAL_LEN]); - + // todo: support nullable index auto new_field_data = milvus::storage::CreateFieldData( - DataType::INT8, 1, total_len); + DataType::INT8, false, 1, total_len); std::vector batch; batch.reserve(slice_num); @@ -462,8 +462,9 @@ VectorMemIndex::BuildV2(const Config& config) { } auto total_num_rows = data->num_rows(); auto col_data = data->GetColumnByName(field_name); + // todo: support nullable index auto field_data = - storage::CreateFieldData(field_type, dim, total_num_rows); + storage::CreateFieldData(field_type, false, dim, total_num_rows); field_data->FillFieldData(col_data); field_datas.push_back(field_data); } diff --git a/internal/core/src/mmap/Column.h b/internal/core/src/mmap/Column.h index c0f1cd967b7ee..3a5abd980b95a 100644 --- a/internal/core/src/mmap/Column.h +++ b/internal/core/src/mmap/Column.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -52,7 +53,6 @@ namespace milvus { */ constexpr size_t STRING_PADDING = 1; constexpr size_t ARRAY_PADDING = 1; - constexpr size_t BLOCK_SIZE = 8192; class ColumnBase { @@ -74,10 +74,10 @@ class ColumnBase { type_size_ = field_meta.get_sizeof(); - cap_size_ = type_size_ * reserve; + data_cap_size_ = field_meta.get_sizeof() * reserve; // use anon mapping so we are able to free these memory with munmap only - size_t mapped_size = cap_size_ + padding_; + size_t mapped_size = data_cap_size_ + padding_; data_ = static_cast(mmap(nullptr, mapped_size, PROT_READ | PROT_WRITE, @@ -89,6 +89,10 @@ class ColumnBase { strerror(errno), mapped_size); + if (field_meta.is_nullable()) { + nullable_ = true; + valid_data_.reserve(reserve); + } UpdateMetricWhenMmap(mapped_size); } @@ -97,24 +101,29 @@ class ColumnBase { int dim, const DataType& data_type, storage::MmapChunkManagerPtr mcm, - storage::MmapChunkDescriptorPtr descriptor) + storage::MmapChunkDescriptorPtr descriptor, + bool nullable) : mcm_(mcm), mmap_descriptor_(descriptor), type_size_(GetDataTypeSize(data_type, dim)), num_rows_(0), - size_(0), - cap_size_(reserve), - mapping_type_(MAP_WITH_MANAGER) { + data_size_(0), + data_cap_size_(reserve), + mapping_type_(MAP_WITH_MANAGER), + nullable_(nullable) { AssertInfo((mcm != nullptr) && descriptor != nullptr, "use wrong mmap chunk manager and mmap chunk descriptor to " "create column."); SetPaddingSize(data_type); - size_t mapped_size = cap_size_ + padding_; + size_t mapped_size = data_cap_size_ + padding_; data_ = (char*)mcm_->Allocate(mmap_descriptor_, (uint64_t)mapped_size); AssertInfo(data_ != nullptr, "fail to create with mmap manager: map_size = {}", mapped_size); + if (nullable_) { + valid_data_.reserve(reserve); + } } // mmap mode ctor @@ -128,11 +137,11 @@ class ColumnBase { num_rows_ = size / type_size_; } - size_ = size; - cap_size_ = size; + data_size_ = size; + data_cap_size_ = size; // use exactly same size of file, padding shall be written in file already // see also https://github.com/milvus-io/milvus/issues/34442 - size_t mapped_size = cap_size_; + size_t mapped_size = data_cap_size_; data_ = static_cast(mmap( nullptr, mapped_size, PROT_READ, MAP_SHARED, file.Descriptor(), 0)); AssertInfo(data_ != MAP_FAILED, @@ -140,6 +149,12 @@ class ColumnBase { strerror(errno)); madvise(data_, mapped_size, MADV_WILLNEED); + // valid_data store in memory + if (field_meta.is_nullable()) { + nullable_ = true; + valid_data_.reserve(num_rows_); + } + UpdateMetricWhenMmap(mapped_size); } @@ -148,15 +163,17 @@ class ColumnBase { ColumnBase(const File& file, size_t size, int dim, - const DataType& data_type) - : size_(size), - cap_size_(size), + const DataType& data_type, + bool nullable) + : data_size_(size), + data_cap_size_(size), + nullable_(nullable), mapping_type_(MappingType::MAP_WITH_FILE) { SetPaddingSize(data_type); // use exact same size of file, padding shall be written in file already // see also https://github.com/milvus-io/milvus/issues/34442 - size_t mapped_size = cap_size_; + size_t mapped_size = data_cap_size_; if (!IsVariableDataType(data_type)) { type_size_ = GetDataTypeSize(data_type, dim); num_rows_ = size / type_size_; @@ -167,35 +184,44 @@ class ColumnBase { "failed to create file-backed map, err: {}", strerror(errno)); + if (nullable) { + valid_data_.reserve(num_rows_); + } + UpdateMetricWhenMmap(mapped_size); } virtual ~ColumnBase() { if (data_ != nullptr) { if (mapping_type_ != MappingType::MAP_WITH_MANAGER) { - size_t mapped_size = cap_size_ + padding_; + size_t mapped_size = data_cap_size_ + padding_; if (munmap(data_, mapped_size)) { AssertInfo(true, "failed to unmap variable field, err={}", strerror(errno)); } } - UpdateMetricWhenMunmap(cap_size_ + padding_); + UpdateMetricWhenMunmap(data_cap_size_ + padding_); + } + if (nullable_) { + valid_data_.clear(); } } ColumnBase(ColumnBase&& column) noexcept : data_(column.data_), - cap_size_(column.cap_size_), + nullable_(column.nullable_), + valid_data_(column.valid_data_), padding_(column.padding_), type_size_(column.type_size_), num_rows_(column.num_rows_), - size_(column.size_) { + data_size_(column.data_size_) { column.data_ = nullptr; - column.cap_size_ = 0; + column.data_cap_size_ = 0; column.padding_ = 0; column.num_rows_ = 0; - column.size_ = 0; + column.data_size_ = 0; + column.nullable_ = false; } // Data() points at an addr that contains the elements @@ -210,6 +236,21 @@ class ColumnBase { return data_; } + bool + IsValid(size_t offset) const { + return valid_data_[offset]; + } + + bool + IsNullable() const { + return nullable_; + } + + size_t + DataSize() const { + return data_size_; + } + size_t NumRows() const { return num_rows_; @@ -217,14 +258,15 @@ class ColumnBase { virtual size_t ByteSize() const { - return cap_size_ + padding_; + // folly::fbvector implemented with bit compression. + return data_cap_size_ + padding_ + (valid_data_.size() + 7) / 8; } // The capacity of the column, // DO NOT call this for variable length column(including SparseFloatColumn). virtual size_t Capacity() const { - return cap_size_ / type_size_; + return data_cap_size_ / type_size_; } virtual SpanBase @@ -245,28 +287,55 @@ class ColumnBase { virtual void AppendBatch(const FieldDataPtr data) { - size_t required_size = size_ + data->Size(); - if (required_size > cap_size_) { - Expand(required_size * 2 + padding_); + size_t required_size = data_size_ + data->DataSize(); + if (required_size > data_cap_size_) { + ExpandData(required_size * 2 + padding_); } std::copy_n(static_cast(data->Data()), - data->Size(), - data_ + size_); - size_ = required_size; + data->DataSize(), + data_ + data_size_); + data_size_ = required_size; + if (nullable_) { + size_t required_rows = num_rows_ + data->get_num_rows(); + if (required_rows > valid_data_.size()) { + valid_data_.reserve(required_rows * 2); + } + + for (size_t i = 0; i < data->get_num_rows(); i++) { + valid_data_.push_back(data->is_valid(i)); + } + } num_rows_ += data->Length(); } // Append one row virtual void Append(const char* data, size_t size) { - size_t required_size = size_ + size; - if (required_size > cap_size_) { - Expand(required_size * 2); + AssertInfo(!nullable_, + "no need to pass valid_data when nullable is false"); + size_t required_size = data_size_ + size; + if (required_size > data_cap_size_) { + ExpandData(required_size * 2); } - std::copy_n(data, size, data_ + size_); - size_ = required_size; + std::copy_n(data, size, data_ + data_size_); + data_size_ = required_size; + num_rows_++; + } + + // Append one row + virtual void + Append(const char* data, const bool valid_data, size_t size) { + AssertInfo(nullable_, "need to pass valid_data_ when nullable is true"); + size_t required_size = data_size_ + size; + if (required_size > data_cap_size_) { + ExpandData(required_size * 2); + } + + std::copy_n(data, size, data_ + data_size_); + valid_data_.push_back(valid_data); + data_size_ = required_size; num_rows_++; } @@ -290,10 +359,15 @@ class ColumnBase { } } + void + SetValidData(FixedVector&& valid_data) { + valid_data_ = std::move(valid_data); + } + protected: // only for memory mode and mmap manager mode, not mmap void - Expand(size_t new_size) { + ExpandData(size_t new_size) { if (new_size == 0) { return; } @@ -317,8 +391,8 @@ class ColumnBase { new_size + padding_); if (data_ != nullptr) { - std::memcpy(data, data_, size_); - if (munmap(data_, cap_size_ + padding_)) { + std::memcpy(data, data_, data_size_); + if (munmap(data_, data_cap_size_ + padding_)) { auto err = errno; size_t mapped_size = new_size + padding_; munmap(data, mapped_size); @@ -328,13 +402,13 @@ class ColumnBase { false, "failed to unmap while expanding: {}, old_map_size={}", strerror(err), - cap_size_ + padding_); + data_cap_size_ + padding_); } - UpdateMetricWhenMunmap(cap_size_ + padding_); + UpdateMetricWhenMunmap(data_cap_size_ + padding_); } data_ = data; - cap_size_ = new_size; + data_cap_size_ = new_size; mapping_type_ = MappingType::MAP_WITH_ANONYMOUS; } else if (mapping_type_ == MappingType::MAP_WITH_MANAGER) { size_t new_mapped_size = new_size + padding_; @@ -342,25 +416,30 @@ class ColumnBase { AssertInfo(data != nullptr, "fail to create with mmap manager: map_size = {}", new_mapped_size); - std::memcpy(data, data_, size_); + std::memcpy(data, data_, data_cap_size_); // allocate space only append in one growing segment, so no need to munmap() data_ = (char*)data; - cap_size_ = new_size; + data_cap_size_ = new_size; mapping_type_ = MappingType::MAP_WITH_MANAGER; } } char* data_{nullptr}; + bool nullable_{false}; + // When merging multiple valid_data, the bit operation logic is very complex + // for the reason that, FixedVector use bit granularity for storage and access + // so FixedVector is also used to store valid_data on the sealed segment. + FixedVector valid_data_; // capacity in bytes - size_t cap_size_{0}; + size_t data_cap_size_{0}; size_t padding_{0}; // type_size_ is not used for sparse float vector column. size_t type_size_{1}; size_t num_rows_{0}; // length in bytes - size_t size_{0}; storage::MmapChunkDescriptorPtr mmap_descriptor_ = nullptr; + size_t data_size_{0}; private: void @@ -413,16 +492,21 @@ class Column : public ColumnBase { } // mmap mode ctor - Column(const File& file, size_t size, int dim, DataType data_type) - : ColumnBase(file, size, dim, data_type) { + Column(const File& file, + size_t size, + int dim, + DataType data_type, + bool nullable) + : ColumnBase(file, size, dim, data_type, nullable) { } Column(size_t reserve, int dim, const DataType& data_type, storage::MmapChunkManagerPtr mcm, - storage::MmapChunkDescriptorPtr descriptor) - : ColumnBase(reserve, dim, data_type, mcm, descriptor) { + storage::MmapChunkDescriptorPtr descriptor, + bool nullable) + : ColumnBase(reserve, dim, data_type, mcm, descriptor, nullable) { } Column(Column&& column) noexcept : ColumnBase(std::move(column)) { @@ -432,7 +516,7 @@ class Column : public ColumnBase { SpanBase Span() const override { - return SpanBase(data_, num_rows_, cap_size_ / num_rows_); + return SpanBase(data_, num_rows_, data_cap_size_ / num_rows_); } }; @@ -453,7 +537,7 @@ class SparseFloatColumn : public ColumnBase { size_t size, int dim, const DataType& data_type) - : ColumnBase(file, size, dim, data_type) { + : ColumnBase(file, size, dim, data_type, false) { } // mmap with mmap manager SparseFloatColumn(size_t reserve, @@ -461,7 +545,7 @@ class SparseFloatColumn : public ColumnBase { const DataType& data_type, storage::MmapChunkManagerPtr mcm, storage::MmapChunkDescriptorPtr descriptor) - : ColumnBase(reserve, dim, data_type, mcm, descriptor) { + : ColumnBase(reserve, dim, data_type, mcm, descriptor, false) { } SparseFloatColumn(SparseFloatColumn&& column) noexcept @@ -477,6 +561,14 @@ class SparseFloatColumn : public ColumnBase { return static_cast(static_cast(vec_.data())); } + // This is used to advice mmap prefetch, we don't currently support mmap for + // sparse float vector thus not implemented for now. + size_t + ByteSize() const override { + PanicInfo(ErrorCode::Unsupported, + "ByteSize not supported for sparse float column"); + } + size_t Capacity() const override { PanicInfo(ErrorCode::Unsupported, @@ -524,7 +616,7 @@ class SparseFloatColumn : public ColumnBase { num_rows_ = indices.size(); // so that indices[num_rows_] - indices[num_rows_ - 1] is the size of // the last row. - indices.push_back(size_); + indices.push_back(data_size_); for (size_t i = 0; i < num_rows_; i++) { auto vec_size = indices[i + 1] - indices[i]; AssertInfo( @@ -564,8 +656,9 @@ class VariableColumn : public ColumnBase { int dim, const DataType& data_type, storage::MmapChunkManagerPtr mcm, - storage::MmapChunkDescriptorPtr descriptor) - : ColumnBase(reserve, dim, data_type, mcm, descriptor) { + storage::MmapChunkDescriptorPtr descriptor, + bool nullable) + : ColumnBase(reserve, dim, data_type, mcm, descriptor, nullable) { } VariableColumn(VariableColumn&& column) noexcept @@ -622,7 +715,7 @@ class VariableColumn : public ColumnBase { pos += sizeof(uint32_t) + size; } - return BufferView{pos, size_ - (pos - data_)}; + return BufferView{pos, data_size_ - (pos - data_)}; } ViewType @@ -654,9 +747,12 @@ class VariableColumn : public ColumnBase { void Append(FieldDataPtr chunk) { for (auto i = 0; i < chunk->get_num_rows(); i++) { - indices_.emplace_back(size_); + indices_.emplace_back(data_size_); auto data = static_cast(chunk->RawValue(i)); - size_ += sizeof(uint32_t) + data->size(); + data_size_ += sizeof(uint32_t) + data->size(); + if (nullable_) { + valid_data_.push_back(chunk->is_valid(i)); + } } load_buf_.emplace(std::move(chunk)); } @@ -671,9 +767,9 @@ class VariableColumn : public ColumnBase { // for variable length column in memory mode only if (data_ == nullptr) { - size_t total_size = size_; - size_ = 0; - Expand(total_size); + size_t total_data_size = data_size_; + data_size_ = 0; + ExpandData(total_data_size); while (!load_buf_.empty()) { auto chunk = std::move(load_buf_.front()); @@ -681,12 +777,19 @@ class VariableColumn : public ColumnBase { // data_ as: |size|data|size|data...... for (auto i = 0; i < chunk->get_num_rows(); i++) { - auto current_size = (uint32_t)chunk->Size(i); - std::memcpy(data_ + size_, ¤t_size, sizeof(uint32_t)); - size_ += sizeof(uint32_t); + auto current_size = (uint32_t)chunk->DataSize(i); + std::memcpy( + data_ + data_size_, ¤t_size, sizeof(uint32_t)); + data_size_ += sizeof(uint32_t); auto data = static_cast(chunk->RawValue(i)); - std::memcpy(data_ + size_, data->c_str(), data->size()); - size_ += data->size(); + std::memcpy( + data_ + data_size_, data->c_str(), data->size()); + data_size_ += data->size(); + } + if (nullable_) { + for (size_t i = 0; i < chunk->get_num_rows(); i++) { + valid_data_.push_back(chunk->is_valid(i)); + } } } } @@ -711,7 +814,6 @@ class VariableColumn : public ColumnBase { private: // loading states std::queue load_buf_{}; - // raw data index, record indices located 0, interval, 2 * interval, 3 * interval // ... just like page index, interval set to 8192 that matches search engine's batch size std::vector indices_{}; @@ -735,8 +837,9 @@ class ArrayColumn : public ColumnBase { int dim, const DataType& data_type, storage::MmapChunkManagerPtr mcm, - storage::MmapChunkDescriptorPtr descriptor) - : ColumnBase(reserve, dim, data_type, mcm, descriptor) { + storage::MmapChunkDescriptorPtr descriptor, + bool nullable) + : ColumnBase(reserve, dim, data_type, mcm, descriptor, nullable) { } ArrayColumn(ArrayColumn&& column) noexcept @@ -769,9 +872,14 @@ class ArrayColumn : public ColumnBase { } void - Append(const Array& array) { - indices_.emplace_back(size_); + Append(const Array& array, bool valid_data = false) { + indices_.emplace_back(data_size_); element_indices_.emplace_back(array.get_offsets()); + if (nullable_) { + return ColumnBase::Append(static_cast(array.data()), + array.byte_size(), + valid_data); + } ColumnBase::Append(static_cast(array.data()), array.byte_size()); } @@ -797,7 +905,7 @@ class ArrayColumn : public ColumnBase { std::move(element_indices_[i])); } views_.emplace_back(data_ + indices_.back(), - size_ - indices_.back(), + data_size_ - indices_.back(), element_type_, std::move(element_indices_[indices_.size() - 1])); element_indices_.clear(); @@ -810,4 +918,4 @@ class ArrayColumn : public ColumnBase { std::vector views_{}; DataType element_type_; }; -} // namespace milvus +} // namespace milvus \ No newline at end of file diff --git a/internal/core/src/mmap/Utils.h b/internal/core/src/mmap/Utils.h index 3cab2c3166f2e..2824a690d06c3 100644 --- a/internal/core/src/mmap/Utils.h +++ b/internal/core/src/mmap/Utils.h @@ -87,7 +87,8 @@ WriteFieldData(File& file, const FieldDataPtr& data, uint64_t& total_written, std::vector& indices, - std::vector>& element_indices) { + std::vector>& element_indices, + FixedVector& valid_data) { if (IsVariableDataType(data_type)) { switch (data_type) { case DataType::VARCHAR: @@ -168,13 +169,22 @@ WriteFieldData(File& file, } } else { // write as: data|data|data|data|data|data...... - size_t written = file.Write(data->Data(), data->Size()); - if (written < data->Size()) { + size_t written = file.Write(data->Data(), data->DataSize()); + if (written < data->DataSize()) { THROW_FILE_WRITE_ERROR } for (auto i = 0; i < data->get_num_rows(); i++) { indices.emplace_back(total_written); - total_written += data->Size(i); + total_written += data->DataSize(i); + } + } + if (data->IsNullable()) { + size_t required_rows = valid_data.size() + data->get_num_rows(); + if (required_rows > valid_data.size()) { + valid_data.reserve(required_rows * 2); + } + for (size_t i = 0; i < data->get_num_rows(); i++) { + valid_data.push_back(data->is_valid(i)); } } } diff --git a/internal/core/src/query/SearchOnGrowing.cpp b/internal/core/src/query/SearchOnGrowing.cpp index f228529b1e64b..0b722c2fa8485 100644 --- a/internal/core/src/query/SearchOnGrowing.cpp +++ b/internal/core/src/query/SearchOnGrowing.cpp @@ -105,7 +105,7 @@ SearchOnGrowing(const segcore::SegmentGrowingImpl& segment, segment.get_chunk_mutex()); int32_t current_chunk_id = 0; // step 3: brute force search where small indexing is unavailable - auto vec_ptr = record.get_field_data_base(vecfield_id); + auto vec_ptr = record.get_data_base(vecfield_id); auto vec_size_per_chunk = vec_ptr->get_size_per_chunk(); auto max_chunk = upper_div(active_count, vec_size_per_chunk); diff --git a/internal/core/src/query/groupby/SearchGroupByOperator.h b/internal/core/src/query/groupby/SearchGroupByOperator.h index dc0ec4037ebb8..5b0c6e2e274bd 100644 --- a/internal/core/src/query/groupby/SearchGroupByOperator.h +++ b/internal/core/src/query/groupby/SearchGroupByOperator.h @@ -41,8 +41,7 @@ class GrowingDataGetter : public DataGetter { const segcore::ConcurrentVector* growing_raw_data_; GrowingDataGetter(const segcore::SegmentGrowingImpl& segment, FieldId fieldId) { - growing_raw_data_ = - segment.get_insert_record().get_field_data(fieldId); + growing_raw_data_ = segment.get_insert_record().get_data(fieldId); } GrowingDataGetter(const GrowingDataGetter& other) diff --git a/internal/core/src/segcore/ConcurrentVector.h b/internal/core/src/segcore/ConcurrentVector.h index a4cb72d986fc4..37167f232a3b4 100644 --- a/internal/core/src/segcore/ConcurrentVector.h +++ b/internal/core/src/segcore/ConcurrentVector.h @@ -326,7 +326,6 @@ class ConcurrentVectorImpl : public VectorBase { fill_chunk(chunk_id, 0, element_count, source, source_offset); } } - void fill_chunk(ssize_t chunk_id, ssize_t chunk_offset, diff --git a/internal/core/src/segcore/FieldIndexing.h b/internal/core/src/segcore/FieldIndexing.h index 19de3974747e1..2585e156f2c61 100644 --- a/internal/core/src/segcore/FieldIndexing.h +++ b/internal/core/src/segcore/FieldIndexing.h @@ -312,7 +312,7 @@ class IndexingRecord { } auto& indexing = field_indexings_.at(fieldId); auto type = indexing->get_field_meta().get_data_type(); - auto field_raw_data = record.get_field_data_base(fieldId); + auto field_raw_data = record.get_data_base(fieldId); if (type == DataType::VECTOR_FLOAT && reserved_offset + size >= indexing->get_build_threshold()) { indexing->AppendSegmentIndexDense( @@ -349,11 +349,11 @@ class IndexingRecord { if (type == DataType::VECTOR_FLOAT && reserved_offset + size >= indexing->get_build_threshold()) { - auto vec_base = record.get_field_data_base(fieldId); + auto vec_base = record.get_data_base(fieldId); indexing->AppendSegmentIndexDense( reserved_offset, size, vec_base, data->Data()); } else if (type == DataType::VECTOR_SPARSE_FLOAT) { - auto vec_base = record.get_field_data_base(fieldId); + auto vec_base = record.get_data_base(fieldId); indexing->AppendSegmentIndexSparse( reserved_offset, size, diff --git a/internal/core/src/segcore/InsertRecord.h b/internal/core/src/segcore/InsertRecord.h index d68acf1b67272..0f6c231393e64 100644 --- a/internal/core/src/segcore/InsertRecord.h +++ b/internal/core/src/segcore/InsertRecord.h @@ -12,6 +12,7 @@ #pragma once #include +#include #include #include #include @@ -407,6 +408,65 @@ class OffsetOrderedArray : public OffsetMap { std::vector> array_; }; +class ThreadSafeValidData { + public: + explicit ThreadSafeValidData() = default; + explicit ThreadSafeValidData(FixedVector data) + : data_(std::move(data)) { + } + + void + set_data_raw(const std::vector& datas) { + std::unique_lock lck(mutex_); + auto total = 0; + for (auto& field_data : datas) { + total += field_data->get_num_rows(); + } + if (length_ + total > data_.size()) { + data_.reserve(length_ + total); + } + length_ += total; + for (auto& field_data : datas) { + auto num_row = field_data->get_num_rows(); + for (size_t i = 0; i < num_row; i++) { + data_.push_back(field_data->is_valid(i)); + } + } + } + + void + set_data_raw(size_t num_rows, + const DataArray* data, + const FieldMeta& field_meta) { + std::unique_lock lck(mutex_); + if (field_meta.is_nullable()) { + if (length_ + num_rows > data_.size()) { + data_.reserve(length_ + num_rows); + } + + auto src = data->valid_data().data(); + for (size_t i = 0; i < num_rows; ++i) { + data_.push_back(src[i]); + // data_[length_ + i] = src[i]; + } + length_ += num_rows; + } + } + + bool + is_valid(size_t offset) { + std::shared_lock lck(mutex_); + Assert(offset < length_); + return data_[offset]; + } + + private: + mutable std::shared_mutex mutex_{}; + FixedVector data_; + // number of actual elements + size_t length_{0}; +}; + template struct InsertRecord { InsertRecord( @@ -419,6 +479,9 @@ struct InsertRecord { for (auto& field : schema) { auto field_id = field.first; auto& field_meta = field.second; + if (field_meta.is_nullable()) { + this->append_valid_data(field_id); + } if (pk2offset_ == nullptr && pk_field_id.has_value() && pk_field_id.value() == field_id) { switch (field_meta.get_data_type()) { @@ -451,28 +514,28 @@ struct InsertRecord { } if (field_meta.is_vector()) { if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) { - this->append_field_data( + this->append_data( field_id, field_meta.get_dim(), size_per_chunk); continue; } else if (field_meta.get_data_type() == DataType::VECTOR_BINARY) { - this->append_field_data( + this->append_data( field_id, field_meta.get_dim(), size_per_chunk); continue; } else if (field_meta.get_data_type() == DataType::VECTOR_FLOAT16) { - this->append_field_data( + this->append_data( field_id, field_meta.get_dim(), size_per_chunk); continue; } else if (field_meta.get_data_type() == DataType::VECTOR_BFLOAT16) { - this->append_field_data( + this->append_data( field_id, field_meta.get_dim(), size_per_chunk); continue; } else if (field_meta.get_data_type() == DataType::VECTOR_SPARSE_FLOAT) { - this->append_field_data(field_id, - size_per_chunk); + this->append_data(field_id, + size_per_chunk); continue; } else { PanicInfo(DataTypeInvalid, @@ -482,44 +545,43 @@ struct InsertRecord { } switch (field_meta.get_data_type()) { case DataType::BOOL: { - this->append_field_data(field_id, size_per_chunk); + this->append_data(field_id, size_per_chunk); break; } case DataType::INT8: { - this->append_field_data(field_id, size_per_chunk); + this->append_data(field_id, size_per_chunk); break; } case DataType::INT16: { - this->append_field_data(field_id, size_per_chunk); + this->append_data(field_id, size_per_chunk); break; } case DataType::INT32: { - this->append_field_data(field_id, size_per_chunk); + this->append_data(field_id, size_per_chunk); break; } case DataType::INT64: { - this->append_field_data(field_id, size_per_chunk); + this->append_data(field_id, size_per_chunk); break; } case DataType::FLOAT: { - this->append_field_data(field_id, size_per_chunk); + this->append_data(field_id, size_per_chunk); break; } case DataType::DOUBLE: { - this->append_field_data(field_id, size_per_chunk); + this->append_data(field_id, size_per_chunk); break; } case DataType::VARCHAR: { - this->append_field_data(field_id, - size_per_chunk); + this->append_data(field_id, size_per_chunk); break; } case DataType::JSON: { - this->append_field_data(field_id, size_per_chunk); + this->append_data(field_id, size_per_chunk); break; } case DataType::ARRAY: { - this->append_field_data(field_id, size_per_chunk); + this->append_data(field_id, size_per_chunk); break; } default: { @@ -666,23 +728,22 @@ struct InsertRecord { pk2offset_->seal(); } - // get field data without knowing the type + // get data without knowing the type VectorBase* - get_field_data_base(FieldId field_id) const { - AssertInfo(fields_data_.find(field_id) != fields_data_.end(), + get_data_base(FieldId field_id) const { + AssertInfo(data_.find(field_id) != data_.end(), "Cannot find field_data with field_id: " + std::to_string(field_id.get())); - AssertInfo( - fields_data_.at(field_id) != nullptr, - "fields_data_ at i is null" + std::to_string(field_id.get())); - return fields_data_.at(field_id).get(); + AssertInfo(data_.at(field_id) != nullptr, + "data_ at i is null" + std::to_string(field_id.get())); + return data_.at(field_id).get(); } // get field data in given type, const version template const ConcurrentVector* - get_field_data(FieldId field_id) const { - auto base_ptr = get_field_data_base(field_id); + get_data(FieldId field_id) const { + auto base_ptr = get_data_base(field_id); auto ptr = dynamic_cast*>(base_ptr); Assert(ptr); return ptr; @@ -691,36 +752,58 @@ struct InsertRecord { // get field data in given type, non-const version template ConcurrentVector* - get_field_data(FieldId field_id) { - auto base_ptr = get_field_data_base(field_id); + get_data(FieldId field_id) { + auto base_ptr = get_data_base(field_id); auto ptr = dynamic_cast*>(base_ptr); Assert(ptr); return ptr; } + ThreadSafeValidData* + get_valid_data(FieldId field_id) const { + AssertInfo(valid_data_.find(field_id) != valid_data_.end(), + "Cannot find valid_data with field_id: " + + std::to_string(field_id.get())); + AssertInfo(valid_data_.at(field_id) != nullptr, + "valid_data_ at i is null" + std::to_string(field_id.get())); + return valid_data_.at(field_id).get(); + } + + bool + is_valid_data_exist(FieldId field_id) { + return valid_data_.find(field_id) != valid_data_.end(); + } + // append a column of scalar or sparse float vector type template void - append_field_data(FieldId field_id, int64_t size_per_chunk) { + append_data(FieldId field_id, int64_t size_per_chunk) { static_assert(IsScalar || IsSparse); - fields_data_.emplace(field_id, - std::make_unique>( - size_per_chunk, mmap_descriptor_)); + data_.emplace(field_id, + std::make_unique>( + size_per_chunk, mmap_descriptor_)); + } + + // append a column of scalar type + void + append_valid_data(FieldId field_id) { + valid_data_.emplace(field_id, std::make_unique()); } // append a column of vector type template void - append_field_data(FieldId field_id, int64_t dim, int64_t size_per_chunk) { + append_data(FieldId field_id, int64_t dim, int64_t size_per_chunk) { static_assert(std::is_base_of_v); - fields_data_.emplace(field_id, - std::make_unique>( - dim, size_per_chunk, mmap_descriptor_)); + data_.emplace(field_id, + std::make_unique>( + dim, size_per_chunk, mmap_descriptor_)); } void drop_field_data(FieldId field_id) { - fields_data_.erase(field_id); + data_.erase(field_id); + valid_data_.erase(field_id); } const ConcurrentVector& @@ -740,7 +823,7 @@ struct InsertRecord { ack_responder_.clear(); timestamp_index_ = TimestampIndex(); pk2offset_->clear(); - fields_data_.clear(); + data_.clear(); } bool @@ -762,7 +845,9 @@ struct InsertRecord { std::unique_ptr pk2offset_; private: - std::unordered_map> fields_data_{}; + std::unordered_map> data_{}; + std::unordered_map> + valid_data_{}; mutable std::shared_mutex shared_mutex_{}; storage::MmapChunkDescriptorPtr mmap_descriptor_; }; diff --git a/internal/core/src/segcore/SegmentGrowingImpl.cpp b/internal/core/src/segcore/SegmentGrowingImpl.cpp index 9a15e530a053e..9e6529386838b 100644 --- a/internal/core/src/segcore/SegmentGrowingImpl.cpp +++ b/internal/core/src/segcore/SegmentGrowingImpl.cpp @@ -57,11 +57,11 @@ SegmentGrowingImpl::try_remove_chunks(FieldId fieldId) { if (indexing_record_.SyncDataWithIndex(fieldId)) { VectorBase* vec_data_base = dynamic_cast*>( - insert_record_.get_field_data_base(fieldId)); + insert_record_.get_data_base(fieldId)); if (!vec_data_base) { vec_data_base = dynamic_cast*>( - insert_record_.get_field_data_base(fieldId)); + insert_record_.get_data_base(fieldId)); } if (vec_data_base && vec_data_base->num_chunk() > 0 && chunk_mutex_.try_lock()) { @@ -105,11 +105,17 @@ SegmentGrowingImpl::Insert(int64_t reserved_offset, fmt::format("can't find field {}", field_id.get())); auto data_offset = field_id_to_offset[field_id]; if (!indexing_record_.SyncDataWithIndex(field_id)) { - insert_record_.get_field_data_base(field_id)->set_data_raw( + insert_record_.get_data_base(field_id)->set_data_raw( reserved_offset, num_rows, &insert_record_proto->fields_data(data_offset), field_meta); + if (field_meta.is_nullable()) { + insert_record_.get_valid_data(field_id)->set_data_raw( + num_rows, + &insert_record_proto->fields_data(data_offset), + field_meta); + } } //insert vector data into index if (segcore_config_.get_enable_interim_segment_index()) { @@ -230,8 +236,12 @@ SegmentGrowingImpl::LoadFieldData(const LoadFieldDataInfo& infos) { } if (!indexing_record_.SyncDataWithIndex(field_id)) { - insert_record_.get_field_data_base(field_id)->set_data_raw( + insert_record_.get_data_base(field_id)->set_data_raw( reserved_offset, field_data); + if (insert_record_.is_valid_data_exist(field_id)) { + insert_record_.get_valid_data(field_id)->set_data_raw( + field_data); + } } if (segcore_config_.get_enable_interim_segment_index()) { auto offset = reserved_offset; @@ -318,7 +328,7 @@ SegmentGrowingImpl::LoadFieldDataV2(const LoadFieldDataInfo& infos) { } if (!indexing_record_.SyncDataWithIndex(field_id)) { - insert_record_.get_field_data_base(field_id)->set_data_raw( + insert_record_.get_data_base(field_id)->set_data_raw( reserved_offset, field_data); } if (segcore_config_.get_enable_interim_segment_index()) { @@ -420,7 +430,7 @@ SegmentGrowingImpl::LoadDeletedRecord(const LoadDeletedRecordInfo& info) { SpanBase SegmentGrowingImpl::chunk_data_impl(FieldId field_id, int64_t chunk_id) const { - auto vec = get_insert_record().get_field_data_base(field_id); + auto vec = get_insert_record().get_data_base(field_id); return vec->get_span_base(chunk_id); } @@ -457,7 +467,7 @@ std::unique_ptr SegmentGrowingImpl::bulk_subscript(FieldId field_id, const int64_t* seg_offsets, int64_t count) const { - auto vec_ptr = insert_record_.get_field_data_base(field_id); + auto vec_ptr = insert_record_.get_data_base(field_id); auto& field_meta = schema_->operator[](field_id); if (field_meta.is_vector()) { auto result = CreateVectorDataArray(count, field_meta); @@ -514,6 +524,14 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id, AssertInfo(!field_meta.is_vector(), "Scalar field meta type is vector type"); auto result = CreateScalarDataArray(count, field_meta); + if (field_meta.is_nullable()) { + auto valid_data_ptr = insert_record_.get_valid_data(field_id); + auto res = result->mutable_valid_data()->mutable_data(); + for (int64_t i = 0; i < count; ++i) { + auto offset = seg_offsets[i]; + res[i] = valid_data_ptr->is_valid(offset); + } + } switch (field_meta.get_data_type()) { case DataType::BOOL: { bulk_subscript_impl(vec_ptr, diff --git a/internal/core/src/segcore/SegmentSealedImpl.cpp b/internal/core/src/segcore/SegmentSealedImpl.cpp index 0245ee76da3e7..5c970429626c8 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.cpp +++ b/internal/core/src/segcore/SegmentSealedImpl.cpp @@ -443,7 +443,12 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) { auto rawValue = field_data->RawValue(i); auto array = static_cast(rawValue); - var_column->Append(*array); + if (field_data->IsNullable()) { + var_column->Append(*array, + field_data->is_valid(i)); + } else { + var_column->Append(*array); + } // we stores the offset for each array element, so there is a additional uint64_t for each array element field_data_size = @@ -480,7 +485,6 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) { FieldDataPtr field_data; while (data.channel->pop(field_data)) { column->AppendBatch(field_data); - stats_.mem_size += field_data->Size(); } LoadPrimitiveSkipIndex( @@ -550,18 +554,19 @@ SegmentSealedImpl::MapFieldData(const FieldId field_id, FieldDataInfo& data) { uint64_t total_written = 0; std::vector indices{}; std::vector> element_indices{}; + FixedVector valid_data{}; while (data.channel->pop(field_data)) { WriteFieldData(file, data_type, field_data, total_written, indices, - element_indices); + element_indices, + valid_data); } WriteFieldPadding(file, data_type, total_written); - - auto num_rows = data.row_count; std::shared_ptr column{}; + auto num_rows = data.row_count; if (IsVariableDataType(data_type)) { switch (data_type) { case milvus::DataType::STRING: @@ -604,6 +609,8 @@ SegmentSealedImpl::MapFieldData(const FieldId field_id, FieldDataInfo& data) { column = std::make_shared(file, total_written, field_meta); } + column->SetValidData(std::move(valid_data)); + { std::unique_lock lck(mutex_); fields_.emplace(field_id, column); @@ -712,7 +719,7 @@ SegmentSealedImpl::chunk_data_impl(FieldId field_id, int64_t chunk_id) const { auto& field_data = it->second; return field_data->Span(); } - auto field_data = insert_record_.get_field_data_base(field_id); + auto field_data = insert_record_.get_data_base(field_id); AssertInfo(field_data->num_chunk() == 1, "num chunk not equal to 1 for sealed segment"); return field_data->get_span_base(0); @@ -1236,6 +1243,13 @@ SegmentSealedImpl::get_raw_data(FieldId field_id, // to make sure it won't get released if segment released auto column = fields_.at(field_id); auto ret = fill_with_empty(field_id, count); + if (column->IsNullable()) { + auto dst = ret->mutable_valid_data()->mutable_data(); + for (int64_t i = 0; i < count; ++i) { + auto offset = seg_offsets[i]; + dst[i] = column->IsValid(offset); + } + } switch (field_meta.get_data_type()) { case DataType::VARCHAR: case DataType::STRING: { diff --git a/internal/core/src/segcore/Utils.cpp b/internal/core/src/segcore/Utils.cpp index ae914e93703ad..c874de7a84891 100644 --- a/internal/core/src/segcore/Utils.cpp +++ b/internal/core/src/segcore/Utils.cpp @@ -232,6 +232,10 @@ CreateScalarDataArray(int64_t count, const FieldMeta& field_meta) { data_array->set_type(static_cast( field_meta.get_data_type())); + if (field_meta.is_nullable()) { + data_array->mutable_valid_data()->Resize(count, false); + } + auto scalar_array = data_array->mutable_scalars(); switch (data_type) { case DataType::BOOL: { @@ -360,6 +364,7 @@ CreateVectorDataArray(int64_t count, const FieldMeta& field_meta) { std::unique_ptr CreateScalarDataArrayFrom(const void* data_raw, + const void* valid_data, int64_t count, const FieldMeta& field_meta) { auto data_type = field_meta.get_data_type(); @@ -367,6 +372,11 @@ CreateScalarDataArrayFrom(const void* data_raw, data_array->set_field_id(field_meta.get_id().get()); data_array->set_type(static_cast( field_meta.get_data_type())); + if (field_meta.is_nullable()) { + auto valid_data_ = reinterpret_cast(valid_data); + auto obj = data_array->mutable_valid_data(); + obj->Add(valid_data_, valid_data_ + count); + } auto scalar_array = data_array->mutable_scalars(); switch (data_type) { @@ -517,12 +527,14 @@ CreateVectorDataArrayFrom(const void* data_raw, std::unique_ptr CreateDataArrayFrom(const void* data_raw, + const void* valid_data, int64_t count, const FieldMeta& field_meta) { auto data_type = field_meta.get_data_type(); if (!IsVectorDataType(data_type)) { - return CreateScalarDataArrayFrom(data_raw, count, field_meta); + return CreateScalarDataArrayFrom( + data_raw, valid_data, count, field_meta); } return CreateVectorDataArrayFrom(data_raw, count, field_meta); @@ -535,6 +547,7 @@ MergeDataArray(std::vector& merge_bases, auto data_type = field_meta.get_data_type(); auto data_array = std::make_unique(); data_array->set_field_id(field_meta.get_id().get()); + auto nullable = field_meta.is_nullable(); data_array->set_type(static_cast( field_meta.get_data_type())); @@ -588,6 +601,12 @@ MergeDataArray(std::vector& merge_bases, continue; } + if (nullable) { + auto data = src_field_data->valid_data().data(); + auto obj = data_array->mutable_valid_data(); + *(obj->Add()) = data[src_offset]; + } + auto scalar_array = data_array->mutable_scalars(); switch (data_type) { case DataType::BOOL: { @@ -781,6 +800,7 @@ LoadFieldDatasFromRemote2(std::shared_ptr space, data->GetColumnByName(field.second.get_name().get()); auto field_data = storage::CreateFieldData( field.second.get_data_type(), + field.second.is_nullable(), field.second.is_vector() ? field.second.get_dim() : 0, total_num_rows); field_data->FillFieldData(col_data); diff --git a/internal/core/src/segcore/Utils.h b/internal/core/src/segcore/Utils.h index 11bc53079838e..51e9cf0d1b04d 100644 --- a/internal/core/src/segcore/Utils.h +++ b/internal/core/src/segcore/Utils.h @@ -63,6 +63,7 @@ CreateVectorDataArray(int64_t count, const FieldMeta& field_meta); std::unique_ptr CreateScalarDataArrayFrom(const void* data_raw, + const void* valid_data, int64_t count, const FieldMeta& field_meta); @@ -73,6 +74,7 @@ CreateVectorDataArrayFrom(const void* data_raw, std::unique_ptr CreateDataArrayFrom(const void* data_raw, + const void* valid_data, int64_t count, const FieldMeta& field_meta); diff --git a/internal/core/src/segcore/segment_c.cpp b/internal/core/src/segcore/segment_c.cpp index f623b69214f9b..d8e8421f4611b 100644 --- a/internal/core/src/segcore/segment_c.cpp +++ b/internal/core/src/segcore/segment_c.cpp @@ -379,7 +379,8 @@ LoadFieldRawData(CSegmentInterface c_segment, dim = field_meta.get_dim(); } } - auto field_data = milvus::storage::CreateFieldData(data_type, dim); + auto field_data = + milvus::storage::CreateFieldData(data_type, false, dim); field_data->FillFieldData(data, row_count); milvus::FieldDataChannelPtr channel = std::make_shared(); diff --git a/internal/core/src/storage/ChunkCache.cpp b/internal/core/src/storage/ChunkCache.cpp index 365563f205357..a61111d6b20f5 100644 --- a/internal/core/src/storage/ChunkCache.cpp +++ b/internal/core/src/storage/ChunkCache.cpp @@ -112,7 +112,7 @@ ChunkCache::Mmap(const FieldDataPtr& field_data, uint64_t offset = 0; for (auto i = 0; i < field_data->get_num_rows(); ++i) { indices.push_back(offset); - offset += field_data->Size(i); + offset += field_data->DataSize(i); } auto sparse_column = std::make_shared( data_size, dim, data_type, mcm_, descriptor); @@ -123,7 +123,7 @@ ChunkCache::Mmap(const FieldDataPtr& field_data, false, "TODO: unimplemented for variable data type: {}", data_type); } else { column = std::make_shared( - data_size, dim, data_type, mcm_, descriptor); + data_size, dim, data_type, mcm_, descriptor,field_data->IsNullable()); } column->AppendBatch(field_data); return column; diff --git a/internal/core/src/storage/DataCodec.cpp b/internal/core/src/storage/DataCodec.cpp index 3d7af86051f1a..96f0aeac73570 100644 --- a/internal/core/src/storage/DataCodec.cpp +++ b/internal/core/src/storage/DataCodec.cpp @@ -31,6 +31,10 @@ DeserializeRemoteFileData(BinlogReaderPtr reader) { DescriptorEvent descriptor_event(reader); DataType data_type = DataType(descriptor_event.event_data.fix_part.data_type); + auto& extras = descriptor_event.event_data.extras; + bool nullable = (extras.find(NULLABLE) != extras.end()) + ? std::any_cast(extras[NULLABLE]) + : false; auto descriptor_fix_part = descriptor_event.event_data.fix_part; FieldDataMeta data_meta{descriptor_fix_part.collection_id, descriptor_fix_part.partition_id, @@ -42,7 +46,7 @@ DeserializeRemoteFileData(BinlogReaderPtr reader) { auto event_data_length = header.event_length_ - GetEventHeaderSize(header); auto insert_event_data = - InsertEventData(reader, event_data_length, data_type); + InsertEventData(reader, event_data_length, data_type, nullable); auto insert_data = std::make_unique(insert_event_data.field_data); insert_data->SetFieldDataMeta(data_meta); @@ -54,7 +58,7 @@ DeserializeRemoteFileData(BinlogReaderPtr reader) { auto event_data_length = header.event_length_ - GetEventHeaderSize(header); auto index_event_data = - IndexEventData(reader, event_data_length, data_type); + IndexEventData(reader, event_data_length, data_type, nullable); auto field_data = index_event_data.field_data; // for compatible with golang indexcode.Serialize, which set dataType to String if (data_type == DataType::STRING) { @@ -63,7 +67,7 @@ DeserializeRemoteFileData(BinlogReaderPtr reader) { AssertInfo( field_data->get_num_rows() == 1, "wrong length of string num in old index binlog file"); - auto new_field_data = CreateFieldData(DataType::INT8); + auto new_field_data = CreateFieldData(DataType::INT8, nullable); new_field_data->FillFieldData( (*static_cast(field_data->RawValue(0))) .c_str(), diff --git a/internal/core/src/storage/DiskFileManagerImpl.cpp b/internal/core/src/storage/DiskFileManagerImpl.cpp index 844495ceb0bba..34332f5240b20 100644 --- a/internal/core/src/storage/DiskFileManagerImpl.cpp +++ b/internal/core/src/storage/DiskFileManagerImpl.cpp @@ -411,7 +411,7 @@ DiskFileManagerImpl::CacheRawDataToDisk( num_rows += total_num_rows; auto col_data = data->GetColumnByName(index_meta_.field_name); auto field_data = storage::CreateFieldData( - index_meta_.field_type, index_meta_.dim, total_num_rows); + index_meta_.field_type, false, index_meta_.dim, total_num_rows); field_data->FillFieldData(col_data); dim = field_data->get_dim(); auto data_size = @@ -741,7 +741,7 @@ DiskFileManagerImpl::CacheOptFieldToDisk( } auto col_data = data->GetColumnByName(field_name); auto field_data = - storage::CreateFieldData(field_type, 1, total_num_rows); + storage::CreateFieldData(field_type, false, 1, total_num_rows); field_data->FillFieldData(col_data); field_datas.emplace_back(field_data); } diff --git a/internal/core/src/storage/Event.cpp b/internal/core/src/storage/Event.cpp index f27de8de30ee9..607191ab010f0 100644 --- a/internal/core/src/storage/Event.cpp +++ b/internal/core/src/storage/Event.cpp @@ -209,7 +209,8 @@ DescriptorEventData::Serialize() { BaseEventData::BaseEventData(BinlogReaderPtr reader, int event_length, - DataType data_type) { + DataType data_type, + bool nullable) { auto ast = reader->Read(sizeof(start_timestamp), &start_timestamp); AssertInfo(ast.ok(), "read start timestamp failed"); ast = reader->Read(sizeof(end_timestamp), &end_timestamp); @@ -220,7 +221,7 @@ BaseEventData::BaseEventData(BinlogReaderPtr reader, auto res = reader->Read(payload_length); AssertInfo(res.first.ok(), "read payload failed"); auto payload_reader = std::make_shared( - res.second.get(), payload_length, data_type); + res.second.get(), payload_length, data_type, nullable); field_data = payload_reader->get_field_data(); } @@ -230,10 +231,11 @@ BaseEventData::Serialize() { std::shared_ptr payload_writer; if (IsVectorDataType(data_type) && !IsSparseFloatVectorDataType(data_type)) { - payload_writer = - std::make_unique(data_type, field_data->get_dim()); + payload_writer = std::make_unique( + data_type, field_data->get_dim(), field_data->IsNullable()); } else { - payload_writer = std::make_unique(data_type); + payload_writer = std::make_unique( + data_type, field_data->IsNullable()); } switch (data_type) { case DataType::VARCHAR: @@ -242,8 +244,8 @@ BaseEventData::Serialize() { ++offset) { auto str = static_cast( field_data->RawValue(offset)); - payload_writer->add_one_string_payload(str->c_str(), - str->size()); + auto size = field_data->is_valid(offset) ? str->size() : -1; + payload_writer->add_one_string_payload(str->c_str(), size); } break; } @@ -253,10 +255,12 @@ BaseEventData::Serialize() { auto array = static_cast(field_data->RawValue(offset)); auto array_string = array->output_data().SerializeAsString(); + auto size = + field_data->is_valid(offset) ? array_string.size() : -1; payload_writer->add_one_binary_payload( reinterpret_cast(array_string.c_str()), - array_string.size()); + size); } break; } @@ -289,8 +293,10 @@ BaseEventData::Serialize() { auto payload = Payload{data_type, static_cast(field_data->Data()), + field_data->ValidData(), field_data->get_num_rows(), - field_data->get_dim()}; + field_data->get_dim(), + field_data->IsNullable()}; payload_writer->add_payload(payload); } } @@ -310,11 +316,13 @@ BaseEventData::Serialize() { return res; } -BaseEvent::BaseEvent(BinlogReaderPtr reader, DataType data_type) { +BaseEvent::BaseEvent(BinlogReaderPtr reader, + DataType data_type, + bool nullable) { event_header = EventHeader(reader); auto event_data_length = event_header.event_length_ - GetEventHeaderSize(event_header); - event_data = BaseEventData(reader, event_data_length, data_type); + event_data = BaseEventData(reader, event_data_length, data_type, nullable); } std::vector @@ -370,8 +378,9 @@ std::vector LocalInsertEvent::Serialize() { int row_num = field_data->get_num_rows(); int dimension = field_data->get_dim(); - int payload_size = field_data->Size(); - int len = sizeof(row_num) + sizeof(dimension) + payload_size; + int data_size = field_data->DataSize(); + int valid_data_size = field_data->ValidDataSize(); + int len = sizeof(row_num) + sizeof(dimension) + data_size + valid_data_size; std::vector res(len); int offset = 0; @@ -379,8 +388,9 @@ LocalInsertEvent::Serialize() { offset += sizeof(row_num); memcpy(res.data() + offset, &dimension, sizeof(dimension)); offset += sizeof(dimension); - memcpy(res.data() + offset, field_data->Data(), payload_size); - + memcpy(res.data() + offset, field_data->Data(), data_size); + offset += data_size; + memcpy(res.data() + offset, field_data->ValidData(), valid_data_size); return res; } @@ -393,7 +403,7 @@ LocalIndexEvent::LocalIndexEvent(BinlogReaderPtr reader) { auto res = reader->Read(index_size); AssertInfo(res.first.ok(), "read payload failed"); auto payload_reader = std::make_shared( - res.second.get(), index_size, DataType::INT8); + res.second.get(), index_size, DataType::INT8, false); field_data = payload_reader->get_field_data(); } diff --git a/internal/core/src/storage/Event.h b/internal/core/src/storage/Event.h index 2922e399f00bd..b974331394f9c 100644 --- a/internal/core/src/storage/Event.h +++ b/internal/core/src/storage/Event.h @@ -80,7 +80,8 @@ struct BaseEventData { BaseEventData() = default; explicit BaseEventData(BinlogReaderPtr reader, int event_length, - DataType data_type); + DataType data_type, + bool nullable); std::vector Serialize(); @@ -103,7 +104,9 @@ struct BaseEvent { int64_t event_offset; BaseEvent() = default; - explicit BaseEvent(BinlogReaderPtr reader, DataType data_type); + explicit BaseEvent(BinlogReaderPtr reader, + DataType data_type, + bool nullable); std::vector Serialize(); diff --git a/internal/core/src/storage/InsertData.cpp b/internal/core/src/storage/InsertData.cpp index d4b043c423baa..4b306dd3fe5ec 100644 --- a/internal/core/src/storage/InsertData.cpp +++ b/internal/core/src/storage/InsertData.cpp @@ -69,7 +69,7 @@ InsertData::serialize_to_remote_file() { } des_event_data.extras[ORIGIN_SIZE_KEY] = std::to_string(field_data_->Size()); - //(todo:smellthemoon) set nullable + des_event_data.extras[NULLABLE] = field_data_->IsNullable(); auto& des_event_header = descriptor_event.event_header; // TODO :: set timestamp diff --git a/internal/core/src/storage/PayloadReader.cpp b/internal/core/src/storage/PayloadReader.cpp index 4d35aa493fa17..f468bd343d810 100644 --- a/internal/core/src/storage/PayloadReader.cpp +++ b/internal/core/src/storage/PayloadReader.cpp @@ -27,8 +27,9 @@ namespace milvus::storage { PayloadReader::PayloadReader(const uint8_t* data, int length, - DataType data_type) - : column_type_(data_type) { + DataType data_type, + bool nullable) + : column_type_(data_type), nullable(nullable) { auto input = std::make_shared(data, length); init(input); } @@ -72,7 +73,7 @@ PayloadReader::init(std::shared_ptr input) { st = arrow_reader->GetRecordBatchReader(&rb_reader); AssertInfo(st.ok(), "get record batch reader"); - field_data_ = CreateFieldData(column_type_, dim_, total_num_rows); + field_data_ = CreateFieldData(column_type_, nullable, dim_, total_num_rows); for (arrow::Result> maybe_batch : *rb_reader) { AssertInfo(maybe_batch.ok(), "get batch record success"); diff --git a/internal/core/src/storage/PayloadReader.h b/internal/core/src/storage/PayloadReader.h index b5fb22084dab4..39aa6420fd14d 100644 --- a/internal/core/src/storage/PayloadReader.h +++ b/internal/core/src/storage/PayloadReader.h @@ -26,7 +26,10 @@ namespace milvus::storage { class PayloadReader { public: - explicit PayloadReader(const uint8_t* data, int length, DataType data_type); + explicit PayloadReader(const uint8_t* data, + int length, + DataType data_type, + bool nullable); ~PayloadReader() = default; @@ -41,6 +44,7 @@ class PayloadReader { private: DataType column_type_; int dim_; + bool nullable; FieldDataPtr field_data_; }; diff --git a/internal/core/src/storage/PayloadStream.h b/internal/core/src/storage/PayloadStream.h index c23c7816367b5..8639ab9a97dab 100644 --- a/internal/core/src/storage/PayloadStream.h +++ b/internal/core/src/storage/PayloadStream.h @@ -32,8 +32,10 @@ class PayloadInputStream; struct Payload { DataType data_type; const uint8_t* raw_data; - int64_t rows; + const uint8_t* valid_data; + const int64_t rows; std::optional dimension; + bool nullable; }; class PayloadOutputStream : public arrow::io::OutputStream { diff --git a/internal/core/src/storage/PayloadWriter.cpp b/internal/core/src/storage/PayloadWriter.cpp index d9b1db7dc5cba..c7722c11b88c7 100644 --- a/internal/core/src/storage/PayloadWriter.cpp +++ b/internal/core/src/storage/PayloadWriter.cpp @@ -23,18 +23,19 @@ namespace milvus::storage { // create payload writer for numeric data type -PayloadWriter::PayloadWriter(const DataType column_type) - : column_type_(column_type) { +PayloadWriter::PayloadWriter(const DataType column_type, bool nullable) + : column_type_(column_type), nullable_(nullable) { builder_ = CreateArrowBuilder(column_type); - schema_ = CreateArrowSchema(column_type); + schema_ = CreateArrowSchema(column_type, nullable); } // create payload writer for vector data type -PayloadWriter::PayloadWriter(const DataType column_type, int dim) - : column_type_(column_type) { +PayloadWriter::PayloadWriter(const DataType column_type, int dim, bool nullable) + : column_type_(column_type), nullable_(nullable) { AssertInfo(column_type != DataType::VECTOR_SPARSE_FLOAT, "PayloadWriter for Sparse Float Vector should be created " "using the constructor without dimension"); + AssertInfo(nullable == false, "only scalcar type support null now"); init_dimension(dim); } @@ -48,7 +49,7 @@ PayloadWriter::init_dimension(int dim) { dimension_ = dim; builder_ = CreateArrowBuilder(column_type_, dim); - schema_ = CreateArrowSchema(column_type_, dim); + schema_ = CreateArrowSchema(column_type_, dim, nullable_); } void diff --git a/internal/core/src/storage/PayloadWriter.h b/internal/core/src/storage/PayloadWriter.h index 1bd2d652be9a8..86ca281bb6b62 100644 --- a/internal/core/src/storage/PayloadWriter.h +++ b/internal/core/src/storage/PayloadWriter.h @@ -25,8 +25,8 @@ namespace milvus::storage { class PayloadWriter { public: - explicit PayloadWriter(const DataType column_type); - explicit PayloadWriter(const DataType column_type, int dim); + explicit PayloadWriter(const DataType column_type, int dim, bool nullable); + explicit PayloadWriter(const DataType column_type, bool nullable); ~PayloadWriter() = default; void @@ -58,6 +58,7 @@ class PayloadWriter { private: DataType column_type_; + bool nullable_; std::shared_ptr builder_; std::shared_ptr schema_; std::shared_ptr output_; diff --git a/internal/core/src/storage/Util.cpp b/internal/core/src/storage/Util.cpp index 3a55f2258eff8..badfa00719610 100644 --- a/internal/core/src/storage/Util.cpp +++ b/internal/core/src/storage/Util.cpp @@ -75,6 +75,18 @@ std::map ReadAheadPolicy_Map = { {"willneed", MADV_WILLNEED}, {"dontneed", MADV_DONTNEED}}; +// in arrow, null_bitmap read from the least significant bit +std::vector +genValidIter(const uint8_t* valid_data, int length) { + std::vector valid_data_; + valid_data_.reserve(length); + for (size_t i = 0; i < length; ++i) { + auto bit = (valid_data[i >> 3] >> (i & 0x07)) & 1; + valid_data_.push_back(bit); + } + return valid_data_; +} + StorageType ReadMediumType(BinlogReaderPtr reader) { AssertInfo(reader->Tell() == 0, @@ -106,12 +118,22 @@ template void add_numeric_payload(std::shared_ptr builder, DT* start, + const uint8_t* valid_data, + bool nullable, int length) { AssertInfo(builder != nullptr, "empty arrow builder"); auto numeric_builder = std::dynamic_pointer_cast(builder); - auto ast = numeric_builder->AppendValues(start, start + length); - AssertInfo( - ast.ok(), "append value to arrow builder failed: {}", ast.ToString()); + arrow::Status ast; + if (nullable) { + // need iter to read valid_data when write + auto iter = genValidIter(valid_data, length); + ast = + numeric_builder->AppendValues(start, start + length, iter.begin()); + AssertInfo(ast.ok(), "append value to arrow builder failed"); + } else { + ast = numeric_builder->AppendValues(start, start + length); + AssertInfo(ast.ok(), "append value to arrow builder failed"); + } } void @@ -121,48 +143,49 @@ AddPayloadToArrowBuilder(std::shared_ptr builder, auto raw_data = const_cast(payload.raw_data); auto length = payload.rows; auto data_type = payload.data_type; + auto nullable = payload.nullable; switch (data_type) { case DataType::BOOL: { auto bool_data = reinterpret_cast(raw_data); add_numeric_payload( - builder, bool_data, length); + builder, bool_data, payload.valid_data, nullable, length); break; } case DataType::INT8: { auto int8_data = reinterpret_cast(raw_data); add_numeric_payload( - builder, int8_data, length); + builder, int8_data, payload.valid_data, nullable, length); break; } case DataType::INT16: { auto int16_data = reinterpret_cast(raw_data); add_numeric_payload( - builder, int16_data, length); + builder, int16_data, payload.valid_data, nullable, length); break; } case DataType::INT32: { auto int32_data = reinterpret_cast(raw_data); add_numeric_payload( - builder, int32_data, length); + builder, int32_data, payload.valid_data, nullable, length); break; } case DataType::INT64: { auto int64_data = reinterpret_cast(raw_data); add_numeric_payload( - builder, int64_data, length); + builder, int64_data, payload.valid_data, nullable, length); break; } case DataType::FLOAT: { auto float_data = reinterpret_cast(raw_data); add_numeric_payload( - builder, float_data, length); + builder, float_data, payload.valid_data, nullable, length); break; } case DataType::DOUBLE: { auto double_data = reinterpret_cast(raw_data); add_numeric_payload( - builder, double_data, length); + builder, double_data, payload.valid_data, nullable, length); break; } case DataType::VECTOR_FLOAT16: @@ -292,40 +315,50 @@ CreateArrowBuilder(DataType data_type, int dim) { } std::shared_ptr -CreateArrowSchema(DataType data_type) { +CreateArrowSchema(DataType data_type, bool nullable) { switch (static_cast(data_type)) { case DataType::BOOL: { - return arrow::schema({arrow::field("val", arrow::boolean())}); + return arrow::schema( + {arrow::field("val", arrow::boolean(), nullable)}); } case DataType::INT8: { - return arrow::schema({arrow::field("val", arrow::int8())}); + return arrow::schema( + {arrow::field("val", arrow::int8(), nullable)}); } case DataType::INT16: { - return arrow::schema({arrow::field("val", arrow::int16())}); + return arrow::schema( + {arrow::field("val", arrow::int16(), nullable)}); } case DataType::INT32: { - return arrow::schema({arrow::field("val", arrow::int32())}); + return arrow::schema( + {arrow::field("val", arrow::int32(), nullable)}); } case DataType::INT64: { - return arrow::schema({arrow::field("val", arrow::int64())}); + return arrow::schema( + {arrow::field("val", arrow::int64(), nullable)}); } case DataType::FLOAT: { - return arrow::schema({arrow::field("val", arrow::float32())}); + return arrow::schema( + {arrow::field("val", arrow::float32(), nullable)}); } case DataType::DOUBLE: { - return arrow::schema({arrow::field("val", arrow::float64())}); + return arrow::schema( + {arrow::field("val", arrow::float64(), nullable)}); } case DataType::VARCHAR: case DataType::STRING: { - return arrow::schema({arrow::field("val", arrow::utf8())}); + return arrow::schema( + {arrow::field("val", arrow::utf8(), nullable)}); } case DataType::ARRAY: case DataType::JSON: { - return arrow::schema({arrow::field("val", arrow::binary())}); + return arrow::schema( + {arrow::field("val", arrow::binary(), nullable)}); } // sparse float vector doesn't require a dim case DataType::VECTOR_SPARSE_FLOAT: { - return arrow::schema({arrow::field("val", arrow::binary())}); + return arrow::schema( + {arrow::field("val", arrow::binary(), nullable)}); } default: { PanicInfo( @@ -335,30 +368,37 @@ CreateArrowSchema(DataType data_type) { } std::shared_ptr -CreateArrowSchema(DataType data_type, int dim) { +CreateArrowSchema(DataType data_type, int dim, bool nullable) { switch (static_cast(data_type)) { case DataType::VECTOR_FLOAT: { AssertInfo(dim > 0, "invalid dim value: {}", dim); - return arrow::schema({arrow::field( - "val", arrow::fixed_size_binary(dim * sizeof(float)))}); + return arrow::schema( + {arrow::field("val", + arrow::fixed_size_binary(dim * sizeof(float)), + nullable)}); } case DataType::VECTOR_BINARY: { AssertInfo(dim % 8 == 0 && dim > 0, "invalid dim value: {}", dim); - return arrow::schema( - {arrow::field("val", arrow::fixed_size_binary(dim / 8))}); + return arrow::schema({arrow::field( + "val", arrow::fixed_size_binary(dim / 8), nullable)}); } case DataType::VECTOR_FLOAT16: { AssertInfo(dim > 0, "invalid dim value: {}", dim); - return arrow::schema({arrow::field( - "val", arrow::fixed_size_binary(dim * sizeof(float16)))}); + return arrow::schema( + {arrow::field("val", + arrow::fixed_size_binary(dim * sizeof(float16)), + nullable)}); } case DataType::VECTOR_BFLOAT16: { AssertInfo(dim > 0, "invalid dim value"); - return arrow::schema({arrow::field( - "val", arrow::fixed_size_binary(dim * sizeof(bfloat16)))}); + return arrow::schema( + {arrow::field("val", + arrow::fixed_size_binary(dim * sizeof(bfloat16)), + nullable)}); } case DataType::VECTOR_SPARSE_FLOAT: { - return arrow::schema({arrow::field("val", arrow::binary())}); + return arrow::schema( + {arrow::field("val", arrow::binary(), nullable)}); } default: { PanicInfo( @@ -499,7 +539,7 @@ EncodeAndUploadIndexSlice(ChunkManager* chunk_manager, IndexMeta index_meta, FieldDataMeta field_meta, std::string object_key) { - auto field_data = CreateFieldData(DataType::INT8); + auto field_data = CreateFieldData(DataType::INT8, false); field_data->FillFieldData(buf, batch_size); auto indexData = std::make_shared(field_data); indexData->set_index_meta(index_meta); @@ -518,7 +558,8 @@ EncodeAndUploadIndexSlice2(std::shared_ptr space, IndexMeta index_meta, FieldDataMeta field_meta, std::string object_key) { - auto field_data = CreateFieldData(DataType::INT8); + // todo: support nullable index + auto field_data = CreateFieldData(DataType::INT8, false); field_data->FillFieldData(buf, batch_size); auto indexData = std::make_shared(field_data); indexData->set_index_meta(index_meta); @@ -542,7 +583,8 @@ EncodeAndUploadFieldSlice(ChunkManager* chunk_manager, auto dim = IsSparseFloatVectorDataType(field_meta.get_data_type()) ? -1 : field_meta.get_dim(); - auto field_data = CreateFieldData(field_meta.get_data_type(), dim, 0); + auto field_data = CreateFieldData( + field_meta.get_data_type(), field_meta.is_nullable(), dim, 0); field_data->FillFieldData(buf, element_count); auto insertData = std::make_shared(field_data); insertData->SetFieldDataMeta(field_data_meta); @@ -779,30 +821,42 @@ CreateChunkManager(const StorageConfig& storage_config) { } FieldDataPtr -CreateFieldData(const DataType& type, int64_t dim, int64_t total_num_rows) { +CreateFieldData(const DataType& type, + bool nullable, + int64_t dim, + int64_t total_num_rows) { switch (type) { case DataType::BOOL: - return std::make_shared>(type, total_num_rows); + return std::make_shared>( + type, nullable, total_num_rows); case DataType::INT8: - return std::make_shared>(type, total_num_rows); + return std::make_shared>( + type, nullable, total_num_rows); case DataType::INT16: - return std::make_shared>(type, total_num_rows); + return std::make_shared>( + type, nullable, total_num_rows); case DataType::INT32: - return std::make_shared>(type, total_num_rows); + return std::make_shared>( + type, nullable, total_num_rows); case DataType::INT64: - return std::make_shared>(type, total_num_rows); + return std::make_shared>( + type, nullable, total_num_rows); case DataType::FLOAT: - return std::make_shared>(type, total_num_rows); + return std::make_shared>( + type, nullable, total_num_rows); case DataType::DOUBLE: - return std::make_shared>(type, total_num_rows); + return std::make_shared>( + type, nullable, total_num_rows); case DataType::STRING: case DataType::VARCHAR: - return std::make_shared>(type, - total_num_rows); + return std::make_shared>( + type, nullable, total_num_rows); case DataType::JSON: - return std::make_shared>(type, total_num_rows); + return std::make_shared>( + type, nullable, total_num_rows); case DataType::ARRAY: - return std::make_shared>(type, total_num_rows); + return std::make_shared>( + type, nullable, total_num_rows); case DataType::VECTOR_FLOAT: return std::make_shared>( dim, type, total_num_rows); @@ -859,11 +913,16 @@ MergeFieldData(std::vector& data_array) { for (const auto& data : data_array) { total_length += data->Length(); } - - auto merged_data = storage::CreateFieldData(data_array[0]->get_data_type()); + auto merged_data = storage::CreateFieldData(data_array[0]->get_data_type(), + data_array[0]->IsNullable()); merged_data->Reserve(total_length); for (const auto& data : data_array) { - merged_data->FillFieldData(data->Data(), data->Length()); + if (merged_data->IsNullable()) { + merged_data->FillFieldData( + data->Data(), data->ValidData(), data->Length()); + } else { + merged_data->FillFieldData(data->Data(), data->Length()); + } } return merged_data; } diff --git a/internal/core/src/storage/Util.h b/internal/core/src/storage/Util.h index b13d03fa42aad..d92bb7d577566 100644 --- a/internal/core/src/storage/Util.h +++ b/internal/core/src/storage/Util.h @@ -58,10 +58,10 @@ std::shared_ptr CreateArrowBuilder(DataType data_type, int dim); std::shared_ptr -CreateArrowSchema(DataType data_type); +CreateArrowSchema(DataType data_type, bool nullable); std::shared_ptr -CreateArrowSchema(DataType data_type, int dim); +CreateArrowSchema(DataType data_type, int dim, bool nullable); int GetDimensionFromFileMetaData(const parquet::ColumnDescriptor* schema, @@ -156,6 +156,7 @@ CreateChunkManager(const StorageConfig& storage_config); FieldDataPtr CreateFieldData(const DataType& type, + bool nullable = false, int64_t dim = 1, int64_t total_num_rows = 0); diff --git a/internal/core/unittest/test_array_expr.cpp b/internal/core/unittest/test_array_expr.cpp index 06266f6e4a6a2..02f15bac7baf3 100644 --- a/internal/core/unittest/test_array_expr.cpp +++ b/internal/core/unittest/test_array_expr.cpp @@ -815,8 +815,11 @@ TEST(Expr, PraseArrayContainsExpr) { auto schema = std::make_shared(); schema->AddDebugField( "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); - schema->AddField( - FieldName("array"), FieldId(101), DataType::ARRAY, DataType::INT64); + schema->AddField(FieldName("array"), + FieldId(101), + DataType::ARRAY, + DataType::INT64, + false); auto plan = CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size()); } diff --git a/internal/core/unittest/test_array_inverted_index.cpp b/internal/core/unittest/test_array_inverted_index.cpp index cd4833b52bf38..d0be8976417ab 100644 --- a/internal/core/unittest/test_array_inverted_index.cpp +++ b/internal/core/unittest/test_array_inverted_index.cpp @@ -35,21 +35,21 @@ GenTestSchema() { schema_->set_primary_field_id(pk); if constexpr (std::is_same_v) { - schema_->AddDebugArrayField("array", DataType::BOOL); + schema_->AddDebugArrayField("array", DataType::BOOL,false); } else if constexpr (std::is_same_v) { - schema_->AddDebugArrayField("array", DataType::INT8); + schema_->AddDebugArrayField("array", DataType::INT8,false); } else if constexpr (std::is_same_v) { - schema_->AddDebugArrayField("array", DataType::INT16); + schema_->AddDebugArrayField("array", DataType::INT16,false); } else if constexpr (std::is_same_v) { - schema_->AddDebugArrayField("array", DataType::INT32); + schema_->AddDebugArrayField("array", DataType::INT32,false); } else if constexpr (std::is_same_v) { - schema_->AddDebugArrayField("array", DataType::INT64); + schema_->AddDebugArrayField("array", DataType::INT64,false); } else if constexpr (std::is_same_v) { - schema_->AddDebugArrayField("array", DataType::FLOAT); + schema_->AddDebugArrayField("array", DataType::FLOAT,false); } else if constexpr (std::is_same_v) { - schema_->AddDebugArrayField("array", DataType::DOUBLE); + schema_->AddDebugArrayField("array", DataType::DOUBLE,false); } else if constexpr (std::is_same_v) { - schema_->AddDebugArrayField("array", DataType::VARCHAR); + schema_->AddDebugArrayField("array", DataType::VARCHAR,false); } return schema_; diff --git a/internal/core/unittest/test_binlog_index.cpp b/internal/core/unittest/test_binlog_index.cpp index 2e9dac8776f38..25cf12a2494e6 100644 --- a/internal/core/unittest/test_binlog_index.cpp +++ b/internal/core/unittest/test_binlog_index.cpp @@ -72,7 +72,7 @@ class BinlogIndexTest : public ::testing::TestWithParam { schema->AddDebugField("fakevec", data_type, data_d, metric_type); auto i64_fid = schema->AddDebugField("counter", DataType::INT64); schema->set_primary_field_id(i64_fid); - vec_field_data = storage::CreateFieldData(data_type, data_d); + vec_field_data = storage::CreateFieldData(data_type, false, data_d); if (data_type == DataType::VECTOR_FLOAT) { auto vec_data = GenRandomFloatVecData(data_n, data_d); @@ -123,9 +123,9 @@ class BinlogIndexTest : public ::testing::TestWithParam { // load id LoadFieldDataInfo row_id_info; FieldMeta row_id_field_meta( - FieldName("RowID"), RowFieldID, DataType::INT64); - auto field_data = - std::make_shared>(DataType::INT64); + FieldName("RowID"), RowFieldID, DataType::INT64, false); + auto field_data = std::make_shared>( + DataType::INT64, false); field_data->FillFieldData(dataset.row_ids_.data(), data_n); auto field_data_info = FieldDataInfo{ RowFieldID.get(), data_n, std::vector{field_data}}; @@ -133,9 +133,9 @@ class BinlogIndexTest : public ::testing::TestWithParam { // load ts LoadFieldDataInfo ts_info; FieldMeta ts_field_meta( - FieldName("Timestamp"), TimestampFieldID, DataType::INT64); - field_data = - std::make_shared>(DataType::INT64); + FieldName("Timestamp"), TimestampFieldID, DataType::INT64, false); + field_data = std::make_shared>( + DataType::INT64, false); field_data->FillFieldData(dataset.timestamps_.data(), data_n); field_data_info = FieldDataInfo{TimestampFieldID.get(), data_n, diff --git a/internal/core/unittest/test_c_api.cpp b/internal/core/unittest/test_c_api.cpp index 8c177c0787f73..cb0d5bfe871e5 100644 --- a/internal/core/unittest/test_c_api.cpp +++ b/internal/core/unittest/test_c_api.cpp @@ -1011,6 +1011,74 @@ TEST(CApiTest, DeleteRepeatedPksFromSealedSegment) { DeleteSegment(segment); } +TEST(CApiTest, SearcTestWhenNullable) { + auto c_collection = NewCollection(get_default_schema_config_nullable()); + CSegmentInterface segment; + auto status = NewSegment(c_collection, Growing, -1, &segment); + ASSERT_EQ(status.error_code, Success); + auto col = (milvus::segcore::Collection*)c_collection; + + int N = 10000; + auto dataset = DataGen(col->get_schema(), N); + int64_t ts_offset = 1000; + + int64_t offset; + PreInsert(segment, N, &offset); + + auto insert_data = serialize(dataset.raw_); + auto ins_res = Insert(segment, + offset, + N, + dataset.row_ids_.data(), + dataset.timestamps_.data(), + insert_data.data(), + insert_data.size()); + ASSERT_EQ(ins_res.error_code, Success); + + milvus::proto::plan::PlanNode plan_node; + auto vector_anns = plan_node.mutable_vector_anns(); + vector_anns->set_vector_type(milvus::proto::plan::VectorType::FloatVector); + vector_anns->set_placeholder_tag("$0"); + vector_anns->set_field_id(100); + auto query_info = vector_anns->mutable_query_info(); + query_info->set_topk(10); + query_info->set_round_decimal(3); + query_info->set_metric_type("L2"); + query_info->set_search_params(R"({"nprobe": 10})"); + auto plan_str = plan_node.SerializeAsString(); + + int num_queries = 10; + auto blob = generate_query_data(num_queries); + + void* plan = nullptr; + status = CreateSearchPlanByExpr( + c_collection, plan_str.data(), plan_str.size(), &plan); + ASSERT_EQ(status.error_code, Success); + + void* placeholderGroup = nullptr; + status = ParsePlaceholderGroup( + plan, blob.data(), blob.length(), &placeholderGroup); + ASSERT_EQ(status.error_code, Success); + + std::vector placeholderGroups; + placeholderGroups.push_back(placeholderGroup); + + CSearchResult search_result; + auto res = CSearch(segment, plan, placeholderGroup, {}, &search_result); + ASSERT_EQ(res.error_code, Success); + + CSearchResult search_result2; + auto res2 = CSearch(segment, plan, placeholderGroup, {}, &search_result2); + ASSERT_EQ(res2.error_code, Success); + + DeleteSearchPlan(plan); + DeletePlaceholderGroup(placeholderGroup); + DeleteSearchResult(search_result); + DeleteSearchResult(search_result2); + DeleteCollection(c_collection); + DeleteSegment(segment); +} + TEST(CApiTest, InsertSamePkAfterDeleteOnGrowingSegment) { auto collection = NewCollection(get_default_schema_config()); CSegmentInterface segment; @@ -4238,8 +4306,9 @@ TEST(CApiTest, SealedSegment_Update_Field_Size) { TEST(CApiTest, GrowingSegment_Load_Field_Data) { auto schema = std::make_shared(); - schema->AddField(FieldName("RowID"), FieldId(0), DataType::INT64); - schema->AddField(FieldName("Timestamp"), FieldId(1), DataType::INT64); + schema->AddField(FieldName("RowID"), FieldId(0), DataType::INT64, false); + schema->AddField( + FieldName("Timestamp"), FieldId(1), DataType::INT64, false); auto str_fid = schema->AddDebugField("string", DataType::VARCHAR); auto vec_fid = schema->AddDebugField( "vector_float", DataType::VECTOR_FLOAT, DIM, "L2"); diff --git a/internal/core/unittest/test_chunk_cache.cpp b/internal/core/unittest/test_chunk_cache.cpp index ee161cfa79f8e..6382430439bb5 100644 --- a/internal/core/unittest/test_chunk_cache.cpp +++ b/internal/core/unittest/test_chunk_cache.cpp @@ -66,7 +66,8 @@ TEST_F(ChunkCacheTest, Read) { fake_id, milvus::DataType::VECTOR_FLOAT, dim, - metric_type); + metric_type, + false); auto lcm = milvus::storage::LocalChunkManagerSingleton::GetInstance() .GetChunkManager(); @@ -114,7 +115,8 @@ TEST_F(ChunkCacheTest, TestMultithreads) { fake_id, milvus::DataType::VECTOR_FLOAT, dim, - metric_type); + metric_type, + false); auto lcm = milvus::storage::LocalChunkManagerSingleton::GetInstance() .GetChunkManager(); diff --git a/internal/core/unittest/test_data_codec.cpp b/internal/core/unittest/test_data_codec.cpp index 0a4e7b36ff657..716f3bb57499e 100644 --- a/internal/core/unittest/test_data_codec.cpp +++ b/internal/core/unittest/test_data_codec.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include +#include #include "storage/DataCodec.h" #include "storage/InsertData.h" @@ -22,6 +23,7 @@ #include "storage/Util.h" #include "common/Consts.h" #include "common/Json.h" +#include #include "test_utils/Constants.h" #include "test_utils/DataGen.h" @@ -29,7 +31,8 @@ using namespace milvus; TEST(storage, InsertDataBool) { FixedVector data = {true, false, true, false, true}; - auto field_data = milvus::storage::CreateFieldData(storage::DataType::BOOL); + auto field_data = + milvus::storage::CreateFieldData(storage::DataType::BOOL, false); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -48,14 +51,51 @@ TEST(storage, InsertDataBool) { auto new_payload = new_insert_data->GetFieldData(); ASSERT_EQ(new_payload->get_data_type(), storage::DataType::BOOL); ASSERT_EQ(new_payload->get_num_rows(), data.size()); + ASSERT_EQ(new_payload->get_null_count(), 0); FixedVector new_data(data.size()); - memcpy(new_data.data(), new_payload->Data(), new_payload->Size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); ASSERT_EQ(data, new_data); } +TEST(storage, InsertDataBoolNullable) { + FixedVector data = {true, false, false, false, true}; + auto field_data = + milvus::storage::CreateFieldData(storage::DataType::BOOL, true); + uint8_t* valid_data = new uint8_t[1]{0x13}; + + field_data->FillFieldData(data.data(), valid_data, data.size()); + + storage::InsertData insert_data(field_data); + storage::FieldDataMeta field_data_meta{100, 101, 102, 103}; + insert_data.SetFieldDataMeta(field_data_meta); + insert_data.SetTimestamps(0, 100); + + auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote); + std::shared_ptr serialized_data_ptr(serialized_bytes.data(), + [&](uint8_t*) {}); + auto new_insert_data = storage::DeserializeFileData( + serialized_data_ptr, serialized_bytes.size()); + ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType); + ASSERT_EQ(new_insert_data->GetTimeRage(), + std::make_pair(Timestamp(0), Timestamp(100))); + auto new_payload = new_insert_data->GetFieldData(); + ASSERT_EQ(new_payload->get_data_type(), storage::DataType::BOOL); + ASSERT_EQ(new_payload->get_num_rows(), data.size()); + ASSERT_EQ(new_payload->get_null_count(), 2); + FixedVector new_data(data.size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); + // valid_data is 0001 0011, read from LSB, '1' means the according index is valid + ASSERT_EQ(data[0], new_data[0]); + ASSERT_EQ(data[1], new_data[1]); + ASSERT_EQ(data[4], new_data[4]); + ASSERT_EQ(*new_payload->ValidData(), *valid_data); + delete[] valid_data; +} + TEST(storage, InsertDataInt8) { FixedVector data = {1, 2, 3, 4, 5}; - auto field_data = milvus::storage::CreateFieldData(storage::DataType::INT8); + auto field_data = + milvus::storage::CreateFieldData(storage::DataType::INT8, false); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -63,6 +103,35 @@ TEST(storage, InsertDataInt8) { insert_data.SetFieldDataMeta(field_data_meta); insert_data.SetTimestamps(0, 100); + auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote); + std::shared_ptr serialized_data_ptr(serialized_bytes.data(), + [&](uint8_t*) {}); + auto new_insert_data = storage::DeserializeFileData( + serialized_data_ptr, serialized_bytes.size()); + ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType); + ASSERT_EQ(new_insert_data->GetTimeRage(), + std::make_pair(Timestamp(0), Timestamp(100))); + auto new_payload = new_insert_data->GetFieldData(); + ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT8); + ASSERT_EQ(new_payload->get_num_rows(), data.size()); + ASSERT_EQ(new_payload->get_null_count(), 0); + FixedVector new_data(data.size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); + ASSERT_EQ(data, new_data); +} + +TEST(storage, InsertDataInt8Nullable) { + FixedVector data = {1, 2, 3, 4, 5}; + auto field_data = + milvus::storage::CreateFieldData(storage::DataType::INT8, true); + uint8_t* valid_data = new uint8_t[1]{0x13}; + field_data->FillFieldData(data.data(), valid_data, data.size()); + + storage::InsertData insert_data(field_data); + storage::FieldDataMeta field_data_meta{100, 101, 102, 103}; + insert_data.SetFieldDataMeta(field_data_meta); + insert_data.SetTimestamps(0, 100); + auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote); std::shared_ptr serialized_data_ptr(serialized_bytes.data(), [&](uint8_t*) {}); @@ -75,14 +144,18 @@ TEST(storage, InsertDataInt8) { ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT8); ASSERT_EQ(new_payload->get_num_rows(), data.size()); FixedVector new_data(data.size()); - memcpy(new_data.data(), new_payload->Data(), new_payload->Size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); + data = {1, 2, 0, 0, 5}; ASSERT_EQ(data, new_data); + ASSERT_EQ(new_payload->get_null_count(), 2); + ASSERT_EQ(*new_payload->ValidData(), *valid_data); + delete[] valid_data; } TEST(storage, InsertDataInt16) { FixedVector data = {1, 2, 3, 4, 5}; auto field_data = - milvus::storage::CreateFieldData(storage::DataType::INT16); + milvus::storage::CreateFieldData(storage::DataType::INT16, false); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -101,15 +174,48 @@ TEST(storage, InsertDataInt16) { auto new_payload = new_insert_data->GetFieldData(); ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT16); ASSERT_EQ(new_payload->get_num_rows(), data.size()); + ASSERT_EQ(new_payload->get_null_count(), 0); FixedVector new_data(data.size()); - memcpy(new_data.data(), new_payload->Data(), new_payload->Size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); ASSERT_EQ(data, new_data); } +TEST(storage, InsertDataInt16Nullable) { + FixedVector data = {1, 2, 3, 4, 5}; + auto field_data = + milvus::storage::CreateFieldData(storage::DataType::INT16, true); + uint8_t* valid_data = new uint8_t[1]{0x13}; + field_data->FillFieldData(data.data(), valid_data, data.size()); + + storage::InsertData insert_data(field_data); + storage::FieldDataMeta field_data_meta{100, 101, 102, 103}; + insert_data.SetFieldDataMeta(field_data_meta); + insert_data.SetTimestamps(0, 100); + + auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote); + std::shared_ptr serialized_data_ptr(serialized_bytes.data(), + [&](uint8_t*) {}); + auto new_insert_data = storage::DeserializeFileData( + serialized_data_ptr, serialized_bytes.size()); + ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType); + ASSERT_EQ(new_insert_data->GetTimeRage(), + std::make_pair(Timestamp(0), Timestamp(100))); + auto new_payload = new_insert_data->GetFieldData(); + ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT16); + ASSERT_EQ(new_payload->get_num_rows(), data.size()); + FixedVector new_data(data.size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); + data = {1, 2, 0, 0, 5}; + ASSERT_EQ(data, new_data); + ASSERT_EQ(new_payload->get_null_count(), 2); + ASSERT_EQ(*new_payload->ValidData(), *valid_data); + delete[] valid_data; +} + TEST(storage, InsertDataInt32) { FixedVector data = {true, false, true, false, true}; auto field_data = - milvus::storage::CreateFieldData(storage::DataType::INT32); + milvus::storage::CreateFieldData(storage::DataType::INT32, false); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -128,15 +234,48 @@ TEST(storage, InsertDataInt32) { auto new_payload = new_insert_data->GetFieldData(); ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT32); ASSERT_EQ(new_payload->get_num_rows(), data.size()); + ASSERT_EQ(new_payload->get_null_count(), 0); FixedVector new_data(data.size()); - memcpy(new_data.data(), new_payload->Data(), new_payload->Size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); ASSERT_EQ(data, new_data); } +TEST(storage, InsertDataInt32Nullable) { + FixedVector data = {1, 2, 3, 4, 5}; + auto field_data = + milvus::storage::CreateFieldData(storage::DataType::INT32, true); + uint8_t* valid_data = new uint8_t[1]{0x13}; + field_data->FillFieldData(data.data(), valid_data, data.size()); + + storage::InsertData insert_data(field_data); + storage::FieldDataMeta field_data_meta{100, 101, 102, 103}; + insert_data.SetFieldDataMeta(field_data_meta); + insert_data.SetTimestamps(0, 100); + + auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote); + std::shared_ptr serialized_data_ptr(serialized_bytes.data(), + [&](uint8_t*) {}); + auto new_insert_data = storage::DeserializeFileData( + serialized_data_ptr, serialized_bytes.size()); + ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType); + ASSERT_EQ(new_insert_data->GetTimeRage(), + std::make_pair(Timestamp(0), Timestamp(100))); + auto new_payload = new_insert_data->GetFieldData(); + ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT32); + ASSERT_EQ(new_payload->get_num_rows(), data.size()); + FixedVector new_data(data.size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); + data = {1, 2, 0, 0, 5}; + ASSERT_EQ(data, new_data); + ASSERT_EQ(new_payload->get_null_count(), 2); + ASSERT_EQ(*new_payload->ValidData(), *valid_data); + delete[] valid_data; +} + TEST(storage, InsertDataInt64) { FixedVector data = {1, 2, 3, 4, 5}; auto field_data = - milvus::storage::CreateFieldData(storage::DataType::INT64); + milvus::storage::CreateFieldData(storage::DataType::INT64, false); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -155,16 +294,49 @@ TEST(storage, InsertDataInt64) { auto new_payload = new_insert_data->GetFieldData(); ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT64); ASSERT_EQ(new_payload->get_num_rows(), data.size()); + ASSERT_EQ(new_payload->get_null_count(), 0); FixedVector new_data(data.size()); - memcpy(new_data.data(), new_payload->Data(), new_payload->Size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); ASSERT_EQ(data, new_data); } +TEST(storage, InsertDataInt64Nullable) { + FixedVector data = {1, 2, 3, 4, 5}; + auto field_data = + milvus::storage::CreateFieldData(storage::DataType::INT64, true); + uint8_t* valid_data = new uint8_t[1]{0x13}; + field_data->FillFieldData(data.data(), valid_data, data.size()); + + storage::InsertData insert_data(field_data); + storage::FieldDataMeta field_data_meta{100, 101, 102, 103}; + insert_data.SetFieldDataMeta(field_data_meta); + insert_data.SetTimestamps(0, 100); + + auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote); + std::shared_ptr serialized_data_ptr(serialized_bytes.data(), + [&](uint8_t*) {}); + auto new_insert_data = storage::DeserializeFileData( + serialized_data_ptr, serialized_bytes.size()); + ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType); + ASSERT_EQ(new_insert_data->GetTimeRage(), + std::make_pair(Timestamp(0), Timestamp(100))); + auto new_payload = new_insert_data->GetFieldData(); + ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT64); + ASSERT_EQ(new_payload->get_num_rows(), data.size()); + FixedVector new_data(data.size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); + data = {1, 2, 0, 0, 5}; + ASSERT_EQ(data, new_data); + ASSERT_EQ(new_payload->get_null_count(), 2); + ASSERT_EQ(*new_payload->ValidData(), *valid_data); + delete[] valid_data; +} + TEST(storage, InsertDataString) { FixedVector data = { "test1", "test2", "test3", "test4", "test5"}; auto field_data = - milvus::storage::CreateFieldData(storage::DataType::VARCHAR); + milvus::storage::CreateFieldData(storage::DataType::VARCHAR, false); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -184,18 +356,56 @@ TEST(storage, InsertDataString) { ASSERT_EQ(new_payload->get_data_type(), storage::DataType::VARCHAR); ASSERT_EQ(new_payload->get_num_rows(), data.size()); FixedVector new_data(data.size()); + ASSERT_EQ(new_payload->get_null_count(), 0); for (int i = 0; i < data.size(); ++i) { new_data[i] = *static_cast(new_payload->RawValue(i)); - ASSERT_EQ(new_payload->Size(i), data[i].size()); + ASSERT_EQ(new_payload->DataSize(i), data[i].size()); } ASSERT_EQ(data, new_data); } +TEST(storage, InsertDataStringNullable) { + FixedVector data = { + "test1", "test2", "test3", "test4", "test5"}; + auto field_data = + milvus::storage::CreateFieldData(storage::DataType::STRING, true); + uint8_t* valid_data = new uint8_t[1]{0x13}; + field_data->FillFieldData(data.data(), valid_data, data.size()); + + storage::InsertData insert_data(field_data); + storage::FieldDataMeta field_data_meta{100, 101, 102, 103}; + insert_data.SetFieldDataMeta(field_data_meta); + insert_data.SetTimestamps(0, 100); + + auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote); + std::shared_ptr serialized_data_ptr(serialized_bytes.data(), + [&](uint8_t*) {}); + auto new_insert_data = storage::DeserializeFileData( + serialized_data_ptr, serialized_bytes.size()); + ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType); + ASSERT_EQ(new_insert_data->GetTimeRage(), + std::make_pair(Timestamp(0), Timestamp(100))); + auto new_payload = new_insert_data->GetFieldData(); + ASSERT_EQ(new_payload->get_data_type(), storage::DataType::STRING); + ASSERT_EQ(new_payload->get_num_rows(), data.size()); + FixedVector new_data(data.size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); + data = {"test1", "test2", "", "", "test5"}; + for (int i = 0; i < data.size(); ++i) { + new_data[i] = + *static_cast(new_payload->RawValue(i)); + ASSERT_EQ(new_payload->DataSize(i), data[i].size()); + } + ASSERT_EQ(new_payload->get_null_count(), 2); + ASSERT_EQ(*new_payload->ValidData(), *valid_data); + delete[] valid_data; +} + TEST(storage, InsertDataFloat) { FixedVector data = {1, 2, 3, 4, 5}; auto field_data = - milvus::storage::CreateFieldData(storage::DataType::FLOAT); + milvus::storage::CreateFieldData(storage::DataType::FLOAT, false); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -203,6 +413,35 @@ TEST(storage, InsertDataFloat) { insert_data.SetFieldDataMeta(field_data_meta); insert_data.SetTimestamps(0, 100); + auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote); + std::shared_ptr serialized_data_ptr(serialized_bytes.data(), + [&](uint8_t*) {}); + auto new_insert_data = storage::DeserializeFileData( + serialized_data_ptr, serialized_bytes.size()); + ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType); + ASSERT_EQ(new_insert_data->GetTimeRage(), + std::make_pair(Timestamp(0), Timestamp(100))); + auto new_payload = new_insert_data->GetFieldData(); + ASSERT_EQ(new_payload->get_data_type(), storage::DataType::FLOAT); + ASSERT_EQ(new_payload->get_num_rows(), data.size()); + ASSERT_EQ(new_payload->get_null_count(), 0); + FixedVector new_data(data.size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); + ASSERT_EQ(data, new_data); +} + +TEST(storage, InsertDataFloatNullable) { + FixedVector data = {1, 2, 3, 4, 5}; + auto field_data = + milvus::storage::CreateFieldData(storage::DataType::FLOAT, true); + uint8_t* valid_data = new uint8_t[1]{0x13}; + field_data->FillFieldData(data.data(), valid_data, data.size()); + + storage::InsertData insert_data(field_data); + storage::FieldDataMeta field_data_meta{100, 101, 102, 103}; + insert_data.SetFieldDataMeta(field_data_meta); + insert_data.SetTimestamps(0, 100); + auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote); std::shared_ptr serialized_data_ptr(serialized_bytes.data(), [&](uint8_t*) {}); @@ -215,14 +454,17 @@ TEST(storage, InsertDataFloat) { ASSERT_EQ(new_payload->get_data_type(), storage::DataType::FLOAT); ASSERT_EQ(new_payload->get_num_rows(), data.size()); FixedVector new_data(data.size()); - memcpy(new_data.data(), new_payload->Data(), new_payload->Size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); + data = {1, 2, 0, 0, 5}; ASSERT_EQ(data, new_data); + ASSERT_EQ(new_payload->get_null_count(), 2); + ASSERT_EQ(*new_payload->ValidData(), *valid_data); } TEST(storage, InsertDataDouble) { FixedVector data = {1.0, 2.0, 3.0, 4.2, 5.3}; auto field_data = - milvus::storage::CreateFieldData(storage::DataType::DOUBLE); + milvus::storage::CreateFieldData(storage::DataType::DOUBLE, false); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -241,16 +483,49 @@ TEST(storage, InsertDataDouble) { auto new_payload = new_insert_data->GetFieldData(); ASSERT_EQ(new_payload->get_data_type(), storage::DataType::DOUBLE); ASSERT_EQ(new_payload->get_num_rows(), data.size()); + ASSERT_EQ(new_payload->get_null_count(), 0); FixedVector new_data(data.size()); - memcpy(new_data.data(), new_payload->Data(), new_payload->Size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); ASSERT_EQ(data, new_data); } +TEST(storage, InsertDataDoubleNullable) { + FixedVector data = {1, 2, 3, 4, 5}; + auto field_data = + milvus::storage::CreateFieldData(storage::DataType::DOUBLE, true); + uint8_t* valid_data = new uint8_t[1]{0x13}; + field_data->FillFieldData(data.data(), valid_data, data.size()); + + storage::InsertData insert_data(field_data); + storage::FieldDataMeta field_data_meta{100, 101, 102, 103}; + insert_data.SetFieldDataMeta(field_data_meta); + insert_data.SetTimestamps(0, 100); + + auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote); + std::shared_ptr serialized_data_ptr(serialized_bytes.data(), + [&](uint8_t*) {}); + auto new_insert_data = storage::DeserializeFileData( + serialized_data_ptr, serialized_bytes.size()); + ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType); + ASSERT_EQ(new_insert_data->GetTimeRage(), + std::make_pair(Timestamp(0), Timestamp(100))); + auto new_payload = new_insert_data->GetFieldData(); + ASSERT_EQ(new_payload->get_data_type(), storage::DataType::DOUBLE); + ASSERT_EQ(new_payload->get_num_rows(), data.size()); + FixedVector new_data(data.size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); + data = {1, 2, 0, 0, 5}; + ASSERT_EQ(data, new_data); + ASSERT_EQ(new_payload->get_null_count(), 2); + ASSERT_EQ(*new_payload->ValidData(), *valid_data); + delete[] valid_data; +} + TEST(storage, InsertDataFloatVector) { std::vector data = {1, 2, 3, 4, 5, 6, 7, 8}; int DIM = 2; - auto field_data = - milvus::storage::CreateFieldData(storage::DataType::VECTOR_FLOAT, DIM); + auto field_data = milvus::storage::CreateFieldData( + storage::DataType::VECTOR_FLOAT, false, DIM); field_data->FillFieldData(data.data(), data.size() / DIM); storage::InsertData insert_data(field_data); @@ -269,6 +544,7 @@ TEST(storage, InsertDataFloatVector) { auto new_payload = new_insert_data->GetFieldData(); ASSERT_EQ(new_payload->get_data_type(), storage::DataType::VECTOR_FLOAT); ASSERT_EQ(new_payload->get_num_rows(), data.size() / DIM); + ASSERT_EQ(new_payload->get_null_count(), 0); std::vector new_data(data.size()); memcpy(new_data.data(), new_payload->Data(), @@ -281,7 +557,7 @@ TEST(storage, InsertDataSparseFloat) { auto vecs = milvus::segcore::GenerateRandomSparseFloatVector( n_rows, kTestSparseDim, kTestSparseVectorDensity); auto field_data = milvus::storage::CreateFieldData( - storage::DataType::VECTOR_SPARSE_FLOAT, kTestSparseDim, n_rows); + storage::DataType::VECTOR_SPARSE_FLOAT, false, kTestSparseDim, n_rows); field_data->FillFieldData(vecs.get(), n_rows); storage::InsertData insert_data(field_data); @@ -301,6 +577,7 @@ TEST(storage, InsertDataSparseFloat) { ASSERT_TRUE(new_payload->get_data_type() == storage::DataType::VECTOR_SPARSE_FLOAT); ASSERT_EQ(new_payload->get_num_rows(), n_rows); + ASSERT_EQ(new_payload->get_null_count(), 0); auto new_data = static_cast*>( new_payload->Data()); @@ -318,8 +595,8 @@ TEST(storage, InsertDataSparseFloat) { TEST(storage, InsertDataBinaryVector) { std::vector data = {1, 2, 3, 4, 5, 6, 7, 8}; int DIM = 16; - auto field_data = - milvus::storage::CreateFieldData(storage::DataType::VECTOR_BINARY, DIM); + auto field_data = milvus::storage::CreateFieldData( + storage::DataType::VECTOR_BINARY, false, DIM); field_data->FillFieldData(data.data(), data.size() * 8 / DIM); storage::InsertData insert_data(field_data); @@ -338,8 +615,9 @@ TEST(storage, InsertDataBinaryVector) { auto new_payload = new_insert_data->GetFieldData(); ASSERT_EQ(new_payload->get_data_type(), storage::DataType::VECTOR_BINARY); ASSERT_EQ(new_payload->get_num_rows(), data.size() * 8 / DIM); + ASSERT_EQ(new_payload->get_null_count(), 0); std::vector new_data(data.size()); - memcpy(new_data.data(), new_payload->Data(), new_payload->Size()); + memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize()); ASSERT_EQ(data, new_data); } @@ -347,7 +625,7 @@ TEST(storage, InsertDataFloat16Vector) { std::vector data = {1, 2, 3, 4, 5, 6, 7, 8}; int DIM = 2; auto field_data = milvus::storage::CreateFieldData( - storage::DataType::VECTOR_FLOAT16, DIM); + storage::DataType::VECTOR_FLOAT16, false, DIM); field_data->FillFieldData(data.data(), data.size() / DIM); storage::InsertData insert_data(field_data); @@ -366,6 +644,7 @@ TEST(storage, InsertDataFloat16Vector) { auto new_payload = new_insert_data->GetFieldData(); ASSERT_EQ(new_payload->get_data_type(), storage::DataType::VECTOR_FLOAT16); ASSERT_EQ(new_payload->get_num_rows(), data.size() / DIM); + ASSERT_EQ(new_payload->get_null_count(), 0); std::vector new_data(data.size()); memcpy(new_data.data(), new_payload->Data(), @@ -373,39 +652,10 @@ TEST(storage, InsertDataFloat16Vector) { ASSERT_EQ(data, new_data); } -TEST(storage, InsertDataBFloat16Vector) { - std::vector data = {1, 2, 3, 4, 5, 6, 7, 8}; - int DIM = 2; - auto field_data = milvus::storage::CreateFieldData( - storage::DataType::VECTOR_BFLOAT16, DIM); - field_data->FillFieldData(data.data(), data.size() / DIM); - - storage::InsertData insert_data(field_data); - storage::FieldDataMeta field_data_meta{100, 101, 102, 103}; - insert_data.SetFieldDataMeta(field_data_meta); - insert_data.SetTimestamps(0, 100); - - auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote); - std::shared_ptr serialized_data_ptr(serialized_bytes.data(), - [&](uint8_t*) {}); - auto new_insert_data = storage::DeserializeFileData( - serialized_data_ptr, serialized_bytes.size()); - ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType); - ASSERT_EQ(new_insert_data->GetTimeRage(), - std::make_pair(Timestamp(0), Timestamp(100))); - auto new_payload = new_insert_data->GetFieldData(); - ASSERT_EQ(new_payload->get_data_type(), storage::DataType::VECTOR_BFLOAT16); - ASSERT_EQ(new_payload->get_num_rows(), data.size() / DIM); - std::vector new_data(data.size()); - memcpy(new_data.data(), - new_payload->Data(), - new_payload->get_num_rows() * sizeof(bfloat16) * DIM); - ASSERT_EQ(data, new_data); -} - TEST(storage, IndexData) { std::vector data = {1, 2, 3, 4, 5, 6, 7, 8}; - auto field_data = milvus::storage::CreateFieldData(storage::DataType::INT8); + auto field_data = + milvus::storage::CreateFieldData(storage::DataType::INT8, false); field_data->FillFieldData(data.data(), data.size()); storage::IndexData index_data(field_data); @@ -427,7 +677,7 @@ TEST(storage, IndexData) { ASSERT_EQ(new_field_data->get_data_type(), storage::DataType::INT8); ASSERT_EQ(new_field_data->Size(), data.size()); std::vector new_data(data.size()); - memcpy(new_data.data(), new_field_data->Data(), new_field_data->Size()); + memcpy(new_data.data(), new_field_data->Data(), new_field_data->DataSize()); ASSERT_EQ(data, new_data); } @@ -441,7 +691,7 @@ TEST(storage, InsertDataStringArray) { auto string_array = Array(field_string_data); FixedVector data = {string_array}; auto field_data = - milvus::storage::CreateFieldData(storage::DataType::ARRAY); + milvus::storage::CreateFieldData(storage::DataType::ARRAY, false); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); @@ -463,7 +713,56 @@ TEST(storage, InsertDataStringArray) { FixedVector new_data(data.size()); for (int i = 0; i < data.size(); ++i) { new_data[i] = *static_cast(new_payload->RawValue(i)); - ASSERT_EQ(new_payload->Size(i), data[i].byte_size()); + ASSERT_EQ(new_payload->DataSize(i), data[i].byte_size()); ASSERT_TRUE(data[i].operator==(new_data[i])); } } + +TEST(storage, InsertDataStringArrayNullable) { + milvus::proto::schema::ScalarField field_string_data; + field_string_data.mutable_string_data()->add_data("test_array1"); + field_string_data.mutable_string_data()->add_data("test_array2"); + field_string_data.mutable_string_data()->add_data("test_array3"); + field_string_data.mutable_string_data()->add_data("test_array4"); + field_string_data.mutable_string_data()->add_data("test_array5"); + auto string_array = Array(field_string_data); + milvus::proto::schema::ScalarField field_int_data; + field_string_data.mutable_int_data()->add_data(1); + field_string_data.mutable_int_data()->add_data(2); + field_string_data.mutable_int_data()->add_data(3); + field_string_data.mutable_int_data()->add_data(4); + field_string_data.mutable_int_data()->add_data(5); + auto int_array = Array(field_int_data); + FixedVector data = {string_array, int_array}; + auto field_data = + milvus::storage::CreateFieldData(storage::DataType::ARRAY, true); + uint8_t* valid_data = new uint8_t[1]{0x01}; + field_data->FillFieldData(data.data(), valid_data, data.size()); + + storage::InsertData insert_data(field_data); + storage::FieldDataMeta field_data_meta{100, 101, 102, 103}; + insert_data.SetFieldDataMeta(field_data_meta); + insert_data.SetTimestamps(0, 100); + + auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote); + std::shared_ptr serialized_data_ptr(serialized_bytes.data(), + [&](uint8_t*) {}); + auto new_insert_data = storage::DeserializeFileData( + serialized_data_ptr, serialized_bytes.size()); + ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType); + ASSERT_EQ(new_insert_data->GetTimeRage(), + std::make_pair(Timestamp(0), Timestamp(100))); + auto new_payload = new_insert_data->GetFieldData(); + ASSERT_EQ(new_payload->get_data_type(), storage::DataType::ARRAY); + ASSERT_EQ(new_payload->get_num_rows(), data.size()); + ASSERT_EQ(new_payload->get_null_count(), 1); + FixedVector expected_data = {string_array, Array()}; + FixedVector new_data(data.size()); + for (int i = 0; i < data.size(); ++i) { + new_data[i] = *static_cast(new_payload->RawValue(i)); + ASSERT_EQ(new_payload->DataSize(i), data[i].byte_size()); + ASSERT_TRUE(expected_data[i].operator==(new_data[i])); + } + ASSERT_EQ(*new_payload->ValidData(), *valid_data); + delete[] valid_data; +} diff --git a/internal/core/unittest/test_disk_file_manager_test.cpp b/internal/core/unittest/test_disk_file_manager_test.cpp index 4c5b75001106c..9f2251baa4304 100644 --- a/internal/core/unittest/test_disk_file_manager_test.cpp +++ b/internal/core/unittest/test_disk_file_manager_test.cpp @@ -104,7 +104,8 @@ TEST_F(DiskAnnFileManagerTest, AddFilePositiveParallel) { auto buf = std::unique_ptr(new uint8_t[file_size]); lcm->Read(file, buf.get(), file_size); - auto index = milvus::storage::CreateFieldData(storage::DataType::INT8); + auto index = + milvus::storage::CreateFieldData(storage::DataType::INT8, false); index->FillFieldData(buf.get(), file_size); auto rows = index->get_num_rows(); auto rawData = (uint8_t*)(index->Data()); @@ -268,7 +269,7 @@ auto PrepareInsertData(const int64_t opt_field_data_range) -> std::string { std::vector data = PrepareRawFieldData(opt_field_data_range); - auto field_data = storage::CreateFieldData(DT, 1, kEntityCnt); + auto field_data = storage::CreateFieldData(DT, false, 1, kEntityCnt); field_data->FillFieldData(data.data(), kEntityCnt); storage::InsertData insert_data(field_data); insert_data.SetFieldDataMeta(kOptVecFieldDataMeta); diff --git a/internal/core/unittest/test_expr.cpp b/internal/core/unittest/test_expr.cpp index 204adef5eef10..302f86aac1623 100644 --- a/internal/core/unittest/test_expr.cpp +++ b/internal/core/unittest/test_expr.cpp @@ -1613,7 +1613,8 @@ TEST(Expr, TestExprPerformance) { TEST_P(ExprTest, test_term_pk) { auto schema = std::make_shared(); - schema->AddField(FieldName("Timestamp"), FieldId(1), DataType::INT64); + schema->AddField( + FieldName("Timestamp"), FieldId(1), DataType::INT64, false); auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto str1_fid = schema->AddDebugField("string1", DataType::VARCHAR); auto int64_fid = schema->AddDebugField("int64", DataType::INT64); diff --git a/internal/core/unittest/test_group_by.cpp b/internal/core/unittest/test_group_by.cpp index af64c78a1d395..9b9fd5fa1c254 100644 --- a/internal/core/unittest/test_group_by.cpp +++ b/internal/core/unittest/test_group_by.cpp @@ -32,7 +32,7 @@ prepareSegmentSystemFieldData(const std::unique_ptr& segment, size_t row_count, GeneratedData& data_set) { auto field_data = - std::make_shared>(DataType::INT64); + std::make_shared>(DataType::INT64, false); field_data->FillFieldData(data_set.row_ids_.data(), row_count); auto field_data_info = FieldDataInfo{RowFieldID.get(), @@ -40,7 +40,8 @@ prepareSegmentSystemFieldData(const std::unique_ptr& segment, std::vector{field_data}}; segment->LoadFieldData(RowFieldID, field_data_info); - field_data = std::make_shared>(DataType::INT64); + field_data = + std::make_shared>(DataType::INT64, false); field_data->FillFieldData(data_set.timestamps_.data(), row_count); field_data_info = FieldDataInfo{TimestampFieldID.get(), diff --git a/internal/core/unittest/test_growing.cpp b/internal/core/unittest/test_growing.cpp index 451123940cbd2..bb17265f9885f 100644 --- a/internal/core/unittest/test_growing.cpp +++ b/internal/core/unittest/test_growing.cpp @@ -323,5 +323,176 @@ TEST_P(GrowingTest, FillData) { num_inserted); EXPECT_EQ(float_array_result->scalars().array_data().data_size(), num_inserted); + + EXPECT_EQ(bool_result->valid_data_size(), 0); + EXPECT_EQ(int8_result->valid_data_size(), 0); + EXPECT_EQ(int16_result->valid_data_size(), 0); + EXPECT_EQ(int32_result->valid_data_size(), 0); + EXPECT_EQ(int64_result->valid_data_size(), 0); + EXPECT_EQ(float_result->valid_data_size(), 0); + EXPECT_EQ(double_result->valid_data_size(), 0); + EXPECT_EQ(varchar_result->valid_data_size(), 0); + EXPECT_EQ(json_result->valid_data_size(), 0); + EXPECT_EQ(int_array_result->valid_data_size(), 0); + EXPECT_EQ(long_array_result->valid_data_size(), 0); + EXPECT_EQ(bool_array_result->valid_data_size(), 0); + EXPECT_EQ(string_array_result->valid_data_size(), 0); + EXPECT_EQ(double_array_result->valid_data_size(), 0); + EXPECT_EQ(float_array_result->valid_data_size(), 0); + } +} + +TEST(Growing, FillNullableData) { + auto schema = std::make_shared(); + auto metric_type = knowhere::metric::L2; + auto bool_field = schema->AddDebugField("bool", DataType::BOOL, true); + auto int8_field = schema->AddDebugField("int8", DataType::INT8, true); + auto int16_field = schema->AddDebugField("int16", DataType::INT16, true); + auto int32_field = schema->AddDebugField("int32", DataType::INT32, true); + auto int64_field = schema->AddDebugField("int64", DataType::INT64); + auto float_field = schema->AddDebugField("float", DataType::FLOAT, true); + auto double_field = schema->AddDebugField("double", DataType::DOUBLE, true); + auto varchar_field = + schema->AddDebugField("varchar", DataType::VARCHAR, true); + auto json_field = schema->AddDebugField("json", DataType::JSON, true); + auto int_array_field = schema->AddDebugField( + "int_array", DataType::ARRAY, DataType::INT8, true); + auto long_array_field = schema->AddDebugField( + "long_array", DataType::ARRAY, DataType::INT64, true); + auto bool_array_field = schema->AddDebugField( + "bool_array", DataType::ARRAY, DataType::BOOL, true); + auto string_array_field = schema->AddDebugField( + "string_array", DataType::ARRAY, DataType::VARCHAR, true); + auto double_array_field = schema->AddDebugField( + "double_array", DataType::ARRAY, DataType::DOUBLE, true); + auto float_array_field = schema->AddDebugField( + "float_array", DataType::ARRAY, DataType::FLOAT, true); + auto vec = schema->AddDebugField( + "embeddings", DataType::VECTOR_FLOAT, 128, metric_type); + schema->set_primary_field_id(int64_field); + + std::map index_params = { + {"index_type", "IVF_FLAT"}, + {"metric_type", metric_type}, + {"nlist", "128"}}; + std::map type_params = {{"dim", "128"}}; + FieldIndexMeta fieldIndexMeta( + vec, std::move(index_params), std::move(type_params)); + auto config = SegcoreConfig::default_config(); + config.set_chunk_rows(1024); + config.set_enable_interim_segment_index(true); + std::map filedMap = {{vec, fieldIndexMeta}}; + IndexMetaPtr metaPtr = + std::make_shared(100000, std::move(filedMap)); + auto segment_growing = CreateGrowingSegment(schema, metaPtr, 1, config); + auto segment = dynamic_cast(segment_growing.get()); + + int64_t per_batch = 1000; + int64_t n_batch = 3; + int64_t dim = 128; + for (int64_t i = 0; i < n_batch; i++) { + auto dataset = DataGen(schema, per_batch); + auto bool_values = dataset.get_col(bool_field); + auto int8_values = dataset.get_col(int8_field); + auto int16_values = dataset.get_col(int16_field); + auto int32_values = dataset.get_col(int32_field); + auto int64_values = dataset.get_col(int64_field); + auto float_values = dataset.get_col(float_field); + auto double_values = dataset.get_col(double_field); + auto varchar_values = dataset.get_col(varchar_field); + auto json_values = dataset.get_col(json_field); + auto int_array_values = dataset.get_col(int_array_field); + auto long_array_values = dataset.get_col(long_array_field); + auto bool_array_values = dataset.get_col(bool_array_field); + auto string_array_values = + dataset.get_col(string_array_field); + auto double_array_values = + dataset.get_col(double_array_field); + auto float_array_values = + dataset.get_col(float_array_field); + auto vector_values = dataset.get_col(vec); + + auto offset = segment->PreInsert(per_batch); + segment->Insert(offset, + per_batch, + dataset.row_ids_.data(), + dataset.timestamps_.data(), + dataset.raw_); + auto num_inserted = (i + 1) * per_batch; + auto ids_ds = GenRandomIds(num_inserted); + auto bool_result = + segment->bulk_subscript(bool_field, ids_ds->GetIds(), num_inserted); + auto int8_result = + segment->bulk_subscript(int8_field, ids_ds->GetIds(), num_inserted); + auto int16_result = segment->bulk_subscript( + int16_field, ids_ds->GetIds(), num_inserted); + auto int32_result = segment->bulk_subscript( + int32_field, ids_ds->GetIds(), num_inserted); + auto int64_result = segment->bulk_subscript( + int64_field, ids_ds->GetIds(), num_inserted); + auto float_result = segment->bulk_subscript( + float_field, ids_ds->GetIds(), num_inserted); + auto double_result = segment->bulk_subscript( + double_field, ids_ds->GetIds(), num_inserted); + auto varchar_result = segment->bulk_subscript( + varchar_field, ids_ds->GetIds(), num_inserted); + auto json_result = + segment->bulk_subscript(json_field, ids_ds->GetIds(), num_inserted); + auto int_array_result = segment->bulk_subscript( + int_array_field, ids_ds->GetIds(), num_inserted); + auto long_array_result = segment->bulk_subscript( + long_array_field, ids_ds->GetIds(), num_inserted); + auto bool_array_result = segment->bulk_subscript( + bool_array_field, ids_ds->GetIds(), num_inserted); + auto string_array_result = segment->bulk_subscript( + string_array_field, ids_ds->GetIds(), num_inserted); + auto double_array_result = segment->bulk_subscript( + double_array_field, ids_ds->GetIds(), num_inserted); + auto float_array_result = segment->bulk_subscript( + float_array_field, ids_ds->GetIds(), num_inserted); + auto vec_result = + segment->bulk_subscript(vec, ids_ds->GetIds(), num_inserted); + + EXPECT_EQ(bool_result->scalars().bool_data().data_size(), num_inserted); + EXPECT_EQ(int8_result->scalars().int_data().data_size(), num_inserted); + EXPECT_EQ(int16_result->scalars().int_data().data_size(), num_inserted); + EXPECT_EQ(int32_result->scalars().int_data().data_size(), num_inserted); + EXPECT_EQ(int64_result->scalars().long_data().data_size(), + num_inserted); + EXPECT_EQ(float_result->scalars().float_data().data_size(), + num_inserted); + EXPECT_EQ(double_result->scalars().double_data().data_size(), + num_inserted); + EXPECT_EQ(varchar_result->scalars().string_data().data_size(), + num_inserted); + EXPECT_EQ(json_result->scalars().json_data().data_size(), num_inserted); + EXPECT_EQ(vec_result->vectors().float_vector().data_size(), + num_inserted * dim); + EXPECT_EQ(int_array_result->scalars().array_data().data_size(), + num_inserted); + EXPECT_EQ(long_array_result->scalars().array_data().data_size(), + num_inserted); + EXPECT_EQ(bool_array_result->scalars().array_data().data_size(), + num_inserted); + EXPECT_EQ(string_array_result->scalars().array_data().data_size(), + num_inserted); + EXPECT_EQ(double_array_result->scalars().array_data().data_size(), + num_inserted); + EXPECT_EQ(float_array_result->scalars().array_data().data_size(), + num_inserted); + EXPECT_EQ(bool_result->valid_data_size(), num_inserted); + EXPECT_EQ(int8_result->valid_data_size(), num_inserted); + EXPECT_EQ(int16_result->valid_data_size(), num_inserted); + EXPECT_EQ(int32_result->valid_data_size(), num_inserted); + EXPECT_EQ(float_result->valid_data_size(), num_inserted); + EXPECT_EQ(double_result->valid_data_size(), num_inserted); + EXPECT_EQ(varchar_result->valid_data_size(), num_inserted); + EXPECT_EQ(json_result->valid_data_size(), num_inserted); + EXPECT_EQ(int_array_result->valid_data_size(), num_inserted); + EXPECT_EQ(long_array_result->valid_data_size(), num_inserted); + EXPECT_EQ(bool_array_result->valid_data_size(), num_inserted); + EXPECT_EQ(string_array_result->valid_data_size(), num_inserted); + EXPECT_EQ(double_array_result->valid_data_size(), num_inserted); + EXPECT_EQ(float_array_result->valid_data_size(), num_inserted); } } diff --git a/internal/core/unittest/test_growing_index.cpp b/internal/core/unittest/test_growing_index.cpp index 7d619182b650d..eb8edcfde683d 100644 --- a/internal/core/unittest/test_growing_index.cpp +++ b/internal/core/unittest/test_growing_index.cpp @@ -150,10 +150,10 @@ TEST_P(GrowingIndexTest, Correctness) { const VectorBase* field_data = nullptr; if (is_sparse) { field_data = segmentImplPtr->get_insert_record() - .get_field_data(vec); + .get_data(vec); } else { field_data = segmentImplPtr->get_insert_record() - .get_field_data(vec); + .get_data(vec); } auto inserted = (i + 1) * per_batch; diff --git a/internal/core/unittest/test_inverted_index.cpp b/internal/core/unittest/test_inverted_index.cpp index 83d3a65673174..d0cbd0e80c7de 100644 --- a/internal/core/unittest/test_inverted_index.cpp +++ b/internal/core/unittest/test_inverted_index.cpp @@ -330,7 +330,7 @@ test_string() { data.push_back(std::to_string(rand())); } - auto field_data = storage::CreateFieldData(dtype); + auto field_data = storage::CreateFieldData(dtype, false); field_data->FillFieldData(data.data(), data.size()); storage::InsertData insert_data(field_data); insert_data.SetFieldDataMeta(field_meta); diff --git a/internal/core/unittest/test_kmeans_clustering.cpp b/internal/core/unittest/test_kmeans_clustering.cpp index e51c5048cf6d5..b81b8c3452b71 100644 --- a/internal/core/unittest/test_kmeans_clustering.cpp +++ b/internal/core/unittest/test_kmeans_clustering.cpp @@ -194,7 +194,7 @@ test_run() { for (int64_t i = 0; i < nb * dim; ++i) { data_gen[i] = rand(); } - auto field_data = storage::CreateFieldData(dtype, dim); + auto field_data = storage::CreateFieldData(dtype, false, dim); field_data->FillFieldData(data_gen.data(), data_gen.size() / dim); storage::InsertData insert_data(field_data); insert_data.SetFieldDataMeta(field_meta); diff --git a/internal/core/unittest/test_query.cpp b/internal/core/unittest/test_query.cpp index 81abab1586b1e..ff5b8a5b48576 100644 --- a/internal/core/unittest/test_query.cpp +++ b/internal/core/unittest/test_query.cpp @@ -555,6 +555,7 @@ TEST(Query, FillSegment) { { auto field = proto.add_fields(); field->set_name("fakevec"); + field->set_nullable(false); field->set_is_primary_key(false); field->set_description("asdgfsagf"); field->set_fieldid(100); @@ -570,6 +571,7 @@ TEST(Query, FillSegment) { { auto field = proto.add_fields(); field->set_name("the_key"); + field->set_nullable(false); field->set_fieldid(101); field->set_is_primary_key(true); field->set_description("asdgfsagf"); @@ -579,6 +581,7 @@ TEST(Query, FillSegment) { { auto field = proto.add_fields(); field->set_name("the_value"); + field->set_nullable(true); field->set_fieldid(102); field->set_is_primary_key(false); field->set_description("asdgfsagf"); @@ -595,6 +598,7 @@ TEST(Query, FillSegment) { dataset.get_col(FieldId(100)); // vector field const auto std_i32_vec = dataset.get_col(FieldId(102)); // scalar field + const auto i32_vec_valid_data = dataset.get_col_valid(FieldId(102)); std::vector> segments; segments.emplace_back([&] { @@ -659,6 +663,8 @@ TEST(Query, FillSegment) { auto output_i32_field_data = fields_data.at(i32_field_id)->scalars().int_data().data(); ASSERT_EQ(output_i32_field_data.size(), topk * num_queries); + auto output_i32_valid_data = fields_data.at(i32_field_id)->valid_data(); + ASSERT_EQ(output_i32_valid_data.size(), topk * num_queries); for (int i = 0; i < topk * num_queries; i++) { int64_t val = std::get(result->primary_keys_[i]); @@ -666,6 +672,7 @@ TEST(Query, FillSegment) { auto internal_offset = result->seg_offsets_[i]; auto std_val = std_vec[internal_offset]; auto std_i32 = std_i32_vec[internal_offset]; + auto std_i32_valid = i32_vec_valid_data[internal_offset]; std::vector std_vfloat(dim); std::copy_n(std_vfloat_vec.begin() + dim * internal_offset, dim, @@ -684,6 +691,10 @@ TEST(Query, FillSegment) { int i32; memcpy(&i32, &output_i32_field_data[i], sizeof(int32_t)); ASSERT_EQ(i32, std_i32); + // check int32 valid field + bool i32_valid; + memcpy(&i32_valid, &output_i32_valid_data[i], sizeof(bool)); + ASSERT_EQ(i32_valid, std_i32_valid); } } } diff --git a/internal/core/unittest/test_sealed.cpp b/internal/core/unittest/test_sealed.cpp index 046f9e7948f0f..f4fbec25c024a 100644 --- a/internal/core/unittest/test_sealed.cpp +++ b/internal/core/unittest/test_sealed.cpp @@ -862,9 +862,9 @@ TEST(Sealed, LoadScalarIndex) { LoadFieldDataInfo row_id_info; FieldMeta row_id_field_meta( - FieldName("RowID"), RowFieldID, DataType::INT64); + FieldName("RowID"), RowFieldID, DataType::INT64, false); auto field_data = - std::make_shared>(DataType::INT64); + std::make_shared>(DataType::INT64, false); field_data->FillFieldData(dataset.row_ids_.data(), N); auto field_data_info = FieldDataInfo{ RowFieldID.get(), N, std::vector{field_data}}; @@ -872,8 +872,9 @@ TEST(Sealed, LoadScalarIndex) { LoadFieldDataInfo ts_info; FieldMeta ts_field_meta( - FieldName("Timestamp"), TimestampFieldID, DataType::INT64); - field_data = std::make_shared>(DataType::INT64); + FieldName("Timestamp"), TimestampFieldID, DataType::INT64, false); + field_data = + std::make_shared>(DataType::INT64, false); field_data->FillFieldData(dataset.timestamps_.data(), N); field_data_info = FieldDataInfo{ TimestampFieldID.get(), N, std::vector{field_data}}; @@ -1142,7 +1143,8 @@ TEST(Sealed, BF) { SealedLoadFieldData(dataset, *segment, {fake_id.get()}); auto vec_data = GenRandomFloatVecs(N, dim); - auto field_data = storage::CreateFieldData(DataType::VECTOR_FLOAT, dim); + auto field_data = + storage::CreateFieldData(DataType::VECTOR_FLOAT, false, dim); field_data->FillFieldData(vec_data.data(), N); auto field_data_info = FieldDataInfo{fake_id.get(), N, std::vector{field_data}}; @@ -1196,7 +1198,8 @@ TEST(Sealed, BF_Overflow) { SealedLoadFieldData(dataset, *segment, {fake_id.get()}); auto vec_data = GenMaxFloatVecs(N, dim); - auto field_data = storage::CreateFieldData(DataType::VECTOR_FLOAT, dim); + auto field_data = + storage::CreateFieldData(DataType::VECTOR_FLOAT, false, dim); field_data->FillFieldData(vec_data.data(), N); auto field_data_info = FieldDataInfo{fake_id.get(), N, std::vector{field_data}}; @@ -1874,7 +1877,8 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { //test for int64 std::vector pks = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - auto pk_field_data = storage::CreateFieldData(DataType::INT64, 1, 10); + auto pk_field_data = + storage::CreateFieldData(DataType::INT64, false, 1, 10); pk_field_data->FillFieldData(pks.data(), N); segment->LoadPrimitiveSkipIndex( pk_fid, 0, DataType::INT64, pk_field_data->Data(), N); @@ -1915,7 +1919,8 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { //test for int32 std::vector int32s = {2, 2, 3, 4, 5, 6, 7, 8, 9, 12}; - auto int32_field_data = storage::CreateFieldData(DataType::INT32, 1, 10); + auto int32_field_data = + storage::CreateFieldData(DataType::INT32, false, 1, 10); int32_field_data->FillFieldData(int32s.data(), N); segment->LoadPrimitiveSkipIndex( i32_fid, 0, DataType::INT32, int32_field_data->Data(), N); @@ -1925,7 +1930,8 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { //test for int16 std::vector int16s = {2, 2, 3, 4, 5, 6, 7, 8, 9, 12}; - auto int16_field_data = storage::CreateFieldData(DataType::INT16, 1, 10); + auto int16_field_data = + storage::CreateFieldData(DataType::INT16, false, 1, 10); int16_field_data->FillFieldData(int16s.data(), N); segment->LoadPrimitiveSkipIndex( i16_fid, 0, DataType::INT16, int16_field_data->Data(), N); @@ -1935,7 +1941,8 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { //test for int8 std::vector int8s = {2, 2, 3, 4, 5, 6, 7, 8, 9, 12}; - auto int8_field_data = storage::CreateFieldData(DataType::INT8, 1, 10); + auto int8_field_data = + storage::CreateFieldData(DataType::INT8, false, 1, 10); int8_field_data->FillFieldData(int8s.data(), N); segment->LoadPrimitiveSkipIndex( i8_fid, 0, DataType::INT8, int8_field_data->Data(), N); @@ -1946,7 +1953,8 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { // test for float std::vector floats = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; - auto float_field_data = storage::CreateFieldData(DataType::FLOAT, 1, 10); + auto float_field_data = + storage::CreateFieldData(DataType::FLOAT, false, 1, 10); float_field_data->FillFieldData(floats.data(), N); segment->LoadPrimitiveSkipIndex( float_fid, 0, DataType::FLOAT, float_field_data->Data(), N); @@ -1957,7 +1965,8 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { // test for double std::vector doubles = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; - auto double_field_data = storage::CreateFieldData(DataType::DOUBLE, 1, 10); + auto double_field_data = + storage::CreateFieldData(DataType::DOUBLE, false, 1, 10); double_field_data->FillFieldData(doubles.data(), N); segment->LoadPrimitiveSkipIndex( double_fid, 0, DataType::DOUBLE, double_field_data->Data(), N); @@ -1980,7 +1989,8 @@ TEST(Sealed, SkipIndexSkipBinaryRange) { //test for int64 std::vector pks = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - auto pk_field_data = storage::CreateFieldData(DataType::INT64, 1, 10); + auto pk_field_data = + storage::CreateFieldData(DataType::INT64, false, 1, 10); pk_field_data->FillFieldData(pks.data(), N); segment->LoadPrimitiveSkipIndex( pk_fid, 0, DataType::INT64, pk_field_data->Data(), N); @@ -2015,7 +2025,8 @@ TEST(Sealed, SkipIndexSkipStringRange) { //test for string std::vector strings = {"e", "f", "g", "g", "j"}; - auto string_field_data = storage::CreateFieldData(DataType::VARCHAR, 1, N); + auto string_field_data = + storage::CreateFieldData(DataType::VARCHAR, false, 1, N); string_field_data->FillFieldData(strings.data(), N); auto string_field_data_info = FieldDataInfo{ string_fid.get(), N, std::vector{string_field_data}}; @@ -2191,4 +2202,174 @@ TEST(Sealed, QueryAllFields) { dataset_size); EXPECT_EQ(float_array_result->scalars().array_data().data_size(), dataset_size); + + EXPECT_EQ(bool_result->valid_data_size(), 0); + EXPECT_EQ(int8_result->valid_data_size(), 0); + EXPECT_EQ(int16_result->valid_data_size(), 0); + EXPECT_EQ(int32_result->valid_data_size(), 0); + EXPECT_EQ(int64_result->valid_data_size(), 0); + EXPECT_EQ(float_result->valid_data_size(), 0); + EXPECT_EQ(double_result->valid_data_size(), 0); + EXPECT_EQ(varchar_result->valid_data_size(), 0); + EXPECT_EQ(json_result->valid_data_size(), 0); + EXPECT_EQ(int_array_result->valid_data_size(), 0); + EXPECT_EQ(long_array_result->valid_data_size(), 0); + EXPECT_EQ(bool_array_result->valid_data_size(), 0); + EXPECT_EQ(string_array_result->valid_data_size(), 0); + EXPECT_EQ(double_array_result->valid_data_size(), 0); + EXPECT_EQ(float_array_result->valid_data_size(), 0); +} + +TEST(Sealed, QueryAllNullableFields) { + auto schema = std::make_shared(); + auto metric_type = knowhere::metric::L2; + auto bool_field = schema->AddDebugField("bool", DataType::BOOL, true); + auto int8_field = schema->AddDebugField("int8", DataType::INT8, true); + auto int16_field = schema->AddDebugField("int16", DataType::INT16, true); + auto int32_field = schema->AddDebugField("int32", DataType::INT32, true); + auto int64_field = schema->AddDebugField("int64", DataType::INT64, false); + auto float_field = schema->AddDebugField("float", DataType::FLOAT, true); + auto double_field = schema->AddDebugField("double", DataType::DOUBLE, true); + auto varchar_field = + schema->AddDebugField("varchar", DataType::VARCHAR, true); + auto json_field = schema->AddDebugField("json", DataType::JSON, true); + auto int_array_field = schema->AddDebugField( + "int_array", DataType::ARRAY, DataType::INT8, true); + auto long_array_field = schema->AddDebugField( + "long_array", DataType::ARRAY, DataType::INT64, true); + auto bool_array_field = schema->AddDebugField( + "bool_array", DataType::ARRAY, DataType::BOOL, true); + auto string_array_field = schema->AddDebugField( + "string_array", DataType::ARRAY, DataType::VARCHAR, true); + auto double_array_field = schema->AddDebugField( + "double_array", DataType::ARRAY, DataType::DOUBLE, true); + auto float_array_field = schema->AddDebugField( + "float_array", DataType::ARRAY, DataType::FLOAT, true); + auto vec = schema->AddDebugField( + "embeddings", DataType::VECTOR_FLOAT, 128, metric_type); + schema->set_primary_field_id(int64_field); + + std::map index_params = { + {"index_type", "IVF_FLAT"}, + {"metric_type", metric_type}, + {"nlist", "128"}}; + std::map type_params = {{"dim", "128"}}; + FieldIndexMeta fieldIndexMeta( + vec, std::move(index_params), std::move(type_params)); + std::map filedMap = {{vec, fieldIndexMeta}}; + IndexMetaPtr metaPtr = + std::make_shared(100000, std::move(filedMap)); + auto segment_sealed = CreateSealedSegment(schema, metaPtr); + auto segment = dynamic_cast(segment_sealed.get()); + + int64_t dataset_size = 1000; + int64_t dim = 128; + auto dataset = DataGen(schema, dataset_size); + SealedLoadFieldData(dataset, *segment); + + auto bool_values = dataset.get_col(bool_field); + auto int8_values = dataset.get_col(int8_field); + auto int16_values = dataset.get_col(int16_field); + auto int32_values = dataset.get_col(int32_field); + auto int64_values = dataset.get_col(int64_field); + auto float_values = dataset.get_col(float_field); + auto double_values = dataset.get_col(double_field); + auto varchar_values = dataset.get_col(varchar_field); + auto json_values = dataset.get_col(json_field); + auto int_array_values = dataset.get_col(int_array_field); + auto long_array_values = dataset.get_col(long_array_field); + auto bool_array_values = dataset.get_col(bool_array_field); + auto string_array_values = dataset.get_col(string_array_field); + auto double_array_values = dataset.get_col(double_array_field); + auto float_array_values = dataset.get_col(float_array_field); + auto vector_values = dataset.get_col(vec); + + auto bool_valid_values = dataset.get_col_valid(bool_field); + auto int8_valid_values = dataset.get_col_valid(int8_field); + auto int16_valid_values = dataset.get_col_valid(int16_field); + auto int32_valid_values = dataset.get_col_valid(int32_field); + auto float_valid_values = dataset.get_col_valid(float_field); + auto double_valid_values = dataset.get_col_valid(double_field); + auto varchar_valid_values = dataset.get_col_valid(varchar_field); + auto json_valid_values = dataset.get_col_valid(json_field); + auto int_array_valid_values = dataset.get_col_valid(int_array_field); + auto long_array_valid_values = dataset.get_col_valid(long_array_field); + auto bool_array_valid_values = dataset.get_col_valid(bool_array_field); + auto string_array_valid_values = dataset.get_col_valid(string_array_field); + auto double_array_valid_values = dataset.get_col_valid(double_array_field); + auto float_array_valid_values = dataset.get_col_valid(float_array_field); + + auto ids_ds = GenRandomIds(dataset_size); + auto bool_result = + segment->bulk_subscript(bool_field, ids_ds->GetIds(), dataset_size); + auto int8_result = + segment->bulk_subscript(int8_field, ids_ds->GetIds(), dataset_size); + auto int16_result = + segment->bulk_subscript(int16_field, ids_ds->GetIds(), dataset_size); + auto int32_result = + segment->bulk_subscript(int32_field, ids_ds->GetIds(), dataset_size); + auto int64_result = + segment->bulk_subscript(int64_field, ids_ds->GetIds(), dataset_size); + auto float_result = + segment->bulk_subscript(float_field, ids_ds->GetIds(), dataset_size); + auto double_result = + segment->bulk_subscript(double_field, ids_ds->GetIds(), dataset_size); + auto varchar_result = + segment->bulk_subscript(varchar_field, ids_ds->GetIds(), dataset_size); + auto json_result = + segment->bulk_subscript(json_field, ids_ds->GetIds(), dataset_size); + auto int_array_result = segment->bulk_subscript( + int_array_field, ids_ds->GetIds(), dataset_size); + auto long_array_result = segment->bulk_subscript( + long_array_field, ids_ds->GetIds(), dataset_size); + auto bool_array_result = segment->bulk_subscript( + bool_array_field, ids_ds->GetIds(), dataset_size); + auto string_array_result = segment->bulk_subscript( + string_array_field, ids_ds->GetIds(), dataset_size); + auto double_array_result = segment->bulk_subscript( + double_array_field, ids_ds->GetIds(), dataset_size); + auto float_array_result = segment->bulk_subscript( + float_array_field, ids_ds->GetIds(), dataset_size); + auto vec_result = + segment->bulk_subscript(vec, ids_ds->GetIds(), dataset_size); + + EXPECT_EQ(bool_result->scalars().bool_data().data_size(), dataset_size); + EXPECT_EQ(int8_result->scalars().int_data().data_size(), dataset_size); + EXPECT_EQ(int16_result->scalars().int_data().data_size(), dataset_size); + EXPECT_EQ(int32_result->scalars().int_data().data_size(), dataset_size); + EXPECT_EQ(int64_result->scalars().long_data().data_size(), dataset_size); + EXPECT_EQ(float_result->scalars().float_data().data_size(), dataset_size); + EXPECT_EQ(double_result->scalars().double_data().data_size(), dataset_size); + EXPECT_EQ(varchar_result->scalars().string_data().data_size(), + dataset_size); + EXPECT_EQ(json_result->scalars().json_data().data_size(), dataset_size); + EXPECT_EQ(vec_result->vectors().float_vector().data_size(), + dataset_size * dim); + EXPECT_EQ(int_array_result->scalars().array_data().data_size(), + dataset_size); + EXPECT_EQ(long_array_result->scalars().array_data().data_size(), + dataset_size); + EXPECT_EQ(bool_array_result->scalars().array_data().data_size(), + dataset_size); + EXPECT_EQ(string_array_result->scalars().array_data().data_size(), + dataset_size); + EXPECT_EQ(double_array_result->scalars().array_data().data_size(), + dataset_size); + EXPECT_EQ(float_array_result->scalars().array_data().data_size(), + dataset_size); + + EXPECT_EQ(bool_result->valid_data_size(), dataset_size); + EXPECT_EQ(int8_result->valid_data_size(), dataset_size); + EXPECT_EQ(int16_result->valid_data_size(), dataset_size); + EXPECT_EQ(int32_result->valid_data_size(), dataset_size); + EXPECT_EQ(float_result->valid_data_size(), dataset_size); + EXPECT_EQ(double_result->valid_data_size(), dataset_size); + EXPECT_EQ(varchar_result->valid_data_size(), dataset_size); + EXPECT_EQ(json_result->valid_data_size(), dataset_size); + EXPECT_EQ(int_array_result->valid_data_size(), dataset_size); + EXPECT_EQ(long_array_result->valid_data_size(), dataset_size); + EXPECT_EQ(bool_array_result->valid_data_size(), dataset_size); + EXPECT_EQ(string_array_result->valid_data_size(), dataset_size); + EXPECT_EQ(double_array_result->valid_data_size(), dataset_size); + EXPECT_EQ(float_array_result->valid_data_size(), dataset_size); } diff --git a/internal/core/unittest/test_utils.cpp b/internal/core/unittest/test_utils.cpp index e75a45d810051..58d8de7cf308d 100644 --- a/internal/core/unittest/test_utils.cpp +++ b/internal/core/unittest/test_utils.cpp @@ -148,7 +148,7 @@ TEST(Util, get_common_prefix) { EXPECT_STREQ(common_prefix.c_str(), ""); } -TEST(Util, dis_closer){ +TEST(Util, dis_closer) { EXPECT_TRUE(milvus::query::dis_closer(0.1, 0.2, "L2")); EXPECT_FALSE(milvus::query::dis_closer(0.2, 0.1, "L2")); EXPECT_FALSE(milvus::query::dis_closer(0.1, 0.1, "L2")); diff --git a/internal/core/unittest/test_utils/DataGen.h b/internal/core/unittest/test_utils/DataGen.h index 48047e5e0400c..842529a492cb8 100644 --- a/internal/core/unittest/test_utils/DataGen.h +++ b/internal/core/unittest/test_utils/DataGen.h @@ -215,6 +215,21 @@ struct GeneratedData { return std::move(ret); } + FixedVector + get_col_valid(FieldId field_id) const { + for (const auto& target_field_data : raw_->fields_data()) { + if (field_id.get() == target_field_data.field_id()) { + auto& field_meta = schema_->operator[](field_id); + Assert(field_meta.is_nullable()); + FixedVector ret(raw_->num_rows()); + auto src_data = target_field_data.valid_data().data(); + std::copy_n(src_data, raw_->num_rows(), ret.data()); + return ret; + } + } + PanicInfo(FieldIDInvalid, "field id not find"); + } + std::unique_ptr get_col(FieldId field_id) const { for (const auto& target_field_data : raw_->fields_data()) { @@ -318,8 +333,14 @@ inline GeneratedData DataGen(SchemaPtr schema, auto insert_data = std::make_unique(); auto insert_cols = [&insert_data]( auto& data, int64_t count, auto& field_meta) { + FixedVector valid_data(count); + if (field_meta.is_nullable()) { + for (int i = 0; i < count; ++i) { + valid_data[i] = i % 2 == 0 ? true : false; + } + } auto array = milvus::segcore::CreateDataArrayFrom( - data.data(), count, field_meta); + data.data(), valid_data.data(), count, field_meta); insert_data->mutable_fields_data()->AddAllocated(array.release()); }; @@ -378,7 +399,7 @@ inline GeneratedData DataGen(SchemaPtr schema, auto res = GenerateRandomSparseFloatVector( N, kTestSparseDim, kTestSparseVectorDensity, seed); auto array = milvus::segcore::CreateDataArrayFrom( - res.get(), N, field_meta); + res.get(), nullptr, N, field_meta); insert_data->mutable_fields_data()->AddAllocated( array.release()); break; @@ -647,7 +668,7 @@ DataGenForJsonArray(SchemaPtr schema, auto insert_cols = [&insert_data]( auto& data, int64_t count, auto& field_meta) { auto array = milvus::segcore::CreateDataArrayFrom( - data.data(), count, field_meta); + data.data(), nullptr, count, field_meta); insert_data->mutable_fields_data()->AddAllocated(array.release()); }; for (auto field_id : schema->get_field_ids()) { @@ -953,9 +974,30 @@ CreateFieldDataFromDataArray(ssize_t raw_count, auto createFieldData = [&field_data, &raw_count](const void* raw_data, DataType data_type, int64_t dim) { - field_data = storage::CreateFieldData(data_type, dim); + field_data = storage::CreateFieldData(data_type, false, dim); field_data->FillFieldData(raw_data, raw_count); }; + auto createNullableFieldData = [&field_data, &raw_count]( + const void* raw_data, + const bool* raw_valid_data, + DataType data_type, + int64_t dim) { + field_data = storage::CreateFieldData(data_type, true, dim); + int byteSize = (raw_count + 7) / 8; + uint8_t* valid_data = new uint8_t[byteSize]; + for (int i = 0; i < raw_count; i++) { + bool value = raw_valid_data[i]; + int byteIndex = i / 8; + int bitIndex = i % 8; + if (value) { + valid_data[byteIndex] |= (1 << bitIndex); + } else { + valid_data[byteIndex] &= ~(1 << bitIndex); + } + } + field_data->FillFieldData(raw_data, valid_data, raw_count); + delete[] valid_data; + }; if (field_meta.is_vector()) { switch (field_meta.get_data_type()) { @@ -998,48 +1040,98 @@ CreateFieldDataFromDataArray(ssize_t raw_count, switch (field_meta.get_data_type()) { case DataType::BOOL: { auto raw_data = data->scalars().bool_data().data().data(); - createFieldData(raw_data, DataType::BOOL, dim); + if (field_meta.is_nullable()) { + auto raw_valid_data = data->valid_data().data(); + createNullableFieldData( + raw_data, raw_valid_data, DataType::BOOL, dim); + } else { + createFieldData(raw_data, DataType::BOOL, dim); + } break; } case DataType::INT8: { auto src_data = data->scalars().int_data().data(); std::vector data_raw(src_data.size()); std::copy_n(src_data.data(), src_data.size(), data_raw.data()); - createFieldData(data_raw.data(), DataType::INT8, dim); + if (field_meta.is_nullable()) { + auto raw_valid_data = data->valid_data().data(); + createNullableFieldData( + data_raw.data(), raw_valid_data, DataType::INT8, dim); + } else { + createFieldData(data_raw.data(), DataType::INT8, dim); + } break; } case DataType::INT16: { auto src_data = data->scalars().int_data().data(); std::vector data_raw(src_data.size()); std::copy_n(src_data.data(), src_data.size(), data_raw.data()); - createFieldData(data_raw.data(), DataType::INT16, dim); + if (field_meta.is_nullable()) { + auto raw_valid_data = data->valid_data().data(); + createNullableFieldData( + data_raw.data(), raw_valid_data, DataType::INT16, dim); + } else { + createFieldData(data_raw.data(), DataType::INT16, dim); + } break; } case DataType::INT32: { auto raw_data = data->scalars().int_data().data().data(); - createFieldData(raw_data, DataType::INT32, dim); + if (field_meta.is_nullable()) { + auto raw_valid_data = data->valid_data().data(); + createNullableFieldData( + raw_data, raw_valid_data, DataType::INT32, dim); + } else { + createFieldData(raw_data, DataType::INT32, dim); + } break; } case DataType::INT64: { auto raw_data = data->scalars().long_data().data().data(); - createFieldData(raw_data, DataType::INT64, dim); + if (field_meta.is_nullable()) { + auto raw_valid_data = data->valid_data().data(); + createNullableFieldData( + raw_data, raw_valid_data, DataType::INT64, dim); + } else { + createFieldData(raw_data, DataType::INT64, dim); + } break; } case DataType::FLOAT: { auto raw_data = data->scalars().float_data().data().data(); - createFieldData(raw_data, DataType::FLOAT, dim); + if (field_meta.is_nullable()) { + auto raw_valid_data = data->valid_data().data(); + createNullableFieldData( + raw_data, raw_valid_data, DataType::FLOAT, dim); + } else { + createFieldData(raw_data, DataType::FLOAT, dim); + } break; } case DataType::DOUBLE: { auto raw_data = data->scalars().double_data().data().data(); - createFieldData(raw_data, DataType::DOUBLE, dim); + if (field_meta.is_nullable()) { + auto raw_valid_data = data->valid_data().data(); + createNullableFieldData( + raw_data, raw_valid_data, DataType::DOUBLE, dim); + } else { + createFieldData(raw_data, DataType::DOUBLE, dim); + } break; } case DataType::VARCHAR: { auto begin = data->scalars().string_data().data().begin(); auto end = data->scalars().string_data().data().end(); std::vector data_raw(begin, end); - createFieldData(data_raw.data(), DataType::VARCHAR, dim); + if (field_meta.is_nullable()) { + auto raw_valid_data = data->valid_data().data(); + createNullableFieldData(data_raw.data(), + raw_valid_data, + DataType::VARCHAR, + dim); + } else { + createFieldData(data_raw.data(), DataType::VARCHAR, dim); + } break; } case DataType::JSON: { @@ -1049,7 +1141,13 @@ CreateFieldDataFromDataArray(ssize_t raw_count, auto str = src_data.Get(i); data_raw[i] = Json(simdjson::padded_string(str)); } - createFieldData(data_raw.data(), DataType::JSON, dim); + if (field_meta.is_nullable()) { + auto raw_valid_data = data->valid_data().data(); + createNullableFieldData( + data_raw.data(), raw_valid_data, DataType::JSON, dim); + } else { + createFieldData(data_raw.data(), DataType::JSON, dim); + } break; } case DataType::ARRAY: { @@ -1058,7 +1156,13 @@ CreateFieldDataFromDataArray(ssize_t raw_count, for (int i = 0; i < src_data.size(); i++) { data_raw[i] = Array(src_data.at(i)); } - createFieldData(data_raw.data(), DataType::ARRAY, dim); + if (field_meta.is_nullable()) { + auto raw_valid_data = data->valid_data().data(); + createNullableFieldData( + data_raw.data(), raw_valid_data, DataType::ARRAY, dim); + } else { + createFieldData(data_raw.data(), DataType::ARRAY, dim); + } break; } default: { @@ -1077,8 +1181,8 @@ SealedLoadFieldData(const GeneratedData& dataset, bool with_mmap = false) { auto row_count = dataset.row_ids_.size(); { - auto field_data = - std::make_shared>(DataType::INT64); + auto field_data = std::make_shared>( + DataType::INT64, false); field_data->FillFieldData(dataset.row_ids_.data(), row_count); auto field_data_info = FieldDataInfo(RowFieldID.get(), @@ -1087,8 +1191,8 @@ SealedLoadFieldData(const GeneratedData& dataset, seg.LoadFieldData(RowFieldID, field_data_info); } { - auto field_data = - std::make_shared>(DataType::INT64); + auto field_data = std::make_shared>( + DataType::INT64, false); field_data->FillFieldData(dataset.timestamps_.data(), row_count); auto field_data_info = FieldDataInfo(TimestampFieldID.get(), diff --git a/internal/core/unittest/test_utils/c_api_test_utils.h b/internal/core/unittest/test_utils/c_api_test_utils.h index cf5eb02eb8a3c..b500b635a56b5 100644 --- a/internal/core/unittest/test_utils/c_api_test_utils.h +++ b/internal/core/unittest/test_utils/c_api_test_utils.h @@ -119,29 +119,62 @@ CheckSearchResultDuplicate(const std::vector& results, const char* get_default_schema_config() { static std::string conf = R"(name: "default-collection" - fields: < - fieldID: 100 - name: "fakevec" - data_type: FloatVector - type_params: < - key: "dim" - value: "16" - > - index_params: < - key: "metric_type" - value: "L2" - > - > - fields: < - fieldID: 101 - name: "age" - data_type: Int64 - is_primary_key: true - >)"; + fields: < + fieldID: 100 + name: "fakevec" + data_type: FloatVector + type_params: < + key: "dim" + value: "16" + > + index_params: < + key: "metric_type" + value: "L2" + > + > + fields: < + fieldID: 101 + name: "age" + data_type: Int64 + is_primary_key: true + >)"; static std::string fake_conf = ""; return conf.c_str(); } +const char* +get_default_schema_config_nullable() { + static std::string conf = R"(name: "default-collection" + fields: < + fieldID: 100 + name: "fakevec" + data_type: FloatVector + type_params: < + key: "dim" + value: "16" + > + index_params: < + key: "metric_type" + value: "L2" + > + > + fields: < + fieldID: 101 + name: "age" + data_type: Int64 + is_primary_key: true + > + fields: < + fieldID: 102 + name: "nullable" + data_type: Int32 + nullable:true + >)"; + static std::string fake_conf = ""; + return conf.c_str(); +} + + CStatus CSearch(CSegmentInterface c_segment, CSearchPlan c_plan, diff --git a/internal/core/unittest/test_utils/storage_test_utils.h b/internal/core/unittest/test_utils/storage_test_utils.h index 05f6e864ec66e..688be00ac8281 100644 --- a/internal/core/unittest/test_utils/storage_test_utils.h +++ b/internal/core/unittest/test_utils/storage_test_utils.h @@ -89,15 +89,15 @@ PrepareInsertBinlog(int64_t collection_id, }; { - auto field_data = - std::make_shared>(DataType::INT64); + auto field_data = std::make_shared>( + DataType::INT64, false); field_data->FillFieldData(dataset.row_ids_.data(), row_count); auto path = prefix + "/" + std::to_string(RowFieldID.get()); SaveFieldData(field_data, path, RowFieldID.get()); } { - auto field_data = - std::make_shared>(DataType::INT64); + auto field_data = std::make_shared>( + DataType::INT64, false); field_data->FillFieldData(dataset.timestamps_.data(), row_count); auto path = prefix + "/" + std::to_string(TimestampFieldID.get()); SaveFieldData(field_data, path, TimestampFieldID.get()); diff --git a/internal/storage/index_data_codec.go b/internal/storage/index_data_codec.go index a3dba549e06fd..5baf695283288 100644 --- a/internal/storage/index_data_codec.go +++ b/internal/storage/index_data_codec.go @@ -221,7 +221,7 @@ func (codec *IndexFileBinlogCodec) DeserializeImpl(blobs []*Blob) ( switch dataType { // just for backward compatibility case schemapb.DataType_Int8: - // todo: smellthemoon, valid_data may need to check when create index + // todo: valid_data may need to check when create index content, _, err := eventReader.GetByteFromPayload() if err != nil { log.Warn("failed to get byte from payload",