Skip to content

Commit

Permalink
enhance: support null in c data_datacodec and load null value (#32183)
Browse files Browse the repository at this point in the history
1. support read and write null in segcore
    will store valid_data(use uint8_t type to save memory) in fieldData.
2. support load null
binlog reader read and write data into column(sealed segment),
insertRecord(growing segment). In sealed segment, store valid_data
directly. In growing segment, considering prior implementation and easy
code reading, it covert uint8_t to fbvector<bool>, which may optimize in
future.
3.  retrieve valid_data.
    parse valid_data in search/query.
#31728

---------

Signed-off-by: lixinguo <[email protected]>
Co-authored-by: lixinguo <[email protected]>
  • Loading branch information
smellthemoon and lixinguo authored Jul 23, 2024
1 parent 92de49e commit 5616b7e
Show file tree
Hide file tree
Showing 60 changed files with 2,010 additions and 496 deletions.
2 changes: 1 addition & 1 deletion configs/milvus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -871,4 +871,4 @@ trace:
#maxMemSize will the whole available GPU memory.
gpu:
initMemSize: # Gpu Memory Pool init size
maxMemSize: # Gpu Memory Pool Max size
maxMemSize: # Gpu Memory Pool Max size
114 changes: 102 additions & 12 deletions internal/core/src/common/FieldData.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ template <typename Type, bool is_type_entire_row>
void
FieldDataImpl<Type, is_type_entire_row>::FillFieldData(const void* source,
ssize_t element_count) {
AssertInfo(!nullable_,
"need to fill valid_data, use the 3-argument version instead");

if (element_count == 0) {
return;
}
Expand All @@ -40,7 +43,38 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(const void* source,
}
std::copy_n(static_cast<const Type*>(source),
element_count * dim_,
field_data_.data() + length_ * dim_);
data_.data() + length_ * dim_);
length_ += element_count;
}

template <typename Type, bool is_type_entire_row>
void
FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
const void* field_data, const uint8_t* valid_data, ssize_t element_count) {
AssertInfo(
nullable_,
"no need to fill valid_data, use the 2-argument version instead");
if (element_count == 0) {
return;
}

std::lock_guard lck(tell_mutex_);
if (length_ + element_count > get_num_rows()) {
resize_field_data(length_ + element_count);
}
std::copy_n(static_cast<const Type*>(field_data),
element_count * dim_,
data_.data() + length_ * dim_);

ssize_t byte_count = (element_count + 7) / 8;
// Note: if 'nullable == true` and valid_data is nullptr
// means null_count == 0, will fill it with 0xFF
if (valid_data == nullptr) {
valid_data_.resize(byte_count, 0xFF);
} else {
std::copy_n(valid_data, byte_count, valid_data_.data());
}

length_ += element_count;
}

Expand All @@ -66,6 +100,7 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
if (element_count == 0) {
return;
}
null_count = array->null_count();
switch (data_type_) {
case DataType::BOOL: {
AssertInfo(array->type()->id() == arrow::Type::type::BOOL,
Expand All @@ -76,42 +111,71 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
for (size_t index = 0; index < element_count; ++index) {
values[index] = bool_array->Value(index);
}
if (nullable_) {
return FillFieldData(values.data(),
bool_array->null_bitmap_data(),
element_count);
}
return FillFieldData(values.data(), element_count);
}
case DataType::INT8: {
auto array_info =
GetDataInfoFromArray<arrow::Int8Array, arrow::Type::type::INT8>(
array);
if (nullable_) {
return FillFieldData(
array_info.first, array->null_bitmap_data(), element_count);
}
return FillFieldData(array_info.first, array_info.second);
}
case DataType::INT16: {
auto array_info =
GetDataInfoFromArray<arrow::Int16Array,
arrow::Type::type::INT16>(array);
if (nullable_) {
return FillFieldData(
array_info.first, array->null_bitmap_data(), element_count);
}
return FillFieldData(array_info.first, array_info.second);
}
case DataType::INT32: {
auto array_info =
GetDataInfoFromArray<arrow::Int32Array,
arrow::Type::type::INT32>(array);
if (nullable_) {
return FillFieldData(
array_info.first, array->null_bitmap_data(), element_count);
}
return FillFieldData(array_info.first, array_info.second);
}
case DataType::INT64: {
auto array_info =
GetDataInfoFromArray<arrow::Int64Array,
arrow::Type::type::INT64>(array);
if (nullable_) {
return FillFieldData(
array_info.first, array->null_bitmap_data(), element_count);
}
return FillFieldData(array_info.first, array_info.second);
}
case DataType::FLOAT: {
auto array_info =
GetDataInfoFromArray<arrow::FloatArray,
arrow::Type::type::FLOAT>(array);
if (nullable_) {
return FillFieldData(
array_info.first, array->null_bitmap_data(), element_count);
}
return FillFieldData(array_info.first, array_info.second);
}
case DataType::DOUBLE: {
auto array_info =
GetDataInfoFromArray<arrow::DoubleArray,
arrow::Type::type::DOUBLE>(array);
if (nullable_) {
return FillFieldData(
array_info.first, array->null_bitmap_data(), element_count);
}
return FillFieldData(array_info.first, array_info.second);
}
case DataType::STRING:
Expand All @@ -124,6 +188,10 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
for (size_t index = 0; index < element_count; ++index) {
values[index] = string_array->GetString(index);
}
if (nullable_) {
return FillFieldData(
values.data(), array->null_bitmap_data(), element_count);
}
return FillFieldData(values.data(), element_count);
}
case DataType::JSON: {
Expand All @@ -136,17 +204,33 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
values[index] =
Json(simdjson::padded_string(json_array->GetString(index)));
}
if (nullable_) {
return FillFieldData(
values.data(), array->null_bitmap_data(), element_count);
}
return FillFieldData(values.data(), element_count);
}
case DataType::ARRAY: {
auto array_array =
std::dynamic_pointer_cast<arrow::BinaryArray>(array);
std::vector<Array> values(element_count);
int null_number = 0;
for (size_t index = 0; index < element_count; ++index) {
ScalarArray field_data;
field_data.ParseFromString(array_array->GetString(index));
if (array_array->GetString(index) == "") {
null_number++;
continue;
}
auto success =
field_data.ParseFromString(array_array->GetString(index));
AssertInfo(success, "parse from string failed");
values[index] = Array(field_data);
}
if (nullable_) {
return FillFieldData(
values.data(), array->null_bitmap_data(), element_count);
}
AssertInfo(null_number == 0, "get empty string when not nullable");
return FillFieldData(values.data(), element_count);
}
case DataType::VECTOR_FLOAT:
Expand Down Expand Up @@ -201,27 +285,33 @@ template class FieldDataImpl<bfloat16, false>;
template class FieldDataImpl<knowhere::sparse::SparseRow<float>, true>;

FieldDataPtr
InitScalarFieldData(const DataType& type, int64_t cap_rows) {
InitScalarFieldData(const DataType& type, bool nullable, int64_t cap_rows) {
switch (type) {
case DataType::BOOL:
return std::make_shared<FieldData<bool>>(type, cap_rows);
return std::make_shared<FieldData<bool>>(type, nullable, cap_rows);
case DataType::INT8:
return std::make_shared<FieldData<int8_t>>(type, cap_rows);
return std::make_shared<FieldData<int8_t>>(
type, nullable, cap_rows);
case DataType::INT16:
return std::make_shared<FieldData<int16_t>>(type, cap_rows);
return std::make_shared<FieldData<int16_t>>(
type, nullable, cap_rows);
case DataType::INT32:
return std::make_shared<FieldData<int32_t>>(type, cap_rows);
return std::make_shared<FieldData<int32_t>>(
type, nullable, cap_rows);
case DataType::INT64:
return std::make_shared<FieldData<int64_t>>(type, cap_rows);
return std::make_shared<FieldData<int64_t>>(
type, nullable, cap_rows);
case DataType::FLOAT:
return std::make_shared<FieldData<float>>(type, cap_rows);
return std::make_shared<FieldData<float>>(type, nullable, cap_rows);
case DataType::DOUBLE:
return std::make_shared<FieldData<double>>(type, cap_rows);
return std::make_shared<FieldData<double>>(
type, nullable, cap_rows);
case DataType::STRING:
case DataType::VARCHAR:
return std::make_shared<FieldData<std::string>>(type, cap_rows);
return std::make_shared<FieldData<std::string>>(
type, nullable, cap_rows);
case DataType::JSON:
return std::make_shared<FieldData<Json>>(type, cap_rows);
return std::make_shared<FieldData<Json>>(type, nullable, cap_rows);
default:
PanicInfo(DataTypeInvalid,
"InitScalarFieldData not support data type " +
Expand Down
40 changes: 25 additions & 15 deletions internal/core/src/common/FieldData.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,41 +30,51 @@ template <typename Type>
class FieldData : public FieldDataImpl<Type, true> {
public:
static_assert(IsScalar<Type> || std::is_same_v<Type, PkType>);
explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0)
explicit FieldData(DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataImpl<Type, true>::FieldDataImpl(
1, data_type, buffered_num_rows) {
1, data_type, nullable, buffered_num_rows) {
}
static_assert(IsScalar<Type> || std::is_same_v<Type, PkType>);
explicit FieldData(DataType data_type, FixedVector<Type>&& inner_data)
explicit FieldData(DataType data_type,
bool nullable,
FixedVector<Type>&& inner_data)
: FieldDataImpl<Type, true>::FieldDataImpl(
1, data_type, std::move(inner_data)) {
1, data_type, nullable, std::move(inner_data)) {
}
};

template <>
class FieldData<std::string> : public FieldDataStringImpl {
public:
static_assert(IsScalar<std::string> || std::is_same_v<std::string, PkType>);
explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0)
: FieldDataStringImpl(data_type, buffered_num_rows) {
explicit FieldData(DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataStringImpl(data_type, nullable, buffered_num_rows) {
}
};

template <>
class FieldData<Json> : public FieldDataJsonImpl {
public:
static_assert(IsScalar<std::string> || std::is_same_v<std::string, PkType>);
explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0)
: FieldDataJsonImpl(data_type, buffered_num_rows) {
explicit FieldData(DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataJsonImpl(data_type, nullable, buffered_num_rows) {
}
};

template <>
class FieldData<Array> : public FieldDataArrayImpl {
public:
static_assert(IsScalar<Array> || std::is_same_v<std::string, PkType>);
explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0)
: FieldDataArrayImpl(data_type, buffered_num_rows) {
explicit FieldData(DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataArrayImpl(data_type, nullable, buffered_num_rows) {
}
};

Expand All @@ -75,7 +85,7 @@ class FieldData<FloatVector> : public FieldDataImpl<float, false> {
DataType data_type,
int64_t buffered_num_rows = 0)
: FieldDataImpl<float, false>::FieldDataImpl(
dim, data_type, buffered_num_rows) {
dim, data_type, false, buffered_num_rows) {
}
};

Expand All @@ -86,7 +96,7 @@ class FieldData<BinaryVector> : public FieldDataImpl<uint8_t, false> {
DataType data_type,
int64_t buffered_num_rows = 0)
: binary_dim_(dim),
FieldDataImpl(dim / 8, data_type, buffered_num_rows) {
FieldDataImpl(dim / 8, data_type, false, buffered_num_rows) {
Assert(dim % 8 == 0);
}

Expand All @@ -106,7 +116,7 @@ class FieldData<Float16Vector> : public FieldDataImpl<float16, false> {
DataType data_type,
int64_t buffered_num_rows = 0)
: FieldDataImpl<float16, false>::FieldDataImpl(
dim, data_type, buffered_num_rows) {
dim, data_type, false, buffered_num_rows) {
}
};

Expand All @@ -117,7 +127,7 @@ class FieldData<BFloat16Vector> : public FieldDataImpl<bfloat16, false> {
DataType data_type,
int64_t buffered_num_rows = 0)
: FieldDataImpl<bfloat16, false>::FieldDataImpl(
dim, data_type, buffered_num_rows) {
dim, data_type, false, buffered_num_rows) {
}
};

Expand All @@ -134,6 +144,6 @@ using FieldDataChannel = Channel<FieldDataPtr>;
using FieldDataChannelPtr = std::shared_ptr<FieldDataChannel>;

FieldDataPtr
InitScalarFieldData(const DataType& type, int64_t cap_rows);
InitScalarFieldData(const DataType& type, bool nullable, int64_t cap_rows);

} // namespace milvus
Loading

0 comments on commit 5616b7e

Please sign in to comment.