Skip to content

Commit

Permalink
change data format
Browse files Browse the repository at this point in the history
Signed-off-by: lixinguo <[email protected]>
  • Loading branch information
lixinguo committed Jul 22, 2024
1 parent e689477 commit 0167033
Show file tree
Hide file tree
Showing 20 changed files with 412 additions and 478 deletions.
2 changes: 1 addition & 1 deletion configs/milvus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -871,4 +871,4 @@ trace:
#maxMemSize will the whole available GPU memory.
gpu:
initMemSize: # Gpu Memory Pool init size
maxMemSize: # Gpu Memory Pool Max size
maxMemSize: # Gpu Memory Pool Max size
8 changes: 4 additions & 4 deletions internal/core/src/common/FieldData.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(const void* source,
}
std::copy_n(static_cast<const Type*>(source),
element_count * dim_,
field_data_.data() + length_ * dim_);
data_.data() + length_ * dim_);
length_ += element_count;
}

Expand All @@ -64,15 +64,15 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
}
std::copy_n(static_cast<const Type*>(field_data),
element_count * dim_,
field_data_.data() + length_ * dim_);
data_.data() + length_ * dim_);

ssize_t byte_count = (element_count + 7) / 8;
// Note: if 'nullable == true` and valid_data is nullptr
// means null_count == 0, will fill it with 0xFF
if (valid_data == nullptr) {
std::fill_n(valid_data_.get(), byte_count, 0xFF);
valid_data_.resize(byte_count, 0xFF);
} else {
std::copy_n(valid_data, byte_count, valid_data_.get());
std::copy_n(valid_data, byte_count, valid_data_.data());
}

length_ += element_count;
Expand Down
8 changes: 5 additions & 3 deletions internal/core/src/common/FieldData.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,11 @@ class FieldData : public FieldDataImpl<Type, true> {
1, data_type, nullable, buffered_num_rows) {
}
static_assert(IsScalar<Type> || std::is_same_v<Type, PkType>);
explicit FieldData(DataType data_type, FixedVector<Type>&& inner_data)
explicit FieldData(DataType data_type,
bool nullable,
FixedVector<Type>&& inner_data)
: FieldDataImpl<Type, true>::FieldDataImpl(
1, data_type, std::move(inner_data)) {
1, data_type, nullable, std::move(inner_data)) {
}
};

Expand Down Expand Up @@ -125,7 +127,7 @@ class FieldData<BFloat16Vector> : public FieldDataImpl<bfloat16, false> {
DataType data_type,
int64_t buffered_num_rows = 0)
: FieldDataImpl<bfloat16, false>::FieldDataImpl(
dim,data_type, false,buffered_num_rows) {
dim, data_type, false, buffered_num_rows) {
}
};

Expand Down
158 changes: 93 additions & 65 deletions internal/core/src/common/FieldDataInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ class FieldDataBase {
virtual void*
Data() = 0;

virtual const uint8_t*
ValidData() const = 0;
virtual uint8_t*
ValidData() = 0;

// For all FieldDataImpl subclasses, this method returns a Type* that points
// at the offset-th row of this field data.
Expand Down Expand Up @@ -117,7 +117,7 @@ class FieldDataBase {
get_null_count() const = 0;

virtual bool
is_null(ssize_t offset) const = 0;
is_valid(ssize_t offset) const = 0;

protected:
const DataType data_type_;
Expand All @@ -143,25 +143,38 @@ class FieldDataImpl : public FieldDataBase {
: FieldDataBase(data_type, nullable),
num_rows_(buffered_num_rows),
dim_(is_type_entire_row ? 1 : dim) {
field_data_.resize(num_rows_ * dim_);
data_.resize(num_rows_ * dim_);
if (nullable) {
if (IsVectorDataType(data_type)){
PanicInfo(NotImplemented,
"vector type not support null");
if (IsVectorDataType(data_type)) {
PanicInfo(NotImplemented, "vector type not support null");
}
valid_data_ =
std::shared_ptr<uint8_t[]>(new uint8_t[(num_rows_ + 7) / 8]);
valid_data_.resize((num_rows_ + 7) / 8);
}
}

explicit FieldDataImpl(size_t dim,
DataType type,
bool nullable,
FixedVector<Type>&& field_data)
FixedVector<Type>&& data)
: FieldDataBase(type, nullable), dim_(is_type_entire_row ? 1 : dim) {
field_data_ = std::move(field_data);
Assert(field_data.size() % dim == 0);
num_rows_ = field_data.size() / dim;
AssertInfo(!nullable, "need to fill valid_data when nullable is true");
data_ = std::move(data);
Assert(data.size() % dim == 0);
num_rows_ = data.size() / dim;
}

explicit FieldDataImpl(size_t dim,
DataType type,
bool nullable,
FixedVector<Type>&& data,
FixedVector<uint8_t>&& valid_data)
: FieldDataBase(type, nullable), dim_(is_type_entire_row ? 1 : dim) {
AssertInfo(nullable,
"no need to fill valid_data when nullable is false");
data_ = std::move(data);
valid_data_ = std::move(valid_data);
Assert(data.size() % dim == 0);
num_rows_ = data.size() / dim;
}

void
Expand Down Expand Up @@ -196,12 +209,12 @@ class FieldDataImpl : public FieldDataBase {

void*
Data() override {
return field_data_.data();
return data_.data();
}

const uint8_t*
ValidData() const override {
return valid_data_.get();
uint8_t*
ValidData() override {
return valid_data_.data();
}

const void*
Expand All @@ -210,23 +223,23 @@ class FieldDataImpl : public FieldDataBase {
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return &field_data_[offset];
}

std::optional<const void*>
Value(ssize_t offset) {
if (!is_type_entire_row) {
return RawValue(offset);
}
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
if (nullable_ && !valid_data_[offset]) {
return std::nullopt;
}
return &field_data_[offset];
}
return &data_[offset];
}

// std::optional<const void*>
// Value(ssize_t offset) {
// if (!is_type_entire_row) {
// return RawValue(offset);
// }
// AssertInfo(offset < get_num_rows(),
// "field data subscript out of range");
// AssertInfo(offset < length(),
// "subscript position don't has valid value");
// if (nullable_ && !valid_data_[offset]) {
// return std::nullopt;
// }
// return &field_data_[offset];
// }

int64_t
Size() const override {
Expand All @@ -250,8 +263,7 @@ class FieldDataImpl : public FieldDataBase {
int64_t
ValidDataSize() const override {
if (nullable_) {
int byteSize = (length() + 7) / 8;
return sizeof(uint8_t) * byteSize;
return sizeof(uint8_t) * (length() + 7) / 8;
}
return 0;
}
Expand All @@ -278,10 +290,10 @@ class FieldDataImpl : public FieldDataBase {
std::lock_guard lck(num_rows_mutex_);
if (cap > num_rows_) {
num_rows_ = cap;
field_data_.resize(num_rows_ * dim_);
data_.resize(num_rows_ * dim_);
}
if (nullable_) {
valid_data_ = std::shared_ptr<uint8_t[]>(new uint8_t[num_rows_]);
valid_data_.resize((num_rows_ + 7) / 8);
}
}

Expand All @@ -297,11 +309,9 @@ class FieldDataImpl : public FieldDataBase {
std::lock_guard lck(num_rows_mutex_);
if (num_rows > num_rows_) {
num_rows_ = num_rows;
field_data_.resize(num_rows_ * dim_);
data_.resize(num_rows_ * dim_);
if (nullable_) {
ssize_t byte_count = (num_rows + 7) / 8;
valid_data_ =
std::shared_ptr<uint8_t[]>(new uint8_t[byte_count]);
valid_data_.resize((num_rows + 7) / 8);
}
}
}
Expand All @@ -324,23 +334,27 @@ class FieldDataImpl : public FieldDataBase {
}

bool
is_null(ssize_t offset) const override {
is_valid(ssize_t offset) const override {
std::shared_lock lck(tell_mutex_);
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
if (!nullable_) {
return false;
return true;
}
auto bit = (valid_data_[offset >> 3] >> ((offset & 0x07))) & 1;
return !bit;
return bit;
}

protected:
FixedVector<Type> field_data_;
std::shared_ptr<uint8_t[]> valid_data_;
// number of elements field_data_ can hold
FixedVector<Type> data_{};
FixedVector<uint8_t> valid_data_{};
// number of elements data_ can hold
int64_t num_rows_;
mutable std::shared_mutex num_rows_mutex_;
int64_t null_count;
// number of actual elements in field_data_
int64_t null_count{0};
// number of actual elements in data_
size_t length_{};
mutable std::shared_mutex tell_mutex_;

Expand All @@ -361,7 +375,7 @@ class FieldDataStringImpl : public FieldDataImpl<std::string, true> {
DataSize() const override {
int64_t data_size = 0;
for (size_t offset = 0; offset < length(); ++offset) {
data_size += field_data_[offset].size();
data_size += data_[offset].size();
}

return data_size;
Expand All @@ -373,7 +387,7 @@ class FieldDataStringImpl : public FieldDataImpl<std::string, true> {
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return field_data_[offset].size();
return data_[offset].size();
}

void
Expand All @@ -390,9 +404,17 @@ class FieldDataStringImpl : public FieldDataImpl<std::string, true> {

auto i = 0;
for (const auto& str : *array) {
field_data_[length_ + i] = str.value();
data_[length_ + i] = str.value();
i++;
}
if (IsNullable()) {
auto valid_data = array->null_bitmap_data();
if (valid_data == nullptr) {
valid_data_.resize((n + 7) / 8, 0xFF);
} else {
std::copy_n(valid_data, (n + 7) / 8, valid_data_.data());
}
}
length_ += n;
}
};
Expand All @@ -409,7 +431,7 @@ class FieldDataJsonImpl : public FieldDataImpl<Json, true> {
DataSize() const override {
int64_t data_size = 0;
for (size_t offset = 0; offset < length(); ++offset) {
data_size += field_data_[offset].data().size();
data_size += data_[offset].data().size();
}

return data_size;
Expand All @@ -421,7 +443,7 @@ class FieldDataJsonImpl : public FieldDataImpl<Json, true> {
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return field_data_[offset].data().size();
return data_[offset].data().size();
}

void
Expand All @@ -448,10 +470,17 @@ class FieldDataJsonImpl : public FieldDataImpl<Json, true> {

auto i = 0;
for (const auto& json : *array) {
field_data_[length_ + i] =
Json(simdjson::padded_string(json.value()));
data_[length_ + i] = Json(simdjson::padded_string(json.value()));
i++;
}
if (IsNullable()) {
auto valid_data = array->null_bitmap_data();
if (valid_data == nullptr) {
valid_data_.resize((n + 7) / 8, 0xFF);
} else {
std::copy_n(valid_data, (n + 7) / 8, valid_data_.data());
}
}
length_ += n;
}
};
Expand All @@ -472,7 +501,7 @@ class FieldDataSparseVectorImpl
DataSize() const override {
int64_t data_size = 0;
for (size_t i = 0; i < length(); ++i) {
data_size += field_data_[i].data_byte_size();
data_size += data_[i].data_byte_size();
}
return data_size;
}
Expand All @@ -483,7 +512,7 @@ class FieldDataSparseVectorImpl
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return field_data_[offset].data_byte_size();
return data_[offset].data_byte_size();
}

// source is a pointer to element_count of
Expand All @@ -504,7 +533,7 @@ class FieldDataSparseVectorImpl
auto& row = ptr[i];
vec_dim_ = std::max(vec_dim_, row.dim());
}
std::copy_n(ptr, element_count, field_data_.data() + length_);
std::copy_n(ptr, element_count, data_.data() + length_);
length_ += element_count;
}

Expand All @@ -523,7 +552,7 @@ class FieldDataSparseVectorImpl

for (int64_t i = 0; i < array->length(); ++i) {
auto view = array->GetView(i);
auto& row = field_data_[length_ + i];
auto& row = data_[length_ + i];
row = CopyAndWrapSparseRow(view.data(), view.size());
vec_dim_ = std::max(vec_dim_, row.dim());
}
Expand All @@ -548,12 +577,11 @@ class FieldDataArrayImpl : public FieldDataImpl<Array, true> {
}

int64_t
DataSize() const override {
DataSize() const override {
int64_t data_size = 0;
for (size_t offset = 0; offset < length(); ++offset) {
data_size += field_data_[offset].byte_size();
data_size += data_[offset].byte_size();
}

return data_size;
}

Expand All @@ -563,7 +591,7 @@ class FieldDataArrayImpl : public FieldDataImpl<Array, true> {
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return field_data_[offset].byte_size();
return data_[offset].byte_size();
}
};

Expand Down
Loading

0 comments on commit 0167033

Please sign in to comment.