Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement empty index for 0-rowed columns #1429

Merged
merged 26 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
c81df09
New test cases
Feb 21, 2024
4b05988
Make default column type empty for 0 rowed columns
Feb 23, 2024
28aa366
Add additional empty-type tests
Feb 27, 2024
1450ccf
Cosmetic changes to the empty-type tests
Feb 27, 2024
7365c6c
Remove a test testing that arctic preserves the type of 0-rowed colum…
Feb 28, 2024
dac194c
Preserve the default behavour so that empty dateframes get datetime i…
Feb 28, 2024
47abdbd
Remove non-reg tests because the desired behavior is different. Xfail…
Feb 28, 2024
e0886a6
Xfail tests for older python/pandas versions
Feb 28, 2024
b3762fa
Merge branch 'master' into dev/vasil.pashov/fix-empty-column-default-…
vasil-pashov Feb 29, 2024
8494f3f
Make fixture deterministic so that the CI can distribute tests in par…
Feb 29, 2024
b667c71
Create an empty index class and assign it to 0-rowed DFs
Mar 6, 2024
408d9be
Make empty index compatible with other index types
Mar 12, 2024
be8ab30
Make it possible to append to and update dfs with empty index type
Mar 14, 2024
ec8b050
Fix comments and reduce test count
Mar 14, 2024
4fa0659
Merge branch 'master' into empty-index-type
Mar 15, 2024
9601f70
Merge branch 'master' into empty-index-type
Mar 15, 2024
733934a
Roll back vcpkg version to fix failing abseil build
Mar 18, 2024
fd30bda
Merge branch 'master' into empty-index-type
Mar 18, 2024
a2ce829
Fix erros in tests
Mar 18, 2024
7b0895f
Rename is_not_ranged_index proto field in python and C++
Mar 21, 2024
81871d9
Fix failing tests
Mar 27, 2024
f86640d
Resolve review comments
Mar 29, 2024
4e5a0eb
Merge branch 'master' into dev/vasil.pashov/empty-index
vasil-pashov Mar 29, 2024
35ca8bc
Handle multiindex
Apr 1, 2024
42db980
Merge branch 'dev/vasil.pashov/empty-index' of github.com:man-group/A…
Apr 1, 2024
292fe48
Address review comment
Apr 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions cpp/arcticdb/entity/merge_descriptors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,13 @@ StreamDescriptor merge_descriptors(
// Merge all the fields for all slices, apart from the index which we already have from the first descriptor.
// Note that we preserve the ordering as we see columns, especially the index which needs to be column 0.
for (const auto &fields : entries) {
if(has_index)
util::variant_match(index, [&fields] (const auto& idx) { idx.check(*fields); });
if (has_index) {
util::variant_match(index,
[](const EmptyIndex&) {},
[](const RowCountIndex&) {},
[&fields] (const auto& idx) { idx.check(*fields); }
);
}

for (size_t idx = has_index ? 1u : 0u; idx < static_cast<size_t>(fields->size()); ++idx) {
const auto& field = fields->at(idx);
Expand Down
3 changes: 1 addition & 2 deletions cpp/arcticdb/entity/stream_descriptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ struct StreamDescriptor {

std::shared_ptr<Proto> data_ = std::make_shared<Proto>();
std::shared_ptr<FieldCollection> fields_ = std::make_shared<FieldCollection>();
;

StreamDescriptor() = default;
~StreamDescriptor() = default;
Expand Down Expand Up @@ -65,7 +64,7 @@ struct StreamDescriptor {
data_->set_sorted(sorted_value_to_proto(sorted));
}

SortedValue get_sorted() {
SortedValue get_sorted() const {
return sorted_value_from_proto(data_->sorted());
}

Expand Down
65 changes: 65 additions & 0 deletions cpp/arcticdb/entity/types_proto.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,69 @@ namespace arcticdb::entity {
}, id);
}

IndexDescriptor::IndexDescriptor(size_t field_count, Type type) {
data_.set_kind(type);
data_.set_field_count(static_cast<uint32_t>(field_count));
}

IndexDescriptor::IndexDescriptor(arcticdb::proto::descriptors::IndexDescriptor data)
: data_(std::move(data)) {
}

bool IndexDescriptor::uninitialized() const {
return data_.field_count() == 0 && data_.kind() == Type::IndexDescriptor_Type_UNKNOWN;
}

const IndexDescriptor::Proto& IndexDescriptor::proto() const {
return data_;
}

size_t IndexDescriptor::field_count() const {
return static_cast<size_t>(data_.field_count());
}

IndexDescriptor::Type IndexDescriptor::type() const {
return data_.kind();
}

void IndexDescriptor::set_type(Type type) {
data_.set_kind(type);
}

bool operator==(const IndexDescriptor& left, const IndexDescriptor& right) {
return left.type() == right.type();
}

IndexDescriptor::TypeChar to_type_char(IndexDescriptor::Type type) {
switch (type) {
case IndexDescriptor::EMPTY: return 'E';
case IndexDescriptor::TIMESTAMP: return 'T';
case IndexDescriptor::ROWCOUNT: return 'R';
case IndexDescriptor::STRING: return 'S';
case IndexDescriptor::UNKNOWN: return 'U';
default: util::raise_rte("Unknown index type: {}", int(type));
}
}

IndexDescriptor::Type from_type_char(IndexDescriptor::TypeChar type) {
switch (type) {
case 'E': return IndexDescriptor::EMPTY;
case 'T': return IndexDescriptor::TIMESTAMP;
case 'R': return IndexDescriptor::ROWCOUNT;
case 'S': return IndexDescriptor::STRING;
case 'U': return IndexDescriptor::UNKNOWN;
default: util::raise_rte("Unknown index type: {}", int(type));
}
}

const char* index_type_to_str(IndexDescriptor::Type type) {
switch (type) {
case IndexDescriptor::EMPTY: return "Empty";
case IndexDescriptor::TIMESTAMP: return "Timestamp";
case IndexDescriptor::ROWCOUNT: return "Row count";
case IndexDescriptor::STRING: return "String";
case IndexDescriptor::UNKNOWN: return "Unknown";
default: util::raise_rte("Unknown index type: {}", int(type));
}
}
} // namespace arcticdb
72 changes: 16 additions & 56 deletions cpp/arcticdb/entity/types_proto.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,69 +49,29 @@ namespace arcticdb::entity {
Proto data_;
using Type = arcticdb::proto::descriptors::IndexDescriptor::Type;

static const Type UNKNOWN = arcticdb::proto::descriptors::IndexDescriptor_Type_UNKNOWN;
static const Type ROWCOUNT = arcticdb::proto::descriptors::IndexDescriptor_Type_ROWCOUNT;
static const Type STRING = arcticdb::proto::descriptors::IndexDescriptor_Type_STRING;
static const Type TIMESTAMP = arcticdb::proto::descriptors::IndexDescriptor_Type_TIMESTAMP;
static constexpr Type UNKNOWN = arcticdb::proto::descriptors::IndexDescriptor_Type_UNKNOWN;
static constexpr Type EMPTY = arcticdb::proto::descriptors::IndexDescriptor_Type_EMPTY;
static constexpr Type ROWCOUNT = arcticdb::proto::descriptors::IndexDescriptor_Type_ROWCOUNT;
static constexpr Type STRING = arcticdb::proto::descriptors::IndexDescriptor_Type_STRING;
static constexpr Type TIMESTAMP = arcticdb::proto::descriptors::IndexDescriptor_Type_TIMESTAMP;

using TypeChar = char;

IndexDescriptor() = default;
IndexDescriptor(size_t field_count, Type type) {
data_.set_kind(type);
data_.set_field_count(static_cast<uint32_t>(field_count));
}

explicit IndexDescriptor(arcticdb::proto::descriptors::IndexDescriptor data)
: data_(std::move(data)) {
}

bool uninitialized() const {
return data_.field_count() == 0 && data_.kind() == Type::IndexDescriptor_Type_UNKNOWN;
}

const Proto& proto() const {
return data_;
}

size_t field_count() const {
return static_cast<size_t>(data_.field_count());
}

Type type() const {
return data_.kind();
}

void set_type(Type type) {
data_.set_kind(type);
}

ARCTICDB_MOVE_COPY_DEFAULT(IndexDescriptor)

friend bool operator==(const IndexDescriptor& left, const IndexDescriptor& right) {
return left.type() == right.type();
}
IndexDescriptor(size_t field_count, Type type);
explicit IndexDescriptor(arcticdb::proto::descriptors::IndexDescriptor data);
bool uninitialized() const;
const Proto& proto() const;
size_t field_count() const;
Type type() const;
void set_type(Type type);
friend bool operator==(const IndexDescriptor& left, const IndexDescriptor& right);
};

constexpr IndexDescriptor::TypeChar to_type_char(IndexDescriptor::Type type) {
switch (type) {
case IndexDescriptor::TIMESTAMP:return 'T';
case IndexDescriptor::ROWCOUNT:return 'R';
case IndexDescriptor::STRING:return 'S';
case IndexDescriptor::UNKNOWN:return 'U';
default:util::raise_rte("Unknown index type: {}", int(type));
}
}

constexpr IndexDescriptor::Type from_type_char(IndexDescriptor::TypeChar type) {
switch (type) {
case 'T': return IndexDescriptor::TIMESTAMP;
case 'R': return IndexDescriptor::ROWCOUNT;
case 'S': return IndexDescriptor::STRING;
case 'U': return IndexDescriptor::UNKNOWN;
default:util::raise_rte("Unknown index type: {}", int(type));
}
}
IndexDescriptor::TypeChar to_type_char(IndexDescriptor::Type type);
IndexDescriptor::Type from_type_char(IndexDescriptor::TypeChar type);
const char* index_type_to_str(IndexDescriptor::Type type);

void set_id(arcticdb::proto::descriptors::StreamDescriptor& pb_desc, StreamId id);

Expand Down
4 changes: 2 additions & 2 deletions cpp/arcticdb/pipeline/frame_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,8 @@ std::pair<size_t, size_t> offset_and_row_count(const std::shared_ptr<pipelines::
return std::make_pair(offset, row_count);
}

bool index_is_not_timeseries_or_is_sorted_ascending(const std::shared_ptr<pipelines::InputTensorFrame>& frame) {
return !std::holds_alternative<stream::TimeseriesIndex>(frame->index) || frame->desc.get_sorted() == SortedValue::ASCENDING;
bool index_is_not_timeseries_or_is_sorted_ascending(const pipelines::InputTensorFrame& frame) {
return !std::holds_alternative<stream::TimeseriesIndex>(frame.index) || frame.desc.get_sorted() == SortedValue::ASCENDING;
}

}
2 changes: 1 addition & 1 deletion cpp/arcticdb/pipeline/frame_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,6 @@ size_t get_slice_rowcounts(
std::pair<size_t, size_t> offset_and_row_count(
const std::shared_ptr<pipelines::PipelineContext>& context);

bool index_is_not_timeseries_or_is_sorted_ascending(const std::shared_ptr<pipelines::InputTensorFrame>& frame);
bool index_is_not_timeseries_or_is_sorted_ascending(const pipelines::InputTensorFrame& frame);

} //namespace arcticdb
2 changes: 1 addition & 1 deletion cpp/arcticdb/pipeline/index_writer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

namespace arcticdb::pipelines::index {
// TODO: change the name - something like KeysSegmentWriter or KeyAggragator or better
template<class Index, std::enable_if_t<InputTensorFrame::is_valid_index_v<Index>, bool> = 0>
template<ValidIndex Index>
class IndexWriter {
// All index segments are row-count indexed in the sense that the keys are
// already ordered - they don't need an additional index
Expand Down
18 changes: 12 additions & 6 deletions cpp/arcticdb/pipeline/input_tensor_frame.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,20 @@ namespace arcticdb::pipelines {

using namespace arcticdb::entity;

struct InputTensorFrame {
/// @TODO Move to a separate "util" header
template <typename T, typename... U>
concept is_any_of = (std::same_as<T, U> || ...);

template <typename IndexT>
concept ValidIndex = is_any_of<
std::remove_cvref_t<std::remove_pointer_t<std::decay_t<IndexT>>>,
stream::TimeseriesIndex,
stream::RowCountIndex,
stream::TableIndex,
stream::EmptyIndex>;

template<class T>
static constexpr bool is_valid_index_v =
std::is_same_v<T, stream::TimeseriesIndex> ||
std::is_same_v<T, stream::RowCountIndex> ||
std::is_same_v<T, stream::TableIndex>;

struct InputTensorFrame {
InputTensorFrame() :
index(stream::empty_index()) {}

Expand Down
Loading
Loading