Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev/vasil.pashov/index file refactor #1472

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
c81df09
New test cases
Feb 21, 2024
4b05988
Make default column type empty for 0 rowed columns
Feb 23, 2024
28aa366
Add additional empty-type tests
Feb 27, 2024
1450ccf
Cosmetic changes to the empty-type tests
Feb 27, 2024
7365c6c
Remove a test testing that arctic preserves the type of 0-rowed colum…
Feb 28, 2024
dac194c
Preserve the default behavour so that empty dateframes get datetime i…
Feb 28, 2024
47abdbd
Remove non-reg tests because the desired behavior is different. Xfail…
Feb 28, 2024
e0886a6
Xfail tests for older python/pandas versions
Feb 28, 2024
b3762fa
Merge branch 'master' into dev/vasil.pashov/fix-empty-column-default-…
vasil-pashov Feb 29, 2024
8494f3f
Make fixture deterministic so that the CI can distribute tests in par…
Feb 29, 2024
b667c71
Create an empty index class and assign it to 0-rowed DFs
Mar 6, 2024
408d9be
Make empty index compatible with other index types
Mar 12, 2024
be8ab30
Make it possible to append to and update dfs with empty index type
Mar 14, 2024
ec8b050
Fix comments and reduce test count
Mar 14, 2024
4fa0659
Merge branch 'master' into empty-index-type
Mar 15, 2024
9601f70
Merge branch 'master' into empty-index-type
Mar 15, 2024
733934a
Roll back vcpkg version to fix failing abseil build
Mar 18, 2024
fd30bda
Merge branch 'master' into empty-index-type
Mar 18, 2024
a2ce829
Fix erros in tests
Mar 18, 2024
7b0895f
Rename is_not_ranged_index proto field in python and C++
Mar 21, 2024
81871d9
Fix failing tests
Mar 27, 2024
f86640d
Resolve review comments
Mar 29, 2024
4e5a0eb
Merge branch 'master' into dev/vasil.pashov/empty-index
vasil-pashov Mar 29, 2024
35ca8bc
Handle multiindex
Apr 1, 2024
42db980
Merge branch 'dev/vasil.pashov/empty-index' of github.com:man-group/A…
Apr 1, 2024
5406651
Split index.hpp
Apr 2, 2024
b59b24d
Merge remote-tracking branch 'origin/feature/empty_index' into refact…
Apr 2, 2024
86f2e8e
Clean headers
Apr 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/arcticdb/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ set(arcticdb_srcs
storage/storage_factory.cpp
stream/aggregator.cpp
stream/append_map.cpp
stream/index.cpp
stream/piloted_clock.cpp
toolbox/library_tool.cpp
util/allocator.cpp
Expand Down
252 changes: 252 additions & 0 deletions cpp/arcticdb/stream/index.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
/* Copyright 2024 Man Group Operations Limited
*
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
*
* As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
* will be governed by the Apache License, version 2.0.
*/

#include <arcticdb/stream/index.hpp>
#include <arcticdb/column_store/memory_segment.hpp>
#include <arcticdb/pipeline/index_fields.hpp>
#include <arcticdb/entity/type_utils.hpp>


namespace arcticdb::stream {

IndexDescriptor::Type get_index_value_type(const AtomKey& key) {
return std::holds_alternative<timestamp>(key.start_index()) ? IndexDescriptor::TIMESTAMP
: IndexDescriptor::STRING;
}

template <typename Derived>
StreamDescriptor BaseIndex<Derived>::create_stream_descriptor(
StreamId stream_id,
std::initializer_list<FieldRef> fields
) const {
std::vector<FieldRef> fds{fields};
return create_stream_descriptor(stream_id, folly::range(fds));
}

template <typename Derived> const Derived* BaseIndex<Derived>::derived() const {
return static_cast<const Derived*>(this);
}

template <typename Derived> BaseIndex<Derived>::operator IndexDescriptor() const {
return {Derived::field_count(), Derived::type()};
}

template <typename Derived> FieldRef BaseIndex<Derived>::field(size_t) const {
return {static_cast<TypeDescriptor>(typename Derived::TypeDescTag{}), std::string_view(derived()->name())};
}

TimeseriesIndex::TimeseriesIndex(const std::string& name) : name_(name) {}

TimeseriesIndex TimeseriesIndex::default_index() {
return TimeseriesIndex(DefaultName);
}

void TimeseriesIndex::check(const FieldCollection& fields) const {
const size_t fields_size = fields.size();
constexpr int current_fields_size = int(field_count());

const TypeDescriptor& first_field_type = fields[0].type();
const TypeDescriptor& current_first_field_type = this->field(0).type();

const bool valid_type_promotion = has_valid_type_promotion(first_field_type, current_first_field_type).has_value();
const bool trivial_type_compatibility = trivially_compatible_types(first_field_type, current_first_field_type);

const bool compatible_types = valid_type_promotion || trivial_type_compatibility;

util::check_arg(
fields_size >= current_fields_size,
"expected at least {} fields, actual {}",
current_fields_size,
fields_size
);
util::check_arg(compatible_types, "expected field[0]={}, actual {}", this->field(0), fields[0]);
}

IndexValue TimeseriesIndex::start_value_for_segment(const SegmentInMemory& segment) {
if (segment.row_count() == 0)
return {NumericIndex{0}};
auto first_ts = segment.template scalar_at<timestamp>(0, 0).value();
return {first_ts};
}

IndexValue TimeseriesIndex::end_value_for_segment(const SegmentInMemory& segment) {
auto row_count = segment.row_count();
if (row_count == 0)
return {NumericIndex{0}};
auto last_ts = segment.template scalar_at<timestamp>(row_count - 1, 0).value();
return {last_ts};
}

IndexValue TimeseriesIndex::start_value_for_keys_segment(const SegmentInMemory& segment) {
if (segment.row_count() == 0)
return {NumericIndex{0}};
auto start_index_id = int(pipelines::index::Fields::start_index);
auto first_ts = segment.template scalar_at<timestamp>(0, start_index_id).value();
return {first_ts};
}

IndexValue TimeseriesIndex::end_value_for_keys_segment(const SegmentInMemory& segment) {
auto row_count = segment.row_count();
if (row_count == 0)
return {NumericIndex{0}};
auto end_index_id = int(pipelines::index::Fields::end_index);
auto last_ts = segment.template scalar_at<timestamp>(row_count - 1, end_index_id).value();
return {last_ts};
}

const char* TimeseriesIndex::name() const {
return name_.c_str();
}

TimeseriesIndex TimeseriesIndex::make_from_descriptor(const StreamDescriptor& desc) {
if (desc.field_count() > 0)
return TimeseriesIndex(std::string(desc.fields(0).name()));

return TimeseriesIndex(DefaultName);
}


TableIndex::TableIndex(const std::string& name) : name_(name) {
}

TableIndex TableIndex::default_index() {
return TableIndex(DefaultName);
}

void TableIndex::check(const FieldCollection& fields) const {
util::check_arg(
fields.size() >= int(field_count()),
"expected at least {} fields, actual {}",
field_count(),
fields.size()
);

util::check(fields.ref_at(0) == field(0), "Field descriptor mismatch {} != {}", fields.ref_at(0), field(0));
}

IndexValue TableIndex::start_value_for_segment(const SegmentInMemory& segment) {
auto string_index = segment.string_at(0, 0).value();
return {std::string{string_index}};
}

IndexValue TableIndex::end_value_for_segment(const SegmentInMemory& segment) {
auto last_rowid = segment.row_count() - 1;
auto string_index = segment.string_at(last_rowid, 0).value();
return {std::string{string_index}};
}

IndexValue TableIndex::start_value_for_keys_segment(const SegmentInMemory& segment) {
if (segment.row_count() == 0)
return {NumericIndex{0}};
auto start_index_id = int(pipelines::index::Fields::start_index);
auto string_index = segment.string_at(0, start_index_id).value();
return {std::string{string_index}};
}

IndexValue TableIndex::end_value_for_keys_segment(const SegmentInMemory& segment) {
auto row_count = segment.row_count();
if (row_count == 0)
return {NumericIndex{0}};
auto end_index_id = int(pipelines::index::Fields::end_index);
auto string_index = segment.string_at(row_count - 1, end_index_id).value();
return {std::string{string_index}};
}

TableIndex TableIndex::make_from_descriptor(const StreamDescriptor& desc) {
if (desc.field_count() > 0)
return TableIndex(std::string(desc.field(0).name()));

return TableIndex(DefaultName);
}

const char* TableIndex::name() const {
return name_.c_str();
}

RowCountIndex RowCountIndex::default_index() {
return RowCountIndex{};
}


IndexValue RowCountIndex::start_value_for_segment(const SegmentInMemory& segment) {
return static_cast<timestamp>(segment.offset());
}

IndexValue RowCountIndex::end_value_for_segment(const SegmentInMemory& segment) {
return static_cast<timestamp>(segment.offset() + (segment.row_count() - 1));
}

IndexValue RowCountIndex::start_value_for_keys_segment(const SegmentInMemory& segment) {
return static_cast<timestamp>(segment.offset());
}

IndexValue RowCountIndex::end_value_for_keys_segment(const SegmentInMemory& segment) {
return static_cast<timestamp>(segment.offset() + (segment.row_count() - 1));
}

RowCountIndex RowCountIndex::make_from_descriptor(const StreamDescriptor&) const {
return RowCountIndex::default_index();
}

IndexValue EmptyIndex::start_value_for_segment(const SegmentInMemory& segment) {
return static_cast<NumericIndex>(segment.offset());
}

IndexValue EmptyIndex::end_value_for_segment(const SegmentInMemory& segment) {
return static_cast<NumericIndex>(segment.offset());
}

IndexValue EmptyIndex::start_value_for_keys_segment(const SegmentInMemory& segment) {
return static_cast<NumericIndex>(segment.offset());
}

IndexValue EmptyIndex::end_value_for_keys_segment(const SegmentInMemory& segment) {
return static_cast<NumericIndex>(segment.offset());
}

Index index_type_from_descriptor(const StreamDescriptor& desc) {
switch (desc.index().proto().kind()) {
case IndexDescriptor::EMPTY: return EmptyIndex{};
case IndexDescriptor::TIMESTAMP: return TimeseriesIndex::make_from_descriptor(desc);
case IndexDescriptor::STRING: return TableIndex::make_from_descriptor(desc);
case IndexDescriptor::ROWCOUNT: return RowCountIndex{};
default:
util::raise_rte(
"Data obtained from storage refers to an index type that this build of ArcticDB doesn't understand ({}).",
int(desc.index().proto().kind())
);
}
}

Index default_index_type_from_descriptor(const IndexDescriptor::Proto& desc) {
switch (desc.kind()) {
case IndexDescriptor::EMPTY: return EmptyIndex{};
case IndexDescriptor::TIMESTAMP: return TimeseriesIndex::default_index();
case IndexDescriptor::STRING: return TableIndex::default_index();
case IndexDescriptor::ROWCOUNT: return RowCountIndex::default_index();
default: util::raise_rte("Unknown index type {} trying to generate index type", int(desc.kind()));
}
}

Index default_index_type_from_descriptor(const IndexDescriptor& desc) {
return default_index_type_from_descriptor(desc.proto());
}

IndexDescriptor get_descriptor_from_index(const Index& index) {
return util::variant_match(index, [](const auto& idx) { return static_cast<IndexDescriptor>(idx); });
}

Index empty_index() {
return RowCountIndex::default_index();
}

template class BaseIndex<TimeseriesIndex>;
template class BaseIndex<TableIndex>;
template class BaseIndex<RowCountIndex>;
template class BaseIndex<EmptyIndex>;
}
Loading
Loading