diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 63d8766c70..1bdc311874 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -1,5 +1,8 @@ # .git-blame-ignore-revs +# Build time improvements +9d2c8ef41589a4e8635b972911d25c0125dc3728 + # Moving implementation from column.hpp to column.cpp # Reordering/removing some inclusions 801cf4b6f0f9ec0a997311fbdc14639537d3bbb6 diff --git a/.github/workflows/analysis_workflow.yml b/.github/workflows/analysis_workflow.yml index f554b641d8..41f260be4d 100644 --- a/.github/workflows/analysis_workflow.yml +++ b/.github/workflows/analysis_workflow.yml @@ -75,7 +75,7 @@ jobs: - name: Install ArcticDB[Testing] shell: bash -el {0} run: | - pip install arcticdb[Testing] + pip install arcticdb[Testing] "protobuf<5" - name: Publish results to Github Pages shell: bash -el {0} diff --git a/.github/workflows/benchmark_commits.yml b/.github/workflows/benchmark_commits.yml index 0c260d2f71..8984a49051 100644 --- a/.github/workflows/benchmark_commits.yml +++ b/.github/workflows/benchmark_commits.yml @@ -13,6 +13,7 @@ jobs: job_type: start benchmark_commit: + timeout-minutes: 1200 needs: [start_ec2_runner] if: | always() && @@ -92,4 +93,4 @@ jobs: with: job_type: stop label: ${{ needs.start_ec2_runner.outputs.label }} - ec2-instance-id: ${{ needs.start_ec2_runner.outputs.ec2-instance-id }} \ No newline at end of file + ec2-instance-id: ${{ needs.start_ec2_runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/persistent_storage.yml b/.github/workflows/persistent_storage.yml index 242e5a8ce1..9b8cbe743d 100644 --- a/.github/workflows/persistent_storage.yml +++ b/.github/workflows/persistent_storage.yml @@ -53,7 +53,7 @@ jobs: - name: Install latest release if: inputs.arcticdb_version == 'latest' run: | - pip install pytest arcticdb + pip install pytest arcticdb "protobuf<5" # Currently, the oldest "supported" release is 3.0.0 # We use this version to test forwards/barwards compatibility @@ -61,7 +61,7 @@ jobs: # Change this value, if we need to support a newer one in the future - name: Install oldest supported release run: | - pip install pytest arcticdb=="3.0.0" + pip install pytest arcticdb=="3.0.0" "protobuf<5" - name: Set persistent storage variables uses: ./.github/actions/set_persistent_storage_env_vars @@ -96,7 +96,7 @@ jobs: - name: Install latest release run: | - pip install pytest arcticdb + pip install pytest arcticdb "protobuf<5" - name: Set persistent storage variables uses: ./.github/actions/set_persistent_storage_env_vars diff --git a/README.md b/README.md index 27387b527c..47c0ef9cb5 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,3 @@ - - - - - -
- -![Alt text](static/happybdayarcticdb.png) - - - -# 🚀 ArcticDB Hits 1,000 GitHub Stars on Our First Open Source Anniversary! 🚀 - -Exciting news as we celebrate two milestones: ArcticDB's first year in the open-source community -and reaching 1,000 GitHub stars! ⭐ - -A huge thank you to all the contributors, supporters, and community members whose involvement has been -pivotal to our success! 🙌 - -
-

diff --git a/cpp/arcticdb/codec/codec-inl.hpp b/cpp/arcticdb/codec/codec-inl.hpp index cb170cf6a8..4adcfc6f76 100644 --- a/cpp/arcticdb/codec/codec-inl.hpp +++ b/cpp/arcticdb/codec/codec-inl.hpp @@ -91,6 +91,7 @@ std::size_t decode_ndarray( util::check(type_desc_tag.data_type() == DataType::EMPTYVAL, "NDArray of type {} should not be of size 0!", datatype_to_str(type_desc_tag.data_type())); + read_bytes = encoding_sizes::data_compressed_size(field); return; } diff --git a/cpp/arcticdb/column_store/chunked_buffer.hpp b/cpp/arcticdb/column_store/chunked_buffer.hpp index 2cc35f6251..4daa91b1ca 100644 --- a/cpp/arcticdb/column_store/chunked_buffer.hpp +++ b/cpp/arcticdb/column_store/chunked_buffer.hpp @@ -271,7 +271,11 @@ class ChunkedBufferImpl { } [[nodiscard]] const uint8_t* data() const { - util::check(blocks_.size() == 1, "Taking a pointer to the beginning of a non-contiguous buffer"); + if (blocks_.empty()) { + return nullptr; + } + internal::check(blocks_.size() == 1, + "Taking a pointer to the beginning of a non-contiguous buffer"); blocks_[0]->magic_.check(); return blocks_[0]->data(); } diff --git a/cpp/arcticdb/column_store/column_data.hpp b/cpp/arcticdb/column_store/column_data.hpp index bba7fa81c5..5a1955e2a7 100644 --- a/cpp/arcticdb/column_store/column_data.hpp +++ b/cpp/arcticdb/column_store/column_data.hpp @@ -213,7 +213,7 @@ struct ColumnData { } // Used to construct [c]end iterators - explicit ColumnDataIterator(ColumnData* parent, typename TDT::DataTypeTag::raw_type* end_ptr): + explicit ColumnDataIterator(ColumnData* parent, RawType* end_ptr): parent_(parent) { data_.ptr_ = end_ptr; } @@ -304,7 +304,7 @@ struct ColumnData { if(!data_->blocks().empty()) { auto block = data_->blocks().at(num_blocks() - 1); auto typed_block_data = next_typed_block(block); - end_ptr = typed_block_data.data() + typed_block_data.row_count(); + end_ptr = const_cast(typed_block_data.data() + typed_block_data.row_count()); } return ColumnDataIterator(this, end_ptr); } diff --git a/cpp/arcticdb/column_store/memory_segment.hpp b/cpp/arcticdb/column_store/memory_segment.hpp index 561feaee0f..73e6d587c7 100644 --- a/cpp/arcticdb/column_store/memory_segment.hpp +++ b/cpp/arcticdb/column_store/memory_segment.hpp @@ -385,10 +385,10 @@ class SegmentInMemory { impl_->set_string_pool(string_pool); } - SegmentInMemory filter(const util::BitSet& filter_bitset, + SegmentInMemory filter(util::BitSet&& filter_bitset, bool filter_down_stringpool=false, bool validate=false) const{ - return SegmentInMemory(impl_->filter(filter_bitset, filter_down_stringpool, validate)); + return SegmentInMemory(impl_->filter(std::move(filter_bitset), filter_down_stringpool, validate)); } /// @see SegmentInMemoryImpl::truncate diff --git a/cpp/arcticdb/column_store/memory_segment_impl.cpp b/cpp/arcticdb/column_store/memory_segment_impl.cpp index 0bb0fbe763..8f8acf55df 100644 --- a/cpp/arcticdb/column_store/memory_segment_impl.cpp +++ b/cpp/arcticdb/column_store/memory_segment_impl.cpp @@ -168,9 +168,10 @@ void SegmentInMemoryImpl::drop_column(std::string_view name) { column_map_->erase(name); } -std::shared_ptr SegmentInMemoryImpl::filter(const util::BitSet& filter_bitset, +std::shared_ptr SegmentInMemoryImpl::filter(util::BitSet&& filter_bitset, bool filter_down_stringpool, bool validate) const { + filter_bitset.resize(row_count()); bool is_input_sparse = is_sparse(); auto num_values = filter_bitset.count(); if(num_values == 0) @@ -210,18 +211,19 @@ std::shared_ptr SegmentInMemoryImpl::filter(const util::Bit } else { bitset_including_sparse.resize((*column)->row_count()); } - if (bitset_including_sparse.count() == 0) { - // No values are set in the sparse column, skip it - return; - } output_col_idx = output->add_column(field(column.index), bitset_including_sparse.count(), true); final_bitset = &bitset_including_sparse; } else { final_bitset = &filter_bitset; } auto& output_col = output->column(position_t(output_col_idx)); - if (sparse_map) + if (sparse_map) { output_col.opt_sparse_map() = std::make_optional(); + if (final_bitset->count() == 0) { + // No values are set in the sparse column, no more work to do + return; + } + } auto output_ptr = reinterpret_cast(output_col.ptr()); auto input_data = (*column)->data(); @@ -585,7 +587,7 @@ std::vector> SegmentInMemoryImpl::split(siz util::BitSetSizeType end = std::min(start + rows, total_rows); // set_range is close interval on [left, right] bitset.set_range(start, end - 1, true); - output.emplace_back(filter(bitset)); + output.emplace_back(filter(std::move(bitset))); } return output; } diff --git a/cpp/arcticdb/column_store/memory_segment_impl.hpp b/cpp/arcticdb/column_store/memory_segment_impl.hpp index b09b726d56..9747be880b 100644 --- a/cpp/arcticdb/column_store/memory_segment_impl.hpp +++ b/cpp/arcticdb/column_store/memory_segment_impl.hpp @@ -758,7 +758,7 @@ class SegmentInMemoryImpl { std::shared_ptr get_output_segment(size_t num_values, bool pre_allocate=true) const; - std::shared_ptr filter(const util::BitSet& filter_bitset, + std::shared_ptr filter(util::BitSet&& filter_bitset, bool filter_down_stringpool=false, bool validate=false) const; diff --git a/cpp/arcticdb/column_store/test/test_memory_segment.cpp b/cpp/arcticdb/column_store/test/test_memory_segment.cpp index 3416242e75..d49e02168c 100644 --- a/cpp/arcticdb/column_store/test/test_memory_segment.cpp +++ b/cpp/arcticdb/column_store/test/test_memory_segment.cpp @@ -481,7 +481,7 @@ TEST(MemSegment, Filter) { filter_bitset.set_bit(retained_row); } - auto filtered_seg = seg.filter(filter_bitset); + auto filtered_seg = seg.filter(std::move(filter_bitset)); for (auto&& [idx, row]: folly::enumerate(filtered_seg)) { ASSERT_EQ(static_cast(retained_rows[idx]), row.scalar_at(0)); diff --git a/cpp/arcticdb/entity/metrics.cpp b/cpp/arcticdb/entity/metrics.cpp index 6f92314fc2..1480e251a5 100644 --- a/cpp/arcticdb/entity/metrics.cpp +++ b/cpp/arcticdb/entity/metrics.cpp @@ -27,45 +27,39 @@ namespace arcticdb { std::shared_ptr PrometheusInstance::instance_; std::once_flag PrometheusInstance::init_flag_; - std::shared_ptr PrometheusConfigInstance::instance(){ - std::call_once(PrometheusConfigInstance::init_flag_, &PrometheusConfigInstance::init); - return PrometheusConfigInstance::instance_; + PrometheusInstance::PrometheusInstance() : configured_(false) { + arcticdb::log::version().debug("PrometheusInstance created"); } - std::shared_ptr PrometheusConfigInstance::instance_; - std::once_flag PrometheusConfigInstance::init_flag_; - - PrometheusInstance::PrometheusInstance() { - - auto cfg = PrometheusConfigInstance::instance()->config; - - if (cfg.prometheus_model() == PrometheusConfigInstance::Proto::PUSH) { - // PUSH MODE - if (cfg.instance().empty() || cfg.host().empty() || cfg.port().empty() || cfg.job_name().empty()) { - util::raise_rte( "Invalid Push PrometheusConfig {}", arcticdb::util::format(cfg)); - } + void PrometheusInstance::configure(const MetricsConfig& config, const bool reconfigure) { + if (configured_ && !reconfigure) { + arcticdb::log::version().warn("Prometheus already configured"); + return; + } + + cfg_ = config; + if (cfg_.model_ == MetricsConfig::Model::PUSH) { // IMP: This is the GROUPING_KEY - every push overwrites the previous grouping key auto labels = prometheus::Gateway::GetInstanceLabel(getHostName()); - mongo_instance_ = cfg.instance(); + mongo_instance_ = cfg_.instance; labels.try_emplace(MONGO_INSTANCE_LABEL, mongo_instance_); - labels.try_emplace(PROMETHEUS_ENV_LABEL, cfg.prometheus_env()); - gateway_= std::make_shared(cfg.host(), cfg.port(), cfg.job_name(), labels); + labels.try_emplace(PROMETHEUS_ENV_LABEL, cfg_.prometheus_env); + gateway_= std::make_shared(cfg_.host, cfg_.port, cfg_.job_name, labels); registry_ = std::make_shared(); gateway_->RegisterCollectable(registry_); - arcticdb::log::version().info("Prometheus Push created with settings {}", arcticdb::util::format(cfg)); + arcticdb::log::version().info("Prometheus Push created with settings {}", cfg_); - } else if (cfg.prometheus_model() == PrometheusConfigInstance::Proto::WEB) { - - // WEB SERVER MODE - if (cfg.port().empty()) { - util::raise_rte( "PrometheusConfig web mode port not set {}", arcticdb::util::format(cfg)); - } + } else if (cfg_.model_ == MetricsConfig::Model::PULL) { // create an http server ie "http://hostname:"+port()+"/metrics" - std::string hostname = getHostName(); - std::string endpoint = hostname + ":" + cfg.port(); + std::string endpoint = cfg_.host + ":" + cfg_.port; + + if (exposer_.use_count() > 0) { + exposer_->RemoveCollectable(registry_, "/metrics"); + exposer_.reset(); + } // default to 2 threads exposer_ = std::make_shared(endpoint, 2); @@ -79,8 +73,10 @@ namespace arcticdb { arcticdb::log::version().info("Prometheus endpoint created on {}/metrics", endpoint); } else { - arcticdb::log::version().info("Prometheus not configured {}", arcticdb::util::format(cfg)); + arcticdb::log::version().info("Prometheus not configured {}", cfg_); } + + configured_ = true; } // new mechanism, labels at runtime diff --git a/cpp/arcticdb/entity/metrics.hpp b/cpp/arcticdb/entity/metrics.hpp index a370f75032..b3366048b0 100644 --- a/cpp/arcticdb/entity/metrics.hpp +++ b/cpp/arcticdb/entity/metrics.hpp @@ -26,6 +26,7 @@ #include #include #include +#include namespace arcticdb { @@ -34,6 +35,42 @@ const std::string PROMETHEUS_ENV_LABEL = "env"; const int SUMMARY_MAX_AGE = 30; const int SUMMARY_AGE_BUCKETS = 5; +class MetricsConfig { +public: + enum class Model { + NO_INIT, + PUSH, + PULL + }; + MetricsConfig() : model_(Model::NO_INIT) {} + + MetricsConfig(const std::string& host, + const std::string& port, + const std::string& job_name, + const std::string& instance, + const std::string& prometheus_env, + const Model model) + : host(host) + , port(port) + , job_name(job_name) + , instance(instance) + , prometheus_env(prometheus_env) + , model_(model) { + util::check(!host.empty(), "MetricsConfig: host is empty"); + util::check(!port.empty(), "MetricsConfig: port is empty"); + util::check(!job_name.empty(), "MetricsConfig: job_name is empty"); + util::check(!instance.empty(), "MetricsConfig: instance is empty"); + util::check(!prometheus_env.empty(), "MetricsConfig: instance is empty"); + util::check(!prometheus_env.empty(), "MetricsConfig: prometheus_env is empty"); + } + + std::string host; + std::string port; + std::string job_name; + std::string instance; + std::string prometheus_env; + Model model_; +}; class PrometheusInstance { public: @@ -67,6 +104,10 @@ class PrometheusInstance { int push(); + void configure(const MetricsConfig& config, const bool reconfigure = false); + + MetricsConfig cfg_; + private: struct HistogramInfo { @@ -88,20 +129,8 @@ class PrometheusInstance { // push gateway std::string mongo_instance_; std::shared_ptr gateway_; -}; + bool configured_; -class PrometheusConfigInstance { -public: - static std::shared_ptr instance(); - - using Proto = arcticdb::proto::utils::PrometheusConfig; - Proto config; - static std::shared_ptr instance_; - static std::once_flag init_flag_; - - static void init(){ - instance_ = std::make_shared(); - } }; inline void log_prometheus_gauge(const std::string& metric_name, const std::string& metric_desc, size_t val) { @@ -121,3 +150,16 @@ inline void log_prometheus_counter(const std::string& metric_name, const std::st } } // Namespace arcticdb + +template<> +struct fmt::formatter { + + template + constexpr auto parse(ParseContext &ctx) { return ctx.begin(); } + + template + auto format(const arcticdb::MetricsConfig k, FormatContext &ctx) const { + return fmt::format_to(ctx.out(), "MetricsConfig: host={}, port={}, job_name={}, instance={}, prometheus_env={}, model={}", + k.host, k.port, k.job_name, k.instance, k.prometheus_env, static_cast(k.model_)); + } +}; \ No newline at end of file diff --git a/cpp/arcticdb/entity/performance_tracing.cpp b/cpp/arcticdb/entity/performance_tracing.cpp index 5d16bb8d16..c2587820a2 100644 --- a/cpp/arcticdb/entity/performance_tracing.cpp +++ b/cpp/arcticdb/entity/performance_tracing.cpp @@ -8,9 +8,9 @@ #include #include -#include #include #include +#include std::shared_ptr RemoteryInstance::instance(){ std::call_once(RemoteryInstance::init_flag_, &RemoteryInstance::init); @@ -63,7 +63,7 @@ namespace arcticdb::detail { std::unordered_map fqn_by_task_name_; std::string thread_name_; - ThreadNameCache():fqn_by_task_name_(),thread_name_(folly::getCurrentThreadName().value_or("Thread")){} + ThreadNameCache():fqn_by_task_name_(),thread_name_(fmt::format("{}", arcticdb::get_thread_id())){} std::string_view get_thread_name(const char * task_name){ if(auto fqn_it = fqn_by_task_name_.find(task_name); fqn_it != fqn_by_task_name_.end()){ diff --git a/cpp/arcticdb/entity/type_utils.cpp b/cpp/arcticdb/entity/type_utils.cpp index 33b0d0a6b9..83df12de1d 100644 --- a/cpp/arcticdb/entity/type_utils.cpp +++ b/cpp/arcticdb/entity/type_utils.cpp @@ -128,9 +128,7 @@ namespace arcticdb { return std::nullopt; } - return entity::TypeDescriptor{ - combine_data_type(slice_value_type(target_type), target_size), - target.dimension()}; + return target; } std::optional has_valid_type_promotion( @@ -148,6 +146,36 @@ namespace arcticdb { if (!maybe_common_type) { maybe_common_type = has_valid_type_promotion(right, left); } + // has_valid_type_promotion checks if the second type can represent all values of the first type + // We also want to handle cases where there is a third type that can represent both + // In practice, this is only the case when there is one signed and one unsigned integer right now + if (!maybe_common_type && + left.dimension() == right.dimension() && + is_integer_type(left.data_type()) && + is_integer_type(right.data_type())) { + auto dimension = left.dimension(); + auto left_type = left.data_type(); + auto right_type = right.data_type(); + auto left_size = slice_bit_size(left_type); + auto right_size = slice_bit_size(right_type); + // To get here we must have one signed and one unsigned type, with the width of the signed type <= the width of the unsigned type + internal::check(is_signed_type(left_type) ^ is_signed_type(right_type), + "Expected one signed and one unsigned int in has_valid_common_type"); + if (is_signed_type(left_type)) { + internal::check(left_size <= right_size, + "Expected left_size <= right_size in has_valid_common_type"); + } else { + // is_signed_type(right_type) + internal::check(right_size <= left_size, + "Expected right_size <= left_size in has_valid_common_type"); + } + auto target_size = entity::SizeBits(uint8_t(std::max(left_size, right_size)) + 1); + if (target_size < entity::SizeBits::COUNT) { + maybe_common_type = entity::TypeDescriptor{ + combine_data_type(entity::ValueType::INT, target_size), + dimension}; + } + } return maybe_common_type; } diff --git a/cpp/arcticdb/entity/types-inl.hpp b/cpp/arcticdb/entity/types-inl.hpp index 3e7b1e87da..8c939a0aed 100644 --- a/cpp/arcticdb/entity/types-inl.hpp +++ b/cpp/arcticdb/entity/types-inl.hpp @@ -41,7 +41,7 @@ constexpr auto visit_dim(DataType dt, Callable &&c) { DT_CASE(EMPTYVAL) DT_CASE(BOOL_OBJECT8) #undef DT_CASE - default: util::raise_rte("Invalid dtype '{}' in visit dim", datatype_to_str(dt)); + default: util::raise_rte("Invalid dtype {}:{} - '{}' in visit dim", int(slice_value_type(dt)), int(slice_bit_size(dt)), datatype_to_str(dt)); } } @@ -69,7 +69,7 @@ auto visit_type(DataType dt, Callable &&c) { DT_CASE(EMPTYVAL) DT_CASE(BOOL_OBJECT8) #undef DT_CASE - default: util::raise_rte("Invalid dtype '{}' in visit type", datatype_to_str(dt)); + default: util::raise_rte("Invalid dtype {}:{} '{}' in visit type", int(slice_value_type(dt)), int(slice_bit_size(dt)), datatype_to_str(dt)); } } diff --git a/cpp/arcticdb/entity/types.hpp b/cpp/arcticdb/entity/types.hpp index 65632df494..b158ae91a4 100644 --- a/cpp/arcticdb/entity/types.hpp +++ b/cpp/arcticdb/entity/types.hpp @@ -303,9 +303,9 @@ constexpr ValueType get_value_type(char specifier) noexcept { // TODO: for the support of Pandas>=2.0, convert any `datetime` to `datetime64[ns]` before-hand and do not // rely uniquely on the resolution-less 'M' specifier if it this doable. case 'M': return ValueType::NANOSECONDS_UTC; // datetime // numpy doesn't support the buffer protocol for datetime64 - case 'U': return ValueType::UTF8_FIXED; // Unicode + case 'U': return ValueType::UTF8_FIXED; // Unicode fixed-width case 'S': return ValueType::ASCII_FIXED; // (byte-)string - case 'O': return ValueType::BYTES; // Fishy, an actual type might be better + case 'O': return ValueType::UTF_DYNAMIC; // Unicode dynamic width default: return ValueType::UNKNOWN_VALUE_TYPE; // Unknown } diff --git a/cpp/arcticdb/pipeline/filter_segment.hpp b/cpp/arcticdb/pipeline/filter_segment.hpp index 142d132bd0..d9069830aa 100644 --- a/cpp/arcticdb/pipeline/filter_segment.hpp +++ b/cpp/arcticdb/pipeline/filter_segment.hpp @@ -13,10 +13,10 @@ namespace arcticdb { inline SegmentInMemory filter_segment(const SegmentInMemory& input, - const util::BitSet& filter_bitset, + util::BitSet&& filter_bitset, bool filter_down_stringpool=false, bool validate=false) { - return input.filter(filter_bitset, filter_down_stringpool, validate); + return input.filter(std::move(filter_bitset), filter_down_stringpool, validate); } inline std::vector partition_segment(const SegmentInMemory& input, diff --git a/cpp/arcticdb/pipeline/frame_utils.hpp b/cpp/arcticdb/pipeline/frame_utils.hpp index d95b5bf8ef..c303c67c3d 100644 --- a/cpp/arcticdb/pipeline/frame_utils.hpp +++ b/cpp/arcticdb/pipeline/frame_utils.hpp @@ -244,7 +244,7 @@ std::optional aggregator_set_data( Column arr_col{column_type_descriptor, true}; for (auto en = values_bitset.first(); en < values_bitset.end(); ++en) { const auto arr_pos = *en; - const auto row_tensor = convert::obj_to_tensor(ptr_data[arr_pos]); + const auto row_tensor = convert::obj_to_tensor(ptr_data[arr_pos], false); const auto row_type_descriptor = TypeDescriptor{row_tensor.data_type(), Dimension::Dim1}; const std::optional& common_type = has_valid_common_type(row_type_descriptor, secondary_type); normalization::check( diff --git a/cpp/arcticdb/pipeline/read_frame.cpp b/cpp/arcticdb/pipeline/read_frame.cpp index a7d63f3407..7fc44cd37c 100644 --- a/cpp/arcticdb/pipeline/read_frame.cpp +++ b/cpp/arcticdb/pipeline/read_frame.cpp @@ -626,7 +626,9 @@ class StringReducer { column_width_(alloc_width), dest_buffer_(ChunkedBuffer::presized(frame_.row_count() * column_width_)), dst_(dest_buffer_.data()) { - std::memset(dest_buffer_.data(), 0, dest_buffer_.bytes()); + if (dest_buffer_.bytes() > 0) { + std::memset(dest_buffer_.data(), 0, dest_buffer_.bytes()); + } } virtual void finalize() { diff --git a/cpp/arcticdb/processing/aggregation.cpp b/cpp/arcticdb/processing/aggregation.cpp index b52dd4b971..2d27449960 100644 --- a/cpp/arcticdb/processing/aggregation.cpp +++ b/cpp/arcticdb/processing/aggregation.cpp @@ -15,15 +15,16 @@ namespace arcticdb void MinMaxAggregatorData::aggregate(const ColumnWithStrings& input_column) { details::visit_type(input_column.column_->type().data_type(), [&] (auto col_tag) { using type_info = ScalarTypeInfo; + using RawType = typename type_info::RawType; if constexpr(!is_sequence_type(type_info::data_type)) { Column::for_each(*input_column.column_, [this](auto value) { - const auto& curr = static_cast(value); + const auto& curr = static_cast(value); if (ARCTICDB_UNLIKELY(!min_.has_value())) { min_ = std::make_optional(curr, type_info::data_type); max_ = std::make_optional(curr, type_info::data_type); } else { - min_->set(std::min(min_->get(), curr)); - max_->set(std::max(max_->get(), curr)); + min_->set(std::min(min_->get(), curr)); + max_->set(std::max(max_->get(), curr)); } }); } else { @@ -168,21 +169,24 @@ void SumAggregatorData::aggregate(const std::optional& input_ data_type_ = DataType::FLOAT64; } details::visit_type(*data_type_, [&input_column, unique_values, &groups, this] (auto global_tag) { - using GlobalInputType = decltype(global_tag); - if constexpr(!is_sequence_type(GlobalInputType::DataTypeTag::data_type)) { - using GlobalTypeDescriptorTag = typename OutputType::type; - using GlobalRawType = typename GlobalTypeDescriptorTag::DataTypeTag::raw_type; - aggregated_.resize(sizeof(GlobalRawType)* unique_values); - auto out_ptr = reinterpret_cast(aggregated_.data()); + using global_type_info = ScalarTypeInfo; + using RawType = typename global_type_info::RawType; + if constexpr(!is_sequence_type(global_type_info::data_type)) { + aggregated_.resize(sizeof(RawType) * unique_values); + auto out_ptr = reinterpret_cast(aggregated_.data()); if (input_column.has_value()) { details::visit_type(input_column->column_->type().data_type(), [&input_column, &groups, &out_ptr] (auto col_tag) { using col_type_info = ScalarTypeInfo; if constexpr(!is_sequence_type(col_type_info::data_type)) { Column::for_each_enumerated(*input_column->column_, [&out_ptr, &groups](auto enumerating_it) { - if constexpr (std::is_same_v) { - out_ptr[groups[enumerating_it.idx()]] |= GlobalRawType(enumerating_it.value()); + if constexpr (is_bool_type(global_type_info::data_type)) { + out_ptr[groups[enumerating_it.idx()]] |= RawType(enumerating_it.value()); + } else if constexpr (is_floating_point_type(col_type_info::data_type)) { + if (ARCTICDB_LIKELY(!std::isnan(enumerating_it.value()))) { + out_ptr[groups[enumerating_it.idx()]] += RawType(enumerating_it.value()); + } } else { - out_ptr[groups[enumerating_it.idx()]] += GlobalRawType(enumerating_it.value()); + out_ptr[groups[enumerating_it.idx()]] += RawType(enumerating_it.value()); } }); } else { @@ -198,8 +202,8 @@ SegmentInMemory SumAggregatorData::finalize(const ColumnName& output_column_name SegmentInMemory res; if(!aggregated_.empty()) { details::visit_type(*data_type_, [that=this, &res, &output_column_name, unique_values] (auto col_tag) { - using RawType = typename decltype(col_tag)::DataTypeTag::raw_type; - that->aggregated_.resize(sizeof(RawType)* unique_values); + using col_type_info = ScalarTypeInfo; + that->aggregated_.resize(sizeof(typename col_type_info::RawType)* unique_values); auto col = std::make_shared(make_scalar_type(that->data_type_.value()), unique_values, true, false); memcpy(col->ptr(), that->aggregated_.data(), that->aggregated_.size()); res.add_column(scalar_field(that->data_type_.value(), output_column_name.value), col); @@ -248,10 +252,9 @@ namespace ) { if(data_type_.has_value() && *data_type_ != DataType::EMPTYVAL && input_column.has_value()) { details::visit_type(*data_type_, [&aggregated_, &input_column, unique_values, &groups] (auto global_tag) { - using GlobalInputType = decltype(global_tag); - if constexpr(!is_sequence_type(GlobalInputType::DataTypeTag::data_type)) { - using GlobalTypeDescriptorTag = typename OutputType::type; - using GlobalRawType = typename GlobalTypeDescriptorTag::DataTypeTag::raw_type; + using global_type_info = ScalarTypeInfo; + using GlobalRawType = typename global_type_info::RawType; + if constexpr(!is_sequence_type(global_type_info::data_type)) { using MaybeValueType = MaybeValue; auto prev_size = aggregated_.size() / sizeof(MaybeValueType); aggregated_.resize(sizeof(MaybeValueType) * unique_values); @@ -259,15 +262,16 @@ namespace std::fill(out_ptr + prev_size, out_ptr + unique_values, MaybeValueType{}); details::visit_type(input_column->column_->type().data_type(), [&input_column, &groups, &out_ptr] (auto col_tag) { using col_type_info = ScalarTypeInfo; + using ColRawType = typename col_type_info::RawType; if constexpr(!is_sequence_type(col_type_info::data_type)) { Column::for_each_enumerated(*input_column->column_, [&groups, &out_ptr](auto enumerating_it) { auto& val = out_ptr[groups[enumerating_it.idx()]]; - if constexpr(std::is_floating_point_v) { + if constexpr(std::is_floating_point_v) { const auto& curr = GlobalRawType(enumerating_it.value()); - if (!val.written_ || std::isnan(static_cast(val.value_))) { + if (!val.written_ || std::isnan(static_cast(val.value_))) { val.value_ = curr; val.written_ = true; - } else if (!std::isnan(static_cast(curr))) { + } else if (!std::isnan(static_cast(curr))) { if constexpr(T == Extremum::MAX) { val.value_ = std::max(val.value_, curr); } else { @@ -302,36 +306,32 @@ namespace ) { SegmentInMemory res; if(!aggregated_.empty()) { - if(dynamic_schema) { - details::visit_type(*data_type_, [&aggregated_, &res, &output_column_name, unique_values] (auto col_tag) { - using RawType = typename decltype(col_tag)::DataTypeTag::raw_type; - using MaybeValueType = MaybeValue; + constexpr auto dynamic_schema_data_type = DataType::FLOAT64; + using DynamicSchemaTDT = ScalarTagType>; + auto col = std::make_shared(make_scalar_type(dynamic_schema ? dynamic_schema_data_type: data_type_.value()), unique_values, true, false); + auto column_data = col->data(); + col->set_row_data(unique_values - 1); + res.add_column(scalar_field(dynamic_schema ? dynamic_schema_data_type : data_type_.value(), output_column_name.value), col); + details::visit_type(*data_type_, [&aggregated_, &column_data, unique_values, dynamic_schema] (auto col_tag) { + using col_type_info = ScalarTypeInfo; + using MaybeValueType = MaybeValue; + if(dynamic_schema) { auto prev_size = aggregated_.size() / sizeof(MaybeValueType); auto new_size = sizeof(MaybeValueType) * unique_values; aggregated_.resize(new_size); - auto in_ptr = reinterpret_cast(aggregated_.data()); + auto in_ptr = reinterpret_cast(aggregated_.data()); std::fill(in_ptr + prev_size, in_ptr + unique_values, MaybeValueType{}); - auto col = std::make_shared(make_scalar_type(DataType::FLOAT64), unique_values, true, false); - auto out_ptr = reinterpret_cast(col->ptr()); - for(auto i = 0u; i < unique_values; ++i, ++in_ptr, ++out_ptr) { - *out_ptr = in_ptr->written_ ? static_cast(in_ptr->value_) : std::numeric_limits::quiet_NaN(); } - - col->set_row_data(unique_values - 1); - res.add_column(scalar_field(DataType::FLOAT64, output_column_name.value), col); - }); - } else { - details::visit_type(*data_type_, [&aggregated_, &data_type_, &res, output_column_name, unique_values] (auto col_tag) { - using RawType = typename decltype(col_tag)::DataTypeTag::raw_type; - auto col = std::make_shared(make_scalar_type(data_type_.value()), unique_values, true, false); - const auto* in_ptr = reinterpret_cast*>(aggregated_.data()); - auto out_ptr = reinterpret_cast(col->ptr()); - for(auto i = 0u; i < unique_values; ++i, ++in_ptr, ++out_ptr) { - *out_ptr = in_ptr->value_; + for (auto it = column_data.begin(); it != column_data.end(); ++it, ++in_ptr) { + *it = in_ptr->written_ ? static_cast(in_ptr->value_) + : std::numeric_limits::quiet_NaN(); } - col->set_row_data(unique_values - 1); - res.add_column(scalar_field(data_type_.value(), output_column_name.value), col); - }); - } + } else { + auto in_ptr = reinterpret_cast(aggregated_.data()); + for (auto it = column_data.begin(); it != column_data.end(); ++it, ++in_ptr) { + *it = in_ptr->value_; + } + } + }); } return res; } @@ -387,8 +387,15 @@ void MeanAggregatorData::aggregate(const std::optional& input if constexpr(!is_sequence_type(col_type_info::data_type)) { Column::for_each_enumerated(*input_column->column_, [&groups, this](auto enumerating_it) { auto& fraction = fractions_[groups[enumerating_it.idx()]]; - fraction.numerator_ += double(enumerating_it.value()); - ++fraction.denominator_; + if constexpr ((is_floating_point_type(col_type_info ::data_type))) { + if (ARCTICDB_LIKELY(!std::isnan(enumerating_it.value()))) { + fraction.numerator_ += double(enumerating_it.value()); + ++fraction.denominator_; + } + } else { + fraction.numerator_ += double(enumerating_it.value()); + ++fraction.denominator_; + } }); } else { util::raise_rte("String aggregations not currently supported"); @@ -401,14 +408,13 @@ SegmentInMemory MeanAggregatorData::finalize(const ColumnName& output_column_nam SegmentInMemory res; if(!fractions_.empty()) { fractions_.resize(unique_values); - auto pos = res.add_column(scalar_field(DataType::FLOAT64, output_column_name.value), fractions_.size(), true); - auto& column = res.column(pos); - auto ptr = reinterpret_cast(column.ptr()); - column.set_row_data(fractions_.size() - 1); - - for (auto idx = 0u; idx < fractions_.size(); ++idx) { - ptr[idx] = fractions_[idx].to_double(); - } + auto col = std::make_shared(make_scalar_type(DataType::FLOAT64), fractions_.size(), true, false); + auto column_data = col->data(); + std::transform(fractions_.cbegin(), fractions_.cend(), column_data.begin>>(), [](auto fraction) { + return fraction.to_double(); + }); + col->set_row_data(fractions_.size() - 1); + res.add_column(scalar_field(DataType::FLOAT64, output_column_name.value), col); } return res; } @@ -429,7 +435,7 @@ void CountAggregatorData::aggregate(const std::optional& inpu using col_type_info = ScalarTypeInfo; Column::for_each_enumerated(*input_column->column_, [&groups, this](auto enumerating_it) { if constexpr (is_floating_point_type(col_type_info::data_type)) { - if (!std::isnan(static_cast(enumerating_it.value()))) { + if (ARCTICDB_LIKELY(!std::isnan(enumerating_it.value()))) { auto& val = aggregated_[groups[enumerating_it.idx()]]; ++val; } diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index 5d38a013c4..a4889f5e32 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -267,9 +267,9 @@ Composite FilterClause::process( proc.set_expression_context(expression_context_); auto variant_data = proc.get(expression_context_->root_node_name_); util::variant_match(variant_data, - [&proc, &output, this](const util::BitSet& bitset) { + [&proc, &output, this](util::BitSet& bitset) { if (bitset.count() > 0) { - proc.apply_filter(bitset, optimisation_); + proc.apply_filter(std::move(bitset), optimisation_); output.push_back(push_entities(component_manager_, std::move(proc))); } else { log::version().debug("Filter returned empty result"); @@ -400,103 +400,94 @@ Composite AggregationClause::process(Composite&& entity_id auto partitioning_column = proc.get(ColumnName(grouping_column_)); if (std::holds_alternative(partitioning_column)) { ColumnWithStrings col = std::get(partitioning_column); - entity::details::visit_type(col.column_->type().data_type(), - [&proc_=proc, &grouping_map, &next_group_id, &aggregators_data, &string_pool, &col, - &num_unique, &grouping_data_type, this](auto data_type_tag) { - using DataTypeTagType = decltype(data_type_tag); - using RawType = typename DataTypeTagType::raw_type; - constexpr auto data_type = DataTypeTagType::data_type; - grouping_data_type = data_type; - std::vector row_to_group; - row_to_group.reserve(col.column_->row_count()); - auto input_data = col.column_->data(); - auto hash_to_group = grouping_map.get(); - // For string grouping columns, keep a local map within this ProcessingUnit - // from offsets to groups, to avoid needless calls to col.string_at_offset and - // string_pool->get - // This could be slower in cases where there aren't many repeats in string - // grouping columns. Maybe track hit ratio of finds and stop using it if it is - // too low? - // Tested with 100,000,000 row dataframe with 100,000 unique values in the grouping column. Timings: - // 11.14 seconds without caching - // 11.01 seconds with caching - // Not worth worrying about right now - ankerl::unordered_dense::map offset_to_group; - - const bool is_sparse = col.column_->is_sparse(); - using optional_iter_type = std::optionalfirst())>; - optional_iter_type iter = std::nullopt; - size_t previous_value_index = 0; - constexpr size_t missing_value_group_id = 0; - - if (is_sparse) - { - iter = std::make_optional(input_data.bit_vector()->first()); - // We use 0 for the missing value group id - next_group_id++; - } - - while (auto block = input_data.next>()) { - const auto row_count = block->row_count(); - auto ptr = block->data(); - for (size_t i = 0; i < row_count; ++i, ++ptr) { - RawType val; - if constexpr(is_sequence_type(data_type)) { - auto offset = *ptr; - if (auto it = offset_to_group.find(offset); it != offset_to_group.end()) { - val = it->second; - } else { - std::optional str = col.string_at_offset(offset); - if (str.has_value()) { - val = string_pool->get(*str, true).offset(); - } else { - val = offset; - } - RawType val_copy(val); - offset_to_group.insert(std::make_pair(std::forward(offset), std::forward(val_copy))); - } + details::visit_type(col.column_->type().data_type(), + [&proc_=proc, &grouping_map, &next_group_id, &aggregators_data, &string_pool, &col, + &num_unique, &grouping_data_type, this](auto data_type_tag) { + using col_type_info = ScalarTypeInfo; + grouping_data_type = col_type_info::data_type; + std::vector row_to_group; + row_to_group.reserve(col.column_->row_count()); + auto hash_to_group = grouping_map.get(); + // For string grouping columns, keep a local map within this ProcessingUnit + // from offsets to groups, to avoid needless calls to col.string_at_offset and + // string_pool->get + // This could be slower in cases where there aren't many repeats in string + // grouping columns. Maybe track hit ratio of finds and stop using it if it is + // too low? + // Tested with 100,000,000 row dataframe with 100,000 unique values in the grouping column. Timings: + // 11.14 seconds without caching + // 11.01 seconds with caching + // Not worth worrying about right now + ankerl::unordered_dense::map offset_to_group; + + const bool is_sparse = col.column_->is_sparse(); + if (is_sparse && next_group_id == 0) { + // We use 0 for the missing value group id + ++next_group_id; + } + ssize_t previous_value_index = 0; + constexpr size_t missing_value_group_id = 0; + + Column::for_each_enumerated( + *col.column_, + [&](auto enumerating_it) { + typename col_type_info::RawType val; + if constexpr(is_sequence_type(col_type_info::data_type)) { + auto offset = enumerating_it.value(); + if (auto it = offset_to_group.find(offset); it != offset_to_group.end()) { + val = it->second; } else { - val = *ptr; - } - if (is_sparse) { - for (size_t j = previous_value_index; j != *(iter.value()); ++j) { - row_to_group.emplace_back(missing_value_group_id); + std::optional str = col.string_at_offset(offset); + if (str.has_value()) { + val = string_pool->get(*str, true).offset(); + } else { + val = offset; } - previous_value_index = *(iter.value()) + 1; - ++(iter.value()); + typename col_type_info::RawType val_copy(val); + offset_to_group.insert(std::make_pair(std::forward(offset), std::forward(val_copy))); } + } else { + val = enumerating_it.value(); + } - if (auto it = hash_to_group->find(val); it == hash_to_group->end()) { - row_to_group.emplace_back(next_group_id); - auto group_id = next_group_id++; - hash_to_group->insert(std::make_pair(std::forward(val), std::forward(group_id))); - } else { - row_to_group.emplace_back(it->second); + if (is_sparse) { + for (auto j = previous_value_index; j != enumerating_it.idx(); ++j) { + row_to_group.emplace_back(missing_value_group_id); } + previous_value_index = enumerating_it.idx() + 1; } - } - - // Marking all the last non-represented values as missing. - for (size_t i = row_to_group.size(); i <= size_t(col.column_->last_row()); ++i) { - row_to_group.emplace_back(missing_value_group_id); - } - num_unique = next_group_id; - util::check(num_unique != 0, "Got zero unique values"); - for (auto agg_data: folly::enumerate(aggregators_data)) { - auto input_column_name = aggregators_.at(agg_data.index).get_input_column_name(); - auto input_column = proc_.get(input_column_name); - std::optional opt_input_column; - if (std::holds_alternative(input_column)) { - auto column_with_strings = std::get(input_column); - // Empty columns don't contribute to aggregations - if (!is_empty_type(column_with_strings.column_->type().data_type())) { - opt_input_column.emplace(std::move(column_with_strings)); - } + if (auto it = hash_to_group->find(val); it == hash_to_group->end()) { + row_to_group.emplace_back(next_group_id); + auto group_id = next_group_id++; + hash_to_group->insert(std::make_pair(std::forward(val), std::forward(group_id))); + } else { + row_to_group.emplace_back(it->second); } - agg_data->aggregate(opt_input_column, row_to_group, num_unique); } - }); + ); + + // Marking all the last non-represented values as missing. + for (size_t i = row_to_group.size(); i <= size_t(col.column_->last_row()); ++i) { + row_to_group.emplace_back(missing_value_group_id); + } + + num_unique = next_group_id; + util::check(num_unique != 0, "Got zero unique values"); + for (auto agg_data: folly::enumerate(aggregators_data)) { + auto input_column_name = aggregators_.at(agg_data.index).get_input_column_name(); + auto input_column = proc_.get(input_column_name); + std::optional opt_input_column; + if (std::holds_alternative(input_column)) { + auto column_with_strings = std::get(input_column); + // Empty columns don't contribute to aggregations + if (!is_empty_type(column_with_strings.column_->type().data_type())) { + opt_input_column.emplace(std::move(column_with_strings)); + } + } + agg_data->aggregate(opt_input_column, row_to_group, num_unique); + } + }); } else { util::raise_rte("Expected single column from expression"); } @@ -504,26 +495,26 @@ Composite AggregationClause::process(Composite&& entity_id SegmentInMemory seg; auto index_col = std::make_shared(make_scalar_type(grouping_data_type), grouping_map.size(), true, false); - auto index_pos = seg.add_column(scalar_field(grouping_data_type, grouping_column_), index_col); + seg.add_column(scalar_field(grouping_data_type, grouping_column_), index_col); seg.descriptor().set_index(IndexDescriptor(0, IndexDescriptor::ROWCOUNT)); - entity::details::visit_type(grouping_data_type, [&seg, &grouping_map, index_pos](auto data_type_tag) { - using DataTypeTagType = decltype(data_type_tag); - using RawType = typename DataTypeTagType::raw_type; - auto hashes = grouping_map.get(); - auto index_ptr = reinterpret_cast(seg.column(index_pos).ptr()); - std::vector> elements; + details::visit_type(grouping_data_type, [&grouping_map, &index_col](auto data_type_tag) { + using col_type_info = ScalarTypeInfo; + auto hashes = grouping_map.get(); + std::vector> elements; for (const auto &hash : *hashes) elements.push_back(std::make_pair(hash.first, hash.second)); std::sort(std::begin(elements), std::end(elements), - [](const std::pair &l, const std::pair &r) { + [](const std::pair &l, const std::pair &r) { return l.second < r.second; }); - for (const auto &element : elements) - *index_ptr++ = element.first; + auto column_data = index_col->data(); + std::transform(elements.cbegin(), elements.cend(), column_data.begin(), [](const auto& element) { + return element.first; + }); }); index_col->set_row_data(grouping_map.size() - 1); diff --git a/cpp/arcticdb/processing/processing_unit.cpp b/cpp/arcticdb/processing/processing_unit.cpp index f0f98c4eed..2430022b6c 100644 --- a/cpp/arcticdb/processing/processing_unit.cpp +++ b/cpp/arcticdb/processing/processing_unit.cpp @@ -10,7 +10,7 @@ namespace arcticdb { void ProcessingUnit::apply_filter( - const util::BitSet& bitset, + util::BitSet&& bitset, PipelineOptimisation optimisation) { internal::check(segments_.has_value() && row_ranges_.has_value() && col_ranges_.has_value(), "ProcessingUnit::apply_filter requires all of segments, row_ranges, and col_ranges to be present"); @@ -18,7 +18,7 @@ void ProcessingUnit::apply_filter( for (auto&& [idx, segment]: folly::enumerate(*segments_)) { auto seg = filter_segment(*segment, - bitset, + std::move(bitset), filter_down_stringpool); auto num_rows = seg.is_null() ? 0 : seg.row_count(); row_ranges_->at(idx) = std::make_shared(row_ranges_->at(idx)->first, row_ranges_->at(idx)->first + num_rows); diff --git a/cpp/arcticdb/processing/processing_unit.hpp b/cpp/arcticdb/processing/processing_unit.hpp index b9a499bd51..eb7f6494d6 100644 --- a/cpp/arcticdb/processing/processing_unit.hpp +++ b/cpp/arcticdb/processing/processing_unit.hpp @@ -93,7 +93,7 @@ namespace arcticdb { return *this; } - void apply_filter(const util::BitSet& bitset, PipelineOptimisation optimisation); + void apply_filter(util::BitSet&& bitset, PipelineOptimisation optimisation); void truncate(size_t start_row, size_t end_row); diff --git a/cpp/arcticdb/processing/test/test_has_valid_type_promotion.cpp b/cpp/arcticdb/processing/test/test_has_valid_type_promotion.cpp index 39f94cd552..4c0689a328 100644 --- a/cpp/arcticdb/processing/test/test_has_valid_type_promotion.cpp +++ b/cpp/arcticdb/processing/test/test_has_valid_type_promotion.cpp @@ -148,3 +148,22 @@ TEST(HasValidTypePromotion, EverythingToEmpty) { } } } + +TEST(HasValidCommonType, UintInt) { + using namespace arcticdb; + using namespace arcticdb::entity; + TypeDescriptor uint32(ValueType::UINT, SizeBits::S32, Dimension::Dim0); + TypeDescriptor uint64(ValueType::UINT, SizeBits::S64, Dimension::Dim0); + TypeDescriptor int16(ValueType::INT, SizeBits::S16, Dimension::Dim0); + TypeDescriptor int32(ValueType::INT, SizeBits::S32, Dimension::Dim0); + TypeDescriptor int64(ValueType::INT, SizeBits::S64, Dimension::Dim0); + ASSERT_EQ(has_valid_common_type(uint32, int16), int64); + ASSERT_EQ(has_valid_common_type(uint32, int32), int64); + ASSERT_EQ(has_valid_common_type(uint32, int64), int64); + EXPECT_FALSE(has_valid_common_type(uint64, int64)); + // has_valid_common_type should be symmetric, these assertions are as above with the arguments reversed + ASSERT_EQ(has_valid_common_type(int16, uint32), int64); + ASSERT_EQ(has_valid_common_type(int32, uint32), int64); + ASSERT_EQ(has_valid_common_type(int64, uint32), int64); + EXPECT_FALSE(has_valid_common_type(int64, uint64)); +} diff --git a/cpp/arcticdb/python/python_module.cpp b/cpp/arcticdb/python/python_module.cpp index 7596ee1583..cb8be77a8d 100644 --- a/cpp/arcticdb/python/python_module.cpp +++ b/cpp/arcticdb/python/python_module.cpp @@ -28,10 +28,9 @@ #include #include #include +#include #include -#include -#include #include #include @@ -175,7 +174,7 @@ void register_termination_handler() { std::rethrow_exception(eptr); } catch (const std::exception &e) { arcticdb::log::root().error("Terminate called in thread {}: {}\n Aborting", - folly::getCurrentThreadName().value_or("Unknown"), e.what()); + arcticdb::get_thread_id(), e.what()); std::abort(); } }); @@ -276,13 +275,19 @@ void register_instrumentation(py::module && m){ } void register_metrics(py::module && m){ + auto prometheus = m.def_submodule("prometheus"); py::class_>(prometheus, "Instance"); - prometheus.def("configure", [](const py::object & py_config){ - arcticdb::proto::utils::PrometheusConfig config; - arcticdb::python_util::pb_from_python(py_config, config); - arcticdb::PrometheusConfigInstance::instance()->config.CopyFrom(config); - }); + + py::class_>(prometheus, "MetricsConfig") + .def(py::init()); + + py::enum_(prometheus, "MetricsConfigModel") + .value("NO_INIT", arcticdb::MetricsConfig::Model::NO_INIT) + .value("PUSH", arcticdb::MetricsConfig::Model::PUSH) + .value("PULL", arcticdb::MetricsConfig::Model::PULL) + .export_values() + ; } /// Register handling of non-trivial types. For more information @see arcticdb::TypeHandlerRegistry and diff --git a/cpp/arcticdb/python/python_to_tensor_frame.cpp b/cpp/arcticdb/python/python_to_tensor_frame.cpp index 15ca454eae..f1a54fd8d5 100644 --- a/cpp/arcticdb/python/python_to_tensor_frame.cpp +++ b/cpp/arcticdb/python/python_to_tensor_frame.cpp @@ -110,7 +110,7 @@ std::variant py_unicode_to_buffer(PyObject } } -NativeTensor obj_to_tensor(PyObject *ptr) { +NativeTensor obj_to_tensor(PyObject *ptr, bool empty_types) { auto& api = pybind11::detail::npy_api::get(); util::check(api.PyArray_Check_(ptr), "Expected Python array"); const auto arr = pybind11::detail::array_proxy(ptr); @@ -120,7 +120,7 @@ NativeTensor obj_to_tensor(PyObject *ptr) { // In Pandas < 2, empty series dtype is `"float"`, but as of Pandas 2.0, empty series dtype is `"object"` // The Normalizer in Python cast empty `"float"` series to `"object"` so `EMPTY` is used here. // See: https://github.com/man-group/ArcticDB/pull/1049 - auto val_type = size == 0 ? ValueType::EMPTY : get_value_type(descr->kind); + auto val_type = size == 0 && empty_types ? ValueType::EMPTY : get_value_type(descr->kind); auto val_bytes = static_cast(descr->elsize); const int64_t element_count = ndim == 1 ? int64_t(arr->dimensions[0]) : int64_t(arr->dimensions[0]) * int64_t(arr->dimensions[1]); const auto c_style = arr->strides[0] == val_bytes; @@ -173,7 +173,7 @@ NativeTensor obj_to_tensor(PyObject *ptr) { sample = *current_object; } if (empty && descr->kind == 'O') { - val_type = ValueType::EMPTY; + val_type = empty_types ? ValueType::EMPTY : ValueType::UTF_DYNAMIC; } else if(all_nans || is_unicode(sample)){ val_type = ValueType::UTF_DYNAMIC; } else if (PYBIND11_BYTES_CHECK(sample)) { @@ -204,7 +204,8 @@ std::shared_ptr py_ndf_to_frame( const StreamId& stream_name, const py::tuple &item, const py::object &norm_meta, - const py::object &user_meta) { + const py::object &user_meta, + bool empty_types) { ARCTICDB_SUBSAMPLE_DEFAULT(NormalizeFrame) auto res = std::make_shared(); res->desc.set_id(stream_name); @@ -222,7 +223,7 @@ std::shared_ptr py_ndf_to_frame( if (!idx_names.empty()) { util::check(idx_names.size() == 1, "Multi-indexed dataframes not handled"); - auto index_tensor = obj_to_tensor(idx_vals[0].ptr()); + auto index_tensor = obj_to_tensor(idx_vals[0].ptr(), empty_types); util::check(index_tensor.ndim() == 1, "Multi-dimensional indexes not handled"); util::check(index_tensor.shape() != nullptr, "Index tensor expected to contain shapes"); std::string index_column_name = !idx_names.empty() ? idx_names[0] : "index"; @@ -251,7 +252,7 @@ std::shared_ptr py_ndf_to_frame( res->set_sorted(sorted); for (auto i = 0u; i < col_vals.size(); ++i) { - auto tensor = obj_to_tensor(col_vals[i].ptr()); + auto tensor = obj_to_tensor(col_vals[i].ptr(), empty_types); res->num_rows = std::max(res->num_rows, static_cast(tensor.shape(0))); if(tensor.expanded_dim() == 1) { res->desc.add_field(scalar_field(tensor.data_type(), col_names[i])); diff --git a/cpp/arcticdb/python/python_to_tensor_frame.hpp b/cpp/arcticdb/python/python_to_tensor_frame.hpp index d7344593a8..4ff566262a 100644 --- a/cpp/arcticdb/python/python_to_tensor_frame.hpp +++ b/cpp/arcticdb/python/python_to_tensor_frame.hpp @@ -80,13 +80,14 @@ std::variant py_unicode_to_buffer( PyObject *obj, std::optional& scoped_gil_lock); -NativeTensor obj_to_tensor(PyObject *ptr); +NativeTensor obj_to_tensor(PyObject *ptr, bool empty_types); std::shared_ptr py_ndf_to_frame( const StreamId& stream_name, const py::tuple &item, const py::object &norm_meta, - const py::object &user_meta); + const py::object &user_meta, + bool empty_types); std::shared_ptr py_none_to_frame(); diff --git a/cpp/arcticdb/storage/library.hpp b/cpp/arcticdb/storage/library.hpp index a347277158..bc02a2f653 100644 --- a/cpp/arcticdb/storage/library.hpp +++ b/cpp/arcticdb/storage/library.hpp @@ -122,6 +122,10 @@ class Library { return storages_->fast_delete(); } + void cleanup() { + storages_->cleanup(); + } + bool key_exists(const VariantKey& key) { return storages_->key_exists(key); } diff --git a/cpp/arcticdb/storage/library_manager.cpp b/cpp/arcticdb/storage/library_manager.cpp index e77510ea12..8072f413bb 100644 --- a/cpp/arcticdb/storage/library_manager.cpp +++ b/cpp/arcticdb/storage/library_manager.cpp @@ -129,8 +129,10 @@ void LibraryManager::remove_library_config(const LibraryPath& path) const { store_->remove_key(RefKey{StreamId(path.to_delim_path()), entity::KeyType::LIBRARY_CONFIG}).wait(); } -std::shared_ptr LibraryManager::get_library(const LibraryPath& path, const StorageOverride& storage_override) { - { +std::shared_ptr LibraryManager::get_library(const LibraryPath& path, + const StorageOverride& storage_override, + const bool ignore_cache) { + if (!ignore_cache) { // Check global cache first, important for LMDB and RocksDB to only open once from a given process std::lock_guard lock{open_libraries_mutex_}; if (auto cached = open_libraries_.find(path); cached != open_libraries_.end()) { @@ -156,9 +158,12 @@ std::shared_ptr LibraryManager::get_library(const LibraryPath& path, co return lib; } -void LibraryManager::close_library_if_open(const LibraryPath &path) { +void LibraryManager::cleanup_library_if_open(const LibraryPath &path) { std::lock_guard lock{open_libraries_mutex_}; - open_libraries_.erase(path); + if (auto it = open_libraries_.find(path); it != open_libraries_.end()) { + it->second->cleanup(); + open_libraries_.erase(it); + } } std::vector LibraryManager::get_library_paths() const { diff --git a/cpp/arcticdb/storage/library_manager.hpp b/cpp/arcticdb/storage/library_manager.hpp index 673b1acf3a..ddb32edaa8 100644 --- a/cpp/arcticdb/storage/library_manager.hpp +++ b/cpp/arcticdb/storage/library_manager.hpp @@ -30,9 +30,12 @@ namespace arcticdb::storage { void remove_library_config(const LibraryPath& path) const; - [[nodiscard]] std::shared_ptr get_library(const LibraryPath& path, const StorageOverride& storage_override = StorageOverride{}); + [[nodiscard]] std::shared_ptr get_library( + const LibraryPath& path, + const StorageOverride& storage_override, + bool ignore_cache); - void close_library_if_open(const LibraryPath& path); + void cleanup_library_if_open(const LibraryPath& path); [[nodiscard]] std::vector get_library_paths() const; diff --git a/cpp/arcticdb/storage/lmdb/lmdb_storage.cpp b/cpp/arcticdb/storage/lmdb/lmdb_storage.cpp index cfb6eb6c09..94c6f62d54 100644 --- a/cpp/arcticdb/storage/lmdb/lmdb_storage.cpp +++ b/cpp/arcticdb/storage/lmdb/lmdb_storage.cpp @@ -259,6 +259,43 @@ void LmdbStorage::do_iterate_type(KeyType key_type, const IterateTypeVisitor& vi } } +void remove_db_files(const fs::path& lib_path) { + std::vector files = {"lock.mdb", "data.mdb"}; + + for (const auto& file : files) { + fs::path file_path = lib_path / file; + try { + if (fs::exists(file_path)) { + fs::remove(file_path); + } + } catch (const std::filesystem::filesystem_error& e) { + raise( + fmt::format("Unexpected LMDB Error: Failed to remove LMDB file at path: {} error: {}", + file_path.string(), e.what())); + } + } + + if (fs::exists(lib_path)) { + if (!fs::is_empty(lib_path)) { + log::storage().warn(fmt::format("Skipping deletion of directory holding LMDB library during " + "library deletion as it contains files unrelated to LMDB")); + } else { + try { + fs::remove_all(lib_path); + } catch (const fs::filesystem_error& e) { + raise( + fmt::format("Unexpected LMDB Error: Failed to remove directory: {} error: {}", + lib_path.string(), e.what())); + } + } + } +} + +void LmdbStorage::cleanup() { + env_.reset(); + remove_db_files(lib_dir_); +} + namespace { template diff --git a/cpp/arcticdb/storage/lmdb/lmdb_storage.hpp b/cpp/arcticdb/storage/lmdb/lmdb_storage.hpp index 8d251188f6..704472db71 100644 --- a/cpp/arcticdb/storage/lmdb/lmdb_storage.hpp +++ b/cpp/arcticdb/storage/lmdb/lmdb_storage.hpp @@ -46,11 +46,19 @@ class LmdbStorage final : public Storage { inline bool do_fast_delete() final; + void cleanup() override; + void do_iterate_type(KeyType key_type, const IterateTypeVisitor& visitor, const std::string &prefix) final; bool do_key_exists(const VariantKey & key) final; - ::lmdb::env& env() { return *env_; } + ::lmdb::env& env() { + if (!env_) { + raise("Unexpected LMDB Error: Invalid operation: LMDB environment has been removed. " + "Possibly because the library has been deleted"); + } + return *env_; + } std::string do_key_path(const VariantKey&) const final { return {}; }; diff --git a/cpp/arcticdb/storage/python_bindings.cpp b/cpp/arcticdb/storage/python_bindings.cpp index 62d852705d..368afebb54 100644 --- a/cpp/arcticdb/storage/python_bindings.cpp +++ b/cpp/arcticdb/storage/python_bindings.cpp @@ -153,11 +153,18 @@ void register_bindings(py::module& storage) { .def("remove_library_config", [](const LibraryManager& library_manager, std::string_view library_path){ return library_manager.remove_library_config(LibraryPath{library_path, '.'}); }, py::call_guard()) - .def("get_library", [](LibraryManager& library_manager, std::string_view library_path, const StorageOverride& storage_override){ - return library_manager.get_library(LibraryPath{library_path, '.'}, storage_override); - }, py::arg("library_path"), py::arg("storage_override") = StorageOverride{}) - .def("close_library_if_open", [](LibraryManager& library_manager, std::string_view library_path) { - return library_manager.close_library_if_open(LibraryPath{library_path, '.'}); + .def("get_library", []( + LibraryManager& library_manager, std::string_view library_path, + const StorageOverride& storage_override, + const bool ignore_cache) { + return library_manager.get_library(LibraryPath{library_path, '.'}, storage_override, ignore_cache); + }, + py::arg("library_path"), + py::arg("storage_override") = StorageOverride{}, + py::arg("ignore_cache") = false + ) + .def("cleanup_library_if_open", [](LibraryManager& library_manager, std::string_view library_path) { + return library_manager.cleanup_library_if_open(LibraryPath{library_path, '.'}); }) .def("has_library", [](const LibraryManager& library_manager, std::string_view library_path){ return library_manager.has_library(LibraryPath{library_path, '.'}); diff --git a/cpp/arcticdb/storage/storage.hpp b/cpp/arcticdb/storage/storage.hpp index 4d1078fe82..c0da8795c0 100644 --- a/cpp/arcticdb/storage/storage.hpp +++ b/cpp/arcticdb/storage/storage.hpp @@ -152,6 +152,8 @@ class Storage { return do_fast_delete(); } + virtual void cleanup() { } + inline bool key_exists(const VariantKey &key) { return do_key_exists(key); } diff --git a/cpp/arcticdb/storage/storages.hpp b/cpp/arcticdb/storage/storages.hpp index dc88a3f3a5..8199a3746e 100644 --- a/cpp/arcticdb/storage/storages.hpp +++ b/cpp/arcticdb/storage/storages.hpp @@ -59,6 +59,10 @@ class Storages { return primary().fast_delete(); } + void cleanup() { + primary().cleanup(); + } + bool key_exists(const VariantKey& key) { return primary().key_exists(key); } diff --git a/cpp/arcticdb/storage/test/test_storage_exceptions.cpp b/cpp/arcticdb/storage/test/test_storage_exceptions.cpp index fe0618b0bf..a7d422ec68 100644 --- a/cpp/arcticdb/storage/test/test_storage_exceptions.cpp +++ b/cpp/arcticdb/storage/test/test_storage_exceptions.cpp @@ -45,25 +45,30 @@ class LMDBStorageFactory : public StorageFactory { private: uint64_t map_size; bool use_mock; + fs::path db_path; + std::string lib_name; public: - explicit LMDBStorageFactory(bool use_mock = false) : map_size(128ULL * (1ULL << 20) /* 128MB */), use_mock(use_mock) { } + explicit LMDBStorageFactory(uint64_t map_size, bool use_mock = false) : map_size(map_size), use_mock(use_mock), db_path(TEST_DATABASES_PATH / "test_lmdb"), lib_name("test_lib") { } - explicit LMDBStorageFactory(uint64_t map_size, bool use_mock = false) : map_size(map_size), use_mock(use_mock) { } + explicit LMDBStorageFactory(bool use_mock = false) : LMDBStorageFactory(128ULL * (1ULL << 20) /* 128MB */, use_mock) { } std::unique_ptr create() override { arcticdb::proto::lmdb_storage::Config cfg; - fs::path db_name = "test_lmdb"; - cfg.set_path((TEST_DATABASES_PATH / db_name).generic_string()); + cfg.set_path((db_path).generic_string()); cfg.set_map_size(map_size); cfg.set_recreate_if_exists(true); cfg.set_use_mock_storage_for_testing(use_mock); - arcticdb::storage::LibraryPath library_path{"a", "b"}; + arcticdb::storage::LibraryPath library_path(lib_name, '/'); return std::make_unique(library_path, arcticdb::storage::OpenMode::DELETE, cfg); } + fs::path get_lib_path() const { + return db_path / lib_name; + } + void setup() override { if (!fs::exists(TEST_DATABASES_PATH)) { fs::create_directories(TEST_DATABASES_PATH); @@ -311,6 +316,40 @@ TEST_F(LMDBStorageTestBase, MockUnexpectedLMDBErrorException) { ASSERT_EQ(list_in_store(*storage), symbols); } +TEST_F(LMDBStorageTestBase, RemoveLibPath) { + LMDBStorageFactory factory; + auto storage = factory.create(); + auto path = factory.get_lib_path(); + + storage->cleanup(); + ASSERT_FALSE(fs::exists(path)); + // Once we call close, any other operations should throw UnexpectedLMDBErrorException as lmdb env is closed + ASSERT_THROW({ + write_in_store(*storage, "sym1"); + }, UnexpectedLMDBErrorException); + + ASSERT_THROW({ + update_in_store(*storage, "sym1"); + }, UnexpectedLMDBErrorException); + + ASSERT_THROW({ + remove_in_store(*storage, {"sym1"}); + }, UnexpectedLMDBErrorException); + + ASSERT_THROW({ + read_in_store(*storage, "sym1"); + }, UnexpectedLMDBErrorException); + + ASSERT_THROW({ + exists_in_store(*storage, "sym1"); + }, UnexpectedLMDBErrorException); + + ASSERT_THROW({ + list_in_store(*storage); + }, UnexpectedLMDBErrorException); + +} + // S3 error handling with mock client // Note: Exception handling is different for S3 as compared to other storages. // S3 does not return an error if you rewrite an existing key. It overwrites it. diff --git a/cpp/arcticdb/util/storage_lock.hpp b/cpp/arcticdb/util/storage_lock.hpp index 2574cc098f..dd43f6daa0 100644 --- a/cpp/arcticdb/util/storage_lock.hpp +++ b/cpp/arcticdb/util/storage_lock.hpp @@ -14,9 +14,9 @@ #include #include -#include - +#include #include +#include namespace arcticdb { @@ -66,8 +66,8 @@ struct StorageLockTimeout : public std::runtime_error { using std::runtime_error::runtime_error; }; -inline uint64_t get_thread_id() { - return folly::getCurrentThreadID(); +inline std::thread::id get_thread_id() noexcept { + return std::this_thread::get_id(); } template diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index e86f41f028..244b7c8fae 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1714,10 +1714,6 @@ void LocalVersionedEngine::configure(const storage::LibraryDescriptor::VariantSt if(cfg.write_options().has_sync_passive()) { version_map->set_log_changes(cfg.write_options().sync_passive().enabled()); } - if (cfg.has_prometheus_config()) { - PrometheusConfigInstance::instance()->config.CopyFrom(cfg.prometheus_config()); - ARCTICDB_DEBUG(log::version(), "prometheus configured"); - } }, [](const auto& conf){ util::raise_rte( diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 3a24f1b0a2..5f3f3a56ae 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -825,16 +825,26 @@ void read_incompletes_to_pipeline( pipeline_context->total_rows_ = pipeline_context->calc_rows(); } -void check_incompletes_index_ranges_dont_overlap(const std::shared_ptr& pipeline_context) { - // Does nothing if the symbol is not timestamp-indexed - // Checks both that the index ranges of incomplete segments do not overlap with one another, and that the earliest - // timestamp in an incomplete segment is greater than the latest timestamp existing in the symbol in the case of a - // parallel append +void check_incompletes_index_ranges_dont_overlap(const std::shared_ptr& pipeline_context, + const std::optional& previous_sorted_value) { + /* + Does nothing if the symbol is not timestamp-indexed + Checks: + - that the existing data is not known to be unsorted in the case of a parallel append + - that the index ranges of incomplete segments do not overlap with one another + - that the earliest timestamp in an incomplete segment is greater than the latest timestamp existing in the + symbol in the case of a parallel append + */ if (pipeline_context->descriptor().index().type() == IndexDescriptor::TIMESTAMP) { std::optional last_existing_index_value; // Beginning of incomplete segments == beginning of all segments implies all segments are incompletes, so we are // writing, not appending if (pipeline_context->incompletes_begin() != pipeline_context->begin()) { + sorting::check( + !previous_sorted_value.has_value() || + *previous_sorted_value == SortedValue::ASCENDING || + *previous_sorted_value == SortedValue::UNKNOWN, + "Cannot append staged segments to existing data as existing data is not sorted in ascending order"); auto last_indexed_slice_and_key = std::prev(pipeline_context->incompletes_begin())->slice_and_key(); // -1 as end_time is stored as 1 greater than the last index value in the segment last_existing_index_value = last_indexed_slice_and_key.key().end_time() - 1; @@ -868,6 +878,8 @@ void copy_frame_data_to_buffer(const SegmentInMemory& destination, size_t target return; } auto& src_column = source.column(static_cast(source_index)); + schema::check(!src_column.opt_sparse_map().has_value(), + "QueryBuilder not yet supported with sparse data"); auto& dst_column = destination.column(static_cast(target_index)); auto& buffer = dst_column.data().buffer(); auto dst_rawtype_size = sizeof_datatype(dst_column.type()); @@ -1326,7 +1338,7 @@ VersionedItem compact_incomplete_impl( if (pipeline_context->slice_and_keys_.size() == prev_size) { util::raise_rte("No incomplete segments found for {}", stream_id); } - check_incompletes_index_ranges_dont_overlap(pipeline_context); + check_incompletes_index_ranges_dont_overlap(pipeline_context, previous_sorted_value); const auto& first_seg = pipeline_context->slice_and_keys_.begin()->segment(store); std::vector delete_keys; diff --git a/cpp/arcticdb/version/version_store_api.cpp b/cpp/arcticdb/version/version_store_api.cpp index 9472028a8e..ff4d4d0ad4 100644 --- a/cpp/arcticdb/version/version_store_api.cpp +++ b/cpp/arcticdb/version/version_store_api.cpp @@ -60,7 +60,7 @@ VersionedItem PythonVersionStore::write_dataframe_specific_version( auto versioned_item = write_dataframe_impl( store(), VersionId(version_id), - convert::py_ndf_to_frame(stream_id, item, norm, user_meta), + convert::py_ndf_to_frame(stream_id, item, norm, user_meta, cfg().write_options().empty_types()), get_write_options()); version_map()->write_version(store(), versioned_item.key_, std::nullopt); @@ -74,11 +74,12 @@ std::vector> create_input_tensor_frames( const std::vector& stream_ids, const std::vector &items, const std::vector &norms, - const std::vector &user_metas) { + const std::vector &user_metas, + bool empty_types) { std::vector> output; output.reserve(stream_ids.size()); for (size_t idx = 0; idx < stream_ids.size(); idx++) { - output.emplace_back(convert::py_ndf_to_frame(stream_ids[idx], items[idx], norms[idx], user_metas[idx])); + output.emplace_back(convert::py_ndf_to_frame(stream_ids[idx], items[idx], norms[idx], user_metas[idx], empty_types)); } return output; } @@ -92,7 +93,7 @@ std::vector> PythonVersionStore::batch_wr bool validate_index, bool throw_on_error) { - auto frames = create_input_tensor_frames(stream_ids, items, norms, user_metas); + auto frames = create_input_tensor_frames(stream_ids, items, norms, user_metas, cfg().write_options().empty_types()); return batch_write_versioned_dataframe_internal(stream_ids, std::move(frames), prune_previous_versions, validate_index, throw_on_error); } @@ -105,7 +106,7 @@ std::vector> PythonVersionStore::batch_ap bool validate_index, bool upsert, bool throw_on_error) { - auto frames = create_input_tensor_frames(stream_ids, items, norms, user_metas); + auto frames = create_input_tensor_frames(stream_ids, items, norms, user_metas, cfg().write_options().empty_types()); return batch_append_internal(stream_ids, std::move(frames), prune_previous_versions, validate_index, upsert, throw_on_error); } @@ -525,7 +526,7 @@ VersionedItem PythonVersionStore::write_partitioned_dataframe( auto versioned_item = write_dataframe_impl( store(), version_id, - convert::py_ndf_to_frame(subkeyname, partitioned_dfs[idx], norm_meta, py::none()), + convert::py_ndf_to_frame(subkeyname, partitioned_dfs[idx], norm_meta, py::none(), cfg().write_options().empty_types()), write_options, de_dup_map, false); @@ -579,7 +580,7 @@ VersionedItem PythonVersionStore::write_versioned_composite_data( de_dup_maps.emplace_back(de_dup_map); } - auto frames = create_input_tensor_frames(sub_keys, items, norm_metas, user_metas); + auto frames = create_input_tensor_frames(sub_keys, items, norm_metas, user_metas, cfg().write_options().empty_types()); auto index_keys = folly::collect(batch_write_internal(std::move(version_ids), sub_keys, std::move(frames), std::move(de_dup_maps), false)).get(); auto multi_key = write_multi_index_entry(store(), index_keys, stream_id, metastruct, user_meta, version_id); auto versioned_item = VersionedItem(to_atom(std::move(multi_key))); @@ -600,7 +601,7 @@ VersionedItem PythonVersionStore::write_versioned_dataframe( bool sparsify_floats, bool validate_index) { ARCTICDB_SAMPLE(WriteVersionedDataframe, 0) - auto frame = convert::py_ndf_to_frame(stream_id, item, norm, user_meta); + auto frame = convert::py_ndf_to_frame(stream_id, item, norm, user_meta, cfg().write_options().empty_types()); auto versioned_item = write_versioned_dataframe_internal(stream_id, frame, prune_previous_versions, sparsify_floats, validate_index); return versioned_item; @@ -614,7 +615,7 @@ VersionedItem PythonVersionStore::append( bool upsert, bool prune_previous_versions, bool validate_index) { - return append_internal(stream_id, convert::py_ndf_to_frame(stream_id, item, norm, user_meta), upsert, + return append_internal(stream_id, convert::py_ndf_to_frame(stream_id, item, norm, user_meta, cfg().write_options().empty_types()), upsert, prune_previous_versions, validate_index); } @@ -628,7 +629,7 @@ VersionedItem PythonVersionStore::update( bool dynamic_schema, bool prune_previous_versions) { return update_internal(stream_id, query, - convert::py_ndf_to_frame(stream_id, item, norm, user_meta), upsert, + convert::py_ndf_to_frame(stream_id, item, norm, user_meta, cfg().write_options().empty_types()), upsert, dynamic_schema, prune_previous_versions); } @@ -650,7 +651,7 @@ void PythonVersionStore::append_incomplete( using namespace arcticdb::pipelines; // Turn the input into a standardised frame object - auto frame = convert::py_ndf_to_frame(stream_id, item, norm, user_meta); + auto frame = convert::py_ndf_to_frame(stream_id, item, norm, user_meta, cfg().write_options().empty_types()); append_incomplete_frame(stream_id, frame); } @@ -736,7 +737,7 @@ void PythonVersionStore::write_parallel( using namespace arcticdb::stream; using namespace arcticdb::pipelines; - auto frame = convert::py_ndf_to_frame(stream_id, item, norm, user_meta); + auto frame = convert::py_ndf_to_frame(stream_id, item, norm, user_meta, cfg().write_options().empty_types()); write_parallel_frame(stream_id, frame); } @@ -1158,7 +1159,7 @@ void write_dataframe_to_file( const py::object& norm, const py::object& user_meta) { ARCTICDB_SAMPLE(WriteDataframeToFile, 0) - auto frame = convert::py_ndf_to_frame(stream_id, item, norm, user_meta); + auto frame = convert::py_ndf_to_frame(stream_id, item, norm, user_meta, false); write_dataframe_to_file_internal(stream_id, frame, path, WriteOptions{}, codec::default_lz4_codec(), EncodingVersion::V2); } diff --git a/cpp/proto/arcticc/pb2/storage.proto b/cpp/proto/arcticc/pb2/storage.proto index 5dbc34b5a7..8c62ae4139 100644 --- a/cpp/proto/arcticc/pb2/storage.proto +++ b/cpp/proto/arcticc/pb2/storage.proto @@ -66,6 +66,7 @@ message VersionStoreConfig { bool fast_tombstone_all = 56; bool bucketize_dynamic = 57; uint32 max_num_buckets = 58; + bool empty_types = 59; message SyncDisabled { bool enabled = 1; diff --git a/python/arcticdb/adapters/arctic_library_adapter.py b/python/arcticdb/adapters/arctic_library_adapter.py index 232788cadc..ee10396290 100644 --- a/python/arcticdb/adapters/arctic_library_adapter.py +++ b/python/arcticdb/adapters/arctic_library_adapter.py @@ -9,13 +9,17 @@ from abc import ABC, abstractmethod from typing import Iterable, List -from arcticdb.options import DEFAULT_ENCODING_VERSION, LibraryOptions +from arcticc.pb2.storage_pb2 import EnvironmentConfigsMap, LibraryConfig +from arcticdb.config import _DEFAULT_ENV +from arcticdb.version_store._store import NativeVersionStore +from arcticdb.options import DEFAULT_ENCODING_VERSION, LibraryOptions, EnterpriseLibraryOptions from arcticc.pb2.storage_pb2 import LibraryConfig from arcticdb_ext.storage import Library, StorageOverride, CONFIG_LIBRARY_NAME from arcticdb.encoding_version import EncodingVersion -def set_library_options(lib_desc: "LibraryConfig", options: LibraryOptions): +def set_library_options(lib_desc: "LibraryConfig", options: LibraryOptions, + enterprise_library_options: EnterpriseLibraryOptions): write_options = lib_desc.version.write_options write_options.dynamic_strings = True @@ -38,6 +42,9 @@ def set_library_options(lib_desc: "LibraryConfig", options: LibraryOptions): options.encoding_version if options.encoding_version is not None else DEFAULT_ENCODING_VERSION ) + write_options.sync_passive.enabled = enterprise_library_options.replication + write_options.delayed_deletes = enterprise_library_options.background_deletion + class ArcticLibraryAdapter(ABC): @abstractmethod @@ -58,13 +65,26 @@ def supports_uri(uri: str) -> bool: def config_library(self) -> Library: raise NotImplementedError + def get_library_config(self, name: str, library_options: LibraryOptions, + enterprise_library_options: EnterpriseLibraryOptions): + env_cfg = EnvironmentConfigsMap() + + self.add_library_to_env(env_cfg, name) + + library_options.encoding_version = ( + library_options.encoding_version if library_options.encoding_version is not None else self._encoding_version + ) + set_library_options(env_cfg.env_by_id[_DEFAULT_ENV].lib_by_path[name], library_options, + enterprise_library_options) + + return NativeVersionStore.create_library_config( + env_cfg, _DEFAULT_ENV, name, encoding_version=library_options.encoding_version + ) + @abstractmethod - def get_library_config(self, name: str, library_options: LibraryOptions): + def add_library_to_env(self, env_cfg: EnvironmentConfigsMap, name: str): raise NotImplementedError - def cleanup_library(self, library_name: str): - pass - def get_storage_override(self) -> StorageOverride: return StorageOverride() diff --git a/python/arcticdb/adapters/azure_library_adapter.py b/python/arcticdb/adapters/azure_library_adapter.py index e067cb7909..1f14717953 100644 --- a/python/arcticdb/adapters/azure_library_adapter.py +++ b/python/arcticdb/adapters/azure_library_adapter.py @@ -10,7 +10,6 @@ from typing import Optional import platform -from arcticdb.options import LibraryOptions from arcticc.pb2.storage_pb2 import EnvironmentConfigsMap, LibraryConfig from arcticdb.version_store.helper import add_azure_library_to_env from arcticdb.config import _DEFAULT_ENV @@ -123,9 +122,7 @@ def get_masking_override(self) -> StorageOverride: storage_override.set_azure_override(azure_override) return storage_override - def get_library_config(self, name, library_options: LibraryOptions): - env_cfg = EnvironmentConfigsMap() - + def add_library_to_env(self, env_cfg: EnvironmentConfigsMap, name: str): if self._query_params.Path_prefix: # add time to prefix - so that the azure root folder is unique and we can delete and recreate fast with_prefix = f"{self._query_params.Path_prefix}/{name}{time.time() * 1e9:.0f}" @@ -142,15 +139,6 @@ def get_library_config(self, name, library_options: LibraryOptions): ca_cert_path=self._ca_cert_path, ) - library_options.encoding_version = ( - library_options.encoding_version if library_options.encoding_version is not None else self._encoding_version - ) - set_library_options(env_cfg.env_by_id[_DEFAULT_ENV].lib_by_path[name], library_options) - - return NativeVersionStore.create_library_config( - env_cfg, _DEFAULT_ENV, name, encoding_version=library_options.encoding_version - ) - @property def path_prefix(self): return self._query_params.Path_prefix diff --git a/python/arcticdb/adapters/in_memory_library_adapter.py b/python/arcticdb/adapters/in_memory_library_adapter.py index 73892a8297..0453fffa01 100644 --- a/python/arcticdb/adapters/in_memory_library_adapter.py +++ b/python/arcticdb/adapters/in_memory_library_adapter.py @@ -47,16 +47,5 @@ def config_library(self): return lib._library - def get_library_config(self, name, library_options: LibraryOptions): - env_cfg = EnvironmentConfigsMap() - + def add_library_to_env(self, env_cfg: EnvironmentConfigsMap, name: str): add_memory_library_to_env(env_cfg, lib_name=name, env_name=_DEFAULT_ENV) - - library_options.encoding_version = ( - library_options.encoding_version if library_options.encoding_version is not None else self._encoding_version - ) - set_library_options(env_cfg.env_by_id[_DEFAULT_ENV].lib_by_path[name], library_options) - - return NativeVersionStore.create_library_config( - env_cfg, _DEFAULT_ENV, name, encoding_version=library_options.encoding_version - ) diff --git a/python/arcticdb/adapters/lmdb_library_adapter.py b/python/arcticdb/adapters/lmdb_library_adapter.py index b87d6fe42d..40346b0677 100644 --- a/python/arcticdb/adapters/lmdb_library_adapter.py +++ b/python/arcticdb/adapters/lmdb_library_adapter.py @@ -147,9 +147,7 @@ def config_library(self): return lib._library - def get_library_config(self, name, library_options: LibraryOptions): - env_cfg = EnvironmentConfigsMap() - + def add_library_to_env(self, env_cfg: EnvironmentConfigsMap, name: str): lmdb_config = {} if self._query_params.map_size: lmdb_config["map_size"] = self._query_params.map_size @@ -158,35 +156,6 @@ def get_library_config(self, name, library_options: LibraryOptions): env_cfg, lib_name=name, env_name=_DEFAULT_ENV, db_dir=self._path, lmdb_config=lmdb_config ) - library_options.encoding_version = ( - library_options.encoding_version if library_options.encoding_version is not None else self._encoding_version - ) - set_library_options(env_cfg.env_by_id[_DEFAULT_ENV].lib_by_path[name], library_options) - - return NativeVersionStore.create_library_config( - env_cfg, _DEFAULT_ENV, name, encoding_version=library_options.encoding_version - ) - - def cleanup_library(self, library_name: str): - lmdb_files_removed = True - for file in ("lock.mdb", "data.mdb"): - path = os.path.join(self._path, library_name, file) - try: - os.remove(path) - except Exception as e: - lmdb_files_removed = False - _rm_errorhandler(None, path, e) - dir_path = os.path.join(self._path, library_name) - if os.path.exists(dir_path): - if os.listdir(dir_path): - log.warn( - "Skipping deletion of directory holding LMDB library during library deletion as it contains " - f"files unrelated to LMDB. LMDB files {'have' if lmdb_files_removed else 'have not'} been " - f"removed. directory=[{dir_path}]" - ) - else: - shutil.rmtree(os.path.join(self._path, library_name), onerror=_rm_errorhandler) - def get_storage_override(self) -> StorageOverride: lmdb_override = LmdbOverride() lmdb_override.path = self._path diff --git a/python/arcticdb/adapters/mongo_library_adapter.py b/python/arcticdb/adapters/mongo_library_adapter.py index 6443e0dea2..dfc0ce169d 100644 --- a/python/arcticdb/adapters/mongo_library_adapter.py +++ b/python/arcticdb/adapters/mongo_library_adapter.py @@ -63,15 +63,5 @@ def config_library(self): return lib._library - def get_library_config(self, name, library_options: LibraryOptions): - env_cfg = EnvironmentConfigsMap() - + def add_library_to_env(self, env_cfg: EnvironmentConfigsMap, name: str): add_mongo_library_to_env(cfg=env_cfg, lib_name=name, env_name=_DEFAULT_ENV, uri=self._uri) - library_options.encoding_version = ( - library_options.encoding_version if library_options.encoding_version is not None else self._encoding_version - ) - set_library_options(env_cfg.env_by_id[_DEFAULT_ENV].lib_by_path[name], library_options) - - return NativeVersionStore.create_library_config( - env_cfg, _DEFAULT_ENV, name, encoding_version=library_options.encoding_version - ) diff --git a/python/arcticdb/adapters/prefixing_library_adapter_decorator.py b/python/arcticdb/adapters/prefixing_library_adapter_decorator.py index 6e0602aa8b..5e7db5ab86 100644 --- a/python/arcticdb/adapters/prefixing_library_adapter_decorator.py +++ b/python/arcticdb/adapters/prefixing_library_adapter_decorator.py @@ -55,13 +55,9 @@ def get_name_for_library_manager(self, user_facing_name: str) -> str: def library_manager_names_to_user_facing(self, names: Iterable[str]) -> List[str]: return [name[len(self._prefix) :] for name in names if name.startswith(self._prefix)] - def get_library_config(self, name: str, library_options: LibraryOptions): + def get_library_config(self, name: str, library_options: LibraryOptions, enterprise_library_options): lib_mgr_name = self.get_name_for_library_manager(name) - return self._inner.get_library_config(lib_mgr_name, library_options) - - def cleanup_library(self, library_name: str): - lib_mgr_name = self.get_name_for_library_manager(library_name) - return self._inner.cleanup_library(lib_mgr_name) + return self._inner.get_library_config(lib_mgr_name, library_options, enterprise_library_options) def __repr__(self): return repr(self._inner) @@ -74,7 +70,7 @@ def __getattr__(self, name: str): sig = inspect.signature(inner_attr) for param in sig.parameters: if "name" in param: - raise NotImplementedError(f"{type(self).__name__} need to be updated to wrap {inner_attr}") + raise NotImplementedError(f"{type(self).__name__} needs to be updated to wrap {inner_attr}") try: setattr(inner_attr, marker, True) except: diff --git a/python/arcticdb/adapters/s3_library_adapter.py b/python/arcticdb/adapters/s3_library_adapter.py index 6b4df51429..0bd061804f 100644 --- a/python/arcticdb/adapters/s3_library_adapter.py +++ b/python/arcticdb/adapters/s3_library_adapter.py @@ -174,9 +174,7 @@ def get_masking_override(self) -> StorageOverride: storage_override.set_s3_override(s3_override) return storage_override - def get_library_config(self, name, library_options: LibraryOptions): - env_cfg = EnvironmentConfigsMap() - + def add_library_to_env(self, env_cfg: EnvironmentConfigsMap, name: str): _name = self._query_params.access if not self._query_params.aws_auth else USE_AWS_CRED_PROVIDERS_TOKEN _key = self._query_params.secret if not self._query_params.aws_auth else USE_AWS_CRED_PROVIDERS_TOKEN @@ -200,15 +198,6 @@ def get_library_config(self, name, library_options: LibraryOptions): use_virtual_addressing=self._query_params.use_virtual_addressing, ) - library_options.encoding_version = ( - library_options.encoding_version if library_options.encoding_version is not None else self._encoding_version - ) - set_library_options(env_cfg.env_by_id[_DEFAULT_ENV].lib_by_path[name], library_options) - - return NativeVersionStore.create_library_config( - env_cfg, _DEFAULT_ENV, name, encoding_version=library_options.encoding_version - ) - def _configure_aws(self): if not self._query_params.region: match = re.match(r"s3\.(?P[a-z0-9-]+)\.amazonaws.*", self._endpoint) diff --git a/python/arcticdb/arctic.py b/python/arcticdb/arctic.py index 5556a73e42..e17a2bf6d6 100644 --- a/python/arcticdb/arctic.py +++ b/python/arcticdb/arctic.py @@ -5,10 +5,10 @@ As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. """ +import logging +from typing import List, Optional, Any, Union -from typing import List, Optional - -from arcticdb.options import DEFAULT_ENCODING_VERSION, LibraryOptions +from arcticdb.options import DEFAULT_ENCODING_VERSION, LibraryOptions, EnterpriseLibraryOptions from arcticdb_ext.storage import LibraryManager from arcticdb.exceptions import LibraryNotFound, MismatchingLibraryOptions from arcticdb.version_store.library import ArcticInvalidApiUsageException, Library @@ -20,6 +20,11 @@ from arcticdb.adapters.mongo_library_adapter import MongoLibraryAdapter from arcticdb.adapters.in_memory_library_adapter import InMemoryLibraryAdapter from arcticdb.encoding_version import EncodingVersion +from arcticdb.exceptions import UnsupportedLibraryOptionValue, UnknownLibraryOption +from arcticdb.options import ModifiableEnterpriseLibraryOption, ModifiableLibraryOption + + +logger = logging.getLogger(__name__) class Arctic: @@ -158,7 +163,10 @@ def get_library( else: raise e - def create_library(self, name: str, library_options: Optional[LibraryOptions] = None) -> Library: + def create_library(self, + name: str, + library_options: Optional[LibraryOptions] = None, + enterprise_library_options: Optional[EnterpriseLibraryOptions] = None) -> Library: """ Creates the library named ``name``. @@ -176,7 +184,11 @@ def create_library(self, name: str, library_options: Optional[LibraryOptions] = The name of the library that you wish to create. library_options: Optional[LibraryOptions] - Options to use in configuring the library. Defaults if not provided are the same as are documented in LibraryOptions. + Options to use in configuring the library. Defaults if not provided are the same as documented in LibraryOptions. + + enterprise_library_options: Optional[EnterpriseLibraryOptions] + Enterprise options to use in configuring the library. Defaults if not provided are the same as documented in + EnterpriseLibraryOptions. These options are only relevant to ArcticDB enterprise users. Examples -------- @@ -194,7 +206,10 @@ def create_library(self, name: str, library_options: Optional[LibraryOptions] = if library_options is None: library_options = LibraryOptions() - cfg = self._library_adapter.get_library_config(name, library_options) + if enterprise_library_options is None: + enterprise_library_options = EnterpriseLibraryOptions() + + cfg = self._library_adapter.get_library_config(name, library_options, enterprise_library_options) lib_mgr_name = self._library_adapter.get_name_for_library_manager(name) self._library_manager.write_library_config(cfg, lib_mgr_name, self._library_adapter.get_masking_override()) if self._created_lib_names is not None: @@ -218,16 +233,12 @@ def delete_library(self, name: str) -> None: except LibraryNotFound: return lib._nvs.version_store.clear() - del lib lib_mgr_name = self._library_adapter.get_name_for_library_manager(name) - self._library_manager.close_library_if_open(lib_mgr_name) - try: - self._library_adapter.cleanup_library(name) - finally: - self._library_manager.remove_library_config(lib_mgr_name) + self._library_manager.cleanup_library_if_open(lib_mgr_name) + self._library_manager.remove_library_config(lib_mgr_name) - if self._created_lib_names and name in self._created_lib_names: - self._created_lib_names.remove(name) + if self._created_lib_names and name in self._created_lib_names: + self._created_lib_names.remove(name) def has_library(self, name: str) -> bool: """ @@ -275,3 +286,80 @@ def get_uri(self) -> str: s3://MY_ENDPOINT:MY_BUCKET """ return self._uri + + def modify_library_option(self, + library: Library, + option: Union[ModifiableLibraryOption, ModifiableEnterpriseLibraryOption], + option_value: Any): + """ + Modify an option for a library. + + See `LibraryOptions` and `EnterpriseLibraryOptions` for descriptions of the meanings of the various options. + + After the modification, this process and other processes that open the library will use the new value. Processes + that already have the library open will not see the configuration change until they restart. + + Parameters + ---------- + library + The library to modify. + + option + The library option to change. + + option_value + The new setting for the library option. + """ + def check_bool(): + if not isinstance(option_value, bool): + raise UnsupportedLibraryOptionValue(f"{option} only supports bool values but received {option_value}. " + f"Not changing library option.") + + def check_postive_int(): + if not isinstance(option_value, int): + raise UnsupportedLibraryOptionValue(f"{option} only supports positive integer values but received " + f"{option_value}. Not changing library option.") + + cfg = library._nvs.lib_cfg() + write_options = cfg.lib_desc.version.write_options + + # Core options + if option == ModifiableLibraryOption.DEDUP: + check_bool() + write_options.de_duplication = option_value + + elif option == ModifiableLibraryOption.ROWS_PER_SEGMENT: + check_postive_int() + write_options.segment_row_size = option_value + + elif option == ModifiableLibraryOption.COLUMNS_PER_SEGMENT: + check_postive_int() + write_options.column_group_size = option_value + + # Enterprise options + elif option == ModifiableEnterpriseLibraryOption.REPLICATION: + check_bool() + write_options.sync_passive.enabled = option_value + + elif option == ModifiableEnterpriseLibraryOption.BACKGROUND_DELETION: + check_bool() + write_options.delayed_deletes = option_value + + # Unknown + else: + raise UnknownLibraryOption(f"Unknown library option {option} cannot be modified. This is a bug " + f"in ArcticDB. Please raise an issue on github.com/ArcticDB") + + self._library_manager.write_library_config(cfg, library.name, self._library_adapter.get_masking_override()) + + lib_mgr_name = self._library_adapter.get_name_for_library_manager(library.name) + storage_override = self._library_adapter.get_storage_override() + library._nvs._initialize( + self._library_manager.get_library(lib_mgr_name, storage_override, ignore_cache=True), + library._nvs.env, + cfg, + library._nvs._custom_normalizer, + library._nvs._open_mode + ) + + logger.info(f"Set option=[{option}] to value=[{option_value}] for Arctic=[{self}] Library=[{library}]") diff --git a/python/arcticdb/exceptions.py b/python/arcticdb/exceptions.py index 1c42b9d5b5..f9bc3675cf 100644 --- a/python/arcticdb/exceptions.py +++ b/python/arcticdb/exceptions.py @@ -27,5 +27,13 @@ class MismatchingLibraryOptions(ArcticException): pass +class UnknownLibraryOption(ArcticException): + pass + + +class UnsupportedLibraryOptionValue(ArcticException): + pass + + class LmdbOptionsError(ArcticException): pass diff --git a/python/arcticdb/options.py b/python/arcticdb/options.py index 2b25d1edca..04787ac823 100644 --- a/python/arcticdb/options.py +++ b/python/arcticdb/options.py @@ -7,6 +7,7 @@ """ from typing import Optional +from enum import Enum from arcticdb.encoding_version import EncodingVersion @@ -16,7 +17,7 @@ class LibraryOptions: """ - Configuration options that can be applied when libraries are created. + Configuration options for ArcticDB libraries. Attributes ---------- @@ -142,3 +143,90 @@ def __repr__(self): f" rows_per_segment={self.rows_per_segment}, columns_per_segment={self.columns_per_segment}," f" encoding_version={self.encoding_version if self.encoding_version is not None else 'Default'})" ) + + +class EnterpriseLibraryOptions: + """ + Configuration options for ArcticDB libraries, that should only be used when you are using the ArcticDB enterprise + features. + + Contact `arcticdb@man.com` for more information about the Enterprise features. + + Attributes + ---------- + + replication: bool + See `__init__` for details. + + background_deletion: bool + See `__init__` for details. + """ + + def __init__( + self, + *, + replication: bool = False, + background_deletion: bool = False, + ): + """ + Parameters + ---------- + + replication: bool, default False + + Whether to use the replication feature for this library. The replication tool is an enterprise tool that + is used for one-way replication of the contents of a library to an arbitrary number of secondary storages. + + When enabled, modifications to this library will also write an oplog message that will be consumed by the + replication tool. + + When using this option, replication target libraries _must_ have background_deletion=True. We recommend + that source libraries also have background_deletion=True as this improves replication performance. + + background_deletion: bool, default False + Whether to use background deletion for this library. The background deletion tool is an enterprise tool + that performs data deletion in the background, so that writers to an ArcticDB library do not need to spend + time doing the deletion themselves. + + When enabled, deletions and version pruning on this library will only logically delete data. The background + deletion tool is responsible for the physical deletion. + + This flag does not affect the logical semantics of the ArcticDB APIs. Readers will see data in the same + state following a `delete` or `prune_previous_versions` call regardless of whether this flag is set or not. + """ + self.replication = replication + self.background_deletion = background_deletion + + def __eq__(self, right): + return ( + self.replication == right.replication + and self.background_deletion == right.background_deletion + ) + + def __repr__(self): + return ( + f"EnterpriseLibraryOptions(replication={self.replication}, background_deletion={self.background_deletion})" + ) + + +class ModifiableLibraryOption(Enum): + """Library options that can be modified after library creation. + + See also `ModifiableEnterpriseLibraryOption` for enterprise options that can be modified. + + See `LibraryOptions` for a description of each option. + """ + DEDUP = 1 + ROWS_PER_SEGMENT = 2 + COLUMNS_PER_SEGMENT = 3 + + +class ModifiableEnterpriseLibraryOption(Enum): + """Enterprise library options that can be modified after library creation. + + See also `ModifiableLibraryOption` for non-enterprise options that can be modified. + + See `EnterpriseLibraryOptions` for a description of each option. + """ + REPLICATION = 1 + BACKGROUND_DELETION = 2 diff --git a/python/arcticdb/storage_fixtures/s3.py b/python/arcticdb/storage_fixtures/s3.py index 3b89bb9045..77dc0da7fc 100644 --- a/python/arcticdb/storage_fixtures/s3.py +++ b/python/arcticdb/storage_fixtures/s3.py @@ -200,14 +200,21 @@ def run_server(port): from moto.server import DomainDispatcherApplication, create_backend_app class _HostDispatcherApplication(DomainDispatcherApplication): - """The stand-alone server needs a way to distinguish between S3 and IAM. We use the host for that""" - - _MAP = {"s3.us-east-1.amazonaws.com": "s3", "localhost": "s3", "127.0.0.1": "iam"} _reqs_till_rate_limit = -1 def get_backend_for_host(self, host): - return self._MAP.get(host, host) + """The stand-alone server needs a way to distinguish between S3 and IAM. We use the host for that""" + if host is None: + return None + if "s3" in host or host == "localhost": + return "s3" + elif host == "127.0.0.1": + return "iam" + elif host == "moto_api": + return "moto_api" + else: + raise RuntimeError(f"Unknown host {host}") def __call__(self, environ, start_response): path_info: bytes = environ.get("PATH_INFO", "") diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py index a7c666767d..8dfbefda85 100644 --- a/python/arcticdb/version_store/_normalization.py +++ b/python/arcticdb/version_store/_normalization.py @@ -244,8 +244,9 @@ def _to_primitive(arr, arr_name, dynamic_strings, string_max_len=None, coerce_co return arr else: raise ArcticDbNotYetImplemented( - "Support for arbitrary objects in an array is not implemented apart from string, unicode, Timestamp. " - "Column type={} for column={}. Do you have mixed dtypes in your column?".format(arr.dtype, arr_name) + f"Failed to normalize column '{arr_name}' with dtype '{arr.dtype}'. Found first non-null value of type " + f"'{type(sample)}', but only strings, unicode, and Timestamps are supported. " + f"Do you have mixed dtypes in your column?" ) # Pick any unwanted data conversions (e.g. np.NaN to 'nan') or None to the string 'None' @@ -1241,14 +1242,8 @@ def normalize( if pickle_on_failure: log.debug("pickle_on_failure flag set, normalizing the item with MsgPackNormalizer", type(item), ex) return self.fallback_normalizer.normalize(item) - # Could not normalize with the default handler, pickle_on_failure - error_message = ( - "Could not normalize item of type: {} with any normalizer." - "You can set pickle_on_failure param to force pickling of this object instead." - "(Note: Pickling has worse performance and stricter memory limitations)" - ) - log.error(error_message, type(item), ex) - raise + else: + raise def denormalize(self, item, norm_meta): # type: (_FrameData, NormalizationMetadata)->_SUPPORTED_TYPES diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py index 20eb0a2041..68abc3571e 100644 --- a/python/arcticdb/version_store/_store.py +++ b/python/arcticdb/version_store/_store.py @@ -204,6 +204,11 @@ class NativeVersionStore: """ _warned_about_list_version_latest_only_and_snapshot: bool = False + norm_failure_options_msg_write = \ + "Setting the pickle_on_failure parameter to True will allow the object to be written. However, many " \ + "operations (such as date_range filtering and column selection) will not work on pickled data." + norm_failure_options_msg_append = "Data must be normalizable to be appended to existing data." + norm_failure_options_msg_update = "Data must be normalizable to be used to update existing data." def __init__(self, library, env, lib_cfg=None, open_mode=OpenMode.DELETE): # type: (_Library, Optional[str], Optional[LibraryConfig], OpenMode)->None @@ -313,7 +318,17 @@ def get_backing_store(self): log.error("Could not get primary backing store for lib due to: {}".format(e)) return backing_store - def _try_normalize(self, symbol, dataframe, metadata, pickle_on_failure, dynamic_strings, coerce_columns, **kwargs): + def _try_normalize( + self, + symbol, + dataframe, + metadata, + pickle_on_failure, + dynamic_strings, + coerce_columns, + norm_failure_options_msg="", + **kwargs + ): dynamic_schema = self.resolve_defaults( "dynamic_schema", self._lib_cfg.lib_desc.version.write_options, False, **kwargs ) @@ -335,8 +350,17 @@ def _try_normalize(self, symbol, dataframe, metadata, pickle_on_failure, dynamic **kwargs, ) except ArcticDbNotYetImplemented as ex: - log.error("Not supported: normalizing symbol={}, data={}, metadata={}, {}", symbol, dataframe, metadata, ex) - raise + raise ArcticDbNotYetImplemented( + f"Not supported: normalizing\n" + f"symbol: {symbol}\n" + f"data:\n" + f"{dataframe}\n" + f"metadata:\n" + f"{metadata}\n" + f"Reason:\n" + f"{ex}\n" + f"{norm_failure_options_msg}" + ) except Exception as ex: log.error("Error while normalizing symbol={}, data={}, metadata={}, {}", symbol, dataframe, metadata, ex) raise ArcticNativeException(str(ex)) @@ -523,6 +547,7 @@ def write( coerce_columns = kwargs.get("coerce_columns", None) sparsify_floats = kwargs.get("sparsify_floats", False) + norm_failure_options_msg = kwargs.get("norm_failure_options_msg", self.norm_failure_options_msg_write) _handle_categorical_columns(symbol, data, False) @@ -542,7 +567,13 @@ def write( return vit udm, item, norm_meta = self._try_normalize( - symbol, data, metadata, pickle_on_failure, dynamic_strings, coerce_columns + symbol, + data, + metadata, + pickle_on_failure, + dynamic_strings, + coerce_columns, + norm_failure_options_msg, ) # TODO: allow_sparse for write_parallel / recursive normalizers as well. if isinstance(item, NPDDataFrame): @@ -557,15 +588,7 @@ def write( symbol, item, norm_meta, udm, prune_previous_version, sparsify_floats, validate_index ) - return VersionedItem( - symbol=vit.symbol, - library=self._library.library_path, - version=vit.version, - metadata=metadata, - data=None, - host=self.env, - timestamp=vit.timestamp - ) + return self._convert_thin_cxx_item_to_python(vit, metadata) def _resolve_dynamic_strings(self, kwargs): proto_cfg = self._lib_cfg.lib_desc.version.write_options @@ -665,7 +688,15 @@ def append( _handle_categorical_columns(symbol, dataframe) - udm, item, norm_meta = self._try_normalize(symbol, dataframe, metadata, False, dynamic_strings, coerce_columns) + udm, item, norm_meta = self._try_normalize( + symbol, + dataframe, + metadata, + False, + dynamic_strings, + coerce_columns, + self.norm_failure_options_msg_append, + ) write_if_missing = kwargs.get("write_if_missing", True) @@ -677,15 +708,7 @@ def append( vit = self.version_store.append( symbol, item, norm_meta, udm, write_if_missing, prune_previous_version, validate_index ) - return VersionedItem( - symbol=vit.symbol, - library=self._library.library_path, - version=vit.version, - metadata=metadata, - data=None, - host=self.env, - timestamp=vit.timestamp - ) + return self._convert_thin_cxx_item_to_python(vit, metadata) def update( self, @@ -769,22 +792,22 @@ def update( _handle_categorical_columns(symbol, data) - udm, item, norm_meta = self._try_normalize(symbol, data, metadata, False, dynamic_strings, coerce_columns) + udm, item, norm_meta = self._try_normalize( + symbol, + data, + metadata, + False, + dynamic_strings, + coerce_columns, + self.norm_failure_options_msg_update, + ) if isinstance(item, NPDDataFrame): with _diff_long_stream_descriptor_mismatch(self): vit = self.version_store.update( symbol, update_query, item, norm_meta, udm, upsert, dynamic_schema, prune_previous_version ) - return VersionedItem( - symbol=vit.symbol, - library=self._library.library_path, - version=vit.version, - metadata=metadata, - data=None, - host=self.env, - timestamp=vit.timestamp - ) + return self._convert_thin_cxx_item_to_python(vit, metadata) def create_column_stats( self, symbol: str, column_stats: Dict[str, Set[str]], as_of: Optional[VersionQueryInput] = None @@ -1110,16 +1133,16 @@ def batch_read_metadata_multi( return results_dict - def _convert_thin_cxx_item_to_python(self, v) -> VersionedItem: + def _convert_thin_cxx_item_to_python(self, cxx_versioned_item, metadata) -> VersionedItem: """Convert a cxx versioned item that does not contain data or metadata to a Python equivalent.""" return VersionedItem( - symbol=v.symbol, + symbol=cxx_versioned_item.symbol, library=self._library.library_path, data=None, - version=v.version, - metadata=None, + version=cxx_versioned_item.version, + metadata=metadata, host=self.env, - timestamp=v.timestamp + timestamp=cxx_versioned_item.timestamp ) def batch_write( @@ -1212,10 +1235,13 @@ def _batch_write_internal( pickle_on_failure = self.resolve_defaults( "pickle_on_failure", proto_cfg, global_default=False, existing_value=pickle_on_failure, **kwargs ) + norm_failure_options_msg = kwargs.get("norm_failure_options_msg", self.norm_failure_options_msg_write) + + # metadata_vector used to be type-hinted as an Iterable, so handle this case in case anyone is relying on it if metadata_vector is None: - metadata_itr = itertools.repeat(None) + metadata_vector = len(symbols) * [None] else: - metadata_itr = iter(metadata_vector) + metadata_vector = list(metadata_vector) for idx in range(len(symbols)): _handle_categorical_columns(symbols[idx], data_vector[idx], False) @@ -1224,10 +1250,11 @@ def _batch_write_internal( self._try_normalize( symbols[idx], data_vector[idx], - next(metadata_itr), - pickle_on_failure=pickle_on_failure, - dynamic_strings=dynamic_strings, - coerce_columns=None, + metadata_vector[idx], + pickle_on_failure, + dynamic_strings, + None, + norm_failure_options_msg, ) for idx in range(len(symbols)) ] @@ -1238,11 +1265,11 @@ def _batch_write_internal( symbols, items, norm_metas, udms, prune_previous_version, validate_index, throw_on_error ) write_results = [] - for result in cxx_versioned_items: + for idx, result in enumerate(cxx_versioned_items): if isinstance(result, DataError): write_results.append(result) else: - write_results.append(self._convert_thin_cxx_item_to_python(result)) + write_results.append(self._convert_thin_cxx_item_to_python(result, metadata_vector[idx])) return write_results def _batch_write_metadata_to_versioned_items( @@ -1257,11 +1284,11 @@ def _batch_write_metadata_to_versioned_items( symbols, normalized_meta, prune_previous_version, throw_on_error ) write_metadata_results = [] - for result in cxx_versioned_items: + for idx, result in enumerate(cxx_versioned_items): if isinstance(result, DataError): write_metadata_results.append(result) else: - write_metadata_results.append(self._convert_thin_cxx_item_to_python(result)) + write_metadata_results.append(self._convert_thin_cxx_item_to_python(result, metadata_vector[idx])) return write_metadata_results def batch_write_metadata( @@ -1371,10 +1398,11 @@ def _batch_append_to_versioned_items( ) dynamic_strings = self._resolve_dynamic_strings(kwargs) + # metadata_vector used to be type-hinted as an Iterable, so handle this case in case anyone is relying on it if metadata_vector is None: - metadata_itr = itertools.repeat(None) + metadata_vector = len(symbols) * [None] else: - metadata_itr = iter(metadata_vector) + metadata_vector = list(metadata_vector) for idx in range(len(symbols)): _handle_categorical_columns(symbols[idx], data_vector[idx]) @@ -1383,10 +1411,11 @@ def _batch_append_to_versioned_items( self._try_normalize( symbols[idx], data_vector[idx], - next(metadata_itr), - dynamic_strings=dynamic_strings, - pickle_on_failure=False, - coerce_columns=None, + metadata_vector[idx], + False, + dynamic_strings, + None, + self.norm_failure_options_msg_append, ) for idx in range(len(symbols)) ] @@ -1407,11 +1436,11 @@ def _batch_append_to_versioned_items( throw_on_error, ) append_results = [] - for result in cxx_versioned_items: + for idx, result in enumerate(cxx_versioned_items): if isinstance(result, DataError): append_results.append(result) else: - append_results.append(self._convert_thin_cxx_item_to_python(result)) + append_results.append(self._convert_thin_cxx_item_to_python(result, metadata_vector[idx])) return append_results def batch_restore_version( @@ -2686,7 +2715,7 @@ def write_metadata( ) udm = normalize_metadata(metadata) if metadata is not None else None v = self.version_store.write_metadata(symbol, udm, prune_previous_version) - return self._convert_thin_cxx_item_to_python(v) + return self._convert_thin_cxx_item_to_python(v, metadata) def is_symbol_fragmented(self, symbol: str, segment_size: Optional[int] = None) -> bool: """ diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py index a50d0d8fd4..f3eb1a55f9 100644 --- a/python/arcticdb/version_store/library.py +++ b/python/arcticdb/version_store/library.py @@ -10,10 +10,11 @@ import pytz from enum import Enum, auto -from typing import Optional, Any, Tuple, Dict, AnyStr, Union, List, Iterable, NamedTuple +from typing import Optional, Any, Tuple, Dict, Union, List, Iterable, NamedTuple from numpy import datetime64 -from arcticdb.options import LibraryOptions +from arcticdb.options import \ + LibraryOptions, EnterpriseLibraryOptions, ModifiableLibraryOption, ModifiableEnterpriseLibraryOption from arcticdb.supported_types import Timestamp from arcticdb.util._versions import IS_PANDAS_TWO @@ -335,6 +336,7 @@ def __contains__(self, symbol: str): return self.has_symbol(symbol) def options(self) -> LibraryOptions: + """Library options set on this library. See also `enterprise_options`.""" write_options = self._nvs.lib_cfg().lib_desc.version.write_options return LibraryOptions( dynamic_schema=write_options.dynamic_schema, @@ -344,6 +346,14 @@ def options(self) -> LibraryOptions: encoding_version=self._nvs.lib_cfg().lib_desc.version.encoding_version, ) + def enterprise_options(self) -> EnterpriseLibraryOptions: + """Enterprise library options set on this library. See also `options` for non-enterprise options.""" + write_options = self._nvs.lib_cfg().lib_desc.version.write_options + return EnterpriseLibraryOptions( + replication=write_options.sync_passive.enabled, + background_deletion=write_options.delayed_deletes + ) + def write( self, symbol: str, @@ -450,6 +460,8 @@ def write( pickle_on_failure=False, parallel=staged, validate_index=validate_index, + norm_failure_options_msg="Using write_pickle will allow the object to be written. However, many operations " + "(such as date_range filtering and column selection) will not work on pickled data.", ) def write_pickle( @@ -596,6 +608,9 @@ def write_batch( pickle_on_failure=False, validate_index=validate_index, throw_on_error=throw_on_error, + norm_failure_options_msg="Using write_pickle_batch will allow the object to be written. However, many " + "operations (such as date_range filtering and column selection) will not work on " + "pickled data.", ) def write_pickle_batch( @@ -809,6 +824,11 @@ def update( prune_previous_versions Removes previous (non-snapshotted) versions from the database when True. + Returns + ------- + VersionedItem + Structure containing metadata and version number of the written symbol in the store. + Examples -------- diff --git a/python/benchmarks/basic_functions.py b/python/benchmarks/basic_functions.py index aa3d244b82..fc64e0396c 100644 --- a/python/benchmarks/basic_functions.py +++ b/python/benchmarks/basic_functions.py @@ -12,17 +12,24 @@ from .common import * +# Common parameters between BasicFunctions and ModificationFunctions +WIDE_DF_ROWS = 5_000 +WIDE_DF_COLS = 30_000 +# PARAMS = ([1_000, 1_500], [50, 100]) +PARAMS = ([100_000, 150_000], [500, 1000]) +PARAM_NAMES = ["rows", "num_symbols"] + + class BasicFunctions: number = 5 timeout = 6000 CONNECTION_STRING = "lmdb://basic_functions?map_size=20GB" - WIDE_DF_ROWS = 5_000 - WIDE_DF_COLS = 30_000 + WIDE_DF_ROWS = WIDE_DF_ROWS + WIDE_DF_COLS = WIDE_DF_COLS DATE_RANGE = pd.date_range("2023-01-01", "2023-01-01") - # params = ([1_000, 15_00], [50, 100]) - params = ([100_000, 150_000], [500, 1000]) - param_names = ["rows", "num_symbols"] + params = PARAMS + param_names = PARAM_NAMES def setup_cache(self): self.ac = Arctic(BasicFunctions.CONNECTION_STRING) @@ -179,3 +186,107 @@ def peakmem_read_batch_with_date_ranges(self, rows, num_symbols): for sym in range(num_symbols) ] self.lib.read_batch(read_reqs) + + +from shutil import copytree, rmtree +class ModificationFunctions: + """ + Modification functions (update, append, delete) need a different setup/teardown process, thus we place them in a + separate group. + """ + number = 1 # We do a single run between setup and teardown because we e.g. can't delete a symbol twice + timeout = 6000 + ARCTIC_DIR = "modification_functions" + ARCTIC_DIR_ORIGINAL = "modification_functions_original" + CONNECTION_STRING = f"lmdb://{ARCTIC_DIR}?map_size=20GB" + WIDE_DF_ROWS = WIDE_DF_ROWS + WIDE_DF_COLS = WIDE_DF_COLS + + params = PARAMS + param_names = PARAM_NAMES + + def setup_cache(self): + self.ac = Arctic(ModificationFunctions.CONNECTION_STRING) + num_rows, num_symbols = ModificationFunctions.params + + self.init_dfs = {rows: generate_pseudo_random_dataframe(rows) for rows in num_rows} + for rows in num_rows: + lib_name = get_prewritten_lib_name(rows) + self.ac.delete_library(lib_name) + lib = self.ac.create_library(lib_name) + payloads = [WritePayload(f"{sym}_sym", self.init_dfs[rows]) for sym in range(num_symbols[-1])] + lib.write_batch(payloads) + + lib_name = get_prewritten_lib_name(ModificationFunctions.WIDE_DF_ROWS) + self.ac.delete_library(lib_name) + lib = self.ac.create_library(lib_name) + lib.write( + "short_wide_sym", + generate_random_floats_dataframe_with_index( + ModificationFunctions.WIDE_DF_ROWS, ModificationFunctions.WIDE_DF_COLS + ), + ) + + # We use the fact that we're running on LMDB to store a copy of the initial arctic directory. + # Then on each teardown we restore the initial state by overwriting the modified with the original. + copytree(ModificationFunctions.ARCTIC_DIR, ModificationFunctions.ARCTIC_DIR_ORIGINAL) + + + def setup(self, rows, num_symbols): + def get_time_at_fraction_of_df(fraction, rows=rows): + end_time = pd.Timestamp("1/1/2023") + time_delta = pd.tseries.offsets.DateOffset(seconds=round(rows * (fraction-1))) + return end_time + time_delta + + self.df_update_single = generate_pseudo_random_dataframe(1, "s", get_time_at_fraction_of_df(0.5)) + self.df_update_half = generate_pseudo_random_dataframe(rows//2, "s", get_time_at_fraction_of_df(0.75)) + self.df_update_upsert = generate_pseudo_random_dataframe(rows, "s", get_time_at_fraction_of_df(1.5)) + self.df_append_single = generate_pseudo_random_dataframe(1, "s", get_time_at_fraction_of_df(1.1)) + self.df_append_large = generate_pseudo_random_dataframe(rows, "s", get_time_at_fraction_of_df(2)) + + self.df_update_short_wide = generate_random_floats_dataframe_with_index( + ModificationFunctions.WIDE_DF_ROWS, ModificationFunctions.WIDE_DF_COLS + ) + self.df_append_short_wide = generate_random_floats_dataframe_with_index( + ModificationFunctions.WIDE_DF_ROWS, ModificationFunctions.WIDE_DF_COLS, "s", get_time_at_fraction_of_df(2, rows=ModificationFunctions.WIDE_DF_ROWS) + ) + + self.ac = Arctic(ModificationFunctions.CONNECTION_STRING) + self.lib = self.ac[get_prewritten_lib_name(rows)] + self.lib_short_wide = self.ac[get_prewritten_lib_name(ModificationFunctions.WIDE_DF_ROWS)] + + + def teardown(self, rows, num_symbols): + # After the modification functions clean up the changes by replacing the modified ARCTIC_DIR with the original ARCTIC_DIR_ORIGINAL + # TODO: We can use dirs_exist_ok=True on copytree instead of removing first if we run with python version >=3.8 + rmtree(ModificationFunctions.ARCTIC_DIR) + copytree(ModificationFunctions.ARCTIC_DIR_ORIGINAL, ModificationFunctions.ARCTIC_DIR) + del self.ac + + + def time_update_single(self, rows, num_symbols): + [self.lib.update(f"{sym}_sym", self.df_update_single) for sym in range(num_symbols)] + + def time_update_half(self, rows, num_symbols): + [self.lib.update(f"{sym}_sym", self.df_update_half) for sym in range(num_symbols)] + + def time_update_upsert(self, rows, num_symbols): + [self.lib.update(f"{sym}_sym", self.df_update_upsert, upsert=True) for sym in range(num_symbols)] + + def time_update_short_wide(self, rows, num_symbols): + self.lib_short_wide.update("short_wide_sym", self.df_update_short_wide) + + def time_append_single(self, rows, num_symbols): + [self.lib.append(f"{sym}_sym", self.df_append_single) for sym in range(num_symbols)] + + def time_append_large(self, rows, num_symbols): + [self.lib.append(f"{sym}_sym", self.df_append_large) for sym in range(num_symbols)] + + def time_append_short_wide(self, rows, num_symbols): + self.lib_short_wide.append("short_wide_sym", self.df_append_short_wide) + + def time_delete(self, rows, num_symbols): + [self.lib.delete(f"{sym}_sym") for sym in range(num_symbols)] + + def time_delete_short_wide(self, rows, num_symbols): + self.lib_short_wide.delete("short_wide_sym") diff --git a/python/benchmarks/common.py b/python/benchmarks/common.py index d05f888acd..e538309b27 100644 --- a/python/benchmarks/common.py +++ b/python/benchmarks/common.py @@ -37,6 +37,13 @@ def generate_random_floats_dataframe(num_rows, num_cols): return pd.DataFrame(data, columns=columns) +def generate_random_floats_dataframe_with_index(num_rows, num_cols, freq="s", end_timestamp="1/1/2023"): + timestamps = pd.date_range(end=end_timestamp, periods=num_rows, freq=freq) + df = generate_random_floats_dataframe(num_rows, num_cols) + df.index = timestamps + return df + + def generate_benchmark_df(n, freq="min", end_timestamp="1/1/2023"): timestamps = pd.date_range(end=end_timestamp, periods=n, freq=freq) k = n // 10 diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 4efda54a8a..89d20a3f23 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -23,16 +23,16 @@ from arcticdb.storage_fixtures.api import StorageFixture from arcticdb.storage_fixtures.azure import AzuriteStorageFixtureFactory from arcticdb.storage_fixtures.lmdb import LmdbStorageFixture -from arcticdb.storage_fixtures.s3 import MotoS3StorageFixtureFactory, real_s3_from_environment_variables, mock_s3_with_error_simulation +from arcticdb.storage_fixtures.s3 import ( + MotoS3StorageFixtureFactory, + real_s3_from_environment_variables, + mock_s3_with_error_simulation, +) from arcticdb.storage_fixtures.mongo import auto_detect_server from arcticdb.storage_fixtures.in_memory import InMemoryStorageFixture from arcticdb.version_store._normalization import MsgPackNormalizer from arcticdb.util.test import create_df -from tests.util.mark import ( - AZURE_TESTS_MARK, - MONGO_TESTS_MARK, - REAL_S3_TESTS_MARK, -) +from tests.util.mark import AZURE_TESTS_MARK, MONGO_TESTS_MARK, REAL_S3_TESTS_MARK # region =================================== Misc. Constants & Setup ==================================== hypothesis.settings.register_profile("ci_linux", max_examples=100) @@ -211,6 +211,7 @@ def arctic_client_no_lmdb(request, encoding_version): assert not ac.list_libraries() return ac + @pytest.fixture def arctic_library(arctic_client, lib_name): return arctic_client.create_library(lib_name) @@ -363,11 +364,7 @@ def object_version_store_prune_previous(object_store_factory): @pytest.fixture( - scope="function", - params=[ - "s3_store_factory", - pytest.param("azure_store_factory", marks=AZURE_TESTS_MARK), - ], + scope="function", params=["s3_store_factory", pytest.param("azure_store_factory", marks=AZURE_TESTS_MARK)] ) def local_object_store_factory(request): """ @@ -396,12 +393,7 @@ def local_object_version_store_prune_previous(local_object_store_factory): return local_object_store_factory(prune_previous_version=True) -@pytest.fixture( - params=[ - "version_store_factory", - pytest.param("real_s3_store_factory", marks=REAL_S3_TESTS_MARK), - ], -) +@pytest.fixture(params=["version_store_factory", pytest.param("real_s3_store_factory", marks=REAL_S3_TESTS_MARK)]) def version_store_and_real_s3_basic_store_factory(request): """ Just the version_store and real_s3 specifically for the test test_interleaved_store_read @@ -415,7 +407,7 @@ def version_store_and_real_s3_basic_store_factory(request): "version_store_factory", "in_memory_store_factory", pytest.param("real_s3_store_factory", marks=REAL_S3_TESTS_MARK), - ], + ] ) def basic_store_factory(request): store_factory = request.getfixturevalue(request.param) @@ -459,13 +451,7 @@ def lmdb_version_store_v2(version_store_factory, lib_name): return version_store_factory(dynamic_strings=True, encoding_version=int(EncodingVersion.V2), name=library_name) -@pytest.fixture( - scope="function", - params=( - "lmdb_version_store_v1", - "lmdb_version_store_v2", - ) -) +@pytest.fixture(scope="function", params=("lmdb_version_store_v1", "lmdb_version_store_v2")) def lmdb_version_store(request): yield request.getfixturevalue(request.param) @@ -510,16 +496,54 @@ def lmdb_version_store_dynamic_schema( raise ValueError(f"Unexpected encoding version: {encoding_version}") +@pytest.fixture +def lmdb_version_store_empty_types_v1(version_store_factory, lib_name): + library_name = lib_name + "_v1" + return version_store_factory(dynamic_strings=True, empty_types=True, name=library_name) + + +@pytest.fixture +def lmdb_version_store_empty_types_v2(version_store_factory, lib_name): + library_name = lib_name + "_v2" + return version_store_factory( + dynamic_strings=True, empty_types=True, encoding_version=int(EncodingVersion.V2), name=library_name + ) + + +@pytest.fixture +def lmdb_version_store_empty_types_dynamic_schema_v1(version_store_factory, lib_name): + library_name = lib_name + "_v1" + return version_store_factory(dynamic_strings=True, empty_types=True, dynamic_schema=True, name=library_name) + + +@pytest.fixture +def lmdb_version_store_empty_types_dynamic_schema_v2(version_store_factory, lib_name): + library_name = lib_name + "_v2" + return version_store_factory( + dynamic_strings=True, + empty_types=True, + dynamic_schema=True, + encoding_version=int(EncodingVersion.V2), + name=library_name, + ) + + @pytest.fixture def lmdb_version_store_delayed_deletes_v1(version_store_factory): - return version_store_factory(delayed_deletes=True, dynamic_strings=True, prune_previous_version=True) + return version_store_factory( + delayed_deletes=True, dynamic_strings=True, empty_types=True, prune_previous_version=True + ) @pytest.fixture def lmdb_version_store_delayed_deletes_v2(version_store_factory, lib_name): library_name = lib_name + "_v2" return version_store_factory( - dynamic_strings=True, delayed_deletes=True, encoding_version=int(EncodingVersion.V2), name=library_name + dynamic_strings=True, + delayed_deletes=True, + empty_types=True, + encoding_version=int(EncodingVersion.V2), + name=library_name, ) @@ -717,10 +741,10 @@ def get_df(ts, width, max_col_width): @pytest.fixture( scope="function", params=( - "lmdb_version_store_v1", - "lmdb_version_store_v2", - "lmdb_version_store_dynamic_schema_v1", - "lmdb_version_store_dynamic_schema_v2", + "lmdb_version_store_empty_types_v1", + "lmdb_version_store_empty_types_v2", + "lmdb_version_store_empty_types_dynamic_schema_v1", + "lmdb_version_store_empty_types_dynamic_schema_v2", ), ) def lmdb_version_store_static_and_dynamic(request): diff --git a/python/tests/integration/arcticdb/test_arctic.py b/python/tests/integration/arcticdb/test_arctic.py index ff1c7b6784..b202685095 100644 --- a/python/tests/integration/arcticdb/test_arctic.py +++ b/python/tests/integration/arcticdb/test_arctic.py @@ -9,7 +9,6 @@ import sys import pytz import math -import re import pytest import pandas as pd import numpy as np @@ -19,10 +18,8 @@ from arcticdb_ext.storage import NoDataFoundException from arcticdb.exceptions import ArcticDbNotYetImplemented, LibraryNotFound, MismatchingLibraryOptions from arcticdb.arctic import Arctic -from arcticdb.options import LibraryOptions -from arcticdb.encoding_version import EncodingVersion +from arcticdb.options import LibraryOptions, EnterpriseLibraryOptions from arcticdb import QueryBuilder -from arcticc.pb2.s3_storage_pb2 import Config as S3Config from arcticdb.storage_fixtures.api import StorageFixture, ArcticUriFields, StorageFixtureFactory from arcticdb.storage_fixtures.mongo import MongoDatabase from arcticdb.util.test import assert_frame_equal @@ -37,262 +34,6 @@ from tests.util.mark import AZURE_TESTS_MARK, MONGO_TESTS_MARK, REAL_S3_TESTS_MARK -def test_library_creation_deletion(arctic_client): - ac = arctic_client - assert ac.list_libraries() == [] - ac.create_library("pytest_test_lib") - with pytest.raises(ValueError): - ac.create_library("pytest_test_lib") - - assert ac.list_libraries() == ["pytest_test_lib"] - assert ac.has_library("pytest_test_lib") - assert "pytest_test_lib" in ac - if "mongo" in arctic_client.get_uri(): - # The mongo fixture uses PrefixingLibraryAdapterDecorator which leaks in this one case - assert ac["pytest_test_lib"].name.endswith(".pytest_test_lib") - else: - assert ac["pytest_test_lib"].name == "pytest_test_lib" - - ac.delete_library("pytest_test_lib") - # Want this to be silent. - ac.delete_library("library_that_does_not_exist") - - assert not ac.list_libraries() - with pytest.raises(LibraryNotFound): - _lib = ac["pytest_test_lib"] - assert not ac.has_library("pytest_test_lib") - assert "pytest_test_lib" not in ac - - -def test_get_library(arctic_client): - ac = arctic_client - assert ac.list_libraries() == [] - # Throws if library doesn't exist - with pytest.raises(LibraryNotFound): - _ = ac.get_library("pytest_test_lib") - # Creates library with default options if just create_if_missing set to True - lib = ac.get_library("pytest_test_lib_default_options", create_if_missing=True) - - assert lib.options() == LibraryOptions(encoding_version=ac._encoding_version) - # Creates library with the specified options if create_if_missing set to True and options provided - library_options = LibraryOptions( - dynamic_schema=True, - dedup=True, - rows_per_segment=10, - columns_per_segment=10, - encoding_version=EncodingVersion.V1 if ac._encoding_version == EncodingVersion.V2 else EncodingVersion.V2, - ) - lib = ac.get_library("pytest_test_lib_specified_options", create_if_missing=True, library_options=library_options) - assert lib.options() == library_options - # If the library already exists, create_if_missing is True, and options are provided, then the provided options must match the existing library - library_options.dynamic_schema = False - with pytest.raises(MismatchingLibraryOptions): - _ = ac.get_library("pytest_test_lib_specified_options", create_if_missing=True, library_options=library_options) - # Throws if library_options are provided but create_if_missing is False - with pytest.raises(ArcticInvalidApiUsageException): - _ = ac.get_library("pytest_test_lib", create_if_missing=False, library_options=library_options) - - -def test_create_library_with_invalid_name(arctic_client): - ac = arctic_client - - # These should succeed because the names are valid - valid_names = ["lib", "lib/with/slash", "lib-with-dash", "lib.with.dot", "lib123"] - for lib_name in valid_names: - ac.create_library(lib_name) - - # These should fail because the names are invalid - invalid_names = [chr(0), "lib>", "lib<", "lib*", "/lib", "lib...lib", "lib"*1000] - for lib_name in invalid_names: - with pytest.raises(UserInputException): - ac.create_library(lib_name) - - # Verify that library list is not corrupted - assert set(ac.list_libraries()) == set(valid_names) - - -# TODO: Fix issue #1247, then use "arcitc_client" instead of "arctic_client_no_lmdb" -@pytest.mark.parametrize("prefix", ["", "prefix"]) -@pytest.mark.parametrize("suffix", ["", "suffix"]) -def test_create_library_with_all_chars(arctic_client_no_lmdb, prefix, suffix): - # Create library names with each character (except '\' because Azure replaces it with '/' in some cases) - names = [f"{prefix}{chr(i)}{suffix}" for i in range(256) if chr(i) != '\\'] - - ac = arctic_client_no_lmdb - - created_libraries = set() - for name in names: - try: - ac.create_library(name) - created_libraries.add(name) - # We should only fail with UserInputException (indicating that name validation failed) - except UserInputException: - pass - - assert set(ac.list_libraries()) == created_libraries - -def test_do_not_persist_s3_details(s3_storage): - """We apply an in-memory overlay for these instead. In particular we should absolutely not persist credentials - in the storage.""" - - def _get_s3_storage_config(cfg): - primary_storage_name = cfg.lib_desc.storage_ids[0] - primary_any = cfg.storage_by_id[primary_storage_name] - s3_config = S3Config() - primary_any.config.Unpack(s3_config) - return s3_config - - ac = Arctic(s3_storage.arctic_uri) - lib = ac.create_library("test") - lib.write("sym", pd.DataFrame()) - - config = ac._library_manager.get_library_config("test") - s3_storage = _get_s3_storage_config(config) - assert s3_storage.bucket_name == "" - assert s3_storage.credential_name == "" - assert s3_storage.credential_key == "" - assert s3_storage.endpoint == "" - assert s3_storage.max_connections == 0 - assert s3_storage.connect_timeout == 0 - assert s3_storage.request_timeout == 0 - assert not s3_storage.ssl - assert s3_storage.prefix.startswith("test") - assert not s3_storage.https - assert s3_storage.region == "" - assert not s3_storage.use_virtual_addressing - - assert "sym" in ac["test"].list_symbols() - - -def test_library_options(arctic_client): - ac = arctic_client - assert ac.list_libraries() == [] - ac.create_library("pytest_default_options") - lib = ac["pytest_default_options"] - assert lib.options() == LibraryOptions(encoding_version=ac._encoding_version) - write_options = lib._nvs._lib_cfg.lib_desc.version.write_options - assert not write_options.dynamic_schema - assert not write_options.de_duplication - assert write_options.segment_row_size == 100_000 - assert write_options.column_group_size == 127 - assert lib._nvs._lib_cfg.lib_desc.version.encoding_version == ac._encoding_version - - library_options = LibraryOptions( - dynamic_schema=True, dedup=True, rows_per_segment=20, columns_per_segment=3, encoding_version=EncodingVersion.V2 - ) - ac.create_library( - "pytest_explicit_options", - library_options, - ) - lib = ac["pytest_explicit_options"] - assert lib.options() == library_options - write_options = lib._nvs._lib_cfg.lib_desc.version.write_options - assert write_options.dynamic_schema - assert write_options.de_duplication - assert write_options.segment_row_size == 20 - assert write_options.column_group_size == 3 - assert write_options.dynamic_strings - assert lib._nvs._lib_cfg.lib_desc.version.encoding_version == EncodingVersion.V2 - - -def test_separation_between_libraries(arctic_client): - # This fails for mem-backed without the library caching implemented in - # issue #520 then re-implemented in issue #889 - """Validate that symbols in one library are not exposed in another.""" - ac = arctic_client - assert ac.list_libraries() == [] - - ac.create_library("pytest_test_lib") - ac.create_library("pytest_test_lib_2") - - assert set(ac.list_libraries()) == {"pytest_test_lib", "pytest_test_lib_2"} - - ac["pytest_test_lib"].write("test_1", pd.DataFrame()) - ac["pytest_test_lib_2"].write("test_2", pd.DataFrame()) - assert ac["pytest_test_lib"].list_symbols() == ["test_1"] - assert ac["pytest_test_lib_2"].list_symbols() == ["test_2"] - - -def add_path_prefix(uri, prefix): - if "path_prefix" in uri: - return uri + prefix - - if "azure" in uri: # azure connection string has a different format - return f"{uri};Path_prefix={prefix}" - else: - return f"{uri}&path_prefix={prefix}" - - -@pytest.mark.parametrize( - "fixture", - [ - "s3_storage", - pytest.param("azurite_storage", marks=AZURE_TESTS_MARK), - pytest.param("real_s3_storage", marks=REAL_S3_TESTS_MARK), - ], -) -def test_separation_between_libraries_with_prefixes(fixture, request): - """The motivation for the prefix feature is that separate users want to be able to create libraries - with the same name in the same bucket without over-writing each other's work. This can be useful when - creating a new bucket is time-consuming, for example due to organizational issues. - """ - storage_fixture: StorageFixture = request.getfixturevalue(fixture) - - mercury_uri = add_path_prefix(storage_fixture.arctic_uri, "/planet/mercury") - ac_mercury = Arctic(mercury_uri) - - mars_uri = add_path_prefix(storage_fixture.arctic_uri, "/planet/mars") - ac_mars = Arctic(mars_uri) - - assert ac_mars.list_libraries() == [] - ac_mercury.create_library("pytest_test_lib") - ac_mercury.create_library("pytest_test_lib_2") - ac_mars.create_library("pytest_test_lib") - ac_mars.create_library("pytest_test_lib_2") - assert ac_mercury.list_libraries() == ["pytest_test_lib", "pytest_test_lib_2"] - assert ac_mars.list_libraries() == ["pytest_test_lib", "pytest_test_lib_2"] - - ac_mercury["pytest_test_lib"].write("test_1", pd.DataFrame()) - ac_mars["pytest_test_lib"].write("test_2", pd.DataFrame()) - - assert ac_mercury["pytest_test_lib"].list_symbols() == ["test_1"] - assert ac_mars["pytest_test_lib"].list_symbols() == ["test_2"] - - ac_mercury.delete_library("pytest_test_lib") - ac_mercury.delete_library("pytest_test_lib_2") - - ac_mars.delete_library("pytest_test_lib") - ac_mars.delete_library("pytest_test_lib_2") - - -@pytest.mark.parametrize("fixture", ["s3_storage", pytest.param("azurite_storage", marks=AZURE_TESTS_MARK)]) -def test_library_management_path_prefix(fixture, request): - storage_fixture: StorageFixture = request.getfixturevalue(fixture) - uri = add_path_prefix(storage_fixture.arctic_uri, "hello/world") - ac = Arctic(uri) - assert ac.list_libraries() == [] - - ac.create_library("pytest_test_lib") - - ac["pytest_test_lib"].write("test_1", pd.DataFrame()) - ac["pytest_test_lib"].write("test_2", pd.DataFrame()) - - assert sorted(ac["pytest_test_lib"].list_symbols()) == ["test_1", "test_2"] - - ac["pytest_test_lib"].snapshot("test_snapshot") - assert ac["pytest_test_lib"].list_snapshots() == {"test_snapshot": None} - - keys = list(storage_fixture.iter_underlying_object_names()) - assert all(k.startswith("hello/world") for k in keys) - assert any(k.startswith("hello/world/_arctic_cfg") for k in keys) - assert any(k.startswith("hello/world/pytest_test_lib") for k in keys) - - ac.delete_library("pytest_test_lib") - - assert not ac.list_libraries() - with pytest.raises(LibraryNotFound): - _lib = ac["pytest_test_lib"] - def test_basic_write_read_update_and_append(arctic_library): lib = arctic_library @@ -1118,3 +859,28 @@ def test_s3_force_uri_lib_config_handling(s3_storage): with pytest.raises(ValueError): Arctic(s3_storage.arctic_uri + "&force_uri_lib_config=false") + + +# See test of same name in test_normalization.py for V1 API equivalent +def test_norm_failure_error_message(arctic_library): + lib = arctic_library + sym = "test_norm_failure_error_message" + col_name = "My unnormalizable column" + df = pd.DataFrame({col_name: [1, [1, 2]]}) + with pytest.raises(ArcticDbNotYetImplemented) as write_exception: + lib.write(sym, df) + with pytest.raises(ArcticDbNotYetImplemented) as write_batch_exception: + lib.write_batch([WritePayload(sym, df)]) + with pytest.raises(ArcticDbNotYetImplemented) as append_exception: + lib.append(sym, df) + with pytest.raises(ArcticDbNotYetImplemented) as append_batch_exception: + lib.append_batch([WritePayload(sym, df)]) + with pytest.raises(ArcticDbNotYetImplemented) as update_exception: + lib.update(sym, df) + + assert all(col_name in str(e.value) for e in + [write_exception, write_batch_exception, append_exception, append_batch_exception, update_exception]) + assert "write_pickle" in str(write_exception.value) and "pickle_on_failure" not in str(write_exception.value) + assert "write_pickle_batch" in str(write_batch_exception.value) and "pickle_on_failure" not in str(write_batch_exception.value) + assert all("write_pickle" not in str(e.value) for e in + [append_exception, append_batch_exception, update_exception]) diff --git a/python/tests/integration/arcticdb/test_arctic_library_management.py b/python/tests/integration/arcticdb/test_arctic_library_management.py new file mode 100644 index 0000000000..b9cf9125df --- /dev/null +++ b/python/tests/integration/arcticdb/test_arctic_library_management.py @@ -0,0 +1,442 @@ +""" +Copyright 2024 Man Group Operations Limited + +Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt. + +As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. +""" +import pytest +import pandas as pd + +from arcticdb_ext.exceptions import InternalException, UserInputException +from arcticdb_ext.storage import KeyType +from arcticdb.exceptions import ArcticDbNotYetImplemented, LibraryNotFound, MismatchingLibraryOptions +from arcticdb.arctic import Arctic +from arcticdb.options import LibraryOptions, EnterpriseLibraryOptions +from arcticdb.encoding_version import EncodingVersion +from arcticc.pb2.s3_storage_pb2 import Config as S3Config +from arcticdb.storage_fixtures.api import StorageFixture, ArcticUriFields, StorageFixtureFactory +from arcticdb.version_store.library import ( + WritePayload, + ArcticUnsupportedDataTypeException, + ReadRequest, + StagedDataFinalizeMethod, + ArcticInvalidApiUsageException, +) + +from tests.util.mark import AZURE_TESTS_MARK, MONGO_TESTS_MARK, REAL_S3_TESTS_MARK + +from arcticdb.options import ModifiableEnterpriseLibraryOption, ModifiableLibraryOption + + +def test_library_creation_deletion(arctic_client): + ac = arctic_client + assert ac.list_libraries() == [] + ac.create_library("pytest_test_lib") + with pytest.raises(ValueError): + ac.create_library("pytest_test_lib") + + assert ac.list_libraries() == ["pytest_test_lib"] + assert ac.has_library("pytest_test_lib") + assert "pytest_test_lib" in ac + if "mongo" in arctic_client.get_uri(): + # The mongo fixture uses PrefixingLibraryAdapterDecorator which leaks in this one case + assert ac["pytest_test_lib"].name.endswith(".pytest_test_lib") + else: + assert ac["pytest_test_lib"].name == "pytest_test_lib" + + ac.delete_library("pytest_test_lib") + # Want this to be silent. + ac.delete_library("library_that_does_not_exist") + + assert not ac.list_libraries() + with pytest.raises(LibraryNotFound): + _lib = ac["pytest_test_lib"] + assert not ac.has_library("pytest_test_lib") + assert "pytest_test_lib" not in ac + + +def test_get_library(arctic_client): + ac = arctic_client + assert ac.list_libraries() == [] + # Throws if library doesn't exist + with pytest.raises(LibraryNotFound): + _ = ac.get_library("pytest_test_lib") + # Creates library with default options if just create_if_missing set to True + lib = ac.get_library("pytest_test_lib_default_options", create_if_missing=True) + + assert lib.options() == LibraryOptions(encoding_version=ac._encoding_version) + # Creates library with the specified options if create_if_missing set to True and options provided + library_options = LibraryOptions( + dynamic_schema=True, + dedup=True, + rows_per_segment=10, + columns_per_segment=10, + encoding_version=EncodingVersion.V1 if ac._encoding_version == EncodingVersion.V2 else EncodingVersion.V2, + ) + lib = ac.get_library("pytest_test_lib_specified_options", create_if_missing=True, library_options=library_options) + assert lib.options() == library_options + + # If the library already exists, create_if_missing is True, and options are provided, then the provided options must match the existing library + library_options.dynamic_schema = False + with pytest.raises(MismatchingLibraryOptions): + _ = ac.get_library("pytest_test_lib_specified_options", create_if_missing=True, library_options=library_options) + # Throws if library_options are provided but create_if_missing is False + with pytest.raises(ArcticInvalidApiUsageException): + _ = ac.get_library("pytest_test_lib", create_if_missing=False, library_options=library_options) + + +def test_create_library_enterprise_options_defaults(lmdb_storage): + ac = lmdb_storage.create_arctic() + lib = ac.create_library("lib") + + enterprise_options = lib.enterprise_options() + assert not enterprise_options.replication + assert not enterprise_options.background_deletion + + +def test_create_library_enterprise_options_set(lmdb_storage): + ac = lmdb_storage.create_arctic() + lib = ac.create_library("lib", enterprise_library_options=EnterpriseLibraryOptions(replication=True, + background_deletion=True)) + + enterprise_options = lib.enterprise_options() + assert enterprise_options.replication + assert enterprise_options.background_deletion + + proto_options = lib._nvs.lib_cfg().lib_desc.version.write_options + assert proto_options.sync_passive.enabled + assert proto_options.delayed_deletes + + +def test_create_library_replication_option_set_writes_logs(lmdb_storage): + ac = lmdb_storage.create_arctic() + lib = ac.create_library("lib", enterprise_library_options=EnterpriseLibraryOptions(replication=True)) + lt = lib._nvs.library_tool() + assert not lt.find_keys(KeyType.LOG) + + df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]}) + lib.write("abc", df) + + assert len(lt.find_keys(KeyType.LOG)) + + +def test_create_library_background_deletion_option_set_does_not_delete(lmdb_storage): + ac = lmdb_storage.create_arctic() + lib = ac.create_library("lib", enterprise_library_options=EnterpriseLibraryOptions(background_deletion=True)) + lt = lib._nvs.library_tool() + + df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]}) + lib.write("abc", df) + lib.delete("abc") + + assert len(lt.find_keys(KeyType.TABLE_DATA)) + + +def test_modify_options_affect_in_memory_lib(lmdb_storage): + ac = lmdb_storage.create_arctic() + lib = ac.create_library("lib") + + ac.modify_library_option(lib, ModifiableEnterpriseLibraryOption.REPLICATION, True) + ac.modify_library_option(lib, ModifiableLibraryOption.DEDUP, False) + + proto_options = lib._nvs.lib_cfg().lib_desc.version.write_options + assert proto_options.sync_passive.enabled + assert not proto_options.de_duplication + + +def test_modify_options_affect_persistent_lib_config(lmdb_storage): + ac = lmdb_storage.create_arctic() + lib = ac.create_library("lib") + + ac.modify_library_option(lib, ModifiableEnterpriseLibraryOption.REPLICATION, True) + ac.modify_library_option(lib, ModifiableEnterpriseLibraryOption.BACKGROUND_DELETION, True) + + new_client = Arctic(ac.get_uri()) + new_lib = new_client["lib"] + proto_options = new_lib._nvs.lib_cfg().lib_desc.version.write_options + assert proto_options.sync_passive.enabled + assert proto_options.delayed_deletes + + +def test_modify_options_dedup(lmdb_storage): + ac = lmdb_storage.create_arctic() + lib = ac.create_library("lib") + + ac.modify_library_option(lib, ModifiableLibraryOption.DEDUP, False) + + proto_options = lib._nvs.lib_cfg().lib_desc.version.write_options + assert not proto_options.de_duplication + + ac.modify_library_option(lib, ModifiableLibraryOption.DEDUP, True) + + proto_options = lib._nvs.lib_cfg().lib_desc.version.write_options + assert proto_options.de_duplication + + +def test_modify_options_rows_per_segment(lmdb_storage): + ac = lmdb_storage.create_arctic() + lib = ac.create_library("lib") + + ac.modify_library_option(lib, ModifiableLibraryOption.ROWS_PER_SEGMENT, 100) + + proto_options = lib._nvs.lib_cfg().lib_desc.version.write_options + assert proto_options.segment_row_size == 100 + + ac.modify_library_option(lib, ModifiableLibraryOption.ROWS_PER_SEGMENT, 200) + + proto_options = lib._nvs.lib_cfg().lib_desc.version.write_options + assert proto_options.segment_row_size == 200 + + +def test_modify_options_cols_per_segment(lmdb_storage): + ac = lmdb_storage.create_arctic() + lib = ac.create_library("lib") + + ac.modify_library_option(lib, ModifiableLibraryOption.COLUMNS_PER_SEGMENT, 100) + + proto_options = lib._nvs.lib_cfg().lib_desc.version.write_options + assert proto_options.column_group_size == 100 + + ac.modify_library_option(lib, ModifiableLibraryOption.COLUMNS_PER_SEGMENT, 200) + + proto_options = lib._nvs.lib_cfg().lib_desc.version.write_options + assert proto_options.column_group_size == 200 + + +def test_modify_options_replication(lmdb_storage): + ac = lmdb_storage.create_arctic() + lib = ac.create_library("lib") + + ac.modify_library_option(lib, ModifiableEnterpriseLibraryOption.REPLICATION, True) + + proto_options = lib._nvs.lib_cfg().lib_desc.version.write_options + assert proto_options.sync_passive.enabled + + df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]}) + lib.write("abc", df) + + lt = lib._nvs.library_tool() + assert len(lt.find_keys(KeyType.LOG)) == 1 + + ac.modify_library_option(lib, ModifiableEnterpriseLibraryOption.REPLICATION, False) + proto_options = lib._nvs.lib_cfg().lib_desc.version.write_options + assert not proto_options.sync_passive.enabled + + lib.write("def", df) + assert len(lt.find_keys(KeyType.LOG)) == 1 + + +def test_modify_options_background_deletion(lmdb_storage): + ac = lmdb_storage.create_arctic() + lib = ac.create_library("lib") + lt = lib._nvs.library_tool() + + ac.modify_library_option(lib, ModifiableEnterpriseLibraryOption.BACKGROUND_DELETION, True) + df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]}) + lib.write("abc", df) + lib.delete("abc") + + assert len(lt.find_keys(KeyType.TABLE_DATA)) + + +def test_create_library_with_invalid_name(arctic_client): + ac = arctic_client + + # These should succeed because the names are valid + valid_names = ["lib", "lib/with/slash", "lib-with-dash", "lib.with.dot", "lib123"] + for lib_name in valid_names: + ac.create_library(lib_name) + + # These should fail because the names are invalid + invalid_names = [chr(0), "lib>", "lib<", "lib*", "/lib", "lib...lib", "lib"*1000] + for lib_name in invalid_names: + with pytest.raises(UserInputException): + ac.create_library(lib_name) + + # Verify that library list is not corrupted + assert set(ac.list_libraries()) == set(valid_names) + + +# TODO: Fix issue #1247, then use "arctic_client" instead of "arctic_client_no_lmdb" +@pytest.mark.parametrize("prefix", ["", "prefix"]) +@pytest.mark.parametrize("suffix", ["", "suffix"]) +def test_create_library_with_all_chars(arctic_client_no_lmdb, prefix, suffix): + # Create library names with each character (except '\' because Azure replaces it with '/' in some cases) + names = [f"{prefix}{chr(i)}{suffix}" for i in range(256) if chr(i) != '\\'] + + ac = arctic_client_no_lmdb + + created_libraries = set() + for name in names: + try: + ac.create_library(name) + created_libraries.add(name) + # We should only fail with UserInputException (indicating that name validation failed) + except UserInputException: + pass + + assert set(ac.list_libraries()) == created_libraries + + +def test_do_not_persist_s3_details(s3_storage): + """We apply an in-memory overlay for these instead. In particular we should absolutely not persist credentials + in the storage.""" + + def _get_s3_storage_config(cfg): + primary_storage_name = cfg.lib_desc.storage_ids[0] + primary_any = cfg.storage_by_id[primary_storage_name] + s3_config = S3Config() + primary_any.config.Unpack(s3_config) + return s3_config + + ac = Arctic(s3_storage.arctic_uri) + lib = ac.create_library("test") + lib.write("sym", pd.DataFrame()) + + config = ac._library_manager.get_library_config("test") + s3_storage = _get_s3_storage_config(config) + assert s3_storage.bucket_name == "" + assert s3_storage.credential_name == "" + assert s3_storage.credential_key == "" + assert s3_storage.endpoint == "" + assert s3_storage.max_connections == 0 + assert s3_storage.connect_timeout == 0 + assert s3_storage.request_timeout == 0 + assert not s3_storage.ssl + assert s3_storage.prefix.startswith("test") + assert not s3_storage.https + assert s3_storage.region == "" + assert not s3_storage.use_virtual_addressing + + assert "sym" in ac["test"].list_symbols() + + +def test_library_options(arctic_client): + ac = arctic_client + assert ac.list_libraries() == [] + ac.create_library("pytest_default_options") + lib = ac["pytest_default_options"] + assert lib.options() == LibraryOptions(encoding_version=ac._encoding_version) + write_options = lib._nvs._lib_cfg.lib_desc.version.write_options + assert not write_options.dynamic_schema + assert not write_options.de_duplication + assert write_options.segment_row_size == 100_000 + assert write_options.column_group_size == 127 + assert lib._nvs._lib_cfg.lib_desc.version.encoding_version == ac._encoding_version + + library_options = LibraryOptions( + dynamic_schema=True, dedup=True, rows_per_segment=20, columns_per_segment=3, encoding_version=EncodingVersion.V2 + ) + ac.create_library( + "pytest_explicit_options", + library_options, + ) + lib = ac["pytest_explicit_options"] + assert lib.options() == library_options + write_options = lib._nvs._lib_cfg.lib_desc.version.write_options + assert write_options.dynamic_schema + assert write_options.de_duplication + assert write_options.segment_row_size == 20 + assert write_options.column_group_size == 3 + assert write_options.dynamic_strings + assert lib._nvs._lib_cfg.lib_desc.version.encoding_version == EncodingVersion.V2 + + +def test_separation_between_libraries(arctic_client): + # This fails for mem-backed without the library caching implemented in + # issue #520 then re-implemented in issue #889 + """Validate that symbols in one library are not exposed in another.""" + ac = arctic_client + assert ac.list_libraries() == [] + + ac.create_library("pytest_test_lib") + ac.create_library("pytest_test_lib_2") + + assert set(ac.list_libraries()) == {"pytest_test_lib", "pytest_test_lib_2"} + + ac["pytest_test_lib"].write("test_1", pd.DataFrame()) + ac["pytest_test_lib_2"].write("test_2", pd.DataFrame()) + assert ac["pytest_test_lib"].list_symbols() == ["test_1"] + assert ac["pytest_test_lib_2"].list_symbols() == ["test_2"] + + +def add_path_prefix(uri, prefix): + if "path_prefix" in uri: + return uri + prefix + + if "azure" in uri: # azure connection string has a different format + return f"{uri};Path_prefix={prefix}" + else: + return f"{uri}&path_prefix={prefix}" + + +@pytest.mark.parametrize( + "fixture", + [ + "s3_storage", + pytest.param("azurite_storage", marks=AZURE_TESTS_MARK), + pytest.param("real_s3_storage", marks=REAL_S3_TESTS_MARK), + ], +) +def test_separation_between_libraries_with_prefixes(fixture, request): + """The motivation for the prefix feature is that separate users want to be able to create libraries + with the same name in the same bucket without over-writing each other's work. This can be useful when + creating a new bucket is time-consuming, for example due to organizational issues. + """ + storage_fixture: StorageFixture = request.getfixturevalue(fixture) + + mercury_uri = add_path_prefix(storage_fixture.arctic_uri, "/planet/mercury") + ac_mercury = Arctic(mercury_uri) + + mars_uri = add_path_prefix(storage_fixture.arctic_uri, "/planet/mars") + ac_mars = Arctic(mars_uri) + + assert ac_mars.list_libraries() == [] + ac_mercury.create_library("pytest_test_lib") + ac_mercury.create_library("pytest_test_lib_2") + ac_mars.create_library("pytest_test_lib") + ac_mars.create_library("pytest_test_lib_2") + assert ac_mercury.list_libraries() == ["pytest_test_lib", "pytest_test_lib_2"] + assert ac_mars.list_libraries() == ["pytest_test_lib", "pytest_test_lib_2"] + + ac_mercury["pytest_test_lib"].write("test_1", pd.DataFrame()) + ac_mars["pytest_test_lib"].write("test_2", pd.DataFrame()) + + assert ac_mercury["pytest_test_lib"].list_symbols() == ["test_1"] + assert ac_mars["pytest_test_lib"].list_symbols() == ["test_2"] + + ac_mercury.delete_library("pytest_test_lib") + ac_mercury.delete_library("pytest_test_lib_2") + + ac_mars.delete_library("pytest_test_lib") + ac_mars.delete_library("pytest_test_lib_2") + + +@pytest.mark.parametrize("fixture", ["s3_storage", pytest.param("azurite_storage", marks=AZURE_TESTS_MARK)]) +def test_library_management_path_prefix(fixture, request): + storage_fixture: StorageFixture = request.getfixturevalue(fixture) + uri = add_path_prefix(storage_fixture.arctic_uri, "hello/world") + ac = Arctic(uri) + assert ac.list_libraries() == [] + + ac.create_library("pytest_test_lib") + + ac["pytest_test_lib"].write("test_1", pd.DataFrame()) + ac["pytest_test_lib"].write("test_2", pd.DataFrame()) + + assert sorted(ac["pytest_test_lib"].list_symbols()) == ["test_1", "test_2"] + + ac["pytest_test_lib"].snapshot("test_snapshot") + assert ac["pytest_test_lib"].list_snapshots() == {"test_snapshot": None} + + keys = list(storage_fixture.iter_underlying_object_names()) + assert all(k.startswith("hello/world") for k in keys) + assert any(k.startswith("hello/world/_arctic_cfg") for k in keys) + assert any(k.startswith("hello/world/pytest_test_lib") for k in keys) + + ac.delete_library("pytest_test_lib") + + assert not ac.list_libraries() + with pytest.raises(LibraryNotFound): + _lib = ac["pytest_test_lib"] diff --git a/python/tests/integration/arcticdb/test_lmdb.py b/python/tests/integration/arcticdb/test_lmdb.py index 476c93415b..75f52ff3b8 100644 --- a/python/tests/integration/arcticdb/test_lmdb.py +++ b/python/tests/integration/arcticdb/test_lmdb.py @@ -164,6 +164,12 @@ def test_map_size_bad_input(options): assert "Incorrect format for map_size" in str(e.value) +def test_delete_library(lmdb_storage): + ac = lmdb_storage.create_arctic() + lib = ac.create_library("library") + ac.delete_library("library") + with pytest.raises(StorageException) as e: + lib.write("sym1", pd.DataFrame()) @pytest.mark.parametrize("options", ["MAP_SIZE=1GB", "atlas_shape=1GB"]) def test_lmdb_options_unknown_option(options): diff --git a/python/tests/integration/arcticdb/version_store/test_basic_version_store.py b/python/tests/integration/arcticdb/version_store/test_basic_version_store.py index 4987a5066e..e524c8b773 100644 --- a/python/tests/integration/arcticdb/version_store/test_basic_version_store.py +++ b/python/tests/integration/arcticdb/version_store/test_basic_version_store.py @@ -1947,6 +1947,13 @@ def test_get_index(object_version_store): assert idx.iloc[0]["version_id"] == 0 +def test_read_empty_index(lmdb_version_store_v1): + lib = lmdb_version_store_v1 + sym = "test_read_empty_index" + lib.write(sym, pd.DataFrame()) + assert len(lib.read_index(sym)) == 0 + + def test_snapshot_empty_segment(basic_store): lib = basic_store diff --git a/python/tests/integration/arcticdb/version_store/test_metadata_support.py b/python/tests/integration/arcticdb/version_store/test_metadata_support.py index cb9457ff30..83f3215a8d 100644 --- a/python/tests/integration/arcticdb/version_store/test_metadata_support.py +++ b/python/tests/integration/arcticdb/version_store/test_metadata_support.py @@ -6,6 +6,7 @@ As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. """ import numpy as np +import pandas as pd from pandas import DataFrame, Timestamp import pytest @@ -157,3 +158,79 @@ def test_write_metadata_preexisting_symbol_no_pruning(basic_store, sym): assert lib.read(sym).data == 1 assert lib.read(sym, as_of=0).metadata == metadata_v0 assert lib.read(sym, as_of=0).data == 1 + + +def timestamp_indexed_df(): + return pd.DataFrame({"col": [0]}, index=[pd.Timestamp("2024-01-01")]) + + +def test_rv_contains_metadata_write(lmdb_version_store_v1): + lib = lmdb_version_store_v1 + sym = "test_write_rv_contains_metadata" + assert lib.write(sym, 1).metadata is None + metadata = {"some": "metadata"} + assert lib.write(sym, 1, metadata).metadata == metadata + + +def test_rv_contains_metadata_append(lmdb_version_store_v1): + lib = lmdb_version_store_v1 + sym = "test_rv_contains_metadata_append" + assert lib.append(sym, timestamp_indexed_df(), write_if_missing=True).metadata is None + metadata = {"some": "metadata"} + assert lib.append(sym, timestamp_indexed_df(), metadata, write_if_missing=True).metadata == metadata + + +def test_rv_contains_metadata_update(lmdb_version_store_v1): + lib = lmdb_version_store_v1 + sym = "test_rv_contains_metadata_update" + assert lib.update(sym, timestamp_indexed_df(), upsert=True).metadata is None + metadata = {"some": "metadata"} + assert lib.update(sym, timestamp_indexed_df(), metadata, upsert=True).metadata == metadata + + +def test_rv_contains_metadata_write_metadata(lmdb_version_store_v1): + lib = lmdb_version_store_v1 + sym = "test_rv_contains_metadata_write_metadata" + metadata = {"some": "metadata"} + assert lib.write_metadata(sym, metadata).metadata == metadata + + +def test_rv_contains_metadata_batch_write(lmdb_version_store_v1): + lib = lmdb_version_store_v1 + sym_0 = "test_rv_contains_metadata_batch_write_0" + sym_1 = "test_rv_contains_metadata_batch_write_1" + sym_2 = "test_rv_contains_metadata_batch_write_2" + vits = lib.batch_write([sym_0, sym_1, sym_2], 3 * [1]) + assert all(vit.metadata is None for vit in vits) + metadata_0 = {"some": "metadata_0"} + metadata_2 = {"some": "metadata_2"} + vits = lib.batch_write([sym_0, sym_1, sym_2], 3 * [1], [metadata_0, None, metadata_2]) + assert vits[0].metadata == metadata_0 + assert vits[1].metadata is None + assert vits[2].metadata == metadata_2 + + +def test_rv_contains_metadata_batch_append(lmdb_version_store_v1): + lib = lmdb_version_store_v1 + sym_0 = "test_rv_contains_metadata_batch_append_0" + sym_1 = "test_rv_contains_metadata_batch_append_1" + sym_2 = "test_rv_contains_metadata_batch_append_2" + vits = lib.batch_append([sym_0, sym_1, sym_2], 3 * [timestamp_indexed_df()], write_if_missing=True) + assert all(vit.metadata is None for vit in vits) + metadata_0 = {"some": "metadata_0"} + metadata_2 = {"some": "metadata_2"} + vits = lib.batch_append([sym_0, sym_1, sym_2], 3 * [timestamp_indexed_df()], [metadata_0, None, metadata_2], write_if_missing=True) + assert vits[0].metadata == metadata_0 + assert vits[1].metadata is None + assert vits[2].metadata == metadata_2 + + +def test_rv_contains_metadata_batch_write_metadata(lmdb_version_store_v1): + lib = lmdb_version_store_v1 + sym_0 = "test_rv_contains_metadata_batch_write_metadata_0" + sym_1 = "test_rv_contains_metadata_batch_write_metadata_1" + metadata_0 = {"some": "metadata_0"} + metadata_1 = {"some": "metadata_1"} + vits = lib.batch_write_metadata([sym_0, sym_1], [metadata_0, metadata_1]) + assert vits[0].metadata == metadata_0 + assert vits[1].metadata == metadata_1 diff --git a/python/tests/integration/toolbox/test_library_tool.py b/python/tests/integration/toolbox/test_library_tool.py index 4921cf4398..33ecda29eb 100644 --- a/python/tests/integration/toolbox/test_library_tool.py +++ b/python/tests/integration/toolbox/test_library_tool.py @@ -5,7 +5,6 @@ """ import pandas as pd import numpy as np -import pytest from arcticdb.util.test import sample_dataframe, populate_db from arcticdb_ext.storage import KeyType diff --git a/python/tests/nonreg/arcticdb/version_store/test_nonreg_specific.py b/python/tests/nonreg/arcticdb/version_store/test_nonreg_specific.py index acd525563a..34a58fbc25 100644 --- a/python/tests/nonreg/arcticdb/version_store/test_nonreg_specific.py +++ b/python/tests/nonreg/arcticdb/version_store/test_nonreg_specific.py @@ -204,9 +204,9 @@ def test_batch_write_unicode_strings(lmdb_version_store): (pd.Series, assert_series_equal), (pd.DataFrame, assert_frame_equal), ]) -def test_update_with_empty_series_or_dataframe(lmdb_version_store, PandasType, assert_pandas_container_equal): +def test_update_with_empty_series_or_dataframe(lmdb_version_store_empty_types_v1, PandasType, assert_pandas_container_equal): # Non-regression test for https://github.com/man-group/ArcticDB/issues/892 - lib = lmdb_version_store + lib = lmdb_version_store_empty_types_v1 kwargs = { "name": "a" } if PandasType == pd.Series else { "columns": ["a"] } data = np.array([1.0]) if PandasType == pd.Series else np.array([[1.0]]) diff --git a/python/tests/scripts/test_update_storage.py b/python/tests/scripts/test_update_storage.py index 20aba42522..919a6bb9d0 100644 --- a/python/tests/scripts/test_update_storage.py +++ b/python/tests/scripts/test_update_storage.py @@ -5,7 +5,7 @@ from arcticdb import Arctic from arcticdb.scripts.update_storage import run -from arcticdb.options import LibraryOptions +from arcticdb.options import LibraryOptions, EnterpriseLibraryOptions from arcticc.pb2.s3_storage_pb2 import Config as S3Config from arcticc.pb2.azure_storage_pb2 import Config as AzureConfig from arcticdb.storage_fixtures.api import ArcticUriFields @@ -18,8 +18,7 @@ def create_library_config(ac: Arctic, name: str): - opts = LibraryOptions() - cfg = ac._library_adapter.get_library_config(name, opts) + cfg = ac._library_adapter.get_library_config(name, LibraryOptions(), EnterpriseLibraryOptions()) ac._library_manager.write_library_config(cfg, name, test_only_validation_toggle=False) diff --git a/python/tests/unit/arcticdb/test_column_stats.py b/python/tests/unit/arcticdb/test_column_stats.py index 52d47289a3..4cdd80b320 100644 --- a/python/tests/unit/arcticdb/test_column_stats.py +++ b/python/tests/unit/arcticdb/test_column_stats.py @@ -384,8 +384,9 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm { "int_widening": np.arange(0, 2, dtype=np.uint8), "int_narrowing": np.arange(-1002, -1000, dtype=np.int16), - "unsigned_to_signed_int": np.arange(1000, 1002, dtype=np.uint16), - "signed_to_unsigned_int": np.arange(-1002, -1000, dtype=np.int32), + "unsigned_to_wider_signed_int": np.arange(1000, 1002, dtype=np.uint16), + "wider_signed_to_unsigned_int": np.arange(-1002, -1000, dtype=np.int32), + "unsigned_to_signed_int_same_width": np.arange(1000, 1002, dtype=np.uint16), "int_to_float": np.arange(1000, 1002, dtype=np.uint16), "float_to_int": [1.5, 2.5], }, @@ -396,8 +397,9 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm { "int_widening": np.arange(1000, 1002, dtype=np.uint16), "int_narrowing": np.arange(-2, 0, dtype=np.int8), - "unsigned_to_signed_int": np.arange(-1002, -1000, dtype=np.int32), - "signed_to_unsigned_int": np.arange(1000, 1002, dtype=np.uint16), + "unsigned_to_wider_signed_int": np.arange(-1002, -1000, dtype=np.int32), + "wider_signed_to_unsigned_int": np.arange(1000, 1002, dtype=np.uint16), + "unsigned_to_signed_int_same_width": np.arange(-1002, -1000, dtype=np.int16), "int_to_float": [1.5, 2.5], "float_to_int": np.arange(1000, 1002, dtype=np.uint16), }, @@ -420,22 +422,31 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm expected_column_stats["v1.0_MIN(int_narrowing)"] = [df0["int_narrowing"].min(), df1["int_narrowing"].min()] expected_column_stats["v1.0_MAX(int_narrowing)"] = [df0["int_narrowing"].max(), df1["int_narrowing"].max()] - expected_column_stats["v1.0_MIN(unsigned_to_signed_int)"] = [ - df0["unsigned_to_signed_int"].min(), - df1["unsigned_to_signed_int"].min(), + expected_column_stats["v1.0_MIN(unsigned_to_wider_signed_int)"] = [ + df0["unsigned_to_wider_signed_int"].min(), + df1["unsigned_to_wider_signed_int"].min(), ] - expected_column_stats["v1.0_MAX(unsigned_to_signed_int)"] = [ - df0["unsigned_to_signed_int"].max(), - df1["unsigned_to_signed_int"].max(), + expected_column_stats["v1.0_MAX(unsigned_to_wider_signed_int)"] = [ + df0["unsigned_to_wider_signed_int"].max(), + df1["unsigned_to_wider_signed_int"].max(), ] - expected_column_stats["v1.0_MIN(signed_to_unsigned_int)"] = [ - df0["signed_to_unsigned_int"].min(), - df1["signed_to_unsigned_int"].min(), + expected_column_stats["v1.0_MIN(wider_signed_to_unsigned_int)"] = [ + df0["wider_signed_to_unsigned_int"].min(), + df1["wider_signed_to_unsigned_int"].min(), ] - expected_column_stats["v1.0_MAX(signed_to_unsigned_int)"] = [ - df0["signed_to_unsigned_int"].max(), - df1["signed_to_unsigned_int"].max(), + expected_column_stats["v1.0_MAX(wider_signed_to_unsigned_int)"] = [ + df0["wider_signed_to_unsigned_int"].max(), + df1["wider_signed_to_unsigned_int"].max(), + ] + + expected_column_stats["v1.0_MIN(unsigned_to_signed_int_same_width)"] = [ + df0["unsigned_to_signed_int_same_width"].min(), + df1["unsigned_to_signed_int_same_width"].min(), + ] + expected_column_stats["v1.0_MAX(unsigned_to_signed_int_same_width)"] = [ + df0["unsigned_to_signed_int_same_width"].max(), + df1["unsigned_to_signed_int_same_width"].max(), ] expected_column_stats["v1.0_MIN(int_to_float)"] = [df0["int_to_float"].min(), df1["int_to_float"].min()] @@ -446,8 +457,9 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm column_stats_dict = { "int_widening": {"MINMAX"}, "int_narrowing": {"MINMAX"}, - "unsigned_to_signed_int": {"MINMAX"}, - "signed_to_unsigned_int": {"MINMAX"}, + "unsigned_to_wider_signed_int": {"MINMAX"}, + "wider_signed_to_unsigned_int": {"MINMAX"}, + "unsigned_to_signed_int_same_width": {"MINMAX"}, "int_to_float": {"MINMAX"}, "float_to_int": {"MINMAX"}, } @@ -462,11 +474,14 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm assert column_stats.dtypes["v1.0_MIN(int_narrowing)"] == np.int16 assert column_stats.dtypes["v1.0_MAX(int_narrowing)"] == np.int16 - assert column_stats.dtypes["v1.0_MIN(unsigned_to_signed_int)"] == np.int32 - assert column_stats.dtypes["v1.0_MAX(unsigned_to_signed_int)"] == np.int32 + assert column_stats.dtypes["v1.0_MIN(unsigned_to_wider_signed_int)"] == np.int32 + assert column_stats.dtypes["v1.0_MAX(unsigned_to_wider_signed_int)"] == np.int32 + + assert column_stats.dtypes["v1.0_MIN(wider_signed_to_unsigned_int)"] == np.int32 + assert column_stats.dtypes["v1.0_MAX(wider_signed_to_unsigned_int)"] == np.int32 - assert column_stats.dtypes["v1.0_MIN(signed_to_unsigned_int)"] == np.int32 - assert column_stats.dtypes["v1.0_MAX(signed_to_unsigned_int)"] == np.int32 + assert column_stats.dtypes["v1.0_MIN(unsigned_to_signed_int_same_width)"] == np.int32 + assert column_stats.dtypes["v1.0_MAX(unsigned_to_signed_int_same_width)"] == np.int32 assert column_stats.dtypes["v1.0_MIN(int_to_float)"] == np.float64 assert column_stats.dtypes["v1.0_MAX(int_to_float)"] == np.float64 diff --git a/python/tests/unit/arcticdb/version_store/test_aggregation.py b/python/tests/unit/arcticdb/version_store/test_aggregation.py index ed4dbdf674..e811cae25d 100644 --- a/python/tests/unit/arcticdb/version_store/test_aggregation.py +++ b/python/tests/unit/arcticdb/version_store/test_aggregation.py @@ -36,6 +36,29 @@ def test_group_on_float_column_with_nans(lmdb_version_store): assert_frame_equal(expected, received) +# TODO: Add first and last once un-feature flagged +@pytest.mark.parametrize("aggregator", ("sum", "min", "max", "mean", "count")) +def test_aggregate_float_columns_with_nans(lmdb_version_store, aggregator): + lib = lmdb_version_store + sym = "test_aggregate_float_columns_with_nans" + df = pd.DataFrame( + { + "grouping_column": 3 * ["some nans", "only nans"], + "agg_column": [1.0, np.nan, 2.0, np.nan, np.nan, np.nan], + } + ) + lib.write(sym, df) + expected = df.groupby("grouping_column").agg({"agg_column": aggregator}) + # We count in unsigned integers for obvious reasons + if aggregator == "count": + expected = expected.astype(np.uint64) + q = QueryBuilder() + q = q.groupby("grouping_column").agg({"agg_column": aggregator}) + received = lib.read(sym, query_builder=q).data + received.sort_index(inplace=True) + assert_frame_equal(expected, received) + + @use_of_function_scoped_fixtures_in_hypothesis_checked @settings(deadline=None) @given( @@ -428,28 +451,6 @@ def test_mean_aggregation_float(local_object_version_store): assert_frame_equal(res.data, df) -def test_mean_aggregation_float_nan(lmdb_version_store): - df = DataFrame( - { - "grouping_column": ["group_1", "group_1", "group_1", "group_2", "group_2"], - "to_mean": [1.1, 1.4, 2.5, np.nan, 2.2], - }, - index=np.arange(5), - ) - q = QueryBuilder() - q = q.groupby("grouping_column").agg({"to_mean": "mean"}) - symbol = "test_aggregation" - lmdb_version_store.write(symbol, df) - - res = lmdb_version_store.read(symbol, query_builder=q) - - df = pd.DataFrame({"to_mean": [(1.1 + 1.4 + 2.5) / 3, np.nan]}, index=["group_1", "group_2"]) - df.index.rename("grouping_column", inplace=True) - res.data.sort_index(inplace=True) - - assert_frame_equal(res.data, df) - - def test_max_minus_one(lmdb_version_store): symbol = "minus_one" lib = lmdb_version_store diff --git a/python/tests/unit/arcticdb/version_store/test_column_type_changes.py b/python/tests/unit/arcticdb/version_store/test_column_type_changes.py index edf1248a2e..1a9da90a80 100644 --- a/python/tests/unit/arcticdb/version_store/test_column_type_changes.py +++ b/python/tests/unit/arcticdb/version_store/test_column_type_changes.py @@ -21,9 +21,9 @@ def test_changing_numeric_type(version_store_factory, dynamic_schema): lib = version_store_factory(dynamic_schema=dynamic_schema) sym_append = "test_changing_numeric_type_append" sym_update = "test_changing_numeric_type_update" - df_write = pd.DataFrame({"col": np.arange(3, dtype=np.uint8)}, index=pd.date_range("2024-01-01", periods=3)) - df_append = pd.DataFrame({"col": np.arange(1, dtype=np.uint16)}, index=pd.date_range("2024-01-04", periods=1)) - df_update = pd.DataFrame({"col": np.arange(1, dtype=np.uint16)}, index=pd.date_range("2024-01-02", periods=1)) + df_write = pd.DataFrame({"col": np.arange(3, dtype=np.uint32)}, index=pd.date_range("2024-01-01", periods=3)) + df_append = pd.DataFrame({"col": np.arange(1, dtype=np.int32)}, index=pd.date_range("2024-01-04", periods=1)) + df_update = pd.DataFrame({"col": np.arange(1, dtype=np.int32)}, index=pd.date_range("2024-01-02", periods=1)) lib.write(sym_append, df_write) lib.write(sym_update, df_write) @@ -41,7 +41,7 @@ def test_changing_numeric_type(version_store_factory, dynamic_schema): received_append = lib.read(sym_append).data assert_frame_equal(expected_append, received_append) - expected_update = pd.DataFrame({"col": np.array([0, 0, 2], dtype=np.uint16)}, index=pd.date_range("2024-01-01", periods=3)) + expected_update = pd.DataFrame({"col": np.array([0, 0, 2], dtype=np.int64)}, index=pd.date_range("2024-01-01", periods=3)) received_update = lib.read(sym_update).data assert_frame_equal(expected_update, received_update) diff --git a/python/tests/unit/arcticdb/version_store/test_missing_empty.py b/python/tests/unit/arcticdb/version_store/test_missing_empty.py index 7214edbe10..5b88d17f82 100644 --- a/python/tests/unit/arcticdb/version_store/test_missing_empty.py +++ b/python/tests/unit/arcticdb/version_store/test_missing_empty.py @@ -590,41 +590,41 @@ def run_test(lib, test: TestCase, action, base_test: TestCase = None): @pytest.mark.parametrize("test_case", _ROUND_TRIP_TESTS) -def test_empty_missing_round_trip_lmdb(lmdb_version_store, test_case: TestCase): - run_test(lmdb_version_store, test_case, round_trip) +def test_empty_missing_round_trip_lmdb(lmdb_version_store_empty_types, test_case: TestCase): + run_test(lmdb_version_store_empty_types, test_case, round_trip) @pytest.mark.parametrize("test_case", _ROUND_TRIP_TESTS) -def test_empty_missing_round_trip_lmdb_dynamic_schema(lmdb_version_store_dynamic_schema, test_case: TestCase): - run_test(lmdb_version_store_dynamic_schema, test_case, round_trip) +def test_empty_missing_round_trip_lmdb_dynamic_schema(lmdb_version_store_empty_types_dynamic_schema, test_case: TestCase): + run_test(lmdb_version_store_empty_types_dynamic_schema, test_case, round_trip) @pytest.mark.parametrize("test_case", _APPEND_TESTS) -def test_empty_missing_append_lmdb(lmdb_version_store, test_case: TestCase): +def test_empty_missing_append_lmdb(lmdb_version_store_empty_type, test_case: TestCase): if test_case.base_name not in _TEST_LOOKUP: pytest.fail(f"Base test case {test_case.base_name} not found") - run_test(lmdb_version_store, test_case, append, _TEST_LOOKUP[test_case.base_name]) + run_test(lmdb_version_store_empty_type, test_case, append, _TEST_LOOKUP[test_case.base_name]) @pytest.mark.parametrize("test_case", _APPEND_TESTS) -def test_empty_missing_append_lmdb_dynamic_schema(lmdb_version_store_dynamic_schema, test_case: TestCase): +def test_empty_missing_append_lmdb_dynamic_schema(lmdb_version_store_empty_type_dynamic_schema, test_case: TestCase): if test_case.base_name not in _TEST_LOOKUP: pytest.fail(f"Base test case {test_case.base_name} not found") - run_test(lmdb_version_store_dynamic_schema, test_case, append, _TEST_LOOKUP[test_case.base_name]) + run_test(lmdb_version_store_empty_type_dynamic_schema, test_case, append, _TEST_LOOKUP[test_case.base_name]) @pytest.mark.parametrize("test_case", _UPDATE_TESTS) -def test_empty_missing_update_lmdb(lmdb_version_store, test_case: TestCase): +def test_empty_missing_update_lmdb(lmdb_version_store_empty_type, test_case: TestCase): if test_case.base_name not in _TEST_LOOKUP: pytest.fail(f"Base test case {test_case.base_name} not found") - run_test(lmdb_version_store, test_case, update, _TEST_LOOKUP[test_case.base_name]) + run_test(lmdb_version_store_empty_type, test_case, update, _TEST_LOOKUP[test_case.base_name]) @pytest.mark.parametrize("test_case", _UPDATE_TESTS) -def test_empty_missing_update_lmdb_dynamic_schema(lmdb_version_store_dynamic_schema, test_case: TestCase): +def test_empty_missing_update_lmdb_dynamic_schema(lmdb_version_store_empty_type_dynamic_schema, test_case: TestCase): if test_case.base_name not in _TEST_LOOKUP: pytest.fail(f"Base test case {test_case.base_name} not found") - run_test(lmdb_version_store_dynamic_schema, test_case, update, _TEST_LOOKUP[test_case.base_name]) + run_test(lmdb_version_store_empty_type_dynamic_schema, test_case, update, _TEST_LOOKUP[test_case.base_name]) # to run a single test, edit the following 2 lines to contain the test and action you want to test, @@ -634,16 +634,16 @@ def test_empty_missing_update_lmdb_dynamic_schema(lmdb_version_store_dynamic_sch @pytest.mark.parametrize("test_case", _SINGLE_TEST) -def test_empty_missing_single_lmdb(lmdb_version_store, test_case: TestCase): +def test_empty_missing_single_lmdb(lmdb_version_store_empty_type, test_case: TestCase): if test_case: if test_case.base_name and test_case.base_name not in _TEST_LOOKUP: pytest.fail(f"Base test case {test_case.base_name} not found") - run_test(lmdb_version_store, test_case, _SINGLE_TEST_ACTION, _TEST_LOOKUP[test_case.base_name]) + run_test(lmdb_version_store_empty_type, test_case, _SINGLE_TEST_ACTION, _TEST_LOOKUP[test_case.base_name]) @pytest.mark.parametrize("test_case", _SINGLE_TEST) -def test_empty_missing_single_lmdb_dynamic_schema(lmdb_version_store_dynamic_schema, test_case: TestCase): +def test_empty_missing_single_lmdb_dynamic_schema(lmdb_version_store_empty_type_dynamic_schema, test_case: TestCase): if test_case: if test_case.base_name and test_case.base_name not in _TEST_LOOKUP: pytest.fail(f"Base test case {test_case.base_name} not found") - run_test(lmdb_version_store_dynamic_schema, test_case, _SINGLE_TEST_ACTION, _TEST_LOOKUP[test_case.base_name]) + run_test(lmdb_version_store_empty_type_dynamic_schema, test_case, _SINGLE_TEST_ACTION, _TEST_LOOKUP[test_case.base_name]) diff --git a/python/tests/unit/arcticdb/version_store/test_normalization.py b/python/tests/unit/arcticdb/version_store/test_normalization.py index 613960fb3d..f55ffcb8fc 100644 --- a/python/tests/unit/arcticdb/version_store/test_normalization.py +++ b/python/tests/unit/arcticdb/version_store/test_normalization.py @@ -558,3 +558,28 @@ def test_pyarrow_error(lmdb_version_store): with pytest.raises(ArcticDbNotYetImplemented, match=error_msg_intro): lmdb_version_store.write("test_pyarrow_error_mixed_df", mixed_df) + + +# See test of same name in test_arctic.py for V2 API equivalent +def test_norm_failure_error_message(lmdb_version_store_v1): + lib = lmdb_version_store_v1 + sym = "test_norm_failure_error_message" + col_name = "My unnormalizable column" + df = pd.DataFrame({col_name: [1, [1, 2]]}) + with pytest.raises(ArcticDbNotYetImplemented) as write_exception: + lib.write(sym, df) + with pytest.raises(ArcticDbNotYetImplemented) as batch_write_exception: + lib.batch_write([sym], [df]) + with pytest.raises(ArcticDbNotYetImplemented) as append_exception: + lib.append(sym, df) + with pytest.raises(ArcticDbNotYetImplemented) as batch_append_exception: + lib.batch_append([sym], [df]) + with pytest.raises(ArcticDbNotYetImplemented) as update_exception: + lib.update(sym, df) + + assert all(col_name in str(e.value) for e in + [write_exception, batch_write_exception, append_exception, batch_append_exception, update_exception]) + assert all("pickle_on_failure" in str(e.value) for e in + [write_exception, batch_write_exception]) + assert all("pickle_on_failure" not in str(e.value) for e in + [append_exception, batch_append_exception, update_exception]) diff --git a/python/tests/unit/arcticdb/version_store/test_parallel.py b/python/tests/unit/arcticdb/version_store/test_parallel.py index 27e85e9b46..676160baf8 100644 --- a/python/tests/unit/arcticdb/version_store/test_parallel.py +++ b/python/tests/unit/arcticdb/version_store/test_parallel.py @@ -349,6 +349,23 @@ def test_parallel_append_overlapping_with_existing(lmdb_version_store): lib.compact_incomplete(sym, True, False) +@pytest.mark.parametrize("sortedness", ("DESCENDING", "UNSORTED")) +def test_parallel_append_existing_data_unsorted(lmdb_version_store, sortedness): + lib = lmdb_version_store + sym = "test_parallel_append_existing_data_unsorted" + last_index_date = "2024-01-01" if sortedness == "DESCENDING" else "2024-01-03" + df_0 = pd.DataFrame( + {"col": [1, 2, 3]}, + index=[pd.Timestamp("2024-01-04"), pd.Timestamp("2024-01-02"), pd.Timestamp(last_index_date)] + ) + lib.write(sym, df_0) + assert lib.get_info(sym)["sorted"] == sortedness + df_1 = pd.DataFrame({"col": [3, 4]}, index=[pd.Timestamp("2024-01-05"), pd.Timestamp("2024-01-06")]) + lib.append(sym, df_1, incomplete=True) + with pytest.raises(SortingException): + lib.compact_incomplete(sym, True, False) + + @pytest.mark.xfail(reason="See https://github.com/man-group/ArcticDB/issues/1250") @pytest.mark.parametrize("append", (True, False)) def test_parallel_dynamic_schema_compatible_types(lmdb_version_store_dynamic_schema, append): diff --git a/python/tests/unit/arcticdb/version_store/test_query_builder.py b/python/tests/unit/arcticdb/version_store/test_query_builder.py index bab3019b06..2e505c71e9 100644 --- a/python/tests/unit/arcticdb/version_store/test_query_builder.py +++ b/python/tests/unit/arcticdb/version_store/test_query_builder.py @@ -11,6 +11,7 @@ from arcticdb.version_store.processing import QueryBuilder from arcticdb.util.test import assert_frame_equal +from arcticdb_ext.exceptions import SchemaException def test_reuse_querybuilder(lmdb_version_store_tiny_segment): @@ -403,3 +404,64 @@ def test_querybuilder_pickling(): import pickle assert pickle.loads(pickle.dumps(q)) == q + + +# Remove the following test and replace with more extensive ones once this issue is fixed: +# https://github.com/man-group/ArcticDB/issues/1404 +def test_query_builder_sparse(lmdb_version_store): + lib = lmdb_version_store + sym = "test_filter_sparse" + df = pd.DataFrame( + { + "sparse_col": [0.0, np.nan, 0.0], + "dense_col": [0.0, 1.0, 2.0] + }, + index=pd.date_range("2024-01-01", periods=3) + ) + lib.write(sym, df, sparsify_floats=True) + + # Filters + # These 2 filters exercise different code paths + q = QueryBuilder() + q = q[q["sparse_col"].isnull()] + with pytest.raises(SchemaException): + lib.read(sym, query_builder=q) + q = QueryBuilder() + q = q[q["sparse_col"] == np.float64(0.0)] + with pytest.raises(SchemaException): + lib.read(sym, query_builder=q) + + # Projections + q = QueryBuilder().apply("projected_col", q["sparse_col"] + 1) + with pytest.raises(SchemaException): + lib.read(sym, query_builder=q) + + # Groupbys + q = QueryBuilder().groupby("sparse_col").agg({"dense_col": "sum"}) + with pytest.raises(SchemaException): + lib.read(sym, query_builder=q) + + # Aggregations + q = QueryBuilder().groupby("dense_col").agg({"sparse_col": "sum"}) + with pytest.raises(SchemaException): + lib.read(sym, query_builder=q) + + # Date range + q = QueryBuilder().date_range((pd.Timestamp("2024-01-01"), pd.Timestamp("2024-01-02"))) + with pytest.raises(SchemaException): + lib.read(sym, query_builder=q) + + # Head + q = QueryBuilder()._head(2) + with pytest.raises(SchemaException): + lib.read(sym, query_builder=q) + + # Tail + q = QueryBuilder()._tail(2) + with pytest.raises(SchemaException): + lib.read(sym, query_builder=q) + + # Row range + q = QueryBuilder()._row_range((1, 2)) + with pytest.raises(SchemaException): + lib.read(sym, query_builder=q) diff --git a/static/happybdayarcticdb.png b/static/happybdayarcticdb.png deleted file mode 100644 index d9cd18498a..0000000000 Binary files a/static/happybdayarcticdb.png and /dev/null differ