Skip to content

Commit

Permalink
Fix SYMBOL_LIST Segment index out of bounds bug
Browse files Browse the repository at this point in the history
The check `row < row_count()` in `string_at` was problematic for symbol
list entries because they have the exception that it is possible for a
column to have more rows than segment rows.

That exception is because of the nature of the symbol list refactor in
4.2.0 which needed to remain backwards compatible with old readers.

This change bypasses the check for symbol list entries and directly
calls string_at or scalar_at of the Column.
  • Loading branch information
IvoDD committed Apr 3, 2024
1 parent ae922bd commit d0ceb90
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 6 deletions.
1 change: 1 addition & 0 deletions cpp/arcticdb/column_store/memory_segment_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,7 @@ std::optional<std::string_view> SegmentInMemoryImpl::string_at(position_t row, p
const auto& col_ref = column(col);

if(is_fixed_string_type(td.data_type()) && col_ref.is_inflated()) {

auto string_size = col_ref.bytes() / row_count();
auto ptr = col_ref.data().buffer().ptr_cast<char>(row * string_size, string_size);
return std::string_view(ptr, string_size);
Expand Down
30 changes: 24 additions & 6 deletions cpp/arcticdb/version/symbol_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,17 +128,35 @@ MaybeCompaction last_compaction(const std::vector<AtomKey>& keys) {
}
}

// The below string_at and scalar_at functions should be used for symbol list cache segments instead of the ones
// provided in SegmentInMemory, because the symbol list structure is the only place where columns can have more entries
// than the segment has rows. Hence, we need to bypass the checks inside SegmentInMemory's function and directly call the
// Column's string_at and scalar_at.
std::string string_at(const SegmentInMemory& seg, position_t row, position_t col){
auto offset = seg.column(col).scalar_at<position_t>(row);
util::check(offset.has_value(), "Symbol list trying to call string_at for missing row {}, column {}", row, col);
return std::string(seg.string_pool_ptr()->get_view(offset.value()));
}

template<typename T>
T scalar_at(const SegmentInMemory& seg, position_t row, position_t col){
auto scalar = seg.column(col).scalar_at<T>(row);
util::check(scalar.has_value(), "Symbol list trying to call scalar_at for missing row {}, column {}", row, col);
return scalar.value();
}


StreamId stream_id_from_segment(
DataType data_type,
const SegmentInMemory& seg,
position_t row_id,
position_t column) {
if (data_type == DataType::UINT64) {
auto num_id = seg.scalar_at<uint64_t>(row_id, column).value();
auto num_id = scalar_at<uint64_t>(seg, row_id, column);
ARCTICDB_DEBUG(log::symbol(), "Reading numeric symbol {}", num_id);
return safe_convert_to_numeric_id(num_id);
} else {
auto sym = std::string(seg.string_at(row_id, column).value());
auto sym = string_at(seg, row_id, column);
ARCTICDB_DEBUG(log::symbol(), "Reading string symbol '{}'", sym);
return {std::move(sym)};
}
Expand Down Expand Up @@ -183,8 +201,8 @@ std::vector<SymbolListEntry> read_new_style_list_from_storage(const SegmentInMem

for(auto i = 0L; i < seg.column(0).row_count(); ++i) {
auto stream_id = stream_id_from_segment(data_type, seg, i, 0);
auto reference_id = VersionId{seg.scalar_at<uint64_t>(i, 1).value()};
auto reference_time = timestamp{seg.scalar_at<int64_t>(i, 2).value()};
auto reference_id = VersionId{scalar_at<uint64_t>(seg, i, 1)};
auto reference_time = timestamp{scalar_at<int64_t>(seg, i, 2)};
ARCTICDB_RUNTIME_DEBUG(log::symbol(), "Reading added symbol {}: {}@{}", stream_id, reference_id, reference_time);
output.emplace_back(stream_id, reference_id, reference_time, ActionType::ADD);
}
Expand All @@ -195,8 +213,8 @@ std::vector<SymbolListEntry> read_new_style_list_from_storage(const SegmentInMem

for (auto i = 0L; i < seg.column(3).row_count(); ++i) {
auto stream_id = stream_id_from_segment(data_type, seg, i, 3);
auto reference_id = VersionId{seg.scalar_at<uint64_t>(i, 4).value()};
auto reference_time = timestamp{seg.scalar_at<int64_t>(i, 5).value()};
auto reference_id = VersionId{scalar_at<uint64_t>(seg, i, 4)};
auto reference_time = timestamp{scalar_at<int64_t>(seg, i, 5)};
ARCTICDB_RUNTIME_DEBUG(log::symbol(), "Reading deleted symbol {}: {}@{}", stream_id, reference_id, reference_time);
output.emplace_back(stream_id, reference_id, reference_time, ActionType::DELETE);
}
Expand Down

0 comments on commit d0ceb90

Please sign in to comment.