Skip to content

Commit

Permalink
[Chore](exchange) add LZ4_MAX_INPUT_SIZE check on DataTypeString::get…
Browse files Browse the repository at this point in the history
…_uncompressed_seri… (apache#43360)

…alized_bytes

### What problem does this PR solve?
add LZ4_MAX_INPUT_SIZE check on
DataTypeString::get_uncompressed_serialized_bytes
Problem Summary:
/*! LZ4_compressBound() :
Provides the maximum size that LZ4 compression may output in a "worst
case" scenario (input data not compressible)
This function is primarily useful for memory allocation purposes
(destination buffer size).
Macro LZ4_COMPRESSBOUND() is also provided for compilation-time
evaluation (stack memory allocation for example).
Note that LZ4_compress_default() compresses faster when dstCapacity is
>= LZ4_compressBound(srcSize)
        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
        return : maximum output size in a "worst case" scenario
              or 0, if input size is incorrect (too large or negative)
*/

Co-authored-by: BiteTheDDDDt <[email protected]>
  • Loading branch information
BiteTheDDDDt and BiteTheDDDDt authored Nov 7, 2024
1 parent bfb9ba8 commit db0288b
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 2 deletions.
12 changes: 10 additions & 2 deletions be/src/vec/data_types/data_type_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include <cstring>

#include "agent/be_exec_version_manager.h"
#include "common/exception.h"
#include "common/status.h"
#include "vec/columns/column.h"
#include "vec/columns/column_const.h"
#include "vec/columns/column_string.h"
Expand Down Expand Up @@ -81,7 +83,7 @@ bool DataTypeString::equals(const IDataType& rhs) const {
int64_t DataTypeString::get_uncompressed_serialized_bytes(const IColumn& column,
int be_exec_version) const {
if (be_exec_version >= USE_CONST_SERDE) {
auto size = sizeof(bool) + sizeof(size_t) + sizeof(size_t);
int64_t size = sizeof(bool) + sizeof(size_t) + sizeof(size_t);
bool is_const_column = is_column_const(column);
const IColumn* string_column = &column;
if (is_const_column) {
Expand All @@ -99,9 +101,15 @@ int64_t DataTypeString::get_uncompressed_serialized_bytes(const IColumn& column,
upper_int32(offsets_size)));
}
size += sizeof(size_t);
if (auto bytes = data_column.get_chars().size(); bytes <= SERIALIZED_MEM_SIZE_LIMIT) {
if (size_t bytes = data_column.get_chars().size(); bytes <= SERIALIZED_MEM_SIZE_LIMIT) {
size += bytes;
} else {
if (bytes > LZ4_MAX_INPUT_SIZE) {
throw Exception(ErrorCode::BUFFER_OVERFLOW,
"LZ4_compressBound meet invalid input size, input_size={}, "
"LZ4_MAX_INPUT_SIZE={}",
bytes, LZ4_MAX_INPUT_SIZE);
}
size += sizeof(size_t) + std::max(bytes, (size_t)LZ4_compressBound(bytes));
}
return size;
Expand Down
17 changes: 17 additions & 0 deletions be/test/vec/jsonb/serialize_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <math.h>
#include <stdint.h>

#include <cassert>
#include <iostream>
#include <memory>
#include <string>
Expand All @@ -30,6 +31,8 @@
#include <utility>
#include <vector>

#include "agent/be_exec_version_manager.h"
#include "common/exception.h"
#include "gen_cpp/descriptors.pb.h"
#include "gtest/gtest_pred_impl.h"
#include "olap/hll.h"
Expand Down Expand Up @@ -263,6 +266,20 @@ TEST(BlockSerializeTest, Map) {
EXPECT_EQ(block.dump_data(), new_block.dump_data());
}

TEST(BlockSerializeTest, Bigstr) {
DataTypePtr s = std::make_shared<DataTypeString>();
MutableColumnPtr col = ColumnString::create();
std::string bigdata;
bigdata.resize(std::numeric_limits<int32_t>::max() - 5);
col->insert_data(bigdata.data(), bigdata.length());
try {
s->get_uncompressed_serialized_bytes(*col, BeExecVersionManager::get_newest_version());
} catch (std::exception e) {
return;
}
assert(false);
}

TEST(BlockSerializeTest, Struct) {
TabletSchema schema;
TabletColumn struct_col;
Expand Down

0 comments on commit db0288b

Please sign in to comment.