Skip to content

Commit

Permalink
Bugfix 1260: dynamic schema type promotion (#1426)
Browse files Browse the repository at this point in the history
Closes #1260 

Allows appending/updating dynamic schema columns with numeric types for
which there exists a type that can hold both, not just when one of the
types is a strict superset of the other.
e.g. `uint8` appended to 'int8' would previously throw, now the column
type will be 'int16'
  • Loading branch information
alexowens90 authored Mar 18, 2024
1 parent 9c91bd6 commit 05ec9d6
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 29 deletions.
34 changes: 31 additions & 3 deletions cpp/arcticdb/entity/type_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,7 @@ namespace arcticdb {
return std::nullopt;
}

return entity::TypeDescriptor{
combine_data_type(slice_value_type(target_type), target_size),
target.dimension()};
return target;
}

std::optional<entity::TypeDescriptor> has_valid_type_promotion(
Expand All @@ -148,6 +146,36 @@ namespace arcticdb {
if (!maybe_common_type) {
maybe_common_type = has_valid_type_promotion(right, left);
}
// has_valid_type_promotion checks if the second type can represent all values of the first type
// We also want to handle cases where there is a third type that can represent both
// In practice, this is only the case when there is one signed and one unsigned integer right now
if (!maybe_common_type &&
left.dimension() == right.dimension() &&
is_integer_type(left.data_type()) &&
is_integer_type(right.data_type())) {
auto dimension = left.dimension();
auto left_type = left.data_type();
auto right_type = right.data_type();
auto left_size = slice_bit_size(left_type);
auto right_size = slice_bit_size(right_type);
// To get here we must have one signed and one unsigned type, with the width of the signed type <= the width of the unsigned type
internal::check<ErrorCode::E_ASSERTION_FAILURE>(is_signed_type(left_type) ^ is_signed_type(right_type),
"Expected one signed and one unsigned int in has_valid_common_type");
if (is_signed_type(left_type)) {
internal::check<ErrorCode::E_ASSERTION_FAILURE>(left_size <= right_size,
"Expected left_size <= right_size in has_valid_common_type");
} else {
// is_signed_type(right_type)
internal::check<ErrorCode::E_ASSERTION_FAILURE>(right_size <= left_size,
"Expected right_size <= left_size in has_valid_common_type");
}
auto target_size = entity::SizeBits(uint8_t(std::max(left_size, right_size)) + 1);
if (target_size < entity::SizeBits::COUNT) {
maybe_common_type = entity::TypeDescriptor{
combine_data_type(entity::ValueType::INT, target_size),
dimension};
}
}
return maybe_common_type;
}

Expand Down
19 changes: 19 additions & 0 deletions cpp/arcticdb/processing/test/test_has_valid_type_promotion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,22 @@ TEST(HasValidTypePromotion, EverythingToEmpty) {
}
}
}

TEST(HasValidCommonType, UintInt) {
using namespace arcticdb;
using namespace arcticdb::entity;
TypeDescriptor uint32(ValueType::UINT, SizeBits::S32, Dimension::Dim0);
TypeDescriptor uint64(ValueType::UINT, SizeBits::S64, Dimension::Dim0);
TypeDescriptor int16(ValueType::INT, SizeBits::S16, Dimension::Dim0);
TypeDescriptor int32(ValueType::INT, SizeBits::S32, Dimension::Dim0);
TypeDescriptor int64(ValueType::INT, SizeBits::S64, Dimension::Dim0);
ASSERT_EQ(has_valid_common_type(uint32, int16), int64);
ASSERT_EQ(has_valid_common_type(uint32, int32), int64);
ASSERT_EQ(has_valid_common_type(uint32, int64), int64);
EXPECT_FALSE(has_valid_common_type(uint64, int64));
// has_valid_common_type should be symmetric, these assertions are as above with the arguments reversed
ASSERT_EQ(has_valid_common_type(int16, uint32), int64);
ASSERT_EQ(has_valid_common_type(int32, uint32), int64);
ASSERT_EQ(has_valid_common_type(int64, uint32), int64);
EXPECT_FALSE(has_valid_common_type(int64, uint64));
}
59 changes: 37 additions & 22 deletions python/tests/unit/arcticdb/test_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,8 +384,9 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm
{
"int_widening": np.arange(0, 2, dtype=np.uint8),
"int_narrowing": np.arange(-1002, -1000, dtype=np.int16),
"unsigned_to_signed_int": np.arange(1000, 1002, dtype=np.uint16),
"signed_to_unsigned_int": np.arange(-1002, -1000, dtype=np.int32),
"unsigned_to_wider_signed_int": np.arange(1000, 1002, dtype=np.uint16),
"wider_signed_to_unsigned_int": np.arange(-1002, -1000, dtype=np.int32),
"unsigned_to_signed_int_same_width": np.arange(1000, 1002, dtype=np.uint16),
"int_to_float": np.arange(1000, 1002, dtype=np.uint16),
"float_to_int": [1.5, 2.5],
},
Expand All @@ -396,8 +397,9 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm
{
"int_widening": np.arange(1000, 1002, dtype=np.uint16),
"int_narrowing": np.arange(-2, 0, dtype=np.int8),
"unsigned_to_signed_int": np.arange(-1002, -1000, dtype=np.int32),
"signed_to_unsigned_int": np.arange(1000, 1002, dtype=np.uint16),
"unsigned_to_wider_signed_int": np.arange(-1002, -1000, dtype=np.int32),
"wider_signed_to_unsigned_int": np.arange(1000, 1002, dtype=np.uint16),
"unsigned_to_signed_int_same_width": np.arange(-1002, -1000, dtype=np.int16),
"int_to_float": [1.5, 2.5],
"float_to_int": np.arange(1000, 1002, dtype=np.uint16),
},
Expand All @@ -420,22 +422,31 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm
expected_column_stats["v1.0_MIN(int_narrowing)"] = [df0["int_narrowing"].min(), df1["int_narrowing"].min()]
expected_column_stats["v1.0_MAX(int_narrowing)"] = [df0["int_narrowing"].max(), df1["int_narrowing"].max()]

expected_column_stats["v1.0_MIN(unsigned_to_signed_int)"] = [
df0["unsigned_to_signed_int"].min(),
df1["unsigned_to_signed_int"].min(),
expected_column_stats["v1.0_MIN(unsigned_to_wider_signed_int)"] = [
df0["unsigned_to_wider_signed_int"].min(),
df1["unsigned_to_wider_signed_int"].min(),
]
expected_column_stats["v1.0_MAX(unsigned_to_signed_int)"] = [
df0["unsigned_to_signed_int"].max(),
df1["unsigned_to_signed_int"].max(),
expected_column_stats["v1.0_MAX(unsigned_to_wider_signed_int)"] = [
df0["unsigned_to_wider_signed_int"].max(),
df1["unsigned_to_wider_signed_int"].max(),
]

expected_column_stats["v1.0_MIN(signed_to_unsigned_int)"] = [
df0["signed_to_unsigned_int"].min(),
df1["signed_to_unsigned_int"].min(),
expected_column_stats["v1.0_MIN(wider_signed_to_unsigned_int)"] = [
df0["wider_signed_to_unsigned_int"].min(),
df1["wider_signed_to_unsigned_int"].min(),
]
expected_column_stats["v1.0_MAX(signed_to_unsigned_int)"] = [
df0["signed_to_unsigned_int"].max(),
df1["signed_to_unsigned_int"].max(),
expected_column_stats["v1.0_MAX(wider_signed_to_unsigned_int)"] = [
df0["wider_signed_to_unsigned_int"].max(),
df1["wider_signed_to_unsigned_int"].max(),
]

expected_column_stats["v1.0_MIN(unsigned_to_signed_int_same_width)"] = [
df0["unsigned_to_signed_int_same_width"].min(),
df1["unsigned_to_signed_int_same_width"].min(),
]
expected_column_stats["v1.0_MAX(unsigned_to_signed_int_same_width)"] = [
df0["unsigned_to_signed_int_same_width"].max(),
df1["unsigned_to_signed_int_same_width"].max(),
]

expected_column_stats["v1.0_MIN(int_to_float)"] = [df0["int_to_float"].min(), df1["int_to_float"].min()]
Expand All @@ -446,8 +457,9 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm
column_stats_dict = {
"int_widening": {"MINMAX"},
"int_narrowing": {"MINMAX"},
"unsigned_to_signed_int": {"MINMAX"},
"signed_to_unsigned_int": {"MINMAX"},
"unsigned_to_wider_signed_int": {"MINMAX"},
"wider_signed_to_unsigned_int": {"MINMAX"},
"unsigned_to_signed_int_same_width": {"MINMAX"},
"int_to_float": {"MINMAX"},
"float_to_int": {"MINMAX"},
}
Expand All @@ -462,11 +474,14 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm
assert column_stats.dtypes["v1.0_MIN(int_narrowing)"] == np.int16
assert column_stats.dtypes["v1.0_MAX(int_narrowing)"] == np.int16

assert column_stats.dtypes["v1.0_MIN(unsigned_to_signed_int)"] == np.int32
assert column_stats.dtypes["v1.0_MAX(unsigned_to_signed_int)"] == np.int32
assert column_stats.dtypes["v1.0_MIN(unsigned_to_wider_signed_int)"] == np.int32
assert column_stats.dtypes["v1.0_MAX(unsigned_to_wider_signed_int)"] == np.int32

assert column_stats.dtypes["v1.0_MIN(wider_signed_to_unsigned_int)"] == np.int32
assert column_stats.dtypes["v1.0_MAX(wider_signed_to_unsigned_int)"] == np.int32

assert column_stats.dtypes["v1.0_MIN(signed_to_unsigned_int)"] == np.int32
assert column_stats.dtypes["v1.0_MAX(signed_to_unsigned_int)"] == np.int32
assert column_stats.dtypes["v1.0_MIN(unsigned_to_signed_int_same_width)"] == np.int32
assert column_stats.dtypes["v1.0_MAX(unsigned_to_signed_int_same_width)"] == np.int32

assert column_stats.dtypes["v1.0_MIN(int_to_float)"] == np.float64
assert column_stats.dtypes["v1.0_MAX(int_to_float)"] == np.float64
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ def test_changing_numeric_type(version_store_factory, dynamic_schema):
lib = version_store_factory(dynamic_schema=dynamic_schema)
sym_append = "test_changing_numeric_type_append"
sym_update = "test_changing_numeric_type_update"
df_write = pd.DataFrame({"col": np.arange(3, dtype=np.uint8)}, index=pd.date_range("2024-01-01", periods=3))
df_append = pd.DataFrame({"col": np.arange(1, dtype=np.uint16)}, index=pd.date_range("2024-01-04", periods=1))
df_update = pd.DataFrame({"col": np.arange(1, dtype=np.uint16)}, index=pd.date_range("2024-01-02", periods=1))
df_write = pd.DataFrame({"col": np.arange(3, dtype=np.uint32)}, index=pd.date_range("2024-01-01", periods=3))
df_append = pd.DataFrame({"col": np.arange(1, dtype=np.int32)}, index=pd.date_range("2024-01-04", periods=1))
df_update = pd.DataFrame({"col": np.arange(1, dtype=np.int32)}, index=pd.date_range("2024-01-02", periods=1))

lib.write(sym_append, df_write)
lib.write(sym_update, df_write)
Expand All @@ -41,7 +41,7 @@ def test_changing_numeric_type(version_store_factory, dynamic_schema):
received_append = lib.read(sym_append).data
assert_frame_equal(expected_append, received_append)

expected_update = pd.DataFrame({"col": np.array([0, 0, 2], dtype=np.uint16)}, index=pd.date_range("2024-01-01", periods=3))
expected_update = pd.DataFrame({"col": np.array([0, 0, 2], dtype=np.int64)}, index=pd.date_range("2024-01-01", periods=3))
received_update = lib.read(sym_update).data
assert_frame_equal(expected_update, received_update)

Expand Down

0 comments on commit 05ec9d6

Please sign in to comment.