Bugfix 1260: dynamic schema type promotion (#1426)

Closes #1260 Allows appending/updating dynamic schema columns with numeric types for which there exists a type that can hold both, not just when one of the types is a strict superset of the other. e.g. `uint8` appended to 'int8' would previously throw, now the column type will be 'int16'
man-group · Mar 18, 2024 · 05ec9d6 · 05ec9d6
1 parent 9c91bd6
commit 05ec9d6
Show file tree

Hide file tree

Showing 4 changed files with 91 additions and 29 deletions.
diff --git a/cpp/arcticdb/entity/type_utils.cpp b/cpp/arcticdb/entity/type_utils.cpp
@@ -128,9 +128,7 @@ namespace arcticdb {
             return std::nullopt;
         }
 
-        return entity::TypeDescriptor{
-            combine_data_type(slice_value_type(target_type), target_size),
-            target.dimension()};
+        return target;
     }
 
     std::optional<entity::TypeDescriptor> has_valid_type_promotion(
@@ -148,6 +146,36 @@ namespace arcticdb {
         if (!maybe_common_type) {
             maybe_common_type = has_valid_type_promotion(right, left);
         }
+        // has_valid_type_promotion checks if the second type can represent all values of the first type
+        // We also want to handle cases where there is a third type that can represent both
+        // In practice, this is only the case when there is one signed and one unsigned integer right now
+        if (!maybe_common_type &&
+            left.dimension() == right.dimension() &&
+            is_integer_type(left.data_type()) &&
+            is_integer_type(right.data_type())) {
+            auto dimension = left.dimension();
+            auto left_type = left.data_type();
+            auto right_type = right.data_type();
+            auto left_size = slice_bit_size(left_type);
+            auto right_size = slice_bit_size(right_type);
+            // To get here we must have one signed and one unsigned type, with the width of the signed type <= the width of the unsigned type
+            internal::check<ErrorCode::E_ASSERTION_FAILURE>(is_signed_type(left_type) ^ is_signed_type(right_type),
+                                                            "Expected one signed and one unsigned int in has_valid_common_type");
+            if (is_signed_type(left_type)) {
+                internal::check<ErrorCode::E_ASSERTION_FAILURE>(left_size <= right_size,
+                                                                "Expected left_size <= right_size in has_valid_common_type");
+            } else {
+                // is_signed_type(right_type)
+                internal::check<ErrorCode::E_ASSERTION_FAILURE>(right_size <= left_size,
+                                                                "Expected right_size <= left_size in has_valid_common_type");
+            }
+            auto target_size = entity::SizeBits(uint8_t(std::max(left_size, right_size)) + 1);
+            if (target_size < entity::SizeBits::COUNT) {
+                maybe_common_type = entity::TypeDescriptor{
+                        combine_data_type(entity::ValueType::INT, target_size),
+                        dimension};
+            }
+        }
         return maybe_common_type;
     }
 

diff --git a/cpp/arcticdb/processing/test/test_has_valid_type_promotion.cpp b/cpp/arcticdb/processing/test/test_has_valid_type_promotion.cpp
@@ -148,3 +148,22 @@ TEST(HasValidTypePromotion, EverythingToEmpty) {
         }
     }
 }
+
+TEST(HasValidCommonType, UintInt) {
+    using namespace arcticdb;
+    using namespace arcticdb::entity;
+    TypeDescriptor uint32(ValueType::UINT, SizeBits::S32, Dimension::Dim0);
+    TypeDescriptor uint64(ValueType::UINT, SizeBits::S64, Dimension::Dim0);
+    TypeDescriptor int16(ValueType::INT, SizeBits::S16, Dimension::Dim0);
+    TypeDescriptor int32(ValueType::INT, SizeBits::S32, Dimension::Dim0);
+    TypeDescriptor int64(ValueType::INT, SizeBits::S64, Dimension::Dim0);
+    ASSERT_EQ(has_valid_common_type(uint32, int16), int64);
+    ASSERT_EQ(has_valid_common_type(uint32, int32), int64);
+    ASSERT_EQ(has_valid_common_type(uint32, int64), int64);
+    EXPECT_FALSE(has_valid_common_type(uint64, int64));
+    // has_valid_common_type should be symmetric, these assertions are as above with the arguments reversed
+    ASSERT_EQ(has_valid_common_type(int16, uint32), int64);
+    ASSERT_EQ(has_valid_common_type(int32, uint32), int64);
+    ASSERT_EQ(has_valid_common_type(int64, uint32), int64);
+    EXPECT_FALSE(has_valid_common_type(int64, uint64));
+}
diff --git a/python/tests/unit/arcticdb/test_column_stats.py b/python/tests/unit/arcticdb/test_column_stats.py
@@ -384,8 +384,9 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm
         {
             "int_widening": np.arange(0, 2, dtype=np.uint8),
             "int_narrowing": np.arange(-1002, -1000, dtype=np.int16),
-            "unsigned_to_signed_int": np.arange(1000, 1002, dtype=np.uint16),
-            "signed_to_unsigned_int": np.arange(-1002, -1000, dtype=np.int32),
+            "unsigned_to_wider_signed_int": np.arange(1000, 1002, dtype=np.uint16),
+            "wider_signed_to_unsigned_int": np.arange(-1002, -1000, dtype=np.int32),
+            "unsigned_to_signed_int_same_width": np.arange(1000, 1002, dtype=np.uint16),
             "int_to_float": np.arange(1000, 1002, dtype=np.uint16),
             "float_to_int": [1.5, 2.5],
         },
@@ -396,8 +397,9 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm
         {
             "int_widening": np.arange(1000, 1002, dtype=np.uint16),
             "int_narrowing": np.arange(-2, 0, dtype=np.int8),
-            "unsigned_to_signed_int": np.arange(-1002, -1000, dtype=np.int32),
-            "signed_to_unsigned_int": np.arange(1000, 1002, dtype=np.uint16),
+            "unsigned_to_wider_signed_int": np.arange(-1002, -1000, dtype=np.int32),
+            "wider_signed_to_unsigned_int": np.arange(1000, 1002, dtype=np.uint16),
+            "unsigned_to_signed_int_same_width": np.arange(-1002, -1000, dtype=np.int16),
             "int_to_float": [1.5, 2.5],
             "float_to_int": np.arange(1000, 1002, dtype=np.uint16),
         },
@@ -420,22 +422,31 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm
     expected_column_stats["v1.0_MIN(int_narrowing)"] = [df0["int_narrowing"].min(), df1["int_narrowing"].min()]
     expected_column_stats["v1.0_MAX(int_narrowing)"] = [df0["int_narrowing"].max(), df1["int_narrowing"].max()]
 
-    expected_column_stats["v1.0_MIN(unsigned_to_signed_int)"] = [
-        df0["unsigned_to_signed_int"].min(),
-        df1["unsigned_to_signed_int"].min(),
+    expected_column_stats["v1.0_MIN(unsigned_to_wider_signed_int)"] = [
+        df0["unsigned_to_wider_signed_int"].min(),
+        df1["unsigned_to_wider_signed_int"].min(),
     ]
-    expected_column_stats["v1.0_MAX(unsigned_to_signed_int)"] = [
-        df0["unsigned_to_signed_int"].max(),
-        df1["unsigned_to_signed_int"].max(),
+    expected_column_stats["v1.0_MAX(unsigned_to_wider_signed_int)"] = [
+        df0["unsigned_to_wider_signed_int"].max(),
+        df1["unsigned_to_wider_signed_int"].max(),
     ]
 
-    expected_column_stats["v1.0_MIN(signed_to_unsigned_int)"] = [
-        df0["signed_to_unsigned_int"].min(),
-        df1["signed_to_unsigned_int"].min(),
+    expected_column_stats["v1.0_MIN(wider_signed_to_unsigned_int)"] = [
+        df0["wider_signed_to_unsigned_int"].min(),
+        df1["wider_signed_to_unsigned_int"].min(),
     ]
-    expected_column_stats["v1.0_MAX(signed_to_unsigned_int)"] = [
-        df0["signed_to_unsigned_int"].max(),
-        df1["signed_to_unsigned_int"].max(),
+    expected_column_stats["v1.0_MAX(wider_signed_to_unsigned_int)"] = [
+        df0["wider_signed_to_unsigned_int"].max(),
+        df1["wider_signed_to_unsigned_int"].max(),
+    ]
+
+    expected_column_stats["v1.0_MIN(unsigned_to_signed_int_same_width)"] = [
+        df0["unsigned_to_signed_int_same_width"].min(),
+        df1["unsigned_to_signed_int_same_width"].min(),
+    ]
+    expected_column_stats["v1.0_MAX(unsigned_to_signed_int_same_width)"] = [
+        df0["unsigned_to_signed_int_same_width"].max(),
+        df1["unsigned_to_signed_int_same_width"].max(),
     ]
 
     expected_column_stats["v1.0_MIN(int_to_float)"] = [df0["int_to_float"].min(), df1["int_to_float"].min()]
@@ -446,8 +457,9 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm
     column_stats_dict = {
         "int_widening": {"MINMAX"},
         "int_narrowing": {"MINMAX"},
-        "unsigned_to_signed_int": {"MINMAX"},
-        "signed_to_unsigned_int": {"MINMAX"},
+        "unsigned_to_wider_signed_int": {"MINMAX"},
+        "wider_signed_to_unsigned_int": {"MINMAX"},
+        "unsigned_to_signed_int_same_width": {"MINMAX"},
         "int_to_float": {"MINMAX"},
         "float_to_int": {"MINMAX"},
     }
@@ -462,11 +474,14 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm
     assert column_stats.dtypes["v1.0_MIN(int_narrowing)"] == np.int16
     assert column_stats.dtypes["v1.0_MAX(int_narrowing)"] == np.int16
 
-    assert column_stats.dtypes["v1.0_MIN(unsigned_to_signed_int)"] == np.int32
-    assert column_stats.dtypes["v1.0_MAX(unsigned_to_signed_int)"] == np.int32
+    assert column_stats.dtypes["v1.0_MIN(unsigned_to_wider_signed_int)"] == np.int32
+    assert column_stats.dtypes["v1.0_MAX(unsigned_to_wider_signed_int)"] == np.int32
+
+    assert column_stats.dtypes["v1.0_MIN(wider_signed_to_unsigned_int)"] == np.int32
+    assert column_stats.dtypes["v1.0_MAX(wider_signed_to_unsigned_int)"] == np.int32
 
-    assert column_stats.dtypes["v1.0_MIN(signed_to_unsigned_int)"] == np.int32
-    assert column_stats.dtypes["v1.0_MAX(signed_to_unsigned_int)"] == np.int32
+    assert column_stats.dtypes["v1.0_MIN(unsigned_to_signed_int_same_width)"] == np.int32
+    assert column_stats.dtypes["v1.0_MAX(unsigned_to_signed_int_same_width)"] == np.int32
 
     assert column_stats.dtypes["v1.0_MIN(int_to_float)"] == np.float64
     assert column_stats.dtypes["v1.0_MAX(int_to_float)"] == np.float64

diff --git a/python/tests/unit/arcticdb/version_store/test_column_type_changes.py b/python/tests/unit/arcticdb/version_store/test_column_type_changes.py
@@ -21,9 +21,9 @@ def test_changing_numeric_type(version_store_factory, dynamic_schema):
     lib = version_store_factory(dynamic_schema=dynamic_schema)
     sym_append = "test_changing_numeric_type_append"
     sym_update = "test_changing_numeric_type_update"
-    df_write = pd.DataFrame({"col": np.arange(3, dtype=np.uint8)}, index=pd.date_range("2024-01-01", periods=3))
-    df_append = pd.DataFrame({"col": np.arange(1, dtype=np.uint16)}, index=pd.date_range("2024-01-04", periods=1))
-    df_update = pd.DataFrame({"col": np.arange(1, dtype=np.uint16)}, index=pd.date_range("2024-01-02", periods=1))
+    df_write = pd.DataFrame({"col": np.arange(3, dtype=np.uint32)}, index=pd.date_range("2024-01-01", periods=3))
+    df_append = pd.DataFrame({"col": np.arange(1, dtype=np.int32)}, index=pd.date_range("2024-01-04", periods=1))
+    df_update = pd.DataFrame({"col": np.arange(1, dtype=np.int32)}, index=pd.date_range("2024-01-02", periods=1))
 
     lib.write(sym_append, df_write)
     lib.write(sym_update, df_write)
@@ -41,7 +41,7 @@ def test_changing_numeric_type(version_store_factory, dynamic_schema):
         received_append = lib.read(sym_append).data
         assert_frame_equal(expected_append, received_append)
 
-        expected_update = pd.DataFrame({"col": np.array([0, 0, 2], dtype=np.uint16)}, index=pd.date_range("2024-01-01", periods=3))
+        expected_update = pd.DataFrame({"col": np.array([0, 0, 2], dtype=np.int64)}, index=pd.date_range("2024-01-01", periods=3))
         received_update = lib.read(sym_update).data
         assert_frame_equal(expected_update, received_update)