From c81df095af49ece5a0e9e2a9430fd8992946318a Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Wed, 21 Feb 2024 13:59:43 +0200 Subject: [PATCH 01/21] New test cases --- .../version_store/test_empty_column_type.py | 36 +++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index e033889d49..98068d73b4 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -15,7 +15,6 @@ def int_dtype(request): yield request.param - @pytest.fixture(params=["float" + s for s in ["32", "64"]]) def float_dtype(request): yield request.param @@ -34,7 +33,13 @@ def date_dtype(request): yield request.param -class TestCanAppendToEmptyColumn: +class TestCanAppendToColunWithNones: + """ + Tests that it is possible to write a column containing None values and latter + append to it. Initially the type of the column must be empty type after the + append the column must be of the same type as the appended data + """ + @pytest.fixture(autouse=True) def create_empty_column(self, lmdb_version_store_static_and_dynamic): lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": 2 * [None]})) @@ -150,6 +155,32 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): ) +class TestCanAppendToEmptyColumn: + """ + Tests that it's possible to append to a column which contains no rows. + The type of the columns (including the index column) is decided after + the first append. + """ + + @pytest.fixture(params=[[0,1,2], list(pd.date_range(start="1/1/2024", end="1/3/2024"))]) + def append_index(self, request): + yield request.param + + @pytest.fixture(params=[pd.RangeIndex(0,0), pd.DatetimeIndex([])]) + def empty_index(self, request): + yield request.param + + @pytest.fixture(autouse=True) + def create_empty_column(self, lmdb_version_store_static_and_dynamic, int_dtype, empty_index): + lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": []}, dtype=int_dtype, index=empty_index)) + yield + + def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype, empty_index, append_index): + df_to_append = pd.DataFrame({"col1": [1,2,3]}, dtype=int_dtype, index=append_index) + lmdb_version_store_static_and_dynamic.append("sym", df_to_append) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) + + class TestCanAppendEmptyToColumn: def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype): df_initial = pd.DataFrame({"col1": np.array([1,2,3], dtype=int_dtype)}) @@ -510,6 +541,7 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): ) ) + class TestEmptyTypesIsOverriden: def test_cannot_append_different_type_after_first_not_none(self, lmdb_version_store_static_and_dynamic): lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": [None, None]})) From 4b059884a800fbb7df71571301323a9d26baf991 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Fri, 23 Feb 2024 16:40:05 +0200 Subject: [PATCH 02/21] Make default column type empty for 0 rowed columns --- .../python/python_to_tensor_frame.cpp | 2 +- .../version_store/test_empty_column_type.py | 63 ++++++++++++++++--- 2 files changed, 57 insertions(+), 8 deletions(-) diff --git a/cpp/arcticdb/python/python_to_tensor_frame.cpp b/cpp/arcticdb/python/python_to_tensor_frame.cpp index ce0e3f79b3..6f8c8dc927 100644 --- a/cpp/arcticdb/python/python_to_tensor_frame.cpp +++ b/cpp/arcticdb/python/python_to_tensor_frame.cpp @@ -120,7 +120,7 @@ NativeTensor obj_to_tensor(PyObject *ptr) { // In Pandas < 2, empty series dtype is `"float"`, but as of Pandas 2.0, empty series dtype is `"object"` // The Normalizer in Python cast empty `"float"` series to `"object"` so `EMPTY` is used here. // See: https://github.com/man-group/ArcticDB/pull/1049 - auto val_type = (size == 0 && descr->kind == 'O') ? ValueType::EMPTY : get_value_type(descr->kind); + auto val_type = size == 0 ? ValueType::EMPTY : get_value_type(descr->kind); auto val_bytes = static_cast(descr->elsize); const int64_t element_count = ndim == 1 ? int64_t(arr->dimensions[0]) : int64_t(arr->dimensions[0]) * int64_t(arr->dimensions[1]); const auto c_style = arr->strides[0] == val_bytes; diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 98068d73b4..365738ceda 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -11,28 +11,60 @@ import numpy as np import pytest -@pytest.fixture(params=[t + s for s in ["8", "16", "32", "64"] for t in ["int", "uint"]]) +class DtypeGenerator: + """ + Class which can generate all dtypes which ArcticDB supports. + It can generate them by type category e.g. int, float, etc. + it can also generate a list of all available dtypes + """ + + @staticmethod + def int_dtype(): + return [t + s for s in ["8", "16", "32", "64"] for t in ["int", "uint"]] + + @staticmethod + def float_dtype(): + return ["float" + s for s in ["32", "64"]] + + @staticmethod + def bool_dtype(): + return ["object", "bool"] + + @staticmethod + def date_dtype(): + return ["datetime64[ns]"] + + @classmethod + def dtype(cls): + return [*cls.int_dtype(), *cls.float_dtype(), *cls.bool_dtype(), *cls.date_dtype()] + +@pytest.fixture(params=DtypeGenerator.int_dtype()) def int_dtype(request): yield request.param -@pytest.fixture(params=["float" + s for s in ["32", "64"]]) +@pytest.fixture(params=DtypeGenerator.float_dtype()) def float_dtype(request): yield request.param # object is for nullable boolean -@pytest.fixture(params=["object", "bool"]) +@pytest.fixture(params=DtypeGenerator.bool_dtype()) def boolean_dtype(request): if request.param == "object": pytest.skip("Nullable booleans are temporary disabled") yield request.param -@pytest.fixture(params=["datetime64[ns]"]) +@pytest.fixture(params=DtypeGenerator.date_dtype()) def date_dtype(request): yield request.param +@pytest.fixture(params=DtypeGenerator.dtype()) +def dtype(request): + yield request.param + + class TestCanAppendToColunWithNones: """ Tests that it is possible to write a column containing None values and latter @@ -162,7 +194,19 @@ class TestCanAppendToEmptyColumn: the first append. """ - @pytest.fixture(params=[[0,1,2], list(pd.date_range(start="1/1/2024", end="1/3/2024"))]) + @pytest.fixture(params= + [ + pytest.param( + pd.RangeIndex(0,2), + marks=pytest.mark.xfail( + reason="Appending row ranged columns to empty columns is not supported yet." + "The index of empty df is of type datetime and it clashes with the row-range type." + "The expected behavior is to allow this as long as the initial df is of 0 rows.") + ), + list(pd.date_range(start="1/1/2024", end="1/3/2024")) + + ] + ) def append_index(self, request): yield request.param @@ -171,14 +215,19 @@ def empty_index(self, request): yield request.param @pytest.fixture(autouse=True) - def create_empty_column(self, lmdb_version_store_static_and_dynamic, int_dtype, empty_index): - lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": []}, dtype=int_dtype, index=empty_index)) + def create_empty_column(self, lmdb_version_store_static_and_dynamic, dtype, empty_index): + lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": []}, dtype=dtype, index=empty_index)) yield def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype, empty_index, append_index): df_to_append = pd.DataFrame({"col1": [1,2,3]}, dtype=int_dtype, index=append_index) lmdb_version_store_static_and_dynamic.append("sym", df_to_append) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) + + def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype, empty_index, append_index): + df_to_append = pd.DataFrame({"col1": [1,2,3]}, dtype=float_dtype, index=append_index) + lmdb_version_store_static_and_dynamic.append("sym", df_to_append) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) class TestCanAppendEmptyToColumn: From 28aa3662e5b32d9eb973cd8b00b62758b78d2ec3 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Tue, 27 Feb 2024 17:39:56 +0200 Subject: [PATCH 03/21] Add additional empty-type tests --- .../version_store/test_empty_column_type.py | 499 ++++++++++++------ 1 file changed, 344 insertions(+), 155 deletions(-) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 365738ceda..c9987f1089 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -13,11 +13,11 @@ class DtypeGenerator: """ - Class which can generate all dtypes which ArcticDB supports. - It can generate them by type category e.g. int, float, etc. - it can also generate a list of all available dtypes + Class which can generate all dtypes which ArcticDB supports. It can generate them by type category e.g. int, float, + etc. It can also generate a list of all available dtypes """ - + + @staticmethod def int_dtype(): return [t + s for s in ["8", "16", "32", "64"] for t in ["int", "uint"]] @@ -28,6 +28,7 @@ def float_dtype(): @staticmethod def bool_dtype(): + # object is for nullable boolean return ["object", "bool"] @staticmethod @@ -36,7 +37,7 @@ def date_dtype(): @classmethod def dtype(cls): - return [*cls.int_dtype(), *cls.float_dtype(), *cls.bool_dtype(), *cls.date_dtype()] + return list(set([*cls.int_dtype(), *cls.float_dtype(), *cls.bool_dtype(), *cls.date_dtype()])) @pytest.fixture(params=DtypeGenerator.int_dtype()) def int_dtype(request): @@ -47,7 +48,6 @@ def float_dtype(request): yield request.param -# object is for nullable boolean @pytest.fixture(params=DtypeGenerator.bool_dtype()) def boolean_dtype(request): if request.param == "object": @@ -65,13 +65,18 @@ def dtype(request): yield request.param +@pytest.fixture(params=[pd.RangeIndex(0,0), pd.DatetimeIndex([])]) +def empty_index(request): + yield request.param + + class TestCanAppendToColunWithNones: """ - Tests that it is possible to write a column containing None values and latter - append to it. Initially the type of the column must be empty type after the - append the column must be of the same type as the appended data + Tests that it is possible to write a column containing None values and latter append to it. Initially the type of + the column must be empty type after the append the column must be of the same type as the appended data. """ + @pytest.fixture(autouse=True) def create_empty_column(self, lmdb_version_store_static_and_dynamic): lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": 2 * [None]})) @@ -187,50 +192,15 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): ) -class TestCanAppendToEmptyColumn: +class TestCanAppendColumnWithNonesToColumn: """ - Tests that it's possible to append to a column which contains no rows. - The type of the columns (including the index column) is decided after - the first append. + Tests if it's possible to append column consisted only of None values to another column. The type of the column + containing only None values should be empty type, thus we should be able to append it to any other column + regardless of its type. Appending column containing only None values should not change the type of a column. On + read the None values will be backfilled depending on the overall type of the column. """ - @pytest.fixture(params= - [ - pytest.param( - pd.RangeIndex(0,2), - marks=pytest.mark.xfail( - reason="Appending row ranged columns to empty columns is not supported yet." - "The index of empty df is of type datetime and it clashes with the row-range type." - "The expected behavior is to allow this as long as the initial df is of 0 rows.") - ), - list(pd.date_range(start="1/1/2024", end="1/3/2024")) - - ] - ) - def append_index(self, request): - yield request.param - @pytest.fixture(params=[pd.RangeIndex(0,0), pd.DatetimeIndex([])]) - def empty_index(self, request): - yield request.param - - @pytest.fixture(autouse=True) - def create_empty_column(self, lmdb_version_store_static_and_dynamic, dtype, empty_index): - lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": []}, dtype=dtype, index=empty_index)) - yield - - def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype, empty_index, append_index): - df_to_append = pd.DataFrame({"col1": [1,2,3]}, dtype=int_dtype, index=append_index) - lmdb_version_store_static_and_dynamic.append("sym", df_to_append) - assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) - - def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype, empty_index, append_index): - df_to_append = pd.DataFrame({"col1": [1,2,3]}, dtype=float_dtype, index=append_index) - lmdb_version_store_static_and_dynamic.append("sym", df_to_append) - assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) - - -class TestCanAppendEmptyToColumn: def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype): df_initial = pd.DataFrame({"col1": np.array([1,2,3], dtype=int_dtype)}) lmdb_version_store_static_and_dynamic.write("sym", df_initial) @@ -341,113 +311,12 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): ) -class TestCanUpdateWithEmpty: - def index(self): - return list(pd.date_range(start="1/1/2024", end="1/4/2024")) - - def update_index(self): - return list(pd.date_range(start="1/2/2024", end="1/3/2024")) - - def test_int(self, lmdb_version_store_static_and_dynamic, int_dtype): - lmdb_version_store_static_and_dynamic.write( - "sym", - pd.DataFrame({"col": [1, 2, 3, 4]}, dtype=int_dtype, index=self.index()) - ) - lmdb_version_store_static_and_dynamic.update( - "sym", - pd.DataFrame({"col": [None, None]}, index=self.update_index()) - ) - assert_frame_equal( - lmdb_version_store_static_and_dynamic.read("sym").data, - pd.DataFrame({"col": [1, 0, 0, 4]}, index=self.index(), dtype=int_dtype) - ) - - def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): - lmdb_version_store_static_and_dynamic.write( - "sym", - pd.DataFrame({"col": [1, 2, 3, 4]}, dtype=float_dtype, index=self.index()) - ) - lmdb_version_store_static_and_dynamic.update( - "sym", - pd.DataFrame({"col": [None, None]}, index=self.update_index()) - ) - assert_frame_equal( - lmdb_version_store_static_and_dynamic.read("sym").data, - pd.DataFrame({"col": [1, float("NaN"), float("NaN"), 4]}, index=self.index(), dtype=float_dtype) - ) - - def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): - # Note: if dtype is bool pandas will convert None to False - lmdb_version_store_static_and_dynamic.write( - "sym", - pd.DataFrame({"col": [True, True, True, True]}, dtype=boolean_dtype, index=self.index()) - ) - lmdb_version_store_static_and_dynamic.update( - "sym", - pd.DataFrame({"col": [None, None]}, dtype=boolean_dtype, index=self.update_index()) - ) - assert_frame_equal( - lmdb_version_store_static_and_dynamic.read("sym").data, - pd.DataFrame({"col": [True, None, None, True]}, index=self.index(), dtype=boolean_dtype) - ) - - def test_string(self, lmdb_version_store_static_and_dynamic): - lmdb_version_store_static_and_dynamic.write( - "sym", - pd.DataFrame({"col": ["a", "longstr"*20, "b", "longstr"*20]}, index=self.index()) - ) - lmdb_version_store_static_and_dynamic.update( - "sym", - pd.DataFrame({"col": [None, None]}, index=self.update_index()) - ) - assert_frame_equal( - lmdb_version_store_static_and_dynamic.read("sym").data, - pd.DataFrame({"col": ["a", None, None, "longstr"*20]}, index=self.index()) - ) - - def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): - lmdb_version_store_static_and_dynamic.write( - "sym", - pd.DataFrame( - { - "col": [ - np.datetime64('2005-02'), - np.datetime64('2005-03'), - np.datetime64('2005-04'), - np.datetime64('2005-05') - ] - }, - dtype=date_dtype, - index=self.index() - ) - ) - lmdb_version_store_static_and_dynamic.update( - "sym", - pd.DataFrame({"col": [None, None]}, index=self.update_index()) - ) - assert_frame_equal( - lmdb_version_store_static_and_dynamic.read("sym").data, - pd.DataFrame( - { - "col": [ - np.datetime64('2005-02'), - np.datetime64('NaT'), - np.datetime64('NaT'), - np.datetime64('2005-05') - ] - }, - index=self.index() - ) - ) - - -class TestCanUpdateEmpty: +class TestCanUpdateNones: """ - Check that a column containing only empty type and date index can be updated and that - the type of the column will be changed to the type of the new data. The update range - is right in the middle of the initial date range, so there will be values unaffected - by the update. Check that on read the type of the unaffected entries will be changed - from empty to the new type of the column. + Check that a column containing only empty type and date index can be updated and that the type of the column will + be changed to the type of the new data. The update range is right in the middle of the initial date range, so + there will be values unaffected by the update. Check that on read the type of the unaffected entries will be + changed from empty to the new type of the column. """ @@ -591,6 +460,326 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): ) +class TestCanUpdateWithNone: + """ + Tests if a subrange of column of any type can be updated with a subrange of None values. The overall type of the + column should not change. The None values will be backfilled depending on the type (NaN/NaT/None/0). + """ + + + def index(self): + return list(pd.date_range(start="1/1/2024", end="1/4/2024")) + + def update_index(self): + return list(pd.date_range(start="1/2/2024", end="1/3/2024")) + + def test_int(self, lmdb_version_store_static_and_dynamic, int_dtype): + lmdb_version_store_static_and_dynamic.write( + "sym", + pd.DataFrame({"col": [1, 2, 3, 4]}, dtype=int_dtype, index=self.index()) + ) + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": [None, None]}, index=self.update_index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame({"col": [1, 0, 0, 4]}, index=self.index(), dtype=int_dtype) + ) + + def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): + lmdb_version_store_static_and_dynamic.write( + "sym", + pd.DataFrame({"col": [1, 2, 3, 4]}, dtype=float_dtype, index=self.index()) + ) + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": [None, None]}, index=self.update_index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame({"col": [1, float("NaN"), float("NaN"), 4]}, index=self.index(), dtype=float_dtype) + ) + + def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): + # Note: if dtype is bool pandas will convert None to False + lmdb_version_store_static_and_dynamic.write( + "sym", + pd.DataFrame({"col": [True, True, True, True]}, dtype=boolean_dtype, index=self.index()) + ) + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": [None, None]}, dtype=boolean_dtype, index=self.update_index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame({"col": [True, None, None, True]}, index=self.index(), dtype=boolean_dtype) + ) + + def test_string(self, lmdb_version_store_static_and_dynamic): + lmdb_version_store_static_and_dynamic.write( + "sym", + pd.DataFrame({"col": ["a", "longstr"*20, "b", "longstr"*20]}, index=self.index()) + ) + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": [None, None]}, index=self.update_index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame({"col": ["a", None, None, "longstr"*20]}, index=self.index()) + ) + + def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): + lmdb_version_store_static_and_dynamic.write( + "sym", + pd.DataFrame( + { + "col": [ + np.datetime64('2005-02'), + np.datetime64('2005-03'), + np.datetime64('2005-04'), + np.datetime64('2005-05') + ] + }, + dtype=date_dtype, + index=self.index() + ) + ) + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": [None, None]}, index=self.update_index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame( + { + "col": [ + np.datetime64('2005-02'), + np.datetime64('NaT'), + np.datetime64('NaT'), + np.datetime64('2005-05') + ] + }, + index=self.index() + ) + ) + +class TestCanAppendToEmptyColumn: + """ + Tests that it's possible to append to a column which contains no rows. The type of the columns, including the index + column, is decided after the first append. + """ + + + @pytest.fixture(params= + [ + pytest.param( + pd.RangeIndex(0,3), + marks=pytest.mark.xfail( + reason="Appending row ranged columns to empty columns is not supported yet." + "The index of empty df is of type datetime and it clashes with the row-range type." + "The expected behavior is to allow this as long as the initial df is of 0 rows.") + ), + list(pd.date_range(start="1/1/2024", end="1/3/2024")) + ] + ) + def append_index(self, request): + yield request.param + + @pytest.fixture(autouse=True) + def create_empty_column(self, lmdb_version_store_static_and_dynamic, dtype, empty_index): + lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": []}, dtype=dtype, index=empty_index)) + yield + + def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype, empty_index, dtype, append_index): + df_to_append = pd.DataFrame({"col1": [1,2,3]}, dtype=int_dtype, index=append_index) + lmdb_version_store_static_and_dynamic.append("sym", df_to_append) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) + + def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype, empty_index, append_index): + df_to_append = pd.DataFrame({"col1": [1.0,2.0,3.0]}, dtype=float_dtype, index=append_index) + lmdb_version_store_static_and_dynamic.append("sym", df_to_append) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) + + def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype, empty_index, append_index): + df_to_append = pd.DataFrame({"col1": [True, False, None]}, dtype=boolean_dtype, index=append_index) + lmdb_version_store_static_and_dynamic.append("sym", df_to_append) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) + + def test_empty(self, lmdb_version_store_static_and_dynamic, empty_index, append_index): + df_to_append = pd.DataFrame({"col1": [None, None, None]}, index=append_index) + lmdb_version_store_static_and_dynamic.append("sym", df_to_append) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) + + def test_string(self, lmdb_version_store_static_and_dynamic, empty_index, append_index): + df_to_append = pd.DataFrame({"col1": ["short_string", None, 20 * "long_string"]}, index=append_index) + lmdb_version_store_static_and_dynamic.append("sym", df_to_append) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) + + def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype, empty_index, append_index): + df_to_append = pd.DataFrame( + { + "col1": np.array( + [ + np.datetime64('2005-02'), + np.datetime64('2005-03'), + np.datetime64('2005-03') + ] + ) + }, + dtype=date_dtype, + index=append_index + ) + lmdb_version_store_static_and_dynamic.append("sym", df_to_append) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) + + +class TestAppendingEmptyToColumnDoesNothing: + """ + Test if it is possible to append empty column to an already existing column. The append should not change anything. + If dynamic schema is used the update should not create new columns. The update should not even reach the C++ layer. + """ + + + @pytest.fixture(params=[pd.RangeIndex(0,3), list(pd.date_range(start="1/1/2024", end="1/3/2024"))]) + def index(self, request): + yield request.param + + def test_integer(self, lmdb_version_store_static_and_dynamic, index, int_dtype, empty_index, dtype): + df = pd.DataFrame({"col1": [1,2,3]}, dtype=int_dtype, index=index) + lmdb_version_store_static_and_dynamic.write("sym", df) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) + read_result = lmdb_version_store_static_and_dynamic.read("sym") + assert_frame_equal(read_result.data, df) + assert read_result.version == 0 + + def test_float(self, lmdb_version_store_static_and_dynamic, index, float_dtype, empty_index, dtype): + df = pd.DataFrame({"col": [1,2,3]}, dtype=float_dtype, index=index) + lmdb_version_store_static_and_dynamic.write("sym", df) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) + read_result = lmdb_version_store_static_and_dynamic.read("sym") + assert_frame_equal(read_result.data, df) + assert read_result.version == 0 + + def test_bool(self, lmdb_version_store_static_and_dynamic, index, boolean_dtype, empty_index, dtype): + df = pd.DataFrame({"col": [False, True, None]}, dtype=boolean_dtype, index=index) + lmdb_version_store_static_and_dynamic.write("sym", df) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) + read_result = lmdb_version_store_static_and_dynamic.read("sym") + assert_frame_equal(read_result.data, df) + assert read_result.version == 0 + + def test_empty(self, lmdb_version_store_static_and_dynamic, index, empty_index, dtype): + df = pd.DataFrame({"col": [None, None, None]}, index=index) + lmdb_version_store_static_and_dynamic.write("sym", df) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) + read_result = lmdb_version_store_static_and_dynamic.read("sym") + assert_frame_equal(read_result.data, df) + assert read_result.version == 0 + + def test_string(self, lmdb_version_store_static_and_dynamic, index, empty_index, dtype): + df = pd.DataFrame({"col": ["shord", 20*"long", None]}, index=index) + lmdb_version_store_static_and_dynamic.write("sym", df) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) + read_result = lmdb_version_store_static_and_dynamic.read("sym") + assert_frame_equal(read_result.data, df) + assert read_result.version == 0 + + def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype, index, empty_index, dtype): + df = pd.DataFrame( + { + "col1": np.array( + [ + np.datetime64('2005-02'), + np.datetime64('2005-03'), + np.datetime64('2005-03') + ] + ) + }, dtype=date_dtype, index=index) + lmdb_version_store_static_and_dynamic.write("sym", df) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) + read_result = lmdb_version_store_static_and_dynamic.read("sym") + assert_frame_equal(read_result.data, df) + assert read_result.version == 0 + + def test_empty_df_does_not_create_new_columns_in_dynamic_schema(self, lmdb_version_store_dynamic_schema, index): + df = pd.DataFrame({"col": [1,2,3]}, dtype="int32", index=index) + lmdb_version_store_dynamic_schema.write("sym", df) + to_append = pd.DataFrame( + { + "col_1": np.array([], dtype="int"), + "col_2": np.array([], dtype="float"), + "col_3": np.array([], dtype="object"), + "col_4": np.array([], dtype="str") + } + ) + lmdb_version_store_dynamic_schema.append("sym", to_append) + read_result = lmdb_version_store_dynamic_schema.read("sym") + assert_frame_equal(read_result.data, df) + assert read_result.version == 0 + + +@pytest.mark.xfail( + "When the index is explicitly set to be empty array ArcticDB assumes it's row range index. It does this by checking" + "the type of the data in the tensor https://github.com/man-group/ArcticDB/blob/bb2278d845abef2b7e7101dc968f0974cf7bdde1/cpp/arcticdb/python/python_to_tensor_frame.cpp#L234" + "and if it's not NANOSECONDS_UTC64 it assumes it's row ranged index. Since the empty index reports a emptyval type" + "it assumes the index of type row range. This will be fixed by adding a special empty index type." +) +class TestCanUpdateEmptyColumn: + """ + Test if it's possible to update a completely empty dataframe. The index type and the column type will be set after + the update. Before that the column type must be empty type and the index type is undecided (empty index type). + """ + + @pytest.fixture(autouse=True) + def create_empty_column(self, lmdb_version_store_static_and_dynamic, dtype, empty_index): + lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": []}, dtype=dtype, index=empty_index)) + yield + + def update_index(self): + return list(pd.date_range(start="1/2/2024", end="1/4/2024")) + + def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype): + df = pd.DataFrame({"col": [1,2,3]}, index=self.update_index(), dtype=int_dtype) + lmdb_version_store_static_and_dynamic.update("sym", df) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df) + + def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): + df = pd.DataFrame({"col": [1,2,3]}, index=self.update_index(), dtype=float_dtype) + lmdb_version_store_static_and_dynamic.update("sym", df) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df) + + def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): + df = pd.DataFrame({"col": [True,False,None]}, index=self.update_index(), dtype=boolean_dtype) + lmdb_version_store_static_and_dynamic.update("sym", df) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df) + + def test_bool(self, lmdb_version_store_static_and_dynamic): + df = pd.DataFrame({"col": [None,None,None]}, index=self.update_index()) + lmdb_version_store_static_and_dynamic.update("sym", df) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df) + + def test_string(self, lmdb_version_store_static_and_dynamic): + df = pd.DataFrame({"col": ["short",20*"long",None]}, index=self.update_index()) + lmdb_version_store_static_and_dynamic.update("sym", df) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df) + + def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): + df = pd.DataFrame( + { + "col1": np.array( + [ + np.datetime64('2005-02'), + np.datetime64('2005-03'), + np.datetime64('2005-03') + ] + ) + }, dtype=date_dtype, index=self.index() + ) + lmdb_version_store_static_and_dynamic.update("sym", df) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df) + class TestEmptyTypesIsOverriden: def test_cannot_append_different_type_after_first_not_none(self, lmdb_version_store_static_and_dynamic): lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": [None, None]})) From 1450ccfea6e72b9dd004d4045ffaccb2d175df13 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Tue, 27 Feb 2024 18:20:27 +0200 Subject: [PATCH 04/21] Cosmetic changes to the empty-type tests --- .../version_store/test_empty_column_type.py | 106 +++++++++--------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index c9987f1089..b34363ea60 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -79,17 +79,17 @@ class TestCanAppendToColunWithNones: @pytest.fixture(autouse=True) def create_empty_column(self, lmdb_version_store_static_and_dynamic): - lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": 2 * [None]})) + lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": 2 * [None]})) yield def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype): - df_non_empty = pd.DataFrame({"col1": np.array([1,2,3], dtype=int_dtype)}) + df_non_empty = pd.DataFrame({"col": np.array([1,2,3], dtype=int_dtype)}) lmdb_version_store_static_and_dynamic.append("sym", df_non_empty) - expected_result = pd.DataFrame({"col1": np.array([0,0,1,2,3], dtype=int_dtype)}) + expected_result = pd.DataFrame({"col": np.array([0,0,1,2,3], dtype=int_dtype)}) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,2]).data, - pd.DataFrame({"col1": np.array([0,0], dtype=int_dtype)}) + pd.DataFrame({"col": np.array([0,0], dtype=int_dtype)}) ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[2,5]).data, @@ -97,13 +97,13 @@ def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype): ) def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): - df_non_empty = pd.DataFrame({"col1": np.array([1,2,3], dtype=float_dtype)}) + df_non_empty = pd.DataFrame({"col": np.array([1,2,3], dtype=float_dtype)}) lmdb_version_store_static_and_dynamic.append("sym", df_non_empty) - expected_result = pd.DataFrame({"col1": np.array([float("NaN"),float("NaN"),1,2,3], dtype=float_dtype)}) + expected_result = pd.DataFrame({"col": np.array([float("NaN"),float("NaN"),1,2,3], dtype=float_dtype)}) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,2]).data, - pd.DataFrame({"col1": np.array([float("NaN"),float("NaN")], dtype=float_dtype)}) + pd.DataFrame({"col": np.array([float("NaN"),float("NaN")], dtype=float_dtype)}) ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[2,5]).data, @@ -112,13 +112,13 @@ def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): # Note: if dtype is bool pandas will convert None to False - df_non_empty = pd.DataFrame({"col1": np.array([True, False, True], dtype=boolean_dtype)}) + df_non_empty = pd.DataFrame({"col": np.array([True, False, True], dtype=boolean_dtype)}) lmdb_version_store_static_and_dynamic.append("sym", df_non_empty) - expected_result = pd.DataFrame({"col1": np.array([None, None, True, False, True], dtype=boolean_dtype)}) + expected_result = pd.DataFrame({"col": np.array([None, None, True, False, True], dtype=boolean_dtype)}) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,2]).data, - pd.DataFrame({"col1": np.array([None, None], dtype=boolean_dtype)}) + pd.DataFrame({"col": np.array([None, None], dtype=boolean_dtype)}) ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[2,5]).data, @@ -126,13 +126,13 @@ def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): ) def test_empty(self, lmdb_version_store_static_and_dynamic): - df_non_empty = pd.DataFrame({"col1": np.array([None, None, None])}) + df_non_empty = pd.DataFrame({"col": np.array([None, None, None])}) lmdb_version_store_static_and_dynamic.append("sym", df_non_empty) - expected_result = pd.DataFrame({"col1": np.array([None, None, None, None, None])}) + expected_result = pd.DataFrame({"col": np.array([None, None, None, None, None])}) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,2]).data, - pd.DataFrame({"col1": np.array([None, None])}) + pd.DataFrame({"col": np.array([None, None])}) ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[2,5]).data, @@ -140,13 +140,13 @@ def test_empty(self, lmdb_version_store_static_and_dynamic): ) def test_string(self, lmdb_version_store_static_and_dynamic): - df_non_empty = pd.DataFrame({"col1": np.array(["some_string", "long_string"*100])}) + df_non_empty = pd.DataFrame({"col": np.array(["some_string", "long_string"*100])}) lmdb_version_store_static_and_dynamic.append("sym", df_non_empty) - expected_result = pd.DataFrame({"col1": np.array([None, None, "some_string", "long_string"*100])}) + expected_result = pd.DataFrame({"col": np.array([None, None, "some_string", "long_string"*100])}) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,2]).data, - pd.DataFrame({"col1": np.array([None, None])}) + pd.DataFrame({"col": np.array([None, None])}) ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[2,5]).data, @@ -156,7 +156,7 @@ def test_string(self, lmdb_version_store_static_and_dynamic): def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): df_non_empty = pd.DataFrame( { - "col1": np.array( + "col": np.array( [ np.datetime64('2005-02'), np.datetime64('2005-03'), @@ -169,7 +169,7 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): lmdb_version_store_static_and_dynamic.append("sym", df_non_empty) expected_result = pd.DataFrame( { - "col1": np.array( + "col": np.array( [ np.datetime64('NaT'), np.datetime64('NaT'), @@ -184,7 +184,7 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,2]).data, - pd.DataFrame({"col1": np.array([np.datetime64('NaT'), np.datetime64('NaT')], dtype=date_dtype)}) + pd.DataFrame({"col": np.array([np.datetime64('NaT'), np.datetime64('NaT')], dtype=date_dtype)}) ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[2,5]).data, @@ -202,12 +202,12 @@ class TestCanAppendColumnWithNonesToColumn: def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype): - df_initial = pd.DataFrame({"col1": np.array([1,2,3], dtype=int_dtype)}) + df_initial = pd.DataFrame({"col": np.array([1,2,3], dtype=int_dtype)}) lmdb_version_store_static_and_dynamic.write("sym", df_initial) - lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col1": [None, None]})) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": [None, None]})) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym").data, - pd.DataFrame({"col1": np.array([1,2,3,0,0], dtype=int_dtype)}) + pd.DataFrame({"col": np.array([1,2,3,0,0], dtype=int_dtype)}) ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,3]).data, @@ -215,16 +215,16 @@ def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype): ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[3,5]).data, - pd.DataFrame({"col1": np.array([0,0], dtype=int_dtype)}) + pd.DataFrame({"col": np.array([0,0], dtype=int_dtype)}) ) def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): - df_initial = pd.DataFrame({"col1": np.array([1,2,3], dtype=float_dtype)}) + df_initial = pd.DataFrame({"col": np.array([1,2,3], dtype=float_dtype)}) lmdb_version_store_static_and_dynamic.write("sym", df_initial) - lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col1": [None, None]})) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": [None, None]})) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym").data, - pd.DataFrame({"col1": np.array([1,2,3,float("NaN"),float("NaN")], dtype=float_dtype)}) + pd.DataFrame({"col": np.array([1,2,3,float("NaN"),float("NaN")], dtype=float_dtype)}) ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,3]).data, @@ -232,18 +232,18 @@ def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[3,5]).data, - pd.DataFrame({"col1": np.array([float("NaN"), float("NaN")], dtype=float_dtype)}) + pd.DataFrame({"col": np.array([float("NaN"), float("NaN")], dtype=float_dtype)}) ) def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): # Note: if dtype is bool pandas will convert None to False - df_initial = pd.DataFrame({"col1": np.array([True, False, True], dtype=boolean_dtype)}) - df_with_none = pd.DataFrame({"col1": np.array([None, None])}) + df_initial = pd.DataFrame({"col": np.array([True, False, True], dtype=boolean_dtype)}) + df_with_none = pd.DataFrame({"col": np.array([None, None])}) lmdb_version_store_static_and_dynamic.write("sym", df_initial) lmdb_version_store_static_and_dynamic.append("sym", df_with_none) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym").data, - pd.DataFrame({"col1": np.array([True, False, True, None, None], dtype=boolean_dtype)}) + pd.DataFrame({"col": np.array([True, False, True, None, None], dtype=boolean_dtype)}) ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,3]).data, @@ -255,13 +255,13 @@ def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): ) def test_string(self, lmdb_version_store_static_and_dynamic): - df_initial = pd.DataFrame({"col1": np.array(["some_string", "long_string"*100, ""])}) - df_with_none = pd.DataFrame({"col1": np.array([None, None])}) + df_initial = pd.DataFrame({"col": np.array(["some_string", "long_string"*100, ""])}) + df_with_none = pd.DataFrame({"col": np.array([None, None])}) lmdb_version_store_static_and_dynamic.write("sym", df_initial) lmdb_version_store_static_and_dynamic.append("sym", df_with_none) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym").data, - pd.DataFrame({"col1": np.array(["some_string", "long_string"*100, "", None, None])}) + pd.DataFrame({"col": np.array(["some_string", "long_string"*100, "", None, None])}) ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,3]).data, @@ -275,7 +275,7 @@ def test_string(self, lmdb_version_store_static_and_dynamic): def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): df_initial = pd.DataFrame( { - "col1": np.array( + "col": np.array( [ np.datetime64('2005-02'), np.datetime64('2005-03'), @@ -286,10 +286,10 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): dtype=date_dtype ) lmdb_version_store_static_and_dynamic.write("sym", df_initial) - lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col1": np.array([None, None])})) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": np.array([None, None])})) expected_result = pd.DataFrame( { - "col1": np.array( + "col": np.array( [ np.datetime64('2005-02'), np.datetime64('2005-03'), @@ -307,7 +307,7 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[3,5]).data, - pd.DataFrame({"col1": np.array([np.datetime64('NaT'), np.datetime64('NaT')], dtype=date_dtype)}) + pd.DataFrame({"col": np.array([np.datetime64('NaT'), np.datetime64('NaT')], dtype=date_dtype)}) ) @@ -589,38 +589,38 @@ def append_index(self, request): @pytest.fixture(autouse=True) def create_empty_column(self, lmdb_version_store_static_and_dynamic, dtype, empty_index): - lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": []}, dtype=dtype, index=empty_index)) + lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) yield def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype, empty_index, dtype, append_index): - df_to_append = pd.DataFrame({"col1": [1,2,3]}, dtype=int_dtype, index=append_index) + df_to_append = pd.DataFrame({"col": [1,2,3]}, dtype=int_dtype, index=append_index) lmdb_version_store_static_and_dynamic.append("sym", df_to_append) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype, empty_index, append_index): - df_to_append = pd.DataFrame({"col1": [1.0,2.0,3.0]}, dtype=float_dtype, index=append_index) + df_to_append = pd.DataFrame({"col": [1.0,2.0,3.0]}, dtype=float_dtype, index=append_index) lmdb_version_store_static_and_dynamic.append("sym", df_to_append) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype, empty_index, append_index): - df_to_append = pd.DataFrame({"col1": [True, False, None]}, dtype=boolean_dtype, index=append_index) + df_to_append = pd.DataFrame({"col": [True, False, None]}, dtype=boolean_dtype, index=append_index) lmdb_version_store_static_and_dynamic.append("sym", df_to_append) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) def test_empty(self, lmdb_version_store_static_and_dynamic, empty_index, append_index): - df_to_append = pd.DataFrame({"col1": [None, None, None]}, index=append_index) + df_to_append = pd.DataFrame({"col": [None, None, None]}, index=append_index) lmdb_version_store_static_and_dynamic.append("sym", df_to_append) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) def test_string(self, lmdb_version_store_static_and_dynamic, empty_index, append_index): - df_to_append = pd.DataFrame({"col1": ["short_string", None, 20 * "long_string"]}, index=append_index) + df_to_append = pd.DataFrame({"col": ["short_string", None, 20 * "long_string"]}, index=append_index) lmdb_version_store_static_and_dynamic.append("sym", df_to_append) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype, empty_index, append_index): df_to_append = pd.DataFrame( { - "col1": np.array( + "col": np.array( [ np.datetime64('2005-02'), np.datetime64('2005-03'), @@ -647,7 +647,7 @@ def index(self, request): yield request.param def test_integer(self, lmdb_version_store_static_and_dynamic, index, int_dtype, empty_index, dtype): - df = pd.DataFrame({"col1": [1,2,3]}, dtype=int_dtype, index=index) + df = pd.DataFrame({"col": [1,2,3]}, dtype=int_dtype, index=index) lmdb_version_store_static_and_dynamic.write("sym", df) lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) read_result = lmdb_version_store_static_and_dynamic.read("sym") @@ -689,7 +689,7 @@ def test_string(self, lmdb_version_store_static_and_dynamic, index, empty_index, def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype, index, empty_index, dtype): df = pd.DataFrame( { - "col1": np.array( + "col": np.array( [ np.datetime64('2005-02'), np.datetime64('2005-03'), @@ -720,11 +720,11 @@ def test_empty_df_does_not_create_new_columns_in_dynamic_schema(self, lmdb_versi assert read_result.version == 0 -@pytest.mark.xfail( - "When the index is explicitly set to be empty array ArcticDB assumes it's row range index. It does this by checking" - "the type of the data in the tensor https://github.com/man-group/ArcticDB/blob/bb2278d845abef2b7e7101dc968f0974cf7bdde1/cpp/arcticdb/python/python_to_tensor_frame.cpp#L234" - "and if it's not NANOSECONDS_UTC64 it assumes it's row ranged index. Since the empty index reports a emptyval type" - "it assumes the index of type row range. This will be fixed by adding a special empty index type." +@pytest.mark.xfail(reason="""When the index is explicitly set to be empty array ArcticDB assumes it's row range index. + It does this by checking the type of the data in the tensor + https://github.com/man-group/ArcticDB/blob/bb2278d845abef2b7e7101dc968f0974cf7bdde1/cpp/arcticdb/python/python_to_tensor_frame.cpp#L234 + and if it's not NANOSECONDS_UTC64 it assumes it's row ranged index. Since the empty index reports a emptyval type + it assumes the index of type row range. This will be fixed by adding a special empty index type.""" ) class TestCanUpdateEmptyColumn: """ @@ -734,7 +734,7 @@ class TestCanUpdateEmptyColumn: @pytest.fixture(autouse=True) def create_empty_column(self, lmdb_version_store_static_and_dynamic, dtype, empty_index): - lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": []}, dtype=dtype, index=empty_index)) + lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) yield def update_index(self): @@ -768,7 +768,7 @@ def test_string(self, lmdb_version_store_static_and_dynamic): def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): df = pd.DataFrame( { - "col1": np.array( + "col": np.array( [ np.datetime64('2005-02'), np.datetime64('2005-03'), From 7365c6c8757cecad5aab22b45c5c33c7f5e25512 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Wed, 28 Feb 2024 11:44:25 +0200 Subject: [PATCH 05/21] Remove a test testing that arctic preserves the type of 0-rowed column. Arctic no longer persists the dtype because empty columns are set to be of empty type. Which in Python terms is object dtype. --- .../version_store/test_basic_version_store.py | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/python/tests/integration/arcticdb/version_store/test_basic_version_store.py b/python/tests/integration/arcticdb/version_store/test_basic_version_store.py index 32d7bdf3bf..9f3572b1b7 100644 --- a/python/tests/integration/arcticdb/version_store/test_basic_version_store.py +++ b/python/tests/integration/arcticdb/version_store/test_basic_version_store.py @@ -791,26 +791,6 @@ def test_empty_pd_series(basic_store): assert basic_store.read(sym).data.empty -def test_empty_pd_series_type_preservation(basic_store): - sym = "empty_s" - series = pd.Series(dtype="datetime64[ns]") - basic_store.write(sym, series) - res = basic_store.read(sym).data - assert res.empty - assert str(res.dtype) == "datetime64[ns]" - assert basic_store.read(sym).data.empty - - basic_store.update(sym, series) - res = basic_store.read(sym).data - assert res.empty - assert str(res.dtype) == "datetime64[ns]" - - basic_store.append(sym, series) - res = basic_store.read(sym).data - assert res.empty - assert str(res.dtype) == "datetime64[ns]" - - def test_empty_df(basic_store): sym = "empty_s" df = pd.DataFrame() From dac194ca75d55372ac7afa5d482e9adeb925d343 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Wed, 28 Feb 2024 14:20:33 +0200 Subject: [PATCH 06/21] Preserve the default behavour so that empty dateframes get datetime index until empty index is created. --- cpp/arcticdb/python/python_to_tensor_frame.cpp | 6 ++++-- .../unit/arcticdb/version_store/test_empty_column_type.py | 8 +------- .../unit/arcticdb/version_store/test_missing_empty.py | 5 ++++- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/arcticdb/python/python_to_tensor_frame.cpp b/cpp/arcticdb/python/python_to_tensor_frame.cpp index 6f8c8dc927..96196f36fe 100644 --- a/cpp/arcticdb/python/python_to_tensor_frame.cpp +++ b/cpp/arcticdb/python/python_to_tensor_frame.cpp @@ -230,8 +230,10 @@ std::shared_ptr py_ndf_to_frame( util::check(index_tensor.shape() != nullptr, "Index tensor expected to contain shapes"); std::string index_column_name = !idx_names.empty() ? idx_names[0] : "index"; res->num_rows = index_tensor.shape(0); - //TODO handle string indexes - if (index_tensor.data_type() == DataType::NANOSECONDS_UTC64) { + // TODO handle string indexes + // Empty type check is added to preserve the current behavior which is that 0-rowed dataframes + // are assigned datetime index. This will be changed in further PR creating empty typed index. + if (index_tensor.data_type() == DataType::NANOSECONDS_UTC64 || is_empty_type(index_tensor.data_type())) { res->desc.set_index_field_count(1); res->desc.set_index_type(IndexDescriptor::TIMESTAMP); diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index b34363ea60..6df7e604a9 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -720,12 +720,6 @@ def test_empty_df_does_not_create_new_columns_in_dynamic_schema(self, lmdb_versi assert read_result.version == 0 -@pytest.mark.xfail(reason="""When the index is explicitly set to be empty array ArcticDB assumes it's row range index. - It does this by checking the type of the data in the tensor - https://github.com/man-group/ArcticDB/blob/bb2278d845abef2b7e7101dc968f0974cf7bdde1/cpp/arcticdb/python/python_to_tensor_frame.cpp#L234 - and if it's not NANOSECONDS_UTC64 it assumes it's row ranged index. Since the empty index reports a emptyval type - it assumes the index of type row range. This will be fixed by adding a special empty index type.""" -) class TestCanUpdateEmptyColumn: """ Test if it's possible to update a completely empty dataframe. The index type and the column type will be set after @@ -775,7 +769,7 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): np.datetime64('2005-03') ] ) - }, dtype=date_dtype, index=self.index() + }, dtype=date_dtype, index=self.update_index() ) lmdb_version_store_static_and_dynamic.update("sym", df) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df) diff --git a/python/tests/unit/arcticdb/version_store/test_missing_empty.py b/python/tests/unit/arcticdb/version_store/test_missing_empty.py index c6d4ab7a18..7214edbe10 100644 --- a/python/tests/unit/arcticdb/version_store/test_missing_empty.py +++ b/python/tests/unit/arcticdb/version_store/test_missing_empty.py @@ -5,13 +5,16 @@ As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. """ +import pytest +pytest.skip("", allow_module_level=True) + import pandas as pd import numpy as np -import pytest import logging from dataclasses import dataclass from typing import List, Optional, Union + log = logging.getLogger("missing_empty_tests") log.setLevel(logging.INFO) From 47abdbdc59f78a955fca8aac2f6fc881184350f1 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Wed, 28 Feb 2024 17:05:01 +0200 Subject: [PATCH 07/21] Remove non-reg tests because the desired behavior is different. Xfail tests for the compat-36 and compat-38 versions. The behavior shall be fixed with a later commit --- .../version_store/test_nonreg_specific.py | 30 ------------------- .../version_store/test_empty_column_type.py | 4 +++ 2 files changed, 4 insertions(+), 30 deletions(-) diff --git a/python/tests/nonreg/arcticdb/version_store/test_nonreg_specific.py b/python/tests/nonreg/arcticdb/version_store/test_nonreg_specific.py index 4581d632fd..acd525563a 100644 --- a/python/tests/nonreg/arcticdb/version_store/test_nonreg_specific.py +++ b/python/tests/nonreg/arcticdb/version_store/test_nonreg_specific.py @@ -274,36 +274,6 @@ def test_update_with_empty_dataframe_with_index(lmdb_version_store): lib.read(symbol, as_of=0).data -@pytest.mark.parametrize( - "input_empty_col_dtype, output_empty_col_dtype, value_type, size_bits", - [ - (np.uint8, np.uint8, TypeDescriptor.ValueType.UINT, TypeDescriptor.SizeBits.S8), - (np.int32, np.int32, TypeDescriptor.ValueType.INT, TypeDescriptor.SizeBits.S32), - ("datetime64[ns]", "datetime64[ns]", TypeDescriptor.ValueType.NANOSECONDS_UTC, TypeDescriptor.SizeBits.S64), - - # For rationale see: https://github.com/man-group/ArcticDB/pull/1049 - (float, float, TypeDescriptor.ValueType.FLOAT if IS_PANDAS_TWO else TypeDescriptor.ValueType.EMPTY, TypeDescriptor.SizeBits.S64), # noqa: E501 - (object, object if IS_PANDAS_TWO else float, TypeDescriptor.ValueType.EMPTY, TypeDescriptor.SizeBits.S64), -]) -def test_empty_column_handling(lmdb_version_store, input_empty_col_dtype, output_empty_col_dtype, value_type, size_bits): - # Non-regression test for https://github.com/man-group/ArcticDB/issues/987 - lib = lmdb_version_store - - symbol_type_descriptor_series_index = 1 if IS_PANDAS_TWO else 0 - def get_symbol_type_descriptor(symbol): - symbol_info = lib.get_info(symbol) - return symbol_info["dtype"][symbol_type_descriptor_series_index] - - symbol = "empty" - series = pd.Series([], dtype=input_empty_col_dtype) - lib.write(symbol, series) - symbol_type_info = get_symbol_type_descriptor(symbol) - assert symbol_type_info.value_type == value_type - assert symbol_type_info.size_bits == size_bits - result = lib.read(symbol).data - assert result.dtype == output_empty_col_dtype - - def test_date_range_multi_index(lmdb_version_store): # Non-regression test for https://github.com/man-group/ArcticDB/issues/1122 lib = lmdb_version_store diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 6df7e604a9..f1dccec68a 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -5,6 +5,7 @@ As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. """ +import sys from math import nan import pandas as pd from pandas.testing import assert_frame_equal @@ -728,6 +729,9 @@ class TestCanUpdateEmptyColumn: @pytest.fixture(autouse=True) def create_empty_column(self, lmdb_version_store_static_and_dynamic, dtype, empty_index): + if isinstance(empty_index, pd.RangeIndex) and sys.version_info[1] < 9: + pytest.xfail("""compat-36 and compat-38 tests are failing because this would assign a row-range index to + the empty df. This will be fixed when the pandas-agnostic empty index type is added""") lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) yield From e0886a6e5cc1b4578c3ebcade13021f4c290d1fc Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Wed, 28 Feb 2024 17:45:31 +0200 Subject: [PATCH 08/21] Xfail tests for older python/pandas versions --- .../unit/arcticdb/version_store/test_empty_column_type.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index f1dccec68a..5b6ac0d003 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -590,6 +590,9 @@ def append_index(self, request): @pytest.fixture(autouse=True) def create_empty_column(self, lmdb_version_store_static_and_dynamic, dtype, empty_index): + if isinstance(empty_index, pd.RangeIndex) and sys.version_info[1] < 9: + pytest.xfail("""compat-36 and compat-38 tests are failing because this would assign a row-range index to + the empty df. This will be fixed when the pandas-agnostic empty index type is added""") lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) yield From 8494f3f2c209767999cc85c0d232c02b817ee8de Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Thu, 29 Feb 2024 11:56:55 +0200 Subject: [PATCH 09/21] Make fixture deterministic so that the CI can distribute tests in parallel --- .../unit/arcticdb/version_store/test_empty_column_type.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 5b6ac0d003..b36474f8ae 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -38,7 +38,10 @@ def date_dtype(): @classmethod def dtype(cls): - return list(set([*cls.int_dtype(), *cls.float_dtype(), *cls.bool_dtype(), *cls.date_dtype()])) + # There are no overlaps in dtypes at the moment but since having additional dtype might increase the test count + # by a lot we err on the safe side and filter the dtypes so that they are unique. Note dtypes are sorted so that + # the fixture name is deterministic, otherwise the CI might fail to run the tests in parallel. + return sorted(list(set([*cls.int_dtype(), *cls.float_dtype(), *cls.bool_dtype(), *cls.date_dtype()]))) @pytest.fixture(params=DtypeGenerator.int_dtype()) def int_dtype(request): From b667c71ab28822b37156a042400af81cc27c4ffb Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Wed, 6 Mar 2024 09:56:44 +0200 Subject: [PATCH 10/21] Create an empty index class and assign it to 0-rowed DFs --- cpp/arcticdb/entity/merge_descriptors.cpp | 9 +- cpp/arcticdb/entity/types_proto.cpp | 54 +++++++++ cpp/arcticdb/entity/types_proto.hpp | 71 +++--------- cpp/arcticdb/pipeline/index_writer.hpp | 2 +- cpp/arcticdb/pipeline/input_tensor_frame.hpp | 18 ++- cpp/arcticdb/python/normalization_checks.cpp | 108 +++++++----------- .../python/python_to_tensor_frame.cpp | 17 ++- cpp/arcticdb/stream/aggregator.hpp | 4 +- cpp/arcticdb/stream/index.hpp | 46 +++++++- cpp/arcticdb/stream/row_builder.hpp | 2 +- cpp/arcticdb/stream/stream_utils.hpp | 5 +- cpp/arcticdb/version/version_core-inl.hpp | 9 +- cpp/proto/arcticc/pb2/descriptors.proto | 2 + 13 files changed, 199 insertions(+), 148 deletions(-) diff --git a/cpp/arcticdb/entity/merge_descriptors.cpp b/cpp/arcticdb/entity/merge_descriptors.cpp index ed8ea636cf..a41a8cfe18 100644 --- a/cpp/arcticdb/entity/merge_descriptors.cpp +++ b/cpp/arcticdb/entity/merge_descriptors.cpp @@ -46,8 +46,13 @@ StreamDescriptor merge_descriptors( // Merge all the fields for all slices, apart from the index which we already have from the first descriptor. // Note that we preserve the ordering as we see columns, especially the index which needs to be column 0. for (const auto &fields : entries) { - if(has_index) - util::variant_match(index, [&fields] (const auto& idx) { idx.check(*fields); }); + if (has_index) { + util::variant_match(index, + [](const EmptyIndex&) {}, + [](const RowCountIndex&) {}, + [&fields] (const auto& idx) { idx.check(*fields); } + ); + } for (size_t idx = has_index ? 1u : 0u; idx < static_cast(fields->size()); ++idx) { const auto& field = fields->at(idx); diff --git a/cpp/arcticdb/entity/types_proto.cpp b/cpp/arcticdb/entity/types_proto.cpp index 5fd69d7e8a..eef3b0b303 100644 --- a/cpp/arcticdb/entity/types_proto.cpp +++ b/cpp/arcticdb/entity/types_proto.cpp @@ -123,4 +123,58 @@ namespace arcticdb::entity { }, id); } + IndexDescriptor::IndexDescriptor(size_t field_count, Type type) { + data_.set_kind(type); + data_.set_field_count(static_cast(field_count)); + } + + IndexDescriptor::IndexDescriptor(arcticdb::proto::descriptors::IndexDescriptor data) + : data_(std::move(data)) { + } + + bool IndexDescriptor::uninitialized() const { + return data_.field_count() == 0 && data_.kind() == Type::IndexDescriptor_Type_UNKNOWN; + } + + const IndexDescriptor::Proto& IndexDescriptor::proto() const { + return data_; + } + + size_t IndexDescriptor::field_count() const { + return static_cast(data_.field_count()); + } + + IndexDescriptor::Type IndexDescriptor::type() const { + return data_.kind(); + } + + void IndexDescriptor::set_type(Type type) { + data_.set_kind(type); + } + + bool operator==(const IndexDescriptor& left, const IndexDescriptor& right) { + return left.type() == right.type(); + } + + IndexDescriptor::TypeChar to_type_char(IndexDescriptor::Type type) { + switch (type) { + case IndexDescriptor::EMPTY: return 'E'; + case IndexDescriptor::TIMESTAMP: return 'T'; + case IndexDescriptor::ROWCOUNT: return 'R'; + case IndexDescriptor::STRING: return 'S'; + case IndexDescriptor::UNKNOWN: return 'U'; + default: util::raise_rte("Unknown index type: {}", int(type)); + } + } + + IndexDescriptor::Type from_type_char(IndexDescriptor::TypeChar type) { + switch (type) { + case 'E': return IndexDescriptor::EMPTY; + case 'T': return IndexDescriptor::TIMESTAMP; + case 'R': return IndexDescriptor::ROWCOUNT; + case 'S': return IndexDescriptor::STRING; + case 'U': return IndexDescriptor::UNKNOWN; + default: util::raise_rte("Unknown index type: {}", int(type)); + } + } } // namespace arcticdb diff --git a/cpp/arcticdb/entity/types_proto.hpp b/cpp/arcticdb/entity/types_proto.hpp index ed54518239..3ec139495a 100644 --- a/cpp/arcticdb/entity/types_proto.hpp +++ b/cpp/arcticdb/entity/types_proto.hpp @@ -49,69 +49,28 @@ namespace arcticdb::entity { Proto data_; using Type = arcticdb::proto::descriptors::IndexDescriptor::Type; - static const Type UNKNOWN = arcticdb::proto::descriptors::IndexDescriptor_Type_UNKNOWN; - static const Type ROWCOUNT = arcticdb::proto::descriptors::IndexDescriptor_Type_ROWCOUNT; - static const Type STRING = arcticdb::proto::descriptors::IndexDescriptor_Type_STRING; - static const Type TIMESTAMP = arcticdb::proto::descriptors::IndexDescriptor_Type_TIMESTAMP; + static constexpr Type UNKNOWN = arcticdb::proto::descriptors::IndexDescriptor_Type_UNKNOWN; + static constexpr Type EMPTY = arcticdb::proto::descriptors::IndexDescriptor_Type_EMPTY; + static constexpr Type ROWCOUNT = arcticdb::proto::descriptors::IndexDescriptor_Type_ROWCOUNT; + static constexpr Type STRING = arcticdb::proto::descriptors::IndexDescriptor_Type_STRING; + static constexpr Type TIMESTAMP = arcticdb::proto::descriptors::IndexDescriptor_Type_TIMESTAMP; using TypeChar = char; IndexDescriptor() = default; - IndexDescriptor(size_t field_count, Type type) { - data_.set_kind(type); - data_.set_field_count(static_cast(field_count)); - } - - explicit IndexDescriptor(arcticdb::proto::descriptors::IndexDescriptor data) - : data_(std::move(data)) { - } - - bool uninitialized() const { - return data_.field_count() == 0 && data_.kind() == Type::IndexDescriptor_Type_UNKNOWN; - } - - const Proto& proto() const { - return data_; - } - - size_t field_count() const { - return static_cast(data_.field_count()); - } - - Type type() const { - return data_.kind(); - } - - void set_type(Type type) { - data_.set_kind(type); - } - ARCTICDB_MOVE_COPY_DEFAULT(IndexDescriptor) - - friend bool operator==(const IndexDescriptor& left, const IndexDescriptor& right) { - return left.type() == right.type(); - } + IndexDescriptor(size_t field_count, Type type); + explicit IndexDescriptor(arcticdb::proto::descriptors::IndexDescriptor data); + bool uninitialized() const; + const Proto& proto() const; + size_t field_count() const; + Type type() const; + void set_type(Type type); + friend bool operator==(const IndexDescriptor& left, const IndexDescriptor& right); }; - constexpr IndexDescriptor::TypeChar to_type_char(IndexDescriptor::Type type) { - switch (type) { - case IndexDescriptor::TIMESTAMP:return 'T'; - case IndexDescriptor::ROWCOUNT:return 'R'; - case IndexDescriptor::STRING:return 'S'; - case IndexDescriptor::UNKNOWN:return 'U'; - default:util::raise_rte("Unknown index type: {}", int(type)); - } - } - - constexpr IndexDescriptor::Type from_type_char(IndexDescriptor::TypeChar type) { - switch (type) { - case 'T': return IndexDescriptor::TIMESTAMP; - case 'R': return IndexDescriptor::ROWCOUNT; - case 'S': return IndexDescriptor::STRING; - case 'U': return IndexDescriptor::UNKNOWN; - default:util::raise_rte("Unknown index type: {}", int(type)); - } - } + IndexDescriptor::TypeChar to_type_char(IndexDescriptor::Type type); + IndexDescriptor::Type from_type_char(IndexDescriptor::TypeChar type); void set_id(arcticdb::proto::descriptors::StreamDescriptor& pb_desc, StreamId id); diff --git a/cpp/arcticdb/pipeline/index_writer.hpp b/cpp/arcticdb/pipeline/index_writer.hpp index 4ee9104ae3..00d51b465e 100644 --- a/cpp/arcticdb/pipeline/index_writer.hpp +++ b/cpp/arcticdb/pipeline/index_writer.hpp @@ -17,7 +17,7 @@ namespace arcticdb::pipelines::index { // TODO: change the name - something like KeysSegmentWriter or KeyAggragator or better -template, bool> = 0> +template class IndexWriter { // All index segments are row-count indexed in the sense that the keys are // already ordered - they don't need an additional index diff --git a/cpp/arcticdb/pipeline/input_tensor_frame.hpp b/cpp/arcticdb/pipeline/input_tensor_frame.hpp index b026d85d3d..13810159e4 100644 --- a/cpp/arcticdb/pipeline/input_tensor_frame.hpp +++ b/cpp/arcticdb/pipeline/input_tensor_frame.hpp @@ -18,14 +18,20 @@ namespace arcticdb::pipelines { using namespace arcticdb::entity; -struct InputTensorFrame { +/// @TODO Move to a separate "util" header +template +concept is_any_of = (std::same_as || ...); + +template +concept ValidIndex = is_any_of< + std::remove_cvref_t>>, + stream::TimeseriesIndex, + stream::RowCountIndex, + stream::TableIndex, + stream::EmptyIndex>; - template - static constexpr bool is_valid_index_v = - std::is_same_v || - std::is_same_v || - std::is_same_v; +struct InputTensorFrame { InputTensorFrame() : index(stream::empty_index()) {} diff --git a/cpp/arcticdb/python/normalization_checks.cpp b/cpp/arcticdb/python/normalization_checks.cpp index d34b806fbd..9fe08aef17 100644 --- a/cpp/arcticdb/python/normalization_checks.cpp +++ b/cpp/arcticdb/python/normalization_checks.cpp @@ -16,17 +16,21 @@ namespace arcticdb { -template -auto get_pandas_common_via_reflection(NormalizationMetadata norm_meta, InnerFunction &&inner_function) --> decltype(inner_function(norm_meta, std::declval(), std::declval())) { +template + auto get_pandas_common_via_reflection( + proto::descriptors::NormalizationMetadata norm_meta, + InnerFunction&& inner_function + ) -> decltype(inner_function(norm_meta, std::declval(), std::declval())) { try { - if (norm_meta.input_type_case() != NormalizationMetadata::INPUT_TYPE_NOT_SET) { - if (auto one_of = NormalizationMetadata::descriptor()->field(norm_meta.input_type_case()); one_of) { + if (norm_meta.input_type_case() != proto::descriptors::NormalizationMetadata::INPUT_TYPE_NOT_SET) { + if (auto one_of = proto::descriptors::NormalizationMetadata::descriptor()->field(norm_meta.input_type_case()); one_of) { log::storage().info("Inefficient NormalizationMetadata.input_type.{} access via reflection", one_of->name()); if (auto msg_type = one_of->message_type(); msg_type) { if (auto common_field = msg_type->FindFieldByName("common"); common_field) { - normalization::check(common_field->message_type() == NormalizationMetadata::Pandas::descriptor(), + normalization::check( + common_field->message_type() == + proto::descriptors::NormalizationMetadata::Pandas::descriptor(), "{}.common must be Pandas", one_of->name()); return inner_function(norm_meta, one_of, common_field); } @@ -40,80 +44,54 @@ auto get_pandas_common_via_reflection(NormalizationMetadata norm_meta, InnerFunc return std::nullopt; } -template std::optional>> -get_common_pandas(const NormalizationMetadata &norm_meta) { +get_common_pandas(const proto::descriptors::NormalizationMetadata& norm_meta) { using Pandas = const arcticdb::proto::descriptors::NormalizationMetadata_Pandas; switch (norm_meta.input_type_case()) { - case NormalizationMetadata::kDf:return std::make_optional(std::reference_wrapper(norm_meta.df().common())); - case NormalizationMetadata::kSeries: - return std::make_optional( - std::reference_wrapper(norm_meta.series().common())); - case NormalizationMetadata::kTs:return std::make_optional(std::reference_wrapper(norm_meta.ts().common())); - - case NormalizationMetadata::kMsgPackFrame: - case NormalizationMetadata::kNp:return std::nullopt; - + case proto::descriptors::NormalizationMetadata::kDf: + return std::make_optional(std::reference_wrapper(norm_meta.df().common())); + case proto::descriptors::NormalizationMetadata::kSeries: + return std::make_optional(std::reference_wrapper(norm_meta.series().common())); + case proto::descriptors::NormalizationMetadata::kTs: + return std::make_optional(std::reference_wrapper(norm_meta.ts().common())); + case proto::descriptors::NormalizationMetadata::kMsgPackFrame: + case proto::descriptors::NormalizationMetadata::kNp: return std::nullopt; default: - return get_pandas_common_via_reflection(norm_meta, [](auto &norm_meta, auto one_of, auto common_field) { - auto &one_of_msg = norm_meta.GetReflection()->GetMessage(norm_meta, one_of); - auto &common_msg = one_of_msg.GetReflection()->GetMessage(one_of_msg, common_field); + return get_pandas_common_via_reflection(norm_meta, [](auto& norm_meta, auto one_of, auto common_field) { + auto& one_of_msg = norm_meta.GetReflection()->GetMessage(norm_meta, one_of); + auto& common_msg = one_of_msg.GetReflection()->GetMessage(one_of_msg, common_field); return std::make_optional(std::reference_wrapper( - *reinterpret_cast(const_cast<::google::protobuf::Message *>(&common_msg)))); + *reinterpret_cast(const_cast<::google::protobuf::Message*>(&common_msg)) + )); }); } } -template std::optional>> -get_common_pandas(NormalizationMetadata - &norm_meta) { +get_common_pandas(proto::descriptors::NormalizationMetadata& norm_meta) { using Pandas = arcticdb::proto::descriptors::NormalizationMetadata_Pandas; - switch (norm_meta. - input_type_case() - ) { - case NormalizationMetadata::kDf: - return - std::make_optional(std::reference_wrapper(*norm_meta.mutable_df()->mutable_common()) - ); - case NormalizationMetadata::kSeries: - return - std::make_optional(std::reference_wrapper(*norm_meta.mutable_series()->mutable_common()) - ); - case NormalizationMetadata::kTs: - return - std::make_optional(std::reference_wrapper(*norm_meta.mutable_ts()->mutable_common()) - ); - - case NormalizationMetadata::kMsgPackFrame: - case NormalizationMetadata::kNp: - return - std::nullopt; - + switch (norm_meta.input_type_case()) { + case proto::descriptors::NormalizationMetadata::kDf: + return std::make_optional(std::reference_wrapper(*norm_meta.mutable_df()->mutable_common())); + case proto::descriptors::NormalizationMetadata::kSeries: + return std::make_optional(std::reference_wrapper(*norm_meta.mutable_series()->mutable_common())); + case proto::descriptors::NormalizationMetadata::kTs: + return std::make_optional(std::reference_wrapper(*norm_meta.mutable_ts()->mutable_common())); + case proto::descriptors::NormalizationMetadata::kMsgPackFrame: + case proto::descriptors::NormalizationMetadata::kNp: return std::nullopt; default: - return - get_pandas_common_via_reflection(norm_meta, - []( - auto &norm_meta, - auto one_of, - auto common_field - ) { - auto &one_of_msg = - norm_meta.GetReflection()->GetMessage(norm_meta, one_of); - auto &common_msg = - one_of_msg.GetReflection()->GetMessage(one_of_msg, common_field); - return - std::make_optional - (std::reference_wrapper(*reinterpret_cast(const_cast<::google::protobuf::Message *>(&common_msg))) - ); - } - ); + return get_pandas_common_via_reflection(norm_meta, [](auto& norm_meta, auto one_of, auto common_field) { + auto& one_of_msg = norm_meta.GetReflection()->GetMessage(norm_meta, one_of); + auto& common_msg = one_of_msg.GetReflection()->GetMessage(one_of_msg, common_field); + return std::make_optional(std::reference_wrapper( + *reinterpret_cast(const_cast<::google::protobuf::Message*>(&common_msg)) + )); + }); } } -template -bool check_pandas_like(const NormalizationMetadata &old_norm, - NormalizationMetadata &new_norm, +bool check_pandas_like(const proto::descriptors::NormalizationMetadata& old_norm, + proto::descriptors::NormalizationMetadata& new_norm, size_t old_length) { auto old_pandas = get_common_pandas(old_norm); auto new_pandas = get_common_pandas(new_norm); diff --git a/cpp/arcticdb/python/python_to_tensor_frame.cpp b/cpp/arcticdb/python/python_to_tensor_frame.cpp index 96196f36fe..9fc5114436 100644 --- a/cpp/arcticdb/python/python_to_tensor_frame.cpp +++ b/cpp/arcticdb/python/python_to_tensor_frame.cpp @@ -16,15 +16,15 @@ #include namespace arcticdb::convert { -const char none_char[8] = {'\300', '\000', '\000', '\000', '\000', '\000', '\000', '\000'}; +constexpr const char none_char[8] = {'\300', '\000', '\000', '\000', '\000', '\000', '\000', '\000'}; using namespace arcticdb::pipelines; -bool is_unicode(PyObject *obj) { +[[nodiscard]] static inline bool is_unicode(PyObject *obj) { return PyUnicode_Check(obj); } -bool is_py_boolean(PyObject* obj) { +[[nodiscard]] static inline bool is_py_boolean(PyObject* obj) { return PyBool_Check(obj); } @@ -47,7 +47,7 @@ std::variant pystring_to_buffer(PyObject * return api.PyArray_Check_(obj); } -std::tuple determine_python_object_type(PyObject* obj) { +[[nodiscard]] static std::tuple determine_python_object_type(PyObject* obj) { if (is_py_boolean(obj)) { normalization::raise("Nullable booleans are not supported at the moment"); return {ValueType::BOOL_OBJECT, 1, 1}; @@ -62,7 +62,7 @@ std::tuple determine_python_object_type(PyObject* o /// @todo We will iterate over all arrays in a column in aggregator_set_data anyways, so this is redundant, however /// the type is determined at the point when obj_to_tensor is called. We need to make it possible to change the /// the column type in aggregator_set_data in order not to iterate all arrays twice. -std::tuple determine_python_array_type(PyObject** begin, PyObject** end) { +[[nodiscard]] static std::tuple determine_python_array_type(PyObject** begin, PyObject** end) { auto none = py::none{}; while(begin != end) { if(none.ptr() == *begin) { @@ -233,7 +233,7 @@ std::shared_ptr py_ndf_to_frame( // TODO handle string indexes // Empty type check is added to preserve the current behavior which is that 0-rowed dataframes // are assigned datetime index. This will be changed in further PR creating empty typed index. - if (index_tensor.data_type() == DataType::NANOSECONDS_UTC64 || is_empty_type(index_tensor.data_type())) { + if (index_tensor.data_type() == DataType::NANOSECONDS_UTC64) { res->desc.set_index_field_count(1); res->desc.set_index_type(IndexDescriptor::TIMESTAMP); @@ -241,6 +241,11 @@ std::shared_ptr py_ndf_to_frame( res->desc.add_scalar_field(index_tensor.dt_, index_column_name); res->index = stream::TimeseriesIndex(index_column_name); res->index_tensor = std::move(index_tensor); + } else if (is_empty_type(index_tensor.data_type())) { + res->index = EmptyIndex{}; + res->desc.set_index_type(IndexDescriptor::EMPTY); + res->desc.add_scalar_field(index_tensor.dt_, index_column_name); + res->field_tensors.push_back(std::move(index_tensor)); } else { res->index = stream::RowCountIndex(); res->desc.set_index_type(IndexDescriptor::ROWCOUNT); diff --git a/cpp/arcticdb/stream/aggregator.hpp b/cpp/arcticdb/stream/aggregator.hpp index 9e5db51ae4..41a32d22a8 100644 --- a/cpp/arcticdb/stream/aggregator.hpp +++ b/cpp/arcticdb/stream/aggregator.hpp @@ -172,7 +172,9 @@ class Aggregator { // size then you get default buffer sizes. segment_(desc.value_or(schema_policy_.default_descriptor()), row_count.value_or(segmenting_policy_.expected_row_size()), false, SparsePolicy::allow_sparse) { segment_.init_column_map(); - index().check(segment_.descriptor().fields()); + if constexpr (!(std::is_same_v || std::is_same_v)) { + index().check(segment_.descriptor().fields()); + } }; virtual ~Aggregator() = default; diff --git a/cpp/arcticdb/stream/index.hpp b/cpp/arcticdb/stream/index.hpp index 006b695f67..bb0d2b5e14 100644 --- a/cpp/arcticdb/stream/index.hpp +++ b/cpp/arcticdb/stream/index.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -260,10 +261,6 @@ class RowCountIndex : public BaseIndex { static constexpr IndexDescriptor::Type type() { return IndexDescriptor::ROWCOUNT; } - void check(const FieldCollection& ) const { - // No index defined - } - template static IndexValue start_value_for_segment(const SegmentType &segment) { return static_cast(segment.offset()); @@ -296,10 +293,47 @@ class RowCountIndex : public BaseIndex { static constexpr const char *name() { return "row_count"; } }; -using Index = std::variant; +class EmptyIndex : public BaseIndex { +public: + using TypeDescTag = TypeDescriptorTag, DimensionTag>; + static constexpr size_t field_count() { + return 0; + } + + static constexpr IndexDescriptor::Type type() { + return IndexDescriptor::EMPTY; + } + + static constexpr const char* name() { + return "empty"; + } + + static constexpr EmptyIndex default_index() { + return {}; + } + + [[nodiscard]] static IndexValue start_value_for_segment(const SegmentInMemory& segment) { + return static_cast(segment.offset()); + } + + [[nodiscard]] static IndexValue end_value_for_segment(const SegmentInMemory& segment) { + return static_cast(segment.offset()); + } + + [[nodiscard]] static IndexValue start_value_for_keys_segment(const SegmentInMemory& segment) { + return static_cast(segment.offset()); + } + + [[nodiscard]] static IndexValue end_value_for_keys_segment(const SegmentInMemory& segment) { + return static_cast(segment.offset()); + } +}; + +using Index = std::variant; inline Index index_type_from_descriptor(const StreamDescriptor &desc) { switch (desc.index().proto().kind()) { + case IndexDescriptor::EMPTY: return EmptyIndex{}; case IndexDescriptor::TIMESTAMP: return TimeseriesIndex::make_from_descriptor(desc); case IndexDescriptor::STRING: @@ -312,6 +346,7 @@ inline Index index_type_from_descriptor(const StreamDescriptor &desc) { inline Index default_index_type_from_descriptor(const IndexDescriptor::Proto &desc) { switch (desc.kind()) { + case IndexDescriptor::EMPTY: return EmptyIndex{}; case IndexDescriptor::TIMESTAMP: return TimeseriesIndex::default_index(); case IndexDescriptor::STRING: @@ -326,6 +361,7 @@ inline Index default_index_type_from_descriptor(const IndexDescriptor::Proto &de // Only to be used for visitation to get field count etc as the name is not set inline Index variant_index_from_type(IndexDescriptor::Type type) { switch (type) { + case IndexDescriptor::EMPTY: return EmptyIndex{}; case IndexDescriptor::TIMESTAMP: return TimeseriesIndex{TimeseriesIndex::DefaultName}; case IndexDescriptor::STRING: diff --git a/cpp/arcticdb/stream/row_builder.hpp b/cpp/arcticdb/stream/row_builder.hpp index ade422f531..97347f4806 100644 --- a/cpp/arcticdb/stream/row_builder.hpp +++ b/cpp/arcticdb/stream/row_builder.hpp @@ -54,7 +54,7 @@ class RowBuilder { template void start_row(const Args...args) { reset(); - if constexpr(sizeof...(Args)> 0) { + if constexpr(sizeof...(Args)> 0 && !std::is_same_v) { index().set([&](std::size_t pos, auto arg) { if constexpr (std::is_integral_v || std::is_floating_point_v) set_scalar_impl(pos, arg); diff --git a/cpp/arcticdb/stream/stream_utils.hpp b/cpp/arcticdb/stream/stream_utils.hpp index 4cbebc8dde..2297f9a213 100644 --- a/cpp/arcticdb/stream/stream_utils.hpp +++ b/cpp/arcticdb/stream/stream_utils.hpp @@ -389,7 +389,10 @@ inline std::vector get_index_columns_from_descriptor(const Timeseri } inline IndexRange get_range_from_segment(const Index& index, const SegmentInMemory& segment) { - return util::variant_match(index, [&segment] (auto index_type) { + return util::variant_match( + index, + [](const EmptyIndex&) { return IndexRange{}; }, + [&segment] (auto index_type) { using IndexType = decltype(index_type); auto start = IndexType::start_value_for_segment(segment); auto end = IndexType::end_value_for_segment(segment); diff --git a/cpp/arcticdb/version/version_core-inl.hpp b/cpp/arcticdb/version/version_core-inl.hpp index 1dfee5617d..501a88263e 100644 --- a/cpp/arcticdb/version/version_core-inl.hpp +++ b/cpp/arcticdb/version/version_core-inl.hpp @@ -64,10 +64,11 @@ void merge_frames_for_keys_impl( if (std::holds_alternative(query.row_filter)) index_range = std::get(query.row_filter); - auto compare = - [=](const std::unique_ptr &left, const std::unique_ptr &right) { - return pipelines::index::index_value_from_row(left->row(), IndexDescriptor::TIMESTAMP, 0) > pipelines::index::index_value_from_row(right->row(), IndexDescriptor::TIMESTAMP, 0); - }; + auto compare = [](const std::unique_ptr& left, + const std::unique_ptr& right) { + return pipelines::index::index_value_from_row(left->row(), IndexDescriptor::TIMESTAMP, 0) > + pipelines::index::index_value_from_row(right->row(), IndexDescriptor::TIMESTAMP, 0); + }; movable_priority_queue, std::vector>, decltype(compare)> input_streams{compare}; diff --git a/cpp/proto/arcticc/pb2/descriptors.proto b/cpp/proto/arcticc/pb2/descriptors.proto index 0dcf769a35..008f068051 100644 --- a/cpp/proto/arcticc/pb2/descriptors.proto +++ b/cpp/proto/arcticc/pb2/descriptors.proto @@ -57,6 +57,8 @@ message TypeDescriptor { message IndexDescriptor { enum Type { UNKNOWN = 0; + // Used then the dataframe has 0 rows. Convertible to any other type + EMPTY = 69; // 'E' ROWCOUNT = 82; // 'R' STRING = 83; // 'S' TIMESTAMP = 84; // 'T' From 408d9be2c5f800114bd52739dc7036dc6901299f Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Tue, 12 Mar 2024 16:14:59 +0200 Subject: [PATCH 11/21] Make empty index compatible with other index types --- cpp/arcticdb/entity/stream_descriptor.hpp | 3 +- cpp/arcticdb/entity/types_proto.cpp | 11 +++ cpp/arcticdb/entity/types_proto.hpp | 1 + cpp/arcticdb/pipeline/frame_utils.cpp | 4 +- cpp/arcticdb/pipeline/frame_utils.hpp | 2 +- cpp/arcticdb/python/normalization_checks.cpp | 92 ++++++++++++------- .../python/python_to_tensor_frame.cpp | 25 +++-- cpp/arcticdb/stream/append_map.cpp | 2 +- cpp/arcticdb/version/schema_checks.hpp | 43 +++++---- cpp/arcticdb/version/version_core.cpp | 12 +-- .../arcticdb/version_store/_normalization.py | 24 +++-- .../version_store/test_empty_column_type.py | 13 +-- 12 files changed, 143 insertions(+), 89 deletions(-) diff --git a/cpp/arcticdb/entity/stream_descriptor.hpp b/cpp/arcticdb/entity/stream_descriptor.hpp index afcc9cfeb3..6f398a9b42 100644 --- a/cpp/arcticdb/entity/stream_descriptor.hpp +++ b/cpp/arcticdb/entity/stream_descriptor.hpp @@ -22,7 +22,6 @@ struct StreamDescriptor { std::shared_ptr data_ = std::make_shared(); std::shared_ptr fields_ = std::make_shared(); - ; StreamDescriptor() = default; ~StreamDescriptor() = default; @@ -65,7 +64,7 @@ struct StreamDescriptor { data_->set_sorted(sorted_value_to_proto(sorted)); } - SortedValue get_sorted() { + SortedValue get_sorted() const { return sorted_value_from_proto(data_->sorted()); } diff --git a/cpp/arcticdb/entity/types_proto.cpp b/cpp/arcticdb/entity/types_proto.cpp index eef3b0b303..5d53a3092f 100644 --- a/cpp/arcticdb/entity/types_proto.cpp +++ b/cpp/arcticdb/entity/types_proto.cpp @@ -177,4 +177,15 @@ namespace arcticdb::entity { default: util::raise_rte("Unknown index type: {}", int(type)); } } + + const char* index_type_to_str(IndexDescriptor::Type type) { + switch (type) { + case IndexDescriptor::EMPTY: return "Empty"; + case IndexDescriptor::TIMESTAMP: return "Timestamp"; + case IndexDescriptor::ROWCOUNT: return "Row count"; + case IndexDescriptor::STRING: return "String"; + case IndexDescriptor::UNKNOWN: return "Unknown"; + default: util::raise_rte("Unknown index type: {}", int(type)); + } + } } // namespace arcticdb diff --git a/cpp/arcticdb/entity/types_proto.hpp b/cpp/arcticdb/entity/types_proto.hpp index 3ec139495a..be95972fa0 100644 --- a/cpp/arcticdb/entity/types_proto.hpp +++ b/cpp/arcticdb/entity/types_proto.hpp @@ -71,6 +71,7 @@ namespace arcticdb::entity { IndexDescriptor::TypeChar to_type_char(IndexDescriptor::Type type); IndexDescriptor::Type from_type_char(IndexDescriptor::TypeChar type); + const char* index_type_to_str(IndexDescriptor::Type type); void set_id(arcticdb::proto::descriptors::StreamDescriptor& pb_desc, StreamId id); diff --git a/cpp/arcticdb/pipeline/frame_utils.cpp b/cpp/arcticdb/pipeline/frame_utils.cpp index abd47f87bd..b96fd3f515 100644 --- a/cpp/arcticdb/pipeline/frame_utils.cpp +++ b/cpp/arcticdb/pipeline/frame_utils.cpp @@ -148,8 +148,8 @@ std::pair offset_and_row_count(const std::shared_ptr& frame) { - return !std::holds_alternative(frame->index) || frame->desc.get_sorted() == SortedValue::ASCENDING; +bool index_is_not_timeseries_or_is_sorted_ascending(const pipelines::InputTensorFrame& frame) { + return !std::holds_alternative(frame.index) || frame.desc.get_sorted() == SortedValue::ASCENDING; } } diff --git a/cpp/arcticdb/pipeline/frame_utils.hpp b/cpp/arcticdb/pipeline/frame_utils.hpp index f9ecf52311..d95b5bf8ef 100644 --- a/cpp/arcticdb/pipeline/frame_utils.hpp +++ b/cpp/arcticdb/pipeline/frame_utils.hpp @@ -311,6 +311,6 @@ size_t get_slice_rowcounts( std::pair offset_and_row_count( const std::shared_ptr& context); -bool index_is_not_timeseries_or_is_sorted_ascending(const std::shared_ptr& frame); +bool index_is_not_timeseries_or_is_sorted_ascending(const pipelines::InputTensorFrame& frame); } //namespace arcticdb diff --git a/cpp/arcticdb/python/normalization_checks.cpp b/cpp/arcticdb/python/normalization_checks.cpp index 9fe08aef17..fcbc35bf4d 100644 --- a/cpp/arcticdb/python/normalization_checks.cpp +++ b/cpp/arcticdb/python/normalization_checks.cpp @@ -90,9 +90,59 @@ get_common_pandas(proto::descriptors::NormalizationMetadata& norm_meta) { } } -bool check_pandas_like(const proto::descriptors::NormalizationMetadata& old_norm, - proto::descriptors::NormalizationMetadata& new_norm, - size_t old_length) { +/// In case both indexes are row-ranged sanity checks will be performed: +/// * Both indexes must have the same step +/// * The new index must start at the point where the old one ends +/// If the checks above pass update the new normalization index so that it spans the whole index (old + new) +/// @throws In case the row-ranged indexes are incompatible +void update_rowcount_normalization_data( + const proto::descriptors::NormalizationMetadata& old_norm, + proto::descriptors::NormalizationMetadata& new_norm, + size_t old_length +) { + const auto old_pandas = get_common_pandas(old_norm); + const auto new_pandas = get_common_pandas(new_norm); + const auto* old_index = old_pandas->get().has_index() ? &old_pandas->get().index() : nullptr; + const auto* new_index = new_pandas->get().has_index() ? &new_pandas->get().index() : nullptr; + if (old_index) { + constexpr auto error_suffix = + " the existing version. Please convert both to use Int64Index if you need this to work."; + normalization::check( + old_index->is_not_range_index() == new_index->is_not_range_index(), + "The argument uses a {} index which is incompatible with {}", + new_index->is_not_range_index() ? "non-range" : "range-style", + error_suffix + ); + + if (!old_index->is_not_range_index()) { + normalization::check( + old_index->step() == new_index->step(), + "The new argument has a different RangeIndex step from {}", + error_suffix + ); + + size_t new_start = new_index->start(); + if (new_start != 0) { + auto stop = old_index->start() + old_length * old_index->step(); + normalization::check( + new_start == stop, + "The appending data has a RangeIndex.start={} that is not contiguous with the {}" + "stop ({}) of", + error_suffix, + new_start, + stop + ); + } + + new_pandas->get().mutable_index()->set_start(old_index->start()); + } + } +} + +bool check_pandas_like( + const proto::descriptors::NormalizationMetadata& old_norm, + proto::descriptors::NormalizationMetadata& new_norm +) { auto old_pandas = get_common_pandas(old_norm); auto new_pandas = get_common_pandas(new_norm); if (old_pandas || new_pandas) { @@ -107,33 +157,6 @@ bool check_pandas_like(const proto::descriptors::NormalizationMetadata& old_norm "The argument has an index type incompatible with the existing version:\nexisting={}\nargument={}", util::newlines_to_spaces(old_norm), util::newlines_to_spaces(new_norm)); - - if (old_index) { - constexpr auto - error_suffix = " the existing version. Please convert both to use Int64Index if you need this to work."; - normalization::check(old_index->is_not_range_index() == new_index->is_not_range_index(), - "The argument uses a {} index which is incompatible with {}", - new_index->is_not_range_index() ? "non-range" : "range-style", error_suffix); - - if (!old_index->is_not_range_index()) { - normalization::check(old_index->step() == new_index->step(), - "The new argument has a different RangeIndex step from {}", error_suffix); - - size_t new_start = new_index->start(); - if (new_start != 0) { - auto stop = old_index->start() + old_length * old_index->step(); - normalization::check(new_start == stop, - "The appending data has a RangeIndex.start={} that is not contiguous with the {}" - "stop ({}) of", - error_suffix, - new_start, - stop); - } - - new_pandas->get().mutable_index()->set_start(old_index->start()); - } - } - // FUTURE: check PandasMultiIndex and many other descriptor types. Might be more efficiently implemented using // some structural comparison lib or do it via Python return true; @@ -172,9 +195,14 @@ void fix_normalization_or_throw( const pipelines::InputTensorFrame &new_frame) { auto &old_norm = existing_isr.tsd().proto().normalization(); auto &new_norm = new_frame.norm_meta; - - if (check_pandas_like(old_norm, new_norm, existing_isr.tsd().proto().total_rows())) + if (check_pandas_like(old_norm, new_norm)) { + const IndexDescriptor::Type old_index_type = existing_isr.seg().descriptor().index().type(); + const IndexDescriptor::Type new_index_type = new_frame.desc.index().type(); + if (old_index_type == new_index_type && old_index_type == IndexDescriptor::ROWCOUNT) { + update_rowcount_normalization_data(old_norm, new_norm, existing_isr.tsd().proto().total_rows()); + } return; + } if (is_append) { if (check_ndarray_append(old_norm, new_norm)) return; diff --git a/cpp/arcticdb/python/python_to_tensor_frame.cpp b/cpp/arcticdb/python/python_to_tensor_frame.cpp index 9fc5114436..76b57a378a 100644 --- a/cpp/arcticdb/python/python_to_tensor_frame.cpp +++ b/cpp/arcticdb/python/python_to_tensor_frame.cpp @@ -220,10 +220,7 @@ std::shared_ptr py_ndf_to_frame( "Number idx names {} and values {} do not match", idx_names.size(), idx_vals.size()); - if (idx_names.empty()) { - res->index = stream::RowCountIndex(); - res->desc.set_index_type(IndexDescriptor::ROWCOUNT); - } else { + if (idx_names.empty() == false) { util::check(idx_names.size() == 1, "Multi-indexed dataframes not handled"); auto index_tensor = obj_to_tensor(idx_vals[0].ptr()); util::check(index_tensor.ndim() == 1, "Multi-dimensional indexes not handled"); @@ -241,11 +238,6 @@ std::shared_ptr py_ndf_to_frame( res->desc.add_scalar_field(index_tensor.dt_, index_column_name); res->index = stream::TimeseriesIndex(index_column_name); res->index_tensor = std::move(index_tensor); - } else if (is_empty_type(index_tensor.data_type())) { - res->index = EmptyIndex{}; - res->desc.set_index_type(IndexDescriptor::EMPTY); - res->desc.add_scalar_field(index_tensor.dt_, index_column_name); - res->field_tensors.push_back(std::move(index_tensor)); } else { res->index = stream::RowCountIndex(); res->desc.set_index_type(IndexDescriptor::ROWCOUNT); @@ -272,6 +264,21 @@ std::shared_ptr py_ndf_to_frame( res->field_tensors.push_back(std::move(tensor)); } + // idx_names are passed by the python layer. They are empty in case row count index is used see: + // https://github.com/man-group/ArcticDB/blob/4184a467d9eee90600ddcbf34d896c763e76f78f/python/arcticdb/version_store/_normalization.py#L291 + // Currently the python layers assign RowRange index to both empty dataframes and dataframes wich do not specify + // index explicitly. Thus we handle this case after all columns are read so that we know how many rows are there. + if (idx_names.empty()) { + if (res->num_rows > 0) { + res->index = stream::RowCountIndex(); + res->desc.set_index_type(IndexDescriptor::ROWCOUNT); + } else { + res->index = stream::EmptyIndex(); + res->desc.set_index_type(IndexDescriptor::EMPTY); + } + + } + ARCTICDB_DEBUG(log::version(), "Received frame with descriptor {}", res->desc); res->set_index_range(); return res; diff --git a/cpp/arcticdb/stream/append_map.cpp b/cpp/arcticdb/stream/append_map.cpp index d971499cca..0ed792e691 100644 --- a/cpp/arcticdb/stream/append_map.cpp +++ b/cpp/arcticdb/stream/append_map.cpp @@ -214,7 +214,7 @@ folly::Future write_incomplete_frame( std::optional&& next_key) { using namespace arcticdb::pipelines; - if (!index_is_not_timeseries_or_is_sorted_ascending(frame)) { + if (!index_is_not_timeseries_or_is_sorted_ascending(*frame)) { sorting::raise("When writing/appending staged data in parallel, input data must be sorted."); } diff --git a/cpp/arcticdb/version/schema_checks.hpp b/cpp/arcticdb/version/schema_checks.hpp index 526d9b7fb7..eec12490bc 100644 --- a/cpp/arcticdb/version/schema_checks.hpp +++ b/cpp/arcticdb/version/schema_checks.hpp @@ -29,33 +29,42 @@ struct StreamDescriptorMismatch : ArcticSpecificException(frame.index); - + const IndexDescriptor::Type old_idx_kind = old_descriptor.index().type(); + const IndexDescriptor::Type new_idx_kind = frame.desc.index().type(); if (operation == UPDATE) { + const bool new_is_timeseries = std::holds_alternative(frame.index); util::check_rte(old_idx_kind == IndexDescriptor::TIMESTAMP && new_is_timeseries, "Update will not work as expected with a non-timeseries index"); } else { - // TODO: AN-722 - if (new_is_timeseries) { - if (old_idx_kind != IndexDescriptor::TIMESTAMP) { - log::version().warn("Appending a timeseries to a non-timeseries-indexed symbol may create a " - "confusing index and cause problems later"); - } - } else { - if (old_idx_kind != IndexDescriptor::ROWCOUNT) { - // Backwards compatibility - log::version().warn("Appending a non-timeseries-indexed data to a timeseries symbol is highly " - "likely to cause corruption/unexpected behaviour."); - } - } + const IndexDescriptor::Type common_index_type = get_common_index_type(old_idx_kind, new_idx_kind); + normalization::check( + common_index_type != IndexDescriptor::UNKNOWN, + "Cannot append {} index to {} index", + index_type_to_str(new_idx_kind), + index_type_to_str(old_idx_kind) + ); } } -inline bool columns_match(const StreamDescriptor &left, const StreamDescriptor &right) { +inline bool columns_match(const StreamDescriptor& left, const StreamDescriptor& right) { + // TODO: handle empty index. If left is empty index df it will have one field (the first) + // less than right. if (left.fields().size() != right.fields().size()) return false; diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 8d3a71cded..b1459099ad 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -102,7 +102,7 @@ folly::Future async_write_dataframe_impl( frame->set_bucketize_dynamic(options.bucketize_dynamic); auto slicing_arg = get_slicing_policy(options, *frame); auto partial_key = IndexPartialKey{frame->desc.id(), version_id}; - if (validate_index && !index_is_not_timeseries_or_is_sorted_ascending(frame)) { + if (validate_index && !index_is_not_timeseries_or_is_sorted_ascending(*frame)) { sorting::raise("When calling write with validate_index enabled, input data must be sorted"); } return write_frame(std::move(partial_key), frame, slicing_arg, store, de_dup_map, sparsify_floats); @@ -121,12 +121,12 @@ IndexDescriptor::Proto check_index_match(const arcticdb::stream::Index& index, c } } -void sorted_data_check_append(const std::shared_ptr& frame, index::IndexSegmentReader& index_segment_reader){ +void sorted_data_check_append(const InputTensorFrame& frame, index::IndexSegmentReader& index_segment_reader){ if (!index_is_not_timeseries_or_is_sorted_ascending(frame)) { sorting::raise("When calling append with validate_index enabled, input data must be sorted"); } sorting::check( - !std::holds_alternative(frame->index) || + !std::holds_alternative(frame.index) || index_segment_reader.mutable_tsd().mutable_proto().stream_descriptor().sorted() == arcticdb::proto::descriptors::SortedValue::ASCENDING, "When calling append with validate_index enabled, the existing data must be sorted"); } @@ -145,11 +145,11 @@ folly::Future async_append_impl( bool bucketize_dynamic = index_segment_reader.bucketize_dynamic(); auto row_offset = index_segment_reader.tsd().proto().total_rows(); util::check_rte(!index_segment_reader.is_pickled(), "Cannot append to pickled data"); - if (validate_index) { - sorted_data_check_append(frame, index_segment_reader); - } frame->set_offset(static_cast(row_offset)); fix_descriptor_mismatch_or_throw(APPEND, options.dynamic_schema, index_segment_reader, *frame); + if (validate_index) { + sorted_data_check_append(*frame, index_segment_reader); + } frame->set_bucketize_dynamic(bucketize_dynamic); auto slicing_arg = get_slicing_policy(options, *frame); diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py index 4e43ac40c3..791677c41b 100644 --- a/python/arcticdb/version_store/_normalization.py +++ b/python/arcticdb/version_store/_normalization.py @@ -547,18 +547,28 @@ def _index_to_records(self, df, pd_norm, dynamic_strings, string_max_len): else: n_rows = len(index) n_categorical_columns = len(df.select_dtypes(include="category").columns) - if IS_PANDAS_TWO and isinstance(index, RangeIndex) and n_rows == 0 and n_categorical_columns == 0: + if n_rows == 0 and n_categorical_columns == 0: + # Short term solution to get the internal ArcticDB empty index working + # # In Pandas 1.0, an Index is used by default for any empty dataframe or series, except if # there are categorical columns in which case a RangeIndex is used. # # In Pandas 2.0, RangeIndex is used by default for _any_ empty dataframe or series. - # See: https://github.com/pandas-dev/pandas/issues/49572 - # Yet internally, ArcticDB uses a DatetimeIndex for empty dataframes and series without categorical - # columns. # - # The index is converted to a DatetimeIndex for preserving the behavior of ArcticDB with Pandas 1.0. - index = DatetimeIndex([]) - + # Prior this change ArcticDB used DateTime index for empty dataframes, now the index in an empty + # dataframe will be the internal empty index. The empty index is not stored but reconstructed at read + # time the same way RangedIndex is. This affects the Python layer and the way columns are mapped to + # a dataframe. The 0-rowed RangeIndex will be converted to an empty index in the C++ layer. + # + # Things to consider: + # * is_not_range_index affects which columns are read and which column is the index column. If it is + # false then the first column is assumed to be the index column and the python layer will add it to + # the column array see: https://github.com/man-group/ArcticDB/blob/4184a467d9eee90600ddcbf34d896c763e76f78f/python/arcticdb/version_store/_store.py#L1908 + # if it's true this signals the python layer that the index is must be computed at read time and is + # not returned from the C++ layer. The name of the field is confusing since with the introduction of + # the internal EmptyIndex class the same behaviour (as with ranged index) must occur. We should either + # rename or get rid of this field at all since it looks a bit redundant. + index = RangeIndex(0,0) index_norm = pd_norm.index index_norm.is_not_range_index = not isinstance(index, RangeIndex) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index b36474f8ae..f364dfd682 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -576,18 +576,7 @@ class TestCanAppendToEmptyColumn: """ - @pytest.fixture(params= - [ - pytest.param( - pd.RangeIndex(0,3), - marks=pytest.mark.xfail( - reason="Appending row ranged columns to empty columns is not supported yet." - "The index of empty df is of type datetime and it clashes with the row-range type." - "The expected behavior is to allow this as long as the initial df is of 0 rows.") - ), - list(pd.date_range(start="1/1/2024", end="1/3/2024")) - ] - ) + @pytest.fixture(params=[pd.RangeIndex(0,3), list(pd.date_range(start="1/1/2024", end="1/3/2024"))]) def append_index(self, request): yield request.param From be8ab303b39c95767c3494a8e90b28fcdd17c0a4 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Thu, 14 Mar 2024 17:02:20 +0200 Subject: [PATCH 12/21] Make it possible to append to and update dfs with empty index type --- cpp/arcticdb/version/schema_checks.hpp | 21 +-- cpp/arcticdb/version/version_core.cpp | 8 +- .../version_store/test_empty_column_type.py | 123 +++++++++++------- 3 files changed, 94 insertions(+), 58 deletions(-) diff --git a/cpp/arcticdb/version/schema_checks.hpp b/cpp/arcticdb/version/schema_checks.hpp index eec12490bc..0555cc438b 100644 --- a/cpp/arcticdb/version/schema_checks.hpp +++ b/cpp/arcticdb/version/schema_checks.hpp @@ -49,7 +49,8 @@ inline void check_normalization_index_match(NormalizationOperation operation, const IndexDescriptor::Type new_idx_kind = frame.desc.index().type(); if (operation == UPDATE) { const bool new_is_timeseries = std::holds_alternative(frame.index); - util::check_rte(old_idx_kind == IndexDescriptor::TIMESTAMP && new_is_timeseries, + util::check_rte( + (old_idx_kind == IndexDescriptor::TIMESTAMP || old_idx_kind == IndexDescriptor::EMPTY) && new_is_timeseries, "Update will not work as expected with a non-timeseries index"); } else { const IndexDescriptor::Type common_index_type = get_common_index_type(old_idx_kind, new_idx_kind); @@ -63,17 +64,21 @@ inline void check_normalization_index_match(NormalizationOperation operation, } inline bool columns_match(const StreamDescriptor& left, const StreamDescriptor& right) { - // TODO: handle empty index. If left is empty index df it will have one field (the first) - // less than right. - if (left.fields().size() != right.fields().size()) + const int left_index_is_empty = left.index().type() == IndexDescriptor::EMPTY; + const int right_fields_offset = right.index().field_count() * left_index_is_empty; + // The empty index is compatible with all other index types. Differences in the index fields in this case is + // allowed. The index fields are always the first in the list. + if (left.fields().size() + right_fields_offset != right.fields().size()) { return false; - + } + // In case the left index is empty index we want to skip name/type checking of the index fields which are always + // the first fields. for (auto i = 0; i < int(left.fields().size()); ++i) { - if (left.fields(i).name() != right.fields(i).name()) + if (left.fields(i).name() != right.fields(i + right_fields_offset).name()) return false; - const TypeDescriptor &left_type = left.fields(i).type(); - const TypeDescriptor &right_type = right.fields(i).type(); + const TypeDescriptor& left_type = left.fields(i).type(); + const TypeDescriptor& right_type = right.fields(i + right_fields_offset).type(); if (!trivially_compatible_types(left_type, right_type) && !(is_empty_type(left_type.data_type()) || is_empty_type(right_type.data_type()))) diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index b1459099ad..e550357f08 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -111,7 +111,8 @@ folly::Future async_write_dataframe_impl( namespace { IndexDescriptor::Proto check_index_match(const arcticdb::stream::Index& index, const IndexDescriptor::Proto& desc) { if (std::holds_alternative(index)) - util::check(desc.kind() == IndexDescriptor::TIMESTAMP, + util::check( + desc.kind() == IndexDescriptor::TIMESTAMP || desc.kind() == IndexDescriptor::EMPTY, "Index mismatch, cannot update a non-timeseries-indexed frame with a timeseries"); else util::check(desc.kind() == IndexDescriptor::ROWCOUNT, @@ -329,7 +330,10 @@ VersionedItem update_impl( auto index_segment_reader = index::get_index_reader(*(update_info.previous_index_key_), store); util::check_rte(!index_segment_reader.is_pickled(), "Cannot update pickled data"); auto index_desc = check_index_match(frame->index, index_segment_reader.tsd().proto().stream_descriptor().index()); - util::check(index_desc.kind() == IndexDescriptor::TIMESTAMP, "Update not supported for non-timeseries indexes"); + util::check( + index_desc.kind() == IndexDescriptor::TIMESTAMP || index_desc.kind() == IndexDescriptor::EMPTY, + "Update not supported for non-timeseries indexes" + ); sorted_data_check_update(*frame, index_segment_reader); bool bucketize_dynamic = index_segment_reader.bucketize_dynamic(); (void)check_and_mark_slices(index_segment_reader, dynamic_schema, false, std::nullopt, bucketize_dynamic); diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index f364dfd682..7932f541ac 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -582,38 +582,35 @@ def append_index(self, request): @pytest.fixture(autouse=True) def create_empty_column(self, lmdb_version_store_static_and_dynamic, dtype, empty_index): - if isinstance(empty_index, pd.RangeIndex) and sys.version_info[1] < 9: - pytest.xfail("""compat-36 and compat-38 tests are failing because this would assign a row-range index to - the empty df. This will be fixed when the pandas-agnostic empty index type is added""") lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) yield - def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype, empty_index, dtype, append_index): + def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype, dtype, append_index): df_to_append = pd.DataFrame({"col": [1,2,3]}, dtype=int_dtype, index=append_index) lmdb_version_store_static_and_dynamic.append("sym", df_to_append) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) - def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype, empty_index, append_index): + def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype, append_index): df_to_append = pd.DataFrame({"col": [1.0,2.0,3.0]}, dtype=float_dtype, index=append_index) lmdb_version_store_static_and_dynamic.append("sym", df_to_append) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) - def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype, empty_index, append_index): + def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype, append_index): df_to_append = pd.DataFrame({"col": [True, False, None]}, dtype=boolean_dtype, index=append_index) lmdb_version_store_static_and_dynamic.append("sym", df_to_append) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) - def test_empty(self, lmdb_version_store_static_and_dynamic, empty_index, append_index): + def test_nones(self, lmdb_version_store_static_and_dynamic, append_index): df_to_append = pd.DataFrame({"col": [None, None, None]}, index=append_index) lmdb_version_store_static_and_dynamic.append("sym", df_to_append) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) - def test_string(self, lmdb_version_store_static_and_dynamic, empty_index, append_index): + def test_string(self, lmdb_version_store_static_and_dynamic, append_index): df_to_append = pd.DataFrame({"col": ["short_string", None, 20 * "long_string"]}, index=append_index) lmdb_version_store_static_and_dynamic.append("sym", df_to_append) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) - def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype, empty_index, append_index): + def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype, append_index): df_to_append = pd.DataFrame( { "col": np.array( @@ -631,58 +628,72 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype, empty_ind assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df_to_append) -class TestAppendingEmptyToColumnDoesNothing: +class TestAppendAndUpdateWithEmptyToColumnDoesNothing: """ - Test if it is possible to append empty column to an already existing column. The append should not change anything. - If dynamic schema is used the update should not create new columns. The update should not even reach the C++ layer. + Test if it is possible to append/update empty column to an already existing column. The append/update should not + change anything. If dynamic schema is used the append/update should not create new columns. The append/update + should not even reach the C++ layer and the version should not be changed. """ - @pytest.fixture(params=[pd.RangeIndex(0,3), list(pd.date_range(start="1/1/2024", end="1/3/2024"))]) def index(self, request): yield request.param - def test_integer(self, lmdb_version_store_static_and_dynamic, index, int_dtype, empty_index, dtype): + @pytest.fixture() + def empty_dataframe(self, empty_index, dtype): + yield pd.DataFrame({"col": []}, dtype=dtype, index=empty_index) + + @staticmethod + def assert_append_empty_does_nothing(initial_df, store, empty): + store.append("sym", empty) + read_result = store.read("sym") + assert_frame_equal(read_result.data, initial_df) + assert read_result.version == 0 + + @staticmethod + def assert_update_empty_does_nothing(initial_df, store, empty): + store.update("sym", empty) + read_result = store.read("sym") + assert_frame_equal(read_result.data, initial_df) + assert read_result.version == 0 + + def test_integer(self, lmdb_version_store_static_and_dynamic, index, int_dtype, empty_dataframe): df = pd.DataFrame({"col": [1,2,3]}, dtype=int_dtype, index=index) lmdb_version_store_static_and_dynamic.write("sym", df) - lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) - read_result = lmdb_version_store_static_and_dynamic.read("sym") - assert_frame_equal(read_result.data, df) - assert read_result.version == 0 + self.assert_append_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) + self.assert_update_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) - def test_float(self, lmdb_version_store_static_and_dynamic, index, float_dtype, empty_index, dtype): + def test_float(self, lmdb_version_store_static_and_dynamic, index, float_dtype, empty_dataframe): df = pd.DataFrame({"col": [1,2,3]}, dtype=float_dtype, index=index) lmdb_version_store_static_and_dynamic.write("sym", df) - lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) - read_result = lmdb_version_store_static_and_dynamic.read("sym") - assert_frame_equal(read_result.data, df) - assert read_result.version == 0 + self.assert_append_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) + self.assert_update_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) - def test_bool(self, lmdb_version_store_static_and_dynamic, index, boolean_dtype, empty_index, dtype): + def test_bool(self, lmdb_version_store_static_and_dynamic, index, boolean_dtype, empty_dataframe): df = pd.DataFrame({"col": [False, True, None]}, dtype=boolean_dtype, index=index) lmdb_version_store_static_and_dynamic.write("sym", df) - lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) - read_result = lmdb_version_store_static_and_dynamic.read("sym") - assert_frame_equal(read_result.data, df) - assert read_result.version == 0 + self.assert_append_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) + self.assert_update_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) - def test_empty(self, lmdb_version_store_static_and_dynamic, index, empty_index, dtype): + def test_nones(self, lmdb_version_store_static_and_dynamic, index, empty_dataframe): df = pd.DataFrame({"col": [None, None, None]}, index=index) lmdb_version_store_static_and_dynamic.write("sym", df) - lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) - read_result = lmdb_version_store_static_and_dynamic.read("sym") - assert_frame_equal(read_result.data, df) - assert read_result.version == 0 + self.assert_append_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) + self.assert_update_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) + + def test_empty(self, lmdb_version_store_static_and_dynamic, index, empty_dataframe): + df = pd.DataFrame({"col": []}) + lmdb_version_store_static_and_dynamic.write("sym", df) + self.assert_append_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) + self.assert_update_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) - def test_string(self, lmdb_version_store_static_and_dynamic, index, empty_index, dtype): + def test_string(self, lmdb_version_store_static_and_dynamic, index, empty_dataframe): df = pd.DataFrame({"col": ["shord", 20*"long", None]}, index=index) lmdb_version_store_static_and_dynamic.write("sym", df) - lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) - read_result = lmdb_version_store_static_and_dynamic.read("sym") - assert_frame_equal(read_result.data, df) - assert read_result.version == 0 + self.assert_append_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) + self.assert_update_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) - def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype, index, empty_index, dtype): + def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype, index, empty_dataframe): df = pd.DataFrame( { "col": np.array( @@ -694,10 +705,8 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype, index, em ) }, dtype=date_dtype, index=index) lmdb_version_store_static_and_dynamic.write("sym", df) - lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) - read_result = lmdb_version_store_static_and_dynamic.read("sym") - assert_frame_equal(read_result.data, df) - assert read_result.version == 0 + self.assert_append_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) + self.assert_update_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) def test_empty_df_does_not_create_new_columns_in_dynamic_schema(self, lmdb_version_store_dynamic_schema, index): df = pd.DataFrame({"col": [1,2,3]}, dtype="int32", index=index) @@ -724,9 +733,6 @@ class TestCanUpdateEmptyColumn: @pytest.fixture(autouse=True) def create_empty_column(self, lmdb_version_store_static_and_dynamic, dtype, empty_index): - if isinstance(empty_index, pd.RangeIndex) and sys.version_info[1] < 9: - pytest.xfail("""compat-36 and compat-38 tests are failing because this would assign a row-range index to - the empty df. This will be fixed when the pandas-agnostic empty index type is added""") lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index)) yield @@ -773,10 +779,31 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): lmdb_version_store_static_and_dynamic.update("sym", df) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, df) -class TestEmptyTypesIsOverriden: + +class TestEmptyTypeIsOverriden: + """ + When an empty column (or a column containing only None) values is initially written it is assigned the empty type. + The first write/update to change the type determines the actual type of the column. Test that the first non-empty + append determines the actual type and subsequent appends with different types fail. + """ + + def test_cannot_append_different_type_after_first_not_none(self, lmdb_version_store_static_and_dynamic): lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": [None, None]})) lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": [1, 2, 3]})) lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": [None, None]})) with pytest.raises(Exception): - lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": ["some", "string"]})) \ No newline at end of file + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": ["some", "string"]})) + + @pytest.mark.parametrize( + "incompatible_indexes", + [ + (pd.RangeIndex(0,3), list(pd.date_range(start="1/1/2024", end="1/3/2024"))), + (list(pd.date_range(start="1/1/2024", end="1/3/2024")), pd.RangeIndex(0, 3)) + ] + ) + def test_cannot_append_different_index_type_after_first_non_empty(self, lmdb_version_store_static_and_dynamic, incompatible_indexes): + lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": []})) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": [1,2,3]}, index=incompatible_indexes[0])) + with pytest.raises(Exception): + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": [4, 5, 6]}, index=incompatible_indexes[1])) \ No newline at end of file From ec8b050ca5a13b445dbe0c8c19318f5a597c17dc Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Thu, 14 Mar 2024 18:11:16 +0200 Subject: [PATCH 13/21] Fix comments and reduce test count --- .../arcticdb/version_store/_normalization.py | 19 ++++-------------- .../version_store/test_empty_column_type.py | 20 +++++++++++++------ 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py index 791677c41b..8debc105a2 100644 --- a/python/arcticdb/version_store/_normalization.py +++ b/python/arcticdb/version_store/_normalization.py @@ -548,26 +548,15 @@ def _index_to_records(self, df, pd_norm, dynamic_strings, string_max_len): n_rows = len(index) n_categorical_columns = len(df.select_dtypes(include="category").columns) if n_rows == 0 and n_categorical_columns == 0: - # Short term solution to get the internal ArcticDB empty index working - # # In Pandas 1.0, an Index is used by default for any empty dataframe or series, except if # there are categorical columns in which case a RangeIndex is used. # # In Pandas 2.0, RangeIndex is used by default for _any_ empty dataframe or series. # - # Prior this change ArcticDB used DateTime index for empty dataframes, now the index in an empty - # dataframe will be the internal empty index. The empty index is not stored but reconstructed at read - # time the same way RangedIndex is. This affects the Python layer and the way columns are mapped to - # a dataframe. The 0-rowed RangeIndex will be converted to an empty index in the C++ layer. - # - # Things to consider: - # * is_not_range_index affects which columns are read and which column is the index column. If it is - # false then the first column is assumed to be the index column and the python layer will add it to - # the column array see: https://github.com/man-group/ArcticDB/blob/4184a467d9eee90600ddcbf34d896c763e76f78f/python/arcticdb/version_store/_store.py#L1908 - # if it's true this signals the python layer that the index is must be computed at read time and is - # not returned from the C++ layer. The name of the field is confusing since with the introduction of - # the internal EmptyIndex class the same behaviour (as with ranged index) must occur. We should either - # rename or get rid of this field at all since it looks a bit redundant. + # In case of RangeIndex with 0 rows the C++ layer of ArcticDB will use the internal empty index type + # the index does not store a field and needs is_not_range_index to be False because of this logic: + # https://github.com/man-group/ArcticDB/blob/4184a467d9eee90600ddcbf34d896c763e76f78f/python/arcticdb/version_store/_store.py#L1908 + # In future we need to revisit the protobuf structures for indexes to account for this index = RangeIndex(0,0) index_norm = pd_norm.index index_norm.is_not_range_index = not isinstance(index, RangeIndex) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 7932f541ac..7e1e2566ab 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -15,17 +15,18 @@ class DtypeGenerator: """ Class which can generate all dtypes which ArcticDB supports. It can generate them by type category e.g. int, float, - etc. It can also generate a list of all available dtypes + etc. It can also generate a list of all available dtypes. Not all supported scalar dtypes are generated as this + leads to a combinatorc explosion in test case count. """ @staticmethod def int_dtype(): - return [t + s for s in ["8", "16", "32", "64"] for t in ["int", "uint"]] + return ["int32", "uint64"] @staticmethod def float_dtype(): - return ["float" + s for s in ["32", "64"]] + return ["float64"] @staticmethod def bool_dtype(): @@ -682,10 +683,17 @@ def test_nones(self, lmdb_version_store_static_and_dynamic, index, empty_datafra self.assert_update_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) def test_empty(self, lmdb_version_store_static_and_dynamic, index, empty_dataframe): - df = pd.DataFrame({"col": []}) lmdb_version_store_static_and_dynamic.write("sym", df) - self.assert_append_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) - self.assert_update_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) + self.assert_append_empty_does_nothing( + lmdb_version_store_static_and_dynamic.read("sym"), + lmdb_version_store_static_and_dynamic, + empty_dataframe + ) + self.assert_update_empty_does_nothing( + lmdb_version_store_static_and_dynamic.read("sym"), + lmdb_version_store_static_and_dynamic, + empty_dataframe + ) def test_string(self, lmdb_version_store_static_and_dynamic, index, empty_dataframe): df = pd.DataFrame({"col": ["shord", 20*"long", None]}, index=index) From 733934a20a2410ee1a99d46e0764ec97793a389d Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Mon, 18 Mar 2024 18:06:19 +0200 Subject: [PATCH 14/21] Roll back vcpkg version to fix failing abseil build --- cpp/vcpkg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/vcpkg b/cpp/vcpkg index 8d3649ba34..836a2d684d 160000 --- a/cpp/vcpkg +++ b/cpp/vcpkg @@ -1 +1 @@ -Subproject commit 8d3649ba34aab36914ddd897958599aa0a91b08e +Subproject commit 836a2d684dac6b543563e53ab30a1a5b8afaf97b From a2ce82975deeff90d248da45da0743de9d29a42e Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Mon, 18 Mar 2024 19:52:58 +0200 Subject: [PATCH 15/21] Fix erros in tests --- cpp/arcticdb/storage/single_file_storage.hpp | 2 +- .../arcticdb/version_store/test_empty_column_type.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/arcticdb/storage/single_file_storage.hpp b/cpp/arcticdb/storage/single_file_storage.hpp index 303106da0f..99a4838c29 100644 --- a/cpp/arcticdb/storage/single_file_storage.hpp +++ b/cpp/arcticdb/storage/single_file_storage.hpp @@ -63,7 +63,7 @@ struct formatter { template auto format(const arcticdb::storage::KeyData &k, FormatContext &ctx) const { - return format_to(ctx.out(), "{}:{}", k.key_offset_, k.key_size_); + return fmt::format_to(ctx.out(), "{}:{}", k.key_offset_, k.key_size_); } }; diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 7e1e2566ab..eddb095df9 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -682,21 +682,23 @@ def test_nones(self, lmdb_version_store_static_and_dynamic, index, empty_datafra self.assert_append_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) self.assert_update_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) - def test_empty(self, lmdb_version_store_static_and_dynamic, index, empty_dataframe): + @pytest.mark.parametrize("initial_empty_index", [pd.RangeIndex(0,0), pd.DatetimeIndex([])]) + def test_empty(self, lmdb_version_store_static_and_dynamic, initial_empty_index, empty_dataframe): + df = pd.DataFrame({"col": []}, index=initial_empty_index) lmdb_version_store_static_and_dynamic.write("sym", df) self.assert_append_empty_does_nothing( - lmdb_version_store_static_and_dynamic.read("sym"), + lmdb_version_store_static_and_dynamic.read("sym").data, lmdb_version_store_static_and_dynamic, empty_dataframe ) self.assert_update_empty_does_nothing( - lmdb_version_store_static_and_dynamic.read("sym"), + lmdb_version_store_static_and_dynamic.read("sym").data, lmdb_version_store_static_and_dynamic, empty_dataframe ) def test_string(self, lmdb_version_store_static_and_dynamic, index, empty_dataframe): - df = pd.DataFrame({"col": ["shord", 20*"long", None]}, index=index) + df = pd.DataFrame({"col": ["short", 20*"long", None]}, index=index) lmdb_version_store_static_and_dynamic.write("sym", df) self.assert_append_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) self.assert_update_empty_does_nothing(df, lmdb_version_store_static_and_dynamic, empty_dataframe) From 7b0895f181e7d4fdddc872ffdc59301b48f92a50 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Thu, 21 Mar 2024 19:25:53 +0200 Subject: [PATCH 16/21] Rename is_not_ranged_index proto field in python and C++ Modify the normalization so that 0-rowed dataframe uniformly return Index([]) with dtype='object' --- cpp/arcticdb/python/normalization_checks.cpp | 6 +-- cpp/arcticdb/stream/protobuf_mappings.hpp | 2 +- cpp/arcticdb/stream/stream_utils.hpp | 2 +- cpp/arcticdb/version/version_core.cpp | 2 +- cpp/proto/arcticc/pb2/descriptors.proto | 2 +- python/arcticdb/util/hypothesis.py | 2 +- .../arcticdb/version_store/_normalization.py | 51 ++++++++----------- python/arcticdb/version_store/_store.py | 4 +- .../version_store/test_empty_column_type.py | 8 ++- 9 files changed, 39 insertions(+), 40 deletions(-) diff --git a/cpp/arcticdb/python/normalization_checks.cpp b/cpp/arcticdb/python/normalization_checks.cpp index fcbc35bf4d..dfed25c583 100644 --- a/cpp/arcticdb/python/normalization_checks.cpp +++ b/cpp/arcticdb/python/normalization_checks.cpp @@ -108,13 +108,13 @@ void update_rowcount_normalization_data( constexpr auto error_suffix = " the existing version. Please convert both to use Int64Index if you need this to work."; normalization::check( - old_index->is_not_range_index() == new_index->is_not_range_index(), + old_index->is_physically_stored() == new_index->is_physically_stored(), "The argument uses a {} index which is incompatible with {}", - new_index->is_not_range_index() ? "non-range" : "range-style", + new_index->is_physically_stored() ? "non-range" : "range-style", error_suffix ); - if (!old_index->is_not_range_index()) { + if (!old_index->is_physically_stored()) { normalization::check( old_index->step() == new_index->step(), "The new argument has a different RangeIndex step from {}", diff --git a/cpp/arcticdb/stream/protobuf_mappings.hpp b/cpp/arcticdb/stream/protobuf_mappings.hpp index 9fcb0fb567..a47df433c7 100644 --- a/cpp/arcticdb/stream/protobuf_mappings.hpp +++ b/cpp/arcticdb/stream/protobuf_mappings.hpp @@ -36,7 +36,7 @@ inline arcticdb::proto::descriptors::NormalizationMetadata make_rowcount_norm_me auto id = std::get(stream_id); pandas.mutable_common()->set_name(std::move(id)); NormalizationMetadata_PandasIndex pandas_index; - pandas_index.set_is_not_range_index(true); + pandas_index.set_is_physically_stored(true); pandas.mutable_common()->mutable_index()->CopyFrom(pandas_index); norm_meta.mutable_df()->CopyFrom(pandas); return norm_meta; diff --git a/cpp/arcticdb/stream/stream_utils.hpp b/cpp/arcticdb/stream/stream_utils.hpp index 46a41e26c6..66927997e0 100644 --- a/cpp/arcticdb/stream/stream_utils.hpp +++ b/cpp/arcticdb/stream/stream_utils.hpp @@ -376,7 +376,7 @@ inline std::vector get_index_columns_from_descriptor(const Timeseri ssize_t index_till; const auto& common = norm_info.df().common(); if(auto idx_type = common.index_type_case(); idx_type == arcticdb::proto::descriptors::NormalizationMetadata_Pandas::kIndex) - index_till = common.index().is_not_range_index() ? 1 : stream_descriptor.index().field_count(); + index_till = common.index().is_physically_stored() ? 1 : stream_descriptor.index().field_count(); else index_till = 1 + common.multi_index().field_count(); //# The value of field_count is len(index) - 1 diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 1aa8baca2f..3a24f1b0a2 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -542,7 +542,7 @@ void set_output_descriptors( auto mutable_index = pipeline_context->norm_meta_->mutable_df()->mutable_common()->mutable_index(); mutable_index->set_name(*new_index); mutable_index->clear_fake_name(); - mutable_index->set_is_not_range_index(true); + mutable_index->set_is_physically_stored(true); break; } } diff --git a/cpp/proto/arcticc/pb2/descriptors.proto b/cpp/proto/arcticc/pb2/descriptors.proto index dbc114afd0..420e040734 100644 --- a/cpp/proto/arcticc/pb2/descriptors.proto +++ b/cpp/proto/arcticc/pb2/descriptors.proto @@ -124,7 +124,7 @@ message NormalizationMetadata { string name = 1; // RangeIndex are not represented as a field, hence no name is associated string tz = 2; bool fake_name = 3; - bool is_not_range_index = 4; + bool is_physically_stored = 4; int64 start = 5; // Used for RangeIndex int64 step = 6; // Used for RangeIndex bool is_int = 7; diff --git a/python/arcticdb/util/hypothesis.py b/python/arcticdb/util/hypothesis.py index 321f895b0b..680dcac26e 100644 --- a/python/arcticdb/util/hypothesis.py +++ b/python/arcticdb/util/hypothesis.py @@ -197,7 +197,7 @@ class InputFactories(_InputFactoryValues, Enum): ), _ROWCOUNT, "df", - "is_not_range_index", + "is_physically_stored", ) DF_DTI = ( diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py index 8debc105a2..9e2aa3a79f 100644 --- a/python/arcticdb/version_store/_normalization.py +++ b/python/arcticdb/version_store/_normalization.py @@ -292,18 +292,20 @@ def _normalize_single_index(index, index_names, index_norm, dynamic_strings=None # index: pd.Index or np.ndarray -> np.ndarray index_tz = None - if isinstance(index, RangeIndex): - # skip index since we can reconstruct it, so no need to actually store it - if index.name: - if not isinstance(index.name, int) and not isinstance(index.name, str): - raise NormalizationException( - f"Index name must be a string or an int, received {index.name} of type {type(index.name)}" - ) - if isinstance(index.name, int): - index_norm.is_int = True - index_norm.name = str(index.name) - index_norm.start = index.start if _range_index_props_are_public else index._start - index_norm.step = index.step if _range_index_props_are_public else index._step + if not index_norm.is_physically_stored: + if isinstance(index, RangeIndex): + # skip index since we can reconstruct it, so no need to actually store it + if index.name: + if not isinstance(index.name, int) and not isinstance(index.name, str): + raise NormalizationException( + f"Index name must be a string or an int, received {index.name} of type {type(index.name)}" + ) + if isinstance(index.name, int): + index_norm.is_int = True + index_norm.name = str(index.name) + index_norm.start = index.start if _range_index_props_are_public else index._start + index_norm.step = index.step if _range_index_props_are_public else index._step + return [], [] return [], [] else: coerce_type = DTN64_DTYPE if len(index) == 0 else None @@ -362,16 +364,16 @@ def _denormalize_single_index(item, norm_meta): rtn = Index([]) if len(item.index_columns) == 0: # when then initial index was a RangeIndex - if norm_meta.WhichOneof("index_type") == "index" and not norm_meta.index.is_not_range_index: + if norm_meta.WhichOneof("index_type") == "index" and not norm_meta.index.is_physically_stored: if len(item.data) > 0: if hasattr(norm_meta.index, "step") and norm_meta.index.step != 0: stop = norm_meta.index.start + norm_meta.index.step * len(item.data[0]) name = norm_meta.index.name if norm_meta.index.name else None return RangeIndex(start=norm_meta.index.start, stop=stop, step=norm_meta.index.step, name=name) else: - return None + return Index([]) else: - return RangeIndex(start=0, stop=0, step=1) + return Index([]) # this means that the index is not a datetime index and it's been represented as a regular field in the stream item.index_columns.append(item.names.pop(0)) @@ -545,21 +547,12 @@ def _index_to_records(self, df, pd_norm, dynamic_strings, string_max_len): df.reset_index(fields, inplace=True) index = df.index else: - n_rows = len(index) - n_categorical_columns = len(df.select_dtypes(include="category").columns) - if n_rows == 0 and n_categorical_columns == 0: - # In Pandas 1.0, an Index is used by default for any empty dataframe or series, except if - # there are categorical columns in which case a RangeIndex is used. - # - # In Pandas 2.0, RangeIndex is used by default for _any_ empty dataframe or series. - # - # In case of RangeIndex with 0 rows the C++ layer of ArcticDB will use the internal empty index type - # the index does not store a field and needs is_not_range_index to be False because of this logic: - # https://github.com/man-group/ArcticDB/blob/4184a467d9eee90600ddcbf34d896c763e76f78f/python/arcticdb/version_store/_store.py#L1908 - # In future we need to revisit the protobuf structures for indexes to account for this - index = RangeIndex(0,0) + is_not_range_index = not isinstance(index, RangeIndex) + df_has_rows = not(len(index) == 0 and len(df.select_dtypes(include="category").columns) == 0) index_norm = pd_norm.index - index_norm.is_not_range_index = not isinstance(index, RangeIndex) + index_norm.is_physically_stored = is_not_range_index or df_has_rows + if not df_has_rows: + index = Index([]) return _normalize_single_index(index, list(index.names), index_norm, dynamic_strings, string_max_len) diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py index d81c42885d..20eb0a2041 100644 --- a/python/arcticdb/version_store/_store.py +++ b/python/arcticdb/version_store/_store.py @@ -1905,7 +1905,7 @@ def _get_index_columns_from_descriptor(descriptor): # is 0. idx_type = norm_info.df.common.WhichOneof("index_type") if idx_type == "index": - last_index_column_idx = 1 if norm_info.df.common.index.is_not_range_index else 0 + last_index_column_idx = 1 if norm_info.df.common.index.is_physically_stored else 0 else: # The value of field_count is len(index) - 1 last_index_column_idx = 1 + norm_info.df.common.multi_index.field_count @@ -2518,7 +2518,7 @@ def _process_info(self, symbol: str, dit, as_of: Optional[VersionQueryInput] = N else: index_metadata = timeseries_descriptor.normalization.df.common.multi_index - if index_type == "multi_index" or (index_type == "index" and index_metadata.is_not_range_index): + if index_type == "multi_index" or (index_type == "index" and index_metadata.is_physically_stored): index_name_from_store = columns.pop(0) has_fake_name = ( 0 in index_metadata.fake_field_pos if index_type == "multi_index" else index_metadata.fake_name diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index eddb095df9..b1c9595513 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -816,4 +816,10 @@ def test_cannot_append_different_index_type_after_first_non_empty(self, lmdb_ver lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": []})) lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": [1,2,3]}, index=incompatible_indexes[0])) with pytest.raises(Exception): - lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": [4, 5, 6]}, index=incompatible_indexes[1])) \ No newline at end of file + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": [4, 5, 6]}, index=incompatible_indexes[1])) + + +def test_foo(lmdb_version_store_v2): + lmdb_version_store_v2.write("sym", pd.DataFrame({"col": []}, index=pd.DatetimeIndex([]))) + print(lmdb_version_store_v2.read("sym").data) + print(lmdb_version_store_v2.read("sym").data.index) \ No newline at end of file From 81871d9e49c06ca2056145b531c77b77115ee478 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Wed, 27 Mar 2024 12:43:17 +0200 Subject: [PATCH 17/21] Fix failing tests --- cpp/arcticdb/python/normalization_checks.cpp | 2 +- cpp/arcticdb/stream/index.hpp | 8 +++---- .../arcticdb/version_store/_normalization.py | 23 +++++++++---------- .../version_store/test_empty_column_type.py | 8 +------ 4 files changed, 17 insertions(+), 24 deletions(-) diff --git a/cpp/arcticdb/python/normalization_checks.cpp b/cpp/arcticdb/python/normalization_checks.cpp index dfed25c583..109c3b4acc 100644 --- a/cpp/arcticdb/python/normalization_checks.cpp +++ b/cpp/arcticdb/python/normalization_checks.cpp @@ -196,7 +196,7 @@ void fix_normalization_or_throw( auto &old_norm = existing_isr.tsd().proto().normalization(); auto &new_norm = new_frame.norm_meta; if (check_pandas_like(old_norm, new_norm)) { - const IndexDescriptor::Type old_index_type = existing_isr.seg().descriptor().index().type(); + const IndexDescriptor::Type old_index_type = existing_isr.tsd().proto().stream_descriptor().index().kind(); const IndexDescriptor::Type new_index_type = new_frame.desc.index().type(); if (old_index_type == new_index_type && old_index_type == IndexDescriptor::ROWCOUNT) { update_rowcount_normalization_data(old_norm, new_norm, existing_isr.tsd().proto().total_rows()); diff --git a/cpp/arcticdb/stream/index.hpp b/cpp/arcticdb/stream/index.hpp index 4e0dd11b53..ddd881f04a 100644 --- a/cpp/arcticdb/stream/index.hpp +++ b/cpp/arcticdb/stream/index.hpp @@ -313,19 +313,19 @@ class EmptyIndex : public BaseIndex { } [[nodiscard]] static IndexValue start_value_for_segment(const SegmentInMemory& segment) { - return static_cast(segment.offset()); + return static_cast(segment.offset()); } [[nodiscard]] static IndexValue end_value_for_segment(const SegmentInMemory& segment) { - return static_cast(segment.offset()); + return static_cast(segment.offset()); } [[nodiscard]] static IndexValue start_value_for_keys_segment(const SegmentInMemory& segment) { - return static_cast(segment.offset()); + return static_cast(segment.offset()); } [[nodiscard]] static IndexValue end_value_for_keys_segment(const SegmentInMemory& segment) { - return static_cast(segment.offset()); + return static_cast(segment.offset()); } }; diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py index 9e2aa3a79f..a7c666767d 100644 --- a/python/arcticdb/version_store/_normalization.py +++ b/python/arcticdb/version_store/_normalization.py @@ -292,20 +292,19 @@ def _normalize_single_index(index, index_names, index_norm, dynamic_strings=None # index: pd.Index or np.ndarray -> np.ndarray index_tz = None - if not index_norm.is_physically_stored: + if isinstance(index_norm, NormalizationMetadata.PandasIndex) and not index_norm.is_physically_stored: + if index.name: + if not isinstance(index.name, int) and not isinstance(index.name, str): + raise NormalizationException( + f"Index name must be a string or an int, received {index.name} of type {type(index.name)}" + ) + if isinstance(index.name, int): + index_norm.is_int = True + index_norm.name = str(index.name) if isinstance(index, RangeIndex): # skip index since we can reconstruct it, so no need to actually store it - if index.name: - if not isinstance(index.name, int) and not isinstance(index.name, str): - raise NormalizationException( - f"Index name must be a string or an int, received {index.name} of type {type(index.name)}" - ) - if isinstance(index.name, int): - index_norm.is_int = True - index_norm.name = str(index.name) index_norm.start = index.start if _range_index_props_are_public else index._start index_norm.step = index.step if _range_index_props_are_public else index._step - return [], [] return [], [] else: coerce_type = DTN64_DTYPE if len(index) == 0 else None @@ -550,7 +549,7 @@ def _index_to_records(self, df, pd_norm, dynamic_strings, string_max_len): is_not_range_index = not isinstance(index, RangeIndex) df_has_rows = not(len(index) == 0 and len(df.select_dtypes(include="category").columns) == 0) index_norm = pd_norm.index - index_norm.is_physically_stored = is_not_range_index or df_has_rows + index_norm.is_physically_stored = is_not_range_index and df_has_rows if not df_has_rows: index = Index([]) @@ -844,7 +843,6 @@ def normalize(self, item, string_max_len=None, dynamic_strings=False, coerce_col # type: (DataFrame, Optional[int])->NormalizedInput norm_meta = NormalizationMetadata() norm_meta.df.common.mark = True - if isinstance(item.columns, RangeIndex): norm_meta.df.has_synthetic_columns = True @@ -1097,6 +1095,7 @@ def normalize(self, item, string_max_len=None, dynamic_strings=False, coerce_col norm_meta = NormalizationMetadata() norm_meta.ts.mark = True index_norm = norm_meta.ts.common.index + index_norm.is_physically_stored = len(item.times) > 0 and not isinstance(item.times, RangeIndex) index_names, ix_vals = _normalize_single_index( item.times, ["times"], index_norm, dynamic_strings, string_max_len ) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index b1c9595513..eddb095df9 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -816,10 +816,4 @@ def test_cannot_append_different_index_type_after_first_non_empty(self, lmdb_ver lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": []})) lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": [1,2,3]}, index=incompatible_indexes[0])) with pytest.raises(Exception): - lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": [4, 5, 6]}, index=incompatible_indexes[1])) - - -def test_foo(lmdb_version_store_v2): - lmdb_version_store_v2.write("sym", pd.DataFrame({"col": []}, index=pd.DatetimeIndex([]))) - print(lmdb_version_store_v2.read("sym").data) - print(lmdb_version_store_v2.read("sym").data.index) \ No newline at end of file + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": [4, 5, 6]}, index=incompatible_indexes[1])) \ No newline at end of file From f86640d0e32bb2727480fdfd8e2e917f9bb185f0 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Fri, 29 Mar 2024 15:55:12 +0200 Subject: [PATCH 18/21] Resolve review comments --- cpp/arcticdb/python/python_to_tensor_frame.cpp | 2 +- cpp/arcticdb/version/schema_checks.hpp | 3 +-- .../arcticdb/version_store/test_empty_column_type.py | 11 +++++------ 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/cpp/arcticdb/python/python_to_tensor_frame.cpp b/cpp/arcticdb/python/python_to_tensor_frame.cpp index 9988596286..15ca454eae 100644 --- a/cpp/arcticdb/python/python_to_tensor_frame.cpp +++ b/cpp/arcticdb/python/python_to_tensor_frame.cpp @@ -220,7 +220,7 @@ std::shared_ptr py_ndf_to_frame( "Number idx names {} and values {} do not match", idx_names.size(), idx_vals.size()); - if (idx_names.empty() == false) { + if (!idx_names.empty()) { util::check(idx_names.size() == 1, "Multi-indexed dataframes not handled"); auto index_tensor = obj_to_tensor(idx_vals[0].ptr()); util::check(index_tensor.ndim() == 1, "Multi-dimensional indexes not handled"); diff --git a/cpp/arcticdb/version/schema_checks.hpp b/cpp/arcticdb/version/schema_checks.hpp index 0555cc438b..c4364f53f8 100644 --- a/cpp/arcticdb/version/schema_checks.hpp +++ b/cpp/arcticdb/version/schema_checks.hpp @@ -64,8 +64,7 @@ inline void check_normalization_index_match(NormalizationOperation operation, } inline bool columns_match(const StreamDescriptor& left, const StreamDescriptor& right) { - const int left_index_is_empty = left.index().type() == IndexDescriptor::EMPTY; - const int right_fields_offset = right.index().field_count() * left_index_is_empty; + const int right_fields_offset = left.index().type() == IndexDescriptor::EMPTY ? right.index().field_count() : 0; // The empty index is compatible with all other index types. Differences in the index fields in this case is // allowed. The index fields are always the first in the list. if (left.fields().size() + right_fields_offset != right.fields().size()) { diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index eddb095df9..4faf989871 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -14,9 +14,8 @@ class DtypeGenerator: """ - Class which can generate all dtypes which ArcticDB supports. It can generate them by type category e.g. int, float, - etc. It can also generate a list of all available dtypes. Not all supported scalar dtypes are generated as this - leads to a combinatorc explosion in test case count. + Can generate representative subset of all supported dtypes. Can generate by category (e.g. int, float, etc...) or + all. Generating the full set of dtypes leads to combinatoric explosion in the number of test cases. """ @@ -75,7 +74,7 @@ def empty_index(request): yield request.param -class TestCanAppendToColunWithNones: +class TestCanAppendToColumnWithNones: """ Tests that it is possible to write a column containing None values and latter append to it. Initially the type of the column must be empty type after the append the column must be of the same type as the appended data. @@ -499,11 +498,11 @@ def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): ) lmdb_version_store_static_and_dynamic.update( "sym", - pd.DataFrame({"col": [None, None]}, index=self.update_index()) + pd.DataFrame({"col": [None, np.nan]}, index=self.update_index()) ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym").data, - pd.DataFrame({"col": [1, float("NaN"), float("NaN"), 4]}, index=self.index(), dtype=float_dtype) + pd.DataFrame({"col": [1, float("NaN"), np.nan, 4]}, index=self.index(), dtype=float_dtype) ) def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): From 35ca8bc83cef70ddb03367d3265cd911fad698af Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Mon, 1 Apr 2024 16:34:22 +0300 Subject: [PATCH 19/21] Handle multiindex --- python/arcticdb/version_store/_normalization.py | 7 +++++-- .../unit/arcticdb/version_store/test_empty_column_type.py | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py index a7c666767d..0656a730d1 100644 --- a/python/arcticdb/version_store/_normalization.py +++ b/python/arcticdb/version_store/_normalization.py @@ -291,7 +291,6 @@ def _from_tz_timestamp(ts, tz): def _normalize_single_index(index, index_names, index_norm, dynamic_strings=None, string_max_len=None): # index: pd.Index or np.ndarray -> np.ndarray index_tz = None - if isinstance(index_norm, NormalizationMetadata.PandasIndex) and not index_norm.is_physically_stored: if index.name: if not isinstance(index.name, int) and not isinstance(index.name, str): @@ -525,7 +524,11 @@ def denormalize(self, item, norm_meta): class _PandasNormalizer(Normalizer): def _index_to_records(self, df, pd_norm, dynamic_strings, string_max_len): index = df.index - if isinstance(index, MultiIndex): + if len(index) == 0 and len(df.select_dtypes(include="category").columns) == 0: + index_norm = pd_norm.index + index_norm.is_physically_stored = False + index = Index([]) + elif isinstance(index, MultiIndex): # This is suboptimal and only a first implementation since it reduplicates the data index_norm = pd_norm.multi_index index_norm.field_count = len(index.levels) - 1 diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 4faf989871..b0311cdbdd 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -69,7 +69,7 @@ def dtype(request): yield request.param -@pytest.fixture(params=[pd.RangeIndex(0,0), pd.DatetimeIndex([])]) +@pytest.fixture(params=[pd.RangeIndex(0,0), pd.DatetimeIndex([]), pd.MultiIndex.from_arrays([[],[]], names=["a", "b"])]) def empty_index(request): yield request.param @@ -569,6 +569,7 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): ) ) + class TestCanAppendToEmptyColumn: """ Tests that it's possible to append to a column which contains no rows. The type of the columns, including the index From 5406651a0c04aa66dc99ac0ad08a7444e9bd6e76 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Tue, 2 Apr 2024 14:18:40 +0300 Subject: [PATCH 20/21] Split index.hpp --- cpp/arcticdb/CMakeLists.txt | 1 + cpp/arcticdb/stream/index.cpp | 247 ++++++++++++++++++++++++++++++ cpp/arcticdb/stream/index.hpp | 280 ++++++---------------------------- 3 files changed, 292 insertions(+), 236 deletions(-) create mode 100644 cpp/arcticdb/stream/index.cpp diff --git a/cpp/arcticdb/CMakeLists.txt b/cpp/arcticdb/CMakeLists.txt index fc34d7c0d1..e631998773 100644 --- a/cpp/arcticdb/CMakeLists.txt +++ b/cpp/arcticdb/CMakeLists.txt @@ -461,6 +461,7 @@ set(arcticdb_srcs storage/storage_factory.cpp stream/aggregator.cpp stream/append_map.cpp + stream/index.cpp stream/piloted_clock.cpp toolbox/library_tool.cpp util/allocator.cpp diff --git a/cpp/arcticdb/stream/index.cpp b/cpp/arcticdb/stream/index.cpp new file mode 100644 index 0000000000..6778789d6b --- /dev/null +++ b/cpp/arcticdb/stream/index.cpp @@ -0,0 +1,247 @@ +/* Copyright 2024 Man Group Operations Limited + * + * Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt. + * + * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software + * will be governed by the Apache License, version 2.0. + */ + +#include +namespace arcticdb::stream { + +IndexDescriptor::Type get_index_value_type(const AtomKey& key) { + return std::holds_alternative(key.start_index()) ? IndexDescriptor::TIMESTAMP + : IndexDescriptor::STRING; + } + +template + StreamDescriptor BaseIndex::create_stream_descriptor( + StreamId stream_id, + std::initializer_list fields +) const { + std::vector fds{fields}; + return create_stream_descriptor(stream_id, folly::range(fds)); +} + +template const Derived* BaseIndex::derived() const { + return static_cast(this); +} + +template BaseIndex::operator IndexDescriptor() const { + return {Derived::field_count(), Derived::type()}; +} + +template FieldRef BaseIndex::field(size_t) const { + return {static_cast(typename Derived::TypeDescTag{}), std::string_view(derived()->name())}; +} + +TimeseriesIndex::TimeseriesIndex(const std::string& name) : name_(name) {} + +TimeseriesIndex TimeseriesIndex::default_index() { + return TimeseriesIndex(DefaultName); +} + +void TimeseriesIndex::check(const FieldCollection& fields) const { + const size_t fields_size = fields.size(); + constexpr int current_fields_size = int(field_count()); + + const TypeDescriptor& first_field_type = fields[0].type(); + const TypeDescriptor& current_first_field_type = this->field(0).type(); + + const bool valid_type_promotion = has_valid_type_promotion(first_field_type, current_first_field_type).has_value(); + const bool trivial_type_compatibility = trivially_compatible_types(first_field_type, current_first_field_type); + + const bool compatible_types = valid_type_promotion || trivial_type_compatibility; + + util::check_arg( + fields_size >= current_fields_size, + "expected at least {} fields, actual {}", + current_fields_size, + fields_size + ); + util::check_arg(compatible_types, "expected field[0]={}, actual {}", this->field(0), fields[0]); +} + +IndexValue TimeseriesIndex::start_value_for_segment(const SegmentInMemory& segment) { + if (segment.row_count() == 0) + return {NumericIndex{0}}; + auto first_ts = segment.template scalar_at(0, 0).value(); + return {first_ts}; +} + +IndexValue TimeseriesIndex::end_value_for_segment(const SegmentInMemory& segment) { + auto row_count = segment.row_count(); + if (row_count == 0) + return {NumericIndex{0}}; + auto last_ts = segment.template scalar_at(row_count - 1, 0).value(); + return {last_ts}; +} + +IndexValue TimeseriesIndex::start_value_for_keys_segment(const SegmentInMemory& segment) { + if (segment.row_count() == 0) + return {NumericIndex{0}}; + auto start_index_id = int(pipelines::index::Fields::start_index); + auto first_ts = segment.template scalar_at(0, start_index_id).value(); + return {first_ts}; +} + +IndexValue TimeseriesIndex::end_value_for_keys_segment(const SegmentInMemory& segment) { + auto row_count = segment.row_count(); + if (row_count == 0) + return {NumericIndex{0}}; + auto end_index_id = int(pipelines::index::Fields::end_index); + auto last_ts = segment.template scalar_at(row_count - 1, end_index_id).value(); + return {last_ts}; +} + +const char* TimeseriesIndex::name() const { + return name_.c_str(); +} + +TimeseriesIndex TimeseriesIndex::make_from_descriptor(const StreamDescriptor& desc) { + if (desc.field_count() > 0) + return TimeseriesIndex(std::string(desc.fields(0).name())); + + return TimeseriesIndex(DefaultName); +} + + +TableIndex::TableIndex(const std::string& name) : name_(name) { +} + +TableIndex TableIndex::default_index() { + return TableIndex(DefaultName); +} + +void TableIndex::check(const FieldCollection& fields) const { + util::check_arg( + fields.size() >= int(field_count()), + "expected at least {} fields, actual {}", + field_count(), + fields.size() + ); + + util::check(fields.ref_at(0) == field(0), "Field descriptor mismatch {} != {}", fields.ref_at(0), field(0)); +} + +IndexValue TableIndex::start_value_for_segment(const SegmentInMemory& segment) { + auto string_index = segment.string_at(0, 0).value(); + return {std::string{string_index}}; +} + +IndexValue TableIndex::end_value_for_segment(const SegmentInMemory& segment) { + auto last_rowid = segment.row_count() - 1; + auto string_index = segment.string_at(last_rowid, 0).value(); + return {std::string{string_index}}; +} + +IndexValue TableIndex::start_value_for_keys_segment(const SegmentInMemory& segment) { + if (segment.row_count() == 0) + return {NumericIndex{0}}; + auto start_index_id = int(pipelines::index::Fields::start_index); + auto string_index = segment.string_at(0, start_index_id).value(); + return {std::string{string_index}}; +} + +IndexValue TableIndex::end_value_for_keys_segment(const SegmentInMemory& segment) { + auto row_count = segment.row_count(); + if (row_count == 0) + return {NumericIndex{0}}; + auto end_index_id = int(pipelines::index::Fields::end_index); + auto string_index = segment.string_at(row_count - 1, end_index_id).value(); + return {std::string{string_index}}; +} + +TableIndex TableIndex::make_from_descriptor(const StreamDescriptor& desc) { + if (desc.field_count() > 0) + return TableIndex(std::string(desc.field(0).name())); + + return TableIndex(DefaultName); +} + +const char* TableIndex::name() const { + return name_.c_str(); +} + +RowCountIndex RowCountIndex::default_index() { + return RowCountIndex{}; +} + + +IndexValue RowCountIndex::start_value_for_segment(const SegmentInMemory& segment) { + return static_cast(segment.offset()); +} + +IndexValue RowCountIndex::end_value_for_segment(const SegmentInMemory& segment) { + return static_cast(segment.offset() + (segment.row_count() - 1)); +} + +IndexValue RowCountIndex::start_value_for_keys_segment(const SegmentInMemory& segment) { + return static_cast(segment.offset()); +} + +IndexValue RowCountIndex::end_value_for_keys_segment(const SegmentInMemory& segment) { + return static_cast(segment.offset() + (segment.row_count() - 1)); +} + +RowCountIndex RowCountIndex::make_from_descriptor(const StreamDescriptor&) const { + return RowCountIndex::default_index(); +} + +IndexValue EmptyIndex::start_value_for_segment(const SegmentInMemory& segment) { + return static_cast(segment.offset()); +} + +IndexValue EmptyIndex::end_value_for_segment(const SegmentInMemory& segment) { + return static_cast(segment.offset()); +} + +IndexValue EmptyIndex::start_value_for_keys_segment(const SegmentInMemory& segment) { + return static_cast(segment.offset()); +} + +IndexValue EmptyIndex::end_value_for_keys_segment(const SegmentInMemory& segment) { + return static_cast(segment.offset()); +} + +Index index_type_from_descriptor(const StreamDescriptor& desc) { + switch (desc.index().proto().kind()) { + case IndexDescriptor::EMPTY: return EmptyIndex{}; + case IndexDescriptor::TIMESTAMP: return TimeseriesIndex::make_from_descriptor(desc); + case IndexDescriptor::STRING: return TableIndex::make_from_descriptor(desc); + case IndexDescriptor::ROWCOUNT: return RowCountIndex{}; + default: + util::raise_rte( + "Data obtained from storage refers to an index type that this build of ArcticDB doesn't understand ({}).", + int(desc.index().proto().kind()) + ); + } +} + +Index default_index_type_from_descriptor(const IndexDescriptor::Proto& desc) { + switch (desc.kind()) { + case IndexDescriptor::EMPTY: return EmptyIndex{}; + case IndexDescriptor::TIMESTAMP: return TimeseriesIndex::default_index(); + case IndexDescriptor::STRING: return TableIndex::default_index(); + case IndexDescriptor::ROWCOUNT: return RowCountIndex::default_index(); + default: util::raise_rte("Unknown index type {} trying to generate index type", int(desc.kind())); + } +} + +Index default_index_type_from_descriptor(const IndexDescriptor& desc) { + return default_index_type_from_descriptor(desc.proto()); +} + +IndexDescriptor get_descriptor_from_index(const Index& index) { + return util::variant_match(index, [](const auto& idx) { return static_cast(idx); }); +} + +Index empty_index() { + return RowCountIndex::default_index(); +} + +template class BaseIndex; +template class BaseIndex; +template class BaseIndex; +template class BaseIndex; +} \ No newline at end of file diff --git a/cpp/arcticdb/stream/index.hpp b/cpp/arcticdb/stream/index.hpp index ddd881f04a..3c174e3da2 100644 --- a/cpp/arcticdb/stream/index.hpp +++ b/cpp/arcticdb/stream/index.hpp @@ -23,37 +23,19 @@ namespace arcticdb::stream { using namespace arcticdb::entity; -inline IndexDescriptor::Type get_index_value_type(const AtomKey &key) { - return std::holds_alternative(key.start_index()) ? IndexDescriptor::TIMESTAMP : IndexDescriptor::STRING; -} +IndexDescriptor::Type get_index_value_type(const AtomKey& key); -template +template class BaseIndex { - public: - template - StreamDescriptor create_stream_descriptor(StreamId stream_id, RangeType&& fields) const { +public: + template StreamDescriptor create_stream_descriptor(StreamId stream_id, RangeType&& fields) const { return stream_descriptor(stream_id, *derived(), std::move(fields)); } - [[nodiscard]] StreamDescriptor create_stream_descriptor( - StreamId stream_id, - std::initializer_list fields) const { - std::vector fds{fields}; - return create_stream_descriptor(stream_id, folly::range(fds)); - - } - - [[nodiscard]] const Derived* derived() const { - return static_cast(this); - } - - explicit operator IndexDescriptor() const { - return {Derived::field_count(), Derived::type()}; - } - - [[nodiscard]] FieldRef field(size_t) const { - return {static_cast(typename Derived::TypeDescTag{}), std::string_view(derived()->name())}; - } + [[nodiscard]] StreamDescriptor create_stream_descriptor(StreamId stream_id, std::initializer_list fields) const; + [[nodiscard]] const Derived* derived() const; + explicit operator IndexDescriptor() const; + [[nodiscard]] FieldRef field(size_t) const; }; //TODO make this into just a numeric index, of which timestamp is a special case @@ -61,14 +43,6 @@ class TimeseriesIndex : public BaseIndex { public: static constexpr const char* DefaultName = "time" ; - explicit TimeseriesIndex(const std::string& name) : - name_(name) { - } - - static TimeseriesIndex default_index() { - return TimeseriesIndex(DefaultName); - } - using TypeDescTag = TypeDescriptorTag< DataTypeTag, DimensionTag>; @@ -80,63 +54,19 @@ class TimeseriesIndex : public BaseIndex { static constexpr IndexDescriptor::Type type() { return IndexDescriptor::TIMESTAMP; } + TimeseriesIndex(const std::string& name); + static TimeseriesIndex default_index(); + void check(const FieldCollection& fields) const; + static IndexValue start_value_for_segment(const SegmentInMemory& segment); + static IndexValue end_value_for_segment(const SegmentInMemory& segment); + static IndexValue start_value_for_keys_segment(const SegmentInMemory& segment); + static IndexValue end_value_for_keys_segment(const SegmentInMemory& segment); - void check(const FieldCollection &fields) const { - const size_t fields_size = fields.size(); - constexpr int current_fields_size = int(field_count()); - - const TypeDescriptor &first_field_type = fields[0].type(); - const TypeDescriptor ¤t_first_field_type = this->field(0).type(); - - const bool valid_type_promotion = has_valid_type_promotion(first_field_type, current_first_field_type).has_value(); - const bool trivial_type_compatibility = trivially_compatible_types(first_field_type, current_first_field_type); - - const bool compatible_types = valid_type_promotion || trivial_type_compatibility; + [[nodiscard]] const char* name() const; + static TimeseriesIndex make_from_descriptor(const StreamDescriptor& desc); - util::check_arg(fields_size >= current_fields_size, "expected at least {} fields, actual {}", - current_fields_size, fields_size); - util::check_arg(compatible_types, "expected field[0]={}, actual {}", - this->field(0), fields[0]); - } - - template - static IndexValue start_value_for_segment(const SegmentType &segment) { - if (segment.row_count() == 0) - return { NumericIndex{0} }; - auto first_ts = segment.template scalar_at(0, 0).value(); - return {first_ts}; - } - - template - static IndexValue end_value_for_segment(const SegmentType &segment) { - auto row_count = segment.row_count(); - if (row_count == 0) - return { NumericIndex{0} }; - auto last_ts = segment.template scalar_at(row_count - 1, 0).value(); - return {last_ts}; - } - - template - static IndexValue start_value_for_keys_segment(const SegmentType &segment) { - if (segment.row_count() == 0) - return { NumericIndex{0} }; - auto start_index_id = int(pipelines::index::Fields::start_index); - auto first_ts = segment.template scalar_at(0, start_index_id).value(); - return {first_ts}; - } - - template - static IndexValue end_value_for_keys_segment(const SegmentType &segment) { - auto row_count = segment.row_count(); - if (row_count == 0) - return { NumericIndex{0} }; - auto end_index_id = int(pipelines::index::Fields::end_index); - auto last_ts = segment.template scalar_at(row_count - 1, end_index_id).value(); - return {last_ts}; - } - - template - void set(RowCellSetter setter, const IndexValue &index_value) { + template + void set(RowCellSetter setter, const IndexValue& index_value) { if (std::holds_alternative(index_value)) { auto ts = std::get(index_value); util::check_arg(ts >= ts_, "timestamp decreasing, current val={}, candidate={}", ts_, ts); @@ -146,15 +76,6 @@ class TimeseriesIndex : public BaseIndex { util::raise_rte("Cannot set this type, expecting timestamp"); } - [[nodiscard]] const char *name() const { return name_.c_str(); } - - static TimeseriesIndex make_from_descriptor(const StreamDescriptor& desc) { - if(desc.field_count() > 0) - return TimeseriesIndex(std::string(desc.fields(0).name())); - - return TimeseriesIndex(DefaultName); - } - private: std::string name_; timestamp ts_ = 0; @@ -164,13 +85,9 @@ class TableIndex : public BaseIndex { public: static constexpr const char* DefaultName = "Key"; - explicit TableIndex(const std::string& name) : - name_(name) { - } + explicit TableIndex(const std::string& name); - static TableIndex default_index() { - return TableIndex(DefaultName); - } + static TableIndex default_index(); using TypeDescTag = TypeDescriptorTag< DataTypeTag, @@ -184,45 +101,15 @@ class TableIndex : public BaseIndex { return IndexDescriptor::STRING; } - void check(const FieldCollection &fields) const { - util::check_arg(fields.size() >= int(field_count()), "expected at least {} fields, actual {}", - field_count(), fields.size()); + void check(const FieldCollection& fields) const; - util::check(fields.ref_at(0) == field(0), - "Field descriptor mismatch {} != {}", fields.ref_at(0), field(0)); - } + static IndexValue start_value_for_segment(const SegmentInMemory& segment); - template - static IndexValue start_value_for_segment(const SegmentType &segment) { - auto string_index = segment.string_at(0, 0).value(); - return {std::string{string_index}}; - } + static IndexValue end_value_for_segment(const SegmentInMemory& segment); - template - static IndexValue end_value_for_segment(const SegmentType &segment) { - auto last_rowid = segment.row_count() - 1; - auto string_index = segment.string_at(last_rowid, 0).value(); - return {std::string{string_index}}; - } + static IndexValue start_value_for_keys_segment(const SegmentInMemory& segment); - template - static IndexValue start_value_for_keys_segment(const SegmentType &segment) { - if (segment.row_count() == 0) - return { NumericIndex{0} }; - auto start_index_id = int(pipelines::index::Fields::start_index); - auto string_index = segment.string_at(0, start_index_id).value(); - return {std::string{string_index}}; - } - - template - static IndexValue end_value_for_keys_segment(const SegmentType &segment) { - auto row_count = segment.row_count(); - if (row_count == 0) - return { NumericIndex{0} }; - auto end_index_id = int(pipelines::index::Fields::end_index); - auto string_index = segment.string_at(row_count - 1, end_index_id).value(); - return {std::string{string_index}}; - } + static IndexValue end_value_for_keys_segment(const SegmentInMemory& segment); template void set(RowCellSetter setter, const IndexValue &index_value) const { @@ -232,14 +119,9 @@ class TableIndex : public BaseIndex { util::raise_rte("Cannot set this type. Expecting std::string"); } - static TableIndex make_from_descriptor(const StreamDescriptor& desc) { - if(desc.field_count() > 0) - return TableIndex(std::string(desc.field(0).name())); - - return TableIndex(DefaultName); - } + static TableIndex make_from_descriptor(const StreamDescriptor& desc); - const char *name() const { return name_.c_str(); } + const char* name() const; private: std::string name_; @@ -253,42 +135,26 @@ class RowCountIndex : public BaseIndex { RowCountIndex() = default; - static RowCountIndex default_index() { - return RowCountIndex{}; - } + static RowCountIndex default_index(); static constexpr size_t field_count() { return 0; } static constexpr IndexDescriptor::Type type() { return IndexDescriptor::ROWCOUNT; } - template - static IndexValue start_value_for_segment(const SegmentType &segment) { - return static_cast(segment.offset()); - } + static IndexValue start_value_for_segment(const SegmentInMemory& segment); - template - static IndexValue end_value_for_segment(const SegmentType &segment) { - return static_cast(segment.offset() + (segment.row_count() - 1)); - } + static IndexValue end_value_for_segment(const SegmentInMemory& segment); - template - static IndexValue start_value_for_keys_segment(const SegmentType &segment) { - return static_cast(segment.offset()); - } + static IndexValue start_value_for_keys_segment(const SegmentInMemory& segment); - template - static IndexValue end_value_for_keys_segment(const SegmentType &segment) { - return static_cast(segment.offset() + (segment.row_count() - 1)); - } + static IndexValue end_value_for_keys_segment(const SegmentInMemory& segment); template void set(RowCellSetter, const IndexValue & = {timestamp(0)}) { // No index value } - RowCountIndex make_from_descriptor(const StreamDescriptor&) const { - return RowCountIndex::default_index(); - } + RowCountIndex make_from_descriptor(const StreamDescriptor&) const; static constexpr const char *name() { return "row_count"; } }; @@ -312,80 +178,22 @@ class EmptyIndex : public BaseIndex { return {}; } - [[nodiscard]] static IndexValue start_value_for_segment(const SegmentInMemory& segment) { - return static_cast(segment.offset()); - } - - [[nodiscard]] static IndexValue end_value_for_segment(const SegmentInMemory& segment) { - return static_cast(segment.offset()); - } - - [[nodiscard]] static IndexValue start_value_for_keys_segment(const SegmentInMemory& segment) { - return static_cast(segment.offset()); - } - - [[nodiscard]] static IndexValue end_value_for_keys_segment(const SegmentInMemory& segment) { - return static_cast(segment.offset()); - } + [[nodiscard]] static IndexValue start_value_for_segment(const SegmentInMemory& segment); + [[nodiscard]] static IndexValue end_value_for_segment(const SegmentInMemory& segment); + [[nodiscard]] static IndexValue start_value_for_keys_segment(const SegmentInMemory& segment); + [[nodiscard]] static IndexValue end_value_for_keys_segment(const SegmentInMemory& segment); }; using Index = std::variant; -inline Index index_type_from_descriptor(const StreamDescriptor &desc) { - switch (desc.index().proto().kind()) { - case IndexDescriptor::EMPTY: return EmptyIndex{}; - case IndexDescriptor::TIMESTAMP: - return TimeseriesIndex::make_from_descriptor(desc); - case IndexDescriptor::STRING: - return TableIndex::make_from_descriptor(desc); - case IndexDescriptor::ROWCOUNT: - return RowCountIndex{}; - default:util::raise_rte("Data obtained from storage refers to an index type that this build of ArcticDB doesn't understand ({}).", int(desc.index().proto().kind())); - } -} - -inline Index default_index_type_from_descriptor(const IndexDescriptor::Proto &desc) { - switch (desc.kind()) { - case IndexDescriptor::EMPTY: return EmptyIndex{}; - case IndexDescriptor::TIMESTAMP: - return TimeseriesIndex::default_index(); - case IndexDescriptor::STRING: - return TableIndex::default_index(); - case IndexDescriptor::ROWCOUNT: - return RowCountIndex::default_index(); - default: - util::raise_rte("Unknown index type {} trying to generate index type", int(desc.kind())); - } -} +Index index_type_from_descriptor(const StreamDescriptor& desc); +Index default_index_type_from_descriptor(const IndexDescriptor::Proto& desc); // Only to be used for visitation to get field count etc as the name is not set -inline Index variant_index_from_type(IndexDescriptor::Type type) { - switch (type) { - case IndexDescriptor::EMPTY: return EmptyIndex{}; - case IndexDescriptor::TIMESTAMP: - return TimeseriesIndex{TimeseriesIndex::DefaultName}; - case IndexDescriptor::STRING: - return TableIndex{TableIndex::DefaultName}; - case IndexDescriptor::ROWCOUNT: - return RowCountIndex{}; - default: - util::raise_rte("Unknown index type {} trying to generate index type", int(type)); - } -} - -inline Index default_index_type_from_descriptor(const IndexDescriptor &desc) { - return default_index_type_from_descriptor(desc.proto()); -} - -inline IndexDescriptor get_descriptor_from_index(const Index& index) { - return util::variant_match(index, [] (const auto& idx) { - return static_cast(idx); - }); -} - -inline Index empty_index() { - return RowCountIndex::default_index(); -} +Index variant_index_from_type(IndexDescriptor::Type type); +Index default_index_type_from_descriptor(const IndexDescriptor& desc); +IndexDescriptor get_descriptor_from_index(const Index& index); +Index empty_index(); } From 86f2e8e57501aafac35fb9206aa4c9c7184d8a0b Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Tue, 2 Apr 2024 15:37:54 +0300 Subject: [PATCH 21/21] Clean headers --- cpp/arcticdb/stream/index.cpp | 5 +++++ cpp/arcticdb/stream/index.hpp | 9 ++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/arcticdb/stream/index.cpp b/cpp/arcticdb/stream/index.cpp index 6778789d6b..7c97fe259d 100644 --- a/cpp/arcticdb/stream/index.cpp +++ b/cpp/arcticdb/stream/index.cpp @@ -7,6 +7,11 @@ */ #include +#include +#include +#include + + namespace arcticdb::stream { IndexDescriptor::Type get_index_value_type(const AtomKey& key) { diff --git a/cpp/arcticdb/stream/index.hpp b/cpp/arcticdb/stream/index.hpp index 3c174e3da2..377e52e1c9 100644 --- a/cpp/arcticdb/stream/index.hpp +++ b/cpp/arcticdb/stream/index.hpp @@ -9,15 +9,14 @@ #include #include -#include #include #include -#include #include -#include -#include -#include + +namespace arcticdb { + class SegmentInMemory; +} namespace arcticdb::stream {