From 1e2deb8bfb7022948481f348c16fe7be374d0a39 Mon Sep 17 00:00:00 2001 From: Georgi Petrov Date: Fri, 8 Mar 2024 16:20:11 +0200 Subject: [PATCH] Fix crash during normalization of time zones Add another chak for the tests Address PR comments and add more test cases Fix edge case on Windows Fix broken test on windows Recreate the timestamp with timezone Coerce all timezone to datetime.timezone Add metadata to compat tests Add meta to compat tests Add a check for the buff len from denormalizing timestamps Add a note in the docs about using the time zone info to create other time zone types Add comment about the old PD_TIMESTAMP normalization Handle case when we have a valid tz from a past version Fix tz in meta in backwards compat test Fix install of old protobuf version Remove outdated assert Add 0 offset case Add write of meta data Fix interop test Add metadata to append Rebase Fix --- docs/mkdocs/docs/faq.md | 13 + .../arcticdb/version_store/_normalization.py | 44 ++- .../version_store/test_normalization.py | 296 ++++++++++++++---- python/tests/util/storage_test.py | 49 ++- 4 files changed, 324 insertions(+), 78 deletions(-) diff --git a/docs/mkdocs/docs/faq.md b/docs/mkdocs/docs/faq.md index 83c9d6b4ad..05b1295c53 100644 --- a/docs/mkdocs/docs/faq.md +++ b/docs/mkdocs/docs/faq.md @@ -183,3 +183,16 @@ The handling of `NaN` in ArcticDB depends on the type of the column under consid * For string columns, `NaN`, as well as Python `None`, are fully supported. * For floating-point numeric columns, `NaN` is also fully supported. * For integer numeric columns `NaN` is not supported. A column that otherwise contains only integers will be treated as a floating point column if a `NaN` is encountered by ArcticDB, at which point [the usual rules](api/arctic.md#LibraryOptions) around type promotion for libraries configured with or without dynamic schema all apply as usual. + +### How does ArcticDB handle `time zone` information? + +ArcticDB takes a more strict approach to time zone handling than Pandas. While Pandas support multiple types for storing time zone information, ArcticDB coerces all time zone information to `datetime.timezone`. This way we can ensure that all time zone information is stored in a consistent manner, and that all time zone information is preserved when data is written to and read from ArcticDB. + +When writing data with time zone information to ArcticDB, we preserve the time zone offset and name. When reading data with time zone information from ArcticDB, this data is used to create a `datetime.timezone` object. + +!!! note + + This means that regardles of the timezone types being written in ArcticDB, all time zone types are normalised to `datetime.timezone`. + If you would like a another time-zone type back then tzname can be used to recreate it: + - For `pytz` you can use: `pytz.gettz(dt.tzname())` + - For `ZoneInfo`, after Python 3.9+, you can use: `ZoneInfo(dt.tzname())` diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py index 807442e973..890ac81f79 100644 --- a/python/arcticdb/version_store/_normalization.py +++ b/python/arcticdb/version_store/_normalization.py @@ -7,9 +7,9 @@ """ import copy import datetime -from datetime import timedelta +from datetime import timedelta, timezone import math - +import pytz import numpy as np import os import sys @@ -21,7 +21,12 @@ from arcticc.pb2.descriptors_pb2 import UserDefinedMetadata, NormalizationMetadata, MsgPackSerialization from arcticc.pb2.storage_pb2 import VersionStoreConfig from collections import Counter -from arcticdb.exceptions import ArcticNativeException, ArcticDbNotYetImplemented, NormalizationException, SortingException +from arcticdb.exceptions import ( + ArcticNativeException, + ArcticDbNotYetImplemented, + NormalizationException, + SortingException, +) from arcticdb.supported_types import DateRangeInput, time_types as supported_time_types from arcticdb.util._versions import IS_PANDAS_TWO, IS_PANDAS_ZERO from arcticdb.version_store.read_result import ReadResult @@ -911,8 +916,15 @@ def denormalize(self, obj, meta): def _custom_pack(self, obj): if isinstance(obj, pd.Timestamp): - tz = obj.tz.zone if obj.tz is not None else None - return ExtType(MsgPackSerialization.PD_TIMESTAMP, packb([obj.value, tz])) + offset = obj.tzinfo.utcoffset(obj).total_seconds() if obj.tzinfo else 0 + if obj.tz and hasattr(obj.tz, "zone"): + tz = obj.tz.zone + else: + tz = obj.tzname() + + return ExtType( + MsgPackSerialization.PD_TIMESTAMP, packb([obj.value, tz, offset]) + ) if isinstance(obj, datetime.datetime): return ExtType(MsgPackSerialization.PY_DATETIME, packb(_to_tz_timestamp(obj))) @@ -928,7 +940,24 @@ def _custom_pack(self, obj): def _ext_hook(self, code, data): if code == MsgPackSerialization.PD_TIMESTAMP: data = unpackb(data, raw=False) - return pd.Timestamp(data[0], tz=data[1]) if data[1] is not None else pd.Timestamp(data[0]) + if len(data) == 2: + # We used to store only the value and the timezone as a string + # This is covering this legacy case, where we don't have the offset + val, tz = data + offset = 0 + if tz: + # We need to convert the string tz to a timedelta to denormalize to a datetime.timezone + obj = pd.Timestamp(val, tz=tz) + offset = obj.tzinfo.utcoffset(obj).total_seconds() + else: + # This covers the current case where we store the value, the timezone and the offset + # Should be changed if the _custom_pack method is changed + val, tz, offset = data[:3] + + if tz is None: + return pd.Timestamp(val) + + return pd.Timestamp(val, tz=timezone(timedelta(seconds=offset), tz)) if code == MsgPackSerialization.PY_DATETIME: data = unpackb(data, raw=False) @@ -1248,8 +1277,7 @@ def _strip_tz(s, e): if not getattr(data, "timezone", None): start, end = _strip_tz(start, end) data = data[ - start.to_pydatetime(warn=False) - - timedelta(microseconds=1) : end.to_pydatetime(warn=False) + start.to_pydatetime(warn=False) - timedelta(microseconds=1) : end.to_pydatetime(warn=False) + timedelta(microseconds=1) ] return data diff --git a/python/tests/unit/arcticdb/version_store/test_normalization.py b/python/tests/unit/arcticdb/version_store/test_normalization.py index a4e0080b60..03d0effe39 100644 --- a/python/tests/unit/arcticdb/version_store/test_normalization.py +++ b/python/tests/unit/arcticdb/version_store/test_normalization.py @@ -5,8 +5,10 @@ As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. """ + import datetime import sys +from datetime import timezone, timedelta from collections import namedtuple from unittest.mock import patch import numpy as np @@ -50,7 +52,10 @@ params = { "simple_dict": {"a": "1", "b": 2, "c": 3.0, "d": True}, - "pd_ts": {"a": pd.Timestamp("2018-01-12 09:15"), "b": pd.Timestamp("2017-01-31", tz="America/New_York")}, + "pd_ts": { + "a": pd.Timestamp("2018-01-12 09:15"), + "b": pd.Timestamp("2017-01-31", tz="America/New_York"), + }, } test_msgpack_normalizer = MsgPackNormalizer() @@ -72,9 +77,12 @@ def test_msg_pack_legacy_1(): # serialised data created with Python 3.6, msgpack 0.6.2, pandas 0.25.3 # this was before string and bytes types were seperated in msgpack norm = test_msgpack_normalizer - packed = b'\x82\xa1a\xc7\x0b \x92\xcf\x15\t\x05:\xdfT\xc8\x00\xc0\xa1b\xc7\x1b \x92\xcf\x14\x9e\xc2\x84+~ \x00\xb0America/New_York' + packed = b"\x82\xa1a\xc7\x0b \x92\xcf\x15\t\x05:\xdfT\xc8\x00\xc0\xa1b\xc7\x1b \x92\xcf\x14\x9e\xc2\x84+~ \x00\xb0America/New_York" data = norm._msgpack_unpackb(packed) - assert data == {'a': pd.Timestamp('2018-01-12 09:15:00'), 'b': pd.Timestamp('2017-01-31 00:00:00-0500', tz='America/New_York')} + assert data == { + "a": pd.Timestamp("2018-01-12 09:15:00"), + "b": pd.Timestamp("2017-01-31 00:00:00-0500", tz="America/New_York"), + } def test_msg_pack_legacy_2(): @@ -82,7 +90,7 @@ def test_msg_pack_legacy_2(): # serialised data created with Python 3.6, msgpack 0.6.2, pandas 0.25.3 # this was before string and bytes types were seperated in msgpack norm = test_msgpack_normalizer - packed = b'\xc7\x1b!\x92\xcf\x15\x93w\xb1\xd2\xa6\x8f\xe8\xb0America/New_York' + packed = b"\xc7\x1b!\x92\xcf\x15\x93w\xb1\xd2\xa6\x8f\xe8\xb0America/New_York" dt = datetime.datetime(2019, 4, 8, 10, 5, 2, 1) nytz = pytz.timezone("America/New_York") loc_dt = nytz.localize(dt) @@ -114,9 +122,19 @@ def test_fails_humongous_meta(): def test_empty_df(): if IS_PANDAS_ZERO: - d = pd.DataFrame(data={"C": []}, index=pd.MultiIndex(levels=[["A"], ["B"]], labels=[[], []], names=["X", "Y"])) + d = pd.DataFrame( + data={"C": []}, + index=pd.MultiIndex( + levels=[["A"], ["B"]], labels=[[], []], names=["X", "Y"] + ), + ) else: - d = pd.DataFrame(data={"C": []}, index=pd.MultiIndex(levels=[["A"], ["B"]], codes=[[], []], names=["X", "Y"])) + d = pd.DataFrame( + data={"C": []}, + index=pd.MultiIndex( + levels=[["A"], ["B"]], codes=[[], []], names=["X", "Y"] + ), + ) norm = CompositeNormalizer() df, norm_meta = norm.normalize(d) @@ -130,11 +148,18 @@ def test_empty_df(): # See test_get_description_date_range_tz in test_arctic.py for the V2 API equivalent @pytest.mark.parametrize( - "tz", ["UTC", "Europe/Amsterdam", pytz.UTC, pytz.timezone("Europe/Amsterdam"), du.tz.gettz("UTC")] + "tz", + [ + "UTC", + "Europe/Amsterdam", + pytz.UTC, + pytz.timezone("Europe/Amsterdam"), + du.tz.gettz("UTC"), + ], ) def test_write_tz(lmdb_version_store, sym, tz): assert tz is not None - index = index=pd.date_range(pd.Timestamp(0), periods=10, tz=tz) + index = index = pd.date_range(pd.Timestamp(0), periods=10, tz=tz) df = pd.DataFrame(data={"col1": np.arange(10)}, index=index) lmdb_version_store.write(sym, df) result = lmdb_version_store.read(sym).data @@ -142,7 +167,9 @@ def test_write_tz(lmdb_version_store, sym, tz): df_tz = df.index.tzinfo assert str(df_tz) == str(tz) if tz == du.tz.gettz("UTC") and sys.version_info < (3, 7): - pytest.skip("Timezone files don't seem to have ever worked properly on Python 3.6") + pytest.skip( + "Timezone files don't seem to have ever worked properly on Python 3.6" + ) start_ts, end_ts = lmdb_version_store.get_timerange_for_symbol(sym) assert isinstance(start_ts, datetime.datetime) assert isinstance(end_ts, datetime.datetime) @@ -150,6 +177,36 @@ def test_write_tz(lmdb_version_store, sym, tz): assert end_ts == index[-1] +@pytest.mark.parametrize( + "tz", + [ + None, + "UTC", + "Europe/Amsterdam", + "America/New_York", + timezone(timedelta(seconds=0), "UTC"), + timezone(timedelta(seconds=0), "Europe/Amsterdam"), + timezone(timedelta(seconds=0), "America/New_York"), + pytz.UTC, + pytz.timezone("Europe/Amsterdam"), + pytz.timezone("America/New_York"), + du.tz.gettz("UTC"), + du.tz.gettz("Europe/Amsterdam"), + du.tz.gettz("America/New_York"), + ], +) +def test_write_metadata_tz(lmdb_version_store, sym, tz): + df = pd.DataFrame( + data={"col1": np.arange(10)}, index=pd.date_range(pd.Timestamp(0), periods=10) + ) + meta = pd.Timestamp("2024-03-04", tz=tz) + lmdb_version_store.write(sym, df, metadata={"index_start": meta}) + df_tz = lmdb_version_store.read(sym).metadata["index_start"] + assert str(df_tz.tzinfo) == str(meta.tzinfo) or df_tz.tzname() == meta.tzname() + if meta.tzinfo is not None: + assert df_tz.tzinfo.utcoffset(df_tz) == meta.tzinfo.utcoffset(meta) + + def get_multiindex_df_with_tz(tz): dt1 = datetime.datetime(2019, 4, 8, 10, 5, 2, 1) dt2 = datetime.datetime(2019, 4, 9, 10, 5, 2, 1) @@ -160,14 +217,19 @@ def get_multiindex_df_with_tz(tz): arr3 = [0, 1, 0, 1] return pd.DataFrame( data={"X": [10, 11, 12, 13]}, - index=pd.MultiIndex.from_arrays([arr1, arr2, arr3], names=["datetime", "level_1", "level_2"]), + index=pd.MultiIndex.from_arrays( + [arr1, arr2, arr3], names=["datetime", "level_1", "level_2"] + ), ) @pytest.mark.parametrize("tz", [pytz.timezone("America/New_York"), pytz.UTC]) def test_multiindex_with_tz(tz): d = get_multiindex_df_with_tz(tz) - norm = CompositeNormalizer(use_norm_failure_handler_known_types=True, fallback_normalizer=test_msgpack_normalizer) + norm = CompositeNormalizer( + use_norm_failure_handler_known_types=True, + fallback_normalizer=test_msgpack_normalizer, + ) df, norm_meta = norm.normalize(d) fd = FrameData.from_npd_df(df) denorm = norm.denormalize(fd, norm_meta) @@ -182,7 +244,10 @@ def test_multiindex_with_tz(tz): @pytest.mark.parametrize("tz", [pytz.timezone("America/New_York"), pytz.UTC]) def test_empty_df_with_multiindex_with_tz(tz): orig_df = get_multiindex_df_with_tz(tz) - norm = CompositeNormalizer(use_norm_failure_handler_known_types=True, fallback_normalizer=test_msgpack_normalizer) + norm = CompositeNormalizer( + use_norm_failure_handler_known_types=True, + fallback_normalizer=test_msgpack_normalizer, + ) norm_df, norm_meta = norm.normalize(orig_df) # Slice the normalized df to an empty df (this can happen after date range slicing) @@ -198,12 +263,17 @@ def test_empty_df_with_multiindex_with_tz(tz): sliced_denorm_df = norm.denormalize(fd, norm_meta) for index_level_num in [0, 1, 2]: - assert isinstance(sliced_denorm_df.index.levels[index_level_num], type(orig_df.index.levels[index_level_num])) + assert isinstance( + sliced_denorm_df.index.levels[index_level_num], + type(orig_df.index.levels[index_level_num]), + ) assert sliced_denorm_df.index.names == orig_df.index.names for index_level_num in [0, 1]: - sliced_denorm_df_index_level_num = sliced_denorm_df.index.levels[index_level_num] + sliced_denorm_df_index_level_num = sliced_denorm_df.index.levels[ + index_level_num + ] orig_df_index_level_num = orig_df.index.levels[index_level_num] if IS_PANDAS_TWO and tz is pytz.UTC: # Pandas 2.0.0 now uses `datetime.timezone.utc` instead of `pytz.UTC`. @@ -211,11 +281,18 @@ def test_empty_df_with_multiindex_with_tz(tz): # TODO: is there a better way to handle this edge case? assert sliced_denorm_df_index_level_num.tz == datetime.timezone.utc assert isinstance(orig_df_index_level_num.tz, pytz.BaseTzInfo) - assert sliced_denorm_df_index_level_num.dtype == orig_df_index_level_num.dtype == "datetime64[ns, UTC]" + assert ( + sliced_denorm_df_index_level_num.dtype + == orig_df_index_level_num.dtype + == "datetime64[ns, UTC]" + ) else: assert isinstance(sliced_denorm_df_index_level_num.tz, pytz.BaseTzInfo) assert isinstance(orig_df_index_level_num.tz, pytz.BaseTzInfo) - assert sliced_denorm_df_index_level_num.tz.zone == orig_df_index_level_num.tz.zone + assert ( + sliced_denorm_df_index_level_num.tz.zone + == orig_df_index_level_num.tz.zone + ) def test_timestamp_without_tz(): @@ -281,7 +358,10 @@ def test_msg_pack_normalizer_strict_mode(): def test_namedtuple_inside_df(): d = pd.DataFrame({"A": [NT(1, "b"), NT(2, "a")]}) - norm = CompositeNormalizer(use_norm_failure_handler_known_types=True, fallback_normalizer=test_msgpack_normalizer) + norm = CompositeNormalizer( + use_norm_failure_handler_known_types=True, + fallback_normalizer=test_msgpack_normalizer, + ) df, norm_meta = norm.normalize(d) fd = FrameData.from_npd_df(df) denorm = norm.denormalize(fd, norm_meta) @@ -321,9 +401,17 @@ def test_serialize_custom_normalizer(): cloned_normalizer = CompositeCustomNormalizer([], False) cloned_normalizer.__setstate__(state) assert_equal(len(normalizer._normalizers), len(cloned_normalizer._normalizers)) - assert_equal(len(normalizer._normalizer_by_typename), len(cloned_normalizer._normalizer_by_typename)) - assert_equal(normalizer._normalizers[0].__class__, cloned_normalizer._normalizers[0].__class__) - assert_equal(normalizer._fail_on_missing_type, cloned_normalizer._fail_on_missing_type) + assert_equal( + len(normalizer._normalizer_by_typename), + len(cloned_normalizer._normalizer_by_typename), + ) + assert_equal( + normalizer._normalizers[0].__class__, + cloned_normalizer._normalizers[0].__class__, + ) + assert_equal( + normalizer._fail_on_missing_type, cloned_normalizer._fail_on_missing_type + ) df, norm_meta = cloned_normalizer.normalize(dt) denormed = cloned_normalizer.denormalize(df, norm_meta) assert_array_equal(dt.custom_values, denormed.custom_values) @@ -333,7 +421,10 @@ def test_force_pickle_on_norm_failure(): normal_df = pd.DataFrame({"a": [1, 2, 3]}) mixed_type_df = pd.DataFrame({"a": [1, 2, "a"]}) # Turn off the global flag for the normalizer - norm = CompositeNormalizer(use_norm_failure_handler_known_types=False, fallback_normalizer=test_msgpack_normalizer) + norm = CompositeNormalizer( + use_norm_failure_handler_known_types=False, + fallback_normalizer=test_msgpack_normalizer, + ) # This should work as before _d, _meta = norm.normalize(normal_df) @@ -344,14 +435,20 @@ def test_force_pickle_on_norm_failure(): # Explicitly passing in pickle settings without global pickle flag being set should work _d, _meta = norm.normalize(mixed_type_df, pickle_on_failure=True) - norm = CompositeNormalizer(use_norm_failure_handler_known_types=True, fallback_normalizer=test_msgpack_normalizer) + norm = CompositeNormalizer( + use_norm_failure_handler_known_types=True, + fallback_normalizer=test_msgpack_normalizer, + ) # Forcing it to pickle should work with the bool set. _d, _meta = norm.normalize(mixed_type_df) def test_numpy_array_normalization_composite(): - norm = CompositeNormalizer(use_norm_failure_handler_known_types=False, fallback_normalizer=test_msgpack_normalizer) + norm = CompositeNormalizer( + use_norm_failure_handler_known_types=False, + fallback_normalizer=test_msgpack_normalizer, + ) arr = np.random.rand(10, 10, 10) df, norm_meta = norm.normalize(arr) fd = FrameData.from_npd_df(df) @@ -381,7 +478,10 @@ def test_ndarray_arbitrary_shape(): def test_dict_with_tuples(): # This has to be pickled because msgpack doesn't differentiate between tuples and lists data = {(1, 2): [1, 24, 55]} - norm = CompositeNormalizer(use_norm_failure_handler_known_types=False, fallback_normalizer=test_msgpack_normalizer) + norm = CompositeNormalizer( + use_norm_failure_handler_known_types=False, + fallback_normalizer=test_msgpack_normalizer, + ) df, norm_meta = norm.normalize(data) fd = FrameData.from_npd_df(df) denormalized_data = norm.denormalize(fd, norm_meta) @@ -392,10 +492,14 @@ def test_dict_with_tuples(): def test_will_item_be_pickled(lmdb_version_store, sym): - df = pd.DataFrame(data=np.arange(10), index=pd.date_range(pd.Timestamp(0), periods=10)) + df = pd.DataFrame( + data=np.arange(10), index=pd.date_range(pd.Timestamp(0), periods=10) + ) pickle = {"hello": "world"} ndarr = np.arange(100) - not_so_bad_df = pd.DataFrame({"": np.arange(10)}, index=pd.date_range(pd.Timestamp(0), periods=10)) + not_so_bad_df = pd.DataFrame( + {"": np.arange(10)}, index=pd.date_range(pd.Timestamp(0), periods=10) + ) assert not lmdb_version_store.will_item_be_pickled(df) assert lmdb_version_store.will_item_be_pickled(pickle) @@ -409,7 +513,10 @@ def test_will_item_be_pickled(lmdb_version_store, sym): def test_numpy_ts_col_with_none(lmdb_version_store): df = pd.DataFrame(data={"a": [None, None]}) df.loc[0, "a"] = pd.Timestamp(0) - norm = CompositeNormalizer(use_norm_failure_handler_known_types=False, fallback_normalizer=test_msgpack_normalizer) + norm = CompositeNormalizer( + use_norm_failure_handler_known_types=False, + fallback_normalizer=test_msgpack_normalizer, + ) df, norm_meta = norm.normalize(df) fd = FrameData.from_npd_df(df) df_denormed = norm.denormalize(fd, norm_meta) @@ -419,10 +526,16 @@ def test_numpy_ts_col_with_none(lmdb_version_store): def test_none_in_columns_names(lmdb_version_store, sym): - df = pd.DataFrame(data={None: [1.2, 2.2], "None": [2.3, 3.5]}, index=[pd.Timestamp(0), pd.Timestamp(1)]) + df = pd.DataFrame( + data={None: [1.2, 2.2], "None": [2.3, 3.5]}, + index=[pd.Timestamp(0), pd.Timestamp(1)], + ) lmdb_version_store.write(sym, df) assert_frame_equal(lmdb_version_store.read(sym).data, df) - df2 = pd.DataFrame(data={None: [5.2, 6.2], "None": [1.3, 5.5]}, index=[pd.Timestamp(2), pd.Timestamp(3)]) + df2 = pd.DataFrame( + data={None: [5.2, 6.2], "None": [1.3, 5.5]}, + index=[pd.Timestamp(2), pd.Timestamp(3)], + ) lmdb_version_store.append(sym, df2) vit = lmdb_version_store.read(sym) assert_frame_equal(vit.data, pd.concat((df, df2))) @@ -430,14 +543,24 @@ def test_none_in_columns_names(lmdb_version_store, sym): def test_same_columns_names(lmdb_version_store, sym): df = pd.DataFrame( - data={"test": [1.2, 2.2], "test2": [2.3, 3.5], "test3": [2.5, 8.5], "test4": [9.3, 1.5]}, + data={ + "test": [1.2, 2.2], + "test2": [2.3, 3.5], + "test3": [2.5, 8.5], + "test4": [9.3, 1.5], + }, index=[pd.Timestamp(0), pd.Timestamp(1)], ) df.columns = ["test", None, "test", None] lmdb_version_store.write(sym, df) assert_frame_equal(lmdb_version_store.read(sym).data, df) df2 = pd.DataFrame( - data={"test": [2.2, 5.2], "test2": [1.3, 8.5], "test3": [2.5, 11.5], "test4": [12.3, 51.5]}, + data={ + "test": [2.2, 5.2], + "test2": [1.3, 8.5], + "test3": [2.5, 11.5], + "test4": [12.3, 51.5], + }, index=[pd.Timestamp(2), pd.Timestamp(3)], ) df2.columns = ["test", None, "test", None] @@ -452,16 +575,25 @@ def test_same_columns_names(lmdb_version_store, sym): def test_columns_names_dynamic_schema(lmdb_version_store_dynamic_schema, sym): lmdb_version_store = lmdb_version_store_dynamic_schema - df = pd.DataFrame(data={None: [1.2, 2.2], "None": [2.3, 3.5]}, index=[pd.Timestamp(0), pd.Timestamp(1)]) + df = pd.DataFrame( + data={None: [1.2, 2.2], "None": [2.3, 3.5]}, + index=[pd.Timestamp(0), pd.Timestamp(1)], + ) lmdb_version_store.write(sym, df) assert_frame_equal(lmdb_version_store.read(sym).data, df) - df = pd.DataFrame(data={None: [1.2, 2.2], "none": [2.3, 3.5]}, index=[pd.Timestamp(0), pd.Timestamp(1)]) + df = pd.DataFrame( + data={None: [1.2, 2.2], "none": [2.3, 3.5]}, + index=[pd.Timestamp(0), pd.Timestamp(1)], + ) df.index.name = "None" lmdb_version_store.write(sym, df) assert_frame_equal(lmdb_version_store.read(sym).data, df) - df2 = pd.DataFrame(data={"none": [22.4, 21.2], None: [25.3, 31.5]}, index=[pd.Timestamp(2), pd.Timestamp(3)]) + df2 = pd.DataFrame( + data={"none": [22.4, 21.2], None: [25.3, 31.5]}, + index=[pd.Timestamp(2), pd.Timestamp(3)], + ) df2.index.name = "None" lmdb_version_store.append(sym, df2) df3 = pd.concat((df, df2)) @@ -469,7 +601,14 @@ def test_columns_names_dynamic_schema(lmdb_version_store_dynamic_schema, sym): df4 = df4[df3.columns.tolist()] assert_frame_equal(df4, df3) - df = pd.DataFrame(data={"test": [1.2, 2.2], "test2": [2.3, 3.5], "test3": [2.5, 8.5], "test4": [9.3, 1.5]}) + df = pd.DataFrame( + data={ + "test": [1.2, 2.2], + "test2": [2.3, 3.5], + "test3": [2.5, 8.5], + "test4": [9.3, 1.5], + } + ) df.columns = ["test", None, "test", None] with pytest.raises(ArcticNativeException) as e_info: lmdb_version_store.write(sym, df) @@ -478,7 +617,11 @@ def test_columns_names_dynamic_schema(lmdb_version_store_dynamic_schema, sym): def test_columns_names_timeframe(lmdb_version_store, sym): tz = "America/New_York" dtidx = pd.date_range("2019-02-06 11:43", periods=6).tz_localize(tz) - tf = TimeFrame(dtidx.values, columns_names=[None], columns_values=[np.asarray([1, 3, 42, 54, 23, 4])]) + tf = TimeFrame( + dtidx.values, + columns_names=[None], + columns_values=[np.asarray([1, 3, 42, 54, 23, 4])], + ) lmdb_version_store.write(sym, tf) vit = lmdb_version_store.read(sym) @@ -498,17 +641,26 @@ def test_columns_names_series_dynamic(lmdb_version_store_dynamic_schema, sym): date_series = pd.Series(dr, index=dr) lmdb_version_store_dynamic_schema.write(sym + "dynamic_schema", date_series) - assert_series_equal(lmdb_version_store_dynamic_schema.read(sym + "dynamic_schema").data, date_series) + assert_series_equal( + lmdb_version_store_dynamic_schema.read(sym + "dynamic_schema").data, date_series + ) @pytest.mark.skipif(not IS_PANDAS_TWO, reason="pandas 2.0-specific test") -@pytest.mark.parametrize("datetime64_dtype", ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"]) +@pytest.mark.parametrize( + "datetime64_dtype", + ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], +) @pytest.mark.parametrize("PandasContainer", [pd.DataFrame, pd.Series]) -def test_no_inplace_index_array_modification(lmdb_version_store, sym, datetime64_dtype, PandasContainer): +def test_no_inplace_index_array_modification( + lmdb_version_store, sym, datetime64_dtype, PandasContainer +): # Normalization must not modify Series' or DataFrames' index array in-place. pandas_container = PandasContainer( data={"a": [1, 2, 3]}, - index=pd.DatetimeIndex(["2020-01-01", "2020-01-02", "2020-01-03"], dtype=datetime64_dtype), + index=pd.DatetimeIndex( + ["2020-01-01", "2020-01-02", "2020-01-03"], dtype=datetime64_dtype + ), ) original_index_array = pandas_container.index lmdb_version_store.write(sym, pandas_container) @@ -517,7 +669,10 @@ def test_no_inplace_index_array_modification(lmdb_version_store, sym, datetime64 def test_index_names_datetime_support(lmdb_version_store, sym): - df = pd.DataFrame(data={"a": [1, 2, 3]}, index=pd.DatetimeIndex(["2020-01-01", "2020-01-02", "2020-01-03"])) + df = pd.DataFrame( + data={"a": [1, 2, 3]}, + index=pd.DatetimeIndex(["2020-01-01", "2020-01-02", "2020-01-03"]), + ) new_index = pd.Timestamp("2020-01-01") df.index.rename(new_index, inplace=True) with pytest.raises(ArcticException): @@ -525,7 +680,10 @@ def test_index_names_datetime_support(lmdb_version_store, sym): def test_index_names_tuple_support(lmdb_version_store, sym): - df = pd.DataFrame(data={"a": [1, 2, 3]}, index=pd.DatetimeIndex(["2020-01-01", "2020-01-02", "2020-01-03"])) + df = pd.DataFrame( + data={"a": [1, 2, 3]}, + index=pd.DatetimeIndex(["2020-01-01", "2020-01-02", "2020-01-03"]), + ) new_index = tuple([1, 2, 3]) df.index.rename(new_index, inplace=True) with pytest.raises(ArcticException): @@ -545,7 +703,10 @@ def test_index_names_roundtrip_csv(lmdb_version_store, sym): def test_column_names_datetime_support(lmdb_version_store, sym): - df = pd.DataFrame(data={"a": [1, 2, 3]}, index=pd.DatetimeIndex(["2020-01-01", "2020-01-02", "2020-01-03"])) + df = pd.DataFrame( + data={"a": [1, 2, 3]}, + index=pd.DatetimeIndex(["2020-01-01", "2020-01-02", "2020-01-03"]), + ) new_index = pd.Timestamp("2020-01-01") df.rename(columns={"a": new_index}, inplace=True) with pytest.raises(ArcticException): @@ -557,7 +718,15 @@ def test_column_names_mixed_types(lmdb_version_store, sym): buf = io.StringIO("2023-11-27 00:00:00,0.73260,0.73260,0.73260,0.73260,7") df = pd.read_csv(buf, parse_dates=[0], index_col=0, header=None) - df_to_write = df.rename(columns={1: 1, 2: pd.Timestamp("2020-01-01"), 3: tuple([1, 2, 3]), 4: "test", 5: 5.5}) + df_to_write = df.rename( + columns={ + 1: 1, + 2: pd.Timestamp("2020-01-01"), + 3: tuple([1, 2, 3]), + 4: "test", + 5: 5.5, + } + ) with pytest.raises(ArcticException): lmdb_version_store.write(sym, df_to_write) @@ -572,7 +741,8 @@ def test_column_names_roundtrip_csv(lmdb_version_store, sym): @pytest.mark.skipif( - not IS_PANDAS_TWO, reason="The full-support of pyarrow-backed pandas objects is pandas 2.0-specific." + not IS_PANDAS_TWO, + reason="The full-support of pyarrow-backed pandas objects is pandas 2.0-specific.", ) def test_pyarrow_error(lmdb_version_store): error_msg_intro = "PyArrow-backed pandas DataFrame and Series are not currently supported by ArcticDB." @@ -614,12 +784,25 @@ def test_norm_failure_error_message(lmdb_version_store_v1): with pytest.raises(ArcticDbNotYetImplemented) as update_exception: lib.update(sym, df) - assert all(col_name in str(e.value) for e in - [write_exception, batch_write_exception, append_exception, batch_append_exception, update_exception]) - assert all("pickle_on_failure" in str(e.value) for e in - [write_exception, batch_write_exception]) - assert all("pickle_on_failure" not in str(e.value) for e in - [append_exception, batch_append_exception, update_exception]) + assert all( + col_name in str(e.value) + for e in [ + write_exception, + batch_write_exception, + append_exception, + batch_append_exception, + update_exception, + ] + ) + assert all( + "pickle_on_failure" in str(e.value) + for e in [write_exception, batch_write_exception] + ) + assert all( + "pickle_on_failure" not in str(e.value) + for e in [append_exception, batch_append_exception, update_exception] + ) + def test_bools_are_pickled(lmdb_version_store_allows_pickling): lib = lmdb_version_store_allows_pickling @@ -627,14 +810,15 @@ def test_bools_are_pickled(lmdb_version_store_allows_pickling): df = pd.DataFrame({"a": [True, False]}) lib.write(sym, df) - lib.get_info(sym)['type'] == 'pickled' + lib.get_info(sym)["type"] == "pickled" assert_frame_equal(df, lib.read(sym).data) df = pd.DataFrame({"a": [True, False, np.nan]}) lib.write(sym, df) - lib.get_info(sym)['type'] == 'pickled' + lib.get_info(sym)["type"] == "pickled" assert_frame_equal(df, lib.read(sym).data) + def test_bools_with_nan_throw_without_pickling(lmdb_version_store_v1): lib = lmdb_version_store_v1 sym = "test_bools_throw_without_pickling" @@ -643,20 +827,22 @@ def test_bools_with_nan_throw_without_pickling(lmdb_version_store_v1): with pytest.raises(Exception): lib.write(sym, df) + def test_arrays_are_pickled(lmdb_version_store_allows_pickling): lib = lmdb_version_store_allows_pickling sym = "test_arrays_are_pickled" df = pd.DataFrame({"a": [np.array([1, 2])]}) lib.write(sym, df) - lib.get_info(sym)['type'] == 'pickled' + lib.get_info(sym)["type"] == "pickled" assert_frame_equal(df, lib.read(sym).data) df = pd.DataFrame({"a": [[1, 2]]}) lib.write(sym, df) - lib.get_info(sym)['type'] == 'pickled' + lib.get_info(sym)["type"] == "pickled" assert_frame_equal(df, lib.read(sym).data) + def test_arrays_throw_without_pickling(lmdb_version_store_v1): lib = lmdb_version_store_v1 sym = "test_arrays_throw_without_pickling" diff --git a/python/tests/util/storage_test.py b/python/tests/util/storage_test.py index b50a122a1f..ab0ff5bc51 100644 --- a/python/tests/util/storage_test.py +++ b/python/tests/util/storage_test.py @@ -5,6 +5,7 @@ import numpy as np import argparse import re +import pytz from datetime import datetime from arcticdb import Arctic @@ -24,10 +25,11 @@ def create_df(start=0, columns=1) -> pd.DataFrame: def get_basic_dfs(): + meta = {"meta": pd.Timestamp("2024-03-04", tz=pytz.UTC)} df1 = create_df(0, 3) df2 = create_df(1, 3) df3 = create_df(2, 3) - return (df1, "one"), (df2, "two"), (df3, "three") + return (df1, "one", meta), (df2, "two", meta), (df3, "three", meta) def get_empty_series(): @@ -37,6 +39,7 @@ def get_empty_series(): def get_csv_df(): + meta = {"meta": pd.Timestamp("2024-03-04", tz="America/New_York")} buf = io.StringIO("2023-11-27 00:00:00,0.73260,0.73260,0.73260,0.73260,7") df = pd.read_csv(buf, parse_dates=[0], index_col=0, header=None) @@ -46,7 +49,7 @@ def get_csv_df(): sym = "csv_df" - return df, sym + return df, sym, meta def real_s3_credentials(shared_path: bool = True): @@ -78,9 +81,7 @@ def get_real_s3_uri(shared_path: bool = True): path_prefix, _, ) = real_s3_credentials(shared_path) - aws_uri = ( - f"s3s://{endpoint}:{bucket}?access={access_key}&secret={secret_key}®ion={region}&path_prefix={path_prefix}" - ) + aws_uri = f"s3s://{endpoint}:{bucket}?access={access_key}&secret={secret_key}®ion={region}&path_prefix={path_prefix}" return aws_uri @@ -111,18 +112,32 @@ def get_test_libraries(ac=None): return [lib for lib in ac.list_libraries() if lib.startswith("test_")] +def assert_tz_equal(meta_expected, meta_actual): + assert meta_actual.tzname() == meta_expected.tzname() or str( + meta_actual.tzinfo + ) == str(meta_expected.tzinfo) + if meta_expected is not None: + assert meta_actual.tzinfo.utcoffset( + meta_actual + ) == meta_expected.tzinfo.utcoffset(meta_expected) + + def read_persistent_library(lib): - for df, sym in get_basic_dfs(): + for df, sym, meta in get_basic_dfs(): res_df = lib.read(sym).data assert_frame_equal(res_df, pd.concat([df, df])) + res_meta = lib.read(sym).metadata["meta"] + assert_tz_equal(meta["meta"], res_meta) + df, sym = get_empty_series() res_df = lib.read(sym).data assert res_df.empty - assert str(res_df.dtype) == "datetime64[ns]" - df, sym = get_csv_df() + df, sym, meta = get_csv_df() res_df = lib.read(sym).data + res_meta = lib.read(sym).metadata["meta"] + assert_tz_equal(meta["meta"], res_meta) # 'cast' to str to accomodate previous versions of ArcticDB df.index.rename(str(df.index.name), inplace=True) res_df.index.rename(str(res_df.index.name), inplace=True) @@ -143,18 +158,18 @@ def is_strategy_branch_valid_format(input_string): def write_persistent_library(lib, latest: bool = False): - for df, sym in get_basic_dfs(): - lib.write(sym, df) - lib.append(sym, df) + for df, sym, meta in get_basic_dfs(): + lib.write(sym, df, metadata=meta) + lib.append(sym, df, metadata=meta) series, sym = get_empty_series() lib.write(sym, series) - df, sym = get_csv_df() + df, sym, meta = get_csv_df() if not latest: # 'cast' to str because we only support string index names in past versions df.index.rename(str(df.index.name), inplace=True) - lib.write(sym, df) + lib.write(sym, df, metadata=meta) res = lib.read(sym).data assert_frame_equal(res, df) @@ -164,7 +179,9 @@ def seed_library(ac, version: str = ""): strategy_branch = os.getenv("ARCTICDB_PERSISTENT_STORAGE_STRATEGY_BRANCH") if not is_strategy_branch_valid_format(strategy_branch): - raise ValueError(f"The strategy_branch: {strategy_branch} is not formatted correctly") + raise ValueError( + f"The strategy_branch: {strategy_branch} is not formatted correctly" + ) lib_name = f"seed_{version}{strategy_branch}" lib_name = normalize_lib_name(lib_name) @@ -229,7 +246,9 @@ def generate_ascending_dataframe(n, freq="S", end_timestamp="1/1/2023"): # Generate timestamps fake_tickers = [gen_fake_ticker(val) for val in values] # Create dataframe - df = pd.DataFrame({"timestamp": timestamps, "fake_ticker": fake_tickers, "value": values}) + df = pd.DataFrame( + {"timestamp": timestamps, "fake_ticker": fake_tickers, "value": values} + ) return df