Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consistently handle timezones in QueryBuilder filtering #1483

Merged
merged 2 commits into from
Apr 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions python/arcticdb/version_store/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,18 @@ def is_supported_sequence(obj):
return isinstance(obj, (list, set, frozenset, tuple, np.ndarray))


def nanoseconds_from_utc(time):
# We convert every time type to a pandas Timestamp because it:
# - is consistent with how we normalize time types when normalizing
# - has nanosecond precision
# - if tzinfo is set will give nanoseconds since UTC
return int(pd.Timestamp(time).value)


def nanoseconds_timedelta(timedelta):
return int(pd.Timedelta(timedelta).value)


def value_list_from_args(*args):
if len(args) == 1 and is_supported_sequence(args[0]):
collection = args[0]
Expand All @@ -255,7 +267,7 @@ def value_list_from_args(*args):
if value not in value_set:
value_set.add(value)
if isinstance(value, supported_time_types):
value = int(value.timestamp() * 1_000_000_000)
value = nanoseconds_from_utc(value)
elem = np.array([value]) if isinstance(value, (int, np.integer)) else np.full(1, value, dtype=None)
array_list.append(elem)
contains_integer = contains_integer or isinstance(value, (int, np.integer))
Expand Down Expand Up @@ -708,15 +720,11 @@ def create_value(value):
elif isinstance(value, np.integer):
min_scalar_type = np.min_scalar_type(value)
f = CONSTRUCTOR_MAP.get(min_scalar_type.kind).get(min_scalar_type.itemsize)
elif isinstance(value, (pd.Timestamp, pd.Timedelta)):
# pd.Timestamp is in supported_time_types, but its timestamp() method can't provide ns precision
value = value.value
f = ValueInt64
elif isinstance(value, supported_time_types):
value = int(value.timestamp() * 1_000_000_000)
value = nanoseconds_from_utc(value)
f = ValueInt64
elif isinstance(value, datetime.timedelta):
value = int(value.total_seconds() * 1_000_000_000)
elif isinstance(value, (datetime.timedelta, pd.Timedelta)):
value = nanoseconds_timedelta(value)
f = ValueInt64
elif isinstance(value, bool):
f = ValueBool
Expand Down
35 changes: 35 additions & 0 deletions python/tests/unit/arcticdb/version_store/test_query_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import numpy as np
import pandas as pd
import pytest
import datetime
import dateutil

from arcticdb.version_store.processing import QueryBuilder
from arcticdb.util.test import assert_frame_equal
Expand Down Expand Up @@ -60,6 +62,39 @@ def test_querybuilder_date_range_then_filter(lmdb_version_store_tiny_segment, us
assert_frame_equal(expected, received)


def test_querybuilder_filter_datetime_with_timezone(lmdb_version_store_tiny_segment):
lib = lmdb_version_store_tiny_segment
symbol = "symbol"
def can_read_back(write_with_time, filter_with_time):
df = pd.DataFrame({"col": [write_with_time]})
lib.delete(symbol)
lib.write(symbol, df)

q = QueryBuilder()
q = q[q["col"] == filter_with_time]
read_df = lib.read(symbol, query_builder=q).data

return len(read_df) == 1

notz_winter_time = datetime.datetime(2024, 1, 1)
notz_summer_time = datetime.datetime(2024, 6, 1)
utc_time = datetime.datetime(2024, 6, 1, tzinfo=dateutil.tz.tzutc())
us_time = datetime.datetime(2024, 6, 1, tzinfo=dateutil.tz.gettz('America/New_York'))

# Reading back the same time should always succeed
assert can_read_back(notz_winter_time, notz_winter_time)
assert can_read_back(notz_summer_time, notz_summer_time)
assert can_read_back(utc_time, utc_time)
assert can_read_back(us_time, us_time)

# If tzinfo is not specified we assume UTC
assert can_read_back(notz_summer_time, utc_time)
assert can_read_back(utc_time, notz_summer_time)
assert not can_read_back(notz_summer_time, us_time)
assert not can_read_back(us_time, notz_summer_time)



@pytest.mark.parametrize("use_date_range_clause", [True, False])
def test_querybuilder_date_range_then_project(lmdb_version_store_tiny_segment, use_date_range_clause):
lib = lmdb_version_store_tiny_segment
Expand Down
Loading