From a08fa1aa9faa120154fd2b5c474ef37b8f14401a Mon Sep 17 00:00:00 2001 From: Hassan Kibirige Date: Tue, 22 Oct 2024 00:18:22 +0300 Subject: [PATCH] Add trans.diff_type_to_num & remove np.timedelta64 --- doc/changelog.rst | 16 ++++++++ mizani/_core/dates.py | 45 ++++++++++++++++++++++ mizani/bounds.py | 3 -- mizani/breaks.py | 31 +++++++-------- mizani/labels.py | 4 +- mizani/transforms.py | 81 ++++++++++++++++++++++++++-------------- mizani/typing.py | 17 +++++---- tests/test_bounds.py | 30 --------------- tests/test_breaks.py | 4 +- tests/test_dates.py | 6 +++ tests/test_transforms.py | 25 ++++++++++++- 11 files changed, 171 insertions(+), 91 deletions(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index 440b9d5..ea9d472 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -1,6 +1,22 @@ Changelog ========= +v0.12.3 +------- +*not-yet-released* + +API Changes +*********** + +- Support for numpy `timedelta64` has been removed. It was not well supported + in the first place, so removing it should be of consequence. + +New +*** + +- :class:`~mizani.transforms.trans` gained new method `diff_type_to_num` that + should be helpful with some arithmetic operations for non-numeric domains. + v0.12.2 ------- *2024-09-04* diff --git a/mizani/_core/dates.py b/mizani/_core/dates.py index 75060f6..34ca910 100644 --- a/mizani/_core/dates.py +++ b/mizani/_core/dates.py @@ -1,11 +1,13 @@ from __future__ import annotations import math +from collections.abc import Sized from datetime import datetime, timedelta, tzinfo from typing import TYPE_CHECKING, overload from zoneinfo import ZoneInfo import numpy as np +import pandas as pd from dateutil.rrule import rrule from ..utils import get_timezone, isclose_abs @@ -22,6 +24,8 @@ NDArrayDatetime, NDArrayFloat, SeqDatetime, + Timedelta, + TimedeltaArrayLike, TzInfo, ) @@ -151,6 +155,47 @@ def num_to_datetime( return _from_ordinalf_np_vectorized(x, tz) +# NOTE: We only deal with timedelta and pd.Timedelta + + +@overload +def timedelta_to_num(x: TimedeltaArrayLike) -> NDArrayFloat: ... + + +@overload +def timedelta_to_num(x: Timedelta) -> float: ... + + +def timedelta_to_num( + x: TimedeltaArrayLike | Timedelta, +) -> NDArrayFloat | float: + """ + Convert any timedelta to days + + This function gives us a numeric representation a timedelta that + we can add/subtract from the numeric representation of datetimes. + """ + _x = x if (sized := isinstance(x, Sized)) else pd.Series([x]) + + if not len(_x): + return np.array([], dtype=float) + + res: NDArrayFloat = np.array( + [td.total_seconds() / SECONDS_PER_DAY for td in _x] + ) + return res if sized else res[0] + + +def num_to_timedelta(x: FloatArrayLike) -> Sequence[pd.Timedelta]: + """ + Convert any float array to numpy datetime64 array + + Returns pd.Timedelta because they have a larger range than + datetime.timedelta. + """ + return tuple(pd.Timedelta(days=val) for val in x) + + WIDTHS: dict[DateFrequency, Sequence[int]] = { DF.YEARLY: (1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000), DF.MONTHLY: (1, 2, 3, 4, 6), diff --git a/mizani/bounds.py b/mizani/bounds.py index f1ab50d..7d809b3 100644 --- a/mizani/bounds.py +++ b/mizani/bounds.py @@ -413,9 +413,6 @@ def zero_range(x: tuple[Any, Any], tol: float = EPSILON * 100) -> bool: # timedelta - pandas, cpython elif isinstance(x[0], (pd.Timedelta, datetime.timedelta)): return x[0].total_seconds() == x[1].total_seconds() - # timedelta - numpy - elif isinstance(x[0], np.timedelta64): - return x[0] == x[1] elif not isinstance(x[0], (float, int, np.number)): raise TypeError( "zero_range objects cannot work with objects " diff --git a/mizani/breaks.py b/mizani/breaks.py index 64d57b1..53fc18f 100644 --- a/mizani/breaks.py +++ b/mizani/breaks.py @@ -36,8 +36,8 @@ DurationUnit, FloatArrayLike, NDArrayFloat, - NDArrayTimedelta, Timedelta, + TimedeltaArrayLike, Trans, TupleFloat2, TupleFloat5, @@ -510,7 +510,7 @@ def __init__(self, n: int = 5, Q: Sequence[float] = (1, 2, 5, 10)): def __call__( self, limits: tuple[Timedelta, Timedelta] - ) -> NDArrayTimedelta: + ) -> TimedeltaArrayLike: """ Compute breaks @@ -525,7 +525,7 @@ def __call__( Sequence of break points. """ if any(pd.isna(x) for x in limits): - return np.array([]) + return [] helper = timedelta_helper(limits) scaled_limits = helper.scaled_limits() @@ -561,7 +561,7 @@ class timedelta_helper: See, :class:`~mizani.labels.label_timedelta` """ - x: NDArrayTimedelta | Sequence[Timedelta] + x: TimedeltaArrayLike units: DurationUnit limits: TupleFloat2 package: Literal["pandas", "cpython"] @@ -569,7 +569,7 @@ class timedelta_helper: def __init__( self, - x: NDArrayTimedelta | Sequence[Timedelta], + x: TimedeltaArrayLike, units: Optional[DurationUnit] = None, ): self.x = x @@ -592,14 +592,12 @@ def determine_package(cls, td: Timedelta) -> Literal["pandas", "cpython"]: @classmethod def format_info( - cls, x: NDArrayTimedelta, units: Optional[DurationUnit] = None + cls, x: TimedeltaArrayLike, units: Optional[DurationUnit] = None ) -> tuple[NDArrayFloat, DurationUnit]: helper = cls(x, units) return helper.timedelta_to_numeric(x), helper.units - def best_units( - self, x: NDArrayTimedelta | Sequence[Timedelta] - ) -> DurationUnit: + def best_units(self, x: TimedeltaArrayLike) -> DurationUnit: """ Determine good units for representing a sequence of timedeltas """ @@ -662,25 +660,24 @@ def scaled_limits(self) -> TupleFloat2: return _min, _max def timedelta_to_numeric( - self, timedeltas: NDArrayTimedelta + self, timedeltas: TimedeltaArrayLike ) -> NDArrayFloat: """ Convert sequence of timedelta to numerics """ return np.array([self.to_numeric(td) for td in timedeltas]) - def numeric_to_timedelta(self, values: NDArrayFloat) -> NDArrayTimedelta: + def numeric_to_timedelta(self, values: NDArrayFloat) -> TimedeltaArrayLike: """ Convert sequence of numerical values to timedelta """ if self.package == "pandas": - return np.array( - [pd.Timedelta(int(x * self.factor), unit="ns") for x in values] - ) + return [ + pd.Timedelta(int(x * self.factor), unit="ns") for x in values + ] + else: - return np.array( - [timedelta(seconds=x * self.factor) for x in values] - ) + return [timedelta(seconds=x * self.factor) for x in values] def get_scaling_factor(self, units): if self.package == "pandas": diff --git a/mizani/labels.py b/mizani/labels.py index d78b4d6..5af8c60 100644 --- a/mizani/labels.py +++ b/mizani/labels.py @@ -36,7 +36,7 @@ BytesSymbol, DurationUnit, FloatArrayLike, - NDArrayTimedelta, + TimedeltaArrayLike, TupleInt2, ) @@ -632,7 +632,7 @@ class label_timedelta: space: bool = True use_plurals: bool = True - def __call__(self, x: NDArrayTimedelta) -> Sequence[str]: + def __call__(self, x: TimedeltaArrayLike) -> Sequence[str]: if len(x) == 0: return [] diff --git a/mizani/transforms.py b/mizani/transforms.py index fba24cc..14bfb72 100644 --- a/mizani/transforms.py +++ b/mizani/transforms.py @@ -31,7 +31,12 @@ import numpy as np import pandas as pd -from ._core.dates import datetime_to_num, num_to_datetime +from ._core.dates import ( + datetime_to_num, + num_to_datetime, + num_to_timedelta, + timedelta_to_num, +) from .breaks import ( breaks_date, breaks_extended, @@ -62,9 +67,8 @@ MinorBreaksFunction, NDArrayDatetime, NDArrayFloat, - NDArrayTimedelta, TFloatArrayLike, - TimedeltaSeries, + TimedeltaArrayLike, TransformFunction, TupleFloat2, ) @@ -231,6 +235,26 @@ def breaks(self, limits: DomainType) -> NDArrayFloat: ) return breaks + def diff_type_to_num(self, x: Any) -> FloatArrayLike: + """ + Convert the difference between two points in the domain to a numeric + + This function is necessary for some arithmetic operations in the + transform space of a domain when the difference in between any two + points in that domain is not numeric. + + For example for a domain of datetime value types, the difference on + the domain is of type timedelta. In this case this function should + expect timedeltas and convert them to float values that compatible + (same units) as the transform value of datetimes. + + Parameters + ---------- + x : + Differences + """ + return x + def trans_new( name: str, @@ -733,13 +757,14 @@ def __init__(self, tz=None, **kwargs): def transform(self, x: DatetimeArrayLike) -> NDArrayFloat: # pyright: ignore[reportIncompatibleMethodOverride] """ Transform from date to a numerical format + + The transform values a unit of [days]. """ if not len(x): return np.array([]) - x0 = next(iter(x)) try: - tz = x0.tzinfo + tz = next(iter(x)).tzinfo except AttributeError: tz = None @@ -761,6 +786,14 @@ def tzinfo(self): """ return self.tz + def diff_type_to_num(self, x: TimedeltaArrayLike) -> FloatArrayLike: + """ + Covert timedelta to numerical format + + The timedeltas are converted to a unit of [days]. + """ + return timedelta_to_num(x) + class timedelta_trans(trans): """ @@ -772,44 +805,36 @@ class timedelta_trans(trans): format = staticmethod(label_timedelta()) @staticmethod - def transform(x: NDArrayTimedelta | Sequence[timedelta]) -> NDArrayFloat: # pyright: ignore[reportIncompatibleMethodOverride] + def transform(x: TimedeltaArrayLike) -> NDArrayFloat: # pyright: ignore[reportIncompatibleMethodOverride] """ Transform from Timeddelta to numerical format + + The transform values have a unit of [days] """ - # microseconds - return np.array([_x.total_seconds() * 10**6 for _x in x]) + return timedelta_to_num(x) @staticmethod - def inverse(x: FloatArrayLike) -> NDArrayTimedelta: + def inverse(x: FloatArrayLike) -> Sequence[pd.Timedelta]: # pyright: ignore[reportIncompatibleMethodOverride] """ Transform to Timedelta from numerical format """ - return np.array([timedelta(microseconds=i) for i in x]) + return num_to_timedelta(x) + def diff_type_to_num(self, x: TimedeltaArrayLike) -> FloatArrayLike: + """ + Covert timedelta to numerical format -class pd_timedelta_trans(trans): + The timedeltas are converted to a unit of [days]. + """ + return timedelta_to_num(x) + + +class pd_timedelta_trans(timedelta_trans): """ Pandas timedelta Transformation """ domain = (pd.Timedelta.min, pd.Timedelta.max) - breaks_ = staticmethod(breaks_timedelta()) - format = staticmethod(label_timedelta()) - - @staticmethod - def transform(x: TimedeltaSeries) -> NDArrayFloat: # pyright: ignore[reportIncompatibleMethodOverride] - """ - Transform from Timeddelta to numerical format - """ - # nanoseconds - return np.array([_x.value for _x in x]) - - @staticmethod - def inverse(x: FloatArrayLike) -> NDArrayTimedelta: - """ - Transform to Timedelta from numerical format - """ - return np.array([pd.Timedelta(int(i)) for i in x]) class reciprocal_trans(trans): diff --git a/mizani/typing.py b/mizani/typing.py index 9cb990e..2b1c64a 100644 --- a/mizani/typing.py +++ b/mizani/typing.py @@ -48,7 +48,6 @@ NDArrayInt: TypeAlias = NDArray[np.int64] NDArrayStr: TypeAlias = NDArray[np.str_] NDArrayDatetime: TypeAlias = NDArray[Any] - NDArrayTimedelta: TypeAlias = NDArray[Any] # Series AnySeries: TypeAlias = pd.Series[Any] @@ -56,9 +55,7 @@ IntSeries: TypeAlias = pd.Series[int] FloatSeries: TypeAlias = pd.Series[float] DatetimeSeries: TypeAlias = pd.Series[datetime] - - # Use Any as cannot define pd.Series[timedelta] - TimedeltaSeries: TypeAlias = pd.Series[Any] + TimedeltaSeries: TypeAlias = pd.Series[pd.Timedelta] # ArrayLikes AnyArrayLike: TypeAlias = NDArrayAny | pd.Series[Any] | Sequence[Any] @@ -68,14 +65,17 @@ DatetimeArrayLike: TypeAlias = ( NDArrayDatetime | DatetimeSeries | Sequence[datetime] ) - TimedeltArrayLike: TypeAlias = ( - NDArrayTimedelta | TimedeltaSeries | Sequence[timedelta] + TimedeltaArrayLike: TypeAlias = ( + Sequence[timedelta] | Sequence[pd.Timedelta] | TimedeltaSeries ) # Type variable - TFloatLike = TypeVar("TFloatLike", bound=NDArrayFloat | float) + TAnyArrayLike = TypeVar( + "TAnyArrayLike", NDArrayAny, pd.Series[Any], Sequence[Any] + ) + TFloatLike = TypeVar("TFloatLike", NDArrayFloat, float) TFloatArrayLike = TypeVar("TFloatArrayLike", bound=FloatArrayLike) - TFloatVector = TypeVar("TFloatVector", bound=NDArrayFloat | FloatSeries) + TFloatVector = TypeVar("TFloatVector", NDArrayFloat, FloatSeries) TConstrained = TypeVar( "TConstrained", int, float, bool, str, complex, datetime, timedelta ) @@ -155,6 +155,7 @@ class SegmentFunctionColorMapData(TypedDict): ) SeqDatetime64: TypeAlias = Sequence[np.datetime64] TzInfo: TypeAlias = tzinfo + SeqTimedelta: TypeAlias = Sequence[timedelta] | Sequence[pd.Timedelta] # dateutil.rrule.YEARLY, ..., but not including 2 weekly # adding 7 for our own MICROSECONDLY diff --git a/tests/test_bounds.py b/tests/test_bounds.py index 37dfbb6..94b88cd 100644 --- a/tests/test_bounds.py +++ b/tests/test_bounds.py @@ -98,15 +98,6 @@ def test_censor(): assert all(val is None for val in result[:2]) assert all(val is None for val in result[-2:]) - # np.timedelta64 - limits = np.timedelta64(200, "D"), np.timedelta64(205, "D") - x = [np.timedelta64(i, "D") for i in range(198, 208)] - x5 = np.array(x) - result = censor(x5, limits) - npt.assert_array_equal(result[2:-2], x5[2:-2]) - assert all(isinstance(val, np.timedelta64) for val in result[:2]) - assert all(isinstance(val, np.timedelta64) for val in result[-2:]) - # branches # x = np.array([1, 2, np.inf, 3, 4, 11]) result = censor(x, (0, 10), only_finite=False) @@ -192,19 +183,6 @@ def diff(x): result = expand_range(limits, add=one_day, zero_width=30 * one_day) diff(result) == diff(limits) + 30 * one_day - # timedelta64 - one_day = np.timedelta64(1, "D") - limits = np.timedelta64(1, "D"), np.timedelta64(10, "D") - result = expand_range(limits, add=one_day, zero_width=30 * one_day) - diff(result) == diff(limits) + 2 * one_day - - result = expand_range(limits, mul=0.5, add=one_day) - diff(result) == 2 * diff(limits) + 2 * one_day - - limits = np.timedelta64(1, "D"), np.timedelta64(1, "D") - result = expand_range(limits, add=one_day, zero_width=30 * one_day) - diff(result) == diff(limits) + 30 * one_day - def test_expand_range_distinct(): assert expand_range_distinct((0, 1)) == (0, 1) @@ -368,14 +346,6 @@ def test_zero_range(): assert not zero_range(x2) assert not zero_range(x3) - # timedelta - numpy - x = np.timedelta64(7, "D"), np.timedelta64(7, "D") - x2 = np.timedelta64(7, "D"), np.timedelta64(1, "W") - x3 = np.timedelta64(7, "D"), np.timedelta64(2, "D") - assert zero_range(x) - assert zero_range(x2) - assert not zero_range(x3) - # branches # assert zero_range([4, float("nan")]) assert not zero_range([4, float("inf")]) diff --git a/tests/test_breaks.py b/tests/test_breaks.py index f9d121b..bef9d5d 100644 --- a/tests/test_breaks.py +++ b/tests/test_breaks.py @@ -370,9 +370,9 @@ def test_breaks_timedelta(): minutes = [val.total_seconds() / 60 for val in major] npt.assert_allclose(minutes, [0, 2, 4, 6, 8]) - # numpy + # numpy timedelta64 is not supported x = [np.timedelta64(i * 10, "D") for i in range(1, 10)] - limits = min(x), max(x) + limits = x[0], x[-1] with pytest.raises(ValueError): breaks(limits) diff --git a/tests/test_dates.py b/tests/test_dates.py index 3fce6aa..6e3c2d6 100644 --- a/tests/test_dates.py +++ b/tests/test_dates.py @@ -16,6 +16,7 @@ datetime_to_num, get_tzinfo, num_to_datetime, + timedelta_to_num, ) @@ -81,6 +82,11 @@ def test_datetime_to_num(): assert len(res) == 0 +def test_timedelta_to_num(): + res = timedelta_to_num([]) + assert len(res) == 0 + + # Just for test coverage # TODO: Find a better test def test_align_limits(): diff --git a/tests/test_transforms.py b/tests/test_transforms.py index a62eaba..b0bc766 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -107,6 +107,10 @@ def _test_trans(trans, x, *args, **kwargs): assert all(minor >= t.domain[0]) assert all(minor <= t.domain[1]) + # We can convert the diff types to numerics + xdiff_num = t.diff_type_to_num(np.diff(x)) + assert all(isinstance(val, (float, int, np.number)) for val in xdiff_num) + def test_asn_trans(): _test_trans(asn_trans, arr * 0.01) @@ -254,6 +258,9 @@ def test_datetime_trans(): s2 = t.inverse(st) assert all(s == s2) + sdiff_num = t.diff_type_to_num(s.diff()) + assert all(isinstance(val, (float, int, np.number)) for val in sdiff_num) + def test_datetime_trans_tz(): EST = ZoneInfo("EST") @@ -285,10 +292,26 @@ def test_timedelta_trans(): x2 = t.inverse(xt) assert all(a == b for a, b in zip(x, x2)) + s = pd.Series(x) + st = t.transform(s) + s2 = t.inverse(st) + assert all(a == b for a, b in zip(s, s2)) + + sdiff_num = t.diff_type_to_num(s.diff()) + assert all(isinstance(val, (float, int, np.number)) for val in sdiff_num) + def test_pd_timedelta_trans(): - x = [pd.Timedelta(days=i) for i in range(1, 11)] + x = [timedelta(days=i) for i in range(1, 11)] t = pd_timedelta_trans() xt = t.transform(x) x2 = t.inverse(xt) assert all(a == b for a, b in zip(x, x2)) + + s = pd.Series(x) + st = t.transform(s) + s2 = t.inverse(st) + assert all(a == b for a, b in zip(s, s2)) + + sdiff_num = t.diff_type_to_num(s.diff()) + assert all(isinstance(val, (float, int, np.number)) for val in sdiff_num)