Skip to content

Commit

Permalink
Backport PR #55077: Revert "BUG: Timestamp origin takes no effect in …
Browse files Browse the repository at this point in the history
…resample for 'MS' frequency (#53938)"
  • Loading branch information
MarcoGorelli authored and meeseeksmachine committed Oct 9, 2023
1 parent 81a192b commit 34df28b
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 93 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`)
- Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`)
- Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`)

.. ---------------------------------------------------------------------------
.. _whatsnew_212.bug_fixes:
Expand Down
16 changes: 10 additions & 6 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9097,6 +9097,10 @@ def resample(
.. versionadded:: 1.3.0
.. note::
Only takes effect for Tick-frequencies (i.e. fixed frequencies like
days, hours, and minutes, rather than months or quarters).
offset : Timedelta or str, default is None
An offset timedelta added to the origin.
Expand Down Expand Up @@ -9367,12 +9371,12 @@ def resample(
2000-10-02 00:26:00 24
Freq: 17T, dtype: int64
>>> ts.resample('17W', origin='2000-01-01').sum()
2000-01-02 0
2000-04-30 0
2000-08-27 0
2000-12-24 108
Freq: 17W-SUN, dtype: int64
>>> ts.resample('17min', origin='2000-01-01').sum()
2000-10-01 23:24:00 3
2000-10-01 23:41:00 15
2000-10-01 23:58:00 45
2000-10-02 00:15:00 45
Freq: 17min, dtype: int64
If you want to adjust the start of the bins with an `offset` Timedelta, the two
following lines are equivalent:
Expand Down
12 changes: 6 additions & 6 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,12 +207,12 @@ class Grouper:
2000-10-02 00:26:00 24
Freq: 17T, dtype: int64
>>> ts.groupby(pd.Grouper(freq='17W', origin='2000-01-01')).sum()
2000-01-02 0
2000-04-30 0
2000-08-27 0
2000-12-24 108
Freq: 17W-SUN, dtype: int64
>>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()
2000-10-01 23:24:00 3
2000-10-01 23:41:00 15
2000-10-01 23:58:00 45
2000-10-02 00:15:00 45
Freq: 17min, dtype: int64
If you want to adjust the start of the bins with an `offset` Timedelta, the two
following lines are equivalent:
Expand Down
13 changes: 1 addition & 12 deletions pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -2463,16 +2463,8 @@ def _get_timestamp_range_edges(
"""
if isinstance(freq, Tick):
index_tz = first.tz

if isinstance(origin, Timestamp) and origin.tz != index_tz:
if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None):
raise ValueError("The origin must have the same timezone as the index.")

elif isinstance(origin, Timestamp):
if origin <= first:
first = origin
elif origin >= last:
last = origin

if origin == "epoch":
# set the epoch based on the timezone to have similar bins results when
# resampling on the same kind of indexes on different timezones
Expand All @@ -2494,9 +2486,6 @@ def _get_timestamp_range_edges(
first = first.tz_localize(index_tz)
last = last.tz_localize(index_tz)
else:
if isinstance(origin, Timestamp):
first = origin

first = first.normalize()
last = last.normalize()

Expand Down
67 changes: 17 additions & 50 deletions pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,34 +796,24 @@ def test_resample_offset(unit):


@pytest.mark.parametrize(
"kwargs, expected",
"kwargs",
[
(
{"origin": "1999-12-31 23:57:00"},
["1999-12-31 23:57:00", "2000-01-01 01:57:00"],
),
(
{"origin": Timestamp("1970-01-01 00:02:00")},
["1970-01-01 00:02:00", "2000-01-01 01:57:00"],
),
(
{"origin": "epoch", "offset": "2m"},
["1999-12-31 23:57:00", "2000-01-01 01:57:00"],
),
{"origin": "1999-12-31 23:57:00"},
{"origin": Timestamp("1970-01-01 00:02:00")},
{"origin": "epoch", "offset": "2m"},
# origin of '1999-31-12 12:02:00' should be equivalent for this case
(
{"origin": "1999-12-31 12:02:00"},
["1999-12-31 12:02:00", "2000-01-01 01:57:00"],
),
({"offset": "-3m"}, ["1999-12-31 23:57:00", "2000-01-01 01:57:00"]),
{"origin": "1999-12-31 12:02:00"},
{"offset": "-3m"},
],
)
def test_resample_origin(kwargs, unit, expected):
def test_resample_origin(kwargs, unit):
# GH 31809
rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit)
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)

exp_rng = date_range(expected[0], expected[1], freq="5min").as_unit(unit)
exp_rng = date_range(
"1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min"
).as_unit(unit)

resampled = ts.resample("5min", **kwargs).mean()
tm.assert_index_equal(resampled.index, exp_rng)
Expand Down Expand Up @@ -853,31 +843,6 @@ def test_resample_bad_offset(offset, unit):
ts.resample("5min", offset=offset)


def test_resample_monthstart_origin():
# GH 53662
df = DataFrame({"ts": [datetime(1999, 12, 31, 0, 0, 0)], "values": [10.0]})
result = df.resample("2MS", on="ts", origin="1999-11-01")["values"].sum()
excepted = Series(
[10.0],
index=DatetimeIndex(
["1999-11-01"], dtype="datetime64[ns]", name="ts", freq="2MS"
),
)
tm.assert_index_equal(result.index, excepted.index)

df = DataFrame({"ts": [datetime(1999, 12, 31, 20)], "values": [10.0]})
result = df.resample(
"3YS", on="ts", closed="left", label="left", origin=datetime(1995, 1, 1)
)["values"].sum()
expected = Series(
[0, 10.0],
index=DatetimeIndex(
["1995-01-01", "1998-01-01"], dtype="datetime64[ns]", name="ts", freq="3YS"
),
)
tm.assert_index_equal(result.index, expected.index)


def test_resample_origin_prime_freq(unit):
# GH 31809
start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00"
Expand Down Expand Up @@ -909,7 +874,7 @@ def test_resample_origin_prime_freq(unit):
tm.assert_index_equal(resampled.index, exp_rng)

exp_rng = date_range(
"2000-01-01 00:00:00", "2000-10-02 00:15:00", freq="17min"
"2000-10-01 23:24:00", "2000-10-02 00:15:00", freq="17min"
).as_unit(unit)
resampled = ts.resample("17min", origin="2000-01-01").mean()
tm.assert_index_equal(resampled.index, exp_rng)
Expand All @@ -928,12 +893,14 @@ def test_resample_origin_with_tz(unit):
exp_rng = date_range(
"1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min", tz=tz
).as_unit(unit)
resampled = ts.resample("5min", origin="epoch", offset="2m").mean()
resampled = ts.resample("5min", origin="1999-12-31 23:57:00+00:00").mean()
tm.assert_index_equal(resampled.index, exp_rng)

resampled = ts.resample(
"5min", origin=Timestamp("1999-12-31 23:57:00", tz=tz)
).mean()
# origin of '1999-31-12 12:02:00+03:00' should be equivalent for this case
resampled = ts.resample("5min", origin="1999-12-31 12:02:00+03:00").mean()
tm.assert_index_equal(resampled.index, exp_rng)

resampled = ts.resample("5min", origin="epoch", offset="2m").mean()
tm.assert_index_equal(resampled.index, exp_rng)

with pytest.raises(ValueError, match=msg):
Expand Down
32 changes: 13 additions & 19 deletions pandas/tests/resample/test_resampler_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,19 +141,6 @@ def test_groupby_with_origin():
start, end = "1/1/2000 00:00:00", "1/31/2000 00:00"
middle = "1/15/2000 00:00:00"

# test origin on 1970-01-01 00:00:00
rng = date_range("1970-01-01 00:00:00", end, freq="1231min") # prime number
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
middle_ts = rng[len(rng) // 2]
ts2 = ts[middle_ts:end]

origin = Timestamp(0)
adjusted_grouper = pd.Grouper(freq=freq, origin=origin)
adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
adjusted_count_ts = adjusted_count_ts[middle_ts:end]
adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2[middle_ts:end])

rng = date_range(start, end, freq="1231min") # prime number
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
ts2 = ts[middle:end]
Expand All @@ -167,19 +154,26 @@ def test_groupby_with_origin():
with pytest.raises(AssertionError, match="Index are different"):
tm.assert_index_equal(count_ts.index, count_ts2.index)

# test origin on 2049-10-18 20:00:00
# test origin on 1970-01-01 00:00:00
origin = Timestamp(0)
adjusted_grouper = pd.Grouper(freq=freq, origin=origin)
adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
adjusted_count_ts = adjusted_count_ts[middle:end]
adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2)

rng = date_range(start, "2049-10-18 20:00:00", freq="1231min") # prime number
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
middle_ts = rng[len(rng) // 2]
ts2 = ts[middle_ts:end]
# test origin on 2049-10-18 20:00:00
origin_future = Timestamp(0) + pd.Timedelta("1399min") * 30_000
adjusted_grouper2 = pd.Grouper(freq=freq, origin=origin_future)
adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count")
adjusted2_count_ts = adjusted2_count_ts[middle_ts:end]
adjusted2_count_ts = adjusted2_count_ts[middle:end]
adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count")
tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2)

# both grouper use an adjusted timestamp that is a multiple of 1399 min
# they should be equals even if the adjusted_timestamp is in the future
tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2)


def test_nearest():
# GH 17496
Expand Down

0 comments on commit 34df28b

Please sign in to comment.