Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport PR #55077 on branch 2.1.x (Revert "BUG: Timestamp origin takes no effect in resample for 'MS' frequency (#53938)") #55459

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9097,6 +9097,10 @@ def resample(

.. versionadded:: 1.3.0

.. note::

Only takes effect for Tick-frequencies (i.e. fixed frequencies like
days, hours, and minutes, rather than months or quarters).
offset : Timedelta or str, default is None
An offset timedelta added to the origin.

Expand Down Expand Up @@ -9367,12 +9371,12 @@ def resample(
2000-10-02 00:26:00 24
Freq: 17T, dtype: int64

>>> ts.resample('17W', origin='2000-01-01').sum()
2000-01-02 0
2000-04-30 0
2000-08-27 0
2000-12-24 108
Freq: 17W-SUN, dtype: int64
>>> ts.resample('17min', origin='2000-01-01').sum()
2000-10-01 23:24:00 3
2000-10-01 23:41:00 15
2000-10-01 23:58:00 45
2000-10-02 00:15:00 45
Freq: 17T, dtype: int64

If you want to adjust the start of the bins with an `offset` Timedelta, the two
following lines are equivalent:
Expand Down
12 changes: 6 additions & 6 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,12 +207,12 @@ class Grouper:
2000-10-02 00:26:00 24
Freq: 17T, dtype: int64

>>> ts.groupby(pd.Grouper(freq='17W', origin='2000-01-01')).sum()
2000-01-02 0
2000-04-30 0
2000-08-27 0
2000-12-24 108
Freq: 17W-SUN, dtype: int64
>>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()
2000-10-01 23:24:00 3
2000-10-01 23:41:00 15
2000-10-01 23:58:00 45
2000-10-02 00:15:00 45
Freq: 17T, dtype: int64

If you want to adjust the start of the bins with an `offset` Timedelta, the two
following lines are equivalent:
Expand Down
13 changes: 1 addition & 12 deletions pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -2463,16 +2463,8 @@ def _get_timestamp_range_edges(
"""
if isinstance(freq, Tick):
index_tz = first.tz

if isinstance(origin, Timestamp) and origin.tz != index_tz:
if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None):
raise ValueError("The origin must have the same timezone as the index.")

elif isinstance(origin, Timestamp):
if origin <= first:
first = origin
elif origin >= last:
last = origin

if origin == "epoch":
# set the epoch based on the timezone to have similar bins results when
# resampling on the same kind of indexes on different timezones
Expand All @@ -2494,9 +2486,6 @@ def _get_timestamp_range_edges(
first = first.tz_localize(index_tz)
last = last.tz_localize(index_tz)
else:
if isinstance(origin, Timestamp):
first = origin

first = first.normalize()
last = last.normalize()

Expand Down
67 changes: 17 additions & 50 deletions pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,34 +796,24 @@ def test_resample_offset(unit):


@pytest.mark.parametrize(
"kwargs, expected",
"kwargs",
[
(
{"origin": "1999-12-31 23:57:00"},
["1999-12-31 23:57:00", "2000-01-01 01:57:00"],
),
(
{"origin": Timestamp("1970-01-01 00:02:00")},
["1970-01-01 00:02:00", "2000-01-01 01:57:00"],
),
(
{"origin": "epoch", "offset": "2m"},
["1999-12-31 23:57:00", "2000-01-01 01:57:00"],
),
{"origin": "1999-12-31 23:57:00"},
{"origin": Timestamp("1970-01-01 00:02:00")},
{"origin": "epoch", "offset": "2m"},
# origin of '1999-31-12 12:02:00' should be equivalent for this case
(
{"origin": "1999-12-31 12:02:00"},
["1999-12-31 12:02:00", "2000-01-01 01:57:00"],
),
({"offset": "-3m"}, ["1999-12-31 23:57:00", "2000-01-01 01:57:00"]),
{"origin": "1999-12-31 12:02:00"},
{"offset": "-3m"},
],
)
def test_resample_origin(kwargs, unit, expected):
def test_resample_origin(kwargs, unit):
# GH 31809
rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit)
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)

exp_rng = date_range(expected[0], expected[1], freq="5min").as_unit(unit)
exp_rng = date_range(
"1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min"
).as_unit(unit)

resampled = ts.resample("5min", **kwargs).mean()
tm.assert_index_equal(resampled.index, exp_rng)
Expand Down Expand Up @@ -853,31 +843,6 @@ def test_resample_bad_offset(offset, unit):
ts.resample("5min", offset=offset)


def test_resample_monthstart_origin():
# GH 53662
df = DataFrame({"ts": [datetime(1999, 12, 31, 0, 0, 0)], "values": [10.0]})
result = df.resample("2MS", on="ts", origin="1999-11-01")["values"].sum()
excepted = Series(
[10.0],
index=DatetimeIndex(
["1999-11-01"], dtype="datetime64[ns]", name="ts", freq="2MS"
),
)
tm.assert_index_equal(result.index, excepted.index)

df = DataFrame({"ts": [datetime(1999, 12, 31, 20)], "values": [10.0]})
result = df.resample(
"3YS", on="ts", closed="left", label="left", origin=datetime(1995, 1, 1)
)["values"].sum()
expected = Series(
[0, 10.0],
index=DatetimeIndex(
["1995-01-01", "1998-01-01"], dtype="datetime64[ns]", name="ts", freq="3YS"
),
)
tm.assert_index_equal(result.index, expected.index)


def test_resample_origin_prime_freq(unit):
# GH 31809
start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00"
Expand Down Expand Up @@ -909,7 +874,7 @@ def test_resample_origin_prime_freq(unit):
tm.assert_index_equal(resampled.index, exp_rng)

exp_rng = date_range(
"2000-01-01 00:00:00", "2000-10-02 00:15:00", freq="17min"
"2000-10-01 23:24:00", "2000-10-02 00:15:00", freq="17min"
).as_unit(unit)
resampled = ts.resample("17min", origin="2000-01-01").mean()
tm.assert_index_equal(resampled.index, exp_rng)
Expand All @@ -928,12 +893,14 @@ def test_resample_origin_with_tz(unit):
exp_rng = date_range(
"1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min", tz=tz
).as_unit(unit)
resampled = ts.resample("5min", origin="epoch", offset="2m").mean()
resampled = ts.resample("5min", origin="1999-12-31 23:57:00+00:00").mean()
tm.assert_index_equal(resampled.index, exp_rng)

resampled = ts.resample(
"5min", origin=Timestamp("1999-12-31 23:57:00", tz=tz)
).mean()
# origin of '1999-31-12 12:02:00+03:00' should be equivalent for this case
resampled = ts.resample("5min", origin="1999-12-31 12:02:00+03:00").mean()
tm.assert_index_equal(resampled.index, exp_rng)

resampled = ts.resample("5min", origin="epoch", offset="2m").mean()
tm.assert_index_equal(resampled.index, exp_rng)

with pytest.raises(ValueError, match=msg):
Expand Down
32 changes: 13 additions & 19 deletions pandas/tests/resample/test_resampler_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,19 +141,6 @@ def test_groupby_with_origin():
start, end = "1/1/2000 00:00:00", "1/31/2000 00:00"
middle = "1/15/2000 00:00:00"

# test origin on 1970-01-01 00:00:00
rng = date_range("1970-01-01 00:00:00", end, freq="1231min") # prime number
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
middle_ts = rng[len(rng) // 2]
ts2 = ts[middle_ts:end]

origin = Timestamp(0)
adjusted_grouper = pd.Grouper(freq=freq, origin=origin)
adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
adjusted_count_ts = adjusted_count_ts[middle_ts:end]
adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2[middle_ts:end])

rng = date_range(start, end, freq="1231min") # prime number
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
ts2 = ts[middle:end]
Expand All @@ -167,19 +154,26 @@ def test_groupby_with_origin():
with pytest.raises(AssertionError, match="Index are different"):
tm.assert_index_equal(count_ts.index, count_ts2.index)

# test origin on 2049-10-18 20:00:00
# test origin on 1970-01-01 00:00:00
origin = Timestamp(0)
adjusted_grouper = pd.Grouper(freq=freq, origin=origin)
adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
adjusted_count_ts = adjusted_count_ts[middle:end]
adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2)

rng = date_range(start, "2049-10-18 20:00:00", freq="1231min") # prime number
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
middle_ts = rng[len(rng) // 2]
ts2 = ts[middle_ts:end]
# test origin on 2049-10-18 20:00:00
origin_future = Timestamp(0) + pd.Timedelta("1399min") * 30_000
adjusted_grouper2 = pd.Grouper(freq=freq, origin=origin_future)
adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count")
adjusted2_count_ts = adjusted2_count_ts[middle_ts:end]
adjusted2_count_ts = adjusted2_count_ts[middle:end]
adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count")
tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2)

# both grouper use an adjusted timestamp that is a multiple of 1399 min
# they should be equals even if the adjusted_timestamp is in the future
tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2)


def test_nearest():
# GH 17496
Expand Down
Loading