diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index f2bf446c3bb6d..35b4b4544cce7 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -554,6 +554,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` :class:`Datetimelike` ``origin`` has no effect in resample when values are outside of axis (:issue:`53662`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` in incorrectly allowing non-fixed ``freq`` when resampling on a :class:`TimedeltaIndex` (:issue:`51896`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` losing time zone when resampling empty data (:issue:`53664`) - Bug in :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmax` return wrong dtype when used on empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 95754795faa73..9d09665b15be9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9247,12 +9247,12 @@ def resample( 2000-10-02 00:26:00 24 Freq: 17T, dtype: int64 - >>> ts.resample('17min', origin='2000-01-01').sum() - 2000-10-01 23:24:00 3 - 2000-10-01 23:41:00 15 - 2000-10-01 23:58:00 45 - 2000-10-02 00:15:00 45 - Freq: 17T, dtype: int64 + >>> ts.resample('17W', origin='2000-01-01').sum() + 2000-01-02 0 + 2000-04-30 0 + 2000-08-27 0 + 2000-12-24 108 + Freq: 17W-SUN, dtype: int64 If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index f38cc21a1308a..4201887e13178 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -206,12 +206,12 @@ class Grouper: 2000-10-02 00:26:00 24 Freq: 17T, dtype: int64 - >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() - 2000-10-01 23:24:00 3 - 2000-10-01 23:41:00 15 - 2000-10-01 23:58:00 45 - 2000-10-02 00:15:00 45 - Freq: 17T, dtype: int64 + >>> ts.groupby(pd.Grouper(freq='17W', origin='2000-01-01')).sum() + 2000-01-02 0 + 2000-04-30 0 + 2000-08-27 0 + 2000-12-24 108 + Freq: 17W-SUN, dtype: int64 If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index e4cebc01cccdd..53d587cdde182 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2463,8 +2463,16 @@ def _get_timestamp_range_edges( """ if isinstance(freq, Tick): index_tz = first.tz - if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None): + + if isinstance(origin, Timestamp) and origin.tz != index_tz: raise ValueError("The origin must have the same timezone as the index.") + + elif isinstance(origin, Timestamp): + if origin <= first: + first = origin + elif origin >= last: + last = origin + if origin == "epoch": # set the epoch based on the timezone to have similar bins results when # resampling on the same kind of indexes on different timezones @@ -2486,6 +2494,9 @@ def _get_timestamp_range_edges( first = first.tz_localize(index_tz) last = last.tz_localize(index_tz) else: + if isinstance(origin, Timestamp): + first = origin + first = first.normalize() last = last.normalize() diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 86a3017753844..12e15eab3aa64 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -790,24 +790,34 @@ def test_resample_offset(unit): @pytest.mark.parametrize( - "kwargs", + "kwargs, expected", [ - {"origin": "1999-12-31 23:57:00"}, - {"origin": Timestamp("1970-01-01 00:02:00")}, - {"origin": "epoch", "offset": "2m"}, + ( + {"origin": "1999-12-31 23:57:00"}, + ["1999-12-31 23:57:00", "2000-01-01 01:57:00"], + ), + ( + {"origin": Timestamp("1970-01-01 00:02:00")}, + ["1970-01-01 00:02:00", "2000-01-01 01:57:00"], + ), + ( + {"origin": "epoch", "offset": "2m"}, + ["1999-12-31 23:57:00", "2000-01-01 01:57:00"], + ), # origin of '1999-31-12 12:02:00' should be equivalent for this case - {"origin": "1999-12-31 12:02:00"}, - {"offset": "-3m"}, + ( + {"origin": "1999-12-31 12:02:00"}, + ["1999-12-31 12:02:00", "2000-01-01 01:57:00"], + ), + ({"offset": "-3m"}, ["1999-12-31 23:57:00", "2000-01-01 01:57:00"]), ], ) -def test_resample_origin(kwargs, unit): +def test_resample_origin(kwargs, unit, expected): # GH 31809 rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit) ts = Series(np.random.randn(len(rng)), index=rng) - exp_rng = date_range( - "1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min" - ).as_unit(unit) + exp_rng = date_range(expected[0], expected[1], freq="5min").as_unit(unit) resampled = ts.resample("5min", **kwargs).mean() tm.assert_index_equal(resampled.index, exp_rng) @@ -837,6 +847,31 @@ def test_resample_bad_offset(offset, unit): ts.resample("5min", offset=offset) +def test_resample_monthstart_origin(): + # GH 53662 + df = DataFrame({"ts": [datetime(1999, 12, 31, 0, 0, 0)], "values": [10.0]}) + result = df.resample("2MS", on="ts", origin="1999-11-01")["values"].sum() + excepted = Series( + [10.0], + index=DatetimeIndex( + ["1999-11-01"], dtype="datetime64[ns]", name="ts", freq="2MS" + ), + ) + tm.assert_index_equal(result.index, excepted.index) + + df = DataFrame({"ts": [datetime(1999, 12, 31, 20)], "values": [10.0]}) + result = df.resample( + "3YS", on="ts", closed="left", label="left", origin=datetime(1995, 1, 1) + )["values"].sum() + expected = Series( + [0, 10.0], + index=DatetimeIndex( + ["1995-01-01", "1998-01-01"], dtype="datetime64[ns]", name="ts", freq="3YS" + ), + ) + tm.assert_index_equal(result.index, expected.index) + + def test_resample_origin_prime_freq(unit): # GH 31809 start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" @@ -868,7 +903,7 @@ def test_resample_origin_prime_freq(unit): tm.assert_index_equal(resampled.index, exp_rng) exp_rng = date_range( - "2000-10-01 23:24:00", "2000-10-02 00:15:00", freq="17min" + "2000-01-01 00:00:00", "2000-10-02 00:15:00", freq="17min" ).as_unit(unit) resampled = ts.resample("17min", origin="2000-01-01").mean() tm.assert_index_equal(resampled.index, exp_rng) @@ -887,14 +922,12 @@ def test_resample_origin_with_tz(unit): exp_rng = date_range( "1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min", tz=tz ).as_unit(unit) - resampled = ts.resample("5min", origin="1999-12-31 23:57:00+00:00").mean() - tm.assert_index_equal(resampled.index, exp_rng) - - # origin of '1999-31-12 12:02:00+03:00' should be equivalent for this case - resampled = ts.resample("5min", origin="1999-12-31 12:02:00+03:00").mean() + resampled = ts.resample("5min", origin="epoch", offset="2m").mean() tm.assert_index_equal(resampled.index, exp_rng) - resampled = ts.resample("5min", origin="epoch", offset="2m").mean() + resampled = ts.resample( + "5min", origin=Timestamp("1999-12-31 23:57:00", tz=tz) + ).mean() tm.assert_index_equal(resampled.index, exp_rng) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 23b4f4bcf01d1..0f702d528655c 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -141,6 +141,19 @@ def test_groupby_with_origin(): start, end = "1/1/2000 00:00:00", "1/31/2000 00:00" middle = "1/15/2000 00:00:00" + # test origin on 1970-01-01 00:00:00 + rng = date_range("1970-01-01 00:00:00", end, freq="1231min") # prime number + ts = Series(np.random.randn(len(rng)), index=rng) + middle_ts = rng[len(rng) // 2] + ts2 = ts[middle_ts:end] + + origin = Timestamp(0) + adjusted_grouper = pd.Grouper(freq=freq, origin=origin) + adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count") + adjusted_count_ts = adjusted_count_ts[middle_ts:end] + adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count") + tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2[middle_ts:end]) + rng = date_range(start, end, freq="1231min") # prime number ts = Series(np.random.randn(len(rng)), index=rng) ts2 = ts[middle:end] @@ -154,26 +167,19 @@ def test_groupby_with_origin(): with pytest.raises(AssertionError, match="Index are different"): tm.assert_index_equal(count_ts.index, count_ts2.index) - # test origin on 1970-01-01 00:00:00 - origin = Timestamp(0) - adjusted_grouper = pd.Grouper(freq=freq, origin=origin) - adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count") - adjusted_count_ts = adjusted_count_ts[middle:end] - adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count") - tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2) - # test origin on 2049-10-18 20:00:00 + + rng = date_range(start, "2049-10-18 20:00:00", freq="1231min") # prime number + ts = Series(np.random.randn(len(rng)), index=rng) + middle_ts = rng[len(rng) // 2] + ts2 = ts[middle_ts:end] origin_future = Timestamp(0) + pd.Timedelta("1399min") * 30_000 adjusted_grouper2 = pd.Grouper(freq=freq, origin=origin_future) adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count") - adjusted2_count_ts = adjusted2_count_ts[middle:end] + adjusted2_count_ts = adjusted2_count_ts[middle_ts:end] adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count") tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2) - # both grouper use an adjusted timestamp that is a multiple of 1399 min - # they should be equals even if the adjusted_timestamp is in the future - tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2) - def test_nearest(): # GH 17496