Backport PR pandas-dev#55077: Revert "BUG: Timestamp origin takes no effect in resample for 'MS' frequency (pandas-dev#53938)"

MarcoGorelli · meeseeksmachine · commit 34df28bf1773 · 2023-10-09T18:33:20.000Z
diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst
@@ -15,6 +15,7 @@ Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`)
 - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`)
+- Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_212.bug_fixes:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -9097,6 +9097,10 @@ def resample(
 
             .. versionadded:: 1.3.0
 
+            .. note::
+
+                Only takes effect for Tick-frequencies (i.e. fixed frequencies like
+                days, hours, and minutes, rather than months or quarters).
         offset : Timedelta or str, default is None
             An offset timedelta added to the origin.
 
@@ -9367,12 +9371,12 @@ def resample(
         2000-10-02 00:26:00    24
         Freq: 17T, dtype: int64
 
-        >>> ts.resample('17W', origin='2000-01-01').sum()
-        2000-01-02      0
-        2000-04-30      0
-        2000-08-27      0
-        2000-12-24    108
-        Freq: 17W-SUN, dtype: int64
+        >>> ts.resample('17min', origin='2000-01-01').sum()
+        2000-10-01 23:24:00     3
+        2000-10-01 23:41:00    15
+        2000-10-01 23:58:00    45
+        2000-10-02 00:15:00    45
+        Freq: 17min, dtype: int64
 
         If you want to adjust the start of the bins with an `offset` Timedelta, the two
         following lines are equivalent:
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -207,12 +207,12 @@ class Grouper:
     2000-10-02 00:26:00    24
     Freq: 17T, dtype: int64
 
-    >>> ts.groupby(pd.Grouper(freq='17W', origin='2000-01-01')).sum()
-    2000-01-02      0
-    2000-04-30      0
-    2000-08-27      0
-    2000-12-24    108
-    Freq: 17W-SUN, dtype: int64
+    >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()
+    2000-10-01 23:24:00     3
+    2000-10-01 23:41:00    15
+    2000-10-01 23:58:00    45
+    2000-10-02 00:15:00    45
+    Freq: 17min, dtype: int64
 
     If you want to adjust the start of the bins with an `offset` Timedelta, the two
     following lines are equivalent:
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -2463,16 +2463,8 @@ def _get_timestamp_range_edges(
     """
     if isinstance(freq, Tick):
         index_tz = first.tz
-
-        if isinstance(origin, Timestamp) and origin.tz != index_tz:
+        if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None):
             raise ValueError("The origin must have the same timezone as the index.")
-
-        elif isinstance(origin, Timestamp):
-            if origin <= first:
-                first = origin
-            elif origin >= last:
-                last = origin
-
         if origin == "epoch":
             # set the epoch based on the timezone to have similar bins results when
             # resampling on the same kind of indexes on different timezones
@@ -2494,9 +2486,6 @@ def _get_timestamp_range_edges(
             first = first.tz_localize(index_tz)
             last = last.tz_localize(index_tz)
     else:
-        if isinstance(origin, Timestamp):
-            first = origin
-
         first = first.normalize()
         last = last.normalize()
 
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
@@ -796,34 +796,24 @@ def test_resample_offset(unit):
 
 
 @pytest.mark.parametrize(
-    "kwargs, expected",
+    "kwargs",
     [
-        (
-            {"origin": "1999-12-31 23:57:00"},
-            ["1999-12-31 23:57:00", "2000-01-01 01:57:00"],
-        ),
-        (
-            {"origin": Timestamp("1970-01-01 00:02:00")},
-            ["1970-01-01 00:02:00", "2000-01-01 01:57:00"],
-        ),
-        (
-            {"origin": "epoch", "offset": "2m"},
-            ["1999-12-31 23:57:00", "2000-01-01 01:57:00"],
-        ),
+        {"origin": "1999-12-31 23:57:00"},
+        {"origin": Timestamp("1970-01-01 00:02:00")},
+        {"origin": "epoch", "offset": "2m"},
         # origin of '1999-31-12 12:02:00' should be equivalent for this case
-        (
-            {"origin": "1999-12-31 12:02:00"},
-            ["1999-12-31 12:02:00", "2000-01-01 01:57:00"],
-        ),
-        ({"offset": "-3m"}, ["1999-12-31 23:57:00", "2000-01-01 01:57:00"]),
+        {"origin": "1999-12-31 12:02:00"},
+        {"offset": "-3m"},
     ],
 )
-def test_resample_origin(kwargs, unit, expected):
+def test_resample_origin(kwargs, unit):
     # GH 31809
     rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit)
     ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
 
-    exp_rng = date_range(expected[0], expected[1], freq="5min").as_unit(unit)
+    exp_rng = date_range(
+        "1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min"
+    ).as_unit(unit)
 
     resampled = ts.resample("5min", **kwargs).mean()
     tm.assert_index_equal(resampled.index, exp_rng)
@@ -853,31 +843,6 @@ def test_resample_bad_offset(offset, unit):
         ts.resample("5min", offset=offset)
 
 
-def test_resample_monthstart_origin():
-    # GH 53662
-    df = DataFrame({"ts": [datetime(1999, 12, 31, 0, 0, 0)], "values": [10.0]})
-    result = df.resample("2MS", on="ts", origin="1999-11-01")["values"].sum()
-    excepted = Series(
-        [10.0],
-        index=DatetimeIndex(
-            ["1999-11-01"], dtype="datetime64[ns]", name="ts", freq="2MS"
-        ),
-    )
-    tm.assert_index_equal(result.index, excepted.index)
-
-    df = DataFrame({"ts": [datetime(1999, 12, 31, 20)], "values": [10.0]})
-    result = df.resample(
-        "3YS", on="ts", closed="left", label="left", origin=datetime(1995, 1, 1)
-    )["values"].sum()
-    expected = Series(
-        [0, 10.0],
-        index=DatetimeIndex(
-            ["1995-01-01", "1998-01-01"], dtype="datetime64[ns]", name="ts", freq="3YS"
-        ),
-    )
-    tm.assert_index_equal(result.index, expected.index)
-
-
 def test_resample_origin_prime_freq(unit):
     # GH 31809
     start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00"
@@ -909,7 +874,7 @@ def test_resample_origin_prime_freq(unit):
     tm.assert_index_equal(resampled.index, exp_rng)
 
     exp_rng = date_range(
-        "2000-01-01 00:00:00", "2000-10-02 00:15:00", freq="17min"
+        "2000-10-01 23:24:00", "2000-10-02 00:15:00", freq="17min"
     ).as_unit(unit)
     resampled = ts.resample("17min", origin="2000-01-01").mean()
     tm.assert_index_equal(resampled.index, exp_rng)
@@ -928,12 +893,14 @@ def test_resample_origin_with_tz(unit):
     exp_rng = date_range(
         "1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min", tz=tz
     ).as_unit(unit)
-    resampled = ts.resample("5min", origin="epoch", offset="2m").mean()
+    resampled = ts.resample("5min", origin="1999-12-31 23:57:00+00:00").mean()
     tm.assert_index_equal(resampled.index, exp_rng)
 
-    resampled = ts.resample(
-        "5min", origin=Timestamp("1999-12-31 23:57:00", tz=tz)
-    ).mean()
+    # origin of '1999-31-12 12:02:00+03:00' should be equivalent for this case
+    resampled = ts.resample("5min", origin="1999-12-31 12:02:00+03:00").mean()
+    tm.assert_index_equal(resampled.index, exp_rng)
+
+    resampled = ts.resample("5min", origin="epoch", offset="2m").mean()
     tm.assert_index_equal(resampled.index, exp_rng)
 
     with pytest.raises(ValueError, match=msg):
diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py
@@ -141,19 +141,6 @@ def test_groupby_with_origin():
     start, end = "1/1/2000 00:00:00", "1/31/2000 00:00"
     middle = "1/15/2000 00:00:00"
 
-    # test origin on 1970-01-01 00:00:00
-    rng = date_range("1970-01-01 00:00:00", end, freq="1231min")  # prime number
-    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
-    middle_ts = rng[len(rng) // 2]
-    ts2 = ts[middle_ts:end]
-
-    origin = Timestamp(0)
-    adjusted_grouper = pd.Grouper(freq=freq, origin=origin)
-    adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
-    adjusted_count_ts = adjusted_count_ts[middle_ts:end]
-    adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
-    tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2[middle_ts:end])
-
     rng = date_range(start, end, freq="1231min")  # prime number
     ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
     ts2 = ts[middle:end]
@@ -167,19 +154,26 @@ def test_groupby_with_origin():
     with pytest.raises(AssertionError, match="Index are different"):
         tm.assert_index_equal(count_ts.index, count_ts2.index)
 
-    # test origin on 2049-10-18 20:00:00
+    # test origin on 1970-01-01 00:00:00
+    origin = Timestamp(0)
+    adjusted_grouper = pd.Grouper(freq=freq, origin=origin)
+    adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
+    adjusted_count_ts = adjusted_count_ts[middle:end]
+    adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
+    tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2)
 
-    rng = date_range(start, "2049-10-18 20:00:00", freq="1231min")  # prime number
-    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
-    middle_ts = rng[len(rng) // 2]
-    ts2 = ts[middle_ts:end]
+    # test origin on 2049-10-18 20:00:00
     origin_future = Timestamp(0) + pd.Timedelta("1399min") * 30_000
     adjusted_grouper2 = pd.Grouper(freq=freq, origin=origin_future)
     adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count")
-    adjusted2_count_ts = adjusted2_count_ts[middle_ts:end]
+    adjusted2_count_ts = adjusted2_count_ts[middle:end]
     adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count")
     tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2)
 
+    # both grouper use an adjusted timestamp that is a multiple of 1399 min
+    # they should be equals even if the adjusted_timestamp is in the future
+    tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2)
+
 
 def test_nearest():
     # GH 17496