Skip to content

Commit e74138c

Browse files
Revert "BUG: Timestamp origin takes no effect in resample for 'MS' frequency (#53938)" (#55077)
* Revert "BUG: Timestamp origin takes no effect in resample for 'MS' frequency (#53938)" This reverts commit dfc4c39. * note that origin only takes effect for tick frequencies * fixup doctest * move whatsnew to v2.1.2 --------- Co-authored-by: Thomas Li <[email protected]>
1 parent 361b62e commit e74138c

File tree

6 files changed

+48
-93
lines changed

6 files changed

+48
-93
lines changed

doc/source/whatsnew/v2.1.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ Fixed regressions
1515
~~~~~~~~~~~~~~~~~
1616
- Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`)
1717
- Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`)
18+
- Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`)
1819

1920
.. ---------------------------------------------------------------------------
2021
.. _whatsnew_212.bug_fixes:

pandas/core/generic.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -9221,6 +9221,10 @@ def resample(
92219221
92229222
.. versionadded:: 1.3.0
92239223
9224+
.. note::
9225+
9226+
Only takes effect for Tick-frequencies (i.e. fixed frequencies like
9227+
days, hours, and minutes, rather than months or quarters).
92249228
offset : Timedelta or str, default is None
92259229
An offset timedelta added to the origin.
92269230
@@ -9491,12 +9495,12 @@ def resample(
94919495
2000-10-02 00:26:00 24
94929496
Freq: 17min, dtype: int64
94939497
9494-
>>> ts.resample('17W', origin='2000-01-01').sum()
9495-
2000-01-02 0
9496-
2000-04-30 0
9497-
2000-08-27 0
9498-
2000-12-24 108
9499-
Freq: 17W-SUN, dtype: int64
9498+
>>> ts.resample('17min', origin='2000-01-01').sum()
9499+
2000-10-01 23:24:00 3
9500+
2000-10-01 23:41:00 15
9501+
2000-10-01 23:58:00 45
9502+
2000-10-02 00:15:00 45
9503+
Freq: 17min, dtype: int64
95009504
95019505
If you want to adjust the start of the bins with an `offset` Timedelta, the two
95029506
following lines are equivalent:

pandas/core/groupby/grouper.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -207,12 +207,12 @@ class Grouper:
207207
2000-10-02 00:26:00 24
208208
Freq: 17min, dtype: int64
209209
210-
>>> ts.groupby(pd.Grouper(freq='17W', origin='2000-01-01')).sum()
211-
2000-01-02 0
212-
2000-04-30 0
213-
2000-08-27 0
214-
2000-12-24 108
215-
Freq: 17W-SUN, dtype: int64
210+
>>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()
211+
2000-10-01 23:24:00 3
212+
2000-10-01 23:41:00 15
213+
2000-10-01 23:58:00 45
214+
2000-10-02 00:15:00 45
215+
Freq: 17min, dtype: int64
216216
217217
If you want to adjust the start of the bins with an `offset` Timedelta, the two
218218
following lines are equivalent:

pandas/core/resample.py

+1-12
Original file line numberDiff line numberDiff line change
@@ -2528,16 +2528,8 @@ def _get_timestamp_range_edges(
25282528
"""
25292529
if isinstance(freq, Tick):
25302530
index_tz = first.tz
2531-
2532-
if isinstance(origin, Timestamp) and origin.tz != index_tz:
2531+
if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None):
25332532
raise ValueError("The origin must have the same timezone as the index.")
2534-
2535-
elif isinstance(origin, Timestamp):
2536-
if origin <= first:
2537-
first = origin
2538-
elif origin >= last:
2539-
last = origin
2540-
25412533
if origin == "epoch":
25422534
# set the epoch based on the timezone to have similar bins results when
25432535
# resampling on the same kind of indexes on different timezones
@@ -2559,9 +2551,6 @@ def _get_timestamp_range_edges(
25592551
first = first.tz_localize(index_tz)
25602552
last = last.tz_localize(index_tz)
25612553
else:
2562-
if isinstance(origin, Timestamp):
2563-
first = origin
2564-
25652554
first = first.normalize()
25662555
last = last.normalize()
25672556

pandas/tests/resample/test_datetime_index.py

+17-50
Original file line numberDiff line numberDiff line change
@@ -796,34 +796,24 @@ def test_resample_offset(unit):
796796

797797

798798
@pytest.mark.parametrize(
799-
"kwargs, expected",
799+
"kwargs",
800800
[
801-
(
802-
{"origin": "1999-12-31 23:57:00"},
803-
["1999-12-31 23:57:00", "2000-01-01 01:57:00"],
804-
),
805-
(
806-
{"origin": Timestamp("1970-01-01 00:02:00")},
807-
["1970-01-01 00:02:00", "2000-01-01 01:57:00"],
808-
),
809-
(
810-
{"origin": "epoch", "offset": "2m"},
811-
["1999-12-31 23:57:00", "2000-01-01 01:57:00"],
812-
),
801+
{"origin": "1999-12-31 23:57:00"},
802+
{"origin": Timestamp("1970-01-01 00:02:00")},
803+
{"origin": "epoch", "offset": "2m"},
813804
# origin of '1999-31-12 12:02:00' should be equivalent for this case
814-
(
815-
{"origin": "1999-12-31 12:02:00"},
816-
["1999-12-31 12:02:00", "2000-01-01 01:57:00"],
817-
),
818-
({"offset": "-3m"}, ["1999-12-31 23:57:00", "2000-01-01 01:57:00"]),
805+
{"origin": "1999-12-31 12:02:00"},
806+
{"offset": "-3m"},
819807
],
820808
)
821-
def test_resample_origin(kwargs, unit, expected):
809+
def test_resample_origin(kwargs, unit):
822810
# GH 31809
823811
rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit)
824812
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
825813

826-
exp_rng = date_range(expected[0], expected[1], freq="5min").as_unit(unit)
814+
exp_rng = date_range(
815+
"1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min"
816+
).as_unit(unit)
827817

828818
resampled = ts.resample("5min", **kwargs).mean()
829819
tm.assert_index_equal(resampled.index, exp_rng)
@@ -853,31 +843,6 @@ def test_resample_bad_offset(offset, unit):
853843
ts.resample("5min", offset=offset)
854844

855845

856-
def test_resample_monthstart_origin():
857-
# GH 53662
858-
df = DataFrame({"ts": [datetime(1999, 12, 31, 0, 0, 0)], "values": [10.0]})
859-
result = df.resample("2MS", on="ts", origin="1999-11-01")["values"].sum()
860-
excepted = Series(
861-
[10.0],
862-
index=DatetimeIndex(
863-
["1999-11-01"], dtype="datetime64[ns]", name="ts", freq="2MS"
864-
),
865-
)
866-
tm.assert_index_equal(result.index, excepted.index)
867-
868-
df = DataFrame({"ts": [datetime(1999, 12, 31, 20)], "values": [10.0]})
869-
result = df.resample(
870-
"3YS", on="ts", closed="left", label="left", origin=datetime(1995, 1, 1)
871-
)["values"].sum()
872-
expected = Series(
873-
[0, 10.0],
874-
index=DatetimeIndex(
875-
["1995-01-01", "1998-01-01"], dtype="datetime64[ns]", name="ts", freq="3YS"
876-
),
877-
)
878-
tm.assert_index_equal(result.index, expected.index)
879-
880-
881846
def test_resample_origin_prime_freq(unit):
882847
# GH 31809
883848
start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00"
@@ -909,7 +874,7 @@ def test_resample_origin_prime_freq(unit):
909874
tm.assert_index_equal(resampled.index, exp_rng)
910875

911876
exp_rng = date_range(
912-
"2000-01-01 00:00:00", "2000-10-02 00:15:00", freq="17min"
877+
"2000-10-01 23:24:00", "2000-10-02 00:15:00", freq="17min"
913878
).as_unit(unit)
914879
resampled = ts.resample("17min", origin="2000-01-01").mean()
915880
tm.assert_index_equal(resampled.index, exp_rng)
@@ -928,12 +893,14 @@ def test_resample_origin_with_tz(unit):
928893
exp_rng = date_range(
929894
"1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min", tz=tz
930895
).as_unit(unit)
931-
resampled = ts.resample("5min", origin="epoch", offset="2m").mean()
896+
resampled = ts.resample("5min", origin="1999-12-31 23:57:00+00:00").mean()
932897
tm.assert_index_equal(resampled.index, exp_rng)
933898

934-
resampled = ts.resample(
935-
"5min", origin=Timestamp("1999-12-31 23:57:00", tz=tz)
936-
).mean()
899+
# origin of '1999-31-12 12:02:00+03:00' should be equivalent for this case
900+
resampled = ts.resample("5min", origin="1999-12-31 12:02:00+03:00").mean()
901+
tm.assert_index_equal(resampled.index, exp_rng)
902+
903+
resampled = ts.resample("5min", origin="epoch", offset="2m").mean()
937904
tm.assert_index_equal(resampled.index, exp_rng)
938905

939906
with pytest.raises(ValueError, match=msg):

pandas/tests/resample/test_resampler_grouper.py

+13-19
Original file line numberDiff line numberDiff line change
@@ -151,19 +151,6 @@ def test_groupby_with_origin():
151151
start, end = "1/1/2000 00:00:00", "1/31/2000 00:00"
152152
middle = "1/15/2000 00:00:00"
153153

154-
# test origin on 1970-01-01 00:00:00
155-
rng = date_range("1970-01-01 00:00:00", end, freq="1231min") # prime number
156-
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
157-
middle_ts = rng[len(rng) // 2]
158-
ts2 = ts[middle_ts:end]
159-
160-
origin = Timestamp(0)
161-
adjusted_grouper = pd.Grouper(freq=freq, origin=origin)
162-
adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
163-
adjusted_count_ts = adjusted_count_ts[middle_ts:end]
164-
adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
165-
tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2[middle_ts:end])
166-
167154
rng = date_range(start, end, freq="1231min") # prime number
168155
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
169156
ts2 = ts[middle:end]
@@ -177,19 +164,26 @@ def test_groupby_with_origin():
177164
with pytest.raises(AssertionError, match="Index are different"):
178165
tm.assert_index_equal(count_ts.index, count_ts2.index)
179166

180-
# test origin on 2049-10-18 20:00:00
167+
# test origin on 1970-01-01 00:00:00
168+
origin = Timestamp(0)
169+
adjusted_grouper = pd.Grouper(freq=freq, origin=origin)
170+
adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
171+
adjusted_count_ts = adjusted_count_ts[middle:end]
172+
adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
173+
tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2)
181174

182-
rng = date_range(start, "2049-10-18 20:00:00", freq="1231min") # prime number
183-
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
184-
middle_ts = rng[len(rng) // 2]
185-
ts2 = ts[middle_ts:end]
175+
# test origin on 2049-10-18 20:00:00
186176
origin_future = Timestamp(0) + pd.Timedelta("1399min") * 30_000
187177
adjusted_grouper2 = pd.Grouper(freq=freq, origin=origin_future)
188178
adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count")
189-
adjusted2_count_ts = adjusted2_count_ts[middle_ts:end]
179+
adjusted2_count_ts = adjusted2_count_ts[middle:end]
190180
adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count")
191181
tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2)
192182

183+
# both grouper use an adjusted timestamp that is a multiple of 1399 min
184+
# they should be equals even if the adjusted_timestamp is in the future
185+
tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2)
186+
193187

194188
def test_nearest():
195189
# GH 17496

0 commit comments

Comments
 (0)