Skip to content

Commit dfc4c39

Browse files
authored
BUG: Timestamp origin takes no effect in resample for 'MS' frequency (#53938)
* add missing origin check * whats new * resolve edge cases, fix tests * update docs * cleanup * accomidate daylight savings time * correct docs and remove nested checks * slim down example * add back tz aware test
1 parent 1a4ac0e commit dfc4c39

File tree

6 files changed

+94
-43
lines changed

6 files changed

+94
-43
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,7 @@ Plotting
561561

562562
Groupby/resample/rolling
563563
^^^^^^^^^^^^^^^^^^^^^^^^
564+
- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` :class:`Datetimelike` ``origin`` has no effect in resample when values are outside of axis (:issue:`53662`)
564565
- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` in incorrectly allowing non-fixed ``freq`` when resampling on a :class:`TimedeltaIndex` (:issue:`51896`)
565566
- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` losing time zone when resampling empty data (:issue:`53664`)
566567
- Bug in :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmax` return wrong dtype when used on empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`)

pandas/core/generic.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -9247,12 +9247,12 @@ def resample(
92479247
2000-10-02 00:26:00 24
92489248
Freq: 17T, dtype: int64
92499249
9250-
>>> ts.resample('17min', origin='2000-01-01').sum()
9251-
2000-10-01 23:24:00 3
9252-
2000-10-01 23:41:00 15
9253-
2000-10-01 23:58:00 45
9254-
2000-10-02 00:15:00 45
9255-
Freq: 17T, dtype: int64
9250+
>>> ts.resample('17W', origin='2000-01-01').sum()
9251+
2000-01-02 0
9252+
2000-04-30 0
9253+
2000-08-27 0
9254+
2000-12-24 108
9255+
Freq: 17W-SUN, dtype: int64
92569256
92579257
If you want to adjust the start of the bins with an `offset` Timedelta, the two
92589258
following lines are equivalent:

pandas/core/groupby/grouper.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -206,12 +206,12 @@ class Grouper:
206206
2000-10-02 00:26:00 24
207207
Freq: 17T, dtype: int64
208208
209-
>>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()
210-
2000-10-01 23:24:00 3
211-
2000-10-01 23:41:00 15
212-
2000-10-01 23:58:00 45
213-
2000-10-02 00:15:00 45
214-
Freq: 17T, dtype: int64
209+
>>> ts.groupby(pd.Grouper(freq='17W', origin='2000-01-01')).sum()
210+
2000-01-02 0
211+
2000-04-30 0
212+
2000-08-27 0
213+
2000-12-24 108
214+
Freq: 17W-SUN, dtype: int64
215215
216216
If you want to adjust the start of the bins with an `offset` Timedelta, the two
217217
following lines are equivalent:

pandas/core/resample.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -2463,8 +2463,16 @@ def _get_timestamp_range_edges(
24632463
"""
24642464
if isinstance(freq, Tick):
24652465
index_tz = first.tz
2466-
if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None):
2466+
2467+
if isinstance(origin, Timestamp) and origin.tz != index_tz:
24672468
raise ValueError("The origin must have the same timezone as the index.")
2469+
2470+
elif isinstance(origin, Timestamp):
2471+
if origin <= first:
2472+
first = origin
2473+
elif origin >= last:
2474+
last = origin
2475+
24682476
if origin == "epoch":
24692477
# set the epoch based on the timezone to have similar bins results when
24702478
# resampling on the same kind of indexes on different timezones
@@ -2486,6 +2494,9 @@ def _get_timestamp_range_edges(
24862494
first = first.tz_localize(index_tz)
24872495
last = last.tz_localize(index_tz)
24882496
else:
2497+
if isinstance(origin, Timestamp):
2498+
first = origin
2499+
24892500
first = first.normalize()
24902501
last = last.normalize()
24912502

pandas/tests/resample/test_datetime_index.py

+50-17
Original file line numberDiff line numberDiff line change
@@ -790,24 +790,34 @@ def test_resample_offset(unit):
790790

791791

792792
@pytest.mark.parametrize(
793-
"kwargs",
793+
"kwargs, expected",
794794
[
795-
{"origin": "1999-12-31 23:57:00"},
796-
{"origin": Timestamp("1970-01-01 00:02:00")},
797-
{"origin": "epoch", "offset": "2m"},
795+
(
796+
{"origin": "1999-12-31 23:57:00"},
797+
["1999-12-31 23:57:00", "2000-01-01 01:57:00"],
798+
),
799+
(
800+
{"origin": Timestamp("1970-01-01 00:02:00")},
801+
["1970-01-01 00:02:00", "2000-01-01 01:57:00"],
802+
),
803+
(
804+
{"origin": "epoch", "offset": "2m"},
805+
["1999-12-31 23:57:00", "2000-01-01 01:57:00"],
806+
),
798807
# origin of '1999-31-12 12:02:00' should be equivalent for this case
799-
{"origin": "1999-12-31 12:02:00"},
800-
{"offset": "-3m"},
808+
(
809+
{"origin": "1999-12-31 12:02:00"},
810+
["1999-12-31 12:02:00", "2000-01-01 01:57:00"],
811+
),
812+
({"offset": "-3m"}, ["1999-12-31 23:57:00", "2000-01-01 01:57:00"]),
801813
],
802814
)
803-
def test_resample_origin(kwargs, unit):
815+
def test_resample_origin(kwargs, unit, expected):
804816
# GH 31809
805817
rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit)
806818
ts = Series(np.random.randn(len(rng)), index=rng)
807819

808-
exp_rng = date_range(
809-
"1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min"
810-
).as_unit(unit)
820+
exp_rng = date_range(expected[0], expected[1], freq="5min").as_unit(unit)
811821

812822
resampled = ts.resample("5min", **kwargs).mean()
813823
tm.assert_index_equal(resampled.index, exp_rng)
@@ -837,6 +847,31 @@ def test_resample_bad_offset(offset, unit):
837847
ts.resample("5min", offset=offset)
838848

839849

850+
def test_resample_monthstart_origin():
851+
# GH 53662
852+
df = DataFrame({"ts": [datetime(1999, 12, 31, 0, 0, 0)], "values": [10.0]})
853+
result = df.resample("2MS", on="ts", origin="1999-11-01")["values"].sum()
854+
excepted = Series(
855+
[10.0],
856+
index=DatetimeIndex(
857+
["1999-11-01"], dtype="datetime64[ns]", name="ts", freq="2MS"
858+
),
859+
)
860+
tm.assert_index_equal(result.index, excepted.index)
861+
862+
df = DataFrame({"ts": [datetime(1999, 12, 31, 20)], "values": [10.0]})
863+
result = df.resample(
864+
"3YS", on="ts", closed="left", label="left", origin=datetime(1995, 1, 1)
865+
)["values"].sum()
866+
expected = Series(
867+
[0, 10.0],
868+
index=DatetimeIndex(
869+
["1995-01-01", "1998-01-01"], dtype="datetime64[ns]", name="ts", freq="3YS"
870+
),
871+
)
872+
tm.assert_index_equal(result.index, expected.index)
873+
874+
840875
def test_resample_origin_prime_freq(unit):
841876
# GH 31809
842877
start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00"
@@ -868,7 +903,7 @@ def test_resample_origin_prime_freq(unit):
868903
tm.assert_index_equal(resampled.index, exp_rng)
869904

870905
exp_rng = date_range(
871-
"2000-10-01 23:24:00", "2000-10-02 00:15:00", freq="17min"
906+
"2000-01-01 00:00:00", "2000-10-02 00:15:00", freq="17min"
872907
).as_unit(unit)
873908
resampled = ts.resample("17min", origin="2000-01-01").mean()
874909
tm.assert_index_equal(resampled.index, exp_rng)
@@ -887,14 +922,12 @@ def test_resample_origin_with_tz(unit):
887922
exp_rng = date_range(
888923
"1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min", tz=tz
889924
).as_unit(unit)
890-
resampled = ts.resample("5min", origin="1999-12-31 23:57:00+00:00").mean()
891-
tm.assert_index_equal(resampled.index, exp_rng)
892-
893-
# origin of '1999-31-12 12:02:00+03:00' should be equivalent for this case
894-
resampled = ts.resample("5min", origin="1999-12-31 12:02:00+03:00").mean()
925+
resampled = ts.resample("5min", origin="epoch", offset="2m").mean()
895926
tm.assert_index_equal(resampled.index, exp_rng)
896927

897-
resampled = ts.resample("5min", origin="epoch", offset="2m").mean()
928+
resampled = ts.resample(
929+
"5min", origin=Timestamp("1999-12-31 23:57:00", tz=tz)
930+
).mean()
898931
tm.assert_index_equal(resampled.index, exp_rng)
899932

900933
with pytest.raises(ValueError, match=msg):

pandas/tests/resample/test_resampler_grouper.py

+19-13
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,19 @@ def test_groupby_with_origin():
141141
start, end = "1/1/2000 00:00:00", "1/31/2000 00:00"
142142
middle = "1/15/2000 00:00:00"
143143

144+
# test origin on 1970-01-01 00:00:00
145+
rng = date_range("1970-01-01 00:00:00", end, freq="1231min") # prime number
146+
ts = Series(np.random.randn(len(rng)), index=rng)
147+
middle_ts = rng[len(rng) // 2]
148+
ts2 = ts[middle_ts:end]
149+
150+
origin = Timestamp(0)
151+
adjusted_grouper = pd.Grouper(freq=freq, origin=origin)
152+
adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
153+
adjusted_count_ts = adjusted_count_ts[middle_ts:end]
154+
adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
155+
tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2[middle_ts:end])
156+
144157
rng = date_range(start, end, freq="1231min") # prime number
145158
ts = Series(np.random.randn(len(rng)), index=rng)
146159
ts2 = ts[middle:end]
@@ -154,26 +167,19 @@ def test_groupby_with_origin():
154167
with pytest.raises(AssertionError, match="Index are different"):
155168
tm.assert_index_equal(count_ts.index, count_ts2.index)
156169

157-
# test origin on 1970-01-01 00:00:00
158-
origin = Timestamp(0)
159-
adjusted_grouper = pd.Grouper(freq=freq, origin=origin)
160-
adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
161-
adjusted_count_ts = adjusted_count_ts[middle:end]
162-
adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
163-
tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2)
164-
165170
# test origin on 2049-10-18 20:00:00
171+
172+
rng = date_range(start, "2049-10-18 20:00:00", freq="1231min") # prime number
173+
ts = Series(np.random.randn(len(rng)), index=rng)
174+
middle_ts = rng[len(rng) // 2]
175+
ts2 = ts[middle_ts:end]
166176
origin_future = Timestamp(0) + pd.Timedelta("1399min") * 30_000
167177
adjusted_grouper2 = pd.Grouper(freq=freq, origin=origin_future)
168178
adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count")
169-
adjusted2_count_ts = adjusted2_count_ts[middle:end]
179+
adjusted2_count_ts = adjusted2_count_ts[middle_ts:end]
170180
adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count")
171181
tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2)
172182

173-
# both grouper use an adjusted timestamp that is a multiple of 1399 min
174-
# they should be equals even if the adjusted_timestamp is in the future
175-
tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2)
176-
177183

178184
def test_nearest():
179185
# GH 17496

0 commit comments

Comments
 (0)