Skip to content

Commit 15b5946

Browse files
authored
ENH: add end and end_day origin for resample (#38408)
1 parent 461ce9a commit 15b5946

File tree

7 files changed

+169
-10
lines changed

7 files changed

+169
-10
lines changed

doc/source/user_guide/timeseries.rst

+28
Original file line numberDiff line numberDiff line change
@@ -1888,6 +1888,34 @@ Those two examples are equivalent for this time series:
18881888
18891889
Note the use of ``'start'`` for ``origin`` on the last example. In that case, ``origin`` will be set to the first value of the timeseries.
18901890

1891+
Backward resample
1892+
~~~~~~~~~~~~~~~~~
1893+
1894+
.. versionadded:: 1.3.0
1895+
1896+
Instead of adjusting the beginning of bins, sometimes we need to fix the end of the bins to make a backward resample with a given ``freq``. The backward resample sets ``closed`` to ``'right'`` by default since the last value should be considered as the edge point for the last bin.
1897+
1898+
We can set ``origin`` to ``'end'``. The value for a specific ``Timestamp`` index stands for the resample result from the current ``Timestamp`` minus ``freq`` to the current ``Timestamp`` with a right close.
1899+
1900+
.. ipython:: python
1901+
1902+
ts.resample('17min', origin='end').sum()
1903+
1904+
Besides, in contrast with the ``'start_day'`` option, ``end_day`` is supported. This will set the origin as the ceiling midnight of the largest ``Timestamp``.
1905+
1906+
.. ipython:: python
1907+
1908+
ts.resample('17min', origin='end_day').sum()
1909+
1910+
The above result uses ``2000-10-02 00:29:00`` as the last bin's right edge since the following computation.
1911+
1912+
.. ipython:: python
1913+
1914+
ceil_mid = rng.max().ceil('D')
1915+
freq = pd.offsets.Minute(17)
1916+
bin_res = ceil_mid - freq * ((ceil_mid - rng.max()) // freq)
1917+
bin_res
1918+
18911919
.. _timeseries.periods:
18921920

18931921
Time span representation

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ Other enhancements
4040
^^^^^^^^^^^^^^^^^^
4141

4242
- Added :meth:`MultiIndex.dtypes` (:issue:`37062`)
43+
- Added ``end`` and ``end_day`` options for ``origin`` in :meth:`DataFrame.resample` (:issue:`37804`)
4344
- Improve error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`)
4445

4546
.. ---------------------------------------------------------------------------

pandas/core/generic.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -8050,7 +8050,8 @@ def resample(
80508050
level : str or int, optional
80518051
For a MultiIndex, level (name or number) to use for
80528052
resampling. `level` must be datetime-like.
8053-
origin : {{'epoch','start','start_day'}}, Timestamp or str, default 'start_day'
8053+
origin : {{'epoch', 'start', 'start_day', 'end', 'end_day'}}, Timestamp
8054+
or str, default 'start_day'
80548055
The timestamp on which to adjust the grouping. The timezone of origin
80558056
must match the timezone of the index.
80568057
If a timestamp is not used, these values are also supported:
@@ -8061,6 +8062,11 @@ def resample(
80618062
80628063
.. versionadded:: 1.1.0
80638064
8065+
- 'end': `origin` is the last value of the timeseries
8066+
- 'end_day': `origin` is the ceiling midnight of the last day
8067+
8068+
.. versionadded:: 1.3.0
8069+
80648070
offset : Timedelta or str, default is None
80658071
An offset timedelta added to the origin.
80668072
@@ -8343,6 +8349,26 @@ def resample(
83438349
2000-10-02 00:21:00 24
83448350
Freq: 17T, dtype: int64
83458351
8352+
If you want to take the largest Timestamp as the end of the bins:
8353+
8354+
>>> ts.resample('17min', origin='end').sum()
8355+
2000-10-01 23:35:00 0
8356+
2000-10-01 23:52:00 18
8357+
2000-10-02 00:09:00 27
8358+
2000-10-02 00:26:00 63
8359+
Freq: 17T, dtype: int64
8360+
8361+
In contrast with the `start_day`, you can use `end_day` to take the ceiling
8362+
midnight of the largest Timestamp as the end of the bins and drop the bins
8363+
not containing data:
8364+
8365+
>>> ts.resample('17min', origin='end_day').sum()
8366+
2000-10-01 23:38:00 3
8367+
2000-10-01 23:55:00 15
8368+
2000-10-02 00:12:00 45
8369+
2000-10-02 00:29:00 45
8370+
Freq: 17T, dtype: int64
8371+
83468372
To replace the use of the deprecated `base` argument, you can now use `offset`,
83478373
in this example it is equivalent to have `base=2`:
83488374

pandas/core/groupby/grouper.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ class Grouper:
8282
However, loffset is also deprecated for ``.resample(...)``
8383
See: :class:`DataFrame.resample`
8484
85-
origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day'
85+
origin : {{'epoch', 'start', 'start_day', 'end', 'end_day'}}, Timestamp
86+
or str, default 'start_day'
8687
The timestamp on which to adjust the grouping. The timezone of origin must
8788
match the timezone of the index.
8889
If a timestamp is not used, these values are also supported:
@@ -93,6 +94,11 @@ class Grouper:
9394
9495
.. versionadded:: 1.1.0
9596
97+
- 'end': `origin` is the last value of the timeseries
98+
- 'end_day': `origin` is the ceiling midnight of the last day
99+
100+
.. versionadded:: 1.3.0
101+
96102
offset : Timedelta or str, default is None
97103
An offset timedelta added to the origin.
98104

pandas/core/resample.py

+26-6
Original file line numberDiff line numberDiff line change
@@ -1388,10 +1388,22 @@ def __init__(
13881388
if label is None:
13891389
label = "right"
13901390
else:
1391-
if closed is None:
1392-
closed = "left"
1393-
if label is None:
1394-
label = "left"
1391+
# The backward resample sets ``closed`` to ``'right'`` by default
1392+
# since the last value should be considered as the edge point for
1393+
# the last bin. When origin in "end" or "end_day", the value for a
1394+
# specific ``Timestamp`` index stands for the resample result from
1395+
# the current ``Timestamp`` minus ``freq`` to the current
1396+
# ``Timestamp`` with a right close.
1397+
if origin in ["end", "end_day"]:
1398+
if closed is None:
1399+
closed = "right"
1400+
if label is None:
1401+
label = "right"
1402+
else:
1403+
if closed is None:
1404+
closed = "left"
1405+
if label is None:
1406+
label = "left"
13951407

13961408
self.closed = closed
13971409
self.label = label
@@ -1404,14 +1416,15 @@ def __init__(
14041416
self.fill_method = fill_method
14051417
self.limit = limit
14061418

1407-
if origin in ("epoch", "start", "start_day"):
1419+
if origin in ("epoch", "start", "start_day", "end", "end_day"):
14081420
self.origin = origin
14091421
else:
14101422
try:
14111423
self.origin = Timestamp(origin)
14121424
except Exception as e:
14131425
raise ValueError(
1414-
"'origin' should be equal to 'epoch', 'start', 'start_day' or "
1426+
"'origin' should be equal to 'epoch', 'start', 'start_day', "
1427+
"'end', 'end_day' or "
14151428
f"should be a Timestamp convertible type. Got '{origin}' instead."
14161429
) from e
14171430

@@ -1846,6 +1859,13 @@ def _adjust_dates_anchored(
18461859
origin_nanos = first.value
18471860
elif isinstance(origin, Timestamp):
18481861
origin_nanos = origin.value
1862+
elif origin in ["end", "end_day"]:
1863+
origin = last if origin == "end" else last.ceil("D")
1864+
sub_freq_times = (origin.value - first.value) // freq.nanos
1865+
if closed == "left":
1866+
sub_freq_times += 1
1867+
first = origin - sub_freq_times * freq
1868+
origin_nanos = first.value
18491869
origin_nanos += offset.value if offset else 0
18501870

18511871
# GH 10117 & GH 19375. If first and last contain timezone information,

pandas/tests/resample/test_datetime_index.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -772,8 +772,9 @@ def test_resample_bad_origin(origin):
772772
rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s")
773773
ts = Series(np.random.randn(len(rng)), index=rng)
774774
msg = (
775-
"'origin' should be equal to 'epoch', 'start', 'start_day' or "
776-
f"should be a Timestamp convertible type. Got '{origin}' instead."
775+
"'origin' should be equal to 'epoch', 'start', 'start_day', "
776+
"'end', 'end_day' or should be a Timestamp convertible type. Got "
777+
f"'{origin}' instead."
777778
)
778779
with pytest.raises(ValueError, match=msg):
779780
ts.resample("5min", origin=origin)

pandas/tests/resample/test_resample_api.py

+77
Original file line numberDiff line numberDiff line change
@@ -611,3 +611,80 @@ def test_resample_agg_readonly():
611611

612612
result = rs.agg("min")
613613
tm.assert_series_equal(result, expected)
614+
615+
616+
@pytest.mark.parametrize(
617+
"start,end,freq,data,resample_freq,origin,closed,exp_data,exp_end,exp_periods",
618+
[
619+
(
620+
"2000-10-01 23:30:00",
621+
"2000-10-02 00:26:00",
622+
"7min",
623+
[0, 3, 6, 9, 12, 15, 18, 21, 24],
624+
"17min",
625+
"end",
626+
None,
627+
[0, 18, 27, 63],
628+
"20001002 00:26:00",
629+
4,
630+
),
631+
(
632+
"20200101 8:26:35",
633+
"20200101 9:31:58",
634+
"77s",
635+
[1] * 51,
636+
"7min",
637+
"end",
638+
"right",
639+
[1, 6, 5, 6, 5, 6, 5, 6, 5, 6],
640+
"2020-01-01 09:30:45",
641+
10,
642+
),
643+
(
644+
"2000-10-01 23:30:00",
645+
"2000-10-02 00:26:00",
646+
"7min",
647+
[0, 3, 6, 9, 12, 15, 18, 21, 24],
648+
"17min",
649+
"end",
650+
"left",
651+
[0, 18, 27, 39, 24],
652+
"20001002 00:43:00",
653+
5,
654+
),
655+
(
656+
"2000-10-01 23:30:00",
657+
"2000-10-02 00:26:00",
658+
"7min",
659+
[0, 3, 6, 9, 12, 15, 18, 21, 24],
660+
"17min",
661+
"end_day",
662+
None,
663+
[3, 15, 45, 45],
664+
"2000-10-02 00:29:00",
665+
4,
666+
),
667+
],
668+
)
669+
def test_end_and_end_day_origin(
670+
start,
671+
end,
672+
freq,
673+
data,
674+
resample_freq,
675+
origin,
676+
closed,
677+
exp_data,
678+
exp_end,
679+
exp_periods,
680+
):
681+
rng = date_range(start, end, freq=freq)
682+
ts = Series(data, index=rng)
683+
684+
res = ts.resample(resample_freq, origin=origin, closed=closed).sum()
685+
expected = Series(
686+
exp_data,
687+
index=date_range(end=exp_end, freq=resample_freq, periods=exp_periods),
688+
)
689+
690+
tm.assert_series_equal(res, expected)

0 commit comments

Comments
 (0)