Skip to content

Commit 8c49bb6

Browse files
committed
ENH: add 'adjust_timestamp' argument to 'resample' and 'pd.Grouper'
1 parent a9d2450 commit 8c49bb6

File tree

9 files changed

+152
-13
lines changed

9 files changed

+152
-13
lines changed

doc/source/whatsnew/v1.0.0.rst

100755100644
File mode changed.

doc/source/whatsnew/v1.1.0.rst

+21
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,27 @@ For example:
3636
ser["2014"]
3737
ser.loc["May 2015"]
3838
39+
.. _whatsnew_110.grouper_adjust_timestamp:
40+
41+
Grouper now supports the argument adjust_timestamp
42+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
43+
44+
:class:`Grouper` and :class:`DataFrame.resample` now supports the argument `adjust_timestamp`. A the timestamp on which to adjust the grouping. (:issue:`31809`)
45+
46+
The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divides a day (like `90s` or `1min`). But it can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can now specify a fixed timestamp with `adjust_timestamp`.
47+
48+
For example:
49+
50+
.. ipython:: python
51+
52+
start, end = "1/1/2000 00:00:00", "1/31/2000 00:00"
53+
rng = pd.date_range(start, end, freq="1231min")
54+
ts = pd.Series(np.arange(len(rng)), index=rng)
55+
ts.groupby(pd.Grouper(freq="1399min")).agg("count")
56+
ts.groupby(pd.Grouper(freq="1399min", adjust_timestamp=pd.Timestamp("1970-01-01"))).agg("count")
57+
58+
..
59+
3960
.. _whatsnew_110.enhancements.other:
4061

4162
Other enhancements

pandas/core/generic.py

+5
Original file line numberDiff line numberDiff line change
@@ -7651,6 +7651,7 @@ def resample(
76517651
kind: Optional[str] = None,
76527652
loffset=None,
76537653
base: int = 0,
7654+
adjust_timestamp=None,
76547655
on=None,
76557656
level=None,
76567657
) -> "Resampler":
@@ -7691,6 +7692,9 @@ def resample(
76917692
For frequencies that evenly subdivide 1 day, the "origin" of the
76927693
aggregated intervals. For example, for '5min' frequency, base could
76937694
range from 0 through 4. Defaults to 0.
7695+
adjust_timestamp : pd.Timestamp, default None
7696+
The timestamp on which to adjust the grouping. If None is passed,
7697+
the first day of the time series at midnight is used.
76947698
on : str, optional
76957699
For a DataFrame, column to use instead of index for resampling.
76967700
Column must be datetime-like.
@@ -7931,6 +7935,7 @@ def resample(
79317935
loffset=loffset,
79327936
convention=convention,
79337937
base=base,
7938+
adjust_timestamp=adjust_timestamp,
79347939
key=on,
79357940
level=level,
79367941
)

pandas/core/groupby/grouper.py

+7
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,13 @@ class Grouper:
6868
If grouper is PeriodIndex and `freq` parameter is passed.
6969
base : int, default 0
7070
Only when `freq` parameter is passed.
71+
For frequencies that evenly subdivide 1 day, the "origin" of the
72+
aggregated intervals. For example, for '5min' frequency, base could
73+
range from 0 through 4. Defaults to 0.
74+
adjust_timestamp : Timestamp, default None
75+
Only when `freq` parameter is passed.
76+
The timestamp on which to adjust the grouping. If None is passed, the
77+
first day of the time series at midnight is used.
7178
loffset : str, DateOffset, timedelta object
7279
Only when `freq` parameter is passed.
7380

pandas/core/resample.py

+48-12
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class Resampler(_GroupBy, ShallowMixin):
6666
"convention",
6767
"loffset",
6868
"base",
69+
"adjust_timestamp",
6970
"kind",
7071
]
7172

@@ -1309,6 +1310,7 @@ class TimeGrouper(Grouper):
13091310
"kind",
13101311
"convention",
13111312
"base",
1313+
"adjust_timestamp",
13121314
)
13131315

13141316
def __init__(
@@ -1323,6 +1325,7 @@ def __init__(
13231325
loffset=None,
13241326
kind=None,
13251327
convention=None,
1328+
adjust_timestamp=None,
13261329
base=0,
13271330
**kwargs,
13281331
):
@@ -1365,6 +1368,9 @@ def __init__(
13651368
self.fill_method = fill_method
13661369
self.limit = limit
13671370
self.base = base
1371+
if adjust_timestamp is not None:
1372+
adjust_timestamp = Timestamp(adjust_timestamp, "ns")
1373+
self.adjust_timestamp = adjust_timestamp
13681374

13691375
# always sort time groupers
13701376
kwargs["sort"] = True
@@ -1424,7 +1430,12 @@ def _get_time_bins(self, ax):
14241430
return binner, [], labels
14251431

14261432
first, last = _get_timestamp_range_edges(
1427-
ax.min(), ax.max(), self.freq, closed=self.closed, base=self.base
1433+
ax.min(),
1434+
ax.max(),
1435+
self.freq,
1436+
closed=self.closed,
1437+
base=self.base,
1438+
adjust_timestamp=self.adjust_timestamp,
14281439
)
14291440
# GH #12037
14301441
# use first/last directly instead of call replace() on them
@@ -1562,10 +1573,15 @@ def _get_period_bins(self, ax):
15621573
bin_shift = 0
15631574

15641575
# GH 23882
1565-
if self.base:
1576+
if self.base or self.adjust_timestamp:
15661577
# get base adjusted bin edge labels
15671578
p_start, end = _get_period_range_edges(
1568-
start, end, self.freq, closed=self.closed, base=self.base
1579+
start,
1580+
end,
1581+
self.freq,
1582+
closed=self.closed,
1583+
base=self.base,
1584+
adjust_timestamp=self.adjust_timestamp,
15691585
)
15701586

15711587
# Get offset for bin edge (not label edge) adjustment
@@ -1617,7 +1633,9 @@ def _take_new_index(obj, indexer, new_index, axis=0):
16171633
raise ValueError("'obj' should be either a Series or a DataFrame")
16181634

16191635

1620-
def _get_timestamp_range_edges(first, last, offset, closed="left", base=0):
1636+
def _get_timestamp_range_edges(
1637+
first, last, offset, closed="left", base=0, adjust_timestamp=None
1638+
):
16211639
"""
16221640
Adjust the `first` Timestamp to the preceding Timestamp that resides on
16231641
the provided offset. Adjust the `last` Timestamp to the following
@@ -1637,6 +1655,9 @@ def _get_timestamp_range_edges(first, last, offset, closed="left", base=0):
16371655
Which side of bin interval is closed.
16381656
base : int, default 0
16391657
The "origin" of the adjusted Timestamps.
1658+
adjust_timestamp : pd.Timestamp, default None
1659+
The timestamp on which to adjust the grouping. If None is passed, the
1660+
first day of the time series at midnight is used.
16401661
16411662
Returns
16421663
-------
@@ -1652,7 +1673,12 @@ def _get_timestamp_range_edges(first, last, offset, closed="left", base=0):
16521673
last = last.tz_localize(None)
16531674

16541675
first, last = _adjust_dates_anchored(
1655-
first, last, offset, closed=closed, base=base
1676+
first,
1677+
last,
1678+
offset,
1679+
closed=closed,
1680+
base=base,
1681+
adjust_timestamp=adjust_timestamp,
16561682
)
16571683
if isinstance(offset, Day):
16581684
first = first.tz_localize(tz)
@@ -1673,7 +1699,9 @@ def _get_timestamp_range_edges(first, last, offset, closed="left", base=0):
16731699
return first, last
16741700

16751701

1676-
def _get_period_range_edges(first, last, offset, closed="left", base=0):
1702+
def _get_period_range_edges(
1703+
first, last, offset, closed="left", base=0, adjust_timestamp=None
1704+
):
16771705
"""
16781706
Adjust the provided `first` and `last` Periods to the respective Period of
16791707
the given offset that encompasses them.
@@ -1690,6 +1718,9 @@ def _get_period_range_edges(first, last, offset, closed="left", base=0):
16901718
Which side of bin interval is closed.
16911719
base : int, default 0
16921720
The "origin" of the adjusted Periods.
1721+
adjust_timestamp : pd.Timestamp, default None
1722+
The timestamp on which to adjust the grouping. If None is passed, the
1723+
first day of the time series at midnight is used.
16931724
16941725
Returns
16951726
-------
@@ -1705,37 +1736,42 @@ def _get_period_range_edges(first, last, offset, closed="left", base=0):
17051736
adjust_last = offset.is_on_offset(last)
17061737

17071738
first, last = _get_timestamp_range_edges(
1708-
first, last, offset, closed=closed, base=base
1739+
first, last, offset, closed=closed, base=base, adjust_timestamp=adjust_timestamp
17091740
)
17101741

17111742
first = (first + adjust_first * offset).to_period(offset)
17121743
last = (last - adjust_last * offset).to_period(offset)
17131744
return first, last
17141745

17151746

1716-
def _adjust_dates_anchored(first, last, offset, closed="right", base=0):
1747+
def _adjust_dates_anchored(
1748+
first, last, offset, closed="right", base=0, adjust_timestamp=None
1749+
):
17171750
# First and last offsets should be calculated from the start day to fix an
17181751
# error cause by resampling across multiple days when a one day period is
17191752
# not a multiple of the frequency.
17201753
#
17211754
# See https://github.com/pandas-dev/pandas/issues/8683
1755+
if adjust_timestamp is None:
1756+
adjust_timestamp_nanos = first.normalize().value
1757+
else:
1758+
adjust_timestamp_nanos = adjust_timestamp.value
17221759

17231760
# GH 10117 & GH 19375. If first and last contain timezone information,
17241761
# Perform the calculation in UTC in order to avoid localizing on an
17251762
# Ambiguous or Nonexistent time.
17261763
first_tzinfo = first.tzinfo
17271764
last_tzinfo = last.tzinfo
1728-
start_day_nanos = first.normalize().value
17291765
if first_tzinfo is not None:
17301766
first = first.tz_convert("UTC")
17311767
if last_tzinfo is not None:
17321768
last = last.tz_convert("UTC")
17331769

17341770
base_nanos = (base % offset.n) * offset.nanos // offset.n
1735-
start_day_nanos += base_nanos
1771+
adjust_timestamp_nanos += base_nanos
17361772

1737-
foffset = (first.value - start_day_nanos) % offset.nanos
1738-
loffset = (last.value - start_day_nanos) % offset.nanos
1773+
foffset = (first.value - adjust_timestamp_nanos) % offset.nanos
1774+
loffset = (last.value - adjust_timestamp_nanos) % offset.nanos
17391775

17401776
if closed == "right":
17411777
if foffset > 0:

pandas/tests/resample/test_datetime_index.py

+14
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,20 @@ def test_resample_base():
794794
tm.assert_index_equal(resampled.index, exp_rng)
795795

796796

797+
def test_resample_adjust_timestamp():
798+
rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s")
799+
ts = Series(np.random.randn(len(rng)), index=rng)
800+
801+
exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min")
802+
803+
resampled = ts.resample("5min", adjust_timestamp="12/31/1999 23:57:00").mean()
804+
tm.assert_index_equal(resampled.index, exp_rng)
805+
806+
offset_timestamp = pd.Timestamp(0) + pd.Timedelta("2min")
807+
resampled = ts.resample("5min", adjust_timestamp=offset_timestamp).mean()
808+
tm.assert_index_equal(resampled.index, exp_rng)
809+
810+
797811
def test_resample_float_base():
798812
# GH25161
799813
dt = pd.to_datetime(

pandas/tests/resample/test_resample_api.py

+7
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@ def test_str():
2828
"label=left, convention=start, base=0]" in str(r)
2929
)
3030

31+
r = test_series.resample("H", adjust_timestamp="1970-01-01")
32+
assert (
33+
"DatetimeIndexResampler [freq=<Hour>, axis=0, closed=left, "
34+
"label=left, convention=start, base=0, "
35+
"adjust_timestamp=1970-01-01 00:00:00]" in str(r)
36+
)
37+
3138

3239
def test_api():
3340

pandas/tests/resample/test_resampler_grouper.py

+40
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from textwrap import dedent
22

33
import numpy as np
4+
import pytest
45

56
from pandas.util._test_decorators import async_mark
67

@@ -123,6 +124,45 @@ def test_groupby_resample_on_api_with_getitem():
123124
tm.assert_series_equal(result, exp)
124125

125126

127+
def test_groupby_with_adjust_timestamp():
128+
freq = "1399min" # prime number that is smaller than 24h
129+
start, end = "1/1/2000 00:00:00", "1/31/2000 00:00"
130+
middle = "1/15/2000 00:00:00"
131+
132+
rng = pd.date_range(start, end, freq="1231min") # prime number
133+
ts = pd.Series(np.random.randn(len(rng)), index=rng)
134+
ts2 = ts[middle:end]
135+
136+
# proves that grouper without a fixed adjust_timestamp does not work
137+
# when dealing with unusual frequencies
138+
simple_grouper = pd.Grouper(freq=freq)
139+
count_ts = ts.groupby(simple_grouper).agg("count")
140+
count_ts = count_ts[middle:end]
141+
count_ts2 = ts2.groupby(simple_grouper).agg("count")
142+
with pytest.raises(AssertionError):
143+
tm.assert_index_equal(count_ts.index, count_ts2.index)
144+
145+
# test adjusted_timestamp on 1970-01-01 00:00:00
146+
adjust_timestamp = pd.Timestamp(0)
147+
adjusted_grouper = pd.Grouper(freq=freq, adjust_timestamp=adjust_timestamp)
148+
adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
149+
adjusted_count_ts = adjusted_count_ts[middle:end]
150+
adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
151+
tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2)
152+
153+
# test adjusted_timestamp on 2049-10-18 20:00:00
154+
adjust_timestamp_future = pd.Timestamp(0) + pd.Timedelta("1399min") * 30_000
155+
adjusted_grouper2 = pd.Grouper(freq=freq, adjust_timestamp=adjust_timestamp_future)
156+
adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count")
157+
adjusted2_count_ts = adjusted2_count_ts[middle:end]
158+
adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count")
159+
tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2)
160+
161+
# both grouper use an adjusted timestamp that is a multiple of 1399 min
162+
# they should be equals even if the adjusted_timestamp is in the future
163+
tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2)
164+
165+
126166
def test_nearest():
127167

128168
# GH 17496

pandas/tests/resample/test_time_grouper.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def test_aggregate_normal(resample_method):
167167
("prod", dict(min_count=1), np.nan),
168168
],
169169
)
170-
def test_resample_entirly_nat_window(method, method_args, unit):
170+
def test_resample_entirely_nat_window(method, method_args, unit):
171171
s = pd.Series([0] * 2 + [np.nan] * 2, index=pd.date_range("2017", periods=4))
172172
result = methodcaller(method, **method_args)(s.resample("2d"))
173173
expected = pd.Series(
@@ -253,6 +253,15 @@ def test_repr():
253253
)
254254
assert result == expected
255255

256+
result = repr(Grouper(key="A", freq="H", adjust_timestamp="1970-01-01"))
257+
expected = (
258+
"TimeGrouper(key='A', freq=<Hour>, axis=0, sort=True, "
259+
"closed='left', label='left', how='mean', "
260+
"convention='e', base=0, "
261+
"adjust_timestamp=Timestamp('1970-01-01 00:00:00', freq='N'))"
262+
)
263+
assert result == expected
264+
256265

257266
@pytest.mark.parametrize(
258267
"method, method_args, expected_values",

0 commit comments

Comments
 (0)