ENH: add 'adjust_timestamp' argument to 'resample' and 'pd.Grouper'

hasB4K · hasB4K · commit 8c49bb624079 · 2020-02-08T23:11:06.000+01:00
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -36,6 +36,27 @@ For example:
    ser["2014"]
    ser.loc["May 2015"]
 
+.. _whatsnew_110.grouper_adjust_timestamp:
+
+Grouper now supports the argument adjust_timestamp
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:class:`Grouper` and :class:`DataFrame.resample` now supports the argument `adjust_timestamp`. A the timestamp on which to adjust the grouping. (:issue:`31809`)
+
+The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divides a day (like `90s` or `1min`). But it can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can now specify a fixed timestamp with `adjust_timestamp`.
+
+For example:
+
+.. ipython:: python
+
+   start, end = "1/1/2000 00:00:00", "1/31/2000 00:00"
+   rng = pd.date_range(start, end, freq="1231min")
+   ts = pd.Series(np.arange(len(rng)), index=rng)
+   ts.groupby(pd.Grouper(freq="1399min")).agg("count")
+   ts.groupby(pd.Grouper(freq="1399min", adjust_timestamp=pd.Timestamp("1970-01-01"))).agg("count")
+
+..
+
 .. _whatsnew_110.enhancements.other:
 
 Other enhancements
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -7651,6 +7651,7 @@ def resample(
         kind: Optional[str] = None,
         loffset=None,
         base: int = 0,
+        adjust_timestamp=None,
         on=None,
         level=None,
     ) -> "Resampler":
@@ -7691,6 +7692,9 @@ def resample(
             For frequencies that evenly subdivide 1 day, the "origin" of the
             aggregated intervals. For example, for '5min' frequency, base could
             range from 0 through 4. Defaults to 0.
+        adjust_timestamp : pd.Timestamp, default None
+            The timestamp on which to adjust the grouping. If None is passed,
+            the first day of the time series at midnight is used.
         on : str, optional
             For a DataFrame, column to use instead of index for resampling.
             Column must be datetime-like.
@@ -7931,6 +7935,7 @@ def resample(
             loffset=loffset,
             convention=convention,
             base=base,
+            adjust_timestamp=adjust_timestamp,
             key=on,
             level=level,
         )
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -68,6 +68,13 @@ class Grouper:
         If grouper is PeriodIndex and `freq` parameter is passed.
     base : int, default 0
         Only when `freq` parameter is passed.
+        For frequencies that evenly subdivide 1 day, the "origin" of the
+        aggregated intervals. For example, for '5min' frequency, base could
+        range from 0 through 4. Defaults to 0.
+    adjust_timestamp : Timestamp, default None
+        Only when `freq` parameter is passed.
+        The timestamp on which to adjust the grouping. If None is passed, the
+        first day of the time series at midnight is used.
     loffset : str, DateOffset, timedelta object
         Only when `freq` parameter is passed.
 
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -66,6 +66,7 @@ class Resampler(_GroupBy, ShallowMixin):
         "convention",
         "loffset",
         "base",
+        "adjust_timestamp",
         "kind",
     ]
 
@@ -1309,6 +1310,7 @@ class TimeGrouper(Grouper):
         "kind",
         "convention",
         "base",
+        "adjust_timestamp",
     )
 
     def __init__(
@@ -1323,6 +1325,7 @@ def __init__(
         loffset=None,
         kind=None,
         convention=None,
+        adjust_timestamp=None,
         base=0,
         **kwargs,
     ):
@@ -1365,6 +1368,9 @@ def __init__(
         self.fill_method = fill_method
         self.limit = limit
         self.base = base
+        if adjust_timestamp is not None:
+            adjust_timestamp = Timestamp(adjust_timestamp, "ns")
+        self.adjust_timestamp = adjust_timestamp
 
         # always sort time groupers
         kwargs["sort"] = True
@@ -1424,7 +1430,12 @@ def _get_time_bins(self, ax):
             return binner, [], labels
 
         first, last = _get_timestamp_range_edges(
-            ax.min(), ax.max(), self.freq, closed=self.closed, base=self.base
+            ax.min(),
+            ax.max(),
+            self.freq,
+            closed=self.closed,
+            base=self.base,
+            adjust_timestamp=self.adjust_timestamp,
         )
         # GH #12037
         # use first/last directly instead of call replace() on them
@@ -1562,10 +1573,15 @@ def _get_period_bins(self, ax):
         bin_shift = 0
 
         # GH 23882
-        if self.base:
+        if self.base or self.adjust_timestamp:
             # get base adjusted bin edge labels
             p_start, end = _get_period_range_edges(
-                start, end, self.freq, closed=self.closed, base=self.base
+                start,
+                end,
+                self.freq,
+                closed=self.closed,
+                base=self.base,
+                adjust_timestamp=self.adjust_timestamp,
             )
 
             # Get offset for bin edge (not label edge) adjustment
@@ -1617,7 +1633,9 @@ def _take_new_index(obj, indexer, new_index, axis=0):
         raise ValueError("'obj' should be either a Series or a DataFrame")
 
 
-def _get_timestamp_range_edges(first, last, offset, closed="left", base=0):
+def _get_timestamp_range_edges(
+    first, last, offset, closed="left", base=0, adjust_timestamp=None
+):
     """
     Adjust the `first` Timestamp to the preceding Timestamp that resides on
     the provided offset. Adjust the `last` Timestamp to the following
@@ -1637,6 +1655,9 @@ def _get_timestamp_range_edges(first, last, offset, closed="left", base=0):
         Which side of bin interval is closed.
     base : int, default 0
         The "origin" of the adjusted Timestamps.
+    adjust_timestamp : pd.Timestamp, default None
+        The timestamp on which to adjust the grouping. If None is passed, the
+        first day of the time series at midnight is used.
 
     Returns
     -------
@@ -1652,7 +1673,12 @@ def _get_timestamp_range_edges(first, last, offset, closed="left", base=0):
             last = last.tz_localize(None)
 
         first, last = _adjust_dates_anchored(
-            first, last, offset, closed=closed, base=base
+            first,
+            last,
+            offset,
+            closed=closed,
+            base=base,
+            adjust_timestamp=adjust_timestamp,
         )
         if isinstance(offset, Day):
             first = first.tz_localize(tz)
@@ -1673,7 +1699,9 @@ def _get_timestamp_range_edges(first, last, offset, closed="left", base=0):
     return first, last
 
 
-def _get_period_range_edges(first, last, offset, closed="left", base=0):
+def _get_period_range_edges(
+    first, last, offset, closed="left", base=0, adjust_timestamp=None
+):
     """
     Adjust the provided `first` and `last` Periods to the respective Period of
     the given offset that encompasses them.
@@ -1690,6 +1718,9 @@ def _get_period_range_edges(first, last, offset, closed="left", base=0):
         Which side of bin interval is closed.
     base : int, default 0
         The "origin" of the adjusted Periods.
+    adjust_timestamp : pd.Timestamp, default None
+        The timestamp on which to adjust the grouping. If None is passed, the
+        first day of the time series at midnight is used.
 
     Returns
     -------
@@ -1705,37 +1736,42 @@ def _get_period_range_edges(first, last, offset, closed="left", base=0):
     adjust_last = offset.is_on_offset(last)
 
     first, last = _get_timestamp_range_edges(
-        first, last, offset, closed=closed, base=base
+        first, last, offset, closed=closed, base=base, adjust_timestamp=adjust_timestamp
     )
 
     first = (first + adjust_first * offset).to_period(offset)
     last = (last - adjust_last * offset).to_period(offset)
     return first, last
 
 
-def _adjust_dates_anchored(first, last, offset, closed="right", base=0):
+def _adjust_dates_anchored(
+    first, last, offset, closed="right", base=0, adjust_timestamp=None
+):
     # First and last offsets should be calculated from the start day to fix an
     # error cause by resampling across multiple days when a one day period is
     # not a multiple of the frequency.
     #
     # See https://github.com/pandas-dev/pandas/issues/8683
+    if adjust_timestamp is None:
+        adjust_timestamp_nanos = first.normalize().value
+    else:
+        adjust_timestamp_nanos = adjust_timestamp.value
 
     # GH 10117 & GH 19375. If first and last contain timezone information,
     # Perform the calculation in UTC in order to avoid localizing on an
     # Ambiguous or Nonexistent time.
     first_tzinfo = first.tzinfo
     last_tzinfo = last.tzinfo
-    start_day_nanos = first.normalize().value
     if first_tzinfo is not None:
         first = first.tz_convert("UTC")
     if last_tzinfo is not None:
         last = last.tz_convert("UTC")
 
     base_nanos = (base % offset.n) * offset.nanos // offset.n
-    start_day_nanos += base_nanos
+    adjust_timestamp_nanos += base_nanos
 
-    foffset = (first.value - start_day_nanos) % offset.nanos
-    loffset = (last.value - start_day_nanos) % offset.nanos
+    foffset = (first.value - adjust_timestamp_nanos) % offset.nanos
+    loffset = (last.value - adjust_timestamp_nanos) % offset.nanos
 
     if closed == "right":
         if foffset > 0:
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
@@ -794,6 +794,20 @@ def test_resample_base():
     tm.assert_index_equal(resampled.index, exp_rng)
 
 
+def test_resample_adjust_timestamp():
+    rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s")
+    ts = Series(np.random.randn(len(rng)), index=rng)
+
+    exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min")
+
+    resampled = ts.resample("5min", adjust_timestamp="12/31/1999 23:57:00").mean()
+    tm.assert_index_equal(resampled.index, exp_rng)
+
+    offset_timestamp = pd.Timestamp(0) + pd.Timedelta("2min")
+    resampled = ts.resample("5min", adjust_timestamp=offset_timestamp).mean()
+    tm.assert_index_equal(resampled.index, exp_rng)
+
+
 def test_resample_float_base():
     # GH25161
     dt = pd.to_datetime(
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
@@ -28,6 +28,13 @@ def test_str():
         "label=left, convention=start, base=0]" in str(r)
     )
 
+    r = test_series.resample("H", adjust_timestamp="1970-01-01")
+    assert (
+        "DatetimeIndexResampler [freq=<Hour>, axis=0, closed=left, "
+        "label=left, convention=start, base=0, "
+        "adjust_timestamp=1970-01-01 00:00:00]" in str(r)
+    )
+
 
 def test_api():
 
diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py
@@ -1,6 +1,7 @@
 from textwrap import dedent
 
 import numpy as np
+import pytest
 
 from pandas.util._test_decorators import async_mark
 
@@ -123,6 +124,45 @@ def test_groupby_resample_on_api_with_getitem():
     tm.assert_series_equal(result, exp)
 
 
+def test_groupby_with_adjust_timestamp():
+    freq = "1399min"  # prime number that is smaller than 24h
+    start, end = "1/1/2000 00:00:00", "1/31/2000 00:00"
+    middle = "1/15/2000 00:00:00"
+
+    rng = pd.date_range(start, end, freq="1231min")  # prime number
+    ts = pd.Series(np.random.randn(len(rng)), index=rng)
+    ts2 = ts[middle:end]
+
+    # proves that grouper without a fixed adjust_timestamp does not work
+    # when dealing with unusual frequencies
+    simple_grouper = pd.Grouper(freq=freq)
+    count_ts = ts.groupby(simple_grouper).agg("count")
+    count_ts = count_ts[middle:end]
+    count_ts2 = ts2.groupby(simple_grouper).agg("count")
+    with pytest.raises(AssertionError):
+        tm.assert_index_equal(count_ts.index, count_ts2.index)
+
+    # test adjusted_timestamp on 1970-01-01 00:00:00
+    adjust_timestamp = pd.Timestamp(0)
+    adjusted_grouper = pd.Grouper(freq=freq, adjust_timestamp=adjust_timestamp)
+    adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
+    adjusted_count_ts = adjusted_count_ts[middle:end]
+    adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
+    tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2)
+
+    # test adjusted_timestamp on 2049-10-18 20:00:00
+    adjust_timestamp_future = pd.Timestamp(0) + pd.Timedelta("1399min") * 30_000
+    adjusted_grouper2 = pd.Grouper(freq=freq, adjust_timestamp=adjust_timestamp_future)
+    adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count")
+    adjusted2_count_ts = adjusted2_count_ts[middle:end]
+    adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count")
+    tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2)
+
+    # both grouper use an adjusted timestamp that is a multiple of 1399 min
+    # they should be equals even if the adjusted_timestamp is in the future
+    tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2)
+
+
 def test_nearest():
 
     # GH 17496
diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py
@@ -167,7 +167,7 @@ def test_aggregate_normal(resample_method):
         ("prod", dict(min_count=1), np.nan),
     ],
 )
-def test_resample_entirly_nat_window(method, method_args, unit):
+def test_resample_entirely_nat_window(method, method_args, unit):
     s = pd.Series([0] * 2 + [np.nan] * 2, index=pd.date_range("2017", periods=4))
     result = methodcaller(method, **method_args)(s.resample("2d"))
     expected = pd.Series(
@@ -253,6 +253,15 @@ def test_repr():
     )
     assert result == expected
 
+    result = repr(Grouper(key="A", freq="H", adjust_timestamp="1970-01-01"))
+    expected = (
+        "TimeGrouper(key='A', freq=<Hour>, axis=0, sort=True, "
+        "closed='left', label='left', how='mean', "
+        "convention='e', base=0, "
+        "adjust_timestamp=Timestamp('1970-01-01 00:00:00', freq='N'))"
+    )
+    assert result == expected
+
 
 @pytest.mark.parametrize(
     "method, method_args, expected_values",