From 3aa2ae3e1fb70fbbf21d317ed644cd8dfe2b1624 Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sat, 8 Feb 2020 19:50:02 +0100 Subject: [PATCH 01/26] ENH: add 'origin and 'offset' arguments to 'resample' and 'pd.Grouper' --- doc/source/user_guide/timeseries.rst | 9 +- doc/source/whatsnew/v1.1.0.rst | 49 ++++++ pandas/core/generic.py | 33 +++- pandas/core/groupby/grouper.py | 23 +++ pandas/core/resample.py | 160 +++++++++++++----- pandas/tests/resample/test_base.py | 3 +- pandas/tests/resample/test_datetime_index.py | 62 +++++-- pandas/tests/resample/test_period_index.py | 101 ++++++++--- pandas/tests/resample/test_resample_api.py | 7 + .../tests/resample/test_resampler_grouper.py | 40 +++++ pandas/tests/resample/test_time_grouper.py | 18 +- pandas/tests/resample/test_timedelta.py | 17 +- 12 files changed, 425 insertions(+), 97 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 6ba58310000cb..60eef91e0fd79 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1572,10 +1572,9 @@ end of the interval is closed: ts.resample('5Min', closed='left').mean() -Parameters like ``label`` and ``loffset`` are used to manipulate the resulting -labels. ``label`` specifies whether the result is labeled with the beginning or -the end of the interval. ``loffset`` performs a time adjustment on the output -labels. +Parameters like ``label`` are used to manipulate the resulting labels. +``label`` specifies whether the result is labeled with the beginning or +the end of the interval. .. ipython:: python @@ -1583,8 +1582,6 @@ labels. ts.resample('5Min', label='left').mean() - ts.resample('5Min', label='left', loffset='1s').mean() - .. warning:: The default values for ``label`` and ``closed`` is '**left**' for all diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 092bd3345efbc..da0f8683cdf0c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -152,6 +152,55 @@ For example: pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z', utc=True) pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z') +.. _whatsnew_110.grouper_resample_origin: + +Grouper and resample now supports the arguments origin and offset +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`Grouper` and :class:`DataFrame.resample` now supports the argument `origin`. The timestamp on which to adjust the grouping. (:issue:`31809`) + +The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divides a day (like `90s` or `1min`). But it can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can now specify a fixed timestamp with the argument ``origin``. + +For example: + +.. ipython:: python + + start, end = "1/10/2000 02:00:00", "1/20/2000 02:00" + middle = "1/15/2000 02:00" + rng = pd.date_range(start, end, freq="1231min") + ts = pd.Series(np.arange(len(rng)) * 3, index=rng) + ts + ts.resample("2711min").sum() + ts.resample("2711min", origin="1970-01-01").sum() + +For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. + +If needed you can just adjust the bins with an offset that would be added to the default ``origin``. +Those two examples are equivalent for this time series: + +.. ipython:: python + + ts.resample("2711min", origin="1/10/2000 02:00:00").sum() + ts.resample("2711min", offset="2h").sum() + +The argument ``base`` is now deprecated in favor of ``offset``. (:issue:`31809`) + +.. ipython:: python + + # ts.resample("2711min", base=2).sum() + # becomes: + ts.resample("2711min", offset="2min").sum() + +The argument ``loffset`` is now deprecated. (:issue:`31809`) + +.. ipython:: python + + from pandas.tseries.frequencies import to_offset + loffset = "8H" + ts_out = ts.resample("2711min").sum() + ts_out.index = ts_out.index + to_offset(loffset) + ts_out + .. _whatsnew_110.enhancements.other: Other enhancements diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 792e5a1228fe6..62dfb5b66c583 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7760,9 +7760,11 @@ def resample( convention: str = "start", kind: Optional[str] = None, loffset=None, - base: int = 0, + base: Optional[int] = None, on=None, level=None, + origin=None, + offset=None, ) -> "Resampler": """ Resample time-series data. @@ -7797,17 +7799,35 @@ def resample( By default the input representation is retained. loffset : timedelta, default None Adjust the resampled time labels. + + .. deprecated:: 1.1.0 + You should add the loffset to the `df.index` after the resample. + like this: + ``df.index = df.index.to_timestamp() + to_offset(loffset)`` + (a more complete example is present below) + base : int, default 0 For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '5min' frequency, base could range from 0 through 4. Defaults to 0. + + .. deprecated:: 1.1.0 + The new arguments that you should use are 'offset' or 'origin'. + ``df.resample(freq="3s", base=2)`` + becomes + ``df.resample(freq="3s", offset="2s")`` + on : str, optional For a DataFrame, column to use instead of index for resampling. Column must be datetime-like. - level : str or int, optional For a MultiIndex, level (name or number) to use for resampling. `level` must be datetime-like. + origin : pd.Timestamp, default None + The timestamp on which to adjust the grouping. If None is passed, + the first day of the time series at midnight is used. + offset : pd.Timedelta, default is None + An offset timedelta added to the origin. Returns ------- @@ -8025,6 +8045,13 @@ def resample( 2000-01-02 22 140 2000-01-03 32 150 2000-01-04 36 90 + + To replace the use of the deprecated loffset argument: + >>> from pandas.tseries.frequencies import to_offset + >>> rng = pd.date_range("2000-01-01", "2000-01-01", freq="1s") + >>> ts = pd.Series(np.arange(len(rng)), index=rng) + >>> s = s.resample("3s").mean() + >>> s.index = s.index.to_timestamp() + to_offset("8H") """ from pandas.core.resample import get_resampler @@ -8041,6 +8068,8 @@ def resample( base=base, key=on, level=level, + origin=origin, + offset=offset, ) def first(self: FrameOrSeries, offset) -> FrameOrSeries: diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 948b4ba27f705..97bd4656b1d9b 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -67,9 +67,32 @@ class Grouper: If grouper is PeriodIndex and `freq` parameter is passed. base : int, default 0 Only when `freq` parameter is passed. + For frequencies that evenly subdivide 1 day, the "origin" of the + aggregated intervals. For example, for '5min' frequency, base could + range from 0 through 4. Defaults to 0. + + .. deprecated:: 1.1.0 + The new arguments that you should use are 'offset' or 'origin'. + ``df.resample(freq="3s", base=2)`` + becomes + ``df.resample(freq="3s", offset="2s")`` + loffset : str, DateOffset, timedelta object Only when `freq` parameter is passed. + .. deprecated:: 1.1.0 + loffset is only working for ``.resample(...)`` and not for + Grouper (:issue:`28302`). + However, loffset is also deprecated for ``.resample(...)`` + See: :class:`DataFrame.resample` + + origin : Timestamp, default None + Only when `freq` parameter is passed. + The timestamp on which to adjust the grouping. If None is passed, the + first day of the time series at midnight is used. + offset : pd.Timedelta, default is None + An offset timedelta added to the origin. + Returns ------- A specification for a groupby instruction diff --git a/pandas/core/resample.py b/pandas/core/resample.py index b8c45f26301a4..53ff2e8246078 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2,11 +2,12 @@ from datetime import timedelta from textwrap import dedent from typing import Dict, no_type_check +import warnings import numpy as np from pandas._libs import lib -from pandas._libs.tslibs import NaT, Period, Timestamp +from pandas._libs.tslibs import NaT, Period, Timedelta, Timestamp from pandas._libs.tslibs.frequencies import is_subperiod, is_superperiod from pandas._libs.tslibs.period import IncompatibleFrequency from pandas.compat.numpy import function as nv @@ -68,6 +69,8 @@ class Resampler(_GroupBy, ShallowMixin): "loffset", "base", "kind", + "origin", + "offset", ] def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): @@ -1257,7 +1260,7 @@ def get_resampler(obj, kind=None, **kwds): """ Create a TimeGrouper and return our resampler. """ - tg = TimeGrouper(**kwds) + tg = TimeGrouper(**kwds, _warning_stack_level=2) return tg._get_resampler(obj, kind=kind) @@ -1299,6 +1302,8 @@ class TimeGrouper(Grouper): "kind", "convention", "base", + "origin", + "offset", ) def __init__( @@ -1313,7 +1318,10 @@ def __init__( loffset=None, kind=None, convention=None, - base=0, + base=None, + origin=None, + offset=None, + _warning_stack_level=0, **kwargs, ): # Check for correctness of the keyword arguments which would @@ -1347,20 +1355,51 @@ def __init__( self.convention = convention or "E" self.convention = self.convention.lower() - if isinstance(loffset, str): - loffset = to_offset(loffset) - self.loffset = loffset - self.how = how self.fill_method = fill_method self.limit = limit - self.base = base + + self.origin = Timestamp(origin) if origin is not None else None + self.offset = Timedelta(offset) if offset is not None else None # always sort time groupers kwargs["sort"] = True + self._warn_if_loffset_or_base_is_used(base, loffset, _warning_stack_level) + if isinstance(loffset, str): + loffset = to_offset(loffset) + self.loffset = loffset + self.base = base if base else 0 + super().__init__(freq=freq, axis=axis, **kwargs) + @staticmethod + def _warn_if_loffset_or_base_is_used(base, loffset, _warning_stack_level): + if base is not None: + warnings.warn( + "'base' in .resample() and in Grouper() is deprecated.\n" + "The new arguments that you should use " + "are 'offset' or 'origin'.\n\n" + '>>> df.resample(freq="3s", base=2)\n' + "\nbecomes:\n\n" + '>>> df.resample(freq="3s", offset="2s")\n', + FutureWarning, + stacklevel=3 + _warning_stack_level, + ) + + if loffset is not None: + warnings.warn( + "'loffset' in .resample() and in Grouper() is deprecated.\n" + "Here an example to have the same behavior than loffset:\n\n" + '>>> df.resample(freq="3s", loffset="8H")\n' + "\nbecomes:\n\n" + ">>> from pandas.tseries.frequencies import to_offset\n" + '>>> df = df.resample(freq="3s").mean()\n' + '>>> df.index = df.index.to_timestamp() + to_offset("8H")\n', + FutureWarning, + stacklevel=3 + _warning_stack_level, + ) + def _get_resampler(self, obj, kind=None): """ Return my resampler or raise if we have an invalid axis. @@ -1414,7 +1453,13 @@ def _get_time_bins(self, ax): return binner, [], labels first, last = _get_timestamp_range_edges( - ax.min(), ax.max(), self.freq, closed=self.closed, base=self.base + ax.min(), + ax.max(), + self.freq, + closed=self.closed, + base=self.base, + origin=self.origin, + offset=self.offset, ) # GH #12037 # use first/last directly instead of call replace() on them @@ -1506,6 +1551,9 @@ def _get_time_delta_bins(self, ax): # GH #33498 labels += self.loffset + if self.offset: + labels += self.offset + return binner, bins, labels def _get_time_period_bins(self, ax): @@ -1556,11 +1604,17 @@ def _get_period_bins(self, ax): end = ax.max().asfreq(self.freq, how="end") bin_shift = 0 - # GH 23882 - if self.base: + # GH 23882 & 31809 + if self.origin is not None or self.offset is not None or self.base: # get base adjusted bin edge labels p_start, end = _get_period_range_edges( - start, end, self.freq, closed=self.closed, base=self.base + start, + end, + self.freq, + closed=self.closed, + base=self.base, + origin=self.origin, + offset=self.offset, ) # Get offset for bin edge (not label edge) adjustment @@ -1612,7 +1666,9 @@ def _take_new_index(obj, indexer, new_index, axis=0): raise ValueError("'obj' should be either a Series or a DataFrame") -def _get_timestamp_range_edges(first, last, offset, closed="left", base=0): +def _get_timestamp_range_edges( + first, last, freq, closed="left", base=0, origin=None, offset=None +): """ Adjust the `first` Timestamp to the preceding Timestamp that resides on the provided offset. Adjust the `last` Timestamp to the following @@ -1626,19 +1682,24 @@ def _get_timestamp_range_edges(first, last, offset, closed="left", base=0): The beginning Timestamp of the range to be adjusted. last : pd.Timestamp The ending Timestamp of the range to be adjusted. - offset : pd.DateOffset + freq : pd.DateOffset The dateoffset to which the Timestamps will be adjusted. closed : {'right', 'left'}, default None Which side of bin interval is closed. base : int, default 0 The "origin" of the adjusted Timestamps. + origin : pd.Timestamp, default None + The timestamp on which to adjust the grouping. If None is passed, the + first day of the time series at midnight is used. + offset : pd.Timedelta, default is None + An offset timedelta added to the origin. Returns ------- A tuple of length 2, containing the adjusted pd.Timestamp objects. """ - if isinstance(offset, Tick): - if isinstance(offset, Day): + if isinstance(freq, Tick): + if isinstance(freq, Day): # _adjust_dates_anchored assumes 'D' means 24H, but first/last # might contain a DST transition (23H, 24H, or 25H). # So "pretend" the dates are naive when adjusting the endpoints @@ -1647,9 +1708,9 @@ def _get_timestamp_range_edges(first, last, offset, closed="left", base=0): last = last.tz_localize(None) first, last = _adjust_dates_anchored( - first, last, offset, closed=closed, base=base + first, last, freq, closed=closed, base=base, origin=origin, offset=offset, ) - if isinstance(offset, Day): + if isinstance(freq, Day): first = first.tz_localize(tz) last = last.tz_localize(tz) return first, last @@ -1659,16 +1720,18 @@ def _get_timestamp_range_edges(first, last, offset, closed="left", base=0): last = last.normalize() if closed == "left": - first = Timestamp(offset.rollback(first)) + first = Timestamp(freq.rollback(first)) else: - first = Timestamp(first - offset) + first = Timestamp(first - freq) - last = Timestamp(last + offset) + last = Timestamp(last + freq) return first, last -def _get_period_range_edges(first, last, offset, closed="left", base=0): +def _get_period_range_edges( + first, last, freq, closed="left", base=0, origin=None, offset=None +): """ Adjust the provided `first` and `last` Periods to the respective Period of the given offset that encompasses them. @@ -1679,12 +1742,17 @@ def _get_period_range_edges(first, last, offset, closed="left", base=0): The beginning Period of the range to be adjusted. last : pd.Period The ending Period of the range to be adjusted. - offset : pd.DateOffset - The dateoffset to which the Periods will be adjusted. + freq : pd.DateOffset + The freq to which the Periods will be adjusted. closed : {'right', 'left'}, default None Which side of bin interval is closed. base : int, default 0 The "origin" of the adjusted Periods. + origin : pd.Timestamp, default None + The timestamp on which to adjust the grouping. If None is passed, the + first day of the time series at midnight is used. + offset : pd.Timedelta, default is None + An offset timedelta added to the origin. Returns ------- @@ -1693,55 +1761,61 @@ def _get_period_range_edges(first, last, offset, closed="left", base=0): if not all(isinstance(obj, Period) for obj in [first, last]): raise TypeError("'first' and 'last' must be instances of type Period") - # GH 23882 + # GH 23882 & 31809 first = first.to_timestamp() last = last.to_timestamp() - adjust_first = not offset.is_on_offset(first) - adjust_last = offset.is_on_offset(last) + adjust_first = not freq.is_on_offset(first) + adjust_last = freq.is_on_offset(last) first, last = _get_timestamp_range_edges( - first, last, offset, closed=closed, base=base + first, last, freq, closed=closed, base=base, origin=origin, offset=offset, ) - first = (first + int(adjust_first) * offset).to_period(offset) - last = (last - int(adjust_last) * offset).to_period(offset) + first = (first + int(adjust_first) * freq).to_period(freq) + last = (last - int(adjust_last) * freq).to_period(freq) return first, last -def _adjust_dates_anchored(first, last, offset, closed="right", base=0): +def _adjust_dates_anchored( + first, last, freq, closed="right", base=0, origin=None, offset=None +): # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is - # not a multiple of the frequency. - # - # See https://github.com/pandas-dev/pandas/issues/8683 + # not a multiple of the frequency. See GH 8683 + # To handle frequencies that are not multiple or divisible by a day we let + # the possibility to define a fixed origin timestamp. See GH 31809 + if origin is None: + origin_nanos = first.normalize().value + else: + origin_nanos = origin.value + origin_nanos += offset.value if offset else 0 # GH 10117 & GH 19375. If first and last contain timezone information, # Perform the calculation in UTC in order to avoid localizing on an # Ambiguous or Nonexistent time. first_tzinfo = first.tzinfo last_tzinfo = last.tzinfo - start_day_nanos = first.normalize().value if first_tzinfo is not None: first = first.tz_convert("UTC") if last_tzinfo is not None: last = last.tz_convert("UTC") - base_nanos = (base % offset.n) * offset.nanos // offset.n - start_day_nanos += base_nanos + base_nanos = (base % freq.n) * freq.nanos // freq.n + origin_nanos += base_nanos - foffset = (first.value - start_day_nanos) % offset.nanos - loffset = (last.value - start_day_nanos) % offset.nanos + foffset = (first.value - origin_nanos) % freq.nanos + loffset = (last.value - origin_nanos) % freq.nanos if closed == "right": if foffset > 0: # roll back fresult = first.value - foffset else: - fresult = first.value - offset.nanos + fresult = first.value - freq.nanos if loffset > 0: # roll forward - lresult = last.value + (offset.nanos - loffset) + lresult = last.value + (freq.nanos - loffset) else: # already the end of the road lresult = last.value @@ -1754,9 +1828,9 @@ def _adjust_dates_anchored(first, last, offset, closed="right", base=0): if loffset > 0: # roll forward - lresult = last.value + (offset.nanos - loffset) + lresult = last.value + (freq.nanos - loffset) else: - lresult = last.value + offset.nanos + lresult = last.value + freq.nanos fresult = Timestamp(fresult) lresult = Timestamp(lresult) if first_tzinfo is not None: diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index d0559923fec51..697aeb6db9f92 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -209,7 +209,8 @@ def test_resample_loffset_arg_type(frame, create_index, arg): expected_index += timedelta(hours=2) expected = DataFrame({"value": expected_means}, index=expected_index) - result_agg = df.resample("2D", loffset="2H").agg(arg) + with tm.assert_produces_warning(FutureWarning): + result_agg = df.resample("2D", loffset="2H").agg(arg) if isinstance(arg, list): expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 0c364d37f039e..92f979b35b13f 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -420,7 +420,10 @@ def test_resample_loffset(loffset): rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") s = Series(np.random.randn(14), index=rng) - result = s.resample("5min", closed="right", label="right", loffset=loffset).mean() + with tm.assert_produces_warning(FutureWarning): + result = s.resample( + "5min", closed="right", label="right", loffset=loffset + ).mean() idx = date_range("1/1/2000", periods=4, freq="5min") expected = Series( [s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], @@ -436,7 +439,8 @@ def test_resample_loffset(loffset): # to weekly result = ser.resample("w-sun").last() business_day_offset = BDay() - expected = ser.resample("w-sun", loffset=-business_day_offset).last() + with tm.assert_produces_warning(FutureWarning): + expected = ser.resample("w-sun", loffset=-business_day_offset).last() assert result.index[0] - business_day_offset == expected.index[0] @@ -445,9 +449,10 @@ def test_resample_loffset_upsample(): rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") s = Series(np.random.randn(14), index=rng) - result = s.resample( - "5min", closed="right", label="right", loffset=timedelta(minutes=1) - ).ffill() + with tm.assert_produces_warning(FutureWarning): + result = s.resample( + "5min", closed="right", label="right", loffset=timedelta(minutes=1) + ).ffill() idx = date_range("1/1/2000", periods=4, freq="5min") expected = Series([s[0], s[5], s[10], s[-1]], index=idx + timedelta(minutes=1)) @@ -460,7 +465,8 @@ def test_resample_loffset_count(): rng = date_range(start_time, periods=100, freq="S") ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample("10S", loffset="1s").count() + with tm.assert_produces_warning(FutureWarning): + result = ts.resample("10S", loffset="1s").count() expected_index = date_range(start_time, periods=10, freq="10S") + timedelta( seconds=1 @@ -471,7 +477,8 @@ def test_resample_loffset_count(): # Same issue should apply to .size() since it goes through # same code path - result = ts.resample("10S", loffset="1s").size() + with tm.assert_produces_warning(FutureWarning): + result = ts.resample("10S", loffset="1s").size() tm.assert_series_equal(result, expected) @@ -795,11 +802,35 @@ def test_resample_base(): rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s") ts = Series(np.random.randn(len(rng)), index=rng) - resampled = ts.resample("5min", base=2).mean() + with tm.assert_produces_warning(FutureWarning): + resampled = ts.resample("5min", base=2).mean() exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min") tm.assert_index_equal(resampled.index, exp_rng) +def test_resample_offset(): + rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + + resampled = ts.resample("5min", offset="2min").mean() + exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min") + tm.assert_index_equal(resampled.index, exp_rng) + + +def test_resample_origin(): + rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + + exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min") + + resampled = ts.resample("5min", origin="12/31/1999 23:57:00").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + offset_timestamp = pd.Timestamp(0) + pd.Timedelta("2min") + resampled = ts.resample("5min", origin=offset_timestamp).mean() + tm.assert_index_equal(resampled.index, exp_rng) + + def test_resample_float_base(): # GH25161 dt = pd.to_datetime( @@ -808,7 +839,8 @@ def test_resample_float_base(): s = Series(np.arange(3), index=dt) base = 17 + 43.51 / 60 - result = s.resample("3min", base=base).size() + with tm.assert_produces_warnding(FutureWarning): + result = s.resample("3min", base=base).size() expected = Series( 3, index=pd.DatetimeIndex(["2018-11-26 16:17:43.51"], freq="3min") ) @@ -1588,7 +1620,7 @@ def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k): @pytest.mark.parametrize( - "first,last,offset,exp_first,exp_last", + "first,last,freq,exp_first,exp_last", [ ("19910905", "19920406", "D", "19910905", "19920407"), ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920407"), @@ -1598,17 +1630,17 @@ def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k): ("1991-08", "1992-04", "M", "19910831", "19920531"), ], ) -def test_get_timestamp_range_edges(first, last, offset, exp_first, exp_last): +def test_get_timestamp_range_edges(first, last, freq, exp_first, exp_last): first = pd.Period(first) first = first.to_timestamp(first.freq) last = pd.Period(last) last = last.to_timestamp(last.freq) - exp_first = pd.Timestamp(exp_first, freq=offset) - exp_last = pd.Timestamp(exp_last, freq=offset) + exp_first = pd.Timestamp(exp_first, freq=freq) + exp_last = pd.Timestamp(exp_last, freq=freq) - offset = pd.tseries.frequencies.to_offset(offset) - result = _get_timestamp_range_edges(first, last, offset) + freq = pd.tseries.frequencies.to_offset(freq) + result = _get_timestamp_range_edges(first, last, freq) expected = (exp_first, exp_last) assert result == expected diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index ebc75018bb52d..8b5f2ac7469a2 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -735,7 +735,8 @@ def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg): expected_index += timedelta(hours=2) expected = DataFrame({"value": expected_means}, index=expected_index) - result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg) + with tm.assert_produces_warning(FutureWarning): + result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg) if isinstance(agg_arg, list): expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) tm.assert_frame_equal(result_agg, expected) @@ -815,42 +816,86 @@ def test_resample_with_only_nat(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "start,end,start_freq,end_freq,base", + "start,end,start_freq,end_freq,base,offset", [ - ("19910905", "19910909 03:00", "H", "24H", 10), - ("19910905", "19910909 12:00", "H", "24H", 10), - ("19910905", "19910909 23:00", "H", "24H", 10), - ("19910905 10:00", "19910909", "H", "24H", 10), - ("19910905 10:00", "19910909 10:00", "H", "24H", 10), - ("19910905", "19910909 10:00", "H", "24H", 10), - ("19910905 12:00", "19910909", "H", "24H", 10), - ("19910905 12:00", "19910909 03:00", "H", "24H", 10), - ("19910905 12:00", "19910909 12:00", "H", "24H", 10), - ("19910905 12:00", "19910909 12:00", "H", "24H", 34), - ("19910905 12:00", "19910909 12:00", "H", "17H", 10), - ("19910905 12:00", "19910909 12:00", "H", "17H", 3), - ("19910905 12:00", "19910909 1:00", "H", "M", 3), - ("19910905", "19910913 06:00", "2H", "24H", 10), - ("19910905", "19910905 01:39", "Min", "5Min", 3), - ("19910905", "19910905 03:18", "2Min", "5Min", 3), + ("19910905", "19910909 03:00", "H", "24H", 10, "10H"), + ("19910905", "19910909 12:00", "H", "24H", 10, "10H"), + ("19910905", "19910909 23:00", "H", "24H", 10, "10H"), + ("19910905 10:00", "19910909", "H", "24H", 10, "10H"), + ("19910905 10:00", "19910909 10:00", "H", "24H", 10, "10H"), + ("19910905", "19910909 10:00", "H", "24H", 10, "10H"), + ("19910905 12:00", "19910909", "H", "24H", 10, "10H"), + ("19910905 12:00", "19910909 03:00", "H", "24H", 10, "10H"), + ("19910905 12:00", "19910909 12:00", "H", "24H", 10, "10H"), + ("19910905 12:00", "19910909 12:00", "H", "24H", 34, "34H"), + ("19910905 12:00", "19910909 12:00", "H", "17H", 10, "10H"), + ("19910905 12:00", "19910909 12:00", "H", "17H", 3, "3H"), + ("19910905 12:00", "19910909 1:00", "H", "M", 3, "3H"), + ("19910905", "19910913 06:00", "2H", "24H", 10, "10H"), + ("19910905", "19910905 01:39", "Min", "5Min", 3, "3Min"), + ("19910905", "19910905 03:18", "2Min", "5Min", 3, "3Min"), ], ) - def test_resample_with_non_zero_base(self, start, end, start_freq, end_freq, base): + def test_resample_with_non_zero_base( + self, start, end, start_freq, end_freq, base, offset + ): # GH 23882 s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq)) s = s + np.arange(len(s)) - result = s.resample(end_freq, base=base).mean() + with tm.assert_produces_warning(FutureWarning): + result = s.resample(end_freq, base=base).mean() result = result.to_timestamp(end_freq) + + # test that the replacement argument `offset` works + result_offset = s.resample(end_freq, offset=offset).mean() + result_offset = result_offset.to_timestamp(end_freq) + tm.assert_series_equal(result, result_offset) + # to_timestamp casts 24H -> D result = result.asfreq(end_freq) if end_freq == "24H" else result - expected = s.to_timestamp().resample(end_freq, base=base).mean() + with tm.assert_produces_warning(FutureWarning): + expected = s.to_timestamp().resample(end_freq, base=base).mean() + if end_freq == "M": + # TODO: is non-tick the relevant characteristic? (GH 33815) + expected.index = expected.index._with_freq(None) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "start,end,start_freq,end_freq,offset", + [ + ("19910905", "19910909 03:00", "H", "24H", "10H"), + ("19910905", "19910909 12:00", "H", "24H", "10H"), + ("19910905", "19910909 23:00", "H", "24H", "10H"), + ("19910905 10:00", "19910909", "H", "24H", "10H"), + ("19910905 10:00", "19910909 10:00", "H", "24H", "10H"), + ("19910905", "19910909 10:00", "H", "24H", "10H"), + ("19910905 12:00", "19910909", "H", "24H", "10H"), + ("19910905 12:00", "19910909 03:00", "H", "24H", "10H"), + ("19910905 12:00", "19910909 12:00", "H", "24H", "10H"), + ("19910905 12:00", "19910909 12:00", "H", "24H", "34H"), + ("19910905 12:00", "19910909 12:00", "H", "17H", "10H"), + ("19910905 12:00", "19910909 12:00", "H", "17H", "3H"), + ("19910905 12:00", "19910909 1:00", "H", "M", "3H"), + ("19910905", "19910913 06:00", "2H", "24H", "10H"), + ("19910905", "19910905 01:39", "Min", "5Min", "3Min"), + ("19910905", "19910905 03:18", "2Min", "5Min", "3Min"), + ], + ) + def test_resample_with_offset(self, start, end, start_freq, end_freq, offset): + # GH 23882 & 31809 + s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq)) + s = s + np.arange(len(s)) + result = s.resample(end_freq, offset=offset).mean() + result = result.to_timestamp(end_freq) + + expected = s.to_timestamp().resample(end_freq, offset=offset).mean() if end_freq == "M": - # TODO: is non-tick the relevant characteristic? + # TODO: is non-tick the relevant characteristic? (GH 33815) expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "first,last,offset,exp_first,exp_last", + "first,last,freq,exp_first,exp_last", [ ("19910905", "19920406", "D", "19910905", "19920406"), ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920406"), @@ -866,15 +911,15 @@ def test_resample_with_non_zero_base(self, start, end, start_freq, end_freq, bas ("1991-08", "1992-04", "M", "1991-08", "1992-04"), ], ) - def test_get_period_range_edges(self, first, last, offset, exp_first, exp_last): + def test_get_period_range_edges(self, first, last, freq, exp_first, exp_last): first = pd.Period(first) last = pd.Period(last) - exp_first = pd.Period(exp_first, freq=offset) - exp_last = pd.Period(exp_last, freq=offset) + exp_first = pd.Period(exp_first, freq=freq) + exp_last = pd.Period(exp_last, freq=freq) - offset = pd.tseries.frequencies.to_offset(offset) - result = _get_period_range_edges(first, last, offset) + freq = pd.tseries.frequencies.to_offset(freq) + result = _get_period_range_edges(first, last, freq) expected = (exp_first, exp_last) assert result == expected diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 5044a18e33248..3cb5c83c0cdde 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -28,6 +28,13 @@ def test_str(): "label=left, convention=start, base=0]" in str(r) ) + r = test_series.resample("H", origin="1970-01-01") + assert ( + "DatetimeIndexResampler [freq=, axis=0, closed=left, " + "label=left, convention=start, base=0, " + "origin=1970-01-01 00:00:00]" in str(r) + ) + def test_api(): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 035698687cfc2..ffef46170b89d 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -1,6 +1,7 @@ from textwrap import dedent import numpy as np +import pytest from pandas.util._test_decorators import async_mark @@ -131,6 +132,45 @@ def test_groupby_resample_on_api_with_getitem(): tm.assert_series_equal(result, exp) +def test_groupby_with_origin(): + freq = "1399min" # prime number that is smaller than 24h + start, end = "1/1/2000 00:00:00", "1/31/2000 00:00" + middle = "1/15/2000 00:00:00" + + rng = pd.date_range(start, end, freq="1231min") # prime number + ts = pd.Series(np.random.randn(len(rng)), index=rng) + ts2 = ts[middle:end] + + # proves that grouper without a fixed origin does not work + # when dealing with unusual frequencies + simple_grouper = pd.Grouper(freq=freq) + count_ts = ts.groupby(simple_grouper).agg("count") + count_ts = count_ts[middle:end] + count_ts2 = ts2.groupby(simple_grouper).agg("count") + with pytest.raises(AssertionError): + tm.assert_index_equal(count_ts.index, count_ts2.index) + + # test origin on 1970-01-01 00:00:00 + origin = pd.Timestamp(0) + adjusted_grouper = pd.Grouper(freq=freq, origin=origin) + adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count") + adjusted_count_ts = adjusted_count_ts[middle:end] + adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count") + tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2) + + # test origin on 2049-10-18 20:00:00 + origin_future = pd.Timestamp(0) + pd.Timedelta("1399min") * 30_000 + adjusted_grouper2 = pd.Grouper(freq=freq, origin=origin_future) + adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count") + adjusted2_count_ts = adjusted2_count_ts[middle:end] + adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count") + tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2) + + # both grouper use an adjusted timestamp that is a multiple of 1399 min + # they should be equals even if the adjusted_timestamp is in the future + tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2) + + def test_nearest(): # GH 17496 diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 49ac5f81f9c02..55a51452f54b2 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -166,7 +166,7 @@ def test_aggregate_normal(resample_method): ("prod", dict(min_count=1), np.nan), ], ) -def test_resample_entirly_nat_window(method, method_args, unit): +def test_resample_entirely_nat_window(method, method_args, unit): s = pd.Series([0] * 2 + [np.nan] * 2, index=pd.date_range("2017", periods=4)) result = methodcaller(method, **method_args)(s.resample("2d")) expected = pd.Series( @@ -255,6 +255,14 @@ def test_repr(): ) assert result == expected + result = repr(Grouper(key="A", freq="H", origin="1970-01-01")) + expected = ( + "TimeGrouper(key='A', freq=, axis=0, sort=True, " + "closed='left', label='left', how='mean', " + "convention='e', base=0, origin=Timestamp('1970-01-01 00:00:00'))" + ) + assert result == expected + @pytest.mark.parametrize( "method, method_args, expected_values", @@ -279,3 +287,11 @@ def test_upsample_sum(method, method_args, expected_values): result = methodcaller(method, **method_args)(resampled) expected = pd.Series(expected_values, index=index) tm.assert_series_equal(result, expected) + + +def test_deprecating_on_loffset_and_base(): + with tm.assert_produces_warning(FutureWarning): + pd.Grouper(freq="10s", loffset="2s") + + with tm.assert_produces_warning(FutureWarning): + pd.Grouper(freq="10s", base=2) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 1b4a625f078c9..4227e3d58eaf0 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -86,7 +86,22 @@ def test_resample_base_with_timedeltaindex(): rng = timedelta_range(start="0s", periods=25, freq="s") ts = Series(np.random.randn(len(rng)), index=rng) - with_base = ts.resample("2s", base=5).mean() + with tm.assert_produces_warning(FutureWarning): + with_base = ts.resample("2s", base=5).mean() + without_base = ts.resample("2s").mean() + + exp_without_base = timedelta_range(start="0s", end="25s", freq="2s") + exp_with_base = timedelta_range(start="5s", end="29s", freq="2s") + + tm.assert_index_equal(without_base.index, exp_without_base) + tm.assert_index_equal(with_base.index, exp_with_base) + + +def test_resample_offset_with_timedeltaindex(): + rng = timedelta_range(start="0s", periods=25, freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + + with_base = ts.resample("2s", offset="5s").mean() without_base = ts.resample("2s").mean() exp_without_base = timedelta_range(start="0s", end="25s", freq="2s") From c16733b05f9bd78752b64c30e839d48862a97eec Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sat, 15 Feb 2020 23:19:15 +0100 Subject: [PATCH 02/26] ENH: change how the warning are handled for base and loffset --- pandas/core/generic.py | 3 + pandas/core/groupby/groupby.py | 3 + pandas/core/groupby/grouper.py | 6 ++ pandas/core/resample.py | 104 +++++++++------------ pandas/tests/resample/test_resample_api.py | 5 +- pandas/tests/resample/test_time_grouper.py | 4 +- 6 files changed, 60 insertions(+), 65 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 62dfb5b66c583..0ad15348f8a0f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8054,6 +8054,9 @@ def resample( >>> s.index = s.index.to_timestamp() + to_offset("8H") """ from pandas.core.resample import get_resampler + from pandas.core.resample import _validate_resample_deprecated_args + + _validate_resample_deprecated_args(offset=offset, base=base, loffset=loffset) axis = self._get_axis_number(axis) return get_resampler( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b92ff1c7c8ca4..14cc1ec82d530 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1657,6 +1657,9 @@ def resample(self, rule, *args, **kwargs): 5 2000-01-01 00:00:20 5 1 """ from pandas.core.resample import get_resampler_for_grouping + from pandas.core.resample import _validate_resample_deprecated_args + + _validate_resample_deprecated_args(**kwargs) return get_resampler_for_grouping(self, rule, *args, **kwargs) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 97bd4656b1d9b..30bddb8e1567a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -153,6 +153,12 @@ class Grouper: def __new__(cls, *args, **kwargs): if kwargs.get("freq") is not None: from pandas.core.resample import TimeGrouper + from pandas.core.resample import _validate_resample_deprecated_args + + if cls is not TimeGrouper: + # validate only when pd.Grouper is called, otherwise + # the warning is handled by the resample function + _validate_resample_deprecated_args(**kwargs) cls = TimeGrouper return super().__new__(cls) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 53ff2e8246078..9c379824f5f59 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -67,7 +67,6 @@ class Resampler(_GroupBy, ShallowMixin): "label", "convention", "loffset", - "base", "kind", "origin", "offset", @@ -247,7 +246,7 @@ def pipe(self, func, *args, **kwargs): >>> r = s.resample('2s') DatetimeIndexResampler [freq=<2 * Seconds>, axis=0, closed=left, - label=left, convention=start, base=0] + label=left, convention=start] >>> r.agg(np.sum) 2013-01-01 00:00:00 3 @@ -1260,7 +1259,7 @@ def get_resampler(obj, kind=None, **kwds): """ Create a TimeGrouper and return our resampler. """ - tg = TimeGrouper(**kwds, _warning_stack_level=2) + tg = TimeGrouper(**kwds) return tg._get_resampler(obj, kind=kind) @@ -1301,7 +1300,6 @@ class TimeGrouper(Grouper): "loffset", "kind", "convention", - "base", "origin", "offset", ) @@ -1321,7 +1319,6 @@ def __init__( base=None, origin=None, offset=None, - _warning_stack_level=0, **kwargs, ): # Check for correctness of the keyword arguments which would @@ -1361,45 +1358,20 @@ def __init__( self.origin = Timestamp(origin) if origin is not None else None self.offset = Timedelta(offset) if offset is not None else None + if base and isinstance(freq, Tick): + # this conversion handle the default behavior of base + # and the special case of GH #10530 + self.offset = Timedelta(base * freq.nanos // freq.n) # always sort time groupers kwargs["sort"] = True - self._warn_if_loffset_or_base_is_used(base, loffset, _warning_stack_level) if isinstance(loffset, str): loffset = to_offset(loffset) self.loffset = loffset - self.base = base if base else 0 super().__init__(freq=freq, axis=axis, **kwargs) - @staticmethod - def _warn_if_loffset_or_base_is_used(base, loffset, _warning_stack_level): - if base is not None: - warnings.warn( - "'base' in .resample() and in Grouper() is deprecated.\n" - "The new arguments that you should use " - "are 'offset' or 'origin'.\n\n" - '>>> df.resample(freq="3s", base=2)\n' - "\nbecomes:\n\n" - '>>> df.resample(freq="3s", offset="2s")\n', - FutureWarning, - stacklevel=3 + _warning_stack_level, - ) - - if loffset is not None: - warnings.warn( - "'loffset' in .resample() and in Grouper() is deprecated.\n" - "Here an example to have the same behavior than loffset:\n\n" - '>>> df.resample(freq="3s", loffset="8H")\n' - "\nbecomes:\n\n" - ">>> from pandas.tseries.frequencies import to_offset\n" - '>>> df = df.resample(freq="3s").mean()\n' - '>>> df.index = df.index.to_timestamp() + to_offset("8H")\n', - FutureWarning, - stacklevel=3 + _warning_stack_level, - ) - def _get_resampler(self, obj, kind=None): """ Return my resampler or raise if we have an invalid axis. @@ -1457,7 +1429,6 @@ def _get_time_bins(self, ax): ax.max(), self.freq, closed=self.closed, - base=self.base, origin=self.origin, offset=self.offset, ) @@ -1544,15 +1515,12 @@ def _get_time_delta_bins(self, ax): end_stamps = labels + self.freq bins = ax.searchsorted(end_stamps, side="left") - if self.base > 0: - # GH #10530 - labels += type(self.freq)(self.base) - if self.loffset: - # GH #33498 - labels += self.loffset - if self.offset: + # GH 10530 & 31809 labels += self.offset + if self.loffset: + # GH 33498 + labels += self.loffset return binner, bins, labels @@ -1605,14 +1573,13 @@ def _get_period_bins(self, ax): bin_shift = 0 # GH 23882 & 31809 - if self.origin is not None or self.offset is not None or self.base: + if self.origin is not None or self.offset is not None: # get base adjusted bin edge labels p_start, end = _get_period_range_edges( start, end, self.freq, closed=self.closed, - base=self.base, origin=self.origin, offset=self.offset, ) @@ -1651,6 +1618,34 @@ def _get_period_bins(self, ax): return binner, bins, labels +def _validate_resample_deprecated_args(offset=None, base=None, loffset=None, **kwds): + if base is not None: + warnings.warn( + "'base' in .resample() and in Grouper() is deprecated.\n" + "The new arguments that you should use are 'offset' or 'origin'.\n\n" + '>>> df.resample(freq="3s", base=2)\n' + "\nbecomes:\n\n" + '>>> df.resample(freq="3s", offset="2s")\n', + FutureWarning, + stacklevel=3, + ) + if offset is not None: + raise ValueError("offset and base cannot be present at the same time") + + if loffset is not None: + warnings.warn( + "'loffset' in .resample() and in Grouper() is deprecated.\n" + "Here an example to have the same behavior than loffset:\n\n" + '>>> df.resample(freq="3s", loffset="8H")\n' + "\nbecomes:\n\n" + ">>> from pandas.tseries.frequencies import to_offset\n" + '>>> df = df.resample(freq="3s").mean()\n' + '>>> df.index = df.index.to_timestamp() + to_offset("8H")\n', + FutureWarning, + stacklevel=3, + ) + + def _take_new_index(obj, indexer, new_index, axis=0): if isinstance(obj, ABCSeries): @@ -1667,7 +1662,7 @@ def _take_new_index(obj, indexer, new_index, axis=0): def _get_timestamp_range_edges( - first, last, freq, closed="left", base=0, origin=None, offset=None + first, last, freq, closed="left", origin=None, offset=None ): """ Adjust the `first` Timestamp to the preceding Timestamp that resides on @@ -1686,8 +1681,6 @@ def _get_timestamp_range_edges( The dateoffset to which the Timestamps will be adjusted. closed : {'right', 'left'}, default None Which side of bin interval is closed. - base : int, default 0 - The "origin" of the adjusted Timestamps. origin : pd.Timestamp, default None The timestamp on which to adjust the grouping. If None is passed, the first day of the time series at midnight is used. @@ -1708,7 +1701,7 @@ def _get_timestamp_range_edges( last = last.tz_localize(None) first, last = _adjust_dates_anchored( - first, last, freq, closed=closed, base=base, origin=origin, offset=offset, + first, last, freq, closed=closed, origin=origin, offset=offset, ) if isinstance(freq, Day): first = first.tz_localize(tz) @@ -1729,9 +1722,7 @@ def _get_timestamp_range_edges( return first, last -def _get_period_range_edges( - first, last, freq, closed="left", base=0, origin=None, offset=None -): +def _get_period_range_edges(first, last, freq, closed="left", origin=None, offset=None): """ Adjust the provided `first` and `last` Periods to the respective Period of the given offset that encompasses them. @@ -1746,8 +1737,6 @@ def _get_period_range_edges( The freq to which the Periods will be adjusted. closed : {'right', 'left'}, default None Which side of bin interval is closed. - base : int, default 0 - The "origin" of the adjusted Periods. origin : pd.Timestamp, default None The timestamp on which to adjust the grouping. If None is passed, the first day of the time series at midnight is used. @@ -1768,7 +1757,7 @@ def _get_period_range_edges( adjust_last = freq.is_on_offset(last) first, last = _get_timestamp_range_edges( - first, last, freq, closed=closed, base=base, origin=origin, offset=offset, + first, last, freq, closed=closed, origin=origin, offset=offset, ) first = (first + int(adjust_first) * freq).to_period(freq) @@ -1776,9 +1765,7 @@ def _get_period_range_edges( return first, last -def _adjust_dates_anchored( - first, last, freq, closed="right", base=0, origin=None, offset=None -): +def _adjust_dates_anchored(first, last, freq, closed="right", origin=None, offset=None): # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is # not a multiple of the frequency. See GH 8683 @@ -1800,9 +1787,6 @@ def _adjust_dates_anchored( if last_tzinfo is not None: last = last.tz_convert("UTC") - base_nanos = (base % freq.n) * freq.nanos // freq.n - origin_nanos += base_nanos - foffset = (first.value - origin_nanos) % freq.nanos loffset = (last.value - origin_nanos) % freq.nanos diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 3cb5c83c0cdde..4f85b0d4036f6 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -25,14 +25,13 @@ def test_str(): r = test_series.resample("H") assert ( "DatetimeIndexResampler [freq=, axis=0, closed=left, " - "label=left, convention=start, base=0]" in str(r) + "label=left, convention=start]" in str(r) ) r = test_series.resample("H", origin="1970-01-01") assert ( "DatetimeIndexResampler [freq=, axis=0, closed=left, " - "label=left, convention=start, base=0, " - "origin=1970-01-01 00:00:00]" in str(r) + "label=left, convention=start, origin=1970-01-01 00:00:00]" in str(r) ) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 55a51452f54b2..532ecdbd2950c 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -251,7 +251,7 @@ def test_repr(): expected = ( "TimeGrouper(key='A', freq=, axis=0, sort=True, " "closed='left', label='left', how='mean', " - "convention='e', base=0)" + "convention='e')" ) assert result == expected @@ -259,7 +259,7 @@ def test_repr(): expected = ( "TimeGrouper(key='A', freq=, axis=0, sort=True, " "closed='left', label='left', how='mean', " - "convention='e', base=0, origin=Timestamp('1970-01-01 00:00:00'))" + "convention='e', origin=Timestamp('1970-01-01 00:00:00'))" ) assert result == expected From 9b000d7c01022285f0b51a983427b14eff706527 Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sun, 16 Feb 2020 00:43:22 +0100 Subject: [PATCH 03/26] TST: move deprecated tests of loffset and base into test_deprecated --- pandas/tests/resample/test_base.py | 24 -- pandas/tests/resample/test_datetime_index.py | 105 +------ pandas/tests/resample/test_deprecated.py | 263 ++++++++++++++++++ pandas/tests/resample/test_period_index.py | 69 +---- .../tests/resample/test_resampler_grouper.py | 2 + pandas/tests/resample/test_time_grouper.py | 8 - pandas/tests/resample/test_timedelta.py | 18 +- 7 files changed, 273 insertions(+), 216 deletions(-) create mode 100644 pandas/tests/resample/test_deprecated.py diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 697aeb6db9f92..42d8883e9ca90 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -194,30 +194,6 @@ def test_resample_empty_dtypes(index, dtype, resample_method): pass -@all_ts -@pytest.mark.parametrize("arg", ["mean", {"value": "mean"}, ["mean"]]) -def test_resample_loffset_arg_type(frame, create_index, arg): - # GH 13218, 15002 - df = frame - expected_means = [df.values[i : i + 2].mean() for i in range(0, len(df.values), 2)] - expected_index = create_index(df.index[0], periods=len(df.index) / 2, freq="2D") - - # loffset coerces PeriodIndex to DateTimeIndex - if isinstance(expected_index, PeriodIndex): - expected_index = expected_index.to_timestamp() - - expected_index += timedelta(hours=2) - expected = DataFrame({"value": expected_means}, index=expected_index) - - with tm.assert_produces_warning(FutureWarning): - result_agg = df.resample("2D", loffset="2H").agg(arg) - - if isinstance(arg, list): - expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) - - tm.assert_frame_equal(result_agg, expected) - - @all_ts def test_apply_to_empty_series(empty_series_dti): # GH 14313 diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 92f979b35b13f..f157306ae8d5b 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta +from datetime import datetime from functools import partial from io import StringIO @@ -18,7 +18,7 @@ from pandas.core.resample import DatetimeIndex, _get_timestamp_range_edges import pandas.tseries.offsets as offsets -from pandas.tseries.offsets import BDay, Minute +from pandas.tseries.offsets import Minute @pytest.fixture() @@ -412,77 +412,6 @@ def test_resample_frame_basic(): df.resample("W-WED", kind="period").mean() -@pytest.mark.parametrize( - "loffset", [timedelta(minutes=1), "1min", Minute(1), np.timedelta64(1, "m")] -) -def test_resample_loffset(loffset): - # GH 7687 - rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") - s = Series(np.random.randn(14), index=rng) - - with tm.assert_produces_warning(FutureWarning): - result = s.resample( - "5min", closed="right", label="right", loffset=loffset - ).mean() - idx = date_range("1/1/2000", periods=4, freq="5min") - expected = Series( - [s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], - index=idx + timedelta(minutes=1), - ) - tm.assert_series_equal(result, expected) - assert result.index.freq == Minute(5) - - # from daily - dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D") - ser = Series(np.random.rand(len(dti)), dti) - - # to weekly - result = ser.resample("w-sun").last() - business_day_offset = BDay() - with tm.assert_produces_warning(FutureWarning): - expected = ser.resample("w-sun", loffset=-business_day_offset).last() - assert result.index[0] - business_day_offset == expected.index[0] - - -def test_resample_loffset_upsample(): - # GH 20744 - rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") - s = Series(np.random.randn(14), index=rng) - - with tm.assert_produces_warning(FutureWarning): - result = s.resample( - "5min", closed="right", label="right", loffset=timedelta(minutes=1) - ).ffill() - idx = date_range("1/1/2000", periods=4, freq="5min") - expected = Series([s[0], s[5], s[10], s[-1]], index=idx + timedelta(minutes=1)) - - tm.assert_series_equal(result, expected) - - -def test_resample_loffset_count(): - # GH 12725 - start_time = "1/1/2000 00:00:00" - rng = date_range(start_time, periods=100, freq="S") - ts = Series(np.random.randn(len(rng)), index=rng) - - with tm.assert_produces_warning(FutureWarning): - result = ts.resample("10S", loffset="1s").count() - - expected_index = date_range(start_time, periods=10, freq="10S") + timedelta( - seconds=1 - ) - expected = Series(10, index=expected_index) - - tm.assert_series_equal(result, expected) - - # Same issue should apply to .size() since it goes through - # same code path - with tm.assert_produces_warning(FutureWarning): - result = ts.resample("10S", loffset="1s").size() - - tm.assert_series_equal(result, expected) - - def test_resample_upsample(): # from daily dti = date_range( @@ -798,17 +727,9 @@ def test_resample_single_group(): tm.assert_series_equal(result, expected) -def test_resample_base(): - rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s") - ts = Series(np.random.randn(len(rng)), index=rng) - - with tm.assert_produces_warning(FutureWarning): - resampled = ts.resample("5min", base=2).mean() - exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min") - tm.assert_index_equal(resampled.index, exp_rng) - - def test_resample_offset(): + # GH 31809 + rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s") ts = Series(np.random.randn(len(rng)), index=rng) @@ -818,6 +739,8 @@ def test_resample_offset(): def test_resample_origin(): + # GH 31809 + rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s") ts = Series(np.random.randn(len(rng)), index=rng) @@ -831,22 +754,6 @@ def test_resample_origin(): tm.assert_index_equal(resampled.index, exp_rng) -def test_resample_float_base(): - # GH25161 - dt = pd.to_datetime( - ["2018-11-26 16:17:43.51", "2018-11-26 16:17:44.51", "2018-11-26 16:17:45.51"] - ) - s = Series(np.arange(3), index=dt) - - base = 17 + 43.51 / 60 - with tm.assert_produces_warnding(FutureWarning): - result = s.resample("3min", base=base).size() - expected = Series( - 3, index=pd.DatetimeIndex(["2018-11-26 16:17:43.51"], freq="3min") - ) - tm.assert_series_equal(result, expected) - - def test_resample_daily_anchored(): rng = date_range("1/1/2000 0:00:00", periods=10000, freq="T") ts = Series(np.random.randn(len(rng)), index=rng) diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py new file mode 100644 index 0000000000000..2ce95dd4fc710 --- /dev/null +++ b/pandas/tests/resample/test_deprecated.py @@ -0,0 +1,263 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.core.indexes.datetimes import date_range +from pandas.core.indexes.period import PeriodIndex, period_range +from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range +from pandas.tseries.offsets import BDay, Minute + + +DATE_RANGE = (date_range, "dti", datetime(2005, 1, 1), datetime(2005, 1, 10)) +PERIOD_RANGE = (period_range, "pi", datetime(2005, 1, 1), datetime(2005, 1, 10)) +TIMEDELTA_RANGE = (timedelta_range, "tdi", "1 day", "10 day") + +all_ts = pytest.mark.parametrize( + "_index_factory,_series_name,_index_start,_index_end", + [DATE_RANGE, PERIOD_RANGE, TIMEDELTA_RANGE], +) + + +@pytest.fixture() +def _index_factory(): + return period_range + + +@pytest.fixture +def create_index(_index_factory): + def _create_index(*args, **kwargs): + """ return the _index_factory created using the args, kwargs """ + return _index_factory(*args, **kwargs) + + return _create_index + + +# new test to check that all FutureWarning are triggered +def test_deprecating_on_loffset_and_base(): + # GH 31809 + + idx = pd.date_range("1/1/2000", periods=4, freq="T") + df = pd.DataFrame(data=4 * [range(2)], index=idx, columns=["a", "b"]) + + with tm.assert_produces_warning(FutureWarning): + pd.Grouper(freq="10s", base=2) + with tm.assert_produces_warning(FutureWarning): + pd.Grouper(freq="10s", loffset="2s") + with tm.assert_produces_warning(FutureWarning): + df.groupby("a").resample("3T", base=0).sum() + with tm.assert_produces_warning(FutureWarning): + df.groupby("a").resample("3T", loffset="0s").sum() + with tm.assert_produces_warning(FutureWarning): + df.resample("3T", base=0).sum() + with tm.assert_produces_warning(FutureWarning): + df.resample("3T", loffset="0s").sum() + + +# old tests from test_base.py: +@all_ts +@pytest.mark.parametrize("arg", ["mean", {"value": "mean"}, ["mean"]]) +def test_resample_loffset_arg_type(frame, create_index, arg): + # GH 13218, 15002 + df = frame + expected_means = [df.values[i : i + 2].mean() for i in range(0, len(df.values), 2)] + expected_index = create_index(df.index[0], periods=len(df.index) / 2, freq="2D") + + # loffset coerces PeriodIndex to DateTimeIndex + if isinstance(expected_index, PeriodIndex): + expected_index = expected_index.to_timestamp() + + expected_index += timedelta(hours=2) + expected = DataFrame({"value": expected_means}, index=expected_index) + + with tm.assert_produces_warning(FutureWarning): + result_agg = df.resample("2D", loffset="2H").agg(arg) + + if isinstance(arg, list): + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) + + tm.assert_frame_equal(result_agg, expected) + + +# old tests from test_datetime_index.py +@pytest.mark.parametrize( + "loffset", [timedelta(minutes=1), "1min", Minute(1), np.timedelta64(1, "m")] +) +def test_resample_loffset(loffset): + # GH 7687 + rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") + s = Series(np.random.randn(14), index=rng) + + with tm.assert_produces_warning(FutureWarning): + result = s.resample( + "5min", closed="right", label="right", loffset=loffset + ).mean() + idx = date_range("1/1/2000", periods=4, freq="5min") + expected = Series( + [s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], + index=idx + timedelta(minutes=1), + ) + tm.assert_series_equal(result, expected) + assert result.index.freq == Minute(5) + + # from daily + dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D") + ser = Series(np.random.rand(len(dti)), dti) + + # to weekly + result = ser.resample("w-sun").last() + business_day_offset = BDay() + with tm.assert_produces_warning(FutureWarning): + expected = ser.resample("w-sun", loffset=-business_day_offset).last() + assert result.index[0] - business_day_offset == expected.index[0] + + +def test_resample_loffset_upsample(): + # GH 20744 + rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") + s = Series(np.random.randn(14), index=rng) + + with tm.assert_produces_warning(FutureWarning): + result = s.resample( + "5min", closed="right", label="right", loffset=timedelta(minutes=1) + ).ffill() + idx = date_range("1/1/2000", periods=4, freq="5min") + expected = Series([s[0], s[5], s[10], s[-1]], index=idx + timedelta(minutes=1)) + + tm.assert_series_equal(result, expected) + + +def test_resample_loffset_count(): + # GH 12725 + start_time = "1/1/2000 00:00:00" + rng = date_range(start_time, periods=100, freq="S") + ts = Series(np.random.randn(len(rng)), index=rng) + + with tm.assert_produces_warning(FutureWarning): + result = ts.resample("10S", loffset="1s").count() + + expected_index = date_range(start_time, periods=10, freq="10S") + timedelta( + seconds=1 + ) + expected = Series(10, index=expected_index) + + tm.assert_series_equal(result, expected) + + # Same issue should apply to .size() since it goes through + # same code path + with tm.assert_produces_warning(FutureWarning): + result = ts.resample("10S", loffset="1s").size() + + tm.assert_series_equal(result, expected) + + +def test_resample_base(): + rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + + with tm.assert_produces_warning(FutureWarning): + resampled = ts.resample("5min", base=2).mean() + exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min") + tm.assert_index_equal(resampled.index, exp_rng) + + +def test_resample_float_base(): + # GH25161 + dt = pd.to_datetime( + ["2018-11-26 16:17:43.51", "2018-11-26 16:17:44.51", "2018-11-26 16:17:45.51"] + ) + s = Series(np.arange(3), index=dt) + + base = 17 + 43.51 / 60 + with tm.assert_produces_warning(FutureWarning): + result = s.resample("3min", base=base).size() + expected = Series( + 3, index=pd.DatetimeIndex(["2018-11-26 16:17:43.51"], freq="3min") + ) + tm.assert_series_equal(result, expected) + + +# old tests from test_period_index.py +@pytest.mark.parametrize("kind", ["period", None, "timestamp"]) +@pytest.mark.parametrize("agg_arg", ["mean", {"value": "mean"}, ["mean"]]) +def test_loffset_returns_datetimeindex(frame, kind, agg_arg): + # make sure passing loffset returns DatetimeIndex in all cases + # basic method taken from Base.test_resample_loffset_arg_type() + df = frame + expected_means = [df.values[i : i + 2].mean() for i in range(0, len(df.values), 2)] + expected_index = period_range(df.index[0], periods=len(df.index) / 2, freq="2D") + + # loffset coerces PeriodIndex to DateTimeIndex + expected_index = expected_index.to_timestamp() + expected_index += timedelta(hours=2) + expected = DataFrame({"value": expected_means}, index=expected_index) + + with tm.assert_produces_warning(FutureWarning): + result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg) + if isinstance(agg_arg, list): + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) + tm.assert_frame_equal(result_agg, expected) + + +@pytest.mark.parametrize( + "start,end,start_freq,end_freq,base,offset", + [ + ("19910905", "19910909 03:00", "H", "24H", 10, "10H"), + ("19910905", "19910909 12:00", "H", "24H", 10, "10H"), + ("19910905", "19910909 23:00", "H", "24H", 10, "10H"), + ("19910905 10:00", "19910909", "H", "24H", 10, "10H"), + ("19910905 10:00", "19910909 10:00", "H", "24H", 10, "10H"), + ("19910905", "19910909 10:00", "H", "24H", 10, "10H"), + ("19910905 12:00", "19910909", "H", "24H", 10, "10H"), + ("19910905 12:00", "19910909 03:00", "H", "24H", 10, "10H"), + ("19910905 12:00", "19910909 12:00", "H", "24H", 10, "10H"), + ("19910905 12:00", "19910909 12:00", "H", "24H", 34, "34H"), + ("19910905 12:00", "19910909 12:00", "H", "17H", 10, "10H"), + ("19910905 12:00", "19910909 12:00", "H", "17H", 3, "3H"), + ("19910905 12:00", "19910909 1:00", "H", "M", 3, "3H"), + ("19910905", "19910913 06:00", "2H", "24H", 10, "10H"), + ("19910905", "19910905 01:39", "Min", "5Min", 3, "3Min"), + ("19910905", "19910905 03:18", "2Min", "5Min", 3, "3Min"), + ], +) +def test_resample_with_non_zero_base(start, end, start_freq, end_freq, base, offset): + # GH 23882 + s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq)) + s = s + np.arange(len(s)) + with tm.assert_produces_warning(FutureWarning): + result = s.resample(end_freq, base=base).mean() + result = result.to_timestamp(end_freq) + + # test that the replacement argument `offset` works + result_offset = s.resample(end_freq, offset=offset).mean() + result_offset = result_offset.to_timestamp(end_freq) + tm.assert_series_equal(result, result_offset) + + # to_timestamp casts 24H -> D + result = result.asfreq(end_freq) if end_freq == "24H" else result + with tm.assert_produces_warning(FutureWarning): + expected = s.to_timestamp().resample(end_freq, base=base).mean() + if end_freq == "M": + # TODO: is non-tick the relevant characteristic? (GH 33815) + expected.index = expected.index._with_freq(None) + tm.assert_series_equal(result, expected) + + +# tests from test_timedelta.py +def test_resample_base_with_timedeltaindex(): + # GH 10530 + rng = timedelta_range(start="0s", periods=25, freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + + with tm.assert_produces_warning(FutureWarning): + with_base = ts.resample("2s", base=5).mean() + without_base = ts.resample("2s").mean() + + exp_without_base = timedelta_range(start="0s", end="25s", freq="2s") + exp_with_base = timedelta_range(start="5s", end="29s", freq="2s") + + tm.assert_index_equal(without_base.index, exp_without_base) + tm.assert_index_equal(with_base.index, exp_with_base) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 8b5f2ac7469a2..3db9a91118ebc 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta +from datetime import datetime import dateutil import numpy as np @@ -719,28 +719,6 @@ def test_evenly_divisible_with_no_extra_bins(self): result = df.resample("7D").sum() tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) - @pytest.mark.parametrize("agg_arg", ["mean", {"value": "mean"}, ["mean"]]) - def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg): - # make sure passing loffset returns DatetimeIndex in all cases - # basic method taken from Base.test_resample_loffset_arg_type() - df = frame - expected_means = [ - df.values[i : i + 2].mean() for i in range(0, len(df.values), 2) - ] - expected_index = period_range(df.index[0], periods=len(df.index) / 2, freq="2D") - - # loffset coerces PeriodIndex to DateTimeIndex - expected_index = expected_index.to_timestamp() - expected_index += timedelta(hours=2) - expected = DataFrame({"value": expected_means}, index=expected_index) - - with tm.assert_produces_warning(FutureWarning): - result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg) - if isinstance(agg_arg, list): - expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) - tm.assert_frame_equal(result_agg, expected) - @pytest.mark.parametrize("freq, period_mult", [("H", 24), ("12H", 2)]) @pytest.mark.parametrize("kind", [None, "period"]) def test_upsampling_ohlc(self, freq, period_mult, kind): @@ -815,51 +793,6 @@ def test_resample_with_only_nat(self): result = frame.resample("1s").mean() tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "start,end,start_freq,end_freq,base,offset", - [ - ("19910905", "19910909 03:00", "H", "24H", 10, "10H"), - ("19910905", "19910909 12:00", "H", "24H", 10, "10H"), - ("19910905", "19910909 23:00", "H", "24H", 10, "10H"), - ("19910905 10:00", "19910909", "H", "24H", 10, "10H"), - ("19910905 10:00", "19910909 10:00", "H", "24H", 10, "10H"), - ("19910905", "19910909 10:00", "H", "24H", 10, "10H"), - ("19910905 12:00", "19910909", "H", "24H", 10, "10H"), - ("19910905 12:00", "19910909 03:00", "H", "24H", 10, "10H"), - ("19910905 12:00", "19910909 12:00", "H", "24H", 10, "10H"), - ("19910905 12:00", "19910909 12:00", "H", "24H", 34, "34H"), - ("19910905 12:00", "19910909 12:00", "H", "17H", 10, "10H"), - ("19910905 12:00", "19910909 12:00", "H", "17H", 3, "3H"), - ("19910905 12:00", "19910909 1:00", "H", "M", 3, "3H"), - ("19910905", "19910913 06:00", "2H", "24H", 10, "10H"), - ("19910905", "19910905 01:39", "Min", "5Min", 3, "3Min"), - ("19910905", "19910905 03:18", "2Min", "5Min", 3, "3Min"), - ], - ) - def test_resample_with_non_zero_base( - self, start, end, start_freq, end_freq, base, offset - ): - # GH 23882 - s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq)) - s = s + np.arange(len(s)) - with tm.assert_produces_warning(FutureWarning): - result = s.resample(end_freq, base=base).mean() - result = result.to_timestamp(end_freq) - - # test that the replacement argument `offset` works - result_offset = s.resample(end_freq, offset=offset).mean() - result_offset = result_offset.to_timestamp(end_freq) - tm.assert_series_equal(result, result_offset) - - # to_timestamp casts 24H -> D - result = result.asfreq(end_freq) if end_freq == "24H" else result - with tm.assert_produces_warning(FutureWarning): - expected = s.to_timestamp().resample(end_freq, base=base).mean() - if end_freq == "M": - # TODO: is non-tick the relevant characteristic? (GH 33815) - expected.index = expected.index._with_freq(None) - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( "start,end,start_freq,end_freq,offset", [ diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index ffef46170b89d..cbf3a778f9ae0 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -133,6 +133,8 @@ def test_groupby_resample_on_api_with_getitem(): def test_groupby_with_origin(): + # GH 31809 + freq = "1399min" # prime number that is smaller than 24h start, end = "1/1/2000 00:00:00", "1/31/2000 00:00" middle = "1/15/2000 00:00:00" diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 532ecdbd2950c..6aa4606486f09 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -287,11 +287,3 @@ def test_upsample_sum(method, method_args, expected_values): result = methodcaller(method, **method_args)(resampled) expected = pd.Series(expected_values, index=index) tm.assert_series_equal(result, expected) - - -def test_deprecating_on_loffset_and_base(): - with tm.assert_produces_warning(FutureWarning): - pd.Grouper(freq="10s", loffset="2s") - - with tm.assert_produces_warning(FutureWarning): - pd.Grouper(freq="10s", base=2) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 4227e3d58eaf0..0fbb60c176b30 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -80,24 +80,8 @@ def test_resample_timedelta_idempotency(): tm.assert_series_equal(result, expected) -def test_resample_base_with_timedeltaindex(): - - # GH 10530 - rng = timedelta_range(start="0s", periods=25, freq="s") - ts = Series(np.random.randn(len(rng)), index=rng) - - with tm.assert_produces_warning(FutureWarning): - with_base = ts.resample("2s", base=5).mean() - without_base = ts.resample("2s").mean() - - exp_without_base = timedelta_range(start="0s", end="25s", freq="2s") - exp_with_base = timedelta_range(start="5s", end="29s", freq="2s") - - tm.assert_index_equal(without_base.index, exp_without_base) - tm.assert_index_equal(with_base.index, exp_with_base) - - def test_resample_offset_with_timedeltaindex(): + # GH 10530 & 31809 rng = timedelta_range(start="0s", periods=25, freq="s") ts = Series(np.random.randn(len(rng)), index=rng) From 6c03bf49b0233b7b2be1a3435161878727a696c9 Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sun, 16 Feb 2020 01:51:09 +0100 Subject: [PATCH 04/26] ENH: check if origin for resample is timezone aware when needed --- pandas/core/generic.py | 10 ++++++++-- pandas/core/groupby/grouper.py | 11 ++++++++--- pandas/core/resample.py | 13 +++++++++++-- pandas/tests/resample/test_base.py | 4 ++-- pandas/tests/resample/test_datetime_index.py | 17 +++++++++++++++++ pandas/tests/resample/test_deprecated.py | 2 +- 6 files changed, 47 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0ad15348f8a0f..0d6f2a14bc931 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7824,11 +7824,17 @@ def resample( For a MultiIndex, level (name or number) to use for resampling. `level` must be datetime-like. origin : pd.Timestamp, default None - The timestamp on which to adjust the grouping. If None is passed, - the first day of the time series at midnight is used. + The timestamp on which to adjust the grouping. It must be timezone + aware if the index of the resampled data is. If None is passed, the + first day of the time series at midnight is used. + + .. versionadded:: 1.1.0 + offset : pd.Timedelta, default is None An offset timedelta added to the origin. + .. versionadded:: 1.1.0 + Returns ------- Resampler object diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 30bddb8e1567a..30b9741c8c240 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -86,13 +86,18 @@ class Grouper: However, loffset is also deprecated for ``.resample(...)`` See: :class:`DataFrame.resample` - origin : Timestamp, default None - Only when `freq` parameter is passed. - The timestamp on which to adjust the grouping. If None is passed, the + origin : pd.Timestamp, default None + The timestamp on which to adjust the grouping. It must be timezone + aware if the index of the resampled data is. If None is passed, the first day of the time series at midnight is used. + + .. versionadded:: 1.1.0 + offset : pd.Timedelta, default is None An offset timedelta added to the origin. + .. versionadded:: 1.1.0 + Returns ------- A specification for a groupby instruction diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 9c379824f5f59..1d6ad1d78c5fb 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1682,7 +1682,8 @@ def _get_timestamp_range_edges( closed : {'right', 'left'}, default None Which side of bin interval is closed. origin : pd.Timestamp, default None - The timestamp on which to adjust the grouping. If None is passed, the + The timestamp on which to adjust the grouping. It must be timezone + aware if the index of the resampled data is. If None is passed, the first day of the time series at midnight is used. offset : pd.Timedelta, default is None An offset timedelta added to the origin. @@ -1692,6 +1693,13 @@ def _get_timestamp_range_edges( A tuple of length 2, containing the adjusted pd.Timestamp objects. """ if isinstance(freq, Tick): + is_idx_tz_aware = first.tzinfo is not None or last.tzinfo is not None + if origin is not None and origin.tzinfo is None and is_idx_tz_aware: + raise ValueError( + "The origin must be timezone aware when the index " + "of the resampled data is." + ) + if isinstance(freq, Day): # _adjust_dates_anchored assumes 'D' means 24H, but first/last # might contain a DST transition (23H, 24H, or 25H). @@ -1738,7 +1746,8 @@ def _get_period_range_edges(first, last, freq, closed="left", origin=None, offse closed : {'right', 'left'}, default None Which side of bin interval is closed. origin : pd.Timestamp, default None - The timestamp on which to adjust the grouping. If None is passed, the + The timestamp on which to adjust the grouping. It must be timezone + aware if the index of the resampled data is. If None is passed, the first day of the time series at midnight is used. offset : pd.Timedelta, default is None An offset timedelta added to the origin. diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 42d8883e9ca90..485535bec20d0 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta +from datetime import datetime import numpy as np import pytest @@ -9,7 +9,7 @@ from pandas.core.groupby.groupby import DataError from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range -from pandas.core.indexes.period import PeriodIndex, period_range +from pandas.core.indexes.period import period_range from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.resample import _asfreq_compat diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index f157306ae8d5b..4669f947de324 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -754,6 +754,23 @@ def test_resample_origin(): tm.assert_index_equal(resampled.index, exp_rng) +def test_resample_origin_with_tz(): + # GH 31809 + msg = "The origin must be timezone aware when the index of the resampled data is." + + tz = "Europe/Paris" + rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s", tz=tz) + ts = Series(np.random.randn(len(rng)), index=rng) + + exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min", tz=tz) + + resampled = ts.resample("5min", origin="12/31/1999 23:57:00+00:00").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + with pytest.raises(ValueError, match=msg): + ts.resample("5min", origin="12/31/1999 23:57:00").mean() + + def test_resample_daily_anchored(): rng = date_range("1/1/2000 0:00:00", periods=10000, freq="T") ts = Series(np.random.randn(len(rng)), index=rng) diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py index 2ce95dd4fc710..46f2fb16ff210 100644 --- a/pandas/tests/resample/test_deprecated.py +++ b/pandas/tests/resample/test_deprecated.py @@ -9,8 +9,8 @@ from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import PeriodIndex, period_range from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range -from pandas.tseries.offsets import BDay, Minute +from pandas.tseries.offsets import BDay, Minute DATE_RANGE = (date_range, "dti", datetime(2005, 1, 1), datetime(2005, 1, 10)) PERIOD_RANGE = (period_range, "pi", datetime(2005, 1, 1), datetime(2005, 1, 10)) From 532cdfe6d304ad2c140d1a0ea6325b347189419e Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sun, 16 Feb 2020 14:06:21 +0100 Subject: [PATCH 05/26] DOC: add 'Use or to adjust the start of the bins' section into timeseries.rst --- doc/source/user_guide/timeseries.rst | 42 ++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 60eef91e0fd79..d968232c10a4a 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1786,6 +1786,48 @@ natural and functions similarly to :py:func:`itertools.groupby`: See :ref:`groupby.iterating-label` or :class:`Resampler.__iter__` for more. +.. _timeseries.adjust-the-start-of-the-bins: + +Use `origin` or `offset` to adjust the start of the bins +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divides a day (like `90s` or `1min`). But it can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can specify a fixed timestamp with the argument ``origin``. + +For example: + +.. ipython:: python + + start, end = "1/10/2000 02:00:00", "1/20/2000 02:00" + middle = "1/15/2000 02:00" + rng = pd.date_range(start, end, freq="1231min") + ts = pd.Series(np.arange(len(rng)) * 3, index=rng) + ts + +Here we can see that, when not using ``origin``, the result after 1/15/2000 are not identical depending on the start of time series: + +.. ipython:: python + + ts.resample("2711min").sum() + ts[middle:end].resample("2711min").sum() + + +Here we can see that, when using ``origin``, the result after 1/15/2000 are identical depending on the start of time series: + +.. ipython:: python + + origin = pd.Timestamp("1970-01-01") + ts.resample("2711min", origin=origin).sum() + ts[middle:end].resample("2711min", origin=origin).sum() + + +If needed we can just adjust the bins with an offset that would be added to the default ``origin``. +Those two examples are equivalent for this time series: + +.. ipython:: python + + ts.resample("2711min", origin="1/10/2000 02:00:00").sum() + ts.resample("2711min", offset="2h").sum() + .. _timeseries.periods: From f158cdbba9145957197f22c72641fbcf9f80fa5e Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sun, 16 Feb 2020 14:23:21 +0100 Subject: [PATCH 06/26] DOC: simplify doc for What's new and add a comment on the deprecation of loffset --- doc/source/user_guide/timeseries.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index d968232c10a4a..45df4425d8d99 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1820,7 +1820,7 @@ Here we can see that, when using ``origin``, the result after 1/15/2000 are iden ts[middle:end].resample("2711min", origin=origin).sum() -If needed we can just adjust the bins with an offset that would be added to the default ``origin``. +If needed you can just adjust the bins with an offset that would be added to the default ``origin``. Those two examples are equivalent for this time series: .. ipython:: python From 78ed64c9872085f1d2b8c883c4f396cd8b41a6fc Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sun, 16 Feb 2020 15:13:36 +0100 Subject: [PATCH 07/26] DOC: Add example for origin and offset in resample and in pd.Grouper --- pandas/core/generic.py | 92 ++++++++++++++++++++++++++++++---- pandas/core/groupby/groupby.py | 9 ---- pandas/core/groupby/grouper.py | 69 +++++++++++++++++++++++-- 3 files changed, 147 insertions(+), 23 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0d6f2a14bc931..a5f5848d8bf05 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7802,9 +7802,7 @@ def resample( .. deprecated:: 1.1.0 You should add the loffset to the `df.index` after the resample. - like this: - ``df.index = df.index.to_timestamp() + to_offset(loffset)`` - (a more complete example is present below) + See below. base : int, default 0 For frequencies that evenly subdivide 1 day, the "origin" of the @@ -7813,9 +7811,6 @@ def resample( .. deprecated:: 1.1.0 The new arguments that you should use are 'offset' or 'origin'. - ``df.resample(freq="3s", base=2)`` - becomes - ``df.resample(freq="3s", offset="2s")`` on : str, optional For a DataFrame, column to use instead of index for resampling. @@ -8052,12 +8047,87 @@ def resample( 2000-01-03 32 150 2000-01-04 36 90 - To replace the use of the deprecated loffset argument: + If you want to adjust the start of the bins based on a fixed timestamp: + + >>> start, end = "1/10/2000 02:00:00", "1/20/2000 02:00" + >>> rng = pd.date_range(start, end, freq="1231min") + >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) + >>> ts + 2000-01-10 02:00:00 0 + 2000-01-10 22:31:00 3 + 2000-01-11 19:02:00 6 + 2000-01-12 15:33:00 9 + 2000-01-13 12:04:00 12 + 2000-01-14 08:35:00 15 + 2000-01-15 05:06:00 18 + 2000-01-16 01:37:00 21 + 2000-01-16 22:08:00 24 + 2000-01-17 18:39:00 27 + 2000-01-18 15:10:00 30 + 2000-01-19 11:41:00 33 + Freq: 1231T, dtype: int64 + >>> ts.resample("2711min").sum() + 2000-01-10 00:00:00 9 + 2000-01-11 21:11:00 21 + 2000-01-13 18:22:00 33 + 2000-01-15 15:33:00 45 + 2000-01-17 12:44:00 57 + 2000-01-19 09:55:00 33 + Freq: 2711T, dtype: int64 + >>> ts.resample("2711min", origin=pd.Timestamp("1970-01-01")).sum() + 2000-01-08 11:44:00 0 + 2000-01-10 08:55:00 9 + 2000-01-12 06:06:00 21 + 2000-01-14 03:17:00 33 + 2000-01-16 00:28:00 72 + 2000-01-17 21:39:00 63 + Freq: 2711T, dtype: int64 + + If you want to adjust the start of the bins with an offset, the two following + lines are equivalent: + + >>> ts.resample("2711min", origin="1/10/2000 02:00:00").sum() + 2000-01-10 02:00:00 9 + 2000-01-11 23:11:00 21 + 2000-01-13 20:22:00 33 + 2000-01-15 17:33:00 45 + 2000-01-17 14:44:00 90 + Freq: 2711T, dtype: int64 + >>> ts.resample("2711min", offset="2h").sum() + 2000-01-10 02:00:00 9 + 2000-01-11 23:11:00 21 + 2000-01-13 20:22:00 33 + 2000-01-15 17:33:00 45 + 2000-01-17 14:44:00 90 + Freq: 2711T, dtype: int64 + + To replace the use of the deprecated `base` argument: + + >>> # ts.resample("2711min", base=2).sum() + >>> # becomes: + >>> ts.resample("2711min", offset="2min").sum() + 2000-01-10 00:02:00 9 + 2000-01-11 21:13:00 21 + 2000-01-13 18:24:00 33 + 2000-01-15 15:35:00 45 + 2000-01-17 12:46:00 57 + 2000-01-19 09:57:00 33 + Freq: 2711T, dtype: int64 + + To replace the use of the deprecated `loffset` argument: + >>> from pandas.tseries.frequencies import to_offset - >>> rng = pd.date_range("2000-01-01", "2000-01-01", freq="1s") - >>> ts = pd.Series(np.arange(len(rng)), index=rng) - >>> s = s.resample("3s").mean() - >>> s.index = s.index.to_timestamp() + to_offset("8H") + >>> loffset = "8H" + >>> ts_out = ts.resample("2711min").sum() + >>> ts_out.index = ts_out.index + to_offset(loffset) + >>> ts_out + 2000-01-10 08:00:00 9 + 2000-01-12 05:11:00 21 + 2000-01-14 02:22:00 33 + 2000-01-15 23:33:00 45 + 2000-01-17 20:44:00 57 + 2000-01-19 17:55:00 33 + Freq: 2711T, dtype: int64 """ from pandas.core.resample import get_resampler from pandas.core.resample import _validate_resample_deprecated_args diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 14cc1ec82d530..791e72a56aa20 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1646,15 +1646,6 @@ def resample(self, rule, *args, **kwargs): 0 2000-01-01 00:00:00 0 1 2000-01-01 00:03:00 0 2 5 2000-01-01 00:03:00 5 1 - - Add an offset of twenty seconds. - - >>> df.groupby('a').resample('3T', loffset='20s').sum() - a b - a - 0 2000-01-01 00:00:20 0 2 - 2000-01-01 00:03:20 0 1 - 5 2000-01-01 00:00:20 5 1 """ from pandas.core.resample import get_resampler_for_grouping from pandas.core.resample import _validate_resample_deprecated_args diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 30b9741c8c240..d5797112694f0 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -73,9 +73,6 @@ class Grouper: .. deprecated:: 1.1.0 The new arguments that you should use are 'offset' or 'origin'. - ``df.resample(freq="3s", base=2)`` - becomes - ``df.resample(freq="3s", offset="2s")`` loffset : str, DateOffset, timedelta object Only when `freq` parameter is passed. @@ -151,6 +148,72 @@ class Grouper: 2000-01-02 0.5 15.0 2000-01-09 2.0 30.0 2000-01-16 3.0 40.0 + + If you want to adjust the start of the bins based on a fixed timestamp: + + >>> start, end = "1/10/2000 02:00:00", "1/20/2000 02:00" + >>> rng = pd.date_range(start, end, freq="1231min") + >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) + >>> ts + 2000-01-10 02:00:00 0 + 2000-01-10 22:31:00 3 + 2000-01-11 19:02:00 6 + 2000-01-12 15:33:00 9 + 2000-01-13 12:04:00 12 + 2000-01-14 08:35:00 15 + 2000-01-15 05:06:00 18 + 2000-01-16 01:37:00 21 + 2000-01-16 22:08:00 24 + 2000-01-17 18:39:00 27 + 2000-01-18 15:10:00 30 + 2000-01-19 11:41:00 33 + Freq: 1231T, dtype: int64 + >>> ts.groupby(pd.Grouper(freq="2711min")).sum() + 2000-01-10 00:00:00 9 + 2000-01-11 21:11:00 21 + 2000-01-13 18:22:00 33 + 2000-01-15 15:33:00 45 + 2000-01-17 12:44:00 57 + 2000-01-19 09:55:00 33 + Freq: 2711T, dtype: int64 + >>> ts.groupby(pd.Grouper(freq="2711min", origin=pd.Timestamp("1970-01-01"))).sum() + 2000-01-08 11:44:00 0 + 2000-01-10 08:55:00 9 + 2000-01-12 06:06:00 21 + 2000-01-14 03:17:00 33 + 2000-01-16 00:28:00 72 + 2000-01-17 21:39:00 63 + Freq: 2711T, dtype: int64 + + If you want to adjust the start of the bins with an offset, the two following + lines are equivalent: + + >>> ts.groupby(pd.Grouper(freq="2711min", origin="1/10/2000 02:00:00")).sum() + 2000-01-10 02:00:00 9 + 2000-01-11 23:11:00 21 + 2000-01-13 20:22:00 33 + 2000-01-15 17:33:00 45 + 2000-01-17 14:44:00 90 + Freq: 2711T, dtype: int64 + >>> ts.groupby(pd.Grouper(freq="2711min", offset="2h")).sum() + 2000-01-10 02:00:00 9 + 2000-01-11 23:11:00 21 + 2000-01-13 20:22:00 33 + 2000-01-15 17:33:00 45 + 2000-01-17 14:44:00 90 + Freq: 2711T, dtype: int64 + + To replace the use of the deprecated `base` argument: + >>> # ts.groupby(pd.Grouper(freq="2711min", base=2)).sum() + >>> # becomes: + >>> ts.groupby(pd.Grouper(freq="2711min", offset="2min")).sum() + 2000-01-10 00:02:00 9 + 2000-01-11 21:13:00 21 + 2000-01-13 18:24:00 33 + 2000-01-15 15:35:00 45 + 2000-01-17 12:46:00 57 + 2000-01-19 09:57:00 33 + Freq: 2711T, dtype: int64 """ _attributes: Tuple[str, ...] = ("key", "level", "freq", "axis", "sort") From 77f507d1ff81fea38d85e26211bd448aa5d4a19c Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Tue, 31 Mar 2020 16:36:51 +0200 Subject: [PATCH 08/26] CLN: review clean part one --- doc/source/user_guide/timeseries.rst | 6 ++++-- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/generic.py | 3 +++ pandas/core/groupby/grouper.py | 3 +++ pandas/core/resample.py | 12 ++++++++---- pandas/tests/resample/test_deprecated.py | 4 ---- 6 files changed, 19 insertions(+), 11 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 45df4425d8d99..f79f1c01ef3ee 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1791,7 +1791,9 @@ See :ref:`groupby.iterating-label` or :class:`Resampler.__iter__` for more. Use `origin` or `offset` to adjust the start of the bins ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divides a day (like `90s` or `1min`). But it can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can specify a fixed timestamp with the argument ``origin``. +.. versionadded:: 1.1.0 + +The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divides a day (like `90s` or `1min`). This can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can specify a fixed timestamp with the argument ``origin``. For example: @@ -1820,7 +1822,7 @@ Here we can see that, when using ``origin``, the result after 1/15/2000 are iden ts[middle:end].resample("2711min", origin=origin).sum() -If needed you can just adjust the bins with an offset that would be added to the default ``origin``. +If needed you can just adjust the bins with an ``offset`` that would be added to the default ``origin``. Those two examples are equivalent for this time series: .. ipython:: python diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index da0f8683cdf0c..7021157b1c836 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -157,7 +157,7 @@ For example: Grouper and resample now supports the arguments origin and offset ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`Grouper` and :class:`DataFrame.resample` now supports the argument `origin`. The timestamp on which to adjust the grouping. (:issue:`31809`) +:class:`Grouper` and :class:`DataFrame.resample` now supports the argument ``origin``. The timestamp on which to adjust the grouping. (:issue:`31809`) The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divides a day (like `90s` or `1min`). But it can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can now specify a fixed timestamp with the argument ``origin``. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a5f5848d8bf05..3ac7731a52290 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8066,6 +8066,7 @@ def resample( 2000-01-18 15:10:00 30 2000-01-19 11:41:00 33 Freq: 1231T, dtype: int64 + >>> ts.resample("2711min").sum() 2000-01-10 00:00:00 9 2000-01-11 21:11:00 21 @@ -8074,6 +8075,7 @@ def resample( 2000-01-17 12:44:00 57 2000-01-19 09:55:00 33 Freq: 2711T, dtype: int64 + >>> ts.resample("2711min", origin=pd.Timestamp("1970-01-01")).sum() 2000-01-08 11:44:00 0 2000-01-10 08:55:00 9 @@ -8093,6 +8095,7 @@ def resample( 2000-01-15 17:33:00 45 2000-01-17 14:44:00 90 Freq: 2711T, dtype: int64 + >>> ts.resample("2711min", offset="2h").sum() 2000-01-10 02:00:00 9 2000-01-11 23:11:00 21 diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d5797112694f0..0f0a53a679bd0 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -168,6 +168,7 @@ class Grouper: 2000-01-18 15:10:00 30 2000-01-19 11:41:00 33 Freq: 1231T, dtype: int64 + >>> ts.groupby(pd.Grouper(freq="2711min")).sum() 2000-01-10 00:00:00 9 2000-01-11 21:11:00 21 @@ -176,6 +177,7 @@ class Grouper: 2000-01-17 12:44:00 57 2000-01-19 09:55:00 33 Freq: 2711T, dtype: int64 + >>> ts.groupby(pd.Grouper(freq="2711min", origin=pd.Timestamp("1970-01-01"))).sum() 2000-01-08 11:44:00 0 2000-01-10 08:55:00 9 @@ -195,6 +197,7 @@ class Grouper: 2000-01-15 17:33:00 45 2000-01-17 14:44:00 90 Freq: 2711T, dtype: int64 + >>> ts.groupby(pd.Grouper(freq="2711min", offset="2h")).sum() 2000-01-10 02:00:00 9 2000-01-11 23:11:00 21 diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 1d6ad1d78c5fb..1c88b323f436d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1359,14 +1359,18 @@ def __init__( self.origin = Timestamp(origin) if origin is not None else None self.offset = Timedelta(offset) if offset is not None else None if base and isinstance(freq, Tick): - # this conversion handle the default behavior of base - # and the special case of GH #10530 + # this conversion handle the default behavior of base and the + # special case of GH #10530. Indeed in case when dealing with + # a TimedeltaIndex base was treated as a 'pure' offset even though + # the default behavior of base was equivalent of a modulo on + # freq_nanos. self.offset = Timedelta(base * freq.nanos // freq.n) # always sort time groupers kwargs["sort"] = True if isinstance(loffset, str): + # loffset is deprecated since v1.1.0 (GH #31809) loffset = to_offset(loffset) self.loffset = loffset @@ -1693,8 +1697,8 @@ def _get_timestamp_range_edges( A tuple of length 2, containing the adjusted pd.Timestamp objects. """ if isinstance(freq, Tick): - is_idx_tz_aware = first.tzinfo is not None or last.tzinfo is not None - if origin is not None and origin.tzinfo is None and is_idx_tz_aware: + is_idx_tz_aware = first.tz is not None or last.tz is not None + if origin is not None and origin.tz is None and is_idx_tz_aware: raise ValueError( "The origin must be timezone aware when the index " "of the resampled data is." diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py index 46f2fb16ff210..d6689530ea88a 100644 --- a/pandas/tests/resample/test_deprecated.py +++ b/pandas/tests/resample/test_deprecated.py @@ -57,7 +57,6 @@ def test_deprecating_on_loffset_and_base(): df.resample("3T", loffset="0s").sum() -# old tests from test_base.py: @all_ts @pytest.mark.parametrize("arg", ["mean", {"value": "mean"}, ["mean"]]) def test_resample_loffset_arg_type(frame, create_index, arg): @@ -82,7 +81,6 @@ def test_resample_loffset_arg_type(frame, create_index, arg): tm.assert_frame_equal(result_agg, expected) -# old tests from test_datetime_index.py @pytest.mark.parametrize( "loffset", [timedelta(minutes=1), "1min", Minute(1), np.timedelta64(1, "m")] ) @@ -180,7 +178,6 @@ def test_resample_float_base(): tm.assert_series_equal(result, expected) -# old tests from test_period_index.py @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) @pytest.mark.parametrize("agg_arg", ["mean", {"value": "mean"}, ["mean"]]) def test_loffset_returns_datetimeindex(frame, kind, agg_arg): @@ -246,7 +243,6 @@ def test_resample_with_non_zero_base(start, end, start_freq, end_freq, base, off tm.assert_series_equal(result, expected) -# tests from test_timedelta.py def test_resample_base_with_timedeltaindex(): # GH 10530 rng = timedelta_range(start="0s", periods=25, freq="s") From 426d8c7b1c6c7eb15c2f4c68aa34dc72c1b882c2 Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Tue, 31 Mar 2020 17:11:25 +0200 Subject: [PATCH 09/26] CLN: review clean part two --- doc/source/user_guide/timeseries.rst | 2 +- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/groupby/grouper.py | 4 ++-- pandas/core/resample.py | 12 ++++-------- pandas/tests/resample/test_datetime_index.py | 2 +- 5 files changed, 9 insertions(+), 13 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index f79f1c01ef3ee..f1e34e7598a4e 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1793,7 +1793,7 @@ Use `origin` or `offset` to adjust the start of the bins .. versionadded:: 1.1.0 -The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divides a day (like `90s` or `1min`). This can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can specify a fixed timestamp with the argument ``origin``. +The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divide a day evenly (like `90s` or `1min`). This can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can specify a fixed timestamp with the argument ``origin``. For example: diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7021157b1c836..00e9247ae38d2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -191,7 +191,7 @@ The argument ``base`` is now deprecated in favor of ``offset``. (:issue:`31809`) # becomes: ts.resample("2711min", offset="2min").sum() -The argument ``loffset`` is now deprecated. (:issue:`31809`) +The argument ``loffset`` is now deprecated. You should now add an offset to the index DataFrame after being resampled. (:issue:`31809`) .. ipython:: python diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 0f0a53a679bd0..54ff11e8f79b0 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -84,8 +84,8 @@ class Grouper: See: :class:`DataFrame.resample` origin : pd.Timestamp, default None - The timestamp on which to adjust the grouping. It must be timezone - aware if the index of the resampled data is. If None is passed, the + The timestamp on which to adjust the grouping. The timezone of the + timestamp must match the timezone of the index. If None is passed, the first day of the time series at midnight is used. .. versionadded:: 1.1.0 diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 1c88b323f436d..9294b6df3d821 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1638,8 +1638,7 @@ def _validate_resample_deprecated_args(offset=None, base=None, loffset=None, **k if loffset is not None: warnings.warn( - "'loffset' in .resample() and in Grouper() is deprecated.\n" - "Here an example to have the same behavior than loffset:\n\n" + "'loffset' in .resample() and in Grouper() is deprecated.\n\n" '>>> df.resample(freq="3s", loffset="8H")\n' "\nbecomes:\n\n" ">>> from pandas.tseries.frequencies import to_offset\n" @@ -1699,10 +1698,7 @@ def _get_timestamp_range_edges( if isinstance(freq, Tick): is_idx_tz_aware = first.tz is not None or last.tz is not None if origin is not None and origin.tz is None and is_idx_tz_aware: - raise ValueError( - "The origin must be timezone aware when the index " - "of the resampled data is." - ) + raise ValueError("The origin must have the same timezone as the index.") if isinstance(freq, Day): # _adjust_dates_anchored assumes 'D' means 24H, but first/last @@ -1750,8 +1746,8 @@ def _get_period_range_edges(first, last, freq, closed="left", origin=None, offse closed : {'right', 'left'}, default None Which side of bin interval is closed. origin : pd.Timestamp, default None - The timestamp on which to adjust the grouping. It must be timezone - aware if the index of the resampled data is. If None is passed, the + The timestamp on which to adjust the grouping. The timezone of the + timestamp must match the timezone of the index. If None is passed, the first day of the time series at midnight is used. offset : pd.Timedelta, default is None An offset timedelta added to the origin. diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 4669f947de324..61e79c5a3741c 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -756,7 +756,7 @@ def test_resample_origin(): def test_resample_origin_with_tz(): # GH 31809 - msg = "The origin must be timezone aware when the index of the resampled data is." + msg = "The origin must have the same timezone as the index." tz = "Europe/Paris" rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s", tz=tz) From bad3ed616a7b0841dea3967c8afbccccfa06830d Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Wed, 1 Apr 2020 00:21:44 +0200 Subject: [PATCH 10/26] DOC: update documentation to be more clearer (review part 3) --- doc/source/user_guide/timeseries.rst | 26 ++--- doc/source/whatsnew/v1.1.0.rst | 39 +++++--- pandas/core/generic.py | 137 ++++++++++++--------------- pandas/core/groupby/grouper.py | 119 +++++++++++------------ 4 files changed, 158 insertions(+), 163 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index f1e34e7598a4e..db0fef8c7a85c 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1793,42 +1793,42 @@ Use `origin` or `offset` to adjust the start of the bins .. versionadded:: 1.1.0 -The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divide a day evenly (like `90s` or `1min`). This can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can specify a fixed timestamp with the argument ``origin``. +The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divide a day evenly (like `90s` or `1min`). This can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can specify a fixed Timestamp with the argument ``origin``. For example: .. ipython:: python - start, end = "1/10/2000 02:00:00", "1/20/2000 02:00" - middle = "1/15/2000 02:00" - rng = pd.date_range(start, end, freq="1231min") + start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" + middle = "2000-10-02 00:00:00" + rng = pd.date_range(start, end, freq="7min") ts = pd.Series(np.arange(len(rng)) * 3, index=rng) ts -Here we can see that, when not using ``origin``, the result after 1/15/2000 are not identical depending on the start of time series: +Here we can see that, when not using ``origin``, the result after "2000-10-02 00:00:00" are not identical depending on the start of time series: .. ipython:: python - ts.resample("2711min").sum() - ts[middle:end].resample("2711min").sum() + ts.resample("17min").sum() + ts[middle:end].resample("17min").sum() -Here we can see that, when using ``origin``, the result after 1/15/2000 are identical depending on the start of time series: +Here we can see that, when using ``origin``, the result after "2000-10-02 00:00:00" are identical depending on the start of time series: .. ipython:: python origin = pd.Timestamp("1970-01-01") - ts.resample("2711min", origin=origin).sum() - ts[middle:end].resample("2711min", origin=origin).sum() + ts.resample("17min", origin=origin).sum() + ts[middle:end].resample("17min", origin=origin).sum() -If needed you can just adjust the bins with an ``offset`` that would be added to the default ``origin``. +If needed you can just adjust the bins with an ``offset`` Timedelta that would be added to the default ``origin``. Those two examples are equivalent for this time series: .. ipython:: python - ts.resample("2711min", origin="1/10/2000 02:00:00").sum() - ts.resample("2711min", offset="2h").sum() + ts.resample("17min", origin=start).sum() + ts.resample("17min", offset=pd.Timedelta("23h30min")).sum() .. _timeseries.periods: diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 00e9247ae38d2..a367e05b983a2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -165,13 +165,23 @@ For example: .. ipython:: python - start, end = "1/10/2000 02:00:00", "1/20/2000 02:00" - middle = "1/15/2000 02:00" - rng = pd.date_range(start, end, freq="1231min") + start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" + middle = "2000-10-02 00:00:00" + rng = pd.date_range(start, end, freq="7min") ts = pd.Series(np.arange(len(rng)) * 3, index=rng) ts - ts.resample("2711min").sum() - ts.resample("2711min", origin="1970-01-01").sum() + +Resample with the default behavior (origin is 2000-10-01 00:00:00): + +.. ipython:: python + + ts.resample("17min").sum() + +Resample using a fixed origin: + +.. ipython:: python + + ts.resample("17min", origin="1970-01-01").sum() For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. @@ -180,24 +190,29 @@ Those two examples are equivalent for this time series: .. ipython:: python - ts.resample("2711min", origin="1/10/2000 02:00:00").sum() - ts.resample("2711min", offset="2h").sum() + ts.resample("17min", origin=start).sum() + ts.resample("17min", offset=pd.Timedelta("23h30min")).sum() The argument ``base`` is now deprecated in favor of ``offset``. (:issue:`31809`) +.. ipython:: python + :okwarning: + + ts.resample("17min", base=2).sum() + +becomes: + .. ipython:: python - # ts.resample("2711min", base=2).sum() - # becomes: - ts.resample("2711min", offset="2min").sum() + ts.resample("17min", offset="2min").sum() The argument ``loffset`` is now deprecated. You should now add an offset to the index DataFrame after being resampled. (:issue:`31809`) .. ipython:: python from pandas.tseries.frequencies import to_offset - loffset = "8H" - ts_out = ts.resample("2711min").sum() + loffset = "19min" + ts_out = ts.resample("17min").sum() ts_out.index = ts_out.index + to_offset(loffset) ts_out diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3ac7731a52290..bf267a129e6c1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8049,88 +8049,77 @@ def resample( If you want to adjust the start of the bins based on a fixed timestamp: - >>> start, end = "1/10/2000 02:00:00", "1/20/2000 02:00" - >>> rng = pd.date_range(start, end, freq="1231min") + >>> start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" + >>> rng = pd.date_range(start, end, freq="7min") >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) >>> ts - 2000-01-10 02:00:00 0 - 2000-01-10 22:31:00 3 - 2000-01-11 19:02:00 6 - 2000-01-12 15:33:00 9 - 2000-01-13 12:04:00 12 - 2000-01-14 08:35:00 15 - 2000-01-15 05:06:00 18 - 2000-01-16 01:37:00 21 - 2000-01-16 22:08:00 24 - 2000-01-17 18:39:00 27 - 2000-01-18 15:10:00 30 - 2000-01-19 11:41:00 33 - Freq: 1231T, dtype: int64 - - >>> ts.resample("2711min").sum() - 2000-01-10 00:00:00 9 - 2000-01-11 21:11:00 21 - 2000-01-13 18:22:00 33 - 2000-01-15 15:33:00 45 - 2000-01-17 12:44:00 57 - 2000-01-19 09:55:00 33 - Freq: 2711T, dtype: int64 - - >>> ts.resample("2711min", origin=pd.Timestamp("1970-01-01")).sum() - 2000-01-08 11:44:00 0 - 2000-01-10 08:55:00 9 - 2000-01-12 06:06:00 21 - 2000-01-14 03:17:00 33 - 2000-01-16 00:28:00 72 - 2000-01-17 21:39:00 63 - Freq: 2711T, dtype: int64 - - If you want to adjust the start of the bins with an offset, the two following - lines are equivalent: - - >>> ts.resample("2711min", origin="1/10/2000 02:00:00").sum() - 2000-01-10 02:00:00 9 - 2000-01-11 23:11:00 21 - 2000-01-13 20:22:00 33 - 2000-01-15 17:33:00 45 - 2000-01-17 14:44:00 90 - Freq: 2711T, dtype: int64 - - >>> ts.resample("2711min", offset="2h").sum() - 2000-01-10 02:00:00 9 - 2000-01-11 23:11:00 21 - 2000-01-13 20:22:00 33 - 2000-01-15 17:33:00 45 - 2000-01-17 14:44:00 90 - Freq: 2711T, dtype: int64 - - To replace the use of the deprecated `base` argument: - - >>> # ts.resample("2711min", base=2).sum() - >>> # becomes: - >>> ts.resample("2711min", offset="2min").sum() - 2000-01-10 00:02:00 9 - 2000-01-11 21:13:00 21 - 2000-01-13 18:24:00 33 - 2000-01-15 15:35:00 45 - 2000-01-17 12:46:00 57 - 2000-01-19 09:57:00 33 - Freq: 2711T, dtype: int64 + 2000-10-01 23:30:00 0 + 2000-10-01 23:37:00 3 + 2000-10-01 23:44:00 6 + 2000-10-01 23:51:00 9 + 2000-10-01 23:58:00 12 + 2000-10-02 00:05:00 15 + 2000-10-02 00:12:00 18 + 2000-10-02 00:19:00 21 + 2000-10-02 00:26:00 24 + Freq: 7T, dtype: int64 + + >>> ts.resample("17min").sum() + 2000-10-01 23:14:00 0 + 2000-10-01 23:31:00 9 + 2000-10-01 23:48:00 21 + 2000-10-02 00:05:00 54 + 2000-10-02 00:22:00 24 + Freq: 17T, dtype: int64 + + >>> ts.resample("17min", origin=pd.Timestamp("1970-01-01")).sum() + 2000-10-01 23:18:00 0 + 2000-10-01 23:35:00 18 + 2000-10-01 23:52:00 27 + 2000-10-02 00:09:00 39 + 2000-10-02 00:26:00 24 + Freq: 17T, dtype: int64 + + If you want to adjust the start of the bins with an `offset` Timedelta, the two + following lines are equivalent: + + >>> ts.resample("17min", origin=start).sum() + 2000-10-01 23:30:00 9 + 2000-10-01 23:47:00 21 + 2000-10-02 00:04:00 54 + 2000-10-02 00:21:00 24 + Freq: 17T, dtype: int64 + + >>> ts.resample("17min", offset=pd.Timedelta("23h30min")).sum() + 2000-10-01 23:30:00 9 + 2000-10-01 23:47:00 21 + 2000-10-02 00:04:00 54 + 2000-10-02 00:21:00 24 + Freq: 17T, dtype: int64 + + To replace the use of the deprecated `base` argument, you can now use `offset`, + in this example it is equivalent to have `base=2`: + >>> ts.resample("17min", offset="2min").sum() + 2000-10-01 23:16:00 0 + 2000-10-01 23:33:00 9 + 2000-10-01 23:50:00 36 + 2000-10-02 00:07:00 39 + 2000-10-02 00:24:00 24 + Freq: 17T, dtype: int64 To replace the use of the deprecated `loffset` argument: >>> from pandas.tseries.frequencies import to_offset - >>> loffset = "8H" - >>> ts_out = ts.resample("2711min").sum() + >>> loffset = "19min" + >>> ts_out = ts.resample("17min").sum() >>> ts_out.index = ts_out.index + to_offset(loffset) >>> ts_out - 2000-01-10 08:00:00 9 - 2000-01-12 05:11:00 21 - 2000-01-14 02:22:00 33 - 2000-01-15 23:33:00 45 - 2000-01-17 20:44:00 57 - 2000-01-19 17:55:00 33 - Freq: 2711T, dtype: int64 + 2000-10-01 23:33:00 0 + 2000-10-01 23:50:00 9 + 2000-10-02 00:07:00 21 + 2000-10-02 00:24:00 54 + 2000-10-02 00:41:00 24 + Freq: 17T, dtype: int64 """ from pandas.core.resample import get_resampler from pandas.core.resample import _validate_resample_deprecated_args diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 54ff11e8f79b0..9e4b072280fef 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -151,72 +151,63 @@ class Grouper: If you want to adjust the start of the bins based on a fixed timestamp: - >>> start, end = "1/10/2000 02:00:00", "1/20/2000 02:00" - >>> rng = pd.date_range(start, end, freq="1231min") + >>> start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" + >>> rng = pd.date_range(start, end, freq="7min") >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) >>> ts - 2000-01-10 02:00:00 0 - 2000-01-10 22:31:00 3 - 2000-01-11 19:02:00 6 - 2000-01-12 15:33:00 9 - 2000-01-13 12:04:00 12 - 2000-01-14 08:35:00 15 - 2000-01-15 05:06:00 18 - 2000-01-16 01:37:00 21 - 2000-01-16 22:08:00 24 - 2000-01-17 18:39:00 27 - 2000-01-18 15:10:00 30 - 2000-01-19 11:41:00 33 - Freq: 1231T, dtype: int64 - - >>> ts.groupby(pd.Grouper(freq="2711min")).sum() - 2000-01-10 00:00:00 9 - 2000-01-11 21:11:00 21 - 2000-01-13 18:22:00 33 - 2000-01-15 15:33:00 45 - 2000-01-17 12:44:00 57 - 2000-01-19 09:55:00 33 - Freq: 2711T, dtype: int64 - - >>> ts.groupby(pd.Grouper(freq="2711min", origin=pd.Timestamp("1970-01-01"))).sum() - 2000-01-08 11:44:00 0 - 2000-01-10 08:55:00 9 - 2000-01-12 06:06:00 21 - 2000-01-14 03:17:00 33 - 2000-01-16 00:28:00 72 - 2000-01-17 21:39:00 63 - Freq: 2711T, dtype: int64 - - If you want to adjust the start of the bins with an offset, the two following - lines are equivalent: - - >>> ts.groupby(pd.Grouper(freq="2711min", origin="1/10/2000 02:00:00")).sum() - 2000-01-10 02:00:00 9 - 2000-01-11 23:11:00 21 - 2000-01-13 20:22:00 33 - 2000-01-15 17:33:00 45 - 2000-01-17 14:44:00 90 - Freq: 2711T, dtype: int64 - - >>> ts.groupby(pd.Grouper(freq="2711min", offset="2h")).sum() - 2000-01-10 02:00:00 9 - 2000-01-11 23:11:00 21 - 2000-01-13 20:22:00 33 - 2000-01-15 17:33:00 45 - 2000-01-17 14:44:00 90 - Freq: 2711T, dtype: int64 - - To replace the use of the deprecated `base` argument: - >>> # ts.groupby(pd.Grouper(freq="2711min", base=2)).sum() - >>> # becomes: - >>> ts.groupby(pd.Grouper(freq="2711min", offset="2min")).sum() - 2000-01-10 00:02:00 9 - 2000-01-11 21:13:00 21 - 2000-01-13 18:24:00 33 - 2000-01-15 15:35:00 45 - 2000-01-17 12:46:00 57 - 2000-01-19 09:57:00 33 - Freq: 2711T, dtype: int64 + 2000-10-01 23:30:00 0 + 2000-10-01 23:37:00 3 + 2000-10-01 23:44:00 6 + 2000-10-01 23:51:00 9 + 2000-10-01 23:58:00 12 + 2000-10-02 00:05:00 15 + 2000-10-02 00:12:00 18 + 2000-10-02 00:19:00 21 + 2000-10-02 00:26:00 24 + Freq: 7T, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq="17min")).sum() + 2000-10-01 23:14:00 0 + 2000-10-01 23:31:00 9 + 2000-10-01 23:48:00 21 + 2000-10-02 00:05:00 54 + 2000-10-02 00:22:00 24 + Freq: 17T, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq="17min", origin=pd.Timestamp("1970-01-01"))).sum() + 2000-10-01 23:18:00 0 + 2000-10-01 23:35:00 18 + 2000-10-01 23:52:00 27 + 2000-10-02 00:09:00 39 + 2000-10-02 00:26:00 24 + Freq: 17T, dtype: int64 + + If you want to adjust the start of the bins with an `offset` Timedelta, the two + following lines are equivalent: + + >>> ts.groupby(pd.Grouper(freq="17min", origin=start)).sum() + 2000-10-01 23:30:00 9 + 2000-10-01 23:47:00 21 + 2000-10-02 00:04:00 54 + 2000-10-02 00:21:00 24 + Freq: 17T, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq="17min", offset=pd.Timedelta("23h30min"))).sum() + 2000-10-01 23:30:00 9 + 2000-10-01 23:47:00 21 + 2000-10-02 00:04:00 54 + 2000-10-02 00:21:00 24 + Freq: 17T, dtype: int64 + + To replace the use of the deprecated `base` argument, you can now use `offset`, + in this example it is equivalent to have `base=2`: + >>> ts.groupby(pd.Grouper(freq="17min", offset="2min")).sum() + 2000-10-01 23:16:00 0 + 2000-10-01 23:33:00 9 + 2000-10-01 23:50:00 36 + 2000-10-02 00:07:00 39 + 2000-10-02 00:24:00 24 + Freq: 17T, dtype: int64 """ _attributes: Tuple[str, ...] = ("key", "level", "freq", "axis", "sort") From 687429e955c919d8d92c06cbeeb20424c361bf56 Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sat, 4 Apr 2020 21:33:17 +0200 Subject: [PATCH 11/26] CLN: review fix - move warning of 'loffset' and 'base' into pd.Grouper --- pandas/core/generic.py | 3 -- pandas/core/groupby/groupby.py | 3 -- pandas/core/groupby/grouper.py | 43 ++++++++++++++++++++---- pandas/core/resample.py | 40 +++++----------------- pandas/tests/resample/test_deprecated.py | 6 ++-- 5 files changed, 48 insertions(+), 47 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bf267a129e6c1..a0bac98ac26ab 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8122,9 +8122,6 @@ def resample( Freq: 17T, dtype: int64 """ from pandas.core.resample import get_resampler - from pandas.core.resample import _validate_resample_deprecated_args - - _validate_resample_deprecated_args(offset=offset, base=base, loffset=loffset) axis = self._get_axis_number(axis) return get_resampler( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 791e72a56aa20..c71085cd4918a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1648,9 +1648,6 @@ def resample(self, rule, *args, **kwargs): 5 2000-01-01 00:03:00 5 1 """ from pandas.core.resample import get_resampler_for_grouping - from pandas.core.resample import _validate_resample_deprecated_args - - _validate_resample_deprecated_args(**kwargs) return get_resampler_for_grouping(self, rule, *args, **kwargs) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 9e4b072280fef..a9fb617528ed9 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -2,7 +2,7 @@ Provide user facing operators for doing the split part of the split-apply-combine paradigm. """ - +import warnings from typing import Dict, Hashable, List, Optional, Tuple import numpy as np @@ -215,12 +215,43 @@ class Grouper: def __new__(cls, *args, **kwargs): if kwargs.get("freq") is not None: from pandas.core.resample import TimeGrouper - from pandas.core.resample import _validate_resample_deprecated_args - if cls is not TimeGrouper: - # validate only when pd.Grouper is called, otherwise - # the warning is handled by the resample function - _validate_resample_deprecated_args(**kwargs) + # Deprecation warning of `base` and `loffset` since v1.1.0: + # we are raising the warning here to be able to set the `stacklevel` + # properly since we need to raise the `base` and `loffset` deprecation + # warning from three different cases: + # core/generic.py::NDFrame.resample + # core/groupby/groupby.py::GroupBy.resample + # core/groupby/grouper.py::Grouper + # raising these warnings from TimeGrouper directly would fail the test: + # tests/resample/test_deprecated.py::test_deprecating_on_loffset_and_base + + # hacky way to set the stacklevel: if cls is TimeGrouper it means + # that the call comes from a pandas internal call of resample, + # otherwise it comes from pd.Grouper + stacklevel = 4 if cls is TimeGrouper else 2 + if kwargs.get("base", None) is not None: + warnings.warn( + "'base' in .resample() and in Grouper() is deprecated.\n" + "The new arguments that you should use are 'offset' or 'origin'.\n" + '\n>>> df.resample(freq="3s", base=2)\n' + "\nbecomes:\n" + '\n>>> df.resample(freq="3s", offset="2s")\n', + FutureWarning, + stacklevel=stacklevel, + ) + + if kwargs.get("loffset", None) is not None: + warnings.warn( + "'loffset' in .resample() and in Grouper() is deprecated.\n" + '\n>>> df.resample(freq="3s", loffset="8H")\n' + "\nbecomes:\n" + "\n>>> from pandas.tseries.frequencies import to_offset" + '\n>>> df = df.resample(freq="3s").mean()' + '\n>>> df.index = df.index.to_timestamp() + to_offset("8H")\n', + FutureWarning, + stacklevel=stacklevel, + ) cls = TimeGrouper return super().__new__(cls) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 9294b6df3d821..d6b9aaac5b021 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2,7 +2,6 @@ from datetime import timedelta from textwrap import dedent from typing import Dict, no_type_check -import warnings import numpy as np @@ -1358,6 +1357,14 @@ def __init__( self.origin = Timestamp(origin) if origin is not None else None self.offset = Timedelta(offset) if offset is not None else None + + # always sort time groupers + kwargs["sort"] = True + + # Handle deprecated arguments since v1.1.0 of `base` and `loffset` (GH #31809) + if base is not None and offset is not None: + raise ValueError("`offset` and `base` cannot be present at the same time") + if base and isinstance(freq, Tick): # this conversion handle the default behavior of base and the # special case of GH #10530. Indeed in case when dealing with @@ -1366,11 +1373,7 @@ def __init__( # freq_nanos. self.offset = Timedelta(base * freq.nanos // freq.n) - # always sort time groupers - kwargs["sort"] = True - if isinstance(loffset, str): - # loffset is deprecated since v1.1.0 (GH #31809) loffset = to_offset(loffset) self.loffset = loffset @@ -1622,33 +1625,6 @@ def _get_period_bins(self, ax): return binner, bins, labels -def _validate_resample_deprecated_args(offset=None, base=None, loffset=None, **kwds): - if base is not None: - warnings.warn( - "'base' in .resample() and in Grouper() is deprecated.\n" - "The new arguments that you should use are 'offset' or 'origin'.\n\n" - '>>> df.resample(freq="3s", base=2)\n' - "\nbecomes:\n\n" - '>>> df.resample(freq="3s", offset="2s")\n', - FutureWarning, - stacklevel=3, - ) - if offset is not None: - raise ValueError("offset and base cannot be present at the same time") - - if loffset is not None: - warnings.warn( - "'loffset' in .resample() and in Grouper() is deprecated.\n\n" - '>>> df.resample(freq="3s", loffset="8H")\n' - "\nbecomes:\n\n" - ">>> from pandas.tseries.frequencies import to_offset\n" - '>>> df = df.resample(freq="3s").mean()\n' - '>>> df.index = df.index.to_timestamp() + to_offset("8H")\n', - FutureWarning, - stacklevel=3, - ) - - def _take_new_index(obj, indexer, new_index, axis=0): if isinstance(obj, ABCSeries): diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py index d6689530ea88a..45de17bc921c7 100644 --- a/pandas/tests/resample/test_deprecated.py +++ b/pandas/tests/resample/test_deprecated.py @@ -40,13 +40,13 @@ def _create_index(*args, **kwargs): def test_deprecating_on_loffset_and_base(): # GH 31809 - idx = pd.date_range("1/1/2000", periods=4, freq="T") + idx = pd.date_range("2001-01-01", periods=4, freq="T") df = pd.DataFrame(data=4 * [range(2)], index=idx, columns=["a", "b"]) with tm.assert_produces_warning(FutureWarning): - pd.Grouper(freq="10s", base=2) + pd.Grouper(freq="10s", base=0) with tm.assert_produces_warning(FutureWarning): - pd.Grouper(freq="10s", loffset="2s") + pd.Grouper(freq="10s", loffset="0s") with tm.assert_produces_warning(FutureWarning): df.groupby("a").resample("3T", base=0).sum() with tm.assert_produces_warning(FutureWarning): From 7d4de499823d47298729ea279b0d86afb7ba9877 Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sat, 4 Apr 2020 22:11:25 +0200 Subject: [PATCH 12/26] CLN: fix lint issue with isort --- pandas/core/groupby/grouper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index a9fb617528ed9..5660bfa9d2d28 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -2,8 +2,8 @@ Provide user facing operators for doing the split part of the split-apply-combine paradigm. """ -import warnings from typing import Dict, Hashable, List, Optional, Tuple +import warnings import numpy as np From b83c5bff8b17b92792e74660cf9de6dbb076de9d Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Fri, 10 Apr 2020 00:18:25 +0200 Subject: [PATCH 13/26] Update pandas/core/generic.py Co-Authored-By: William Ayd --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a0bac98ac26ab..64e2d8b9c5f8b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7763,7 +7763,7 @@ def resample( base: Optional[int] = None, on=None, level=None, - origin=None, + origin: pd.Timestamp = None, offset=None, ) -> "Resampler": """ From 3e24d5339fd9d5663dd5285e2d337019f4b50e0f Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sat, 11 Apr 2020 16:31:51 +0200 Subject: [PATCH 14/26] CLN: add TimestampCompatibleTypes and TimedeltaCompatibleTypes in pandas._typing --- pandas/_typing.py | 10 ++++++++++ pandas/core/generic.py | 10 ++++++---- pandas/core/groupby/grouper.py | 4 ++-- pandas/core/resample.py | 22 ++++++++++++---------- 4 files changed, 30 insertions(+), 16 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index d225b845970cc..4f66c376dc44c 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,3 +1,4 @@ +from datetime import datetime, timedelta from pathlib import Path from typing import ( IO, @@ -43,6 +44,15 @@ PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] Scalar = Union[PythonScalar, PandasScalar] +# timestamp and timedelta compatible types + +TimestampCompatibleTypes = Union[ + "Timestamp", datetime, np.datetime64, int, np.int64, float, str +] +TimedeltaCompatibleTypes = Union[ + "Timedelta", timedelta, np.timedelta64, int, np.int64, float, str +] + # other Dtype = Union[ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 64e2d8b9c5f8b..ab8ef5d2fbf76 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -39,6 +39,8 @@ Label, Level, Renamer, + TimedeltaCompatibleTypes, + TimestampCompatibleTypes, ValueKeyFunc, ) from pandas.compat import set_function_name @@ -7763,8 +7765,8 @@ def resample( base: Optional[int] = None, on=None, level=None, - origin: pd.Timestamp = None, - offset=None, + origin: Optional[TimestampCompatibleTypes] = None, + offset: Optional[TimedeltaCompatibleTypes] = None, ) -> "Resampler": """ Resample time-series data. @@ -7818,14 +7820,14 @@ def resample( level : str or int, optional For a MultiIndex, level (name or number) to use for resampling. `level` must be datetime-like. - origin : pd.Timestamp, default None + origin : Timestamp, str or datetime-like, default None The timestamp on which to adjust the grouping. It must be timezone aware if the index of the resampled data is. If None is passed, the first day of the time series at midnight is used. .. versionadded:: 1.1.0 - offset : pd.Timedelta, default is None + offset : Timedelta or str, default is None An offset timedelta added to the origin. .. versionadded:: 1.1.0 diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 5660bfa9d2d28..580bae42b05d2 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -83,14 +83,14 @@ class Grouper: However, loffset is also deprecated for ``.resample(...)`` See: :class:`DataFrame.resample` - origin : pd.Timestamp, default None + origin : Timestamp, str or datetime-like, default None The timestamp on which to adjust the grouping. The timezone of the timestamp must match the timezone of the index. If None is passed, the first day of the time series at midnight is used. .. versionadded:: 1.1.0 - offset : pd.Timedelta, default is None + offset : Timedelta or str, default is None An offset timedelta added to the origin. .. versionadded:: 1.1.0 diff --git a/pandas/core/resample.py b/pandas/core/resample.py index d6b9aaac5b021..3e2cc1cab5235 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1,7 +1,7 @@ import copy from datetime import timedelta from textwrap import dedent -from typing import Dict, no_type_check +from typing import Dict, no_type_check, Optional import numpy as np @@ -9,6 +9,8 @@ from pandas._libs.tslibs import NaT, Period, Timedelta, Timestamp from pandas._libs.tslibs.frequencies import is_subperiod, is_superperiod from pandas._libs.tslibs.period import IncompatibleFrequency + +from pandas._typing import TimestampCompatibleTypes, TimedeltaCompatibleTypes from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, doc @@ -1306,18 +1308,18 @@ class TimeGrouper(Grouper): def __init__( self, freq="Min", - closed=None, - label=None, + closed: Optional[str] = None, + label: Optional[str] = None, how="mean", axis=0, fill_method=None, limit=None, loffset=None, - kind=None, - convention=None, - base=None, - origin=None, - offset=None, + kind: Optional[str] = None, + convention: Optional[str] = None, + base: Optional[int] = None, + origin: Optional[TimestampCompatibleTypes] = None, + offset: Optional[TimedeltaCompatibleTypes] = None, **kwargs, ): # Check for correctness of the keyword arguments which would @@ -1722,8 +1724,8 @@ def _get_period_range_edges(first, last, freq, closed="left", origin=None, offse closed : {'right', 'left'}, default None Which side of bin interval is closed. origin : pd.Timestamp, default None - The timestamp on which to adjust the grouping. The timezone of the - timestamp must match the timezone of the index. If None is passed, the + The timestamp on which to adjust the grouping. It must be timezone + aware if the index of the resampled data is. If None is passed, the first day of the time series at midnight is used. offset : pd.Timedelta, default is None An offset timedelta added to the origin. From c2ee6612147a9025fab924292af29ab13886dca3 Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sat, 11 Apr 2020 17:05:09 +0200 Subject: [PATCH 15/26] CLN: fix lint issue with isort --- pandas/core/resample.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 3e2cc1cab5235..861ac9645dfa5 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1,7 +1,7 @@ import copy from datetime import timedelta from textwrap import dedent -from typing import Dict, no_type_check, Optional +from typing import Dict, Optional, no_type_check import numpy as np @@ -9,8 +9,7 @@ from pandas._libs.tslibs import NaT, Period, Timedelta, Timestamp from pandas._libs.tslibs.frequencies import is_subperiod, is_superperiod from pandas._libs.tslibs.period import IncompatibleFrequency - -from pandas._typing import TimestampCompatibleTypes, TimedeltaCompatibleTypes +from pandas._typing import TimedeltaCompatibleTypes, TimestampCompatibleTypes from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, doc From a6e94c0dba702ccf7ffdb71977de0c9dfbc7b8b9 Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sat, 11 Apr 2020 23:53:29 +0200 Subject: [PATCH 16/26] ENH: support 'epoch', 'start_day' and 'start' for origin --- pandas/core/generic.py | 2 +- pandas/core/resample.py | 46 ++++++++++++++-------- pandas/tests/resample/test_resample_api.py | 6 +-- pandas/tests/resample/test_time_grouper.py | 6 +-- 4 files changed, 37 insertions(+), 23 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ab8ef5d2fbf76..502562509abc7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7765,7 +7765,7 @@ def resample( base: Optional[int] = None, on=None, level=None, - origin: Optional[TimestampCompatibleTypes] = None, + origin: Union[str, TimestampCompatibleTypes] = "start_day", offset: Optional[TimedeltaCompatibleTypes] = None, ) -> "Resampler": """ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 861ac9645dfa5..779f8c903794a 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1,7 +1,7 @@ import copy from datetime import timedelta from textwrap import dedent -from typing import Dict, Optional, no_type_check +from typing import Dict, Optional, Union, no_type_check import numpy as np @@ -1317,7 +1317,7 @@ def __init__( kind: Optional[str] = None, convention: Optional[str] = None, base: Optional[int] = None, - origin: Optional[TimestampCompatibleTypes] = None, + origin: Union[str, TimestampCompatibleTypes] = "start_day", offset: Optional[TimedeltaCompatibleTypes] = None, **kwargs, ): @@ -1356,7 +1356,12 @@ def __init__( self.fill_method = fill_method self.limit = limit - self.origin = Timestamp(origin) if origin is not None else None + if origin in {"start_day", "start"}: + self.origin = origin + elif origin == "epoch": + self.origin = Timedelta(0, tz="UTC") + else: + self.origin = Timestamp(origin) self.offset = Timedelta(offset) if offset is not None else None # always sort time groupers @@ -1580,9 +1585,11 @@ def _get_period_bins(self, ax): end = ax.max().asfreq(self.freq, how="end") bin_shift = 0 - # GH 23882 & 31809 - if self.origin is not None or self.offset is not None: - # get base adjusted bin edge labels + if isinstance(self.freq, Tick): + # GH 23882 & 31809: get adjusted bin edge labels with `origin` + # and `origin` support. This call only makes sense if the freq is a + # Tick since offset and origin are only used in those cases. + # Not doing this check could create an extra empty bin. p_start, end = _get_period_range_edges( start, end, @@ -1642,7 +1649,7 @@ def _take_new_index(obj, indexer, new_index, axis=0): def _get_timestamp_range_edges( - first, last, freq, closed="left", origin=None, offset=None + first, last, freq, closed="left", origin="start_day", offset=None ): """ Adjust the `first` Timestamp to the preceding Timestamp that resides on @@ -1673,9 +1680,10 @@ def _get_timestamp_range_edges( A tuple of length 2, containing the adjusted pd.Timestamp objects. """ if isinstance(freq, Tick): - is_idx_tz_aware = first.tz is not None or last.tz is not None - if origin is not None and origin.tz is None and is_idx_tz_aware: - raise ValueError("The origin must have the same timezone as the index.") + if origin not in {"start", "start_day"}: + is_idx_tz_aware = first.tz is not None or last.tz is not None + if origin.tz is None and is_idx_tz_aware: + raise ValueError("The origin must have the same timezone as the index.") if isinstance(freq, Day): # _adjust_dates_anchored assumes 'D' means 24H, but first/last @@ -1707,7 +1715,9 @@ def _get_timestamp_range_edges( return first, last -def _get_period_range_edges(first, last, freq, closed="left", origin=None, offset=None): +def _get_period_range_edges( + first, last, freq, closed="left", origin="start_day", offset=None +): """ Adjust the provided `first` and `last` Periods to the respective Period of the given offset that encompasses them. @@ -1736,7 +1746,7 @@ def _get_period_range_edges(first, last, freq, closed="left", origin=None, offse if not all(isinstance(obj, Period) for obj in [first, last]): raise TypeError("'first' and 'last' must be instances of type Period") - # GH 23882 & 31809 + # GH 23882 first = first.to_timestamp() last = last.to_timestamp() adjust_first = not freq.is_on_offset(first) @@ -1751,16 +1761,20 @@ def _get_period_range_edges(first, last, freq, closed="left", origin=None, offse return first, last -def _adjust_dates_anchored(first, last, freq, closed="right", origin=None, offset=None): +def _adjust_dates_anchored( + first, last, freq, closed="right", origin="start_day", offset=None +): # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is # not a multiple of the frequency. See GH 8683 # To handle frequencies that are not multiple or divisible by a day we let # the possibility to define a fixed origin timestamp. See GH 31809 - if origin is None: - origin_nanos = first.normalize().value - else: + if isinstance(origin, Timestamp): origin_nanos = origin.value + elif origin == "start": + origin_nanos = first.value + else: # origin == "start_day" + origin_nanos = first.normalize().value origin_nanos += offset.value if offset else 0 # GH 10117 & GH 19375. If first and last contain timezone information, diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 4f85b0d4036f6..73aa01cff84fa 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -25,13 +25,13 @@ def test_str(): r = test_series.resample("H") assert ( "DatetimeIndexResampler [freq=, axis=0, closed=left, " - "label=left, convention=start]" in str(r) + "label=left, convention=start, origin=start_day]" in str(r) ) - r = test_series.resample("H", origin="1970-01-01") + r = test_series.resample("H", origin="2000-01-01") assert ( "DatetimeIndexResampler [freq=, axis=0, closed=left, " - "label=left, convention=start, origin=1970-01-01 00:00:00]" in str(r) + "label=left, convention=start, origin=2000-01-01 00:00:00]" in str(r) ) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 6aa4606486f09..26e429c47b494 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -251,15 +251,15 @@ def test_repr(): expected = ( "TimeGrouper(key='A', freq=, axis=0, sort=True, " "closed='left', label='left', how='mean', " - "convention='e')" + "convention='e', origin='start_day')" ) assert result == expected - result = repr(Grouper(key="A", freq="H", origin="1970-01-01")) + result = repr(Grouper(key="A", freq="H", origin="2000-01-01")) expected = ( "TimeGrouper(key='A', freq=, axis=0, sort=True, " "closed='left', label='left', how='mean', " - "convention='e', origin=Timestamp('1970-01-01 00:00:00'))" + "convention='e', origin=Timestamp('2000-01-01 00:00:00'))" ) assert result == expected From 53802e5a66e30c1934d73ddd9d7d0c3e42a5d55b Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sun, 12 Apr 2020 02:48:45 +0200 Subject: [PATCH 17/26] DOC: add doc for origin that uses 'epoch', 'start' or 'start_day' --- doc/source/user_guide/timeseries.rst | 32 ++++++++++++++--------- doc/source/whatsnew/v1.1.0.rst | 28 ++++++++++---------- pandas/core/generic.py | 38 ++++++++++++++++++---------- pandas/core/groupby/grouper.py | 34 +++++++++++++++++-------- pandas/core/resample.py | 38 ++++++++++++++++------------ 5 files changed, 105 insertions(+), 65 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index db0fef8c7a85c..774cc5b419f9d 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1799,37 +1799,45 @@ For example: .. ipython:: python - start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" - middle = "2000-10-02 00:00:00" - rng = pd.date_range(start, end, freq="7min") + start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' + middle = '2000-10-02 00:00:00' + rng = pd.date_range(start, end, freq='7min') ts = pd.Series(np.arange(len(rng)) * 3, index=rng) ts -Here we can see that, when not using ``origin``, the result after "2000-10-02 00:00:00" are not identical depending on the start of time series: +Here we can see that, when using ``origin`` with its default value (``'start_day'``), the result after ``'2000-10-02 00:00:00'`` are not identical depending on the start of time series: .. ipython:: python - ts.resample("17min").sum() - ts[middle:end].resample("17min").sum() + ts.resample('17min').sum() + ts[middle:end].resample('17min').sum() -Here we can see that, when using ``origin``, the result after "2000-10-02 00:00:00" are identical depending on the start of time series: +Here we can see that, when setting ``origin`` to ``'epoch'``, the result after ``'2000-10-02 00:00:00'`` are identical depending on the start of time series: .. ipython:: python - origin = pd.Timestamp("1970-01-01") - ts.resample("17min", origin=origin).sum() - ts[middle:end].resample("17min", origin=origin).sum() + ts.resample('17min', origin='epoch').sum() + ts[middle:end].resample('17min', origin='epoch').sum() +If needed you can use a custom timestamp for ``origin``: + +.. ipython:: python + + ts.resample('17min', origin='2001-01-01').sum() + ts[middle:end].resample('17min', origin=pd.Timestamp('2001-01-01')).sum() + If needed you can just adjust the bins with an ``offset`` Timedelta that would be added to the default ``origin``. Those two examples are equivalent for this time series: .. ipython:: python - ts.resample("17min", origin=start).sum() - ts.resample("17min", offset=pd.Timedelta("23h30min")).sum() + ts.resample('17min', origin='start').sum() + ts.resample('17min', offset='23h30min').sum() + +Note the use of ``'start'`` for ``origin`` on the last example. In that case, ``origin`` will be set to the first value of the timeseries. .. _timeseries.periods: diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a367e05b983a2..339e43ab1861d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -157,7 +157,7 @@ For example: Grouper and resample now supports the arguments origin and offset ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`Grouper` and :class:`DataFrame.resample` now supports the argument ``origin``. The timestamp on which to adjust the grouping. (:issue:`31809`) +:class:`Grouper` and :class:`DataFrame.resample` now supports the arguments ``origin`` and ``offset``. It let the user control the timestamp on which to adjust the grouping. (:issue:`31809`) The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divides a day (like `90s` or `1min`). But it can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can now specify a fixed timestamp with the argument ``origin``. @@ -165,23 +165,25 @@ For example: .. ipython:: python - start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" - middle = "2000-10-02 00:00:00" - rng = pd.date_range(start, end, freq="7min") + start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' + middle = '2000-10-02 00:00:00' + rng = pd.date_range(start, end, freq='7min') ts = pd.Series(np.arange(len(rng)) * 3, index=rng) ts -Resample with the default behavior (origin is 2000-10-01 00:00:00): +Resample with the default behavior 'start_day' (origin is 2000-10-01 00:00:00): .. ipython:: python - ts.resample("17min").sum() + ts.resample('17min').sum() + ts.resample('17min', origin='start_day').sum() Resample using a fixed origin: .. ipython:: python - ts.resample("17min", origin="1970-01-01").sum() + ts.resample('17min', origin='epoch').sum() + ts.resample('17min', origin='2000-01-01').sum() For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. @@ -190,29 +192,29 @@ Those two examples are equivalent for this time series: .. ipython:: python - ts.resample("17min", origin=start).sum() - ts.resample("17min", offset=pd.Timedelta("23h30min")).sum() + ts.resample('17min', origin='start').sum() + ts.resample('17min', offset='23h30min').sum() The argument ``base`` is now deprecated in favor of ``offset``. (:issue:`31809`) .. ipython:: python :okwarning: - ts.resample("17min", base=2).sum() + ts.resample('17min', base=2).sum() becomes: .. ipython:: python - ts.resample("17min", offset="2min").sum() + ts.resample('17min', offset='2min').sum() The argument ``loffset`` is now deprecated. You should now add an offset to the index DataFrame after being resampled. (:issue:`31809`) .. ipython:: python from pandas.tseries.frequencies import to_offset - loffset = "19min" - ts_out = ts.resample("17min").sum() + loffset = '19min' + ts_out = ts.resample('17min').sum() ts_out.index = ts_out.index + to_offset(loffset) ts_out diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 502562509abc7..268f29a31436c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7820,10 +7820,14 @@ def resample( level : str or int, optional For a MultiIndex, level (name or number) to use for resampling. `level` must be datetime-like. - origin : Timestamp, str or datetime-like, default None - The timestamp on which to adjust the grouping. It must be timezone - aware if the index of the resampled data is. If None is passed, the - first day of the time series at midnight is used. + origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' + The timestamp on which to adjust the grouping. It must be timezone aware if + the index of the resampled data is. + If a timestamp is not used, these values are also supported: + + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries .. versionadded:: 1.1.0 @@ -8051,8 +8055,8 @@ def resample( If you want to adjust the start of the bins based on a fixed timestamp: - >>> start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" - >>> rng = pd.date_range(start, end, freq="7min") + >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' + >>> rng = pd.date_range(start, end, freq='7min') >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) >>> ts 2000-10-01 23:30:00 0 @@ -8066,7 +8070,7 @@ def resample( 2000-10-02 00:26:00 24 Freq: 7T, dtype: int64 - >>> ts.resample("17min").sum() + >>> ts.resample('17min').sum() 2000-10-01 23:14:00 0 2000-10-01 23:31:00 9 2000-10-01 23:48:00 21 @@ -8074,7 +8078,7 @@ def resample( 2000-10-02 00:22:00 24 Freq: 17T, dtype: int64 - >>> ts.resample("17min", origin=pd.Timestamp("1970-01-01")).sum() + >>> ts.resample('17min', origin='epoch').sum() 2000-10-01 23:18:00 0 2000-10-01 23:35:00 18 2000-10-01 23:52:00 27 @@ -8082,17 +8086,24 @@ def resample( 2000-10-02 00:26:00 24 Freq: 17T, dtype: int64 + >>> ts.resample('17min', origin='2000-01-01').sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17T, dtype: int64 + If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: - >>> ts.resample("17min", origin=start).sum() + >>> ts.resample('17min', origin='start').sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 2000-10-02 00:21:00 24 Freq: 17T, dtype: int64 - >>> ts.resample("17min", offset=pd.Timedelta("23h30min")).sum() + >>> ts.resample('17min', offset='23h30min').sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 @@ -8101,7 +8112,8 @@ def resample( To replace the use of the deprecated `base` argument, you can now use `offset`, in this example it is equivalent to have `base=2`: - >>> ts.resample("17min", offset="2min").sum() + + >>> ts.resample('17min', offset='2min').sum() 2000-10-01 23:16:00 0 2000-10-01 23:33:00 9 2000-10-01 23:50:00 36 @@ -8112,8 +8124,8 @@ def resample( To replace the use of the deprecated `loffset` argument: >>> from pandas.tseries.frequencies import to_offset - >>> loffset = "19min" - >>> ts_out = ts.resample("17min").sum() + >>> loffset = '19min' + >>> ts_out = ts.resample('17min').sum() >>> ts_out.index = ts_out.index + to_offset(loffset) >>> ts_out 2000-10-01 23:33:00 0 diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 580bae42b05d2..fc06e35e3e39f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -83,10 +83,14 @@ class Grouper: However, loffset is also deprecated for ``.resample(...)`` See: :class:`DataFrame.resample` - origin : Timestamp, str or datetime-like, default None - The timestamp on which to adjust the grouping. The timezone of the - timestamp must match the timezone of the index. If None is passed, the - first day of the time series at midnight is used. + origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' + The timestamp on which to adjust the grouping. It must be timezone aware if + the index of the resampled data is. + If a timestamp is not used, these values are also supported: + + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries .. versionadded:: 1.1.0 @@ -151,8 +155,8 @@ class Grouper: If you want to adjust the start of the bins based on a fixed timestamp: - >>> start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" - >>> rng = pd.date_range(start, end, freq="7min") + >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' + >>> rng = pd.date_range(start, end, freq='7min') >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) >>> ts 2000-10-01 23:30:00 0 @@ -166,7 +170,7 @@ class Grouper: 2000-10-02 00:26:00 24 Freq: 7T, dtype: int64 - >>> ts.groupby(pd.Grouper(freq="17min")).sum() + >>> ts.groupby(pd.Grouper(freq='17min')).sum() 2000-10-01 23:14:00 0 2000-10-01 23:31:00 9 2000-10-01 23:48:00 21 @@ -174,7 +178,7 @@ class Grouper: 2000-10-02 00:22:00 24 Freq: 17T, dtype: int64 - >>> ts.groupby(pd.Grouper(freq="17min", origin=pd.Timestamp("1970-01-01"))).sum() + >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum() 2000-10-01 23:18:00 0 2000-10-01 23:35:00 18 2000-10-01 23:52:00 27 @@ -182,17 +186,24 @@ class Grouper: 2000-10-02 00:26:00 24 Freq: 17T, dtype: int64 + >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17T, dtype: int64 + If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: - >>> ts.groupby(pd.Grouper(freq="17min", origin=start)).sum() + >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 2000-10-02 00:21:00 24 Freq: 17T, dtype: int64 - >>> ts.groupby(pd.Grouper(freq="17min", offset=pd.Timedelta("23h30min"))).sum() + >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 @@ -201,7 +212,8 @@ class Grouper: To replace the use of the deprecated `base` argument, you can now use `offset`, in this example it is equivalent to have `base=2`: - >>> ts.groupby(pd.Grouper(freq="17min", offset="2min")).sum() + + >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum() 2000-10-01 23:16:00 0 2000-10-01 23:33:00 9 2000-10-01 23:50:00 36 diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 779f8c903794a..c60d646d9e6b7 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1356,10 +1356,8 @@ def __init__( self.fill_method = fill_method self.limit = limit - if origin in {"start_day", "start"}: + if origin in {"epoch", "start_day", "start"}: self.origin = origin - elif origin == "epoch": - self.origin = Timedelta(0, tz="UTC") else: self.origin = Timestamp(origin) self.offset = Timedelta(offset) if offset is not None else None @@ -1668,10 +1666,13 @@ def _get_timestamp_range_edges( The dateoffset to which the Timestamps will be adjusted. closed : {'right', 'left'}, default None Which side of bin interval is closed. - origin : pd.Timestamp, default None - The timestamp on which to adjust the grouping. It must be timezone - aware if the index of the resampled data is. If None is passed, the - first day of the time series at midnight is used. + origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' + The timestamp on which to adjust the grouping. It must be timezone aware if + the index of the resampled data is. + If a timestamp is not used, these values are also supported: + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries offset : pd.Timedelta, default is None An offset timedelta added to the origin. @@ -1680,7 +1681,7 @@ def _get_timestamp_range_edges( A tuple of length 2, containing the adjusted pd.Timestamp objects. """ if isinstance(freq, Tick): - if origin not in {"start", "start_day"}: + if origin not in {"epoch", "start", "start_day"}: is_idx_tz_aware = first.tz is not None or last.tz is not None if origin.tz is None and is_idx_tz_aware: raise ValueError("The origin must have the same timezone as the index.") @@ -1732,10 +1733,14 @@ def _get_period_range_edges( The freq to which the Periods will be adjusted. closed : {'right', 'left'}, default None Which side of bin interval is closed. - origin : pd.Timestamp, default None - The timestamp on which to adjust the grouping. It must be timezone - aware if the index of the resampled data is. If None is passed, the - first day of the time series at midnight is used. + origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' + The timestamp on which to adjust the grouping. It must be timezone aware if + the index of the resampled data is. + + If a timestamp is not used, these values are also supported: + - If 'epoch': `origin` is 1970-01-01 + - If 'start': then `origin` is the first value of the timeseries + - If 'start_day', then `origin` is the first day at midnight of the timeseries offset : pd.Timedelta, default is None An offset timedelta added to the origin. @@ -1769,12 +1774,13 @@ def _adjust_dates_anchored( # not a multiple of the frequency. See GH 8683 # To handle frequencies that are not multiple or divisible by a day we let # the possibility to define a fixed origin timestamp. See GH 31809 - if isinstance(origin, Timestamp): - origin_nanos = origin.value + origin_nanos = 0 # origin == "epoch" + if origin == "start_day": + origin_nanos = first.normalize().value elif origin == "start": origin_nanos = first.value - else: # origin == "start_day" - origin_nanos = first.normalize().value + elif isinstance(origin, Timestamp): + origin_nanos = origin.value origin_nanos += offset.value if offset else 0 # GH 10117 & GH 19375. If first and last contain timezone information, From 3fc2bf6ca2a86a2843683aa6efa6d141dac4035e Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sun, 12 Apr 2020 02:49:04 +0200 Subject: [PATCH 18/26] TST: add test for origin that uses 'epoch', 'start' or 'start_day' --- pandas/tests/resample/test_datetime_index.py | 58 +++++++++++++++++--- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 61e79c5a3741c..1ec2fd66b84de 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -740,31 +740,75 @@ def test_resample_offset(): def test_resample_origin(): # GH 31809 - - rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s") + rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") ts = Series(np.random.randn(len(rng)), index=rng) - exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min") + exp_rng = date_range("1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min") - resampled = ts.resample("5min", origin="12/31/1999 23:57:00").mean() + resampled = ts.resample("5min", origin="1999-12-31 23:57:00").mean() tm.assert_index_equal(resampled.index, exp_rng) offset_timestamp = pd.Timestamp(0) + pd.Timedelta("2min") resampled = ts.resample("5min", origin=offset_timestamp).mean() tm.assert_index_equal(resampled.index, exp_rng) + resampled = ts.resample("5min", origin="epoch", offset="2m").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + # origin of '1999-31-12 12:02:00' should be equivalent for this case + resampled = ts.resample("5min", origin="1999-12-31 12:02:00").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + resampled = ts.resample("5min", offset="-3m").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + +def test_resample_origin_prime_freq(): + # GH 31809 + start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" + rng = pd.date_range(start, end, freq="7min") + ts = Series(np.random.randn(len(rng)), index=rng) + + exp_rng = date_range("2000-10-01 23:14:00", "2000-10-02 00:22:00", freq="17min") + resampled = ts.resample("17min").mean() + tm.assert_index_equal(resampled.index, exp_rng) + resampled = ts.resample("17min", origin="start_day").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + exp_rng = date_range("2000-10-01 23:30:00", "2000-10-02 00:21:00", freq="17min") + resampled = ts.resample("17min", origin="start").mean() + tm.assert_index_equal(resampled.index, exp_rng) + resampled = ts.resample("17min", offset="23h30min").mean() + tm.assert_index_equal(resampled.index, exp_rng) + resampled = ts.resample("17min", origin="start_day", offset="23h30min").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + exp_rng = date_range("2000-10-01 23:18:00", "2000-10-02 00:26:00", freq="17min") + resampled = ts.resample("17min", origin="epoch").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + exp_rng = date_range("2000-10-01 23:24:00", "2000-10-02 00:15:00", freq="17min") + resampled = ts.resample("17min", origin="2000-01-01").mean() + tm.assert_index_equal(resampled.index, exp_rng) + def test_resample_origin_with_tz(): # GH 31809 msg = "The origin must have the same timezone as the index." tz = "Europe/Paris" - rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s", tz=tz) + rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s", tz=tz) ts = Series(np.random.randn(len(rng)), index=rng) - exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min", tz=tz) + exp_rng = date_range("1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min", tz=tz) + resampled = ts.resample("5min", origin="1999-12-31 23:57:00+00:00").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + # origin of '1999-31-12 12:02:00+03:00' should be equivalent for this case + resampled = ts.resample("5min", origin="1999-12-31 12:02:00+03:00").mean() + tm.assert_index_equal(resampled.index, exp_rng) - resampled = ts.resample("5min", origin="12/31/1999 23:57:00+00:00").mean() + resampled = ts.resample("5min", origin="epoch", offset="2m").mean() tm.assert_index_equal(resampled.index, exp_rng) with pytest.raises(ValueError, match=msg): From 4ad979a6d1884bb066954d90301ffaabfdec16b0 Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sun, 12 Apr 2020 14:09:31 +0200 Subject: [PATCH 19/26] BUG: fix a timezone bug between origin and index on df.resample --- pandas/core/generic.py | 4 +- pandas/core/groupby/grouper.py | 4 +- pandas/core/resample.py | 46 ++++++++++---------- pandas/tests/resample/test_datetime_index.py | 6 +++ 4 files changed, 32 insertions(+), 28 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 268f29a31436c..4f84b58a68773 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7821,8 +7821,8 @@ def resample( For a MultiIndex, level (name or number) to use for resampling. `level` must be datetime-like. origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' - The timestamp on which to adjust the grouping. It must be timezone aware if - the index of the resampled data is. + The timestamp on which to adjust the grouping. The timezone of origin + must match the timezone of the index. If a timestamp is not used, these values are also supported: - 'epoch': `origin` is 1970-01-01 diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index fc06e35e3e39f..9660fb9c2e1b0 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -84,8 +84,8 @@ class Grouper: See: :class:`DataFrame.resample` origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' - The timestamp on which to adjust the grouping. It must be timezone aware if - the index of the resampled data is. + The timestamp on which to adjust the grouping. The timezone of origin must + match the timezone of the index. If a timestamp is not used, these values are also supported: - 'epoch': `origin` is 1970-01-01 diff --git a/pandas/core/resample.py b/pandas/core/resample.py index c60d646d9e6b7..653085f541d4d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1666,10 +1666,11 @@ def _get_timestamp_range_edges( The dateoffset to which the Timestamps will be adjusted. closed : {'right', 'left'}, default None Which side of bin interval is closed. - origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' - The timestamp on which to adjust the grouping. It must be timezone aware if - the index of the resampled data is. + origin : {'epoch', 'start', 'start_day'} or Timestamp, default 'start_day' + The timestamp on which to adjust the grouping. The timezone of origin must + match the timezone of the index. If a timestamp is not used, these values are also supported: + - 'epoch': `origin` is 1970-01-01 - 'start': `origin` is the first value of the timeseries - 'start_day': `origin` is the first day at midnight of the timeseries @@ -1680,17 +1681,15 @@ def _get_timestamp_range_edges( ------- A tuple of length 2, containing the adjusted pd.Timestamp objects. """ - if isinstance(freq, Tick): - if origin not in {"epoch", "start", "start_day"}: - is_idx_tz_aware = first.tz is not None or last.tz is not None - if origin.tz is None and is_idx_tz_aware: - raise ValueError("The origin must have the same timezone as the index.") + index_tz = first.tz + if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None): + raise ValueError("The origin must have the same timezone as the index.") + if isinstance(freq, Tick): if isinstance(freq, Day): # _adjust_dates_anchored assumes 'D' means 24H, but first/last # might contain a DST transition (23H, 24H, or 25H). # So "pretend" the dates are naive when adjusting the endpoints - tz = first.tz first = first.tz_localize(None) last = last.tz_localize(None) @@ -1698,20 +1697,18 @@ def _get_timestamp_range_edges( first, last, freq, closed=closed, origin=origin, offset=offset, ) if isinstance(freq, Day): - first = first.tz_localize(tz) - last = last.tz_localize(tz) - return first, last - + first = first.tz_localize(index_tz) + last = last.tz_localize(index_tz) else: first = first.normalize() last = last.normalize() - if closed == "left": - first = Timestamp(freq.rollback(first)) - else: - first = Timestamp(first - freq) + if closed == "left": + first = Timestamp(freq.rollback(first)) + else: + first = Timestamp(first - freq) - last = Timestamp(last + freq) + last = Timestamp(last + freq) return first, last @@ -1733,14 +1730,15 @@ def _get_period_range_edges( The freq to which the Periods will be adjusted. closed : {'right', 'left'}, default None Which side of bin interval is closed. - origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' - The timestamp on which to adjust the grouping. It must be timezone aware if - the index of the resampled data is. + origin : {'epoch', 'start', 'start_day'}, Timestamp, default 'start_day' + The timestamp on which to adjust the grouping. The timezone of origin must + match the timezone of the index. If a timestamp is not used, these values are also supported: - - If 'epoch': `origin` is 1970-01-01 - - If 'start': then `origin` is the first value of the timeseries - - If 'start_day', then `origin` is the first day at midnight of the timeseries + + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries offset : pd.Timedelta, default is None An offset timedelta added to the origin. diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 1ec2fd66b84de..cf4588bca2970 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -814,6 +814,12 @@ def test_resample_origin_with_tz(): with pytest.raises(ValueError, match=msg): ts.resample("5min", origin="12/31/1999 23:57:00").mean() + # if the series is not tz aware, origin should not be tz aware + rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + with pytest.raises(ValueError, match=msg): + ts.resample("5min", origin="12/31/1999 23:57:00+03:00").mean() + def test_resample_daily_anchored(): rng = date_range("1/1/2000 0:00:00", periods=10000, freq="T") From 343a30a5e6a1c6443fcf448a6af15e6cfc3ec95a Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Fri, 1 May 2020 18:25:52 +0200 Subject: [PATCH 20/26] DOC: change doc after review --- doc/source/user_guide/timeseries.rst | 4 +-- doc/source/whatsnew/v1.1.0.rst | 39 ++++++---------------------- 2 files changed, 10 insertions(+), 33 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 774cc5b419f9d..076c1313eec4e 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1809,8 +1809,8 @@ Here we can see that, when using ``origin`` with its default value (``'start_day .. ipython:: python - ts.resample('17min').sum() - ts[middle:end].resample('17min').sum() + ts.resample('17min', origin='start_day').sum() + ts[middle:end].resample('17min', origin='start_day').sum() Here we can see that, when setting ``origin`` to ``'epoch'``, the result after ``'2000-10-02 00:00:00'`` are identical depending on the start of time series: diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 339e43ab1861d..3a462ae50978e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -161,7 +161,12 @@ Grouper and resample now supports the arguments origin and offset The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divides a day (like `90s` or `1min`). But it can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can now specify a fixed timestamp with the argument ``origin``. -For example: +Two arguments are now deprecated (more information in the documentation of :class:`DataFrame.resample`): + +- ``base`` should be replaced by ``offset``. +- ``loffset`` should be replaced by directly adding an offset to the index DataFrame after being resampled. + +Small example of the use of ``origin``: .. ipython:: python @@ -185,38 +190,10 @@ Resample using a fixed origin: ts.resample('17min', origin='epoch').sum() ts.resample('17min', origin='2000-01-01').sum() -For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. - -If needed you can just adjust the bins with an offset that would be added to the default ``origin``. -Those two examples are equivalent for this time series: +If needed you can adjust the bins with the argument ``offset`` (a Timedelta) that would be added to the default ``origin``. -.. ipython:: python - - ts.resample('17min', origin='start').sum() - ts.resample('17min', offset='23h30min').sum() - -The argument ``base`` is now deprecated in favor of ``offset``. (:issue:`31809`) - -.. ipython:: python - :okwarning: - - ts.resample('17min', base=2).sum() - -becomes: - -.. ipython:: python - - ts.resample('17min', offset='2min').sum() - -The argument ``loffset`` is now deprecated. You should now add an offset to the index DataFrame after being resampled. (:issue:`31809`) - -.. ipython:: python +For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. - from pandas.tseries.frequencies import to_offset - loffset = '19min' - ts_out = ts.resample('17min').sum() - ts_out.index = ts_out.index + to_offset(loffset) - ts_out .. _whatsnew_110.enhancements.other: From efb572e6437fa9b27b46ed336030ac32e63dacc6 Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Fri, 1 May 2020 18:30:15 +0200 Subject: [PATCH 21/26] CLN: change typing for TimestampConvertibleTypes --- pandas/_typing.py | 6 +++--- pandas/core/generic.py | 8 ++++---- pandas/core/resample.py | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 4f66c376dc44c..71df27119bd96 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -44,12 +44,12 @@ PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] Scalar = Union[PythonScalar, PandasScalar] -# timestamp and timedelta compatible types +# timestamp and timedelta convertible types -TimestampCompatibleTypes = Union[ +TimestampConvertibleTypes = Union[ "Timestamp", datetime, np.datetime64, int, np.int64, float, str ] -TimedeltaCompatibleTypes = Union[ +TimedeltaConvertibleTypes = Union[ "Timedelta", timedelta, np.timedelta64, int, np.int64, float, str ] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4f84b58a68773..26691c3f1cc0c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -39,8 +39,8 @@ Label, Level, Renamer, - TimedeltaCompatibleTypes, - TimestampCompatibleTypes, + TimedeltaConvertibleTypes, + TimestampConvertibleTypes, ValueKeyFunc, ) from pandas.compat import set_function_name @@ -7765,8 +7765,8 @@ def resample( base: Optional[int] = None, on=None, level=None, - origin: Union[str, TimestampCompatibleTypes] = "start_day", - offset: Optional[TimedeltaCompatibleTypes] = None, + origin: Union[str, TimestampConvertibleTypes] = "start_day", + offset: Optional[TimedeltaConvertibleTypes] = None, ) -> "Resampler": """ Resample time-series data. diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 653085f541d4d..c8ec7a310e90b 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -9,7 +9,7 @@ from pandas._libs.tslibs import NaT, Period, Timedelta, Timestamp from pandas._libs.tslibs.frequencies import is_subperiod, is_superperiod from pandas._libs.tslibs.period import IncompatibleFrequency -from pandas._typing import TimedeltaCompatibleTypes, TimestampCompatibleTypes +from pandas._typing import TimedeltaConvertibleTypes, TimestampConvertibleTypes from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, doc @@ -1317,8 +1317,8 @@ def __init__( kind: Optional[str] = None, convention: Optional[str] = None, base: Optional[int] = None, - origin: Union[str, TimestampCompatibleTypes] = "start_day", - offset: Optional[TimedeltaCompatibleTypes] = None, + origin: Union[str, TimestampConvertibleTypes] = "start_day", + offset: Optional[TimedeltaConvertibleTypes] = None, **kwargs, ): # Check for correctness of the keyword arguments which would From fcdde91eb085278e7381e39e746776ac47afa400 Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Fri, 1 May 2020 22:16:35 +0200 Subject: [PATCH 22/26] CLN: add nice message for ValueError of 'origin' and 'offset' in resample --- pandas/core/resample.py | 25 +++++++++++++++----- pandas/tests/resample/test_datetime_index.py | 25 ++++++++++++++++++++ pandas/tests/resample/test_deprecated.py | 5 +++- 3 files changed, 48 insertions(+), 7 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index c8ec7a310e90b..de6392a8b77f3 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1356,18 +1356,31 @@ def __init__( self.fill_method = fill_method self.limit = limit - if origin in {"epoch", "start_day", "start"}: + if origin in {"epoch", "start", "start_day"}: self.origin = origin else: - self.origin = Timestamp(origin) - self.offset = Timedelta(offset) if offset is not None else None + try: + self.origin = Timestamp(origin) + except Exception as e: + raise ValueError( + "'origin' should be equal to 'epoch', 'start', 'start_day' or " + f"should be a Timestamp convertible type. Got '{origin}' instead." + ) from e + + try: + self.offset = Timedelta(offset) if offset is not None else None + except Exception as e: + raise ValueError( + "'offset' should be a Timedelta convertible type. " + f"Got '{offset}' instead." + ) from e # always sort time groupers kwargs["sort"] = True # Handle deprecated arguments since v1.1.0 of `base` and `loffset` (GH #31809) if base is not None and offset is not None: - raise ValueError("`offset` and `base` cannot be present at the same time") + raise ValueError("'offset' and 'base' cannot be present at the same time") if base and isinstance(freq, Tick): # this conversion handle the default behavior of base and the @@ -1584,8 +1597,8 @@ def _get_period_bins(self, ax): bin_shift = 0 if isinstance(self.freq, Tick): - # GH 23882 & 31809: get adjusted bin edge labels with `origin` - # and `origin` support. This call only makes sense if the freq is a + # GH 23882 & 31809: get adjusted bin edge labels with 'origin' + # and 'origin' support. This call only makes sense if the freq is a # Tick since offset and origin are only used in those cases. # Not doing this check could create an extra empty bin. p_start, end = _get_period_range_edges( diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index cf4588bca2970..e91bd4c6734aa 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -763,6 +763,31 @@ def test_resample_origin(): tm.assert_index_equal(resampled.index, exp_rng) +@pytest.mark.parametrize( + "origin", + ["invalid_value", "epch", "startday", "startt", "2000-30-30", object()], +) +def test_resample_bad_origin(origin): + rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + msg = ("'origin' should be equal to 'epoch', 'start', 'start_day' or " + f"should be a Timestamp convertible type. Got '{origin}' instead.") + with pytest.raises(ValueError, match=msg): + ts.resample("5min", origin=origin) + + +@pytest.mark.parametrize( + "offset", + ["invalid_value", "12dayys", "2000-30-30", object()], +) +def test_resample_bad_offset(offset): + rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + msg = f"'offset' should be a Timedelta convertible type. Got '{offset}' instead." + with pytest.raises(ValueError, match=msg): + ts.resample("5min", offset=offset) + + def test_resample_origin_prime_freq(): # GH 31809 start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py index 45de17bc921c7..542992a948123 100644 --- a/pandas/tests/resample/test_deprecated.py +++ b/pandas/tests/resample/test_deprecated.py @@ -55,6 +55,9 @@ def test_deprecating_on_loffset_and_base(): df.resample("3T", base=0).sum() with tm.assert_produces_warning(FutureWarning): df.resample("3T", loffset="0s").sum() + msg = "'offset' and 'base' cannot be present at the same time" + with pytest.raises(ValueError, match=msg): + df.groupby("a").resample("3T", base=0, offset=0).sum() @all_ts @@ -228,7 +231,7 @@ def test_resample_with_non_zero_base(start, end, start_freq, end_freq, base, off result = s.resample(end_freq, base=base).mean() result = result.to_timestamp(end_freq) - # test that the replacement argument `offset` works + # test that the replacement argument 'offset' works result_offset = s.resample(end_freq, offset=offset).mean() result_offset = result_offset.to_timestamp(end_freq) tm.assert_series_equal(result, result_offset) From 1fec9460f4c28e2a8c6af1ed68a2f73ce2c715f4 Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sat, 2 May 2020 21:12:51 +0200 Subject: [PATCH 23/26] BUG: fix a bug when resampling in DST context --- pandas/core/resample.py | 4 +- pandas/tests/resample/test_datetime_index.py | 67 ++++++++++++++++++-- 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index de6392a8b77f3..755059bf0adf1 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1356,7 +1356,7 @@ def __init__( self.fill_method = fill_method self.limit = limit - if origin in {"epoch", "start", "start_day"}: + if origin in ("epoch", "start", "start_day"): self.origin = origin else: try: @@ -1705,6 +1705,8 @@ def _get_timestamp_range_edges( # So "pretend" the dates are naive when adjusting the endpoints first = first.tz_localize(None) last = last.tz_localize(None) + if isinstance(origin, Timestamp): + origin = origin.tz_localize(None) first, last = _adjust_dates_anchored( first, last, freq, closed=closed, origin=origin, offset=offset, diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index e91bd4c6734aa..7b62e047a0c8c 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -764,21 +764,21 @@ def test_resample_origin(): @pytest.mark.parametrize( - "origin", - ["invalid_value", "epch", "startday", "startt", "2000-30-30", object()], + "origin", ["invalid_value", "epch", "startday", "startt", "2000-30-30", object()], ) def test_resample_bad_origin(origin): rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") ts = Series(np.random.randn(len(rng)), index=rng) - msg = ("'origin' should be equal to 'epoch', 'start', 'start_day' or " - f"should be a Timestamp convertible type. Got '{origin}' instead.") + msg = ( + "'origin' should be equal to 'epoch', 'start', 'start_day' or " + f"should be a Timestamp convertible type. Got '{origin}' instead." + ) with pytest.raises(ValueError, match=msg): ts.resample("5min", origin=origin) @pytest.mark.parametrize( - "offset", - ["invalid_value", "12dayys", "2000-30-30", object()], + "offset", ["invalid_value", "12dayys", "2000-30-30", object()], ) def test_resample_bad_offset(offset): rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") @@ -846,6 +846,61 @@ def test_resample_origin_with_tz(): ts.resample("5min", origin="12/31/1999 23:57:00+03:00").mean() +def test_resample_origin_with_day_freq_on_dst(): + # GH 31809 + tz = "dateutil//usr/share/zoneinfo/America/Chicago" + + def _create_series(values, timestamps, freq="D"): + return pd.Series( + values, + index=pd.DatetimeIndex( + [Timestamp(t, tz=tz) for t in timestamps], freq=freq, ambiguous=True + ), + ) + + # test classical behavior of origin in a DST context + start = pd.Timestamp("2013-11-02", tz=tz) + end = pd.Timestamp("2013-11-03 23:59", tz=tz) + rng = pd.date_range(start, end, freq="1h") + ts = pd.Series(np.ones(len(rng)), index=rng) + + expected = _create_series([24.0, 25.0], ["2013-11-02", "2013-11-03"]) + for origin in ["epoch", "start", "start_day", start, None]: + result = ts.resample("D", origin=origin).sum() + tm.assert_series_equal(result, expected) + + # test complex behavior of origin/offset in a DST context + start = pd.Timestamp("2013-11-03", tz=tz) + end = pd.Timestamp("2013-11-03 23:59", tz=tz) + rng = pd.date_range(start, end, freq="1h") + ts = pd.Series(np.ones(len(rng)), index=rng) + + expected_ts = ["2013-11-02 22:00-05:00", "2013-11-03 22:00-06:00"] + expected = _create_series([23.0, 2.0], expected_ts) + result = ts.resample("D", origin="start", offset="-2H").sum() + tm.assert_series_equal(result, expected) + + expected_ts = ["2013-11-02 22:00-05:00", "2013-11-03 21:00-06:00"] + expected = _create_series([22.0, 3.0], expected_ts, freq="24H") + result = ts.resample("24H", origin="start", offset="-2H").sum() + tm.assert_series_equal(result, expected) + + expected_ts = ["2013-11-02 02:00-05:00", "2013-11-03 02:00-06:00"] + expected = _create_series([3.0, 22.0], expected_ts) + result = ts.resample("D", origin="start", offset="2H").sum() + tm.assert_series_equal(result, expected) + + expected_ts = ["2013-11-02 23:00-05:00", "2013-11-03 23:00-06:00"] + expected = _create_series([24.0, 1.0], expected_ts) + result = ts.resample("D", origin="start", offset="-1H").sum() + tm.assert_series_equal(result, expected) + + expected_ts = ["2013-11-02 01:00-05:00", "2013-11-03 01:00:00-0500"] + expected = _create_series([1.0, 24.0], expected_ts) + result = ts.resample("D", origin="start", offset="1H").sum() + tm.assert_series_equal(result, expected) + + def test_resample_daily_anchored(): rng = date_range("1/1/2000 0:00:00", periods=10000, freq="T") ts = Series(np.random.randn(len(rng)), index=rng) From 5695ffb74f9b3175d2b961e23a7cccdfb86e6e5b Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sat, 2 May 2020 21:39:14 +0200 Subject: [PATCH 24/26] TST: fix deprecation test --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/tests/resample/test_deprecated.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 3a462ae50978e..44797d3296c80 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -176,7 +176,7 @@ Small example of the use of ``origin``: ts = pd.Series(np.arange(len(rng)) * 3, index=rng) ts -Resample with the default behavior 'start_day' (origin is 2000-10-01 00:00:00): +Resample with the default behavior ``'start_day'`` (origin is ``2000-10-01 00:00:00``): .. ipython:: python diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py index 542992a948123..d51b36a9f546e 100644 --- a/pandas/tests/resample/test_deprecated.py +++ b/pandas/tests/resample/test_deprecated.py @@ -56,8 +56,9 @@ def test_deprecating_on_loffset_and_base(): with tm.assert_produces_warning(FutureWarning): df.resample("3T", loffset="0s").sum() msg = "'offset' and 'base' cannot be present at the same time" - with pytest.raises(ValueError, match=msg): - df.groupby("a").resample("3T", base=0, offset=0).sum() + with tm.assert_produces_warning(FutureWarning): + with pytest.raises(ValueError, match=msg): + df.groupby("a").resample("3T", base=0, offset=0).sum() @all_ts From de6b477eec7bc89fd15201900868904115e50e0b Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sat, 2 May 2020 22:31:11 +0200 Subject: [PATCH 25/26] TST: using pytz instead of datetutil in test of test_resample_origin_with_day_freq_on_dst --- pandas/tests/resample/test_datetime_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 7b62e047a0c8c..fe005801aaa53 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -848,7 +848,7 @@ def test_resample_origin_with_tz(): def test_resample_origin_with_day_freq_on_dst(): # GH 31809 - tz = "dateutil//usr/share/zoneinfo/America/Chicago" + tz = "America/Chicago" def _create_series(values, timestamps, freq="D"): return pd.Series( From 05ddd9b6cd3b1769abade95e94c45728005a9a97 Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Sat, 9 May 2020 23:50:56 +0200 Subject: [PATCH 26/26] CLN: remove unused import --- pandas/tests/resample/test_deprecated.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py index d51b36a9f546e..8b3adbf08d157 100644 --- a/pandas/tests/resample/test_deprecated.py +++ b/pandas/tests/resample/test_deprecated.py @@ -8,7 +8,7 @@ import pandas._testing as tm from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import PeriodIndex, period_range -from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range +from pandas.core.indexes.timedeltas import timedelta_range from pandas.tseries.offsets import BDay, Minute