diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 6ba58310000cb..076c1313eec4e 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1572,10 +1572,9 @@ end of the interval is closed: ts.resample('5Min', closed='left').mean() -Parameters like ``label`` and ``loffset`` are used to manipulate the resulting -labels. ``label`` specifies whether the result is labeled with the beginning or -the end of the interval. ``loffset`` performs a time adjustment on the output -labels. +Parameters like ``label`` are used to manipulate the resulting labels. +``label`` specifies whether the result is labeled with the beginning or +the end of the interval. .. ipython:: python @@ -1583,8 +1582,6 @@ labels. ts.resample('5Min', label='left').mean() - ts.resample('5Min', label='left', loffset='1s').mean() - .. warning:: The default values for ``label`` and ``closed`` is '**left**' for all @@ -1789,6 +1786,58 @@ natural and functions similarly to :py:func:`itertools.groupby`: See :ref:`groupby.iterating-label` or :class:`Resampler.__iter__` for more. +.. _timeseries.adjust-the-start-of-the-bins: + +Use `origin` or `offset` to adjust the start of the bins +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.1.0 + +The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divide a day evenly (like `90s` or `1min`). This can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can specify a fixed Timestamp with the argument ``origin``. + +For example: + +.. ipython:: python + + start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' + middle = '2000-10-02 00:00:00' + rng = pd.date_range(start, end, freq='7min') + ts = pd.Series(np.arange(len(rng)) * 3, index=rng) + ts + +Here we can see that, when using ``origin`` with its default value (``'start_day'``), the result after ``'2000-10-02 00:00:00'`` are not identical depending on the start of time series: + +.. ipython:: python + + ts.resample('17min', origin='start_day').sum() + ts[middle:end].resample('17min', origin='start_day').sum() + + +Here we can see that, when setting ``origin`` to ``'epoch'``, the result after ``'2000-10-02 00:00:00'`` are identical depending on the start of time series: + +.. ipython:: python + + ts.resample('17min', origin='epoch').sum() + ts[middle:end].resample('17min', origin='epoch').sum() + + +If needed you can use a custom timestamp for ``origin``: + +.. ipython:: python + + ts.resample('17min', origin='2001-01-01').sum() + ts[middle:end].resample('17min', origin=pd.Timestamp('2001-01-01')).sum() + +If needed you can just adjust the bins with an ``offset`` Timedelta that would be added to the default ``origin``. +Those two examples are equivalent for this time series: + +.. ipython:: python + + ts.resample('17min', origin='start').sum() + ts.resample('17min', offset='23h30min').sum() + + +Note the use of ``'start'`` for ``origin`` on the last example. In that case, ``origin`` will be set to the first value of the timeseries. .. _timeseries.periods: diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 092bd3345efbc..44797d3296c80 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -152,6 +152,49 @@ For example: pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z', utc=True) pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z') +.. _whatsnew_110.grouper_resample_origin: + +Grouper and resample now supports the arguments origin and offset +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`Grouper` and :class:`DataFrame.resample` now supports the arguments ``origin`` and ``offset``. It let the user control the timestamp on which to adjust the grouping. (:issue:`31809`) + +The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divides a day (like `90s` or `1min`). But it can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can now specify a fixed timestamp with the argument ``origin``. + +Two arguments are now deprecated (more information in the documentation of :class:`DataFrame.resample`): + +- ``base`` should be replaced by ``offset``. +- ``loffset`` should be replaced by directly adding an offset to the index DataFrame after being resampled. + +Small example of the use of ``origin``: + +.. ipython:: python + + start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' + middle = '2000-10-02 00:00:00' + rng = pd.date_range(start, end, freq='7min') + ts = pd.Series(np.arange(len(rng)) * 3, index=rng) + ts + +Resample with the default behavior ``'start_day'`` (origin is ``2000-10-01 00:00:00``): + +.. ipython:: python + + ts.resample('17min').sum() + ts.resample('17min', origin='start_day').sum() + +Resample using a fixed origin: + +.. ipython:: python + + ts.resample('17min', origin='epoch').sum() + ts.resample('17min', origin='2000-01-01').sum() + +If needed you can adjust the bins with the argument ``offset`` (a Timedelta) that would be added to the default ``origin``. + +For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. + + .. _whatsnew_110.enhancements.other: Other enhancements diff --git a/pandas/_typing.py b/pandas/_typing.py index d225b845970cc..71df27119bd96 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,3 +1,4 @@ +from datetime import datetime, timedelta from pathlib import Path from typing import ( IO, @@ -43,6 +44,15 @@ PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] Scalar = Union[PythonScalar, PandasScalar] +# timestamp and timedelta convertible types + +TimestampConvertibleTypes = Union[ + "Timestamp", datetime, np.datetime64, int, np.int64, float, str +] +TimedeltaConvertibleTypes = Union[ + "Timedelta", timedelta, np.timedelta64, int, np.int64, float, str +] + # other Dtype = Union[ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 792e5a1228fe6..26691c3f1cc0c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -39,6 +39,8 @@ Label, Level, Renamer, + TimedeltaConvertibleTypes, + TimestampConvertibleTypes, ValueKeyFunc, ) from pandas.compat import set_function_name @@ -7760,9 +7762,11 @@ def resample( convention: str = "start", kind: Optional[str] = None, loffset=None, - base: int = 0, + base: Optional[int] = None, on=None, level=None, + origin: Union[str, TimestampConvertibleTypes] = "start_day", + offset: Optional[TimedeltaConvertibleTypes] = None, ) -> "Resampler": """ Resample time-series data. @@ -7797,17 +7801,40 @@ def resample( By default the input representation is retained. loffset : timedelta, default None Adjust the resampled time labels. + + .. deprecated:: 1.1.0 + You should add the loffset to the `df.index` after the resample. + See below. + base : int, default 0 For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '5min' frequency, base could range from 0 through 4. Defaults to 0. + + .. deprecated:: 1.1.0 + The new arguments that you should use are 'offset' or 'origin'. + on : str, optional For a DataFrame, column to use instead of index for resampling. Column must be datetime-like. - level : str or int, optional For a MultiIndex, level (name or number) to use for resampling. `level` must be datetime-like. + origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' + The timestamp on which to adjust the grouping. The timezone of origin + must match the timezone of the index. + If a timestamp is not used, these values are also supported: + + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + + .. versionadded:: 1.1.0 + + offset : Timedelta or str, default is None + An offset timedelta added to the origin. + + .. versionadded:: 1.1.0 Returns ------- @@ -8025,6 +8052,88 @@ def resample( 2000-01-02 22 140 2000-01-03 32 150 2000-01-04 36 90 + + If you want to adjust the start of the bins based on a fixed timestamp: + + >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' + >>> rng = pd.date_range(start, end, freq='7min') + >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) + >>> ts + 2000-10-01 23:30:00 0 + 2000-10-01 23:37:00 3 + 2000-10-01 23:44:00 6 + 2000-10-01 23:51:00 9 + 2000-10-01 23:58:00 12 + 2000-10-02 00:05:00 15 + 2000-10-02 00:12:00 18 + 2000-10-02 00:19:00 21 + 2000-10-02 00:26:00 24 + Freq: 7T, dtype: int64 + + >>> ts.resample('17min').sum() + 2000-10-01 23:14:00 0 + 2000-10-01 23:31:00 9 + 2000-10-01 23:48:00 21 + 2000-10-02 00:05:00 54 + 2000-10-02 00:22:00 24 + Freq: 17T, dtype: int64 + + >>> ts.resample('17min', origin='epoch').sum() + 2000-10-01 23:18:00 0 + 2000-10-01 23:35:00 18 + 2000-10-01 23:52:00 27 + 2000-10-02 00:09:00 39 + 2000-10-02 00:26:00 24 + Freq: 17T, dtype: int64 + + >>> ts.resample('17min', origin='2000-01-01').sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17T, dtype: int64 + + If you want to adjust the start of the bins with an `offset` Timedelta, the two + following lines are equivalent: + + >>> ts.resample('17min', origin='start').sum() + 2000-10-01 23:30:00 9 + 2000-10-01 23:47:00 21 + 2000-10-02 00:04:00 54 + 2000-10-02 00:21:00 24 + Freq: 17T, dtype: int64 + + >>> ts.resample('17min', offset='23h30min').sum() + 2000-10-01 23:30:00 9 + 2000-10-01 23:47:00 21 + 2000-10-02 00:04:00 54 + 2000-10-02 00:21:00 24 + Freq: 17T, dtype: int64 + + To replace the use of the deprecated `base` argument, you can now use `offset`, + in this example it is equivalent to have `base=2`: + + >>> ts.resample('17min', offset='2min').sum() + 2000-10-01 23:16:00 0 + 2000-10-01 23:33:00 9 + 2000-10-01 23:50:00 36 + 2000-10-02 00:07:00 39 + 2000-10-02 00:24:00 24 + Freq: 17T, dtype: int64 + + To replace the use of the deprecated `loffset` argument: + + >>> from pandas.tseries.frequencies import to_offset + >>> loffset = '19min' + >>> ts_out = ts.resample('17min').sum() + >>> ts_out.index = ts_out.index + to_offset(loffset) + >>> ts_out + 2000-10-01 23:33:00 0 + 2000-10-01 23:50:00 9 + 2000-10-02 00:07:00 21 + 2000-10-02 00:24:00 54 + 2000-10-02 00:41:00 24 + Freq: 17T, dtype: int64 """ from pandas.core.resample import get_resampler @@ -8041,6 +8150,8 @@ def resample( base=base, key=on, level=level, + origin=origin, + offset=offset, ) def first(self: FrameOrSeries, offset) -> FrameOrSeries: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b92ff1c7c8ca4..c71085cd4918a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1646,15 +1646,6 @@ def resample(self, rule, *args, **kwargs): 0 2000-01-01 00:00:00 0 1 2000-01-01 00:03:00 0 2 5 2000-01-01 00:03:00 5 1 - - Add an offset of twenty seconds. - - >>> df.groupby('a').resample('3T', loffset='20s').sum() - a b - a - 0 2000-01-01 00:00:20 0 2 - 2000-01-01 00:03:20 0 1 - 5 2000-01-01 00:00:20 5 1 """ from pandas.core.resample import get_resampler_for_grouping diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 948b4ba27f705..9660fb9c2e1b0 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -2,8 +2,8 @@ Provide user facing operators for doing the split part of the split-apply-combine paradigm. """ - from typing import Dict, Hashable, List, Optional, Tuple +import warnings import numpy as np @@ -67,9 +67,38 @@ class Grouper: If grouper is PeriodIndex and `freq` parameter is passed. base : int, default 0 Only when `freq` parameter is passed. + For frequencies that evenly subdivide 1 day, the "origin" of the + aggregated intervals. For example, for '5min' frequency, base could + range from 0 through 4. Defaults to 0. + + .. deprecated:: 1.1.0 + The new arguments that you should use are 'offset' or 'origin'. + loffset : str, DateOffset, timedelta object Only when `freq` parameter is passed. + .. deprecated:: 1.1.0 + loffset is only working for ``.resample(...)`` and not for + Grouper (:issue:`28302`). + However, loffset is also deprecated for ``.resample(...)`` + See: :class:`DataFrame.resample` + + origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' + The timestamp on which to adjust the grouping. The timezone of origin must + match the timezone of the index. + If a timestamp is not used, these values are also supported: + + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + + .. versionadded:: 1.1.0 + + offset : Timedelta or str, default is None + An offset timedelta added to the origin. + + .. versionadded:: 1.1.0 + Returns ------- A specification for a groupby instruction @@ -123,6 +152,74 @@ class Grouper: 2000-01-02 0.5 15.0 2000-01-09 2.0 30.0 2000-01-16 3.0 40.0 + + If you want to adjust the start of the bins based on a fixed timestamp: + + >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' + >>> rng = pd.date_range(start, end, freq='7min') + >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) + >>> ts + 2000-10-01 23:30:00 0 + 2000-10-01 23:37:00 3 + 2000-10-01 23:44:00 6 + 2000-10-01 23:51:00 9 + 2000-10-01 23:58:00 12 + 2000-10-02 00:05:00 15 + 2000-10-02 00:12:00 18 + 2000-10-02 00:19:00 21 + 2000-10-02 00:26:00 24 + Freq: 7T, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq='17min')).sum() + 2000-10-01 23:14:00 0 + 2000-10-01 23:31:00 9 + 2000-10-01 23:48:00 21 + 2000-10-02 00:05:00 54 + 2000-10-02 00:22:00 24 + Freq: 17T, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum() + 2000-10-01 23:18:00 0 + 2000-10-01 23:35:00 18 + 2000-10-01 23:52:00 27 + 2000-10-02 00:09:00 39 + 2000-10-02 00:26:00 24 + Freq: 17T, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17T, dtype: int64 + + If you want to adjust the start of the bins with an `offset` Timedelta, the two + following lines are equivalent: + + >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum() + 2000-10-01 23:30:00 9 + 2000-10-01 23:47:00 21 + 2000-10-02 00:04:00 54 + 2000-10-02 00:21:00 24 + Freq: 17T, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum() + 2000-10-01 23:30:00 9 + 2000-10-01 23:47:00 21 + 2000-10-02 00:04:00 54 + 2000-10-02 00:21:00 24 + Freq: 17T, dtype: int64 + + To replace the use of the deprecated `base` argument, you can now use `offset`, + in this example it is equivalent to have `base=2`: + + >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum() + 2000-10-01 23:16:00 0 + 2000-10-01 23:33:00 9 + 2000-10-01 23:50:00 36 + 2000-10-02 00:07:00 39 + 2000-10-02 00:24:00 24 + Freq: 17T, dtype: int64 """ _attributes: Tuple[str, ...] = ("key", "level", "freq", "axis", "sort") @@ -131,6 +228,43 @@ def __new__(cls, *args, **kwargs): if kwargs.get("freq") is not None: from pandas.core.resample import TimeGrouper + # Deprecation warning of `base` and `loffset` since v1.1.0: + # we are raising the warning here to be able to set the `stacklevel` + # properly since we need to raise the `base` and `loffset` deprecation + # warning from three different cases: + # core/generic.py::NDFrame.resample + # core/groupby/groupby.py::GroupBy.resample + # core/groupby/grouper.py::Grouper + # raising these warnings from TimeGrouper directly would fail the test: + # tests/resample/test_deprecated.py::test_deprecating_on_loffset_and_base + + # hacky way to set the stacklevel: if cls is TimeGrouper it means + # that the call comes from a pandas internal call of resample, + # otherwise it comes from pd.Grouper + stacklevel = 4 if cls is TimeGrouper else 2 + if kwargs.get("base", None) is not None: + warnings.warn( + "'base' in .resample() and in Grouper() is deprecated.\n" + "The new arguments that you should use are 'offset' or 'origin'.\n" + '\n>>> df.resample(freq="3s", base=2)\n' + "\nbecomes:\n" + '\n>>> df.resample(freq="3s", offset="2s")\n', + FutureWarning, + stacklevel=stacklevel, + ) + + if kwargs.get("loffset", None) is not None: + warnings.warn( + "'loffset' in .resample() and in Grouper() is deprecated.\n" + '\n>>> df.resample(freq="3s", loffset="8H")\n' + "\nbecomes:\n" + "\n>>> from pandas.tseries.frequencies import to_offset" + '\n>>> df = df.resample(freq="3s").mean()' + '\n>>> df.index = df.index.to_timestamp() + to_offset("8H")\n', + FutureWarning, + stacklevel=stacklevel, + ) + cls = TimeGrouper return super().__new__(cls) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index b8c45f26301a4..755059bf0adf1 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1,14 +1,15 @@ import copy from datetime import timedelta from textwrap import dedent -from typing import Dict, no_type_check +from typing import Dict, Optional, Union, no_type_check import numpy as np from pandas._libs import lib -from pandas._libs.tslibs import NaT, Period, Timestamp +from pandas._libs.tslibs import NaT, Period, Timedelta, Timestamp from pandas._libs.tslibs.frequencies import is_subperiod, is_superperiod from pandas._libs.tslibs.period import IncompatibleFrequency +from pandas._typing import TimedeltaConvertibleTypes, TimestampConvertibleTypes from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, doc @@ -66,8 +67,9 @@ class Resampler(_GroupBy, ShallowMixin): "label", "convention", "loffset", - "base", "kind", + "origin", + "offset", ] def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): @@ -244,7 +246,7 @@ def pipe(self, func, *args, **kwargs): >>> r = s.resample('2s') DatetimeIndexResampler [freq=<2 * Seconds>, axis=0, closed=left, - label=left, convention=start, base=0] + label=left, convention=start] >>> r.agg(np.sum) 2013-01-01 00:00:00 3 @@ -1298,22 +1300,25 @@ class TimeGrouper(Grouper): "loffset", "kind", "convention", - "base", + "origin", + "offset", ) def __init__( self, freq="Min", - closed=None, - label=None, + closed: Optional[str] = None, + label: Optional[str] = None, how="mean", axis=0, fill_method=None, limit=None, loffset=None, - kind=None, - convention=None, - base=0, + kind: Optional[str] = None, + convention: Optional[str] = None, + base: Optional[int] = None, + origin: Union[str, TimestampConvertibleTypes] = "start_day", + offset: Optional[TimedeltaConvertibleTypes] = None, **kwargs, ): # Check for correctness of the keyword arguments which would @@ -1347,18 +1352,48 @@ def __init__( self.convention = convention or "E" self.convention = self.convention.lower() - if isinstance(loffset, str): - loffset = to_offset(loffset) - self.loffset = loffset - self.how = how self.fill_method = fill_method self.limit = limit - self.base = base + + if origin in ("epoch", "start", "start_day"): + self.origin = origin + else: + try: + self.origin = Timestamp(origin) + except Exception as e: + raise ValueError( + "'origin' should be equal to 'epoch', 'start', 'start_day' or " + f"should be a Timestamp convertible type. Got '{origin}' instead." + ) from e + + try: + self.offset = Timedelta(offset) if offset is not None else None + except Exception as e: + raise ValueError( + "'offset' should be a Timedelta convertible type. " + f"Got '{offset}' instead." + ) from e # always sort time groupers kwargs["sort"] = True + # Handle deprecated arguments since v1.1.0 of `base` and `loffset` (GH #31809) + if base is not None and offset is not None: + raise ValueError("'offset' and 'base' cannot be present at the same time") + + if base and isinstance(freq, Tick): + # this conversion handle the default behavior of base and the + # special case of GH #10530. Indeed in case when dealing with + # a TimedeltaIndex base was treated as a 'pure' offset even though + # the default behavior of base was equivalent of a modulo on + # freq_nanos. + self.offset = Timedelta(base * freq.nanos // freq.n) + + if isinstance(loffset, str): + loffset = to_offset(loffset) + self.loffset = loffset + super().__init__(freq=freq, axis=axis, **kwargs) def _get_resampler(self, obj, kind=None): @@ -1414,7 +1449,12 @@ def _get_time_bins(self, ax): return binner, [], labels first, last = _get_timestamp_range_edges( - ax.min(), ax.max(), self.freq, closed=self.closed, base=self.base + ax.min(), + ax.max(), + self.freq, + closed=self.closed, + origin=self.origin, + offset=self.offset, ) # GH #12037 # use first/last directly instead of call replace() on them @@ -1499,11 +1539,11 @@ def _get_time_delta_bins(self, ax): end_stamps = labels + self.freq bins = ax.searchsorted(end_stamps, side="left") - if self.base > 0: - # GH #10530 - labels += type(self.freq)(self.base) + if self.offset: + # GH 10530 & 31809 + labels += self.offset if self.loffset: - # GH #33498 + # GH 33498 labels += self.loffset return binner, bins, labels @@ -1556,11 +1596,18 @@ def _get_period_bins(self, ax): end = ax.max().asfreq(self.freq, how="end") bin_shift = 0 - # GH 23882 - if self.base: - # get base adjusted bin edge labels + if isinstance(self.freq, Tick): + # GH 23882 & 31809: get adjusted bin edge labels with 'origin' + # and 'origin' support. This call only makes sense if the freq is a + # Tick since offset and origin are only used in those cases. + # Not doing this check could create an extra empty bin. p_start, end = _get_period_range_edges( - start, end, self.freq, closed=self.closed, base=self.base + start, + end, + self.freq, + closed=self.closed, + origin=self.origin, + offset=self.offset, ) # Get offset for bin edge (not label edge) adjustment @@ -1612,7 +1659,9 @@ def _take_new_index(obj, indexer, new_index, axis=0): raise ValueError("'obj' should be either a Series or a DataFrame") -def _get_timestamp_range_edges(first, last, offset, closed="left", base=0): +def _get_timestamp_range_edges( + first, last, freq, closed="left", origin="start_day", offset=None +): """ Adjust the `first` Timestamp to the preceding Timestamp that resides on the provided offset. Adjust the `last` Timestamp to the following @@ -1626,49 +1675,62 @@ def _get_timestamp_range_edges(first, last, offset, closed="left", base=0): The beginning Timestamp of the range to be adjusted. last : pd.Timestamp The ending Timestamp of the range to be adjusted. - offset : pd.DateOffset + freq : pd.DateOffset The dateoffset to which the Timestamps will be adjusted. closed : {'right', 'left'}, default None Which side of bin interval is closed. - base : int, default 0 - The "origin" of the adjusted Timestamps. + origin : {'epoch', 'start', 'start_day'} or Timestamp, default 'start_day' + The timestamp on which to adjust the grouping. The timezone of origin must + match the timezone of the index. + If a timestamp is not used, these values are also supported: + + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + offset : pd.Timedelta, default is None + An offset timedelta added to the origin. Returns ------- A tuple of length 2, containing the adjusted pd.Timestamp objects. """ - if isinstance(offset, Tick): - if isinstance(offset, Day): + index_tz = first.tz + if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None): + raise ValueError("The origin must have the same timezone as the index.") + + if isinstance(freq, Tick): + if isinstance(freq, Day): # _adjust_dates_anchored assumes 'D' means 24H, but first/last # might contain a DST transition (23H, 24H, or 25H). # So "pretend" the dates are naive when adjusting the endpoints - tz = first.tz first = first.tz_localize(None) last = last.tz_localize(None) + if isinstance(origin, Timestamp): + origin = origin.tz_localize(None) first, last = _adjust_dates_anchored( - first, last, offset, closed=closed, base=base + first, last, freq, closed=closed, origin=origin, offset=offset, ) - if isinstance(offset, Day): - first = first.tz_localize(tz) - last = last.tz_localize(tz) - return first, last - + if isinstance(freq, Day): + first = first.tz_localize(index_tz) + last = last.tz_localize(index_tz) else: first = first.normalize() last = last.normalize() - if closed == "left": - first = Timestamp(offset.rollback(first)) - else: - first = Timestamp(first - offset) + if closed == "left": + first = Timestamp(freq.rollback(first)) + else: + first = Timestamp(first - freq) - last = Timestamp(last + offset) + last = Timestamp(last + freq) return first, last -def _get_period_range_edges(first, last, offset, closed="left", base=0): +def _get_period_range_edges( + first, last, freq, closed="left", origin="start_day", offset=None +): """ Adjust the provided `first` and `last` Periods to the respective Period of the given offset that encompasses them. @@ -1679,12 +1741,21 @@ def _get_period_range_edges(first, last, offset, closed="left", base=0): The beginning Period of the range to be adjusted. last : pd.Period The ending Period of the range to be adjusted. - offset : pd.DateOffset - The dateoffset to which the Periods will be adjusted. + freq : pd.DateOffset + The freq to which the Periods will be adjusted. closed : {'right', 'left'}, default None Which side of bin interval is closed. - base : int, default 0 - The "origin" of the adjusted Periods. + origin : {'epoch', 'start', 'start_day'}, Timestamp, default 'start_day' + The timestamp on which to adjust the grouping. The timezone of origin must + match the timezone of the index. + + If a timestamp is not used, these values are also supported: + + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + offset : pd.Timedelta, default is None + An offset timedelta added to the origin. Returns ------- @@ -1696,52 +1767,58 @@ def _get_period_range_edges(first, last, offset, closed="left", base=0): # GH 23882 first = first.to_timestamp() last = last.to_timestamp() - adjust_first = not offset.is_on_offset(first) - adjust_last = offset.is_on_offset(last) + adjust_first = not freq.is_on_offset(first) + adjust_last = freq.is_on_offset(last) first, last = _get_timestamp_range_edges( - first, last, offset, closed=closed, base=base + first, last, freq, closed=closed, origin=origin, offset=offset, ) - first = (first + int(adjust_first) * offset).to_period(offset) - last = (last - int(adjust_last) * offset).to_period(offset) + first = (first + int(adjust_first) * freq).to_period(freq) + last = (last - int(adjust_last) * freq).to_period(freq) return first, last -def _adjust_dates_anchored(first, last, offset, closed="right", base=0): +def _adjust_dates_anchored( + first, last, freq, closed="right", origin="start_day", offset=None +): # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is - # not a multiple of the frequency. - # - # See https://github.com/pandas-dev/pandas/issues/8683 + # not a multiple of the frequency. See GH 8683 + # To handle frequencies that are not multiple or divisible by a day we let + # the possibility to define a fixed origin timestamp. See GH 31809 + origin_nanos = 0 # origin == "epoch" + if origin == "start_day": + origin_nanos = first.normalize().value + elif origin == "start": + origin_nanos = first.value + elif isinstance(origin, Timestamp): + origin_nanos = origin.value + origin_nanos += offset.value if offset else 0 # GH 10117 & GH 19375. If first and last contain timezone information, # Perform the calculation in UTC in order to avoid localizing on an # Ambiguous or Nonexistent time. first_tzinfo = first.tzinfo last_tzinfo = last.tzinfo - start_day_nanos = first.normalize().value if first_tzinfo is not None: first = first.tz_convert("UTC") if last_tzinfo is not None: last = last.tz_convert("UTC") - base_nanos = (base % offset.n) * offset.nanos // offset.n - start_day_nanos += base_nanos - - foffset = (first.value - start_day_nanos) % offset.nanos - loffset = (last.value - start_day_nanos) % offset.nanos + foffset = (first.value - origin_nanos) % freq.nanos + loffset = (last.value - origin_nanos) % freq.nanos if closed == "right": if foffset > 0: # roll back fresult = first.value - foffset else: - fresult = first.value - offset.nanos + fresult = first.value - freq.nanos if loffset > 0: # roll forward - lresult = last.value + (offset.nanos - loffset) + lresult = last.value + (freq.nanos - loffset) else: # already the end of the road lresult = last.value @@ -1754,9 +1831,9 @@ def _adjust_dates_anchored(first, last, offset, closed="right", base=0): if loffset > 0: # roll forward - lresult = last.value + (offset.nanos - loffset) + lresult = last.value + (freq.nanos - loffset) else: - lresult = last.value + offset.nanos + lresult = last.value + freq.nanos fresult = Timestamp(fresult) lresult = Timestamp(lresult) if first_tzinfo is not None: diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index d0559923fec51..485535bec20d0 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta +from datetime import datetime import numpy as np import pytest @@ -9,7 +9,7 @@ from pandas.core.groupby.groupby import DataError from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range -from pandas.core.indexes.period import PeriodIndex, period_range +from pandas.core.indexes.period import period_range from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.resample import _asfreq_compat @@ -194,29 +194,6 @@ def test_resample_empty_dtypes(index, dtype, resample_method): pass -@all_ts -@pytest.mark.parametrize("arg", ["mean", {"value": "mean"}, ["mean"]]) -def test_resample_loffset_arg_type(frame, create_index, arg): - # GH 13218, 15002 - df = frame - expected_means = [df.values[i : i + 2].mean() for i in range(0, len(df.values), 2)] - expected_index = create_index(df.index[0], periods=len(df.index) / 2, freq="2D") - - # loffset coerces PeriodIndex to DateTimeIndex - if isinstance(expected_index, PeriodIndex): - expected_index = expected_index.to_timestamp() - - expected_index += timedelta(hours=2) - expected = DataFrame({"value": expected_means}, index=expected_index) - - result_agg = df.resample("2D", loffset="2H").agg(arg) - - if isinstance(arg, list): - expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) - - tm.assert_frame_equal(result_agg, expected) - - @all_ts def test_apply_to_empty_series(empty_series_dti): # GH 14313 diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 0c364d37f039e..fe005801aaa53 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta +from datetime import datetime from functools import partial from io import StringIO @@ -18,7 +18,7 @@ from pandas.core.resample import DatetimeIndex, _get_timestamp_range_edges import pandas.tseries.offsets as offsets -from pandas.tseries.offsets import BDay, Minute +from pandas.tseries.offsets import Minute @pytest.fixture() @@ -412,70 +412,6 @@ def test_resample_frame_basic(): df.resample("W-WED", kind="period").mean() -@pytest.mark.parametrize( - "loffset", [timedelta(minutes=1), "1min", Minute(1), np.timedelta64(1, "m")] -) -def test_resample_loffset(loffset): - # GH 7687 - rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") - s = Series(np.random.randn(14), index=rng) - - result = s.resample("5min", closed="right", label="right", loffset=loffset).mean() - idx = date_range("1/1/2000", periods=4, freq="5min") - expected = Series( - [s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], - index=idx + timedelta(minutes=1), - ) - tm.assert_series_equal(result, expected) - assert result.index.freq == Minute(5) - - # from daily - dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D") - ser = Series(np.random.rand(len(dti)), dti) - - # to weekly - result = ser.resample("w-sun").last() - business_day_offset = BDay() - expected = ser.resample("w-sun", loffset=-business_day_offset).last() - assert result.index[0] - business_day_offset == expected.index[0] - - -def test_resample_loffset_upsample(): - # GH 20744 - rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") - s = Series(np.random.randn(14), index=rng) - - result = s.resample( - "5min", closed="right", label="right", loffset=timedelta(minutes=1) - ).ffill() - idx = date_range("1/1/2000", periods=4, freq="5min") - expected = Series([s[0], s[5], s[10], s[-1]], index=idx + timedelta(minutes=1)) - - tm.assert_series_equal(result, expected) - - -def test_resample_loffset_count(): - # GH 12725 - start_time = "1/1/2000 00:00:00" - rng = date_range(start_time, periods=100, freq="S") - ts = Series(np.random.randn(len(rng)), index=rng) - - result = ts.resample("10S", loffset="1s").count() - - expected_index = date_range(start_time, periods=10, freq="10S") + timedelta( - seconds=1 - ) - expected = Series(10, index=expected_index) - - tm.assert_series_equal(result, expected) - - # Same issue should apply to .size() since it goes through - # same code path - result = ts.resample("10S", loffset="1s").size() - - tm.assert_series_equal(result, expected) - - def test_resample_upsample(): # from daily dti = date_range( @@ -791,27 +727,177 @@ def test_resample_single_group(): tm.assert_series_equal(result, expected) -def test_resample_base(): +def test_resample_offset(): + # GH 31809 + rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s") ts = Series(np.random.randn(len(rng)), index=rng) - resampled = ts.resample("5min", base=2).mean() + resampled = ts.resample("5min", offset="2min").mean() exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min") tm.assert_index_equal(resampled.index, exp_rng) -def test_resample_float_base(): - # GH25161 - dt = pd.to_datetime( - ["2018-11-26 16:17:43.51", "2018-11-26 16:17:44.51", "2018-11-26 16:17:45.51"] - ) - s = Series(np.arange(3), index=dt) +def test_resample_origin(): + # GH 31809 + rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) - base = 17 + 43.51 / 60 - result = s.resample("3min", base=base).size() - expected = Series( - 3, index=pd.DatetimeIndex(["2018-11-26 16:17:43.51"], freq="3min") + exp_rng = date_range("1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min") + + resampled = ts.resample("5min", origin="1999-12-31 23:57:00").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + offset_timestamp = pd.Timestamp(0) + pd.Timedelta("2min") + resampled = ts.resample("5min", origin=offset_timestamp).mean() + tm.assert_index_equal(resampled.index, exp_rng) + + resampled = ts.resample("5min", origin="epoch", offset="2m").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + # origin of '1999-31-12 12:02:00' should be equivalent for this case + resampled = ts.resample("5min", origin="1999-12-31 12:02:00").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + resampled = ts.resample("5min", offset="-3m").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + +@pytest.mark.parametrize( + "origin", ["invalid_value", "epch", "startday", "startt", "2000-30-30", object()], +) +def test_resample_bad_origin(origin): + rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + msg = ( + "'origin' should be equal to 'epoch', 'start', 'start_day' or " + f"should be a Timestamp convertible type. Got '{origin}' instead." ) + with pytest.raises(ValueError, match=msg): + ts.resample("5min", origin=origin) + + +@pytest.mark.parametrize( + "offset", ["invalid_value", "12dayys", "2000-30-30", object()], +) +def test_resample_bad_offset(offset): + rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + msg = f"'offset' should be a Timedelta convertible type. Got '{offset}' instead." + with pytest.raises(ValueError, match=msg): + ts.resample("5min", offset=offset) + + +def test_resample_origin_prime_freq(): + # GH 31809 + start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" + rng = pd.date_range(start, end, freq="7min") + ts = Series(np.random.randn(len(rng)), index=rng) + + exp_rng = date_range("2000-10-01 23:14:00", "2000-10-02 00:22:00", freq="17min") + resampled = ts.resample("17min").mean() + tm.assert_index_equal(resampled.index, exp_rng) + resampled = ts.resample("17min", origin="start_day").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + exp_rng = date_range("2000-10-01 23:30:00", "2000-10-02 00:21:00", freq="17min") + resampled = ts.resample("17min", origin="start").mean() + tm.assert_index_equal(resampled.index, exp_rng) + resampled = ts.resample("17min", offset="23h30min").mean() + tm.assert_index_equal(resampled.index, exp_rng) + resampled = ts.resample("17min", origin="start_day", offset="23h30min").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + exp_rng = date_range("2000-10-01 23:18:00", "2000-10-02 00:26:00", freq="17min") + resampled = ts.resample("17min", origin="epoch").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + exp_rng = date_range("2000-10-01 23:24:00", "2000-10-02 00:15:00", freq="17min") + resampled = ts.resample("17min", origin="2000-01-01").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + +def test_resample_origin_with_tz(): + # GH 31809 + msg = "The origin must have the same timezone as the index." + + tz = "Europe/Paris" + rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s", tz=tz) + ts = Series(np.random.randn(len(rng)), index=rng) + + exp_rng = date_range("1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min", tz=tz) + resampled = ts.resample("5min", origin="1999-12-31 23:57:00+00:00").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + # origin of '1999-31-12 12:02:00+03:00' should be equivalent for this case + resampled = ts.resample("5min", origin="1999-12-31 12:02:00+03:00").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + resampled = ts.resample("5min", origin="epoch", offset="2m").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + with pytest.raises(ValueError, match=msg): + ts.resample("5min", origin="12/31/1999 23:57:00").mean() + + # if the series is not tz aware, origin should not be tz aware + rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + with pytest.raises(ValueError, match=msg): + ts.resample("5min", origin="12/31/1999 23:57:00+03:00").mean() + + +def test_resample_origin_with_day_freq_on_dst(): + # GH 31809 + tz = "America/Chicago" + + def _create_series(values, timestamps, freq="D"): + return pd.Series( + values, + index=pd.DatetimeIndex( + [Timestamp(t, tz=tz) for t in timestamps], freq=freq, ambiguous=True + ), + ) + + # test classical behavior of origin in a DST context + start = pd.Timestamp("2013-11-02", tz=tz) + end = pd.Timestamp("2013-11-03 23:59", tz=tz) + rng = pd.date_range(start, end, freq="1h") + ts = pd.Series(np.ones(len(rng)), index=rng) + + expected = _create_series([24.0, 25.0], ["2013-11-02", "2013-11-03"]) + for origin in ["epoch", "start", "start_day", start, None]: + result = ts.resample("D", origin=origin).sum() + tm.assert_series_equal(result, expected) + + # test complex behavior of origin/offset in a DST context + start = pd.Timestamp("2013-11-03", tz=tz) + end = pd.Timestamp("2013-11-03 23:59", tz=tz) + rng = pd.date_range(start, end, freq="1h") + ts = pd.Series(np.ones(len(rng)), index=rng) + + expected_ts = ["2013-11-02 22:00-05:00", "2013-11-03 22:00-06:00"] + expected = _create_series([23.0, 2.0], expected_ts) + result = ts.resample("D", origin="start", offset="-2H").sum() + tm.assert_series_equal(result, expected) + + expected_ts = ["2013-11-02 22:00-05:00", "2013-11-03 21:00-06:00"] + expected = _create_series([22.0, 3.0], expected_ts, freq="24H") + result = ts.resample("24H", origin="start", offset="-2H").sum() + tm.assert_series_equal(result, expected) + + expected_ts = ["2013-11-02 02:00-05:00", "2013-11-03 02:00-06:00"] + expected = _create_series([3.0, 22.0], expected_ts) + result = ts.resample("D", origin="start", offset="2H").sum() + tm.assert_series_equal(result, expected) + + expected_ts = ["2013-11-02 23:00-05:00", "2013-11-03 23:00-06:00"] + expected = _create_series([24.0, 1.0], expected_ts) + result = ts.resample("D", origin="start", offset="-1H").sum() + tm.assert_series_equal(result, expected) + + expected_ts = ["2013-11-02 01:00-05:00", "2013-11-03 01:00:00-0500"] + expected = _create_series([1.0, 24.0], expected_ts) + result = ts.resample("D", origin="start", offset="1H").sum() tm.assert_series_equal(result, expected) @@ -1588,7 +1674,7 @@ def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k): @pytest.mark.parametrize( - "first,last,offset,exp_first,exp_last", + "first,last,freq,exp_first,exp_last", [ ("19910905", "19920406", "D", "19910905", "19920407"), ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920407"), @@ -1598,17 +1684,17 @@ def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k): ("1991-08", "1992-04", "M", "19910831", "19920531"), ], ) -def test_get_timestamp_range_edges(first, last, offset, exp_first, exp_last): +def test_get_timestamp_range_edges(first, last, freq, exp_first, exp_last): first = pd.Period(first) first = first.to_timestamp(first.freq) last = pd.Period(last) last = last.to_timestamp(last.freq) - exp_first = pd.Timestamp(exp_first, freq=offset) - exp_last = pd.Timestamp(exp_last, freq=offset) + exp_first = pd.Timestamp(exp_first, freq=freq) + exp_last = pd.Timestamp(exp_last, freq=freq) - offset = pd.tseries.frequencies.to_offset(offset) - result = _get_timestamp_range_edges(first, last, offset) + freq = pd.tseries.frequencies.to_offset(freq) + result = _get_timestamp_range_edges(first, last, freq) expected = (exp_first, exp_last) assert result == expected diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py new file mode 100644 index 0000000000000..8b3adbf08d157 --- /dev/null +++ b/pandas/tests/resample/test_deprecated.py @@ -0,0 +1,263 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.core.indexes.datetimes import date_range +from pandas.core.indexes.period import PeriodIndex, period_range +from pandas.core.indexes.timedeltas import timedelta_range + +from pandas.tseries.offsets import BDay, Minute + +DATE_RANGE = (date_range, "dti", datetime(2005, 1, 1), datetime(2005, 1, 10)) +PERIOD_RANGE = (period_range, "pi", datetime(2005, 1, 1), datetime(2005, 1, 10)) +TIMEDELTA_RANGE = (timedelta_range, "tdi", "1 day", "10 day") + +all_ts = pytest.mark.parametrize( + "_index_factory,_series_name,_index_start,_index_end", + [DATE_RANGE, PERIOD_RANGE, TIMEDELTA_RANGE], +) + + +@pytest.fixture() +def _index_factory(): + return period_range + + +@pytest.fixture +def create_index(_index_factory): + def _create_index(*args, **kwargs): + """ return the _index_factory created using the args, kwargs """ + return _index_factory(*args, **kwargs) + + return _create_index + + +# new test to check that all FutureWarning are triggered +def test_deprecating_on_loffset_and_base(): + # GH 31809 + + idx = pd.date_range("2001-01-01", periods=4, freq="T") + df = pd.DataFrame(data=4 * [range(2)], index=idx, columns=["a", "b"]) + + with tm.assert_produces_warning(FutureWarning): + pd.Grouper(freq="10s", base=0) + with tm.assert_produces_warning(FutureWarning): + pd.Grouper(freq="10s", loffset="0s") + with tm.assert_produces_warning(FutureWarning): + df.groupby("a").resample("3T", base=0).sum() + with tm.assert_produces_warning(FutureWarning): + df.groupby("a").resample("3T", loffset="0s").sum() + with tm.assert_produces_warning(FutureWarning): + df.resample("3T", base=0).sum() + with tm.assert_produces_warning(FutureWarning): + df.resample("3T", loffset="0s").sum() + msg = "'offset' and 'base' cannot be present at the same time" + with tm.assert_produces_warning(FutureWarning): + with pytest.raises(ValueError, match=msg): + df.groupby("a").resample("3T", base=0, offset=0).sum() + + +@all_ts +@pytest.mark.parametrize("arg", ["mean", {"value": "mean"}, ["mean"]]) +def test_resample_loffset_arg_type(frame, create_index, arg): + # GH 13218, 15002 + df = frame + expected_means = [df.values[i : i + 2].mean() for i in range(0, len(df.values), 2)] + expected_index = create_index(df.index[0], periods=len(df.index) / 2, freq="2D") + + # loffset coerces PeriodIndex to DateTimeIndex + if isinstance(expected_index, PeriodIndex): + expected_index = expected_index.to_timestamp() + + expected_index += timedelta(hours=2) + expected = DataFrame({"value": expected_means}, index=expected_index) + + with tm.assert_produces_warning(FutureWarning): + result_agg = df.resample("2D", loffset="2H").agg(arg) + + if isinstance(arg, list): + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) + + tm.assert_frame_equal(result_agg, expected) + + +@pytest.mark.parametrize( + "loffset", [timedelta(minutes=1), "1min", Minute(1), np.timedelta64(1, "m")] +) +def test_resample_loffset(loffset): + # GH 7687 + rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") + s = Series(np.random.randn(14), index=rng) + + with tm.assert_produces_warning(FutureWarning): + result = s.resample( + "5min", closed="right", label="right", loffset=loffset + ).mean() + idx = date_range("1/1/2000", periods=4, freq="5min") + expected = Series( + [s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], + index=idx + timedelta(minutes=1), + ) + tm.assert_series_equal(result, expected) + assert result.index.freq == Minute(5) + + # from daily + dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D") + ser = Series(np.random.rand(len(dti)), dti) + + # to weekly + result = ser.resample("w-sun").last() + business_day_offset = BDay() + with tm.assert_produces_warning(FutureWarning): + expected = ser.resample("w-sun", loffset=-business_day_offset).last() + assert result.index[0] - business_day_offset == expected.index[0] + + +def test_resample_loffset_upsample(): + # GH 20744 + rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") + s = Series(np.random.randn(14), index=rng) + + with tm.assert_produces_warning(FutureWarning): + result = s.resample( + "5min", closed="right", label="right", loffset=timedelta(minutes=1) + ).ffill() + idx = date_range("1/1/2000", periods=4, freq="5min") + expected = Series([s[0], s[5], s[10], s[-1]], index=idx + timedelta(minutes=1)) + + tm.assert_series_equal(result, expected) + + +def test_resample_loffset_count(): + # GH 12725 + start_time = "1/1/2000 00:00:00" + rng = date_range(start_time, periods=100, freq="S") + ts = Series(np.random.randn(len(rng)), index=rng) + + with tm.assert_produces_warning(FutureWarning): + result = ts.resample("10S", loffset="1s").count() + + expected_index = date_range(start_time, periods=10, freq="10S") + timedelta( + seconds=1 + ) + expected = Series(10, index=expected_index) + + tm.assert_series_equal(result, expected) + + # Same issue should apply to .size() since it goes through + # same code path + with tm.assert_produces_warning(FutureWarning): + result = ts.resample("10S", loffset="1s").size() + + tm.assert_series_equal(result, expected) + + +def test_resample_base(): + rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + + with tm.assert_produces_warning(FutureWarning): + resampled = ts.resample("5min", base=2).mean() + exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min") + tm.assert_index_equal(resampled.index, exp_rng) + + +def test_resample_float_base(): + # GH25161 + dt = pd.to_datetime( + ["2018-11-26 16:17:43.51", "2018-11-26 16:17:44.51", "2018-11-26 16:17:45.51"] + ) + s = Series(np.arange(3), index=dt) + + base = 17 + 43.51 / 60 + with tm.assert_produces_warning(FutureWarning): + result = s.resample("3min", base=base).size() + expected = Series( + 3, index=pd.DatetimeIndex(["2018-11-26 16:17:43.51"], freq="3min") + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("kind", ["period", None, "timestamp"]) +@pytest.mark.parametrize("agg_arg", ["mean", {"value": "mean"}, ["mean"]]) +def test_loffset_returns_datetimeindex(frame, kind, agg_arg): + # make sure passing loffset returns DatetimeIndex in all cases + # basic method taken from Base.test_resample_loffset_arg_type() + df = frame + expected_means = [df.values[i : i + 2].mean() for i in range(0, len(df.values), 2)] + expected_index = period_range(df.index[0], periods=len(df.index) / 2, freq="2D") + + # loffset coerces PeriodIndex to DateTimeIndex + expected_index = expected_index.to_timestamp() + expected_index += timedelta(hours=2) + expected = DataFrame({"value": expected_means}, index=expected_index) + + with tm.assert_produces_warning(FutureWarning): + result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg) + if isinstance(agg_arg, list): + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) + tm.assert_frame_equal(result_agg, expected) + + +@pytest.mark.parametrize( + "start,end,start_freq,end_freq,base,offset", + [ + ("19910905", "19910909 03:00", "H", "24H", 10, "10H"), + ("19910905", "19910909 12:00", "H", "24H", 10, "10H"), + ("19910905", "19910909 23:00", "H", "24H", 10, "10H"), + ("19910905 10:00", "19910909", "H", "24H", 10, "10H"), + ("19910905 10:00", "19910909 10:00", "H", "24H", 10, "10H"), + ("19910905", "19910909 10:00", "H", "24H", 10, "10H"), + ("19910905 12:00", "19910909", "H", "24H", 10, "10H"), + ("19910905 12:00", "19910909 03:00", "H", "24H", 10, "10H"), + ("19910905 12:00", "19910909 12:00", "H", "24H", 10, "10H"), + ("19910905 12:00", "19910909 12:00", "H", "24H", 34, "34H"), + ("19910905 12:00", "19910909 12:00", "H", "17H", 10, "10H"), + ("19910905 12:00", "19910909 12:00", "H", "17H", 3, "3H"), + ("19910905 12:00", "19910909 1:00", "H", "M", 3, "3H"), + ("19910905", "19910913 06:00", "2H", "24H", 10, "10H"), + ("19910905", "19910905 01:39", "Min", "5Min", 3, "3Min"), + ("19910905", "19910905 03:18", "2Min", "5Min", 3, "3Min"), + ], +) +def test_resample_with_non_zero_base(start, end, start_freq, end_freq, base, offset): + # GH 23882 + s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq)) + s = s + np.arange(len(s)) + with tm.assert_produces_warning(FutureWarning): + result = s.resample(end_freq, base=base).mean() + result = result.to_timestamp(end_freq) + + # test that the replacement argument 'offset' works + result_offset = s.resample(end_freq, offset=offset).mean() + result_offset = result_offset.to_timestamp(end_freq) + tm.assert_series_equal(result, result_offset) + + # to_timestamp casts 24H -> D + result = result.asfreq(end_freq) if end_freq == "24H" else result + with tm.assert_produces_warning(FutureWarning): + expected = s.to_timestamp().resample(end_freq, base=base).mean() + if end_freq == "M": + # TODO: is non-tick the relevant characteristic? (GH 33815) + expected.index = expected.index._with_freq(None) + tm.assert_series_equal(result, expected) + + +def test_resample_base_with_timedeltaindex(): + # GH 10530 + rng = timedelta_range(start="0s", periods=25, freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + + with tm.assert_produces_warning(FutureWarning): + with_base = ts.resample("2s", base=5).mean() + without_base = ts.resample("2s").mean() + + exp_without_base = timedelta_range(start="0s", end="25s", freq="2s") + exp_with_base = timedelta_range(start="5s", end="29s", freq="2s") + + tm.assert_index_equal(without_base.index, exp_without_base) + tm.assert_index_equal(with_base.index, exp_with_base) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index ebc75018bb52d..3db9a91118ebc 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta +from datetime import datetime import dateutil import numpy as np @@ -719,27 +719,6 @@ def test_evenly_divisible_with_no_extra_bins(self): result = df.resample("7D").sum() tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) - @pytest.mark.parametrize("agg_arg", ["mean", {"value": "mean"}, ["mean"]]) - def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg): - # make sure passing loffset returns DatetimeIndex in all cases - # basic method taken from Base.test_resample_loffset_arg_type() - df = frame - expected_means = [ - df.values[i : i + 2].mean() for i in range(0, len(df.values), 2) - ] - expected_index = period_range(df.index[0], periods=len(df.index) / 2, freq="2D") - - # loffset coerces PeriodIndex to DateTimeIndex - expected_index = expected_index.to_timestamp() - expected_index += timedelta(hours=2) - expected = DataFrame({"value": expected_means}, index=expected_index) - - result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg) - if isinstance(agg_arg, list): - expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) - tm.assert_frame_equal(result_agg, expected) - @pytest.mark.parametrize("freq, period_mult", [("H", 24), ("12H", 2)]) @pytest.mark.parametrize("kind", [None, "period"]) def test_upsampling_ohlc(self, freq, period_mult, kind): @@ -815,42 +794,41 @@ def test_resample_with_only_nat(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "start,end,start_freq,end_freq,base", + "start,end,start_freq,end_freq,offset", [ - ("19910905", "19910909 03:00", "H", "24H", 10), - ("19910905", "19910909 12:00", "H", "24H", 10), - ("19910905", "19910909 23:00", "H", "24H", 10), - ("19910905 10:00", "19910909", "H", "24H", 10), - ("19910905 10:00", "19910909 10:00", "H", "24H", 10), - ("19910905", "19910909 10:00", "H", "24H", 10), - ("19910905 12:00", "19910909", "H", "24H", 10), - ("19910905 12:00", "19910909 03:00", "H", "24H", 10), - ("19910905 12:00", "19910909 12:00", "H", "24H", 10), - ("19910905 12:00", "19910909 12:00", "H", "24H", 34), - ("19910905 12:00", "19910909 12:00", "H", "17H", 10), - ("19910905 12:00", "19910909 12:00", "H", "17H", 3), - ("19910905 12:00", "19910909 1:00", "H", "M", 3), - ("19910905", "19910913 06:00", "2H", "24H", 10), - ("19910905", "19910905 01:39", "Min", "5Min", 3), - ("19910905", "19910905 03:18", "2Min", "5Min", 3), + ("19910905", "19910909 03:00", "H", "24H", "10H"), + ("19910905", "19910909 12:00", "H", "24H", "10H"), + ("19910905", "19910909 23:00", "H", "24H", "10H"), + ("19910905 10:00", "19910909", "H", "24H", "10H"), + ("19910905 10:00", "19910909 10:00", "H", "24H", "10H"), + ("19910905", "19910909 10:00", "H", "24H", "10H"), + ("19910905 12:00", "19910909", "H", "24H", "10H"), + ("19910905 12:00", "19910909 03:00", "H", "24H", "10H"), + ("19910905 12:00", "19910909 12:00", "H", "24H", "10H"), + ("19910905 12:00", "19910909 12:00", "H", "24H", "34H"), + ("19910905 12:00", "19910909 12:00", "H", "17H", "10H"), + ("19910905 12:00", "19910909 12:00", "H", "17H", "3H"), + ("19910905 12:00", "19910909 1:00", "H", "M", "3H"), + ("19910905", "19910913 06:00", "2H", "24H", "10H"), + ("19910905", "19910905 01:39", "Min", "5Min", "3Min"), + ("19910905", "19910905 03:18", "2Min", "5Min", "3Min"), ], ) - def test_resample_with_non_zero_base(self, start, end, start_freq, end_freq, base): - # GH 23882 + def test_resample_with_offset(self, start, end, start_freq, end_freq, offset): + # GH 23882 & 31809 s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq)) s = s + np.arange(len(s)) - result = s.resample(end_freq, base=base).mean() + result = s.resample(end_freq, offset=offset).mean() result = result.to_timestamp(end_freq) - # to_timestamp casts 24H -> D - result = result.asfreq(end_freq) if end_freq == "24H" else result - expected = s.to_timestamp().resample(end_freq, base=base).mean() + + expected = s.to_timestamp().resample(end_freq, offset=offset).mean() if end_freq == "M": - # TODO: is non-tick the relevant characteristic? + # TODO: is non-tick the relevant characteristic? (GH 33815) expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "first,last,offset,exp_first,exp_last", + "first,last,freq,exp_first,exp_last", [ ("19910905", "19920406", "D", "19910905", "19920406"), ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920406"), @@ -866,15 +844,15 @@ def test_resample_with_non_zero_base(self, start, end, start_freq, end_freq, bas ("1991-08", "1992-04", "M", "1991-08", "1992-04"), ], ) - def test_get_period_range_edges(self, first, last, offset, exp_first, exp_last): + def test_get_period_range_edges(self, first, last, freq, exp_first, exp_last): first = pd.Period(first) last = pd.Period(last) - exp_first = pd.Period(exp_first, freq=offset) - exp_last = pd.Period(exp_last, freq=offset) + exp_first = pd.Period(exp_first, freq=freq) + exp_last = pd.Period(exp_last, freq=freq) - offset = pd.tseries.frequencies.to_offset(offset) - result = _get_period_range_edges(first, last, offset) + freq = pd.tseries.frequencies.to_offset(freq) + result = _get_period_range_edges(first, last, freq) expected = (exp_first, exp_last) assert result == expected diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 5044a18e33248..73aa01cff84fa 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -25,7 +25,13 @@ def test_str(): r = test_series.resample("H") assert ( "DatetimeIndexResampler [freq=, axis=0, closed=left, " - "label=left, convention=start, base=0]" in str(r) + "label=left, convention=start, origin=start_day]" in str(r) + ) + + r = test_series.resample("H", origin="2000-01-01") + assert ( + "DatetimeIndexResampler [freq=, axis=0, closed=left, " + "label=left, convention=start, origin=2000-01-01 00:00:00]" in str(r) ) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 035698687cfc2..cbf3a778f9ae0 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -1,6 +1,7 @@ from textwrap import dedent import numpy as np +import pytest from pandas.util._test_decorators import async_mark @@ -131,6 +132,47 @@ def test_groupby_resample_on_api_with_getitem(): tm.assert_series_equal(result, exp) +def test_groupby_with_origin(): + # GH 31809 + + freq = "1399min" # prime number that is smaller than 24h + start, end = "1/1/2000 00:00:00", "1/31/2000 00:00" + middle = "1/15/2000 00:00:00" + + rng = pd.date_range(start, end, freq="1231min") # prime number + ts = pd.Series(np.random.randn(len(rng)), index=rng) + ts2 = ts[middle:end] + + # proves that grouper without a fixed origin does not work + # when dealing with unusual frequencies + simple_grouper = pd.Grouper(freq=freq) + count_ts = ts.groupby(simple_grouper).agg("count") + count_ts = count_ts[middle:end] + count_ts2 = ts2.groupby(simple_grouper).agg("count") + with pytest.raises(AssertionError): + tm.assert_index_equal(count_ts.index, count_ts2.index) + + # test origin on 1970-01-01 00:00:00 + origin = pd.Timestamp(0) + adjusted_grouper = pd.Grouper(freq=freq, origin=origin) + adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count") + adjusted_count_ts = adjusted_count_ts[middle:end] + adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count") + tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2) + + # test origin on 2049-10-18 20:00:00 + origin_future = pd.Timestamp(0) + pd.Timedelta("1399min") * 30_000 + adjusted_grouper2 = pd.Grouper(freq=freq, origin=origin_future) + adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count") + adjusted2_count_ts = adjusted2_count_ts[middle:end] + adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count") + tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2) + + # both grouper use an adjusted timestamp that is a multiple of 1399 min + # they should be equals even if the adjusted_timestamp is in the future + tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2) + + def test_nearest(): # GH 17496 diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 49ac5f81f9c02..26e429c47b494 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -166,7 +166,7 @@ def test_aggregate_normal(resample_method): ("prod", dict(min_count=1), np.nan), ], ) -def test_resample_entirly_nat_window(method, method_args, unit): +def test_resample_entirely_nat_window(method, method_args, unit): s = pd.Series([0] * 2 + [np.nan] * 2, index=pd.date_range("2017", periods=4)) result = methodcaller(method, **method_args)(s.resample("2d")) expected = pd.Series( @@ -251,7 +251,15 @@ def test_repr(): expected = ( "TimeGrouper(key='A', freq=, axis=0, sort=True, " "closed='left', label='left', how='mean', " - "convention='e', base=0)" + "convention='e', origin='start_day')" + ) + assert result == expected + + result = repr(Grouper(key="A", freq="H", origin="2000-01-01")) + expected = ( + "TimeGrouper(key='A', freq=, axis=0, sort=True, " + "closed='left', label='left', how='mean', " + "convention='e', origin=Timestamp('2000-01-01 00:00:00'))" ) assert result == expected diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 1b4a625f078c9..0fbb60c176b30 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -80,13 +80,12 @@ def test_resample_timedelta_idempotency(): tm.assert_series_equal(result, expected) -def test_resample_base_with_timedeltaindex(): - - # GH 10530 +def test_resample_offset_with_timedeltaindex(): + # GH 10530 & 31809 rng = timedelta_range(start="0s", periods=25, freq="s") ts = Series(np.random.randn(len(rng)), index=rng) - with_base = ts.resample("2s", base=5).mean() + with_base = ts.resample("2s", offset="5s").mean() without_base = ts.resample("2s").mean() exp_without_base = timedelta_range(start="0s", end="25s", freq="2s")