From 4fbb93c2ec0e823fc2bd18445d99807e82bde0fa Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Tue, 27 Nov 2018 00:23:28 -0500 Subject: [PATCH 01/32] ENH - account for base argument in period resample --- pandas/core/resample.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 125b441e5558a..27a4e5cd06274 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1520,8 +1520,14 @@ def _get_period_bins(self, ax): start = ax.min().asfreq(self.freq, how=self.convention) end = ax.max().asfreq(self.freq, how='end') + start, end = _get_range_edges(start.to_timestamp(), + end.to_timestamp(), + self.freq, + closed=self.closed, + base=self.base) + labels = binner = PeriodIndex(start=start, end=end, - freq=self.freq, name=ax.name) + freq=self.freq, name=ax.name)[:-1] i8 = memb.asi8 freq_mult = self.freq.n @@ -1531,6 +1537,7 @@ def _get_period_bins(self, ax): i8_extend = expected_bins_count - (i8[-1] - i8[0]) rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) rng += freq_mult + rng -= ((freq_mult - self.base) % freq_mult) bins = memb.searchsorted(rng, side='left') if nat_count > 0: From ac3f301c93a008c5f4382a9e409156df220d9006 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Tue, 27 Nov 2018 01:07:08 -0500 Subject: [PATCH 02/32] BUG - closer on perfectly emulating previous behavior --- pandas/core/resample.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 27a4e5cd06274..eccd293a69629 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1517,17 +1517,19 @@ def _get_period_bins(self, ax): data=[], freq=self.freq, name=ax.name) return binner, [], labels - start = ax.min().asfreq(self.freq, how=self.convention) - end = ax.max().asfreq(self.freq, how='end') - - start, end = _get_range_edges(start.to_timestamp(), - end.to_timestamp(), - self.freq, - closed=self.closed, - base=self.base) - - labels = binner = PeriodIndex(start=start, end=end, - freq=self.freq, name=ax.name)[:-1] + start = ax.min().asfreq(self.freq, how=self.convention).to_timestamp() + end = ax.max().asfreq(self.freq, how='end').to_timestamp() + + p_start, p_end = _get_range_edges(start, + end, + self.freq, + closed=self.closed, + base=self.base) + + i = None if self.freq.onOffset(start) else 1 + j = -1 if self.freq.onOffset(end) else None + labels = binner = PeriodIndex(start=p_start, end=p_end, + freq=self.freq, name=ax.name)[i:j] i8 = memb.asi8 freq_mult = self.freq.n From 2a72bac15379f9f3df7916006bd3eeaa8612457d Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Tue, 27 Nov 2018 20:27:08 -0500 Subject: [PATCH 03/32] BUG - all original resample tests now pass --- pandas/core/resample.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index eccd293a69629..696d49d5bcdc2 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1517,19 +1517,26 @@ def _get_period_bins(self, ax): data=[], freq=self.freq, name=ax.name) return binner, [], labels - start = ax.min().asfreq(self.freq, how=self.convention).to_timestamp() - end = ax.max().asfreq(self.freq, how='end').to_timestamp() + start = ax.min().asfreq(self.freq, how=self.convention) + end = ax.max().asfreq(self.freq, how='end') + if self.base: + start = start.to_timestamp() + end = end.to_timestamp() + + p_start, p_end = _get_range_edges(start, + end, + self.freq, + closed=self.closed, + base=self.base) + i = None if self.freq.onOffset(start) else 1 + j = -1 if self.freq.onOffset(end) else None + else: + p_start, p_end = start, end + i = j = None - p_start, p_end = _get_range_edges(start, - end, - self.freq, - closed=self.closed, - base=self.base) - i = None if self.freq.onOffset(start) else 1 - j = -1 if self.freq.onOffset(end) else None labels = binner = PeriodIndex(start=p_start, end=p_end, - freq=self.freq, name=ax.name)[i:j] + freq=self.freq, name=ax.name)[slice(i, j)] i8 = memb.asi8 freq_mult = self.freq.n From cb03d4ea56db9090ef2d2358bf512ca66844c244 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Tue, 27 Nov 2018 20:27:39 -0500 Subject: [PATCH 04/32] TST - add preliminary tests for pandas period resample with base --- pandas/tests/resample/test_period_index.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 99b8edd5dbbea..fce0fe8fb5199 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -740,3 +740,15 @@ def test_resample_with_only_nat(self): expected = DataFrame([], index=expected_index) result = frame.resample('1s').mean() assert_frame_equal(result, expected) + + def test_resample_with_non_zero_base(self): + # GH 23882 + s = pd.Series(range(100), index=pd.period_range('19910905', periods=100, freq='H')) + pr = s.resample('24H', base=10).mean().to_timestamp().asfreq('24H') + tr = s.to_timestamp().resample('24H', base=10).mean() + assert_series_equal(pr, tr) + + s = pd.Series(range(100), index=pd.period_range('19910905', periods=100, freq='2H')) + pr = s.resample('24H', base=10).mean().to_timestamp().asfreq('24H') + tr = s.to_timestamp().resample('24H', base=10).mean() + assert_series_equal(pr, tr) From 4833c97d561d732d1ac9362ed0370c2479360d42 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Tue, 27 Nov 2018 20:29:52 -0500 Subject: [PATCH 05/32] CLN - pep8 adherence --- pandas/core/resample.py | 4 ++-- pandas/tests/resample/test_period_index.py | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 696d49d5bcdc2..7b813c222eb83 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1534,9 +1534,9 @@ def _get_period_bins(self, ax): p_start, p_end = start, end i = j = None - labels = binner = PeriodIndex(start=p_start, end=p_end, - freq=self.freq, name=ax.name)[slice(i, j)] + freq=self.freq, + name=ax.name)[slice(i, j)] i8 = memb.asi8 freq_mult = self.freq.n diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index fce0fe8fb5199..e99b9430d481d 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -743,12 +743,16 @@ def test_resample_with_only_nat(self): def test_resample_with_non_zero_base(self): # GH 23882 - s = pd.Series(range(100), index=pd.period_range('19910905', periods=100, freq='H')) + s = pd.Series(range(100), index=pd.period_range('19910905', + periods=100, + freq='H')) pr = s.resample('24H', base=10).mean().to_timestamp().asfreq('24H') tr = s.to_timestamp().resample('24H', base=10).mean() assert_series_equal(pr, tr) - s = pd.Series(range(100), index=pd.period_range('19910905', periods=100, freq='2H')) + s = pd.Series(range(100), index=pd.period_range('19910905', + periods=100, + freq='2H')) pr = s.resample('24H', base=10).mean().to_timestamp().asfreq('24H') tr = s.to_timestamp().resample('24H', base=10).mean() assert_series_equal(pr, tr) From 5a69414929109b2726d217f63d8e30c6aae8e5ce Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Tue, 27 Nov 2018 21:05:40 -0500 Subject: [PATCH 06/32] TST - add a couple more tests --- pandas/tests/resample/test_period_index.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index e99b9430d481d..76b927391701d 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -756,3 +756,17 @@ def test_resample_with_non_zero_base(self): pr = s.resample('24H', base=10).mean().to_timestamp().asfreq('24H') tr = s.to_timestamp().resample('24H', base=10).mean() assert_series_equal(pr, tr) + + s = pd.Series(range(100), index=pd.period_range('19910905', + periods=100, + freq='Min')) + pr = s.resample('5Min', base=3).mean().to_timestamp() + tr = s.to_timestamp().resample('5Min', base=3).mean() + assert_series_equal(pr, tr) + + s = pd.Series(range(100), index=pd.period_range('19910905', + periods=100, + freq='2Min')) + pr = s.resample('5Min', base=3).mean().to_timestamp() + tr = s.to_timestamp().resample('5Min', base=3).mean() + assert_series_equal(pr, tr) From 5987a2ecf1461af0cef8bcbdc1109d41b5b05ad4 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Wed, 28 Nov 2018 21:03:48 -0500 Subject: [PATCH 07/32] TST - parameterize tests --- pandas/tests/resample/test_period_index.py | 40 ++++++++-------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 76b927391701d..a9c6625945ff2 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -741,32 +741,22 @@ def test_resample_with_only_nat(self): result = frame.resample('1s').mean() assert_frame_equal(result, expected) - def test_resample_with_non_zero_base(self): + @pytest.mark.parametrize('start,end,start_freq,end_freq,base', [ + ('19910905', '19910909 03:00', 'H', '24H', 10), + ('19910905', '19910909 12:00', 'H', '24H', 10), + ('19910905 12:00', '19910909 03:00', 'H', '24H', 10), + ('19910905 12:00', '19910909 12:00', 'H', '24H', 10), + ('19910905', '19910913 06:00', '2H', '24H', 10), + ('19910905', '19910905 01:39', 'Min', '5Min', 3), + ('19910905', '19910905 03:18', '2Min', '5Min', 3), + ]) + def test_resample_with_non_zero_base(self, start, end, start_freq, + end_freq, base): # GH 23882 s = pd.Series(range(100), index=pd.period_range('19910905', periods=100, - freq='H')) - pr = s.resample('24H', base=10).mean().to_timestamp().asfreq('24H') - tr = s.to_timestamp().resample('24H', base=10).mean() - assert_series_equal(pr, tr) - - s = pd.Series(range(100), index=pd.period_range('19910905', - periods=100, - freq='2H')) - pr = s.resample('24H', base=10).mean().to_timestamp().asfreq('24H') - tr = s.to_timestamp().resample('24H', base=10).mean() - assert_series_equal(pr, tr) - - s = pd.Series(range(100), index=pd.period_range('19910905', - periods=100, - freq='Min')) - pr = s.resample('5Min', base=3).mean().to_timestamp() - tr = s.to_timestamp().resample('5Min', base=3).mean() - assert_series_equal(pr, tr) - - s = pd.Series(range(100), index=pd.period_range('19910905', - periods=100, - freq='2Min')) - pr = s.resample('5Min', base=3).mean().to_timestamp() - tr = s.to_timestamp().resample('5Min', base=3).mean() + freq=start_freq)) + pr = (s.resample(end_freq, base=base).mean().to_timestamp() + .asfreq(end_freq)) # to_timestamp casts 24H -> D + tr = s.to_timestamp().resample(end_freq, base=base).mean() assert_series_equal(pr, tr) From 8f3c976839a33a008f250d9a4c7e7856602d77f8 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 2 Dec 2018 13:40:41 -0500 Subject: [PATCH 08/32] DOC- add whatsnew entry --- doc/source/whatsnew/v0.24.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 1127d02f0a822..3d199881f4d09 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -280,6 +280,7 @@ Other Enhancements - :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`) - :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). - :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). +- :meth:`TimeGrouper._get_period_bins` now accounts for the base argument passed through the :class:`PeriodIndexResampler` (:issue:`23882`) - :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``, all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). From 58a59a69ff20d19a7da17a46af96bf277c2ba218 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Mon, 3 Dec 2018 20:22:59 -0500 Subject: [PATCH 09/32] DOC - add comments and modify whatsnew --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/resample.py | 4 ++++ pandas/tests/resample/test_period_index.py | 8 ++++---- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 3d199881f4d09..ddcf3270ba1cc 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -280,7 +280,7 @@ Other Enhancements - :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`) - :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). - :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). -- :meth:`TimeGrouper._get_period_bins` now accounts for the base argument passed through the :class:`PeriodIndexResampler` (:issue:`23882`) +- :class:`PeriodIndexResampler` aggregations will now respect the base argument in the same fashion as :class:`DatetimeIndexResampler` (:issue:`23882`) - :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``, all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 7b813c222eb83..f38bb71c6dda8 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1523,11 +1523,14 @@ def _get_period_bins(self, ax): start = start.to_timestamp() end = end.to_timestamp() + # get base adjusted bin edge labels p_start, p_end = _get_range_edges(start, end, self.freq, closed=self.closed, base=self.base) + + # compensate for edge labels being extened away from true labels i = None if self.freq.onOffset(start) else 1 j = -1 if self.freq.onOffset(end) else None else: @@ -1546,6 +1549,7 @@ def _get_period_bins(self, ax): i8_extend = expected_bins_count - (i8[-1] - i8[0]) rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) rng += freq_mult + # adjust bin edge indexes to account for base rng -= ((freq_mult - self.base) % freq_mult) bins = memb.searchsorted(rng, side='left') diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index a9c6625945ff2..b3a340c6386a3 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -756,7 +756,7 @@ def test_resample_with_non_zero_base(self, start, end, start_freq, s = pd.Series(range(100), index=pd.period_range('19910905', periods=100, freq=start_freq)) - pr = (s.resample(end_freq, base=base).mean().to_timestamp() - .asfreq(end_freq)) # to_timestamp casts 24H -> D - tr = s.to_timestamp().resample(end_freq, base=base).mean() - assert_series_equal(pr, tr) + result = (s.resample(end_freq, base=base).mean().to_timestamp() + .asfreq(end_freq)) # to_timestamp casts 24H -> D + expected = s.to_timestamp().resample(end_freq, base=base).mean() + assert_series_equal(result, expected) From 6b6d1a90ffe89639997128e41726cc1e11698a8a Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Mon, 3 Dec 2018 22:37:57 -0500 Subject: [PATCH 10/32] CLN - minor refactor of label creation and whatsnew --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/resample.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index ddcf3270ba1cc..82436b4a8e65e 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -280,7 +280,7 @@ Other Enhancements - :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`) - :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). - :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). -- :class:`PeriodIndexResampler` aggregations will now respect the base argument in the same fashion as :class:`DatetimeIndexResampler` (:issue:`23882`) +- :class:`PeriodIndexResampler` aggregations will now respect the ``base`` argument in the same fashion as :class:`DatetimeIndexResampler` (:issue:`23882`) - :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``, all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). diff --git a/pandas/core/resample.py b/pandas/core/resample.py index f38bb71c6dda8..8ddc6bb11f385 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1530,16 +1530,16 @@ def _get_period_bins(self, ax): closed=self.closed, base=self.base) - # compensate for edge labels being extened away from true labels + # compensate if edge labels are extened away from true labels i = None if self.freq.onOffset(start) else 1 j = -1 if self.freq.onOffset(end) else None - else: - p_start, p_end = start, end - i = j = None - labels = binner = PeriodIndex(start=p_start, end=p_end, + labels = binner = PeriodIndex(start=p_start, end=p_end, freq=self.freq, name=ax.name)[slice(i, j)] + else: + labels = binner = PeriodIndex(start=start, end=end, + freq=self.freq, name=ax.name) i8 = memb.asi8 freq_mult = self.freq.n From ba10dcfa88e0541107837bb587157040d4a59332 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Mon, 3 Dec 2018 23:59:45 -0500 Subject: [PATCH 11/32] CLN - pep8 adherence --- pandas/core/resample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 8ddc6bb11f385..b4f269a631861 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1539,7 +1539,7 @@ def _get_period_bins(self, ax): name=ax.name)[slice(i, j)] else: labels = binner = PeriodIndex(start=start, end=end, - freq=self.freq, name=ax.name) + freq=self.freq, name=ax.name) i8 = memb.asi8 freq_mult = self.freq.n From b51841a34fc5765ce7562f0b993e60905453a790 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Tue, 4 Dec 2018 01:46:29 -0500 Subject: [PATCH 12/32] BUG - fix cases where period doesnt start at the 0 base of the freq --- pandas/core/resample.py | 171 ++++++++++----------- pandas/tests/resample/test_period_index.py | 13 +- 2 files changed, 87 insertions(+), 97 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 229396d809fe2..eb6d039792e41 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -26,7 +26,7 @@ from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.indexes.period import PeriodIndex -from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range +from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.tseries.frequencies import is_subperiod, is_superperiod, to_offset from pandas.tseries.offsets import ( @@ -81,9 +81,7 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.groupby._set_grouper(self._convert_obj(obj), sort=True) def __unicode__(self): - """ - Provide a nice str repr of our rolling object. - """ + """ provide a nice str repr of our rolling object """ attrs = ["{k}={v}".format(k=k, v=getattr(self.groupby, k)) for k in self._attributes if getattr(self.groupby, k, None) is not None] @@ -102,7 +100,7 @@ def __getattr__(self, attr): def __iter__(self): """ - Resampler iterator. + Resampler iterator Returns ------- @@ -126,18 +124,14 @@ def ax(self): @property def _typ(self): - """ - Masquerade for compat as a Series or a DataFrame. - """ + """ masquerade for compat as a Series or a DataFrame """ if isinstance(self._selected_obj, pd.Series): return 'series' return 'dataframe' @property def _from_selection(self): - """ - Is the resampling from a DataFrame column or MultiIndex level. - """ + """ is the resampling from a DataFrame column or MultiIndex level """ # upsampling and PeriodIndex resampling do not work # with selection, this state used to catch and raise an error return (self.groupby is not None and @@ -146,7 +140,7 @@ def _from_selection(self): def _convert_obj(self, obj): """ - Provide any conversions for the object in order to correctly handle. + provide any conversions for the object in order to correctly handle Parameters ---------- @@ -164,17 +158,17 @@ def _get_binner_for_time(self): def _set_binner(self): """ - Setup our binners. - - Cache these as we are an immutable object + setup our binners + cache these as we are an immutable object """ + if self.binner is None: self.binner, self.grouper = self._get_binner() def _get_binner(self): """ - Create the BinGrouper, assume that self.set_grouper(obj) - has already been called. + create the BinGrouper, assume that self.set_grouper(obj) + has already been called """ binner, bins, binlabels = self._get_binner_for_time() @@ -182,31 +176,28 @@ def _get_binner(self): return binner, bin_grouper def _assure_grouper(self): - """ - Make sure that we are creating our binner & grouper. - """ + """ make sure that we are creating our binner & grouper """ self._set_binner() @Substitution(klass='Resampler', versionadded='.. versionadded:: 0.23.0', examples=""" - >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, - ... index=pd.date_range('2012-08-02', periods=4)) - >>> df - A - 2012-08-02 1 - 2012-08-03 2 - 2012-08-04 3 - 2012-08-05 4 - - To get the difference between each 2-day period's maximum and minimum - value in one pass, you can do - - >>> df.resample('2D').pipe(lambda x: x.max() - x.min()) - A - 2012-08-02 1 - 2012-08-04 1 - """) +>>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, +... index=pd.date_range('2012-08-02', periods=4)) +>>> df + A +2012-08-02 1 +2012-08-03 2 +2012-08-04 3 +2012-08-05 4 + +To get the difference between each 2-day period's maximum and minimum value in +one pass, you can do + +>>> df.resample('2D').pipe(lambda x: x.max() - x.min()) + A +2012-08-02 1 +2012-08-04 1""") @Appender(_pipe_template) def pipe(self, func, *args, **kwargs): return super(Resampler, self).pipe(func, *args, **kwargs) @@ -279,7 +270,7 @@ def aggregate(self, func, *args, **kwargs): def transform(self, arg, *args, **kwargs): """ Call function producing a like-indexed Series on each group and return - a Series with the transformed values. + a Series with the transformed values Parameters ---------- @@ -305,7 +296,8 @@ def _upsample(self, f, limit=None, fill_value=None): def _gotitem(self, key, ndim, subset=None): """ - Sub-classes to define. Return a sliced object. + sub-classes to define + return a sliced object Parameters ---------- @@ -328,9 +320,7 @@ def _gotitem(self, key, ndim, subset=None): return grouped def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): - """ - Re-evaluate the obj with a groupby aggregation. - """ + """ re-evaluate the obj with a groupby aggregation """ if grouper is None: self._set_binner() @@ -362,7 +352,7 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): def _apply_loffset(self, result): """ - If loffset is set, offset the result index. + if loffset is set, offset the result index This is NOT an idempotent routine, it will be applied exactly once to the result. @@ -387,15 +377,11 @@ def _apply_loffset(self, result): return result def _get_resampler_for_grouping(self, groupby, **kwargs): - """ - Return the correct class for resampling with groupby. - """ + """ return the correct class for resampling with groupby """ return self._resampler_for_grouping(self, groupby=groupby, **kwargs) def _wrap_result(self, result): - """ - Potentially wrap any results. - """ + """ potentially wrap any results """ if isinstance(result, ABCSeries) and self._selection is not None: result.name = self._selection @@ -408,7 +394,7 @@ def _wrap_result(self, result): def pad(self, limit=None): """ - Forward fill the values. + Forward fill the values Parameters ---------- @@ -771,7 +757,8 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, def asfreq(self, fill_value=None): """ - Return the values at the new freq, essentially a reindex. + return the values at the new freq, + essentially a reindex Parameters ---------- @@ -790,7 +777,7 @@ def asfreq(self, fill_value=None): def std(self, ddof=1, *args, **kwargs): """ - Compute standard deviation of groups, excluding missing values. + Compute standard deviation of groups, excluding missing values Parameters ---------- @@ -802,12 +789,12 @@ def std(self, ddof=1, *args, **kwargs): def var(self, ddof=1, *args, **kwargs): """ - Compute variance of groups, excluding missing values. + Compute variance of groups, excluding missing values Parameters ---------- ddof : integer, default 1 - degrees of freedom + degrees of freedom """ nv.validate_resampler_func('var', args, kwargs) return self._downsample('var', ddof=ddof) @@ -876,10 +863,8 @@ def f(self, _method=method): def _maybe_process_deprecations(r, how=None, fill_method=None, limit=None): - """ - Potentially we might have a deprecation warning, show it - but call the appropriate methods anyhow. - """ + """ potentially we might have a deprecation warning, show it + but call the appropriate methods anyhow """ if how is not None: @@ -924,9 +909,8 @@ def _maybe_process_deprecations(r, how=None, fill_method=None, limit=None): class _GroupByMixin(GroupByMixin): - """ - Provide the groupby facilities. - """ + """ provide the groupby facilities """ + def __init__(self, obj, *args, **kwargs): parent = kwargs.pop('parent', None) @@ -947,8 +931,8 @@ def __init__(self, obj, *args, **kwargs): def _apply(self, f, grouper=None, *args, **kwargs): """ - Dispatch to _upsample; we are stripping all of the _upsample kwargs and - performing the original function call on the grouped object. + dispatch to _upsample; we are stripping all of the _upsample kwargs and + performing the original function call on the grouped object """ def func(x): @@ -982,7 +966,7 @@ def _get_binner_for_time(self): def _downsample(self, how, **kwargs): """ - Downsample the cython defined function. + Downsample the cython defined function Parameters ---------- @@ -1019,7 +1003,6 @@ def _downsample(self, how, **kwargs): def _adjust_binner_for_upsample(self, binner): """ Adjust our binner when upsampling. - The range of a new index should not be outside specified range """ if self.closed == 'right': @@ -1030,8 +1013,6 @@ def _adjust_binner_for_upsample(self, binner): def _upsample(self, method, limit=None, fill_value=None): """ - Parameters - ---------- method : string {'backfill', 'bfill', 'pad', 'ffill', 'asfreq'} method for upsampling limit : int, default None @@ -1084,6 +1065,7 @@ class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler): Provides a resample of a groupby implementation .. versionadded:: 0.18.1 + """ @property def _constructor(self): @@ -1124,7 +1106,7 @@ def _convert_obj(self, obj): def _downsample(self, how, **kwargs): """ - Downsample the cython defined function. + Downsample the cython defined function Parameters ---------- @@ -1161,8 +1143,6 @@ def _downsample(self, how, **kwargs): def _upsample(self, method, limit=None, fill_value=None): """ - Parameters - ---------- method : string {'backfill', 'bfill', 'pad', 'ffill'} method for upsampling limit : int, default None @@ -1197,9 +1177,10 @@ def _upsample(self, method, limit=None, fill_value=None): class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler): """ - Provides a resample of a groupby implementation. + Provides a resample of a groupby implementation .. versionadded:: 0.18.1 + """ @property def _constructor(self): @@ -1218,7 +1199,6 @@ def _get_binner_for_time(self): def _adjust_binner_for_upsample(self, binner): """ Adjust our binner when upsampling. - The range of a new index is allowed to be greater than original range so we don't need to change the length of a binner, GH 13022 """ @@ -1227,9 +1207,10 @@ def _adjust_binner_for_upsample(self, binner): class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler): """ - Provides a resample of a groupby implementation. + Provides a resample of a groupby implementation .. versionadded:: 0.18.1 + """ @property def _constructor(self): @@ -1237,9 +1218,7 @@ def _constructor(self): def resample(obj, kind=None, **kwds): - """ - Create a TimeGrouper and return our resampler. - """ + """ create a TimeGrouper and return our resampler """ tg = TimeGrouper(**kwds) return tg._get_resampler(obj, kind=kind) @@ -1249,9 +1228,7 @@ def resample(obj, kind=None, **kwds): def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None, limit=None, kind=None, **kwargs): - """ - Return our appropriate resampler when grouping as well. - """ + """ return our appropriate resampler when grouping as well """ # .resample uses 'on' similar to how .groupby uses 'key' kwargs['key'] = kwargs.pop('on', None) @@ -1267,7 +1244,7 @@ def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None, class TimeGrouper(Grouper): """ - Custom groupby class for time-interval grouping. + Custom groupby class for time-interval grouping Parameters ---------- @@ -1334,7 +1311,7 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', def _get_resampler(self, obj, kind=None): """ - Return my resampler or raise if we have an invalid axis. + return my resampler or raise if we have an invalid axis Parameters ---------- @@ -1398,11 +1375,11 @@ def _get_time_bins(self, ax): # because replace() will swallow the nanosecond part # thus last bin maybe slightly before the end if the end contains # nanosecond part and lead to `Values falls after last bin` error - binner = labels = date_range(freq=self.freq, - start=first, - end=last, - tz=tz, - name=ax.name) + binner = labels = DatetimeIndex(freq=self.freq, + start=first, + end=last, + tz=tz, + name=ax.name) # GH 15549 # In edge case of tz-aware resapmling binner last index can be @@ -1484,10 +1461,10 @@ def _get_time_delta_bins(self, ax): return binner, [], labels start, end = ax.min(), ax.max() - labels = binner = timedelta_range(start=start, - end=end, - freq=self.freq, - name=ax.name) + labels = binner = TimedeltaIndex(start=start, + end=end, + freq=self.freq, + name=ax.name) end_stamps = labels + self.freq bins = ax.searchsorted(end_stamps, side='left') @@ -1540,6 +1517,8 @@ def _get_period_bins(self, ax): data=[], freq=self.freq, name=ax.name) return binner, [], labels + freq_mult = self.freq.n + start = ax.min().asfreq(self.freq, how=self.convention) end = ax.max().asfreq(self.freq, how='end') if self.base: @@ -1554,18 +1533,22 @@ def _get_period_bins(self, ax): base=self.base) # compensate if edge labels are extened away from true labels - i = None if self.freq.onOffset(start) else 1 + i = None #if self.freq.onOffset(start) else 1 j = -1 if self.freq.onOffset(end) else None labels = binner = PeriodIndex(start=p_start, end=p_end, freq=self.freq, name=ax.name)[slice(i, j)] + start_offset = (pd.Period(start, self.freq) + - pd.Period(p_start, self.freq)) + # remove /freq_mult once period diff scaling bug is fixed + offset = int(start_offset.n / freq_mult) % freq_mult else: labels = binner = PeriodIndex(start=start, end=end, freq=self.freq, name=ax.name) + offset = 0 i8 = memb.asi8 - freq_mult = self.freq.n # when upsampling to subperiods, we need to generate enough bins expected_bins_count = len(binner) * freq_mult @@ -1573,7 +1556,7 @@ def _get_period_bins(self, ax): rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) rng += freq_mult # adjust bin edge indexes to account for base - rng -= ((freq_mult - self.base) % freq_mult) + rng -= offset bins = memb.searchsorted(rng, side='left') if nat_count > 0: @@ -1686,7 +1669,7 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): """ - Utility frequency conversion method for Series/DataFrame. + Utility frequency conversion method for Series/DataFrame """ if isinstance(obj.index, PeriodIndex): if method is not None: diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 8d763415e6640..0f07b60734a65 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -743,8 +743,16 @@ def test_resample_with_only_nat(self): @pytest.mark.parametrize('start,end,start_freq,end_freq,base', [ ('19910905', '19910909 03:00', 'H', '24H', 10), ('19910905', '19910909 12:00', 'H', '24H', 10), + ('19910905', '19910909 23:00', 'H', '24H', 10), + ('19910905 10:00', '19910909', 'H', '24H', 10), + ('19910905 10:00', '19910909 10:00', 'H', '24H', 10), + ('19910905', '19910909 10:00', 'H', '24H', 10), + ('19910905 12:00', '19910909', 'H', '24H', 10), ('19910905 12:00', '19910909 03:00', 'H', '24H', 10), ('19910905 12:00', '19910909 12:00', 'H', '24H', 10), + ('19910905 12:00', '19910909 12:00', 'H', '24H', 34), + ('19910905 12:00', '19910909 12:00', 'H', '17H', 10), + ('19910905 12:00', '19910909 12:00', 'H', '17H', 3), ('19910905', '19910913 06:00', '2H', '24H', 10), ('19910905', '19910905 01:39', 'Min', '5Min', 3), ('19910905', '19910905 03:18', '2Min', '5Min', 3), @@ -752,9 +760,8 @@ def test_resample_with_only_nat(self): def test_resample_with_non_zero_base(self, start, end, start_freq, end_freq, base): # GH 23882 - s = pd.Series(range(100), index=pd.period_range('19910905', - periods=100, - freq=start_freq)) + s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq)) + s = s + np.arange(len(s)) result = (s.resample(end_freq, base=base).mean().to_timestamp() .asfreq(end_freq)) # to_timestamp casts 24H -> D expected = s.to_timestamp().resample(end_freq, base=base).mean() From 5afea5cdee500b3002d0105cca1805f23b3a7261 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Tue, 4 Dec 2018 01:47:36 -0500 Subject: [PATCH 13/32] CLN - remove unneccesary code --- pandas/core/resample.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index eb6d039792e41..e98dbb2a5ad15 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1533,12 +1533,11 @@ def _get_period_bins(self, ax): base=self.base) # compensate if edge labels are extened away from true labels - i = None #if self.freq.onOffset(start) else 1 j = -1 if self.freq.onOffset(end) else None labels = binner = PeriodIndex(start=p_start, end=p_end, freq=self.freq, - name=ax.name)[slice(i, j)] + name=ax.name)[:j] start_offset = (pd.Period(start, self.freq) - pd.Period(p_start, self.freq)) # remove /freq_mult once period diff scaling bug is fixed From 5ea4d2cc6a8770cb75cc5e007d6ea8d62915739d Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Tue, 4 Dec 2018 01:50:37 -0500 Subject: [PATCH 14/32] CLN - pep8 adherence --- pandas/core/resample.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index e98dbb2a5ad15..092a6ccc59485 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1536,8 +1536,8 @@ def _get_period_bins(self, ax): j = -1 if self.freq.onOffset(end) else None labels = binner = PeriodIndex(start=p_start, end=p_end, - freq=self.freq, - name=ax.name)[:j] + freq=self.freq, + name=ax.name)[:j] start_offset = (pd.Period(start, self.freq) - pd.Period(p_start, self.freq)) # remove /freq_mult once period diff scaling bug is fixed From 99e32a761e29ae4b51fe0e5f72b8ae06d8f3f857 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Wed, 5 Dec 2018 20:10:54 -0500 Subject: [PATCH 15/32] BUG - reset resample file --- pandas/core/resample.py | 190 ++++++++++++++++++++-------------------- 1 file changed, 94 insertions(+), 96 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 092a6ccc59485..f2cf17f8f060d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -26,7 +26,7 @@ from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.indexes.period import PeriodIndex -from pandas.core.indexes.timedeltas import TimedeltaIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range from pandas.tseries.frequencies import is_subperiod, is_superperiod, to_offset from pandas.tseries.offsets import ( @@ -81,7 +81,9 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.groupby._set_grouper(self._convert_obj(obj), sort=True) def __unicode__(self): - """ provide a nice str repr of our rolling object """ + """ + Provide a nice str repr of our rolling object. + """ attrs = ["{k}={v}".format(k=k, v=getattr(self.groupby, k)) for k in self._attributes if getattr(self.groupby, k, None) is not None] @@ -100,7 +102,7 @@ def __getattr__(self, attr): def __iter__(self): """ - Resampler iterator + Resampler iterator. Returns ------- @@ -124,14 +126,18 @@ def ax(self): @property def _typ(self): - """ masquerade for compat as a Series or a DataFrame """ + """ + Masquerade for compat as a Series or a DataFrame. + """ if isinstance(self._selected_obj, pd.Series): return 'series' return 'dataframe' @property def _from_selection(self): - """ is the resampling from a DataFrame column or MultiIndex level """ + """ + Is the resampling from a DataFrame column or MultiIndex level. + """ # upsampling and PeriodIndex resampling do not work # with selection, this state used to catch and raise an error return (self.groupby is not None and @@ -140,7 +146,7 @@ def _from_selection(self): def _convert_obj(self, obj): """ - provide any conversions for the object in order to correctly handle + Provide any conversions for the object in order to correctly handle. Parameters ---------- @@ -158,17 +164,17 @@ def _get_binner_for_time(self): def _set_binner(self): """ - setup our binners - cache these as we are an immutable object - """ + Setup our binners. + Cache these as we are an immutable object + """ if self.binner is None: self.binner, self.grouper = self._get_binner() def _get_binner(self): """ - create the BinGrouper, assume that self.set_grouper(obj) - has already been called + Create the BinGrouper, assume that self.set_grouper(obj) + has already been called. """ binner, bins, binlabels = self._get_binner_for_time() @@ -176,28 +182,31 @@ def _get_binner(self): return binner, bin_grouper def _assure_grouper(self): - """ make sure that we are creating our binner & grouper """ + """ + Make sure that we are creating our binner & grouper. + """ self._set_binner() @Substitution(klass='Resampler', versionadded='.. versionadded:: 0.23.0', examples=""" ->>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, -... index=pd.date_range('2012-08-02', periods=4)) ->>> df - A -2012-08-02 1 -2012-08-03 2 -2012-08-04 3 -2012-08-05 4 - -To get the difference between each 2-day period's maximum and minimum value in -one pass, you can do - ->>> df.resample('2D').pipe(lambda x: x.max() - x.min()) - A -2012-08-02 1 -2012-08-04 1""") + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each 2-day period's maximum and minimum + value in one pass, you can do + + >>> df.resample('2D').pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 1 + 2012-08-04 1 + """) @Appender(_pipe_template) def pipe(self, func, *args, **kwargs): return super(Resampler, self).pipe(func, *args, **kwargs) @@ -270,7 +279,7 @@ def aggregate(self, func, *args, **kwargs): def transform(self, arg, *args, **kwargs): """ Call function producing a like-indexed Series on each group and return - a Series with the transformed values + a Series with the transformed values. Parameters ---------- @@ -296,8 +305,7 @@ def _upsample(self, f, limit=None, fill_value=None): def _gotitem(self, key, ndim, subset=None): """ - sub-classes to define - return a sliced object + Sub-classes to define. Return a sliced object. Parameters ---------- @@ -320,7 +328,9 @@ def _gotitem(self, key, ndim, subset=None): return grouped def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): - """ re-evaluate the obj with a groupby aggregation """ + """ + Re-evaluate the obj with a groupby aggregation. + """ if grouper is None: self._set_binner() @@ -352,7 +362,7 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): def _apply_loffset(self, result): """ - if loffset is set, offset the result index + If loffset is set, offset the result index. This is NOT an idempotent routine, it will be applied exactly once to the result. @@ -377,11 +387,15 @@ def _apply_loffset(self, result): return result def _get_resampler_for_grouping(self, groupby, **kwargs): - """ return the correct class for resampling with groupby """ + """ + Return the correct class for resampling with groupby. + """ return self._resampler_for_grouping(self, groupby=groupby, **kwargs) def _wrap_result(self, result): - """ potentially wrap any results """ + """ + Potentially wrap any results. + """ if isinstance(result, ABCSeries) and self._selection is not None: result.name = self._selection @@ -394,7 +408,7 @@ def _wrap_result(self, result): def pad(self, limit=None): """ - Forward fill the values + Forward fill the values. Parameters ---------- @@ -757,8 +771,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, def asfreq(self, fill_value=None): """ - return the values at the new freq, - essentially a reindex + Return the values at the new freq, essentially a reindex. Parameters ---------- @@ -777,7 +790,7 @@ def asfreq(self, fill_value=None): def std(self, ddof=1, *args, **kwargs): """ - Compute standard deviation of groups, excluding missing values + Compute standard deviation of groups, excluding missing values. Parameters ---------- @@ -789,12 +802,12 @@ def std(self, ddof=1, *args, **kwargs): def var(self, ddof=1, *args, **kwargs): """ - Compute variance of groups, excluding missing values + Compute variance of groups, excluding missing values. Parameters ---------- ddof : integer, default 1 - degrees of freedom + degrees of freedom """ nv.validate_resampler_func('var', args, kwargs) return self._downsample('var', ddof=ddof) @@ -863,8 +876,10 @@ def f(self, _method=method): def _maybe_process_deprecations(r, how=None, fill_method=None, limit=None): - """ potentially we might have a deprecation warning, show it - but call the appropriate methods anyhow """ + """ + Potentially we might have a deprecation warning, show it + but call the appropriate methods anyhow. + """ if how is not None: @@ -909,8 +924,9 @@ def _maybe_process_deprecations(r, how=None, fill_method=None, limit=None): class _GroupByMixin(GroupByMixin): - """ provide the groupby facilities """ - + """ + Provide the groupby facilities. + """ def __init__(self, obj, *args, **kwargs): parent = kwargs.pop('parent', None) @@ -931,8 +947,8 @@ def __init__(self, obj, *args, **kwargs): def _apply(self, f, grouper=None, *args, **kwargs): """ - dispatch to _upsample; we are stripping all of the _upsample kwargs and - performing the original function call on the grouped object + Dispatch to _upsample; we are stripping all of the _upsample kwargs and + performing the original function call on the grouped object. """ def func(x): @@ -966,7 +982,7 @@ def _get_binner_for_time(self): def _downsample(self, how, **kwargs): """ - Downsample the cython defined function + Downsample the cython defined function. Parameters ---------- @@ -1003,6 +1019,7 @@ def _downsample(self, how, **kwargs): def _adjust_binner_for_upsample(self, binner): """ Adjust our binner when upsampling. + The range of a new index should not be outside specified range """ if self.closed == 'right': @@ -1013,6 +1030,8 @@ def _adjust_binner_for_upsample(self, binner): def _upsample(self, method, limit=None, fill_value=None): """ + Parameters + ---------- method : string {'backfill', 'bfill', 'pad', 'ffill', 'asfreq'} method for upsampling limit : int, default None @@ -1065,7 +1084,6 @@ class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler): Provides a resample of a groupby implementation .. versionadded:: 0.18.1 - """ @property def _constructor(self): @@ -1106,7 +1124,7 @@ def _convert_obj(self, obj): def _downsample(self, how, **kwargs): """ - Downsample the cython defined function + Downsample the cython defined function. Parameters ---------- @@ -1143,6 +1161,8 @@ def _downsample(self, how, **kwargs): def _upsample(self, method, limit=None, fill_value=None): """ + Parameters + ---------- method : string {'backfill', 'bfill', 'pad', 'ffill'} method for upsampling limit : int, default None @@ -1177,10 +1197,9 @@ def _upsample(self, method, limit=None, fill_value=None): class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler): """ - Provides a resample of a groupby implementation + Provides a resample of a groupby implementation. .. versionadded:: 0.18.1 - """ @property def _constructor(self): @@ -1199,6 +1218,7 @@ def _get_binner_for_time(self): def _adjust_binner_for_upsample(self, binner): """ Adjust our binner when upsampling. + The range of a new index is allowed to be greater than original range so we don't need to change the length of a binner, GH 13022 """ @@ -1207,10 +1227,9 @@ def _adjust_binner_for_upsample(self, binner): class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler): """ - Provides a resample of a groupby implementation + Provides a resample of a groupby implementation. .. versionadded:: 0.18.1 - """ @property def _constructor(self): @@ -1218,7 +1237,9 @@ def _constructor(self): def resample(obj, kind=None, **kwds): - """ create a TimeGrouper and return our resampler """ + """ + Create a TimeGrouper and return our resampler. + """ tg = TimeGrouper(**kwds) return tg._get_resampler(obj, kind=kind) @@ -1228,7 +1249,9 @@ def resample(obj, kind=None, **kwds): def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None, limit=None, kind=None, **kwargs): - """ return our appropriate resampler when grouping as well """ + """ + Return our appropriate resampler when grouping as well. + """ # .resample uses 'on' similar to how .groupby uses 'key' kwargs['key'] = kwargs.pop('on', None) @@ -1244,7 +1267,7 @@ def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None, class TimeGrouper(Grouper): """ - Custom groupby class for time-interval grouping + Custom groupby class for time-interval grouping. Parameters ---------- @@ -1311,7 +1334,7 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', def _get_resampler(self, obj, kind=None): """ - return my resampler or raise if we have an invalid axis + Return my resampler or raise if we have an invalid axis. Parameters ---------- @@ -1375,11 +1398,11 @@ def _get_time_bins(self, ax): # because replace() will swallow the nanosecond part # thus last bin maybe slightly before the end if the end contains # nanosecond part and lead to `Values falls after last bin` error - binner = labels = DatetimeIndex(freq=self.freq, - start=first, - end=last, - tz=tz, - name=ax.name) + binner = labels = date_range(freq=self.freq, + start=first, + end=last, + tz=tz, + name=ax.name) # GH 15549 # In edge case of tz-aware resapmling binner last index can be @@ -1461,10 +1484,10 @@ def _get_time_delta_bins(self, ax): return binner, [], labels start, end = ax.min(), ax.max() - labels = binner = TimedeltaIndex(start=start, - end=end, - freq=self.freq, - name=ax.name) + labels = binner = timedelta_range(start=start, + end=end, + freq=self.freq, + name=ax.name) end_stamps = labels + self.freq bins = ax.searchsorted(end_stamps, side='left') @@ -1517,45 +1540,20 @@ def _get_period_bins(self, ax): data=[], freq=self.freq, name=ax.name) return binner, [], labels - freq_mult = self.freq.n - start = ax.min().asfreq(self.freq, how=self.convention) end = ax.max().asfreq(self.freq, how='end') - if self.base: - start = start.to_timestamp() - end = end.to_timestamp() - # get base adjusted bin edge labels - p_start, p_end = _get_range_edges(start, - end, - self.freq, - closed=self.closed, - base=self.base) - - # compensate if edge labels are extened away from true labels - j = -1 if self.freq.onOffset(end) else None - - labels = binner = PeriodIndex(start=p_start, end=p_end, - freq=self.freq, - name=ax.name)[:j] - start_offset = (pd.Period(start, self.freq) - - pd.Period(p_start, self.freq)) - # remove /freq_mult once period diff scaling bug is fixed - offset = int(start_offset.n / freq_mult) % freq_mult - else: - labels = binner = PeriodIndex(start=start, end=end, - freq=self.freq, name=ax.name) - offset = 0 + labels = binner = PeriodIndex(start=start, end=end, + freq=self.freq, name=ax.name) i8 = memb.asi8 + freq_mult = self.freq.n # when upsampling to subperiods, we need to generate enough bins expected_bins_count = len(binner) * freq_mult i8_extend = expected_bins_count - (i8[-1] - i8[0]) rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) rng += freq_mult - # adjust bin edge indexes to account for base - rng -= offset bins = memb.searchsorted(rng, side='left') if nat_count > 0: @@ -1668,7 +1666,7 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): """ - Utility frequency conversion method for Series/DataFrame + Utility frequency conversion method for Series/DataFrame. """ if isinstance(obj.index, PeriodIndex): if method is not None: From 126ae7a31d1c24de056758cad85bd7bf0b995ee3 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Wed, 5 Dec 2018 20:12:11 -0500 Subject: [PATCH 16/32] BUG - add original changes back in after master reset --- pandas/core/resample.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index f2cf17f8f060d..fd2043385a78e 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1540,20 +1540,45 @@ def _get_period_bins(self, ax): data=[], freq=self.freq, name=ax.name) return binner, [], labels + freq_mult = self.freq.n + start = ax.min().asfreq(self.freq, how=self.convention) end = ax.max().asfreq(self.freq, how='end') + if self.base: + start = start.to_timestamp() + end = end.to_timestamp() - labels = binner = PeriodIndex(start=start, end=end, - freq=self.freq, name=ax.name) + # get base adjusted bin edge labels + p_start, p_end = _get_range_edges(start, + end, + self.freq, + closed=self.closed, + base=self.base) + + # compensate if edge labels are extened away from true labels + j = -1 if self.freq.onOffset(end) else None + + labels = binner = PeriodIndex(start=p_start, end=p_end, + freq=self.freq, + name=ax.name)[:j] + start_offset = (pd.Period(start, self.freq) + - pd.Period(p_start, self.freq)) + # remove /freq_mult once period diff scaling bug is fixed + offset = int(start_offset.n / freq_mult) % freq_mult + else: + labels = binner = PeriodIndex(start=start, end=end, + freq=self.freq, name=ax.name) + offset = 0 i8 = memb.asi8 - freq_mult = self.freq.n # when upsampling to subperiods, we need to generate enough bins expected_bins_count = len(binner) * freq_mult i8_extend = expected_bins_count - (i8[-1] - i8[0]) rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) rng += freq_mult + # adjust bin edge indexes to account for base + rng -= offset bins = memb.searchsorted(rng, side='left') if nat_count > 0: From bf076de3a21cdb1e57d4ed959fbaba24e1f8470d Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Thu, 6 Dec 2018 22:59:39 -0500 Subject: [PATCH 17/32] BUG - add case back in where not start is not onOffset and add tests for this case --- pandas/core/resample.py | 6 ++++-- pandas/tests/resample/test_period_index.py | 7 +++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index fd2043385a78e..18805d2be8b97 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1544,6 +1544,8 @@ def _get_period_bins(self, ax): start = ax.min().asfreq(self.freq, how=self.convention) end = ax.max().asfreq(self.freq, how='end') + offset = 0 + if self.base: start = start.to_timestamp() end = end.to_timestamp() @@ -1556,11 +1558,12 @@ def _get_period_bins(self, ax): base=self.base) # compensate if edge labels are extened away from true labels + i = None if self.freq.onOffset(start) else 1 j = -1 if self.freq.onOffset(end) else None labels = binner = PeriodIndex(start=p_start, end=p_end, freq=self.freq, - name=ax.name)[:j] + name=ax.name)[i:j] start_offset = (pd.Period(start, self.freq) - pd.Period(p_start, self.freq)) # remove /freq_mult once period diff scaling bug is fixed @@ -1568,7 +1571,6 @@ def _get_period_bins(self, ax): else: labels = binner = PeriodIndex(start=start, end=end, freq=self.freq, name=ax.name) - offset = 0 i8 = memb.asi8 diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 0f07b60734a65..ce3fb2618ef9c 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -753,6 +753,7 @@ def test_resample_with_only_nat(self): ('19910905 12:00', '19910909 12:00', 'H', '24H', 34), ('19910905 12:00', '19910909 12:00', 'H', '17H', 10), ('19910905 12:00', '19910909 12:00', 'H', '17H', 3), + ('19910905 12:00', '19910909 1:00', 'H', 'M', 3), ('19910905', '19910913 06:00', '2H', '24H', 10), ('19910905', '19910905 01:39', 'Min', '5Min', 3), ('19910905', '19910905 03:18', '2Min', '5Min', 3), @@ -762,7 +763,9 @@ def test_resample_with_non_zero_base(self, start, end, start_freq, # GH 23882 s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq)) s = s + np.arange(len(s)) - result = (s.resample(end_freq, base=base).mean().to_timestamp() - .asfreq(end_freq)) # to_timestamp casts 24H -> D + result = s.resample(end_freq, base=base).mean() + result = result.to_timestamp(end_freq) + # to_timestamp casts 24H -> D + result = result.asfreq(end_freq) if end_freq == '24H' else result expected = s.to_timestamp().resample(end_freq, base=base).mean() assert_series_equal(result, expected) From 47d7f7b65e86d5d06ee414dd2e1274efe3861ea8 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Thu, 6 Dec 2018 23:29:03 -0500 Subject: [PATCH 18/32] ENH - allow for edge compensation in _get_range_edges --- pandas/core/resample.py | 55 +++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 18805d2be8b97..c2aa52969f825 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1555,15 +1555,14 @@ def _get_period_bins(self, ax): end, self.freq, closed=self.closed, - base=self.base) - - # compensate if edge labels are extened away from true labels - i = None if self.freq.onOffset(start) else 1 - j = -1 if self.freq.onOffset(end) else None + base=self.base, + compensate_edges=True) labels = binner = PeriodIndex(start=p_start, end=p_end, freq=self.freq, - name=ax.name)[i:j] + name=ax.name) + + # Get offset for bin edge (not label edge) adjustment start_offset = (pd.Period(start, self.freq) - pd.Period(p_start, self.freq)) # remove /freq_mult once period diff scaling bug is fixed @@ -1609,26 +1608,56 @@ def _take_new_index(obj, indexer, new_index, axis=0): raise ValueError("'obj' should be either a Series or a DataFrame") -def _get_range_edges(first, last, offset, closed='left', base=0): +def _get_range_edges(first, last, offset, closed='left', base=0, + compensate_edges=False): + first_ = first + last_ = last + + additional_adjust = True if isinstance(offset, Tick): is_day = isinstance(offset, Day) day_nanos = delta_to_nanoseconds(timedelta(1)) # #1165 if (is_day and day_nanos % offset.nanos == 0) or not is_day: - return _adjust_dates_anchored(first, last, offset, - closed=closed, base=base) + additional_adjust = False + # Don't return immediately because might need to compensate edges + first, last = _adjust_dates_anchored(first, last, offset, + closed=closed, base=base) else: first = first.normalize() last = last.normalize() - if closed == 'left': - first = Timestamp(offset.rollback(first)) + if additional_adjust: + if closed == 'left': + first = Timestamp(offset.rollback(first)) + else: + first = Timestamp(first - offset) + + last = Timestamp(last + offset) + + if compensate_edges: + first, last = _compensate_edges(first, last, first_, last_, offset) + + return first, last + + +def _compensate_edges(adj_first, adj_last, orig_first, orig_last, offset): + # Implement this function in expanded form, instead of the shorter form + # to ensure that all these cases are covered in code coverage reports + + # first = adj_first + offset * (not offset.onOffset(orig_first)) + if not offset.onOffset(orig_first): + first = adj_first + offset else: - first = Timestamp(first - offset) + first = adj_first - last = Timestamp(last + offset) + # last = adj_last - offset * offset.onOffset(orig_last) + if offset.onOffset(orig_last): + last = adj_last - offset + else: + last = adj_last return first, last From eb0550195b00ae18582b7353aab6710f1c97f4f0 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Fri, 7 Dec 2018 17:46:42 -0500 Subject: [PATCH 19/32] BUG/CLN - offsets.Day(n>2) not properly anchoring dates, and make code more readable --- pandas/core/resample.py | 71 ++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 43 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index c2aa52969f825..dcc356ebdafa1 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1546,30 +1546,24 @@ def _get_period_bins(self, ax): end = ax.max().asfreq(self.freq, how='end') offset = 0 + # GH 23882 if self.base: - start = start.to_timestamp() - end = end.to_timestamp() - # get base adjusted bin edge labels - p_start, p_end = _get_range_edges(start, - end, - self.freq, - closed=self.closed, - base=self.base, - compensate_edges=True) - - labels = binner = PeriodIndex(start=p_start, end=p_end, - freq=self.freq, - name=ax.name) + p_start, end = _get_range_edges(start, + end, + self.freq, + closed=self.closed, + base=self.base) # Get offset for bin edge (not label edge) adjustment start_offset = (pd.Period(start, self.freq) - pd.Period(p_start, self.freq)) # remove /freq_mult once period diff scaling bug is fixed offset = int(start_offset.n / freq_mult) % freq_mult - else: - labels = binner = PeriodIndex(start=start, end=end, - freq=self.freq, name=ax.name) + start = p_start + + labels = binner = PeriodIndex(start=start, end=end, + freq=self.freq, name=ax.name) i8 = memb.asi8 @@ -1608,18 +1602,27 @@ def _take_new_index(obj, indexer, new_index, axis=0): raise ValueError("'obj' should be either a Series or a DataFrame") -def _get_range_edges(first, last, offset, closed='left', base=0, - compensate_edges=False): - first_ = first - last_ = last +def _get_range_edges(first, last, offset, closed='left', base=0): + if type(first) != type(last): + raise TypeError("'first' and 'last' must be same type") + + # make proper adjustments for Periods #GH 23882 + is_period = False + adjust_first = adjust_last = False + if isinstance(first, pd.Period): + is_period = True + first = first.to_timestamp() + last = last.to_timestamp() + adjust_first = not offset.onOffset(first) + adjust_last = offset.onOffset(last) additional_adjust = True if isinstance(offset, Tick): is_day = isinstance(offset, Day) day_nanos = delta_to_nanoseconds(timedelta(1)) - # #1165 - if (is_day and day_nanos % offset.nanos == 0) or not is_day: + # #1165 / #24127 + if (is_day and not offset.nanos % day_nanos) or not is_day: additional_adjust = False # Don't return immediately because might need to compensate edges first, last = _adjust_dates_anchored(first, last, offset, @@ -1637,27 +1640,9 @@ def _get_range_edges(first, last, offset, closed='left', base=0, last = Timestamp(last + offset) - if compensate_edges: - first, last = _compensate_edges(first, last, first_, last_, offset) - - return first, last - - -def _compensate_edges(adj_first, adj_last, orig_first, orig_last, offset): - # Implement this function in expanded form, instead of the shorter form - # to ensure that all these cases are covered in code coverage reports - - # first = adj_first + offset * (not offset.onOffset(orig_first)) - if not offset.onOffset(orig_first): - first = adj_first + offset - else: - first = adj_first - - # last = adj_last - offset * offset.onOffset(orig_last) - if offset.onOffset(orig_last): - last = adj_last - offset - else: - last = adj_last + if is_period: + first = (first + adjust_first * offset).to_period(offset) + last = (last - adjust_last * offset).to_period(offset) return first, last From f2b066117b4a39eeb190dacd1d86a64ba12366f9 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Fri, 7 Dec 2018 21:34:27 -0500 Subject: [PATCH 20/32] TST - update test to reflect more up-to-date assumption of what expected output should be --- pandas/tests/groupby/test_timegrouper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 183ccfb5182a2..cb7b419710837 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -43,8 +43,8 @@ def test_groupby_with_timegrouper(self): expected = DataFrame( {'Quantity': 0}, - index=date_range('20130901 13:00:00', - '20131205 13:00:00', freq='5D', + index=date_range('20130901', + '20131205', freq='5D', name='Date', closed='left')) expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype='int64') From 8020acbf2425858d6d8b46107557562d31e9719c Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Fri, 7 Dec 2018 23:29:44 -0500 Subject: [PATCH 21/32] CLN - rever changes for other resample fix, to be fixed in another PR --- pandas/core/resample.py | 4 ++-- pandas/tests/groupby/test_timegrouper.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 9658af027bc04..0492ba6c8160a 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1621,8 +1621,8 @@ def _get_range_edges(first, last, offset, closed='left', base=0): is_day = isinstance(offset, Day) day_nanos = delta_to_nanoseconds(timedelta(1)) - # #1165 / #24127 - if (is_day and not offset.nanos % day_nanos) or not is_day: + # #1165 + if (is_day and day_nanos % offset.nanos == 0) or not is_day: additional_adjust = False # Don't return immediately because might need to compensate edges first, last = _adjust_dates_anchored(first, last, offset, diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index cb7b419710837..183ccfb5182a2 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -43,8 +43,8 @@ def test_groupby_with_timegrouper(self): expected = DataFrame( {'Quantity': 0}, - index=date_range('20130901', - '20131205', freq='5D', + index=date_range('20130901 13:00:00', + '20131205 13:00:00', freq='5D', name='Date', closed='left')) expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype='int64') From 334eb0b6a753b270dc31cde0138a067a82dca141 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Fri, 7 Dec 2018 23:43:21 -0500 Subject: [PATCH 22/32] CLN - split up range edge functions for timestamp and period --- pandas/core/resample.py | 73 +++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 0492ba6c8160a..2d30015081058 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1389,9 +1389,10 @@ def _get_time_bins(self, ax): data=[], freq=self.freq, name=ax.name) return binner, [], labels - first, last = _get_range_edges(ax.min(), ax.max(), self.freq, - closed=self.closed, - base=self.base) + first, last = _get_timestamp_range_edges(ax.min(), ax.max(), + self.freq, + closed=self.closed, + base=self.base) tz = ax.tz # GH #12037 # use first/last directly instead of call replace() on them @@ -1549,11 +1550,11 @@ def _get_period_bins(self, ax): # GH 23882 if self.base: # get base adjusted bin edge labels - p_start, end = _get_range_edges(start, - end, - self.freq, - closed=self.closed, - base=self.base) + p_start, end = _get_period_range_edges(start, + end, + self.freq, + closed=self.closed, + base=self.base) # Get offset for bin edge (not label edge) adjustment start_offset = (pd.Period(start, self.freq) @@ -1602,51 +1603,51 @@ def _take_new_index(obj, indexer, new_index, axis=0): raise ValueError("'obj' should be either a Series or a DataFrame") -def _get_range_edges(first, last, offset, closed='left', base=0): - if type(first) != type(last): - raise TypeError("'first' and 'last' must be same type") +def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): + if not all(isinstance(obj, pd.Timestamp) for obj in [first, last]): + raise TypeError("'first' and 'last' must be instances of type " + "Timestamp") - # make proper adjustments for Periods #GH 23882 - is_period = False - adjust_first = adjust_last = False - if isinstance(first, pd.Period): - is_period = True - first = first.to_timestamp() - last = last.to_timestamp() - adjust_first = not offset.onOffset(first) - adjust_last = offset.onOffset(last) - - additional_adjust = True if isinstance(offset, Tick): is_day = isinstance(offset, Day) day_nanos = delta_to_nanoseconds(timedelta(1)) # #1165 if (is_day and day_nanos % offset.nanos == 0) or not is_day: - additional_adjust = False - # Don't return immediately because might need to compensate edges - first, last = _adjust_dates_anchored(first, last, offset, - closed=closed, base=base) + return _adjust_dates_anchored(first, last, offset, + closed=closed, base=base) else: first = first.normalize() last = last.normalize() - if additional_adjust: - if closed == 'left': - first = Timestamp(offset.rollback(first)) - else: - first = Timestamp(first - offset) - - last = Timestamp(last + offset) + if closed == 'left': + first = Timestamp(offset.rollback(first)) + else: + first = Timestamp(first - offset) - if is_period: - first = (first + adjust_first * offset).to_period(offset) - last = (last - adjust_last * offset).to_period(offset) + last = Timestamp(last + offset) return first, last +def _get_period_range_edges(first, last, offset, closed='left', base=0): + if not all(isinstance(obj, pd.Period) for obj in [first, last]): + raise TypeError("'first' and 'last' must be instances of type Period") + + #GH 23882 + first = first.to_timestamp() + last = last.to_timestamp() + adjust_first = not offset.onOffset(first) + adjust_last = offset.onOffset(last) + + first, last = _get_timestamp_range_edges(first, last, offset, + closed=closed, base=base) + + first = (first + adjust_first * offset).to_period(offset) + last = (last - adjust_last * offset).to_period(offset) + return first, last + def _adjust_dates_anchored(first, last, offset, closed='right', base=0): # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is From c14dbcea0071be8334511f22820c432e4be96280 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Fri, 7 Dec 2018 23:58:34 -0500 Subject: [PATCH 23/32] CLN - pep8 adherence --- pandas/core/resample.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 2d30015081058..3528b3c67c535 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1635,7 +1635,7 @@ def _get_period_range_edges(first, last, offset, closed='left', base=0): if not all(isinstance(obj, pd.Period) for obj in [first, last]): raise TypeError("'first' and 'last' must be instances of type Period") - #GH 23882 + # GH 23882 first = first.to_timestamp() last = last.to_timestamp() adjust_first = not offset.onOffset(first) @@ -1648,6 +1648,7 @@ def _get_period_range_edges(first, last, offset, closed='left', base=0): last = (last - adjust_last * offset).to_period(offset) return first, last + def _adjust_dates_anchored(first, last, offset, closed='right', base=0): # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is From 8e07a953160322a2daf69b3effc2babc3d73074d Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 9 Dec 2018 22:27:51 -0500 Subject: [PATCH 24/32] TST/DOC - add tests for get range edges and update whatsnew --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/tests/resample/test_datetime_index.py | 27 +++++++++++++++++++- pandas/tests/resample/test_period_index.py | 23 +++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 8a24a1a27955e..d9129c09d0478 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -352,7 +352,7 @@ Other Enhancements - :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`) - :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). - :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). -- :class:`PeriodIndexResampler` aggregations will now respect the ``base`` argument in the same fashion as :class:`DatetimeIndexResampler` (:issue:`23882`) +- :meth:`DataFrame.resample` and :meth:`Series.resample` with a :class:`PeriodIndex` will now respect the ``base`` argument in the same fashion as with a :class:`DatetimeIndex`. (:issue:`23882`) - :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``, all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index b287eb468cd94..8fd0d0313a69e 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -17,7 +17,8 @@ from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import timedelta_range -from pandas.core.resample import DatetimeIndex, TimeGrouper +from pandas.core.resample import ( + DatetimeIndex, TimeGrouper, _get_timestamp_range_edges) from pandas.tests.resample.test_base import ( Base, business_day_offset, downsample_methods, simple_date_range_series, simple_period_range_series) @@ -1463,3 +1464,27 @@ def f(data, add_arg): result = df.groupby("A").resample("D").agg(f, multiplier) expected = df.groupby("A").resample('D').mean().multiply(multiplier) assert_frame_equal(result, expected) + + @pytest.mark.parametrize('first,last,offset,exp_first,exp_last', [ + ('19910905', '19920406', 'D', '19910905', '19920407'), + ('19910905 00:00', '19920406 06:00', 'D', '19910905', '19920407'), + ('19910905 06:00', '19920406 06:00', 'H', '19910905 06:00', + '19920406 07:00'), + ('19910906', '19920406', 'M', '19910831', '19920430'), + ('19910831', '19920430', 'M', '19910831', '19920531'), + ('1991-08', '1992-04', 'M', '19910831', '19920531'), + ]) + def test_get_timestamp_range_edges(self, first, last, offset, + exp_first, exp_last): + first = pd.Period(first) + first = first.to_timestamp(first.freq) + last = pd.Period(last) + last = last.to_timestamp(last.freq) + + exp_first = pd.Timestamp(exp_first, freq=offset) + exp_last = pd.Timestamp(exp_last, freq=offset) + + offset = pd.tseries.frequencies.to_offset(offset) + result = _get_timestamp_range_edges(first, last, offset) + expected = (exp_first, exp_last) + assert result == expected diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index ce3fb2618ef9c..91e4237172c67 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -15,6 +15,7 @@ from pandas import DataFrame, Series, Timestamp from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import Period, PeriodIndex, period_range +from pandas.core.resample import _get_period_range_edges from pandas.tests.resample.test_base import ( Base, resample_methods, simple_period_range_series) import pandas.util.testing as tm @@ -769,3 +770,25 @@ def test_resample_with_non_zero_base(self, start, end, start_freq, result = result.asfreq(end_freq) if end_freq == '24H' else result expected = s.to_timestamp().resample(end_freq, base=base).mean() assert_series_equal(result, expected) + + @pytest.mark.parametrize('first,last,offset,exp_first,exp_last', [ + ('19910905', '19920406', 'D', '19910905', '19920406'), + ('19910905 00:00', '19920406 06:00', 'D', '19910905', '19920406'), + ('19910905 06:00', '19920406 06:00', 'H', '19910905 06:00', + '19920406 06:00'), + ('19910906', '19920406', 'M', '1991-09', '1992-04'), + ('19910831', '19920430', 'M', '1991-08', '1992-04'), + ('1991-08', '1992-04', 'M', '1991-08', '1992-04'), + ]) + def test_get_period_range_edges(self, first, last, offset, + exp_first, exp_last): + first = pd.Period(first) + last = pd.Period(last) + + exp_first = pd.Period(exp_first, freq=offset) + exp_last = pd.Period(exp_last, freq=offset) + + offset = pd.tseries.frequencies.to_offset(offset) + result = _get_period_range_edges(first, last, offset) + expected = (exp_first, exp_last) + assert result == expected From 0a9c7dc1d6c8955f4d12429091663db59e0c298a Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Mon, 10 Dec 2018 20:35:29 -0500 Subject: [PATCH 25/32] BUG - account for new period diff behavior --- pandas/core/resample.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 3528b3c67c535..e7f8212a84562 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1559,8 +1559,7 @@ def _get_period_bins(self, ax): # Get offset for bin edge (not label edge) adjustment start_offset = (pd.Period(start, self.freq) - pd.Period(p_start, self.freq)) - # remove /freq_mult once period diff scaling bug is fixed - offset = int(start_offset.n / freq_mult) % freq_mult + offset = start_offset.n % freq_mult start = p_start labels = binner = PeriodIndex(start=start, end=end, From cfefa36a494a8fff9e39db5c59b56a7119298bbe Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Tue, 11 Dec 2018 20:34:40 -0500 Subject: [PATCH 26/32] TST - add test for bad input to get range edges --- pandas/tests/resample/test_datetime_index.py | 9 +++++++++ pandas/tests/resample/test_period_index.py | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 8fd0d0313a69e..64ae70cdd8125 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1488,3 +1488,12 @@ def test_get_timestamp_range_edges(self, first, last, offset, result = _get_timestamp_range_edges(first, last, offset) expected = (exp_first, exp_last) assert result == expected + + @pytest.mark.parametrize('first,last', [ + (pd.Period('1991-09-05'), pd.Timestamp('1992-04-06')), + (pd.Timestamp('1991-09-05'), pd.Period('1992-04-06')), + (pd.Period('1991-09-05'), pd.Period('1992-04-06')), + ]) + def test_get_period_range_edges_bad_input(self, first, last): + with pytest.raises(TypeError, match='instances of type Timestamp'): + _get_timestamp_range_edges(first, last, pd.offsets.Day()) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 91e4237172c67..3f4469741fb8a 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -792,3 +792,12 @@ def test_get_period_range_edges(self, first, last, offset, result = _get_period_range_edges(first, last, offset) expected = (exp_first, exp_last) assert result == expected + + @pytest.mark.parametrize('first,last', [ + (pd.Period('1991-09-05'), pd.Timestamp('1992-04-06')), + (pd.Timestamp('1991-09-05'), pd.Period('1992-04-06')), + (pd.Timestamp('1991-09-05'), pd.Timestamp('1992-04-06')), + ]) + def test_get_period_range_edges_bad_input(self, first, last): + with pytest.raises(TypeError, match='instances of type Period'): + _get_period_range_edges(first, last, pd.offsets.Day()) From 93eaab75bd8a686992d19638bd1894227765a03e Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Tue, 11 Dec 2018 20:37:01 -0500 Subject: [PATCH 27/32] TST - add one more test case to bad get_range_edges --- pandas/tests/resample/test_datetime_index.py | 1 + pandas/tests/resample/test_period_index.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 64ae70cdd8125..288e4c2f8d30d 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1493,6 +1493,7 @@ def test_get_timestamp_range_edges(self, first, last, offset, (pd.Period('1991-09-05'), pd.Timestamp('1992-04-06')), (pd.Timestamp('1991-09-05'), pd.Period('1992-04-06')), (pd.Period('1991-09-05'), pd.Period('1992-04-06')), + (pd.Timestamp('1991-09-05'), '1992-04-06'), ]) def test_get_period_range_edges_bad_input(self, first, last): with pytest.raises(TypeError, match='instances of type Timestamp'): diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 3f4469741fb8a..e2b81067c5f61 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -797,6 +797,7 @@ def test_get_period_range_edges(self, first, last, offset, (pd.Period('1991-09-05'), pd.Timestamp('1992-04-06')), (pd.Timestamp('1991-09-05'), pd.Period('1992-04-06')), (pd.Timestamp('1991-09-05'), pd.Timestamp('1992-04-06')), + (pd.Period('1991-09-05'), '1992-04-06'), ]) def test_get_period_range_edges_bad_input(self, first, last): with pytest.raises(TypeError, match='instances of type Period'): From e5286f8c66a251b09f6dada789674e006563ae7c Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Wed, 12 Dec 2018 19:27:36 -0500 Subject: [PATCH 28/32] DOC - add docstrings --- pandas/core/resample.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index e7f8212a84562..ae2de57adbad9 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1603,6 +1603,10 @@ def _take_new_index(obj, indexer, new_index, axis=0): def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): + """ + Adjust the provided Timestamp range edge values to the appropriate edge + values for the given offset parameters. + """ if not all(isinstance(obj, pd.Timestamp) for obj in [first, last]): raise TypeError("'first' and 'last' must be instances of type " "Timestamp") @@ -1631,6 +1635,10 @@ def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): def _get_period_range_edges(first, last, offset, closed='left', base=0): + """ + Adjust the provided Period range edge values to the appropriate edge + values for the given offset parameters. + """ if not all(isinstance(obj, pd.Period) for obj in [first, last]): raise TypeError("'first' and 'last' must be instances of type Period") From bef91189f5a13f263ce31d6b85f81e6c907ac68f Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Wed, 12 Dec 2018 19:27:50 -0500 Subject: [PATCH 29/32] TST - remove unneccesary test --- pandas/tests/resample/test_datetime_index.py | 10 ---------- pandas/tests/resample/test_period_index.py | 10 ---------- 2 files changed, 20 deletions(-) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 288e4c2f8d30d..8fd0d0313a69e 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1488,13 +1488,3 @@ def test_get_timestamp_range_edges(self, first, last, offset, result = _get_timestamp_range_edges(first, last, offset) expected = (exp_first, exp_last) assert result == expected - - @pytest.mark.parametrize('first,last', [ - (pd.Period('1991-09-05'), pd.Timestamp('1992-04-06')), - (pd.Timestamp('1991-09-05'), pd.Period('1992-04-06')), - (pd.Period('1991-09-05'), pd.Period('1992-04-06')), - (pd.Timestamp('1991-09-05'), '1992-04-06'), - ]) - def test_get_period_range_edges_bad_input(self, first, last): - with pytest.raises(TypeError, match='instances of type Timestamp'): - _get_timestamp_range_edges(first, last, pd.offsets.Day()) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index e2b81067c5f61..91e4237172c67 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -792,13 +792,3 @@ def test_get_period_range_edges(self, first, last, offset, result = _get_period_range_edges(first, last, offset) expected = (exp_first, exp_last) assert result == expected - - @pytest.mark.parametrize('first,last', [ - (pd.Period('1991-09-05'), pd.Timestamp('1992-04-06')), - (pd.Timestamp('1991-09-05'), pd.Period('1992-04-06')), - (pd.Timestamp('1991-09-05'), pd.Timestamp('1992-04-06')), - (pd.Period('1991-09-05'), '1992-04-06'), - ]) - def test_get_period_range_edges_bad_input(self, first, last): - with pytest.raises(TypeError, match='instances of type Period'): - _get_period_range_edges(first, last, pd.offsets.Day()) From 9b7d261a3425fb9e44d382bd06adf9bf4bec87cc Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Wed, 12 Dec 2018 22:00:28 -0500 Subject: [PATCH 30/32] DOC - update docstrings --- pandas/core/resample.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index ae2de57adbad9..a17598a09399d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1604,8 +1604,11 @@ def _take_new_index(obj, indexer, new_index, axis=0): def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): """ - Adjust the provided Timestamp range edge values to the appropriate edge - values for the given offset parameters. + Adjust the `first` Timestamp to the preceeding Timestamp that resides on + the provided offset. Adjust the `last` Timestamp to the following + Timestamp that resides on the provided offset. Input Timestamps that + already reside on the offset will be adjusted depeding on the type of + offset and the `closed` parameter. """ if not all(isinstance(obj, pd.Timestamp) for obj in [first, last]): raise TypeError("'first' and 'last' must be instances of type " @@ -1636,8 +1639,8 @@ def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): def _get_period_range_edges(first, last, offset, closed='left', base=0): """ - Adjust the provided Period range edge values to the appropriate edge - values for the given offset parameters. + Adjust the provided `first` and `last` Periods to the respective Period of + the given offset that encompasses them. """ if not all(isinstance(obj, pd.Period) for obj in [first, last]): raise TypeError("'first' and 'last' must be instances of type Period") From a49d1291f554e07c29c0ffbcd0db5af43a749703 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Wed, 12 Dec 2018 22:13:46 -0500 Subject: [PATCH 31/32] CLN - rename offset to bin_shift, so as not to confuse with pd.offsets --- pandas/core/resample.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 298f1c24ea530..bbcf42a3664dc 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1545,7 +1545,7 @@ def _get_period_bins(self, ax): start = ax.min().asfreq(self.freq, how=self.convention) end = ax.max().asfreq(self.freq, how='end') - offset = 0 + bin_shift = 0 # GH 23882 if self.base: @@ -1559,7 +1559,7 @@ def _get_period_bins(self, ax): # Get offset for bin edge (not label edge) adjustment start_offset = (pd.Period(start, self.freq) - pd.Period(p_start, self.freq)) - offset = start_offset.n % freq_mult + bin_shift = start_offset.n % freq_mult start = p_start labels = binner = PeriodIndex(start=start, end=end, @@ -1573,7 +1573,7 @@ def _get_period_bins(self, ax): rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) rng += freq_mult # adjust bin edge indexes to account for base - rng -= offset + rng -= bin_shift bins = memb.searchsorted(rng, side='left') if nat_count > 0: From 8f1e29060bd0c2247fcd850cba0e353c3a4beee3 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Thu, 13 Dec 2018 08:37:41 -0500 Subject: [PATCH 32/32] DOC - add params and returns sections --- pandas/core/resample.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index bbcf42a3664dc..7b842d141e839 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1609,6 +1609,23 @@ def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): Timestamp that resides on the provided offset. Input Timestamps that already reside on the offset will be adjusted depeding on the type of offset and the `closed` parameter. + + Parameters + ---------- + first : pd.Timestamp + The beginning Timestamp of the range to be adjusted. + last : pd.Timestamp + The ending Timestamp of the range to be adjusted. + offset : pd.DateOffset + The dateoffset to which the Timestamps will be adjusted. + closed : {'right', 'left'}, default None + Which side of bin interval is closed. + base : int, default 0 + The "origin" of the adjusted Timestamps. + + Returns + ------- + A tuple of length 2, containing the adjusted pd.Timestamp objects. """ if not all(isinstance(obj, pd.Timestamp) for obj in [first, last]): raise TypeError("'first' and 'last' must be instances of type " @@ -1641,6 +1658,23 @@ def _get_period_range_edges(first, last, offset, closed='left', base=0): """ Adjust the provided `first` and `last` Periods to the respective Period of the given offset that encompasses them. + + Parameters + ---------- + first : pd.Period + The beginning Period of the range to be adjusted. + last : pd.Period + The ending Period of the range to be adjusted. + offset : pd.DateOffset + The dateoffset to which the Periods will be adjusted. + closed : {'right', 'left'}, default None + Which side of bin interval is closed. + base : int, default 0 + The "origin" of the adjusted Periods. + + Returns + ------- + A tuple of length 2, containing the adjusted pd.Period objects. """ if not all(isinstance(obj, pd.Period) for obj in [first, last]): raise TypeError("'first' and 'last' must be instances of type Period")