diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 4a3122a78b234..eafe8d08aafaa 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -171,6 +171,82 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_0210.api_breaking.period_index_resampling: + +``PeriodIndex`` resampling +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, resampling a ``Series``/``DataFrame`` indexed by a ``PeriodIndex`` returned a ``DatetimeIndex`` in some cases (:issue:`12884`). Resampling to a multiplied frequency now returns a ``PeriodIndex`` (:issue:`15944`). As a minor enhancement, resampling a ``PeriodIndex`` can now handle ``NaT`` values (:issue:`13224`) + +Previous Behavior: + +.. code-block:: ipython + + In [1]: pi = pd.period_range('2017-01', periods=12, freq='M') + + In [2]: s = pd.Series(np.arange(12), index=pi) + + In [3]: resampled = s.resample('2Q').mean() + + In [4]: resampled + Out[4]: + 2017-03-31 1.0 + 2017-09-30 5.5 + 2018-03-31 10.0 + Freq: 2Q-DEC, dtype: float64 + + In [5]: resampled.index + Out[5]: DatetimeIndex(['2017-03-31', '2017-09-30', '2018-03-31'], dtype='datetime64[ns]', freq='2Q-DEC') + +New Behavior: + +.. ipython:: python + + pi = pd.period_range('2017-01', periods=12, freq='M') + + s = pd.Series(np.arange(12), index=pi) + + resampled = s.resample('2Q').mean() + + resampled + + resampled.index + + +Upsampling and calling ``.ohlc()`` previously returned a ``Series``, basically identical to calling ``.asfreq()``. OHLC upsampling now returns a DataFrame with columns ``open``, ``high``, ``low`` and ``close`` (:issue:`13083`). This is consistent with downsampling and ``DatetimeIndex`` behavior. + +Previous Behavior: + +.. code-block:: ipython + + In [1]: pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10) + + In [2]: s = pd.Series(np.arange(10), index=pi) + + In [3]: s.resample('H').ohlc() + Out[3]: + 2000-01-01 00:00 0.0 + ... + 2000-01-10 23:00 NaN + Freq: H, Length: 240, dtype: float64 + + In [4]: s.resample('M').ohlc() + Out[4]: + open high low close + 2000-01 0 9 0 9 + +New Behavior: + +.. ipython:: python + + pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10) + + s = pd.Series(np.arange(10), index=pi) + + s.resample('H').ohlc() + + s.resample('M').ohlc() + .. _whatsnew_0210.api_breaking.deps: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 01c7e875b8ecc..083fbcaaabe46 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -14,7 +14,7 @@ from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.tseries.offsets import DateOffset, Tick, Day, _delta_to_nanoseconds -from pandas.core.indexes.period import PeriodIndex, period_range +from pandas.core.indexes.period import PeriodIndex import pandas.core.common as com import pandas.core.algorithms as algos from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -834,53 +834,32 @@ class PeriodIndexResampler(DatetimeIndexResampler): def _resampler_for_grouping(self): return PeriodIndexResamplerGroupby + def _get_binner_for_time(self): + if self.kind == 'timestamp': + return super(PeriodIndexResampler, self)._get_binner_for_time() + return self.groupby._get_period_bins(self.ax) + def _convert_obj(self, obj): obj = super(PeriodIndexResampler, self)._convert_obj(obj) - offset = to_offset(self.freq) - if offset.n > 1: - if self.kind == 'period': # pragma: no cover - print('Warning: multiple of frequency -> timestamps') - - # Cannot have multiple of periods, convert to timestamp + if self._from_selection: + # see GH 14008, GH 12871 + msg = ("Resampling from level= or on= selection" + " with a PeriodIndex is not currently supported," + " use .set_index(...) to explicitly set index") + raise NotImplementedError(msg) + + if self.loffset is not None: + # Cannot apply loffset/timedelta to PeriodIndex -> convert to + # timestamps self.kind = 'timestamp' # convert to timestamp - if not (self.kind is None or self.kind == 'period'): - if self._from_selection: - # see GH 14008, GH 12871 - msg = ("Resampling from level= or on= selection" - " with a PeriodIndex is not currently supported," - " use .set_index(...) to explicitly set index") - raise NotImplementedError(msg) - else: - obj = obj.to_timestamp(how=self.convention) + if self.kind == 'timestamp': + obj = obj.to_timestamp(how=self.convention) return obj - def aggregate(self, arg, *args, **kwargs): - result, how = self._aggregate(arg, *args, **kwargs) - if result is None: - result = self._downsample(arg, *args, **kwargs) - - result = self._apply_loffset(result) - return result - - agg = aggregate - - def _get_new_index(self): - """ return our new index """ - ax = self.ax - - if len(ax) == 0: - values = [] - else: - start = ax[0].asfreq(self.freq, how=self.convention) - end = ax[-1].asfreq(self.freq, how='end') - values = period_range(start, end, freq=self.freq).asi8 - - return ax._shallow_copy(values, freq=self.freq) - def _downsample(self, how, **kwargs): """ Downsample the cython defined function @@ -898,22 +877,17 @@ def _downsample(self, how, **kwargs): how = self._is_cython_func(how) or how ax = self.ax - new_index = self._get_new_index() - - # Start vs. end of period - memb = ax.asfreq(self.freq, how=self.convention) - if is_subperiod(ax.freq, self.freq): # Downsampling - if len(new_index) == 0: - bins = [] - else: - i8 = memb.asi8 - rng = np.arange(i8[0], i8[-1] + 1) - bins = memb.searchsorted(rng, side='right') - grouper = BinGrouper(bins, new_index) - return self._groupby_and_aggregate(how, grouper=grouper) + return self._groupby_and_aggregate(how, grouper=self.grouper) elif is_superperiod(ax.freq, self.freq): + if how == 'ohlc': + # GH #13083 + # upsampling to subperiods is handled as an asfreq, which works + # for pure aggregating/reducing methods + # OHLC reduces along the time dimension, but creates multiple + # values for each period -> handle by _groupby_and_aggregate() + return self._groupby_and_aggregate(how, grouper=self.grouper) return self.asfreq() elif ax.freq == self.freq: return self.asfreq() @@ -936,19 +910,16 @@ def _upsample(self, method, limit=None, fill_value=None): .fillna """ - if self._from_selection: - raise ValueError("Upsampling from level= or on= selection" - " is not supported, use .set_index(...)" - " to explicitly set index to" - " datetime-like") + # we may need to actually resample as if we are timestamps if self.kind == 'timestamp': return super(PeriodIndexResampler, self)._upsample( method, limit=limit, fill_value=fill_value) + self._set_binner() ax = self.ax obj = self.obj - new_index = self._get_new_index() + new_index = self.binner # Start vs. end of period memb = ax.asfreq(self.freq, how=self.convention) @@ -1293,6 +1264,51 @@ def _get_time_period_bins(self, ax): return binner, bins, labels + def _get_period_bins(self, ax): + if not isinstance(ax, PeriodIndex): + raise TypeError('axis must be a PeriodIndex, but got ' + 'an instance of %r' % type(ax).__name__) + + memb = ax.asfreq(self.freq, how=self.convention) + + # NaT handling as in pandas._lib.lib.generate_bins_dt64() + nat_count = 0 + if memb.hasnans: + nat_count = np.sum(memb._isnan) + memb = memb[~memb._isnan] + + # if index contains no valid (non-NaT) values, return empty index + if not len(memb): + binner = labels = PeriodIndex( + data=[], freq=self.freq, name=ax.name) + return binner, [], labels + + start = ax.min().asfreq(self.freq, how=self.convention) + end = ax.max().asfreq(self.freq, how='end') + + labels = binner = PeriodIndex(start=start, end=end, + freq=self.freq, name=ax.name) + + i8 = memb.asi8 + freq_mult = self.freq.n + + # when upsampling to subperiods, we need to generate enough bins + expected_bins_count = len(binner) * freq_mult + i8_extend = expected_bins_count - (i8[-1] - i8[0]) + rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) + rng += freq_mult + bins = memb.searchsorted(rng, side='left') + + if nat_count > 0: + # NaT handling as in pandas._lib.lib.generate_bins_dt64() + # shift bins by the number of NaT + bins += nat_count + bins = np.insert(bins, 0, nat_count) + binner = binner.insert(0, tslib.NaT) + labels = labels.insert(0, tslib.NaT) + + return binner, bins, labels + def _take_new_index(obj, indexer, new_index, axis=0): from pandas.core.api import Series, DataFrame diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 7449beb8f97df..cd15203eccd82 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -18,7 +18,7 @@ from pandas.core.dtypes.generic import ABCSeries, ABCDataFrame from pandas.compat import range, lrange, zip, product, OrderedDict -from pandas.core.base import SpecificationError +from pandas.core.base import SpecificationError, AbstractMethodError from pandas.errors import UnsupportedFunctionCall from pandas.core.groupby import DataError from pandas.tseries.frequencies import MONTHS, DAYS @@ -698,35 +698,58 @@ def create_index(self, *args, **kwargs): factory = self._index_factory() return factory(*args, **kwargs) - def test_asfreq_downsample(self): - s = self.create_series() - - result = s.resample('2D').asfreq() - expected = s.reindex(s.index.take(np.arange(0, len(s.index), 2))) - expected.index.freq = to_offset('2D') - assert_series_equal(result, expected) - - frame = s.to_frame('value') - result = frame.resample('2D').asfreq() - expected = frame.reindex( - frame.index.take(np.arange(0, len(frame.index), 2))) - expected.index.freq = to_offset('2D') - assert_frame_equal(result, expected) - - def test_asfreq_upsample(self): - s = self.create_series() - - result = s.resample('1H').asfreq() - new_index = self.create_index(s.index[0], s.index[-1], freq='1H') - expected = s.reindex(new_index) - assert_series_equal(result, expected) - - frame = s.to_frame('value') - result = frame.resample('1H').asfreq() - new_index = self.create_index(frame.index[0], - frame.index[-1], freq='1H') - expected = frame.reindex(new_index) - assert_frame_equal(result, expected) + @pytest.fixture + def _index_start(self): + return datetime(2005, 1, 1) + + @pytest.fixture + def _index_end(self): + return datetime(2005, 1, 10) + + @pytest.fixture + def _index_freq(self): + return 'D' + + @pytest.fixture + def index(self, _index_start, _index_end, _index_freq): + return self.create_index(_index_start, _index_end, freq=_index_freq) + + @pytest.fixture + def _series_name(self): + raise AbstractMethodError(self) + + @pytest.fixture + def _static_values(self, index): + return np.arange(len(index)) + + @pytest.fixture + def series(self, index, _series_name, _static_values): + return Series(_static_values, index=index, name=_series_name) + + @pytest.fixture + def frame(self, index, _static_values): + return DataFrame({'value': _static_values}, index=index) + + @pytest.fixture(params=[Series, DataFrame]) + def series_and_frame(self, request, index, _series_name, _static_values): + if request.param == Series: + return Series(_static_values, index=index, name=_series_name) + if request.param == DataFrame: + return DataFrame({'value': _static_values}, index=index) + + @pytest.mark.parametrize('freq', ['2D', '1H']) + def test_asfreq(self, series_and_frame, freq): + obj = series_and_frame + + result = obj.resample(freq).asfreq() + if freq == '2D': + new_index = obj.index.take(np.arange(0, len(obj.index), 2)) + new_index.freq = to_offset('2D') + else: + new_index = self.create_index(obj.index[0], obj.index[-1], + freq=freq) + expected = obj.reindex(new_index) + assert_almost_equal(result, expected) def test_asfreq_fill_value(self): # test for fill value during resampling, issue 3715 @@ -824,7 +847,7 @@ def test_resample_loffset_arg_type(self): periods=len(df.index) / 2, freq='2D') - # loffset coreces PeriodIndex to DateTimeIndex + # loffset coerces PeriodIndex to DateTimeIndex if isinstance(expected_index, PeriodIndex): expected_index = expected_index.to_timestamp() @@ -866,6 +889,10 @@ def test_apply_to_empty_series(self): class TestDatetimeIndex(Base): _index_factory = lambda x: date_range + @pytest.fixture + def _series_name(self): + return 'dti' + def setup_method(self, method): dti = DatetimeIndex(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq='Min') @@ -2214,57 +2241,35 @@ def test_resample_datetime_values(self): class TestPeriodIndex(Base): _index_factory = lambda x: period_range + @pytest.fixture + def _series_name(self): + return 'pi' + def create_series(self): + # TODO: replace calls to .create_series() by injecting the series + # fixture i = period_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq='D') return Series(np.arange(len(i)), index=i, name='pi') - def test_asfreq_downsample(self): - - # series - s = self.create_series() - expected = s.reindex(s.index.take(np.arange(0, len(s.index), 2))) - expected.index = expected.index.to_timestamp() - expected.index.freq = to_offset('2D') - - # this is a bug, this *should* return a PeriodIndex - # directly - # GH 12884 - result = s.resample('2D').asfreq() - assert_series_equal(result, expected) - - # frame - frame = s.to_frame('value') - expected = frame.reindex( - frame.index.take(np.arange(0, len(frame.index), 2))) - expected.index = expected.index.to_timestamp() - expected.index.freq = to_offset('2D') - result = frame.resample('2D').asfreq() - assert_frame_equal(result, expected) - - def test_asfreq_upsample(self): - - # this is a bug, this *should* return a PeriodIndex - # directly - # GH 12884 - s = self.create_series() - new_index = date_range(s.index[0].to_timestamp(how='start'), - (s.index[-1] + 1).to_timestamp(how='start'), - freq='1H', - closed='left') - expected = s.to_timestamp().reindex(new_index).to_period() - result = s.resample('1H').asfreq() - assert_series_equal(result, expected) - - frame = s.to_frame('value') - new_index = date_range(frame.index[0].to_timestamp(how='start'), - (frame.index[-1] + 1).to_timestamp(how='start'), - freq='1H', - closed='left') - expected = frame.to_timestamp().reindex(new_index).to_period() - result = frame.resample('1H').asfreq() - assert_frame_equal(result, expected) + @pytest.mark.parametrize('freq', ['2D', '1H', '2H']) + @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) + def test_asfreq(self, series_and_frame, freq, kind): + # GH 12884, 15944 + # make sure .asfreq() returns PeriodIndex (except kind='timestamp') + + obj = series_and_frame + if kind == 'timestamp': + expected = obj.to_timestamp().resample(freq).asfreq() + else: + start = obj.index[0].to_timestamp(how='start') + end = (obj.index[-1] + 1).to_timestamp(how='start') + new_index = date_range(start=start, end=end, freq=freq, + closed='left') + expected = obj.to_timestamp().reindex(new_index).to_period(freq) + result = obj.resample(freq, kind=kind).asfreq() + assert_almost_equal(result, expected) def test_asfreq_fill_value(self): # test for fill value during resampling, issue 3715 @@ -2285,8 +2290,9 @@ def test_asfreq_fill_value(self): result = frame.resample('1H', kind='timestamp').asfreq(fill_value=3.0) assert_frame_equal(result, expected) - def test_selection(self): - index = self.create_series().index + @pytest.mark.parametrize('freq', ['H', '12H', '2D', 'W']) + @pytest.mark.parametrize('kind', [None, 'period', 'timestamp']) + def test_selection(self, index, freq, kind): # This is a bug, these should be implemented # GH 14008 df = pd.DataFrame({'date': index, @@ -2294,12 +2300,10 @@ def test_selection(self): index=pd.MultiIndex.from_arrays([ np.arange(len(index), dtype=np.int64), index], names=['v', 'd'])) - with pytest.raises(NotImplementedError): - df.resample('2D', on='date') - + df.resample(freq, on='date', kind=kind) with pytest.raises(NotImplementedError): - df.resample('2D', level='d') + df.resample(freq, level='d', kind=kind) def test_annual_upsample_D_s_f(self): self._check_annual_upsample_cases('D', 'start', 'ffill') @@ -2366,15 +2370,14 @@ def test_not_subperiod(self): pytest.raises(ValueError, lambda: ts.resample('M').mean()) pytest.raises(ValueError, lambda: ts.resample('w-thu').mean()) - def test_basic_upsample(self): + @pytest.mark.parametrize('freq', ['D', '2D']) + def test_basic_upsample(self, freq): ts = _simple_pts('1/1/1990', '6/30/1995', freq='M') result = ts.resample('a-dec').mean() - resampled = result.resample('D', convention='end').ffill() - - expected = result.to_timestamp('D', how='end') - expected = expected.asfreq('D', 'ffill').to_period() - + resampled = result.resample(freq, convention='end').ffill() + expected = result.to_timestamp(freq, how='end') + expected = expected.asfreq(freq, 'ffill').to_period(freq) assert_series_equal(resampled, expected) def test_upsample_with_limit(self): @@ -2440,16 +2443,15 @@ def test_resample_basic(self): result2 = s.resample('T', kind='period').mean() assert_series_equal(result2, expected) - def test_resample_count(self): - + @pytest.mark.parametrize('freq,expected_vals', [('M', [31, 29, 31, 9]), + ('2M', [31 + 29, 31 + 9])]) + def test_resample_count(self, freq, expected_vals): # GH12774 - series = pd.Series(1, index=pd.period_range(start='2000', - periods=100)) - result = series.resample('M').count() - - expected_index = pd.period_range(start='2000', freq='M', periods=4) - expected = pd.Series([31, 29, 31, 9], index=expected_index) - + series = pd.Series(1, index=pd.period_range(start='2000', periods=100)) + result = series.resample(freq).count() + expected_index = pd.period_range(start='2000', freq=freq, + periods=len(expected_vals)) + expected = pd.Series(expected_vals, index=expected_index) assert_series_equal(result, expected) def test_resample_same_freq(self): @@ -2587,12 +2589,15 @@ def test_cant_fill_missing_dups(self): s = Series(np.random.randn(5), index=rng) pytest.raises(Exception, lambda: s.resample('A').ffill()) - def test_resample_5minute(self): + @pytest.mark.parametrize('freq', ['5min']) + @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) + def test_resample_5minute(self, freq, kind): rng = period_range('1/1/2000', '1/5/2000', freq='T') ts = Series(np.random.randn(len(rng)), index=rng) - - result = ts.resample('5min').mean() - expected = ts.to_timestamp().resample('5min').mean() + expected = ts.to_timestamp().resample(freq).mean() + if kind != 'timestamp': + expected = expected.to_period(freq) + result = ts.resample(freq, kind=kind).mean() assert_series_equal(result, expected) def test_upsample_daily_business_daily(self): @@ -2812,18 +2817,96 @@ def test_evenly_divisible_with_no_extra_bins(self): result = df.resample('7D').sum() assert_frame_equal(result, expected) - def test_apply_to_empty_series(self): - # GH 14313 - series = self.create_series()[:0] + @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) + @pytest.mark.parametrize('agg_arg', ['mean', {'value': 'mean'}, ['mean']]) + def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg): + # make sure passing loffset returns DatetimeIndex in all cases + # basic method taken from Base.test_resample_loffset_arg_type() + df = frame + expected_means = [df.values[i:i + 2].mean() + for i in range(0, len(df.values), 2)] + expected_index = self.create_index(df.index[0], + periods=len(df.index) / 2, + freq='2D') - for freq in ['M', 'D', 'H']: - with pytest.raises(TypeError): - series.resample(freq).apply(lambda x: 1) + # loffset coerces PeriodIndex to DateTimeIndex + expected_index = expected_index.to_timestamp() + expected_index += timedelta(hours=2) + expected = DataFrame({'value': expected_means}, index=expected_index) + + result_agg = df.resample('2D', loffset='2H', kind=kind).agg(agg_arg) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result_how = df.resample('2D', how=agg_arg, loffset='2H', + kind=kind) + if isinstance(agg_arg, list): + expected.columns = pd.MultiIndex.from_tuples([('value', 'mean')]) + assert_frame_equal(result_agg, expected) + assert_frame_equal(result_how, expected) + + @pytest.mark.parametrize('freq, period_mult', [('H', 24), ('12H', 2)]) + @pytest.mark.parametrize('kind', [None, 'period']) + def test_upsampling_ohlc(self, freq, period_mult, kind): + # GH 13083 + pi = PeriodIndex(start='2000', freq='D', periods=10) + s = Series(range(len(pi)), index=pi) + expected = s.to_timestamp().resample(freq).ohlc().to_period(freq) + + # timestamp-based resampling doesn't include all sub-periods + # of the last original period, so extend accordingly: + new_index = PeriodIndex(start='2000', freq=freq, + periods=period_mult * len(pi)) + expected = expected.reindex(new_index) + result = s.resample(freq, kind=kind).ohlc() + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('periods, values', + [([pd.NaT, '1970-01-01 00:00:00', pd.NaT, + '1970-01-01 00:00:02', '1970-01-01 00:00:03'], + [2, 3, 5, 7, 11]), + ([pd.NaT, pd.NaT, '1970-01-01 00:00:00', pd.NaT, + pd.NaT, pd.NaT, '1970-01-01 00:00:02', + '1970-01-01 00:00:03', pd.NaT, pd.NaT], + [1, 2, 3, 5, 6, 8, 7, 11, 12, 13])]) + @pytest.mark.parametrize('freq, expected_values', + [('1s', [3, np.NaN, 7, 11]), + ('2s', [3, int((7 + 11) / 2)]), + ('3s', [int((3 + 7) / 2), 11])]) + def test_resample_with_nat(self, periods, values, freq, expected_values): + # GH 13224 + index = PeriodIndex(periods, freq='S') + frame = DataFrame(values, index=index) + + expected_index = period_range('1970-01-01 00:00:00', + periods=len(expected_values), freq=freq) + expected = DataFrame(expected_values, index=expected_index) + result = frame.resample(freq).mean() + assert_frame_equal(result, expected) + + def test_resample_with_only_nat(self): + # GH 13224 + pi = PeriodIndex([pd.NaT] * 3, freq='S') + frame = DataFrame([2, 3, 5], index=pi) + expected_index = PeriodIndex(data=[], freq=pi.freq) + expected = DataFrame([], index=expected_index) + result = frame.resample('1s').mean() + assert_frame_equal(result, expected) class TestTimedeltaIndex(Base): _index_factory = lambda x: timedelta_range + @pytest.fixture + def _index_start(self): + return '1 day' + + @pytest.fixture + def _index_end(self): + return '10 day' + + @pytest.fixture + def _series_name(self): + return 'tdi' + def create_series(self): i = timedelta_range('1 day', '10 day', freq='D') @@ -3167,13 +3250,6 @@ def test_fails_on_no_datetime_index(self): "instance of %r" % name): df.groupby(TimeGrouper('D')) - # PeriodIndex gives a specific error message - df = DataFrame({'a': np.random.randn(n)}, index=tm.makePeriodIndex(n)) - with tm.assert_raises_regex(TypeError, - "axis must be a DatetimeIndex, but " - "got an instance of 'PeriodIndex'"): - df.groupby(TimeGrouper('D')) - def test_aaa_group_order(self): # GH 12840 # check TimeGrouper perform stable sorts