From a0fffb67fc685128d3b0993ce98c5bc63f5db099 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Mon, 11 Apr 2016 20:50:59 -0400 Subject: [PATCH] PeriodIndex count & resample-on-same-freq fix --- doc/source/whatsnew/v0.18.1.txt | 3 + pandas/tseries/resample.py | 41 ++++++----- pandas/tseries/tests/test_resample.py | 97 +++++++++++++++++---------- 3 files changed, 83 insertions(+), 58 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 4cfe82214d0d0..feb63db27805e 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -215,6 +215,9 @@ Bug Fixes - ``usecols`` parameter in ``pd.read_csv`` is now respected even when the lines of a CSV file are not even (:issue:`12203`) - Bug in ``groupby.transform(..)`` when ``axis=1`` is specified with a non-monotonic ordered index (:issue:`12713`) - Bug in ``Period`` and ``PeriodIndex`` creation raises ``KeyError`` if ``freq="Minute"`` is specified. Note that "Minute" freq is deprecated in v0.17.0, and recommended to use ``freq="T"`` instead (:issue:`11854`) +- Bug in ``PeriodIndex.resample(...).count()`` always raised a ``TypeError`` (:issue:`12774`) +- Bug in ``PeriodIndex.resample`` casting to ``DatetimeIndex`` when empty (:issue:`12868`) +- Bug in ``PeriodInedx.resample`` when resampling to existing frequency (:issue:`12770`) - Bug in printing data which contains ``Period`` with different ``freq`` raises ``ValueError`` (:issue:`12615`) - Bug in numpy compatibility of ``np.round()`` on a ``Series`` (:issue:`12600`) - Bug in ``Series`` construction with ``Categorical`` and ``dtype='category'`` is specified (:issue:`12574`) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 454eb6b3c165e..409d104e5eb71 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -653,9 +653,6 @@ def _convert_obj(self, obj): # Cannot have multiple of periods, convert to timestamp self.kind = 'timestamp' - if not len(obj): - self.kind = 'timestamp' - # convert to timestamp if not (self.kind is None or self.kind == 'period'): obj = obj.to_timestamp(how=self.convention) @@ -673,18 +670,15 @@ def aggregate(self, arg, *args, **kwargs): def _get_new_index(self): """ return our new index """ ax = self.ax - ax_attrs = ax._get_attributes_dict() - ax_attrs['freq'] = self.freq - obj = self._selected_obj if len(ax) == 0: - new_index = PeriodIndex(data=[], **ax_attrs) - return obj.reindex(new_index) - - start = ax[0].asfreq(self.freq, how=self.convention) - end = ax[-1].asfreq(self.freq, how='end') + values = [] + else: + start = ax[0].asfreq(self.freq, how=self.convention) + end = ax[-1].asfreq(self.freq, how='end') + values = period_range(start, end, freq=self.freq).values - return period_range(start, end, **ax_attrs) + return ax._shallow_copy(values, freq=self.freq) def _downsample(self, how, **kwargs): """ @@ -705,7 +699,7 @@ def _downsample(self, how, **kwargs): new_index = self._get_new_index() if len(new_index) == 0: - return self._wrap_result(new_index) + return self._wrap_result(self._selected_obj.reindex(new_index)) # Start vs. end of period memb = ax.asfreq(self.freq, how=self.convention) @@ -718,6 +712,8 @@ def _downsample(self, how, **kwargs): return self._groupby_and_aggregate(grouper, how) elif is_superperiod(ax.freq, self.freq): return self.asfreq() + elif ax.freq == self.freq: + return self.asfreq() raise ValueError('Frequency {axfreq} cannot be ' 'resampled to {freq}'.format( @@ -743,23 +739,24 @@ def _upsample(self, method, limit=None): ax = self.ax obj = self.obj - new_index = self._get_new_index() - if len(new_index) == 0: - return self._wrap_result(new_index) - if not is_superperiod(ax.freq, self.freq): - return self.asfreq() + if len(new_index) == 0: + return self._wrap_result(self._selected_obj.reindex(new_index)) # Start vs. end of period memb = ax.asfreq(self.freq, how=self.convention) # Get the fill indexer indexer = memb.get_indexer(new_index, method=method, limit=limit) - return self._wrap_result(_take_new_index(obj, - indexer, - new_index, - axis=self.axis)) + return self._wrap_result(_take_new_index( + obj, indexer, new_index, axis=self.axis)) + + def _groupby_and_aggregate(self, grouper, how, *args, **kwargs): + if grouper is None: + return self._downsample(how, **kwargs) + return super(PeriodIndexResampler, self)._groupby_and_aggregate( + grouper, how, *args, **kwargs) class TimedeltaResampler(DatetimeIndexResampler): diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index a9348eb11e13b..e450c802225f7 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -3,33 +3,34 @@ from datetime import datetime, timedelta from functools import partial -from pandas.compat import range, lrange, zip, product, OrderedDict +import nose import numpy as np +import pandas as pd +import pandas.tseries.offsets as offsets +import pandas.util.testing as tm from pandas import (Series, DataFrame, Panel, Index, isnull, notnull, Timestamp) - +from pandas.compat import range, lrange, zip, product, OrderedDict +from pandas.core.base import SpecificationError +from pandas.core.common import ABCSeries, ABCDataFrame from pandas.core.groupby import DataError +from pandas.tseries.frequencies import MONTHS, DAYS from pandas.tseries.index import date_range -from pandas.tseries.tdi import timedelta_range from pandas.tseries.offsets import Minute, BDay from pandas.tseries.period import period_range, PeriodIndex, Period from pandas.tseries.resample import (DatetimeIndex, TimeGrouper, DatetimeIndexResampler) -from pandas.tseries.frequencies import MONTHS, DAYS -from pandas.core.common import ABCSeries, ABCDataFrame -from pandas.core.base import SpecificationError - -import pandas.tseries.offsets as offsets -import pandas as pd - -import nose - +from pandas.tseries.tdi import timedelta_range from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal) -import pandas.util.testing as tm bday = BDay() +downsample_methods = ['min', 'max', 'first', 'last', 'sum', 'mean', 'sem', + 'median', 'prod', 'ohlc'] +upsample_methods = ['count', 'size'] +series_methods = ['nunique'] +resample_methods = downsample_methods + upsample_methods + series_methods class TestResampleAPI(tm.TestCase): @@ -95,12 +96,13 @@ def test_api_changes_v018(self): self.assertRaises(ValueError, lambda: r.iat[0]) self.assertRaises(ValueError, lambda: r.ix[0]) self.assertRaises(ValueError, lambda: r.loc[ - Timestamp('2013-01-01 00:00:00', offset='H')]) + Timestamp('2013-01-01 00:00:00', offset='H')]) self.assertRaises(ValueError, lambda: r.at[ - Timestamp('2013-01-01 00:00:00', offset='H')]) + Timestamp('2013-01-01 00:00:00', offset='H')]) def f(): r[0] = 5 + self.assertRaises(ValueError, f) # str/repr @@ -144,7 +146,6 @@ def f(): # comparison ops for op in ['__lt__', '__le__', '__gt__', '__ge__', '__eq__', '__ne__']: - r = self.series.resample('H') with tm.assert_produces_warning(FutureWarning, @@ -259,6 +260,7 @@ def test_attribute_access(self): # setting def f(): r.F = 'bah' + self.assertRaises(ValueError, f) def test_api_compat_before_use(self): @@ -509,10 +511,10 @@ def test_agg_misc(self): # errors # invalid names in the agg specification for t in [r, g]: - def f(): r[['A']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) + self.assertRaises(SpecificationError, f) def test_agg_nested_dicts(self): @@ -679,7 +681,7 @@ def _ohlc(group): assert_series_equal(result, expected) except BaseException as exc: - exc.args += ('how=%s' % arg, ) + exc.args += ('how=%s' % arg,) raise def test_resample_how_callables(self): @@ -692,7 +694,6 @@ def fn(x, a=1): return str(type(x)) class fn_class: - def __call__(self, x): return str(type(x)) @@ -768,7 +769,7 @@ def test_resample_rounding(self): from pandas.compat import StringIO df = pd.read_csv(StringIO(data), parse_dates={'timestamp': [ - 'date', 'time']}, index_col='timestamp') + 'date', 'time']}, index_col='timestamp') df.index.name = None result = df.resample('6s').sum() expected = DataFrame({'value': [ @@ -1061,10 +1062,10 @@ def test_resample_ohlc_dataframe(self): df.columns = [['a', 'b'], ['c', 'd']] res = df.resample('H').ohlc() - exp.columns = pd.MultiIndex.from_tuples([('a', 'c', 'open'), ( - 'a', 'c', 'high'), ('a', 'c', 'low'), ('a', 'c', 'close'), ( - 'b', 'd', 'open'), ('b', 'd', 'high'), ('b', 'd', 'low'), ( - 'b', 'd', 'close')]) + exp.columns = pd.MultiIndex.from_tuples([ + ('a', 'c', 'open'), ('a', 'c', 'high'), ('a', 'c', 'low'), + ('a', 'c', 'close'), ('b', 'd', 'open'), ('b', 'd', 'high'), + ('b', 'd', 'low'), ('b', 'd', 'close')]) assert_frame_equal(exp, res) # dupe columns fail atm @@ -1449,11 +1450,12 @@ def test_resample_anchored_multiday(self): # # See: https://github.com/pydata/pandas/issues/8683 - s = pd.Series(np.random.randn(5), - index=pd.date_range('2014-10-14 23:06:23.206', - periods=3, freq='400L') | - pd.date_range('2014-10-15 23:00:00', - periods=2, freq='2200L')) + index = pd.date_range( + '2014-10-14 23:06:23.206', periods=3, freq='400L' + ) | pd.date_range( + '2014-10-15 23:00:00', periods=2, freq='2200L') + + s = pd.Series(np.random.randn(5), index=index) # Ensure left closing works result = s.resample('2200L').mean() @@ -1763,7 +1765,6 @@ def _simple_pts(start, end, freq='D'): class TestResamplePeriodIndex(tm.TestCase): - _multiprocess_can_split_ = True def test_annual_upsample_D_s_f(self): @@ -1907,16 +1908,40 @@ def test_resample_basic(self): def test_resample_empty(self): - # GH12771 + # GH12771 & GH12868 index = PeriodIndex(start='2000', periods=0, freq='D', name='idx') s = Series(index=index) - result = s.resample('M').sum() - # after GH12774 is resolved, this should be a PeriodIndex - expected_index = DatetimeIndex([], name='idx') + expected_index = PeriodIndex([], name='idx', freq='M') expected = Series(index=expected_index) + + for method in resample_methods: + result = getattr(s.resample('M'), method)() + assert_series_equal(result, expected) + + def test_resample_count(self): + + # GH12774 + series = pd.Series(1, index=pd.period_range(start='2000', + periods=100)) + result = series.resample('M').count() + + expected_index = pd.period_range(start='2000', freq='M', periods=4) + expected = pd.Series([31, 29, 31, 9], index=expected_index) + assert_series_equal(result, expected) + def test_resample_same_freq(self): + + # GH12770 + series = pd.Series(range(3), index=pd.period_range( + start='2000', periods=3, freq='M')) + expected = series + + for method in resample_methods: + result = getattr(series.resample('M'), method)() + assert_series_equal(result, expected) + def test_with_local_timezone_pytz(self): # GH5430 tm._skip_if_no_pytz() @@ -2493,8 +2518,8 @@ def test_aggregate_with_nat(self): # GH 9925 self.assertEqual(dt_result.index.name, 'key') - # if NaT is included, 'var', 'std', 'mean', 'first','last' and 'nth' - # doesn't work yet + # if NaT is included, 'var', 'std', 'mean', 'first','last' + # and 'nth' doesn't work yet if __name__ == '__main__':