BUG: PeriodIndex count & resample-on-same-freq fix

max-sixty · jreback · commit 77be872f4fce · 2016-04-12T21:12:06.000-04:00
closes #12774 closes #12868 closes #12770 closes #12874
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -217,6 +217,9 @@ Bug Fixes
 - ``usecols`` parameter in ``pd.read_csv`` is now respected even when the lines of a CSV file are not even (:issue:`12203`)
 - Bug in ``groupby.transform(..)`` when ``axis=1`` is specified with a non-monotonic ordered index (:issue:`12713`)
 - Bug in ``Period`` and ``PeriodIndex`` creation raises ``KeyError`` if ``freq="Minute"`` is specified. Note that "Minute" freq is deprecated in v0.17.0, and recommended to use ``freq="T"`` instead (:issue:`11854`)
+- Bug in ``.resample(...).count()`` with a ``PeriodIndex`` always raising a ``TypeError`` (:issue:`12774`)
+- Bug in ``.resample(...)`` with a ``PeriodIndex`` casting to a ``DatetimeIndex`` when empty (:issue:`12868`)
+- Bug in ``.resample(...)`` with a ``PeriodIndex`` when resampling to an existing frequency (:issue:`12770`)
 - Bug in printing data which contains ``Period`` with different ``freq`` raises ``ValueError`` (:issue:`12615`)
 - Bug in numpy compatibility of ``np.round()`` on a ``Series`` (:issue:`12600`)
 - Bug in ``Series`` construction with ``Categorical`` and ``dtype='category'`` is specified (:issue:`12574`)
diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py
@@ -653,9 +653,6 @@ def _convert_obj(self, obj):
             # Cannot have multiple of periods, convert to timestamp
             self.kind = 'timestamp'
 
-        if not len(obj):
-            self.kind = 'timestamp'
-
         # convert to timestamp
         if not (self.kind is None or self.kind == 'period'):
             obj = obj.to_timestamp(how=self.convention)
@@ -673,18 +670,15 @@ def aggregate(self, arg, *args, **kwargs):
     def _get_new_index(self):
         """ return our new index """
         ax = self.ax
-        ax_attrs = ax._get_attributes_dict()
-        ax_attrs['freq'] = self.freq
-        obj = self._selected_obj
 
         if len(ax) == 0:
-            new_index = PeriodIndex(data=[], **ax_attrs)
-            return obj.reindex(new_index)
-
-        start = ax[0].asfreq(self.freq, how=self.convention)
-        end = ax[-1].asfreq(self.freq, how='end')
+            values = []
+        else:
+            start = ax[0].asfreq(self.freq, how=self.convention)
+            end = ax[-1].asfreq(self.freq, how='end')
+            values = period_range(start, end, freq=self.freq).values
 
-        return period_range(start, end, **ax_attrs)
+        return ax._shallow_copy(values, freq=self.freq)
 
     def _downsample(self, how, **kwargs):
         """
@@ -705,7 +699,7 @@ def _downsample(self, how, **kwargs):
 
         new_index = self._get_new_index()
         if len(new_index) == 0:
-            return self._wrap_result(new_index)
+            return self._wrap_result(self._selected_obj.reindex(new_index))
 
         # Start vs. end of period
         memb = ax.asfreq(self.freq, how=self.convention)
@@ -718,6 +712,8 @@ def _downsample(self, how, **kwargs):
             return self._groupby_and_aggregate(grouper, how)
         elif is_superperiod(ax.freq, self.freq):
             return self.asfreq()
+        elif ax.freq == self.freq:
+            return self.asfreq()
 
         raise ValueError('Frequency {axfreq} cannot be '
                          'resampled to {freq}'.format(
@@ -743,23 +739,24 @@ def _upsample(self, method, limit=None):
 
         ax = self.ax
         obj = self.obj
-
         new_index = self._get_new_index()
-        if len(new_index) == 0:
-            return self._wrap_result(new_index)
 
-        if not is_superperiod(ax.freq, self.freq):
-            return self.asfreq()
+        if len(new_index) == 0:
+            return self._wrap_result(self._selected_obj.reindex(new_index))
 
         # Start vs. end of period
         memb = ax.asfreq(self.freq, how=self.convention)
 
         # Get the fill indexer
         indexer = memb.get_indexer(new_index, method=method, limit=limit)
-        return self._wrap_result(_take_new_index(obj,
-                                                 indexer,
-                                                 new_index,
-                                                 axis=self.axis))
+        return self._wrap_result(_take_new_index(
+            obj, indexer, new_index, axis=self.axis))
+
+    def _groupby_and_aggregate(self, grouper, how, *args, **kwargs):
+        if grouper is None:
+            return self._downsample(how, **kwargs)
+        return super(PeriodIndexResampler, self)._groupby_and_aggregate(
+            grouper, how, *args, **kwargs)
 
 
 class TimedeltaResampler(DatetimeIndexResampler):
diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py
@@ -3,34 +3,37 @@
 from datetime import datetime, timedelta
 from functools import partial
 
-from pandas.compat import range, lrange, zip, product, OrderedDict
+import nose
 import numpy as np
 
+import pandas as pd
+import pandas.tseries.offsets as offsets
+import pandas.util.testing as tm
 from pandas import (Series, DataFrame, Panel, Index, isnull,
                     notnull, Timestamp)
-
+from pandas.compat import range, lrange, zip, product, OrderedDict
+from pandas.core.base import SpecificationError
+from pandas.core.common import ABCSeries, ABCDataFrame
 from pandas.core.groupby import DataError
+from pandas.tseries.frequencies import MONTHS, DAYS
 from pandas.tseries.index import date_range
-from pandas.tseries.tdi import timedelta_range
 from pandas.tseries.offsets import Minute, BDay
 from pandas.tseries.period import period_range, PeriodIndex, Period
 from pandas.tseries.resample import (DatetimeIndex, TimeGrouper,
                                      DatetimeIndexResampler)
-from pandas.tseries.frequencies import MONTHS, DAYS
-from pandas.core.common import ABCSeries, ABCDataFrame
-from pandas.core.base import SpecificationError
-
-import pandas.tseries.offsets as offsets
-import pandas as pd
-
-import nose
-
+from pandas.tseries.tdi import timedelta_range
 from pandas.util.testing import (assert_series_equal, assert_almost_equal,
                                  assert_frame_equal)
-import pandas.util.testing as tm
 
 bday = BDay()
 
+# The various methods we support
+downsample_methods = ['min', 'max', 'first', 'last', 'sum', 'mean', 'sem',
+                      'median', 'prod', 'var', 'ohlc']
+upsample_methods = ['count', 'size']
+series_methods = ['nunique']
+resample_methods = downsample_methods + upsample_methods + series_methods
+
 
 class TestResampleAPI(tm.TestCase):
     _multiprocess_can_split_ = True
@@ -95,12 +98,13 @@ def test_api_changes_v018(self):
         self.assertRaises(ValueError, lambda: r.iat[0])
         self.assertRaises(ValueError, lambda: r.ix[0])
         self.assertRaises(ValueError, lambda: r.loc[
-                          Timestamp('2013-01-01 00:00:00', offset='H')])
+            Timestamp('2013-01-01 00:00:00', offset='H')])
         self.assertRaises(ValueError, lambda: r.at[
-                          Timestamp('2013-01-01 00:00:00', offset='H')])
+            Timestamp('2013-01-01 00:00:00', offset='H')])
 
         def f():
             r[0] = 5
+
         self.assertRaises(ValueError, f)
 
         # str/repr
@@ -144,7 +148,6 @@ def f():
 
         # comparison ops
         for op in ['__lt__', '__le__', '__gt__', '__ge__', '__eq__', '__ne__']:
-
             r = self.series.resample('H')
 
             with tm.assert_produces_warning(FutureWarning,
@@ -259,6 +262,7 @@ def test_attribute_access(self):
         # setting
         def f():
             r.F = 'bah'
+
         self.assertRaises(ValueError, f)
 
     def test_api_compat_before_use(self):
@@ -509,10 +513,10 @@ def test_agg_misc(self):
         # errors
         # invalid names in the agg specification
         for t in [r, g]:
-
             def f():
                 r[['A']].agg({'A': ['sum', 'std'],
                               'B': ['mean', 'std']})
+
             self.assertRaises(SpecificationError, f)
 
     def test_agg_nested_dicts(self):
@@ -648,8 +652,7 @@ def test_resample_how(self):
         grouplist[1:6] = 1
         grouplist[6:11] = 2
         grouplist[11:] = 3
-        args = ['sum', 'mean', 'std', 'sem', 'max', 'min', 'median', 'first',
-                'last', 'ohlc']
+        args = downsample_methods
 
         def _ohlc(group):
             if isnull(group).all():
@@ -679,7 +682,7 @@ def _ohlc(group):
                     assert_series_equal(result, expected)
             except BaseException as exc:
 
-                exc.args += ('how=%s' % arg, )
+                exc.args += ('how=%s' % arg,)
                 raise
 
     def test_resample_how_callables(self):
@@ -692,7 +695,6 @@ def fn(x, a=1):
             return str(type(x))
 
         class fn_class:
-
             def __call__(self, x):
                 return str(type(x))
 
@@ -768,7 +770,7 @@ def test_resample_rounding(self):
 
         from pandas.compat import StringIO
         df = pd.read_csv(StringIO(data), parse_dates={'timestamp': [
-                         'date', 'time']}, index_col='timestamp')
+            'date', 'time']}, index_col='timestamp')
         df.index.name = None
         result = df.resample('6s').sum()
         expected = DataFrame({'value': [
@@ -1061,10 +1063,10 @@ def test_resample_ohlc_dataframe(self):
 
         df.columns = [['a', 'b'], ['c', 'd']]
         res = df.resample('H').ohlc()
-        exp.columns = pd.MultiIndex.from_tuples([('a', 'c', 'open'), (
-            'a', 'c', 'high'), ('a', 'c', 'low'), ('a', 'c', 'close'), (
-                'b', 'd', 'open'), ('b', 'd', 'high'), ('b', 'd', 'low'), (
-                    'b', 'd', 'close')])
+        exp.columns = pd.MultiIndex.from_tuples([
+            ('a', 'c', 'open'), ('a', 'c', 'high'), ('a', 'c', 'low'),
+            ('a', 'c', 'close'), ('b', 'd', 'open'), ('b', 'd', 'high'),
+            ('b', 'd', 'low'), ('b', 'd', 'close')])
         assert_frame_equal(exp, res)
 
         # dupe columns fail atm
@@ -1333,7 +1335,7 @@ def test_resample_empty(self):
         # them to ensure they no longer do.  (GH #10228)
         for index in tm.all_timeseries_index_generator(0):
             for dtype in (np.float, np.int, np.object, 'datetime64[ns]'):
-                for how in ('count', 'mean', 'min', 'ohlc', 'last', 'prod'):
+                for how in downsample_methods + upsample_methods:
                     empty_series = pd.Series([], index, dtype)
                     try:
                         getattr(empty_series.resample('d'), how)()
@@ -1342,6 +1344,30 @@ def test_resample_empty(self):
                         # (ex: doing mean with dtype of np.object)
                         pass
 
+    def test_resample_empty_nunique(self):
+
+        # this routines should be folded into the above when
+        # GH12886 is closed
+        for index in tm.all_timeseries_index_generator(0):
+            for dtype in (np.float, np.int, np.object, 'datetime64[ns]'):
+                for how in ['nunique']:
+
+                    empty_series = pd.Series([], index, dtype)
+
+                    def f():
+                        getattr(empty_series.resample('d'), how)()
+
+                    if isinstance(index,
+                                  (pd.DatetimeIndex, pd.TimedeltaIndex)):
+                        self.assertRaises(Exception, f)
+                    else:
+                        try:
+                            f()
+                        except DataError:
+                            # Ignore these since some combinations are invalid
+                            # (ex: doing mean with dtype of np.object)
+                            pass
+
     def test_resample_dtype_preservation(self):
 
         # GH 12202
@@ -1449,11 +1475,12 @@ def test_resample_anchored_multiday(self):
         #
         # See: https://github.com/pydata/pandas/issues/8683
 
-        s = pd.Series(np.random.randn(5),
-                      index=pd.date_range('2014-10-14 23:06:23.206',
-                                          periods=3, freq='400L') |
-                      pd.date_range('2014-10-15 23:00:00',
-                                    periods=2, freq='2200L'))
+        index = pd.date_range(
+            '2014-10-14 23:06:23.206', periods=3, freq='400L'
+        ) | pd.date_range(
+            '2014-10-15 23:00:00', periods=2, freq='2200L')
+
+        s = pd.Series(np.random.randn(5), index=index)
 
         # Ensure left closing works
         result = s.resample('2200L').mean()
@@ -1763,7 +1790,6 @@ def _simple_pts(start, end, freq='D'):
 
 
 class TestResamplePeriodIndex(tm.TestCase):
-
     _multiprocess_can_split_ = True
 
     def test_annual_upsample_D_s_f(self):
@@ -1907,16 +1933,40 @@ def test_resample_basic(self):
 
     def test_resample_empty(self):
 
-        # GH12771
+        # GH12771 & GH12868
         index = PeriodIndex(start='2000', periods=0, freq='D', name='idx')
         s = Series(index=index)
-        result = s.resample('M').sum()
 
-        # after GH12774 is resolved, this should be a PeriodIndex
-        expected_index = DatetimeIndex([], name='idx')
+        expected_index = PeriodIndex([], name='idx', freq='M')
         expected = Series(index=expected_index)
+
+        for method in resample_methods:
+            result = getattr(s.resample('M'), method)()
+            assert_series_equal(result, expected)
+
+    def test_resample_count(self):
+
+        # GH12774
+        series = pd.Series(1, index=pd.period_range(start='2000',
+                                                    periods=100))
+        result = series.resample('M').count()
+
+        expected_index = pd.period_range(start='2000', freq='M', periods=4)
+        expected = pd.Series([31, 29, 31, 9], index=expected_index)
+
         assert_series_equal(result, expected)
 
+    def test_resample_same_freq(self):
+
+        # GH12770
+        series = pd.Series(range(3), index=pd.period_range(
+            start='2000', periods=3, freq='M'))
+        expected = series
+
+        for method in resample_methods:
+            result = getattr(series.resample('M'), method)()
+            assert_series_equal(result, expected)
+
     def test_with_local_timezone_pytz(self):
         # GH5430
         tm._skip_if_no_pytz()
@@ -2493,8 +2543,8 @@ def test_aggregate_with_nat(self):
             # GH 9925
             self.assertEqual(dt_result.index.name, 'key')
 
-        # if NaT is included, 'var', 'std', 'mean', 'first','last' and 'nth'
-        # doesn't work yet
+            # if NaT is included, 'var', 'std', 'mean', 'first','last'
+            # and 'nth' doesn't work yet
 
 
 if __name__ == '__main__':