From def74de1e4cee07fc3eed5c08f20b686ef15416a Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 7 Aug 2016 17:11:20 -0500 Subject: [PATCH 01/10] API: Expanded resample --- doc/source/whatsnew/v0.19.0.txt | 14 ++++++ pandas/core/generic.py | 15 +++++-- pandas/tseries/tests/test_resample.py | 62 ++++++++++++++++++++++----- 3 files changed, 77 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 13dacf28b5988..62ca22f0a4a77 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -377,6 +377,20 @@ Other enhancements pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) +- the ``.resample()`` function now accepts a ``on=`` or ``key=`` parameter for resampling on a column or ``MultiIndex`` level (:issue:`13500`) + + .. ipython:: python + + df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), + 'a': np.arange(5)}, + index=pd.MultiIndex.from_arrays([ + [1,2,3,4,5], + pd.date_range('2015-01-01', freq='W', periods=5)], + names=['v','d'])) + df + df.resample('M', on='date').sum() + df.resample('M', level='d').sum() + - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d0295afe990c8..8e2b6a503f83d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4038,7 +4038,7 @@ def between_time(self, start_time, end_time, include_start=True, def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, label=None, convention='start', kind=None, loffset=None, - limit=None, base=0): + limit=None, base=0, on=None, level=None): """ Convenience method for frequency conversion and resampling of regular time-series data. @@ -4059,7 +4059,12 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '5min' frequency, base could range from 0 through 4. Defaults to 0 - + on : string, optional + For a DataFrame, column to use for resampling, rather than + the index + level : string or int, optional + For a MultiIndex, level (name or number) to use for + resampling To learn more about the offset strings, please see `this link `__. @@ -4164,12 +4169,16 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, """ from pandas.tseries.resample import (resample, _maybe_process_deprecations) + if is_list_like(on): + raise ValueError("Only a single column may be passed to on") + if is_list_like(level): + raise ValueError("Only a single column may be passed to level") axis = self._get_axis_number(axis) r = resample(self, freq=rule, label=label, closed=closed, axis=axis, kind=kind, loffset=loffset, convention=convention, - base=base) + base=base, key=on, level=level) return _maybe_process_deprecations(r, how=how, fill_method=fill_method, diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 85d8cd52e1866..3ef97e812b4b9 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -450,20 +450,30 @@ def test_agg(self): ('r2', 'B', 'sum')]) def test_agg_misc(self): - # test with both a Resampler and a TimeGrouper + # test with all three Resampler apis and TimeGrouper np.random.seed(1234) df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), index=pd.date_range('2010-01-01 09:00:00', periods=10, - freq='s')) + freq='s', + name='date')) + df_col = df.reset_index() + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], + names=['index', 'date']) r = df.resample('2s') - g = df.groupby(pd.Grouper(freq='2s')) + cases = [ + r, + df_col.resample('2s', on='date'), + df_mult.resample('2s', level='date'), + df.groupby(pd.Grouper(freq='2s')) + ] # passed lambda - for t in [r, g]: + for t in cases: result = t.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) rcustom = t['B'].apply(lambda x: np.std(x, ddof=1)) @@ -480,7 +490,7 @@ def test_agg_misc(self): ('result1', 'B'), ('result2', 'A'), ('result2', 'B')]) - for t in [r, g]: + for t in cases: result = t[['A', 'B']].agg(OrderedDict([('result1', np.sum), ('result2', np.mean)])) assert_frame_equal(result, expected, check_like=True) @@ -495,19 +505,19 @@ def test_agg_misc(self): ('A', 'std'), ('B', 'mean'), ('B', 'std')]) - for t in [r, g]: + for t in cases: result = t.agg(OrderedDict([('A', ['sum', 'std']), ('B', ['mean', 'std'])])) assert_frame_equal(result, expected, check_like=True) # equivalent of using a selection list / or not - for t in [r, g]: - result = g[['A', 'B']].agg({'A': ['sum', 'std'], + for t in cases: + result = t[['A', 'B']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) assert_frame_equal(result, expected, check_like=True) # series like aggs - for t in [r, g]: + for t in cases: result = t['A'].agg({'A': ['sum', 'std']}) expected = pd.concat([t['A'].sum(), t['A'].std()], @@ -528,9 +538,9 @@ def test_agg_misc(self): # errors # invalid names in the agg specification - for t in [r, g]: + for t in cases: def f(): - r[['A']].agg({'A': ['sum', 'std'], + t[['A']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) self.assertRaises(SpecificationError, f) @@ -581,6 +591,36 @@ def test_agg_consistency(self): result = r.agg({'r1': 'mean', 'r2': 'sum'}) assert_frame_equal(result, expected) + def test_api_validation(self): + # GH 13500 + dates = pd.date_range('2015-01-01', freq='W', periods=10) + df = pd.DataFrame({'date': dates, + 'a': np.arange(10, dtype='int64')}, + index=pd.MultiIndex.from_arrays([ + np.arange(10), + dates], names=['v', 'd'])) + + exp_index = pd.date_range('2015-01-31', periods=3, + freq='M', name='date') + expected = pd.DataFrame({'a': [6, 22, 17]}, + index=exp_index) + + actual = df.resample('M', on='date').sum() + assert_frame_equal(actual, expected) + + actual = df.resample('M', level='d').sum() + expected.index.name = 'd' + assert_frame_equal(actual, expected) + + with tm.assertRaises(ValueError): + df.resample('M', on='date', level='d') + + with tm.assertRaises(ValueError): + df.resample('M', on=['a', 'date']) + + with tm.assertRaises(ValueError): + df.resample('M', level=['a', 'date']) + class Base(object): """ From c4db0e7ff45b2d60ebca59cba13868b0c251b7d8 Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 11 Aug 2016 18:40:03 -0500 Subject: [PATCH 02/10] move error handling; doc fixups --- doc/source/timeseries.rst | 22 ++++++++++++++++++++++ doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/core/generic.py | 5 ----- pandas/tseries/tests/test_resample.py | 13 ++++++++++--- 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index b8f747757987c..dbaba03802970 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1473,6 +1473,28 @@ Furthermore, you can also specify multiple aggregation functions for each column r.agg({'A' : ['sum','std'], 'B' : ['mean','std'] }) +If a ``DataFrame`` does not have a ``DatetimeIndex``, but instead you want +to resample based on column in the frame, it can passed to the ``on`` keyword. + +.. ipython:: python + + df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), + 'a': np.arange(5)}, + index=pd.MultiIndex.from_arrays([ + [1,2,3,4,5], + pd.date_range('2015-01-01', freq='W', periods=5)], + names=['v','d'])) + df + df.resample('M', on='date').sum() + +Similarly, if you instead want to resample by a level of ``MultiIndex``, its +name or location can be passed to the ``level`` keyword. + +.. ipython:: python + + df.resample(level='d').sum() + + .. _timeseries.periods: Time Span Representation diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 62ca22f0a4a77..13ac7f10b38a7 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -377,7 +377,7 @@ Other enhancements pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) -- the ``.resample()`` function now accepts a ``on=`` or ``key=`` parameter for resampling on a column or ``MultiIndex`` level (:issue:`13500`) +- the ``.resample()`` function now accepts a ``on=`` or ``level=`` parameter for resampling on a column or ``MultiIndex`` level (:issue:`13500`) .. ipython:: python diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8e2b6a503f83d..d06a938308041 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4169,11 +4169,6 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, """ from pandas.tseries.resample import (resample, _maybe_process_deprecations) - if is_list_like(on): - raise ValueError("Only a single column may be passed to on") - if is_list_like(level): - raise ValueError("Only a single column may be passed to level") - axis = self._get_axis_number(axis) r = resample(self, freq=rule, label=label, closed=closed, axis=axis, kind=kind, loffset=loffset, diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 3ef97e812b4b9..faada578a3d87 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -608,17 +608,24 @@ def test_api_validation(self): actual = df.resample('M', on='date').sum() assert_frame_equal(actual, expected) - actual = df.resample('M', level='d').sum() expected.index.name = 'd' + actual = df.resample('M', level='d').sum() + assert_frame_equal(actual, expected) + + actual = df.resample('M', level=1).sum() assert_frame_equal(actual, expected) + # non DatetimeIndex + with tm.assertRaises(TypeError): + df.resample('M', level='v') + with tm.assertRaises(ValueError): df.resample('M', on='date', level='d') - with tm.assertRaises(ValueError): + with tm.assertRaises(TypeError): df.resample('M', on=['a', 'date']) - with tm.assertRaises(ValueError): + with tm.assertRaises(KeyError): df.resample('M', level=['a', 'date']) From b55309a814b32c3383862ec86a14e15b0486783a Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 15 Aug 2016 21:22:45 -0500 Subject: [PATCH 03/10] wip --- pandas/core/generic.py | 12 +- pandas/tseries/resample.py | 1 + pandas/tseries/tests/test_resample.py | 375 +++++++++++++------------- 3 files changed, 202 insertions(+), 186 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d06a938308041..33ce8aa80a506 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4040,8 +4040,10 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, label=None, convention='start', kind=None, loffset=None, limit=None, base=0, on=None, level=None): """ - Convenience method for frequency conversion and resampling of regular - time-series data. + Convenience method for frequency conversion and resampling of time + series. Object must have a datetime-like index (DatetimeIndex, + PeriodIndex, or TimedeltaIndex), or pass datetime-like values + to the on or level keyword. Parameters ---------- @@ -4060,11 +4062,11 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, aggregated intervals. For example, for '5min' frequency, base could range from 0 through 4. Defaults to 0 on : string, optional - For a DataFrame, column to use for resampling, rather than - the index + For a DataFrame, column to use instead of index for resampling. + Column must be datetime-like. level : string or int, optional For a MultiIndex, level (name or number) to use for - resampling + resampling. Level must be datetime-like. To learn more about the offset strings, please see `this link `__. diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 38c2e009a01f3..06889a2d83788 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -1002,6 +1002,7 @@ def _get_resampler(self, obj, kind=None): TypeError if incompatible axis """ + import pdb; pdb.set_trace() self._set_grouper(obj) ax = self.ax diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index faada578a3d87..b9bd5fa35189c 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -371,18 +371,162 @@ def test_apply_without_aggregation(self): result = t.apply(lambda x: x) assert_series_equal(result, self.series) + def test_agg_consistency(self): + + # make sure that we are consistent across + # similar aggregations with and w/o selection list + df = DataFrame(np.random.randn(1000, 3), + index=pd.date_range('1/1/2012', freq='S', periods=1000), + columns=['A', 'B', 'C']) + + r = df.resample('3T') + + expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'}) + result = r.agg({'r1': 'mean', 'r2': 'sum'}) + assert_frame_equal(result, expected) + + + + +class Base(object): + """ + base class for resampling testing, calling + .create_series() generates a series of each index type + """ + + def create_index(self, *args, **kwargs): + """ return the _index_factory created using the args, kwargs """ + factory = self._index_factory() + return factory(*args, **kwargs) + + def test_asfreq_downsample(self): + s = self.create_series() + + result = s.resample('2D').asfreq() + expected = s.reindex(s.index.take(np.arange(0, len(s.index), 2))) + expected.index.freq = to_offset('2D') + assert_series_equal(result, expected) + + frame = s.to_frame('value') + result = frame.resample('2D').asfreq() + expected = frame.reindex( + frame.index.take(np.arange(0, len(frame.index), 2))) + expected.index.freq = to_offset('2D') + assert_frame_equal(result, expected) + + def test_asfreq_upsample(self): + s = self.create_series() + + result = s.resample('1H').asfreq() + new_index = self.create_index(s.index[0], s.index[-1], freq='1H') + expected = s.reindex(new_index) + assert_series_equal(result, expected) + + frame = s.to_frame('value') + result = frame.resample('1H').asfreq() + new_index = self.create_index(frame.index[0], + frame.index[-1], freq='1H') + expected = frame.reindex(new_index) + assert_frame_equal(result, expected) + + def test_resample_interpolate(self): + # # 12925 + df = self.create_series().to_frame('value') + assert_frame_equal( + df.resample('1T').asfreq().interpolate(), + df.resample('1T').interpolate()) + + def test_raises_on_non_datetimelike_index(self): + # this is a non datetimelike index + xp = DataFrame() + self.assertRaises(TypeError, lambda: xp.resample('A').mean()) + + def test_resample_empty_series(self): + # GH12771 & GH12868 + + s = self.create_series()[:0] + + for freq in ['M', 'D', 'H']: + # need to test for ohlc from GH13083 + methods = [method for method in resample_methods + if method != 'ohlc'] + for method in methods: + result = getattr(s.resample(freq), method)() + + expected = s.copy() + expected.index = s.index._shallow_copy(freq=freq) + assert_index_equal(result.index, expected.index) + self.assertEqual(result.index.freq, expected.index.freq) + + if (method == 'size' and + isinstance(result.index, PeriodIndex) and + freq in ['M', 'D']): + # GH12871 - TODO: name should propagate, but currently + # doesn't on lower / same frequency with PeriodIndex + assert_series_equal(result, expected, check_dtype=False, + check_names=False) + # this assert will break when fixed + self.assertTrue(result.name is None) + else: + assert_series_equal(result, expected, check_dtype=False) + + def test_resample_empty_dataframe(self): + # GH13212 + index = self.create_series().index[:0] + f = DataFrame(index=index) + + for freq in ['M', 'D', 'H']: + # count retains dimensions too + methods = downsample_methods + ['count'] + for method in methods: + result = getattr(f.resample(freq), method)() + + expected = f.copy() + expected.index = f.index._shallow_copy(freq=freq) + assert_index_equal(result.index, expected.index) + self.assertEqual(result.index.freq, expected.index.freq) + assert_frame_equal(result, expected, check_dtype=False) + + # test size for GH13212 (currently stays as df) + + def test_resample_empty_dtypes(self): + + # Empty series were sometimes causing a segfault (for the functions + # with Cython bounds-checking disabled) or an IndexError. We just run + # them to ensure they no longer do. (GH #10228) + for index in tm.all_timeseries_index_generator(0): + for dtype in (np.float, np.int, np.object, 'datetime64[ns]'): + for how in downsample_methods + upsample_methods: + empty_series = pd.Series([], index, dtype) + try: + getattr(empty_series.resample('d'), how)() + except DataError: + # Ignore these since some combinations are invalid + # (ex: doing mean with dtype of np.object) + pass + def test_agg(self): - # test with both a Resampler and a TimeGrouper + # test with all three Resampler apis and TimeGrouper np.random.seed(1234) + index = self.create_series().index + index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), - index=pd.date_range('2010-01-01 09:00:00', - periods=10, - freq='s')) + index=index) + df_col = df.reset_index() + print df_col + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], + names=['index', 'date']) + r = df.resample('2D') + cases = [ + r, + df_col.resample('2D', on='date'), + df_mult.resample('2D', level='date'), + df.groupby(pd.Grouper(freq='2D')) + ] - r = df.resample('2s') - g = df.groupby(pd.Grouper(freq='2s')) a_mean = r['A'].mean() a_std = r['A'].std() a_sum = r['A'].sum() @@ -393,12 +537,12 @@ def test_agg(self): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([['A', 'B'], ['mean', 'std']]) - for t in [r, g]: + for t in cases: result = t.aggregate([np.mean, np.std]) assert_frame_equal(result, expected) expected = pd.concat([a_mean, b_std], axis=1) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': np.mean, 'B': np.std}) assert_frame_equal(result, expected, check_like=True) @@ -406,20 +550,20 @@ def test_agg(self): expected = pd.concat([a_mean, a_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'std')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': ['mean', 'std']}) assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = ['mean', 'sum'] - for t in [r, g]: + for t in cases: result = t['A'].aggregate(['mean', 'sum']) assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'sum')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) assert_frame_equal(result, expected, check_like=True) @@ -428,7 +572,7 @@ def test_agg(self): ('A', 'sum'), ('B', 'mean2'), ('B', 'sum2')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}, 'B': {'mean2': 'mean', 'sum2': 'sum'}}) assert_frame_equal(result, expected, check_like=True) @@ -438,7 +582,7 @@ def test_agg(self): ('A', 'std'), ('B', 'mean'), ('B', 'std')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': ['mean', 'std'], 'B': ['mean', 'std']}) assert_frame_equal(result, expected, check_like=True) @@ -453,23 +597,22 @@ def test_agg_misc(self): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) + index = self.create_series().index + index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), - index=pd.date_range('2010-01-01 09:00:00', - periods=10, - freq='s', - name='date')) + index=index) df_col = df.reset_index() df_mult = df_col.copy() df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], names=['index', 'date']) - r = df.resample('2s') + r = df.resample('2D') cases = [ r, - df_col.resample('2s', on='date'), - df_mult.resample('2s', level='date'), - df.groupby(pd.Grouper(freq='2s')) + df_col.resample('2D', on='date'), + df_mult.resample('2D', level='date'), + df.groupby(pd.Grouper(freq='2D')) ] # passed lambda @@ -548,22 +691,30 @@ def f(): def test_agg_nested_dicts(self): np.random.seed(1234) + index = self.create_series().index + index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), - index=pd.date_range('2010-01-01 09:00:00', - periods=10, - freq='s')) - - r = df.resample('2s') - g = df.groupby(pd.Grouper(freq='2s')) + index=index) + df_col = df.reset_index() + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], + names=['index', 'date']) + r = df.resample('2D') + cases = [ + r, + df_col.resample('2D', on='date'), + df_mult.resample('2D', level='date'), + df.groupby(pd.Grouper(freq='2D')) + ] - for t in [r, g]: + for t in cases: def f(): t.aggregate({'r1': {'A': ['mean', 'sum']}, 'r2': {'B': ['mean', 'sum']}}) self.assertRaises(ValueError, f) - for t in [r, g]: + for t in cases: expected = pd.concat([t['A'].mean(), t['A'].std(), t['B'].mean(), t['B'].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( @@ -577,43 +728,16 @@ def f(): 'B': {'rb': ['mean', 'std']}}) assert_frame_equal(result, expected, check_like=True) - def test_agg_consistency(self): - - # make sure that we are consistent across - # similar aggregations with and w/o selection list - df = DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2012', freq='S', periods=1000), - columns=['A', 'B', 'C']) - - r = df.resample('3T') - - expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'}) - result = r.agg({'r1': 'mean', 'r2': 'sum'}) - assert_frame_equal(result, expected) - - def test_api_validation(self): + def test_selection_api_validation(self): # GH 13500 - dates = pd.date_range('2015-01-01', freq='W', periods=10) - df = pd.DataFrame({'date': dates, - 'a': np.arange(10, dtype='int64')}, - index=pd.MultiIndex.from_arrays([ - np.arange(10), - dates], names=['v', 'd'])) - - exp_index = pd.date_range('2015-01-31', periods=3, - freq='M', name='date') - expected = pd.DataFrame({'a': [6, 22, 17]}, - index=exp_index) - - actual = df.resample('M', on='date').sum() - assert_frame_equal(actual, expected) - - expected.index.name = 'd' - actual = df.resample('M', level='d').sum() - assert_frame_equal(actual, expected) - - actual = df.resample('M', level=1).sum() - assert_frame_equal(actual, expected) + index = self.create_series().index + df = pd.DataFrame({'date': index, + 'a': np.arange(len(index), dtype=np.int64)}, + index=pd.MultiIndex.from_arrays([ + np.arange(len(index), dtype=np.int64), + index], names=['v', 'd'])) + df_exp = pd.DataFrame({'a': np.arange(len(index), dtype=np.int64)}, + index=index) # non DatetimeIndex with tm.assertRaises(TypeError): @@ -628,123 +752,12 @@ def test_api_validation(self): with tm.assertRaises(KeyError): df.resample('M', level=['a', 'date']) + exp = df_exp.resample('2D').sum() + exp.index.name = 'date' + assert_frame_equal(exp, df.resample('2D', on='date').sum()) -class Base(object): - """ - base class for resampling testing, calling - .create_series() generates a series of each index type - """ - - def create_index(self, *args, **kwargs): - """ return the _index_factory created using the args, kwargs """ - factory = self._index_factory() - return factory(*args, **kwargs) - - def test_asfreq_downsample(self): - s = self.create_series() - - result = s.resample('2D').asfreq() - expected = s.reindex(s.index.take(np.arange(0, len(s.index), 2))) - expected.index.freq = to_offset('2D') - assert_series_equal(result, expected) - - frame = s.to_frame('value') - result = frame.resample('2D').asfreq() - expected = frame.reindex( - frame.index.take(np.arange(0, len(frame.index), 2))) - expected.index.freq = to_offset('2D') - assert_frame_equal(result, expected) - - def test_asfreq_upsample(self): - s = self.create_series() - - result = s.resample('1H').asfreq() - new_index = self.create_index(s.index[0], s.index[-1], freq='1H') - expected = s.reindex(new_index) - assert_series_equal(result, expected) - - frame = s.to_frame('value') - result = frame.resample('1H').asfreq() - new_index = self.create_index(frame.index[0], - frame.index[-1], freq='1H') - expected = frame.reindex(new_index) - assert_frame_equal(result, expected) - - def test_resample_interpolate(self): - # # 12925 - df = self.create_series().to_frame('value') - assert_frame_equal( - df.resample('1T').asfreq().interpolate(), - df.resample('1T').interpolate()) - - def test_raises_on_non_datetimelike_index(self): - # this is a non datetimelike index - xp = DataFrame() - self.assertRaises(TypeError, lambda: xp.resample('A').mean()) - - def test_resample_empty_series(self): - # GH12771 & GH12868 - - s = self.create_series()[:0] - - for freq in ['M', 'D', 'H']: - # need to test for ohlc from GH13083 - methods = [method for method in resample_methods - if method != 'ohlc'] - for method in methods: - result = getattr(s.resample(freq), method)() - - expected = s.copy() - expected.index = s.index._shallow_copy(freq=freq) - assert_index_equal(result.index, expected.index) - self.assertEqual(result.index.freq, expected.index.freq) - - if (method == 'size' and - isinstance(result.index, PeriodIndex) and - freq in ['M', 'D']): - # GH12871 - TODO: name should propagate, but currently - # doesn't on lower / same frequency with PeriodIndex - assert_series_equal(result, expected, check_dtype=False, - check_names=False) - # this assert will break when fixed - self.assertTrue(result.name is None) - else: - assert_series_equal(result, expected, check_dtype=False) - - def test_resample_empty_dataframe(self): - # GH13212 - index = self.create_series().index[:0] - f = DataFrame(index=index) - - for freq in ['M', 'D', 'H']: - # count retains dimensions too - methods = downsample_methods + ['count'] - for method in methods: - result = getattr(f.resample(freq), method)() - - expected = f.copy() - expected.index = f.index._shallow_copy(freq=freq) - assert_index_equal(result.index, expected.index) - self.assertEqual(result.index.freq, expected.index.freq) - assert_frame_equal(result, expected, check_dtype=False) - - # test size for GH13212 (currently stays as df) - - def test_resample_empty_dtypes(self): - - # Empty series were sometimes causing a segfault (for the functions - # with Cython bounds-checking disabled) or an IndexError. We just run - # them to ensure they no longer do. (GH #10228) - for index in tm.all_timeseries_index_generator(0): - for dtype in (np.float, np.int, np.object, 'datetime64[ns]'): - for how in downsample_methods + upsample_methods: - empty_series = pd.Series([], index, dtype) - try: - getattr(empty_series.resample('d'), how)() - except DataError: - # Ignore these since some combinations are invalid - # (ex: doing mean with dtype of np.object) - pass + exp.index.name = 'd' + assert_frame_equal(exp, df.resample('2D', level='d').sum()) class TestDatetimeIndex(Base, tm.TestCase): From 7f9add404a3c5ead9286a35b237d52449ed20c9c Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 16 Aug 2016 06:40:50 -0500 Subject: [PATCH 04/10] more wip --- pandas/core/groupby.py | 9 +++++++-- pandas/tseries/resample.py | 21 +++++++++++++++------ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c2ab406e1da65..d692ec2cb5f57 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -247,7 +247,7 @@ def _get_grouper(self, obj): sort=self.sort) return self.binner, self.grouper, self.obj - def _set_grouper(self, obj, sort=False): + def _set_grouper(self, obj, sort=False, converter=None): """ given an object and the specifications, setup the internal grouper for this particular specification @@ -255,7 +255,10 @@ def _set_grouper(self, obj, sort=False): Parameters ---------- obj : the subject object - + sort : bool, default False + whether the resulting grouper should be sorted + converter : callable, optional + conversion to apply the grouper after selection """ if self.key is not None and self.level is not None: @@ -295,6 +298,8 @@ def _set_grouper(self, obj, sort=False): convert=False, is_copy=False) self.obj = obj + if converter is not None: + ax = converter(ax) self.grouper = ax return self.grouper diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 06889a2d83788..f13d0f9476b6d 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -87,7 +87,8 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.grouper = None if self.groupby is not None: - self.groupby._set_grouper(self._convert_obj(obj), sort=True) + obj, converter = self._convert_obj(obj) + self.groupby._set_grouper(obj, sort=True, converter=converter) def __unicode__(self): """ provide a nice str repr of our rolling object """ @@ -203,13 +204,20 @@ def __setitem__(self, attr, value): def _convert_obj(self, obj): """ provide any conversions for the object in order to correctly handle + and returns a converter function to be applied to grouping selection Parameters ---------- obj : the object to be resampled + + Returns + ------- + obj : converted object + converter : callable, optional + converter to apply after selection """ obj = obj.consolidate() - return obj + return obj, None def _get_binner_for_time(self): raise AbstractMethodError(self) @@ -703,6 +711,7 @@ def _upsample(self, method, limit=None): .fillna """ + # import pdb; pdb.set_trace() self._set_binner() if self.axis: raise AssertionError('axis must be 0') @@ -751,7 +760,7 @@ def _resampler_for_grouping(self): return PeriodIndexResamplerGroupby def _convert_obj(self, obj): - obj = super(PeriodIndexResampler, self)._convert_obj(obj) + obj, _ = super(PeriodIndexResampler, self)._convert_obj(obj) offset = to_offset(self.freq) if offset.n > 1: @@ -761,10 +770,11 @@ def _convert_obj(self, obj): # Cannot have multiple of periods, convert to timestamp self.kind = 'timestamp' + converter = None # convert to timestamp if not (self.kind is None or self.kind == 'period'): - obj = obj.to_timestamp(how=self.convention) - return obj + converter = lambda x: x.to_timestamp(how=self.convention) + return obj, converter def aggregate(self, arg, *args, **kwargs): result, how = self._aggregate(arg, *args, **kwargs) @@ -1002,7 +1012,6 @@ def _get_resampler(self, obj, kind=None): TypeError if incompatible axis """ - import pdb; pdb.set_trace() self._set_grouper(obj) ax = self.ax From 5fd97d9697cf105eac57745e0c8caa7996bab433 Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 17 Aug 2016 19:13:05 -0500 Subject: [PATCH 05/10] add from_selection bookkeeping --- pandas/tseries/resample.py | 26 +++++++++++++++++++++++--- pandas/tseries/tests/test_resample.py | 14 ++++++++++---- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index f13d0f9476b6d..0cd24b2afea71 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -64,7 +64,7 @@ class Resampler(_GroupBy): 'binner', 'grouper', 'groupby', 'sort', 'kind', 'squeeze', 'keys', 'group_keys', 'as_index', 'exclusions', - '_groupby'] + '_groupby', 'from_selection'] # don't raise deprecation warning on attributes starting with these # patterns - prevents warnings caused by IPython introspection @@ -85,8 +85,12 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.exclusions = set() self.binner = None self.grouper = None + self.from_selection = False if self.groupby is not None: + # bookeeping to disallow upsampling if not resampling on index + self.from_selection = (self.groupby.key is not None or + self.groupby.level is not None) obj, converter = self._convert_obj(obj) self.groupby._set_grouper(obj, sort=True, converter=converter) @@ -711,10 +715,14 @@ def _upsample(self, method, limit=None): .fillna """ - # import pdb; pdb.set_trace() self._set_binner() if self.axis: raise AssertionError('axis must be 0') + if self.from_selection: + raise NotImplementedError("Upsampling from level= or on= selection " + "is not supported, use .set_index(...) " + "to explicitly set index to " + "datetime-like") ax = self.ax obj = self._selected_obj @@ -773,7 +781,13 @@ def _convert_obj(self, obj): converter = None # convert to timestamp if not (self.kind is None or self.kind == 'period'): - converter = lambda x: x.to_timestamp(how=self.convention) + # if periondindex is the actual index obj, just convert it + # otherwise, converter callback will be used on selection + if self.from_selection: + converter = lambda x: x.to_timestamp(how=self.convention) + else: + obj = obj.to_timestamp(how=self.convention) + return obj, converter def aggregate(self, arg, *args, **kwargs): @@ -850,6 +864,12 @@ def _upsample(self, method, limit=None): .fillna """ + # import pdb; pdb.set_trace() + if self.from_selection: + raise NotImplementedError("Upsampling from level= or on= selection " + "is not supported, use .set_index(...) " + "to explicitly set index to " + "datetime-like") # we may need to actually resample as if we are timestamps if self.kind == 'timestamp': return super(PeriodIndexResampler, self)._upsample(method, diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index b9bd5fa35189c..58c1d08144889 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -741,16 +741,22 @@ def test_selection_api_validation(self): # non DatetimeIndex with tm.assertRaises(TypeError): - df.resample('M', level='v') + df.resample('2D', level='v') with tm.assertRaises(ValueError): - df.resample('M', on='date', level='d') + df.resample('2D', on='date', level='d') with tm.assertRaises(TypeError): - df.resample('M', on=['a', 'date']) + df.resample('2D', on=['a', 'date']) with tm.assertRaises(KeyError): - df.resample('M', level=['a', 'date']) + df.resample('2D', level=['a', 'date']) + + with tm.assertRaises(NotImplementedError): + df.resample('2D', level='d').asfreq() + + with tm.assertRaises(NotImplementedError): + df.resample('2D', on='date').asfreq() exp = df_exp.resample('2D').sum() exp.index.name = 'date' From c7b299ea7df4fbd0735464517cea219b7fdecb36 Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 18 Aug 2016 07:07:40 -0500 Subject: [PATCH 06/10] cleanup debugging --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/tseries/resample.py | 17 ++++++++--------- pandas/tseries/tests/test_resample.py | 11 ++++------- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 13ac7f10b38a7..5523346c89e3d 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -948,7 +948,7 @@ Bug Fixes - Bug in ``pd.read_hdf()`` returns incorrect result when a ``DataFrame`` with a ``categorical`` column and a query which doesn't match any values (:issue:`13792`) - Bug in ``pd.to_datetime()`` raise ``AttributeError`` with NaN and the other string is not valid when errors='ignore' (:issue:`12424`) - +- Bug in ``groupby`` where a ``TimeGrouper`` selection is used with the ``key`` or ``level`` arguments with a ``PeriodIndex`` (:issue:`14008`) - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) - Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 0cd24b2afea71..21d3ed988326d 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -719,10 +719,10 @@ def _upsample(self, method, limit=None): if self.axis: raise AssertionError('axis must be 0') if self.from_selection: - raise NotImplementedError("Upsampling from level= or on= selection " - "is not supported, use .set_index(...) " - "to explicitly set index to " - "datetime-like") + raise NotImplementedError("Upsampling from level= or on= selection" + " is not supported, use .set_index(...)" + " to explicitly set index to" + " datetime-like") ax = self.ax obj = self._selected_obj @@ -864,12 +864,11 @@ def _upsample(self, method, limit=None): .fillna """ - # import pdb; pdb.set_trace() if self.from_selection: - raise NotImplementedError("Upsampling from level= or on= selection " - "is not supported, use .set_index(...) " - "to explicitly set index to " - "datetime-like") + raise NotImplementedError("Upsampling from level= or on= selection" + " is not supported, use .set_index(...)" + " to explicitly set index to" + " datetime-like") # we may need to actually resample as if we are timestamps if self.kind == 'timestamp': return super(PeriodIndexResampler, self)._upsample(method, diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 58c1d08144889..45815e0235ab9 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -386,8 +386,6 @@ def test_agg_consistency(self): assert_frame_equal(result, expected) - - class Base(object): """ base class for resampling testing, calling @@ -515,7 +513,6 @@ def test_agg(self): columns=list('AB'), index=index) df_col = df.reset_index() - print df_col df_mult = df_col.copy() df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], names=['index', 'date']) @@ -733,11 +730,11 @@ def test_selection_api_validation(self): index = self.create_series().index df = pd.DataFrame({'date': index, 'a': np.arange(len(index), dtype=np.int64)}, - index=pd.MultiIndex.from_arrays([ - np.arange(len(index), dtype=np.int64), - index], names=['v', 'd'])) + index=pd.MultiIndex.from_arrays([ + np.arange(len(index), dtype=np.int64), + index], names=['v', 'd'])) df_exp = pd.DataFrame({'a': np.arange(len(index), dtype=np.int64)}, - index=index) + index=index) # non DatetimeIndex with tm.assertRaises(TypeError): From 384026b7aae713a6e6a85e67b02764812e0bc810 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 20 Aug 2016 08:41:53 -0500 Subject: [PATCH 07/10] remove PeriodIndex workaround --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/core/groupby.py | 4 - pandas/tseries/resample.py | 26 +-- pandas/tseries/tests/test_resample.py | 264 ++++++++++++++------------ 4 files changed, 156 insertions(+), 140 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 5523346c89e3d..13ac7f10b38a7 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -948,7 +948,7 @@ Bug Fixes - Bug in ``pd.read_hdf()`` returns incorrect result when a ``DataFrame`` with a ``categorical`` column and a query which doesn't match any values (:issue:`13792`) - Bug in ``pd.to_datetime()`` raise ``AttributeError`` with NaN and the other string is not valid when errors='ignore' (:issue:`12424`) -- Bug in ``groupby`` where a ``TimeGrouper`` selection is used with the ``key`` or ``level`` arguments with a ``PeriodIndex`` (:issue:`14008`) + - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) - Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index d692ec2cb5f57..bd2bad58f5ece 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -257,8 +257,6 @@ def _set_grouper(self, obj, sort=False, converter=None): obj : the subject object sort : bool, default False whether the resulting grouper should be sorted - converter : callable, optional - conversion to apply the grouper after selection """ if self.key is not None and self.level is not None: @@ -298,8 +296,6 @@ def _set_grouper(self, obj, sort=False, converter=None): convert=False, is_copy=False) self.obj = obj - if converter is not None: - ax = converter(ax) self.grouper = ax return self.grouper diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 21d3ed988326d..61f3399ace117 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -88,11 +88,13 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.from_selection = False if self.groupby is not None: - # bookeeping to disallow upsampling if not resampling on index + # upsampling and PeriodIndex resampling do not work + # if resampling on a column or mi level + # this is state used to catch and raise an error self.from_selection = (self.groupby.key is not None or self.groupby.level is not None) - obj, converter = self._convert_obj(obj) - self.groupby._set_grouper(obj, sort=True, converter=converter) + obj = self._convert_obj(obj) + self.groupby._set_grouper(obj, sort=True) def __unicode__(self): """ provide a nice str repr of our rolling object """ @@ -208,7 +210,6 @@ def __setitem__(self, attr, value): def _convert_obj(self, obj): """ provide any conversions for the object in order to correctly handle - and returns a converter function to be applied to grouping selection Parameters ---------- @@ -217,11 +218,9 @@ def _convert_obj(self, obj): Returns ------- obj : converted object - converter : callable, optional - converter to apply after selection """ obj = obj.consolidate() - return obj, None + return obj def _get_binner_for_time(self): raise AbstractMethodError(self) @@ -768,7 +767,7 @@ def _resampler_for_grouping(self): return PeriodIndexResamplerGroupby def _convert_obj(self, obj): - obj, _ = super(PeriodIndexResampler, self)._convert_obj(obj) + obj = super(PeriodIndexResampler, self)._convert_obj(obj) offset = to_offset(self.freq) if offset.n > 1: @@ -778,17 +777,18 @@ def _convert_obj(self, obj): # Cannot have multiple of periods, convert to timestamp self.kind = 'timestamp' - converter = None # convert to timestamp if not (self.kind is None or self.kind == 'period'): - # if periondindex is the actual index obj, just convert it - # otherwise, converter callback will be used on selection if self.from_selection: - converter = lambda x: x.to_timestamp(how=self.convention) + # see GH 14008, GH 12871 + msg = ("Resampling from level= or on= selection" + " with a PeriodIndex is not currently supported," + " use .set_index(...) to explicitly set index") + raise NotImplementedError(msg) else: obj = obj.to_timestamp(how=self.convention) - return obj, converter + return obj def aggregate(self, arg, *args, **kwargs): result, how = self._aggregate(arg, *args, **kwargs) diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 45815e0235ab9..e7bc823c18001 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -385,129 +385,12 @@ def test_agg_consistency(self): result = r.agg({'r1': 'mean', 'r2': 'sum'}) assert_frame_equal(result, expected) - -class Base(object): - """ - base class for resampling testing, calling - .create_series() generates a series of each index type - """ - - def create_index(self, *args, **kwargs): - """ return the _index_factory created using the args, kwargs """ - factory = self._index_factory() - return factory(*args, **kwargs) - - def test_asfreq_downsample(self): - s = self.create_series() - - result = s.resample('2D').asfreq() - expected = s.reindex(s.index.take(np.arange(0, len(s.index), 2))) - expected.index.freq = to_offset('2D') - assert_series_equal(result, expected) - - frame = s.to_frame('value') - result = frame.resample('2D').asfreq() - expected = frame.reindex( - frame.index.take(np.arange(0, len(frame.index), 2))) - expected.index.freq = to_offset('2D') - assert_frame_equal(result, expected) - - def test_asfreq_upsample(self): - s = self.create_series() - - result = s.resample('1H').asfreq() - new_index = self.create_index(s.index[0], s.index[-1], freq='1H') - expected = s.reindex(new_index) - assert_series_equal(result, expected) - - frame = s.to_frame('value') - result = frame.resample('1H').asfreq() - new_index = self.create_index(frame.index[0], - frame.index[-1], freq='1H') - expected = frame.reindex(new_index) - assert_frame_equal(result, expected) - - def test_resample_interpolate(self): - # # 12925 - df = self.create_series().to_frame('value') - assert_frame_equal( - df.resample('1T').asfreq().interpolate(), - df.resample('1T').interpolate()) - - def test_raises_on_non_datetimelike_index(self): - # this is a non datetimelike index - xp = DataFrame() - self.assertRaises(TypeError, lambda: xp.resample('A').mean()) - - def test_resample_empty_series(self): - # GH12771 & GH12868 - - s = self.create_series()[:0] - - for freq in ['M', 'D', 'H']: - # need to test for ohlc from GH13083 - methods = [method for method in resample_methods - if method != 'ohlc'] - for method in methods: - result = getattr(s.resample(freq), method)() - - expected = s.copy() - expected.index = s.index._shallow_copy(freq=freq) - assert_index_equal(result.index, expected.index) - self.assertEqual(result.index.freq, expected.index.freq) - - if (method == 'size' and - isinstance(result.index, PeriodIndex) and - freq in ['M', 'D']): - # GH12871 - TODO: name should propagate, but currently - # doesn't on lower / same frequency with PeriodIndex - assert_series_equal(result, expected, check_dtype=False, - check_names=False) - # this assert will break when fixed - self.assertTrue(result.name is None) - else: - assert_series_equal(result, expected, check_dtype=False) - - def test_resample_empty_dataframe(self): - # GH13212 - index = self.create_series().index[:0] - f = DataFrame(index=index) - - for freq in ['M', 'D', 'H']: - # count retains dimensions too - methods = downsample_methods + ['count'] - for method in methods: - result = getattr(f.resample(freq), method)() - - expected = f.copy() - expected.index = f.index._shallow_copy(freq=freq) - assert_index_equal(result.index, expected.index) - self.assertEqual(result.index.freq, expected.index.freq) - assert_frame_equal(result, expected, check_dtype=False) - - # test size for GH13212 (currently stays as df) - - def test_resample_empty_dtypes(self): - - # Empty series were sometimes causing a segfault (for the functions - # with Cython bounds-checking disabled) or an IndexError. We just run - # them to ensure they no longer do. (GH #10228) - for index in tm.all_timeseries_index_generator(0): - for dtype in (np.float, np.int, np.object, 'datetime64[ns]'): - for how in downsample_methods + upsample_methods: - empty_series = pd.Series([], index, dtype) - try: - getattr(empty_series.resample('d'), how)() - except DataError: - # Ignore these since some combinations are invalid - # (ex: doing mean with dtype of np.object) - pass - def test_agg(self): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) - index = self.create_series().index + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), @@ -594,7 +477,8 @@ def test_agg_misc(self): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) - index = self.create_series().index + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), @@ -688,7 +572,8 @@ def f(): def test_agg_nested_dicts(self): np.random.seed(1234) - index = self.create_series().index + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), @@ -727,7 +612,8 @@ def f(): def test_selection_api_validation(self): # GH 13500 - index = self.create_series().index + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') df = pd.DataFrame({'date': index, 'a': np.arange(len(index), dtype=np.int64)}, index=pd.MultiIndex.from_arrays([ @@ -763,6 +649,124 @@ def test_selection_api_validation(self): assert_frame_equal(exp, df.resample('2D', level='d').sum()) +class Base(object): + """ + base class for resampling testing, calling + .create_series() generates a series of each index type + """ + + def create_index(self, *args, **kwargs): + """ return the _index_factory created using the args, kwargs """ + factory = self._index_factory() + return factory(*args, **kwargs) + + def test_asfreq_downsample(self): + s = self.create_series() + + result = s.resample('2D').asfreq() + expected = s.reindex(s.index.take(np.arange(0, len(s.index), 2))) + expected.index.freq = to_offset('2D') + assert_series_equal(result, expected) + + frame = s.to_frame('value') + result = frame.resample('2D').asfreq() + expected = frame.reindex( + frame.index.take(np.arange(0, len(frame.index), 2))) + expected.index.freq = to_offset('2D') + assert_frame_equal(result, expected) + + def test_asfreq_upsample(self): + s = self.create_series() + + result = s.resample('1H').asfreq() + new_index = self.create_index(s.index[0], s.index[-1], freq='1H') + expected = s.reindex(new_index) + assert_series_equal(result, expected) + + frame = s.to_frame('value') + result = frame.resample('1H').asfreq() + new_index = self.create_index(frame.index[0], + frame.index[-1], freq='1H') + expected = frame.reindex(new_index) + assert_frame_equal(result, expected) + + def test_resample_interpolate(self): + # # 12925 + df = self.create_series().to_frame('value') + assert_frame_equal( + df.resample('1T').asfreq().interpolate(), + df.resample('1T').interpolate()) + + def test_raises_on_non_datetimelike_index(self): + # this is a non datetimelike index + xp = DataFrame() + self.assertRaises(TypeError, lambda: xp.resample('A').mean()) + + def test_resample_empty_series(self): + # GH12771 & GH12868 + + s = self.create_series()[:0] + + for freq in ['M', 'D', 'H']: + # need to test for ohlc from GH13083 + methods = [method for method in resample_methods + if method != 'ohlc'] + for method in methods: + result = getattr(s.resample(freq), method)() + + expected = s.copy() + expected.index = s.index._shallow_copy(freq=freq) + assert_index_equal(result.index, expected.index) + self.assertEqual(result.index.freq, expected.index.freq) + + if (method == 'size' and + isinstance(result.index, PeriodIndex) and + freq in ['M', 'D']): + # GH12871 - TODO: name should propagate, but currently + # doesn't on lower / same frequency with PeriodIndex + assert_series_equal(result, expected, check_dtype=False, + check_names=False) + # this assert will break when fixed + self.assertTrue(result.name is None) + else: + assert_series_equal(result, expected, check_dtype=False) + + def test_resample_empty_dataframe(self): + # GH13212 + index = self.create_series().index[:0] + f = DataFrame(index=index) + + for freq in ['M', 'D', 'H']: + # count retains dimensions too + methods = downsample_methods + ['count'] + for method in methods: + result = getattr(f.resample(freq), method)() + + expected = f.copy() + expected.index = f.index._shallow_copy(freq=freq) + assert_index_equal(result.index, expected.index) + self.assertEqual(result.index.freq, expected.index.freq) + assert_frame_equal(result, expected, check_dtype=False) + + # test size for GH13212 (currently stays as df) + + def test_resample_empty_dtypes(self): + + # Empty series were sometimes causing a segfault (for the functions + # with Cython bounds-checking disabled) or an IndexError. We just run + # them to ensure they no longer do. (GH #10228) + for index in tm.all_timeseries_index_generator(0): + for dtype in (np.float, np.int, np.object, 'datetime64[ns]'): + for how in downsample_methods + upsample_methods: + empty_series = pd.Series([], index, dtype) + try: + getattr(empty_series.resample('d'), how)() + except DataError: + # Ignore these since some combinations are invalid + # (ex: doing mean with dtype of np.object) + pass + + class TestDatetimeIndex(Base, tm.TestCase): _multiprocess_can_split_ = True _index_factory = lambda x: date_range @@ -2055,6 +2059,22 @@ def test_asfreq_upsample(self): result = frame.resample('1H').asfreq() assert_frame_equal(result, expected) + def test_selection(self): + index = self.create_series().index + # This is a bug, these should be implemented + # GH 14008 + df = pd.DataFrame({'date': index, + 'a': np.arange(len(index), dtype=np.int64)}, + index=pd.MultiIndex.from_arrays([ + np.arange(len(index), dtype=np.int64), + index], names=['v', 'd'])) + + with tm.assertRaises(NotImplementedError): + df.resample('2D', on='date') + + with tm.assertRaises(NotImplementedError): + df.resample('2D', level='d') + def test_annual_upsample_D_s_f(self): self._check_annual_upsample_cases('D', 'start', 'ffill') From e203fcfcb009c46fd8ad34c278579fb3f88e104a Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 20 Aug 2016 10:05:16 -0500 Subject: [PATCH 08/10] doc updates --- doc/source/timeseries.rst | 10 ++++++---- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/core/generic.py | 5 +++++ pandas/core/groupby.py | 2 +- pandas/tseries/resample.py | 5 ++--- 5 files changed, 15 insertions(+), 9 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index dbaba03802970..9bc8b8f425458 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1473,8 +1473,9 @@ Furthermore, you can also specify multiple aggregation functions for each column r.agg({'A' : ['sum','std'], 'B' : ['mean','std'] }) -If a ``DataFrame`` does not have a ``DatetimeIndex``, but instead you want -to resample based on column in the frame, it can passed to the ``on`` keyword. +If a ``DataFrame`` does not have a datetimelike index, but instead you want +to resample based on datetimelike column in the frame, it can passed to the +``on`` keyword. .. ipython:: python @@ -1487,8 +1488,9 @@ to resample based on column in the frame, it can passed to the ``on`` keyword. df df.resample('M', on='date').sum() -Similarly, if you instead want to resample by a level of ``MultiIndex``, its -name or location can be passed to the ``level`` keyword. +Similarly, if you instead want to resample by a datetimelike +level of ``MultiIndex``, its name or location can be passed to the +``level`` keyword. .. ipython:: python diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 13ac7f10b38a7..de13df873a3d8 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -377,7 +377,7 @@ Other enhancements pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) -- the ``.resample()`` function now accepts a ``on=`` or ``level=`` parameter for resampling on a column or ``MultiIndex`` level (:issue:`13500`) +- the ``.resample()`` function now accepts a ``on=`` or ``level=`` parameter for resampling on a datetimelike column or ``MultiIndex`` level (:issue:`13500`) .. ipython:: python diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 33ce8aa80a506..c334a8ceea166 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4064,10 +4064,15 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, on : string, optional For a DataFrame, column to use instead of index for resampling. Column must be datetime-like. + + .. versionadded:: 0.19.0 + level : string or int, optional For a MultiIndex, level (name or number) to use for resampling. Level must be datetime-like. + .. versionadded:: 0.19.0 + To learn more about the offset strings, please see `this link `__. diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index bd2bad58f5ece..08299b84d09eb 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -247,7 +247,7 @@ def _get_grouper(self, obj): sort=self.sort) return self.binner, self.grouper, self.obj - def _set_grouper(self, obj, sort=False, converter=None): + def _set_grouper(self, obj, sort=False): """ given an object and the specifications, setup the internal grouper for this particular specification diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 61f3399ace117..2900655dedb32 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -90,11 +90,10 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): if self.groupby is not None: # upsampling and PeriodIndex resampling do not work # if resampling on a column or mi level - # this is state used to catch and raise an error + # this state used to catch and raise an error self.from_selection = (self.groupby.key is not None or self.groupby.level is not None) - obj = self._convert_obj(obj) - self.groupby._set_grouper(obj, sort=True) + self.groupby._set_grouper(self._convert_obj(obj), sort=True) def __unicode__(self): """ provide a nice str repr of our rolling object """ From 10c728055df5a1ec6bc3478d535d552d2262423c Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 27 Aug 2016 10:17:46 -0500 Subject: [PATCH 09/10] NotImp -> ValueError --- pandas/tseries/resample.py | 30 +++++++++++++-------------- pandas/tseries/tests/test_resample.py | 7 +++++-- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 2900655dedb32..7621ac1f4461c 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -64,7 +64,7 @@ class Resampler(_GroupBy): 'binner', 'grouper', 'groupby', 'sort', 'kind', 'squeeze', 'keys', 'group_keys', 'as_index', 'exclusions', - '_groupby', 'from_selection'] + '_groupby', '_from_selection'] # don't raise deprecation warning on attributes starting with these # patterns - prevents warnings caused by IPython introspection @@ -85,14 +85,14 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.exclusions = set() self.binner = None self.grouper = None - self.from_selection = False + self._from_selection = False if self.groupby is not None: # upsampling and PeriodIndex resampling do not work # if resampling on a column or mi level # this state used to catch and raise an error - self.from_selection = (self.groupby.key is not None or - self.groupby.level is not None) + self._from_selection = (self.groupby.key is not None or + self.groupby.level is not None) self.groupby._set_grouper(self._convert_obj(obj), sort=True) def __unicode__(self): @@ -716,11 +716,11 @@ def _upsample(self, method, limit=None): self._set_binner() if self.axis: raise AssertionError('axis must be 0') - if self.from_selection: - raise NotImplementedError("Upsampling from level= or on= selection" - " is not supported, use .set_index(...)" - " to explicitly set index to" - " datetime-like") + if self._from_selection: + raise ValueError("Upsampling from level= or on= selection" + " is not supported, use .set_index(...)" + " to explicitly set index to" + " datetime-like") ax = self.ax obj = self._selected_obj @@ -778,7 +778,7 @@ def _convert_obj(self, obj): # convert to timestamp if not (self.kind is None or self.kind == 'period'): - if self.from_selection: + if self._from_selection: # see GH 14008, GH 12871 msg = ("Resampling from level= or on= selection" " with a PeriodIndex is not currently supported," @@ -863,11 +863,11 @@ def _upsample(self, method, limit=None): .fillna """ - if self.from_selection: - raise NotImplementedError("Upsampling from level= or on= selection" - " is not supported, use .set_index(...)" - " to explicitly set index to" - " datetime-like") + if self._from_selection: + raise ValueError("Upsampling from level= or on= selection" + " is not supported, use .set_index(...)" + " to explicitly set index to" + " datetime-like") # we may need to actually resample as if we are timestamps if self.kind == 'timestamp': return super(PeriodIndexResampler, self)._upsample(method, diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index e7bc823c18001..9c989ffc2d8b2 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -385,6 +385,8 @@ def test_agg_consistency(self): result = r.agg({'r1': 'mean', 'r2': 'sum'}) assert_frame_equal(result, expected) + # TODO: once GH 14008 is fixed, move these tests into + # `Base` test class def test_agg(self): # test with all three Resampler apis and TimeGrouper @@ -635,10 +637,11 @@ def test_selection_api_validation(self): with tm.assertRaises(KeyError): df.resample('2D', level=['a', 'date']) - with tm.assertRaises(NotImplementedError): + # upsampling not allowed + with tm.assertRaises(ValueError): df.resample('2D', level='d').asfreq() - with tm.assertRaises(NotImplementedError): + with tm.assertRaises(ValueError): df.resample('2D', on='date').asfreq() exp = df_exp.resample('2D').sum() From b8dd114256183259329aa7c1792cc7bc2f3a5ce5 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 27 Aug 2016 14:52:08 -0500 Subject: [PATCH 10/10] make _from_selection a property --- pandas/tseries/resample.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 7621ac1f4461c..0a0bcb0d4f3d8 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -64,7 +64,7 @@ class Resampler(_GroupBy): 'binner', 'grouper', 'groupby', 'sort', 'kind', 'squeeze', 'keys', 'group_keys', 'as_index', 'exclusions', - '_groupby', '_from_selection'] + '_groupby'] # don't raise deprecation warning on attributes starting with these # patterns - prevents warnings caused by IPython introspection @@ -85,14 +85,8 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.exclusions = set() self.binner = None self.grouper = None - self._from_selection = False if self.groupby is not None: - # upsampling and PeriodIndex resampling do not work - # if resampling on a column or mi level - # this state used to catch and raise an error - self._from_selection = (self.groupby.key is not None or - self.groupby.level is not None) self.groupby._set_grouper(self._convert_obj(obj), sort=True) def __unicode__(self): @@ -118,6 +112,15 @@ def _typ(self): return 'series' return 'dataframe' + @property + def _from_selection(self): + """ is the resampling from a DataFrame column or MultiIndex level """ + # upsampling and PeriodIndex resampling do not work + # with selection, this state used to catch and raise an error + return (self.groupby is not None and + (self.groupby.key is not None or + self.groupby.level is not None)) + def _deprecated(self, op): warnings.warn(("\n.resample() is now a deferred operation\n" "You called {op}(...) on this deferred object "