diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c82dc370e3e71..e8a3f52975bc0 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -332,6 +332,7 @@ Bug Fixes +- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`) - Bug in ``DataFrame.to_html`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index adc17c7514832..895a376457f09 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -306,12 +306,18 @@ def validate_expanding_func(name, args, kwargs): raise UnsupportedFunctionCall(msg) -def validate_groupby_func(name, args, kwargs): +def validate_groupby_func(name, args, kwargs, allowed=None): """ - 'args' and 'kwargs' should be empty because all of + 'args' and 'kwargs' should be empty, except for allowed + kwargs because all of their necessary parameters are explicitly listed in the function signature """ + if allowed is None: + allowed = [] + + kwargs = set(kwargs) - set(allowed) + if len(args) + len(kwargs) > 0: raise UnsupportedFunctionCall(( "numpy operations are not valid " diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 700e279cb0030..ddf6d95fa2ab4 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -19,6 +19,7 @@ is_categorical_dtype, is_datetimelike, is_datetime_or_timedelta_dtype, + is_datetime64_any_dtype, is_bool, is_integer_dtype, is_complex_dtype, is_bool_dtype, @@ -109,10 +110,12 @@ def _groupby_function(name, alias, npfunc, numeric_only=True, @Substitution(name='groupby', f=name) @Appender(_doc_template) @Appender(_local_template) - def f(self): + def f(self, **kwargs): + if 'numeric_only' not in kwargs: + kwargs['numeric_only'] = numeric_only self._set_group_selection() try: - return self._cython_agg_general(alias, numeric_only=numeric_only) + return self._cython_agg_general(alias, alt=npfunc, **kwargs) except AssertionError as e: raise SpecificationError(str(e)) except Exception: @@ -127,7 +130,9 @@ def f(self): def _first_compat(x, axis=0): + def _first(x): + x = np.asarray(x) x = x[notnull(x)] if len(x) == 0: @@ -142,6 +147,7 @@ def _first(x): def _last_compat(x, axis=0): def _last(x): + x = np.asarray(x) x = x[notnull(x)] if len(x) == 0: @@ -775,7 +781,7 @@ def _try_cast(self, result, obj): return result def _cython_transform(self, how, numeric_only=True): - output = {} + output = collections.OrderedDict() for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: @@ -783,6 +789,8 @@ def _cython_transform(self, how, numeric_only=True): try: result, names = self.grouper.transform(obj.values, how) + except NotImplementedError: + continue except AssertionError as e: raise GroupByError(str(e)) output[name] = self._try_cast(result, obj) @@ -792,7 +800,7 @@ def _cython_transform(self, how, numeric_only=True): return self._wrap_transformed_output(output, names) - def _cython_agg_general(self, how, numeric_only=True): + def _cython_agg_general(self, how, alt=None, numeric_only=True): output = {} for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) @@ -1015,26 +1023,26 @@ def mean(self, *args, **kwargs): For multiple groupings, the result index will be a MultiIndex """ - nv.validate_groupby_func('mean', args, kwargs) + nv.validate_groupby_func('mean', args, kwargs, ['numeric_only']) try: - return self._cython_agg_general('mean') + return self._cython_agg_general('mean', **kwargs) except GroupByError: raise except Exception: # pragma: no cover self._set_group_selection() - f = lambda x: x.mean(axis=self.axis) + f = lambda x: x.mean(axis=self.axis, **kwargs) return self._python_agg_general(f) @Substitution(name='groupby') @Appender(_doc_template) - def median(self): + def median(self, **kwargs): """ Compute median of groups, excluding missing values For multiple groupings, the result index will be a MultiIndex """ try: - return self._cython_agg_general('median') + return self._cython_agg_general('median', **kwargs) except GroupByError: raise except Exception: # pragma: no cover @@ -1044,7 +1052,7 @@ def median(self): def f(x): if isinstance(x, np.ndarray): x = Series(x) - return x.median(axis=self.axis) + return x.median(axis=self.axis, **kwargs) return self._python_agg_general(f) @Substitution(name='groupby') @@ -1063,7 +1071,7 @@ def std(self, ddof=1, *args, **kwargs): # TODO: implement at Cython level? nv.validate_groupby_func('std', args, kwargs) - return np.sqrt(self.var(ddof=ddof)) + return np.sqrt(self.var(ddof=ddof, **kwargs)) @Substitution(name='groupby') @Appender(_doc_template) @@ -1080,10 +1088,10 @@ def var(self, ddof=1, *args, **kwargs): """ nv.validate_groupby_func('var', args, kwargs) if ddof == 1: - return self._cython_agg_general('var') + return self._cython_agg_general('var', **kwargs) else: self._set_group_selection() - f = lambda x: x.var(ddof=ddof) + f = lambda x: x.var(ddof=ddof, **kwargs) return self._python_agg_general(f) @Substitution(name='groupby') @@ -1400,39 +1408,39 @@ def cumcount(self, ascending=True): @Appender(_doc_template) def cumprod(self, axis=0, *args, **kwargs): """Cumulative product for each group""" - nv.validate_groupby_func('cumprod', args, kwargs) + nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only']) if axis != 0: - return self.apply(lambda x: x.cumprod(axis=axis)) + return self.apply(lambda x: x.cumprod(axis=axis, **kwargs)) - return self._cython_transform('cumprod') + return self._cython_transform('cumprod', **kwargs) @Substitution(name='groupby') @Appender(_doc_template) def cumsum(self, axis=0, *args, **kwargs): """Cumulative sum for each group""" - nv.validate_groupby_func('cumsum', args, kwargs) + nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only']) if axis != 0: - return self.apply(lambda x: x.cumsum(axis=axis)) + return self.apply(lambda x: x.cumsum(axis=axis, **kwargs)) - return self._cython_transform('cumsum') + return self._cython_transform('cumsum', **kwargs) @Substitution(name='groupby') @Appender(_doc_template) - def cummin(self, axis=0): + def cummin(self, axis=0, **kwargs): """Cumulative min for each group""" if axis != 0: return self.apply(lambda x: np.minimum.accumulate(x, axis)) - return self._cython_transform('cummin') + return self._cython_transform('cummin', **kwargs) @Substitution(name='groupby') @Appender(_doc_template) - def cummax(self, axis=0): + def cummax(self, axis=0, **kwargs): """Cumulative max for each group""" if axis != 0: return self.apply(lambda x: np.maximum.accumulate(x, axis)) - return self._cython_transform('cummax') + return self._cython_transform('cummax', **kwargs) @Substitution(name='groupby') @Appender(_doc_template) @@ -1828,6 +1836,28 @@ def wrapper(*args, **kwargs): def _cython_operation(self, kind, values, how, axis): assert kind in ['transform', 'aggregate'] + # can we do this operation with our cython functions + # if not raise NotImplementedError + + # we raise NotImplemented if this is an invalid operation + # entirely, e.g. adding datetimes + + # categoricals are only 1d, so we + # are not setup for dim transforming + if is_categorical_dtype(values): + raise NotImplementedError( + "categoricals are not support in cython ops ATM") + elif is_datetime64_any_dtype(values): + if how in ['add', 'prod', 'cumsum', 'cumprod']: + raise NotImplementedError( + "datetime64 type does not support {} " + "operations".format(how)) + elif is_timedelta64_dtype(values): + if how in ['prod', 'cumprod']: + raise NotImplementedError( + "timedelta64 type does not support {} " + "operations".format(how)) + arity = self._cython_arity.get(how, 1) vdim = values.ndim @@ -3155,9 +3185,9 @@ def _iterate_slices(self): continue yield val, slicer(val) - def _cython_agg_general(self, how, numeric_only=True): + def _cython_agg_general(self, how, alt=None, numeric_only=True): new_items, new_blocks = self._cython_agg_blocks( - how, numeric_only=numeric_only) + how, alt=alt, numeric_only=numeric_only) return self._wrap_agged_blocks(new_items, new_blocks) def _wrap_agged_blocks(self, items, blocks): @@ -3183,29 +3213,75 @@ def _wrap_agged_blocks(self, items, blocks): _block_agg_axis = 0 - def _cython_agg_blocks(self, how, numeric_only=True): - data, agg_axis = self._get_data_to_aggregate() + def _cython_agg_blocks(self, how, alt=None, numeric_only=True): + # TODO: the actual managing of mgr_locs is a PITA + # here, it should happen via BlockManager.combine - new_blocks = [] + data, agg_axis = self._get_data_to_aggregate() if numeric_only: data = data.get_numeric_data(copy=False) + new_blocks = [] + new_items = [] + deleted_items = [] for block in data.blocks: - result, _ = self.grouper.aggregate( - block.values, how, axis=agg_axis) + locs = block.mgr_locs.as_array + try: + result, _ = self.grouper.aggregate( + block.values, how, axis=agg_axis) + except NotImplementedError: + # generally if we have numeric_only=False + # and non-applicable functions + # try to python agg + + if alt is None: + # we cannot perform the operation + # in an alternate way, exclude the block + deleted_items.append(locs) + continue + + # call our grouper again with only this block + obj = self.obj[data.items[locs]] + s = groupby(obj, self.grouper) + result = s.aggregate(lambda x: alt(x, axis=self.axis)) + result = result._data.blocks[0] # see if we can cast the block back to the original dtype result = block._try_coerce_and_cast_result(result) - newb = make_block(result, placement=block.mgr_locs) + new_items.append(locs) + newb = block.make_block_same_class(result) new_blocks.append(newb) if len(new_blocks) == 0: raise DataError('No numeric types to aggregate') - return data.items, new_blocks + # reset the locs in the blocks to correspond to our + # current ordering + indexer = np.concatenate(new_items) + new_items = data.items.take(np.sort(indexer)) + + if len(deleted_items): + + # we need to adjust the indexer to account for the + # items we have removed + # really should be done in internals :< + + deleted = np.concatenate(deleted_items) + ai = np.arange(len(data)) + mask = np.zeros(len(data)) + mask[deleted] = 1 + indexer = (ai - mask.cumsum())[indexer] + + offset = 0 + for b in new_blocks: + l = len(b.mgr_locs) + b.mgr_locs = indexer[offset:(offset + l)] + offset += l + + return new_items, new_blocks def _get_data_to_aggregate(self): obj = self._obj_with_exclusions diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b00dc62206f57..f8a1e5a684858 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2257,9 +2257,131 @@ def test_max_min_non_numeric(self): result = aa.groupby('nn').max() self.assertTrue('ss' in result) + result = aa.groupby('nn').max(numeric_only=False) + self.assertTrue('ss' in result) + result = aa.groupby('nn').min() self.assertTrue('ss' in result) + result = aa.groupby('nn').min(numeric_only=False) + self.assertTrue('ss' in result) + + def test_arg_passthru(self): + # make sure that we are passing thru kwargs + # to our agg functions + + # GH3668 + # GH5724 + df = pd.DataFrame( + {'group': [1, 1, 2], + 'int': [1, 2, 3], + 'float': [4., 5., 6.], + 'string': list('abc'), + 'category_string': pd.Series(list('abc')).astype('category'), + 'category_int': [7, 8, 9], + 'datetime': pd.date_range('20130101', periods=3), + 'datetimetz': pd.date_range('20130101', + periods=3, + tz='US/Eastern'), + 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')}, + columns=['group', 'int', 'float', 'string', + 'category_string', 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + + expected_columns_numeric = Index(['int', 'float', 'category_int']) + + # mean / median + expected = pd.DataFrame( + {'category_int': [7.5, 9], + 'float': [4.5, 6.], + 'timedelta': [pd.Timedelta('1.5s'), + pd.Timedelta('3s')], + 'int': [1.5, 3], + 'datetime': [pd.Timestamp('2013-01-01 12:00:00'), + pd.Timestamp('2013-01-03 00:00:00')], + 'datetimetz': [ + pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'), + pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]}, + index=Index([1, 2], name='group'), + columns=['int', 'float', 'category_int', + 'datetime', 'datetimetz', 'timedelta']) + for attr in ['mean', 'median']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + assert_frame_equal(result.reindex_like(expected), expected) + + # TODO: min, max *should* handle + # categorical (ordered) dtype + expected_columns = Index(['int', 'float', 'string', + 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + for attr in ['min', 'max']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'string', + 'category_string', 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + for attr in ['first', 'last']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'string', + 'category_int', 'timedelta']) + for attr in ['sum']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'category_int']) + for attr in ['prod', 'cumprod']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + # like min, max, but don't include strings + expected_columns = Index(['int', 'float', + 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + for attr in ['cummin', 'cummax']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'category_int', + 'timedelta']) + for attr in ['cumsum']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + def test_cython_agg_boolean(self): frame = DataFrame({'a': np.random.randint(0, 5, 50), 'b': np.random.randint(0, 2, 50).astype('bool')}) @@ -3436,6 +3558,7 @@ def test_int64_overflow(self): tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H' ]].values)) tups = com._asarray_tuplesafe(tups) + expected = df.groupby(tups).sum()['values'] for k, v in compat.iteritems(expected):