BUG: make sure that we are passing thru kwargs to groupby

jreback · jreback · commit 045bcb5d5d72 · 2017-01-05T18:52:25.000-05:00
BUG: allow timedelta64 to work in groupby with numeric_only=False closes #5724
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -323,6 +323,7 @@ Bug Fixes
 
 
 
+- Bug in  groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`)
 
 
 - Bug in ``DataFrame.to_html`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`)
diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py
@@ -306,12 +306,18 @@ def validate_expanding_func(name, args, kwargs):
             raise UnsupportedFunctionCall(msg)
 
 
-def validate_groupby_func(name, args, kwargs):
+def validate_groupby_func(name, args, kwargs, allowed=None):
     """
-    'args' and 'kwargs' should be empty because all of
+    'args' and 'kwargs' should be empty, except for allowed
+    kwargs because all of
     their necessary parameters are explicitly listed in
     the function signature
     """
+    if allowed is None:
+        allowed = []
+
+    kwargs = set(kwargs) - set(allowed)
+
     if len(args) + len(kwargs) > 0:
         raise UnsupportedFunctionCall((
             "numpy operations are not valid "
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -19,6 +19,7 @@
                                  is_categorical_dtype,
                                  is_datetimelike,
                                  is_datetime_or_timedelta_dtype,
+                                 is_datetime64_any_dtype,
                                  is_bool, is_integer_dtype,
                                  is_complex_dtype,
                                  is_bool_dtype,
@@ -108,10 +109,12 @@ def _groupby_function(name, alias, npfunc, numeric_only=True,
     @Substitution(name='groupby', f=name)
     @Appender(_doc_template)
     @Appender(_local_template)
-    def f(self):
+    def f(self, **kwargs):
+        if 'numeric_only' not in kwargs:
+            kwargs['numeric_only'] = numeric_only
         self._set_group_selection()
         try:
-            return self._cython_agg_general(alias, numeric_only=numeric_only)
+            return self._cython_agg_general(alias, alt=npfunc, **kwargs)
         except AssertionError as e:
             raise SpecificationError(str(e))
         except Exception:
@@ -126,7 +129,9 @@ def f(self):
 
 
 def _first_compat(x, axis=0):
+
     def _first(x):
+
         x = np.asarray(x)
         x = x[notnull(x)]
         if len(x) == 0:
@@ -141,6 +146,7 @@ def _first(x):
 
 def _last_compat(x, axis=0):
     def _last(x):
+
         x = np.asarray(x)
         x = x[notnull(x)]
         if len(x) == 0:
@@ -782,6 +788,8 @@ def _cython_transform(self, how, numeric_only=True):
 
             try:
                 result, names = self.grouper.transform(obj.values, how)
+            except NotImplementedError:
+                continue
             except AssertionError as e:
                 raise GroupByError(str(e))
             output[name] = self._try_cast(result, obj)
@@ -791,7 +799,7 @@ def _cython_transform(self, how, numeric_only=True):
 
         return self._wrap_transformed_output(output, names)
 
-    def _cython_agg_general(self, how, numeric_only=True):
+    def _cython_agg_general(self, how, alt=None, numeric_only=True):
         output = {}
         for name, obj in self._iterate_slices():
             is_numeric = is_numeric_dtype(obj.dtype)
@@ -1014,26 +1022,26 @@ def mean(self, *args, **kwargs):
 
         For multiple groupings, the result index will be a MultiIndex
         """
-        nv.validate_groupby_func('mean', args, kwargs)
+        nv.validate_groupby_func('mean', args, kwargs, ['numeric_only'])
         try:
-            return self._cython_agg_general('mean')
+            return self._cython_agg_general('mean', **kwargs)
         except GroupByError:
             raise
         except Exception:  # pragma: no cover
             self._set_group_selection()
-            f = lambda x: x.mean(axis=self.axis)
+            f = lambda x: x.mean(axis=self.axis, **kwargs)
             return self._python_agg_general(f)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
-    def median(self):
+    def median(self, **kwargs):
         """
         Compute median of groups, excluding missing values
 
         For multiple groupings, the result index will be a MultiIndex
         """
         try:
-            return self._cython_agg_general('median')
+            return self._cython_agg_general('median', **kwargs)
         except GroupByError:
             raise
         except Exception:  # pragma: no cover
@@ -1043,7 +1051,7 @@ def median(self):
             def f(x):
                 if isinstance(x, np.ndarray):
                     x = Series(x)
-                return x.median(axis=self.axis)
+                return x.median(axis=self.axis, **kwargs)
             return self._python_agg_general(f)
 
     @Substitution(name='groupby')
@@ -1062,7 +1070,7 @@ def std(self, ddof=1, *args, **kwargs):
 
         # TODO: implement at Cython level?
         nv.validate_groupby_func('std', args, kwargs)
-        return np.sqrt(self.var(ddof=ddof))
+        return np.sqrt(self.var(ddof=ddof, **kwargs))
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
@@ -1079,10 +1087,10 @@ def var(self, ddof=1, *args, **kwargs):
         """
         nv.validate_groupby_func('var', args, kwargs)
         if ddof == 1:
-            return self._cython_agg_general('var')
+            return self._cython_agg_general('var', **kwargs)
         else:
             self._set_group_selection()
-            f = lambda x: x.var(ddof=ddof)
+            f = lambda x: x.var(ddof=ddof, **kwargs)
             return self._python_agg_general(f)
 
     @Substitution(name='groupby')
@@ -1399,21 +1407,21 @@ def cumcount(self, ascending=True):
     @Appender(_doc_template)
     def cumprod(self, axis=0, *args, **kwargs):
         """Cumulative product for each group"""
-        nv.validate_groupby_func('cumprod', args, kwargs)
+        nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only'])
         if axis != 0:
-            return self.apply(lambda x: x.cumprod(axis=axis))
+            return self.apply(lambda x: x.cumprod(axis=axis, **kwargs))
 
-        return self._cython_transform('cumprod')
+        return self._cython_transform('cumprod', **kwargs)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def cumsum(self, axis=0, *args, **kwargs):
         """Cumulative sum for each group"""
-        nv.validate_groupby_func('cumsum', args, kwargs)
+        nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only'])
         if axis != 0:
-            return self.apply(lambda x: x.cumsum(axis=axis))
+            return self.apply(lambda x: x.cumsum(axis=axis, **kwargs))
 
-        return self._cython_transform('cumsum')
+        return self._cython_transform('cumsum', **kwargs)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
@@ -1807,6 +1815,28 @@ def wrapper(*args, **kwargs):
     def _cython_operation(self, kind, values, how, axis):
         assert kind in ['transform', 'aggregate']
 
+        # can we do this operation with our cython functions
+        # if not raise NotImplementedError
+
+        # we raise NotImplemented if this is an invalid operation
+        # entirely, e.g. adding datetimes
+
+        # categoricals are only 1d, so we
+        # are not setup for dim transforming
+        if is_categorical_dtype(values):
+            raise NotImplementedError(
+                "categoricals are not support in cython ops ATM")
+        elif is_datetime64_any_dtype(values):
+            if how in ['add', 'prod', 'cumsum', 'cumprod']:
+                raise NotImplementedError(
+                    "datetime64 type does not support {} "
+                    "operations".format(how))
+        elif is_timedelta64_dtype(values):
+            if how in ['prod', 'cumprod']:
+                raise NotImplementedError(
+                    "timedelta64 type does not support {} "
+                    "operations".format(how))
+
         arity = self._cython_arity.get(how, 1)
 
         vdim = values.ndim
@@ -3134,9 +3164,9 @@ def _iterate_slices(self):
                 continue
             yield val, slicer(val)
 
-    def _cython_agg_general(self, how, numeric_only=True):
+    def _cython_agg_general(self, how, alt=None, numeric_only=True):
         new_items, new_blocks = self._cython_agg_blocks(
-            how, numeric_only=numeric_only)
+            how, alt=alt, numeric_only=numeric_only)
         return self._wrap_agged_blocks(new_items, new_blocks)
 
     def _wrap_agged_blocks(self, items, blocks):
@@ -3162,29 +3192,55 @@ def _wrap_agged_blocks(self, items, blocks):
 
     _block_agg_axis = 0
 
-    def _cython_agg_blocks(self, how, numeric_only=True):
+    def _cython_agg_blocks(self, how, alt=None, numeric_only=True):
+        # TODO: the actual managing of mgr_locs is a PITA
+        # here, it should happen via BlockManager.combine
+
         data, agg_axis = self._get_data_to_aggregate()
 
         new_blocks = []
 
         if numeric_only:
             data = data.get_numeric_data(copy=False)
 
+        offset = 0
+        new_items = []
         for block in data.blocks:
 
-            result, _ = self.grouper.aggregate(
-                block.values, how, axis=agg_axis)
+            locs = block.mgr_locs.as_array
+            try:
+                result, _ = self.grouper.aggregate(
+                    block.values, how, axis=agg_axis)
+            except NotImplementedError:
+                # generally if we have numeric_only=False
+                # and non-applicable functions
+                # try to python agg
+
+                if alt is None:
+                    # we cannot perform the operation
+                    # in an alternate way, exclude the block
+                    continue
+
+                # call our grouper again with only this block
+                obj = self.obj.iloc[:, locs]
+                s = groupby(obj, self.grouper)
+                result = s.aggregate(lambda x: alt(x, axis=self.axis))
+                result = result._data.blocks[0]
 
             # see if we can cast the block back to the original dtype
             result = block._try_coerce_and_cast_result(result)
 
-            newb = make_block(result, placement=block.mgr_locs)
+            new_items.append(locs)
+            newb = block.make_block_same_class(
+                result,
+                placement=np.arange(offset, offset + len(locs)))
+            offset += len(locs)
             new_blocks.append(newb)
 
         if len(new_blocks) == 0:
             raise DataError('No numeric types to aggregate')
 
-        return data.items, new_blocks
+        return data.items.take(np.concatenate(new_items)), new_blocks
 
     def _get_data_to_aggregate(self):
         obj = self._obj_with_exclusions
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2260,6 +2260,86 @@ def test_max_min_non_numeric(self):
         result = aa.groupby('nn').min()
         self.assertTrue('ss' in result)
 
+    def test_arg_passthru(self):
+        # make sure that we are passing thru kwargs
+        # to our agg functions
+
+        # GH3668
+        # GH5724
+        df = pd.DataFrame({
+            'group': [1, 1, 2],
+            'int': [1, 2, 3],
+            'float': [1., 2., 3.],
+            'string': list('abc'),
+            'category': pd.Series(list('abc')).astype('category'),
+            'datetime': pd.date_range('20130101', periods=3),
+            'datetimetz': pd.date_range('20130101',
+                                        periods=3,
+                                        tz='US/Eastern'),
+            'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')})
+
+        # basic
+        result = df.groupby('group').mean()
+        expected = pd.DataFrame(
+            {'int': [1.5, 3],
+             'float': [1.5, 3.]},
+            index=Index([1, 2], name='group'))
+        assert_frame_equal(result.reindex_like(expected), expected)
+
+        # mean / median
+        expected = pd.DataFrame(
+            {'int': [1.5, 3],
+             'float': [1.5, 3.],
+             'timedelta': [pd.Timedelta('1.5s'),
+                           pd.Timedelta('3s')],
+             'datetime': [pd.Timestamp('2013-01-01 12:00:00'),
+                          pd.Timestamp('2013-01-03 00:00:00')],
+             'datetimetz': [
+                 pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'),
+                 pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]},
+            index=Index([1, 2], name='group'))
+        for attr in ['mean', 'median']:
+            f = getattr(df.groupby('group'), attr)
+            result = f(numeric_only=False)
+            assert_frame_equal(result, expected)
+
+        expected_columns = Index(['datetime', 'datetimetz',
+                                  'float', 'int',
+                                  'string', 'timedelta'])
+
+        # TODO: min, max *should*
+        # categorical (ordered) dtype
+        for attr in ['min', 'max']:
+            f = getattr(df.groupby('group'), attr)
+            result = f(numeric_only=False)
+            tm.assert_index_equal(result.columns, expected_columns)
+
+        expected_columns = Index(['category', 'datetime', 'datetimetz',
+                                  'float', 'int',
+                                  'string', 'timedelta'])
+        for attr in ['first', 'last']:
+            f = getattr(df.groupby('group'), attr)
+            result = f(numeric_only=False)
+            tm.assert_index_equal(result.columns, expected_columns)
+
+        expected_columns = Index(['float', 'int', 'string', 'timedelta'])
+        for attr in ['sum']:
+            f = getattr(df.groupby('group'), attr)
+            result = f(numeric_only=False)
+            tm.assert_index_equal(result.columns, expected_columns)
+
+        expected_columns = Index(['float', 'int'])
+        for attr in ['prod', 'cumprod']:
+            f = getattr(df.groupby('group'), attr)
+            result = f(numeric_only=False)
+            tm.assert_index_equal(result.columns, expected_columns)
+
+        expected_columns = Index(['float', 'int', 'timedelta'])
+        for attr in ['cumsum']:
+            f = getattr(df.groupby('group'), attr)
+            result = f(numeric_only=False)
+            tm.assert_index_equal(result.columns, expected_columns)
+
     def test_cython_agg_boolean(self):
         frame = DataFrame({'a': np.random.randint(0, 5, 50),
                            'b': np.random.randint(0, 2, 50).astype('bool')})
@@ -3436,6 +3516,7 @@ def test_int64_overflow(self):
         tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'
                                    ]].values))
         tups = com._asarray_tuplesafe(tups)
+
         expected = df.groupby(tups).sum()['values']
 
         for k, v in compat.iteritems(expected):

Original file line number	Diff line number	Diff line change
`@@ -323,6 +323,7 @@ Bug Fixes`
`323`	`323`
`324`	`324`
`325`	`325`
	`326`	+- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`)
`326`	`327`
`327`	`328`
`328`	`329`	- Bug in ``DataFrame.to_html`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`)