BUG: make sure that we are passing thru kwargs to groupby

jreback · jreback · commit 8e13da24818d · 2017-01-18T11:13:35.000-05:00
BUG: allow timedelta64 to work in groupby with numeric_only=False closes #5724 Author: Jeff Reback <jeff@reback.net> Closes #15054 from jreback/groupby_arg and squashes the following commits: 768fce1 [Jeff Reback] BUG: make sure that we are passing thru kwargs to groupby BUG: allow timedelta64 to work in groupby with numeric_only=False
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -400,6 +400,7 @@ Bug Fixes
 
 
 
+- Bug in  groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`)
 
 
 - Bug in ``DataFrame.to_html`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`)
diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py
@@ -306,12 +306,18 @@ def validate_expanding_func(name, args, kwargs):
             raise UnsupportedFunctionCall(msg)
 
 
-def validate_groupby_func(name, args, kwargs):
+def validate_groupby_func(name, args, kwargs, allowed=None):
     """
-    'args' and 'kwargs' should be empty because all of
+    'args' and 'kwargs' should be empty, except for allowed
+    kwargs because all of
     their necessary parameters are explicitly listed in
     the function signature
     """
+    if allowed is None:
+        allowed = []
+
+    kwargs = set(kwargs) - set(allowed)
+
     if len(args) + len(kwargs) > 0:
         raise UnsupportedFunctionCall((
             "numpy operations are not valid "
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -19,6 +19,7 @@
                                  is_categorical_dtype,
                                  is_datetimelike,
                                  is_datetime_or_timedelta_dtype,
+                                 is_datetime64_any_dtype,
                                  is_bool, is_integer_dtype,
                                  is_complex_dtype,
                                  is_bool_dtype,
@@ -109,10 +110,12 @@ def _groupby_function(name, alias, npfunc, numeric_only=True,
     @Substitution(name='groupby', f=name)
     @Appender(_doc_template)
     @Appender(_local_template)
-    def f(self):
+    def f(self, **kwargs):
+        if 'numeric_only' not in kwargs:
+            kwargs['numeric_only'] = numeric_only
         self._set_group_selection()
         try:
-            return self._cython_agg_general(alias, numeric_only=numeric_only)
+            return self._cython_agg_general(alias, alt=npfunc, **kwargs)
         except AssertionError as e:
             raise SpecificationError(str(e))
         except Exception:
@@ -127,7 +130,9 @@ def f(self):
 
 
 def _first_compat(x, axis=0):
+
     def _first(x):
+
         x = np.asarray(x)
         x = x[notnull(x)]
         if len(x) == 0:
@@ -142,6 +147,7 @@ def _first(x):
 
 def _last_compat(x, axis=0):
     def _last(x):
+
         x = np.asarray(x)
         x = x[notnull(x)]
         if len(x) == 0:
@@ -775,14 +781,16 @@ def _try_cast(self, result, obj):
         return result
 
     def _cython_transform(self, how, numeric_only=True):
-        output = {}
+        output = collections.OrderedDict()
         for name, obj in self._iterate_slices():
             is_numeric = is_numeric_dtype(obj.dtype)
             if numeric_only and not is_numeric:
                 continue
 
             try:
                 result, names = self.grouper.transform(obj.values, how)
+            except NotImplementedError:
+                continue
             except AssertionError as e:
                 raise GroupByError(str(e))
             output[name] = self._try_cast(result, obj)
@@ -792,7 +800,7 @@ def _cython_transform(self, how, numeric_only=True):
 
         return self._wrap_transformed_output(output, names)
 
-    def _cython_agg_general(self, how, numeric_only=True):
+    def _cython_agg_general(self, how, alt=None, numeric_only=True):
         output = {}
         for name, obj in self._iterate_slices():
             is_numeric = is_numeric_dtype(obj.dtype)
@@ -1015,26 +1023,26 @@ def mean(self, *args, **kwargs):
 
         For multiple groupings, the result index will be a MultiIndex
         """
-        nv.validate_groupby_func('mean', args, kwargs)
+        nv.validate_groupby_func('mean', args, kwargs, ['numeric_only'])
         try:
-            return self._cython_agg_general('mean')
+            return self._cython_agg_general('mean', **kwargs)
         except GroupByError:
             raise
         except Exception:  # pragma: no cover
             self._set_group_selection()
-            f = lambda x: x.mean(axis=self.axis)
+            f = lambda x: x.mean(axis=self.axis, **kwargs)
             return self._python_agg_general(f)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
-    def median(self):
+    def median(self, **kwargs):
         """
         Compute median of groups, excluding missing values
 
         For multiple groupings, the result index will be a MultiIndex
         """
         try:
-            return self._cython_agg_general('median')
+            return self._cython_agg_general('median', **kwargs)
         except GroupByError:
             raise
         except Exception:  # pragma: no cover
@@ -1044,7 +1052,7 @@ def median(self):
             def f(x):
                 if isinstance(x, np.ndarray):
                     x = Series(x)
-                return x.median(axis=self.axis)
+                return x.median(axis=self.axis, **kwargs)
             return self._python_agg_general(f)
 
     @Substitution(name='groupby')
@@ -1063,7 +1071,7 @@ def std(self, ddof=1, *args, **kwargs):
 
         # TODO: implement at Cython level?
         nv.validate_groupby_func('std', args, kwargs)
-        return np.sqrt(self.var(ddof=ddof))
+        return np.sqrt(self.var(ddof=ddof, **kwargs))
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
@@ -1080,10 +1088,10 @@ def var(self, ddof=1, *args, **kwargs):
         """
         nv.validate_groupby_func('var', args, kwargs)
         if ddof == 1:
-            return self._cython_agg_general('var')
+            return self._cython_agg_general('var', **kwargs)
         else:
             self._set_group_selection()
-            f = lambda x: x.var(ddof=ddof)
+            f = lambda x: x.var(ddof=ddof, **kwargs)
             return self._python_agg_general(f)
 
     @Substitution(name='groupby')
@@ -1400,39 +1408,39 @@ def cumcount(self, ascending=True):
     @Appender(_doc_template)
     def cumprod(self, axis=0, *args, **kwargs):
         """Cumulative product for each group"""
-        nv.validate_groupby_func('cumprod', args, kwargs)
+        nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only'])
         if axis != 0:
-            return self.apply(lambda x: x.cumprod(axis=axis))
+            return self.apply(lambda x: x.cumprod(axis=axis, **kwargs))
 
-        return self._cython_transform('cumprod')
+        return self._cython_transform('cumprod', **kwargs)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def cumsum(self, axis=0, *args, **kwargs):
         """Cumulative sum for each group"""
-        nv.validate_groupby_func('cumsum', args, kwargs)
+        nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only'])
         if axis != 0:
-            return self.apply(lambda x: x.cumsum(axis=axis))
+            return self.apply(lambda x: x.cumsum(axis=axis, **kwargs))
 
-        return self._cython_transform('cumsum')
+        return self._cython_transform('cumsum', **kwargs)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
-    def cummin(self, axis=0):
+    def cummin(self, axis=0, **kwargs):
         """Cumulative min for each group"""
         if axis != 0:
             return self.apply(lambda x: np.minimum.accumulate(x, axis))
 
-        return self._cython_transform('cummin')
+        return self._cython_transform('cummin', **kwargs)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
-    def cummax(self, axis=0):
+    def cummax(self, axis=0, **kwargs):
         """Cumulative max for each group"""
         if axis != 0:
             return self.apply(lambda x: np.maximum.accumulate(x, axis))
 
-        return self._cython_transform('cummax')
+        return self._cython_transform('cummax', **kwargs)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
@@ -1828,6 +1836,28 @@ def wrapper(*args, **kwargs):
     def _cython_operation(self, kind, values, how, axis):
         assert kind in ['transform', 'aggregate']
 
+        # can we do this operation with our cython functions
+        # if not raise NotImplementedError
+
+        # we raise NotImplemented if this is an invalid operation
+        # entirely, e.g. adding datetimes
+
+        # categoricals are only 1d, so we
+        # are not setup for dim transforming
+        if is_categorical_dtype(values):
+            raise NotImplementedError(
+                "categoricals are not support in cython ops ATM")
+        elif is_datetime64_any_dtype(values):
+            if how in ['add', 'prod', 'cumsum', 'cumprod']:
+                raise NotImplementedError(
+                    "datetime64 type does not support {} "
+                    "operations".format(how))
+        elif is_timedelta64_dtype(values):
+            if how in ['prod', 'cumprod']:
+                raise NotImplementedError(
+                    "timedelta64 type does not support {} "
+                    "operations".format(how))
+
         arity = self._cython_arity.get(how, 1)
 
         vdim = values.ndim
@@ -3155,9 +3185,9 @@ def _iterate_slices(self):
                 continue
             yield val, slicer(val)
 
-    def _cython_agg_general(self, how, numeric_only=True):
+    def _cython_agg_general(self, how, alt=None, numeric_only=True):
         new_items, new_blocks = self._cython_agg_blocks(
-            how, numeric_only=numeric_only)
+            how, alt=alt, numeric_only=numeric_only)
         return self._wrap_agged_blocks(new_items, new_blocks)
 
     def _wrap_agged_blocks(self, items, blocks):
@@ -3183,29 +3213,75 @@ def _wrap_agged_blocks(self, items, blocks):
 
     _block_agg_axis = 0
 
-    def _cython_agg_blocks(self, how, numeric_only=True):
-        data, agg_axis = self._get_data_to_aggregate()
+    def _cython_agg_blocks(self, how, alt=None, numeric_only=True):
+        # TODO: the actual managing of mgr_locs is a PITA
+        # here, it should happen via BlockManager.combine
 
-        new_blocks = []
+        data, agg_axis = self._get_data_to_aggregate()
 
         if numeric_only:
             data = data.get_numeric_data(copy=False)
 
+        new_blocks = []
+        new_items = []
+        deleted_items = []
         for block in data.blocks:
 
-            result, _ = self.grouper.aggregate(
-                block.values, how, axis=agg_axis)
+            locs = block.mgr_locs.as_array
+            try:
+                result, _ = self.grouper.aggregate(
+                    block.values, how, axis=agg_axis)
+            except NotImplementedError:
+                # generally if we have numeric_only=False
+                # and non-applicable functions
+                # try to python agg
+
+                if alt is None:
+                    # we cannot perform the operation
+                    # in an alternate way, exclude the block
+                    deleted_items.append(locs)
+                    continue
+
+                # call our grouper again with only this block
+                obj = self.obj[data.items[locs]]
+                s = groupby(obj, self.grouper)
+                result = s.aggregate(lambda x: alt(x, axis=self.axis))
+                result = result._data.blocks[0]
 
             # see if we can cast the block back to the original dtype
             result = block._try_coerce_and_cast_result(result)
 
-            newb = make_block(result, placement=block.mgr_locs)
+            new_items.append(locs)
+            newb = block.make_block_same_class(result)
             new_blocks.append(newb)
 
         if len(new_blocks) == 0:
             raise DataError('No numeric types to aggregate')
 
-        return data.items, new_blocks
+        # reset the locs in the blocks to correspond to our
+        # current ordering
+        indexer = np.concatenate(new_items)
+        new_items = data.items.take(np.sort(indexer))
+
+        if len(deleted_items):
+
+            # we need to adjust the indexer to account for the
+            # items we have removed
+            # really should be done in internals :<
+
+            deleted = np.concatenate(deleted_items)
+            ai = np.arange(len(data))
+            mask = np.zeros(len(data))
+            mask[deleted] = 1
+            indexer = (ai - mask.cumsum())[indexer]
+
+        offset = 0
+        for b in new_blocks:
+            l = len(b.mgr_locs)
+            b.mgr_locs = indexer[offset:(offset + l)]
+            offset += l
+
+        return new_items, new_blocks
 
     def _get_data_to_aggregate(self):
         obj = self._obj_with_exclusions
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py

Original file line number	Diff line number	Diff line change
`@@ -400,6 +400,7 @@ Bug Fixes`
`400`	`400`
`401`	`401`
`402`	`402`
	`403`	+- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`)
`403`	`404`
`404`	`405`
`405`	`406`	- Bug in ``DataFrame.to_html`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`)