Merge pull request #7000 from jreback/groupby_counts_agg

jreback · jreback · commit d2ead2cab735 · 2014-04-29T16:12:46.000-04:00
ENH/BUG: add count to grouper / ensure that grouper keys are not included in the returned
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -179,6 +179,8 @@ API Changes
   validation warnings in :func:`read_csv`/:func:`read_table` (:issue:`6607`)
 - Raise a ``TypeError`` when ``DataFrame`` is passed an iterator as the
   ``data`` argument (:issue:`5357`)
+- groupby will now not return the grouped column for non-cython functions (:issue:`5610`),
+  as its already the index
 
 Deprecations
 ~~~~~~~~~~~~
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -110,12 +110,29 @@ API changes
 
   .. ipython:: python
 
-     DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+     df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
      g = df.groupby('A')
      g.nth(0)  # can also use negative ints
 
      g.nth(0, dropna='any')  # similar to old behaviour
 
+  groupby will now not return the grouped column for non-cython functions (:issue:`5610`),
+  as its already the index
+
+  .. ipython:: python
+
+     df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B'])
+     g = df.groupby('A')
+     g.count()
+     g.describe()
+
+  passing ``as_index`` will leave the grouped column in-place (this is not change in 0.14.0)
+
+     df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B'])
+     g = df.groupby('A',as_index=False)
+     g.count()
+     g.describe()
+
 - Allow specification of a more complex groupby via ``pd.Grouper``, such as grouping
   by a Time and a string field simultaneously. See :ref:`the docs <groupby.specify>`. (:issue:`3794`)
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -226,11 +226,13 @@ def describe(self):
         """
         # Hack?
         from pandas.core.frame import DataFrame
-        grouped = DataFrame(self.labels).groupby(0)
-        counts = grouped.count().values.squeeze()
+        counts = DataFrame({
+            'labels' : self.labels,
+            'values' : self.labels }
+                           ).groupby('labels').count().squeeze().values
         freqs = counts / float(counts.sum())
-        return DataFrame.from_dict({
+        return DataFrame({
             'counts': counts,
             'freqs': freqs,
             'levels': self.levels
-        }).set_index('levels')
+            }).set_index('levels')
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -611,11 +611,19 @@ def __neg__(self):
             arr = operator.inv(values)
         else:
             arr = operator.neg(values)
-        return self._wrap_array(arr, self.axes, copy=False)
+        return self.__array_wrap__(arr)
 
     def __invert__(self):
-        arr = operator.inv(_values_from_object(self))
-        return self._wrap_array(arr, self.axes, copy=False)
+        try:
+            arr = operator.inv(_values_from_object(self))
+            return self.__array_wrap__(arr)
+        except:
+
+            # inv fails with 0 len
+            if not np.prod(self.shape):
+                return self
+
+            raise
 
     def equals(self, other):
         """
@@ -707,15 +715,11 @@ def __abs__(self):
     #----------------------------------------------------------------------
     # Array Interface
 
-    def _wrap_array(self, arr, axes, copy=False):
-        d = self._construct_axes_dict_from(self, axes, copy=copy)
-        return self._constructor(arr, **d).__finalize__(self)
-
     def __array__(self, dtype=None):
         return _values_from_object(self)
 
-    def __array_wrap__(self, result):
-        d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
+    def __array_wrap__(self, result, copy=False):
+        d = self._construct_axes_dict(self._AXIS_ORDERS, copy=copy)
         return self._constructor(result, **d).__finalize__(self)
 
     # ideally we would define this to avoid the getattr checks, but
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -445,6 +445,23 @@ def _selection_list(self):
             return [self._selection]
         return self._selection
 
+    @cache_readonly
+    def _selected_obj(self):
+
+        if self._selection is None or isinstance(self.obj, Series):
+            return self.obj
+        else:
+            return self.obj[self._selection]
+
+    def _set_selection_from_grouper(self):
+        """ we may need create a selection if we have non-level groupers """
+        grp = self.grouper
+        if self._selection is None and self.as_index and getattr(grp,'groupings',None) is not None:
+            ax = self.obj._info_axis
+            groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
+            if len(groupers):
+                self._selection = (ax-Index(groupers)).tolist()
+
     def _local_dir(self):
         return sorted(set(self.obj._local_dir() + list(self._apply_whitelist)))
 
@@ -453,7 +470,6 @@ def __getattr__(self, attr):
             return object.__getattribute__(self, attr)
         if attr in self.obj:
             return self[attr]
-
         if hasattr(self.obj, attr):
             return self._make_wrapper(attr)
 
@@ -472,6 +488,10 @@ def _make_wrapper(self, name):
                                                      type(self).__name__))
             raise AttributeError(msg)
 
+        # need to setup the selection
+        # as are not passed directly but in the grouper
+        self._set_selection_from_grouper()
+
         f = getattr(self._selected_obj, name)
         if not isinstance(f, types.MethodType):
             return self.apply(lambda self: getattr(self, name))
@@ -503,7 +523,19 @@ def curried(x):
             try:
                 return self.apply(curried_with_axis)
             except Exception:
-                return self.apply(curried)
+                try:
+                    return self.apply(curried)
+                except Exception:
+
+                    # related to : GH3688
+                    # try item-by-item
+                    # this can be called recursively, so need to raise ValueError if
+                    # we don't have this method to indicated to aggregate to
+                    # mark this column as an error
+                    try:
+                        return self._aggregate_item_by_item(name, *args, **kwargs)
+                    except (AttributeError):
+                        raise ValueError
 
         return wrapper
 
@@ -624,6 +656,7 @@ def mean(self):
         except GroupByError:
             raise
         except Exception:  # pragma: no cover
+            self._set_selection_from_grouper()
             f = lambda x: x.mean(axis=self.axis)
             return self._python_agg_general(f)
 
@@ -639,6 +672,7 @@ def median(self):
             raise
         except Exception:  # pragma: no cover
 
+            self._set_selection_from_grouper()
             def f(x):
                 if isinstance(x, np.ndarray):
                     x = Series(x)
@@ -655,6 +689,7 @@ def std(self, ddof=1):
         if ddof == 1:
             return self._cython_agg_general('std')
         else:
+            self._set_selection_from_grouper()
             f = lambda x: x.std(ddof=ddof)
             return self._python_agg_general(f)
 
@@ -667,15 +702,26 @@ def var(self, ddof=1):
         if ddof == 1:
             return self._cython_agg_general('var')
         else:
+            self._set_selection_from_grouper()
             f = lambda x: x.var(ddof=ddof)
             return self._python_agg_general(f)
 
     def size(self):
         """
         Compute group sizes
+
         """
         return self.grouper.size()
 
+    def count(self, axis=0):
+        """
+        Number of non-null items in each group.
+        axis : axis number, default 0
+               the grouping axis
+        """
+        self._set_selection_from_grouper()
+        return self._python_agg_general(lambda x: notnull(x).sum(axis=axis)).astype('int64')
+
     sum = _groupby_function('sum', 'add', np.sum)
     prod = _groupby_function('prod', 'prod', np.prod)
     min = _groupby_function('min', 'min', np.min, numeric_only=False)
@@ -685,14 +731,14 @@ def size(self):
     last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
                              _convert=True)
 
+
     def ohlc(self):
         """
         Compute sum of values, excluding missing values
-
         For multiple groupings, the result index will be a MultiIndex
-
         """
-        return self._cython_agg_general('ohlc')
+        return self._apply_to_column_groupbys(
+            lambda x: x._cython_agg_general('ohlc'))
 
     def nth(self, n, dropna=None):
         """
@@ -888,13 +934,6 @@ def _cumcount_array(self, arr=None, **kwargs):
                 cumcounts[v] = arr[len(v)-1::-1]
         return cumcounts
 
-    @cache_readonly
-    def _selected_obj(self):
-        if self._selection is None or isinstance(self.obj, Series):
-            return self.obj
-        else:
-            return self.obj[self._selection]
-
     def _index_with_as_index(self, b):
         """
         Take boolean mask of index to be returned from apply, if as_index=True
@@ -990,12 +1029,23 @@ def _concat_objects(self, keys, values, not_indexed_same=False):
                 result = result.reindex(ax)
             else:
                 result = result.reindex_axis(ax, axis=self.axis)
-        elif self.group_keys and self.as_index:
-            group_keys = keys
-            group_levels = self.grouper.levels
-            group_names = self.grouper.names
-            result = concat(values, axis=self.axis, keys=group_keys,
-                            levels=group_levels, names=group_names)
+
+        elif self.group_keys:
+
+            if self.as_index:
+
+                # possible MI return case
+                group_keys = keys
+                group_levels = self.grouper.levels
+                group_names = self.grouper.names
+                result = concat(values, axis=self.axis, keys=group_keys,
+                                levels=group_levels, names=group_names)
+            else:
+
+                # GH5610, returns a MI, with the first level being a
+                # range index
+                keys = list(range(len(values)))
+                result = concat(values, axis=self.axis, keys=keys)
         else:
             result = concat(values, axis=self.axis)
 
@@ -2187,6 +2237,9 @@ def true_and_notnull(x, *args, **kwargs):
         filtered = self._apply_filter(indices, dropna)
         return filtered
 
+    def _apply_to_column_groupbys(self, func):
+        """ return a pass thru """
+        return func(self)
 
 class NDFrameGroupBy(GroupBy):
 
@@ -2486,6 +2539,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
         elif hasattr(self.grouper, 'groupings'):
             if len(self.grouper.groupings) > 1:
                 key_index = MultiIndex.from_tuples(keys, names=key_names)
+
             else:
                 ping = self.grouper.groupings[0]
                 if len(keys) == ping.ngroups:
@@ -2498,8 +2552,13 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                     # reorder the values
                     values = [values[i] for i in indexer]
                 else:
+
                     key_index = Index(keys, name=key_names[0])
 
+                # don't use the key indexer
+                if not self.as_index:
+                    key_index = None
+
             # make Nones an empty object
             if com._count_not_none(*values) != len(values):
                 v = next(v for v in values if v is not None)
@@ -2569,7 +2628,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
 
                         # normally use vstack as its faster than concat
                         # and if we have mi-columns
-                        if not _np_version_under1p7 or isinstance(v.index,MultiIndex):
+                        if not _np_version_under1p7 or isinstance(v.index,MultiIndex) or key_index is None:
                             stacked_values = np.vstack([np.asarray(x) for x in values])
                             result = DataFrame(stacked_values,index=key_index,columns=index)
                         else:
@@ -2889,16 +2948,6 @@ def _apply_to_column_groupbys(self, func):
              in self._iterate_column_groupbys()),
             keys=self._selected_obj.columns, axis=1)
 
-    def ohlc(self):
-        """
-        Compute sum of values, excluding missing values
-
-        For multiple groupings, the result index will be a MultiIndex
-        """
-        return self._apply_to_column_groupbys(
-            lambda x: x._cython_agg_general('ohlc'))
-
-
 from pandas.tools.plotting import boxplot_frame_groupby
 DataFrameGroupBy.boxplot = boxplot_frame_groupby
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -370,12 +370,12 @@ def __array__(self, result=None):
         """ the array interface, return my values """
         return self.values
 
-    def __array_wrap__(self, result):
+    def __array_wrap__(self, result, copy=False):
         """
         Gets called prior to a ufunc (and after)
         """
         return self._constructor(result, index=self.index,
-                                 copy=False).__finalize__(self)
+                                 copy=copy).__finalize__(self)
 
     def __contains__(self, key):
         return key in self.index
@@ -959,19 +959,6 @@ def iteritems(self):
     if compat.PY3:  # pragma: no cover
         items = iteritems
 
-    # inversion
-    def __neg__(self):
-        values = self.values
-        if values.dtype == np.bool_:
-            arr = operator.inv(values)
-        else:
-            arr = operator.neg(values)
-        return self._constructor(arr, self.index).__finalize__(self)
-
-    def __invert__(self):
-        arr = operator.inv(self.values)
-        return self._constructor(arr, self.index).__finalize__(self)
-
     #----------------------------------------------------------------------
     # unbox reductions
 
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py