ENH: infer selection_obj on groupby with an applied method (GH5610)

jreback · jreback · commit 6fa398e3290a · 2014-04-29T14:46:47.000-04:00
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -179,6 +179,8 @@ API Changes
   validation warnings in :func:`read_csv`/:func:`read_table` (:issue:`6607`)
 - Raise a ``TypeError`` when ``DataFrame`` is passed an iterator as the
   ``data`` argument (:issue:`5357`)
+- groupby will now not return the grouped column for non-cython functions (:issue:`5610`),
+  as its already the index
 
 Deprecations
 ~~~~~~~~~~~~
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -110,12 +110,22 @@ API changes
 
   .. ipython:: python
 
-     DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+     df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
      g = df.groupby('A')
      g.nth(0)  # can also use negative ints
 
      g.nth(0, dropna='any')  # similar to old behaviour
 
+  groupby will now not return the grouped column for non-cython functions (:issue:`5610`),
+  as its already the index
+
+  .. ipython:: python
+
+     df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B'])
+     g = df.groupby('A')
+     g.count()
+     g.describe()
+
 - Allow specification of a more complex groupby via ``pd.Grouper``, such as grouping
   by a Time and a string field simultaneously. See :ref:`the docs <groupby.specify>`. (:issue:`3794`)
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -445,6 +445,23 @@ def _selection_list(self):
             return [self._selection]
         return self._selection
 
+    @cache_readonly
+    def _selected_obj(self):
+
+        if self._selection is None or isinstance(self.obj, Series):
+            return self.obj
+        else:
+            return self.obj[self._selection]
+
+    def _set_selection_from_grouper(self):
+        """ we may need create a selection if we have non-level groupers """
+        grp = self.grouper
+        if self._selection is None and getattr(grp,'groupings',None) is not None:
+            ax = self.obj._info_axis
+            groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
+            if len(groupers):
+                self._selection = (ax-Index(groupers)).tolist()
+
     def _local_dir(self):
         return sorted(set(self.obj._local_dir() + list(self._apply_whitelist)))
 
@@ -453,7 +470,6 @@ def __getattr__(self, attr):
             return object.__getattribute__(self, attr)
         if attr in self.obj:
             return self[attr]
-
         if hasattr(self.obj, attr):
             return self._make_wrapper(attr)
 
@@ -472,6 +488,10 @@ def _make_wrapper(self, name):
                                                      type(self).__name__))
             raise AttributeError(msg)
 
+        # need to setup the selection
+        # as are not passed directly but in the grouper
+        self._set_selection_from_grouper()
+
         f = getattr(self._selected_obj, name)
         if not isinstance(f, types.MethodType):
             return self.apply(lambda self: getattr(self, name))
@@ -503,7 +523,19 @@ def curried(x):
             try:
                 return self.apply(curried_with_axis)
             except Exception:
-                return self.apply(curried)
+                try:
+                    return self.apply(curried)
+                except Exception:
+
+                    # related to : GH3688
+                    # try item-by-item
+                    # this can be called recursively, so need to raise ValueError if
+                    # we don't have this method to indicated to aggregate to
+                    # mark this column as an error
+                    try:
+                        return self._aggregate_item_by_item(name, *args, **kwargs)
+                    except (AttributeError):
+                        raise ValueError
 
         return wrapper
 
@@ -624,6 +656,7 @@ def mean(self):
         except GroupByError:
             raise
         except Exception:  # pragma: no cover
+            self._set_selection_from_grouper()
             f = lambda x: x.mean(axis=self.axis)
             return self._python_agg_general(f)
 
@@ -639,6 +672,7 @@ def median(self):
             raise
         except Exception:  # pragma: no cover
 
+            self._set_selection_from_grouper()
             def f(x):
                 if isinstance(x, np.ndarray):
                     x = Series(x)
@@ -655,6 +689,7 @@ def std(self, ddof=1):
         if ddof == 1:
             return self._cython_agg_general('std')
         else:
+            self._set_selection_from_grouper()
             f = lambda x: x.std(ddof=ddof)
             return self._python_agg_general(f)
 
@@ -667,6 +702,7 @@ def var(self, ddof=1):
         if ddof == 1:
             return self._cython_agg_general('var')
         else:
+            self._set_selection_from_grouper()
             f = lambda x: x.var(ddof=ddof)
             return self._python_agg_general(f)
 
@@ -677,12 +713,14 @@ def size(self):
         """
         return self.grouper.size()
 
-    def count(self):
+    def count(self, axis=0):
         """
         Number of non-null items in each group.
-
+        axis : axis number, default 0
+               the grouping axis
         """
-        return self._python_agg_general(lambda x: notnull(x).sum())
+        self._set_selection_from_grouper()
+        return self._python_agg_general(lambda x: notnull(x).sum(axis=axis)).astype('int64')
 
     sum = _groupby_function('sum', 'add', np.sum)
     prod = _groupby_function('prod', 'prod', np.prod)
@@ -693,12 +731,14 @@ def count(self):
     last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
                              _convert=True)
 
+
     def ohlc(self):
         """
-        Deprecated, use .resample(how="ohlc") instead.
-
+        Compute sum of values, excluding missing values
+        For multiple groupings, the result index will be a MultiIndex
         """
-        raise AttributeError('ohlc is deprecated, use resample(how="ohlc").')
+        return self._apply_to_column_groupbys(
+            lambda x: x._cython_agg_general('ohlc'))
 
     def nth(self, n, dropna=None):
         """
@@ -894,13 +934,6 @@ def _cumcount_array(self, arr=None, **kwargs):
                 cumcounts[v] = arr[len(v)-1::-1]
         return cumcounts
 
-    @cache_readonly
-    def _selected_obj(self):
-        if self._selection is None or isinstance(self.obj, Series):
-            return self.obj
-        else:
-            return self.obj[self._selection]
-
     def _index_with_as_index(self, b):
         """
         Take boolean mask of index to be returned from apply, if as_index=True
@@ -945,7 +978,6 @@ def _cython_agg_general(self, how, numeric_only=True):
                 result, names = self.grouper.aggregate(obj.values, how)
             except AssertionError as e:
                 raise GroupByError(str(e))
-            # infer old dytpe
             output[name] = self._try_cast(result, obj)
 
         if len(output) == 0:
@@ -954,8 +986,6 @@ def _cython_agg_general(self, how, numeric_only=True):
         return self._wrap_aggregated_output(output, names)
 
     def _python_agg_general(self, func, *args, **kwargs):
-        _dtype = kwargs.pop("_dtype", None)
-
         func = _intercept_function(func)
         f = lambda x: func(x, *args, **kwargs)
 
@@ -964,14 +994,7 @@ def _python_agg_general(self, func, *args, **kwargs):
         for name, obj in self._iterate_slices():
             try:
                 result, counts = self.grouper.agg_series(obj, f)
-
-                if _dtype is None:  # infer old dytpe
-                    output[name] = self._try_cast(result, obj)
-                elif _dtype is False:
-                    output[name] = result
-                else:
-                    output[name] = _possibly_downcast_to_dtype(result, _dtype)
-
+                output[name] = self._try_cast(result, obj)
             except TypeError:
                 continue
 
@@ -2203,6 +2226,9 @@ def true_and_notnull(x, *args, **kwargs):
         filtered = self._apply_filter(indices, dropna)
         return filtered
 
+    def _apply_to_column_groupbys(self, func):
+        """ return a pass thru """
+        return func(self)
 
 class NDFrameGroupBy(GroupBy):
 
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -1971,16 +1971,53 @@ def test_size(self):
             self.assertEquals(result[key], len(group))
 
     def test_count(self):
-        df = pd.DataFrame([[1, 2], [1, nan], [3, nan]], columns=['A', 'B'])
+
+        # GH5610
+        # count counts non-nulls
+        df = pd.DataFrame([[1, 2, 'foo'], [1, nan, 'bar'], [3, nan, nan]], columns=['A', 'B', 'C'])
+
         count_as = df.groupby('A').count()
         count_not_as = df.groupby('A', as_index=False).count()
 
-        res = pd.DataFrame([[1, 1], [3, 0]], columns=['A', 'B'])
-        assert_frame_equal(count_not_as, res)
-        assert_frame_equal(count_as, res.set_index('A'))
+        expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], index=[1,3])
+        expected.index.name='A'
+        assert_frame_equal(count_not_as, expected.reset_index())
+        assert_frame_equal(count_as, expected)
 
         count_B = df.groupby('A')['B'].count()
-        assert_series_equal(count_B, res['B'])
+        assert_series_equal(count_B, expected['B'])
+
+    def test_non_cython_api(self):
+
+        # GH5610
+        # non-cython calls should not include the grouper
+
+        df = DataFrame([[1, 2, 'foo'], [1, nan, 'bar',], [3, nan, 'baz']], columns=['A', 'B','C'])
+        g = df.groupby('A')
+
+        # mad
+        expected = DataFrame([[0],[nan]],columns=['B'],index=[1,3])
+        expected.index.name = 'A'
+        result = g.mad()
+        assert_frame_equal(result,expected)
+
+        # describe
+        expected = DataFrame(dict(B = concat([df.loc[[0,1],'B'].describe(),df.loc[[2],'B'].describe()],keys=[1,3])))
+        expected.index.names = ['A',None]
+        result = g.describe()
+        assert_frame_equal(result,expected)
+
+        # any
+        expected = DataFrame([[True, True],[False, True]],columns=['B','C'],index=[1,3])
+        expected.index.name = 'A'
+        result = g.any()
+        assert_frame_equal(result,expected)
+
+        # idxmax
+        expected = DataFrame([[0],[nan]],columns=['B'],index=[1,3])
+        expected.index.name = 'A'
+        result = g.idxmax()
+        assert_frame_equal(result,expected)
 
     def test_grouping_ndarray(self):
         grouped = self.df.groupby(self.df['A'].values)
@@ -2937,7 +2974,7 @@ def test_groupby_with_timegrouper(self):
                 DT.datetime(2013,12,2,12,0),
                 DT.datetime(2013,9,2,14,0),
                 ]})
-        
+
         # GH 6908 change target column's order
         df_reordered = df_original.sort(columns='Quantity')
 
@@ -3949,8 +3986,14 @@ def test_frame_groupby_plot_boxplot(self):
         self.assertEqual(len(res), 2)
         tm.close()
 
+        # now works with GH 5610 as gender is excluded
+        res = df.groupby('gender').hist()
+        tm.close()
+
+        df2 = df.copy()
+        df2['gender2'] = df['gender']
         with tm.assertRaisesRegexp(TypeError, '.*str.+float'):
-            gb.hist()
+            df2.groupby('gender').hist()
 
     @slow
     def test_frame_groupby_hist(self):
diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py
@@ -1126,9 +1126,9 @@ def test_evenly_divisible_with_no_extra_bins(self):
         expected = DataFrame(
             [{'REST_KEY': 14, 'DLY_TRN_QT': 14, 'DLY_SLS_AMT': 14,
               'COOP_DLY_TRN_QT': 14, 'COOP_DLY_SLS_AMT': 14}] * 4,
-            index=index).unstack().swaplevel(1,0).sortlevel()
+            index=index)
         result = df.resample('7D', how='count')
-        assert_series_equal(result,expected)
+        assert_frame_equal(result,expected)
 
         expected = DataFrame(
             [{'REST_KEY': 21, 'DLY_TRN_QT': 1050, 'DLY_SLS_AMT': 700,