BUG: make groupby play nice with sparse objects, modify SparseSeries.take to return SparseSeries, dictification tests, GH #666

wesm · wesm · commit 2e61d97ef633 · 2012-01-23T16:28:52.000-05:00
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -800,7 +800,7 @@ def load(path):
 
     Parameters
     ----------
-p    path : string
+    path : string
         File path
 
     Returns
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -161,6 +161,9 @@ def __getattr__(self, attr):
         raise AttributeError("'%s' object has no attribute '%s'" %
                              (type(self).__name__, attr))
 
+    def __getitem__(self, key):
+        raise NotImplementedError
+
     def _make_wrapper(self, name):
         f = getattr(self.obj, name)
         if not isinstance(f, types.MethodType):
@@ -294,7 +297,13 @@ def mean(self):
 
         For multiple groupings, the result index will be a MultiIndex
         """
-        return self._cython_agg_general('mean')
+        try:
+            return self._cython_agg_general('mean')
+        except GroupByError:
+            raise
+        except Exception:
+            f = lambda x: x.mean(axis=self.axis)
+            return self._python_agg_general(f)
 
     def std(self):
         """
@@ -1256,7 +1265,6 @@ def _aggregate_generic(self, func, *args, **kwargs):
 
         return result
 
-
 class NDArrayGroupBy(GroupBy):
     pass
 
diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py
@@ -263,14 +263,15 @@ def _get_val_at(self, loc):
         else:
             return _gin.get_value_at(self, sp_loc)
 
-    def take(self, indices):
+    def take(self, indices, axis=0):
         """
         Sparse-compatible version of ndarray.take
 
         Returns
         -------
         taken : ndarray
         """
+        assert(axis == 0)
         indices = np.asarray(indices, dtype=int)
 
         n = len(self)
diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py
@@ -543,6 +543,30 @@ def _rename_columns_inplace(self, mapper):
         self.columns = new_columns
         self._series = new_series
 
+    def take(self, indices, axis=0):
+        """
+        Analogous to ndarray.take, return SparseDataFrame corresponding to
+        requested indices along an axis
+
+        Parameters
+        ----------
+        indices : list / array of ints
+        axis : {0, 1}
+
+        Returns
+        -------
+        taken : SparseDataFrame
+        """
+        new_values = self.values.take(indices, axis=axis)
+        if axis == 0:
+            new_columns = self.columns
+            new_index = self.index.take(indices)
+        else:
+            new_columns = self.columns.take(indices)
+            new_index = self.index
+        return self._constructor(new_values, index=new_index,
+                                 columns=new_columns)
+
     def add_prefix(self, prefix):
         f = (('%s' % prefix) + '%s').__mod__
         return self.rename(columns=f)
diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py
@@ -409,6 +409,18 @@ def sparse_reindex(self, new_index):
                             sparse_index=new_index,
                             fill_value=self.fill_value)
 
+    def take(self, indices, axis=0):
+        """
+        Sparse-compatible version of ndarray.take
+
+        Returns
+        -------
+        taken : ndarray
+        """
+        new_values = SparseArray.take(self, indices)
+        new_index = self.index.take(indices)
+        return self._constructor(new_values, index=new_index)
+
     def cumsum(self, axis=0, dtype=None, out=None):
         """
         Cumulative sum of values. Preserves locations of NaN values
diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py
@@ -352,7 +352,8 @@ def _compare_with_dense(sp):
             def _compare(idx):
                 dense_result = dense.take(idx).values
                 sparse_result = sp.take(idx)
-                assert_almost_equal(dense_result, sparse_result)
+                self.assert_(isinstance(sparse_result, SparseSeries))
+                assert_almost_equal(dense_result, sparse_result.values)
 
             _compare([1., 2., 3., 4., 5., 0.])
             _compare([7, 2, 9, 0, 4])
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -1285,14 +1285,35 @@ def test_groupby_list_infer_array_like(self):
         result = df.groupby(['foo', 'bar']).mean()
         expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]
 
-
-class TestPanelGroupBy(unittest.TestCase):
-
-    def setUp(self):
+    def test_dictify(self):
+        dict(iter(self.df.groupby('A')))
+        dict(iter(self.df.groupby(['A', 'B'])))
+        dict(iter(self.df['C'].groupby(self.df['A'])))
+        dict(iter(self.df['C'].groupby([self.df['A'], self.df['B']])))
+        dict(iter(self.df.groupby('A')['C']))
+        dict(iter(self.df.groupby(['A', 'B'])['C']))
+
+    def test_sparse_friendly(self):
+        sdf = self.df[['C', 'D']].to_sparse()
+        panel = tm.makePanel()
+        tm.add_nans(panel)
+
+        def _check_work(gp):
+            gp.mean()
+            gp.agg(np.mean)
+            dict(iter(gp))
+
+        # it works!
+        _check_work(sdf.groupby(lambda x: x // 2))
+        _check_work(sdf['C'].groupby(lambda x: x // 2))
+        _check_work(sdf.groupby(self.df['A']))
+
+        # do this someday
+        # _check_work(panel.groupby(lambda x: x.month, axis=1))
+
+    def test_panel_groupby(self):
         self.panel = tm.makePanel()
         tm.add_nans(self.panel)
-
-    def test_groupby(self):
         grouped = self.panel.groupby({'ItemA' : 0, 'ItemB' : 0, 'ItemC' : 1},
                                      axis='items')
         agged = grouped.agg(np.mean)