BUG: Fix/test SparseSeries/SparseDataFrame stack/unstack (#16616)

kernc · jreback · commit d3be81ad595c · 2017-09-26T15:01:39.000-04:00
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -607,7 +607,7 @@ Sparse
 
 - Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`)
 - Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`)
-
+- Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -125,10 +125,16 @@ def f(self, other):
     return f
 
 
-def maybe_to_categorical(array):
-    """ coerce to a categorical if a series is given """
+def _maybe_to_categorical(array):
+    """
+    Coerce to a categorical if a series is given.
+
+    Internal use ONLY.
+    """
     if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
         return array._values
+    elif isinstance(array, np.ndarray):
+        return Categorical(array)
     return array
 
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -56,7 +56,7 @@
 
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas.core.indexing import maybe_convert_indices, length_of_indexer
-from pandas.core.categorical import Categorical, maybe_to_categorical
+from pandas.core.categorical import Categorical, _maybe_to_categorical
 from pandas.core.indexes.datetimes import DatetimeIndex
 from pandas.io.formats.printing import pprint_thing
 
@@ -1484,6 +1484,35 @@ def equals(self, other):
             return False
         return array_equivalent(self.values, other.values)
 
+    def _unstack(self, unstacker_func, new_columns):
+        """Return a list of unstacked blocks of self
+
+        Parameters
+        ----------
+        unstacker_func : callable
+            Partially applied unstacker.
+        new_columns : Index
+            All columns of the unstacked BlockManager.
+
+        Returns
+        -------
+        blocks : list of Block
+            New blocks of unstacked values.
+        mask : array_like of bool
+            The mask of columns of `blocks` we should keep.
+        """
+        unstacker = unstacker_func(self.values.T)
+        new_items = unstacker.get_new_columns()
+        new_placement = new_columns.get_indexer(new_items)
+        new_values, mask = unstacker.get_new_values()
+
+        mask = mask.any(0)
+        new_values = new_values.T[mask]
+        new_placement = new_placement[mask]
+
+        blocks = [make_block(new_values, placement=new_placement)]
+        return blocks, mask
+
     def quantile(self, qs, interpolation='linear', axis=0, mgr=None):
         """
         compute the quantiles of the
@@ -1712,6 +1741,38 @@ def _slice(self, slicer):
     def _try_cast_result(self, result, dtype=None):
         return result
 
+    def _unstack(self, unstacker_func, new_columns):
+        """Return a list of unstacked blocks of self
+
+        Parameters
+        ----------
+        unstacker_func : callable
+            Partially applied unstacker.
+        new_columns : Index
+            All columns of the unstacked BlockManager.
+
+        Returns
+        -------
+        blocks : list of Block
+            New blocks of unstacked values.
+        mask : array_like of bool
+            The mask of columns of `blocks` we should keep.
+        """
+        # NonConsolidatable blocks can have a single item only, so we return
+        # one block per item
+        unstacker = unstacker_func(self.values.T)
+        new_items = unstacker.get_new_columns()
+        new_placement = new_columns.get_indexer(new_items)
+        new_values, mask = unstacker.get_new_values()
+
+        mask = mask.any(0)
+        new_values = new_values.T[mask]
+        new_placement = new_placement[mask]
+
+        blocks = [self.make_block_same_class(vals, [place])
+                  for vals, place in zip(new_values, new_placement)]
+        return blocks, mask
+
 
 class NumericBlock(Block):
     __slots__ = ()
@@ -2227,7 +2288,7 @@ class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock):
     def __init__(self, values, placement, fastpath=False, **kwargs):
 
         # coerce to categorical if we can
-        super(CategoricalBlock, self).__init__(maybe_to_categorical(values),
+        super(CategoricalBlock, self).__init__(_maybe_to_categorical(values),
                                                fastpath=True,
                                                placement=placement, **kwargs)
 
@@ -4192,6 +4253,38 @@ def canonicalize(block):
         return all(block.equals(oblock)
                    for block, oblock in zip(self_blocks, other_blocks))
 
+    def unstack(self, unstacker_func):
+        """Return a blockmanager with all blocks unstacked.
+
+        Parameters
+        ----------
+        unstacker_func : callable
+            A (partially-applied) ``pd.core.reshape._Unstacker`` class.
+
+        Returns
+        -------
+        unstacked : BlockManager
+        """
+        dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
+        new_columns = dummy.get_new_columns()
+        new_index = dummy.get_new_index()
+        new_blocks = []
+        columns_mask = []
+
+        for blk in self.blocks:
+            blocks, mask = blk._unstack(
+                partial(unstacker_func,
+                        value_columns=self.items[blk.mgr_locs.indexer]),
+                new_columns)
+
+            new_blocks.extend(blocks)
+            columns_mask.extend(mask)
+
+        new_columns = new_columns[columns_mask]
+
+        bm = BlockManager(new_blocks, [new_columns, new_index])
+        return bm
+
 
 class SingleBlockManager(BlockManager):
     """ manage a single block with """
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -2,6 +2,7 @@
 # pylint: disable=W0703,W0622,W0613,W0201
 from pandas.compat import range, text_type, zip
 from pandas import compat
+from functools import partial
 import itertools
 import re
 
@@ -10,7 +11,7 @@
 from pandas.core.dtypes.common import (
     _ensure_platform_int,
     is_list_like, is_bool_dtype,
-    needs_i8_conversion)
+    needs_i8_conversion, is_sparse)
 from pandas.core.dtypes.cast import maybe_promote
 from pandas.core.dtypes.missing import notna
 import pandas.core.dtypes.concat as _concat
@@ -75,10 +76,15 @@ def __init__(self, values, index, level=-1, value_columns=None,
                  fill_value=None):
 
         self.is_categorical = None
+        self.is_sparse = is_sparse(values)
         if values.ndim == 1:
             if isinstance(values, Categorical):
                 self.is_categorical = values
                 values = np.array(values)
+            elif self.is_sparse:
+                # XXX: Makes SparseArray *dense*, but it's supposedly
+                # a single column at a time, so it's "doable"
+                values = values.values
             values = values[:, np.newaxis]
         self.values = values
         self.value_columns = value_columns
@@ -177,7 +183,8 @@ def get_result(self):
                                   ordered=ordered)
                       for i in range(values.shape[-1])]
 
-        return DataFrame(values, index=index, columns=columns)
+        klass = SparseDataFrame if self.is_sparse else DataFrame
+        return klass(values, index=index, columns=columns)
 
     def get_new_values(self):
         values = self.values
@@ -469,36 +476,12 @@ def unstack(obj, level, fill_value=None):
 
 
 def _unstack_frame(obj, level, fill_value=None):
-    from pandas.core.internals import BlockManager, make_block
-
     if obj._is_mixed_type:
-        unstacker = _Unstacker(np.empty(obj.shape, dtype=bool),  # dummy
-                               obj.index, level=level,
-                               value_columns=obj.columns)
-        new_columns = unstacker.get_new_columns()
-        new_index = unstacker.get_new_index()
-        new_axes = [new_columns, new_index]
-
-        new_blocks = []
-        mask_blocks = []
-        for blk in obj._data.blocks:
-            blk_items = obj._data.items[blk.mgr_locs.indexer]
-            bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
-                                    value_columns=blk_items,
-                                    fill_value=fill_value)
-            new_items = bunstacker.get_new_columns()
-            new_placement = new_columns.get_indexer(new_items)
-            new_values, mask = bunstacker.get_new_values()
-
-            mblk = make_block(mask.T, placement=new_placement)
-            mask_blocks.append(mblk)
-
-            newb = make_block(new_values.T, placement=new_placement)
-            new_blocks.append(newb)
-
-        result = DataFrame(BlockManager(new_blocks, new_axes))
-        mask_frame = DataFrame(BlockManager(mask_blocks, new_axes))
-        return result.loc[:, mask_frame.sum(0) > 0]
+        unstacker = partial(_Unstacker, index=obj.index,
+                            level=level, fill_value=fill_value)
+        blocks = obj._data.unstack(unstacker)
+        klass = type(obj)
+        return klass(blocks)
     else:
         unstacker = _Unstacker(obj.values, obj.index, level=level,
                                value_columns=obj.columns,
@@ -559,7 +542,9 @@ def factorize(index):
         mask = notna(new_values)
         new_values = new_values[mask]
         new_index = new_index[mask]
-    return Series(new_values, index=new_index)
+
+    klass = type(frame)._constructor_sliced
+    return klass(new_values, index=new_index)
 
 
 def stack_multiple(frame, level, dropna=True):
diff --git a/pandas/tests/sparse/test_reshape.py b/pandas/tests/sparse/test_reshape.py
@@ -0,0 +1,38 @@
+import pytest
+import numpy as np
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+@pytest.fixture
+def sparse_df():
+    return pd.SparseDataFrame({0: {0: 1}, 1: {1: 1}, 2: {2: 1}})  # eye
+
+
+@pytest.fixture
+def multi_index3():
+    return pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)])
+
+
+def test_sparse_frame_stack(sparse_df, multi_index3):
+    ss = sparse_df.stack()
+    expected = pd.SparseSeries(np.ones(3), index=multi_index3)
+    tm.assert_sp_series_equal(ss, expected)
+
+
+def test_sparse_frame_unstack(sparse_df):
+    mi = pd.MultiIndex.from_tuples([(0, 0), (1, 0), (1, 2)])
+    sparse_df.index = mi
+    arr = np.array([[1, np.nan, np.nan],
+                    [np.nan, 1, np.nan],
+                    [np.nan, np.nan, 1]])
+    unstacked_df = pd.DataFrame(arr, index=mi).unstack()
+    unstacked_sdf = sparse_df.unstack()
+
+    tm.assert_numpy_array_equal(unstacked_df.values, unstacked_sdf.values)
+
+
+def test_sparse_series_unstack(sparse_df, multi_index3):
+    frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack()
+    tm.assert_sp_frame_equal(frame, sparse_df)