Skip to content

Commit d3be81a

Browse files
kerncjreback
authored andcommitted
BUG: Fix/test SparseSeries/SparseDataFrame stack/unstack (#16616)
1 parent 66f4cc1 commit d3be81a

File tree

5 files changed

+159
-37
lines changed

5 files changed

+159
-37
lines changed

doc/source/whatsnew/v0.21.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -607,7 +607,7 @@ Sparse
607607

608608
- Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`)
609609
- Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`)
610-
610+
- Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`)
611611

612612
Reshaping
613613
^^^^^^^^^

pandas/core/categorical.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -125,10 +125,16 @@ def f(self, other):
125125
return f
126126

127127

128-
def maybe_to_categorical(array):
129-
""" coerce to a categorical if a series is given """
128+
def _maybe_to_categorical(array):
129+
"""
130+
Coerce to a categorical if a series is given.
131+
132+
Internal use ONLY.
133+
"""
130134
if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
131135
return array._values
136+
elif isinstance(array, np.ndarray):
137+
return Categorical(array)
132138
return array
133139

134140

pandas/core/internals.py

+95-2
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656

5757
from pandas.core.index import Index, MultiIndex, _ensure_index
5858
from pandas.core.indexing import maybe_convert_indices, length_of_indexer
59-
from pandas.core.categorical import Categorical, maybe_to_categorical
59+
from pandas.core.categorical import Categorical, _maybe_to_categorical
6060
from pandas.core.indexes.datetimes import DatetimeIndex
6161
from pandas.io.formats.printing import pprint_thing
6262

@@ -1484,6 +1484,35 @@ def equals(self, other):
14841484
return False
14851485
return array_equivalent(self.values, other.values)
14861486

1487+
def _unstack(self, unstacker_func, new_columns):
1488+
"""Return a list of unstacked blocks of self
1489+
1490+
Parameters
1491+
----------
1492+
unstacker_func : callable
1493+
Partially applied unstacker.
1494+
new_columns : Index
1495+
All columns of the unstacked BlockManager.
1496+
1497+
Returns
1498+
-------
1499+
blocks : list of Block
1500+
New blocks of unstacked values.
1501+
mask : array_like of bool
1502+
The mask of columns of `blocks` we should keep.
1503+
"""
1504+
unstacker = unstacker_func(self.values.T)
1505+
new_items = unstacker.get_new_columns()
1506+
new_placement = new_columns.get_indexer(new_items)
1507+
new_values, mask = unstacker.get_new_values()
1508+
1509+
mask = mask.any(0)
1510+
new_values = new_values.T[mask]
1511+
new_placement = new_placement[mask]
1512+
1513+
blocks = [make_block(new_values, placement=new_placement)]
1514+
return blocks, mask
1515+
14871516
def quantile(self, qs, interpolation='linear', axis=0, mgr=None):
14881517
"""
14891518
compute the quantiles of the
@@ -1712,6 +1741,38 @@ def _slice(self, slicer):
17121741
def _try_cast_result(self, result, dtype=None):
17131742
return result
17141743

1744+
def _unstack(self, unstacker_func, new_columns):
1745+
"""Return a list of unstacked blocks of self
1746+
1747+
Parameters
1748+
----------
1749+
unstacker_func : callable
1750+
Partially applied unstacker.
1751+
new_columns : Index
1752+
All columns of the unstacked BlockManager.
1753+
1754+
Returns
1755+
-------
1756+
blocks : list of Block
1757+
New blocks of unstacked values.
1758+
mask : array_like of bool
1759+
The mask of columns of `blocks` we should keep.
1760+
"""
1761+
# NonConsolidatable blocks can have a single item only, so we return
1762+
# one block per item
1763+
unstacker = unstacker_func(self.values.T)
1764+
new_items = unstacker.get_new_columns()
1765+
new_placement = new_columns.get_indexer(new_items)
1766+
new_values, mask = unstacker.get_new_values()
1767+
1768+
mask = mask.any(0)
1769+
new_values = new_values.T[mask]
1770+
new_placement = new_placement[mask]
1771+
1772+
blocks = [self.make_block_same_class(vals, [place])
1773+
for vals, place in zip(new_values, new_placement)]
1774+
return blocks, mask
1775+
17151776

17161777
class NumericBlock(Block):
17171778
__slots__ = ()
@@ -2227,7 +2288,7 @@ class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock):
22272288
def __init__(self, values, placement, fastpath=False, **kwargs):
22282289

22292290
# coerce to categorical if we can
2230-
super(CategoricalBlock, self).__init__(maybe_to_categorical(values),
2291+
super(CategoricalBlock, self).__init__(_maybe_to_categorical(values),
22312292
fastpath=True,
22322293
placement=placement, **kwargs)
22332294

@@ -4192,6 +4253,38 @@ def canonicalize(block):
41924253
return all(block.equals(oblock)
41934254
for block, oblock in zip(self_blocks, other_blocks))
41944255

4256+
def unstack(self, unstacker_func):
4257+
"""Return a blockmanager with all blocks unstacked.
4258+
4259+
Parameters
4260+
----------
4261+
unstacker_func : callable
4262+
A (partially-applied) ``pd.core.reshape._Unstacker`` class.
4263+
4264+
Returns
4265+
-------
4266+
unstacked : BlockManager
4267+
"""
4268+
dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
4269+
new_columns = dummy.get_new_columns()
4270+
new_index = dummy.get_new_index()
4271+
new_blocks = []
4272+
columns_mask = []
4273+
4274+
for blk in self.blocks:
4275+
blocks, mask = blk._unstack(
4276+
partial(unstacker_func,
4277+
value_columns=self.items[blk.mgr_locs.indexer]),
4278+
new_columns)
4279+
4280+
new_blocks.extend(blocks)
4281+
columns_mask.extend(mask)
4282+
4283+
new_columns = new_columns[columns_mask]
4284+
4285+
bm = BlockManager(new_blocks, [new_columns, new_index])
4286+
return bm
4287+
41954288

41964289
class SingleBlockManager(BlockManager):
41974290
""" manage a single block with """

pandas/core/reshape/reshape.py

+17-32
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# pylint: disable=W0703,W0622,W0613,W0201
33
from pandas.compat import range, text_type, zip
44
from pandas import compat
5+
from functools import partial
56
import itertools
67
import re
78

@@ -10,7 +11,7 @@
1011
from pandas.core.dtypes.common import (
1112
_ensure_platform_int,
1213
is_list_like, is_bool_dtype,
13-
needs_i8_conversion)
14+
needs_i8_conversion, is_sparse)
1415
from pandas.core.dtypes.cast import maybe_promote
1516
from pandas.core.dtypes.missing import notna
1617
import pandas.core.dtypes.concat as _concat
@@ -75,10 +76,15 @@ def __init__(self, values, index, level=-1, value_columns=None,
7576
fill_value=None):
7677

7778
self.is_categorical = None
79+
self.is_sparse = is_sparse(values)
7880
if values.ndim == 1:
7981
if isinstance(values, Categorical):
8082
self.is_categorical = values
8183
values = np.array(values)
84+
elif self.is_sparse:
85+
# XXX: Makes SparseArray *dense*, but it's supposedly
86+
# a single column at a time, so it's "doable"
87+
values = values.values
8288
values = values[:, np.newaxis]
8389
self.values = values
8490
self.value_columns = value_columns
@@ -177,7 +183,8 @@ def get_result(self):
177183
ordered=ordered)
178184
for i in range(values.shape[-1])]
179185

180-
return DataFrame(values, index=index, columns=columns)
186+
klass = SparseDataFrame if self.is_sparse else DataFrame
187+
return klass(values, index=index, columns=columns)
181188

182189
def get_new_values(self):
183190
values = self.values
@@ -469,36 +476,12 @@ def unstack(obj, level, fill_value=None):
469476

470477

471478
def _unstack_frame(obj, level, fill_value=None):
472-
from pandas.core.internals import BlockManager, make_block
473-
474479
if obj._is_mixed_type:
475-
unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy
476-
obj.index, level=level,
477-
value_columns=obj.columns)
478-
new_columns = unstacker.get_new_columns()
479-
new_index = unstacker.get_new_index()
480-
new_axes = [new_columns, new_index]
481-
482-
new_blocks = []
483-
mask_blocks = []
484-
for blk in obj._data.blocks:
485-
blk_items = obj._data.items[blk.mgr_locs.indexer]
486-
bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
487-
value_columns=blk_items,
488-
fill_value=fill_value)
489-
new_items = bunstacker.get_new_columns()
490-
new_placement = new_columns.get_indexer(new_items)
491-
new_values, mask = bunstacker.get_new_values()
492-
493-
mblk = make_block(mask.T, placement=new_placement)
494-
mask_blocks.append(mblk)
495-
496-
newb = make_block(new_values.T, placement=new_placement)
497-
new_blocks.append(newb)
498-
499-
result = DataFrame(BlockManager(new_blocks, new_axes))
500-
mask_frame = DataFrame(BlockManager(mask_blocks, new_axes))
501-
return result.loc[:, mask_frame.sum(0) > 0]
480+
unstacker = partial(_Unstacker, index=obj.index,
481+
level=level, fill_value=fill_value)
482+
blocks = obj._data.unstack(unstacker)
483+
klass = type(obj)
484+
return klass(blocks)
502485
else:
503486
unstacker = _Unstacker(obj.values, obj.index, level=level,
504487
value_columns=obj.columns,
@@ -559,7 +542,9 @@ def factorize(index):
559542
mask = notna(new_values)
560543
new_values = new_values[mask]
561544
new_index = new_index[mask]
562-
return Series(new_values, index=new_index)
545+
546+
klass = type(frame)._constructor_sliced
547+
return klass(new_values, index=new_index)
563548

564549

565550
def stack_multiple(frame, level, dropna=True):

pandas/tests/sparse/test_reshape.py

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import pytest
2+
import numpy as np
3+
4+
import pandas as pd
5+
import pandas.util.testing as tm
6+
7+
8+
@pytest.fixture
9+
def sparse_df():
10+
return pd.SparseDataFrame({0: {0: 1}, 1: {1: 1}, 2: {2: 1}}) # eye
11+
12+
13+
@pytest.fixture
14+
def multi_index3():
15+
return pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)])
16+
17+
18+
def test_sparse_frame_stack(sparse_df, multi_index3):
19+
ss = sparse_df.stack()
20+
expected = pd.SparseSeries(np.ones(3), index=multi_index3)
21+
tm.assert_sp_series_equal(ss, expected)
22+
23+
24+
def test_sparse_frame_unstack(sparse_df):
25+
mi = pd.MultiIndex.from_tuples([(0, 0), (1, 0), (1, 2)])
26+
sparse_df.index = mi
27+
arr = np.array([[1, np.nan, np.nan],
28+
[np.nan, 1, np.nan],
29+
[np.nan, np.nan, 1]])
30+
unstacked_df = pd.DataFrame(arr, index=mi).unstack()
31+
unstacked_sdf = sparse_df.unstack()
32+
33+
tm.assert_numpy_array_equal(unstacked_df.values, unstacked_sdf.values)
34+
35+
36+
def test_sparse_series_unstack(sparse_df, multi_index3):
37+
frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack()
38+
tm.assert_sp_frame_equal(frame, sparse_df)

0 commit comments

Comments
 (0)