Skip to content

BUG: Fix/test SparseSeries/SparseDataFrame stack/unstack #16616

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Sep 26, 2017
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ Sparse

- Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`)
- Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`)

- Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`)

Reshaping
^^^^^^^^^
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ def maybe_to_categorical(array):
""" coerce to a categorical if a series is given """
if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
return array._values
elif isinstance(array, np.ndarray):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how is this hit?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in NonConsolidatingBlock._unstack() calling .make_block_same_class(), making a CategoricalBlock from unstacked values (an ndarray). Otherwise fails frame.test_reshape.TestDataFrameReshape.test_unstack_preserve_dtypes.
Since the function is named maybe_to_categorical and accepts argument array, the change seems like making perfect sense.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment to the doc-string that this is only an internal method.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May I prefix it with an underscore? It's used in a single place only.

return Categorical(array)
return array


Expand Down
93 changes: 93 additions & 0 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1484,6 +1484,35 @@ def equals(self, other):
return False
return array_equivalent(self.values, other.values)

def _unstack(self, unstacker_func, new_columns):
"""Return a list of unstacked blocks of self

Parameters
----------
unstacker_func : callable
Partially applied unstacker.
new_columns : Index
All columns of the unstacked BlockManager.

Returns
-------
blocks : list of Block
New blocks of unstacked values.
mask : array_like of bool
The mask of columns of `blocks` we should keep.
"""
unstacker = unstacker_func(self.values.T)
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()

mask = mask.any(0)
new_values = new_values.T[mask]
new_placement = new_placement[mask]

blocks = [make_block(new_values, placement=new_placement)]
return blocks, mask

def quantile(self, qs, interpolation='linear', axis=0, mgr=None):
"""
compute the quantiles of the
Expand Down Expand Up @@ -1712,6 +1741,38 @@ def _slice(self, slicer):
def _try_cast_result(self, result, dtype=None):
return result

def _unstack(self, unstacker_func, new_columns):
"""Return a list of unstacked blocks of self

Parameters
----------
unstacker_func : callable
Partially applied unstacker.
new_columns : Index
All columns of the unstacked BlockManager.

Returns
-------
blocks : list of Block
New blocks of unstacked values.
mask : array_like of bool
The mask of columns of `blocks` we should keep.
"""
# NonConsolidatable blocks can have a single item only, so we return
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add doc-string

# one block per item
unstacker = unstacker_func(self.values.T)
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()

mask = mask.any(0)
new_values = new_values.T[mask]
new_placement = new_placement[mask]

blocks = [self.make_block_same_class(vals, [place])
for vals, place in zip(new_values, new_placement)]
return blocks, mask


class NumericBlock(Block):
__slots__ = ()
Expand Down Expand Up @@ -4167,6 +4228,38 @@ def canonicalize(block):
return all(block.equals(oblock)
for block, oblock in zip(self_blocks, other_blocks))

def unstack(self, unstacker_func):
"""Return a blockmanager with all blocks unstacked.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

change unstacker_t as above


Parameters
----------
unstacker_func : callable
A (partially-applied) ``pd.core.reshape._Unstacker`` class.

Returns
-------
unstacked : BlockManager
"""
dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
new_columns = dummy.get_new_columns()
new_index = dummy.get_new_index()
new_blocks = []
columns_mask = []

for blk in self.blocks:
blocks, mask = blk._unstack(
partial(unstacker_func,
value_columns=self.items[blk.mgr_locs.indexer]),
new_columns)

new_blocks.extend(blocks)
columns_mask.extend(mask)

new_columns = new_columns[columns_mask]

bm = BlockManager(new_blocks, [new_columns, new_index])
return bm


class SingleBlockManager(BlockManager):
""" manage a single block with """
Expand Down
49 changes: 17 additions & 32 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# pylint: disable=W0703,W0622,W0613,W0201
from pandas.compat import range, text_type, zip
from pandas import compat
from functools import partial
import itertools
import re

Expand All @@ -10,7 +11,7 @@
from pandas.core.dtypes.common import (
_ensure_platform_int,
is_list_like, is_bool_dtype,
needs_i8_conversion)
needs_i8_conversion, is_sparse)
from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.missing import notna
import pandas.core.dtypes.concat as _concat
Expand Down Expand Up @@ -75,10 +76,15 @@ def __init__(self, values, index, level=-1, value_columns=None,
fill_value=None):

self.is_categorical = None
self.is_sparse = is_sparse(values)
if values.ndim == 1:
if isinstance(values, Categorical):
self.is_categorical = values
values = np.array(values)
elif self.is_sparse:
# XXX: Makes SparseArray *dense*, but it's supposedly
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this a TODO? or a comment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A comment with a mild TODO hint at whomever will be refactoring the whole thing eventually to take the note into consideration.

# a single column at a time, so it's "doable"
values = values.values
values = values[:, np.newaxis]
self.values = values
self.value_columns = value_columns
Expand Down Expand Up @@ -177,7 +183,8 @@ def get_result(self):
ordered=ordered)
for i in range(values.shape[-1])]

return DataFrame(values, index=index, columns=columns)
klass = SparseDataFrame if self.is_sparse else DataFrame
return klass(values, index=index, columns=columns)

def get_new_values(self):
values = self.values
Expand Down Expand Up @@ -469,36 +476,12 @@ def unstack(obj, level, fill_value=None):


def _unstack_frame(obj, level, fill_value=None):
from pandas.core.internals import BlockManager, make_block

if obj._is_mixed_type:
unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy
obj.index, level=level,
value_columns=obj.columns)
new_columns = unstacker.get_new_columns()
new_index = unstacker.get_new_index()
new_axes = [new_columns, new_index]

new_blocks = []
mask_blocks = []
for blk in obj._data.blocks:
blk_items = obj._data.items[blk.mgr_locs.indexer]
bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
value_columns=blk_items,
fill_value=fill_value)
new_items = bunstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = bunstacker.get_new_values()

mblk = make_block(mask.T, placement=new_placement)
mask_blocks.append(mblk)

newb = make_block(new_values.T, placement=new_placement)
new_blocks.append(newb)

result = DataFrame(BlockManager(new_blocks, new_axes))
mask_frame = DataFrame(BlockManager(mask_blocks, new_axes))
return result.loc[:, mask_frame.sum(0) > 0]
unstacker = partial(_Unstacker, index=obj.index,
level=level, fill_value=fill_value)
blocks = obj._data.unstack(unstacker)
klass = type(obj)
return klass(blocks)
else:
unstacker = _Unstacker(obj.values, obj.index, level=level,
value_columns=obj.columns,
Expand Down Expand Up @@ -559,7 +542,9 @@ def factorize(index):
mask = notna(new_values)
new_values = new_values[mask]
new_index = new_index[mask]
return Series(new_values, index=new_index)

klass = type(frame)._constructor_sliced
return klass(new_values, index=new_index)


def stack_multiple(frame, level, dropna=True):
Expand Down
38 changes: 38 additions & 0 deletions pandas/tests/sparse/test_reshape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pytest
import numpy as np

import pandas as pd
import pandas.util.testing as tm


@pytest.fixture
def sparse_df():
return pd.SparseDataFrame({0: {0: 1}, 1: {1: 1}, 2: {2: 1}}) # eye


@pytest.fixture
def multi_index3():
return pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)])


def test_sparse_frame_stack(sparse_df, multi_index3):
ss = sparse_df.stack()
expected = pd.SparseSeries(np.ones(3), index=multi_index3)
tm.assert_sp_series_equal(ss, expected)


def test_sparse_frame_unstack(sparse_df):
mi = pd.MultiIndex.from_tuples([(0, 0), (1, 0), (1, 2)])
sparse_df.index = mi
arr = np.array([[1, np.nan, np.nan],
[np.nan, 1, np.nan],
[np.nan, np.nan, 1]])
unstacked_df = pd.DataFrame(arr, index=mi).unstack()
unstacked_sdf = sparse_df.unstack()

tm.assert_numpy_array_equal(unstacked_df.values, unstacked_sdf.values)


def test_sparse_series_unstack(sparse_df, multi_index3):
frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack()
tm.assert_sp_frame_equal(frame, sparse_df)