-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Fix/test SparseSeries/SparseDataFrame stack/unstack #16616
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
d4618d3
9ae1c10
f621ea6
369e315
a27c047
0acb6f3
8aad74c
cef5f47
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1484,6 +1484,35 @@ def equals(self, other): | |
return False | ||
return array_equivalent(self.values, other.values) | ||
|
||
def _unstack(self, unstacker_func, new_columns): | ||
"""Return a list of unstacked blocks of self | ||
|
||
Parameters | ||
---------- | ||
unstacker_func : callable | ||
Partially applied unstacker. | ||
new_columns : Index | ||
All columns of the unstacked BlockManager. | ||
|
||
Returns | ||
------- | ||
blocks : list of Block | ||
New blocks of unstacked values. | ||
mask : array_like of bool | ||
The mask of columns of `blocks` we should keep. | ||
""" | ||
unstacker = unstacker_func(self.values.T) | ||
new_items = unstacker.get_new_columns() | ||
new_placement = new_columns.get_indexer(new_items) | ||
new_values, mask = unstacker.get_new_values() | ||
|
||
mask = mask.any(0) | ||
new_values = new_values.T[mask] | ||
new_placement = new_placement[mask] | ||
|
||
blocks = [make_block(new_values, placement=new_placement)] | ||
return blocks, mask | ||
|
||
def quantile(self, qs, interpolation='linear', axis=0, mgr=None): | ||
""" | ||
compute the quantiles of the | ||
|
@@ -1712,6 +1741,38 @@ def _slice(self, slicer): | |
def _try_cast_result(self, result, dtype=None): | ||
return result | ||
|
||
def _unstack(self, unstacker_func, new_columns): | ||
"""Return a list of unstacked blocks of self | ||
|
||
Parameters | ||
---------- | ||
unstacker_func : callable | ||
Partially applied unstacker. | ||
new_columns : Index | ||
All columns of the unstacked BlockManager. | ||
|
||
Returns | ||
------- | ||
blocks : list of Block | ||
New blocks of unstacked values. | ||
mask : array_like of bool | ||
The mask of columns of `blocks` we should keep. | ||
""" | ||
# NonConsolidatable blocks can have a single item only, so we return | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add doc-string |
||
# one block per item | ||
unstacker = unstacker_func(self.values.T) | ||
new_items = unstacker.get_new_columns() | ||
new_placement = new_columns.get_indexer(new_items) | ||
new_values, mask = unstacker.get_new_values() | ||
|
||
mask = mask.any(0) | ||
new_values = new_values.T[mask] | ||
new_placement = new_placement[mask] | ||
|
||
blocks = [self.make_block_same_class(vals, [place]) | ||
for vals, place in zip(new_values, new_placement)] | ||
return blocks, mask | ||
|
||
|
||
class NumericBlock(Block): | ||
__slots__ = () | ||
|
@@ -4167,6 +4228,38 @@ def canonicalize(block): | |
return all(block.equals(oblock) | ||
for block, oblock in zip(self_blocks, other_blocks)) | ||
|
||
def unstack(self, unstacker_func): | ||
"""Return a blockmanager with all blocks unstacked. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. change unstacker_t as above |
||
|
||
Parameters | ||
---------- | ||
unstacker_func : callable | ||
A (partially-applied) ``pd.core.reshape._Unstacker`` class. | ||
|
||
Returns | ||
------- | ||
unstacked : BlockManager | ||
""" | ||
dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items) | ||
new_columns = dummy.get_new_columns() | ||
new_index = dummy.get_new_index() | ||
new_blocks = [] | ||
columns_mask = [] | ||
|
||
for blk in self.blocks: | ||
blocks, mask = blk._unstack( | ||
partial(unstacker_func, | ||
value_columns=self.items[blk.mgr_locs.indexer]), | ||
new_columns) | ||
|
||
new_blocks.extend(blocks) | ||
columns_mask.extend(mask) | ||
|
||
new_columns = new_columns[columns_mask] | ||
|
||
bm = BlockManager(new_blocks, [new_columns, new_index]) | ||
return bm | ||
|
||
|
||
class SingleBlockManager(BlockManager): | ||
""" manage a single block with """ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
# pylint: disable=W0703,W0622,W0613,W0201 | ||
from pandas.compat import range, text_type, zip | ||
from pandas import compat | ||
from functools import partial | ||
import itertools | ||
import re | ||
|
||
|
@@ -10,7 +11,7 @@ | |
from pandas.core.dtypes.common import ( | ||
_ensure_platform_int, | ||
is_list_like, is_bool_dtype, | ||
needs_i8_conversion) | ||
needs_i8_conversion, is_sparse) | ||
from pandas.core.dtypes.cast import maybe_promote | ||
from pandas.core.dtypes.missing import notna | ||
import pandas.core.dtypes.concat as _concat | ||
|
@@ -75,10 +76,15 @@ def __init__(self, values, index, level=-1, value_columns=None, | |
fill_value=None): | ||
|
||
self.is_categorical = None | ||
self.is_sparse = is_sparse(values) | ||
if values.ndim == 1: | ||
if isinstance(values, Categorical): | ||
self.is_categorical = values | ||
values = np.array(values) | ||
elif self.is_sparse: | ||
# XXX: Makes SparseArray *dense*, but it's supposedly | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this a TODO? or a comment? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A comment with a mild TODO hint at whomever will be refactoring the whole thing eventually to take the note into consideration. |
||
# a single column at a time, so it's "doable" | ||
values = values.values | ||
values = values[:, np.newaxis] | ||
self.values = values | ||
self.value_columns = value_columns | ||
|
@@ -177,7 +183,8 @@ def get_result(self): | |
ordered=ordered) | ||
for i in range(values.shape[-1])] | ||
|
||
return DataFrame(values, index=index, columns=columns) | ||
klass = SparseDataFrame if self.is_sparse else DataFrame | ||
return klass(values, index=index, columns=columns) | ||
|
||
def get_new_values(self): | ||
values = self.values | ||
|
@@ -469,36 +476,12 @@ def unstack(obj, level, fill_value=None): | |
|
||
|
||
def _unstack_frame(obj, level, fill_value=None): | ||
from pandas.core.internals import BlockManager, make_block | ||
|
||
if obj._is_mixed_type: | ||
unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy | ||
obj.index, level=level, | ||
value_columns=obj.columns) | ||
new_columns = unstacker.get_new_columns() | ||
new_index = unstacker.get_new_index() | ||
new_axes = [new_columns, new_index] | ||
|
||
new_blocks = [] | ||
mask_blocks = [] | ||
for blk in obj._data.blocks: | ||
blk_items = obj._data.items[blk.mgr_locs.indexer] | ||
bunstacker = _Unstacker(blk.values.T, obj.index, level=level, | ||
value_columns=blk_items, | ||
fill_value=fill_value) | ||
new_items = bunstacker.get_new_columns() | ||
new_placement = new_columns.get_indexer(new_items) | ||
new_values, mask = bunstacker.get_new_values() | ||
|
||
mblk = make_block(mask.T, placement=new_placement) | ||
mask_blocks.append(mblk) | ||
|
||
newb = make_block(new_values.T, placement=new_placement) | ||
new_blocks.append(newb) | ||
|
||
result = DataFrame(BlockManager(new_blocks, new_axes)) | ||
mask_frame = DataFrame(BlockManager(mask_blocks, new_axes)) | ||
return result.loc[:, mask_frame.sum(0) > 0] | ||
unstacker = partial(_Unstacker, index=obj.index, | ||
level=level, fill_value=fill_value) | ||
blocks = obj._data.unstack(unstacker) | ||
klass = type(obj) | ||
return klass(blocks) | ||
else: | ||
unstacker = _Unstacker(obj.values, obj.index, level=level, | ||
value_columns=obj.columns, | ||
|
@@ -559,7 +542,9 @@ def factorize(index): | |
mask = notna(new_values) | ||
new_values = new_values[mask] | ||
new_index = new_index[mask] | ||
return Series(new_values, index=new_index) | ||
|
||
klass = type(frame)._constructor_sliced | ||
return klass(new_values, index=new_index) | ||
|
||
|
||
def stack_multiple(frame, level, dropna=True): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import pytest | ||
import numpy as np | ||
|
||
import pandas as pd | ||
import pandas.util.testing as tm | ||
|
||
|
||
@pytest.fixture | ||
def sparse_df(): | ||
return pd.SparseDataFrame({0: {0: 1}, 1: {1: 1}, 2: {2: 1}}) # eye | ||
|
||
|
||
@pytest.fixture | ||
def multi_index3(): | ||
return pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) | ||
|
||
|
||
def test_sparse_frame_stack(sparse_df, multi_index3): | ||
ss = sparse_df.stack() | ||
expected = pd.SparseSeries(np.ones(3), index=multi_index3) | ||
tm.assert_sp_series_equal(ss, expected) | ||
|
||
|
||
def test_sparse_frame_unstack(sparse_df): | ||
mi = pd.MultiIndex.from_tuples([(0, 0), (1, 0), (1, 2)]) | ||
sparse_df.index = mi | ||
arr = np.array([[1, np.nan, np.nan], | ||
[np.nan, 1, np.nan], | ||
[np.nan, np.nan, 1]]) | ||
unstacked_df = pd.DataFrame(arr, index=mi).unstack() | ||
unstacked_sdf = sparse_df.unstack() | ||
|
||
tm.assert_numpy_array_equal(unstacked_df.values, unstacked_sdf.values) | ||
|
||
|
||
def test_sparse_series_unstack(sparse_df, multi_index3): | ||
frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack() | ||
tm.assert_sp_frame_equal(frame, sparse_df) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how is this hit?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
in
NonConsolidatingBlock._unstack()
calling.make_block_same_class()
, making aCategoricalBlock
from unstackedvalues
(anndarray
). Otherwise failsframe.test_reshape.TestDataFrameReshape.test_unstack_preserve_dtypes
.Since the function is named
maybe_to_categorical
and accepts argumentarray
, the change seems like making perfect sense.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you add a comment to the doc-string that this is only an internal method.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
May I prefix it with an underscore? It's used in a single place only.