Skip to content

Commit 737cb7d

Browse files
TomAugspurgerJustinZhengBC
authored andcommitted
ENH: Support EAs in Series.unstack (pandas-dev#23284)
1 parent 31eee47 commit 737cb7d

File tree

11 files changed

+248
-51
lines changed

11 files changed

+248
-51
lines changed

asv_bench/benchmarks/reshape.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,33 @@ def time_unstack(self):
4949

5050
class Unstack(object):
5151

52-
def setup(self):
52+
params = ['int', 'category']
53+
54+
def setup(self, dtype):
5355
m = 100
5456
n = 1000
5557

5658
levels = np.arange(m)
5759
index = MultiIndex.from_product([levels] * 2)
5860
columns = np.arange(n)
59-
values = np.arange(m * m * n).reshape(m * m, n)
61+
if dtype == 'int':
62+
values = np.arange(m * m * n).reshape(m * m, n)
63+
else:
64+
# the category branch is ~20x slower than int. So we
65+
# cut down the size a bit. Now it's only ~3x slower.
66+
n = 50
67+
columns = columns[:n]
68+
indices = np.random.randint(0, 52, size=(m * m, n))
69+
values = np.take(list(string.ascii_letters), indices)
70+
values = [pd.Categorical(v) for v in values.T]
71+
6072
self.df = DataFrame(values, index, columns)
6173
self.df2 = self.df.iloc[:-1]
6274

63-
def time_full_product(self):
75+
def time_full_product(self, dtype):
6476
self.df.unstack()
6577

66-
def time_without_last_row(self):
78+
def time_without_last_row(self, dtype):
6779
self.df2.unstack()
6880

6981

doc/source/whatsnew/v0.24.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -853,7 +853,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
853853
- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)
854854
- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`).
855855
- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`)
856-
- :meth:`Series.unstack` no longer converts extension arrays to object-dtype ndarrays. The output ``DataFrame`` will now have the same dtype as the input. This changes behavior for Categorical and Sparse data (:issue:`23077`).
856+
- :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`).
857857
- Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`).
858858

859859
.. _whatsnew_0240.api.incompatibilities:
@@ -1090,6 +1090,7 @@ Categorical
10901090
- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
10911091
- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
10921092
- Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`).
1093+
- In meth:`Series.unstack`, specifying a ``fill_value`` not present in the categories now raises a ``TypeError`` rather than ignoring the ``fill_value`` (:issue:`23284`)
10931094
- Bug when resampling :meth:`Dataframe.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`)
10941095

10951096
Datetimelike

pandas/core/internals/blocks.py

+76-7
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# -*- coding: utf-8 -*-
2+
import functools
23
import warnings
34
import inspect
45
import re
@@ -34,6 +35,7 @@
3435
is_numeric_v_string_like, is_extension_type,
3536
is_extension_array_dtype,
3637
is_list_like,
38+
is_sparse,
3739
is_re,
3840
is_re_compilable,
3941
pandas_dtype)
@@ -632,7 +634,10 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
632634
return self
633635

634636
if klass is None:
635-
if dtype == np.object_:
637+
if is_sparse(self.values):
638+
# special case sparse, Series[Sparse].astype(object) is sparse
639+
klass = ExtensionBlock
640+
elif is_object_dtype(dtype):
636641
klass = ObjectBlock
637642
elif is_extension_array_dtype(dtype):
638643
klass = ExtensionBlock
@@ -1429,7 +1434,7 @@ def equals(self, other):
14291434
return False
14301435
return array_equivalent(self.values, other.values)
14311436

1432-
def _unstack(self, unstacker_func, new_columns):
1437+
def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
14331438
"""Return a list of unstacked blocks of self
14341439
14351440
Parameters
@@ -1438,6 +1443,10 @@ def _unstack(self, unstacker_func, new_columns):
14381443
Partially applied unstacker.
14391444
new_columns : Index
14401445
All columns of the unstacked BlockManager.
1446+
n_rows : int
1447+
Only used in ExtensionBlock.unstack
1448+
fill_value : int
1449+
Only used in ExtensionBlock.unstack
14411450
14421451
Returns
14431452
-------
@@ -1731,7 +1740,7 @@ def _slice(self, slicer):
17311740
def _try_cast_result(self, result, dtype=None):
17321741
return result
17331742

1734-
def _unstack(self, unstacker_func, new_columns):
1743+
def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
17351744
"""Return a list of unstacked blocks of self
17361745
17371746
Parameters
@@ -1740,6 +1749,10 @@ def _unstack(self, unstacker_func, new_columns):
17401749
Partially applied unstacker.
17411750
new_columns : Index
17421751
All columns of the unstacked BlockManager.
1752+
n_rows : int
1753+
Only used in ExtensionBlock.unstack
1754+
fill_value : int
1755+
Only used in ExtensionBlock.unstack
17431756
17441757
Returns
17451758
-------
@@ -1751,18 +1764,50 @@ def _unstack(self, unstacker_func, new_columns):
17511764
# NonConsolidatable blocks can have a single item only, so we return
17521765
# one block per item
17531766
unstacker = unstacker_func(self.values.T)
1754-
new_items = unstacker.get_new_columns()
1755-
new_placement = new_columns.get_indexer(new_items)
1756-
new_values, mask = unstacker.get_new_values()
17571767

1758-
mask = mask.any(0)
1768+
new_placement, new_values, mask = self._get_unstack_items(
1769+
unstacker, new_columns
1770+
)
1771+
17591772
new_values = new_values.T[mask]
17601773
new_placement = new_placement[mask]
17611774

17621775
blocks = [self.make_block_same_class(vals, [place])
17631776
for vals, place in zip(new_values, new_placement)]
17641777
return blocks, mask
17651778

1779+
def _get_unstack_items(self, unstacker, new_columns):
1780+
"""
1781+
Get the placement, values, and mask for a Block unstack.
1782+
1783+
This is shared between ObjectBlock and ExtensionBlock. They
1784+
differ in that ObjectBlock passes the values, while ExtensionBlock
1785+
passes the dummy ndarray of positions to be used by a take
1786+
later.
1787+
1788+
Parameters
1789+
----------
1790+
unstacker : pandas.core.reshape.reshape._Unstacker
1791+
new_columns : Index
1792+
All columns of the unstacked BlockManager.
1793+
1794+
Returns
1795+
-------
1796+
new_placement : ndarray[int]
1797+
The placement of the new columns in `new_columns`.
1798+
new_values : Union[ndarray, ExtensionArray]
1799+
The first return value from _Unstacker.get_new_values.
1800+
mask : ndarray[bool]
1801+
The second return value from _Unstacker.get_new_values.
1802+
"""
1803+
# shared with ExtensionBlock
1804+
new_items = unstacker.get_new_columns()
1805+
new_placement = new_columns.get_indexer(new_items)
1806+
new_values, mask = unstacker.get_new_values()
1807+
1808+
mask = mask.any(0)
1809+
return new_placement, new_values, mask
1810+
17661811

17671812
class ExtensionBlock(NonConsolidatableMixIn, Block):
17681813
"""Block for holding extension types.
@@ -1950,6 +1995,30 @@ def shift(self, periods, axis=0):
19501995
def _ftype(self):
19511996
return getattr(self.values, '_pandas_ftype', Block._ftype)
19521997

1998+
def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
1999+
# ExtensionArray-safe unstack.
2000+
# We override ObjectBlock._unstack, which unstacks directly on the
2001+
# values of the array. For EA-backed blocks, this would require
2002+
# converting to a 2-D ndarray of objects.
2003+
# Instead, we unstack an ndarray of integer positions, followed by
2004+
# a `take` on the actual values.
2005+
dummy_arr = np.arange(n_rows)
2006+
dummy_unstacker = functools.partial(unstacker_func, fill_value=-1)
2007+
unstacker = dummy_unstacker(dummy_arr)
2008+
2009+
new_placement, new_values, mask = self._get_unstack_items(
2010+
unstacker, new_columns
2011+
)
2012+
2013+
blocks = [
2014+
self.make_block_same_class(
2015+
self.values.take(indices, allow_fill=True,
2016+
fill_value=fill_value),
2017+
[place])
2018+
for indices, place in zip(new_values.T, new_placement)
2019+
]
2020+
return blocks, mask
2021+
19532022

19542023
class NumericBlock(Block):
19552024
__slots__ = ()

pandas/core/internals/managers.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -1405,18 +1405,21 @@ def canonicalize(block):
14051405
return all(block.equals(oblock)
14061406
for block, oblock in zip(self_blocks, other_blocks))
14071407

1408-
def unstack(self, unstacker_func):
1408+
def unstack(self, unstacker_func, fill_value):
14091409
"""Return a blockmanager with all blocks unstacked.
14101410
14111411
Parameters
14121412
----------
14131413
unstacker_func : callable
14141414
A (partially-applied) ``pd.core.reshape._Unstacker`` class.
1415+
fill_value : Any
1416+
fill_value for newly introduced missing values.
14151417
14161418
Returns
14171419
-------
14181420
unstacked : BlockManager
14191421
"""
1422+
n_rows = self.shape[-1]
14201423
dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
14211424
new_columns = dummy.get_new_columns()
14221425
new_index = dummy.get_new_index()
@@ -1427,7 +1430,10 @@ def unstack(self, unstacker_func):
14271430
blocks, mask = blk._unstack(
14281431
partial(unstacker_func,
14291432
value_columns=self.items[blk.mgr_locs.indexer]),
1430-
new_columns)
1433+
new_columns,
1434+
n_rows,
1435+
fill_value
1436+
)
14311437

14321438
new_blocks.extend(blocks)
14331439
columns_mask.extend(mask)

pandas/core/reshape/reshape.py

+55-26
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@
1212
from pandas.core.dtypes.cast import maybe_promote
1313
from pandas.core.dtypes.common import (
1414
ensure_platform_int, is_bool_dtype, is_extension_array_dtype, is_list_like,
15-
is_object_dtype, is_sparse, needs_i8_conversion)
15+
is_object_dtype, needs_i8_conversion)
1616
from pandas.core.dtypes.missing import notna
1717

1818
from pandas import compat
1919
import pandas.core.algorithms as algos
20-
from pandas.core.arrays import Categorical, SparseArray
20+
from pandas.core.arrays import SparseArray
2121
from pandas.core.arrays.categorical import _factorize_from_iterable
2222
from pandas.core.frame import DataFrame
2323
from pandas.core.index import Index, MultiIndex
@@ -82,28 +82,15 @@ class _Unstacker(object):
8282
def __init__(self, values, index, level=-1, value_columns=None,
8383
fill_value=None, constructor=None):
8484

85-
self.is_categorical = None
86-
self.is_sparse = is_sparse(values)
8785
if values.ndim == 1:
88-
if isinstance(values, Categorical):
89-
self.is_categorical = values
90-
values = np.array(values)
91-
elif self.is_sparse:
92-
# XXX: Makes SparseArray *dense*, but it's supposedly
93-
# a single column at a time, so it's "doable"
94-
values = values.values
9586
values = values[:, np.newaxis]
9687
self.values = values
9788
self.value_columns = value_columns
9889
self.fill_value = fill_value
9990

10091
if constructor is None:
101-
if self.is_sparse:
102-
self.constructor = SparseDataFrame
103-
else:
104-
self.constructor = DataFrame
105-
else:
106-
self.constructor = constructor
92+
constructor = DataFrame
93+
self.constructor = constructor
10794

10895
if value_columns is None and values.shape[1] != 1: # pragma: no cover
10996
raise ValueError('must pass column labels for multi-column data')
@@ -174,14 +161,6 @@ def get_result(self):
174161
columns = self.get_new_columns()
175162
index = self.get_new_index()
176163

177-
# may need to coerce categoricals here
178-
if self.is_categorical is not None:
179-
categories = self.is_categorical.categories
180-
ordered = self.is_categorical.ordered
181-
values = [Categorical(values[:, i], categories=categories,
182-
ordered=ordered)
183-
for i in range(values.shape[-1])]
184-
185164
return self.constructor(values, index=index, columns=columns)
186165

187166
def get_new_values(self):
@@ -339,6 +318,7 @@ def _unstack_multiple(data, clocs, fill_value=None):
339318
if isinstance(data, Series):
340319
dummy = data.copy()
341320
dummy.index = dummy_index
321+
342322
unstacked = dummy.unstack('__placeholder__', fill_value=fill_value)
343323
new_levels = clevels
344324
new_names = cnames
@@ -394,6 +374,8 @@ def unstack(obj, level, fill_value=None):
394374
else:
395375
return obj.T.stack(dropna=False)
396376
else:
377+
if is_extension_array_dtype(obj.dtype):
378+
return _unstack_extension_series(obj, level, fill_value)
397379
unstacker = _Unstacker(obj.values, obj.index, level=level,
398380
fill_value=fill_value,
399381
constructor=obj._constructor_expanddim)
@@ -404,7 +386,8 @@ def _unstack_frame(obj, level, fill_value=None):
404386
if obj._is_mixed_type:
405387
unstacker = partial(_Unstacker, index=obj.index,
406388
level=level, fill_value=fill_value)
407-
blocks = obj._data.unstack(unstacker)
389+
blocks = obj._data.unstack(unstacker,
390+
fill_value=fill_value)
408391
return obj._constructor(blocks)
409392
else:
410393
unstacker = _Unstacker(obj.values, obj.index, level=level,
@@ -414,6 +397,52 @@ def _unstack_frame(obj, level, fill_value=None):
414397
return unstacker.get_result()
415398

416399

400+
def _unstack_extension_series(series, level, fill_value):
401+
"""
402+
Unstack an ExtensionArray-backed Series.
403+
404+
The ExtensionDtype is preserved.
405+
406+
Parameters
407+
----------
408+
series : Series
409+
A Series with an ExtensionArray for values
410+
level : Any
411+
The level name or number.
412+
fill_value : Any
413+
The user-level (not physical storage) fill value to use for
414+
missing values introduced by the reshape. Passed to
415+
``series.values.take``.
416+
417+
Returns
418+
-------
419+
DataFrame
420+
Each column of the DataFrame will have the same dtype as
421+
the input Series.
422+
"""
423+
# Implementation note: the basic idea is to
424+
# 1. Do a regular unstack on a dummy array of integers
425+
# 2. Followup with a columnwise take.
426+
# We use the dummy take to discover newly-created missing values
427+
# introduced by the reshape.
428+
from pandas.core.reshape.concat import concat
429+
430+
dummy_arr = np.arange(len(series))
431+
# fill_value=-1, since we will do a series.values.take later
432+
result = _Unstacker(dummy_arr, series.index,
433+
level=level, fill_value=-1).get_result()
434+
435+
out = []
436+
values = series.values
437+
438+
for col, indices in result.iteritems():
439+
out.append(Series(values.take(indices.values,
440+
allow_fill=True,
441+
fill_value=fill_value),
442+
name=col, index=result.index))
443+
return concat(out, axis='columns', copy=False, keys=result.columns)
444+
445+
417446
def stack(frame, level=-1, dropna=True):
418447
"""
419448
Convert DataFrame to Series with multi-level Index. Columns become the

0 commit comments

Comments
 (0)