Skip to content

Commit 7208610

Browse files
toobazjreback
authored andcommitted
BUG: fix unstacking with unused levels in columns/unstacked index level (pandas-dev#18460)
closes pandas-dev#17845 closes pandas-dev#18562
1 parent c26c49d commit 7208610

File tree

3 files changed

+89
-18
lines changed

3 files changed

+89
-18
lines changed

doc/source/whatsnew/v0.23.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,8 @@ Reshaping
484484
^^^^^^^^^
485485

486486
- Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`)
487+
- Bug in :func:`DataFrame.unstack` which casts int to float if ``columns`` is a ``MultiIndex`` with unused levels (:issue:`17845`)
488+
- Bug in :func:`DataFrame.unstack` which raises an error if ``index`` is a ``MultiIndex`` with unused labels on the unstacked level (:issue:`18562`)
487489
- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`)
488490
- Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`)
489491
- Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`)

pandas/core/reshape/reshape.py

+19-18
Original file line numberDiff line numberDiff line change
@@ -112,18 +112,19 @@ def __init__(self, values, index, level=-1, value_columns=None,
112112
if value_columns is None and values.shape[1] != 1: # pragma: no cover
113113
raise ValueError('must pass column labels for multi-column data')
114114

115-
self.index = index
115+
self.index = index.remove_unused_levels()
116116

117117
self.level = self.index._get_level_number(level)
118118

119119
# when index includes `nan`, need to lift levels/strides by 1
120120
self.lift = 1 if -1 in self.index.labels[self.level] else 0
121121

122-
self.new_index_levels = list(index.levels)
123-
self.new_index_names = list(index.names)
122+
self.new_index_levels = list(self.index.levels)
123+
self.new_index_names = list(self.index.names)
124124

125125
self.removed_name = self.new_index_names.pop(self.level)
126126
self.removed_level = self.new_index_levels.pop(self.level)
127+
self.removed_level_full = index.levels[self.level]
127128

128129
self._make_sorted_values_labels()
129130
self._make_selectors()
@@ -173,21 +174,10 @@ def _make_selectors(self):
173174
self.compressor = comp_index.searchsorted(np.arange(ngroups))
174175

175176
def get_result(self):
176-
# TODO: find a better way than this masking business
177-
178-
values, value_mask = self.get_new_values()
177+
values, _ = self.get_new_values()
179178
columns = self.get_new_columns()
180179
index = self.get_new_index()
181180

182-
# filter out missing levels
183-
if values.shape[1] > 0:
184-
col_inds, obs_ids = compress_group_index(self.sorted_labels[-1])
185-
# rare case, level values not observed
186-
if len(obs_ids) < self.full_shape[1]:
187-
inds = (value_mask.sum(0) > 0).nonzero()[0]
188-
values = algos.take_nd(values, inds, axis=1)
189-
columns = columns[inds]
190-
191181
# may need to coerce categoricals here
192182
if self.is_categorical is not None:
193183
categories = self.is_categorical.categories
@@ -275,17 +265,28 @@ def get_new_columns(self):
275265
width = len(self.value_columns)
276266
propagator = np.repeat(np.arange(width), stride)
277267
if isinstance(self.value_columns, MultiIndex):
278-
new_levels = self.value_columns.levels + (self.removed_level,)
268+
new_levels = self.value_columns.levels + (self.removed_level_full,)
279269
new_names = self.value_columns.names + (self.removed_name,)
280270

281271
new_labels = [lab.take(propagator)
282272
for lab in self.value_columns.labels]
283273
else:
284-
new_levels = [self.value_columns, self.removed_level]
274+
new_levels = [self.value_columns, self.removed_level_full]
285275
new_names = [self.value_columns.name, self.removed_name]
286276
new_labels = [propagator]
287277

288-
new_labels.append(np.tile(np.arange(stride) - self.lift, width))
278+
# The two indices differ only if the unstacked level had unused items:
279+
if len(self.removed_level_full) != len(self.removed_level):
280+
# In this case, we remap the new labels to the original level:
281+
repeater = self.removed_level_full.get_indexer(self.removed_level)
282+
if self.lift:
283+
repeater = np.insert(repeater, 0, -1)
284+
else:
285+
# Otherwise, we just use each level item exactly once:
286+
repeater = np.arange(stride) - self.lift
287+
288+
# The entire level is then just a repetition of the single chunk:
289+
new_labels.append(np.tile(repeater, width))
289290
return MultiIndex(levels=new_levels, labels=new_labels,
290291
names=new_names, verify_integrity=False)
291292

pandas/tests/frame/test_reshape.py

+68
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,74 @@ def test_unstack_dtypes(self):
560560
assert left.shape == (3, 2)
561561
tm.assert_frame_equal(left, right)
562562

563+
def test_unstack_unused_levels(self):
564+
# GH 17845: unused labels in index make unstack() cast int to float
565+
idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1]
566+
df = pd.DataFrame([[1, 0]] * 3, index=idx)
567+
568+
result = df.unstack()
569+
exp_col = pd.MultiIndex.from_product([[0, 1], ['A', 'B', 'C']])
570+
expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=['a'],
571+
columns=exp_col)
572+
tm.assert_frame_equal(result, expected)
573+
assert((result.columns.levels[1] == idx.levels[1]).all())
574+
575+
# Unused items on both levels
576+
levels = [[0, 1, 7], [0, 1, 2, 3]]
577+
labels = [[0, 0, 1, 1], [0, 2, 0, 2]]
578+
idx = pd.MultiIndex(levels, labels)
579+
block = np.arange(4).reshape(2, 2)
580+
df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx)
581+
result = df.unstack()
582+
expected = pd.DataFrame(np.concatenate([block * 2, block * 2 + 1],
583+
axis=1),
584+
columns=idx)
585+
tm.assert_frame_equal(result, expected)
586+
assert((result.columns.levels[1] == idx.levels[1]).all())
587+
588+
# With mixed dtype and NaN
589+
levels = [['a', 2, 'c'], [1, 3, 5, 7]]
590+
labels = [[0, -1, 1, 1], [0, 2, -1, 2]]
591+
idx = pd.MultiIndex(levels, labels)
592+
data = np.arange(8)
593+
df = pd.DataFrame(data.reshape(4, 2), index=idx)
594+
595+
cases = ((0, [13, 16, 6, 9, 2, 5, 8, 11],
596+
[np.nan, 'a', 2], [np.nan, 5, 1]),
597+
(1, [8, 11, 1, 4, 12, 15, 13, 16],
598+
[np.nan, 5, 1], [np.nan, 'a', 2]))
599+
for level, idces, col_level, idx_level in cases:
600+
result = df.unstack(level=level)
601+
exp_data = np.zeros(18) * np.nan
602+
exp_data[idces] = data
603+
cols = pd.MultiIndex.from_product([[0, 1], col_level])
604+
expected = pd.DataFrame(exp_data.reshape(3, 6),
605+
index=idx_level, columns=cols)
606+
# Broken (GH 18455):
607+
# tm.assert_frame_equal(result, expected)
608+
diff = result - expected
609+
assert(diff.sum().sum() == 0)
610+
assert((diff + 1).sum().sum() == 8)
611+
612+
assert((result.columns.levels[1] == idx.levels[level]).all())
613+
614+
@pytest.mark.parametrize("cols", [['A', 'C'], slice(None)])
615+
def test_unstack_unused_level(self, cols):
616+
# GH 18562 : unused labels on the unstacked level
617+
df = pd.DataFrame([[2010, 'a', 'I'],
618+
[2011, 'b', 'II']],
619+
columns=['A', 'B', 'C'])
620+
621+
ind = df.set_index(['A', 'B', 'C'], drop=False)
622+
selection = ind.loc[(slice(None), slice(None), 'I'), cols]
623+
result = selection.unstack()
624+
625+
expected = ind.iloc[[0]][cols]
626+
expected.columns = MultiIndex.from_product([expected.columns, ['I']],
627+
names=[None, 'C'])
628+
expected.index = expected.index.droplevel('C')
629+
tm.assert_frame_equal(result, expected)
630+
563631
def test_unstack_nan_index(self): # GH7466
564632
cast = lambda val: '{0:1}'.format('' if val != val else val)
565633
nan = np.nan

0 commit comments

Comments
 (0)