Skip to content

Commit 7dd32be

Browse files
committed
BUG: avoid unnecessary casting when unstacking index with unused levels
closes #17845
1 parent b45325e commit 7dd32be

File tree

2 files changed

+64
-18
lines changed

2 files changed

+64
-18
lines changed

pandas/core/reshape/reshape.py

+14-18
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def __init__(self, values, index, level=-1, value_columns=None,
8989
if value_columns is None and values.shape[1] != 1: # pragma: no cover
9090
raise ValueError('must pass column labels for multi-column data')
9191

92-
self.index = index
92+
self.index = index.remove_unused_levels()
9393

9494
if isinstance(self.index, MultiIndex):
9595
if index._reference_duplicate_name(level):
@@ -102,11 +102,12 @@ def __init__(self, values, index, level=-1, value_columns=None,
102102
# when index includes `nan`, need to lift levels/strides by 1
103103
self.lift = 1 if -1 in self.index.labels[self.level] else 0
104104

105-
self.new_index_levels = list(index.levels)
106-
self.new_index_names = list(index.names)
105+
self.new_index_levels = list(self.index.levels)
106+
self.new_index_names = list(self.index.names)
107107

108108
self.removed_name = self.new_index_names.pop(self.level)
109109
self.removed_level = self.new_index_levels.pop(self.level)
110+
self.removed_level_full = index.levels[self.level]
110111

111112
self._make_sorted_values_labels()
112113
self._make_selectors()
@@ -156,21 +157,10 @@ def _make_selectors(self):
156157
self.compressor = comp_index.searchsorted(np.arange(ngroups))
157158

158159
def get_result(self):
159-
# TODO: find a better way than this masking business
160-
161-
values, value_mask = self.get_new_values()
160+
values, _ = self.get_new_values()
162161
columns = self.get_new_columns()
163162
index = self.get_new_index()
164163

165-
# filter out missing levels
166-
if values.shape[1] > 0:
167-
col_inds, obs_ids = compress_group_index(self.sorted_labels[-1])
168-
# rare case, level values not observed
169-
if len(obs_ids) < self.full_shape[1]:
170-
inds = (value_mask.sum(0) > 0).nonzero()[0]
171-
values = algos.take_nd(values, inds, axis=1)
172-
columns = columns[inds]
173-
174164
# may need to coerce categoricals here
175165
if self.is_categorical is not None:
176166
categories = self.is_categorical.categories
@@ -259,17 +249,23 @@ def get_new_columns(self):
259249
width = len(self.value_columns)
260250
propagator = np.repeat(np.arange(width), stride)
261251
if isinstance(self.value_columns, MultiIndex):
262-
new_levels = self.value_columns.levels + (self.removed_level,)
252+
new_levels = self.value_columns.levels + (self.removed_level_full,)
263253
new_names = self.value_columns.names + (self.removed_name,)
264254

265255
new_labels = [lab.take(propagator)
266256
for lab in self.value_columns.labels]
267257
else:
268-
new_levels = [self.value_columns, self.removed_level]
258+
new_levels = [self.value_columns, self.removed_level_full]
269259
new_names = [self.value_columns.name, self.removed_name]
270260
new_labels = [propagator]
271261

272-
new_labels.append(np.tile(np.arange(stride) - self.lift, width))
262+
if len(self.removed_level_full) != len(self.removed_level):
263+
repeater = self.removed_level_full.get_indexer(self.removed_level)
264+
if self.lift:
265+
repeater = np.insert(repeater, 0, -1)
266+
else:
267+
repeater = np.arange(stride) - self.lift
268+
new_labels.append(np.tile(repeater, width))
273269
return MultiIndex(levels=new_levels, labels=new_labels,
274270
names=new_names, verify_integrity=False)
275271

pandas/tests/frame/test_reshape.py

+50
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,56 @@ def test_unstack_dtypes(self):
536536
assert left.shape == (3, 2)
537537
tm.assert_frame_equal(left, right)
538538

539+
def test_unstack_unused_levels(self):
540+
# GH 17845: sliced columns of int DataFrame
541+
idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1]
542+
df = pd.DataFrame([[1, 0]] * 3, index=idx)
543+
544+
result = df.unstack()
545+
exp_col = pd.MultiIndex.from_product([[0, 1], ['A', 'B', 'C']])
546+
expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=['a'],
547+
columns=exp_col)
548+
tm.assert_frame_equal(result, expected)
549+
assert((result.columns.levels[1] == idx.levels[1]).all())
550+
551+
# Unused items on both levels
552+
levels = [[0, 1, 7], [0, 1, 2, 3]]
553+
labels = [[0, 0, 1, 1], [0, 2, 0, 2]]
554+
idx = pd.MultiIndex(levels, labels)
555+
block = np.arange(4).reshape(2, 2)
556+
df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx)
557+
result = df.unstack()
558+
expected = pd.DataFrame(np.concatenate([block * 2, block * 2 - 1],
559+
axis=1),
560+
columns=idx)
561+
assert((result.columns.levels[1] == idx.levels[1]).all())
562+
563+
# With mixed dtype and NaN
564+
levels = [['a', 2, 'c'], [1, 3, 5, 7]]
565+
labels = [[0, -1, 1, 1], [0, 2, -1, 2]]
566+
idx = pd.MultiIndex(levels, labels)
567+
data = np.arange(8)
568+
df = pd.DataFrame(data.reshape(4, 2), index=idx)
569+
570+
cases = ((0, [13, 16, 6, 9, 2, 5, 8, 11],
571+
[np.nan, 'a', 2], [np.nan, 5, 1]),
572+
(1, [8, 11, 1, 4, 12, 15, 13, 16],
573+
[np.nan, 5, 1], [np.nan, 'a', 2]))
574+
for level, idces, col_level, idx_level in cases:
575+
result = df.unstack(level=level)
576+
exp_data = np.zeros(18) * np.nan
577+
exp_data[idces] = data
578+
cols = pd.MultiIndex.from_product([[0, 1], col_level])
579+
expected = pd.DataFrame(exp_data.reshape(3, 6),
580+
index=idx_level, columns=cols)
581+
# Broken (GH 18455):
582+
# tm.assert_frame_equal(result, expected)
583+
diff = result - expected
584+
assert(diff.sum().sum() == 0)
585+
assert((diff + 1).sum().sum() == 8)
586+
587+
assert((result.columns.levels[1] == idx.levels[level]).all())
588+
539589
def test_unstack_non_unique_index_names(self):
540590
idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
541591
names=['c1', 'c1'])

0 commit comments

Comments
 (0)