diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index dc305f36f32ec..dc3df2f16c6b3 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -444,6 +444,8 @@ Reshaping ^^^^^^^^^ - Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`) +- Bug in :func:`DataFrame.unstack` which casts int to float if ``columns`` is a ``MultiIndex`` with unused levels (:issue:`17845`) +- Bug in :func:`DataFrame.unstack` which raises an error if ``index`` is a ``MultiIndex`` with unused labels on the unstacked level (:issue:`18562`) - Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`) - Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`) - Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d6aed064e49f8..cadd63cc7c665 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -89,18 +89,19 @@ def __init__(self, values, index, level=-1, value_columns=None, if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError('must pass column labels for multi-column data') - self.index = index + self.index = index.remove_unused_levels() self.level = self.index._get_level_number(level) # when index includes `nan`, need to lift levels/strides by 1 self.lift = 1 if -1 in self.index.labels[self.level] else 0 - self.new_index_levels = list(index.levels) - self.new_index_names = list(index.names) + self.new_index_levels = list(self.index.levels) + self.new_index_names = list(self.index.names) self.removed_name = self.new_index_names.pop(self.level) self.removed_level = self.new_index_levels.pop(self.level) + self.removed_level_full = index.levels[self.level] self._make_sorted_values_labels() self._make_selectors() @@ -150,21 +151,10 @@ def _make_selectors(self): self.compressor = comp_index.searchsorted(np.arange(ngroups)) def get_result(self): - # TODO: find a better way than this masking business - - values, value_mask = self.get_new_values() + values, _ = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() - # filter out missing levels - if values.shape[1] > 0: - col_inds, obs_ids = compress_group_index(self.sorted_labels[-1]) - # rare case, level values not observed - if len(obs_ids) < self.full_shape[1]: - inds = (value_mask.sum(0) > 0).nonzero()[0] - values = algos.take_nd(values, inds, axis=1) - columns = columns[inds] - # may need to coerce categoricals here if self.is_categorical is not None: categories = self.is_categorical.categories @@ -253,17 +243,28 @@ def get_new_columns(self): width = len(self.value_columns) propagator = np.repeat(np.arange(width), stride) if isinstance(self.value_columns, MultiIndex): - new_levels = self.value_columns.levels + (self.removed_level,) + new_levels = self.value_columns.levels + (self.removed_level_full,) new_names = self.value_columns.names + (self.removed_name,) new_labels = [lab.take(propagator) for lab in self.value_columns.labels] else: - new_levels = [self.value_columns, self.removed_level] + new_levels = [self.value_columns, self.removed_level_full] new_names = [self.value_columns.name, self.removed_name] new_labels = [propagator] - new_labels.append(np.tile(np.arange(stride) - self.lift, width)) + # The two indices differ only if the unstacked level had unused items: + if len(self.removed_level_full) != len(self.removed_level): + # In this case, we remap the new labels to the original level: + repeater = self.removed_level_full.get_indexer(self.removed_level) + if self.lift: + repeater = np.insert(repeater, 0, -1) + else: + # Otherwise, we just use each level item exactly once: + repeater = np.arange(stride) - self.lift + + # The entire level is then just a repetition of the single chunk: + new_labels.append(np.tile(repeater, width)) return MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 5ff4f58774322..7907486c7c98d 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -560,6 +560,74 @@ def test_unstack_dtypes(self): assert left.shape == (3, 2) tm.assert_frame_equal(left, right) + def test_unstack_unused_levels(self): + # GH 17845: unused labels in index make unstack() cast int to float + idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1] + df = pd.DataFrame([[1, 0]] * 3, index=idx) + + result = df.unstack() + exp_col = pd.MultiIndex.from_product([[0, 1], ['A', 'B', 'C']]) + expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=['a'], + columns=exp_col) + tm.assert_frame_equal(result, expected) + assert((result.columns.levels[1] == idx.levels[1]).all()) + + # Unused items on both levels + levels = [[0, 1, 7], [0, 1, 2, 3]] + labels = [[0, 0, 1, 1], [0, 2, 0, 2]] + idx = pd.MultiIndex(levels, labels) + block = np.arange(4).reshape(2, 2) + df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx) + result = df.unstack() + expected = pd.DataFrame(np.concatenate([block * 2, block * 2 + 1], + axis=1), + columns=idx) + tm.assert_frame_equal(result, expected) + assert((result.columns.levels[1] == idx.levels[1]).all()) + + # With mixed dtype and NaN + levels = [['a', 2, 'c'], [1, 3, 5, 7]] + labels = [[0, -1, 1, 1], [0, 2, -1, 2]] + idx = pd.MultiIndex(levels, labels) + data = np.arange(8) + df = pd.DataFrame(data.reshape(4, 2), index=idx) + + cases = ((0, [13, 16, 6, 9, 2, 5, 8, 11], + [np.nan, 'a', 2], [np.nan, 5, 1]), + (1, [8, 11, 1, 4, 12, 15, 13, 16], + [np.nan, 5, 1], [np.nan, 'a', 2])) + for level, idces, col_level, idx_level in cases: + result = df.unstack(level=level) + exp_data = np.zeros(18) * np.nan + exp_data[idces] = data + cols = pd.MultiIndex.from_product([[0, 1], col_level]) + expected = pd.DataFrame(exp_data.reshape(3, 6), + index=idx_level, columns=cols) + # Broken (GH 18455): + # tm.assert_frame_equal(result, expected) + diff = result - expected + assert(diff.sum().sum() == 0) + assert((diff + 1).sum().sum() == 8) + + assert((result.columns.levels[1] == idx.levels[level]).all()) + + @pytest.mark.parametrize("cols", [['A', 'C'], slice(None)]) + def test_unstack_unused_level(self, cols): + # GH 18562 : unused labels on the unstacked level + df = pd.DataFrame([[2010, 'a', 'I'], + [2011, 'b', 'II']], + columns=['A', 'B', 'C']) + + ind = df.set_index(['A', 'B', 'C'], drop=False) + selection = ind.loc[(slice(None), slice(None), 'I'), cols] + result = selection.unstack() + + expected = ind.iloc[[0]][cols] + expected.columns = MultiIndex.from_product([expected.columns, ['I']], + names=[None, 'C']) + expected.index = expected.index.droplevel('C') + tm.assert_frame_equal(result, expected) + def test_unstack_nan_index(self): # GH7466 cast = lambda val: '{0:1}'.format('' if val != val else val) nan = np.nan