diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 8d6e3cb0512b4..ef1a9c011d19c 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -113,7 +113,8 @@ Bug Fixes - Bug where ``get_data_google``returned object dtypes (:issue:`3995`) - +- Bug in ``DataFrame.stack(..., dropna=False)`` when the DataFrame's ``columns`` is a ``MultiIndex`` + whose ``labels`` do not reference all its ``levels``. (:issue:`8844`) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index d576f788a831f..5ed823d690028 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -648,7 +648,9 @@ def _convert_level_number(level_num, columns): # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] - levsize = len(level_vals) + level_labels = sorted(set(this.columns.labels[-1])) + level_vals_used = level_vals[level_labels] + levsize = len(level_labels) drop_cols = [] for key in unique_groups: loc = this.columns.get_loc(key) @@ -661,7 +663,7 @@ def _convert_level_number(level_num, columns): elif slice_len != levsize: chunk = this.ix[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.labels[-1]) - value_slice = chunk.reindex(columns=level_vals).values + value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_mixed_type: value_slice = this.ix[:, this.columns[loc]].values @@ -685,7 +687,7 @@ def _convert_level_number(level_num, columns): new_names = [this.index.name] # something better? new_levels.append(frame.columns.levels[level_num]) - new_labels.append(np.tile(np.arange(levsize), N)) + new_labels.append(np.tile(level_labels, N)) new_names.append(frame.columns.names[level_num]) new_index = MultiIndex(levels=new_levels, labels=new_labels, diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index fc031afe728dc..b1d6ce4cf19ae 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12266,6 +12266,56 @@ def test_stack_datetime_column_multiIndex(self): expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols) assert_frame_equal(result, expected) + def test_stack_partial_multiIndex(self): + # GH 8844 + def _test_stack_with_multiindex(multiindex): + df = DataFrame(np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), + columns=multiindex) + for level in (-1, 0, 1, [0, 1], [1, 0]): + result = df.stack(level=level, dropna=False) + + if isinstance(level, int): + # Stacking a single level should not make any all-NaN rows, + # so df.stack(level=level, dropna=False) should be the same + # as df.stack(level=level, dropna=True). + expected = df.stack(level=level, dropna=True) + if isinstance(expected, Series): + assert_series_equal(result, expected) + else: + assert_frame_equal(result, expected) + + df.columns = MultiIndex.from_tuples(df.columns.get_values(), + names=df.columns.names) + expected = df.stack(level=level, dropna=False) + if isinstance(expected, Series): + assert_series_equal(result, expected) + else: + assert_frame_equal(result, expected) + + full_multiindex = MultiIndex.from_tuples([('B', 'x'), ('B', 'z'), + ('A', 'y'), + ('C', 'x'), ('C', 'u')], + names=['Upper', 'Lower']) + for multiindex_columns in ([0, 1, 2, 3, 4], + [0, 1, 2, 3], [0, 1, 2, 4], + [0, 1, 2], [1, 2, 3], [2, 3, 4], + [0, 1], [0, 2], [0, 3], + [0], [2], [4]): + _test_stack_with_multiindex(full_multiindex[multiindex_columns]) + if len(multiindex_columns) > 1: + multiindex_columns.reverse() + _test_stack_with_multiindex(full_multiindex[multiindex_columns]) + + df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]]) + result = df.stack(dropna=False) + expected = DataFrame([[0, 2], [1, nan], [3, 5], [4, nan]], + index=MultiIndex(levels=[[0, 1], ['u', 'x', 'y', 'z']], + labels=[[0, 0, 1, 1], [1, 3, 1, 3]], + names=[None, 'Lower']), + columns=Index(['B', 'C'], name='Upper'), + dtype=df.dtypes[0]) + assert_frame_equal(result, expected) + def test_repr_with_mi_nat(self): df = DataFrame({'X': [1, 2]}, index=[[pd.NaT, pd.Timestamp('20130101')], ['a', 'b']])