diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index de2261a79da47..13d61957eea00 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -561,6 +561,7 @@ Bug Fixes - Bug in ``Table.select_column`` where name is not preserved (:issue:`10392`) - Bug in ``offsets.generate_range`` where ``start`` and ``end`` have finer precision than ``offset`` (:issue:`9907`) - Bug in ``pd.rolling_*`` where ``Series.name`` would be lost in the output (:issue:`10565`) +- Bug in ``stack`` when index or columns are not unique. (:issue:`10417`) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index f782aa38bc965..fecfe5cd82c6d 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -461,6 +461,12 @@ def stack(frame, level=-1, dropna=True): ------- stacked : Series """ + def factorize(index): + if index.is_unique: + return index, np.arange(len(index)) + cat = Categorical(index, ordered=True) + return cat.categories, cat.codes + N, K = frame.shape if isinstance(frame.columns, MultiIndex): if frame.columns._reference_duplicate_name(level): @@ -475,20 +481,22 @@ def stack(frame, level=-1, dropna=True): return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) - new_levels.append(frame.columns) - new_labels = [lab.repeat(K) for lab in frame.index.labels] - new_labels.append(np.tile(np.arange(K), N).ravel()) + + clev, clab = factorize(frame.columns) + new_levels.append(clev) + new_labels.append(np.tile(clab, N).ravel()) new_names = list(frame.index.names) new_names.append(frame.columns.name) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) else: - ilabels = np.arange(N).repeat(K) - clabels = np.tile(np.arange(K), N).ravel() - new_index = MultiIndex(levels=[frame.index, frame.columns], - labels=[ilabels, clabels], + levels, (ilab, clab) = \ + zip(*map(factorize, (frame.index, frame.columns))) + labels = ilab.repeat(K), np.tile(clab, N).ravel() + new_index = MultiIndex(levels=levels, + labels=labels, names=[frame.index.name, frame.columns.name], verify_integrity=False) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index a7ef49c41a011..65ba5fd036a35 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -964,6 +964,44 @@ def test_stack(self): result = self.ymd.unstack(0).stack(-2) expected = self.ymd.unstack(0).stack(0) + # GH10417 + def check(left, right): + assert_series_equal(left, right) + self.assertFalse(left.index.is_unique) + li, ri = left.index, right.index + for i in range(ri.nlevels): + tm.assert_numpy_array_equal(li.levels[i], ri.levels[i]) + tm.assert_numpy_array_equal(li.labels[i], ri.labels[i]) + + df = DataFrame(np.arange(12).reshape(4, 3), + index=list('abab'), + columns=['1st', '2nd', '3rd']) + + mi = MultiIndex(levels=[['a', 'b'], ['1st', '2nd', '3rd']], + labels=[np.tile(np.arange(2).repeat(3), 2), + np.tile(np.arange(3), 4)]) + + left, right = df.stack(), Series(np.arange(12), index=mi) + check(left, right) + + df.columns = ['1st', '2nd', '1st'] + mi = MultiIndex(levels=[['a', 'b'], ['1st', '2nd']], + labels=[np.tile(np.arange(2).repeat(3), 2), + np.tile([0, 1, 0], 4)]) + + left, right = df.stack(), Series(np.arange(12), index=mi) + check(left, right) + + tpls = ('a', 2), ('b', 1), ('a', 1), ('b', 2) + df.index = MultiIndex.from_tuples(tpls) + mi = MultiIndex(levels=[['a', 'b'], [1, 2], ['1st', '2nd']], + labels=[np.tile(np.arange(2).repeat(3), 2), + np.repeat([1, 0, 1], [3, 6, 3]), + np.tile([0, 1, 0], 4)]) + + left, right = df.stack(), Series(np.arange(12), index=mi) + check(left, right) + def test_unstack_odd_failure(self): data = """day,time,smoker,sum,len Fri,Dinner,No,8.25,3.