From 6315d071fcafa5e90758140399d5fd8d83ac0012 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Fri, 28 Apr 2017 01:24:06 +0200 Subject: [PATCH 1/5] REF: Avoid duplication in reset_index() when reinsering index columns --- pandas/core/frame.py | 60 ++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 06bd8f8fc51bc..5601751a8bfd0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3020,43 +3020,33 @@ def _maybe_casted_values(index, labels=None): new_index = self.index.droplevel(level) if not drop: - names = self.index.names - zipped = lzip(self.index.levels, self.index.labels) - - multi_col = isinstance(self.columns, MultiIndex) - for i, (lev, lab) in reversed(list(enumerate(zipped))): - col_name = names[i] - if col_name is None: - col_name = 'level_%d' % i - - if multi_col: - if col_fill is None: - col_name = tuple([col_name] * self.columns.nlevels) - else: - name_lst = [col_fill] * self.columns.nlevels - lev_num = self.columns._get_level_number(col_level) - name_lst[lev_num] = col_name - col_name = tuple(name_lst) - - # to ndarray and maybe infer different dtype - level_values = _maybe_casted_values(lev, lab) - if level is None or i in level: - new_obj.insert(0, col_name, level_values) + names = [n if n is not None else ('level_%d' % i) + for (i, n) in enumerate(self.index.names)] + to_insert = lzip(self.index.levels, self.index.labels) elif not drop: - name = self.index.name - if name is None or name == 'index': - name = 'index' if 'index' not in self else 'level_0' - if isinstance(self.columns, MultiIndex): - if col_fill is None: - name = tuple([name] * self.columns.nlevels) - else: - name_lst = [col_fill] * self.columns.nlevels - lev_num = self.columns._get_level_number(col_level) - name_lst[lev_num] = name - name = tuple(name_lst) - values = _maybe_casted_values(self.index) - new_obj.insert(0, name, values) + default = 'index' if 'index' not in self else 'level_0' + names = [default] if self.index.name is None else [self.index.name] + to_insert = ((self.index, None),) + + if not drop: + multi_col = isinstance(self.columns, MultiIndex) + for i, (lev, lab) in reversed(list(enumerate(to_insert))): + col_name = names[i] + + if multi_col: + if col_fill is None: + col_name = tuple([col_name] * self.columns.nlevels) + else: + name_lst = [col_fill] * self.columns.nlevels + lev_num = self.columns._get_level_number(col_level) + name_lst[lev_num] = col_name + col_name = tuple(name_lst) + + # to ndarray and maybe infer different dtype + level_values = _maybe_casted_values(lev, lab) + if level is None or i in level: + new_obj.insert(0, col_name, level_values) new_obj.index = new_index if not inplace: From e12bca109658435a709dda0858869323a7af6783 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Fri, 28 Apr 2017 01:39:19 +0200 Subject: [PATCH 2/5] ENH: allow tuple index names to be interpreted as full column keys --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/frame.py | 2 +- pandas/tests/test_multilevel.py | 12 +++++++----- 3 files changed, 9 insertions(+), 6 deletions(-) mode change 100755 => 100644 pandas/tests/test_multilevel.py diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 86a598183517c..4c2dda48a7d10 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -490,6 +490,7 @@ Other Enhancements - ``Series.interpolate()`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) - Addition of a ``level`` keyword to ``DataFrame/Series.rename`` to rename labels in the specified level of a MultiIndex (:issue:`4160`). +- ``DataFrame.reset_index()`` will now interpret a tuple ``index.name`` as a key spanning across levels of ``columns``, if this is a ``MultiIndex`` (:issues:`16164`) - ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs ` (:issue:`15136`) - ``.select_dtypes()`` now allows the string ``datetimetz`` to generically select datetimes with tz (:issue:`14910`) - The ``.to_latex()`` method will now accept ``multicolumn`` and ``multirow`` arguments to use the accompanying LaTeX enhancements diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5601751a8bfd0..c9cb8439a205d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3034,7 +3034,7 @@ def _maybe_casted_values(index, labels=None): for i, (lev, lab) in reversed(list(enumerate(to_insert))): col_name = names[i] - if multi_col: + if multi_col and not isinstance(col_name, tuple): if col_fill is None: col_name = tuple([col_name] * self.columns.nlevels) else: diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py old mode 100755 new mode 100644 index 1a4603978ce38..ce7c677ccffe3 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2242,16 +2242,18 @@ def test_reset_index_multiindex_columns(self): levels = [['A', ''], ['B', 'b']] df = pd.DataFrame([[0, 2], [1, 3]], columns=pd.MultiIndex.from_tuples(levels)) - expected = df.copy() - df.index.name = 'A' - result = df[['B']].reset_index() - tm.assert_frame_equal(result, expected) + result = df[['B']].rename_axis('A').reset_index() + tm.assert_frame_equal(result, df) # gh-16120: already existing column with tm.assert_raises_regex(ValueError, ("cannot insert \('A', ''\), " "already exists")): - df.reset_index() + df.rename_axis('A').reset_index() + + # gh-16164: multiindex (tuple) full key + result = df.set_index([('A', '')]).reset_index() + tm.assert_frame_equal(result, df) def test_set_index_period(self): # GH 6631 From c958de761cf61dd1ce61e3f9cf3334796d684571 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Fri, 28 Apr 2017 08:07:56 +0200 Subject: [PATCH 3/5] TST: additional test for reset_index with tuple-named index level --- pandas/tests/test_multilevel.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index ce7c677ccffe3..43c973114cc47 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2255,6 +2255,14 @@ def test_reset_index_multiindex_columns(self): result = df.set_index([('A', '')]).reset_index() tm.assert_frame_equal(result, df) + # with additional (unnamed) index level + idx_col = pd.DataFrame([[0], [1]], + columns=pd.MultiIndex.from_tuples([('level_0', + '')])) + expected = pd.concat([idx_col, df[[('B', 'b'), ('A', '')]]], axis=1) + result = df.set_index([('B', 'b')], append=True).reset_index() + tm.assert_frame_equal(result, expected) + def test_set_index_period(self): # GH 6631 df = DataFrame(np.random.random(6)) From 3b0bb1f150d1d93ab193b36c3119c32d13569acd Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Fri, 28 Apr 2017 12:22:36 +0200 Subject: [PATCH 4/5] ENH: Handle tuples shorter than nlevels gracefully --- pandas/core/frame.py | 26 ++++++++++++++++---------- pandas/tests/test_multilevel.py | 23 +++++++++++++++++++++++ 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c9cb8439a205d..1520f71137cf9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3032,21 +3032,27 @@ def _maybe_casted_values(index, labels=None): if not drop: multi_col = isinstance(self.columns, MultiIndex) for i, (lev, lab) in reversed(list(enumerate(to_insert))): - col_name = names[i] - - if multi_col and not isinstance(col_name, tuple): + name = names[i] + if multi_col: + col_name = (list(name) if isinstance(name, tuple) + else [name]) if col_fill is None: - col_name = tuple([col_name] * self.columns.nlevels) - else: - name_lst = [col_fill] * self.columns.nlevels - lev_num = self.columns._get_level_number(col_level) - name_lst[lev_num] = col_name - col_name = tuple(name_lst) + if len(col_name) not in (1, self.columns.nlevels): + raise ValueError("col_fill=None is incompatible " + "with incomplete column name " + "{}".format(name)) + col_fill = col_name[0] + + lev_num = self.columns._get_level_number(col_level) + name_lst = [col_fill] * lev_num + col_name + missing = self.columns.nlevels - len(name_lst) + name_lst += [col_fill] * missing + name = tuple(name_lst) # to ndarray and maybe infer different dtype level_values = _maybe_casted_values(lev, lab) if level is None or i in level: - new_obj.insert(0, col_name, level_values) + new_obj.insert(0, name, level_values) new_obj.index = new_index if not inplace: diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 43c973114cc47..6d1d25c69f6b0 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2263,6 +2263,29 @@ def test_reset_index_multiindex_columns(self): result = df.set_index([('B', 'b')], append=True).reset_index() tm.assert_frame_equal(result, expected) + # with index name which is a too long tuple... + with tm.assert_raises_regex(ValueError, + ("Item must have length equal to number " + "of levels.")): + df.rename_axis([('C', 'c', 'i')]).reset_index() + # or too short... + levels = [['A', 'a', ''], ['B', 'b', 'i']] + df2 = pd.DataFrame([[0, 2], [1, 3]], + columns=pd.MultiIndex.from_tuples(levels)) + idx_col = pd.DataFrame([[0], [1]], + columns=pd.MultiIndex.from_tuples([('C', + 'c', + 'ii')])) + expected = pd.concat([idx_col, df2], axis=1) + result = df2.rename_axis([('C', 'c')]).reset_index(col_fill='ii') + tm.assert_frame_equal(result, expected) + + # ... which is incompatible with col_fill=None + with tm.assert_raises_regex(ValueError, + ("col_fill=None is incompatible with " + "incomplete column name \('C', 'c'\)")): + df2.rename_axis([('C', 'c')]).reset_index(col_fill=None) + def test_set_index_period(self): # GH 6631 df = DataFrame(np.random.random(6)) From 9e1bdba29260db49ca6f5fdbf1d13114ea1153e0 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Fri, 28 Apr 2017 12:38:29 +0200 Subject: [PATCH 5/5] REF: reorganize reinsertion code --- pandas/core/frame.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1520f71137cf9..9a62259202653 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3019,17 +3019,17 @@ def _maybe_casted_values(index, labels=None): if len(level) < len(self.index.levels): new_index = self.index.droplevel(level) - if not drop: + if not drop: + if isinstance(self.index, MultiIndex): names = [n if n is not None else ('level_%d' % i) for (i, n) in enumerate(self.index.names)] to_insert = lzip(self.index.levels, self.index.labels) + else: + default = 'index' if 'index' not in self else 'level_0' + names = ([default] if self.index.name is None + else [self.index.name]) + to_insert = ((self.index, None),) - elif not drop: - default = 'index' if 'index' not in self else 'level_0' - names = [default] if self.index.name is None else [self.index.name] - to_insert = ((self.index, None),) - - if not drop: multi_col = isinstance(self.columns, MultiIndex) for i, (lev, lab) in reversed(list(enumerate(to_insert))): name = names[i]