From e98e53dcb3de039d43f1bafedeba8b60e15bceef Mon Sep 17 00:00:00 2001 From: Stephen Kappel Date: Sat, 2 Jan 2016 17:38:10 -0500 Subject: [PATCH 1/2] ENH: allow index of col names in set_index GH10797 --- doc/source/whatsnew/v0.18.0.txt | 1 + pandas/core/frame.py | 13 ++++++++++--- pandas/tests/test_frame.py | 17 +++++++++++++++++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 94c2dddbe1ef0..66846cfc7dd39 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -110,6 +110,7 @@ Other enhancements - ``DataFrame`` has gained a ``_repr_latex_`` method in order to allow for automatic conversion to latex in a ipython/jupyter notebook using nbconvert. Options ``display.latex.escape`` and ``display.latex.longtable`` have been added to the configuration and are used automatically by the ``to_latex`` method. (:issue:`11778`) - ``sys.getsizeof(obj)`` returns the memory usage of a pandas object, including the values it contains (:issue:`11597`) +- ``set_index`` now accepts indexes of column labels in the keys parameter (:issue:`10797`) .. _whatsnew_0180.enhancements.rounding: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b66c51bc4411e..06e7e42c00742 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2727,11 +2727,16 @@ def set_index(self, keys, drop=True, append=False, inplace=False, verify_integrity=False): """ Set the DataFrame index (row labels) using one or more existing - columns. By default yields a new object. + columns and/or new arrays of values. By default yields a new object. Parameters ---------- - keys : column label or list of column labels / arrays + keys : column label (str), Index, Series, array, or a list of these things + Existing columns to set as the index (when given columns labels) + and/or new values to set as the index. If an Index is given, it's + values will be used as the index if its length is the same as the + length of the DataFrame; otherwise, it's values will be assumed to + be column labels. drop : boolean, default True Delete columns to be used as the new index append : boolean, default False @@ -2748,12 +2753,14 @@ def set_index(self, keys, drop=True, append=False, inplace=False, >>> indexed_df = df.set_index(['A', 'B']) >>> indexed_df2 = df.set_index(['A', [0, 1, 2, 0, 1, 2]]) >>> indexed_df3 = df.set_index([[0, 1, 2, 0, 1, 2]]) + >>> indexed_df4 = df.set_index(df.columns[:2]) Returns ------- dataframe : DataFrame """ - if not isinstance(keys, list): + if not isinstance(keys, list) and not (isinstance(keys, Index) and + len(keys) != len(self.index)): keys = [keys] if inplace: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index b6b81caccf9d5..380dcf1ef1c90 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2583,6 +2583,23 @@ def test_set_index_empty_column(self): result = df.set_index(['a', 'x']) repr(result) + def test_set_index_with_col_label_index(self): + # GH10797: It should be possible to use an index of column labels as the + # `keys` parameter in set_index(). + df = DataFrame({'col1': [1, 2, 3, 4, 5, 6], + 'col2': ['a', 'b', 'c', 'a', 'b', 'c'], + 'col3': [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]}) + expected_index = MultiIndex(levels=[['a', 'b', 'c'], [0.0, 1.0, 2.0]], + labels=[[0, 1, 2, 0, 1, 2], + [0, 0, 1, 1, 2, 2]], + names=['col2', 'col3']) + expected_df = DataFrame(data={'col1': [1, 2, 3, 4, 5, 6]}, + index=expected_index) + list_df = df.set_index(['col2', 'col3']) + assert_frame_equal(expected_df, list_df) + index_df = df.set_index(df.columns[1:]) + assert_frame_equal(expected_df, index_df) + def test_set_columns(self): cols = Index(np.arange(len(self.mixed_frame.columns))) self.mixed_frame.columns = cols From 557c2ad164790e227b919ede4b255a6ad363d719 Mon Sep 17 00:00:00 2001 From: Stephen Kappel Date: Sun, 3 Jan 2016 20:01:33 -0500 Subject: [PATCH 2/2] ENH: only treat index like col name list when slice of column index GH10797 --- doc/source/whatsnew/v0.18.0.txt | 2 +- pandas/core/frame.py | 25 ++++++++++++++++++------- pandas/tests/test_frame.py | 25 ++++++++++++++++++++++--- 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 66846cfc7dd39..0b946596ec31a 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -110,7 +110,7 @@ Other enhancements - ``DataFrame`` has gained a ``_repr_latex_`` method in order to allow for automatic conversion to latex in a ipython/jupyter notebook using nbconvert. Options ``display.latex.escape`` and ``display.latex.longtable`` have been added to the configuration and are used automatically by the ``to_latex`` method. (:issue:`11778`) - ``sys.getsizeof(obj)`` returns the memory usage of a pandas object, including the values it contains (:issue:`11597`) -- ``set_index`` now accepts indexes of column labels in the keys parameter (:issue:`10797`) +- ``set_index`` now interprets views of the columns index passed to the keys parameter as lists of existing columns to use as the index (:issue:`10797`) .. _whatsnew_0180.enhancements.rounding: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 06e7e42c00742..5006d7b564ec1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2733,10 +2733,10 @@ def set_index(self, keys, drop=True, append=False, inplace=False, ---------- keys : column label (str), Index, Series, array, or a list of these things Existing columns to set as the index (when given columns labels) - and/or new values to set as the index. If an Index is given, it's - values will be used as the index if its length is the same as the - length of the DataFrame; otherwise, it's values will be assumed to - be column labels. + and/or new values to set as new index values. If an Index is given, + it will be used as a new index unless it is a view of the column + index, in which case it will be interpreted as a set of existing + columns to set as the index. drop : boolean, default True Delete columns to be used as the new index append : boolean, default False @@ -2759,9 +2759,20 @@ def set_index(self, keys, drop=True, append=False, inplace=False, ------- dataframe : DataFrame """ - if not isinstance(keys, list) and not (isinstance(keys, Index) and - len(keys) != len(self.index)): - keys = [keys] + if not isinstance(keys, list): + if isinstance(keys, Index): + # if the index is a slice of the column index, treat it like + # a list of column labels; otherwise, treat it like a new index + keys_base = keys.base + while isinstance(keys_base, Index): + keys_base = keys_base.base + cols_base = self.columns.base + while isinstance(cols_base, Index): + cols_base = cols_base.base + if keys_base is not cols_base: + keys = [keys] + else: + keys = [keys] if inplace: frame = self diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 380dcf1ef1c90..03102519d40c3 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2583,9 +2583,12 @@ def test_set_index_empty_column(self): result = df.set_index(['a', 'x']) repr(result) - def test_set_index_with_col_label_index(self): - # GH10797: It should be possible to use an index of column labels as the - # `keys` parameter in set_index(). + def test_set_index_with_index(self): + # GH10797: It should be possible to use a slice of the column index as + # the `keys` parameter in set_index(). + + # Test that setting the first two columns as the index can be done + # either with a list of column labels or a slice of the column index. df = DataFrame({'col1': [1, 2, 3, 4, 5, 6], 'col2': ['a', 'b', 'c', 'a', 'b', 'c'], 'col3': [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]}) @@ -2600,6 +2603,22 @@ def test_set_index_with_col_label_index(self): index_df = df.set_index(df.columns[1:]) assert_frame_equal(expected_df, index_df) + # Test that passing the entire index results in an empty dataframe (i.e. + # all columns become part of the index). + empty_df = df.set_index(df.columns) + assert_equal(len(empty_df.columns), 0) + assert_equal(empty_df.index.nlevels, 3) + + # Test that an index that is created independently of the column index + # is used as a new index - not as a set of column labels. + new_index = Index(data=['col1', 'col1', 'col2', 'col2', 'col3', 'col3']) + expected_df2 = DataFrame({'col1': [1, 2, 3, 4, 5, 6], + 'col2': ['a', 'b', 'c', 'a', 'b', 'c'], + 'col3': [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]}, + index=new_index) + col_name_index_df = df.set_index(new_index) + assert_frame_equal(expected_df2, col_name_index_df) + def test_set_columns(self): cols = Index(np.arange(len(self.mixed_frame.columns))) self.mixed_frame.columns = cols