From a4592769c4fb220e540d9743b40e2e72a2b3c8ee Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 11 Mar 2023 11:50:21 +1100 Subject: [PATCH 1/7] BUG/TST fix dtype preservation with multindex --- pandas/core/frame.py | 8 +++----- .../indexing/multiindex/test_multiindex.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4f13ead4005e7..5f9b819df503b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3792,11 +3792,8 @@ def _getitem_multilevel(self, key): result = self.reindex(columns=new_columns) result.columns = result_columns else: - new_values = self.values[:, loc] - result = self._constructor( - new_values, index=self.index, columns=result_columns - ) - result = result.__finalize__(self) + result = self.iloc[:, loc] + result.columns = result_columns # If there is only one column being returned, and its name is # either an empty string, or a tuple with an empty string as its @@ -3810,6 +3807,7 @@ def _getitem_multilevel(self, key): top = top[0] if top == "": result = result[""] + # result = result.squeeze("columns") # TODO iloc? if isinstance(result, Series): result = self._constructor_sliced( result, index=self.index, name=key diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 8e507212976ec..a6330fa46097f 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -206,3 +206,22 @@ def test_multiindex_with_na_missing_key(self): ) with pytest.raises(KeyError, match="missing_key"): df[[("missing_key",)]] + + def test_multiindex_dtype_preservation(self): + # GH51261 + df = pd.DataFrame([('value')], columns=pd.MultiIndex.from_tuples([('A', 'B')], names=['lvl1', 'lvl2'])).astype( + 'category') + assert (df['A'].dtypes == 'category').all() + + # geopandas 1763 analogue + df = pd.DataFrame( + [[1, 0], [0, 1]], + columns=[ + ["foo", "foo"], + ["location", "location"], + ["x", "y"], + ], + ).assign(bools=pd.Series([True, False], dtype="boolean")) + assert df['bools'].dtype == "boolean" + + From a3fc02f65cbb9d016d3d8a20163ce4036d8d0981 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 11 Mar 2023 11:59:48 +1100 Subject: [PATCH 2/7] lint --- .../tests/indexing/multiindex/test_multiindex.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index a6330fa46097f..b5322c5b8b7f5 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -209,19 +209,19 @@ def test_multiindex_with_na_missing_key(self): def test_multiindex_dtype_preservation(self): # GH51261 - df = pd.DataFrame([('value')], columns=pd.MultiIndex.from_tuples([('A', 'B')], names=['lvl1', 'lvl2'])).astype( - 'category') - assert (df['A'].dtypes == 'category').all() + df = DataFrame( + ["value"], + columns=MultiIndex.from_tuples([("A", "B")], names=["lvl1", "lvl2"]), + ).astype("category") + assert (df["A"].dtypes == "category").all() # geopandas 1763 analogue - df = pd.DataFrame( + df = DataFrame( [[1, 0], [0, 1]], columns=[ ["foo", "foo"], ["location", "location"], ["x", "y"], ], - ).assign(bools=pd.Series([True, False], dtype="boolean")) - assert df['bools'].dtype == "boolean" - - + ).assign(bools=Series([True, False], dtype="boolean")) + assert df["bools"].dtype == "boolean" From 1be4d25c84f0744f104a99916fbb3afb1083a920 Mon Sep 17 00:00:00 2001 From: Matt Richards <45483497+m-richards@users.noreply.github.com> Date: Sat, 11 Mar 2023 21:22:09 +1100 Subject: [PATCH 3/7] Update pandas/tests/indexing/multiindex/test_multiindex.py Co-authored-by: Joris Van den Bossche --- pandas/tests/indexing/multiindex/test_multiindex.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index b5322c5b8b7f5..b6267d958ddfd 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -209,10 +209,8 @@ def test_multiindex_with_na_missing_key(self): def test_multiindex_dtype_preservation(self): # GH51261 - df = DataFrame( - ["value"], - columns=MultiIndex.from_tuples([("A", "B")], names=["lvl1", "lvl2"]), - ).astype("category") + columns = MultiIndex.from_tuples([("A", "B")], names=["lvl1", "lvl2"]) + df = DataFrame(["value"], columns=columns).astype("category") assert (df["A"].dtypes == "category").all() # geopandas 1763 analogue From e8655487666e8376f75d69864614da6ba57dd174 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 11 Mar 2023 21:40:23 +1100 Subject: [PATCH 4/7] cleanups --- pandas/core/frame.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 213c9d4522e30..b153802d56808 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3827,12 +3827,8 @@ def _getitem_multilevel(self, key): if isinstance(loc, (slice, np.ndarray)): new_columns = self.columns[loc] result_columns = maybe_droplevels(new_columns, key) - if self._is_mixed_type: - result = self.reindex(columns=new_columns) - result.columns = result_columns - else: - result = self.iloc[:, loc] - result.columns = result_columns + result = self.reindex(columns=new_columns) + result.columns = result_columns # If there is only one column being returned, and its name is # either an empty string, or a tuple with an empty string as its @@ -3848,7 +3844,6 @@ def _getitem_multilevel(self, key): top = top[0] if top == "": result = result[""] - # result = result.squeeze("columns") # TODO iloc? if isinstance(result, Series): result = self._constructor_sliced( result, index=self.index, name=key From 533e0d81f2a00bc95fed4b9c7aba9e03bb7fc57f Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sun, 12 Mar 2023 15:14:44 +1100 Subject: [PATCH 5/7] switch to iloc, reindex fails in some cases --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b153802d56808..d0487a57bf3ce 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3827,7 +3827,7 @@ def _getitem_multilevel(self, key): if isinstance(loc, (slice, np.ndarray)): new_columns = self.columns[loc] result_columns = maybe_droplevels(new_columns, key) - result = self.reindex(columns=new_columns) + result = self.iloc[:, loc] result.columns = result_columns # If there is only one column being returned, and its name is From 1d221b1b8962e1c8851c66f0d96d2be9a5b9ece7 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 14 Mar 2023 21:13:32 +1100 Subject: [PATCH 6/7] suggestions from code review --- pandas/tests/indexing/multiindex/test_multiindex.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index b6267d958ddfd..2c975e3d5287c 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -4,6 +4,8 @@ import pandas._libs.index as _index from pandas.errors import PerformanceWarning +from pandas.core.dtypes.common import is_categorical_dtype + import pandas as pd from pandas import ( DataFrame, @@ -12,6 +14,7 @@ Series, ) import pandas._testing as tm +from pandas.core.arrays.boolean import BooleanDtype class TestMultiIndexBasic: @@ -211,7 +214,8 @@ def test_multiindex_dtype_preservation(self): # GH51261 columns = MultiIndex.from_tuples([("A", "B")], names=["lvl1", "lvl2"]) df = DataFrame(["value"], columns=columns).astype("category") - assert (df["A"].dtypes == "category").all() + df_no_multiindex = df["A"] + assert is_categorical_dtype(df_no_multiindex["B"]) # geopandas 1763 analogue df = DataFrame( @@ -222,4 +226,4 @@ def test_multiindex_dtype_preservation(self): ["x", "y"], ], ).assign(bools=Series([True, False], dtype="boolean")) - assert df["bools"].dtype == "boolean" + assert isinstance(df["bools"].dtype, BooleanDtype) From 6badc9e6b671bc59d6dadd1fa96f4fd0765f72cc Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 25 Apr 2023 12:26:43 +1000 Subject: [PATCH 7/7] address code review comments Co-Authored-By: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/tests/indexing/multiindex/test_multiindex.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b0e9fa2cea0ee..742324c18428f 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -347,8 +347,8 @@ Missing MultiIndex ^^^^^^^^^^ +- Bug in :meth:`DataFrame.__getitem__` not preserving dtypes for :class:`MultiIndex` partial keys (:issue:`51895`) - Bug in :meth:`MultiIndex.set_levels` not preserving dtypes for :class:`Categorical` (:issue:`52125`) -- I/O ^^^ diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 69c0b0d20ffe3..955c4acfd4c97 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -4,10 +4,9 @@ import pandas._libs.index as _index from pandas.errors import PerformanceWarning -from pandas.core.dtypes.common import is_categorical_dtype - import pandas as pd from pandas import ( + CategoricalDtype, DataFrame, Index, MultiIndex, @@ -215,7 +214,7 @@ def test_multiindex_dtype_preservation(self): columns = MultiIndex.from_tuples([("A", "B")], names=["lvl1", "lvl2"]) df = DataFrame(["value"], columns=columns).astype("category") df_no_multiindex = df["A"] - assert is_categorical_dtype(df_no_multiindex["B"]) + assert isinstance(df_no_multiindex["B"].dtype, CategoricalDtype) # geopandas 1763 analogue df = DataFrame(