From 4cb5f7daa7e5c04522b2328cd24b7464592bc307 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 20 Aug 2022 09:57:47 +0200 Subject: [PATCH 1/4] REGR: ensure DataFrame.select_dtypes() returns a copy --- doc/source/whatsnew/v1.4.4.rst | 1 + pandas/core/frame.py | 2 +- pandas/core/internals/array_manager.py | 7 +++++-- pandas/core/internals/managers.py | 4 ++-- pandas/tests/frame/methods/test_select_dtypes.py | 9 +++++++++ 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index 2f290217bf985..ec6f7d593138a 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.loc` not aligning index in some cases when setting a :class:`DataFrame` (:issue:`47578`) - Fixed regression in :meth:`DataFrame.loc` setting a length-1 array like value to a single value in the DataFrame (:issue:`46268`) - Fixed regression in setting ``None`` or non-string value into a ``string``-dtype Series using a mask (:issue:`47628`) +- Fixed regression in :meth:`DataFrame.select_dtypes` returning a view on the original DataFrame (:issue:`48090`) - Fixed regression in :func:`merge` throwing an error when passing a :class:`Series` with a multi-level name (:issue:`47946`) - Fixed regression in :meth:`DataFrame.eval` creating a copy when updating inplace (:issue:`47449`) - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bcc7a2ae8f83f..29eb2f2a04f02 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4706,7 +4706,7 @@ def predicate(arr: ArrayLike) -> bool: return True - mgr = self._mgr._get_data_subset(predicate) + mgr = self._mgr._get_data_subset(predicate, copy=True) return type(self)(mgr).__finalize__(self) def insert( diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 88f81064b826f..c04c0744cf8ff 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -477,9 +477,12 @@ def is_view(self) -> bool: def is_single_block(self) -> bool: return len(self.arrays) == 1 - def _get_data_subset(self: T, predicate: Callable) -> T: + def _get_data_subset(self: T, predicate: Callable, copy: bool = False) -> T: indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)] - arrays = [self.arrays[i] for i in indices] + if copy: + arrays = [self.arrays[i].copy() for i in indices] + else: + arrays = [self.arrays[i] for i in indices] # TODO copy? # Note: using Index.take ensures we can retain e.g. DatetimeIndex.freq, # see test_describe_datetime_columns diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4e84b013b2a11..3b67b4f863e6c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -464,9 +464,9 @@ def is_view(self) -> bool: return False - def _get_data_subset(self: T, predicate: Callable) -> T: + def _get_data_subset(self: T, predicate: Callable, copy: bool = False) -> T: blocks = [blk for blk in self.blocks if predicate(blk.values)] - return self._combine(blocks, copy=False) + return self._combine(blocks, copy=copy) def get_bool_data(self: T, copy: bool = False) -> T: """ diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 6ff5a41b67ec2..9284e0c0cced6 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -456,3 +456,12 @@ def test_np_bool_ea_boolean_include_number(self): result = df.select_dtypes(include="number") expected = DataFrame({"a": [1, 2, 3]}) tm.assert_frame_equal(result, expected) + + def test_select_dtypes_no_view(self): + # https://github.com/pandas-dev/pandas/issues/48090 + # result of this method is not a view on the original dataframe + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df_orig = df.copy() + result = df.select_dtypes(include=["number"]) + result.iloc[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) From 8a8f6cfd90d413b15a3811a5c5c4cd6866515a60 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 20 Aug 2022 10:32:10 +0200 Subject: [PATCH 2/4] fix type issues --- pandas/core/internals/array_manager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index c04c0744cf8ff..a785a6dc76907 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -1332,8 +1332,11 @@ def idelete(self, indexer) -> SingleArrayManager: self._axes = [self._axes[0][to_keep]] return self - def _get_data_subset(self, predicate: Callable) -> SingleArrayManager: + def _get_data_subset( + self, predicate: Callable, copy: bool = False + ) -> SingleArrayManager: # used in get_numeric_data / get_bool_data + # copy keyword is being ignored, only added for signature compat with base class if predicate(self.array): return type(self)(self.arrays, self._axes, verify_integrity=False) else: From 49e86fa54426bf6946d31a000b35b773f0424ad8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Aug 2022 07:10:36 +0200 Subject: [PATCH 3/4] simplify copy --- pandas/core/frame.py | 2 +- pandas/core/internals/array_manager.py | 12 +++--------- pandas/core/internals/managers.py | 4 ++-- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3be428d0d23af..dc8419e8f0543 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4740,7 +4740,7 @@ def predicate(arr: ArrayLike) -> bool: return True - mgr = self._mgr._get_data_subset(predicate, copy=True) + mgr = self._mgr._get_data_subset(predicate).copy() return type(self)(mgr).__finalize__(self) def insert( diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 25bdc8365872a..dcf69dfda1ae8 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -477,12 +477,9 @@ def is_view(self) -> bool: def is_single_block(self) -> bool: return len(self.arrays) == 1 - def _get_data_subset(self: T, predicate: Callable, copy: bool = False) -> T: + def _get_data_subset(self: T, predicate: Callable) -> T: indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)] - if copy: - arrays = [self.arrays[i].copy() for i in indices] - else: - arrays = [self.arrays[i] for i in indices] + arrays = [self.arrays[i] for i in indices] # TODO copy? # Note: using Index.take ensures we can retain e.g. DatetimeIndex.freq, # see test_describe_datetime_columns @@ -1342,11 +1339,8 @@ def idelete(self, indexer) -> SingleArrayManager: self._axes = [self._axes[0][to_keep]] return self - def _get_data_subset( - self, predicate: Callable, copy: bool = False - ) -> SingleArrayManager: + def _get_data_subset(self, predicate: Callable) -> SingleArrayManager: # used in get_numeric_data / get_bool_data - # copy keyword is being ignored, only added for signature compat with base class if predicate(self.array): return type(self)(self.arrays, self._axes, verify_integrity=False) else: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index bc88abfee5798..3084bcea49f05 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -523,9 +523,9 @@ def is_view(self) -> bool: return False - def _get_data_subset(self: T, predicate: Callable, copy: bool = False) -> T: + def _get_data_subset(self: T, predicate: Callable) -> T: blocks = [blk for blk in self.blocks if predicate(blk.values)] - return self._combine(blocks, copy=copy) + return self._combine(blocks, copy=False) def get_bool_data(self: T, copy: bool = False) -> T: """ From f1d0502e8cfa10998e58a0c0a3034363aa90882e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Aug 2022 10:02:44 +0200 Subject: [PATCH 4/4] fixup tests --- pandas/core/frame.py | 2 +- pandas/tests/copy_view/test_methods.py | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dc8419e8f0543..4302b14da6418 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4740,7 +4740,7 @@ def predicate(arr: ArrayLike) -> bool: return True - mgr = self._mgr._get_data_subset(predicate).copy() + mgr = self._mgr._get_data_subset(predicate).copy(deep=None) return type(self)(mgr).__finalize__(self) def insert( diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index cc4c219e6c5d9..df723808ce06b 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -139,18 +139,16 @@ def test_select_dtypes(using_copy_on_write): df2 = df.select_dtypes("int64") df2._mgr._verify_integrity() - # currently this always returns a "view" - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column/block df2.iloc[0, 0] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - tm.assert_frame_equal(df, df_orig) - else: - # but currently select_dtypes() actually returns a view -> mutates parent - df_orig.iloc[0, 0] = 0 - tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df, df_orig) def test_to_frame(using_copy_on_write):