From 020de9358f897069e624de385c127e34b128bb07 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 24 Dec 2022 15:30:58 +0100 Subject: [PATCH 1/6] Use lazy copy for dropna --- pandas/core/frame.py | 2 +- pandas/tests/copy_view/test_methods.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 21b3a0c033702..5ef6d2b8ad759 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6325,7 +6325,7 @@ def dropna( raise ValueError(f"invalid how option: {how}") if np.all(mask): - result = self.copy() + result = self.copy(deep=None) else: result = self.loc(axis=axis)[mask] diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index f5c7b31e59bc5..aa4afd60b5a3a 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -292,6 +292,23 @@ def test_add_suffix(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +@pytest.mark.parametrize("axis, val", [(0, 5.5), (1, np.nan)]) +def test_dropna(using_copy_on_write, axis, val): + df = DataFrame({"a": [1, 2, 3], "b": [4, val, 6], "c": "d"}) + df_orig = df.copy() + df2 = df.dropna(axis=axis) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + @pytest.mark.parametrize( "method", [ From 6816c5378cc2f60a5042ba48197d0ba5ff7b6dbb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 24 Dec 2022 17:05:39 +0100 Subject: [PATCH 2/6] Add support for series --- pandas/core/internals/managers.py | 2 ++ pandas/core/series.py | 2 +- pandas/tests/copy_view/test_methods.py | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 53f347ec4d372..e65cdf512b289 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1950,6 +1950,8 @@ def _blklocs(self): def getitem_mgr(self, indexer: slice | npt.NDArray[np.bool_]) -> SingleBlockManager: # similar to get_slice, but not restricted to slice indexer blk = self._block + if _using_copy_on_write() and isinstance(indexer, np.ndarray) and indexer.all(): + return type(self)(blk, self.index, [weakref.ref(blk)], parent=self) array = blk._slice(indexer) if array.ndim > 1: # This will be caught by Series._get_values diff --git a/pandas/core/series.py b/pandas/core/series.py index 1bdf92e1dcf02..b1bb9a5e312fa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5562,7 +5562,7 @@ def dropna( return result else: if not inplace: - return self.copy() + return self.copy(deep=None) return None # ---------------------------------------------------------------------- diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index aa4afd60b5a3a..f061bb827aef9 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -309,6 +309,23 @@ def test_dropna(using_copy_on_write, axis, val): tm.assert_frame_equal(df, df_orig) +@pytest.mark.parametrize("val", [5, 5.5]) +def test_dropna_series(using_copy_on_write, val): + ser = Series([1, val, 4]) + ser_orig = ser.copy() + ser2 = ser.dropna() + + if using_copy_on_write: + assert np.shares_memory(ser2.values, ser.values) + else: + assert not np.shares_memory(ser2.values, ser.values) + + ser2.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(ser2.values, ser.values) + tm.assert_series_equal(ser, ser_orig) + + @pytest.mark.parametrize( "method", [ From 0c2e9a33dfa8c8ab8aa2198c587803d1fa8b853d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 24 Dec 2022 18:47:36 +0100 Subject: [PATCH 3/6] Fix tests --- pandas/core/internals/managers.py | 10 ++++++++-- pandas/tests/series/indexing/test_datetime.py | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e65cdf512b289..0993a4f35a9e6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1947,10 +1947,16 @@ def _blklocs(self): """compat with BlockManager""" return None - def getitem_mgr(self, indexer: slice | npt.NDArray[np.bool_]) -> SingleBlockManager: + def getitem_mgr(self, indexer: slice | np.ndarray) -> SingleBlockManager: # similar to get_slice, but not restricted to slice indexer blk = self._block - if _using_copy_on_write() and isinstance(indexer, np.ndarray) and indexer.all(): + if ( + _using_copy_on_write() + and isinstance(indexer, np.ndarray) + and len(indexer) > 0 + and com.is_bool_indexer(indexer) + and indexer.all() + ): return type(self)(blk, self.index, [weakref.ref(blk)], parent=self) array = blk._slice(indexer) if array.ndim > 1: diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 48cef368b387d..4fa07780822d1 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -364,6 +364,7 @@ def test_indexing_over_size_cutoff_period_index(monkeypatch): def test_indexing_unordered(): # GH 2437 + pd.options.mode.copy_on_write = True rng = date_range(start="2011-01-01", end="2011-01-15") ts = Series(np.random.rand(len(rng)), index=rng) ts2 = pd.concat([ts[0:4], ts[-4:], ts[4:-4]]) From 942f3f38ba813dc5e1609130360f8dc2f336faab Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 24 Dec 2022 18:48:00 +0100 Subject: [PATCH 4/6] Fix tests --- pandas/tests/series/indexing/test_datetime.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 4fa07780822d1..48cef368b387d 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -364,7 +364,6 @@ def test_indexing_over_size_cutoff_period_index(monkeypatch): def test_indexing_unordered(): # GH 2437 - pd.options.mode.copy_on_write = True rng = date_range(start="2011-01-01", end="2011-01-15") ts = Series(np.random.rand(len(rng)), index=rng) ts2 = pd.concat([ts[0:4], ts[-4:], ts[4:-4]]) From 560a72cd2d43373223071ef7b0e8f3c8476ae362 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 12 Jan 2023 23:13:10 +0100 Subject: [PATCH 5/6] Fix ref --- pandas/core/internals/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 56760852217ad..37fd858ea9769 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1961,7 +1961,7 @@ def getitem_mgr(self, indexer: slice | np.ndarray) -> SingleBlockManager: # similar to get_slice, but not restricted to slice indexer blk = self._block if ( - _using_copy_on_write() + using_copy_on_write() and isinstance(indexer, np.ndarray) and len(indexer) > 0 and com.is_bool_indexer(indexer) From d48f8301de9b6b83f88ac0d003b4b24c68249aa4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 13 Jan 2023 15:02:50 +0100 Subject: [PATCH 6/6] Fix type hint --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index cb9f3371aaea3..7a1b03a3c3539 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -921,7 +921,7 @@ def _ixs(self, i: int, axis: AxisInt = 0) -> Any: """ return self._values[i] - def _slice(self, slobj: slice, axis: Axis = 0) -> Series: + def _slice(self, slobj: slice | np.ndarray, axis: Axis = 0) -> Series: # axis kwarg is retained for compat with NDFrame method # _slice is *always* positional return self._get_values(slobj)