From 64fae13815c015eebf3cb8d0708fef4f5ab6e418 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 11 Feb 2023 23:05:29 +0100 Subject: [PATCH 1/2] ENH: Add lazy copy to where --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/internals/blocks.py | 36 +++++++++++++------ pandas/core/internals/managers.py | 1 + pandas/tests/copy_view/test_indexing.py | 48 +++++++++++++++++++++++++ pandas/tests/copy_view/util.py | 5 ++- 5 files changed, 80 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 29f360e050548..36c54e71f7e89 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -227,6 +227,7 @@ Copy-on-Write improvements - :meth:`DataFrame.interpolate` / :meth:`Series.interpolate` - :meth:`DataFrame.ffill` / :meth:`Series.ffill` - :meth:`DataFrame.bfill` / :meth:`Series.bfill` + - :meth:`DataFrame.where` / :meth:`Series.where` - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects` - :meth:`DataFrame.astype` / :meth:`Series.astype` - :meth:`DataFrame.convert_dtypes` / :meth:`Series.convert_dtypes` diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 60d022a0c7964..8000ee1aaac13 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1077,7 +1077,9 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: res_blocks.extend(rbs) return res_blocks - def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]: + def where( + self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False + ) -> list[Block]: """ evaluate the block; return result block(s) from the result @@ -1108,6 +1110,8 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]: icond, noop = validate_putmask(values, ~cond) if noop: # GH-39595: Always return a copy; short-circuit up/downcasting + if using_cow: + return [self.copy(deep=False)] return [self.copy()] if other is lib.no_default: @@ -1127,8 +1131,10 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]: # no need to split columns block = self.coerce_to_target_dtype(other) - blocks = block.where(orig_other, cond) - return self._maybe_downcast(blocks, downcast=_downcast) + blocks = block.where(orig_other, cond, using_cow=using_cow) + return self._maybe_downcast( + blocks, downcast=_downcast, using_cow=using_cow + ) else: # since _maybe_downcast would split blocks anyway, we @@ -1145,7 +1151,9 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]: oth = other[:, i : i + 1] submask = cond[:, i : i + 1] - rbs = nb.where(oth, submask, _downcast=_downcast) + rbs = nb.where( + oth, submask, _downcast=_downcast, using_cow=using_cow + ) res_blocks.extend(rbs) return res_blocks @@ -1536,7 +1544,9 @@ def setitem(self, indexer, value, using_cow: bool = False): else: return self - def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]: + def where( + self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False + ) -> list[Block]: # _downcast private bc we only specify it when calling from fillna arr = self.values.T @@ -1554,6 +1564,8 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]: if noop: # GH#44181, GH#45135 # Avoid a) raising for Interval/PeriodDtype and b) unnecessary object upcast + if using_cow: + return [self.copy(deep=False)] return [self.copy()] try: @@ -1566,15 +1578,19 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]: if is_interval_dtype(self.dtype): # TestSetitemFloatIntervalWithIntIntervalValues blk = self.coerce_to_target_dtype(orig_other) - nbs = blk.where(orig_other, orig_cond) - return self._maybe_downcast(nbs, downcast=_downcast) + nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) + return self._maybe_downcast( + nbs, downcast=_downcast, using_cow=using_cow + ) elif isinstance(self, NDArrayBackedExtensionBlock): # NB: not (yet) the same as # isinstance(values, NDArrayBackedExtensionArray) blk = self.coerce_to_target_dtype(orig_other) - nbs = blk.where(orig_other, orig_cond) - return self._maybe_downcast(nbs, downcast=_downcast) + nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) + return self._maybe_downcast( + nbs, downcast=_downcast, using_cow=using_cow + ) else: raise @@ -1592,7 +1608,7 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]: n = orig_other[:, i : i + 1] submask = orig_cond[:, i : i + 1] - rbs = nb.where(n, submask) + rbs = nb.where(n, submask, using_cow=using_cow) res_blocks.extend(rbs) return res_blocks diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index eb4c2d642862b..01ddf103eb515 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -343,6 +343,7 @@ def where(self: T, other, cond, align: bool) -> T: align_keys=align_keys, other=other, cond=cond, + using_cow=using_copy_on_write(), ) def setitem(self: T, indexer, value) -> T: diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index a673d8b37a008..e859c5f275052 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -919,3 +919,51 @@ def test_set_value_copy_only_necessary_column( assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) else: assert np.shares_memory(get_array(df, "a"), get_array(view, "a")) + + +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +def test_where_noop(using_copy_on_write, dtype): + ser = Series([1, 2, 3], dtype=dtype) + ser_orig = ser.copy() + + result = ser.where(ser > 0, 10) + + if using_copy_on_write: + assert np.shares_memory(get_array(ser), get_array(result)) + else: + assert not np.shares_memory(get_array(ser), get_array(result)) + + result.iloc[0] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(ser), get_array(result)) + tm.assert_series_equal(ser, ser_orig) + + +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +def test_where(using_copy_on_write, dtype): + ser = Series([1, 2, 3], dtype=dtype) + ser_orig = ser.copy() + + result = ser.where(ser < 0, 10) + + assert not np.shares_memory(get_array(ser), get_array(result)) + tm.assert_series_equal(ser, ser_orig) + + +@pytest.mark.parametrize("dtype, val", [("int64", 10.5), ("Int64", 10)]) +def test_where_noop_on_single_column(using_copy_on_write, dtype, val): + df = DataFrame({"a": [1, 2, 3], "b": [-4, -5, -6]}, dtype=dtype) + df_orig = df.copy() + + result = df.where(df < 0, val) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(result, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + else: + assert not np.shares_memory(get_array(df, "b"), get_array(result, "b")) + + result.iloc[0, 1] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "b"), get_array(result, "b")) + tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/util.py b/pandas/tests/copy_view/util.py index ba9b0ecdebb58..1c6b5b51fa265 100644 --- a/pandas/tests/copy_view/util.py +++ b/pandas/tests/copy_view/util.py @@ -11,7 +11,10 @@ def get_array(obj, col=None): this is done by some other operation). """ if isinstance(obj, Series) and (col is None or obj.name == col): - return obj._values + arr = obj._values + if isinstance(arr, BaseMaskedArray): + return arr._data + return arr assert col is not None icol = obj.columns.get_loc(col) assert isinstance(icol, int) From e152a7551e418256ee41e6f7a5e70becaa0056ba Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 14 Feb 2023 21:57:21 +0100 Subject: [PATCH 2/2] Move tests --- pandas/tests/copy_view/test_indexing.py | 48 ------------------------- pandas/tests/copy_view/test_methods.py | 48 +++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index e859c5f275052..a673d8b37a008 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -919,51 +919,3 @@ def test_set_value_copy_only_necessary_column( assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) else: assert np.shares_memory(get_array(df, "a"), get_array(view, "a")) - - -@pytest.mark.parametrize("dtype", ["int64", "Int64"]) -def test_where_noop(using_copy_on_write, dtype): - ser = Series([1, 2, 3], dtype=dtype) - ser_orig = ser.copy() - - result = ser.where(ser > 0, 10) - - if using_copy_on_write: - assert np.shares_memory(get_array(ser), get_array(result)) - else: - assert not np.shares_memory(get_array(ser), get_array(result)) - - result.iloc[0] = 10 - if using_copy_on_write: - assert not np.shares_memory(get_array(ser), get_array(result)) - tm.assert_series_equal(ser, ser_orig) - - -@pytest.mark.parametrize("dtype", ["int64", "Int64"]) -def test_where(using_copy_on_write, dtype): - ser = Series([1, 2, 3], dtype=dtype) - ser_orig = ser.copy() - - result = ser.where(ser < 0, 10) - - assert not np.shares_memory(get_array(ser), get_array(result)) - tm.assert_series_equal(ser, ser_orig) - - -@pytest.mark.parametrize("dtype, val", [("int64", 10.5), ("Int64", 10)]) -def test_where_noop_on_single_column(using_copy_on_write, dtype, val): - df = DataFrame({"a": [1, 2, 3], "b": [-4, -5, -6]}, dtype=dtype) - df_orig = df.copy() - - result = df.where(df < 0, val) - - if using_copy_on_write: - assert np.shares_memory(get_array(df, "b"), get_array(result, "b")) - assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) - else: - assert not np.shares_memory(get_array(df, "b"), get_array(result, "b")) - - result.iloc[0, 1] = 10 - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "b"), get_array(result, "b")) - tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index d822cc03c499d..1243345a38e12 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1301,6 +1301,54 @@ def test_putmask_dont_copy_some_blocks(using_copy_on_write, val, exp): assert view.iloc[0, 0] == 5 +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +def test_where_noop(using_copy_on_write, dtype): + ser = Series([1, 2, 3], dtype=dtype) + ser_orig = ser.copy() + + result = ser.where(ser > 0, 10) + + if using_copy_on_write: + assert np.shares_memory(get_array(ser), get_array(result)) + else: + assert not np.shares_memory(get_array(ser), get_array(result)) + + result.iloc[0] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(ser), get_array(result)) + tm.assert_series_equal(ser, ser_orig) + + +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +def test_where(using_copy_on_write, dtype): + ser = Series([1, 2, 3], dtype=dtype) + ser_orig = ser.copy() + + result = ser.where(ser < 0, 10) + + assert not np.shares_memory(get_array(ser), get_array(result)) + tm.assert_series_equal(ser, ser_orig) + + +@pytest.mark.parametrize("dtype, val", [("int64", 10.5), ("Int64", 10)]) +def test_where_noop_on_single_column(using_copy_on_write, dtype, val): + df = DataFrame({"a": [1, 2, 3], "b": [-4, -5, -6]}, dtype=dtype) + df_orig = df.copy() + + result = df.where(df < 0, val) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(result, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + else: + assert not np.shares_memory(get_array(df, "b"), get_array(result, "b")) + + result.iloc[0, 1] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "b"), get_array(result, "b")) + tm.assert_frame_equal(df, df_orig) + + def test_asfreq_noop(using_copy_on_write): df = DataFrame( {"a": [0.0, None, 2.0, 3.0]},