pandas-dev · lithomas1 · Feb 15, 2023 · Feb 11, 2023 · Feb 14, 2023 · Feb 14, 2023
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -227,6 +227,7 @@ Copy-on-Write improvements
   - :meth:`DataFrame.interpolate` / :meth:`Series.interpolate`
   - :meth:`DataFrame.ffill` / :meth:`Series.ffill`
   - :meth:`DataFrame.bfill` / :meth:`Series.bfill`
+  - :meth:`DataFrame.where` / :meth:`Series.where`
   - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects`
   - :meth:`DataFrame.astype` / :meth:`Series.astype`
   - :meth:`DataFrame.convert_dtypes` / :meth:`Series.convert_dtypes`

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1077,7 +1077,9 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
                     res_blocks.extend(rbs)
                 return res_blocks
 
-    def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]:
+    def where(
+        self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False
+    ) -> list[Block]:
         """
         evaluate the block; return result block(s) from the result
 
@@ -1108,6 +1110,8 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]:
         icond, noop = validate_putmask(values, ~cond)
         if noop:
             # GH-39595: Always return a copy; short-circuit up/downcasting
+            if using_cow:
+                return [self.copy(deep=False)]
             return [self.copy()]
 
         if other is lib.no_default:
@@ -1127,8 +1131,10 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]:
                 # no need to split columns
 
                 block = self.coerce_to_target_dtype(other)
-                blocks = block.where(orig_other, cond)
-                return self._maybe_downcast(blocks, downcast=_downcast)
+                blocks = block.where(orig_other, cond, using_cow=using_cow)
+                return self._maybe_downcast(
+                    blocks, downcast=_downcast, using_cow=using_cow
+                )
 
             else:
                 # since _maybe_downcast would split blocks anyway, we
@@ -1145,7 +1151,9 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]:
                         oth = other[:, i : i + 1]
 
                     submask = cond[:, i : i + 1]
-                    rbs = nb.where(oth, submask, _downcast=_downcast)
+                    rbs = nb.where(
+                        oth, submask, _downcast=_downcast, using_cow=using_cow
+                    )
                     res_blocks.extend(rbs)
                 return res_blocks
 
@@ -1536,7 +1544,9 @@ def setitem(self, indexer, value, using_cow: bool = False):
         else:
             return self
 
-    def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]:
+    def where(
+        self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False
+    ) -> list[Block]:
         # _downcast private bc we only specify it when calling from fillna
         arr = self.values.T
 
@@ -1554,6 +1564,8 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]:
         if noop:
             # GH#44181, GH#45135
             # Avoid a) raising for Interval/PeriodDtype and b) unnecessary object upcast
+            if using_cow:
+                return [self.copy(deep=False)]
             return [self.copy()]
 
         try:
@@ -1566,15 +1578,19 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]:
                 if is_interval_dtype(self.dtype):
                     # TestSetitemFloatIntervalWithIntIntervalValues
                     blk = self.coerce_to_target_dtype(orig_other)
-                    nbs = blk.where(orig_other, orig_cond)
-                    return self._maybe_downcast(nbs, downcast=_downcast)
+                    nbs = blk.where(orig_other, orig_cond, using_cow=using_cow)
+                    return self._maybe_downcast(
+                        nbs, downcast=_downcast, using_cow=using_cow
+                    )
 
                 elif isinstance(self, NDArrayBackedExtensionBlock):
                     # NB: not (yet) the same as
                     #  isinstance(values, NDArrayBackedExtensionArray)
                     blk = self.coerce_to_target_dtype(orig_other)
-                    nbs = blk.where(orig_other, orig_cond)
-                    return self._maybe_downcast(nbs, downcast=_downcast)
+                    nbs = blk.where(orig_other, orig_cond, using_cow=using_cow)
+                    return self._maybe_downcast(
+                        nbs, downcast=_downcast, using_cow=using_cow
+                    )
 
                 else:
                     raise
@@ -1592,7 +1608,7 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]:
                         n = orig_other[:, i : i + 1]
 
                     submask = orig_cond[:, i : i + 1]
-                    rbs = nb.where(n, submask)
+                    rbs = nb.where(n, submask, using_cow=using_cow)
                     res_blocks.extend(rbs)
                 return res_blocks
 

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -343,6 +343,7 @@ def where(self: T, other, cond, align: bool) -> T:
             align_keys=align_keys,
             other=other,
             cond=cond,
+            using_cow=using_copy_on_write(),
         )
 
     def setitem(self: T, indexer, value) -> T:

diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py
@@ -919,3 +919,51 @@ def test_set_value_copy_only_necessary_column(
             assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
         else:
             assert np.shares_memory(get_array(df, "a"), get_array(view, "a"))
+
+
+@pytest.mark.parametrize("dtype", ["int64", "Int64"])
+def test_where_noop(using_copy_on_write, dtype):
+    ser = Series([1, 2, 3], dtype=dtype)
+    ser_orig = ser.copy()
+
+    result = ser.where(ser > 0, 10)
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(ser), get_array(result))
+    else:
+        assert not np.shares_memory(get_array(ser), get_array(result))
+
+    result.iloc[0] = 10
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(ser), get_array(result))
+    tm.assert_series_equal(ser, ser_orig)
+
+
+@pytest.mark.parametrize("dtype", ["int64", "Int64"])
+def test_where(using_copy_on_write, dtype):
+    ser = Series([1, 2, 3], dtype=dtype)
+    ser_orig = ser.copy()
+
+    result = ser.where(ser < 0, 10)
+
+    assert not np.shares_memory(get_array(ser), get_array(result))
+    tm.assert_series_equal(ser, ser_orig)
+
+
+@pytest.mark.parametrize("dtype, val", [("int64", 10.5), ("Int64", 10)])
+def test_where_noop_on_single_column(using_copy_on_write, dtype, val):
+    df = DataFrame({"a": [1, 2, 3], "b": [-4, -5, -6]}, dtype=dtype)
+    df_orig = df.copy()
+
+    result = df.where(df < 0, val)
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df, "b"), get_array(result, "b"))
+        assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
+    else:
+        assert not np.shares_memory(get_array(df, "b"), get_array(result, "b"))
+
+    result.iloc[0, 1] = 10
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df, "b"), get_array(result, "b"))
+    tm.assert_frame_equal(df, df_orig)
diff --git a/pandas/tests/copy_view/util.py b/pandas/tests/copy_view/util.py
@@ -11,7 +11,10 @@ def get_array(obj, col=None):
     this is done by some other operation).
     """
     if isinstance(obj, Series) and (col is None or obj.name == col):
-        return obj._values
+        arr = obj._values
+        if isinstance(arr, BaseMaskedArray):
+            return arr._data
+        return arr
     assert col is not None
     icol = obj.columns.get_loc(col)
     assert isinstance(icol, int)