diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 53581420f920f..fc083433b9b53 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -223,6 +223,7 @@ Copy-on-Write improvements - :meth:`DataFrame.to_period` / :meth:`Series.to_period` - :meth:`DataFrame.truncate` - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize` + - :meth:`DataFrame.fillna` / :meth:`Series.fillna` - :meth:`DataFrame.interpolate` / :meth:`Series.interpolate` - :meth:`DataFrame.ffill` / :meth:`Series.ffill` - :meth:`DataFrame.bfill` / :meth:`Series.bfill` diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 47691a7f964f7..60d022a0c7964 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1014,6 +1014,7 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: ---------- mask : np.ndarray[bool], SparseArray[bool], or BooleanArray new : a ndarray/object + using_cow: bool, default False Returns ------- @@ -1188,7 +1189,12 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]: return [self.make_block(result)] def fillna( - self, value, limit: int | None = None, inplace: bool = False, downcast=None + self, + value, + limit: int | None = None, + inplace: bool = False, + downcast=None, + using_cow: bool = False, ) -> list[Block]: """ fillna on the block with the value. If we fail, then convert to @@ -1207,20 +1213,22 @@ def fillna( if noop: # we can't process the value, but nothing to do if inplace: + if using_cow: + return [self.copy(deep=False)] # Arbitrarily imposing the convention that we ignore downcast # on no-op when inplace=True return [self] else: # GH#45423 consistent downcasting on no-ops. - nb = self.copy() - nbs = nb._maybe_downcast([nb], downcast=downcast) + nb = self.copy(deep=not using_cow) + nbs = nb._maybe_downcast([nb], downcast=downcast, using_cow=using_cow) return nbs if limit is not None: mask[mask.cumsum(self.ndim - 1) > limit] = False if inplace: - nbs = self.putmask(mask.T, value) + nbs = self.putmask(mask.T, value, using_cow=using_cow) else: # without _downcast, we would break # test_fillna_dtype_conversion_equiv_replace @@ -1230,7 +1238,10 @@ def fillna( # makes a difference bc blk may have object dtype, which has # different behavior in _maybe_downcast. return extend_blocks( - [blk._maybe_downcast([blk], downcast=downcast) for blk in nbs] + [ + blk._maybe_downcast([blk], downcast=downcast, using_cow=using_cow) + for blk in nbs + ] ) def interpolate( @@ -1725,12 +1736,21 @@ class ExtensionBlock(libinternals.Block, EABackedBlock): values: ExtensionArray def fillna( - self, value, limit: int | None = None, inplace: bool = False, downcast=None + self, + value, + limit: int | None = None, + inplace: bool = False, + downcast=None, + using_cow: bool = False, ) -> list[Block]: if is_interval_dtype(self.dtype): # Block.fillna handles coercion (test_fillna_interval) return super().fillna( - value=value, limit=limit, inplace=inplace, downcast=downcast + value=value, + limit=limit, + inplace=inplace, + downcast=downcast, + using_cow=using_cow, ) new_values = self.values.fillna(value=value, method=None, limit=limit) nb = self.make_block_same_class(new_values) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7508b5cd1e8e7..eb4c2d642862b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -398,15 +398,14 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: if limit is not None: # Do this validation even if we go through one of the no-op paths limit = libalgos.validate_limit(None, limit=limit) - if inplace: - # TODO(CoW) can be optimized to only copy those blocks that have refs - if using_copy_on_write() and any( - not self._has_no_reference_block(i) for i in range(len(self.blocks)) - ): - self = self.copy() return self.apply( - "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast + "fillna", + value=value, + limit=limit, + inplace=inplace, + downcast=downcast, + using_cow=using_copy_on_write(), ) def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T: diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 1bfcf7d180e54..32a36b9690465 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -3,9 +3,11 @@ from pandas import ( DataFrame, + Interval, NaT, Series, Timestamp, + interval_range, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -162,3 +164,71 @@ def test_interpolate_downcast_reference_triggers_copy(using_copy_on_write): tm.assert_frame_equal(df_orig, view) else: tm.assert_frame_equal(df, view) + + +def test_fillna(using_copy_on_write): + df = DataFrame({"a": [1.5, np.nan], "b": 1}) + df_orig = df.copy() + + df2 = df.fillna(5.5) + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + else: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + df2.iloc[0, 1] = 100 + tm.assert_frame_equal(df_orig, df) + + +@pytest.mark.parametrize("downcast", [None, False]) +def test_fillna_inplace(using_copy_on_write, downcast): + df = DataFrame({"a": [1.5, np.nan], "b": 1}) + arr_a = get_array(df, "a") + arr_b = get_array(df, "b") + + df.fillna(5.5, inplace=True, downcast=downcast) + assert np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "b"), arr_b) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(1) + + +def test_fillna_inplace_reference(using_copy_on_write): + df = DataFrame({"a": [1.5, np.nan], "b": 1}) + df_orig = df.copy() + arr_a = get_array(df, "a") + arr_b = get_array(df, "b") + view = df[:] + + df.fillna(5.5, inplace=True) + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "b"), arr_b) + assert view._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "b"), arr_b) + expected = DataFrame({"a": [1.5, 5.5], "b": 1}) + tm.assert_frame_equal(df, expected) + + +def test_fillna_interval_inplace_reference(using_copy_on_write): + ser = Series(interval_range(start=0, end=5), name="a") + ser.iloc[1] = np.nan + + ser_orig = ser.copy() + view = ser[:] + ser.fillna(value=Interval(left=0, right=5), inplace=True) + + if using_copy_on_write: + assert not np.shares_memory( + get_array(ser, "a").left.values, get_array(view, "a").left.values + ) + tm.assert_series_equal(view, ser_orig) + else: + assert np.shares_memory( + get_array(ser, "a").left.values, get_array(view, "a").left.values + ) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b74372017f303..85c2febefb6ce 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -249,14 +249,17 @@ def test_fillna_copy_frame(self, data_missing): assert df.A.values is not result.A.values - def test_fillna_copy_series(self, data_missing): + def test_fillna_copy_series(self, data_missing, no_op_with_cow: bool = False): arr = data_missing.take([1, 1]) ser = pd.Series(arr) filled_val = ser[0] result = ser.fillna(filled_val) - assert ser._values is not result._values + if no_op_with_cow: + assert ser._values is result._values + else: + assert ser._values is not result._values assert ser._values is arr def test_fillna_length_mismatch(self, data_missing): diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 92796c604333d..0ad2de9e834fa 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -116,6 +116,11 @@ def test_combine_add(self, data_repeated): # Timestamp.__add__(Timestamp) not defined pass + def test_fillna_copy_series(self, data_missing, using_copy_on_write): + super().test_fillna_copy_series( + data_missing, no_op_with_cow=using_copy_on_write + ) + class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests): pass diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 0f916cea9d518..3cf24d848e453 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -132,6 +132,11 @@ def test_combine_add(self, data_repeated): def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) + def test_fillna_copy_series(self, data_missing, using_copy_on_write): + super().test_fillna_copy_series( + data_missing, no_op_with_cow=using_copy_on_write + ) + class TestMissing(BaseInterval, base.BaseMissingTests): # Index.fillna only accepts scalar `value`, so we have to xfail all diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 923dc24240f61..fc7ca41399baf 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -105,6 +105,11 @@ def test_diff(self, data, periods): else: super().test_diff(data, periods) + def test_fillna_copy_series(self, data_missing, using_copy_on_write): + super().test_fillna_copy_series( + data_missing, no_op_with_cow=using_copy_on_write + ) + class TestInterface(BasePeriodTests, base.BaseInterfaceTests):