From f3fce88b63bc3e2f6d05c4fa3353a9cdee010449 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 10 Feb 2023 00:11:11 +0100 Subject: [PATCH 1/5] ENH: Add CoW optimization for fillna --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/internals/blocks.py | 54 ++++++++++--- pandas/core/internals/managers.py | 13 ++-- pandas/tests/copy_view/test_interp_fillna.py | 79 ++++++++++++++++++++ 4 files changed, 129 insertions(+), 18 deletions(-) create mode 100644 pandas/tests/copy_view/test_interp_fillna.py diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 9fd9faf057a8a..d04153722dbab 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -223,6 +223,7 @@ Copy-on-Write improvements - :meth:`DataFrame.to_period` / :meth:`Series.to_period` - :meth:`DataFrame.truncate` - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize` + - :meth:`DataFrame.fillna` / :meth:`Series.fillna` - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects` - :meth:`DataFrame.astype` / :meth:`Series.astype` - :func:`concat` diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e66011acb978b..1be15d8ee2702 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -421,7 +421,9 @@ def coerce_to_target_dtype(self, other) -> Block: return self.astype(new_dtype, copy=False) @final - def _maybe_downcast(self, blocks: list[Block], downcast=None) -> list[Block]: + def _maybe_downcast( + self, blocks: list[Block], downcast=None, using_cow: bool = False + ) -> list[Block]: if downcast is False: return blocks @@ -431,23 +433,30 @@ def _maybe_downcast(self, blocks: list[Block], downcast=None) -> list[Block]: # but ATM it breaks too much existing code. # split and convert the blocks - return extend_blocks([blk.convert() for blk in blocks]) + return extend_blocks( + [blk.convert(copy=not using_cow, using_cow=using_cow) for blk in blocks] + ) if downcast is None: return blocks - return extend_blocks([b._downcast_2d(downcast) for b in blocks]) + return extend_blocks( + [b._downcast_2d(downcast, using_cow=using_cow) for b in blocks] + ) @final @maybe_split - def _downcast_2d(self, dtype) -> list[Block]: + def _downcast_2d(self, dtype, using_cow: bool = False) -> list[Block]: """ downcast specialized to 2D case post-validation. Refactored to allow use of maybe_split. """ new_values = maybe_downcast_to_dtype(self.values, dtype=dtype) - return [self.make_block(new_values)] + refs = None + if using_cow and new_values is self.values: + refs = self.refs + return [self.make_block(new_values, refs=refs)] def convert( self, @@ -1152,7 +1161,12 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]: return [self.make_block(result)] def fillna( - self, value, limit: int | None = None, inplace: bool = False, downcast=None + self, + value, + limit: int | None = None, + inplace: bool = False, + downcast=None, + using_cow: bool = False, ) -> list[Block]: """ fillna on the block with the value. If we fail, then convert to @@ -1171,19 +1185,25 @@ def fillna( if noop: # we can't process the value, but nothing to do if inplace: + if using_cow: + return [self.copy(deep=False)] # Arbitrarily imposing the convention that we ignore downcast # on no-op when inplace=True return [self] else: # GH#45423 consistent downcasting on no-ops. - nb = self.copy() - nbs = nb._maybe_downcast([nb], downcast=downcast) + nb = self.copy(deep=not using_cow) + nbs = nb._maybe_downcast([nb], downcast=downcast, using_cow=using_cow) return nbs if limit is not None: mask[mask.cumsum(self.ndim - 1) > limit] = False if inplace: + if using_cow and self.refs.has_reference(): + # TODO(CoW): If using_cow is implemented for putmask we can defer + # the copy + self = self.copy() nbs = self.putmask(mask.T, value) else: # without _downcast, we would break @@ -1194,7 +1214,10 @@ def fillna( # makes a difference bc blk may have object dtype, which has # different behavior in _maybe_downcast. return extend_blocks( - [blk._maybe_downcast([blk], downcast=downcast) for blk in nbs] + [ + blk._maybe_downcast([blk], downcast=downcast, using_cow=using_cow) + for blk in nbs + ] ) def interpolate( @@ -1662,12 +1685,21 @@ class ExtensionBlock(libinternals.Block, EABackedBlock): values: ExtensionArray def fillna( - self, value, limit: int | None = None, inplace: bool = False, downcast=None + self, + value, + limit: int | None = None, + inplace: bool = False, + downcast=None, + using_cow: bool = False, ) -> list[Block]: if is_interval_dtype(self.dtype): # Block.fillna handles coercion (test_fillna_interval) return super().fillna( - value=value, limit=limit, inplace=inplace, downcast=downcast + value=value, + limit=limit, + inplace=inplace, + downcast=downcast, + using_cow=using_cow, ) new_values = self.values.fillna(value=value, method=None, limit=limit) nb = self.make_block_same_class(new_values) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 517e6d7e48275..1154949bb44e9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -410,15 +410,14 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: if limit is not None: # Do this validation even if we go through one of the no-op paths limit = libalgos.validate_limit(None, limit=limit) - if inplace: - # TODO(CoW) can be optimized to only copy those blocks that have refs - if using_copy_on_write() and any( - not self._has_no_reference_block(i) for i in range(len(self.blocks)) - ): - self = self.copy() return self.apply( - "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast + "fillna", + value=value, + limit=limit, + inplace=inplace, + downcast=downcast, + using_cow=using_copy_on_write(), ) def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T: diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py new file mode 100644 index 0000000000000..233455a3238bf --- /dev/null +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -0,0 +1,79 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Interval, + Series, + interval_range, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def test_fillna(using_copy_on_write): + df = DataFrame({"a": [1.5, np.nan], "b": 1}) + df_orig = df.copy() + + df2 = df.fillna(5.5) + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + else: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + df2.iloc[0, 1] = 100 + tm.assert_frame_equal(df_orig, df) + + +@pytest.mark.parametrize("downcast", [None, False]) +def test_fillna_inplace(using_copy_on_write, downcast): + df = DataFrame({"a": [1.5, np.nan], "b": 1}) + arr_a = get_array(df, "a") + arr_b = get_array(df, "b") + + df.fillna(5.5, inplace=True, downcast=downcast) + assert np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "b"), arr_b) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(1) + + +def test_fillna_inplace_reference(using_copy_on_write): + df = DataFrame({"a": [1.5, np.nan], "b": 1}) + df_orig = df.copy() + arr_a = get_array(df, "a") + arr_b = get_array(df, "b") + view = df[:] + + df.fillna(5.5, inplace=True) + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "b"), arr_b) + assert view._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "b"), arr_b) + expected = DataFrame({"a": [1.5, 5.5], "b": 1}) + tm.assert_frame_equal(df, expected) + + +def test_fillna_interval_inplace_reference(using_copy_on_write): + ser = Series(interval_range(start=0, end=5), name="a") + ser.iloc[1] = np.nan + + ser_orig = ser.copy() + view = ser[:] + ser.fillna(value=Interval(left=0, right=5), inplace=True) + + if using_copy_on_write: + assert not np.shares_memory( + get_array(ser, "a").left.values, get_array(view, "a").left.values + ) + tm.assert_series_equal(view, ser_orig) + else: + assert np.shares_memory( + get_array(ser, "a").left.values, get_array(view, "a").left.values + ) From eae6ffcbcaf6ce3e43b8f9d029866b1119ea6dde Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 10 Feb 2023 13:57:13 +0100 Subject: [PATCH 2/5] Fix merge --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/tests/copy_view/test_interp_fillna.py | 70 ++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d1bc322a6ec7a..2df21318aa237 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -223,6 +223,7 @@ Copy-on-Write improvements - :meth:`DataFrame.to_period` / :meth:`Series.to_period` - :meth:`DataFrame.truncate` - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize` + - :meth:`DataFrame.fillna` / :meth:`Series.fillna` - :meth:`DataFrame.interpolate` / :meth:`Series.interpolate` - :meth:`DataFrame.ffill` / :meth:`Series.ffill` - :meth:`DataFrame.bfill` / :meth:`Series.bfill` diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 1bfcf7d180e54..32a36b9690465 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -3,9 +3,11 @@ from pandas import ( DataFrame, + Interval, NaT, Series, Timestamp, + interval_range, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -162,3 +164,71 @@ def test_interpolate_downcast_reference_triggers_copy(using_copy_on_write): tm.assert_frame_equal(df_orig, view) else: tm.assert_frame_equal(df, view) + + +def test_fillna(using_copy_on_write): + df = DataFrame({"a": [1.5, np.nan], "b": 1}) + df_orig = df.copy() + + df2 = df.fillna(5.5) + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + else: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + df2.iloc[0, 1] = 100 + tm.assert_frame_equal(df_orig, df) + + +@pytest.mark.parametrize("downcast", [None, False]) +def test_fillna_inplace(using_copy_on_write, downcast): + df = DataFrame({"a": [1.5, np.nan], "b": 1}) + arr_a = get_array(df, "a") + arr_b = get_array(df, "b") + + df.fillna(5.5, inplace=True, downcast=downcast) + assert np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "b"), arr_b) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(1) + + +def test_fillna_inplace_reference(using_copy_on_write): + df = DataFrame({"a": [1.5, np.nan], "b": 1}) + df_orig = df.copy() + arr_a = get_array(df, "a") + arr_b = get_array(df, "b") + view = df[:] + + df.fillna(5.5, inplace=True) + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "b"), arr_b) + assert view._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "b"), arr_b) + expected = DataFrame({"a": [1.5, 5.5], "b": 1}) + tm.assert_frame_equal(df, expected) + + +def test_fillna_interval_inplace_reference(using_copy_on_write): + ser = Series(interval_range(start=0, end=5), name="a") + ser.iloc[1] = np.nan + + ser_orig = ser.copy() + view = ser[:] + ser.fillna(value=Interval(left=0, right=5), inplace=True) + + if using_copy_on_write: + assert not np.shares_memory( + get_array(ser, "a").left.values, get_array(view, "a").left.values + ) + tm.assert_series_equal(view, ser_orig) + else: + assert np.shares_memory( + get_array(ser, "a").left.values, get_array(view, "a").left.values + ) From 7cfc2e2f87b3a5fa95cbc1398c0aac99eea6ccd4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 10 Feb 2023 16:50:33 +0100 Subject: [PATCH 3/5] Fix tests --- pandas/tests/extension/base/methods.py | 7 +++++-- pandas/tests/extension/test_datetime.py | 5 +++++ pandas/tests/extension/test_interval.py | 5 +++++ pandas/tests/extension/test_period.py | 5 +++++ 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b74372017f303..85c2febefb6ce 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -249,14 +249,17 @@ def test_fillna_copy_frame(self, data_missing): assert df.A.values is not result.A.values - def test_fillna_copy_series(self, data_missing): + def test_fillna_copy_series(self, data_missing, no_op_with_cow: bool = False): arr = data_missing.take([1, 1]) ser = pd.Series(arr) filled_val = ser[0] result = ser.fillna(filled_val) - assert ser._values is not result._values + if no_op_with_cow: + assert ser._values is result._values + else: + assert ser._values is not result._values assert ser._values is arr def test_fillna_length_mismatch(self, data_missing): diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 92796c604333d..0ad2de9e834fa 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -116,6 +116,11 @@ def test_combine_add(self, data_repeated): # Timestamp.__add__(Timestamp) not defined pass + def test_fillna_copy_series(self, data_missing, using_copy_on_write): + super().test_fillna_copy_series( + data_missing, no_op_with_cow=using_copy_on_write + ) + class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests): pass diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 0f916cea9d518..3cf24d848e453 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -132,6 +132,11 @@ def test_combine_add(self, data_repeated): def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) + def test_fillna_copy_series(self, data_missing, using_copy_on_write): + super().test_fillna_copy_series( + data_missing, no_op_with_cow=using_copy_on_write + ) + class TestMissing(BaseInterval, base.BaseMissingTests): # Index.fillna only accepts scalar `value`, so we have to xfail all diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 923dc24240f61..fc7ca41399baf 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -105,6 +105,11 @@ def test_diff(self, data, periods): else: super().test_diff(data, periods) + def test_fillna_copy_series(self, data_missing, using_copy_on_write): + super().test_fillna_copy_series( + data_missing, no_op_with_cow=using_copy_on_write + ) + class TestInterface(BasePeriodTests, base.BaseInterfaceTests): From b27670b1b3ed10628f4fc6e798cb666c71c8a1a7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 10 Feb 2023 16:52:05 +0100 Subject: [PATCH 4/5] Fix merge errors --- pandas/core/internals/blocks.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f829720e5c76a..e85d1a370ed2e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -437,15 +437,13 @@ def _maybe_downcast( # split and convert the blocks return extend_blocks( - [blk.convert(copy=not using_cow, using_cow=using_cow) for blk in blocks] + [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] ) if downcast is None: return blocks - return extend_blocks( - [b._downcast_2d(downcast, using_cow=using_cow) for b in blocks] - ) + return extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks]) @final @maybe_split From 97d7b024815b2fdf4882c4cb17613841ee92af20 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 10 Feb 2023 19:37:21 +0100 Subject: [PATCH 5/5] Use putmask optimization --- pandas/core/internals/blocks.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7b6d3abd5451f..60d022a0c7964 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1014,6 +1014,7 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: ---------- mask : np.ndarray[bool], SparseArray[bool], or BooleanArray new : a ndarray/object + using_cow: bool, default False Returns ------- @@ -1227,11 +1228,7 @@ def fillna( mask[mask.cumsum(self.ndim - 1) > limit] = False if inplace: - if using_cow and self.refs.has_reference(): - # TODO(CoW): If using_cow is implemented for putmask we can defer - # the copy - self = self.copy() - nbs = self.putmask(mask.T, value) + nbs = self.putmask(mask.T, value, using_cow=using_cow) else: # without _downcast, we would break # test_fillna_dtype_conversion_equiv_replace