diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index bea7dd0a1ed6d..6bc0cb2969734 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -223,6 +223,9 @@ Copy-on-Write improvements - :meth:`DataFrame.to_period` / :meth:`Series.to_period` - :meth:`DataFrame.truncate` - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize` + - :meth:`DataFrame.interpolate` / :meth:`Series.interpolate` + - :meth:`DataFrame.ffill` / :meth:`Series.ffill` + - :meth:`DataFrame.bfill` / :meth:`Series.bfill` - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects` - :meth:`DataFrame.astype` / :meth:`Series.astype` - :func:`concat` diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e66011acb978b..5ccc0aeb03500 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -228,7 +228,10 @@ def make_block( @final def make_block_same_class( - self, values, placement: BlockPlacement | None = None + self, + values, + placement: BlockPlacement | None = None, + refs: BlockValuesRefs | None = None, ) -> Block: """Wrap given values in a block of same type as self.""" # Pre-2.0 we called ensure_wrapped_if_datetimelike because fastparquet @@ -237,7 +240,7 @@ def make_block_same_class( placement = self._mgr_locs # We assume maybe_coerce_values has already been called - return type(self)(values, placement=placement, ndim=self.ndim) + return type(self)(values, placement=placement, ndim=self.ndim, refs=refs) @final def __repr__(self) -> str: @@ -421,7 +424,9 @@ def coerce_to_target_dtype(self, other) -> Block: return self.astype(new_dtype, copy=False) @final - def _maybe_downcast(self, blocks: list[Block], downcast=None) -> list[Block]: + def _maybe_downcast( + self, blocks: list[Block], downcast=None, using_cow: bool = False + ) -> list[Block]: if downcast is False: return blocks @@ -431,23 +436,26 @@ def _maybe_downcast(self, blocks: list[Block], downcast=None) -> list[Block]: # but ATM it breaks too much existing code. # split and convert the blocks - return extend_blocks([blk.convert() for blk in blocks]) + return extend_blocks( + [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] + ) if downcast is None: return blocks - return extend_blocks([b._downcast_2d(downcast) for b in blocks]) + return extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks]) @final @maybe_split - def _downcast_2d(self, dtype) -> list[Block]: + def _downcast_2d(self, dtype, using_cow: bool = False) -> list[Block]: """ downcast specialized to 2D case post-validation. Refactored to allow use of maybe_split. """ new_values = maybe_downcast_to_dtype(self.values, dtype=dtype) - return [self.make_block(new_values)] + refs = self.refs if using_cow and new_values is self.values else None + return [self.make_block(new_values, refs=refs)] def convert( self, @@ -1209,6 +1217,7 @@ def interpolate( limit_area: str | None = None, fill_value: Any | None = None, downcast: str | None = None, + using_cow: bool = False, **kwargs, ) -> list[Block]: @@ -1216,6 +1225,8 @@ def interpolate( if not self._can_hold_na: # If there are no NAs, then interpolate is a no-op + if using_cow: + return [self.copy(deep=False)] return [self] if inplace else [self.copy()] try: @@ -1224,8 +1235,10 @@ def interpolate( m = None if m is None and self.dtype.kind != "f": # only deal with floats - # bc we already checked that can_hold_na, we dont have int dtype here + # bc we already checked that can_hold_na, we don't have int dtype here # test_interp_basic checks that we make a copy here + if using_cow: + return [self.copy(deep=False)] return [self] if inplace else [self.copy()] if self.is_object and self.ndim == 2 and self.shape[0] != 1 and axis == 0: @@ -1244,7 +1257,15 @@ def interpolate( **kwargs, ) - data = self.values if inplace else self.values.copy() + refs = None + if inplace: + if using_cow and self.refs.has_reference(): + data = self.values.copy() + else: + data = self.values + refs = self.refs + else: + data = self.values.copy() data = cast(np.ndarray, data) # bc overridden by ExtensionBlock missing.interpolate_array_2d( @@ -1259,8 +1280,8 @@ def interpolate( **kwargs, ) - nb = self.make_block_same_class(data) - return nb._maybe_downcast([nb], downcast) + nb = self.make_block_same_class(data, refs=refs) + return nb._maybe_downcast([nb], downcast, using_cow) def diff(self, n: int, axis: AxisInt = 1) -> list[Block]: """return block for the diff of the values""" @@ -1632,6 +1653,7 @@ def interpolate( inplace: bool = False, limit: int | None = None, fill_value=None, + using_cow: bool = False, **kwargs, ): values = self.values @@ -2011,6 +2033,7 @@ def interpolate( inplace: bool = False, limit: int | None = None, fill_value=None, + using_cow: bool = False, **kwargs, ): values = self.values @@ -2020,12 +2043,20 @@ def interpolate( # "Literal['linear']") [comparison-overlap] if method == "linear": # type: ignore[comparison-overlap] # TODO: GH#50950 implement for arbitrary EAs - data_out = values._ndarray if inplace else values._ndarray.copy() + refs = None + if using_cow: + if inplace and not self.refs.has_reference(): + data_out = values._ndarray + refs = self.refs + else: + data_out = values._ndarray.copy() + else: + data_out = values._ndarray if inplace else values._ndarray.copy() missing.interpolate_array_2d( data_out, method=method, limit=limit, index=index, axis=axis ) new_values = type(values)._simple_new(data_out, dtype=values.dtype) - return self.make_block_same_class(new_values) + return self.make_block_same_class(new_values, refs=refs) elif values.ndim == 2 and axis == 0: # NDArrayBackedExtensionArray.fillna assumes axis=1 diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 517e6d7e48275..3601337b4cb9c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -389,14 +389,9 @@ def diff(self: T, n: int, axis: AxisInt) -> T: return self.apply("diff", n=n, axis=axis) def interpolate(self: T, inplace: bool, **kwargs) -> T: - if inplace: - # TODO(CoW) can be optimized to only copy those blocks that have refs - if using_copy_on_write() and any( - not self._has_no_reference_block(i) for i in range(len(self.blocks)) - ): - self = self.copy() - - return self.apply("interpolate", inplace=inplace, **kwargs) + return self.apply( + "interpolate", inplace=inplace, **kwargs, using_cow=using_copy_on_write() + ) def shift(self: T, periods: int, axis: AxisInt, fill_value) -> T: axis = self._normalize_axis(axis) diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py new file mode 100644 index 0000000000000..1bfcf7d180e54 --- /dev/null +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -0,0 +1,164 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + NaT, + Series, + Timestamp, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +@pytest.mark.parametrize("method", ["pad", "nearest", "linear"]) +def test_interpolate_no_op(using_copy_on_write, method): + df = DataFrame({"a": [1, 2]}) + df_orig = df.copy() + + result = df.interpolate(method=method) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + + result.iloc[0, 0] = 100 + + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +def test_interp_fill_functions(using_copy_on_write, func): + # Check that these takes the same code paths as interpolate + df = DataFrame({"a": [1, 2]}) + df_orig = df.copy() + + result = getattr(df, func)() + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + + result.iloc[0, 0] = 100 + + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +@pytest.mark.parametrize( + "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] +) +def test_interpolate_triggers_copy(using_copy_on_write, vals, func): + df = DataFrame({"a": vals}) + result = getattr(df, func)() + + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + if using_copy_on_write: + # Check that we don't have references when triggering a copy + assert result._mgr._has_no_reference(0) + + +@pytest.mark.parametrize( + "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] +) +def test_interpolate_inplace_no_reference_no_copy(using_copy_on_write, vals): + df = DataFrame({"a": vals}) + arr = get_array(df, "a") + df.interpolate(method="linear", inplace=True) + + assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + # Check that we don't have references when triggering a copy + assert df._mgr._has_no_reference(0) + + +@pytest.mark.parametrize( + "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] +) +def test_interpolate_inplace_with_refs(using_copy_on_write, vals): + df = DataFrame({"a": [1, np.nan, 2]}) + df_orig = df.copy() + arr = get_array(df, "a") + view = df[:] + df.interpolate(method="linear", inplace=True) + + if using_copy_on_write: + # Check that copy was triggered in interpolate and that we don't + # have any references left + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + else: + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_interpolate_cleaned_fill_method(using_copy_on_write): + # Check that "method is set to None" case works correctly + df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) + df_orig = df.copy() + + result = df.interpolate(method="asfreq") + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + + result.iloc[0, 0] = Timestamp("2021-12-31") + + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_interpolate_object_convert_no_op(using_copy_on_write): + df = DataFrame({"a": ["a", "b", "c"], "b": 1}) + arr_a = get_array(df, "a") + df.interpolate(method="pad", inplace=True) + + # Now CoW makes a copy, it should not! + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert np.shares_memory(arr_a, get_array(df, "a")) + + +def test_interpolate_object_convert_copies(using_copy_on_write): + df = DataFrame({"a": Series([1, 2], dtype=object), "b": 1}) + arr_a = get_array(df, "a") + df.interpolate(method="pad", inplace=True) + + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert not np.shares_memory(arr_a, get_array(df, "a")) + + +def test_interpolate_downcast(using_copy_on_write): + df = DataFrame({"a": [1, np.nan, 2.5], "b": 1}) + arr_a = get_array(df, "a") + df.interpolate(method="pad", inplace=True, downcast="infer") + + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert np.shares_memory(arr_a, get_array(df, "a")) + + +def test_interpolate_downcast_reference_triggers_copy(using_copy_on_write): + df = DataFrame({"a": [1, np.nan, 2.5], "b": 1}) + df_orig = df.copy() + arr_a = get_array(df, "a") + view = df[:] + df.interpolate(method="pad", inplace=True, downcast="infer") + + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert not np.shares_memory(arr_a, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + else: + tm.assert_frame_equal(df, view) diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 20e4571e2ba05..d6da967106fe6 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -52,7 +52,7 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 - def test_interp_basic(self): + def test_interp_basic(self, using_copy_on_write): df = DataFrame( { "A": [1, 2, np.nan, 4], @@ -75,8 +75,12 @@ def test_interp_basic(self): # check we didn't operate inplace GH#45791 cvalues = df["C"]._values dvalues = df["D"].values - assert not np.shares_memory(cvalues, result["C"]._values) - assert not np.shares_memory(dvalues, result["D"]._values) + if using_copy_on_write: + assert np.shares_memory(cvalues, result["C"]._values) + assert np.shares_memory(dvalues, result["D"]._values) + else: + assert not np.shares_memory(cvalues, result["C"]._values) + assert not np.shares_memory(dvalues, result["D"]._values) res = df.interpolate(inplace=True) assert res is None @@ -95,14 +99,6 @@ def test_interp_basic_with_non_range_index(self): "D": list("abcd"), } ) - expected = DataFrame( - { - "A": [1.0, 2.0, 3.0, 4.0], - "B": [1.0, 4.0, 9.0, 9.0], - "C": [1, 2, 3, 5], - "D": list("abcd"), - } - ) result = df.set_index("C").interpolate() expected = df.set_index("C") @@ -327,20 +323,24 @@ def test_interp_raise_on_all_object_dtype(self): df.interpolate() def test_interp_inplace(self, using_copy_on_write): - # TODO(CoW) inplace keyword (it is still mutating the parent) - if using_copy_on_write: - pytest.skip("CoW: inplace keyword not yet handled") df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]}) expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]}) + expected_cow = df.copy() result = df.copy() return_value = result["a"].interpolate(inplace=True) assert return_value is None - tm.assert_frame_equal(result, expected) + if using_copy_on_write: + tm.assert_frame_equal(result, expected_cow) + else: + tm.assert_frame_equal(result, expected) result = df.copy() return_value = result["a"].interpolate(inplace=True, downcast="infer") assert return_value is None - tm.assert_frame_equal(result, expected.astype("int64")) + if using_copy_on_write: + tm.assert_frame_equal(result, expected_cow) + else: + tm.assert_frame_equal(result, expected.astype("int64")) def test_interp_inplace_row(self): # GH 10395