Skip to content

ENH: Add CoW optimization to interpolate #51249

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Feb 10, 2023
Merged
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,9 @@ Copy-on-Write improvements
- :meth:`DataFrame.to_period` / :meth:`Series.to_period`
- :meth:`DataFrame.truncate`
- :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize`
- :meth:`DataFrame.interpolate` / :meth:`Series.interpolate`
- :meth:`DataFrame.ffill` / :meth:`Series.ffill`
- :meth:`DataFrame.bfill` / :meth:`Series.bfill`
- :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects`
- :meth:`DataFrame.astype` / :meth:`Series.astype`
- :func:`concat`
Expand Down
57 changes: 44 additions & 13 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,10 @@ def make_block(

@final
def make_block_same_class(
self, values, placement: BlockPlacement | None = None
self,
values,
placement: BlockPlacement | None = None,
refs: BlockValuesRefs | None = None,
) -> Block:
"""Wrap given values in a block of same type as self."""
# Pre-2.0 we called ensure_wrapped_if_datetimelike because fastparquet
Expand All @@ -237,7 +240,7 @@ def make_block_same_class(
placement = self._mgr_locs

# We assume maybe_coerce_values has already been called
return type(self)(values, placement=placement, ndim=self.ndim)
return type(self)(values, placement=placement, ndim=self.ndim, refs=refs)

@final
def __repr__(self) -> str:
Expand Down Expand Up @@ -421,7 +424,9 @@ def coerce_to_target_dtype(self, other) -> Block:
return self.astype(new_dtype, copy=False)

@final
def _maybe_downcast(self, blocks: list[Block], downcast=None) -> list[Block]:
def _maybe_downcast(
self, blocks: list[Block], downcast=None, using_cow: bool = False
) -> list[Block]:
if downcast is False:
return blocks

Expand All @@ -431,23 +436,26 @@ def _maybe_downcast(self, blocks: list[Block], downcast=None) -> list[Block]:
# but ATM it breaks too much existing code.
# split and convert the blocks

return extend_blocks([blk.convert() for blk in blocks])
return extend_blocks(
[blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks]
)

if downcast is None:
return blocks

return extend_blocks([b._downcast_2d(downcast) for b in blocks])
return extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks])

@final
@maybe_split
def _downcast_2d(self, dtype) -> list[Block]:
def _downcast_2d(self, dtype, using_cow: bool = False) -> list[Block]:
"""
downcast specialized to 2D case post-validation.

Refactored to allow use of maybe_split.
"""
new_values = maybe_downcast_to_dtype(self.values, dtype=dtype)
return [self.make_block(new_values)]
refs = self.refs if using_cow and new_values is self.values else None
return [self.make_block(new_values, refs=refs)]

def convert(
self,
Expand Down Expand Up @@ -1209,13 +1217,16 @@ def interpolate(
limit_area: str | None = None,
fill_value: Any | None = None,
downcast: str | None = None,
using_cow: bool = False,
**kwargs,
) -> list[Block]:

inplace = validate_bool_kwarg(inplace, "inplace")

if not self._can_hold_na:
# If there are no NAs, then interpolate is a no-op
if using_cow:
return [self.copy(deep=False)]
return [self] if inplace else [self.copy()]

try:
Expand All @@ -1224,8 +1235,10 @@ def interpolate(
m = None
if m is None and self.dtype.kind != "f":
# only deal with floats
# bc we already checked that can_hold_na, we dont have int dtype here
# bc we already checked that can_hold_na, we don't have int dtype here
# test_interp_basic checks that we make a copy here
if using_cow:
return [self.copy(deep=False)]
return [self] if inplace else [self.copy()]

if self.is_object and self.ndim == 2 and self.shape[0] != 1 and axis == 0:
Expand All @@ -1244,7 +1257,15 @@ def interpolate(
**kwargs,
)

data = self.values if inplace else self.values.copy()
refs = None
if inplace:
if using_cow and self.refs.has_reference():
data = self.values.copy()
else:
data = self.values
refs = self.refs
else:
data = self.values.copy()
data = cast(np.ndarray, data) # bc overridden by ExtensionBlock

missing.interpolate_array_2d(
Expand All @@ -1259,8 +1280,8 @@ def interpolate(
**kwargs,
)

nb = self.make_block_same_class(data)
return nb._maybe_downcast([nb], downcast)
nb = self.make_block_same_class(data, refs=refs)
return nb._maybe_downcast([nb], downcast, using_cow)

def diff(self, n: int, axis: AxisInt = 1) -> list[Block]:
"""return block for the diff of the values"""
Expand Down Expand Up @@ -1632,6 +1653,7 @@ def interpolate(
inplace: bool = False,
limit: int | None = None,
fill_value=None,
using_cow: bool = False,
**kwargs,
):
values = self.values
Expand Down Expand Up @@ -2011,6 +2033,7 @@ def interpolate(
inplace: bool = False,
limit: int | None = None,
fill_value=None,
using_cow: bool = False,
**kwargs,
):
values = self.values
Expand All @@ -2020,12 +2043,20 @@ def interpolate(
# "Literal['linear']") [comparison-overlap]
if method == "linear": # type: ignore[comparison-overlap]
# TODO: GH#50950 implement for arbitrary EAs
data_out = values._ndarray if inplace else values._ndarray.copy()
refs = None
if using_cow:
if inplace and not self.refs.has_reference():
data_out = values._ndarray
refs = self.refs
else:
data_out = values._ndarray.copy()
else:
data_out = values._ndarray if inplace else values._ndarray.copy()
missing.interpolate_array_2d(
data_out, method=method, limit=limit, index=index, axis=axis
)
new_values = type(values)._simple_new(data_out, dtype=values.dtype)
return self.make_block_same_class(new_values)
return self.make_block_same_class(new_values, refs=refs)

elif values.ndim == 2 and axis == 0:
# NDArrayBackedExtensionArray.fillna assumes axis=1
Expand Down
11 changes: 3 additions & 8 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,14 +389,9 @@ def diff(self: T, n: int, axis: AxisInt) -> T:
return self.apply("diff", n=n, axis=axis)

def interpolate(self: T, inplace: bool, **kwargs) -> T:
if inplace:
# TODO(CoW) can be optimized to only copy those blocks that have refs
if using_copy_on_write() and any(
not self._has_no_reference_block(i) for i in range(len(self.blocks))
):
self = self.copy()

return self.apply("interpolate", inplace=inplace, **kwargs)
return self.apply(
"interpolate", inplace=inplace, **kwargs, using_cow=using_copy_on_write()
)

def shift(self: T, periods: int, axis: AxisInt, fill_value) -> T:
axis = self._normalize_axis(axis)
Expand Down
164 changes: 164 additions & 0 deletions pandas/tests/copy_view/test_interp_fillna.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import numpy as np
import pytest

from pandas import (
DataFrame,
NaT,
Series,
Timestamp,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array


@pytest.mark.parametrize("method", ["pad", "nearest", "linear"])
def test_interpolate_no_op(using_copy_on_write, method):
df = DataFrame({"a": [1, 2]})
df_orig = df.copy()

result = df.interpolate(method=method)

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))

result.iloc[0, 0] = 100

if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)


@pytest.mark.parametrize("func", ["ffill", "bfill"])
def test_interp_fill_functions(using_copy_on_write, func):
# Check that these takes the same code paths as interpolate
df = DataFrame({"a": [1, 2]})
df_orig = df.copy()

result = getattr(df, func)()

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))

result.iloc[0, 0] = 100

if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)


@pytest.mark.parametrize("func", ["ffill", "bfill"])
@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_triggers_copy(using_copy_on_write, vals, func):
df = DataFrame({"a": vals})
result = getattr(df, func)()

assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
if using_copy_on_write:
# Check that we don't have references when triggering a copy
assert result._mgr._has_no_reference(0)


@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_inplace_no_reference_no_copy(using_copy_on_write, vals):
df = DataFrame({"a": vals})
arr = get_array(df, "a")
df.interpolate(method="linear", inplace=True)

assert np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
# Check that we don't have references when triggering a copy
assert df._mgr._has_no_reference(0)


@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_inplace_with_refs(using_copy_on_write, vals):
df = DataFrame({"a": [1, np.nan, 2]})
df_orig = df.copy()
arr = get_array(df, "a")
view = df[:]
df.interpolate(method="linear", inplace=True)

if using_copy_on_write:
# Check that copy was triggered in interpolate and that we don't
# have any references left
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
else:
assert np.shares_memory(arr, get_array(df, "a"))


def test_interpolate_cleaned_fill_method(using_copy_on_write):
# Check that "method is set to None" case works correctly
df = DataFrame({"a": ["a", np.nan, "c"], "b": 1})
df_orig = df.copy()

result = df.interpolate(method="asfreq")

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))

result.iloc[0, 0] = Timestamp("2021-12-31")

if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)


def test_interpolate_object_convert_no_op(using_copy_on_write):
df = DataFrame({"a": ["a", "b", "c"], "b": 1})
arr_a = get_array(df, "a")
df.interpolate(method="pad", inplace=True)

# No CoW makes a copy, it should not!
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr_a, get_array(df, "a"))


def test_interpolate_object_convert_copies(using_copy_on_write):
df = DataFrame({"a": Series([1, 2], dtype=object), "b": 1})
arr_a = get_array(df, "a")
df.interpolate(method="pad", inplace=True)

if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert not np.shares_memory(arr_a, get_array(df, "a"))


def test_interpolate_downcast(using_copy_on_write):
df = DataFrame({"a": [1, np.nan, 2.5], "b": 1})
arr_a = get_array(df, "a")
df.interpolate(method="pad", inplace=True, downcast="infer")

if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr_a, get_array(df, "a"))


def test_interpolate_downcast_reference_triggers_copy(using_copy_on_write):
df = DataFrame({"a": [1, np.nan, 2.5], "b": 1})
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]
df.interpolate(method="pad", inplace=True, downcast="infer")

if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert not np.shares_memory(arr_a, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
else:
tm.assert_frame_equal(df, view)
Loading