Skip to content

ENH: EA.fillna copy=True #53728

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,9 @@ def _fill_mask_inplace(
func(self._ndarray.T, limit=limit, mask=mask.T)

@doc(ExtensionArray.fillna)
def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
value, method = validate_fillna_kwargs(
value, method, validate_scalar_dict_value=False
)
Expand All @@ -313,7 +315,9 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
# TODO: check value is None
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._ndarray.T.copy()
npvalues = self._ndarray.T
if copy:
npvalues = npvalues.copy()
func(npvalues, limit=limit, mask=mask.T)
npvalues = npvalues.T

Expand All @@ -322,14 +326,20 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
new_values = self._from_backing_data(npvalues)
else:
# fill with value
new_values = self.copy()
if copy:
new_values = self.copy()
else:
new_values = self[:]
new_values[mask] = value
else:
# We validate the fill_value even if there is nothing to fill
if value is not None:
self._validate_setitem_value(value)

new_values = self.copy()
if not copy:
new_values = self[:]
else:
new_values = self.copy()
return new_values

# ------------------------------------------------------------------------
Expand Down
10 changes: 4 additions & 6 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,18 +905,16 @@ def fillna(
value: object | ArrayLike | None = None,
method: FillnaOptions | None = None,
limit: int | None = None,
copy: bool = True,
) -> Self:
value, method = validate_fillna_kwargs(value, method)

if not self._hasna:
# TODO(CoW): Not necessary anymore when CoW is the default
return self.copy()

if limit is not None:
return super().fillna(value=value, method=method, limit=limit)

if method is not None:
return super().fillna(value=value, method=method, limit=limit)
if limit is not None or method is not None:
return super().fillna(value=value, method=method, limit=limit, copy=copy)

if isinstance(value, (np.ndarray, ExtensionArray)):
# Similar to check_value_size, but we do not mask here since we may
Expand Down Expand Up @@ -959,7 +957,7 @@ def convert_fill_value(value, pa_type, dtype):
# a kernel for duration types.
pass

return super().fillna(value=value, method=method, limit=limit)
return super().fillna(value=value, method=method, limit=limit, copy=copy)

def isin(self, values) -> npt.NDArray[np.bool_]:
# short-circuit to return all False array.
Expand Down
19 changes: 17 additions & 2 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -872,6 +872,7 @@ def fillna(
value: object | ArrayLike | None = None,
method: FillnaOptions | None = None,
limit: int | None = None,
copy: bool = True,
) -> Self:
"""
Fill NA/NaN values using the specified method.
Expand All @@ -896,6 +897,14 @@ def fillna(
maximum number of entries along the entire axis where NaNs will be
filled.

copy : bool, default True
Whether to make a copy of the data before filling. If False, then
the original should be modified and no new memory should be allocated.
For ExtensionArray subclasses that cannot do this, it is at the
author's discretion whether to ignore "copy=False" or to raise.
The base class implementation ignores the keyword in pad/backfill
cases.

Returns
-------
ExtensionArray
Expand Down Expand Up @@ -932,10 +941,16 @@ def fillna(
return self[::-1].take(indexer, allow_fill=True)
else:
# fill with value
new_values = self.copy()
if not copy:
new_values = self[:]
else:
new_values = self.copy()
new_values[mask] = value
else:
new_values = self.copy()
if not copy:
new_values = self[:]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need a shallow copy in these cases?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i figured on principle to return a new object, doesnt make much difference

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this compare with other EA methods?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see a couple of return self and a couple of return self[:] in the base class methods. so not too consistent

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't care then, but we should make this consistent (not in this pr)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im leaning towards consistently returning self as in the Block method we may want to determine whether a copy was made and result is not values is an easy way to do that.

else:
new_values = self.copy()
return new_values

def dropna(self) -> Self:
Expand Down
11 changes: 10 additions & 1 deletion pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,7 +889,9 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr
indexer = obj.argsort()[-1]
return obj[indexer]

def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
"""
Fill NA/NaN values using the specified method.

Expand All @@ -911,11 +913,18 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
be partially filled. If method is not specified, this is the
maximum number of entries along the entire axis where NaNs will be
filled.
copy : bool, default True
Whether to make a copy of the data before filling. If False, then
the original should be modified and no new memory should be allocated.
For ExtensionArray subclasses that cannot do this, it is at the
author's discretion whether to ignore "copy=False" or to raise.

Returns
-------
filled : IntervalArray with NA/NaN filled
"""
if copy is False:
raise NotImplementedError
if method is not None:
return super().fillna(value=value, method=method, limit=limit)

Expand Down
21 changes: 16 additions & 5 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,9 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any:
return self._simple_new(self._data[item], newmask)

@doc(ExtensionArray.fillna)
def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
value, method = validate_fillna_kwargs(value, method)

mask = self._mask
Expand All @@ -195,16 +197,25 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
if mask.any():
if method is not None:
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._data.copy().T
new_mask = mask.copy().T
npvalues = self._data.T
new_mask = mask.T
if copy:
npvalues = npvalues.copy()
new_mask = new_mask.copy()
func(npvalues, limit=limit, mask=new_mask)
return self._simple_new(npvalues.T, new_mask.T)
else:
# fill with value
new_values = self.copy()
if copy:
new_values = self.copy()
else:
new_values = self[:]
new_values[mask] = value
else:
new_values = self.copy()
if copy:
new_values = self.copy()
else:
new_values = self[:]
return new_values

@classmethod
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -790,16 +790,18 @@ def searchsorted(
m8arr = self._ndarray.view("M8[ns]")
return m8arr.searchsorted(npvalue, side=side, sorter=sorter)

def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
if method is not None:
# view as dt64 so we get treated as timelike in core.missing,
# similar to dtl._period_dispatch
dta = self.view("M8[ns]")
result = dta.fillna(value=value, method=method, limit=limit)
result = dta.fillna(value=value, method=method, limit=limit, copy=copy)
# error: Incompatible return value type (got "Union[ExtensionArray,
# ndarray[Any, Any]]", expected "PeriodArray")
return result.view(self.dtype) # type: ignore[return-value]
return super().fillna(value=value, method=method, limit=limit)
return super().fillna(value=value, method=method, limit=limit, copy=copy)

# ------------------------------------------------------------------
# Arithmetic Methods
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,7 @@ def fillna(
value=None,
method: FillnaOptions | None = None,
limit: int | None = None,
copy: bool = True,
) -> Self:
"""
Fill missing values with `value`.
Expand All @@ -734,6 +735,9 @@ def fillna(

limit : int, optional

copy: bool, default True
Ignored for SparseArray.

Returns
-------
SparseArray
Expand Down
53 changes: 48 additions & 5 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
cast,
final,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -41,6 +42,7 @@
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.astype import (
Expand Down Expand Up @@ -1895,12 +1897,32 @@ def pad_or_backfill(
using_cow: bool = False,
) -> list[Block]:
values = self.values
copy, refs = self._get_refs_and_copy(using_cow, inplace)

if values.ndim == 2 and axis == 1:
# NDArrayBackedExtensionArray.fillna assumes axis=0
new_values = values.T.fillna(method=method, limit=limit).T
new_values = values.T.fillna(method=method, limit=limit, copy=copy).T
else:
new_values = values.fillna(method=method, limit=limit)
return [self.make_block_same_class(new_values)]
try:
new_values = values.fillna(method=method, limit=limit, copy=copy)
except TypeError:
# 3rd party EA that has not implemented copy keyword yet
refs = None
new_values = values.fillna(method=method, limit=limit)
# issue the warning *after* retrying, in case the TypeError
# was caused by an invalid fill_value
warnings.warn(
# GH#53278
"ExtensionArray.fillna added a 'copy' keyword in pandas "
"2.1.0. In a future version, ExtensionArray subclasses will "
"need to implement this keyword or an exception will be "
"raised. In the interim, the keyword is ignored by "
f"{type(self.values).__name__}.",
FutureWarning,
stacklevel=find_stack_level(),
)

return [self.make_block_same_class(new_values, refs=refs)]


class ExtensionBlock(libinternals.Block, EABackedBlock):
Expand Down Expand Up @@ -1938,8 +1960,29 @@ def fillna(
refs = self.refs
new_values = self.values
else:
refs = None
new_values = self.values.fillna(value=value, method=None, limit=limit)
copy, refs = self._get_refs_and_copy(using_cow, inplace)

try:
new_values = self.values.fillna(
value=value, method=None, limit=limit, copy=copy
)
except TypeError:
# 3rd party EA that has not implemented copy keyword yet
refs = None
new_values = self.values.fillna(value=value, method=None, limit=limit)
# issue the warning *after* retrying, in case the TypeError
# was caused by an invalid fill_value
warnings.warn(
# GH#53278
"ExtensionArray.fillna added a 'copy' keyword in pandas "
"2.1.0. In a future version, ExtensionArray subclasses will "
"need to implement this keyword or an exception will be "
"raised. In the interim, the keyword is ignored by "
f"{type(self.values).__name__}.",
FutureWarning,
stacklevel=find_stack_level(),
)

nb = self.make_block_same_class(new_values, refs=refs)
return nb._maybe_downcast([nb], downcast, using_cow=using_cow)

Expand Down
20 changes: 12 additions & 8 deletions pandas/tests/copy_view/test_interp_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,19 +331,23 @@ def test_fillna_inplace_ea_noop_shares_memory(
view = df[:]
df.fillna(100, inplace=True)

assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
else:
# MaskedArray can actually respect inplace=True
assert np.shares_memory(get_array(df, "a"), get_array(view, "a"))

assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
assert not df._mgr._has_no_reference(1)
assert not view._mgr._has_no_reference(1)
elif isinstance(df.dtypes.iloc[0], ArrowDtype):
# arrow is immutable, so no-ops do not need to copy underlying array
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
else:
assert not np.shares_memory(get_array(df, "b"), get_array(view, "b"))

df.iloc[0, 1] = 100
tm.assert_frame_equal(df_orig, view)
if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write:
tm.assert_frame_equal(df_orig, view)
else:
# we actually have a view
tm.assert_frame_equal(df, view)


def test_fillna_chained_assignment(using_copy_on_write):
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/extension/decimal/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,17 @@ def convert_values(param):
def value_counts(self, dropna: bool = True):
return value_counts(self.to_numpy(), dropna=dropna)

# Simulate a 3rd-party EA that has not yet updated to include a "copy"
# keyword in its fillna method.
# error: Signature of "fillna" incompatible with supertype "ExtensionArray"
def fillna( # type: ignore[override]
self,
value=None,
method=None,
limit: int | None = None,
):
return super().fillna(value=value, method=method, limit=limit, copy=True)


def to_decimal(values, context=None):
return DecimalArray([decimal.Decimal(x) for x in values], context=context)
Expand Down
Loading