Skip to content

ENH: EA.fillna copy=True #53728

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,9 @@ def _fill_mask_inplace(
func(self._ndarray.T, limit=limit, mask=mask.T)

@doc(ExtensionArray.fillna)
def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
value, method = validate_fillna_kwargs(
value, method, validate_scalar_dict_value=False
)
Expand All @@ -310,22 +312,30 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
# TODO: check value is None
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._ndarray.T.copy()
npvalues = self._ndarray.T
if copy:
npvalues = npvalues.copy()
func(npvalues, limit=limit, mask=mask.T)
npvalues = npvalues.T

# TODO: PandasArray didn't used to copy, need tests for this
new_values = self._from_backing_data(npvalues)
else:
# fill with value
new_values = self.copy()
if copy:
new_values = self.copy()
else:
new_values = self[:]
new_values[mask] = value
else:
# We validate the fill_value even if there is nothing to fill
if value is not None:
self._validate_setitem_value(value)

new_values = self.copy()
if not copy:
new_values = self[:]
else:
new_values = self.copy()
return new_values

# ------------------------------------------------------------------------
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,7 @@ def fillna(
value: object | ArrayLike | None = None,
method: FillnaOptions | None = None,
limit: int | None = None,
copy: bool = True,
) -> Self:
value, method = validate_fillna_kwargs(value, method)

Expand All @@ -915,11 +916,11 @@ def fillna(
return self.copy()

if limit is not None:
return super().fillna(value=value, method=method, limit=limit)
return super().fillna(value=value, method=method, limit=limit, copy=copy)

if method is not None:
fallback_performancewarning()
return super().fillna(value=value, method=method, limit=limit)
return super().fillna(value=value, method=method, limit=limit, copy=copy)

if isinstance(value, (np.ndarray, ExtensionArray)):
# Similar to check_value_size, but we do not mask here since we may
Expand Down Expand Up @@ -962,7 +963,7 @@ def convert_fill_value(value, pa_type, dtype):
# a kernel for duration types.
pass

return super().fillna(value=value, method=method, limit=limit)
return super().fillna(value=value, method=method, limit=limit, copy=copy)

def isin(self, values) -> npt.NDArray[np.bool_]:
# short-circuit to return all False array.
Expand Down
19 changes: 17 additions & 2 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,7 @@ def fillna(
value: object | ArrayLike | None = None,
method: FillnaOptions | None = None,
limit: int | None = None,
copy: bool = True,
) -> Self:
"""
Fill NA/NaN values using the specified method.
Expand All @@ -782,6 +783,12 @@ def fillna(
maximum number of entries along the entire axis where NaNs will be
filled.

copy : bool, default True
Whether to make a copy of the data before filling. If False, then
the original should be modified and no new memory should be allocated.
For ExtensionArray subclasses that cannot do this, it is at the
author's discretion whether to ignore "copy=False" or to raise.

Returns
-------
ExtensionArray
Expand All @@ -802,12 +809,20 @@ def fillna(
npvalues = self.astype(object)
func(npvalues, limit=limit, mask=mask)
new_values = self._from_sequence(npvalues, dtype=self.dtype)
if not copy:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like this, this is actually slower than copy=True

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed this is not good. would you prefer to just ignore copy in this case?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep

self[:] = new_values
else:
# fill with value
new_values = self.copy()
if not copy:
new_values = self[:]
else:
new_values = self.copy()
new_values[mask] = value
else:
new_values = self.copy()
if not copy:
new_values = self[:]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need a shallow copy in these cases?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i figured on principle to return a new object, doesnt make much difference

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this compare with other EA methods?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see a couple of return self and a couple of return self[:] in the base class methods. so not too consistent

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't care then, but we should make this consistent (not in this pr)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im leaning towards consistently returning self as in the Block method we may want to determine whether a copy was made and result is not values is an easy way to do that.

else:
new_values = self.copy()
return new_values

def dropna(self) -> Self:
Expand Down
11 changes: 10 additions & 1 deletion pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,7 +886,9 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr
indexer = obj.argsort()[-1]
return obj[indexer]

def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
"""
Fill NA/NaN values using the specified method.

Expand All @@ -908,6 +910,11 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
be partially filled. If method is not specified, this is the
maximum number of entries along the entire axis where NaNs will be
filled.
copy : bool, default True
Whether to make a copy of the data before filling. If False, then
the original should be modified and no new memory should be allocated.
For ExtensionArray subclasses that cannot do this, it is at the
author's discretion whether to ignore "copy=False" or to raise.

Returns
-------
Expand All @@ -917,6 +924,8 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
raise TypeError("Filling by method is not supported for IntervalArray.")
if limit is not None:
raise TypeError("limit is not supported for IntervalArray.")
if copy is False:
raise NotImplementedError

value_left, value_right = self._validate_scalar(value)

Expand Down
21 changes: 16 additions & 5 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,9 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any:
return self._simple_new(self._data[item], newmask)

@doc(ExtensionArray.fillna)
def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
value, method = validate_fillna_kwargs(value, method)

mask = self._mask
Expand All @@ -189,16 +191,25 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
if mask.any():
if method is not None:
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._data.copy().T
new_mask = mask.copy().T
npvalues = self._data.T
new_mask = mask.T
if copy:
npvalues = npvalues.copy()
new_mask = new_mask.copy()
func(npvalues, limit=limit, mask=new_mask)
return self._simple_new(npvalues.T, new_mask.T)
else:
# fill with value
new_values = self.copy()
if copy:
new_values = self.copy()
else:
new_values = self[:]
new_values[mask] = value
else:
new_values = self.copy()
if copy:
new_values = self.copy()
else:
new_values = self[:]
return new_values

@classmethod
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,16 +781,18 @@ def searchsorted(
m8arr = self._ndarray.view("M8[ns]")
return m8arr.searchsorted(npvalue, side=side, sorter=sorter)

def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
if method is not None:
# view as dt64 so we get treated as timelike in core.missing,
# similar to dtl._period_dispatch
dta = self.view("M8[ns]")
result = dta.fillna(value=value, method=method, limit=limit)
result = dta.fillna(value=value, method=method, limit=limit, copy=copy)
# error: Incompatible return value type (got "Union[ExtensionArray,
# ndarray[Any, Any]]", expected "PeriodArray")
return result.view(self.dtype) # type: ignore[return-value]
return super().fillna(value=value, method=method, limit=limit)
return super().fillna(value=value, method=method, limit=limit, copy=copy)

# ------------------------------------------------------------------
# Arithmetic Methods
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,7 @@ def fillna(
value=None,
method: FillnaOptions | None = None,
limit: int | None = None,
copy: bool = True,
) -> Self:
"""
Fill missing values with `value`.
Expand All @@ -739,6 +740,9 @@ def fillna(

limit : int, optional

copy: bool, default True
Ignored for SparseArray.

Returns
-------
SparseArray
Expand Down
53 changes: 43 additions & 10 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1850,12 +1850,24 @@ def interpolate(
**kwargs,
):
values = self.values
refs = None
copy = not inplace
if using_cow:
if inplace and not self.refs.has_reference():
refs = self.refs
else:
copy = True

if values.ndim == 2 and axis == 0:
# NDArrayBackedExtensionArray.fillna assumes axis=1
new_values = values.T.fillna(value=fill_value, method=method, limit=limit).T
new_values = values.T.fillna(
value=fill_value, method=method, limit=limit, copy=copy
).T
else:
new_values = values.fillna(value=fill_value, method=method, limit=limit)
return self.make_block_same_class(new_values)
new_values = values.fillna(
value=fill_value, method=method, limit=limit, copy=copy
)
return self.make_block_same_class(new_values, refs=refs)


class ExtensionBlock(libinternals.Block, EABackedBlock):
Expand Down Expand Up @@ -1898,7 +1910,16 @@ def fillna(
new_values = self.values
else:
refs = None
new_values = self.values.fillna(value=value, method=None, limit=limit)
copy = not inplace
if using_cow:
if inplace and not self.refs.has_reference():
refs = self.refs
else:
copy = True

new_values = self.values.fillna(
value=value, method=None, limit=limit, copy=copy
)
nb = self.make_block_same_class(new_values, refs=refs)
return nb._maybe_downcast([nb], downcast, using_cow=using_cow)

Expand Down Expand Up @@ -2285,13 +2306,25 @@ def interpolate(
**kwargs,
)
return self.make_block_same_class(new_values, refs=refs)

elif values.ndim == 2 and axis == 0:
# NDArrayBackedExtensionArray.fillna assumes axis=1
new_values = values.T.fillna(value=fill_value, method=method, limit=limit).T
else:
new_values = values.fillna(value=fill_value, method=method, limit=limit)
return self.make_block_same_class(new_values)
copy = not inplace
refs = None
if using_cow:
if inplace and not self.refs.has_reference():
refs = self.refs
else:
copy = True

if values.ndim == 2 and axis == 0:
# NDArrayBackedExtensionArray.fillna assumes axis=1
new_values = values.T.fillna(
value=fill_value, method=method, limit=limit, copy=copy
).T
else:
new_values = values.fillna(
value=fill_value, method=method, limit=limit, copy=copy
)
return self.make_block_same_class(new_values, refs=refs)


class DatetimeTZBlock(DatetimeLikeBlock):
Expand Down
15 changes: 12 additions & 3 deletions pandas/tests/copy_view/test_interp_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,11 @@ def test_fillna_inplace_ea_noop_shares_memory(
view = df[:]
df.fillna(100, inplace=True)

assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
else:
# MaskedArray can actually respect inplace=True
assert np.shares_memory(get_array(df, "a"), get_array(view, "a"))

if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
Expand All @@ -336,6 +340,11 @@ def test_fillna_inplace_ea_noop_shares_memory(
# arrow is immutable, so no-ops do not need to copy underlying array
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
else:
assert not np.shares_memory(get_array(df, "b"), get_array(view, "b"))
# MaskedArray can actually respect inplace=True
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
df.iloc[0, 1] = 100
tm.assert_frame_equal(df_orig, view)
if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write:
tm.assert_frame_equal(df_orig, view)
else:
# we actually have a view
tm.assert_frame_equal(df, view)