Skip to content

BUG: Implement fillna(..., limit=x) for EAs #58249

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ Indexing

Missing
^^^^^^^
-
- Fixed bug in :meth:`DataFrame.fillna` and :meth:`Series.fillna` that would ignore the ``limit`` argument on :class:`.ExtensionArray` dtypes (:issue:`58001`)
-

MultiIndex
Expand Down Expand Up @@ -444,7 +444,6 @@ Sparse
ExtensionArray
^^^^^^^^^^^^^^
- Fixed bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`)
-

Styler
^^^^^^
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,12 @@ def _pad_or_backfill(
@doc(ExtensionArray.fillna)
def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self:
mask = self.isna()
if limit is not None and limit < len(self):
modify = mask.cumsum() > limit
if modify.any():
# Only copy mask if necessary
mask = mask.copy()
mask[modify] = False
# error: Argument 2 to "check_value_size" has incompatible type
# "ExtensionArray"; expected "ndarray"
value = missing.check_value_size(
Expand Down
15 changes: 7 additions & 8 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1045,19 +1045,12 @@ def fillna(
Alternatively, an array-like "value" can be given. It's expected
that the array-like have the same length as 'self'.
limit : int, default None
If method is specified, this is the maximum number of consecutive
NaN values to forward/backward fill. In other words, if there is
a gap with more than this number of consecutive NaNs, it will only
be partially filled. If method is not specified, this is the
maximum number of entries along the entire axis where NaNs will be
filled.
The maximum number of entries where NA values will be filled.
copy : bool, default True
Whether to make a copy of the data before filling. If False, then
the original should be modified and no new memory should be allocated.
For ExtensionArray subclasses that cannot do this, it is at the
author's discretion whether to ignore "copy=False" or to raise.
The base class implementation ignores the keyword in pad/backfill
cases.

Returns
-------
Expand All @@ -1073,6 +1066,12 @@ def fillna(
Length: 6, dtype: Int64
"""
mask = self.isna()
if limit is not None and limit < len(self):
modify = mask.cumsum() > limit
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self.isna() can return an ExtensionArray, which may not implement cumsum because we don't require it to implement _accumulate (but do require _reduce). Would it be okay to require _accumulate? Still not sure about the comparison > in that case.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbrockmendel - any thoughts here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’d be fine with that. Could add a comment to that effect

if modify.any():
# Only copy mask if necessary
mask = mask.copy()
mask[modify] = False
# error: Argument 2 to "check_value_size" has incompatible type
# "ExtensionArray"; expected "ndarray"
value = missing.check_value_size(
Expand Down
9 changes: 3 additions & 6 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,12 +905,7 @@ def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Sel
value(s) passed should be either Interval objects or NA/NaN.
limit : int, default None
(Not implemented yet for IntervalArray)
If method is specified, this is the maximum number of consecutive
NaN values to forward/backward fill. In other words, if there is
a gap with more than this number of consecutive NaNs, it will only
be partially filled. If method is not specified, this is the
maximum number of entries along the entire axis where NaNs will be
filled.
The maximum number of entries where NA values will be filled.
copy : bool, default True
Whether to make a copy of the data before filling. If False, then
the original should be modified and no new memory should be allocated.
Expand All @@ -923,6 +918,8 @@ def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Sel
"""
if copy is False:
raise NotImplementedError
if limit is not None:
raise ValueError("limit must be None")

value_left, value_right = self._validate_scalar(value)

Expand Down
6 changes: 6 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,12 @@ def _pad_or_backfill(
@doc(ExtensionArray.fillna)
def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self:
mask = self._mask
if limit is not None and limit < len(self):
modify = mask.cumsum() > limit
if modify.any():
# Only copy mask if necessary
mask = mask.copy()
mask[modify] = False

value = missing.check_value_size(value, mask, len(self))

Expand Down
3 changes: 3 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,7 @@ def fillna(
----------
value : scalar
limit : int, optional
Not supported for SparseArray, must be None.
copy: bool, default True
Ignored for SparseArray.

Expand All @@ -738,6 +739,8 @@ def fillna(
"""
if value is None:
raise ValueError("Must specify 'value'.")
if limit is not None:
raise ValueError("limit must be None")
new_values = np.where(isna(self.sp_values), value, self.sp_values)

if self._null_fill_value:
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1853,6 +1853,8 @@ def fillna(
) -> list[Block]:
if isinstance(self.dtype, IntervalDtype):
# Block.fillna handles coercion (test_fillna_interval)
if limit is not None:
raise ValueError("limit must be None")
return super().fillna(
value=value,
limit=limit,
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,20 @@ def test_factorize_empty(self, data):
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_extension_array_equal(uniques, expected_uniques)

def test_fillna_limit_frame(self, data_missing):
# GH#58001
df = pd.DataFrame({"A": data_missing.take([0, 1, 0, 1])})
expected = pd.DataFrame({"A": data_missing.take([1, 1, 0, 1])})
result = df.fillna(value=data_missing[1], limit=1)
tm.assert_frame_equal(result, expected)

def test_fillna_limit_series(self, data_missing):
# GH#58001
ser = pd.Series(data_missing.take([0, 1, 0, 1]))
expected = pd.Series(data_missing.take([1, 1, 0, 1]))
result = ser.fillna(value=data_missing[1], limit=1)
tm.assert_series_equal(result, expected)

def test_fillna_copy_frame(self, data_missing):
arr = data_missing.take([1, 1])
df = pd.DataFrame({"A": arr})
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/extension/decimal/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,22 @@ def test_fillna_series(self, data_missing):
):
super().test_fillna_series(data_missing)

def test_fillna_limit_frame(self, data_missing):
# GH#58001
msg = "ExtensionArray.fillna added a 'copy' keyword"
with tm.assert_produces_warning(
DeprecationWarning, match=msg, check_stacklevel=False
):
super().test_fillna_limit_frame(data_missing)

def test_fillna_limit_series(self, data_missing):
# GH#58001
msg = "ExtensionArray.fillna added a 'copy' keyword"
with tm.assert_produces_warning(
DeprecationWarning, match=msg, check_stacklevel=False
):
super().test_fillna_limit_series(data_missing)

@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna):
all_data = all_data[:10]
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/extension/json/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,16 @@ def test_fillna_frame(self):
"""We treat dictionaries as a mapping in fillna, not a scalar."""
super().test_fillna_frame()

@pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot use fillna with JSONArray because DataFrame.fillna has special logic for dict / Series argument.

def test_fillna_limit_frame(self, data_missing):
# GH#58001
super().test_fillna_limit_frame(data_missing)

@pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path")
def test_fillna_limit_series(self, data_missing):
# GH#58001
super().test_fillna_limit_frame(data_missing)

@pytest.mark.parametrize(
"limit_area, input_ilocs, expected_ilocs",
[
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/extension/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,16 @@ class TestIntervalArray(base.ExtensionTests):
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
return op_name in ["min", "max"]

def test_fillna_limit_frame(self, data_missing):
# GH#58001
with pytest.raises(ValueError, match="limit must be None"):
super().test_fillna_limit_frame(data_missing)

def test_fillna_limit_series(self, data_missing):
# GH#58001
with pytest.raises(ValueError, match="limit must be None"):
super().test_fillna_limit_frame(data_missing)

@pytest.mark.xfail(
reason="Raises with incorrect message bc it disallows *all* listlikes "
"instead of just wrong-length listlikes"
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,18 @@ def test_shift_fill_value(self, data):
# np.array shape inference. Shift implementation fails.
super().test_shift_fill_value(data)

@skip_nested
def test_fillna_limit_frame(self, data_missing):
# GH#58001
# The "scalar" for this array isn't a scalar.
super().test_fillna_limit_frame(data_missing)

@skip_nested
def test_fillna_limit_series(self, data_missing):
# GH#58001
# The "scalar" for this array isn't a scalar.
super().test_fillna_limit_series(data_missing)

@skip_nested
def test_fillna_copy_frame(self, data_missing):
# The "scalar" for this array isn't a scalar.
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,16 @@ def test_fillna_frame(self, data_missing):

tm.assert_frame_equal(result, expected)

def test_fillna_limit_frame(self, data_missing):
# GH#58001
with pytest.raises(ValueError, match="limit must be None"):
super().test_fillna_limit_frame(data_missing)

def test_fillna_limit_series(self, data_missing):
# GH#58001
with pytest.raises(ValueError, match="limit must be None"):
super().test_fillna_limit_frame(data_missing)

_combine_le_expected_dtype = "Sparse[bool]"

def test_fillna_copy_frame(self, data_missing):
Expand Down
Loading