Skip to content

BUG: Implement fillna(..., limit=x) for EAs #58249

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ Indexing

Missing
^^^^^^^
-
- Fixed bug in :meth:`DataFrame.fillna` and :meth:`Series.fillna` that would ignore the ``limit`` argument on :class:`.ExtensionArray` dtypes (:issue:`58001`)
-

MultiIndex
Expand Down Expand Up @@ -465,7 +465,6 @@ Sparse
ExtensionArray
^^^^^^^^^^^^^^
- Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`)
-

Styler
^^^^^^
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,13 @@ def _pad_or_backfill(
@doc(ExtensionArray.fillna)
def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self:
mask = self.isna()
if limit is not None and limit < len(self):
# mypy doesn't like that mask can be an EA which need not have `cumsum`
modify = mask.cumsum() > limit # type: ignore[union-attr]
if modify.any():
# Only copy mask if necessary
mask = mask.copy()
mask[modify] = False
# error: Argument 2 to "check_value_size" has incompatible type
# "ExtensionArray"; expected "ndarray"
value = missing.check_value_size(
Expand Down
21 changes: 12 additions & 9 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -742,7 +742,8 @@ def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll:
If returning an ExtensionArray, then

* ``na_values._is_boolean`` should be True
* `na_values` should implement :func:`ExtensionArray._reduce`
* ``na_values`` should implement :func:`ExtensionArray._reduce`
* ``na_values`` should implement :func:`ExtensionArray._accumulate`
* ``na_values.any`` and ``na_values.all`` should be implemented

Examples
Expand Down Expand Up @@ -1045,19 +1046,12 @@ def fillna(
Alternatively, an array-like "value" can be given. It's expected
that the array-like have the same length as 'self'.
limit : int, default None
If method is specified, this is the maximum number of consecutive
NaN values to forward/backward fill. In other words, if there is
a gap with more than this number of consecutive NaNs, it will only
be partially filled. If method is not specified, this is the
maximum number of entries along the entire axis where NaNs will be
filled.
The maximum number of entries where NA values will be filled.
copy : bool, default True
Whether to make a copy of the data before filling. If False, then
the original should be modified and no new memory should be allocated.
For ExtensionArray subclasses that cannot do this, it is at the
author's discretion whether to ignore "copy=False" or to raise.
The base class implementation ignores the keyword in pad/backfill
cases.

Returns
-------
Expand All @@ -1073,6 +1067,15 @@ def fillna(
Length: 6, dtype: Int64
"""
mask = self.isna()
if limit is not None and limit < len(self):
# isna can return an ExtensionArray, we're assuming that comparisons
# are implemented.
# mypy doesn't like that mask can be an EA which need not have `cumsum`
modify = mask.cumsum() > limit # type: ignore[union-attr]
if modify.any():
# Only copy mask if necessary
mask = mask.copy()
mask[modify] = False
# error: Argument 2 to "check_value_size" has incompatible type
# "ExtensionArray"; expected "ndarray"
value = missing.check_value_size(
Expand Down
9 changes: 3 additions & 6 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,12 +905,7 @@ def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self:
value(s) passed should be either Interval objects or NA/NaN.
limit : int, default None
(Not implemented yet for IntervalArray)
If method is specified, this is the maximum number of consecutive
NaN values to forward/backward fill. In other words, if there is
a gap with more than this number of consecutive NaNs, it will only
be partially filled. If method is not specified, this is the
maximum number of entries along the entire axis where NaNs will be
filled.
The maximum number of entries where NA values will be filled.
copy : bool, default True
Whether to make a copy of the data before filling. If False, then
the original should be modified and no new memory should be allocated.
Expand All @@ -923,6 +918,8 @@ def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self:
"""
if copy is False:
raise NotImplementedError
if limit is not None:
raise ValueError("limit must be None")

value_left, value_right = self._validate_scalar(value)

Expand Down
6 changes: 6 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,12 @@ def _pad_or_backfill(
@doc(ExtensionArray.fillna)
def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self:
mask = self._mask
if limit is not None and limit < len(self):
modify = mask.cumsum() > limit
if modify.any():
# Only copy mask if necessary
mask = mask.copy()
mask[modify] = False

value = missing.check_value_size(value, mask, len(self))

Expand Down
3 changes: 3 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,7 @@ def fillna(
----------
value : scalar
limit : int, optional
Not supported for SparseArray, must be None.
copy: bool, default True
Ignored for SparseArray.

Expand All @@ -736,6 +737,8 @@ def fillna(
When ``self.fill_value`` is not NA, the result dtype will be
``self.dtype``. Again, this preserves the amount of memory used.
"""
if limit is not None:
raise ValueError("limit must be None")
new_values = np.where(isna(self.sp_values), value, self.sp_values)

if self._null_fill_value:
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1863,6 +1863,8 @@ def fillna(
) -> list[Block]:
if isinstance(self.dtype, IntervalDtype):
# Block.fillna handles coercion (test_fillna_interval)
if limit is not None:
raise ValueError("limit must be None")
return super().fillna(
value=value,
limit=limit,
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,20 @@ def test_factorize_empty(self, data):
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_extension_array_equal(uniques, expected_uniques)

def test_fillna_limit_frame(self, data_missing):
# GH#58001
df = pd.DataFrame({"A": data_missing.take([0, 1, 0, 1])})
expected = pd.DataFrame({"A": data_missing.take([1, 1, 0, 1])})
result = df.fillna(value=data_missing[1], limit=1)
tm.assert_frame_equal(result, expected)

def test_fillna_limit_series(self, data_missing):
# GH#58001
ser = pd.Series(data_missing.take([0, 1, 0, 1]))
expected = pd.Series(data_missing.take([1, 1, 0, 1]))
result = ser.fillna(value=data_missing[1], limit=1)
tm.assert_series_equal(result, expected)

def test_fillna_copy_frame(self, data_missing):
arr = data_missing.take([1, 1])
df = pd.DataFrame({"A": arr})
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/extension/decimal/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,22 @@ def test_fillna_with_none(self, data_missing):
with pytest.raises(TypeError, match=msg):
super().test_fillna_with_none(data_missing)

def test_fillna_limit_frame(self, data_missing):
# GH#58001
msg = "ExtensionArray.fillna added a 'copy' keyword"
with tm.assert_produces_warning(
DeprecationWarning, match=msg, check_stacklevel=False
):
super().test_fillna_limit_frame(data_missing)

def test_fillna_limit_series(self, data_missing):
# GH#58001
msg = "ExtensionArray.fillna added a 'copy' keyword"
with tm.assert_produces_warning(
DeprecationWarning, match=msg, check_stacklevel=False
):
super().test_fillna_limit_series(data_missing)

@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna):
all_data = all_data[:10]
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/extension/json/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,16 @@ def test_fillna_with_none(self, data_missing):
with pytest.raises(AssertionError):
super().test_fillna_with_none(data_missing)

@pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot use fillna with JSONArray because DataFrame.fillna has special logic for dict / Series argument.

def test_fillna_limit_frame(self, data_missing):
# GH#58001
super().test_fillna_limit_frame(data_missing)

@pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path")
def test_fillna_limit_series(self, data_missing):
# GH#58001
super().test_fillna_limit_frame(data_missing)

@pytest.mark.parametrize(
"limit_area, input_ilocs, expected_ilocs",
[
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/extension/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,16 @@ class TestIntervalArray(base.ExtensionTests):
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
return op_name in ["min", "max"]

def test_fillna_limit_frame(self, data_missing):
# GH#58001
with pytest.raises(ValueError, match="limit must be None"):
super().test_fillna_limit_frame(data_missing)

def test_fillna_limit_series(self, data_missing):
# GH#58001
with pytest.raises(ValueError, match="limit must be None"):
super().test_fillna_limit_frame(data_missing)

@pytest.mark.xfail(
reason="Raises with incorrect message bc it disallows *all* listlikes "
"instead of just wrong-length listlikes"
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,18 @@ def test_shift_fill_value(self, data):
# np.array shape inference. Shift implementation fails.
super().test_shift_fill_value(data)

@skip_nested
def test_fillna_limit_frame(self, data_missing):
# GH#58001
# The "scalar" for this array isn't a scalar.
super().test_fillna_limit_frame(data_missing)

@skip_nested
def test_fillna_limit_series(self, data_missing):
# GH#58001
# The "scalar" for this array isn't a scalar.
super().test_fillna_limit_series(data_missing)

@skip_nested
def test_fillna_copy_frame(self, data_missing):
# The "scalar" for this array isn't a scalar.
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,16 @@ def test_fillna_frame(self, data_missing):

tm.assert_frame_equal(result, expected)

def test_fillna_limit_frame(self, data_missing):
# GH#58001
with pytest.raises(ValueError, match="limit must be None"):
super().test_fillna_limit_frame(data_missing)

def test_fillna_limit_series(self, data_missing):
# GH#58001
with pytest.raises(ValueError, match="limit must be None"):
super().test_fillna_limit_frame(data_missing)

_combine_le_expected_dtype = "Sparse[bool]"

def test_fillna_copy_frame(self, data_missing):
Expand Down