diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9e7349a061295..f20bef9f023ce 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -414,7 +414,7 @@ Indexing Missing ^^^^^^^ -- +- Bug in :meth:`DataFrame.fillna` and :meth:`Series.fillna` that would ignore the ``limit`` argument on :class:`.ExtensionArray` dtypes (:issue:`58001`) - MultiIndex @@ -465,7 +465,6 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) -- Styler ^^^^^^ diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index cbd0221cc2082..7b941e7ea8338 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -330,6 +330,13 @@ def _pad_or_backfill( @doc(ExtensionArray.fillna) def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: mask = self.isna() + if limit is not None and limit < len(self): + # mypy doesn't like that mask can be an EA which need not have `cumsum` + modify = mask.cumsum() > limit # type: ignore[union-attr] + if modify.any(): + # Only copy mask if necessary + mask = mask.copy() + mask[modify] = False # error: Argument 2 to "check_value_size" has incompatible type # "ExtensionArray"; expected "ndarray" value = missing.check_value_size( diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8a2856d0a7e64..da08d6e9167c5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -742,7 +742,8 @@ def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll: If returning an ExtensionArray, then * ``na_values._is_boolean`` should be True - * `na_values` should implement :func:`ExtensionArray._reduce` + * ``na_values`` should implement :func:`ExtensionArray._reduce` + * ``na_values`` should implement :func:`ExtensionArray._accumulate` * ``na_values.any`` and ``na_values.all`` should be implemented Examples @@ -1045,19 +1046,12 @@ def fillna( Alternatively, an array-like "value" can be given. It's expected that the array-like have the same length as 'self'. limit : int, default None - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. + The maximum number of entries where NA values will be filled. copy : bool, default True Whether to make a copy of the data before filling. If False, then the original should be modified and no new memory should be allocated. For ExtensionArray subclasses that cannot do this, it is at the author's discretion whether to ignore "copy=False" or to raise. - The base class implementation ignores the keyword in pad/backfill - cases. Returns ------- @@ -1073,6 +1067,15 @@ def fillna( Length: 6, dtype: Int64 """ mask = self.isna() + if limit is not None and limit < len(self): + # isna can return an ExtensionArray, we're assuming that comparisons + # are implemented. + # mypy doesn't like that mask can be an EA which need not have `cumsum` + modify = mask.cumsum() > limit # type: ignore[union-attr] + if modify.any(): + # Only copy mask if necessary + mask = mask.copy() + mask[modify] = False # error: Argument 2 to "check_value_size" has incompatible type # "ExtensionArray"; expected "ndarray" value = missing.check_value_size( diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 86f58b48ea3be..6149adc8bc1a9 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -905,12 +905,7 @@ def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: value(s) passed should be either Interval objects or NA/NaN. limit : int, default None (Not implemented yet for IntervalArray) - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. + The maximum number of entries where NA values will be filled. copy : bool, default True Whether to make a copy of the data before filling. If False, then the original should be modified and no new memory should be allocated. @@ -923,6 +918,8 @@ def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: """ if copy is False: raise NotImplementedError + if limit is not None: + raise ValueError("limit must be None") value_left, value_right = self._validate_scalar(value) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index df794183f67d1..e77c998e796a9 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -232,6 +232,12 @@ def _pad_or_backfill( @doc(ExtensionArray.fillna) def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: mask = self._mask + if limit is not None and limit < len(self): + modify = mask.cumsum() > limit + if modify.any(): + # Only copy mask if necessary + mask = mask.copy() + mask[modify] = False value = missing.check_value_size(value, mask, len(self)) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 522d86fb165f6..42e1cdd337e62 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -717,6 +717,7 @@ def fillna( ---------- value : scalar limit : int, optional + Not supported for SparseArray, must be None. copy: bool, default True Ignored for SparseArray. @@ -736,6 +737,8 @@ def fillna( When ``self.fill_value`` is not NA, the result dtype will be ``self.dtype``. Again, this preserves the amount of memory used. """ + if limit is not None: + raise ValueError("limit must be None") new_values = np.where(isna(self.sp_values), value, self.sp_values) if self._null_fill_value: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 28d3292a1c65b..6107c009b5bf1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1863,6 +1863,8 @@ def fillna( ) -> list[Block]: if isinstance(self.dtype, IntervalDtype): # Block.fillna handles coercion (test_fillna_interval) + if limit is not None: + raise ValueError("limit must be None") return super().fillna( value=value, limit=limit, diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 225a3301b8b8c..b951d4c35d208 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -299,6 +299,20 @@ def test_factorize_empty(self, data): tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_extension_array_equal(uniques, expected_uniques) + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + df = pd.DataFrame({"A": data_missing.take([0, 1, 0, 1])}) + expected = pd.DataFrame({"A": data_missing.take([1, 1, 0, 1])}) + result = df.fillna(value=data_missing[1], limit=1) + tm.assert_frame_equal(result, expected) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + ser = pd.Series(data_missing.take([0, 1, 0, 1])) + expected = pd.Series(data_missing.take([1, 1, 0, 1])) + result = ser.fillna(value=data_missing[1], limit=1) + tm.assert_series_equal(result, expected) + def test_fillna_copy_frame(self, data_missing): arr = data_missing.take([1, 1]) df = pd.DataFrame({"A": arr}) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 504bafc145108..6f18761f77138 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -152,6 +152,22 @@ def test_fillna_with_none(self, data_missing): with pytest.raises(TypeError, match=msg): super().test_fillna_with_none(data_missing) + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + msg = "ExtensionArray.fillna added a 'copy' keyword" + with tm.assert_produces_warning( + DeprecationWarning, match=msg, check_stacklevel=False + ): + super().test_fillna_limit_frame(data_missing) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + msg = "ExtensionArray.fillna added a 'copy' keyword" + with tm.assert_produces_warning( + DeprecationWarning, match=msg, check_stacklevel=False + ): + super().test_fillna_limit_series(data_missing) + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): all_data = all_data[:10] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 22ac9627f6cda..4bc9562f1895d 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -156,6 +156,16 @@ def test_fillna_with_none(self, data_missing): with pytest.raises(AssertionError): super().test_fillna_with_none(data_missing) + @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path") + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + super().test_fillna_limit_frame(data_missing) + + @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path") + def test_fillna_limit_series(self, data_missing): + # GH#58001 + super().test_fillna_limit_frame(data_missing) + @pytest.mark.parametrize( "limit_area, input_ilocs, expected_ilocs", [ diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 6900d6d67f9d9..ec979ac6d22dc 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -84,6 +84,16 @@ class TestIntervalArray(base.ExtensionTests): def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return op_name in ["min", "max"] + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + @pytest.mark.xfail( reason="Raises with incorrect message bc it disallows *all* listlikes " "instead of just wrong-length listlikes" diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index ca79c13ed44e4..79cfb736941d6 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -205,6 +205,18 @@ def test_shift_fill_value(self, data): # np.array shape inference. Shift implementation fails. super().test_shift_fill_value(data) + @skip_nested + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + # The "scalar" for this array isn't a scalar. + super().test_fillna_limit_frame(data_missing) + + @skip_nested + def test_fillna_limit_series(self, data_missing): + # GH#58001 + # The "scalar" for this array isn't a scalar. + super().test_fillna_limit_series(data_missing) + @skip_nested def test_fillna_copy_frame(self, data_missing): # The "scalar" for this array isn't a scalar. diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 5595a9ca44d05..56c023d99bb1c 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -264,6 +264,16 @@ def test_fillna_frame(self, data_missing): tm.assert_frame_equal(result, expected) + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + _combine_le_expected_dtype = "Sparse[bool]" def test_fillna_copy_frame(self, data_missing):