From 989c9e57c41481bdb3eb7d2ce84908624faa7fb5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 13 Apr 2024 09:47:22 -0400 Subject: [PATCH 1/4] BUG: Implement fillna(..., limit=x) for EAs --- doc/source/whatsnew/v3.0.0.rst | 3 +-- pandas/core/arrays/_mixins.py | 6 ++++++ pandas/core/arrays/base.py | 15 +++++++-------- pandas/core/arrays/interval.py | 9 +++------ pandas/core/arrays/masked.py | 6 ++++++ pandas/core/arrays/sparse/array.py | 3 +++ pandas/core/internals/blocks.py | 2 ++ pandas/tests/extension/base/methods.py | 14 ++++++++++++++ pandas/tests/extension/decimal/test_decimal.py | 16 ++++++++++++++++ pandas/tests/extension/json/test_json.py | 10 ++++++++++ pandas/tests/extension/test_interval.py | 10 ++++++++++ pandas/tests/extension/test_numpy.py | 12 ++++++++++++ pandas/tests/extension/test_sparse.py | 10 ++++++++++ 13 files changed, 100 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 50643454bbcec..68ed089db25c9 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -399,7 +399,7 @@ Indexing Missing ^^^^^^^ -- +- Fixed bug in :meth:`DataFrame.fillna` and :meth:`Series.fillna` that would ignore the ``limit`` argument on :class:`.ExtensionArray` dtypes (:issue:`58001`) - MultiIndex @@ -444,7 +444,6 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ - Fixed bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) -- Styler ^^^^^^ diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 123dc679a83ea..eb614de27fef3 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -330,6 +330,12 @@ def _pad_or_backfill( @doc(ExtensionArray.fillna) def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self: mask = self.isna() + if limit is not None and limit < len(self): + modify = mask.cumsum() > limit + if modify.any(): + # Only copy mask if necessary + mask = mask.copy() + mask[modify] = False # error: Argument 2 to "check_value_size" has incompatible type # "ExtensionArray"; expected "ndarray" value = missing.check_value_size( diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8a2856d0a7e64..428ded5e7b438 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1045,19 +1045,12 @@ def fillna( Alternatively, an array-like "value" can be given. It's expected that the array-like have the same length as 'self'. limit : int, default None - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. + The maximum number of entries where NA values will be filled. copy : bool, default True Whether to make a copy of the data before filling. If False, then the original should be modified and no new memory should be allocated. For ExtensionArray subclasses that cannot do this, it is at the author's discretion whether to ignore "copy=False" or to raise. - The base class implementation ignores the keyword in pad/backfill - cases. Returns ------- @@ -1073,6 +1066,12 @@ def fillna( Length: 6, dtype: Int64 """ mask = self.isna() + if limit is not None and limit < len(self): + modify = mask.cumsum() > limit + if modify.any(): + # Only copy mask if necessary + mask = mask.copy() + mask[modify] = False # error: Argument 2 to "check_value_size" has incompatible type # "ExtensionArray"; expected "ndarray" value = missing.check_value_size( diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index af666a591b1bc..b739f26952ecc 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -905,12 +905,7 @@ def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Sel value(s) passed should be either Interval objects or NA/NaN. limit : int, default None (Not implemented yet for IntervalArray) - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. + The maximum number of entries where NA values will be filled. copy : bool, default True Whether to make a copy of the data before filling. If False, then the original should be modified and no new memory should be allocated. @@ -923,6 +918,8 @@ def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Sel """ if copy is False: raise NotImplementedError + if limit is not None: + raise ValueError("limit must be None") value_left, value_right = self._validate_scalar(value) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d20d7f98b8aa8..59d629cc0242e 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -238,6 +238,12 @@ def _pad_or_backfill( @doc(ExtensionArray.fillna) def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self: mask = self._mask + if limit is not None and limit < len(self): + modify = mask.cumsum() > limit + if modify.any(): + # Only copy mask if necessary + mask = mask.copy() + mask[modify] = False value = missing.check_value_size(value, mask, len(self)) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 134702099371d..dc65e90c26599 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -717,6 +717,7 @@ def fillna( ---------- value : scalar limit : int, optional + Not supported for SparseArray, must be None. copy: bool, default True Ignored for SparseArray. @@ -738,6 +739,8 @@ def fillna( """ if value is None: raise ValueError("Must specify 'value'.") + if limit is not None: + raise ValueError("limit must be None") new_values = np.where(isna(self.sp_values), value, self.sp_values) if self._null_fill_value: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7be1d5d95ffdf..11809adc792a9 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1853,6 +1853,8 @@ def fillna( ) -> list[Block]: if isinstance(self.dtype, IntervalDtype): # Block.fillna handles coercion (test_fillna_interval) + if limit is not None: + raise ValueError("limit must be None") return super().fillna( value=value, limit=limit, diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 225a3301b8b8c..b951d4c35d208 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -299,6 +299,20 @@ def test_factorize_empty(self, data): tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_extension_array_equal(uniques, expected_uniques) + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + df = pd.DataFrame({"A": data_missing.take([0, 1, 0, 1])}) + expected = pd.DataFrame({"A": data_missing.take([1, 1, 0, 1])}) + result = df.fillna(value=data_missing[1], limit=1) + tm.assert_frame_equal(result, expected) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + ser = pd.Series(data_missing.take([0, 1, 0, 1])) + expected = pd.Series(data_missing.take([1, 1, 0, 1])) + result = ser.fillna(value=data_missing[1], limit=1) + tm.assert_series_equal(result, expected) + def test_fillna_copy_frame(self, data_missing): arr = data_missing.take([1, 1]) df = pd.DataFrame({"A": arr}) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index a2721908e858f..a669b941ea7a2 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -144,6 +144,22 @@ def test_fillna_series(self, data_missing): ): super().test_fillna_series(data_missing) + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + msg = "ExtensionArray.fillna added a 'copy' keyword" + with tm.assert_produces_warning( + DeprecationWarning, match=msg, check_stacklevel=False + ): + super().test_fillna_limit_frame(data_missing) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + msg = "ExtensionArray.fillna added a 'copy' keyword" + with tm.assert_produces_warning( + DeprecationWarning, match=msg, check_stacklevel=False + ): + super().test_fillna_limit_series(data_missing) + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): all_data = all_data[:10] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 6ecbf2063f203..bd46e7b15e3ca 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -149,6 +149,16 @@ def test_fillna_frame(self): """We treat dictionaries as a mapping in fillna, not a scalar.""" super().test_fillna_frame() + @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path") + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + super().test_fillna_limit_frame(data_missing) + + @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path") + def test_fillna_limit_series(self, data_missing): + # GH#58001 + super().test_fillna_limit_frame(data_missing) + @pytest.mark.parametrize( "limit_area, input_ilocs, expected_ilocs", [ diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 6900d6d67f9d9..ec979ac6d22dc 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -84,6 +84,16 @@ class TestIntervalArray(base.ExtensionTests): def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return op_name in ["min", "max"] + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + @pytest.mark.xfail( reason="Raises with incorrect message bc it disallows *all* listlikes " "instead of just wrong-length listlikes" diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index ca79c13ed44e4..79cfb736941d6 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -205,6 +205,18 @@ def test_shift_fill_value(self, data): # np.array shape inference. Shift implementation fails. super().test_shift_fill_value(data) + @skip_nested + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + # The "scalar" for this array isn't a scalar. + super().test_fillna_limit_frame(data_missing) + + @skip_nested + def test_fillna_limit_series(self, data_missing): + # GH#58001 + # The "scalar" for this array isn't a scalar. + super().test_fillna_limit_series(data_missing) + @skip_nested def test_fillna_copy_frame(self, data_missing): # The "scalar" for this array isn't a scalar. diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 5595a9ca44d05..56c023d99bb1c 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -264,6 +264,16 @@ def test_fillna_frame(self, data_missing): tm.assert_frame_equal(result, expected) + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + _combine_le_expected_dtype = "Sparse[bool]" def test_fillna_copy_frame(self, data_missing): From 1dea9dd562623d39a956d42d47ef4209eb69cd45 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 2 May 2024 18:49:06 -0400 Subject: [PATCH 2/4] Comments --- pandas/core/arrays/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 428ded5e7b438..2639e65b4b36d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -742,7 +742,8 @@ def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll: If returning an ExtensionArray, then * ``na_values._is_boolean`` should be True - * `na_values` should implement :func:`ExtensionArray._reduce` + * ``na_values`` should implement :func:`ExtensionArray._reduce` + * ``na_values`` should implement :func:`ExtensionArray._accumulate` * ``na_values.any`` and ``na_values.all`` should be implemented Examples @@ -1067,6 +1068,8 @@ def fillna( """ mask = self.isna() if limit is not None and limit < len(self): + # isna can return an ExtensionArray, we're assuming that comparisons + # are implemented and the result can use `.any()` modify = mask.cumsum() > limit if modify.any(): # Only copy mask if necessary From f8d8904274fed6b7189a2a906b6292cd0cddc22d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 2 May 2024 18:52:44 -0400 Subject: [PATCH 3/4] type ignores --- pandas/core/arrays/_mixins.py | 3 ++- pandas/core/arrays/base.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 91f3aff4745ef..7b941e7ea8338 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -331,7 +331,8 @@ def _pad_or_backfill( def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: mask = self.isna() if limit is not None and limit < len(self): - modify = mask.cumsum() > limit + # mypy doesn't like that mask can be an EA which need not have `cumsum` + modify = mask.cumsum() > limit # type: ignore[union-attr] if modify.any(): # Only copy mask if necessary mask = mask.copy() diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2639e65b4b36d..da08d6e9167c5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1069,8 +1069,9 @@ def fillna( mask = self.isna() if limit is not None and limit < len(self): # isna can return an ExtensionArray, we're assuming that comparisons - # are implemented and the result can use `.any()` - modify = mask.cumsum() > limit + # are implemented. + # mypy doesn't like that mask can be an EA which need not have `cumsum` + modify = mask.cumsum() > limit # type: ignore[union-attr] if modify.any(): # Only copy mask if necessary mask = mask.copy() From 8f4556e0cac6cce9f30af0b2e89ad81a3f0576fb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 3 May 2024 09:45:47 -0700 Subject: [PATCH 4/4] Update doc/source/whatsnew/v3.0.0.rst --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 6d5c700f0de74..f20bef9f023ce 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -414,7 +414,7 @@ Indexing Missing ^^^^^^^ -- Fixed bug in :meth:`DataFrame.fillna` and :meth:`Series.fillna` that would ignore the ``limit`` argument on :class:`.ExtensionArray` dtypes (:issue:`58001`) +- Bug in :meth:`DataFrame.fillna` and :meth:`Series.fillna` that would ignore the ``limit`` argument on :class:`.ExtensionArray` dtypes (:issue:`58001`) - MultiIndex