diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index bff5b2b70b518..eb2529716b55e 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -54,6 +54,7 @@ objects. api.extensions.ExtensionArray.interpolate api.extensions.ExtensionArray.isin api.extensions.ExtensionArray.isna + api.extensions.ExtensionArray.pad_or_backfill api.extensions.ExtensionArray.ravel api.extensions.ExtensionArray.repeat api.extensions.ExtensionArray.searchsorted diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 0fdec3175f635..70cb0c0223eee 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -417,6 +417,7 @@ Other Deprecations - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`) - Deprecated replacing builtin and NumPy functions in ``.agg``, ``.apply``, and ``.transform``; use the corresponding string alias (e.g. ``"sum"`` for ``sum`` or ``np.sum``) instead (:issue:`53425`) - Deprecated strings ``T``, ``t``, ``L`` and ``l`` denoting units in :func:`to_timedelta` (:issue:`52536`) +- Deprecated the "method" and "limit" keywords in ``ExtensionArray.fillna``, implement and use ``pad_or_backfill`` instead (:issue:`53621`) - Deprecated the "method" and "limit" keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`SeriesGroupBy.fillna`, :meth:`DataFrameGroupBy.fillna`, and :meth:`Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`) - Deprecated the ``method`` and ``limit`` keywords in :meth:`DataFrame.replace` and :meth:`Series.replace` (:issue:`33302`) - Deprecated the use of non-supported datetime64 and timedelta64 resolutions with :func:`pandas.array`. Supported resolutions are: "s", "ms", "us", "ns" resolutions (:issue:`53058`) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 365f85a908099..359e0161e763c 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -18,6 +18,7 @@ AxisInt, Dtype, F, + FillnaOptions, PositionalIndexer2D, PositionalIndexerTuple, ScalarIndexer, @@ -295,6 +296,37 @@ def _fill_mask_inplace( func = missing.get_fill_func(method, ndim=self.ndim) func(self._ndarray.T, limit=limit, mask=mask.T) + def pad_or_backfill( + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + mask = self.isna() + if mask.any(): + # (for now) when self.ndim == 2, we assume axis=0 + func = missing.get_fill_func(method, ndim=self.ndim) + + npvalues = self._ndarray.T + if copy: + npvalues = npvalues.copy() + func(npvalues, limit=limit, mask=mask.T) + npvalues = npvalues.T + + if copy: + new_values = self._from_backing_data(npvalues) + else: + new_values = self + + else: + if copy: + new_values = self.copy() + else: + new_values = self + return new_values + @doc(ExtensionArray.fillna) def fillna( self, value=None, method=None, limit: int | None = None, copy: bool = True @@ -312,7 +344,6 @@ def fillna( if mask.any(): if method is not None: - # TODO: check value is None # (for now) when self.ndim == 2, we assume axis=0 func = missing.get_fill_func(method, ndim=self.ndim) npvalues = self._ndarray.T diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7976f97cb49aa..88695f11fba59 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -899,6 +899,24 @@ def dropna(self) -> Self: """ return type(self)(pc.drop_null(self._pa_array)) + def pad_or_backfill( + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + if not self._hasna: + # TODO(CoW): Not necessary anymore when CoW is the default + return self.copy() + + # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove + # this method entirely. + return super().pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) + @doc(ExtensionArray.fillna) def fillna( self, diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f69ddac4db6bf..325edba670fce 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -18,6 +18,7 @@ cast, overload, ) +import warnings import numpy as np @@ -33,6 +34,7 @@ Substitution, cache_readonly, ) +from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_bool_kwarg, validate_fillna_kwargs, @@ -130,6 +132,7 @@ class ExtensionArray: interpolate isin isna + pad_or_backfill ravel repeat searchsorted @@ -180,6 +183,7 @@ class ExtensionArray: methods: * fillna + * pad_or_backfill * dropna * unique * factorize / _values_for_factorize @@ -907,6 +911,93 @@ def interpolate( f"{type(self).__name__} does not implement interpolate" ) + def pad_or_backfill( + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + """ + Pad or backfill values, used by Series/DataFrame ffill and bfill. + + Parameters + ---------- + method : {'backfill', 'bfill', 'pad', 'ffill'} + Method to use for filling holes in reindexed Series: + + * pad / ffill: propagate last valid observation forward to next valid. + * backfill / bfill: use NEXT valid observation to fill gap. + + limit : int, default None + This is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + copy : bool, default True + Whether to make a copy of the data before filling. If False, then + the original should be modified and no new memory should be allocated. + For ExtensionArray subclasses that cannot do this, it is at the + author's discretion whether to ignore "copy=False" or to raise. + The base class implementation ignores the keyword if any NAs are + present. + + Returns + ------- + Same type as self + + Examples + -------- + >>> arr = pd.array([np.nan, np.nan, 2, 3, np.nan, np.nan]) + >>> arr.pad_or_backfill(method="backfill", limit=1) + + [, 2, 2, 3, , ] + Length: 6, dtype: Int64 + """ + + # If a 3rd-party EA has implemented this functionality in fillna, + # we warn that they need to implement pad_or_backfill instead. + if ( + type(self).fillna is not ExtensionArray.fillna + and type(self).pad_or_backfill is ExtensionArray.pad_or_backfill + ): + # Check for pad_or_backfill here allows us to call + # super().pad_or_backfill without getting this warning + warnings.warn( + "ExtensionArray.fillna 'method' keyword is deprecated. " + "In a future version. arr.pad_or_backfill will be called " + "instead. 3rd-party ExtensionArray authors need to implement " + "pad_or_backfill.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.fillna(method=method, limit=limit) + + mask = self.isna() + + if mask.any(): + # NB: the base class does not respect the "copy" keyword + meth = missing.clean_fill_method(method) + + npmask = np.asarray(mask) + if meth == "pad": + indexer = libalgos.get_fill_indexer(npmask, limit=limit) + return self.take(indexer, allow_fill=True) + else: + # i.e. meth == "backfill" + indexer = libalgos.get_fill_indexer(npmask[::-1], limit=limit)[::-1] + return self[::-1].take(indexer, allow_fill=True) + + else: + if not copy: + return self + new_values = self.copy() + return new_values + def fillna( self, value: object | ArrayLike | None = None, @@ -921,7 +1012,7 @@ def fillna( ---------- value : scalar, array-like If a scalar value is passed it is used to fill all missing values. - Alternatively, an array-like 'value' can be given. It's expected + Alternatively, an array-like "value" can be given. It's expected that the array-like have the same length as 'self'. method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series: @@ -929,6 +1020,8 @@ def fillna( * pad / ffill: propagate last valid observation forward to next valid. * backfill / bfill: use NEXT valid observation to fill gap. + .. deprecated:: 2.1.0 + limit : int, default None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill. In other words, if there is @@ -937,6 +1030,8 @@ def fillna( maximum number of entries along the entire axis where NaNs will be filled. + .. deprecated:: 2.1.0 + copy : bool, default True Whether to make a copy of the data before filling. If False, then the original should be modified and no new memory should be allocated. @@ -958,6 +1053,15 @@ def fillna( [0, 0, 2, 3, 0, 0] Length: 6, dtype: Int64 """ + if method is not None: + warnings.warn( + f"The 'method' keyword in {type(self).__name__}.fillna is " + "deprecated and will be removed in a future version. " + "Use pad_or_backfill instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + value, method = validate_fillna_kwargs(value, method) mask = self.isna() diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 01f631b54a1d7..3263dd73fe4dc 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -29,6 +29,7 @@ ArrayLike, AxisInt, Dtype, + FillnaOptions, IntervalClosedType, NpDtype, PositionalIndexer, @@ -889,6 +890,20 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr indexer = obj.argsort()[-1] return obj[indexer] + def pad_or_backfill( # pylint: disable=useless-parent-delegation + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove + # this method entirely. + return super().pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) + def fillna( self, value=None, method=None, limit: int | None = None, copy: bool = True ) -> Self: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 1de3ae3b2428e..bec875f2bbfa1 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -24,6 +24,7 @@ AstypeArg, AxisInt, DtypeObj, + FillnaOptions, NpDtype, PositionalIndexer, Scalar, @@ -189,6 +190,36 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self._simple_new(self._data[item], newmask) + def pad_or_backfill( + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + mask = self._mask + + if mask.any(): + func = missing.get_fill_func(method, ndim=self.ndim) + + npvalues = self._data.T + new_mask = mask.T + if copy: + npvalues = npvalues.copy() + new_mask = new_mask.copy() + func(npvalues, limit=limit, mask=new_mask) + if copy: + return self._simple_new(npvalues.T, new_mask.T) + else: + return self + else: + if copy: + new_values = self.copy() + else: + new_values = self + return new_values + @doc(ExtensionArray.fillna) def fillna( self, value=None, method=None, limit: int | None = None, copy: bool = True diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 6d01dfcf6d90b..79a9ffb5f8c0b 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -237,7 +237,7 @@ def pad_or_backfill( self, *, method: FillnaOptions, - limit: int | None, + limit: int | None = None, limit_area: Literal["inside", "outside"] | None = None, copy: bool = True, ) -> Self: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index af6402b9964e5..4df4375c5d701 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -78,6 +78,7 @@ from pandas._typing import ( AnyArrayLike, Dtype, + FillnaOptions, NpDtype, NumpySorter, NumpyValueArrayLike, @@ -790,6 +791,25 @@ def searchsorted( m8arr = self._ndarray.view("M8[ns]") return m8arr.searchsorted(npvalue, side=side, sorter=sorter) + def pad_or_backfill( + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + # view as dt64 so we get treated as timelike in core.missing, + # similar to dtl._period_dispatch + dta = self.view("M8[ns]") + result = dta.pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) + if copy: + return cast("Self", result.view(self.dtype)) + else: + return self + def fillna( self, value=None, method=None, limit: int | None = None, copy: bool = True ) -> Self: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index d832a9f772f45..d32c98535d7cb 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -712,6 +712,20 @@ def isna(self): mask[self.sp_index.indices] = isna(self.sp_values) return type(self)(mask, fill_value=False, dtype=dtype) + def pad_or_backfill( # pylint: disable=useless-parent-delegation + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + # TODO(3.0): We can remove this method once deprecation for fillna method + # keyword is enforced. + return super().pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) + def fillna( self, value=None, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3e988068dbc12..49f4d333c77b2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1917,28 +1917,10 @@ def pad_or_backfill( if values.ndim == 2 and axis == 1: # NDArrayBackedExtensionArray.fillna assumes axis=0 - new_values = values.T.fillna(method=method, limit=limit, copy=copy).T + new_values = values.T.pad_or_backfill(method=method, limit=limit).T else: - try: - new_values = values.fillna(method=method, limit=limit, copy=copy) - except TypeError: - # 3rd party EA that has not implemented copy keyword yet - refs = None - new_values = values.fillna(method=method, limit=limit) - # issue the warning *after* retrying, in case the TypeError - # was caused by an invalid fill_value - warnings.warn( - # GH#53278 - "ExtensionArray.fillna added a 'copy' keyword in pandas " - "2.1.0. In a future version, ExtensionArray subclasses will " - "need to implement this keyword or an exception will be " - "raised. In the interim, the keyword is ignored by " - f"{type(self.values).__name__}.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - return [self.make_block_same_class(new_values, refs=refs)] + new_values = values.pad_or_backfill(method=method, limit=limit) + return [self.make_block_same_class(new_values)] class ExtensionBlock(libinternals.Block, EABackedBlock): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 8f87749a4ed6e..711a1b5f2f26c 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -258,7 +258,7 @@ def test_fillna_method_doesnt_change_orig(self, method): fill_value = arr[3] if method == "pad" else arr[5] - result = arr.fillna(method=method) + result = arr.pad_or_backfill(method=method) assert result[4] == fill_value # check that the original was not changed diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 1fe1d4efbefd7..8e38a8c741b8d 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -497,7 +497,7 @@ def test_fillna_preserves_tz(self, method): dtype=DatetimeTZDtype(tz="US/Central"), ) - result = arr.fillna(method=method) + result = arr.pad_or_backfill(method=method) tm.assert_extension_array_equal(result, expected) # assert that arr and dti were not modified in-place @@ -510,12 +510,12 @@ def test_fillna_2d(self): dta[0, 1] = pd.NaT dta[1, 0] = pd.NaT - res1 = dta.fillna(method="pad") + res1 = dta.pad_or_backfill(method="pad") expected1 = dta.copy() expected1[1, 0] = dta[0, 0] tm.assert_extension_array_equal(res1, expected1) - res2 = dta.fillna(method="backfill") + res2 = dta.pad_or_backfill(method="backfill") expected2 = dta.copy() expected2 = dta.copy() expected2[1, 0] = dta[2, 0] @@ -529,10 +529,10 @@ def test_fillna_2d(self): assert not dta2._ndarray.flags["C_CONTIGUOUS"] tm.assert_extension_array_equal(dta, dta2) - res3 = dta2.fillna(method="pad") + res3 = dta2.pad_or_backfill(method="pad") tm.assert_extension_array_equal(res3, expected1) - res4 = dta2.fillna(method="backfill") + res4 = dta2.pad_or_backfill(method="backfill") tm.assert_extension_array_equal(res4, expected2) # test the DataFrame method while we're here diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index b9706f87ab7d3..9dcce28f47e52 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -155,16 +155,14 @@ def test_concat_2d(self, data): @pytest.mark.parametrize("method", ["backfill", "pad"]) def test_fillna_2d_method(self, data_missing, method): + # pad_or_backfill is always along axis=0 arr = data_missing.repeat(2).reshape(2, 2) assert arr[0].isna().all() assert not arr[1].isna().any() - try: - result = arr.pad_or_backfill(method=method, limit=None) - except AttributeError: - result = arr.fillna(method=method, limit=None) + result = arr.pad_or_backfill(method=method, limit=None) - expected = data_missing.fillna(method=method).repeat(2).reshape(2, 2) + expected = data_missing.pad_or_backfill(method=method).repeat(2).reshape(2, 2) self.assert_extension_array_equal(result, expected) # Reverse so that backfill is not a no-op. @@ -172,12 +170,11 @@ def test_fillna_2d_method(self, data_missing, method): assert not arr2[0].isna().any() assert arr2[1].isna().all() - try: - result2 = arr2.pad_or_backfill(method=method, limit=None) - except AttributeError: - result2 = arr2.fillna(method=method, limit=None) + result2 = arr2.pad_or_backfill(method=method, limit=None) - expected2 = data_missing[::-1].fillna(method=method).repeat(2).reshape(2, 2) + expected2 = ( + data_missing[::-1].pad_or_backfill(method=method).repeat(2).reshape(2, 2) + ) self.assert_extension_array_equal(result2, expected2) @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 43f37a020df3f..a839a9d327f95 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -95,7 +95,7 @@ def test_fillna_no_op_returns_copy(self, data): assert result is not data self.assert_extension_array_equal(result, data) - result = data.fillna(method="backfill") + result = data.pad_or_backfill(method="backfill") assert result is not data self.assert_extension_array_equal(result, data) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 5762bc9ce485c..d24b70a884c45 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -281,6 +281,8 @@ def convert_values(param): def value_counts(self, dropna: bool = True): return value_counts(self.to_numpy(), dropna=dropna) + # We override fillna here to simulate a 3rd party EA that has done so. This + # lets us test the deprecation telling authors to implement pad_or_backfill # Simulate a 3rd-party EA that has not yet updated to include a "copy" # keyword in its fillna method. # error: Signature of "fillna" incompatible with supertype "ExtensionArray" diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 4f0ff427dd900..6feac7fb9d9dc 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -105,7 +105,7 @@ def test_fillna_frame(self, data_missing): super().test_fillna_frame(data_missing) def test_fillna_limit_pad(self, data_missing): - msg = "ExtensionArray.fillna added a 'copy' keyword" + msg = "ExtensionArray.fillna 'method' keyword is deprecated" with tm.assert_produces_warning( FutureWarning, match=msg, check_stacklevel=False ): @@ -123,6 +123,13 @@ def test_fillna_limit_backfill(self, data_missing): ): super().test_fillna_limit_backfill(data_missing) + def test_fillna_no_op_returns_copy(self, data): + msg = "ExtensionArray.fillna 'method' keyword is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + super().test_fillna_no_op_returns_copy(data) + def test_fillna_series(self, data_missing): msg = "ExtensionArray.fillna added a 'copy' keyword" with tm.assert_produces_warning( @@ -131,7 +138,7 @@ def test_fillna_series(self, data_missing): super().test_fillna_series(data_missing) def test_fillna_series_method(self, data_missing, fillna_method): - msg = "ExtensionArray.fillna added a 'copy' keyword" + msg = "ExtensionArray.fillna 'method' keyword is deprecated" with tm.assert_produces_warning( FutureWarning, match=msg, check_stacklevel=False ):