From 1ae8dccbff69f6b99c8127bb0b3ec10e1296c215 Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 30 Mar 2024 10:14:16 -0400 Subject: [PATCH 1/4] ENH: Enable fillna(value=None) --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/arrays/_mixins.py | 5 +- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/interval.py | 2 +- pandas/core/arrays/masked.py | 2 +- pandas/core/arrays/sparse/array.py | 4 +- pandas/core/generic.py | 169 +++++++++--------- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/multi.py | 2 +- pandas/tests/extension/base/missing.py | 6 + .../tests/extension/decimal/test_decimal.py | 8 + pandas/tests/extension/json/test_json.py | 7 + pandas/tests/frame/methods/test_fillna.py | 18 +- 13 files changed, 128 insertions(+), 101 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 74a19472ec835..c0c0c88ef9648 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -35,7 +35,7 @@ Other enhancements - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) -- +- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 7f4e6f6666382..370fdb0deb23b 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -335,7 +335,7 @@ def _pad_or_backfill( return new_values @doc(ExtensionArray.fillna) - def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self: + def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: mask = self.isna() # error: Argument 2 to "check_value_size" has incompatible type # "ExtensionArray"; expected "ndarray" @@ -354,8 +354,7 @@ def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Sel new_values[mask] = value else: # We validate the fill_value even if there is nothing to fill - if value is not None: - self._validate_setitem_value(value) + self._validate_setitem_value(value) if not copy: new_values = self[:] diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 84b62563605ac..8f148e2e91551 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1077,7 +1077,7 @@ def _pad_or_backfill( @doc(ExtensionArray.fillna) def fillna( self, - value: object | ArrayLike | None = None, + value: object | ArrayLike, limit: int | None = None, copy: bool = True, ) -> Self: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index af666a591b1bc..86f58b48ea3be 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -892,7 +892,7 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr indexer = obj.argsort()[-1] return obj[indexer] - def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self: + def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: """ Fill NA/NaN values using the specified method. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d20d7f98b8aa8..190888d281ea9 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -236,7 +236,7 @@ def _pad_or_backfill( return new_values @doc(ExtensionArray.fillna) - def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self: + def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: mask = self._mask value = missing.check_value_size(value, mask, len(self)) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index bdcb3219a9875..398fd795aee8a 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -716,7 +716,7 @@ def isna(self) -> Self: # type: ignore[override] def fillna( self, - value=None, + value, limit: int | None = None, copy: bool = True, ) -> Self: @@ -746,8 +746,6 @@ def fillna( When ``self.fill_value`` is not NA, the result dtype will be ``self.dtype``. Again, this preserves the amount of memory used. """ - if value is None: - raise ValueError("Must specify 'value'.") new_values = np.where(isna(self.sp_values), value, self.sp_values) if self._null_fill_value: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 858d2ba82a969..eed9adf7b59cd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6786,7 +6786,7 @@ def fillna( ) def fillna( self, - value: Hashable | Mapping | Series | DataFrame | None = None, + value: Hashable | Mapping | Series | DataFrame, *, axis: Axis | None = None, inplace: bool = False, @@ -6814,6 +6814,10 @@ def fillna( This is the maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. + Notes + ----- + For non-object dtype, ``value=None`` will use the NA value of the dtype. + Returns ------- {klass} or None @@ -6909,101 +6913,92 @@ def fillna( axis = 0 axis = self._get_axis_number(axis) - if value is None: - raise ValueError("Must specify a fill 'value'.") - else: - if self.ndim == 1: - if isinstance(value, (dict, ABCSeries)): - if not len(value): - # test_fillna_nonscalar - if inplace: - return None - return self.copy(deep=False) - from pandas import Series + if self.ndim == 1: + if isinstance(value, (dict, ABCSeries)): + if not len(value): + # test_fillna_nonscalar + if inplace: + return None + return self.copy(deep=False) + from pandas import Series - value = Series(value) - value = value.reindex(self.index) - value = value._values - elif not is_list_like(value): - pass - else: - raise TypeError( - '"value" parameter must be a scalar, dict ' - "or Series, but you passed a " - f'"{type(value).__name__}"' - ) + value = Series(value) + value = value.reindex(self.index) + value = value._values + elif not is_list_like(value): + pass + else: + raise TypeError( + '"value" parameter must be a scalar, dict ' + "or Series, but you passed a " + f'"{type(value).__name__}"' + ) - new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace) + new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace) - elif isinstance(value, (dict, ABCSeries)): - if axis == 1: - raise NotImplementedError( - "Currently only can fill " - "with dict/Series column " - "by column" - ) - result = self if inplace else self.copy(deep=False) - for k, v in value.items(): - if k not in result: - continue + elif isinstance(value, (dict, ABCSeries)): + if axis == 1: + raise NotImplementedError( + "Currently only can fill " "with dict/Series column " "by column" + ) + result = self if inplace else self.copy(deep=False) + for k, v in value.items(): + if k not in result: + continue - res_k = result[k].fillna(v, limit=limit) + res_k = result[k].fillna(v, limit=limit) - if not inplace: - result[k] = res_k + if not inplace: + result[k] = res_k + else: + # We can write into our existing column(s) iff dtype + # was preserved. + if isinstance(res_k, ABCSeries): + # i.e. 'k' only shows up once in self.columns + if res_k.dtype == result[k].dtype: + result.loc[:, k] = res_k + else: + # Different dtype -> no way to do inplace. + result[k] = res_k else: - # We can write into our existing column(s) iff dtype - # was preserved. - if isinstance(res_k, ABCSeries): - # i.e. 'k' only shows up once in self.columns - if res_k.dtype == result[k].dtype: - result.loc[:, k] = res_k + # see test_fillna_dict_inplace_nonunique_columns + locs = result.columns.get_loc(k) + if isinstance(locs, slice): + locs = np.arange(self.shape[1])[locs] + elif isinstance(locs, np.ndarray) and locs.dtype.kind == "b": + locs = locs.nonzero()[0] + elif not ( + isinstance(locs, np.ndarray) and locs.dtype.kind == "i" + ): + # Should never be reached, but let's cover our bases + raise NotImplementedError( + "Unexpected get_loc result, please report a bug at " + "https://github.com/pandas-dev/pandas" + ) + + for i, loc in enumerate(locs): + res_loc = res_k.iloc[:, i] + target = self.iloc[:, loc] + + if res_loc.dtype == target.dtype: + result.iloc[:, loc] = res_loc else: - # Different dtype -> no way to do inplace. - result[k] = res_k - else: - # see test_fillna_dict_inplace_nonunique_columns - locs = result.columns.get_loc(k) - if isinstance(locs, slice): - locs = np.arange(self.shape[1])[locs] - elif ( - isinstance(locs, np.ndarray) and locs.dtype.kind == "b" - ): - locs = locs.nonzero()[0] - elif not ( - isinstance(locs, np.ndarray) and locs.dtype.kind == "i" - ): - # Should never be reached, but let's cover our bases - raise NotImplementedError( - "Unexpected get_loc result, please report a bug at " - "https://github.com/pandas-dev/pandas" - ) - - for i, loc in enumerate(locs): - res_loc = res_k.iloc[:, i] - target = self.iloc[:, loc] - - if res_loc.dtype == target.dtype: - result.iloc[:, loc] = res_loc - else: - result.isetitem(loc, res_loc) - if inplace: - return self._update_inplace(result) - else: - return result + result.isetitem(loc, res_loc) + if inplace: + return self._update_inplace(result) + else: + return result - elif not is_list_like(value): - if axis == 1: - result = self.T.fillna(value=value, limit=limit).T - new_data = result._mgr - else: - new_data = self._mgr.fillna( - value=value, limit=limit, inplace=inplace - ) - elif isinstance(value, ABCDataFrame) and self.ndim == 2: - new_data = self.where(self.notna(), value)._mgr + elif not is_list_like(value): + if axis == 1: + result = self.T.fillna(value=value, limit=limit).T + new_data = result._mgr else: - raise ValueError(f"invalid fill value with a {type(value)}") + new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace) + elif isinstance(value, ABCDataFrame) and self.ndim == 2: + new_data = self.where(self.notna(), value)._mgr + else: + raise ValueError(f"invalid fill value with a {type(value)}") result = self._constructor_from_mgr(new_data, axes=new_data.axes) if inplace: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 30cf6f0b866ee..5d26bcaa63ecb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2543,7 +2543,7 @@ def notna(self) -> npt.NDArray[np.bool_]: notnull = notna - def fillna(self, value=None): + def fillna(self, value): """ Fill NA/NaN values with the specified value. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2e554bc848ffe..3be17d28d931c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1681,7 +1681,7 @@ def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: # (previously declared in base class "IndexOpsMixin") _duplicated = duplicated # type: ignore[misc] - def fillna(self, value=None, downcast=None): + def fillna(self, value, downcast=None): """ fillna is not implemented for MultiIndex """ diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 328c6cd6164fb..4b9234a9904a2 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -68,6 +68,12 @@ def test_fillna_scalar(self, data_missing): expected = data_missing.fillna(valid) tm.assert_extension_array_equal(result, expected) + def test_fillna_with_none(self, data_missing): + # GH#57723 + result = data_missing.fillna(None) + expected = data_missing + tm.assert_extension_array_equal(result, expected) + def test_fillna_limit_pad(self, data_missing): arr = data_missing.take([1, 0, 0, 0, 1]) result = pd.Series(arr).ffill(limit=2) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index a2721908e858f..504bafc145108 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -144,6 +144,14 @@ def test_fillna_series(self, data_missing): ): super().test_fillna_series(data_missing) + def test_fillna_with_none(self, data_missing): + # GH#57723 + # EAs that don't have special logic for None will raise, unlike pandas' + # which interpret None as the NA value for the dtype. + msg = "conversion from NoneType to Decimal is not supported" + with pytest.raises(TypeError, match=msg): + super().test_fillna_with_none(data_missing) + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): all_data = all_data[:10] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 6ecbf2063f203..22ac9627f6cda 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -149,6 +149,13 @@ def test_fillna_frame(self): """We treat dictionaries as a mapping in fillna, not a scalar.""" super().test_fillna_frame() + def test_fillna_with_none(self, data_missing): + # GH#57723 + # EAs that don't have special logic for None will raise, unlike pandas' + # which interpret None as the NA value for the dtype. + with pytest.raises(AssertionError): + super().test_fillna_with_none(data_missing) + @pytest.mark.parametrize( "limit_area, input_ilocs, expected_ilocs", [ diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 81f66cfd48b0a..77df2ee67ce75 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -64,8 +64,8 @@ def test_fillna_datetime(self, datetime_frame): padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"] ).all() - msg = "Must specify a fill 'value'" - with pytest.raises(ValueError, match=msg): + msg = r"NDFrame.fillna\(\) missing 1 required positional argument: 'value'" + with pytest.raises(TypeError, match=msg): datetime_frame.fillna() @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") @@ -779,3 +779,17 @@ def test_ffill_bfill_limit_area(data, expected_data, method, kwargs): expected = DataFrame(expected_data) result = getattr(df, method)(**kwargs) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("test_frame", [True, False]) +@pytest.mark.parametrize("dtype", ["float", "object"]) +def test_fillna_with_none_object(test_frame, dtype): + # GH#57723 + obj = Series([1, np.nan, 3], dtype=dtype) + if test_frame: + obj = obj.to_frame() + result = obj.fillna(value=None) + expected = Series([1, None, 3], dtype=dtype) + if test_frame: + expected = expected.to_frame() + tm.assert_equal(result, expected) From 0f171fac73fe7de07192d3314c3cb250e4846955 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 31 Mar 2024 07:45:11 -0400 Subject: [PATCH 2/4] fixups --- pandas/core/generic.py | 14 +++++++------- pandas/tests/frame/methods/test_fillna.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eed9adf7b59cd..7030a966ba08c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6752,7 +6752,7 @@ def _pad_or_backfill( @overload def fillna( self, - value: Hashable | Mapping | Series | DataFrame = ..., + value: Hashable | Mapping | Series | DataFrame, *, axis: Axis | None = ..., inplace: Literal[False] = ..., @@ -6762,7 +6762,7 @@ def fillna( @overload def fillna( self, - value: Hashable | Mapping | Series | DataFrame = ..., + value: Hashable | Mapping | Series | DataFrame, *, axis: Axis | None = ..., inplace: Literal[True], @@ -6772,7 +6772,7 @@ def fillna( @overload def fillna( self, - value: Hashable | Mapping | Series | DataFrame = ..., + value: Hashable | Mapping | Series | DataFrame, *, axis: Axis | None = ..., inplace: bool = ..., @@ -6814,10 +6814,6 @@ def fillna( This is the maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. - Notes - ----- - For non-object dtype, ``value=None`` will use the NA value of the dtype. - Returns ------- {klass} or None @@ -6831,6 +6827,10 @@ def fillna( reindex : Conform object to new index. asfreq : Convert TimeSeries to specified frequency. + Notes + ----- + For non-object dtype, ``value=None`` will use the NA value of the dtype. + Examples -------- >>> df = pd.DataFrame( diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 77df2ee67ce75..c89d0c8a0fc62 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -64,7 +64,7 @@ def test_fillna_datetime(self, datetime_frame): padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"] ).all() - msg = r"NDFrame.fillna\(\) missing 1 required positional argument: 'value'" + msg = r"missing 1 required positional argument: 'value'" with pytest.raises(TypeError, match=msg): datetime_frame.fillna() From 80faa789e4579abfed51a79026a13452d85c99e4 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 31 Mar 2024 09:20:48 -0400 Subject: [PATCH 3/4] fixup --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7030a966ba08c..84bbc0961e846 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6939,7 +6939,7 @@ def fillna( elif isinstance(value, (dict, ABCSeries)): if axis == 1: raise NotImplementedError( - "Currently only can fill " "with dict/Series column " "by column" + "Currently only can fill with dict/Series column by column" ) result = self if inplace else self.copy(deep=False) for k, v in value.items(): From 9c8bdf9cfe721ade2fd2ea79cab58695087644f2 Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 12 Apr 2024 20:51:49 -0400 Subject: [PATCH 4/4] Improve docs --- doc/source/user_guide/missing_data.rst | 21 +++++++++++++++++++++ pandas/core/generic.py | 2 ++ 2 files changed, 23 insertions(+) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 2e104ac06f9f4..5149bd30dbbef 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -386,6 +386,27 @@ Replace NA with a scalar value df df.fillna(0) +When the data has object dtype, you can control what type of NA values are present. + +.. ipython:: python + + df = pd.DataFrame({"a": [pd.NA, np.nan, None]}, dtype=object) + df + df.fillna(None) + df.fillna(np.nan) + df.fillna(pd.NA) + +However when the dtype is not object, these will all be replaced with the proper NA value for the dtype. + +.. ipython:: python + + data = {"np": [1.0, np.nan, np.nan, 2], "arrow": pd.array([1.0, pd.NA, pd.NA, 2], dtype="float64[pyarrow]")} + df = pd.DataFrame(data) + df + df.fillna(None) + df.fillna(np.nan) + df.fillna(pd.NA) + Fill gaps forward or backward .. ipython:: python diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7227cfa7615d3..523ca9de201bf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6830,6 +6830,8 @@ def fillna( Notes ----- For non-object dtype, ``value=None`` will use the NA value of the dtype. + See more details in the :ref:`Filling missing data` + section. Examples --------