ENH: Enable fillna(value=None) (pandas-dev#58085)

rhshadrach · pmhatre1 · commit 218c9e84dfbe · 2024-05-06T23:13:44.000-07:00
* ENH: Enable fillna(value=None)

* fixups

* fixup

* Improve docs
diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
@@ -386,6 +386,27 @@ Replace NA with a scalar value
    df
    df.fillna(0)
 
+When the data has object dtype, you can control what type of NA values are present.
+
+.. ipython:: python
+
+   df = pd.DataFrame({"a": [pd.NA, np.nan, None]}, dtype=object)
+   df
+   df.fillna(None)
+   df.fillna(np.nan)
+   df.fillna(pd.NA)
+
+However when the dtype is not object, these will all be replaced with the proper NA value for the dtype.
+
+.. ipython:: python
+
+   data = {"np": [1.0, np.nan, np.nan, 2], "arrow": pd.array([1.0, pd.NA, pd.NA, 2], dtype="float64[pyarrow]")}
+   df = pd.DataFrame(data)
+   df
+   df.fillna(None)
+   df.fillna(np.nan)
+   df.fillna(pd.NA)
+
 Fill gaps forward or backward
 
 .. ipython:: python
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -37,6 +37,7 @@ Other enhancements
 - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
 - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
 - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
+- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_300.notable_bug_fixes:
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
@@ -328,7 +328,7 @@ def _pad_or_backfill(
         return new_values
 
     @doc(ExtensionArray.fillna)
-    def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self:
+    def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self:
         mask = self.isna()
         # error: Argument 2 to "check_value_size" has incompatible type
         # "ExtensionArray"; expected "ndarray"
@@ -347,8 +347,7 @@ def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Sel
             new_values[mask] = value
         else:
             # We validate the fill_value even if there is nothing to fill
-            if value is not None:
-                self._validate_setitem_value(value)
+            self._validate_setitem_value(value)
 
             if not copy:
                 new_values = self[:]
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1077,7 +1077,7 @@ def _pad_or_backfill(
     @doc(ExtensionArray.fillna)
     def fillna(
         self,
-        value: object | ArrayLike | None = None,
+        value: object | ArrayLike,
         limit: int | None = None,
         copy: bool = True,
     ) -> Self:
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -892,7 +892,7 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr
         indexer = obj.argsort()[-1]
         return obj[indexer]
 
-    def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self:
+    def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self:
         """
         Fill NA/NaN values using the specified method.
 
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -236,7 +236,7 @@ def _pad_or_backfill(
         return new_values
 
     @doc(ExtensionArray.fillna)
-    def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self:
+    def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self:
         mask = self._mask
 
         value = missing.check_value_size(value, mask, len(self))
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -706,7 +706,7 @@ def isna(self) -> Self:  # type: ignore[override]
 
     def fillna(
         self,
-        value=None,
+        value,
         limit: int | None = None,
         copy: bool = True,
     ) -> Self:
@@ -736,8 +736,6 @@ def fillna(
         When ``self.fill_value`` is not NA, the result dtype will be
         ``self.dtype``. Again, this preserves the amount of memory used.
         """
-        if value is None:
-            raise ValueError("Must specify 'value'.")
         new_values = np.where(isna(self.sp_values), value, self.sp_values)
 
         if self._null_fill_value:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -6752,7 +6752,7 @@ def _pad_or_backfill(
     @overload
     def fillna(
         self,
-        value: Hashable | Mapping | Series | DataFrame = ...,
+        value: Hashable | Mapping | Series | DataFrame,
         *,
         axis: Axis | None = ...,
         inplace: Literal[False] = ...,
@@ -6762,7 +6762,7 @@ def fillna(
     @overload
     def fillna(
         self,
-        value: Hashable | Mapping | Series | DataFrame = ...,
+        value: Hashable | Mapping | Series | DataFrame,
         *,
         axis: Axis | None = ...,
         inplace: Literal[True],
@@ -6772,7 +6772,7 @@ def fillna(
     @overload
     def fillna(
         self,
-        value: Hashable | Mapping | Series | DataFrame = ...,
+        value: Hashable | Mapping | Series | DataFrame,
         *,
         axis: Axis | None = ...,
         inplace: bool = ...,
@@ -6786,7 +6786,7 @@ def fillna(
     )
     def fillna(
         self,
-        value: Hashable | Mapping | Series | DataFrame | None = None,
+        value: Hashable | Mapping | Series | DataFrame,
         *,
         axis: Axis | None = None,
         inplace: bool = False,
@@ -6827,6 +6827,12 @@ def fillna(
         reindex : Conform object to new index.
         asfreq : Convert TimeSeries to specified frequency.
 
+        Notes
+        -----
+        For non-object dtype, ``value=None`` will use the NA value of the dtype.
+        See more details in the :ref:`Filling missing data<missing_data.fillna>`
+        section.
+
         Examples
         --------
         >>> df = pd.DataFrame(
@@ -6909,101 +6915,92 @@ def fillna(
             axis = 0
         axis = self._get_axis_number(axis)
 
-        if value is None:
-            raise ValueError("Must specify a fill 'value'.")
-        else:
-            if self.ndim == 1:
-                if isinstance(value, (dict, ABCSeries)):
-                    if not len(value):
-                        # test_fillna_nonscalar
-                        if inplace:
-                            return None
-                        return self.copy(deep=False)
-                    from pandas import Series
-
-                    value = Series(value)
-                    value = value.reindex(self.index)
-                    value = value._values
-                elif not is_list_like(value):
-                    pass
-                else:
-                    raise TypeError(
-                        '"value" parameter must be a scalar, dict '
-                        "or Series, but you passed a "
-                        f'"{type(value).__name__}"'
-                    )
+        if self.ndim == 1:
+            if isinstance(value, (dict, ABCSeries)):
+                if not len(value):
+                    # test_fillna_nonscalar
+                    if inplace:
+                        return None
+                    return self.copy(deep=False)
+                from pandas import Series
+
+                value = Series(value)
+                value = value.reindex(self.index)
+                value = value._values
+            elif not is_list_like(value):
+                pass
+            else:
+                raise TypeError(
+                    '"value" parameter must be a scalar, dict '
+                    "or Series, but you passed a "
+                    f'"{type(value).__name__}"'
+                )
 
-                new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace)
+            new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace)
 
-            elif isinstance(value, (dict, ABCSeries)):
-                if axis == 1:
-                    raise NotImplementedError(
-                        "Currently only can fill "
-                        "with dict/Series column "
-                        "by column"
-                    )
-                result = self if inplace else self.copy(deep=False)
-                for k, v in value.items():
-                    if k not in result:
-                        continue
+        elif isinstance(value, (dict, ABCSeries)):
+            if axis == 1:
+                raise NotImplementedError(
+                    "Currently only can fill with dict/Series column by column"
+                )
+            result = self if inplace else self.copy(deep=False)
+            for k, v in value.items():
+                if k not in result:
+                    continue
 
-                    res_k = result[k].fillna(v, limit=limit)
+                res_k = result[k].fillna(v, limit=limit)
 
-                    if not inplace:
-                        result[k] = res_k
+                if not inplace:
+                    result[k] = res_k
+                else:
+                    # We can write into our existing column(s) iff dtype
+                    #  was preserved.
+                    if isinstance(res_k, ABCSeries):
+                        # i.e. 'k' only shows up once in self.columns
+                        if res_k.dtype == result[k].dtype:
+                            result.loc[:, k] = res_k
+                        else:
+                            # Different dtype -> no way to do inplace.
+                            result[k] = res_k
                     else:
-                        # We can write into our existing column(s) iff dtype
-                        #  was preserved.
-                        if isinstance(res_k, ABCSeries):
-                            # i.e. 'k' only shows up once in self.columns
-                            if res_k.dtype == result[k].dtype:
-                                result.loc[:, k] = res_k
+                        # see test_fillna_dict_inplace_nonunique_columns
+                        locs = result.columns.get_loc(k)
+                        if isinstance(locs, slice):
+                            locs = np.arange(self.shape[1])[locs]
+                        elif isinstance(locs, np.ndarray) and locs.dtype.kind == "b":
+                            locs = locs.nonzero()[0]
+                        elif not (
+                            isinstance(locs, np.ndarray) and locs.dtype.kind == "i"
+                        ):
+                            # Should never be reached, but let's cover our bases
+                            raise NotImplementedError(
+                                "Unexpected get_loc result, please report a bug at "
+                                "https://github.com/pandas-dev/pandas"
+                            )
+
+                        for i, loc in enumerate(locs):
+                            res_loc = res_k.iloc[:, i]
+                            target = self.iloc[:, loc]
+
+                            if res_loc.dtype == target.dtype:
+                                result.iloc[:, loc] = res_loc
                             else:
-                                # Different dtype -> no way to do inplace.
-                                result[k] = res_k
-                        else:
-                            # see test_fillna_dict_inplace_nonunique_columns
-                            locs = result.columns.get_loc(k)
-                            if isinstance(locs, slice):
-                                locs = np.arange(self.shape[1])[locs]
-                            elif (
-                                isinstance(locs, np.ndarray) and locs.dtype.kind == "b"
-                            ):
-                                locs = locs.nonzero()[0]
-                            elif not (
-                                isinstance(locs, np.ndarray) and locs.dtype.kind == "i"
-                            ):
-                                # Should never be reached, but let's cover our bases
-                                raise NotImplementedError(
-                                    "Unexpected get_loc result, please report a bug at "
-                                    "https://github.com/pandas-dev/pandas"
-                                )
-
-                            for i, loc in enumerate(locs):
-                                res_loc = res_k.iloc[:, i]
-                                target = self.iloc[:, loc]
-
-                                if res_loc.dtype == target.dtype:
-                                    result.iloc[:, loc] = res_loc
-                                else:
-                                    result.isetitem(loc, res_loc)
-                if inplace:
-                    return self._update_inplace(result)
-                else:
-                    return result
+                                result.isetitem(loc, res_loc)
+            if inplace:
+                return self._update_inplace(result)
+            else:
+                return result
 
-            elif not is_list_like(value):
-                if axis == 1:
-                    result = self.T.fillna(value=value, limit=limit).T
-                    new_data = result._mgr
-                else:
-                    new_data = self._mgr.fillna(
-                        value=value, limit=limit, inplace=inplace
-                    )
-            elif isinstance(value, ABCDataFrame) and self.ndim == 2:
-                new_data = self.where(self.notna(), value)._mgr
+        elif not is_list_like(value):
+            if axis == 1:
+                result = self.T.fillna(value=value, limit=limit).T
+                new_data = result._mgr
             else:
-                raise ValueError(f"invalid fill value with a {type(value)}")
+                new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace)
+        elif isinstance(value, ABCDataFrame) and self.ndim == 2:
+            new_data = self.where(self.notna(), value)._mgr
+        else:
+            raise ValueError(f"invalid fill value with a {type(value)}")
 
         result = self._constructor_from_mgr(new_data, axes=new_data.axes)
         if inplace:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -2543,7 +2543,7 @@ def notna(self) -> npt.NDArray[np.bool_]:
 
     notnull = notna
 
-    def fillna(self, value=None):
+    def fillna(self, value):
         """
         Fill NA/NaN values with the specified value.
 
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -1675,7 +1675,7 @@ def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:
     # (previously declared in base class "IndexOpsMixin")
     _duplicated = duplicated  # type: ignore[misc]
 
-    def fillna(self, value=None, downcast=None):
+    def fillna(self, value, downcast=None):
         """
         fillna is not implemented for MultiIndex
         """
diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py
@@ -68,6 +68,12 @@ def test_fillna_scalar(self, data_missing):
         expected = data_missing.fillna(valid)
         tm.assert_extension_array_equal(result, expected)
 
+    def test_fillna_with_none(self, data_missing):
+        # GH#57723
+        result = data_missing.fillna(None)
+        expected = data_missing
+        tm.assert_extension_array_equal(result, expected)
+
     def test_fillna_limit_pad(self, data_missing):
         arr = data_missing.take([1, 0, 0, 0, 1])
         result = pd.Series(arr).ffill(limit=2)
diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py
@@ -144,6 +144,14 @@ def test_fillna_series(self, data_missing):
         ):
             super().test_fillna_series(data_missing)
 
+    def test_fillna_with_none(self, data_missing):
+        # GH#57723
+        # EAs that don't have special logic for None will raise, unlike pandas'
+        # which interpret None as the NA value for the dtype.
+        msg = "conversion from NoneType to Decimal is not supported"
+        with pytest.raises(TypeError, match=msg):
+            super().test_fillna_with_none(data_missing)
+
     @pytest.mark.parametrize("dropna", [True, False])
     def test_value_counts(self, all_data, dropna):
         all_data = all_data[:10]
diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
@@ -149,6 +149,13 @@ def test_fillna_frame(self):
         """We treat dictionaries as a mapping in fillna, not a scalar."""
         super().test_fillna_frame()
 
+    def test_fillna_with_none(self, data_missing):
+        # GH#57723
+        # EAs that don't have special logic for None will raise, unlike pandas'
+        # which interpret None as the NA value for the dtype.
+        with pytest.raises(AssertionError):
+            super().test_fillna_with_none(data_missing)
+
     @pytest.mark.parametrize(
         "limit_area, input_ilocs, expected_ilocs",
         [
diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py