From 719b1946e76bbb796efc563be11faff8845563ec Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 4 Jan 2022 12:01:07 -0800 Subject: [PATCH] POC/API/DEPR: errors kwd for fillna --- pandas/core/frame.py | 12 ++++ pandas/core/generic.py | 35 +++++++-- pandas/core/internals/array_manager.py | 9 ++- pandas/core/internals/blocks.py | 82 ++++++++++++++++++---- pandas/core/internals/managers.py | 9 ++- pandas/core/series.py | 12 ++++ pandas/tests/frame/methods/test_fillna.py | 2 +- pandas/tests/series/methods/test_fillna.py | 12 ++-- 8 files changed, 142 insertions(+), 31 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a39c1b0bf43f2..0cff3bf595101 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5097,6 +5097,7 @@ def fillna( inplace: Literal[False] = ..., limit=..., downcast=..., + errors=..., ) -> DataFrame: ... @@ -5109,6 +5110,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -5119,6 +5121,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -5130,6 +5133,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -5141,6 +5145,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -5152,6 +5157,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -5164,6 +5170,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -5176,6 +5183,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -5188,6 +5196,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -5200,6 +5209,7 @@ def fillna( inplace: bool = ..., limit=..., downcast=..., + errors=..., ) -> DataFrame | None: ... @@ -5213,6 +5223,7 @@ def fillna( inplace: bool = False, limit=None, downcast=None, + errors=lib.no_default, ) -> DataFrame | None: return super().fillna( value=value, @@ -5221,6 +5232,7 @@ def fillna( inplace=inplace, limit=limit, downcast=downcast, + errors=errors, ) def pop(self, item: Hashable) -> Series: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1e25b0f4eb176..021a041f2eecb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6278,6 +6278,7 @@ def fillna( inplace: bool_t = False, limit=None, downcast=None, + errors=lib.no_default, ) -> NDFrameT | None: """ Fill NA/NaN values using the specified method. @@ -6311,6 +6312,14 @@ def fillna( A dict of item->dtype of what to downcast if possible, or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). + errors : {{'raise', 'coerce'}} + If the given value cannot be filled into an array with this dtype, + do we raise or coerce to a common dtype? + Default depends on dtype for backward compatibility. For most dtypes, + the default is to coerce. In a future version, the default will be + to coerce for all dtypes. + + .. versionadded:: 1.4.0 Returns ------- @@ -6390,6 +6399,8 @@ def fillna( """ inplace = validate_bool_kwarg(inplace, "inplace") value, method = validate_fillna_kwargs(value, method) + if errors not in ["raise", "coerce", lib.no_default]: + raise ValueError("'errors' must be either 'raise' or 'coerce'") self._consolidate_inplace() @@ -6403,7 +6414,7 @@ def fillna( if not self._mgr.is_single_block and axis == 1: if inplace: raise NotImplementedError() - result = self.T.fillna(method=method, limit=limit).T + result = self.T.fillna(method=method, limit=limit, errors=errors).T return result @@ -6413,7 +6424,7 @@ def fillna( limit=limit, inplace=inplace, coerce=True, - downcast=downcast, + downcast=downcast, # TODO: errors ) else: if self.ndim == 1: @@ -6438,7 +6449,11 @@ def fillna( ) new_data = self._mgr.fillna( - value=value, limit=limit, inplace=inplace, downcast=downcast + value=value, + limit=limit, + inplace=inplace, + downcast=downcast, + errors=errors, ) elif isinstance(value, (dict, ABCSeries)): @@ -6455,23 +6470,29 @@ def fillna( if k not in result: continue downcast_k = downcast if not is_dict else downcast.get(k) - result[k] = result[k].fillna(v, limit=limit, downcast=downcast_k) + result[k] = result[k].fillna( + v, limit=limit, downcast=downcast_k, errors=errors + ) return result if not inplace else None elif not is_list_like(value): if not self._mgr.is_single_block and axis == 1: - result = self.T.fillna(value=value, limit=limit).T + result = self.T.fillna(value=value, limit=limit, errors=errors).T new_data = result else: new_data = self._mgr.fillna( - value=value, limit=limit, inplace=inplace, downcast=downcast + value=value, + limit=limit, + inplace=inplace, + downcast=downcast, + errors=errors, ) elif isinstance(value, ABCDataFrame) and self.ndim == 2: - new_data = self.where(self.notna(), value)._mgr + new_data = self.where(self.notna(), value)._mgr # TODO: errors else: raise ValueError(f"invalid fill value with a {type(value)}") diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index ec3a9e8b493e3..ab38097a215d8 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -382,9 +382,14 @@ def shift(self: T, periods: int, axis: int, fill_value) -> T: "shift", periods=periods, axis=axis, fill_value=fill_value ) - def fillna(self: T, value, limit, inplace: bool, downcast) -> T: + def fillna(self: T, value, limit, inplace: bool, downcast, errors) -> T: return self.apply_with_block( - "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast + "fillna", + value=value, + limit=limit, + inplace=inplace, + downcast=downcast, + errors=errors, ) def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9f242226739ec..9b71c04b855ba 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -434,7 +434,12 @@ def _split_op_result(self, result: ArrayLike) -> list[Block]: return [nb] def fillna( - self, value, limit=None, inplace: bool = False, downcast=None + self, + value, + limit=None, + inplace: bool = False, + downcast=None, + errors=lib.no_default, ) -> list[Block]: """ fillna on the block with the value. If we fail, then convert to @@ -455,7 +460,7 @@ def fillna( else: return [self.copy()] - if self._can_hold_element(value): + if self._can_hold_element(value) or errors == "raise": nb = self if inplace else self.copy() putmask_inplace(nb.values, mask, value) return nb._maybe_downcast([nb], downcast) @@ -1673,9 +1678,35 @@ def getitem_block_index(self, slicer: slice) -> ExtensionBlock: return type(self)(new_values, self._mgr_locs, ndim=self.ndim) def fillna( - self, value, limit=None, inplace: bool = False, downcast=None + self, + value, + limit=None, + inplace: bool = False, + downcast=None, + errors=lib.no_default, ) -> list[Block]: - values = self.values.fillna(value=value, limit=limit) + try: + values = self.values.fillna(value=value, limit=limit) + except (ValueError, TypeError): + if errors is lib.no_default: + warnings.warn( + f"The default behavior of fillna with {self.dtype} is " + "deprecated. In a future version, when the specified value " + "cannot be held in an array, the value and array will be " + "coerced to a common dtype. This is consistent with the " + "behavior with numpy dtypes. To retain the old behavior, " + "pass `errors='raise'` to obj.fillna. To get the future " + "behavior, pass `errors='coerce'`.", + FutureWarning, + stacklevel=find_stack_level(), + ) + errors = "raise" + if errors == "coerce": + blk = self.coerce_to_target_dtype(value) + return blk.fillna(value, limit=limit, inplace=True, downcast=downcast) + else: + raise + return [self.make_block_same_class(values=values)] def diff(self, n: int, axis: int = 1) -> list[Block]: @@ -1814,18 +1845,43 @@ def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> list[Blo return [self.make_block_same_class(new_values)] def fillna( - self, value, limit=None, inplace: bool = False, downcast=None + self, + value, + limit=None, + inplace: bool = False, + downcast=None, + errors=lib.no_default, ) -> list[Block]: - if not self._can_hold_element(value) and self.dtype.kind != "m": - # We support filling a DatetimeTZ with a `value` whose timezone - # is different by coercing to object. - # TODO: don't special-case td64 - return self.coerce_to_target_dtype(value).fillna( - value, limit, inplace, downcast - ) + try: + new_values = self.values.fillna(value=value, limit=limit) + except (ValueError, TypeError) as err: + if isinstance(err, ValueError) and "Timezones don't match" not in str(err): + # TODO(2.0): remove catching ValueError at all since + # DTA raising here is deprecated + raise + + if errors is lib.no_default and self.dtype.kind == "m": + # Deprecating special-casing of td64 + warnings.warn( + f"The default behavior of fillna with {self.dtype} is " + "deprecated. In a future version, when the specified value " + "cannot be held in an array, the value and array will be " + "coerced to a common dtype. This is consistent with the " + "behavior with numpy dtypes. To retain the old behavior, " + "pass `errors='raise'` to obj.fillna. To get the future " + "behavior, pass `errors='coerce'`.", + FutureWarning, + stacklevel=find_stack_level(), + ) + raise + elif errors == "coerce" or errors is lib.no_default: + # for non-td64, the default is already to coerce. + blk = self.coerce_to_target_dtype(value) + return blk.fillna(value, limit, inplace, downcast) + else: + raise - new_values = self.values.fillna(value=value, limit=limit) return [self.make_block_same_class(values=new_values)] diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3e0b62da64f42..9b07450df34c7 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -410,9 +410,14 @@ def shift(self: T, periods: int, axis: int, fill_value) -> T: return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value) - def fillna(self: T, value, limit, inplace: bool, downcast) -> T: + def fillna(self: T, value, limit, inplace: bool, downcast, errors) -> T: return self.apply( - "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast + "fillna", + value=value, + limit=limit, + inplace=inplace, + downcast=downcast, + errors=errors, ) def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: diff --git a/pandas/core/series.py b/pandas/core/series.py index 81b901b13a42b..8e45d35bd2cc0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4765,6 +4765,7 @@ def fillna( inplace: Literal[False] = ..., limit=..., downcast=..., + errors=..., ) -> Series: ... @@ -4777,6 +4778,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -4787,6 +4789,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -4798,6 +4801,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -4809,6 +4813,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -4820,6 +4825,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -4832,6 +4838,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -4844,6 +4851,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -4856,6 +4864,7 @@ def fillna( inplace: Literal[True], limit=..., downcast=..., + errors=..., ) -> None: ... @@ -4868,6 +4877,7 @@ def fillna( inplace: bool = ..., limit=..., downcast=..., + errors=..., ) -> Series | None: ... @@ -4882,6 +4892,7 @@ def fillna( inplace=False, limit=None, downcast=None, + errors=lib.no_default, ) -> Series | None: return super().fillna( value=value, @@ -4890,6 +4901,7 @@ def fillna( inplace=inplace, limit=limit, downcast=downcast, + errors=errors, ) def pop(self, item: Hashable) -> Any: diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index b5bdf6a70199c..a463efa0f3fa7 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -172,7 +172,7 @@ def test_na_actions_categorical(self): msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(TypeError, match=msg): - df.fillna(value={"cats": 4, "vals": "c"}) + df.fillna(value={"cats": 4, "vals": "c"}, errors="raise") res = df.fillna(method="pad") tm.assert_frame_equal(res, df_exp_fill) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 19f61a0a2d6fc..c5a67d9e82b73 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -254,7 +254,7 @@ def test_timedelta_fillna(self, frame_or_series): # interpreted as seconds, no longer supported msg = "value should be a 'Timedelta', 'NaT', or array of those. Got 'int'" with pytest.raises(TypeError, match=msg): - obj.fillna(1) + obj.fillna(1, errors="raise") result = obj.fillna(Timedelta(seconds=1)) expected = Series( @@ -723,29 +723,29 @@ def test_fillna_categorical_raises(self): msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(TypeError, match=msg): - ser.fillna("d") + ser.fillna("d", errors="raise") msg2 = "Length of 'value' does not match." with pytest.raises(ValueError, match=msg2): cat.fillna(Series("d")) with pytest.raises(TypeError, match=msg): - ser.fillna({1: "d", 3: "a"}) + ser.fillna({1: "d", 3: "a"}, errors="raise") msg = '"value" parameter must be a scalar or dict, but you passed a "list"' with pytest.raises(TypeError, match=msg): - ser.fillna(["a", "b"]) + ser.fillna(["a", "b"], errors="raise") msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' with pytest.raises(TypeError, match=msg): - ser.fillna(("a", "b")) + ser.fillna(("a", "b"), errors="raise") msg = ( '"value" parameter must be a scalar, dict ' 'or Series, but you passed a "DataFrame"' ) with pytest.raises(TypeError, match=msg): - ser.fillna(DataFrame({1: ["a"], 3: ["b"]})) + ser.fillna(DataFrame({1: ["a"], 3: ["b"]}), errors="raise") @pytest.mark.parametrize("dtype", [float, "float32", "float64"]) @pytest.mark.parametrize("fill_type", tm.ALL_REAL_NUMPY_DTYPES)