diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index e0e752099b77a..4a2aa565dd15c 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -401,7 +401,7 @@ Limit the number of NA values filled df.ffill(limit=1) -NA values can be replaced with corresponding value from a :class:`Series`` or :class:`DataFrame`` +NA values can be replaced with corresponding value from a :class:`Series` or :class:`DataFrame` where the index and column aligns between the original object and the filled object. .. ipython:: python @@ -660,7 +660,7 @@ Pass a list of regular expressions that will replace matches with a scalar. .. ipython:: python - df.replace([r"\s*\.\s*", r"a|b"], np.nan, regex=True) + df.replace([r"\s*\.\s*", r"a|b"], "placeholder", regex=True) All of the regular expression examples can also be passed with the ``to_replace`` argument as the ``regex`` argument. In this case the ``value`` @@ -669,7 +669,7 @@ dictionary. .. ipython:: python - df.replace(regex=[r"\s*\.\s*", r"a|b"], value=np.nan) + df.replace(regex=[r"\s*\.\s*", r"a|b"], value="placeholder") .. note:: diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 54e855f61905a..dec6638a239a4 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -162,7 +162,8 @@ Deprecations - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`) -- Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.downcasting", True)`` (:issue:`53656`) +- Deprecated automatic downcasting of object-dtype results in :meth:`Series.replace` and :meth:`DataFrame.replace`, explicitly call ``result = result.infer_objects(copy=False)`` instead. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54710`) +- Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) - Deprecated including the groups in computations when using :meth:`DataFrameGroupBy.apply` and :meth:`DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) - Deprecated not passing a tuple to :class:`DataFrameGroupBy.get_group` or :class:`SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`) - Deprecated strings ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6399f85723ae5..57833c1d626ee 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -761,7 +761,23 @@ def replace( if not (self.is_object and value is None): # if the user *explicitly* gave None, we keep None, otherwise # may downcast to NaN - blocks = blk.convert(copy=False, using_cow=using_cow) + if get_option("future.no_silent_downcasting") is True: + blocks = [blk] + else: + blocks = blk.convert(copy=False, using_cow=using_cow) + if len(blocks) > 1 or blocks[0].dtype != blk.dtype: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated and " + "will be removed in a future version. To retain the old " + "behavior, explicitly call " + "`result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) else: blocks = [blk] return blocks @@ -836,7 +852,21 @@ def _replace_regex( replace_regex(block.values, rx, value, mask) - return block.convert(copy=False, using_cow=using_cow) + nbs = block.convert(copy=False, using_cow=using_cow) + opt = get_option("future.no_silent_downcasting") + if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated and " + "will be removed in a future version. To retain the old " + "behavior, explicitly call `result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + return nbs @final def replace_list( @@ -902,6 +932,7 @@ def replace_list( else: rb = [self if inplace else self.copy()] + opt = get_option("future.no_silent_downcasting") for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): convert = i == src_len # only convert once at the end new_rb: list[Block] = [] @@ -939,14 +970,33 @@ def replace_list( b.refs.referenced_blocks.index(ref) ) - if convert and blk.is_object and not all(x is None for x in dest_list): + if ( + not opt + and convert + and blk.is_object + and not all(x is None for x in dest_list) + ): # GH#44498 avoid unwanted cast-back - result = extend_blocks( - [ - b.convert(copy=True and not using_cow, using_cow=using_cow) - for b in result - ] - ) + nbs = [] + for res_blk in result: + converted = res_blk.convert( + copy=True and not using_cow, using_cow=using_cow + ) + if len(converted) > 1 or converted[0].dtype != res_blk.dtype: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated " + "and will be removed in a future version. To " + "retain the old behavior, explicitly call " + "`result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + nbs.extend(converted) + result = nbs new_rb.extend(result) rb = new_rb return rb diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 61e44b4e24c08..f07c53060a06b 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -289,7 +289,9 @@ def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) - result = df.replace({"Type": {"Q": 0, "T": 1}}) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) def test_regex_replace_list_to_scalar(self, mix_abc): @@ -301,16 +303,20 @@ def test_regex_replace_list_to_scalar(self, mix_abc): "c": [np.nan, np.nan, np.nan, "d"], } ) - res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) res2 = df.copy() res3 = df.copy() - return_value = res2.replace( - [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True - ) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = res2.replace( + [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True + ) assert return_value is None - return_value = res3.replace( - regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True - ) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = res3.replace( + regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True + ) assert return_value is None tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) @@ -520,7 +526,9 @@ def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) m = {"foo": 1, "bar": 2, "bah": 3} - rep = df.replace(m) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + rep = df.replace(m) expec = Series([np.int64] * 3) res = rep.dtypes tm.assert_series_equal(expec, res) @@ -838,7 +846,12 @@ def test_replace_for_new_dtypes(self, datetime_frame): ], ) def test_replace_dtypes(self, frame, to_replace, value, expected): - result = frame.replace(to_replace, value) + warn = None + if isinstance(to_replace, datetime) and to_replace.year == 2920: + warn = FutureWarning + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(warn, match=msg): + result = frame.replace(to_replace, value) tm.assert_frame_equal(result, expected) def test_replace_input_formats_listlike(self): @@ -927,7 +940,9 @@ def test_replace_dict_no_regex(self): "Strongly Disagree": 1, } expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) - result = answer.replace(weights) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = answer.replace(weights) tm.assert_series_equal(result, expected) def test_replace_series_no_regex(self): @@ -950,7 +965,9 @@ def test_replace_series_no_regex(self): } ) expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) - result = answer.replace(weights) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = answer.replace(weights) tm.assert_series_equal(result, expected) def test_replace_dict_tuple_list_ordering_remains_the_same(self): @@ -1076,7 +1093,9 @@ def test_replace_period(self): expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) assert expected.dtypes.iloc[0] == "Period[M]" - result = df.replace(d) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace(d) tm.assert_frame_equal(result, expected) def test_replace_datetime(self): @@ -1106,7 +1125,9 @@ def test_replace_datetime(self): ) assert set(df.fname.values) == set(d["fname"].keys()) expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) - result = df.replace(d) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace(d) tm.assert_frame_equal(result, expected) def test_replace_datetimetz(self): @@ -1307,10 +1328,12 @@ def test_replace_commutative(self, df, to_replace, exp): np.float64(1), ], ) - def test_replace_replacer_dtype(self, request, replacer): + def test_replace_replacer_dtype(self, replacer): # GH26632 df = DataFrame(["a"]) - result = df.replace({"a": replacer, "b": replacer}) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace({"a": replacer, "b": replacer}) expected = DataFrame([replacer]) tm.assert_frame_equal(result, expected) @@ -1564,12 +1587,15 @@ def test_replace_regex_dtype_frame(self, regex): # GH-48644 df1 = DataFrame({"A": ["0"], "B": ["0"]}) expected_df1 = DataFrame({"A": [1], "B": [1]}) - result_df1 = df1.replace(to_replace="0", value=1, regex=regex) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result_df1 = df1.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df1, expected_df1) df2 = DataFrame({"A": ["0"], "B": ["1"]}) expected_df2 = DataFrame({"A": [1], "B": ["1"]}) - result_df2 = df2.replace(to_replace="0", value=1, regex=regex) + with tm.assert_produces_warning(FutureWarning, match=msg): + result_df2 = df2.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df2, expected_df2) def test_replace_with_value_also_being_replaced(self): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 2c39729097487..82368c67dc6d4 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -836,8 +836,6 @@ def test_replace_series(self, how, to_key, from_key, replacer): # tested below return - result = obj.replace(replacer) - if (from_key == "float64" and to_key in ("int64")) or ( from_key == "complex128" and to_key in ("int64", "float64") ): @@ -851,6 +849,17 @@ def test_replace_series(self, how, to_key, from_key, replacer): exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key + msg = "Downcasting behavior in `replace`" + warn = FutureWarning + if ( + exp.dtype == obj.dtype + or exp.dtype == object + or (exp.dtype.kind in "iufc" and obj.dtype.kind in "iufc") + ): + warn = None + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) + tm.assert_series_equal(result, exp) @pytest.mark.parametrize( @@ -866,11 +875,14 @@ def test_replace_series_datetime_tz(self, how, to_key, from_key, replacer): obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - result = obj.replace(replacer) - exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key + msg = "Downcasting behavior in `replace`" + warn = FutureWarning if exp.dtype != object else None + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) + tm.assert_series_equal(result, exp) @pytest.mark.parametrize( @@ -888,16 +900,22 @@ def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer) obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - result = obj.replace(replacer) - exp = pd.Series(self.rep[to_key], index=index, name="yyy") + warn = FutureWarning if isinstance(obj.dtype, pd.DatetimeTZDtype) and isinstance( exp.dtype, pd.DatetimeTZDtype ): # with mismatched tzs, we retain the original dtype as of 2.0 exp = exp.astype(obj.dtype) + warn = None else: assert exp.dtype == to_key + if to_key == from_key: + warn = None + + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) tm.assert_series_equal(result, exp) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 9cbb29605a1ec..c54d06b6ab4ce 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1197,7 +1197,9 @@ def test_render_as_column_name(self, path): def test_true_and_false_value_options(self, path): # see gh-13347 df = DataFrame([["foo", "bar"]], columns=["col1", "col2"]) - expected = df.replace({"foo": True, "bar": False}) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.replace({"foo": True, "bar": False}) df.to_excel(path) read_frame = pd.read_excel( diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index ec9db8c3830d6..f08966c3816c0 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -76,7 +76,9 @@ def test_replace(self): ser[20:30] = "bar" # replace list with a single value - rs = ser.replace([np.nan, "foo", "bar"], -1) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() @@ -84,7 +86,8 @@ def test_replace(self): assert (pd.isna(ser[:5])).all() # replace with different values - rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() @@ -92,11 +95,13 @@ def test_replace(self): assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists - rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace - return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() @@ -294,7 +299,9 @@ def test_replace2(self): ser[20:30] = "bar" # replace list with a single value - rs = ser.replace([np.nan, "foo", "bar"], -1) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() @@ -302,7 +309,8 @@ def test_replace2(self): assert (pd.isna(ser[:5])).all() # replace with different values - rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() @@ -310,11 +318,13 @@ def test_replace2(self): assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists - rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace - return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() @@ -373,7 +383,9 @@ def test_replace_unicode_with_number(self): def test_replace_mixed_types_with_string(self): # Testing mixed s = pd.Series([1, 2, 3, "4", 4, 5]) - result = s.replace([2, "4"], np.nan) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.replace([2, "4"], np.nan) expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) @@ -387,7 +399,9 @@ def test_replace_mixed_types_with_string(self): def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 ser = pd.Series(categorical) - result = ser.replace({"A": 1, "B": 2}) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.replace({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") if 2 not in expected.cat.categories: # i.e. categories should be [1, 2] even if there are no "B"s present @@ -710,7 +724,9 @@ def test_replace_regex_dtype_series(self, regex): # GH-48644 series = pd.Series(["0"]) expected = pd.Series([1]) - result = series.replace(to_replace="0", value=1, regex=regex) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = series.replace(to_replace="0", value=1, regex=regex) tm.assert_series_equal(result, expected) def test_replace_different_int_types(self, any_int_numpy_dtype):