diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index b9fd970e68f5b..71a4d3ae2575f 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -86,6 +86,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: Literal[False] = ..., + convert_string: Literal[False] = ..., convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> npt.NDArray[np.object_ | np.number]: ... @@ -97,6 +98,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: bool = ..., + convert_string: bool = ..., convert_to_nullable_dtype: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @@ -108,6 +110,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: bool = ..., + convert_string: bool = ..., convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bc039917aef87..f72d6a5dad877 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2498,6 +2498,7 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_numeric=True, # NB: different default! bint convert_to_nullable_dtype=False, bint convert_non_numeric=False, + bint convert_string=True, object dtype_if_all_nat=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2747,7 +2748,11 @@ def maybe_convert_objects(ndarray[object] objects, dtype = StringDtype() return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif using_string_dtype() and is_string_array(objects, skipna=True): + elif ( + convert_string + and using_string_dtype() + and is_string_array(objects, skipna=True) + ): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(na_value=np.nan) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6ae591a5d4ac8..5be83aa38011b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -563,7 +563,12 @@ def _maybe_downcast( return blocks nbs = extend_blocks( - [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] + [ + blk.convert( + using_cow=using_cow, copy=not using_cow, convert_string=False + ) + for blk in blocks + ] ) if caller == "fillna": if len(nbs) != len(blocks) or not all( @@ -636,6 +641,7 @@ def convert( *, copy: bool = True, using_cow: bool = False, + convert_string: bool = True, ) -> list[Block]: """ Attempt to coerce any object types to better types. Return a copy @@ -648,7 +654,10 @@ def convert( if self.ndim != 1 and self.shape[0] != 1: blocks = self.split_and_operate( - Block.convert, copy=copy, using_cow=using_cow + Block.convert, + copy=copy, + using_cow=using_cow, + convert_string=convert_string, ) if all(blk.dtype.kind == "O" for blk in blocks): # Avoid fragmenting the block if convert is a no-op @@ -666,6 +675,7 @@ def convert( res_values = lib.maybe_convert_objects( values, # type: ignore[arg-type] convert_non_numeric=True, + convert_string=convert_string, ) refs = None if ( @@ -851,6 +861,7 @@ def replace( mask: npt.NDArray[np.bool_] | None = None, using_cow: bool = False, already_warned=None, + convert_string=None, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -915,7 +926,11 @@ def replace( if get_option("future.no_silent_downcasting") is True: blocks = [blk] else: - blocks = blk.convert(copy=False, using_cow=using_cow) + blocks = blk.convert( + copy=False, + using_cow=using_cow, + convert_string=convert_string or self.dtype != _dtype_obj, + ) if len(blocks) > 1 or blocks[0].dtype != blk.dtype: warnings.warn( # GH#54710 @@ -944,6 +959,7 @@ def replace( inplace=True, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) else: @@ -958,6 +974,7 @@ def replace( inplace=True, mask=mask[i : i + 1], using_cow=using_cow, + convert_string=convert_string, ) ) return blocks @@ -970,6 +987,7 @@ def _replace_regex( inplace: bool = False, mask=None, using_cow: bool = False, + convert_string: bool = True, already_warned=None, ) -> list[Block]: """ @@ -1029,7 +1047,9 @@ def _replace_regex( ) already_warned.warned_already = True - nbs = block.convert(copy=False, using_cow=using_cow) + nbs = block.convert( + copy=False, using_cow=using_cow, convert_string=convert_string + ) opt = get_option("future.no_silent_downcasting") if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: warnings.warn( @@ -1068,6 +1088,8 @@ def replace_list( values._replace(to_replace=src_list, value=dest_list, inplace=True) return [blk] + convert_string = self.dtype != _dtype_obj + # Exclude anything that we know we won't contain pairs = [ (x, y) @@ -1152,6 +1174,7 @@ def replace_list( inplace=inplace, regex=regex, using_cow=using_cow, + convert_string=convert_string, ) if using_cow and i != src_len: @@ -1174,7 +1197,9 @@ def replace_list( nbs = [] for res_blk in result: converted = res_blk.convert( - copy=True and not using_cow, using_cow=using_cow + copy=True and not using_cow, + using_cow=using_cow, + convert_string=convert_string, ) if len(converted) > 1 or converted[0].dtype != res_blk.dtype: warnings.warn( @@ -1204,6 +1229,7 @@ def _replace_coerce( inplace: bool = True, regex: bool = False, using_cow: bool = False, + convert_string: bool = True, ) -> list[Block]: """ Replace value corresponding to the given boolean array with another @@ -1233,6 +1259,7 @@ def _replace_coerce( inplace=inplace, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) else: if value is None: @@ -1256,6 +1283,7 @@ def _replace_coerce( inplace=inplace, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) # --------------------------------------------------------------------- diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index e2baa2567f5b4..9844122dc4b2d 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -132,21 +132,14 @@ def test_fillna_different_dtype(self, using_infer_string): [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna({2: "foo"}) - else: - result = df.fillna({2: "foo"}) + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) + expected[2] = expected[2].astype("object") tm.assert_frame_equal(result, expected) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - return_value = df.fillna({2: "foo"}, inplace=True) - else: - return_value = df.fillna({2: "foo"}, inplace=True) + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -385,12 +378,8 @@ def test_fillna_dtype_conversion(self, using_infer_string): # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna("nan") - else: - result = df.fillna("nan") - expected = DataFrame("nan", index=range(3), columns=["A", "B"]) + result = df.fillna("nan") + expected = DataFrame("nan", index=range(3), columns=["A", "B"], dtype=object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("val", ["", 1, np.nan, 1.0]) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 8df9893e73766..2ee878893ce70 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -281,20 +281,12 @@ def test_regex_replace_dict_nested(self, mix_abc): tm.assert_frame_equal(res3, expec) tm.assert_frame_equal(res4, expec) - def test_regex_replace_dict_nested_non_first_character( - self, any_string_dtype, using_infer_string - ): + def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): # GH 25259 dtype = any_string_dtype df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) - if using_infer_string and any_string_dtype == "object": - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.replace({"a": "."}, regex=True) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) - - else: - result = df.replace({"a": "."}, regex=True) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) def test_regex_replace_dict_nested_gh4115(self): @@ -429,31 +421,12 @@ def test_replace_regex_metachar(self, metachar): ], ) def test_regex_replace_string_types( - self, - data, - to_replace, - expected, - frame_or_series, - any_string_dtype, - using_infer_string, - request, + self, data, to_replace, expected, frame_or_series, any_string_dtype ): # GH-41333, GH-35977 dtype = any_string_dtype obj = frame_or_series(data, dtype=dtype) - if using_infer_string and any_string_dtype == "object": - if len(to_replace) > 1 and isinstance(obj, DataFrame): - request.node.add_marker( - pytest.mark.xfail( - reason="object input array that gets downcasted raises on " - "second pass" - ) - ) - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = obj.replace(to_replace, regex=True) - dtype = "str" - else: - result = obj.replace(to_replace, regex=True) + result = obj.replace(to_replace, regex=True) expected = frame_or_series(expected, dtype=dtype) tm.assert_equal(result, expected) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index ac3bfe3a13a44..4e1697eabf734 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -831,7 +831,7 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer - def test_replace_series(self, how, to_key, from_key, replacer): + def test_replace_series(self, how, to_key, from_key, replacer, using_infer_string): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") obj = obj.astype(from_key) @@ -856,6 +856,10 @@ def test_replace_series(self, how, to_key, from_key, replacer): else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") + if using_infer_string and exp.dtype == "string" and obj.dtype == object: + # with infer_string, we disable the deprecated downcasting behavior + exp = exp.astype(object) + msg = "Downcasting behavior in `replace`" warn = FutureWarning if ( diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 0b0cf57a70c3f..0c2e0fdc2616f 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -768,7 +766,6 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_change_dtype_series(self): # GH#25797 df = pd.DataFrame({"Test": ["0.5", True, "0.6"]}, dtype=object)