diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 162f6a4d30f3f..82ff8517cb535 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1025,6 +1025,8 @@ def convert_dtypes( ------- np.dtype, or ExtensionDtype """ + from pandas.core.arrays.string_ import StringDtype + inferred_dtype: str | DtypeObj if ( @@ -1103,6 +1105,13 @@ def convert_dtypes( # If we couldn't do anything else, then we retain the dtype inferred_dtype = input_array.dtype + elif ( + convert_string + and isinstance(input_array.dtype, StringDtype) + and input_array.dtype.na_value is np.nan + ): + inferred_dtype = pandas_dtype_func("string") + else: inferred_dtype = input_array.dtype diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 149bef6258bfa..dfb96162f0ac1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -512,7 +512,11 @@ def convert(self) -> list[Block]: convert_non_numeric=True, ) refs = None - if res_values is values: + if ( + res_values is values + or isinstance(res_values, NumpyExtensionArray) + and res_values._ndarray is values + ): refs = self.refs res_values = ensure_block_shape(res_values, self.ndim) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 59779234b46d9..e7f6e5d625d3e 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -3,21 +3,15 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm class TestConvertDtypes: - # TODO convert_dtypes should not use NaN variant of string dtype, but always NA - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_convert_dtypes( - self, convert_integer, expected, string_storage, using_infer_string - ): + def test_convert_dtypes(self, convert_integer, expected, string_storage): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here df = pd.DataFrame( @@ -182,7 +176,6 @@ def test_convert_dtypes_pyarrow_timestamp(self): result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_avoid_block_splitting(self): # GH#55341 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"}) @@ -197,7 +190,6 @@ def test_convert_dtypes_avoid_block_splitting(self): tm.assert_frame_equal(result, expected) assert result._mgr.nblocks == 2 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_from_arrow(self): # GH#56581 df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"]) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 3f410a13c8f80..e71814bcb3991 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -463,7 +463,6 @@ def test_dtype_backend_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 pa = pytest.importorskip("pyarrow") @@ -507,7 +506,6 @@ def test_dtype_backend_ea_dtype_specified(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_pyarrow(all_parsers, request): # GH#36712 pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 4a8af259b4134..90c4056a39e84 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib import pandas as pd @@ -12,7 +10,6 @@ class TestSeriesConvertDtypes: - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, maindtype, expected_default, expected_other", [ @@ -223,9 +220,9 @@ def test_convert_dtypes( and params[0] and not params[1] ): - # If we would convert with convert strings then infer_objects converts - # with the option - expected_dtype = "string[pyarrow_numpy]" + # If convert_string=False and infer_objects=True, we end up with the + # default string dtype instead of preserving object for string data + expected_dtype = pd.StringDtype(na_value=np.nan) expected = pd.Series(data, dtype=expected_dtype) tm.assert_series_equal(result, expected)