Skip to content

Commit db630fc

Browse files
String dtype: fix convert_dtypes() to convert NaN-string to NA-string (#59470)
* String dtype: fix convert_dtypes() to convert NaN-string to NA-string * fix CoW tracking for conversion to python storage strings * remove xfails
1 parent 9f5b041 commit db630fc

File tree

5 files changed

+18
-18
lines changed

5 files changed

+18
-18
lines changed

pandas/core/dtypes/cast.py

+9
Original file line numberDiff line numberDiff line change
@@ -1023,6 +1023,8 @@ def convert_dtypes(
10231023
-------
10241024
np.dtype, or ExtensionDtype
10251025
"""
1026+
from pandas.core.arrays.string_ import StringDtype
1027+
10261028
inferred_dtype: str | DtypeObj
10271029

10281030
if (
@@ -1101,6 +1103,13 @@ def convert_dtypes(
11011103
# If we couldn't do anything else, then we retain the dtype
11021104
inferred_dtype = input_array.dtype
11031105

1106+
elif (
1107+
convert_string
1108+
and isinstance(input_array.dtype, StringDtype)
1109+
and input_array.dtype.na_value is np.nan
1110+
):
1111+
inferred_dtype = pandas_dtype_func("string")
1112+
11041113
else:
11051114
inferred_dtype = input_array.dtype
11061115

pandas/core/internals/blocks.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,11 @@ def convert(self) -> list[Block]:
512512
convert_non_numeric=True,
513513
)
514514
refs = None
515-
if res_values is values:
515+
if (
516+
res_values is values
517+
or isinstance(res_values, NumpyExtensionArray)
518+
and res_values._ndarray is values
519+
):
516520
refs = self.refs
517521

518522
res_values = ensure_block_shape(res_values, self.ndim)

pandas/tests/frame/methods/test_convert_dtypes.py

+1-9
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,15 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
import pandas as pd
97
import pandas._testing as tm
108

119

1210
class TestConvertDtypes:
13-
# TODO convert_dtypes should not use NaN variant of string dtype, but always NA
14-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1511
@pytest.mark.parametrize(
1612
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
1713
)
18-
def test_convert_dtypes(
19-
self, convert_integer, expected, string_storage, using_infer_string
20-
):
14+
def test_convert_dtypes(self, convert_integer, expected, string_storage):
2115
# Specific types are tested in tests/series/test_dtypes.py
2216
# Just check that it works for DataFrame here
2317
df = pd.DataFrame(
@@ -182,7 +176,6 @@ def test_convert_dtypes_pyarrow_timestamp(self):
182176
result = expected.convert_dtypes(dtype_backend="pyarrow")
183177
tm.assert_series_equal(result, expected)
184178

185-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
186179
def test_convert_dtypes_avoid_block_splitting(self):
187180
# GH#55341
188181
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"})
@@ -197,7 +190,6 @@ def test_convert_dtypes_avoid_block_splitting(self):
197190
tm.assert_frame_equal(result, expected)
198191
assert result._mgr.nblocks == 2
199192

200-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
201193
def test_convert_dtypes_from_arrow(self):
202194
# GH#56581
203195
df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"])

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

-2
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,6 @@ def test_dtype_backend_and_dtype(all_parsers):
463463
tm.assert_frame_equal(result, expected)
464464

465465

466-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
467466
def test_dtype_backend_string(all_parsers, string_storage):
468467
# GH#36712
469468
pa = pytest.importorskip("pyarrow")
@@ -507,7 +506,6 @@ def test_dtype_backend_ea_dtype_specified(all_parsers):
507506
tm.assert_frame_equal(result, expected)
508507

509508

510-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
511509
def test_dtype_backend_pyarrow(all_parsers, request):
512510
# GH#36712
513511
pa = pytest.importorskip("pyarrow")

pandas/tests/series/methods/test_convert_dtypes.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,13 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
from pandas._libs import lib
97

108
import pandas as pd
119
import pandas._testing as tm
1210

1311

1412
class TestSeriesConvertDtypes:
15-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
1613
@pytest.mark.parametrize(
1714
"data, maindtype, expected_default, expected_other",
1815
[
@@ -223,9 +220,9 @@ def test_convert_dtypes(
223220
and params[0]
224221
and not params[1]
225222
):
226-
# If we would convert with convert strings then infer_objects converts
227-
# with the option
228-
expected_dtype = "string[pyarrow_numpy]"
223+
# If convert_string=False and infer_objects=True, we end up with the
224+
# default string dtype instead of preserving object for string data
225+
expected_dtype = pd.StringDtype(na_value=np.nan)
229226

230227
expected = pd.Series(data, dtype=expected_dtype)
231228
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)