Skip to content

Commit aa5d8f9

Browse files
jorisvandenbosscheWillAyd
authored andcommitted
String dtype: fix convert_dtypes() to convert NaN-string to NA-string (#59470)
* String dtype: fix convert_dtypes() to convert NaN-string to NA-string * fix CoW tracking for conversion to python storage strings * remove xfails
1 parent 224cb55 commit aa5d8f9

File tree

5 files changed

+19
-15
lines changed

5 files changed

+19
-15
lines changed

pandas/core/dtypes/cast.py

+9
Original file line numberDiff line numberDiff line change
@@ -1026,6 +1026,8 @@ def convert_dtypes(
10261026
-------
10271027
np.dtype, or ExtensionDtype
10281028
"""
1029+
from pandas.core.arrays.string_ import StringDtype
1030+
10291031
inferred_dtype: str | DtypeObj
10301032

10311033
if (
@@ -1104,6 +1106,13 @@ def convert_dtypes(
11041106
# If we couldn't do anything else, then we retain the dtype
11051107
inferred_dtype = input_array.dtype
11061108

1109+
elif (
1110+
convert_string
1111+
and isinstance(input_array.dtype, StringDtype)
1112+
and input_array.dtype.na_value is np.nan
1113+
):
1114+
inferred_dtype = pandas_dtype_func("string")
1115+
11071116
else:
11081117
inferred_dtype = input_array.dtype
11091118

pandas/core/internals/blocks.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -657,7 +657,12 @@ def convert(
657657
convert_non_numeric=True,
658658
)
659659
refs = None
660-
if copy and res_values is values:
660+
if (
661+
copy
662+
and res_values is values
663+
or isinstance(res_values, NumpyExtensionArray)
664+
and res_values._ndarray is values
665+
):
661666
res_values = values.copy()
662667
elif res_values is values:
663668
refs = self.refs

pandas/tests/frame/methods/test_convert_dtypes.py

+1-9
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,15 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
import pandas as pd
97
import pandas._testing as tm
108

119

1210
class TestConvertDtypes:
13-
# TODO convert_dtypes should not use NaN variant of string dtype, but always NA
14-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1511
@pytest.mark.parametrize(
1612
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
1713
)
18-
def test_convert_dtypes(
19-
self, convert_integer, expected, string_storage, using_infer_string
20-
):
14+
def test_convert_dtypes(self, convert_integer, expected, string_storage):
2115
# Specific types are tested in tests/series/test_dtypes.py
2216
# Just check that it works for DataFrame here
2317
df = pd.DataFrame(
@@ -182,7 +176,6 @@ def test_convert_dtypes_pyarrow_timestamp(self):
182176
result = expected.convert_dtypes(dtype_backend="pyarrow")
183177
tm.assert_series_equal(result, expected)
184178

185-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
186179
def test_convert_dtypes_avoid_block_splitting(self):
187180
# GH#55341
188181
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"})
@@ -197,7 +190,6 @@ def test_convert_dtypes_avoid_block_splitting(self):
197190
tm.assert_frame_equal(result, expected)
198191
assert result._mgr.nblocks == 2
199192

200-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
201193
def test_convert_dtypes_from_arrow(self):
202194
# GH#56581
203195
df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"])

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

-2
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,6 @@ def test_dtype_backend_and_dtype(all_parsers):
463463
tm.assert_frame_equal(result, expected)
464464

465465

466-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
467466
def test_dtype_backend_string(all_parsers, string_storage):
468467
# GH#36712
469468
pa = pytest.importorskip("pyarrow")
@@ -507,7 +506,6 @@ def test_dtype_backend_ea_dtype_specified(all_parsers):
507506
tm.assert_frame_equal(result, expected)
508507

509508

510-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
511509
def test_dtype_backend_pyarrow(all_parsers, request):
512510
# GH#36712
513511
pa = pytest.importorskip("pyarrow")

pandas/tests/series/methods/test_convert_dtypes.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -230,9 +230,9 @@ def test_convert_dtypes(
230230
and params[0]
231231
and not params[1]
232232
):
233-
# If we would convert with convert strings then infer_objects converts
234-
# with the option
235-
expected_dtype = "string[pyarrow_numpy]"
233+
# If convert_string=False and infer_objects=True, we end up with the
234+
# default string dtype instead of preserving object for string data
235+
expected_dtype = pd.StringDtype(na_value=np.nan)
236236

237237
expected = pd.Series(data, dtype=expected_dtype)
238238
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)