Skip to content

Commit 29eb61b

Browse files
String dtype: still return nullable NA-variant in object inference (maybe_converts_object) if requested (pandas-dev#59487)
* String dtype: maybe_converts_object give precedence to nullable dtype * update datetimelike input validation * update tests and remove xfails * explicitly test pd.array() behaviour (remove xfail) * fixup allow_2d * undo changes related to datetimelike input validation * fix test for str on current main --------- Co-authored-by: Matthew Roeschke <[email protected]>
1 parent d4a1069 commit 29eb61b

File tree

6 files changed

+46
-15
lines changed

6 files changed

+46
-15
lines changed

pandas/tests/arrays/string_/test_string_arrow.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,8 @@ def test_config(string_storage, using_infer_string):
3636
result = pd.array(["a", "b"])
3737
assert result.dtype.storage == string_storage
3838

39-
dtype = StringDtype(
40-
string_storage, na_value=np.nan if using_infer_string else pd.NA
41-
)
39+
# pd.array(..) by default always returns the NA-variant
40+
dtype = StringDtype(string_storage, na_value=pd.NA)
4241
expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
4342
tm.assert_equal(result, expected)
4443

pandas/tests/arrays/test_array.py

+39-1
Original file line numberDiff line numberDiff line change
@@ -218,13 +218,45 @@ def test_dt64_array(dtype_unit):
218218
.construct_array_type()
219219
._from_sequence(["a", None], dtype=pd.StringDtype()),
220220
),
221+
(
222+
["a", None],
223+
"str",
224+
pd.StringDtype(na_value=np.nan)
225+
.construct_array_type()
226+
._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan))
227+
if using_string_dtype()
228+
else NumpyExtensionArray(np.array(["a", "None"])),
229+
),
221230
(
222231
["a", None],
223232
pd.StringDtype(),
224233
pd.StringDtype()
225234
.construct_array_type()
226235
._from_sequence(["a", None], dtype=pd.StringDtype()),
227236
),
237+
(
238+
["a", None],
239+
pd.StringDtype(na_value=np.nan),
240+
pd.StringDtype(na_value=np.nan)
241+
.construct_array_type()
242+
._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)),
243+
),
244+
(
245+
# numpy array with string dtype
246+
np.array(["a", "b"], dtype=str),
247+
pd.StringDtype(),
248+
pd.StringDtype()
249+
.construct_array_type()
250+
._from_sequence(["a", "b"], dtype=pd.StringDtype()),
251+
),
252+
(
253+
# numpy array with string dtype
254+
np.array(["a", "b"], dtype=str),
255+
pd.StringDtype(na_value=np.nan),
256+
pd.StringDtype(na_value=np.nan)
257+
.construct_array_type()
258+
._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
259+
),
228260
# Boolean
229261
(
230262
[True, None],
@@ -277,7 +309,6 @@ def test_array_copy():
277309
cet = pytz.timezone("CET")
278310

279311

280-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
281312
@pytest.mark.parametrize(
282313
"data, expected",
283314
[
@@ -370,6 +401,13 @@ def test_array_copy():
370401
.construct_array_type()
371402
._from_sequence(["a", None], dtype=pd.StringDtype()),
372403
),
404+
(
405+
# numpy array with string dtype
406+
np.array(["a", "b"], dtype=str),
407+
pd.StringDtype()
408+
.construct_array_type()
409+
._from_sequence(["a", "b"], dtype=pd.StringDtype()),
410+
),
373411
# Boolean
374412
([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")),
375413
([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")),

pandas/tests/arrays/test_datetimelike.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -295,9 +295,7 @@ def test_searchsorted(self):
295295
assert result == 10
296296

297297
@pytest.mark.parametrize("box", [None, "index", "series"])
298-
def test_searchsorted_castable_strings(
299-
self, arr1d, box, string_storage, using_infer_string
300-
):
298+
def test_searchsorted_castable_strings(self, arr1d, box, string_storage):
301299
arr = arr1d
302300
if box is None:
303301
pass
@@ -333,8 +331,7 @@ def test_searchsorted_castable_strings(
333331
TypeError,
334332
match=re.escape(
335333
f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
336-
"or array of those. Got "
337-
f"{'str' if using_infer_string else 'string'} array instead."
334+
"or array of those. Got string array instead."
338335
),
339336
):
340337
arr.searchsorted([str(arr[1]), "baz"])

pandas/tests/base/test_value_counts.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def test_value_counts_inferred(index_or_series, using_infer_string):
127127
else:
128128
exp = np.unique(np.array(s_values, dtype=np.object_))
129129
if using_infer_string:
130-
exp = array(exp)
130+
exp = array(exp, dtype="str")
131131
tm.assert_equal(s.unique(), exp)
132132

133133
assert s.nunique() == 4
@@ -205,7 +205,7 @@ def test_value_counts_bins(index_or_series, using_infer_string):
205205
else:
206206
exp = np.array(["a", "b", np.nan, "d"], dtype=object)
207207
if using_infer_string:
208-
exp = array(exp)
208+
exp = array(exp, dtype="str")
209209
tm.assert_equal(s.unique(), exp)
210210
assert s.nunique() == 3
211211

pandas/tests/dtypes/cast/test_construct_ndarray.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def test_construct_1d_ndarray_preserving_na(
2121
):
2222
result = sanitize_array(values, index=None, dtype=dtype)
2323
if using_infer_string and expected.dtype == object and dtype is None:
24-
tm.assert_extension_array_equal(result, pd.array(expected))
24+
tm.assert_extension_array_equal(result, pd.array(expected, dtype="str"))
2525
else:
2626
tm.assert_numpy_array_equal(result, expected)
2727

pandas/tests/io/parser/usecols/test_usecols_basic.py

-3
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import numpy as np
88
import pytest
99

10-
from pandas._config import using_string_dtype
11-
1210
from pandas.errors import ParserError
1311

1412
from pandas import (
@@ -547,7 +545,6 @@ def test_usecols_additional_columns_integer_columns(all_parsers):
547545
tm.assert_frame_equal(result, expected)
548546

549547

550-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
551548
def test_usecols_dtype(all_parsers):
552549
parser = all_parsers
553550
data = """

0 commit comments

Comments
 (0)