Skip to content

String dtype: still return nullable NA-variant in object inference (maybe_converts_object) if requested #59487

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2699,16 +2699,16 @@ def maybe_convert_objects(ndarray[object] objects,
seen.object_ = True

elif seen.str_:
if using_string_dtype() and is_string_array(objects, skipna=True):
if convert_to_nullable_dtype and is_string_array(objects, skipna=True):
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype(na_value=np.nan)
dtype = StringDtype()
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)

elif convert_to_nullable_dtype and is_string_array(objects, skipna=True):
elif using_string_dtype() and is_string_array(objects, skipna=True):
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype()
dtype = StringDtype(na_value=np.nan)
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
Comment on lines -2702 to 2712
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Essentially this diff is just switching the order of the if/elif blocks, to first check if we want a nullable dtype and only then check if we want the future default string dtype.


seen.object_ = True
Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,8 @@ def test_config(string_storage, using_infer_string):
result = pd.array(["a", "b"])
assert result.dtype.storage == string_storage

dtype = StringDtype(
string_storage, na_value=np.nan if using_infer_string else pd.NA
)
# pd.array(..) by default always returns the NA-variant
dtype = StringDtype(string_storage, na_value=pd.NA)
expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
tm.assert_equal(result, expected)

Expand Down
34 changes: 32 additions & 2 deletions pandas/tests/arrays/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,21 +215,45 @@ def test_dt64_array(dtype_unit):
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype()),
),
(
["a", None],
"str",
pd.StringDtype(na_value=np.nan)
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan))
if using_string_dtype()
else NumpyExtensionArray(np.array(["a", "None"])),
),
(
["a", None],
pd.StringDtype(),
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype()),
),
(
["a", None],
pd.StringDtype(na_value=np.nan),
pd.StringDtype(na_value=np.nan)
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)),
),
(
# numpy array with string dtype
np.array(["a", "b"], dtype=str),
None,
pd.StringDtype(),
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", "b"], dtype=pd.StringDtype()),
),
(
# numpy array with string dtype
np.array(["a", "b"], dtype=str),
pd.StringDtype(na_value=np.nan),
pd.StringDtype(na_value=np.nan)
.construct_array_type()
._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
),
# Boolean
(
[True, None],
Expand Down Expand Up @@ -287,7 +311,6 @@ def test_array_copy():
assert tm.shares_memory(a, b)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"data, expected",
[
Expand Down Expand Up @@ -387,6 +410,13 @@ def test_array_copy():
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype()),
),
(
# numpy array with string dtype
np.array(["a", "b"], dtype=str),
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", "b"], dtype=pd.StringDtype()),
),
# Boolean
([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")),
([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")),
Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/arrays/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,9 +297,7 @@ def test_searchsorted(self):
assert result == 10

@pytest.mark.parametrize("box", [None, "index", "series"])
def test_searchsorted_castable_strings(
self, arr1d, box, string_storage, using_infer_string
):
def test_searchsorted_castable_strings(self, arr1d, box, string_storage):
arr = arr1d
if box is None:
pass
Expand Down Expand Up @@ -335,8 +333,7 @@ def test_searchsorted_castable_strings(
TypeError,
match=re.escape(
f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
"or array of those. Got "
f"{'str' if using_infer_string else 'string'} array instead."
"or array of those. Got string array instead."
),
):
arr.searchsorted([str(arr[1]), "baz"])
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def test_value_counts_inferred(index_or_series, using_infer_string):
else:
exp = np.unique(np.array(s_values, dtype=np.object_))
if using_infer_string:
exp = array(exp)
exp = array(exp, dtype="str")
tm.assert_equal(s.unique(), exp)

assert s.nunique() == 4
Expand Down Expand Up @@ -192,7 +192,7 @@ def test_value_counts_bins(index_or_series, using_infer_string):
else:
exp = np.array(["a", "b", np.nan, "d"], dtype=object)
if using_infer_string:
exp = array(exp)
exp = array(exp, dtype="str")
tm.assert_equal(s.unique(), exp)
assert s.nunique() == 3

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/dtypes/cast/test_construct_ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_construct_1d_ndarray_preserving_na(
):
result = sanitize_array(values, index=None, dtype=dtype)
if using_infer_string and expected.dtype == object and dtype is None:
tm.assert_extension_array_equal(result, pd.array(expected))
tm.assert_extension_array_equal(result, pd.array(expected, dtype="str"))
else:
tm.assert_numpy_array_equal(result, expected)

Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/io/parser/usecols/test_usecols_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import ParserError

from pandas import (
Expand Down Expand Up @@ -531,7 +529,6 @@ def test_usecols_additional_columns_integer_columns(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_usecols_dtype(all_parsers):
parser = all_parsers
data = """
Expand Down
Loading