diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 489d4fa111d40..e1a2a0142c52e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2699,16 +2699,16 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_string_dtype() and is_string_array(objects, skipna=True): + if convert_to_nullable_dtype and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(na_value=np.nan) + dtype = StringDtype() return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): + elif using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype() + dtype = StringDtype(na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index cfba32c62f206..b042cf632288b 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -36,9 +36,8 @@ def test_config(string_storage, using_infer_string): result = pd.array(["a", "b"]) assert result.dtype.storage == string_storage - dtype = StringDtype( - string_storage, na_value=np.nan if using_infer_string else pd.NA - ) + # pd.array(..) by default always returns the NA-variant + dtype = StringDtype(string_storage, na_value=pd.NA) expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype) tm.assert_equal(result, expected) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 76b8928f28b65..4070a2844846f 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -215,6 +215,15 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + ["a", None], + "str", + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)) + if using_string_dtype() + else NumpyExtensionArray(np.array(["a", "None"])), + ), ( ["a", None], pd.StringDtype(), @@ -222,14 +231,29 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + ["a", None], + pd.StringDtype(na_value=np.nan), + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)), + ), ( # numpy array with string dtype np.array(["a", "b"], dtype=str), - None, + pd.StringDtype(), pd.StringDtype() .construct_array_type() ._from_sequence(["a", "b"], dtype=pd.StringDtype()), ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype(na_value=np.nan), + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + ), # Boolean ( [True, None], @@ -287,7 +311,6 @@ def test_array_copy(): assert tm.shares_memory(a, b) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, expected", [ @@ -387,6 +410,13 @@ def test_array_copy(): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype()), + ), # Boolean ([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")), ([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 59ff4f3122e8f..6dd1ef9d59ab4 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -297,9 +297,7 @@ def test_searchsorted(self): assert result == 10 @pytest.mark.parametrize("box", [None, "index", "series"]) - def test_searchsorted_castable_strings( - self, arr1d, box, string_storage, using_infer_string - ): + def test_searchsorted_castable_strings(self, arr1d, box, string_storage): arr = arr1d if box is None: pass @@ -335,8 +333,7 @@ def test_searchsorted_castable_strings( TypeError, match=re.escape( f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - "or array of those. Got " - f"{'str' if using_infer_string else 'string'} array instead." + "or array of those. Got string array instead." ), ): arr.searchsorted([str(arr[1]), "baz"]) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index c72abfeb9f3e7..bcb31829a201f 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -114,7 +114,7 @@ def test_value_counts_inferred(index_or_series, using_infer_string): else: exp = np.unique(np.array(s_values, dtype=np.object_)) if using_infer_string: - exp = array(exp) + exp = array(exp, dtype="str") tm.assert_equal(s.unique(), exp) assert s.nunique() == 4 @@ -192,7 +192,7 @@ def test_value_counts_bins(index_or_series, using_infer_string): else: exp = np.array(["a", "b", np.nan, "d"], dtype=object) if using_infer_string: - exp = array(exp) + exp = array(exp, dtype="str") tm.assert_equal(s.unique(), exp) assert s.nunique() == 3 diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py index ab468c81124bc..6b9b2dfda6e8b 100644 --- a/pandas/tests/dtypes/cast/test_construct_ndarray.py +++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -21,7 +21,7 @@ def test_construct_1d_ndarray_preserving_na( ): result = sanitize_array(values, index=None, dtype=dtype) if using_infer_string and expected.dtype == object and dtype is None: - tm.assert_extension_array_equal(result, pd.array(expected)) + tm.assert_extension_array_equal(result, pd.array(expected, dtype="str")) else: tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index d02364a77df90..82b42beb38ae0 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserError from pandas import ( @@ -531,7 +529,6 @@ def test_usecols_additional_columns_integer_columns(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_usecols_dtype(all_parsers): parser = all_parsers data = """