From 03d4943fb9ff325afb9c4e43a0655ec586214a45 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 12 Aug 2024 10:44:30 +0200
Subject: [PATCH 1/7] String dtype: maybe_converts_object give precedence to
 nullable dtype

---
 pandas/_libs/lib.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 489d4fa111d40..e1a2a0142c52e 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2699,16 +2699,16 @@ def maybe_convert_objects(ndarray[object] objects,
         seen.object_ = True
 
     elif seen.str_:
-        if using_string_dtype() and is_string_array(objects, skipna=True):
+        if convert_to_nullable_dtype and is_string_array(objects, skipna=True):
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype(na_value=np.nan)
+            dtype = StringDtype()
             return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
-        elif convert_to_nullable_dtype and is_string_array(objects, skipna=True):
+        elif using_string_dtype() and is_string_array(objects, skipna=True):
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype()
+            dtype = StringDtype(na_value=np.nan)
             return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
         seen.object_ = True

From c00577816012807ed191673436162dcbf8ab508d Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 12 Aug 2024 15:07:07 +0200
Subject: [PATCH 2/7] update datetimelike input validation

---
 pandas/core/arrays/datetimelike.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index ad0bde3abbdd4..32a90cf83321d 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -130,9 +130,9 @@
 from pandas.core.arrays.integer import IntegerArray
 import pandas.core.common as com
 from pandas.core.construction import (
-    array as pd_array,
     ensure_wrapped_if_datetimelike,
     extract_array,
+    sanitize_array,
 )
 from pandas.core.indexers import (
     check_array_indexer,
@@ -667,12 +667,10 @@ def _validate_listlike(self, value, allow_object: bool = False):
                     msg = self._validation_error_message(value, True)
                     raise TypeError(msg) from err
 
-        # Do type inference if necessary up front (after unpacking
-        # NumpyExtensionArray)
+        # Do type inference if necessary up front
         # e.g. we passed PeriodIndex.values and got an ndarray of Periods
-        value = extract_array(value, extract_numpy=True)
-        value = pd_array(value)
-        value = extract_array(value, extract_numpy=True)
+        value = sanitize_array(value, index=None)
+        value = ensure_wrapped_if_datetimelike(value)
 
         if is_all_strings(value):
             # We got a StringArray

From 0bee1ac482232897761f760f93df003436d96025 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 12 Aug 2024 15:16:10 +0200
Subject: [PATCH 3/7] update tests and remove xfails

---
 pandas/tests/arrays/string_/test_string_arrow.py    | 13 +++----------
 pandas/tests/base/test_value_counts.py              |  4 ++--
 pandas/tests/dtypes/cast/test_construct_ndarray.py  |  2 +-
 .../tests/io/parser/usecols/test_usecols_basic.py   |  3 ---
 4 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index 7d4aae0f7bb4e..ef882a81f882f 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -4,7 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat import HAS_PYARROW
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -27,20 +26,14 @@ def test_eq_all_na():
     tm.assert_extension_array_equal(result, expected)
 
 
-def test_config(string_storage, request, using_infer_string):
-    if using_infer_string and string_storage == "python" and HAS_PYARROW:
-        # string storage with na_value=NaN always uses pyarrow if available
-        # -> does not yet honor the option
-        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
-
+def test_config(string_storage):
     with pd.option_context("string_storage", string_storage):
         assert StringDtype().storage == string_storage
         result = pd.array(["a", "b"])
         assert result.dtype.storage == string_storage
 
-    dtype = StringDtype(
-        string_storage, na_value=np.nan if using_infer_string else pd.NA
-    )
+    # pd.array(..) by default always returns the NA-variant
+    dtype = StringDtype(string_storage, na_value=pd.NA)
     expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
     tm.assert_equal(result, expected)
 
diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
index c72abfeb9f3e7..bcb31829a201f 100644
--- a/pandas/tests/base/test_value_counts.py
+++ b/pandas/tests/base/test_value_counts.py
@@ -114,7 +114,7 @@ def test_value_counts_inferred(index_or_series, using_infer_string):
     else:
         exp = np.unique(np.array(s_values, dtype=np.object_))
         if using_infer_string:
-            exp = array(exp)
+            exp = array(exp, dtype="str")
         tm.assert_equal(s.unique(), exp)
 
     assert s.nunique() == 4
@@ -192,7 +192,7 @@ def test_value_counts_bins(index_or_series, using_infer_string):
     else:
         exp = np.array(["a", "b", np.nan, "d"], dtype=object)
         if using_infer_string:
-            exp = array(exp)
+            exp = array(exp, dtype="str")
         tm.assert_equal(s.unique(), exp)
     assert s.nunique() == 3
 
diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py
index ab468c81124bc..6b9b2dfda6e8b 100644
--- a/pandas/tests/dtypes/cast/test_construct_ndarray.py
+++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py
@@ -21,7 +21,7 @@ def test_construct_1d_ndarray_preserving_na(
 ):
     result = sanitize_array(values, index=None, dtype=dtype)
     if using_infer_string and expected.dtype == object and dtype is None:
-        tm.assert_extension_array_equal(result, pd.array(expected))
+        tm.assert_extension_array_equal(result, pd.array(expected, dtype="str"))
     else:
         tm.assert_numpy_array_equal(result, expected)
 
diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py
index d02364a77df90..82b42beb38ae0 100644
--- a/pandas/tests/io/parser/usecols/test_usecols_basic.py
+++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import ParserError
 
 from pandas import (
@@ -531,7 +529,6 @@ def test_usecols_additional_columns_integer_columns(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_usecols_dtype(all_parsers):
     parser = all_parsers
     data = """

From 0057158bedc675850607c4bd970dbc586e7550c2 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 12 Aug 2024 15:30:29 +0200
Subject: [PATCH 4/7] explicitly test pd.array() behaviour (remove xfail)

---
 pandas/tests/arrays/test_array.py | 34 +++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py
index 76b8928f28b65..0bb023e6b3b92 100644
--- a/pandas/tests/arrays/test_array.py
+++ b/pandas/tests/arrays/test_array.py
@@ -5,8 +5,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 import pandas._testing as tm
 from pandas.api.extensions import register_extension_dtype
@@ -215,6 +213,13 @@ def test_dt64_array(dtype_unit):
             .construct_array_type()
             ._from_sequence(["a", None], dtype=pd.StringDtype()),
         ),
+        (
+            ["a", None],
+            "str",
+            pd.StringDtype(na_value=np.nan)
+            .construct_array_type()
+            ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)),
+        ),
         (
             ["a", None],
             pd.StringDtype(),
@@ -222,14 +227,29 @@ def test_dt64_array(dtype_unit):
             .construct_array_type()
             ._from_sequence(["a", None], dtype=pd.StringDtype()),
         ),
+        (
+            ["a", None],
+            pd.StringDtype(na_value=np.nan),
+            pd.StringDtype(na_value=np.nan)
+            .construct_array_type()
+            ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)),
+        ),
         (
             # numpy array with string dtype
             np.array(["a", "b"], dtype=str),
-            None,
+            pd.StringDtype(),
             pd.StringDtype()
             .construct_array_type()
             ._from_sequence(["a", "b"], dtype=pd.StringDtype()),
         ),
+        (
+            # numpy array with string dtype
+            np.array(["a", "b"], dtype=str),
+            pd.StringDtype(na_value=np.nan),
+            pd.StringDtype(na_value=np.nan)
+            .construct_array_type()
+            ._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
+        ),
         # Boolean
         (
             [True, None],
@@ -287,7 +307,6 @@ def test_array_copy():
     assert tm.shares_memory(a, b)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize(
     "data, expected",
     [
@@ -387,6 +406,13 @@ def test_array_copy():
             .construct_array_type()
             ._from_sequence(["a", None], dtype=pd.StringDtype()),
         ),
+        (
+            # numpy array with string dtype
+            np.array(["a", "b"], dtype=str),
+            pd.StringDtype()
+            .construct_array_type()
+            ._from_sequence(["a", "b"], dtype=pd.StringDtype()),
+        ),
         # Boolean
         ([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")),
         ([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")),

From 8af4cdac2c792a0c0ebabe378d85c71273d7c1ab Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 13 Aug 2024 08:51:29 +0200
Subject: [PATCH 5/7] fixup allow_2d

---
 pandas/core/arrays/datetimelike.py       |  2 +-
 pandas/core/construction.py              | 24 +++++-------------------
 pandas/tests/arrays/test_datetimelike.py |  2 +-
 3 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index e4995a3bb4898..3c69a5a450dd1 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -669,7 +669,7 @@ def _validate_listlike(self, value, allow_object: bool = False):
 
         # Do type inference if necessary up front
         # e.g. we passed PeriodIndex.values and got an ndarray of Periods
-        value = sanitize_array(value, index=None)
+        value = sanitize_array(value, index=None, allow_2d=True)
         value = ensure_wrapped_if_datetimelike(value)
 
         if is_all_strings(value):
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 665eb75953078..8035a550d175b 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -37,7 +37,6 @@
 from pandas.core.dtypes.common import (
     ensure_object,
     is_list_like,
-    is_object_dtype,
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import NumpyEADtype
@@ -49,8 +48,6 @@
 )
 from pandas.core.dtypes.missing import isna
 
-import pandas.core.common as com
-
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
@@ -706,23 +703,12 @@ def _sanitize_ndim(
         result = _maybe_repeat(result, index)
 
     elif result.ndim > 1:
-        if isinstance(data, np.ndarray):
-            if allow_2d:
-                return result
-            raise ValueError(
-                f"Data must be 1-dimensional, got ndarray of shape {data.shape} instead"
-            )
-        if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype):
-            # i.e. NumpyEADtype("O")
+        if allow_2d:
+            return result
+        raise ValueError(
+            f"Data must be 1-dimensional, got ndarray of shape {data.shape} instead"
+        )
 
-            result = com.asarray_tuplesafe(data, dtype=np.dtype("object"))
-            cls = dtype.construct_array_type()
-            result = cls._from_sequence(result, dtype=dtype)
-        else:
-            # error: Argument "dtype" to "asarray_tuplesafe" has incompatible type
-            # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[str,
-            # dtype[Any], None]"
-            result = com.asarray_tuplesafe(data, dtype=dtype)  # type: ignore[arg-type]
     return result
 
 
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
index 5834b268be2be..cb9e14aaeb691 100644
--- a/pandas/tests/arrays/test_datetimelike.py
+++ b/pandas/tests/arrays/test_datetimelike.py
@@ -336,7 +336,7 @@ def test_searchsorted_castable_strings(
                 match=re.escape(
                     f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
                     "or array of those. Got "
-                    f"{'str' if using_infer_string else 'string'} array instead."
+                    f"{'str' if using_infer_string else 'object'} array instead."
                 ),
             ):
                 arr.searchsorted([str(arr[1]), "baz"])

From 8b92517ceeb9cd2fc7908e29a908025cb3a013df Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 14 Aug 2024 23:29:51 +0200
Subject: [PATCH 6/7] undo changes related to datetimelike input validation

---
 pandas/core/arrays/datetimelike.py       | 10 ++++++----
 pandas/core/construction.py              | 24 +++++++++++++++++++-----
 pandas/tests/arrays/test_datetimelike.py |  7 ++-----
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 3c69a5a450dd1..c5a08c1834a3e 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -130,9 +130,9 @@
 from pandas.core.arrays.integer import IntegerArray
 import pandas.core.common as com
 from pandas.core.construction import (
+    array as pd_array,
     ensure_wrapped_if_datetimelike,
     extract_array,
-    sanitize_array,
 )
 from pandas.core.indexers import (
     check_array_indexer,
@@ -667,10 +667,12 @@ def _validate_listlike(self, value, allow_object: bool = False):
                     msg = self._validation_error_message(value, True)
                     raise TypeError(msg) from err
 
-        # Do type inference if necessary up front
+        # Do type inference if necessary up front (after unpacking
+        # NumpyExtensionArray)
         # e.g. we passed PeriodIndex.values and got an ndarray of Periods
-        value = sanitize_array(value, index=None, allow_2d=True)
-        value = ensure_wrapped_if_datetimelike(value)
+        value = extract_array(value, extract_numpy=True)
+        value = pd_array(value)
+        value = extract_array(value, extract_numpy=True)
 
         if is_all_strings(value):
             # We got a StringArray
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 8035a550d175b..665eb75953078 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -37,6 +37,7 @@
 from pandas.core.dtypes.common import (
     ensure_object,
     is_list_like,
+    is_object_dtype,
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import NumpyEADtype
@@ -48,6 +49,8 @@
 )
 from pandas.core.dtypes.missing import isna
 
+import pandas.core.common as com
+
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
@@ -703,12 +706,23 @@ def _sanitize_ndim(
         result = _maybe_repeat(result, index)
 
     elif result.ndim > 1:
-        if allow_2d:
-            return result
-        raise ValueError(
-            f"Data must be 1-dimensional, got ndarray of shape {data.shape} instead"
-        )
+        if isinstance(data, np.ndarray):
+            if allow_2d:
+                return result
+            raise ValueError(
+                f"Data must be 1-dimensional, got ndarray of shape {data.shape} instead"
+            )
+        if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype):
+            # i.e. NumpyEADtype("O")
 
+            result = com.asarray_tuplesafe(data, dtype=np.dtype("object"))
+            cls = dtype.construct_array_type()
+            result = cls._from_sequence(result, dtype=dtype)
+        else:
+            # error: Argument "dtype" to "asarray_tuplesafe" has incompatible type
+            # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[str,
+            # dtype[Any], None]"
+            result = com.asarray_tuplesafe(data, dtype=dtype)  # type: ignore[arg-type]
     return result
 
 
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
index cb9e14aaeb691..3d8f8d791b763 100644
--- a/pandas/tests/arrays/test_datetimelike.py
+++ b/pandas/tests/arrays/test_datetimelike.py
@@ -297,9 +297,7 @@ def test_searchsorted(self):
         assert result == 10
 
     @pytest.mark.parametrize("box", [None, "index", "series"])
-    def test_searchsorted_castable_strings(
-        self, arr1d, box, string_storage, using_infer_string
-    ):
+    def test_searchsorted_castable_strings(self, arr1d, box, string_storage):
         arr = arr1d
         if box is None:
             pass
@@ -335,8 +333,7 @@ def test_searchsorted_castable_strings(
                 TypeError,
                 match=re.escape(
                     f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
-                    "or array of those. Got "
-                    f"{'str' if using_infer_string else 'object'} array instead."
+                    "or array of those. Got string array instead."
                 ),
             ):
                 arr.searchsorted([str(arr[1]), "baz"])

From 548b501e6162b73dea34abc52bd199c85ca16dc4 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 20 Aug 2024 21:11:22 +0200
Subject: [PATCH 7/7] fix test for str on current main

---
 pandas/tests/arrays/test_array.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py
index 0bb023e6b3b92..4070a2844846f 100644
--- a/pandas/tests/arrays/test_array.py
+++ b/pandas/tests/arrays/test_array.py
@@ -5,6 +5,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_string_dtype
+
 import pandas as pd
 import pandas._testing as tm
 from pandas.api.extensions import register_extension_dtype
@@ -218,7 +220,9 @@ def test_dt64_array(dtype_unit):
             "str",
             pd.StringDtype(na_value=np.nan)
             .construct_array_type()
-            ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)),
+            ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan))
+            if using_string_dtype()
+            else NumpyExtensionArray(np.array(["a", "None"])),
         ),
         (
             ["a", None],