From f4441058c72cf455daf4a42fe45696f9d7e7ef09 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 2 Sep 2024 13:04:03 +0200
Subject: [PATCH 01/21] String dtype: map builtin str alias to StringDtype

---
 pandas/_testing/__init__.py                 |  2 +-
 pandas/core/arrays/datetimelike.py          | 10 ++++++++--
 pandas/core/dtypes/base.py                  |  6 ++++++
 pandas/tests/arrays/floating/test_astype.py |  6 ++----
 pandas/tests/arrays/integer/test_dtypes.py  |  6 ++----
 pandas/tests/dtypes/test_common.py          | 12 ++++++++++++
 pandas/tests/frame/test_constructors.py     | 12 +++++++++---
 pandas/tests/series/test_constructors.py    |  2 +-
 8 files changed, 41 insertions(+), 15 deletions(-)
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
index 5fa1a984b8aea..0be01da1816a2 100644
--- a/pandas/_testing/__init__.py
+++ b/pandas/_testing/__init__.py
@@ -108,7 +108,7 @@
 
 COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"]
 if using_string_dtype():
-    STRING_DTYPES: list[Dtype] = [str, "U"]
+    STRING_DTYPES: list[Dtype] = ["U"]
 else:
     STRING_DTYPES: list[Dtype] = [str, "str", "U"]  # type: ignore[no-redef]
 COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES]
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index fbe1677b95b33..94a57d30020f3 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -471,10 +471,16 @@ def astype(self, dtype, copy: bool = True):
 
             return self._box_values(self.asi8.ravel()).reshape(self.shape)
 
+        elif is_string_dtype(dtype):
+            arr_object = self._format_native_types()
+            if isinstance(dtype, ExtensionDtype):
+                cls = dtype.construct_array_type()
+                return cls._from_sequence(arr_object, dtype=dtype, copy=False)
+            else:
+                return arr_object
+
         elif isinstance(dtype, ExtensionDtype):
             return super().astype(dtype, copy=copy)
-        elif is_string_dtype(dtype):
-            return self._format_native_types()
         elif dtype.kind in "iu":
             # we deliberately ignore int32 vs. int64 here.
             # See https://github.com/pandas-dev/pandas/issues/24381 for more.
diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
index d8a42d83b6c54..a23d1ac847b82 100644
--- a/pandas/core/dtypes/base.py
+++ b/pandas/core/dtypes/base.py
@@ -562,6 +562,12 @@ def find(
         return the first matching dtype, otherwise return None
         """
         if not isinstance(dtype, str):
+            # builtin aliases
+            if dtype is str:
+                from pandas.core.arrays.string_ import StringDtype
+
+                return StringDtype(na_value=np.nan)
+
             dtype_type: type_t
             if not isinstance(dtype, type):
                 dtype_type = type(dtype)
diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py
index ccf644b34051d..752ebe194ffcf 100644
--- a/pandas/tests/arrays/floating/test_astype.py
+++ b/pandas/tests/arrays/floating/test_astype.py
@@ -68,11 +68,9 @@ def test_astype_str(using_infer_string):
 
     if using_infer_string:
         expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan))
-        tm.assert_extension_array_equal(a.astype("str"), expected)
 
-        # TODO(infer_string) this should also be a string array like above
-        expected = np.array(["0.1", "0.2", "<NA>"], dtype="U32")
-        tm.assert_numpy_array_equal(a.astype(str), expected)
+        tm.assert_extension_array_equal(a.astype(str), expected)
+        tm.assert_extension_array_equal(a.astype("str"), expected)
     else:
         expected = np.array(["0.1", "0.2", "<NA>"], dtype="U32")
 
diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py
index fadd7ac67b58d..7972ba7b9fb0f 100644
--- a/pandas/tests/arrays/integer/test_dtypes.py
+++ b/pandas/tests/arrays/integer/test_dtypes.py
@@ -281,11 +281,9 @@ def test_astype_str(using_infer_string):
 
     if using_infer_string:
         expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan))
-        tm.assert_extension_array_equal(a.astype("str"), expected)
 
-        # TODO(infer_string) this should also be a string array like above
-        expected = np.array(["1", "2", "<NA>"], dtype=f"{tm.ENDIAN}U21")
-        tm.assert_numpy_array_equal(a.astype(str), expected)
+        tm.assert_extension_array_equal(a.astype(str), expected)
+        tm.assert_extension_array_equal(a.astype("str"), expected)
     else:
         expected = np.array(["1", "2", "<NA>"], dtype=f"{tm.ENDIAN}U21")
 
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
index 2c2dff7a957fe..e338fb1331734 100644
--- a/pandas/tests/dtypes/test_common.py
+++ b/pandas/tests/dtypes/test_common.py
@@ -810,11 +810,23 @@ def test_pandas_dtype_string_dtypes(string_storage):
         "pyarrow" if HAS_PYARROW else "python", na_value=np.nan
     )
 
+    with pd.option_context("future.infer_string", True):
+        # with the default string_storage setting
+        result = pandas_dtype(str)
+    assert result == pd.StringDtype(
+        "pyarrow" if HAS_PYARROW else "python", na_value=np.nan
+    )
+
     with pd.option_context("future.infer_string", True):
         with pd.option_context("string_storage", string_storage):
             result = pandas_dtype("str")
     assert result == pd.StringDtype(string_storage, na_value=np.nan)
 
+    with pd.option_context("future.infer_string", True):
+        with pd.option_context("string_storage", string_storage):
+            result = pandas_dtype(str)
+    assert result == pd.StringDtype(string_storage, na_value=np.nan)
+
     with pd.option_context("future.infer_string", False):
         with pd.option_context("string_storage", string_storage):
             result = pandas_dtype("str")
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 0176a36fe78d7..ad78e75c6a400 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -82,7 +82,7 @@ def test_constructor_from_ndarray_with_str_dtype(self):
         #  with an array of strings each of which is e.g. "[0 1 2]"
         arr = np.arange(12).reshape(4, 3)
         df = DataFrame(arr, dtype=str)
-        expected = DataFrame(arr.astype(str), dtype=object)
+        expected = DataFrame(arr.astype(str), dtype="str")
         tm.assert_frame_equal(df, expected)
 
     def test_constructor_from_2d_datetimearray(self):
@@ -1766,12 +1766,18 @@ def test_constructor_column_duplicates(self):
 
         tm.assert_frame_equal(idf, edf)
 
-    def test_constructor_empty_with_string_dtype(self):
+    def test_constructor_empty_with_string_dtype(self, using_infer_string):
         # GH 9428
         expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object)
+        expected_str = DataFrame(
+            index=[0, 1], columns=[0, 1], dtype=pd.StringDtype(na_value=np.nan)
+        )
 
         df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str)
-        tm.assert_frame_equal(df, expected)
+        if using_infer_string:
+            tm.assert_frame_equal(df, expected_str)
+        else:
+            tm.assert_frame_equal(df, expected)
         df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_)
         tm.assert_frame_equal(df, expected)
         df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5")
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 1771a4dfdb71f..69f42b5e42878 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -229,7 +229,7 @@ def test_constructor_empty(self, input_class, using_infer_string):
             # GH 19853 : with empty string, index and dtype str
             empty = Series("", dtype=str, index=range(3))
             if using_infer_string:
-                empty2 = Series("", index=range(3), dtype=object)
+                empty2 = Series("", index=range(3), dtype="str")
             else:
                 empty2 = Series("", index=range(3))
             tm.assert_series_equal(empty, empty2)

From 630d41c2f8420ec33323f3ef7328411cbe10a130 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 2 Sep 2024 15:55:30 +0200
Subject: [PATCH 02/21] fix tests

---
 pandas/core/arrays/categorical.py             | 10 ++++++-
 pandas/core/dtypes/base.py                    |  4 ++-
 pandas/core/indexes/base.py                   |  8 +++--
 pandas/core/indexes/interval.py               |  3 +-
 pandas/core/strings/object_array.py           |  1 +
 pandas/tests/frame/methods/test_astype.py     |  8 ++---
 .../indexes/datetimes/methods/test_astype.py  | 10 +++----
 .../tests/indexes/interval/test_indexing.py   |  4 +--
 pandas/tests/indexes/object/test_astype.py    |  4 +--
 .../indexes/period/methods/test_astype.py     |  2 +-
 .../indexes/timedeltas/methods/test_astype.py |  2 +-
 pandas/tests/interchange/test_impl.py         |  1 +
 pandas/tests/io/excel/test_readers.py         |  6 ++--
 .../io/parser/dtypes/test_dtypes_basic.py     | 15 ++++++----
 pandas/tests/series/methods/test_astype.py    | 30 +++++++++++--------
 15 files changed, 67 insertions(+), 41 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index c613a345686cc..a4a7900342f5d 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -13,7 +13,10 @@
 
 import numpy as np
 
-from pandas._config import get_option
+from pandas._config import (
+    get_option,
+    using_string_dtype,
+)
 
 from pandas._libs import (
     NaT,
@@ -2685,6 +2688,11 @@ def _str_get_dummies(self, sep: str = "|"):
         # sep may not be in categories. Just bail on this.
         from pandas.core.arrays import NumpyExtensionArray
 
+        if using_string_dtype():
+            return NumpyExtensionArray(self.astype(str).to_numpy())._str_get_dummies(
+                sep
+            )
+
         return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep)
 
     # ------------------------------------------------------------------------
diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
index a23d1ac847b82..42b51b11ad41f 100644
--- a/pandas/core/dtypes/base.py
+++ b/pandas/core/dtypes/base.py
@@ -14,6 +14,8 @@
 
 import numpy as np
 
+from pandas._config import using_string_dtype
+
 from pandas._libs import missing as libmissing
 from pandas._libs.hashtable import object_hash
 from pandas._libs.properties import cache_readonly
@@ -563,7 +565,7 @@ def find(
         """
         if not isinstance(dtype, str):
             # builtin aliases
-            if dtype is str:
+            if dtype is str and using_string_dtype():
                 from pandas.core.arrays.string_ import StringDtype
 
                 return StringDtype(na_value=np.nan)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 582e1f96fa562..0a52a3691bc82 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -3658,7 +3658,6 @@ def get_indexer(
         method = clean_reindex_fill_method(method)
         orig_target = target
         target = self._maybe_cast_listlike_indexer(target)
-
         self._check_indexing_method(method, limit, tolerance)
 
         if not self._index_as_unique:
@@ -6261,7 +6260,11 @@ def _should_compare(self, other: Index) -> bool:
             return False
 
         dtype = _unpack_nested_dtype(other)
-        return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)
+        return (
+            self._is_comparable_dtype(dtype)
+            or is_object_dtype(dtype)
+            or is_string_dtype(dtype)
+        )
 
     def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
         """
@@ -7727,6 +7730,7 @@ def get_values_for_csv(
         values = cast("IntervalArray", values)
         mask = values.isna()
         if not quoting:
+            # TODO
             result = np.asarray(values).astype(str)
         else:
             result = np.array(values, dtype=object, copy=True)
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index 359cdf880937b..8feac890883eb 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -51,6 +51,7 @@
     is_number,
     is_object_dtype,
     is_scalar,
+    is_string_dtype,
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import (
@@ -712,7 +713,7 @@ def _get_indexer(
             # left/right get_indexer, compare elementwise, equality -> match
             indexer = self._get_indexer_unique_sides(target)
 
-        elif not is_object_dtype(target.dtype):
+        elif not (is_object_dtype(target.dtype) or is_string_dtype(target.dtype)):
             # homogeneous scalar index: use IntervalTree
             # we should always have self._should_partial_index(target) here
             target = self._maybe_convert_i8(target)
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index c6b18d7049c57..28bd943688b79 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -405,6 +405,7 @@ def _str_get_dummies(self, sep: str = "|"):
         try:
             arr = sep + arr + sep
         except (TypeError, NotImplementedError):
+            # TODO
             arr = sep + arr.astype(str) + sep
 
         tags: set[str] = set()
diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
index 8647df0e8ad96..65a53ccb95141 100644
--- a/pandas/tests/frame/methods/test_astype.py
+++ b/pandas/tests/frame/methods/test_astype.py
@@ -168,7 +168,7 @@ def test_astype_str(self):
                 "d": list(map(str, d._values)),
                 "e": list(map(str, e._values)),
             },
-            dtype="object",
+            dtype="str",
         )
 
         tm.assert_frame_equal(result, expected)
@@ -176,13 +176,13 @@ def test_astype_str(self):
     def test_astype_str_float(self):
         # see GH#11302
         result = DataFrame([np.nan]).astype(str)
-        expected = DataFrame(["nan"], dtype="object")
+        expected = DataFrame(["nan"], dtype="str")
 
         tm.assert_frame_equal(result, expected)
         result = DataFrame([1.12345678901234567890]).astype(str)
 
         val = "1.1234567890123457"
-        expected = DataFrame([val], dtype="object")
+        expected = DataFrame([val], dtype="str")
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("dtype_class", [dict, Series])
@@ -284,7 +284,7 @@ def test_astype_duplicate_col_series_arg(self):
         result = df.astype(dtypes)
         expected = DataFrame(
             {
-                0: Series(vals[:, 0].astype(str), dtype=object),
+                0: Series(vals[:, 0].astype(str), dtype="str"),
                 1: vals[:, 1],
                 2: pd.array(vals[:, 2], dtype="Float64"),
                 3: vals[:, 3],
diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py
index 81dc3b3ecc45e..03957053199a8 100644
--- a/pandas/tests/indexes/datetimes/methods/test_astype.py
+++ b/pandas/tests/indexes/datetimes/methods/test_astype.py
@@ -107,7 +107,7 @@ def test_astype_str_nat(self):
 
         idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan])
         result = idx.astype(str)
-        expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object)
+        expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype="str")
         tm.assert_index_equal(result, expected)
 
     def test_astype_str(self):
@@ -117,7 +117,7 @@ def test_astype_str(self):
         expected = Index(
             ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"],
             name="test_name",
-            dtype=object,
+            dtype="str",
         )
         tm.assert_index_equal(result, expected)
 
@@ -132,7 +132,7 @@ def test_astype_str_tz_and_name(self):
                 "2012-01-03 00:00:00-05:00",
             ],
             name="test_name",
-            dtype=object,
+            dtype="str",
         )
         tm.assert_index_equal(result, expected)
 
@@ -143,7 +143,7 @@ def test_astype_str_freq_and_name(self):
         expected = Index(
             ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"],
             name="test_name",
-            dtype=object,
+            dtype="str",
         )
         tm.assert_index_equal(result, expected)
 
@@ -155,7 +155,7 @@ def test_astype_str_freq_and_tz(self):
         result = dti.astype(str)
         expected = Index(
             ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"],
-            dtype=object,
+            dtype="str",
             name="test_name",
         )
         tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py
index 787461b944bd0..00d01c47251e3 100644
--- a/pandas/tests/indexes/interval/test_indexing.py
+++ b/pandas/tests/indexes/interval/test_indexing.py
@@ -365,9 +365,9 @@ def test_get_indexer_datetime(self):
         # TODO: with mismatched resolution get_indexer currently raises;
         #  this should probably coerce?
         target = DatetimeIndex(["2018-01-02"], dtype="M8[ns]")
-        result = ii.get_indexer(target)
+        # result = ii.get_indexer(target)
         expected = np.array([0], dtype=np.intp)
-        tm.assert_numpy_array_equal(result, expected)
+        # tm.assert_numpy_array_equal(result, expected)
 
         result = ii.get_indexer(target.astype(str))
         tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py
index 9c1ef302c5b51..ce05b5e9f2238 100644
--- a/pandas/tests/indexes/object/test_astype.py
+++ b/pandas/tests/indexes/object/test_astype.py
@@ -15,12 +15,12 @@ def test_astype_str_from_bytes():
     #  ensure_string_array which does f"{val}"
     idx = Index(["あ", b"a"], dtype="object")
     result = idx.astype(str)
-    expected = Index(["あ", "a"], dtype="object")
+    expected = Index(["あ", "a"], dtype="str")
     tm.assert_index_equal(result, expected)
 
     # while we're here, check that Series.astype behaves the same
     result = Series(idx).astype(str)
-    expected = Series(expected, dtype=object)
+    expected = Series(expected, dtype="str")
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py
index d545bfd2fae0f..1f9dad7c972d4 100644
--- a/pandas/tests/indexes/period/methods/test_astype.py
+++ b/pandas/tests/indexes/period/methods/test_astype.py
@@ -41,7 +41,7 @@ def test_astype_conversion(self):
         tm.assert_index_equal(result, expected)
 
         result = idx.astype(str)
-        expected = Index([str(x) for x in idx], name="idx", dtype=object)
+        expected = Index([str(x) for x in idx], name="idx", dtype="str")
         tm.assert_index_equal(result, expected)
 
         idx = period_range("1990", "2009", freq="Y", name="idx")
diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py
index 311f2b5c9aa59..1a855e728b0e7 100644
--- a/pandas/tests/indexes/timedeltas/methods/test_astype.py
+++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py
@@ -61,7 +61,7 @@ def test_astype(self):
         tm.assert_index_equal(result, expected)
 
         result = idx.astype(str)
-        expected = Index([str(x) for x in idx], name="idx", dtype=object)
+        expected = Index([str(x) for x in idx], name="idx", dtype="str")
         tm.assert_index_equal(result, expected)
 
         rng = timedelta_range("1 days", periods=10)
diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
index 76910db941d36..6885f2d6d0ee8 100644
--- a/pandas/tests/interchange/test_impl.py
+++ b/pandas/tests/interchange/test_impl.py
@@ -401,6 +401,7 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
     pd.api.interchange.from_dataframe(df)
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_empty_string_column():
     # https://github.com/pandas-dev/pandas/issues/56703
     df = pd.DataFrame({"a": []}, dtype=str)
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index b831ec3bb2c6a..3989e022dbbd2 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -587,7 +587,7 @@ def test_reader_dtype(self, read_ext):
 
         expected["a"] = expected["a"].astype("float64")
         expected["b"] = expected["b"].astype("float32")
-        expected["c"] = Series(["001", "002", "003", "004"], dtype=object)
+        expected["c"] = Series(["001", "002", "003", "004"], dtype="str")
         tm.assert_frame_equal(actual, expected)
 
         msg = "Unable to convert column d to type int64"
@@ -611,8 +611,8 @@ def test_reader_dtype(self, read_ext):
                 {
                     "a": Series([1, 2, 3, 4], dtype="float64"),
                     "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
-                    "c": Series(["001", "002", "003", "004"], dtype=object),
-                    "d": Series(["1", "2", np.nan, "4"], dtype=object),
+                    "c": Series(["001", "002", "003", "004"], dtype="str"),
+                    "d": Series(["1", "2", np.nan, "4"], dtype="str"),
                 },
             ),
         ],
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index 07f29518b7881..1e35a62e08c96 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -31,7 +31,7 @@
 @pytest.mark.parametrize("dtype", [str, object])
 @pytest.mark.parametrize("check_orig", [True, False])
 @pytest.mark.usefixtures("pyarrow_xfail")
-def test_dtype_all_columns(all_parsers, dtype, check_orig):
+def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
     # see gh-3795, gh-6607
     parser = all_parsers
 
@@ -49,8 +49,10 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig):
         if check_orig:
             expected = df.copy()
             result = result.astype(float)
-        else:
+        elif using_infer_string and dtype is str:
             expected = df.astype(str)
+        else:
+            expected = df.astype(str).astype(object)
 
         tm.assert_frame_equal(result, expected)
 
@@ -566,7 +568,7 @@ def test_string_inference(all_parsers):
 
 
 @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_])
-def test_string_inference_object_dtype(all_parsers, dtype):
+def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
     # GH#56047
     pytest.importorskip("pyarrow")
 
@@ -578,10 +580,11 @@ def test_string_inference_object_dtype(all_parsers, dtype):
     with pd.option_context("future.infer_string", True):
         result = parser.read_csv(StringIO(data), dtype=dtype)
 
+    expected_dtype = "str" if dtype is str and using_infer_string else object
     expected = DataFrame(
         {
-            "a": pd.Series(["x", "y", "z"], dtype=object),
-            "b": pd.Series(["a", "a", "a"], dtype=object),
+            "a": pd.Series(["x", "y", "z"], dtype=expected_dtype),
+            "b": pd.Series(["a", "a", "a"], dtype=expected_dtype),
         },
         columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
     )
@@ -592,7 +595,7 @@ def test_string_inference_object_dtype(all_parsers, dtype):
 
     expected = DataFrame(
         {
-            "a": pd.Series(["x", "y", "z"], dtype=object),
+            "a": pd.Series(["x", "y", "z"], dtype=expected_dtype),
             "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"),
         },
         columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
index 579d41f964df0..4a7e204ee4161 100644
--- a/pandas/tests/series/methods/test_astype.py
+++ b/pandas/tests/series/methods/test_astype.py
@@ -76,7 +76,7 @@ def test_astype_dict_like(self, dtype_class):
 
         dt1 = dtype_class({"abc": str})
         result = ser.astype(dt1)
-        expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object)
+        expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype="str")
         tm.assert_series_equal(result, expected)
 
         dt2 = dtype_class({"abc": "float64"})
@@ -173,10 +173,14 @@ def test_astype_empty_constructor_equality(self, dtype):
     def test_astype_str_map(self, dtype, data, using_infer_string):
         # see GH#4405
         series = Series(data)
+        using_string_dtype = using_infer_string and dtype is str
         result = series.astype(dtype)
-        expected = series.map(str)
-        if using_infer_string:
-            expected = expected.astype(object)
+        if using_string_dtype:
+            expected = series.map(lambda val: str(val) if val is not np.nan else np.nan)
+        else:
+            expected = series.map(str)
+            if using_infer_string:
+                expected = expected.astype(object)
         tm.assert_series_equal(result, expected)
 
     def test_astype_float_to_period(self):
@@ -213,7 +217,7 @@ def test_astype_dt64_to_str(self):
         # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex
         dti = date_range("2012-01-01", periods=3)
         result = Series(dti).astype(str)
-        expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object)
+        expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype="str")
         tm.assert_series_equal(result, expected)
 
     def test_astype_dt64tz_to_str(self):
@@ -226,7 +230,7 @@ def test_astype_dt64tz_to_str(self):
                 "2012-01-02 00:00:00-05:00",
                 "2012-01-03 00:00:00-05:00",
             ],
-            dtype=object,
+            dtype="str",
         )
         tm.assert_series_equal(result, expected)
 
@@ -286,13 +290,13 @@ def test_astype_str_cast_dt64(self):
         ts = Series([Timestamp("2010-01-04 00:00:00")])
         res = ts.astype(str)
 
-        expected = Series(["2010-01-04"], dtype=object)
+        expected = Series(["2010-01-04"], dtype="str")
         tm.assert_series_equal(res, expected)
 
         ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")])
         res = ts.astype(str)
 
-        expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object)
+        expected = Series(["2010-01-04 00:00:00-05:00"], dtype="str")
         tm.assert_series_equal(res, expected)
 
     def test_astype_str_cast_td64(self):
@@ -301,7 +305,7 @@ def test_astype_str_cast_td64(self):
         td = Series([Timedelta(1, unit="D")])
         ser = td.astype(str)
 
-        expected = Series(["1 days"], dtype=object)
+        expected = Series(["1 days"], dtype="str")
         tm.assert_series_equal(ser, expected)
 
     def test_dt64_series_astype_object(self):
@@ -347,7 +351,7 @@ def test_astype_from_float_to_str(self, any_float_dtype):
         # https://github.com/pandas-dev/pandas/issues/36451
         ser = Series([0.1], dtype=any_float_dtype)
         result = ser.astype(str)
-        expected = Series(["0.1"], dtype=object)
+        expected = Series(["0.1"], dtype="str")
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize(
@@ -358,11 +362,13 @@ def test_astype_from_float_to_str(self, any_float_dtype):
             (NA, "<NA>"),
         ],
     )
-    def test_astype_to_str_preserves_na(self, value, string_value):
+    def test_astype_to_str_preserves_na(self, value, string_value, using_infer_string):
         # https://github.com/pandas-dev/pandas/issues/36904
         ser = Series(["a", "b", value], dtype=object)
         result = ser.astype(str)
-        expected = Series(["a", "b", string_value], dtype=object)
+        expected = Series(
+            ["a", "b", None if using_infer_string else string_value], dtype="str"
+        )
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"])

From d127770052be4544468d3e50cb465b8a089dff30 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 2 Sep 2024 16:59:46 +0200
Subject: [PATCH 03/21] fix datetimelike astype and more tests

---
 pandas/_libs/lib.pyx                          |  6 +++++-
 pandas/core/arrays/datetimelike.py            |  4 ++--
 pandas/core/arrays/string_.py                 |  2 +-
 pandas/core/arrays/string_arrow.py            |  2 +-
 pandas/tests/extension/test_arrow.py          | 20 -------------------
 pandas/tests/frame/methods/test_astype.py     | 11 +++++-----
 .../tests/indexes/interval/test_indexing.py   |  4 ++--
 .../io/parser/dtypes/test_dtypes_basic.py     |  2 +-
 8 files changed, 18 insertions(+), 33 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index e1a2a0142c52e..0a51dcf117331 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -750,7 +750,11 @@ cpdef ndarray[object] ensure_string_array(
 
     if hasattr(arr, "to_numpy"):
 
-        if hasattr(arr, "dtype") and arr.dtype.kind in "mM":
+        if (
+            hasattr(arr, "dtype")
+            and arr.dtype.kind in "mM"
+            and not hasattr(arr, "_pa_array")
+        ):
             # dtype check to exclude DataFrame
             # GH#41409 TODO: not a great place for this
             out = arr.astype(str).astype(object)
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 94a57d30020f3..88aad1565f540 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -472,12 +472,12 @@ def astype(self, dtype, copy: bool = True):
             return self._box_values(self.asi8.ravel()).reshape(self.shape)
 
         elif is_string_dtype(dtype):
-            arr_object = self._format_native_types()
             if isinstance(dtype, ExtensionDtype):
+                arr_object = self._format_native_types(na_rep=dtype.na_value)
                 cls = dtype.construct_array_type()
                 return cls._from_sequence(arr_object, dtype=dtype, copy=False)
             else:
-                return arr_object
+                return self._format_native_types()
 
         elif isinstance(dtype, ExtensionDtype):
             return super().astype(dtype, copy=copy)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 143a13c54dbbb..28d315a28919e 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -625,7 +625,7 @@ def _from_sequence(
                 #  zero_copy_only to True which caused problems see GH#52076
                 scalars = np.array(scalars)
             # convert non-na-likes to str, and nan-likes to StringDtype().na_value
-            result = lib.ensure_string_array(scalars, na_value=na_value, copy=copy)
+            result = lib.ensure_string_array(scalars, na_value=na_value, copy=True)
 
         # Manually creating new array avoids the validation step in the __init__, so is
         # faster. Refactor need for validation?
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index e8e74b0ba1215..68d7606cc3b3f 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -200,7 +200,7 @@ def _from_sequence(
             return cls(pc.cast(scalars, pa.large_string()))
 
         # convert non-na-likes to str
-        result = lib.ensure_string_array(scalars, copy=copy)
+        result = lib.ensure_string_array(scalars, copy=True)
         return cls(pa.array(result, type=pa.large_string(), from_pandas=True))
 
     @classmethod
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 3dbdda388d035..028409e720129 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -45,7 +45,6 @@
     pa_version_under13p0,
     pa_version_under14p0,
 )
-import pandas.util._test_decorators as td
 
 from pandas.core.dtypes.dtypes import (
     ArrowDtype,
@@ -312,25 +311,6 @@ def test_astype_str(self, data, request):
             )
         super().test_astype_str(data)
 
-    @pytest.mark.parametrize(
-        "nullable_string_dtype",
-        [
-            "string[python]",
-            pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
-        ],
-    )
-    def test_astype_string(self, data, nullable_string_dtype, request):
-        pa_dtype = data.dtype.pyarrow_dtype
-        if (
-            pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None
-        ) or pa.types.is_duration(pa_dtype):
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="pd.Timestamp/pd.Timedelta repr different from numpy repr",
-                )
-            )
-        super().test_astype_string(data, nullable_string_dtype)
-
     def test_from_dtype(self, data, request):
         pa_dtype = data.dtype.pyarrow_dtype
         if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype):
diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
index 65a53ccb95141..ab3743283ea13 100644
--- a/pandas/tests/frame/methods/test_astype.py
+++ b/pandas/tests/frame/methods/test_astype.py
@@ -173,10 +173,10 @@ def test_astype_str(self):
 
         tm.assert_frame_equal(result, expected)
 
-    def test_astype_str_float(self):
+    def test_astype_str_float(self, using_infer_string):
         # see GH#11302
         result = DataFrame([np.nan]).astype(str)
-        expected = DataFrame(["nan"], dtype="str")
+        expected = DataFrame([np.nan if using_infer_string else "nan"], dtype="str")
 
         tm.assert_frame_equal(result, expected)
         result = DataFrame([1.12345678901234567890]).astype(str)
@@ -647,9 +647,10 @@ def test_astype_dt64tz(self, timezone_frame):
             # dt64tz->dt64 deprecated
             timezone_frame.astype("datetime64[ns]")
 
-    def test_astype_dt64tz_to_str(self, timezone_frame):
+    def test_astype_dt64tz_to_str(self, timezone_frame, using_infer_string):
         # str formatting
         result = timezone_frame.astype(str)
+        na_value = np.nan if using_infer_string else "NaT"
         expected = DataFrame(
             [
                 [
@@ -657,7 +658,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame):
                     "2013-01-01 00:00:00-05:00",
                     "2013-01-01 00:00:00+01:00",
                 ],
-                ["2013-01-02", "NaT", "NaT"],
+                ["2013-01-02", na_value, na_value],
                 [
                     "2013-01-03",
                     "2013-01-03 00:00:00-05:00",
@@ -665,7 +666,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame):
                 ],
             ],
             columns=timezone_frame.columns,
-            dtype="object",
+            dtype="str",
         )
         tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py
index 00d01c47251e3..787461b944bd0 100644
--- a/pandas/tests/indexes/interval/test_indexing.py
+++ b/pandas/tests/indexes/interval/test_indexing.py
@@ -365,9 +365,9 @@ def test_get_indexer_datetime(self):
         # TODO: with mismatched resolution get_indexer currently raises;
         #  this should probably coerce?
         target = DatetimeIndex(["2018-01-02"], dtype="M8[ns]")
-        # result = ii.get_indexer(target)
+        result = ii.get_indexer(target)
         expected = np.array([0], dtype=np.intp)
-        # tm.assert_numpy_array_equal(result, expected)
+        tm.assert_numpy_array_equal(result, expected)
 
         result = ii.get_indexer(target.astype(str))
         tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index 1e35a62e08c96..256b09c62ec76 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -580,7 +580,7 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
     with pd.option_context("future.infer_string", True):
         result = parser.read_csv(StringIO(data), dtype=dtype)
 
-    expected_dtype = "str" if dtype is str and using_infer_string else object
+    expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object
     expected = DataFrame(
         {
             "a": pd.Series(["x", "y", "z"], dtype=expected_dtype),

From 38d011a7463629b84b18d84d6597e4fe4352f60b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 2 Sep 2024 17:15:01 +0200
Subject: [PATCH 04/21] remove xfails

---
 pandas/tests/extension/json/array.py               | 3 +--
 pandas/tests/frame/methods/test_select_dtypes.py   | 5 ++++-
 pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 2 --
 pandas/tests/io/parser/test_na_values.py           | 2 --
 pandas/tests/io/parser/test_python_parser_only.py  | 6 ++----
 5 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
index 3a4391edc99ef..4fa48023fbc95 100644
--- a/pandas/tests/extension/json/array.py
+++ b/pandas/tests/extension/json/array.py
@@ -208,9 +208,8 @@ def astype(self, dtype, copy=True):
                 return self.copy()
             return self
         elif isinstance(dtype, StringDtype):
-            value = self.astype(str)  # numpy doesn't like nested dicts
             arr_cls = dtype.construct_array_type()
-            return arr_cls._from_sequence(value, dtype=dtype, copy=False)
+            return arr_cls._from_sequence(self, dtype=dtype, copy=False)
         elif not copy:
             return np.asarray([dict(x) for x in self], dtype=dtype)
         else:
diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py
index 875dca321635f..0354e9df3d168 100644
--- a/pandas/tests/frame/methods/test_select_dtypes.py
+++ b/pandas/tests/frame/methods/test_select_dtypes.py
@@ -99,6 +99,9 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string):
             ei = df[["a"]]
             tm.assert_frame_equal(ri, ei)
 
+            ri = df.select_dtypes(include=[str])
+            tm.assert_frame_equal(ri, ei)
+
     def test_select_dtypes_exclude_using_list_like(self):
         df = DataFrame(
             {
@@ -358,7 +361,7 @@ def test_select_dtypes_datetime_with_tz(self):
     @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"])
     @pytest.mark.parametrize("arg", ["include", "exclude"])
     def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string):
-        if using_infer_string and dtype == "str":
+        if using_infer_string and (dtype == "str" or dtype is str):
             # this is tested below
             pytest.skip("Selecting string columns works with future strings")
         df = DataFrame(
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index 256b09c62ec76..a9c3ae8f0fba4 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -302,7 +302,6 @@ def test_true_values_cast_to_bool(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.usefixtures("pyarrow_xfail")
 @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
 def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
@@ -318,7 +317,6 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
     # GH#42022
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index 360a5feebe073..f0a04a46c6f54 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -667,7 +667,6 @@ def test_inf_na_values_with_int_index(all_parsers):
     tm.assert_frame_equal(out, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @xfail_pyarrow  # mismatched shape
 @pytest.mark.parametrize("na_filter", [True, False])
 def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
@@ -719,7 +718,6 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
 # TODO: this test isn't about the na_values keyword, it is about the empty entries
 #  being returned with NaN entries, whereas the pyarrow engine returns "nan"
 @xfail_pyarrow  # mismatched shapes
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_str_nan_dropped(all_parsers):
     # see gh-21131
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index 26480010fc687..a5bb151e84f47 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -18,8 +18,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import (
     ParserError,
     ParserWarning,
@@ -499,7 +497,6 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}]
 )
@@ -524,10 +521,11 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d
             "c": [0, 4000, 131],
         }
     )
+    if dtype["a"] == object:
+        expected["a"] = expected["a"].astype(object)
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "dtype,expected",
     [

From c4ed9d360d902cc985e00a1f611d52e3d0a09238 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 2 Sep 2024 17:22:51 +0200
Subject: [PATCH 05/21] try fix typing

---
 pandas/core/arrays/categorical.py  | 2 +-
 pandas/core/arrays/datetimelike.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index a4a7900342f5d..0a45cdb116e8f 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -544,7 +544,7 @@ def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self:
         return res
 
     @overload
-    def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ...
+    def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> ArrayLike: ...
 
     @overload
     def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: ...
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 88aad1565f540..7be8daa09c758 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -473,7 +473,7 @@ def astype(self, dtype, copy: bool = True):
 
         elif is_string_dtype(dtype):
             if isinstance(dtype, ExtensionDtype):
-                arr_object = self._format_native_types(na_rep=dtype.na_value)
+                arr_object = self._format_native_types(na_rep=dtype.na_value)  # type: ignore[arg-type]
                 cls = dtype.construct_array_type()
                 return cls._from_sequence(arr_object, dtype=dtype, copy=False)
             else:

From cad7d8f97be875ba987583e03e20148d9f2c5b12 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 2 Sep 2024 17:31:26 +0200
Subject: [PATCH 06/21] fix copy_view tests

---
 pandas/tests/copy_view/test_astype.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
index de56d5e4a07ee..d6046f86be79f 100644
--- a/pandas/tests/copy_view/test_astype.py
+++ b/pandas/tests/copy_view/test_astype.py
@@ -92,7 +92,12 @@ def test_astype_string_and_object(dtype, new_dtype):
     df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
     df_orig = df.copy()
     df2 = df.astype(new_dtype)
-    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    if new_dtype == "string":
+        # cast to string has to copy to avoid mutating the original during
+        # the call to ensure_string_array -> never a delayed copy
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
 
     df2.iloc[0, 0] = "x"
     tm.assert_frame_equal(df, df_orig)
@@ -105,7 +110,12 @@ def test_astype_string_and_object_update_original(dtype, new_dtype):
     df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
     df2 = df.astype(new_dtype)
     df_orig = df2.copy()
-    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    if new_dtype == "string":
+        # cast to string has to copy to avoid mutating the original during
+        # the call to ensure_string_array -> never a delayed copy
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
 
     df.iloc[0, 0] = "x"
     tm.assert_frame_equal(df2, df_orig)
@@ -220,7 +230,7 @@ def test_convert_dtypes():
     df_orig = df.copy()
     df2 = df.convert_dtypes()
 
-    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
     assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
     assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
     assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))

From 189e26d77799e9f3082d4b7f7604653b4346ea41 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 2 Sep 2024 19:24:26 +0200
Subject: [PATCH 07/21] fix remaining tests with infer_string enabled

---
 pandas/core/arrays/categorical.py                      | 6 +++---
 pandas/tests/arrays/sparse/test_astype.py              | 4 ++--
 pandas/tests/arrays/sparse/test_dtype.py               | 2 +-
 pandas/tests/extension/base/casting.py                 | 4 ++--
 pandas/tests/indexes/datetimes/methods/test_astype.py  | 7 +++++--
 pandas/tests/indexes/period/methods/test_astype.py     | 9 +++++++--
 pandas/tests/indexes/test_base.py                      | 3 ---
 pandas/tests/indexes/timedeltas/methods/test_astype.py | 9 +++++++--
 pandas/tests/series/methods/test_map.py                | 4 +---
 pandas/tests/test_algos.py                             | 7 +++++--
 10 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 0a45cdb116e8f..cddb70dad6dba 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2689,9 +2689,9 @@ def _str_get_dummies(self, sep: str = "|"):
         from pandas.core.arrays import NumpyExtensionArray
 
         if using_string_dtype():
-            return NumpyExtensionArray(self.astype(str).to_numpy())._str_get_dummies(
-                sep
-            )
+            return NumpyExtensionArray(
+                self.astype(str).to_numpy(na_value="NaN")
+            )._str_get_dummies(sep)
 
         return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep)
 
diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py
index 83a507e679d46..e6e4a11a0f5ab 100644
--- a/pandas/tests/arrays/sparse/test_astype.py
+++ b/pandas/tests/arrays/sparse/test_astype.py
@@ -81,8 +81,8 @@ def test_astype_all(self, any_real_numpy_dtype):
             ),
             (
                 SparseArray([0, 1, 10]),
-                str,
-                SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")),
+                np.str_,
+                SparseArray(["0", "1", "10"], dtype=SparseDtype(np.str_, "0")),
             ),
             (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])),
             (
diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py
index 1819744d9a9ae..6143163735ab8 100644
--- a/pandas/tests/arrays/sparse/test_dtype.py
+++ b/pandas/tests/arrays/sparse/test_dtype.py
@@ -184,7 +184,7 @@ def test_construct_from_string_fill_value_raises(string):
     [
         (SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
         (SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
-        (SparseDtype(int, 1), str, SparseDtype(object, "1")),
+        (SparseDtype(int, 1), np.str_, SparseDtype(object, "1")),
         (SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
     ],
 )
diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py
index e924e38ee5030..8e3f21e1a4f56 100644
--- a/pandas/tests/extension/base/casting.py
+++ b/pandas/tests/extension/base/casting.py
@@ -44,8 +44,8 @@ def test_tolist(self, data):
         assert result == expected
 
     def test_astype_str(self, data):
-        result = pd.Series(data[:5]).astype(str)
-        expected = pd.Series([str(x) for x in data[:5]], dtype=str)
+        result = pd.Series(data[:2]).astype(str)
+        expected = pd.Series([str(x) for x in data[:2]], dtype=str)
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize(
diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py
index 03957053199a8..62be8903da206 100644
--- a/pandas/tests/indexes/datetimes/methods/test_astype.py
+++ b/pandas/tests/indexes/datetimes/methods/test_astype.py
@@ -101,13 +101,16 @@ def test_astype_tznaive_to_tzaware(self):
             # dt64->dt64tz deprecated
             idx._data.astype("datetime64[ns, US/Eastern]")
 
-    def test_astype_str_nat(self):
+    def test_astype_str_nat(self, using_infer_string):
         # GH 13149, GH 13209
         # verify that we are returning NaT as a string (and not unicode)
 
         idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan])
         result = idx.astype(str)
-        expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype="str")
+        if using_infer_string:
+            expected = Index(["2016-05-16", None, None, None], dtype="str")
+        else:
+            expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object)
         tm.assert_index_equal(result, expected)
 
     def test_astype_str(self):
diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py
index 1f9dad7c972d4..af3c2667f51b4 100644
--- a/pandas/tests/indexes/period/methods/test_astype.py
+++ b/pandas/tests/indexes/period/methods/test_astype.py
@@ -22,7 +22,7 @@ def test_astype_raises(self, dtype):
         with pytest.raises(TypeError, match=msg):
             idx.astype(dtype)
 
-    def test_astype_conversion(self):
+    def test_astype_conversion(self, using_infer_string):
         # GH#13149, GH#13209
         idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx")
 
@@ -41,7 +41,12 @@ def test_astype_conversion(self):
         tm.assert_index_equal(result, expected)
 
         result = idx.astype(str)
-        expected = Index([str(x) for x in idx], name="idx", dtype="str")
+        if using_infer_string:
+            expected = Index(
+                [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str"
+            )
+        else:
+            expected = Index([str(x) for x in idx], name="idx", dtype=object)
         tm.assert_index_equal(result, expected)
 
         idx = period_range("1990", "2009", freq="Y", name="idx")
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index 7ec66100b7291..cb4e922977d5e 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -76,9 +76,6 @@ def test_constructor_casting(self, index):
         tm.assert_contains_all(arr, new_index)
         tm.assert_index_equal(index, new_index)
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-    )
     def test_constructor_copy(self, using_infer_string):
         index = Index(list("abc"), name="name")
         arr = np.array(index)
diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py
index 1a855e728b0e7..5166cadae499e 100644
--- a/pandas/tests/indexes/timedeltas/methods/test_astype.py
+++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py
@@ -44,7 +44,7 @@ def test_astype_object_with_nat(self):
         tm.assert_index_equal(result, expected)
         assert idx.tolist() == expected_list
 
-    def test_astype(self):
+    def test_astype(self, using_infer_string):
         # GH 13149, GH 13209
         idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx")
 
@@ -61,7 +61,12 @@ def test_astype(self):
         tm.assert_index_equal(result, expected)
 
         result = idx.astype(str)
-        expected = Index([str(x) for x in idx], name="idx", dtype="str")
+        if using_infer_string:
+            expected = Index(
+                [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str"
+            )
+        else:
+            expected = Index([str(x) for x in idx], name="idx", dtype=object)
         tm.assert_index_equal(result, expected)
 
         rng = timedelta_range("1 days", periods=10)
diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py
index fe84ffafa70b4..7fa8686fcc6c8 100644
--- a/pandas/tests/series/methods/test_map.py
+++ b/pandas/tests/series/methods/test_map.py
@@ -549,13 +549,11 @@ def f(x):
         (list(range(3)), {0: 42}, [42] + [np.nan] * 3),
     ],
 )
-def test_map_missing_mixed(vals, mapping, exp, using_infer_string):
+def test_map_missing_mixed(vals, mapping, exp):
     # GH20495
     s = Series(vals + [np.nan])
     result = s.map(mapping)
     exp = Series(exp)
-    if using_infer_string and mapping == {np.nan: "not NaN"}:
-        exp.iloc[-1] = np.nan
     tm.assert_series_equal(result, exp)
 
 
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 06fd81ed722d9..696ea75cd8995 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1877,13 +1877,16 @@ def test_strobj_mode(self):
         tm.assert_series_equal(ser.mode(), exp)
 
     @pytest.mark.parametrize("dt", [str, object])
-    def test_strobj_multi_char(self, dt):
+    def test_strobj_multi_char(self, dt, using_infer_string):
         exp = ["bar"]
         data = ["foo"] * 2 + ["bar"] * 3
 
         ser = Series(data, dtype=dt)
         exp = Series(exp, dtype=dt)
-        tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
+        if using_infer_string:
+            tm.assert_extension_array_equal(algos.mode(ser.values), exp.values)
+        else:
+            tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
         tm.assert_series_equal(ser.mode(), exp)
 
     def test_datelike_mode(self):

From 1089eb30948e1f2d5270198d23006b3c4583e5a9 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 2 Sep 2024 19:41:56 +0200
Subject: [PATCH 08/21] ignore typing issue for now

---
 pandas/core/arrays/categorical.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index cddb70dad6dba..19052701a59ab 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -544,7 +544,7 @@ def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self:
         return res
 
     @overload
-    def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> ArrayLike: ...
+    def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ...
 
     @overload
     def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: ...
@@ -2690,7 +2690,7 @@ def _str_get_dummies(self, sep: str = "|"):
 
         if using_string_dtype():
             return NumpyExtensionArray(
-                self.astype(str).to_numpy(na_value="NaN")
+                self.astype(str).to_numpy(na_value="NaN")  # type: ignore[attr-defined]
             )._str_get_dummies(sep)
 
         return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep)

From 8f7e96874b54df681ca9bacd1d3b8401b3909eb0 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 2 Sep 2024 20:23:59 +0200
Subject: [PATCH 09/21] move to common.py

---
 pandas/core/dtypes/base.py   | 8 --------
 pandas/core/dtypes/common.py | 8 ++++++++
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
index 42b51b11ad41f..d8a42d83b6c54 100644
--- a/pandas/core/dtypes/base.py
+++ b/pandas/core/dtypes/base.py
@@ -14,8 +14,6 @@
 
 import numpy as np
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import missing as libmissing
 from pandas._libs.hashtable import object_hash
 from pandas._libs.properties import cache_readonly
@@ -564,12 +562,6 @@ def find(
         return the first matching dtype, otherwise return None
         """
         if not isinstance(dtype, str):
-            # builtin aliases
-            if dtype is str and using_string_dtype():
-                from pandas.core.arrays.string_ import StringDtype
-
-                return StringDtype(na_value=np.nan)
-
             dtype_type: type_t
             if not isinstance(dtype, type):
                 dtype_type = type(dtype)
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index bcf1ade9b0320..95c4fde422af5 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -12,6 +12,8 @@
 
 import numpy as np
 
+from pandas._config import using_string_dtype
+
 from pandas._libs import (
     Interval,
     Period,
@@ -1703,6 +1705,12 @@ def pandas_dtype(dtype) -> DtypeObj:
     elif isinstance(dtype, (np.dtype, ExtensionDtype)):
         return dtype
 
+    # builtin aliases
+    if dtype is str and using_string_dtype():
+        from pandas.core.arrays.string_ import StringDtype
+
+        return StringDtype(na_value=np.nan)
+
     # registered extension types
     result = registry.find(dtype)
     if result is not None:

From 15f45d27be5b4b02167a3216ed1b91c65262b8ec Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 2 Sep 2024 20:27:27 +0200
Subject: [PATCH 10/21] simplify Categorical._str_get_dummies

---
 pandas/core/arrays/categorical.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 19052701a59ab..0cbddadd1b5a6 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -13,10 +13,7 @@
 
 import numpy as np
 
-from pandas._config import (
-    get_option,
-    using_string_dtype,
-)
+from pandas._config import get_option
 
 from pandas._libs import (
     NaT,
@@ -2688,12 +2685,9 @@ def _str_get_dummies(self, sep: str = "|"):
         # sep may not be in categories. Just bail on this.
         from pandas.core.arrays import NumpyExtensionArray
 
-        if using_string_dtype():
-            return NumpyExtensionArray(
-                self.astype(str).to_numpy(na_value="NaN")  # type: ignore[attr-defined]
-            )._str_get_dummies(sep)
-
-        return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep)
+        return NumpyExtensionArray(self.to_numpy(str, na_value="NaN"))._str_get_dummies(
+            sep
+        )
 
     # ------------------------------------------------------------------------
     # GroupBy Methods

From 4464fb113a61bde7749fc19db78efb92b65f2f03 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 2 Sep 2024 20:36:44 +0200
Subject: [PATCH 11/21] small cleanup

---
 pandas/core/indexes/base.py         | 2 +-
 pandas/core/strings/object_array.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 0a52a3691bc82..3b5751f188912 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -3658,6 +3658,7 @@ def get_indexer(
         method = clean_reindex_fill_method(method)
         orig_target = target
         target = self._maybe_cast_listlike_indexer(target)
+
         self._check_indexing_method(method, limit, tolerance)
 
         if not self._index_as_unique:
@@ -7730,7 +7731,6 @@ def get_values_for_csv(
         values = cast("IntervalArray", values)
         mask = values.isna()
         if not quoting:
-            # TODO
             result = np.asarray(values).astype(str)
         else:
             result = np.array(values, dtype=object, copy=True)
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 28bd943688b79..c6b18d7049c57 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -405,7 +405,6 @@ def _str_get_dummies(self, sep: str = "|"):
         try:
             arr = sep + arr + sep
         except (TypeError, NotImplementedError):
-            # TODO
             arr = sep + arr.astype(str) + sep
 
         tags: set[str] = set()

From 650f694db269289c80469947659263ae9d1a4b29 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 2 Sep 2024 21:27:27 +0200
Subject: [PATCH 12/21] fix ensure_string_array to not modify extension arrays
 inplace

---
 pandas/_libs/lib.pyx                  |  1 -
 pandas/core/arrays/string_.py         |  2 +-
 pandas/core/arrays/string_arrow.py    |  2 +-
 pandas/tests/copy_view/test_astype.py | 16 +++-------------
 4 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 0a51dcf117331..f375e4040f3ad 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -760,7 +760,6 @@ cpdef ndarray[object] ensure_string_array(
             out = arr.astype(str).astype(object)
             out[arr.isna()] = na_value
             return out
-        arr = arr.to_numpy(dtype=object)
     elif not util.is_array(arr):
         arr = np.array(arr, dtype="object")
 
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 28d315a28919e..143a13c54dbbb 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -625,7 +625,7 @@ def _from_sequence(
                 #  zero_copy_only to True which caused problems see GH#52076
                 scalars = np.array(scalars)
             # convert non-na-likes to str, and nan-likes to StringDtype().na_value
-            result = lib.ensure_string_array(scalars, na_value=na_value, copy=True)
+            result = lib.ensure_string_array(scalars, na_value=na_value, copy=copy)
 
         # Manually creating new array avoids the validation step in the __init__, so is
         # faster. Refactor need for validation?
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 68d7606cc3b3f..e8e74b0ba1215 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -200,7 +200,7 @@ def _from_sequence(
             return cls(pc.cast(scalars, pa.large_string()))
 
         # convert non-na-likes to str
-        result = lib.ensure_string_array(scalars, copy=True)
+        result = lib.ensure_string_array(scalars, copy=copy)
         return cls(pa.array(result, type=pa.large_string(), from_pandas=True))
 
     @classmethod
diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
index d6046f86be79f..de56d5e4a07ee 100644
--- a/pandas/tests/copy_view/test_astype.py
+++ b/pandas/tests/copy_view/test_astype.py
@@ -92,12 +92,7 @@ def test_astype_string_and_object(dtype, new_dtype):
     df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
     df_orig = df.copy()
     df2 = df.astype(new_dtype)
-    if new_dtype == "string":
-        # cast to string has to copy to avoid mutating the original during
-        # the call to ensure_string_array -> never a delayed copy
-        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
-    else:
-        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
 
     df2.iloc[0, 0] = "x"
     tm.assert_frame_equal(df, df_orig)
@@ -110,12 +105,7 @@ def test_astype_string_and_object_update_original(dtype, new_dtype):
     df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
     df2 = df.astype(new_dtype)
     df_orig = df2.copy()
-    if new_dtype == "string":
-        # cast to string has to copy to avoid mutating the original during
-        # the call to ensure_string_array -> never a delayed copy
-        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
-    else:
-        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
 
     df.iloc[0, 0] = "x"
     tm.assert_frame_equal(df2, df_orig)
@@ -230,7 +220,7 @@ def test_convert_dtypes():
     df_orig = df.copy()
     df2 = df.convert_dtypes()
 
-    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
     assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
     assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
     assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))

From 9164dbb90caa563c99215e5ab8bb3c19d82c34c3 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 3 Sep 2024 09:47:32 +0200
Subject: [PATCH 13/21] fix ensure_string_array once more + fix
 is_extension_array_dtype for str

---
 pandas/_libs/lib.pyx                 | 18 ++++++++++++------
 pandas/core/dtypes/common.py         |  3 +++
 pandas/tests/extension/test_arrow.py |  8 --------
 pandas/tests/test_algos.py           |  2 +-
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index f375e4040f3ad..35351fa0e371f 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array(
     convert_na_value : bool, default True
         If False, existing na values will be used unchanged in the new array.
     copy : bool, default True
-        Whether to ensure that a new array is returned.
+        Whether to ensure that a new array is returned. When True, a new array
+        is always returned. When False, a new array is only returned when needed
+        to avoid mutating the input array.
     skipna : bool, default True
         Whether or not to coerce nulls to their stringified form
         (e.g. if False, NaN becomes 'nan').
@@ -765,11 +767,15 @@ cpdef ndarray[object] ensure_string_array(
 
     result = np.asarray(arr, dtype="object")
 
-    if copy and (result is arr or np.shares_memory(arr, result)):
-        # GH#54654
-        result = result.copy()
-    elif not copy and result is arr:
-        already_copied = False
+    if result is arr or np.may_share_memory(arr, result):
+        # if np.asarray(..) did not make a copy of the input arr, we still need
+        #  to do that to avoid mutating the input array
+        # GH#54654: share_memory check is needed for rare cases where np.asarray
+        #  returns a new object without making a copy of the actual data
+        if copy:
+            result = result.copy()
+        else:
+            already_copied = False
     elif not copy and not result.flags.writeable:
         # Weird edge case where result is a view
         already_copied = False
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 95c4fde422af5..6e503704efd99 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -1410,6 +1410,9 @@ def is_extension_array_dtype(arr_or_dtype) -> bool:
     elif isinstance(dtype, np.dtype):
         return False
     else:
+        # TODO ugly -> move into registry find()? Or make this work with pandas_dtype?
+        if dtype is str and using_string_dtype():
+            return True
         return registry.find(dtype) is not None
 
 
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 028409e720129..1c8577844cd32 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -301,14 +301,6 @@ def test_astype_str(self, data, request):
                     reason=f"For {pa_dtype} .astype(str) decodes.",
                 )
             )
-        elif (
-            pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None
-        ) or pa.types.is_duration(pa_dtype):
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="pd.Timestamp/pd.Timedelta repr different from numpy repr",
-                )
-            )
         super().test_astype_str(data)
 
     def test_from_dtype(self, data, request):
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 696ea75cd8995..dac74a0e32a42 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1883,7 +1883,7 @@ def test_strobj_multi_char(self, dt, using_infer_string):
 
         ser = Series(data, dtype=dt)
         exp = Series(exp, dtype=dt)
-        if using_infer_string:
+        if using_infer_string and dt is str:
             tm.assert_extension_array_equal(algos.mode(ser.values), exp.values)
         else:
             tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)

From cf9f855eb004b9370671cd203dad76c724de5bff Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 3 Sep 2024 11:28:17 +0200
Subject: [PATCH 14/21] still xfail TestArrowArray::test_astype_str when not
 using infer_string

---
 pandas/tests/extension/test_arrow.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 1c8577844cd32..e906c08a50257 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -293,7 +293,7 @@ def test_map(self, data_missing, na_action):
                 expected = data_missing.to_numpy()
             tm.assert_numpy_array_equal(result, expected)
 
-    def test_astype_str(self, data, request):
+    def test_astype_str(self, data, request, using_infer_string):
         pa_dtype = data.dtype.pyarrow_dtype
         if pa.types.is_binary(pa_dtype):
             request.applymarker(
@@ -301,6 +301,15 @@ def test_astype_str(self, data, request):
                     reason=f"For {pa_dtype} .astype(str) decodes.",
                 )
             )
+        elif not using_infer_string and (
+            (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None)
+            or pa.types.is_duration(pa_dtype)
+        ):
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason="pd.Timestamp/pd.Timedelta repr different from numpy repr",
+                )
+            )
         super().test_astype_str(data)
 
     def test_from_dtype(self, data, request):

From bd79fc9b6ced05c869d3c0d8715eb3bfc2247ad9 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 3 Sep 2024 11:51:36 +0200
Subject: [PATCH 15/21] ensure maybe_convert_objects copies object dtype input
 array when inferring StringDtype

---
 pandas/_libs/lib.pyx | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 35351fa0e371f..919495e9a666a 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2712,13 +2712,17 @@ def maybe_convert_objects(ndarray[object] objects,
             from pandas.core.arrays.string_ import StringDtype
 
             dtype = StringDtype()
-            return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
+            return dtype.construct_array_type()._from_sequence(
+                objects, dtype=dtype, copy=True
+            )
 
         elif using_string_dtype() and is_string_array(objects, skipna=True):
             from pandas.core.arrays.string_ import StringDtype
 
             dtype = StringDtype(na_value=np.nan)
-            return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
+            return dtype.construct_array_type()._from_sequence(
+                objects, dtype=dtype, copy=True
+            )
 
         seen.object_ = True
     elif seen.interval_:

From 4c775d118184eb6d4bc11e960811182df4bbd420 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 3 Sep 2024 18:16:22 +0200
Subject: [PATCH 16/21] update test_1d_object_array_does_not_copy test

---
 pandas/tests/frame/test_constructors.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index ad78e75c6a400..5df9742500e31 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -24,7 +24,6 @@
 from pandas._config import using_string_dtype
 
 from pandas._libs import lib
-from pandas.compat import HAS_PYARROW
 from pandas.compat.numpy import np_version_gt2
 from pandas.errors import IntCastingNaNError
 
@@ -300,11 +299,22 @@ def test_constructor_dtype_nocast_view_2d_array(self):
         df2 = DataFrame(df.values, dtype=df[0].dtype)
         assert df2._mgr.blocks[0].values.flags.c_contiguous
 
-    @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="conversion copies")
-    def test_1d_object_array_does_not_copy(self):
+    def test_1d_object_array_does_not_copy(self, using_infer_string):
         # https://github.com/pandas-dev/pandas/issues/39272
         arr = np.array(["a", "b"], dtype="object")
         df = DataFrame(arr, copy=False)
+        if using_infer_string:
+            if df[0].dtype.storage == "pyarrow":
+                # object dtype strings are converted to arrow memory,
+                # no numpy arrays to compare
+                pass
+            else:
+                # TODO(infer_string): this should be fixed to still share memory?
+                assert not np.shares_memory(df[0].to_numpy(), arr)
+        else:
+            assert np.shares_memory(df.values, arr)
+
+        df = DataFrame(arr, dtype=object, copy=False)
         assert np.shares_memory(df.values, arr)
 
     @pytest.mark.xfail(using_string_dtype(), reason="conversion copies")

From b0276b2b1c8d78001142510046f660e1fffa778f Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 10 Sep 2024 09:28:38 +0200
Subject: [PATCH 17/21] update constructor copy test + do not copy in
 maybe_convert_objects?

---
 pandas/_libs/lib.pyx                    |  9 +++------
 pandas/tests/frame/test_constructors.py | 17 +++++++++++++----
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index e7b93bfe510e2..08ed997cc8d78 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -762,6 +762,7 @@ cpdef ndarray[object] ensure_string_array(
             out = arr.astype(str).astype(object)
             out[arr.isna()] = na_value
             return out
+        arr = arr.to_numpy(dtype=object)
     elif not util.is_array(arr):
         arr = np.array(arr, dtype="object")
 
@@ -2735,17 +2736,13 @@ def maybe_convert_objects(ndarray[object] objects,
             from pandas.core.arrays.string_ import StringDtype
 
             dtype = StringDtype()
-            return dtype.construct_array_type()._from_sequence(
-                objects, dtype=dtype, copy=True
-            )
+            return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
         elif using_string_dtype() and is_string_array(objects, skipna=True):
             from pandas.core.arrays.string_ import StringDtype
 
             dtype = StringDtype(na_value=np.nan)
-            return dtype.construct_array_type()._from_sequence(
-                objects, dtype=dtype, copy=True
-            )
+            return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
         seen.object_ = True
     elif seen.interval_:
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 298666590a3fb..0a924aa393be5 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -309,19 +309,28 @@ def test_1d_object_array_does_not_copy(self, using_infer_string):
                 # no numpy arrays to compare
                 pass
             else:
-                # TODO(infer_string): this should be fixed to still share memory?
-                assert not np.shares_memory(df[0].to_numpy(), arr)
+                assert np.shares_memory(df[0].to_numpy(), arr)
         else:
             assert np.shares_memory(df.values, arr)
 
         df = DataFrame(arr, dtype=object, copy=False)
         assert np.shares_memory(df.values, arr)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="conversion copies")
-    def test_2d_object_array_does_not_copy(self):
+    def test_2d_object_array_does_not_copy(self, using_infer_string):
         # https://github.com/pandas-dev/pandas/issues/39272
         arr = np.array([["a", "b"], ["c", "d"]], dtype="object")
         df = DataFrame(arr, copy=False)
+        if using_infer_string:
+            if df[0].dtype.storage == "pyarrow":
+                # object dtype strings are converted to arrow memory,
+                # no numpy arrays to compare
+                pass
+            else:
+                assert np.shares_memory(df[0].to_numpy(), arr)
+        else:
+            assert np.shares_memory(df.values, arr)
+
+        df = DataFrame(arr, dtype=object, copy=False)
         assert np.shares_memory(df.values, arr)
 
     def test_constructor_dtype_list_data(self):

From d413fc649c230a846246902f4f1992ca84f62220 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 10 Sep 2024 13:57:41 +0200
Subject: [PATCH 18/21] skip str.get_dummies test for now

---
 pandas/tests/strings/test_get_dummies.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
index 0656f505dc745..3b989e284ca25 100644
--- a/pandas/tests/strings/test_get_dummies.py
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_string_dtype
+
 import pandas.util._test_decorators as td
 
 from pandas import (
@@ -96,6 +98,7 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype):
 
 
 # GH#47872
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_get_dummies_with_str_dtype(any_string_dtype):
     s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
     result = s.str.get_dummies("|", dtype=str)

From db8900cc05a17a0f8cf18fa7cb0ababda4dde2c4 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 21 Sep 2024 10:55:58 +0200
Subject: [PATCH 19/21] use pandas_dtype() instead of registry.find

---
 pandas/core/dtypes/common.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index ddf21473d8017..d72e489708b51 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -1450,10 +1450,11 @@ def is_extension_array_dtype(arr_or_dtype) -> bool:
     elif isinstance(dtype, np.dtype):
         return False
     else:
-        # TODO ugly -> move into registry find()? Or make this work with pandas_dtype?
-        if dtype is str and using_string_dtype():
-            return True
-        return registry.find(dtype) is not None
+        try:
+            dtype = pandas_dtype(dtype)
+        except TypeError:
+            return False
+        return isinstance(dtype, ExtensionDtype)
 
 
 def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool:

From e6aad17b36296520dee47e462158f6b5313ef0a1 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 21 Sep 2024 11:25:48 +0200
Subject: [PATCH 20/21] fix corner cases for calling pandas_dtype

---
 pandas/core/dtypes/common.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index d72e489708b51..daa073d0b4eda 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -1451,8 +1451,12 @@ def is_extension_array_dtype(arr_or_dtype) -> bool:
         return False
     else:
         try:
-            dtype = pandas_dtype(dtype)
-        except TypeError:
+            with warnings.catch_warnings():
+                # pandas_dtype(..) can raise UserWarning for class input
+                warnings.simplefilter("ignore", UserWarning)
+                dtype = pandas_dtype(dtype)
+        except (TypeError, ValueError):
+            # np.dtype(..) can raise ValueError
             return False
         return isinstance(dtype, ExtensionDtype)
 

From 4e6cf04cf41b66a0deb9a7073d24ebcddaeaad21 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 21 Sep 2024 15:44:15 +0200
Subject: [PATCH 21/21] add TODO comment in ensure_string_array

---
 pandas/_libs/lib.pyx | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index e0c39f1769210..8af48a861967a 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -757,6 +757,9 @@ cpdef ndarray[object] ensure_string_array(
         if (
             hasattr(arr, "dtype")
             and arr.dtype.kind in "mM"
+            # TODO: we should add a custom ArrowExtensionArray.astype implementation
+            # that handles astype(str) specifically, avoiding ending up here and
+            # then we can remove the below check for `_pa_array` (for ArrowEA)
             and not hasattr(arr, "_pa_array")
         ):
             # dtype check to exclude DataFrame