From f5fb4744450485c1e4fd322c99b1ab6f38d29a8e Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Mon, 30 Jan 2023 21:46:43 +0100
Subject: [PATCH 1/4] BUG: astype to string modifying input array inplace

---
 doc/source/whatsnew/v2.0.0.rst            |  1 +
 pandas/core/arrays/string_.py             |  6 ++++++
 pandas/core/arrays/string_arrow.py        |  7 +++++++
 pandas/tests/frame/methods/test_astype.py | 10 ++++++++++
 4 files changed, 24 insertions(+)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index e90717efc10b5..3402c6a7d5502 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -1021,6 +1021,7 @@ Conversion
 - Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`)
 - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`)
 - Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`50430`)
+- Bug in :meth:`DataFrame.astype` modifying input array inplace when converting to ``string`` and ``copy=False`` (:issue:``)
 - Bug in :meth:`Series.to_numpy` converting to NumPy array before applying ``na_value`` (:issue:`48951`)
 - Bug in :meth:`DataFrame.astype` not copying data when converting to pyarrow dtype (:issue:`50984`)
 - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 9b26db07fc28f..60755f3a42cf5 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -357,6 +357,12 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
 
         else:
             # convert non-na-likes to str, and nan-likes to StringDtype().na_value
+            if is_object_dtype(scalars):
+                # copy if we get object dtype with non-string values to avoid
+                # modifying input inplace
+                inferred = lib.infer_dtype(scalars, skipna=False)
+                if inferred != "string":
+                    copy = True
             result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
 
         # Manually creating new array avoids the validation step in the __init__, so is
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 4aebe61412866..11a797b32cb58 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -135,6 +135,13 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
             result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
             return cls(pa.array(result, mask=na_values, type=pa.string()))
 
+        if is_object_dtype(scalars):
+            # copy if we get object dtype with non-string values to avoid
+            # modifying input inplace
+            inferred = lib.infer_dtype(scalars, skipna=False)
+            if inferred != "string":
+                copy = True
+
         # convert non-na-likes to str
         result = lib.ensure_string_array(scalars, copy=copy)
         return cls(pa.array(result, type=pa.string(), from_pandas=True))
diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
index 1f43b51d4808f..7840d20196160 100644
--- a/pandas/tests/frame/methods/test_astype.py
+++ b/pandas/tests/frame/methods/test_astype.py
@@ -879,3 +879,13 @@ def test_astype_copies(dtype):
     df.iloc[0, 0] = 100
     expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]")
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
+def test_astype_to_string_not_modifying_input(string_storage, val):
+    # GH#
+    df = DataFrame({"a": ["a", "b", val]})
+    expected = df.copy()
+    with option_context("mode.string_storage", string_storage):
+        df.astype("string", copy=False)
+    tm.assert_frame_equal(df, expected)

From c338f94d4676081f37f460df1f81bddc798ff536 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Mon, 30 Jan 2023 21:48:04 +0100
Subject: [PATCH 2/4] Add gh ref

---
 doc/source/whatsnew/v2.0.0.rst            | 2 +-
 pandas/tests/frame/methods/test_astype.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 3402c6a7d5502..c667e82e5c3d7 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -1021,7 +1021,7 @@ Conversion
 - Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`)
 - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`)
 - Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`50430`)
-- Bug in :meth:`DataFrame.astype` modifying input array inplace when converting to ``string`` and ``copy=False`` (:issue:``)
+- Bug in :meth:`DataFrame.astype` modifying input array inplace when converting to ``string`` and ``copy=False`` (:issue:`51073`)
 - Bug in :meth:`Series.to_numpy` converting to NumPy array before applying ``na_value`` (:issue:`48951`)
 - Bug in :meth:`DataFrame.astype` not copying data when converting to pyarrow dtype (:issue:`50984`)
 - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`)
diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
index 7840d20196160..8a1a2783b5dc6 100644
--- a/pandas/tests/frame/methods/test_astype.py
+++ b/pandas/tests/frame/methods/test_astype.py
@@ -883,7 +883,7 @@ def test_astype_copies(dtype):
 
 @pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
 def test_astype_to_string_not_modifying_input(string_storage, val):
-    # GH#
+    # GH#51073
     df = DataFrame({"a": ["a", "b", val]})
     expected = df.copy()
     with option_context("mode.string_storage", string_storage):

From f296e37efa3b0b8e8b91bd0bdbd00a35bc31ea83 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Tue, 31 Jan 2023 21:48:57 +0100
Subject: [PATCH 3/4] Move to cython

---
 pandas/_libs/lib.pyx                       |  6 ++++++
 pandas/core/arrays/string_.py              |  6 ------
 pandas/core/arrays/string_arrow.py         |  7 -------
 pandas/tests/arrays/string_/test_string.py | 10 ++--------
 4 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 16d5bbaad9de9..ca81acb6c009a 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -739,6 +739,7 @@ cpdef ndarray[object] ensure_string_array(
     """
     cdef:
         Py_ssize_t i = 0, n = len(arr)
+        bint already_copied = True
 
     if hasattr(arr, "to_numpy"):
 
@@ -757,6 +758,8 @@ cpdef ndarray[object] ensure_string_array(
 
     if copy and result is arr:
         result = result.copy()
+    elif not copy and result is arr:
+        already_copied = False
 
     if issubclass(arr.dtype.type, np.str_):
         # short-circuit, all elements are str
@@ -768,6 +771,9 @@ cpdef ndarray[object] ensure_string_array(
         if isinstance(val, str):
             continue
 
+        elif not already_copied:
+            result = result.copy()
+
         if not checknull(val):
             if not util.is_float_object(val):
                 # f"{val}" is faster than str(val)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 60755f3a42cf5..9b26db07fc28f 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -357,12 +357,6 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
 
         else:
             # convert non-na-likes to str, and nan-likes to StringDtype().na_value
-            if is_object_dtype(scalars):
-                # copy if we get object dtype with non-string values to avoid
-                # modifying input inplace
-                inferred = lib.infer_dtype(scalars, skipna=False)
-                if inferred != "string":
-                    copy = True
             result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
 
         # Manually creating new array avoids the validation step in the __init__, so is
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 11a797b32cb58..4aebe61412866 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -135,13 +135,6 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
             result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
             return cls(pa.array(result, mask=na_values, type=pa.string()))
 
-        if is_object_dtype(scalars):
-            # copy if we get object dtype with non-string values to avoid
-            # modifying input inplace
-            inferred = lib.infer_dtype(scalars, skipna=False)
-            if inferred != "string":
-                copy = True
-
         # convert non-na-likes to str
         result = lib.ensure_string_array(scalars, copy=copy)
         return cls(pa.array(result, type=pa.string(), from_pandas=True))
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index e1ea001819b1c..32017cdc1c4b7 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -290,13 +290,9 @@ def test_constructor_nan_like(na):
 
 @pytest.mark.parametrize("copy", [True, False])
 def test_from_sequence_no_mutate(copy, cls, request):
-    if cls is ArrowStringArray and copy is False:
-        mark = pytest.mark.xfail(
-            raises=AssertionError, reason="numpy array are different"
-        )
-        request.node.add_marker(mark)
 
     nan_arr = np.array(["a", np.nan], dtype=object)
+    expected_input = nan_arr.copy()
     na_arr = np.array(["a", pd.NA], dtype=object)
 
     result = cls._from_sequence(nan_arr, copy=copy)
@@ -309,9 +305,7 @@ def test_from_sequence_no_mutate(copy, cls, request):
         expected = cls(na_arr)
 
     tm.assert_extension_array_equal(result, expected)
-
-    expected = nan_arr if copy else na_arr
-    tm.assert_numpy_array_equal(nan_arr, expected)
+    tm.assert_numpy_array_equal(nan_arr, expected_input)
 
 
 def test_astype_int(dtype):

From c720aada79851776e03e618e7c352389a39176fa Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Tue, 31 Jan 2023 21:51:19 +0100
Subject: [PATCH 4/4] Move to cython

---
 pandas/_libs/lib.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index ca81acb6c009a..9eeea2410eee7 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -773,6 +773,7 @@ cpdef ndarray[object] ensure_string_array(
 
         elif not already_copied:
             result = result.copy()
+            already_copied = True
 
         if not checknull(val):
             if not util.is_float_object(val):