pandas-dev · jorisvandenbossche · Feb 1, 2023 · Jan 30, 2023 · Jan 30, 2023 · Jan 31, 2023
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -1021,6 +1021,7 @@ Conversion
 - Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`)
 - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`)
 - Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`50430`)
+- Bug in :meth:`DataFrame.astype` modifying input array inplace when converting to ``string`` and ``copy=False`` (:issue:`51073`)
 - Bug in :meth:`Series.to_numpy` converting to NumPy array before applying ``na_value`` (:issue:`48951`)
 - Bug in :meth:`DataFrame.astype` not copying data when converting to pyarrow dtype (:issue:`50984`)
 - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -357,6 +357,12 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
 
         else:
             # convert non-na-likes to str, and nan-likes to StringDtype().na_value
+            if is_object_dtype(scalars):
+                # copy if we get object dtype with non-string values to avoid
+                # modifying input inplace
+                inferred = lib.infer_dtype(scalars, skipna=False)
+                if inferred != "string":
+                    copy = True
             result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
 
         # Manually creating new array avoids the validation step in the __init__, so is

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -135,6 +135,13 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
             result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
             return cls(pa.array(result, mask=na_values, type=pa.string()))
 
+        if is_object_dtype(scalars):
+            # copy if we get object dtype with non-string values to avoid
+            # modifying input inplace
+            inferred = lib.infer_dtype(scalars, skipna=False)
+            if inferred != "string":
+                copy = True
+
         # convert non-na-likes to str
         result = lib.ensure_string_array(scalars, copy=copy)
         return cls(pa.array(result, type=pa.string(), from_pandas=True))

diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
@@ -879,3 +879,13 @@ def test_astype_copies(dtype):
     df.iloc[0, 0] = 100
     expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]")
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
+def test_astype_to_string_not_modifying_input(string_storage, val):
+    # GH#51073
+    df = DataFrame({"a": ["a", "b", val]})
+    expected = df.copy()
+    with option_context("mode.string_storage", string_storage):
+        df.astype("string", copy=False)
+    tm.assert_frame_equal(df, expected)