diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index a23efc802c74e..0bc9edd1046bc 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1023,6 +1023,7 @@ Conversion - Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`) - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`) - Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`50430`) +- Bug in :meth:`DataFrame.astype` modifying input array inplace when converting to ``string`` and ``copy=False`` (:issue:`51073`) - Bug in :meth:`Series.to_numpy` converting to NumPy array before applying ``na_value`` (:issue:`48951`) - Bug in :meth:`DataFrame.astype` not copying data when converting to pyarrow dtype (:issue:`50984`) - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2931f178a7c1c..2a7c793c0096c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -739,6 +739,7 @@ cpdef ndarray[object] ensure_string_array( """ cdef: Py_ssize_t i = 0, n = len(arr) + bint already_copied = True if hasattr(arr, "to_numpy"): @@ -757,6 +758,8 @@ cpdef ndarray[object] ensure_string_array( if copy and result is arr: result = result.copy() + elif not copy and result is arr: + already_copied = False if issubclass(arr.dtype.type, np.str_): # short-circuit, all elements are str @@ -768,6 +771,10 @@ cpdef ndarray[object] ensure_string_array( if isinstance(val, str): continue + elif not already_copied: + result = result.copy() + already_copied = True + if not checknull(val): if not util.is_float_object(val): # f"{val}" is faster than str(val) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index e1ea001819b1c..32017cdc1c4b7 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -290,13 +290,9 @@ def test_constructor_nan_like(na): @pytest.mark.parametrize("copy", [True, False]) def test_from_sequence_no_mutate(copy, cls, request): - if cls is ArrowStringArray and copy is False: - mark = pytest.mark.xfail( - raises=AssertionError, reason="numpy array are different" - ) - request.node.add_marker(mark) nan_arr = np.array(["a", np.nan], dtype=object) + expected_input = nan_arr.copy() na_arr = np.array(["a", pd.NA], dtype=object) result = cls._from_sequence(nan_arr, copy=copy) @@ -309,9 +305,7 @@ def test_from_sequence_no_mutate(copy, cls, request): expected = cls(na_arr) tm.assert_extension_array_equal(result, expected) - - expected = nan_arr if copy else na_arr - tm.assert_numpy_array_equal(nan_arr, expected) + tm.assert_numpy_array_equal(nan_arr, expected_input) def test_astype_int(dtype): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 1f43b51d4808f..8a1a2783b5dc6 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -879,3 +879,13 @@ def test_astype_copies(dtype): df.iloc[0, 0] = 100 expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]") tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT]) +def test_astype_to_string_not_modifying_input(string_storage, val): + # GH#51073 + df = DataFrame({"a": ["a", "b", val]}) + expected = df.copy() + with option_context("mode.string_storage", string_storage): + df.astype("string", copy=False) + tm.assert_frame_equal(df, expected)