Skip to content

Commit 337faf3

Browse files
authored
BUG: astype to string modifying input array inplace (#51073)
1 parent 0105aa2 commit 337faf3

File tree

4 files changed

+20
-8
lines changed

4 files changed

+20
-8
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1057,6 +1057,7 @@ Conversion
10571057
- Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`)
10581058
- Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`)
10591059
- Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`50430`)
1060+
- Bug in :meth:`DataFrame.astype` modifying input array inplace when converting to ``string`` and ``copy=False`` (:issue:`51073`)
10601061
- Bug in :meth:`Series.to_numpy` converting to NumPy array before applying ``na_value`` (:issue:`48951`)
10611062
- Bug in :meth:`DataFrame.astype` not copying data when converting to pyarrow dtype (:issue:`50984`)
10621063
- Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`)

pandas/_libs/lib.pyx

+7
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,7 @@ cpdef ndarray[object] ensure_string_array(
739739
"""
740740
cdef:
741741
Py_ssize_t i = 0, n = len(arr)
742+
bint already_copied = True
742743

743744
if hasattr(arr, "to_numpy"):
744745

@@ -757,6 +758,8 @@ cpdef ndarray[object] ensure_string_array(
757758

758759
if copy and result is arr:
759760
result = result.copy()
761+
elif not copy and result is arr:
762+
already_copied = False
760763

761764
if issubclass(arr.dtype.type, np.str_):
762765
# short-circuit, all elements are str
@@ -768,6 +771,10 @@ cpdef ndarray[object] ensure_string_array(
768771
if isinstance(val, str):
769772
continue
770773

774+
elif not already_copied:
775+
result = result.copy()
776+
already_copied = True
777+
771778
if not checknull(val):
772779
if not util.is_float_object(val):
773780
# f"{val}" is faster than str(val)

pandas/tests/arrays/string_/test_string.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -290,13 +290,9 @@ def test_constructor_nan_like(na):
290290

291291
@pytest.mark.parametrize("copy", [True, False])
292292
def test_from_sequence_no_mutate(copy, cls, request):
293-
if cls is ArrowStringArray and copy is False:
294-
mark = pytest.mark.xfail(
295-
raises=AssertionError, reason="numpy array are different"
296-
)
297-
request.node.add_marker(mark)
298293

299294
nan_arr = np.array(["a", np.nan], dtype=object)
295+
expected_input = nan_arr.copy()
300296
na_arr = np.array(["a", pd.NA], dtype=object)
301297

302298
result = cls._from_sequence(nan_arr, copy=copy)
@@ -309,9 +305,7 @@ def test_from_sequence_no_mutate(copy, cls, request):
309305
expected = cls(na_arr)
310306

311307
tm.assert_extension_array_equal(result, expected)
312-
313-
expected = nan_arr if copy else na_arr
314-
tm.assert_numpy_array_equal(nan_arr, expected)
308+
tm.assert_numpy_array_equal(nan_arr, expected_input)
315309

316310

317311
def test_astype_int(dtype):

pandas/tests/frame/methods/test_astype.py

+10
Original file line numberDiff line numberDiff line change
@@ -879,3 +879,13 @@ def test_astype_copies(dtype):
879879
df.iloc[0, 0] = 100
880880
expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]")
881881
tm.assert_frame_equal(result, expected)
882+
883+
884+
@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
885+
def test_astype_to_string_not_modifying_input(string_storage, val):
886+
# GH#51073
887+
df = DataFrame({"a": ["a", "b", val]})
888+
expected = df.copy()
889+
with option_context("mode.string_storage", string_storage):
890+
df.astype("string", copy=False)
891+
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
 (0)