From f5fb4744450485c1e4fd322c99b1ab6f38d29a8e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 30 Jan 2023 21:46:43 +0100 Subject: [PATCH 1/4] BUG: astype to string modifying input array inplace --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/arrays/string_.py | 6 ++++++ pandas/core/arrays/string_arrow.py | 7 +++++++ pandas/tests/frame/methods/test_astype.py | 10 ++++++++++ 4 files changed, 24 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e90717efc10b5..3402c6a7d5502 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1021,6 +1021,7 @@ Conversion - Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`) - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`) - Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`50430`) +- Bug in :meth:`DataFrame.astype` modifying input array inplace when converting to ``string`` and ``copy=False`` (:issue:``) - Bug in :meth:`Series.to_numpy` converting to NumPy array before applying ``na_value`` (:issue:`48951`) - Bug in :meth:`DataFrame.astype` not copying data when converting to pyarrow dtype (:issue:`50984`) - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 9b26db07fc28f..60755f3a42cf5 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -357,6 +357,12 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal else: # convert non-na-likes to str, and nan-likes to StringDtype().na_value + if is_object_dtype(scalars): + # copy if we get object dtype with non-string values to avoid + # modifying input inplace + inferred = lib.infer_dtype(scalars, skipna=False) + if inferred != "string": + copy = True result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) # Manually creating new array avoids the validation step in the __init__, so is diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4aebe61412866..11a797b32cb58 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -135,6 +135,13 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) return cls(pa.array(result, mask=na_values, type=pa.string())) + if is_object_dtype(scalars): + # copy if we get object dtype with non-string values to avoid + # modifying input inplace + inferred = lib.infer_dtype(scalars, skipna=False) + if inferred != "string": + copy = True + # convert non-na-likes to str result = lib.ensure_string_array(scalars, copy=copy) return cls(pa.array(result, type=pa.string(), from_pandas=True)) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 1f43b51d4808f..7840d20196160 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -879,3 +879,13 @@ def test_astype_copies(dtype): df.iloc[0, 0] = 100 expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]") tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT]) +def test_astype_to_string_not_modifying_input(string_storage, val): + # GH# + df = DataFrame({"a": ["a", "b", val]}) + expected = df.copy() + with option_context("mode.string_storage", string_storage): + df.astype("string", copy=False) + tm.assert_frame_equal(df, expected) From c338f94d4676081f37f460df1f81bddc798ff536 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 30 Jan 2023 21:48:04 +0100 Subject: [PATCH 2/4] Add gh ref --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/tests/frame/methods/test_astype.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3402c6a7d5502..c667e82e5c3d7 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1021,7 +1021,7 @@ Conversion - Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`) - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`) - Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`50430`) -- Bug in :meth:`DataFrame.astype` modifying input array inplace when converting to ``string`` and ``copy=False`` (:issue:``) +- Bug in :meth:`DataFrame.astype` modifying input array inplace when converting to ``string`` and ``copy=False`` (:issue:`51073`) - Bug in :meth:`Series.to_numpy` converting to NumPy array before applying ``na_value`` (:issue:`48951`) - Bug in :meth:`DataFrame.astype` not copying data when converting to pyarrow dtype (:issue:`50984`) - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 7840d20196160..8a1a2783b5dc6 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -883,7 +883,7 @@ def test_astype_copies(dtype): @pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT]) def test_astype_to_string_not_modifying_input(string_storage, val): - # GH# + # GH#51073 df = DataFrame({"a": ["a", "b", val]}) expected = df.copy() with option_context("mode.string_storage", string_storage): From f296e37efa3b0b8e8b91bd0bdbd00a35bc31ea83 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 31 Jan 2023 21:48:57 +0100 Subject: [PATCH 3/4] Move to cython --- pandas/_libs/lib.pyx | 6 ++++++ pandas/core/arrays/string_.py | 6 ------ pandas/core/arrays/string_arrow.py | 7 ------- pandas/tests/arrays/string_/test_string.py | 10 ++-------- 4 files changed, 8 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 16d5bbaad9de9..ca81acb6c009a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -739,6 +739,7 @@ cpdef ndarray[object] ensure_string_array( """ cdef: Py_ssize_t i = 0, n = len(arr) + bint already_copied = True if hasattr(arr, "to_numpy"): @@ -757,6 +758,8 @@ cpdef ndarray[object] ensure_string_array( if copy and result is arr: result = result.copy() + elif not copy and result is arr: + already_copied = False if issubclass(arr.dtype.type, np.str_): # short-circuit, all elements are str @@ -768,6 +771,9 @@ cpdef ndarray[object] ensure_string_array( if isinstance(val, str): continue + elif not already_copied: + result = result.copy() + if not checknull(val): if not util.is_float_object(val): # f"{val}" is faster than str(val) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 60755f3a42cf5..9b26db07fc28f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -357,12 +357,6 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal else: # convert non-na-likes to str, and nan-likes to StringDtype().na_value - if is_object_dtype(scalars): - # copy if we get object dtype with non-string values to avoid - # modifying input inplace - inferred = lib.infer_dtype(scalars, skipna=False) - if inferred != "string": - copy = True result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) # Manually creating new array avoids the validation step in the __init__, so is diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 11a797b32cb58..4aebe61412866 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -135,13 +135,6 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) return cls(pa.array(result, mask=na_values, type=pa.string())) - if is_object_dtype(scalars): - # copy if we get object dtype with non-string values to avoid - # modifying input inplace - inferred = lib.infer_dtype(scalars, skipna=False) - if inferred != "string": - copy = True - # convert non-na-likes to str result = lib.ensure_string_array(scalars, copy=copy) return cls(pa.array(result, type=pa.string(), from_pandas=True)) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index e1ea001819b1c..32017cdc1c4b7 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -290,13 +290,9 @@ def test_constructor_nan_like(na): @pytest.mark.parametrize("copy", [True, False]) def test_from_sequence_no_mutate(copy, cls, request): - if cls is ArrowStringArray and copy is False: - mark = pytest.mark.xfail( - raises=AssertionError, reason="numpy array are different" - ) - request.node.add_marker(mark) nan_arr = np.array(["a", np.nan], dtype=object) + expected_input = nan_arr.copy() na_arr = np.array(["a", pd.NA], dtype=object) result = cls._from_sequence(nan_arr, copy=copy) @@ -309,9 +305,7 @@ def test_from_sequence_no_mutate(copy, cls, request): expected = cls(na_arr) tm.assert_extension_array_equal(result, expected) - - expected = nan_arr if copy else na_arr - tm.assert_numpy_array_equal(nan_arr, expected) + tm.assert_numpy_array_equal(nan_arr, expected_input) def test_astype_int(dtype): From c720aada79851776e03e618e7c352389a39176fa Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 31 Jan 2023 21:51:19 +0100 Subject: [PATCH 4/4] Move to cython --- pandas/_libs/lib.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index ca81acb6c009a..9eeea2410eee7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -773,6 +773,7 @@ cpdef ndarray[object] ensure_string_array( elif not already_copied: result = result.copy() + already_copied = True if not checknull(val): if not util.is_float_object(val):