diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 47a31954b9d6c..75f58f565dd6f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array( convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. copy : bool, default True - Whether to ensure that a new array is returned. + Whether to ensure that a new array is returned. When True, a new array + is always returned. When False, a new array is only returned when needed + to avoid mutating the input array. skipna : bool, default True Whether or not to coerce nulls to their stringified form (e.g. if False, NaN becomes 'nan'). @@ -762,11 +764,15 @@ cpdef ndarray[object] ensure_string_array( result = np.asarray(arr, dtype="object") - if copy and (result is arr or np.shares_memory(arr, result)): - # GH#54654 - result = result.copy() - elif not copy and result is arr: - already_copied = False + if result is arr or np.may_share_memory(arr, result): + # if np.asarray(..) did not make a copy of the input arr, we still need + # to do that to avoid mutating the input array + # GH#54654: share_memory check is needed for rare cases where np.asarray + # returns a new object without making a copy of the actual data + if copy: + result = result.copy() + else: + already_copied = False elif not copy and not result.flags.writeable: # Weird edge case where result is a view already_copied = False diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 039d868bccd16..379e557211cad 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -124,11 +124,25 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: new_mask = np.zeros(arr.shape, dtype=np.bool_) new_mask[arr_mask] = arr[arr_mask] == x else: - new_mask = arr == x + # GH#47101 + # Fix where type bool has no attribute to_numpy() by first + # attempting to broadcast with np.equal for some cases, and then + # an explicit type check when checking the mask for any straggling + # cases. Where a literal comparison would fail np.equal we fall back + # to the original equality check. + try: + # In case of an uncastable type, this will emit TypeError + new_mask = np.equal(arr, x) # type: ignore[arg-type] + except TypeError: + # Old behaviour for uncastable types + new_mask = arr == x if not isinstance(new_mask, np.ndarray): # usually BooleanArray - new_mask = new_mask.to_numpy(dtype=bool, na_value=False) + if isinstance(new_mask, bool): + new_mask = np.array([new_mask], dtype=bool) + else: + new_mask = new_mask.to_numpy(dtype=bool, na_value=False) mask |= new_mask if na_mask.any(): diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index de56d5e4a07ee..80c30f2d0c26e 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -7,7 +7,6 @@ from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under12p0 -import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -111,7 +110,8 @@ def test_astype_string_and_object_update_original(dtype, new_dtype): tm.assert_frame_equal(df2, df_orig) -def test_astype_string_copy_on_pickle_roundrip(): +def test_astype_str_copy_on_pickle_roundrip(): + # TODO(infer_string) this test can be removed after 3.0 (once str is the default) # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) @@ -120,14 +120,22 @@ def test_astype_string_copy_on_pickle_roundrip(): tm.assert_series_equal(base, base_copy) -@td.skip_if_no("pyarrow") -def test_astype_string_read_only_on_pickle_roundrip(): +def test_astype_string_copy_on_pickle_roundrip(any_string_dtype): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy.astype(any_string_dtype) + tm.assert_series_equal(base, base_copy) + + +def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter read-only array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) base_copy = pickle.loads(pickle.dumps(base)) base_copy._values.flags.writeable = False - base_copy.astype("string[pyarrow]") + base_copy.astype(any_string_dtype) tm.assert_series_equal(base, base_copy) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 6b872bf48d550..cfd9a21a79c03 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1518,3 +1518,10 @@ def test_replace_object_splitting(self, using_infer_string): assert len(df._mgr.blocks) == 2 else: assert len(df._mgr.blocks) == 1 + + def test_replace_bool_to_numpy_attributeerror(self): + # GH#47101 + pass_pre_patch = DataFrame({"d": [None]}) + tm.assert_frame_equal(pass_pre_patch, pass_pre_patch.replace("", pd.NA)) + fail_pre_patch = DataFrame({"d": [pd.NA]}) + tm.assert_frame_equal(fail_pre_patch, fail_pre_patch.replace("", pd.NA)) diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 8583d8bcc052c..17dae1879f3b8 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest @@ -283,3 +285,15 @@ def test_no_default_pickle(): # GH#40397 obj = tm.round_trip_pickle(lib.no_default) assert obj is lib.no_default + + +def test_ensure_string_array_copy(): + # ensure the original array is not modified in case of copy=False with + # pickle-roundtripped object dtype array + # https://github.com/pandas-dev/pandas/issues/54654 + arr = np.array(["a", None], dtype=object) + arr = pickle.loads(pickle.dumps(arr)) + result = lib.ensure_string_array(arr, copy=False) + assert not np.shares_memory(arr, result) + assert arr[1] is None + assert result[1] is np.nan