Skip to content

Fix issue #47101 #59766

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array(
convert_na_value : bool, default True
If False, existing na values will be used unchanged in the new array.
copy : bool, default True
Whether to ensure that a new array is returned.
Whether to ensure that a new array is returned. When True, a new array
is always returned. When False, a new array is only returned when needed
to avoid mutating the input array.
skipna : bool, default True
Whether or not to coerce nulls to their stringified form
(e.g. if False, NaN becomes 'nan').
Expand Down Expand Up @@ -762,11 +764,15 @@ cpdef ndarray[object] ensure_string_array(

result = np.asarray(arr, dtype="object")

if copy and (result is arr or np.shares_memory(arr, result)):
# GH#54654
result = result.copy()
elif not copy and result is arr:
already_copied = False
if result is arr or np.may_share_memory(arr, result):
# if np.asarray(..) did not make a copy of the input arr, we still need
# to do that to avoid mutating the input array
# GH#54654: share_memory check is needed for rare cases where np.asarray
# returns a new object without making a copy of the actual data
if copy:
result = result.copy()
else:
already_copied = False
elif not copy and not result.flags.writeable:
# Weird edge case where result is a view
already_copied = False
Expand Down
18 changes: 16 additions & 2 deletions pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,25 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]:
new_mask = np.zeros(arr.shape, dtype=np.bool_)
new_mask[arr_mask] = arr[arr_mask] == x
else:
new_mask = arr == x
# GH#47101
# Fix where type bool has no attribute to_numpy() by first
# attempting to broadcast with np.equal for some cases, and then
# an explicit type check when checking the mask for any straggling
# cases. Where a literal comparison would fail np.equal we fall back
# to the original equality check.
try:
# In case of an uncastable type, this will emit TypeError
new_mask = np.equal(arr, x) # type: ignore[arg-type]
except TypeError:
# Old behaviour for uncastable types
new_mask = arr == x

if not isinstance(new_mask, np.ndarray):
# usually BooleanArray
new_mask = new_mask.to_numpy(dtype=bool, na_value=False)
if isinstance(new_mask, bool):
new_mask = np.array([new_mask], dtype=bool)
else:
new_mask = new_mask.to_numpy(dtype=bool, na_value=False)
mask |= new_mask

if na_mask.any():
Expand Down
18 changes: 13 additions & 5 deletions pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from pandas.compat import HAS_PYARROW
from pandas.compat.pyarrow import pa_version_under12p0
import pandas.util._test_decorators as td

from pandas import (
DataFrame,
Expand Down Expand Up @@ -111,7 +110,8 @@ def test_astype_string_and_object_update_original(dtype, new_dtype):
tm.assert_frame_equal(df2, df_orig)


def test_astype_string_copy_on_pickle_roundrip():
def test_astype_str_copy_on_pickle_roundrip():
# TODO(infer_string) this test can be removed after 3.0 (once str is the default)
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
Expand All @@ -120,14 +120,22 @@ def test_astype_string_copy_on_pickle_roundrip():
tm.assert_series_equal(base, base_copy)


@td.skip_if_no("pyarrow")
def test_astype_string_read_only_on_pickle_roundrip():
def test_astype_string_copy_on_pickle_roundrip(any_string_dtype):
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
base_copy = pickle.loads(pickle.dumps(base))
base_copy.astype(any_string_dtype)
tm.assert_series_equal(base, base_copy)


def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype):
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter read-only array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
base_copy = pickle.loads(pickle.dumps(base))
base_copy._values.flags.writeable = False
base_copy.astype("string[pyarrow]")
base_copy.astype(any_string_dtype)
tm.assert_series_equal(base, base_copy)


Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -1518,3 +1518,10 @@ def test_replace_object_splitting(self, using_infer_string):
assert len(df._mgr.blocks) == 2
else:
assert len(df._mgr.blocks) == 1

def test_replace_bool_to_numpy_attributeerror(self):
# GH#47101
pass_pre_patch = DataFrame({"d": [None]})
tm.assert_frame_equal(pass_pre_patch, pass_pre_patch.replace("", pd.NA))
fail_pre_patch = DataFrame({"d": [pd.NA]})
tm.assert_frame_equal(fail_pre_patch, fail_pre_patch.replace("", pd.NA))
14 changes: 14 additions & 0 deletions pandas/tests/libs/test_lib.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pickle

import numpy as np
import pytest

Expand Down Expand Up @@ -283,3 +285,15 @@ def test_no_default_pickle():
# GH#40397
obj = tm.round_trip_pickle(lib.no_default)
assert obj is lib.no_default


def test_ensure_string_array_copy():
# ensure the original array is not modified in case of copy=False with
# pickle-roundtripped object dtype array
# https://github.com/pandas-dev/pandas/issues/54654
arr = np.array(["a", None], dtype=object)
arr = pickle.loads(pickle.dumps(arr))
result = lib.ensure_string_array(arr, copy=False)
assert not np.shares_memory(arr, result)
assert arr[1] is None
assert result[1] is np.nan