Skip to content

BUG: fix IntegerArray astype with copy=True/False #34931

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 10, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,18 +445,18 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
if incompatible type with an IntegerDtype, equivalent of same_kind
casting
"""
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.masked import BaseMaskedDtype
from pandas.core.arrays.string_ import StringDtype

dtype = pandas_dtype(dtype)

# if we are astyping to an existing IntegerDtype we can fastpath
if isinstance(dtype, _IntegerDtype):
result = self._data.astype(dtype.numpy_dtype, copy=False)
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
elif isinstance(dtype, BooleanDtype):
result = self._data.astype("bool", copy=False)
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
# if we are astyping to another nullable masked dtype, we can fastpath
if isinstance(dtype, BaseMaskedDtype):
data = self._data.astype(dtype.numpy_dtype, copy=copy)
# mask is copied depending on whether the data was copied, and
# not directly depending on the `copy` keyword
mask = self._mask if data is self._data else self._mask.copy()
return dtype.construct_array_type()(data, mask, copy=False)
elif isinstance(dtype, StringDtype):
return dtype.construct_array_type()._from_sequence(self, copy=False)

Expand Down
11 changes: 11 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ class BaseMaskedDtype(ExtensionDtype):
def numpy_dtype(self) -> np.dtype:
raise AbstractMethodError

@classmethod
def construct_array_type(cls) -> Type["BaseMaskedArray"]:
"""
Return the array type associated with this dtype.

Returns
-------
type
"""
raise NotImplementedError
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should use AbstractMethodError

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We actually use NotImplementedError in the base dtype class for this, so this is consistent with that (not fully sure why we do that though, there might be a reason)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kk yeah we should actually change that as this is a better error message (but ok)



class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin):
"""
Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/arrays/integer/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,42 @@ def test_astype(all_data):
tm.assert_series_equal(result, expected)


def test_astype_copy():
arr = pd.array([1, 2, 3, None], dtype="Int64")
orig = pd.array([1, 2, 3, None], dtype="Int64")

# copy=True -> ensure both data and mask are actual copies
result = arr.astype("Int64", copy=True)
assert not np.shares_memory(result._data, arr._data)
assert not np.shares_memory(result._mask, arr._mask)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)

# copy=False
result = arr.astype("Int64", copy=False)
assert np.shares_memory(result._data, arr._data)
assert np.shares_memory(result._mask, arr._mask)
result[0] = 10
assert arr[0] == 10
result[0] = pd.NA
assert arr[0] is pd.NA

# astype to different dtype -> always needs a copy -> even with copy=False
# we need to ensure that also the mask is actually copied
arr = pd.array([1, 2, 3, None], dtype="Int64")
orig = pd.array([1, 2, 3, None], dtype="Int64")

result = arr.astype("Int32", copy=False)
assert not np.shares_memory(result._data, arr._data)
assert not np.shares_memory(result._mask, arr._mask)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)


def test_astype_to_larger_numpy():
a = pd.array([1, 2], dtype="Int32")
result = a.astype("int64")
Expand Down