diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5473b7c1523f3..5dff6d729479a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1143,6 +1143,7 @@ ExtensionArray - Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`) - Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable boolean dtypes (:issue:`34051`) - Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`) +- Fixed bug in ``IntegerArray.astype`` to correctly copy the mask as well (:issue:`34931`). Other ^^^^^ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 7be7ef3637ee5..b5cb681812939 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -448,18 +448,22 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: if incompatible type with an IntegerDtype, equivalent of same_kind casting """ - from pandas.core.arrays.boolean import BooleanDtype + from pandas.core.arrays.masked import BaseMaskedDtype from pandas.core.arrays.string_ import StringDtype dtype = pandas_dtype(dtype) - # if we are astyping to an existing IntegerDtype we can fastpath - if isinstance(dtype, _IntegerDtype): - result = self._data.astype(dtype.numpy_dtype, copy=False) - return dtype.construct_array_type()(result, mask=self._mask, copy=False) - elif isinstance(dtype, BooleanDtype): - result = self._data.astype("bool", copy=False) - return dtype.construct_array_type()(result, mask=self._mask, copy=False) + # if the dtype is exactly the same, we can fastpath + if self.dtype == dtype: + # return the same object for copy=False + return self.copy() if copy else self + # if we are astyping to another nullable masked dtype, we can fastpath + if isinstance(dtype, BaseMaskedDtype): + data = self._data.astype(dtype.numpy_dtype, copy=copy) + # mask is copied depending on whether the data was copied, and + # not directly depending on the `copy` keyword + mask = self._mask if data is self._data else self._mask.copy() + return dtype.construct_array_type()(data, mask, copy=False) elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 28add129825d1..235840d6d201e 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -40,6 +40,17 @@ class BaseMaskedDtype(ExtensionDtype): def numpy_dtype(self) -> np.dtype: raise AbstractMethodError + @classmethod + def construct_array_type(cls) -> Type["BaseMaskedArray"]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + raise NotImplementedError + class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): """ diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index cafe9e47a18f4..67efa4cb2ce4a 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -144,6 +144,44 @@ def test_astype(all_data): tm.assert_series_equal(result, expected) +def test_astype_copy(): + arr = pd.array([1, 2, 3, None], dtype="Int64") + orig = pd.array([1, 2, 3, None], dtype="Int64") + + # copy=True -> ensure both data and mask are actual copies + result = arr.astype("Int64", copy=True) + assert result is not arr + assert not np.shares_memory(result._data, arr._data) + assert not np.shares_memory(result._mask, arr._mask) + result[0] = 10 + tm.assert_extension_array_equal(arr, orig) + result[0] = pd.NA + tm.assert_extension_array_equal(arr, orig) + + # copy=False + result = arr.astype("Int64", copy=False) + assert result is arr + assert np.shares_memory(result._data, arr._data) + assert np.shares_memory(result._mask, arr._mask) + result[0] = 10 + assert arr[0] == 10 + result[0] = pd.NA + assert arr[0] is pd.NA + + # astype to different dtype -> always needs a copy -> even with copy=False + # we need to ensure that also the mask is actually copied + arr = pd.array([1, 2, 3, None], dtype="Int64") + orig = pd.array([1, 2, 3, None], dtype="Int64") + + result = arr.astype("Int32", copy=False) + assert not np.shares_memory(result._data, arr._data) + assert not np.shares_memory(result._mask, arr._mask) + result[0] = 10 + tm.assert_extension_array_equal(arr, orig) + result[0] = pd.NA + tm.assert_extension_array_equal(arr, orig) + + def test_astype_to_larger_numpy(): a = pd.array([1, 2], dtype="Int32") result = a.astype("int64")