diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 330510c2c883c..297315d57427d 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -387,8 +387,37 @@ As a reminder, you can specify the ``dtype`` to disable all inference. .. ipython:: python a = pd.array([1, 2, None], dtype="Int64") + a a[2] +This has a few API-breaking consequences. + +**Converting to a NumPy ndarray** + +When converting to a NumPy array missing values will be ``pd.NA``, which cannot +be converted to a float. So calling ``np.asarray(integer_array, dtype="float")`` +will now raise. + +*pandas 0.25.x* + +.. code-block:: python + + >>> np.asarray(a, dtype="float") + array([ 1., 2., nan]) + +*pandas 1.0.0* + +.. ipython:: python + :okexcept: + + np.asarray(a, dtype="float") + +Use :meth:`arrays.IntegerArray.to_numpy` with an explicit ``na_value`` instead. + +.. ipython:: python + + a.to_numpy(dtype="float", na_value=np.nan) + See :ref:`missing_data.NA` for more on the differences between :attr:`pandas.NA` and :attr:`numpy.nan`. diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 3ba632660a325..c2ce799c64aac 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -19,7 +19,9 @@ is_integer_dtype, is_list_like, is_numeric_dtype, + is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import register_extension_dtype @@ -382,9 +384,14 @@ def to_numpy( if dtype is None: dtype = object if self._hasna: - if is_bool_dtype(dtype) and na_value is libmissing.NA: + if ( + not (is_object_dtype(dtype) or is_string_dtype(dtype)) + and na_value is libmissing.NA + ): raise ValueError( - "cannot convert to bool numpy array in presence of missing values" + f"cannot convert to '{dtype}'-dtype NumPy array " + "with missing values. Specify an appropriate 'na_value' " + "for this dtype." ) # don't pass copy to astype -> always need a copy since we are mutating data = self._data.astype(dtype) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 337ff7f448586..d63692c5ba972 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -5,6 +5,7 @@ import numpy as np from pandas._libs import lib, missing as libmissing +from pandas._typing import Scalar from pandas.compat import set_function_name from pandas.util._decorators import cache_readonly @@ -19,6 +20,7 @@ is_list_like, is_object_dtype, is_scalar, + is_string_dtype, ) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna, notna @@ -376,30 +378,35 @@ def __getitem__(self, item): return type(self)(self._data[item], self._mask[item]) - def _coerce_to_ndarray(self, dtype=None, na_value=lib.no_default): - """ - coerce to an ndarary of object dtype - """ + @property + def _hasna(self) -> bool: + # Note: this is expensive right now! The hope is that we can + # make this faster by having an optional mask, but not have to change + # source code using it.. + return self._mask.any() + + def to_numpy( + self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default, + ): + if na_value is lib.no_default: + na_value = libmissing.NA if dtype is None: dtype = object - - if na_value is lib.no_default and is_float_dtype(dtype): - na_value = np.nan - elif na_value is lib.no_default: - na_value = libmissing.NA - - if is_integer_dtype(dtype): - # Specifically, a NumPy integer dtype, not a pandas integer dtype, - # since we're coercing to a numpy dtype by definition in this function. - if not self.isna().any(): - return self._data.astype(dtype) - else: + if self._hasna: + if ( + not (is_object_dtype(dtype) or is_string_dtype(dtype)) + and na_value is libmissing.NA + ): raise ValueError( - "cannot convert to integer NumPy array with missing values" + f"cannot convert to '{dtype}'-dtype NumPy array " + "with missing values. Specify an appropriate 'na_value' " + "for this dtype." ) - - data = self._data.astype(dtype) - data[self._mask] = na_value + # don't pass copy to astype -> always need a copy since we are mutating + data = self._data.astype(dtype) + data[self._mask] = na_value + else: + data = self._data.astype(dtype, copy=copy) return data __array_priority__ = 1000 # higher than ndarray so ops dispatch to us @@ -409,7 +416,7 @@ def __array__(self, dtype=None): the array interface, return my values We return an object array here to preserve our scalar values """ - return self._coerce_to_ndarray(dtype=dtype) + return self.to_numpy(dtype=dtype) def __arrow_array__(self, type=None): """ @@ -564,7 +571,13 @@ def astype(self, dtype, copy=True): return type(self)(result, mask=self._mask, copy=False) # coerce - data = self._coerce_to_ndarray(dtype=dtype) + if is_float_dtype(dtype): + # In astype, we consider dtype=float to also mean na_value=np.nan + kwargs = dict(na_value=np.nan) + else: + kwargs = {} + + data = self.to_numpy(dtype=dtype, **kwargs) return astype_nansafe(data, dtype, copy=False) @property @@ -630,7 +643,7 @@ def value_counts(self, dropna=True): def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: # TODO: https://github.com/pandas-dev/pandas/issues/30037 # use masked algorithms, rather than object-dtype / np.nan. - return self._coerce_to_ndarray(na_value=np.nan), np.nan + return self.to_numpy(na_value=np.nan), np.nan def _values_for_argsort(self) -> np.ndarray: """Return values for sorting. diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index a716bc8e0a337..5a007f28d63cb 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -171,6 +171,8 @@ def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array: try: return arr.astype("uint64", copy=copy, casting="safe") # type: ignore except TypeError: + if is_extension_array_dtype(arr.dtype): + return arr.to_numpy(dtype="float64", na_value=np.nan) return arr.astype("float64", copy=copy) diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index 089cda7f434e9..b89aece3f982c 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -265,6 +265,11 @@ def test_to_numpy(box): expected = np.array([True, False, pd.NA], dtype="object") tm.assert_numpy_array_equal(result, expected) + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype="str") + expected = np.array([True, False, pd.NA], dtype=" can convert to bool, otherwise raises arr = con([True, False, True], dtype="boolean") result = arr.to_numpy(dtype="bool") @@ -272,7 +277,7 @@ def test_to_numpy(box): tm.assert_numpy_array_equal(result, expected) arr = con([True, False, None], dtype="boolean") - with pytest.raises(ValueError, match="cannot convert to bool numpy"): + with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"): result = arr.to_numpy(dtype="bool") # specify dtype and na_value @@ -294,9 +299,9 @@ def test_to_numpy(box): tm.assert_numpy_array_equal(result, expected) # converting to int or float without specifying na_value raises - with pytest.raises(TypeError): + with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): arr.to_numpy(dtype="int64") - with pytest.raises(TypeError): + with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): arr.to_numpy(dtype="float64") @@ -329,6 +334,10 @@ def test_astype(): expected = np.array([1, 0, np.nan], dtype="float64") tm.assert_numpy_array_equal(result, expected) + result = arr.astype("str") + expected = np.array(["True", "False", "NA"], dtype="object") + tm.assert_numpy_array_equal(result, expected) + # no missing values arr = pd.array([True, False, True], dtype="boolean") result = arr.astype("int64") diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index f172280202e64..6a3ef75157d5d 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -118,7 +118,9 @@ def test_from_dtype_from_float(self, data): # from float expected = pd.Series(data) - result = pd.Series(np.array(data, dtype="float"), dtype=str(dtype)) + result = pd.Series( + data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype) + ) tm.assert_series_equal(result, expected) # from int / list @@ -634,10 +636,47 @@ def test_construct_cast_invalid(self, dtype): with pytest.raises(TypeError, match=msg): pd.Series(arr).astype(dtype) - def test_coerce_to_ndarray_float_NA_rasies(self): - a = pd.array([0, 1, 2], dtype="Int64") - with pytest.raises(TypeError, match="NAType"): - a._coerce_to_ndarray(dtype="float", na_value=pd.NA) + @pytest.mark.parametrize("in_series", [True, False]) + def test_to_numpy_na_nan(self, in_series): + a = pd.array([0, 1, None], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([0.0, 1.0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="int64", na_value=-1) + expected = np.array([0, 1, -1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="bool", na_value=False) + expected = np.array([False, True, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("in_series", [True, False]) + @pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) + def test_to_numpy_dtype(self, dtype, in_series): + a = pd.array([0, 1], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype=dtype) + expected = np.array([0, 1], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) + def test_to_numpy_na_raises(self, dtype): + a = pd.array([0, 1, None], dtype="Int64") + with pytest.raises(ValueError, match=dtype): + a.to_numpy(dtype=dtype) + + def test_astype_str(self): + a = pd.array([1, 2, None], dtype="Int64") + expected = np.array(["1", "2", "NA"], dtype=object) + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) def test_frame_repr(data_missing): @@ -887,7 +926,7 @@ def test_reduce_to_float(op): def test_astype_nansafe(): # see gh-22343 arr = integer_array([np.nan, 1, 2], dtype="Int8") - msg = "cannot convert to integer NumPy array with missing values" + msg = "cannot convert to 'uint32'-dtype NumPy array with missing values." with pytest.raises(ValueError, match=msg): arr.astype("uint32")