IntegerArray.to_numpy

TomAugspurger · TomAugspurger · commit 5156db08e484 · 2020-01-07T13:19:17.000-06:00
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -387,8 +387,37 @@ As a reminder, you can specify the ``dtype`` to disable all inference.
 .. ipython:: python
 
    a = pd.array([1, 2, None], dtype="Int64")
+   a
    a[2]
 
+This has a few API-breaking consequences.
+
+**Converting to a NumPy ndarray**
+
+When converting to a NumPy array missing values will be ``pd.NA``, which cannot
+be converted to a float. So calling ``np.asarray(integer_array, dtype="float")``
+will now raise.
+
+*pandas 0.25.x*
+
+.. code-block:: python
+
+    >>> np.asarray(a, dtype="float")
+    array([ 1.,  2., nan])
+
+*pandas 1.0.0*
+
+.. ipython:: python
+   :okexcept:
+
+   np.asarray(a, dtype="float")
+
+Use :meth:`arrays.IntegerArray.to_numpy` with an explicit ``na_value`` instead.
+
+.. ipython:: python
+
+   a.to_numpy(dtype="float", na_value=np.nan)
+
 See :ref:`missing_data.NA` for more on the differences between :attr:`pandas.NA`
 and :attr:`numpy.nan`.
 
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -19,7 +19,9 @@
     is_integer_dtype,
     is_list_like,
     is_numeric_dtype,
+    is_object_dtype,
     is_scalar,
+    is_string_dtype,
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import register_extension_dtype
@@ -382,9 +384,14 @@ def to_numpy(
         if dtype is None:
             dtype = object
         if self._hasna:
-            if is_bool_dtype(dtype) and na_value is libmissing.NA:
+            if (
+                not (is_object_dtype(dtype) or is_string_dtype(dtype))
+                and na_value is libmissing.NA
+            ):
                 raise ValueError(
-                    "cannot convert to bool numpy array in presence of missing values"
+                    f"cannot convert to '{dtype}'-dtype NumPy array "
+                    "with missing values. Specify an appropriate 'na_value' "
+                    "for this dtype."
                 )
             # don't pass copy to astype -> always need a copy since we are mutating
             data = self._data.astype(dtype)
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -5,6 +5,7 @@
 import numpy as np
 
 from pandas._libs import lib, missing as libmissing
+from pandas._typing import Scalar
 from pandas.compat import set_function_name
 from pandas.util._decorators import cache_readonly
 
@@ -19,6 +20,7 @@
     is_list_like,
     is_object_dtype,
     is_scalar,
+    is_string_dtype,
 )
 from pandas.core.dtypes.dtypes import register_extension_dtype
 from pandas.core.dtypes.missing import isna, notna
@@ -376,30 +378,35 @@ def __getitem__(self, item):
 
         return type(self)(self._data[item], self._mask[item])
 
-    def _coerce_to_ndarray(self, dtype=None, na_value=lib.no_default):
-        """
-        coerce to an ndarary of object dtype
-        """
+    @property
+    def _hasna(self) -> bool:
+        # Note: this is expensive right now! The hope is that we can
+        # make this faster by having an optional mask, but not have to change
+        # source code using it..
+        return self._mask.any()
+
+    def to_numpy(
+        self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default,
+    ):
+        if na_value is lib.no_default:
+            na_value = libmissing.NA
         if dtype is None:
             dtype = object
-
-        if na_value is lib.no_default and is_float_dtype(dtype):
-            na_value = np.nan
-        elif na_value is lib.no_default:
-            na_value = libmissing.NA
-
-        if is_integer_dtype(dtype):
-            # Specifically, a NumPy integer dtype, not a pandas integer dtype,
-            # since we're coercing to a numpy dtype by definition in this function.
-            if not self.isna().any():
-                return self._data.astype(dtype)
-            else:
+        if self._hasna:
+            if (
+                not (is_object_dtype(dtype) or is_string_dtype(dtype))
+                and na_value is libmissing.NA
+            ):
                 raise ValueError(
-                    "cannot convert to integer NumPy array with missing values"
+                    f"cannot convert to '{dtype}'-dtype NumPy array "
+                    "with missing values. Specify an appropriate 'na_value' "
+                    "for this dtype."
                 )
-
-        data = self._data.astype(dtype)
-        data[self._mask] = na_value
+            # don't pass copy to astype -> always need a copy since we are mutating
+            data = self._data.astype(dtype)
+            data[self._mask] = na_value
+        else:
+            data = self._data.astype(dtype, copy=copy)
         return data
 
     __array_priority__ = 1000  # higher than ndarray so ops dispatch to us
@@ -409,7 +416,7 @@ def __array__(self, dtype=None):
         the array interface, return my values
         We return an object array here to preserve our scalar values
         """
-        return self._coerce_to_ndarray(dtype=dtype)
+        return self.to_numpy(dtype=dtype)
 
     def __arrow_array__(self, type=None):
         """
@@ -564,7 +571,13 @@ def astype(self, dtype, copy=True):
             return type(self)(result, mask=self._mask, copy=False)
 
         # coerce
-        data = self._coerce_to_ndarray(dtype=dtype)
+        if is_float_dtype(dtype):
+            # In astype, we consider dtype=float to also mean na_value=np.nan
+            kwargs = dict(na_value=np.nan)
+        else:
+            kwargs = {}
+
+        data = self.to_numpy(dtype=dtype, **kwargs)
         return astype_nansafe(data, dtype, copy=False)
 
     @property
@@ -630,7 +643,7 @@ def value_counts(self, dropna=True):
     def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
         # TODO: https://github.com/pandas-dev/pandas/issues/30037
         # use masked algorithms, rather than object-dtype / np.nan.
-        return self._coerce_to_ndarray(na_value=np.nan), np.nan
+        return self.to_numpy(na_value=np.nan), np.nan
 
     def _values_for_argsort(self) -> np.ndarray:
         """Return values for sorting.
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -171,6 +171,8 @@ def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array:
     try:
         return arr.astype("uint64", copy=copy, casting="safe")  # type: ignore
     except TypeError:
+        if is_extension_array_dtype(arr.dtype):
+            return arr.to_numpy(dtype="float64", na_value=np.nan)
         return arr.astype("float64", copy=copy)
 
 
diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py
@@ -265,14 +265,19 @@ def test_to_numpy(box):
     expected = np.array([True, False, pd.NA], dtype="object")
     tm.assert_numpy_array_equal(result, expected)
 
+    arr = con([True, False, None], dtype="boolean")
+    result = arr.to_numpy(dtype="str")
+    expected = np.array([True, False, pd.NA], dtype="<U5")
+    tm.assert_numpy_array_equal(result, expected)
+
     # no missing values -> can convert to bool, otherwise raises
     arr = con([True, False, True], dtype="boolean")
     result = arr.to_numpy(dtype="bool")
     expected = np.array([True, False, True], dtype="bool")
     tm.assert_numpy_array_equal(result, expected)
 
     arr = con([True, False, None], dtype="boolean")
-    with pytest.raises(ValueError, match="cannot convert to bool numpy"):
+    with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"):
         result = arr.to_numpy(dtype="bool")
 
     # specify dtype and na_value
@@ -294,9 +299,9 @@ def test_to_numpy(box):
     tm.assert_numpy_array_equal(result, expected)
 
     # converting to int or float without specifying na_value raises
-    with pytest.raises(TypeError):
+    with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"):
         arr.to_numpy(dtype="int64")
-    with pytest.raises(TypeError):
+    with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"):
         arr.to_numpy(dtype="float64")
 
 
@@ -329,6 +334,10 @@ def test_astype():
     expected = np.array([1, 0, np.nan], dtype="float64")
     tm.assert_numpy_array_equal(result, expected)
 
+    result = arr.astype("str")
+    expected = np.array(["True", "False", "NA"], dtype="object")
+    tm.assert_numpy_array_equal(result, expected)
+
     # no missing values
     arr = pd.array([True, False, True], dtype="boolean")
     result = arr.astype("int64")
diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py
@@ -118,7 +118,9 @@ def test_from_dtype_from_float(self, data):
 
         # from float
         expected = pd.Series(data)
-        result = pd.Series(np.array(data, dtype="float"), dtype=str(dtype))
+        result = pd.Series(
+            data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype)
+        )
         tm.assert_series_equal(result, expected)
 
         # from int / list
@@ -634,10 +636,47 @@ def test_construct_cast_invalid(self, dtype):
         with pytest.raises(TypeError, match=msg):
             pd.Series(arr).astype(dtype)
 
-    def test_coerce_to_ndarray_float_NA_rasies(self):
-        a = pd.array([0, 1, 2], dtype="Int64")
-        with pytest.raises(TypeError, match="NAType"):
-            a._coerce_to_ndarray(dtype="float", na_value=pd.NA)
+    @pytest.mark.parametrize("in_series", [True, False])
+    def test_to_numpy_na_nan(self, in_series):
+        a = pd.array([0, 1, None], dtype="Int64")
+        if in_series:
+            a = pd.Series(a)
+
+        result = a.to_numpy(dtype="float64", na_value=np.nan)
+        expected = np.array([0.0, 1.0, np.nan], dtype="float64")
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = a.to_numpy(dtype="int64", na_value=-1)
+        expected = np.array([0, 1, -1], dtype="int64")
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = a.to_numpy(dtype="bool", na_value=False)
+        expected = np.array([False, True, False], dtype="bool")
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("in_series", [True, False])
+    @pytest.mark.parametrize("dtype", ["int32", "int64", "bool"])
+    def test_to_numpy_dtype(self, dtype, in_series):
+        a = pd.array([0, 1], dtype="Int64")
+        if in_series:
+            a = pd.Series(a)
+
+        result = a.to_numpy(dtype=dtype)
+        expected = np.array([0, 1], dtype=dtype)
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype", ["float64", "int64", "bool"])
+    def test_to_numpy_na_raises(self, dtype):
+        a = pd.array([0, 1, None], dtype="Int64")
+        with pytest.raises(ValueError, match=dtype):
+            a.to_numpy(dtype=dtype)
+
+    def test_astype_str(self):
+        a = pd.array([1, 2, None], dtype="Int64")
+        expected = np.array(["1", "2", "NA"], dtype=object)
+
+        tm.assert_numpy_array_equal(a.astype(str), expected)
+        tm.assert_numpy_array_equal(a.astype("str"), expected)
 
 
 def test_frame_repr(data_missing):
@@ -887,7 +926,7 @@ def test_reduce_to_float(op):
 def test_astype_nansafe():
     # see gh-22343
     arr = integer_array([np.nan, 1, 2], dtype="Int8")
-    msg = "cannot convert to integer NumPy array with missing values"
+    msg = "cannot convert to 'uint32'-dtype NumPy array with missing values."
 
     with pytest.raises(ValueError, match=msg):
         arr.astype("uint32")