to_numpy defaults to dtype.type

MarcoGorelli · MarcoGorelli · commit a3b702fd6575 · 2022-10-04T19:19:07.000+01:00
diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst
@@ -111,6 +111,40 @@ Optional libraries below the lowest tested version may still work, but are not c
 
 See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
 
+Nullable types get converted to their respective NumPy types in ``to_numpy``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Previously, ``to_numpy`` would always convert to ``object`` type:
+
+*Old Behavior*
+
+.. code-block:: ipython
+
+    In [1]: pd.Series([1, 2, 3], dtype="Float64").to_numpy()
+    Out[1]: array([1.0, 2.0, 3.0], dtype=object)
+
+Now, the above ``Series`` gets converted to ``float64``:
+
+*New Behavior*
+
+.. ipython:: python
+
+    pd.Series([1, 2, 3], dtype="Float64").to_numpy()
+
+If a ``Series`` contains missing values (``pd.NA``), then when converting to ``float64``,
+they will be converted to ``np.nan``:
+
+.. ipython:: python
+
+    pd.Series([1, 2, pd.NA], dtype="Float64").to_numpy()
+
+If converting to a type other than ``float64``, then you need to specify an ``na_value``
+compatible with that ``dtype``, for example:
+
+.. ipython:: python
+
+    pd.Series([1, 2, pd.NA], dtype="Float64").to_numpy("int64", na_value=-1)
+
 .. _whatsnew_160.api_breaking.other:
 
 Other API changes
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -1051,15 +1051,15 @@ def assert_series_equal(
                 left_values,
                 right_values,
                 check_dtype=check_dtype,
-                index_values=np.asarray(left.index),
+                index_values=np.asarray(left.index, dtype=object),
             )
         else:
             assert_numpy_array_equal(
                 left_values,
                 right_values,
                 check_dtype=check_dtype,
                 obj=str(obj),
-                index_values=np.asarray(left.index),
+                index_values=np.asarray(left.index, dtype=object),
             )
     elif check_datetimelike_compat and (
         needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype)
@@ -1088,7 +1088,7 @@ def assert_series_equal(
             atol=atol,
             check_dtype=bool(check_dtype),
             obj=str(obj),
-            index_values=np.asarray(left.index),
+            index_values=np.asarray(left.index, dtype=object),
         )
     elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype):
         assert_extension_array_equal(
@@ -1097,7 +1097,7 @@ def assert_series_equal(
             rtol=rtol,
             atol=atol,
             check_dtype=check_dtype,
-            index_values=np.asarray(left.index),
+            index_values=np.asarray(left.index, dtype=object),
         )
     elif is_extension_array_dtype_and_needs_i8_conversion(
         left.dtype, right.dtype
@@ -1106,15 +1106,15 @@ def assert_series_equal(
             left._values,
             right._values,
             check_dtype=check_dtype,
-            index_values=np.asarray(left.index),
+            index_values=np.asarray(left.index, dtype=object),
         )
     elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype):
         # DatetimeArray or TimedeltaArray
         assert_extension_array_equal(
             left._values,
             right._values,
             check_dtype=check_dtype,
-            index_values=np.asarray(left.index),
+            index_values=np.asarray(left.index, dtype=object),
         )
     else:
         _testing.assert_almost_equal(
@@ -1124,7 +1124,7 @@ def assert_series_equal(
             atol=atol,
             check_dtype=bool(check_dtype),
             obj=str(obj),
-            index_values=np.asarray(left.index),
+            index_values=np.asarray(left.index, dtype=object),
         )
 
     # metadata comparison
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -401,10 +401,14 @@ def to_numpy(
         >>> a.to_numpy(dtype="bool", na_value=False)
         array([ True, False, False])
         """
-        if na_value is lib.no_default:
-            na_value = libmissing.NA
         if dtype is None:
-            dtype = object
+            dtype = self.dtype.type
+
+        if na_value is lib.no_default and is_float_dtype(dtype):
+            na_value = np.nan
+        elif na_value is lib.no_default:
+            na_value = libmissing.NA
+
         if self._hasna:
             if (
                 not is_object_dtype(dtype)
@@ -413,8 +417,12 @@ def to_numpy(
             ):
                 raise ValueError(
                     f"cannot convert to '{dtype}'-dtype NumPy array "
-                    "with missing values. Specify an appropriate 'na_value' "
-                    "for this dtype."
+                    "with missing values.\n"
+                    "Please either:\n"
+                    "- convert to 'float'\n"
+                    "- convert to 'object'\n"
+                    "- specify an appropriate 'na_value' for this dtype\n"
+                    "for this dtype.\n"
                 )
             # don't pass copy to astype -> always need a copy since we are mutating
             data = self._data.astype(dtype)
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -1657,7 +1657,7 @@ def _format_strings(self) -> list[str]:
             # Categorical is special for now, so that we can preserve tzinfo
             array = values._internal_get_values()
         else:
-            array = np.asarray(values)
+            array = np.asarray(values, dtype=object)
 
         fmt_values = format_array(
             array,
diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py
@@ -214,16 +214,17 @@ def test_coerce_to_array_from_boolean_array():
 
 
 def test_coerce_to_numpy_array():
-    # with missing values -> object dtype
+    # with missing values -> tries but fails to convert
     arr = pd.array([True, False, None], dtype="boolean")
-    result = np.array(arr)
-    expected = np.array([True, False, pd.NA], dtype="object")
-    tm.assert_numpy_array_equal(result, expected)
+    with pytest.raises(
+        ValueError, match=r"specify an appropriate 'na_value' for this dtype"
+    ):
+        result = np.array(arr)
 
-    # also with no missing values -> object dtype
+    # also with no missing values -> successfully converts to bool
     arr = pd.array([True, False, True], dtype="boolean")
     result = np.array(arr)
-    expected = np.array([True, False, True], dtype="object")
+    expected = np.array([True, False, True], dtype="bool")
     tm.assert_numpy_array_equal(result, expected)
 
     # force bool dtype
@@ -233,8 +234,12 @@ def test_coerce_to_numpy_array():
     # with missing values will raise error
     arr = pd.array([True, False, None], dtype="boolean")
     msg = (
-        "cannot convert to 'bool'-dtype NumPy array with missing values. "
-        "Specify an appropriate 'na_value' for this dtype."
+        "^cannot convert to 'bool'-dtype NumPy array with missing values.\n"
+        "Please either:\n"
+        "- convert to 'float'\n"
+        "- convert to 'object'\n"
+        "- specify an appropriate 'na_value' for this dtype\n"
+        "for this dtype.\n$"
     )
     with pytest.raises(ValueError, match=msg):
         np.array(arr, dtype="bool")
@@ -260,16 +265,17 @@ def test_to_boolean_array_from_strings_invalid_string():
 @pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
 def test_to_numpy(box):
     con = pd.Series if box else pd.array
-    # default (with or without missing values) -> object dtype
+    # default (with or without missing values) -> bool dtype
     arr = con([True, False, True], dtype="boolean")
     result = arr.to_numpy()
-    expected = np.array([True, False, True], dtype="object")
+    expected = np.array([True, False, True], dtype="bool")
     tm.assert_numpy_array_equal(result, expected)
 
     arr = con([True, False, None], dtype="boolean")
-    result = arr.to_numpy()
-    expected = np.array([True, False, pd.NA], dtype="object")
-    tm.assert_numpy_array_equal(result, expected)
+    with pytest.raises(
+        ValueError, match="specify an appropriate 'na_value' for this dtype"
+    ):
+        arr.to_numpy()
 
     arr = con([True, False, None], dtype="boolean")
     result = arr.to_numpy(dtype="str")
@@ -304,11 +310,13 @@ def test_to_numpy(box):
     expected = np.array([1, 0, np.nan], dtype="float64")
     tm.assert_numpy_array_equal(result, expected)
 
-    # converting to int or float without specifying na_value raises
+    # converting to int without specifying na_value raises
     with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"):
         arr.to_numpy(dtype="int64")
-    with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"):
-        arr.to_numpy(dtype="float64")
+    # converting to float without specifying na_value converts NA to nan
+    result = arr.to_numpy(dtype="float64")
+    expected = np.array([1, 0, np.nan], dtype="float64")
+    tm.assert_numpy_array_equal(result, expected)
 
 
 def test_to_numpy_copy():
diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py
@@ -10,15 +10,15 @@
 def test_to_numpy(box):
     con = pd.Series if box else pd.array
 
-    # default (with or without missing values) -> object dtype
+    # default (with or without missing values) -> float64 dtype
     arr = con([0.1, 0.2, 0.3], dtype="Float64")
     result = arr.to_numpy()
-    expected = np.array([0.1, 0.2, 0.3], dtype="object")
+    expected = np.array([0.1, 0.2, 0.3], dtype="float64")
     tm.assert_numpy_array_equal(result, expected)
 
     arr = con([0.1, 0.2, None], dtype="Float64")
     result = arr.to_numpy()
-    expected = np.array([0.1, 0.2, pd.NA], dtype="object")
+    expected = np.array([0.1, 0.2, np.nan], dtype="float64")
     tm.assert_numpy_array_equal(result, expected)
 
 
@@ -33,8 +33,9 @@ def test_to_numpy_float(box):
     tm.assert_numpy_array_equal(result, expected)
 
     arr = con([0.1, 0.2, None], dtype="Float64")
-    with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"):
-        result = arr.to_numpy(dtype="float64")
+    result = arr.to_numpy(dtype="float64")
+    expected = np.array([0.1, 0.2, np.nan], dtype="float64")
+    tm.assert_numpy_array_equal(result, expected)
 
     # need to explicitly specify na_value
     result = arr.to_numpy(dtype="float64", na_value=np.nan)
@@ -100,7 +101,18 @@ def test_to_numpy_dtype(box, dtype):
     tm.assert_numpy_array_equal(result, expected)
 
 
-@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"])
+@pytest.mark.parametrize("dtype", ["float64", "float32"])
+@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
+def test_to_numpy_na_doesnt_raise(box, dtype):
+    # https://github.com/pandas-dev/pandas/issues/48891
+    con = pd.Series if box else pd.array
+    arr = con([0.0, 1.0, None], dtype="Float64")
+    result = arr.to_numpy(dtype=dtype)
+    expected = np.array([0.0, 1.0, np.nan], dtype=dtype)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"])
 @pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
 def test_to_numpy_na_raises(box, dtype):
     con = pd.Series if box else pd.array
diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py
@@ -37,7 +37,7 @@ def test_from_dtype_from_float(data):
 
     # from int / list
     expected = pd.Series(data)
-    result = pd.Series(np.array(data).tolist(), dtype=str(dtype))
+    result = pd.Series(np.array(data, dtype=object).tolist(), dtype=str(dtype))
     tm.assert_series_equal(result, expected)
 
     # from int / array
diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py
@@ -89,7 +89,7 @@ def test_astype_index(all_data, dropna):
         other = all_data
 
     dtype = all_data.dtype
-    idx = pd.Index._with_infer(np.array(other))
+    idx = pd.Index._with_infer(np.array(other, dtype=object))
     assert isinstance(idx, ABCIndex)
 
     result = idx.astype(dtype)
@@ -143,7 +143,7 @@ def test_astype(all_data):
     # coerce to object
     s = pd.Series(mixed)
     result = s.astype("object")
-    expected = pd.Series(np.asarray(mixed))
+    expected = pd.Series(np.asarray(mixed, dtype=object))
     tm.assert_series_equal(result, expected)
 
 
@@ -274,13 +274,22 @@ def test_to_numpy_dtype(dtype, in_series):
     tm.assert_numpy_array_equal(result, expected)
 
 
-@pytest.mark.parametrize("dtype", ["float64", "int64", "bool"])
+@pytest.mark.parametrize("dtype", ["int64", "bool"])
 def test_to_numpy_na_raises(dtype):
     a = pd.array([0, 1, None], dtype="Int64")
     with pytest.raises(ValueError, match=dtype):
         a.to_numpy(dtype=dtype)
 
 
+@pytest.mark.parametrize("dtype", ["float64"])
+def test_to_numpy_na_doesnt_raise(dtype):
+    # https://github.com/pandas-dev/pandas/issues/48891
+    a = pd.array([0, 1, None], dtype="Int64")
+    result = a.to_numpy(dtype=dtype)
+    expected = np.array([0.0, 1.0, np.nan])
+    tm.assert_numpy_array_equal(result, expected)
+
+
 def test_astype_str():
     a = pd.array([1, 2, None], dtype="Int64")
     expected = np.array(["1", "2", "<NA>"], dtype=f"{tm.ENDIAN}U21")
diff --git a/pandas/tests/arrays/masked_shared.py b/pandas/tests/arrays/masked_shared.py
@@ -130,7 +130,7 @@ def test_ufunc_with_out(self, dtype):
         # result |= mask worked because mask could be cast losslessly to
         #  boolean ndarray. mask2 can't, so this raises
         result = np.zeros(3, dtype=bool)
-        msg = "Specify an appropriate 'na_value' for this dtype"
+        msg = "specify an appropriate 'na_value' for this dtype"
         with pytest.raises(ValueError, match=msg):
             result |= mask2