Skip to content

Commit 5156db0

Browse files
committed
IntegerArray.to_numpy
1 parent be1556c commit 5156db0

File tree

6 files changed

+133
-34
lines changed

6 files changed

+133
-34
lines changed

doc/source/whatsnew/v1.0.0.rst

+29
Original file line numberDiff line numberDiff line change
@@ -387,8 +387,37 @@ As a reminder, you can specify the ``dtype`` to disable all inference.
387387
.. ipython:: python
388388
389389
a = pd.array([1, 2, None], dtype="Int64")
390+
a
390391
a[2]
391392
393+
This has a few API-breaking consequences.
394+
395+
**Converting to a NumPy ndarray**
396+
397+
When converting to a NumPy array missing values will be ``pd.NA``, which cannot
398+
be converted to a float. So calling ``np.asarray(integer_array, dtype="float")``
399+
will now raise.
400+
401+
*pandas 0.25.x*
402+
403+
.. code-block:: python
404+
405+
>>> np.asarray(a, dtype="float")
406+
array([ 1., 2., nan])
407+
408+
*pandas 1.0.0*
409+
410+
.. ipython:: python
411+
:okexcept:
412+
413+
np.asarray(a, dtype="float")
414+
415+
Use :meth:`arrays.IntegerArray.to_numpy` with an explicit ``na_value`` instead.
416+
417+
.. ipython:: python
418+
419+
a.to_numpy(dtype="float", na_value=np.nan)
420+
392421
See :ref:`missing_data.NA` for more on the differences between :attr:`pandas.NA`
393422
and :attr:`numpy.nan`.
394423

pandas/core/arrays/boolean.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919
is_integer_dtype,
2020
is_list_like,
2121
is_numeric_dtype,
22+
is_object_dtype,
2223
is_scalar,
24+
is_string_dtype,
2325
pandas_dtype,
2426
)
2527
from pandas.core.dtypes.dtypes import register_extension_dtype
@@ -382,9 +384,14 @@ def to_numpy(
382384
if dtype is None:
383385
dtype = object
384386
if self._hasna:
385-
if is_bool_dtype(dtype) and na_value is libmissing.NA:
387+
if (
388+
not (is_object_dtype(dtype) or is_string_dtype(dtype))
389+
and na_value is libmissing.NA
390+
):
386391
raise ValueError(
387-
"cannot convert to bool numpy array in presence of missing values"
392+
f"cannot convert to '{dtype}'-dtype NumPy array "
393+
"with missing values. Specify an appropriate 'na_value' "
394+
"for this dtype."
388395
)
389396
# don't pass copy to astype -> always need a copy since we are mutating
390397
data = self._data.astype(dtype)

pandas/core/arrays/integer.py

+36-23
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import numpy as np
66

77
from pandas._libs import lib, missing as libmissing
8+
from pandas._typing import Scalar
89
from pandas.compat import set_function_name
910
from pandas.util._decorators import cache_readonly
1011

@@ -19,6 +20,7 @@
1920
is_list_like,
2021
is_object_dtype,
2122
is_scalar,
23+
is_string_dtype,
2224
)
2325
from pandas.core.dtypes.dtypes import register_extension_dtype
2426
from pandas.core.dtypes.missing import isna, notna
@@ -376,30 +378,35 @@ def __getitem__(self, item):
376378

377379
return type(self)(self._data[item], self._mask[item])
378380

379-
def _coerce_to_ndarray(self, dtype=None, na_value=lib.no_default):
380-
"""
381-
coerce to an ndarary of object dtype
382-
"""
381+
@property
382+
def _hasna(self) -> bool:
383+
# Note: this is expensive right now! The hope is that we can
384+
# make this faster by having an optional mask, but not have to change
385+
# source code using it..
386+
return self._mask.any()
387+
388+
def to_numpy(
389+
self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default,
390+
):
391+
if na_value is lib.no_default:
392+
na_value = libmissing.NA
383393
if dtype is None:
384394
dtype = object
385-
386-
if na_value is lib.no_default and is_float_dtype(dtype):
387-
na_value = np.nan
388-
elif na_value is lib.no_default:
389-
na_value = libmissing.NA
390-
391-
if is_integer_dtype(dtype):
392-
# Specifically, a NumPy integer dtype, not a pandas integer dtype,
393-
# since we're coercing to a numpy dtype by definition in this function.
394-
if not self.isna().any():
395-
return self._data.astype(dtype)
396-
else:
395+
if self._hasna:
396+
if (
397+
not (is_object_dtype(dtype) or is_string_dtype(dtype))
398+
and na_value is libmissing.NA
399+
):
397400
raise ValueError(
398-
"cannot convert to integer NumPy array with missing values"
401+
f"cannot convert to '{dtype}'-dtype NumPy array "
402+
"with missing values. Specify an appropriate 'na_value' "
403+
"for this dtype."
399404
)
400-
401-
data = self._data.astype(dtype)
402-
data[self._mask] = na_value
405+
# don't pass copy to astype -> always need a copy since we are mutating
406+
data = self._data.astype(dtype)
407+
data[self._mask] = na_value
408+
else:
409+
data = self._data.astype(dtype, copy=copy)
403410
return data
404411

405412
__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
@@ -409,7 +416,7 @@ def __array__(self, dtype=None):
409416
the array interface, return my values
410417
We return an object array here to preserve our scalar values
411418
"""
412-
return self._coerce_to_ndarray(dtype=dtype)
419+
return self.to_numpy(dtype=dtype)
413420

414421
def __arrow_array__(self, type=None):
415422
"""
@@ -564,7 +571,13 @@ def astype(self, dtype, copy=True):
564571
return type(self)(result, mask=self._mask, copy=False)
565572

566573
# coerce
567-
data = self._coerce_to_ndarray(dtype=dtype)
574+
if is_float_dtype(dtype):
575+
# In astype, we consider dtype=float to also mean na_value=np.nan
576+
kwargs = dict(na_value=np.nan)
577+
else:
578+
kwargs = {}
579+
580+
data = self.to_numpy(dtype=dtype, **kwargs)
568581
return astype_nansafe(data, dtype, copy=False)
569582

570583
@property
@@ -630,7 +643,7 @@ def value_counts(self, dropna=True):
630643
def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
631644
# TODO: https://github.com/pandas-dev/pandas/issues/30037
632645
# use masked algorithms, rather than object-dtype / np.nan.
633-
return self._coerce_to_ndarray(na_value=np.nan), np.nan
646+
return self.to_numpy(na_value=np.nan), np.nan
634647

635648
def _values_for_argsort(self) -> np.ndarray:
636649
"""Return values for sorting.

pandas/core/dtypes/common.py

+2
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array:
171171
try:
172172
return arr.astype("uint64", copy=copy, casting="safe") # type: ignore
173173
except TypeError:
174+
if is_extension_array_dtype(arr.dtype):
175+
return arr.to_numpy(dtype="float64", na_value=np.nan)
174176
return arr.astype("float64", copy=copy)
175177

176178

pandas/tests/arrays/test_boolean.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -265,14 +265,19 @@ def test_to_numpy(box):
265265
expected = np.array([True, False, pd.NA], dtype="object")
266266
tm.assert_numpy_array_equal(result, expected)
267267

268+
arr = con([True, False, None], dtype="boolean")
269+
result = arr.to_numpy(dtype="str")
270+
expected = np.array([True, False, pd.NA], dtype="<U5")
271+
tm.assert_numpy_array_equal(result, expected)
272+
268273
# no missing values -> can convert to bool, otherwise raises
269274
arr = con([True, False, True], dtype="boolean")
270275
result = arr.to_numpy(dtype="bool")
271276
expected = np.array([True, False, True], dtype="bool")
272277
tm.assert_numpy_array_equal(result, expected)
273278

274279
arr = con([True, False, None], dtype="boolean")
275-
with pytest.raises(ValueError, match="cannot convert to bool numpy"):
280+
with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"):
276281
result = arr.to_numpy(dtype="bool")
277282

278283
# specify dtype and na_value
@@ -294,9 +299,9 @@ def test_to_numpy(box):
294299
tm.assert_numpy_array_equal(result, expected)
295300

296301
# converting to int or float without specifying na_value raises
297-
with pytest.raises(TypeError):
302+
with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"):
298303
arr.to_numpy(dtype="int64")
299-
with pytest.raises(TypeError):
304+
with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"):
300305
arr.to_numpy(dtype="float64")
301306

302307

@@ -329,6 +334,10 @@ def test_astype():
329334
expected = np.array([1, 0, np.nan], dtype="float64")
330335
tm.assert_numpy_array_equal(result, expected)
331336

337+
result = arr.astype("str")
338+
expected = np.array(["True", "False", "NA"], dtype="object")
339+
tm.assert_numpy_array_equal(result, expected)
340+
332341
# no missing values
333342
arr = pd.array([True, False, True], dtype="boolean")
334343
result = arr.astype("int64")

pandas/tests/arrays/test_integer.py

+45-6
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,9 @@ def test_from_dtype_from_float(self, data):
118118

119119
# from float
120120
expected = pd.Series(data)
121-
result = pd.Series(np.array(data, dtype="float"), dtype=str(dtype))
121+
result = pd.Series(
122+
data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype)
123+
)
122124
tm.assert_series_equal(result, expected)
123125

124126
# from int / list
@@ -634,10 +636,47 @@ def test_construct_cast_invalid(self, dtype):
634636
with pytest.raises(TypeError, match=msg):
635637
pd.Series(arr).astype(dtype)
636638

637-
def test_coerce_to_ndarray_float_NA_rasies(self):
638-
a = pd.array([0, 1, 2], dtype="Int64")
639-
with pytest.raises(TypeError, match="NAType"):
640-
a._coerce_to_ndarray(dtype="float", na_value=pd.NA)
639+
@pytest.mark.parametrize("in_series", [True, False])
640+
def test_to_numpy_na_nan(self, in_series):
641+
a = pd.array([0, 1, None], dtype="Int64")
642+
if in_series:
643+
a = pd.Series(a)
644+
645+
result = a.to_numpy(dtype="float64", na_value=np.nan)
646+
expected = np.array([0.0, 1.0, np.nan], dtype="float64")
647+
tm.assert_numpy_array_equal(result, expected)
648+
649+
result = a.to_numpy(dtype="int64", na_value=-1)
650+
expected = np.array([0, 1, -1], dtype="int64")
651+
tm.assert_numpy_array_equal(result, expected)
652+
653+
result = a.to_numpy(dtype="bool", na_value=False)
654+
expected = np.array([False, True, False], dtype="bool")
655+
tm.assert_numpy_array_equal(result, expected)
656+
657+
@pytest.mark.parametrize("in_series", [True, False])
658+
@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"])
659+
def test_to_numpy_dtype(self, dtype, in_series):
660+
a = pd.array([0, 1], dtype="Int64")
661+
if in_series:
662+
a = pd.Series(a)
663+
664+
result = a.to_numpy(dtype=dtype)
665+
expected = np.array([0, 1], dtype=dtype)
666+
tm.assert_numpy_array_equal(result, expected)
667+
668+
@pytest.mark.parametrize("dtype", ["float64", "int64", "bool"])
669+
def test_to_numpy_na_raises(self, dtype):
670+
a = pd.array([0, 1, None], dtype="Int64")
671+
with pytest.raises(ValueError, match=dtype):
672+
a.to_numpy(dtype=dtype)
673+
674+
def test_astype_str(self):
675+
a = pd.array([1, 2, None], dtype="Int64")
676+
expected = np.array(["1", "2", "NA"], dtype=object)
677+
678+
tm.assert_numpy_array_equal(a.astype(str), expected)
679+
tm.assert_numpy_array_equal(a.astype("str"), expected)
641680

642681

643682
def test_frame_repr(data_missing):
@@ -887,7 +926,7 @@ def test_reduce_to_float(op):
887926
def test_astype_nansafe():
888927
# see gh-22343
889928
arr = integer_array([np.nan, 1, 2], dtype="Int8")
890-
msg = "cannot convert to integer NumPy array with missing values"
929+
msg = "cannot convert to 'uint32'-dtype NumPy array with missing values."
891930

892931
with pytest.raises(ValueError, match=msg):
893932
arr.astype("uint32")

0 commit comments

Comments
 (0)