Skip to content

IntegerArray.to_numpy #30792

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -387,8 +387,37 @@ As a reminder, you can specify the ``dtype`` to disable all inference.
.. ipython:: python

a = pd.array([1, 2, None], dtype="Int64")
a
a[2]

This has a few API-breaking consequences.

**Converting to a NumPy ndarray**

When converting to a NumPy array missing values will be ``pd.NA``, which cannot
be converted to a float. So calling ``np.asarray(integer_array, dtype="float")``
will now raise.

*pandas 0.25.x*

.. code-block:: python

>>> np.asarray(a, dtype="float")
array([ 1., 2., nan])

*pandas 1.0.0*

.. ipython:: python
:okexcept:

np.asarray(a, dtype="float")

Use :meth:`arrays.IntegerArray.to_numpy` with an explicit ``na_value`` instead.

.. ipython:: python

a.to_numpy(dtype="float", na_value=np.nan)

See :ref:`missing_data.NA` for more on the differences between :attr:`pandas.NA`
and :attr:`numpy.nan`.

Expand Down
11 changes: 9 additions & 2 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
is_integer_dtype,
is_list_like,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import register_extension_dtype
Expand Down Expand Up @@ -382,9 +384,14 @@ def to_numpy(
if dtype is None:
dtype = object
if self._hasna:
if is_bool_dtype(dtype) and na_value is libmissing.NA:
if (
not (is_object_dtype(dtype) or is_string_dtype(dtype))
and na_value is libmissing.NA
):
raise ValueError(
"cannot convert to bool numpy array in presence of missing values"
f"cannot convert to '{dtype}'-dtype NumPy array "
"with missing values. Specify an appropriate 'na_value' "
"for this dtype."
)
# don't pass copy to astype -> always need a copy since we are mutating
data = self._data.astype(dtype)
Expand Down
59 changes: 36 additions & 23 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np

from pandas._libs import lib, missing as libmissing
from pandas._typing import Scalar
from pandas.compat import set_function_name
from pandas.util._decorators import cache_readonly

Expand All @@ -19,6 +20,7 @@
is_list_like,
is_object_dtype,
is_scalar,
is_string_dtype,
)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.missing import isna, notna
Expand Down Expand Up @@ -376,30 +378,35 @@ def __getitem__(self, item):

return type(self)(self._data[item], self._mask[item])

def _coerce_to_ndarray(self, dtype=None, na_value=lib.no_default):
"""
coerce to an ndarary of object dtype
"""
@property
def _hasna(self) -> bool:
# Note: this is expensive right now! The hope is that we can
# make this faster by having an optional mask, but not have to change
# source code using it..
return self._mask.any()

def to_numpy(
self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default,
):
if na_value is lib.no_default:
na_value = libmissing.NA
if dtype is None:
dtype = object

if na_value is lib.no_default and is_float_dtype(dtype):
na_value = np.nan
elif na_value is lib.no_default:
na_value = libmissing.NA

if is_integer_dtype(dtype):
# Specifically, a NumPy integer dtype, not a pandas integer dtype,
# since we're coercing to a numpy dtype by definition in this function.
if not self.isna().any():
return self._data.astype(dtype)
else:
if self._hasna:
if (
not (is_object_dtype(dtype) or is_string_dtype(dtype))
and na_value is libmissing.NA
):
raise ValueError(
"cannot convert to integer NumPy array with missing values"
f"cannot convert to '{dtype}'-dtype NumPy array "
"with missing values. Specify an appropriate 'na_value' "
"for this dtype."
)

data = self._data.astype(dtype)
data[self._mask] = na_value
# don't pass copy to astype -> always need a copy since we are mutating
data = self._data.astype(dtype)
data[self._mask] = na_value
else:
data = self._data.astype(dtype, copy=copy)
return data

__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
Expand All @@ -409,7 +416,7 @@ def __array__(self, dtype=None):
the array interface, return my values
We return an object array here to preserve our scalar values
"""
return self._coerce_to_ndarray(dtype=dtype)
return self.to_numpy(dtype=dtype)

def __arrow_array__(self, type=None):
"""
Expand Down Expand Up @@ -564,7 +571,13 @@ def astype(self, dtype, copy=True):
return type(self)(result, mask=self._mask, copy=False)

# coerce
data = self._coerce_to_ndarray(dtype=dtype)
if is_float_dtype(dtype):
# In astype, we consider dtype=float to also mean na_value=np.nan
kwargs = dict(na_value=np.nan)
else:
kwargs = {}

data = self.to_numpy(dtype=dtype, **kwargs)
return astype_nansafe(data, dtype, copy=False)

@property
Expand Down Expand Up @@ -630,7 +643,7 @@ def value_counts(self, dropna=True):
def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
# TODO: https://github.com/pandas-dev/pandas/issues/30037
# use masked algorithms, rather than object-dtype / np.nan.
return self._coerce_to_ndarray(na_value=np.nan), np.nan
return self.to_numpy(na_value=np.nan), np.nan

def _values_for_argsort(self) -> np.ndarray:
"""Return values for sorting.
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array:
try:
return arr.astype("uint64", copy=copy, casting="safe") # type: ignore
except TypeError:
if is_extension_array_dtype(arr.dtype):
return arr.to_numpy(dtype="float64", na_value=np.nan)
return arr.astype("float64", copy=copy)


Expand Down
15 changes: 12 additions & 3 deletions pandas/tests/arrays/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,14 +265,19 @@ def test_to_numpy(box):
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)

arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy(dtype="str")
expected = np.array([True, False, pd.NA], dtype="<U5")
tm.assert_numpy_array_equal(result, expected)

# no missing values -> can convert to bool, otherwise raises
arr = con([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype="bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)

arr = con([True, False, None], dtype="boolean")
with pytest.raises(ValueError, match="cannot convert to bool numpy"):
with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"):
result = arr.to_numpy(dtype="bool")

# specify dtype and na_value
Expand All @@ -294,9 +299,9 @@ def test_to_numpy(box):
tm.assert_numpy_array_equal(result, expected)

# converting to int or float without specifying na_value raises
with pytest.raises(TypeError):
with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"):
arr.to_numpy(dtype="int64")
with pytest.raises(TypeError):
with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"):
arr.to_numpy(dtype="float64")


Expand Down Expand Up @@ -329,6 +334,10 @@ def test_astype():
expected = np.array([1, 0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)

result = arr.astype("str")
expected = np.array(["True", "False", "NA"], dtype="object")
tm.assert_numpy_array_equal(result, expected)

# no missing values
arr = pd.array([True, False, True], dtype="boolean")
result = arr.astype("int64")
Expand Down
51 changes: 45 additions & 6 deletions pandas/tests/arrays/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,9 @@ def test_from_dtype_from_float(self, data):

# from float
expected = pd.Series(data)
result = pd.Series(np.array(data, dtype="float"), dtype=str(dtype))
result = pd.Series(
data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype)
)
tm.assert_series_equal(result, expected)

# from int / list
Expand Down Expand Up @@ -634,10 +636,47 @@ def test_construct_cast_invalid(self, dtype):
with pytest.raises(TypeError, match=msg):
pd.Series(arr).astype(dtype)

def test_coerce_to_ndarray_float_NA_rasies(self):
a = pd.array([0, 1, 2], dtype="Int64")
with pytest.raises(TypeError, match="NAType"):
a._coerce_to_ndarray(dtype="float", na_value=pd.NA)
@pytest.mark.parametrize("in_series", [True, False])
def test_to_numpy_na_nan(self, in_series):
a = pd.array([0, 1, None], dtype="Int64")
if in_series:
a = pd.Series(a)

result = a.to_numpy(dtype="float64", na_value=np.nan)
expected = np.array([0.0, 1.0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)

result = a.to_numpy(dtype="int64", na_value=-1)
expected = np.array([0, 1, -1], dtype="int64")
tm.assert_numpy_array_equal(result, expected)

result = a.to_numpy(dtype="bool", na_value=False)
expected = np.array([False, True, False], dtype="bool")
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize("in_series", [True, False])
@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"])
def test_to_numpy_dtype(self, dtype, in_series):
a = pd.array([0, 1], dtype="Int64")
if in_series:
a = pd.Series(a)

result = a.to_numpy(dtype=dtype)
expected = np.array([0, 1], dtype=dtype)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize("dtype", ["float64", "int64", "bool"])
def test_to_numpy_na_raises(self, dtype):
a = pd.array([0, 1, None], dtype="Int64")
with pytest.raises(ValueError, match=dtype):
a.to_numpy(dtype=dtype)

def test_astype_str(self):
a = pd.array([1, 2, None], dtype="Int64")
expected = np.array(["1", "2", "NA"], dtype=object)

tm.assert_numpy_array_equal(a.astype(str), expected)
tm.assert_numpy_array_equal(a.astype("str"), expected)


def test_frame_repr(data_missing):
Expand Down Expand Up @@ -887,7 +926,7 @@ def test_reduce_to_float(op):
def test_astype_nansafe():
# see gh-22343
arr = integer_array([np.nan, 1, 2], dtype="Int8")
msg = "cannot convert to integer NumPy array with missing values"
msg = "cannot convert to 'uint32'-dtype NumPy array with missing values."

with pytest.raises(ValueError, match=msg):
arr.astype("uint32")
Expand Down