-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
API: Uses pd.NA in IntegerArray #29964
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 51 commits
1eec965
f5f61ea
c569562
a8261a4
c8ff04f
cddc9df
9488d34
0d5aab8
fa61a6d
de2c6c6
60d7663
a4c4618
0a500be
1c716f3
67c8d51
22a2bc7
34de18e
78944d1
ffbe299
7abf40e
36d403d
f6b4062
945e8cd
04546f3
a493965
8fc8b3a
a49aa65
8ad166d
dd745c3
88fa412
0902eef
721a1ea
c658307
4f9d775
1244ef4
4a34b45
5293d87
39f225a
ea19b2d
fe2d98e
68fe155
f27a5c2
b97450b
5d62af8
2bf57d6
2f4e1cd
021dc7b
197f18b
259b779
c0cfef9
3183d53
4986d84
76806e9
64b4ccc
b39dc60
800158d
e5d6832
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,10 @@ | ||
import numbers | ||
from typing import Type | ||
from typing import Any, Tuple, Type | ||
import warnings | ||
|
||
import numpy as np | ||
|
||
from pandas._libs import lib | ||
from pandas._libs import lib, missing as libmissing | ||
from pandas.compat import set_function_name | ||
from pandas.util._decorators import cache_readonly | ||
|
||
|
@@ -44,7 +44,7 @@ class _IntegerDtype(ExtensionDtype): | |
name: str | ||
base = None | ||
type: Type | ||
na_value = np.nan | ||
na_value = libmissing.NA | ||
|
||
def __repr__(self) -> str: | ||
sign = "U" if self.is_unsigned_integer else "" | ||
|
@@ -263,6 +263,11 @@ class IntegerArray(ExtensionArray, ExtensionOpsMixin): | |
|
||
.. versionadded:: 0.24.0 | ||
|
||
.. versionchanged:: 1.0.0 | ||
|
||
Now uses :attr:`pandas.NA` as its missing value, rather | ||
than :attr:`numpy.nan`. | ||
|
||
.. warning:: | ||
|
||
IntegerArray is currently experimental, and its API or internal | ||
|
@@ -358,29 +363,37 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): | |
def _from_factorized(cls, values, original): | ||
return integer_array(values, dtype=original.dtype) | ||
|
||
def _formatter(self, boxed=False): | ||
def fmt(x): | ||
if isna(x): | ||
return "NaN" | ||
return str(x) | ||
|
||
return fmt | ||
|
||
def __getitem__(self, item): | ||
if is_integer(item): | ||
if self._mask[item]: | ||
return self.dtype.na_value | ||
return self._data[item] | ||
return type(self)(self._data[item], self._mask[item]) | ||
|
||
def _coerce_to_ndarray(self): | ||
def _coerce_to_ndarray(self, dtype=None, na_value=lib._no_default): | ||
""" | ||
coerce to an ndarary of object dtype | ||
""" | ||
if dtype is None: | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
dtype = object | ||
|
||
if na_value is lib._no_default and is_float_dtype(dtype): | ||
na_value = np.nan | ||
elif na_value is lib._no_default: | ||
na_value = libmissing.NA | ||
|
||
if is_integer_dtype(dtype): | ||
# Specifically, a NumPy integer dtype, not a pandas integer dtype, | ||
# since we're coercing to a numpy dtype by definition in this function. | ||
if not self.isna().any(): | ||
return self._data.astype(dtype) | ||
else: | ||
raise ValueError( | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"cannot convert to integer NumPy array with missing values" | ||
) | ||
|
||
# TODO(jreback) make this better | ||
data = self._data.astype(object) | ||
data[self._mask] = self._na_value | ||
data = self._data.astype(dtype) | ||
data[self._mask] = na_value | ||
return data | ||
|
||
__array_priority__ = 1000 # higher than ndarray so ops dispatch to us | ||
|
@@ -390,7 +403,7 @@ def __array__(self, dtype=None): | |
the array interface, return my values | ||
We return an object array here to preserve our scalar values | ||
""" | ||
return self._coerce_to_ndarray() | ||
return self._coerce_to_ndarray(dtype=dtype) | ||
|
||
def __arrow_array__(self, type=None): | ||
""" | ||
|
@@ -506,7 +519,7 @@ def isna(self): | |
|
||
@property | ||
def _na_value(self): | ||
return np.nan | ||
return self.dtype.na_value | ||
|
||
@classmethod | ||
def _concat_same_type(cls, to_concat): | ||
|
@@ -545,7 +558,7 @@ def astype(self, dtype, copy=True): | |
return type(self)(result, mask=self._mask, copy=False) | ||
|
||
# coerce | ||
data = self._coerce_to_ndarray() | ||
data = self._coerce_to_ndarray(dtype=dtype) | ||
return astype_nansafe(data, dtype, copy=None) | ||
|
||
@property | ||
|
@@ -600,12 +613,19 @@ def value_counts(self, dropna=True): | |
# w/o passing the dtype | ||
array = np.append(array, [self._mask.sum()]) | ||
index = Index( | ||
np.concatenate([index.values, np.array([np.nan], dtype=object)]), | ||
np.concatenate( | ||
[index.values, np.array([self.dtype.na_value], dtype=object)] | ||
), | ||
dtype=object, | ||
) | ||
|
||
return Series(array, index=index) | ||
|
||
def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: | ||
# TODO: https://github.com/pandas-dev/pandas/issues/30037 | ||
# use masked algorithms, rather than object-dtype / np.nan. | ||
return self._coerce_to_ndarray(na_value=np.nan), np.nan | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def _values_for_argsort(self) -> np.ndarray: | ||
"""Return values for sorting. | ||
|
||
|
@@ -629,9 +649,11 @@ def _create_comparison_method(cls, op): | |
|
||
@unpack_zerodim_and_defer(op.__name__) | ||
def cmp_method(self, other): | ||
from pandas.arrays import BooleanArray | ||
|
||
mask = None | ||
|
||
if isinstance(other, IntegerArray): | ||
if isinstance(other, (BooleanArray, IntegerArray)): | ||
other, mask = other._data, other._mask | ||
|
||
elif is_list_like(other): | ||
|
@@ -643,25 +665,30 @@ def cmp_method(self, other): | |
if len(self) != len(other): | ||
raise ValueError("Lengths must match to compare") | ||
|
||
# numpy will show a DeprecationWarning on invalid elementwise | ||
# comparisons, this will raise in the future | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See previous question about this. Is this comment no longer relevant or correct? Or why was it removed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure, do you know how this is actually hit? If NumPy is going to raise in the future, shouldn't they be seeing that warning? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is about the warning you get with comparisons with objects / non-broadcastable arrays. Eg:
(it seems IntegerArray already handles this fine, not sure there is a explicit test for that) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Gotch. It's silencing the same warning from NumPy, and falling back to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually... the comment is incorrect. NumPy will perform elementwise comparison in the future, not raise. If they were to raise on that in the future the implementation would be incorrect. Though I'm still a bit confused, as the NumPy op is returning NotImplemented since we're calling it directly. Will that continue to return NotImplemented? Or will the elementwise result be different? |
||
with warnings.catch_warnings(): | ||
warnings.filterwarnings("ignore", "elementwise", FutureWarning) | ||
with np.errstate(all="ignore"): | ||
method = getattr(self._data, f"__{op_name}__") | ||
result = method(other) | ||
if other is libmissing.NA: | ||
# numpy does not handle pd.NA well as "other" scalar (it returns | ||
# a scalar False instead of an array) | ||
# This may be fixed by NA.__array_ufunc__. Revisit this check | ||
# once that's implemented. | ||
result = np.zeros(self._data.shape, dtype="bool") | ||
mask = np.ones(self._data.shape, dtype="bool") | ||
else: | ||
with warnings.catch_warnings(): | ||
warnings.filterwarnings("ignore", "elementwise", FutureWarning) | ||
with np.errstate(all="ignore"): | ||
method = getattr(self._data, f"__{op_name}__") | ||
result = method(other) | ||
|
||
if result is NotImplemented: | ||
result = invalid_comparison(self._data, other, op) | ||
|
||
# nans propagate | ||
if mask is None: | ||
mask = self._mask | ||
mask = self._mask.copy() | ||
else: | ||
mask = self._mask | mask | ||
|
||
result[mask] = op_name == "ne" | ||
return result | ||
return BooleanArray(result, mask) | ||
|
||
name = f"__{op.__name__}__" | ||
return set_function_name(cmp_method, name, cls) | ||
|
@@ -673,7 +700,8 @@ def _reduce(self, name, skipna=True, **kwargs): | |
# coerce to a nan-aware float if needed | ||
if mask.any(): | ||
data = self._data.astype("float64") | ||
data[mask] = self._na_value | ||
# We explicitly use NaN within reductions. | ||
data[mask] = np.nan | ||
|
||
op = getattr(nanops, "nan" + name) | ||
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) | ||
|
@@ -739,12 +767,13 @@ def integer_arithmetic_method(self, other): | |
raise TypeError("can only perform ops with numeric values") | ||
|
||
else: | ||
if not (is_float(other) or is_integer(other)): | ||
if not (is_float(other) or is_integer(other) or other is libmissing.NA): | ||
raise TypeError("can only perform ops with numeric values") | ||
|
||
# nans propagate | ||
if omask is None: | ||
mask = self._mask.copy() | ||
if other is libmissing.NA: | ||
mask |= True | ||
else: | ||
mask = self._mask | omask | ||
|
||
|
@@ -754,20 +783,23 @@ def integer_arithmetic_method(self, other): | |
# x ** 0 is 1. | ||
if omask is not None: | ||
mask = np.where((other == 0) & ~omask, False, mask) | ||
else: | ||
elif other is not libmissing.NA: | ||
mask = np.where(other == 0, False, mask) | ||
|
||
elif op_name == "rpow": | ||
# 1 ** x is 1. | ||
if omask is not None: | ||
mask = np.where((other == 1) & ~omask, False, mask) | ||
else: | ||
elif other is not libmissing.NA: | ||
mask = np.where(other == 1, False, mask) | ||
# x ** 0 is 1. | ||
mask = np.where((self._data == 0) & ~self._mask, False, mask) | ||
|
||
with np.errstate(all="ignore"): | ||
result = op(self._data, other) | ||
if other is libmissing.NA: | ||
result = np.ones_like(self._data) | ||
else: | ||
with np.errstate(all="ignore"): | ||
result = op(self._data, other) | ||
|
||
# divmod returns a tuple | ||
if op_name == "divmod": | ||
|
@@ -790,6 +822,11 @@ def integer_arithmetic_method(self, other): | |
_dtype_docstring = """ | ||
An ExtensionDtype for {dtype} integer data. | ||
|
||
.. versionchanged:: 1.0.0 | ||
|
||
Now uses :attr:`pandas.NA` as its missing value, | ||
rather than :attr:`numpy.nan`. | ||
|
||
Attributes | ||
---------- | ||
None | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
may want to add a versionchanged tag here (and below)