-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
DOC/TST: Indexing with NA raises #30308
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
492f904
6444aa0
53f4f63
3bbf868
a5ac457
0dfe761
dac111d
d1f08d9
3dd59ca
151bdfe
d57b0ac
36be0f6
7bd6c2f
c5f3afb
76bb6ce
505112e
c73ae8e
3efe359
f94483f
953938d
8b1e567
f317c64
c656292
d4f0adc
37ea95e
816a47c
21fd589
3637070
61599f2
6a0eda6
e622826
5004d91
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,3 +11,5 @@ | |
) | ||
from pandas.core.algorithms import take # noqa: F401 | ||
from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin # noqa: F401 | ||
from pandas.core.common import is_bool_indexer # noqa: F401 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jbrockmendel ought to move is_bool_indexer to a more usual location, pandas.core.dtypes.common is better There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think core.indexers |
||
from pandas.core.indexing import check_bool_array_indexer # noqa: F401 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,6 +29,7 @@ | |
from pandas.core import nanops, ops | ||
from pandas.core.algorithms import take | ||
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin | ||
from pandas.core.common import is_bool_indexer | ||
|
||
if TYPE_CHECKING: | ||
from pandas._typing import Scalar | ||
|
@@ -307,11 +308,25 @@ def _from_factorized(cls, values, original: "BooleanArray"): | |
def _formatter(self, boxed=False): | ||
return str | ||
|
||
@property | ||
def _hasna(self) -> bool: | ||
# Note: this is expensive right now! The hope is that we can | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# make this faster by having an optional mask, but not have to change | ||
# source code using it.. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this could easily be cached (and then updated on setitem / other mutation) |
||
return self._mask.any() | ||
|
||
def __getitem__(self, item): | ||
# import here to avoid circular import. Probably need to restructure | ||
from pandas.core.indexing import check_bool_array_indexer | ||
|
||
if is_integer(item): | ||
if self._mask[item]: | ||
return self.dtype.na_value | ||
return self._data[item] | ||
|
||
elif is_bool_indexer(item): | ||
item = check_bool_array_indexer(self, item) | ||
|
||
return type(self)(self._data[item], self._mask[item]) | ||
|
||
def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): | ||
|
@@ -329,7 +344,7 @@ def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): | |
if dtype is None: | ||
dtype = object | ||
if is_bool_dtype(dtype): | ||
if not self.isna().any(): | ||
if not self._hasna: | ||
return self._data | ||
else: | ||
raise ValueError( | ||
|
@@ -503,7 +518,7 @@ def astype(self, dtype, copy=True): | |
|
||
if is_bool_dtype(dtype): | ||
# astype_nansafe converts np.nan to True | ||
if self.isna().any(): | ||
if self._hasna: | ||
raise ValueError("cannot convert float NaN to bool") | ||
else: | ||
return self._data.astype(dtype, copy=copy) | ||
|
@@ -515,7 +530,7 @@ def astype(self, dtype, copy=True): | |
) | ||
# for integer, error if there are missing values | ||
if is_integer_dtype(dtype): | ||
if self.isna().any(): | ||
if self._hasna: | ||
raise ValueError("cannot convert NA to integer") | ||
# for float dtype, ensure we use np.nan before casting (numpy cannot | ||
# deal with pd.NA) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -738,6 +738,9 @@ def value_counts(self, dropna=True): | |
# -------- | ||
|
||
def __getitem__(self, key): | ||
# avoid mypy issues when importing at the top-level | ||
from pandas.core.indexing import check_bool_indexer | ||
|
||
if isinstance(key, tuple): | ||
if len(key) > 1: | ||
raise IndexError("too many indices for array.") | ||
|
@@ -766,7 +769,9 @@ def __getitem__(self, key): | |
else: | ||
key = np.asarray(key) | ||
|
||
if com.is_bool_indexer(key) and len(self) == len(key): | ||
if com.is_bool_indexer(key): | ||
key = check_bool_indexer(self, key) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this check_bool_indexer when in all the others its check_bool_array_indexer? found this bc mypy is complaining about that the first arg should be an Index |
||
|
||
return self.take(np.arange(len(key), dtype=np.int32)[key]) | ||
elif hasattr(key, "__len__"): | ||
return self.take(key) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ | |
from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries | ||
from pandas.core.dtypes.missing import _infer_fill_value, isna | ||
|
||
from pandas._typing import AnyArrayLike | ||
import pandas.core.common as com | ||
from pandas.core.indexers import is_list_like_indexer, length_of_indexer | ||
from pandas.core.indexes.api import Index, InvalidIndexError | ||
|
@@ -2268,6 +2269,69 @@ def convert_to_index_sliceable(obj, key): | |
return None | ||
|
||
|
||
def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i would like to see these utilities split out of here (pandas.core.indexing is huge). but can certainly be done later. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is also simple enough it could go into core.indexers |
||
""" | ||
Check if `mask` is a valid boolean indexer for `array`. | ||
|
||
`array` and `mask` are checked to have the same length, and the | ||
dtype is validated. | ||
|
||
Parameters | ||
---------- | ||
array : array | ||
The array that's being masked. | ||
mask : array | ||
The boolean array that's masking. | ||
|
||
Returns | ||
------- | ||
numpy.ndarray | ||
The validated boolean mask. | ||
|
||
Raises | ||
------ | ||
IndexError | ||
When the lengths don't match. | ||
ValueError | ||
When `mask` cannot be converted to a bool-dtype ndarray. | ||
|
||
See Also | ||
-------- | ||
api.extensions.is_bool_indexer : Check if `key` is a boolean indexer. | ||
|
||
Examples | ||
-------- | ||
A boolean ndarray is returned when the arguments are all valid. | ||
|
||
>>> mask = pd.array([True, False]) | ||
>>> arr = pd.Series([1, 2]) | ||
>>> pd.api.extensions.check_bool_array_indexer(arr, mask) | ||
array([ True, False]) | ||
|
||
An IndexError is raised when the lengths don't match. | ||
|
||
>>> mask = pd.array([True, False, True]) | ||
>>> pd.api.extensions.check_bool_array_indexer(arr, mask) | ||
Traceback (most recent call last): | ||
... | ||
IndexError: Item wrong length 3 instead of 2. | ||
|
||
A ValueError is raised when the mask cannot be converted to | ||
a bool-dtype ndarray. | ||
|
||
>>> mask = pd.array([True, pd.NA]) | ||
>>> pd.api.extensions.check_bool_array_indexer(arr, mask) | ||
Traceback (most recent call last): | ||
... | ||
ValueError: cannot convert to bool numpy array in presence of missing values | ||
""" | ||
result = np.asarray(mask, dtype=bool) | ||
# GH26658 | ||
if len(result) != len(array): | ||
raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we worry about shape here? i.e. either handle or disallow ndim>1? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so. This is just an extraction from |
||
return result | ||
|
||
|
||
def check_bool_indexer(index: Index, key) -> np.ndarray: | ||
""" | ||
Check if key is a valid boolean indexer for an object with such index and | ||
|
@@ -2308,13 +2372,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: | |
else: | ||
if is_sparse(result): | ||
result = result.to_dense() | ||
result = np.asarray(result, dtype=bool) | ||
|
||
# GH26658 | ||
if len(result) != len(index): | ||
raise IndexError( | ||
f"Item wrong length {len(result)} instead of {len(index)}." | ||
) | ||
result = check_bool_array_indexer(index, result) | ||
|
||
return result | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a real reason to expose check_book_array_indexer now?
is_bool_indexer also shouldn't be in api.extension, api.types would be ok, though it was never meant to be public.
can we defer both of these as exposing to the public until we actually see use?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Happy to defer exposing those utilities. Will push an update this evening.