Skip to content

Commit 22f9d9f

Browse files
API: generalized check_array_indexer for validating array-like getitem indexers (pandas-dev#31150)
1 parent a08c2f9 commit 22f9d9f

File tree

20 files changed

+321
-96
lines changed

20 files changed

+321
-96
lines changed

doc/source/reference/extensions.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ behaves correctly.
6666
.. autosummary::
6767
:toctree: api/
6868

69-
api.indexers.check_bool_array_indexer
69+
api.indexers.check_array_indexer
7070

7171

7272
The sentinel ``pandas.api.extensions.no_default`` is used as the default

pandas/api/indexers/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Public API for Rolling Window Indexers.
33
"""
44

5-
from pandas.core.indexers import check_bool_array_indexer
5+
from pandas.core.indexers import check_array_indexer
66
from pandas.core.window.indexers import BaseIndexer
77

8-
__all__ = ["check_bool_array_indexer", "BaseIndexer"]
8+
__all__ = ["check_array_indexer", "BaseIndexer"]

pandas/core/arrays/categorical.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
)
4040
from pandas.core.dtypes.dtypes import CategoricalDtype
4141
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
42-
from pandas.core.dtypes.inference import is_array_like, is_hashable
42+
from pandas.core.dtypes.inference import is_hashable
4343
from pandas.core.dtypes.missing import isna, notna
4444

4545
from pandas.core import ops
@@ -54,7 +54,7 @@
5454
from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
5555
import pandas.core.common as com
5656
from pandas.core.construction import array, extract_array, sanitize_array
57-
from pandas.core.indexers import check_bool_array_indexer
57+
from pandas.core.indexers import check_array_indexer, deprecate_ndim_indexing
5858
from pandas.core.missing import interpolate_2d
5959
from pandas.core.ops.common import unpack_zerodim_and_defer
6060
from pandas.core.sorting import nargsort
@@ -2001,14 +2001,11 @@ def __getitem__(self, key):
20012001
else:
20022002
return self.categories[i]
20032003

2004-
if is_list_like(key) and not is_array_like(key):
2005-
key = np.asarray(key)
2006-
2007-
if com.is_bool_indexer(key):
2008-
key = check_bool_array_indexer(self, key)
2004+
key = check_array_indexer(self, key)
20092005

20102006
result = self._codes[key]
20112007
if result.ndim > 1:
2008+
deprecate_ndim_indexing(result)
20122009
return result
20132010
return self._constructor(result, dtype=self.dtype, fastpath=True)
20142011

pandas/core/arrays/datetimelike.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts
4343
from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin
4444
import pandas.core.common as com
45-
from pandas.core.indexers import check_bool_array_indexer
45+
from pandas.core.indexers import check_array_indexer
4646
from pandas.core.ops.common import unpack_zerodim_and_defer
4747
from pandas.core.ops.invalid import invalid_comparison, make_invalid_op
4848

@@ -518,11 +518,20 @@ def __getitem__(self, key):
518518
return type(self)(val, dtype=self.dtype)
519519

520520
if com.is_bool_indexer(key):
521-
key = check_bool_array_indexer(self, key)
521+
# first convert to boolean, because check_array_indexer doesn't
522+
# allow object dtype
523+
key = np.asarray(key, dtype=bool)
524+
key = check_array_indexer(self, key)
522525
if key.all():
523526
key = slice(0, None, None)
524527
else:
525528
key = lib.maybe_booleans_to_slice(key.view(np.uint8))
529+
elif isinstance(key, list) and len(key) == 1 and isinstance(key[0], slice):
530+
# see https://github.com/pandas-dev/pandas/issues/31299, need to allow
531+
# this for now (would otherwise raise in check_array_indexer)
532+
pass
533+
else:
534+
key = check_array_indexer(self, key)
526535

527536
is_period = is_period_dtype(self)
528537
if is_period:

pandas/core/arrays/interval.py

+2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
from pandas.core.arrays.categorical import Categorical
4141
import pandas.core.common as com
4242
from pandas.core.construction import array
43+
from pandas.core.indexers import check_array_indexer
4344
from pandas.core.indexes.base import ensure_index
4445

4546
_VALID_CLOSED = {"left", "right", "both", "neither"}
@@ -495,6 +496,7 @@ def __len__(self) -> int:
495496
return len(self.left)
496497

497498
def __getitem__(self, value):
499+
value = check_array_indexer(self, value)
498500
left = self.left[value]
499501
right = self.right[value]
500502

pandas/core/arrays/masked.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@
99

1010
from pandas.core.algorithms import take
1111
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
12-
import pandas.core.common as com
13-
from pandas.core.indexers import check_bool_array_indexer
12+
from pandas.core.indexers import check_array_indexer
1413

1514
if TYPE_CHECKING:
1615
from pandas._typing import Scalar
@@ -35,8 +34,7 @@ def __getitem__(self, item):
3534
return self.dtype.na_value
3635
return self._data[item]
3736

38-
elif com.is_bool_indexer(item):
39-
item = check_bool_array_indexer(self, item)
37+
item = check_array_indexer(self, item)
4038

4139
return type(self)(self._data[item], self._mask[item])
4240

pandas/core/arrays/numpy_.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,8 @@
1818
from pandas.core import nanops
1919
from pandas.core.algorithms import searchsorted, take, unique
2020
from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin
21-
import pandas.core.common as com
2221
from pandas.core.construction import extract_array
23-
from pandas.core.indexers import check_bool_array_indexer
22+
from pandas.core.indexers import check_array_indexer
2423
from pandas.core.missing import backfill_1d, pad_1d
2524

2625

@@ -234,8 +233,7 @@ def __getitem__(self, item):
234233
if isinstance(item, type(self)):
235234
item = item._ndarray
236235

237-
elif com.is_bool_indexer(item):
238-
item = check_bool_array_indexer(self, item)
236+
item = check_array_indexer(self, item)
239237

240238
result = self._ndarray[item]
241239
if not lib.is_scalar(item):

pandas/core/arrays/sparse/array.py

+3
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
from pandas.core.base import PandasObject
4444
import pandas.core.common as com
4545
from pandas.core.construction import sanitize_array
46+
from pandas.core.indexers import check_array_indexer
4647
from pandas.core.missing import interpolate_2d
4748
import pandas.core.ops as ops
4849
from pandas.core.ops.common import unpack_zerodim_and_defer
@@ -768,6 +769,8 @@ def __getitem__(self, key):
768769
else:
769770
key = np.asarray(key)
770771

772+
key = check_array_indexer(self, key)
773+
771774
if com.is_bool_indexer(key):
772775
key = check_bool_indexer(self, key)
773776

pandas/core/common.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,8 @@ def is_bool_indexer(key: Any) -> bool:
121121
122122
See Also
123123
--------
124-
check_bool_array_indexer : Check that `key`
125-
is a valid mask for an array, and convert to an ndarray.
124+
check_array_indexer : Check that `key` is a valid array to index,
125+
and convert to an ndarray.
126126
"""
127127
na_msg = "cannot mask with array containing NA / NaN values"
128128
if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (

pandas/core/indexers.py

+130-23
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,18 @@
11
"""
22
Low-dependency indexing utilities.
33
"""
4+
import warnings
5+
46
import numpy as np
57

6-
from pandas._typing import AnyArrayLike
8+
from pandas._typing import Any, AnyArrayLike
79

8-
from pandas.core.dtypes.common import is_list_like
10+
from pandas.core.dtypes.common import (
11+
is_array_like,
12+
is_bool_dtype,
13+
is_integer_dtype,
14+
is_list_like,
15+
)
916
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
1017

1118
# -----------------------------------------------------------
@@ -244,66 +251,166 @@ def length_of_indexer(indexer, target=None) -> int:
244251
raise AssertionError("cannot find the length of the indexer")
245252

246253

247-
def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray:
254+
def deprecate_ndim_indexing(result):
255+
"""
256+
Helper function to raise the deprecation warning for multi-dimensional
257+
indexing on 1D Series/Index.
258+
259+
GH#27125 indexer like idx[:, None] expands dim, but we cannot do that
260+
and keep an index, so we currently return ndarray, which is deprecated
261+
(Deprecation GH#30588).
248262
"""
249-
Check if `mask` is a valid boolean indexer for `array`.
263+
if np.ndim(result) > 1:
264+
warnings.warn(
265+
"Support for multi-dimensional indexing (e.g. `index[:, None]`) "
266+
"on an Index is deprecated and will be removed in a future "
267+
"version. Convert to a numpy array before indexing instead.",
268+
DeprecationWarning,
269+
stacklevel=3,
270+
)
271+
272+
273+
# -----------------------------------------------------------
274+
# Public indexer validation
250275

251-
`array` and `mask` are checked to have the same length, and the
252-
dtype is validated.
276+
277+
def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
278+
"""
279+
Check if `indexer` is a valid array indexer for `array`.
280+
281+
For a boolean mask, `array` and `indexer` are checked to have the same
282+
length. The dtype is validated, and if it is an integer or boolean
283+
ExtensionArray, it is checked if there are missing values present, and
284+
it is converted to the appropriate numpy array. Other dtypes will raise
285+
an error.
286+
287+
Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed
288+
through as is.
253289
254290
.. versionadded:: 1.0.0
255291
256292
Parameters
257293
----------
258-
array : array
259-
The array that's being masked.
260-
mask : array
261-
The boolean array that's masking.
294+
array : array-like
295+
The array that is being indexed (only used for the length).
296+
indexer : array-like or list-like
297+
The array-like that's used to index. List-like input that is not yet
298+
a numpy array or an ExtensionArray is converted to one. Other input
299+
types are passed through as is
262300
263301
Returns
264302
-------
265303
numpy.ndarray
266-
The validated boolean mask.
304+
The validated indexer as a numpy array that can be used to index.
267305
268306
Raises
269307
------
270308
IndexError
271309
When the lengths don't match.
272310
ValueError
273-
When `mask` cannot be converted to a bool-dtype ndarray.
311+
When `indexer` cannot be converted to a numpy ndarray to index
312+
(e.g. presence of missing values).
274313
275314
See Also
276315
--------
277316
api.types.is_bool_dtype : Check if `key` is of boolean dtype.
278317
279318
Examples
280319
--------
281-
A boolean ndarray is returned when the arguments are all valid.
320+
When checking a boolean mask, a boolean ndarray is returned when the
321+
arguments are all valid.
282322
283323
>>> mask = pd.array([True, False])
284324
>>> arr = pd.array([1, 2])
285-
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
325+
>>> pd.api.indexers.check_array_indexer(arr, mask)
286326
array([ True, False])
287327
288328
An IndexError is raised when the lengths don't match.
289329
290330
>>> mask = pd.array([True, False, True])
291-
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
331+
>>> pd.api.indexers.check_array_indexer(arr, mask)
292332
Traceback (most recent call last):
293333
...
294-
IndexError: Item wrong length 3 instead of 2.
334+
IndexError: Boolean index has wrong length: 3 instead of 2.
295335
296336
A ValueError is raised when the mask cannot be converted to
297337
a bool-dtype ndarray.
298338
299339
>>> mask = pd.array([True, pd.NA])
300-
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
340+
>>> pd.api.indexers.check_array_indexer(arr, mask)
341+
Traceback (most recent call last):
342+
...
343+
ValueError: Cannot mask with a boolean indexer containing NA values
344+
345+
A numpy boolean mask will get passed through (if the length is correct):
346+
347+
>>> mask = np.array([True, False])
348+
>>> pd.api.indexers.check_array_indexer(arr, mask)
349+
array([ True, False])
350+
351+
Similarly for integer indexers, an integer ndarray is returned when it is
352+
a valid indexer, otherwise an error is (for integer indexers, a matching
353+
length is not required):
354+
355+
>>> indexer = pd.array([0, 2], dtype="Int64")
356+
>>> arr = pd.array([1, 2, 3])
357+
>>> pd.api.indexers.check_array_indexer(arr, indexer)
358+
array([0, 2])
359+
360+
>>> indexer = pd.array([0, pd.NA], dtype="Int64")
361+
>>> pd.api.indexers.check_array_indexer(arr, indexer)
362+
Traceback (most recent call last):
363+
...
364+
ValueError: Cannot index with an integer indexer containing NA values
365+
366+
For non-integer/boolean dtypes, an appropriate error is raised:
367+
368+
>>> indexer = np.array([0., 2.], dtype="float64")
369+
>>> pd.api.indexers.check_array_indexer(arr, indexer)
301370
Traceback (most recent call last):
302371
...
303-
ValueError: cannot convert to bool numpy array in presence of missing values
372+
IndexError: arrays used as indices must be of integer or boolean type
304373
"""
305-
result = np.asarray(mask, dtype=bool)
306-
# GH26658
307-
if len(result) != len(array):
308-
raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.")
309-
return result
374+
from pandas.core.construction import array as pd_array
375+
376+
# whathever is not an array-like is returned as-is (possible valid array
377+
# indexers that are not array-like: integer, slice, Ellipsis, None)
378+
# In this context, tuples are not considered as array-like, as they have
379+
# a specific meaning in indexing (multi-dimensional indexing)
380+
if is_list_like(indexer):
381+
if isinstance(indexer, tuple):
382+
return indexer
383+
else:
384+
return indexer
385+
386+
# convert list-likes to array
387+
if not is_array_like(indexer):
388+
indexer = pd_array(indexer)
389+
if len(indexer) == 0:
390+
# empty list is converted to float array by pd.array
391+
indexer = np.array([], dtype=np.intp)
392+
393+
dtype = indexer.dtype
394+
if is_bool_dtype(dtype):
395+
try:
396+
indexer = np.asarray(indexer, dtype=bool)
397+
except ValueError:
398+
raise ValueError("Cannot mask with a boolean indexer containing NA values")
399+
400+
# GH26658
401+
if len(indexer) != len(array):
402+
raise IndexError(
403+
f"Boolean index has wrong length: "
404+
f"{len(indexer)} instead of {len(array)}"
405+
)
406+
elif is_integer_dtype(dtype):
407+
try:
408+
indexer = np.asarray(indexer, dtype=np.intp)
409+
except ValueError:
410+
raise ValueError(
411+
"Cannot index with an integer indexer containing NA values"
412+
)
413+
else:
414+
raise IndexError("arrays used as indices must be of integer or boolean type")
415+
416+
return indexer

0 commit comments

Comments
 (0)