Skip to content

API: generalized check_array_indexer for validating array-like getitem indexers #31150

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Jan 29, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
e8f539a
API: generalized check_array_indexer for validating array-like indexers
jorisvandenbossche Jan 20, 2020
4fa9f5a
test boolean message as well
jorisvandenbossche Jan 20, 2020
b55dfd2
fixes for failing tests
jorisvandenbossche Jan 20, 2020
095b741
Merge remote-tracking branch 'upstream/master' into EA-check-indexer
jorisvandenbossche Jan 22, 2020
58bfe78
remove previous check_bool_array_indexer
jorisvandenbossche Jan 22, 2020
5ce8d85
don't convert tuples to avoid warning from numpy
jorisvandenbossche Jan 22, 2020
ebc2150
ensure check_bool_indexer returns numpy array
jorisvandenbossche Jan 22, 2020
4a51d97
raise warning for categorical
jorisvandenbossche Jan 22, 2020
50490aa
Merge remote-tracking branch 'upstream/master' into EA-check-indexer
jorisvandenbossche Jan 24, 2020
c979df8
move deprecate_ndim_indexing
jorisvandenbossche Jan 24, 2020
ce2e042
cleanup; ensure output of check_array_indexer is always an ndarray
jorisvandenbossche Jan 24, 2020
4d447bf
clean-up black reformatting
jorisvandenbossche Jan 24, 2020
d930e84
Merge remote-tracking branch 'upstream/master' into EA-check-indexer
jorisvandenbossche Jan 27, 2020
9ed8fe9
fix check_bool_indexer
jorisvandenbossche Jan 28, 2020
2f8cd27
add comment to check_bool_indexer
jorisvandenbossche Jan 28, 2020
4d9a201
fix empty list case
jorisvandenbossche Jan 28, 2020
097d221
add specific tests for check_array_indexer
jorisvandenbossche Jan 28, 2020
3c5e4c6
allow list-length-1-with-slice corner case
jorisvandenbossche Jan 28, 2020
1ca35d1
move list-like check inside
jorisvandenbossche Jan 28, 2020
e5ea9b4
Merge remote-tracking branch 'upstream/master' into EA-check-indexer
TomAugspurger Jan 28, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/api/indexers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Public API for Rolling Window Indexers.
"""

from pandas.core.indexers import check_bool_array_indexer
from pandas.core.indexers import check_array_indexer, check_bool_array_indexer
from pandas.core.window.indexers import BaseIndexer

__all__ = ["check_bool_array_indexer", "BaseIndexer"]
__all__ = ["check_array_indexer", "check_bool_array_indexer", "BaseIndexer"]
13 changes: 5 additions & 8 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
from pandas.core.dtypes.inference import is_array_like, is_hashable
from pandas.core.dtypes.inference import is_hashable
from pandas.core.dtypes.missing import isna, notna

from pandas.core import ops
Expand All @@ -54,7 +54,7 @@
from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
import pandas.core.common as com
from pandas.core.construction import array, extract_array, sanitize_array
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.indexers import check_array_indexer
from pandas.core.missing import interpolate_2d
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.sorting import nargsort
Expand Down Expand Up @@ -2001,15 +2001,12 @@ def __getitem__(self, key):
else:
return self.categories[i]

if is_list_like(key) and not is_array_like(key):
key = np.asarray(key)

if com.is_bool_indexer(key):
key = check_bool_array_indexer(self, key)
if is_list_like(key) and not isinstance(key, tuple):
key = check_array_indexer(self, key)

result = self._codes[key]
if result.ndim > 1:
return result
raise IndexError("Cannot user indexer with multiple dimensions")
return self._constructor(result, dtype=self.dtype, fastpath=True)

def __setitem__(self, key, value):
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts
from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.indexers import check_array_indexer
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.ops.invalid import invalid_comparison, make_invalid_op

Expand Down Expand Up @@ -517,8 +517,12 @@ def __getitem__(self, key):
return self._box_func(val)
return type(self)(val, dtype=self.dtype)

if is_list_like(key):
key = check_array_indexer(self, key)

if com.is_bool_indexer(key):
key = check_bool_array_indexer(self, key)
# can still have object dtype
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are there any uses of is_bool_indexer left? can you just get rid of them.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we still use it in several places. As long as we don't have deprecated+removed boolean indexing with object dtype (again, see the non-inline discussion in this PR, #31150 (comment)), we will need this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can u create an issue to rename / refactor / remove is_book_indexer then
as it’s purpose is now different before - it is no longer the one true way

and now we have 2 ways of checking booking indexers

either check_array_indexer should completely subsume it or it should be renamed

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and now we have 2 ways of checking booking indexers

Yes, because there are places we allow object dtype (for backwards compatibility), and there are places where we are more strict.

Why would it need to be renamed? Or what name do you suggest?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i would just incorporate it in check_array_indexer to be honest
it’s more complicated the way you have have it now and just plain confusing

i have to know that is_bool_indexer is something that doesn’t check indexing except in object arrays

key = np.asarray(key, dtype=bool)
if key.all():
key = slice(0, None, None)
else:
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from pandas.core.arrays.categorical import Categorical
import pandas.core.common as com
from pandas.core.construction import array
from pandas.core.indexers import check_array_indexer
from pandas.core.indexes.base import ensure_index

_VALID_CLOSED = {"left", "right", "both", "neither"}
Expand Down Expand Up @@ -495,6 +496,8 @@ def __len__(self) -> int:
return len(self.left)

def __getitem__(self, value):
if is_list_like(value):
value = check_array_indexer(self, value)
left = self.left[value]
right = self.right[value]

Expand Down
14 changes: 9 additions & 5 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,17 @@

from pandas._libs import lib, missing as libmissing

from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype
from pandas.core.dtypes.common import (
is_integer,
is_list_like,
is_object_dtype,
is_string_dtype,
)
from pandas.core.dtypes.missing import isna, notna

from pandas.core.algorithms import take
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.indexers import check_array_indexer

if TYPE_CHECKING:
from pandas._typing import Scalar
Expand All @@ -35,8 +39,8 @@ def __getitem__(self, item):
return self.dtype.na_value
return self._data[item]

elif com.is_bool_indexer(item):
item = check_bool_array_indexer(self, item)
elif is_list_like(item):
item = check_array_indexer(self, item)

return type(self)(self._data[item], self._mask[item])

Expand Down
8 changes: 4 additions & 4 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pandas.util._decorators import Appender
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
from pandas.core.dtypes.inference import is_array_like
Expand All @@ -18,9 +19,8 @@
from pandas.core import nanops
from pandas.core.algorithms import searchsorted, take, unique
from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.indexers import check_array_indexer
from pandas.core.missing import backfill_1d, pad_1d


Expand Down Expand Up @@ -235,8 +235,8 @@ def __getitem__(self, item):
if isinstance(item, type(self)):
item = item._ndarray

elif com.is_bool_indexer(item):
item = check_bool_array_indexer(self, item)
elif is_list_like(item):
item = check_array_indexer(self, item)

result = self._ndarray[item]
if not lib.is_scalar(item):
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
is_datetime64_any_dtype,
is_dtype_equal,
is_integer,
is_list_like,
is_object_dtype,
is_scalar,
is_string_dtype,
Expand All @@ -43,6 +44,7 @@
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.construction import sanitize_array
from pandas.core.indexers import check_array_indexer
from pandas.core.missing import interpolate_2d
import pandas.core.ops as ops
from pandas.core.ops.common import unpack_zerodim_and_defer
Expand Down Expand Up @@ -768,6 +770,9 @@ def __getitem__(self, key):
else:
key = np.asarray(key)

if is_list_like(key):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this repeated non purpose?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

repeated from where?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the next check is_bool_indexer is duplicative

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not fully duplicative, see my long explanation at #31150 (comment). It's mainly for dealing with object dtype.

key = check_array_indexer(self, key)

if com.is_bool_indexer(key):
key = check_bool_indexer(self, key)

Expand Down
66 changes: 65 additions & 1 deletion pandas/core/indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@

from pandas._typing import AnyArrayLike

from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.common import (
is_array_like,
is_bool_dtype,
is_integer_dtype,
is_list_like,
)
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries

# -----------------------------------------------------------
Expand Down Expand Up @@ -307,3 +312,62 @@ def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndar
if len(result) != len(array):
raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.")
return result


def check_array_indexer(array, indexer) -> np.ndarray:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you type these at all, shouldn't indexer -> key and be Label (or maybe something more sophisticated); not looking to solve this in this PR necessarily

"""
Check if `indexer` is a valid array indexer for `array`.

`array` and `indexer` are checked to have the same length, and the
dtype is validated. If it is an integer or boolean ExtensionArray, it is
checked if there are missing values present, and it is converted to
the appropriate numpy array.

.. versionadded:: 1.0.0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1.0 or 1.1?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1.0 if we're planning to subsume check_bool_array_indexer.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1.0 if we're planning to subsume check_bool_array_indexer.

Yes, this is replacing check_bool_array_indexer which is already in 1.0.0, so we should do the replacement also for 1.0.0


Parameters
----------
array : array
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can this be made more specific, e.g. "np.ndarray or EA"?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's only used to get the length, so made it "array-like" (can in principle also be a Series)

The array that's being indexed (only used for the length).
indexer : array-like
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a few places above, you've done is_list_like, but here we require an array (with a dtype).

Thoughts on what we want? Requiring an array is certainly easier, so that we don't have to infer the types. But users may be passing arbitrary objects to __getitem__.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We actually don't require an array with a dtype. The first thing that this function does is:

    if not is_array_like(indexer):
        indexer = pd.array(indexer)

to deal with eg lists.

So I probably meant to update the array into "list-like" instead of "array-like"

The array-like that's used to index.

Returns
-------
numpy.ndarray
The validated indexer.

Raises
------
IndexError
When the lengths don't match.
ValueError
When `indexer` cannot be converted to a numpy ndarray.

"""
import pandas as pd

if not is_array_like(indexer):
indexer = pd.array(indexer)
dtype = indexer.dtype
if is_bool_dtype(dtype):
try:
indexer = np.asarray(indexer, dtype=bool)
except ValueError:
raise ValueError("Cannot mask with a boolean indexer containing NA values")

# GH26658
if len(indexer) != len(array):
raise IndexError(
f"Item wrong length {len(indexer)} instead of {len(array)}."
)

elif is_integer_dtype(dtype):
try:
indexer = np.asarray(indexer, dtype=int)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does int vs np.int64 vs np.intp matter here? are there failure modes other than the presence of NAs?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this does matter; indexers are intp

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that was on my todo to fix up. Need to figure out the easiest way to convert to numpy array preserving the bit-ness of the dtype (or can we always convert to intp?)

Will update tomorrow

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, went with np.intp. From a quick test, when you pass non-intp integers to index with numpy, it's not slower to do the conversion to intp yourself beforehand (although while writing this, what happens if you try to index with a too large int64 that doesn't fit into int32 on a 32-bit platform?)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ensure_platform_int is a well established pattern

Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Jan 22, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you prefer to update ensure_platform_int to handle extension arrays so I can use it here? (it's basically the same as np.asarray(.., dtype=np.intp), not really sure why the code in ensure_platform_int takes more hoops, performance I suppose)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

either way - but should be consistent and use only 1 pattern; ensure_platform_int is used extensively already

except ValueError:
raise ValueError(
"Cannot index with an integer indexer containing NA values"
)

return indexer
36 changes: 35 additions & 1 deletion pandas/tests/extension/base/getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,14 +152,48 @@ def test_getitem_boolean_array_mask(self, data):
def test_getitem_boolean_array_mask_raises(self, data):
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
mask[:2] = pd.NA
with pytest.raises(ValueError):

msg = "Cannot mask with a boolean indexer containing NA values"
with pytest.raises(ValueError, match=msg):
data[mask]

s = pd.Series(data)

with pytest.raises(ValueError):
s[mask]

@pytest.mark.parametrize(
"idx",
[[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
ids=["list", "integer-array", "numpy-array"],
)
def test_getitem_integer_array(self, data, idx):
result = data[idx]
assert len(result) == 3
assert isinstance(result, type(data))
expected = data.take([0, 1, 2])
self.assert_extension_array_equal(result, expected)

expected = pd.Series(expected)
result = pd.Series(data)[idx]
self.assert_series_equal(result, expected)

@pytest.mark.parametrize(
"idx",
[[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")],
ids=["list", "integer-array"],
)
def test_getitem_integer_with_missing_raises(self, data, idx):
msg = "Cannot index with an integer indexer containing NA values"
with pytest.raises(ValueError, match=msg):
data[idx]

# TODO this raises KeyError about labels not found (it tries label-based)
# import pandas._testing as tm
# s = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))])
# with pytest.raises(ValueError, match=msg):
# s[idx]

def test_getitem_slice(self, data):
# getitem[slice] should return an array
result = data[slice(0)] # empty
Expand Down
8 changes: 1 addition & 7 deletions pandas/tests/extension/decimal/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,7 @@ def __getitem__(self, item):
else:
# array, slice.
if pd.api.types.is_list_like(item):
if not pd.api.types.is_array_like(item):
item = pd.array(item)
dtype = item.dtype
if pd.api.types.is_bool_dtype(dtype):
item = pd.api.indexers.check_bool_array_indexer(self, item)
elif pd.api.types.is_integer_dtype(dtype):
item = np.asarray(item, dtype="int")
item = pd.api.indexers.check_array_indexer(self, item)
return type(self)(self._data[item])

def take(self, indexer, allow_fill=False, fill_value=None):
Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/extension/json/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,8 @@ def __getitem__(self, item):
# slice
return type(self)(self.data[item])
else:
if not pd.api.types.is_array_like(item):
item = pd.array(item)
dtype = item.dtype
if pd.api.types.is_bool_dtype(dtype):
item = pd.api.indexers.check_bool_array_indexer(self, item)
item = pd.api.indexers.check_array_indexer(self, item)
if pd.api.types.is_bool_dtype(item.dtype):
return self._from_sequence([x for x, m in zip(self, item) if m])
# integer
return type(self)([self.data[i] for i in item])
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/indexes/categorical/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -976,8 +976,9 @@ def test_engine_type(self, dtype, engine_type):
assert np.issubdtype(ci.codes.dtype, dtype)
assert isinstance(ci._engine, engine_type)

def test_getitem_2d_deprecated(self):
def test_getitem_raise_2d(self):
# GH#30588 multi-dim indexing is deprecated, but raising is also acceptable
idx = self.create_index()
with pytest.raises(ValueError, match="cannot mask with array containing NA"):
msg = "Cannot user indexer with multiple dimensions"
with pytest.raises(IndexError, match=msg):
idx[:, None]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was removed because the base class version (which checks for the deprecation) now passes (since I added the deprecation warning)