From 492f904772ba452db3b60003e8cf935c9bbeec39 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Dec 2019 10:31:36 -0600 Subject: [PATCH 01/24] DOC/TST: Indexing with NA raises --- asv_bench/benchmarks/indexing.py | 4 ++ doc/source/user_guide/boolean.rst | 23 +++++++ pandas/core/arrays/boolean.py | 13 +++- pandas/tests/indexing/test_loc.py | 3 + pandas/tests/indexing/test_na_indexing.py | 79 +++++++++++++++++++++++ 5 files changed, 119 insertions(+), 3 deletions(-) create mode 100644 pandas/tests/indexing/test_na_indexing.py diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index c78c2fa92827e..f057ab5d6da31 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -131,6 +131,7 @@ def setup(self): self.col_scalar = columns[10] self.bool_indexer = self.df[self.col_scalar] > 0 self.bool_obj_indexer = self.bool_indexer.astype(object) + self.boolean_indexer = (self.df[self.col_scalar] > 0).astype("boolean") def time_loc(self): self.df.loc[self.idx_scalar, self.col_scalar] @@ -144,6 +145,9 @@ def time_boolean_rows(self): def time_boolean_rows_object(self): self.df[self.bool_obj_indexer] + def time_boolean_rows_boolean(self): + self.df[self.bool_obj_indexer] + class DataFrameNumericIndexing: def setup(self): diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index e0f676d3072fc..46695917bf15b 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -14,6 +14,29 @@ Nullable Boolean Data Type .. versionadded:: 1.0.0 + +.. _boolean.indexing: + +Indexing with NA values +----------------------- + +pandas does not allow indexing with NA values. Attempting to do so +will raise a ``ValueError``. + +.. ipython:: python + :okexcept: + + s = pd.Series([1, 2, 3]) + mask = pd.array([True, False, None]) + s[mask] + +The missing values will need to be explicitly filled with True or False prior +to using the array as a mask. + +.. ipython:: python + + s[mask.fillna(False)] + .. _boolean.kleene: Kleene Logical Operations diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 743d45e1fa400..2ed26be01b978 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -289,6 +289,13 @@ def _from_factorized(cls, values, original: "BooleanArray"): def _formatter(self, boxed=False): return str + @property + def _hasnans(self): + # Note: this is expensive right now! The hope is that we can + # make this faster by having an optional mask, but not have to change + # source code using it.. + return self._mask.any() + def __getitem__(self, item): if is_integer(item): if self._mask[item]: @@ -311,7 +318,7 @@ def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): if dtype is None: dtype = object if is_bool_dtype(dtype): - if not self.isna().any(): + if not self._hasnans: return self._data else: raise ValueError( @@ -485,7 +492,7 @@ def astype(self, dtype, copy=True): if is_bool_dtype(dtype): # astype_nansafe converts np.nan to True - if self.isna().any(): + if self._hasnans: raise ValueError("cannot convert float NaN to bool") else: return self._data.astype(dtype, copy=copy) @@ -497,7 +504,7 @@ def astype(self, dtype, copy=True): ) # for integer, error if there are missing values if is_integer_dtype(dtype): - if self.isna().any(): + if self._hasnans: raise ValueError("cannot convert NA to integer") # for float dtype, ensure we use np.nan before casting (numpy cannot # deal with pd.NA) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 6f20ec649b200..16a093872cf62 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -373,6 +373,9 @@ def test_loc_index(self): result = df.loc[mask.values] tm.assert_frame_equal(result, expected) + result = df.loc[pd.array(mask, dtype="boolean")] + tm.assert_frame_equal(result, expected) + def test_loc_general(self): df = DataFrame( diff --git a/pandas/tests/indexing/test_na_indexing.py b/pandas/tests/indexing/test_na_indexing.py new file mode 100644 index 0000000000000..550082ca8f53c --- /dev/null +++ b/pandas/tests/indexing/test_na_indexing.py @@ -0,0 +1,79 @@ +import pytest + +import pandas as pd +import pandas.util.testing as tm + + +@pytest.mark.parametrize( + "values, dtype", + [ + ([1, 2, 3], "int64"), + ([1.0, 2.0, 3.0], "float64"), + (["a", "b", "c"], "object"), + (["a", "b", "c"], "string"), + ([1, 2, 3], "datetime64[ns]"), + ([1, 2, 3], "datetime64[ns, CET]"), + ([1, 2, 3], "timedelta64[ns]"), + (["2000", "2001", "2002"], "Period[D]"), + ([1, 0, 3], "Sparse"), + ([pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(3, 4)], "interval"), + ], +) +@pytest.mark.parametrize( + "mask", [[True, False, False], [True, True, True], [False, False, False]] +) +@pytest.mark.parametrize("box_mask", [True, False]) +@pytest.mark.parametrize("frame", [True, False]) +def test_series_mask_boolean(values, dtype, mask, box_mask, frame): + ser = pd.Series(values, dtype=dtype, index=["a", "b", "c"]) + if frame: + ser = ser.to_frame() + mask = pd.array(mask, dtype="boolean") + if box_mask: + mask = pd.Series(mask, index=ser.index) + + expected = ser[mask.astype("bool")] + + result = ser[mask] + tm.assert_equal(result, expected) + + if not box_mask: + # Series.iloc[Series[bool]] isn't allowed + result = ser.iloc[mask] + tm.assert_equal(result, expected) + + result = ser.loc[mask] + tm.assert_equal(result, expected) + + # empty + mask = mask[:0] + ser = ser.iloc[:0] + expected = ser[mask.astype("bool")] + result = ser[mask] + tm.assert_equal(result, expected) + + if not box_mask: + # Series.iloc[Series[bool]] isn't allowed + result = ser.iloc[mask] + tm.assert_equal(result, expected) + + result = ser.loc[mask] + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("frame", [True, False]) +def test_indexing_with_na_raises(frame): + s = pd.Series([1, 2, 3], name="name") + + if frame: + s = s.to_frame() + mask = pd.array([True, False, None], dtype="boolean") + match = "cannot index with vector containing NA / NaN values" + with pytest.raises(ValueError, match=match): + s[mask] + + with pytest.raises(ValueError, match=match): + s.loc[mask] + + with pytest.raises(ValueError, match=match): + s.iloc[mask] From 53f4f634cba2fcd6deff46b81238aa629b617918 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 18 Dec 2019 09:06:11 -0600 Subject: [PATCH 02/24] Handle BooleanArray in all EAs --- pandas/api/extensions/__init__.py | 2 ++ pandas/core/arrays/boolean.py | 8 ++++++ pandas/core/arrays/categorical.py | 13 ++++++--- pandas/core/arrays/integer.py | 6 +++++ pandas/core/arrays/numpy_.py | 5 ++++ pandas/core/arrays/sparse/array.py | 5 +++- pandas/core/indexing.py | 35 ++++++++++++++++++++----- pandas/tests/extension/base/getitem.py | 31 ++++++++++++++++++++++ pandas/tests/extension/decimal/array.py | 8 +++++- pandas/tests/extension/json/array.py | 12 ++++++--- 10 files changed, 108 insertions(+), 17 deletions(-) diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index 573d700dac43d..d76e4903c4e78 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -11,3 +11,5 @@ ) from pandas.core.algorithms import take # noqa: F401 from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin # noqa: F401 +from pandas.core.common import is_bool_indexer # noqa: F401 +from pandas.core.indexing import check_bool_array_indexer # noqa: F401 diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 2ed26be01b978..b48024da2bcaf 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -29,6 +29,7 @@ from pandas.core import nanops, ops from pandas.core.algorithms import take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.common import is_bool_indexer if TYPE_CHECKING: from pandas._typing import Scalar @@ -297,10 +298,17 @@ def _hasnans(self): return self._mask.any() def __getitem__(self, item): + # import here to avoid circular import. Probably need to restructure + from pandas.core.indexing import check_bool_array_indexer + if is_integer(item): if self._mask[item]: return self.dtype.na_value return self._data[item] + + elif is_bool_indexer(item): + item = check_bool_array_indexer(self, item) + return type(self)(self._data[item], self._mask[item]) def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6b422adef2d68..3581c70305013 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1990,16 +1990,21 @@ def __getitem__(self, key): """ Return an item. """ + from pandas.core.indexing import check_bool_array_indexer + if isinstance(key, (int, np.integer)): i = self._codes[key] if i == -1: return np.nan else: return self.categories[i] - else: - return self._constructor( - values=self._codes[key], dtype=self.dtype, fastpath=True - ) + + elif com.is_bool_indexer(key): + key = check_bool_array_indexer(self, key) + + return self._constructor( + values=self._codes[key], dtype=self.dtype, fastpath=True + ) def __setitem__(self, key, value): """ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 3f5a4ca49702f..a3f821b0fff1d 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -26,6 +26,8 @@ from pandas.core import nanops, ops from pandas.core.algorithms import take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.common import is_bool_indexer +from pandas.core.indexing import check_bool_array_indexer from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric @@ -371,6 +373,10 @@ def __getitem__(self, item): if self._mask[item]: return self.dtype.na_value return self._data[item] + + elif is_bool_indexer(item): + item = check_bool_array_indexer(self, item) + return type(self)(self._data[item], self._mask[item]) def _coerce_to_ndarray(self): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index deec30dfe34ff..4e50bdae636d8 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -17,7 +17,9 @@ from pandas import compat from pandas.core import nanops from pandas.core.algorithms import searchsorted, take, unique +from pandas.core.common import is_bool_indexer from pandas.core.construction import extract_array +from pandas.core.indexing import check_bool_array_indexer from pandas.core.missing import backfill_1d, pad_1d from .base import ExtensionArray, ExtensionOpsMixin @@ -234,6 +236,9 @@ def __getitem__(self, item): if isinstance(item, type(self)): item = item._ndarray + elif is_bool_indexer(item): + item = check_bool_array_indexer(self, item) + result = self._ndarray[item] if not lib.is_scalar(item): result = type(self)(result) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 548f2bf702e60..fbfcdd3591a1b 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -42,6 +42,7 @@ from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import sanitize_array +from pandas.core.indexing import check_bool_indexer from pandas.core.missing import interpolate_2d import pandas.core.ops as ops from pandas.core.ops.common import unpack_zerodim_and_defer @@ -766,7 +767,9 @@ def __getitem__(self, key): else: key = np.asarray(key) - if com.is_bool_indexer(key) and len(self) == len(key): + if com.is_bool_indexer(key): + key = check_bool_indexer(self, key) + return self.take(np.arange(len(key), dtype=np.int32)[key]) elif hasattr(key, "__len__"): return self.take(key) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b31973de5bca0..f7f37cbae27b0 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -21,6 +21,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna +from pandas._typing import AnyArrayLike import pandas.core.common as com from pandas.core.indexers import is_list_like_indexer, length_of_indexer from pandas.core.indexes.api import Index, InvalidIndexError @@ -2282,6 +2283,32 @@ def convert_to_index_sliceable(obj, key): return None +def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray: + """ + Check wither `mask` is a valid boolean indexer for `array`. + + `array` and `mask` are checked to have the same length. + + Parameters + ---------- + array : array + The array that's being masked. + mask : array + The boolean array that's masking. + + Returns + ------- + numpy.ndarray + The validated boolean mask. + + """ + result = np.asarray(mask, dtype=bool) + # GH26658 + if len(result) != len(array): + raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.") + return result + + def check_bool_indexer(index: Index, key) -> np.ndarray: """ Check if key is a valid boolean indexer for an object with such index and @@ -2322,13 +2349,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: else: if is_sparse(result): result = result.to_dense() - result = np.asarray(result, dtype=bool) - - # GH26658 - if len(result) != len(index): - raise IndexError( - "Item wrong length {} instead of {}.".format(len(result), len(index)) - ) + result = check_bool_array_indexer(index, result) return result diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 0e4a86ed44529..007b4e768f608 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -121,6 +121,37 @@ def test_getitem_mask(self, data): assert len(result) == 1 assert result.dtype == data.dtype + def test_getitem_boolenarray_mask(self, data): + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + result = data[mask] + assert len(result) == 0 + assert isinstance(result, type(data)) + + result = pd.Series(data)[mask] + assert len(result) == 0 + assert result.dtype == data.dtype + + mask[:5] = True + expected = data.take([0, 1, 2, 3, 4]) + result = data[mask] + self.assert_extension_array_equal(result, expected) + + expected = pd.Series(expected) + result = pd.Series(data)[mask] + self.assert_series_equal(result, expected) + + def test_getitem_boolenarray_mask_raises(self, data): + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + mask[:2] = pd.NA + match = "" + with pytest.raises(ValueError, match=match): + data[mask] + + s = pd.Series(data) + + with pytest.raises(ValueError, match=match): + s[mask] + def test_getitem_slice(self, data): # getitem[slice] should return an array result = data[slice(0)] # empty diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 74f1e3cfbaf20..f81aae7db0f9e 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -8,7 +8,11 @@ from pandas.core.dtypes.base import ExtensionDtype import pandas as pd -from pandas.api.extensions import register_extension_dtype +from pandas.api.extensions import ( + check_bool_array_indexer, + is_bool_indexer, + register_extension_dtype, +) from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin @@ -109,6 +113,8 @@ def __getitem__(self, item): if isinstance(item, numbers.Integral): return self._data[item] else: + if is_bool_indexer(item): + item = check_bool_array_indexer(self, item) return type(self)(self._data[item]) def take(self, indexer, allow_fill=False, fill_value=None): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 46ca7bd8f760a..9cb5b07c9b910 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -19,9 +19,12 @@ import numpy as np -from pandas.core.dtypes.base import ExtensionDtype - -from pandas.core.arrays import ExtensionArray +from pandas.api.extensions import ( + ExtensionArray, + ExtensionDtype, + check_bool_array_indexer, + is_bool_indexer, +) class JSONDtype(ExtensionDtype): @@ -76,7 +79,8 @@ def _from_factorized(cls, values, original): def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.data[item] - elif isinstance(item, np.ndarray) and item.dtype == "bool": + elif is_bool_indexer(item): + item = check_bool_array_indexer(self, item) return self._from_sequence([x for x, m in zip(self, item) if m]) elif isinstance(item, abc.Iterable): # fancy indexing From 3bbf868eb1033b9aace90921cef4d8a460ecfa85 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 18 Dec 2019 09:09:48 -0600 Subject: [PATCH 03/24] update --- doc/source/reference/extensions.rst | 3 +++ pandas/core/common.py | 6 ++++++ pandas/core/indexing.py | 3 +++ 3 files changed, 12 insertions(+) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 4b1a99da7cd4c..117049ff621eb 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -18,6 +18,8 @@ objects. api.extensions.register_series_accessor api.extensions.register_index_accessor api.extensions.ExtensionDtype + api.extensions.is_bool_indexer + api.extensions.check_bool_array_indexer .. autosummary:: :toctree: api/ @@ -26,6 +28,7 @@ objects. api.extensions.ExtensionArray arrays.PandasArray + .. We need this autosummary so that methods and attributes are generated. .. Separate block, since they aren't classes. diff --git a/pandas/core/common.py b/pandas/core/common.py index 9017584171850..186bfb573a6b2 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -110,12 +110,18 @@ def is_bool_indexer(key: Any) -> bool: Returns ------- bool + Whether `key` is a valid boolean indexer. Raises ------ ValueError When the array is an object-dtype ndarray or ExtensionArray and contains missing values. + + See Also + -------- + api.extensions.check_bool_array_indexer : Check that `key` + is a valid mask for an array, and convert to an ndarary. """ na_msg = "cannot index with vector containing NA / NaN values" if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f7f37cbae27b0..f8c698bee98c1 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2301,6 +2301,9 @@ def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndar numpy.ndarray The validated boolean mask. + See Also + -------- + api.extensions.is_bool_indexer : Check if `key` is a boolean indexer. """ result = np.asarray(mask, dtype=bool) # GH26658 From a5ac45714cb86b4e639b350913424579119bf518 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 18 Dec 2019 09:13:33 -0600 Subject: [PATCH 04/24] fixups --- doc/source/user_guide/boolean.rst | 2 +- pandas/core/arrays/boolean.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index 46695917bf15b..6af49695962c6 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -27,7 +27,7 @@ will raise a ``ValueError``. :okexcept: s = pd.Series([1, 2, 3]) - mask = pd.array([True, False, None]) + mask = pd.array([True, False, pd.NA], dtyep="boolean") s[mask] The missing values will need to be explicitly filled with True or False prior diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index b48024da2bcaf..c716c6dd43c65 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -291,7 +291,7 @@ def _formatter(self, boxed=False): return str @property - def _hasnans(self): + def _hasna(self): # Note: this is expensive right now! The hope is that we can # make this faster by having an optional mask, but not have to change # source code using it.. @@ -326,7 +326,7 @@ def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): if dtype is None: dtype = object if is_bool_dtype(dtype): - if not self._hasnans: + if not self._hasna: return self._data else: raise ValueError( @@ -500,7 +500,7 @@ def astype(self, dtype, copy=True): if is_bool_dtype(dtype): # astype_nansafe converts np.nan to True - if self._hasnans: + if self._hasna: raise ValueError("cannot convert float NaN to bool") else: return self._data.astype(dtype, copy=copy) @@ -512,7 +512,7 @@ def astype(self, dtype, copy=True): ) # for integer, error if there are missing values if is_integer_dtype(dtype): - if self._hasnans: + if self._hasna: raise ValueError("cannot convert NA to integer") # for float dtype, ensure we use np.nan before casting (numpy cannot # deal with pd.NA) From 0dfe761ee2255e07470a2cb69df1a0b6ceacf4a4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 18 Dec 2019 09:56:00 -0600 Subject: [PATCH 05/24] type --- pandas/core/arrays/boolean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index c716c6dd43c65..76d93d7a9b486 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -291,7 +291,7 @@ def _formatter(self, boxed=False): return str @property - def _hasna(self): + def _hasna(self) -> bool: # Note: this is expensive right now! The hope is that we can # make this faster by having an optional mask, but not have to change # source code using it.. From dac111d0b871d71f42490696b5b2365bbb99f5ec Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 18 Dec 2019 11:25:46 -0600 Subject: [PATCH 06/24] fix benchmark --- asv_bench/benchmarks/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index f057ab5d6da31..6453649b91270 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -146,7 +146,7 @@ def time_boolean_rows_object(self): self.df[self.bool_obj_indexer] def time_boolean_rows_boolean(self): - self.df[self.bool_obj_indexer] + self.df[self.boolean_indexer] class DataFrameNumericIndexing: From d1f08d98bc19d45389afe5c394a4949ffeda853b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 18 Dec 2019 11:40:08 -0600 Subject: [PATCH 07/24] fixup --- pandas/core/arrays/integer.py | 4 +++- pandas/core/arrays/numpy_.py | 4 +++- pandas/core/arrays/sparse/array.py | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index a3f821b0fff1d..67427d9aa9a28 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -27,7 +27,6 @@ from pandas.core.algorithms import take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.common import is_bool_indexer -from pandas.core.indexing import check_bool_array_indexer from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric @@ -369,6 +368,9 @@ def fmt(x): return fmt def __getitem__(self, item): + # Importing this at the top-level causes many unrelated(?) mypy failures + from pandas.core.indexing import check_bool_array_indexer + if is_integer(item): if self._mask[item]: return self.dtype.na_value diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 4e50bdae636d8..cc9825f9cbbd0 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -19,7 +19,6 @@ from pandas.core.algorithms import searchsorted, take, unique from pandas.core.common import is_bool_indexer from pandas.core.construction import extract_array -from pandas.core.indexing import check_bool_array_indexer from pandas.core.missing import backfill_1d, pad_1d from .base import ExtensionArray, ExtensionOpsMixin @@ -233,6 +232,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # Pandas ExtensionArray Interface def __getitem__(self, item): + # Avoid mypy failures when importing at the top-level + from pandas.core.indexing import check_bool_array_indexer + if isinstance(item, type(self)): item = item._ndarray diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index fbfcdd3591a1b..adf10642f337a 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -42,7 +42,6 @@ from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import sanitize_array -from pandas.core.indexing import check_bool_indexer from pandas.core.missing import interpolate_2d import pandas.core.ops as ops from pandas.core.ops.common import unpack_zerodim_and_defer @@ -739,6 +738,9 @@ def value_counts(self, dropna=True): # -------- def __getitem__(self, key): + # avoid mypy issues when importing at the top-level + from pandas.core.indexing import check_bool_indexer + if isinstance(key, tuple): if len(key) > 1: raise IndexError("too many indices for array.") From 3dd59ca020c6c1d168baa0afa792a385b8da9ae7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 18 Dec 2019 11:53:28 -0600 Subject: [PATCH 08/24] typo --- doc/source/user_guide/boolean.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index 6af49695962c6..5276bc6142206 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -27,7 +27,7 @@ will raise a ``ValueError``. :okexcept: s = pd.Series([1, 2, 3]) - mask = pd.array([True, False, pd.NA], dtyep="boolean") + mask = pd.array([True, False, pd.NA], dtype="boolean") s[mask] The missing values will need to be explicitly filled with True or False prior From 151bdfecb5dfcf3994b43e427e6fae1915eee938 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 19 Dec 2019 10:15:46 -0600 Subject: [PATCH 09/24] updates --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/arrays/datetimelike.py | 4 +++- pandas/core/common.py | 19 ++++++------------- pandas/core/frame.py | 4 ---- pandas/core/indexes/base.py | 4 +++- pandas/core/indexes/datetimes.py | 2 ++ pandas/core/indexes/multi.py | 3 ++- pandas/core/indexing.py | 3 +++ pandas/core/series.py | 1 + pandas/tests/indexes/common.py | 5 +++++ 10 files changed, 26 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index e44fec112c5c1..06f2e95316c62 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -706,6 +706,7 @@ Datetimelike - Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`) - Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`) - Bug in :func:`pandas.to_datetime` failing for `deques` when using ``cache=True`` (the default) (:issue:`29403`) +- Bug in datetimelike indexes and arrays not validating that the length of a boolean mask matches the array (:issue:`30308`) - Bug in :meth:`Series.item` with ``datetime64`` or ``timedelta64`` dtype, :meth:`DatetimeIndex.item`, and :meth:`TimedeltaIndex.item` returning an integer instead of a :class:`Timestamp` or :class:`Timedelta` (:issue:`30175`) - diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f5d1e62f44fd0..90e56930bf13b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -416,7 +416,9 @@ def __getitem__(self, key): return self._box_func(val) if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + from pandas.core.indexing import check_bool_indexer + + key = check_bool_indexer(self, key) if key.all(): key = slice(0, None, None) else: diff --git a/pandas/core/common.py b/pandas/core/common.py index 186bfb573a6b2..dbf0339c5b009 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -112,18 +112,18 @@ def is_bool_indexer(key: Any) -> bool: bool Whether `key` is a valid boolean indexer. - Raises - ------ - ValueError - When the array is an object-dtype ndarray or ExtensionArray - and contains missing values. + Notes + ----- + This function is inexpensive for `bool` and `BooleanDtype`. + It is expensive for object-dtype backed arrays. In this case + a scan of the data to check that all the values are bool is + needed. See Also -------- api.extensions.check_bool_array_indexer : Check that `key` is a valid mask for an array, and convert to an ndarary. """ - na_msg = "cannot index with vector containing NA / NaN values" if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): @@ -131,16 +131,9 @@ def is_bool_indexer(key: Any) -> bool: key = np.asarray(values_from_object(key)) if not lib.is_bool_array(key): - if isna(key).any(): - raise ValueError(na_msg) return False return True elif is_bool_dtype(key.dtype): - # an ndarray with bool-dtype by definition has no missing values. - # So we only need to check for NAs in ExtensionArrays - if is_extension_array_dtype(key.dtype): - if np.any(key.isna()): - raise ValueError(na_msg) return True elif isinstance(key, list): try: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b699961cf07e8..fd3a67fa73620 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2799,10 +2799,6 @@ def _setitem_slice(self, key, value): def _setitem_array(self, key, value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): - if len(key) != len(self.index): - raise ValueError( - f"Item wrong length {len(key)} instead of {len(self.index)}!" - ) key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] self._check_setitem_copy() diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5abd049b9564c..e370417c8a0fe 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3998,6 +3998,8 @@ def __getitem__(self, key): corresponding `Index` subclass. """ + from pandas.core.indexing import check_bool_indexer + # There's no custom logic to be implemented in __getslice__, so it's # not overloaded intentionally. getitem = self._data.__getitem__ @@ -4013,7 +4015,7 @@ def __getitem__(self, key): return promote(getitem(key)) if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + key = check_bool_indexer(self, key) key = com.values_from_object(key) result = getitem(key) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 523c434cb7377..1fb2ec181605b 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1115,6 +1115,8 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz) def __getitem__(self, key): + # if com.is_bool_indexer(key): + # breakpoint() result = self._data.__getitem__(key) if is_scalar(result): return result diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9e434d0f5f704..58e227f7364fc 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -42,6 +42,7 @@ ensure_index, ) from pandas.core.indexes.frozen import FrozenList +from pandas.core.indexing import check_bool_indexer import pandas.core.missing as missing from pandas.core.sorting import ( get_group_index, @@ -1934,7 +1935,7 @@ def __getitem__(self, key): return tuple(retval) else: if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + key = check_bool_indexer(self, key) sortorder = self.sortorder else: # cannot be sure whether the result will be sorted diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f8c698bee98c1..c0edd5e9dffaa 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1595,6 +1595,7 @@ def _validate_key(self, key, axis: int): return if com.is_bool_indexer(key): + # XXX: do we need to verify no NA here? return if not is_list_like_indexer(key): @@ -1681,6 +1682,7 @@ def _getitem_axis(self, key, axis: int): self._validate_key(key, axis) return self._get_slice_axis(key, axis=axis) elif com.is_bool_indexer(key): + # check_bool_indexer is called in getbool_axis return self._getbool_axis(key, axis=axis) elif is_list_like_indexer(key): @@ -2030,6 +2032,7 @@ def _getitem_axis(self, key, axis: int): key = np.asarray(key) if com.is_bool_indexer(key): + # check_bool_indexer is called in _getbool_axis self._validate_key(key, axis) return self._getbool_axis(key, axis=axis) diff --git a/pandas/core/series.py b/pandas/core/series.py index 54c163330e6ee..c634fc52bee29 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -879,6 +879,7 @@ def __getitem__(self, key): elif key is Ellipsis: return self elif com.is_bool_indexer(key): + # We check later on. pass else: diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 102949fe3f05e..a6103cff1f016 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -220,6 +220,11 @@ def test_get_indexer_consistency(self, indices): assert isinstance(indexer, np.ndarray) assert indexer.dtype == np.intp + def test_getitem_mask_wrong_length(self, indices): + mask = np.array([True]) + with pytest.raises(IndexError, match="Item wrong length 1"): + indices[mask] + def test_ndarray_compat_properties(self): idx = self.create_index() assert idx.T.equals(idx) From d57b0ac41f2b84e00f51a4cf1b0ab48cb91c741d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 19 Dec 2019 12:01:31 -0600 Subject: [PATCH 10/24] Revert "updates" This reverts commit 151bdfecb5dfcf3994b43e427e6fae1915eee938. --- doc/source/whatsnew/v1.0.0.rst | 1 - pandas/core/arrays/datetimelike.py | 4 +--- pandas/core/common.py | 19 +++++++++++++------ pandas/core/frame.py | 4 ++++ pandas/core/indexes/base.py | 4 +--- pandas/core/indexes/datetimes.py | 2 -- pandas/core/indexes/multi.py | 3 +-- pandas/core/indexing.py | 3 --- pandas/core/series.py | 1 - pandas/tests/indexes/common.py | 5 ----- 10 files changed, 20 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 06f2e95316c62..e44fec112c5c1 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -706,7 +706,6 @@ Datetimelike - Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`) - Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`) - Bug in :func:`pandas.to_datetime` failing for `deques` when using ``cache=True`` (the default) (:issue:`29403`) -- Bug in datetimelike indexes and arrays not validating that the length of a boolean mask matches the array (:issue:`30308`) - Bug in :meth:`Series.item` with ``datetime64`` or ``timedelta64`` dtype, :meth:`DatetimeIndex.item`, and :meth:`TimedeltaIndex.item` returning an integer instead of a :class:`Timestamp` or :class:`Timedelta` (:issue:`30175`) - diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 90e56930bf13b..f5d1e62f44fd0 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -416,9 +416,7 @@ def __getitem__(self, key): return self._box_func(val) if com.is_bool_indexer(key): - from pandas.core.indexing import check_bool_indexer - - key = check_bool_indexer(self, key) + key = np.asarray(key, dtype=bool) if key.all(): key = slice(0, None, None) else: diff --git a/pandas/core/common.py b/pandas/core/common.py index dbf0339c5b009..186bfb573a6b2 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -112,18 +112,18 @@ def is_bool_indexer(key: Any) -> bool: bool Whether `key` is a valid boolean indexer. - Notes - ----- - This function is inexpensive for `bool` and `BooleanDtype`. - It is expensive for object-dtype backed arrays. In this case - a scan of the data to check that all the values are bool is - needed. + Raises + ------ + ValueError + When the array is an object-dtype ndarray or ExtensionArray + and contains missing values. See Also -------- api.extensions.check_bool_array_indexer : Check that `key` is a valid mask for an array, and convert to an ndarary. """ + na_msg = "cannot index with vector containing NA / NaN values" if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): @@ -131,9 +131,16 @@ def is_bool_indexer(key: Any) -> bool: key = np.asarray(values_from_object(key)) if not lib.is_bool_array(key): + if isna(key).any(): + raise ValueError(na_msg) return False return True elif is_bool_dtype(key.dtype): + # an ndarray with bool-dtype by definition has no missing values. + # So we only need to check for NAs in ExtensionArrays + if is_extension_array_dtype(key.dtype): + if np.any(key.isna()): + raise ValueError(na_msg) return True elif isinstance(key, list): try: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fd3a67fa73620..b699961cf07e8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2799,6 +2799,10 @@ def _setitem_slice(self, key, value): def _setitem_array(self, key, value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): + if len(key) != len(self.index): + raise ValueError( + f"Item wrong length {len(key)} instead of {len(self.index)}!" + ) key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] self._check_setitem_copy() diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e370417c8a0fe..5abd049b9564c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3998,8 +3998,6 @@ def __getitem__(self, key): corresponding `Index` subclass. """ - from pandas.core.indexing import check_bool_indexer - # There's no custom logic to be implemented in __getslice__, so it's # not overloaded intentionally. getitem = self._data.__getitem__ @@ -4015,7 +4013,7 @@ def __getitem__(self, key): return promote(getitem(key)) if com.is_bool_indexer(key): - key = check_bool_indexer(self, key) + key = np.asarray(key, dtype=bool) key = com.values_from_object(key) result = getitem(key) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 1fb2ec181605b..523c434cb7377 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1115,8 +1115,6 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz) def __getitem__(self, key): - # if com.is_bool_indexer(key): - # breakpoint() result = self._data.__getitem__(key) if is_scalar(result): return result diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 58e227f7364fc..9e434d0f5f704 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -42,7 +42,6 @@ ensure_index, ) from pandas.core.indexes.frozen import FrozenList -from pandas.core.indexing import check_bool_indexer import pandas.core.missing as missing from pandas.core.sorting import ( get_group_index, @@ -1935,7 +1934,7 @@ def __getitem__(self, key): return tuple(retval) else: if com.is_bool_indexer(key): - key = check_bool_indexer(self, key) + key = np.asarray(key, dtype=bool) sortorder = self.sortorder else: # cannot be sure whether the result will be sorted diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c0edd5e9dffaa..f8c698bee98c1 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1595,7 +1595,6 @@ def _validate_key(self, key, axis: int): return if com.is_bool_indexer(key): - # XXX: do we need to verify no NA here? return if not is_list_like_indexer(key): @@ -1682,7 +1681,6 @@ def _getitem_axis(self, key, axis: int): self._validate_key(key, axis) return self._get_slice_axis(key, axis=axis) elif com.is_bool_indexer(key): - # check_bool_indexer is called in getbool_axis return self._getbool_axis(key, axis=axis) elif is_list_like_indexer(key): @@ -2032,7 +2030,6 @@ def _getitem_axis(self, key, axis: int): key = np.asarray(key) if com.is_bool_indexer(key): - # check_bool_indexer is called in _getbool_axis self._validate_key(key, axis) return self._getbool_axis(key, axis=axis) diff --git a/pandas/core/series.py b/pandas/core/series.py index c634fc52bee29..54c163330e6ee 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -879,7 +879,6 @@ def __getitem__(self, key): elif key is Ellipsis: return self elif com.is_bool_indexer(key): - # We check later on. pass else: diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index a6103cff1f016..102949fe3f05e 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -220,11 +220,6 @@ def test_get_indexer_consistency(self, indices): assert isinstance(indexer, np.ndarray) assert indexer.dtype == np.intp - def test_getitem_mask_wrong_length(self, indices): - mask = np.array([True]) - with pytest.raises(IndexError, match="Item wrong length 1"): - indices[mask] - def test_ndarray_compat_properties(self): idx = self.create_index() assert idx.T.equals(idx) From 36be0f6ec7777c6901d3741e99cae3c8dbb87afd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 20 Dec 2019 08:52:34 -0600 Subject: [PATCH 11/24] examples --- pandas/core/indexing.py | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f8c698bee98c1..8ee3800901023 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2285,9 +2285,10 @@ def convert_to_index_sliceable(obj, key): def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray: """ - Check wither `mask` is a valid boolean indexer for `array`. + Check if `mask` is a valid boolean indexer for `array`. - `array` and `mask` are checked to have the same length. + `array` and `mask` are checked to have the same length, and the + dtype is validated. Parameters ---------- @@ -2301,9 +2302,42 @@ def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndar numpy.ndarray The validated boolean mask. + Raises + ------ + IndexError + When the lengths don't match. + ValueError + When `mask` cannot be converted to a bool-dtype ndarray. + See Also -------- api.extensions.is_bool_indexer : Check if `key` is a boolean indexer. + + Examples + -------- + A boolean ndarray is returned when the arguments are all valid. + + >>> mask = pd.array([True, False]) + >>> arr = pd.Series([1, 2]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + array([ True, False]) + + An IndexError is raised when the lengths don't match. + + >>> mask = pd.array([True, False, True]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + Traceback (most recent call last): + ... + IndexError: Item wrong length 3 instead of 2. + + A ValueError is raised when the mask cannot be converted to + a bool-dtype ndarray. + + >>> mask = pd.array([True, pd.NA]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + Traceback (most recent call last): + ... + ValueError: cannot convert to bool numpy array in presence of missing values """ result = np.asarray(mask, dtype=bool) # GH26658 From 7bd6c2fe8971e5a72934b8b8efee5615807d277f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 20 Dec 2019 09:00:46 -0600 Subject: [PATCH 12/24] restore datetime fix --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/arrays/datetimelike.py | 4 +++- pandas/tests/extension/base/getitem.py | 14 +++++++++++--- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index e44fec112c5c1..61a6f50ef3ecb 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -703,6 +703,7 @@ Datetimelike - Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`) - Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`) - Bug in :meth:`DatetimeIndex.strftime` and :meth:`Series.dt.strftime` where ``NaT`` was converted to the string ``'NaT'`` instead of ``np.nan`` (:issue:`29578`) +- Bug in masking datetime-like arrays with a boolean mask of an incorrect length not raising an ``IndexError`` (:issue:`30308`) - Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`) - Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`) - Bug in :func:`pandas.to_datetime` failing for `deques` when using ``cache=True`` (the default) (:issue:`29403`) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f5d1e62f44fd0..6fc8040455a1e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -416,7 +416,9 @@ def __getitem__(self, key): return self._box_func(val) if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + from pandas.core.indexing import check_bool_array_indexer + + key = check_bool_array_indexer(self, key) if key.all(): key = slice(0, None, None) else: diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 007b4e768f608..a5b2bfee8287b 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -121,6 +121,15 @@ def test_getitem_mask(self, data): assert len(result) == 1 assert result.dtype == data.dtype + def test_getitem_mask_raises(self, data): + mask = np.array([True, False]) + with pytest.raises(IndexError): + data[mask] + + mask = pd.array(mask, dtype="boolean") + with pytest.raises(IndexError): + data[mask] + def test_getitem_boolenarray_mask(self, data): mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") result = data[mask] @@ -143,13 +152,12 @@ def test_getitem_boolenarray_mask(self, data): def test_getitem_boolenarray_mask_raises(self, data): mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") mask[:2] = pd.NA - match = "" - with pytest.raises(ValueError, match=match): + with pytest.raises(ValueError): data[mask] s = pd.Series(data) - with pytest.raises(ValueError, match=match): + with pytest.raises(ValueError): s[mask] def test_getitem_slice(self, data): From 505112eb3d3b45e3fd19b178022ff2e94537ed3d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 28 Dec 2019 10:47:27 -0600 Subject: [PATCH 13/24] update error message --- pandas/core/common.py | 2 +- pandas/tests/extension/base/getitem.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 186bfb573a6b2..2dabc098517f7 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -123,7 +123,7 @@ def is_bool_indexer(key: Any) -> bool: api.extensions.check_bool_array_indexer : Check that `key` is a valid mask for an array, and convert to an ndarary. """ - na_msg = "cannot index with vector containing NA / NaN values" + na_msg = "cannot mask with array containing NA / NaN values" if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 639489ae591e9..dc1f62c4c97c5 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -130,7 +130,7 @@ def test_getitem_mask_raises(self, data): with pytest.raises(IndexError): data[mask] - def test_getitem_boolenarray_mask(self, data): + def test_getitem_boolean_array_mask(self, data): mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") result = data[mask] assert len(result) == 0 @@ -149,7 +149,7 @@ def test_getitem_boolenarray_mask(self, data): result = pd.Series(data)[mask] self.assert_series_equal(result, expected) - def test_getitem_boolenarray_mask_raises(self, data): + def test_getitem_boolean_array_mask_raises(self, data): mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") mask[:2] = pd.NA with pytest.raises(ValueError): From c73ae8e2048260e05615599b780b3c0aecaca83b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 28 Dec 2019 10:53:06 -0600 Subject: [PATCH 14/24] checks --- ci/code_checks.sh | 2 +- pandas/core/arrays/boolean.py | 4 ++-- pandas/core/arrays/integer.py | 4 ++-- pandas/core/arrays/numpy_.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 46ace2dd9d70e..4a4887124885c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -121,7 +121,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then # Check for imports from pandas.core.common instead of `import pandas.core.common as com` # Check for imports from collections.abc instead of `from collections import abc` MSG='Check for non-standard imports' ; echo $MSG - invgrep -R --include="*.py*" -E "from pandas.core.common import" pandas + invgrep -R --include="*.py*" --exclude="__init__.py" -E "from pandas.core.common import" pandas RET=$(($RET + $?)) ; echo $MSG "DONE" invgrep -R --include="*.py*" -E "from pandas.core import common" pandas RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 66f85a9c934b2..d998b64690670 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -29,7 +29,7 @@ from pandas.core import nanops, ops from pandas.core.algorithms import take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin -from pandas.core.common import is_bool_indexer +import pandas.core.common as com if TYPE_CHECKING: from pandas._typing import Scalar @@ -324,7 +324,7 @@ def __getitem__(self, item): return self.dtype.na_value return self._data[item] - elif is_bool_indexer(item): + elif com.is_bool_indexer(item): item = check_bool_array_indexer(self, item) return type(self)(self._data[item], self._mask[item]) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 67427d9aa9a28..7fdd31979d0a5 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -26,7 +26,7 @@ from pandas.core import nanops, ops from pandas.core.algorithms import take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin -from pandas.core.common import is_bool_indexer +import pandas.core.common as com from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric @@ -376,7 +376,7 @@ def __getitem__(self, item): return self.dtype.na_value return self._data[item] - elif is_bool_indexer(item): + elif com.is_bool_indexer(item): item = check_bool_array_indexer(self, item) return type(self)(self._data[item], self._mask[item]) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index cc9825f9cbbd0..79c0a1a7f8df5 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -17,7 +17,7 @@ from pandas import compat from pandas.core import nanops from pandas.core.algorithms import searchsorted, take, unique -from pandas.core.common import is_bool_indexer +import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.missing import backfill_1d, pad_1d @@ -238,7 +238,7 @@ def __getitem__(self, item): if isinstance(item, type(self)): item = item._ndarray - elif is_bool_indexer(item): + elif com.is_bool_indexer(item): item = check_bool_array_indexer(self, item) result = self._ndarray[item] From f94483fe037ae56c983f7238fe24d034e117221a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Dec 2019 06:32:21 -0600 Subject: [PATCH 15/24] update for error message --- pandas/tests/indexing/test_na_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_na_indexing.py b/pandas/tests/indexing/test_na_indexing.py index 550082ca8f53c..4b92df581d164 100644 --- a/pandas/tests/indexing/test_na_indexing.py +++ b/pandas/tests/indexing/test_na_indexing.py @@ -68,7 +68,7 @@ def test_indexing_with_na_raises(frame): if frame: s = s.to_frame() mask = pd.array([True, False, None], dtype="boolean") - match = "cannot index with vector containing NA / NaN values" + match = "cannot mask with array containing NA / NaN values" with pytest.raises(ValueError, match=match): s[mask] From 8b1e56788b5a8832d7e41c123e7d66626df38d28 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Dec 2019 09:05:04 -0600 Subject: [PATCH 16/24] update isort --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b34f5dfdd1a83..88548f6c2f678 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: language: python_venv additional_dependencies: [flake8-comprehensions>=3.1.0] - repo: https://github.com/pre-commit/mirrors-isort - rev: v4.3.20 + rev: v4.3.21 hooks: - id: isort language: python_venv From f317c64d1afa9eda64222073ef2376b0acb199ba Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Dec 2019 09:06:17 -0600 Subject: [PATCH 17/24] isort --- pandas/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 62fc1e3397c11..6fc04b7025e7f 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -4,6 +4,7 @@ from pandas._libs.indexing import _NDFrameIndexerBase from pandas._libs.lib import item_from_zerodim +from pandas._typing import AnyArrayLike from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender @@ -21,7 +22,6 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna -from pandas._typing import AnyArrayLike import pandas.core.common as com from pandas.core.indexers import is_list_like_indexer, length_of_indexer from pandas.core.indexes.api import Index, InvalidIndexError From c6562921aaeffb4f03622906925cddb937bb9907 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Dec 2019 09:58:04 -0600 Subject: [PATCH 18/24] fixup --- pandas/tests/series/indexing/test_boolean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py index c2912cf3ce53f..925d657d7dd04 100644 --- a/pandas/tests/series/indexing/test_boolean.py +++ b/pandas/tests/series/indexing/test_boolean.py @@ -75,7 +75,7 @@ def test_getitem_boolean_object(string_series): # nans raise exception omask[5:10] = np.nan - msg = "cannot index with vector containing NA / NaN values" + msg = "cannot mask with array containing NA / NaN values" with pytest.raises(ValueError, match=msg): s[omask] with pytest.raises(ValueError, match=msg): From 37ea95e569ff43c4e5d9e3c579df6797832efd23 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 31 Dec 2019 14:12:00 -0600 Subject: [PATCH 19/24] fixup --- ci/code_checks.sh | 2 +- doc/source/reference/extensions.rst | 3 -- pandas/api/extensions/__init__.py | 2 - pandas/core/arrays/boolean.py | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/numpy_.py | 2 +- pandas/core/common.py | 4 +- pandas/core/indexers.py | 65 +++++++++++++++++++++++ pandas/core/indexing.py | 70 ++----------------------- pandas/tests/extension/decimal/array.py | 8 ++- pandas/tests/extension/json/array.py | 9 ++-- 13 files changed, 84 insertions(+), 89 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 4a4887124885c..46ace2dd9d70e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -121,7 +121,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then # Check for imports from pandas.core.common instead of `import pandas.core.common as com` # Check for imports from collections.abc instead of `from collections import abc` MSG='Check for non-standard imports' ; echo $MSG - invgrep -R --include="*.py*" --exclude="__init__.py" -E "from pandas.core.common import" pandas + invgrep -R --include="*.py*" -E "from pandas.core.common import" pandas RET=$(($RET + $?)) ; echo $MSG "DONE" invgrep -R --include="*.py*" -E "from pandas.core import common" pandas RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 117049ff621eb..4b1a99da7cd4c 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -18,8 +18,6 @@ objects. api.extensions.register_series_accessor api.extensions.register_index_accessor api.extensions.ExtensionDtype - api.extensions.is_bool_indexer - api.extensions.check_bool_array_indexer .. autosummary:: :toctree: api/ @@ -28,7 +26,6 @@ objects. api.extensions.ExtensionArray arrays.PandasArray - .. We need this autosummary so that methods and attributes are generated. .. Separate block, since they aren't classes. diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index d76e4903c4e78..573d700dac43d 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -11,5 +11,3 @@ ) from pandas.core.algorithms import take # noqa: F401 from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin # noqa: F401 -from pandas.core.common import is_bool_indexer # noqa: F401 -from pandas.core.indexing import check_bool_array_indexer # noqa: F401 diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index cc3a954b0489f..807fb4547a261 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -317,7 +317,7 @@ def _hasna(self) -> bool: def __getitem__(self, item): # import here to avoid circular import. Probably need to restructure - from pandas.core.indexing import check_bool_array_indexer + from pandas.core.indexers import check_bool_array_indexer if is_integer(item): if self._mask[item]: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 016257340b47f..c0b04e4edb163 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1990,7 +1990,7 @@ def __getitem__(self, key): """ Return an item. """ - from pandas.core.indexing import check_bool_array_indexer + from pandas.core.indexers import check_bool_array_indexer if isinstance(key, (int, np.integer)): i = self._codes[key] diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ff230fc1d79c4..3c7d90084d633 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -437,7 +437,7 @@ def __getitem__(self, key): return type(self)(val, dtype=self.dtype) if com.is_bool_indexer(key): - from pandas.core.indexing import check_bool_array_indexer + from pandas.core.indexers import check_bool_array_indexer key = check_bool_array_indexer(self, key) if key.all(): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 8a0caa79fd576..49447a5099706 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -366,7 +366,7 @@ def _from_factorized(cls, values, original): def __getitem__(self, item): # Importing this at the top-level causes many unrelated(?) mypy failures - from pandas.core.indexing import check_bool_array_indexer + from pandas.core.indexers import check_bool_array_indexer if is_integer(item): if self._mask[item]: diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 79c0a1a7f8df5..b75d47515cf78 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -233,7 +233,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): def __getitem__(self, item): # Avoid mypy failures when importing at the top-level - from pandas.core.indexing import check_bool_array_indexer + from pandas.core.indexers import check_bool_array_indexer if isinstance(item, type(self)): item = item._ndarray diff --git a/pandas/core/common.py b/pandas/core/common.py index a2fb399e7ff43..f0fcb736586d6 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -121,8 +121,8 @@ def is_bool_indexer(key: Any) -> bool: See Also -------- - api.extensions.check_bool_array_indexer : Check that `key` - is a valid mask for an array, and convert to an ndarary. + check_bool_array_indexer : Check that `key` + is a valid mask for an array, and convert to an ndarray. """ na_msg = "cannot mask with array containing NA / NaN values" if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index f75087ca3b505..b6d4f5b04e86c 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -3,6 +3,8 @@ """ import numpy as np +from pandas._typing import AnyArrayLike + from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -240,3 +242,66 @@ def length_of_indexer(indexer, target=None) -> int: elif not is_list_like_indexer(indexer): return 1 raise AssertionError("cannot find the length of the indexer") + + +def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray: + """ + Check if `mask` is a valid boolean indexer for `array`. + + `array` and `mask` are checked to have the same length, and the + dtype is validated. + + Parameters + ---------- + array : array + The array that's being masked. + mask : array + The boolean array that's masking. + + Returns + ------- + numpy.ndarray + The validated boolean mask. + + Raises + ------ + IndexError + When the lengths don't match. + ValueError + When `mask` cannot be converted to a bool-dtype ndarray. + + See Also + -------- + api.extensions.is_bool_indexer : Check if `key` is a boolean indexer. + + Examples + -------- + A boolean ndarray is returned when the arguments are all valid. + + >>> mask = pd.array([True, False]) + >>> arr = pd.Series([1, 2]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + array([ True, False]) + + An IndexError is raised when the lengths don't match. + + >>> mask = pd.array([True, False, True]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + Traceback (most recent call last): + ... + IndexError: Item wrong length 3 instead of 2. + + A ValueError is raised when the mask cannot be converted to + a bool-dtype ndarray. + + >>> mask = pd.array([True, pd.NA]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + Traceback (most recent call last): + ... + ValueError: cannot convert to bool numpy array in presence of missing values + """ + result = np.asarray(mask, dtype=bool) + # GH26658 + if len(result) != len(array): + raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.") + return result diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 32bffb8e31cfa..b15d91240e7bb 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -4,7 +4,6 @@ from pandas._libs.indexing import _NDFrameIndexerBase from pandas._libs.lib import item_from_zerodim -from pandas._typing import AnyArrayLike from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender @@ -23,7 +22,11 @@ from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com -from pandas.core.indexers import is_list_like_indexer, length_of_indexer +from pandas.core.indexers import ( + check_bool_array_indexer, + is_list_like_indexer, + length_of_indexer, +) from pandas.core.indexes.api import Index, InvalidIndexError @@ -2270,69 +2273,6 @@ def convert_to_index_sliceable(obj, key): return None -def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray: - """ - Check if `mask` is a valid boolean indexer for `array`. - - `array` and `mask` are checked to have the same length, and the - dtype is validated. - - Parameters - ---------- - array : array - The array that's being masked. - mask : array - The boolean array that's masking. - - Returns - ------- - numpy.ndarray - The validated boolean mask. - - Raises - ------ - IndexError - When the lengths don't match. - ValueError - When `mask` cannot be converted to a bool-dtype ndarray. - - See Also - -------- - api.extensions.is_bool_indexer : Check if `key` is a boolean indexer. - - Examples - -------- - A boolean ndarray is returned when the arguments are all valid. - - >>> mask = pd.array([True, False]) - >>> arr = pd.Series([1, 2]) - >>> pd.api.extensions.check_bool_array_indexer(arr, mask) - array([ True, False]) - - An IndexError is raised when the lengths don't match. - - >>> mask = pd.array([True, False, True]) - >>> pd.api.extensions.check_bool_array_indexer(arr, mask) - Traceback (most recent call last): - ... - IndexError: Item wrong length 3 instead of 2. - - A ValueError is raised when the mask cannot be converted to - a bool-dtype ndarray. - - >>> mask = pd.array([True, pd.NA]) - >>> pd.api.extensions.check_bool_array_indexer(arr, mask) - Traceback (most recent call last): - ... - ValueError: cannot convert to bool numpy array in presence of missing values - """ - result = np.asarray(mask, dtype=bool) - # GH26658 - if len(result) != len(array): - raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.") - return result - - def check_bool_indexer(index: Index, key) -> np.ndarray: """ Check if key is a valid boolean indexer for an object with such index and diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index f81aae7db0f9e..94695bb3d306d 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -8,12 +8,10 @@ from pandas.core.dtypes.base import ExtensionDtype import pandas as pd -from pandas.api.extensions import ( - check_bool_array_indexer, - is_bool_indexer, - register_extension_dtype, -) +from pandas.api.extensions import register_extension_dtype from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin +from pandas.core.common import is_bool_indexer +from pandas.core.indexers import check_bool_array_indexer @register_extension_dtype diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index a37f28c114fca..dabcd02a76c76 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -19,12 +19,9 @@ import numpy as np -from pandas.api.extensions import ( - ExtensionArray, - ExtensionDtype, - check_bool_array_indexer, - is_bool_indexer, -) +from pandas.api.extensions import ExtensionArray, ExtensionDtype +from pandas.core.common import is_bool_indexer +from pandas.core.indexers import check_bool_array_indexer class JSONDtype(ExtensionDtype): From 21fd589d5003578340314e884b4cdcbbb0d6b71d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 2 Jan 2020 07:42:38 -0600 Subject: [PATCH 20/24] update arrayo --- pandas/tests/extension/decimal/array.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 94695bb3d306d..770b703def87d 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -10,7 +10,6 @@ import pandas as pd from pandas.api.extensions import register_extension_dtype from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin -from pandas.core.common import is_bool_indexer from pandas.core.indexers import check_bool_array_indexer @@ -111,8 +110,13 @@ def __getitem__(self, item): if isinstance(item, numbers.Integral): return self._data[item] else: - if is_bool_indexer(item): - item = check_bool_array_indexer(self, item) + # array, slice. + if pd.api.types.is_list_like(item): + if not pd.api.types.is_array_like(item): + item = pd.array(item) + dtype = item.dtype + if pd.api.types.is_bool_dtype(dtype): + item = check_bool_array_indexer(self, item) return type(self)(self._data[item]) def take(self, indexer, allow_fill=False, fill_value=None): From 36370705735430601892bd67145ca22072233c96 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 2 Jan 2020 07:55:03 -0600 Subject: [PATCH 21/24] doc --- doc/source/reference/extensions.rst | 9 +++++++++ pandas/api/indexers/__init__.py | 1 + pandas/core/indexers.py | 2 ++ pandas/tests/extension/decimal/array.py | 3 +-- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 4b1a99da7cd4c..6d4eff8c97052 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -59,3 +59,12 @@ objects. api.extensions.ExtensionArray.nbytes api.extensions.ExtensionArray.ndim api.extensions.ExtensionArray.shape + +Additionally, we have some utility methods for ensuring your object +behaves correctly. + +.. autosummary:: + :toctree: api/ + + api.indexers.check_bool_array_indexer + diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py index a5d6bc07da3eb..64383d8ecbd24 100644 --- a/pandas/api/indexers/__init__.py +++ b/pandas/api/indexers/__init__.py @@ -1,2 +1,3 @@ """Public API for Rolling Window Indexers""" +from pandas.core.indexers import check_bool_array_indexer # noqa: F401 from pandas.core.window.indexers import BaseIndexer # noqa: F401 diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index b6d4f5b04e86c..0f932f7b849e3 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -251,6 +251,8 @@ def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndar `array` and `mask` are checked to have the same length, and the dtype is validated. + .. versionadded:: 1.0.0 + Parameters ---------- array : array diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 770b703def87d..af3775c8b174d 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -10,7 +10,6 @@ import pandas as pd from pandas.api.extensions import register_extension_dtype from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin -from pandas.core.indexers import check_bool_array_indexer @register_extension_dtype @@ -116,7 +115,7 @@ def __getitem__(self, item): item = pd.array(item) dtype = item.dtype if pd.api.types.is_bool_dtype(dtype): - item = check_bool_array_indexer(self, item) + item = pd.api.indexers.check_bool_array_indexer(self, item) return type(self)(self._data[item]) def take(self, indexer, allow_fill=False, fill_value=None): From 61599f2aa762bcaf83095f2277a96d44d36e7a8c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 2 Jan 2020 07:56:14 -0600 Subject: [PATCH 22/24] integer --- pandas/tests/extension/decimal/array.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index af3775c8b174d..570cdf5f29d00 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -116,6 +116,8 @@ def __getitem__(self, item): dtype = item.dtype if pd.api.types.is_bool_dtype(dtype): item = pd.api.indexers.check_bool_array_indexer(self, item) + elif pd.api.types.is_integer_dtype(dtype): + item = np.asarray(item, dtype="int") return type(self)(self._data[item]) def take(self, indexer, allow_fill=False, fill_value=None): From e62282645b9794730ef1281006cf3debbe526f0c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 2 Jan 2020 08:07:38 -0600 Subject: [PATCH 23/24] fixup --- doc/source/reference/extensions.rst | 1 - pandas/core/arrays/boolean.py | 4 +--- pandas/core/arrays/categorical.py | 3 +-- pandas/core/arrays/datetimelike.py | 3 +-- pandas/core/arrays/integer.py | 4 +--- pandas/core/arrays/numpy_.py | 4 +--- 6 files changed, 5 insertions(+), 14 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 6d4eff8c97052..16a84b5d2ecaf 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -67,4 +67,3 @@ behaves correctly. :toctree: api/ api.indexers.check_bool_array_indexer - diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 807fb4547a261..102150b1cbce1 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -30,6 +30,7 @@ from pandas.core.algorithms import take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com +from pandas.core.indexers import check_bool_array_indexer if TYPE_CHECKING: from pandas._typing import Scalar @@ -316,9 +317,6 @@ def _hasna(self) -> bool: return self._mask.any() def __getitem__(self, item): - # import here to avoid circular import. Probably need to restructure - from pandas.core.indexers import check_bool_array_indexer - if is_integer(item): if self._mask[item]: return self.dtype.na_value diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e00ddc8589b84..8c78676834786 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -49,6 +49,7 @@ from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array +from pandas.core.indexers import check_bool_array_indexer from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort @@ -1990,8 +1991,6 @@ def __getitem__(self, key): """ Return an item. """ - from pandas.core.indexers import check_bool_array_indexer - if isinstance(key, (int, np.integer)): i = self._codes[key] if i == -1: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 156fa006bf351..2bdd9acaeb70f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -40,6 +40,7 @@ from pandas.core import missing, nanops from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts import pandas.core.common as com +from pandas.core.indexers import check_bool_array_indexer from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import make_invalid_op @@ -436,8 +437,6 @@ def __getitem__(self, key): return type(self)(val, dtype=self.dtype) if com.is_bool_indexer(key): - from pandas.core.indexers import check_bool_array_indexer - key = check_bool_array_indexer(self, key) if key.all(): key = slice(0, None, None) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 49447a5099706..0922f4ac6f71d 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -27,6 +27,7 @@ from pandas.core.algorithms import take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com +from pandas.core.indexers import check_bool_array_indexer from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric @@ -365,9 +366,6 @@ def _from_factorized(cls, values, original): return integer_array(values, dtype=original.dtype) def __getitem__(self, item): - # Importing this at the top-level causes many unrelated(?) mypy failures - from pandas.core.indexers import check_bool_array_indexer - if is_integer(item): if self._mask[item]: return self.dtype.na_value diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index b75d47515cf78..a114be9a21c6c 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -19,6 +19,7 @@ from pandas.core.algorithms import searchsorted, take, unique import pandas.core.common as com from pandas.core.construction import extract_array +from pandas.core.indexers import check_bool_array_indexer from pandas.core.missing import backfill_1d, pad_1d from .base import ExtensionArray, ExtensionOpsMixin @@ -232,9 +233,6 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # Pandas ExtensionArray Interface def __getitem__(self, item): - # Avoid mypy failures when importing at the top-level - from pandas.core.indexers import check_bool_array_indexer - if isinstance(item, type(self)): item = item._ndarray From 5004d91cfa3c897e0748de6bca8f2b9e7d92b329 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 2 Jan 2020 09:02:57 -0600 Subject: [PATCH 24/24] fixup --- pandas/tests/extension/json/array.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index dabcd02a76c76..17bc2773aad19 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -19,9 +19,8 @@ import numpy as np +import pandas as pd from pandas.api.extensions import ExtensionArray, ExtensionDtype -from pandas.core.common import is_bool_indexer -from pandas.core.indexers import check_bool_array_indexer class JSONDtype(ExtensionDtype): @@ -76,18 +75,21 @@ def _from_factorized(cls, values, original): def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.data[item] - elif is_bool_indexer(item): - item = check_bool_array_indexer(self, item) - return self._from_sequence([x for x, m in zip(self, item) if m]) - elif isinstance(item, abc.Iterable): - # fancy indexing - return type(self)([self.data[i] for i in item]) elif isinstance(item, slice) and item == slice(None): # Make sure we get a view return type(self)(self.data) - else: + elif isinstance(item, slice): # slice return type(self)(self.data[item]) + else: + if not pd.api.types.is_array_like(item): + item = pd.array(item) + dtype = item.dtype + if pd.api.types.is_bool_dtype(dtype): + item = pd.api.indexers.check_bool_array_indexer(self, item) + return self._from_sequence([x for x, m in zip(self, item) if m]) + # integer + return type(self)([self.data[i] for i in item]) def __setitem__(self, key, value): if isinstance(key, numbers.Integral):