diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b34f5dfdd1a83..88548f6c2f678 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: language: python_venv additional_dependencies: [flake8-comprehensions>=3.1.0] - repo: https://github.com/pre-commit/mirrors-isort - rev: v4.3.20 + rev: v4.3.21 hooks: - id: isort language: python_venv diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index c78c2fa92827e..6453649b91270 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -131,6 +131,7 @@ def setup(self): self.col_scalar = columns[10] self.bool_indexer = self.df[self.col_scalar] > 0 self.bool_obj_indexer = self.bool_indexer.astype(object) + self.boolean_indexer = (self.df[self.col_scalar] > 0).astype("boolean") def time_loc(self): self.df.loc[self.idx_scalar, self.col_scalar] @@ -144,6 +145,9 @@ def time_boolean_rows(self): def time_boolean_rows_object(self): self.df[self.bool_obj_indexer] + def time_boolean_rows_boolean(self): + self.df[self.boolean_indexer] + class DataFrameNumericIndexing: def setup(self): diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 4b1a99da7cd4c..16a84b5d2ecaf 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -59,3 +59,11 @@ objects. api.extensions.ExtensionArray.nbytes api.extensions.ExtensionArray.ndim api.extensions.ExtensionArray.shape + +Additionally, we have some utility methods for ensuring your object +behaves correctly. + +.. autosummary:: + :toctree: api/ + + api.indexers.check_bool_array_indexer diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index e0f676d3072fc..5276bc6142206 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -14,6 +14,29 @@ Nullable Boolean Data Type .. versionadded:: 1.0.0 + +.. _boolean.indexing: + +Indexing with NA values +----------------------- + +pandas does not allow indexing with NA values. Attempting to do so +will raise a ``ValueError``. + +.. ipython:: python + :okexcept: + + s = pd.Series([1, 2, 3]) + mask = pd.array([True, False, pd.NA], dtype="boolean") + s[mask] + +The missing values will need to be explicitly filled with True or False prior +to using the array as a mask. + +.. ipython:: python + + s[mask.fillna(False)] + .. _boolean.kleene: Kleene Logical Operations diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 59644440149ff..f70791433d5d1 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -793,6 +793,7 @@ Datetimelike - Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`) - Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`) - Bug in :meth:`DatetimeIndex.strftime` and :meth:`Series.dt.strftime` where ``NaT`` was converted to the string ``'NaT'`` instead of ``np.nan`` (:issue:`29578`) +- Bug in masking datetime-like arrays with a boolean mask of an incorrect length not raising an ``IndexError`` (:issue:`30308`) - Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`) - Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`) - Bug in :func:`pandas.to_datetime` failing for `deques` when using ``cache=True`` (the default) (:issue:`29403`) diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py index a5d6bc07da3eb..64383d8ecbd24 100644 --- a/pandas/api/indexers/__init__.py +++ b/pandas/api/indexers/__init__.py @@ -1,2 +1,3 @@ """Public API for Rolling Window Indexers""" +from pandas.core.indexers import check_bool_array_indexer # noqa: F401 from pandas.core.window.indexers import BaseIndexer # noqa: F401 diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 7301c0ab434a0..102150b1cbce1 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -29,6 +29,8 @@ from pandas.core import nanops, ops from pandas.core.algorithms import take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +import pandas.core.common as com +from pandas.core.indexers import check_bool_array_indexer if TYPE_CHECKING: from pandas._typing import Scalar @@ -307,11 +309,22 @@ def _from_factorized(cls, values, original: "BooleanArray"): def _formatter(self, boxed=False): return str + @property + def _hasna(self) -> bool: + # Note: this is expensive right now! The hope is that we can + # make this faster by having an optional mask, but not have to change + # source code using it.. + return self._mask.any() + def __getitem__(self, item): if is_integer(item): if self._mask[item]: return self.dtype.na_value return self._data[item] + + elif com.is_bool_indexer(item): + item = check_bool_array_indexer(self, item) + return type(self)(self._data[item], self._mask[item]) def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): @@ -329,7 +342,7 @@ def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): if dtype is None: dtype = object if is_bool_dtype(dtype): - if not self.isna().any(): + if not self._hasna: return self._data else: raise ValueError( @@ -503,7 +516,7 @@ def astype(self, dtype, copy=True): if is_bool_dtype(dtype): # astype_nansafe converts np.nan to True - if self.isna().any(): + if self._hasna: raise ValueError("cannot convert float NaN to bool") else: return self._data.astype(dtype, copy=copy) @@ -515,7 +528,7 @@ def astype(self, dtype, copy=True): ) # for integer, error if there are missing values if is_integer_dtype(dtype): - if self.isna().any(): + if self._hasna: raise ValueError("cannot convert NA to integer") # for float dtype, ensure we use np.nan before casting (numpy cannot # deal with pd.NA) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5f4bd801429a4..8c78676834786 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -49,6 +49,7 @@ from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array +from pandas.core.indexers import check_bool_array_indexer from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort @@ -1996,10 +1997,13 @@ def __getitem__(self, key): return np.nan else: return self.categories[i] - else: - return self._constructor( - values=self._codes[key], dtype=self.dtype, fastpath=True - ) + + elif com.is_bool_indexer(key): + key = check_bool_array_indexer(self, key) + + return self._constructor( + values=self._codes[key], dtype=self.dtype, fastpath=True + ) def __setitem__(self, key, value): """ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 763a6fe560283..2bdd9acaeb70f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -40,6 +40,7 @@ from pandas.core import missing, nanops from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts import pandas.core.common as com +from pandas.core.indexers import check_bool_array_indexer from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import make_invalid_op @@ -436,7 +437,7 @@ def __getitem__(self, key): return type(self)(val, dtype=self.dtype) if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + key = check_bool_array_indexer(self, key) if key.all(): key = slice(0, None, None) else: diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 62f31addedc0b..0922f4ac6f71d 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -26,6 +26,8 @@ from pandas.core import nanops, ops from pandas.core.algorithms import take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +import pandas.core.common as com +from pandas.core.indexers import check_bool_array_indexer from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric @@ -368,6 +370,10 @@ def __getitem__(self, item): if self._mask[item]: return self.dtype.na_value return self._data[item] + + elif com.is_bool_indexer(item): + item = check_bool_array_indexer(self, item) + return type(self)(self._data[item], self._mask[item]) def _coerce_to_ndarray(self, dtype=None, na_value=lib._no_default): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index deec30dfe34ff..a114be9a21c6c 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -17,7 +17,9 @@ from pandas import compat from pandas.core import nanops from pandas.core.algorithms import searchsorted, take, unique +import pandas.core.common as com from pandas.core.construction import extract_array +from pandas.core.indexers import check_bool_array_indexer from pandas.core.missing import backfill_1d, pad_1d from .base import ExtensionArray, ExtensionOpsMixin @@ -234,6 +236,9 @@ def __getitem__(self, item): if isinstance(item, type(self)): item = item._ndarray + elif com.is_bool_indexer(item): + item = check_bool_array_indexer(self, item) + result = self._ndarray[item] if not lib.is_scalar(item): result = type(self)(result) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 548f2bf702e60..adf10642f337a 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -738,6 +738,9 @@ def value_counts(self, dropna=True): # -------- def __getitem__(self, key): + # avoid mypy issues when importing at the top-level + from pandas.core.indexing import check_bool_indexer + if isinstance(key, tuple): if len(key) > 1: raise IndexError("too many indices for array.") @@ -766,7 +769,9 @@ def __getitem__(self, key): else: key = np.asarray(key) - if com.is_bool_indexer(key) and len(self) == len(key): + if com.is_bool_indexer(key): + key = check_bool_indexer(self, key) + return self.take(np.arange(len(key), dtype=np.int32)[key]) elif hasattr(key, "__len__"): return self.take(key) diff --git a/pandas/core/common.py b/pandas/core/common.py index 8a430a4aa7d11..f0fcb736586d6 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -111,14 +111,20 @@ def is_bool_indexer(key: Any) -> bool: Returns ------- bool + Whether `key` is a valid boolean indexer. Raises ------ ValueError When the array is an object-dtype ndarray or ExtensionArray and contains missing values. + + See Also + -------- + check_bool_array_indexer : Check that `key` + is a valid mask for an array, and convert to an ndarray. """ - na_msg = "cannot index with vector containing NA / NaN values" + na_msg = "cannot mask with array containing NA / NaN values" if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index f75087ca3b505..0f932f7b849e3 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -3,6 +3,8 @@ """ import numpy as np +from pandas._typing import AnyArrayLike + from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -240,3 +242,68 @@ def length_of_indexer(indexer, target=None) -> int: elif not is_list_like_indexer(indexer): return 1 raise AssertionError("cannot find the length of the indexer") + + +def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray: + """ + Check if `mask` is a valid boolean indexer for `array`. + + `array` and `mask` are checked to have the same length, and the + dtype is validated. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + array : array + The array that's being masked. + mask : array + The boolean array that's masking. + + Returns + ------- + numpy.ndarray + The validated boolean mask. + + Raises + ------ + IndexError + When the lengths don't match. + ValueError + When `mask` cannot be converted to a bool-dtype ndarray. + + See Also + -------- + api.extensions.is_bool_indexer : Check if `key` is a boolean indexer. + + Examples + -------- + A boolean ndarray is returned when the arguments are all valid. + + >>> mask = pd.array([True, False]) + >>> arr = pd.Series([1, 2]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + array([ True, False]) + + An IndexError is raised when the lengths don't match. + + >>> mask = pd.array([True, False, True]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + Traceback (most recent call last): + ... + IndexError: Item wrong length 3 instead of 2. + + A ValueError is raised when the mask cannot be converted to + a bool-dtype ndarray. + + >>> mask = pd.array([True, pd.NA]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + Traceback (most recent call last): + ... + ValueError: cannot convert to bool numpy array in presence of missing values + """ + result = np.asarray(mask, dtype=bool) + # GH26658 + if len(result) != len(array): + raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.") + return result diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ebecb02e20e1a..b15d91240e7bb 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -22,7 +22,11 @@ from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com -from pandas.core.indexers import is_list_like_indexer, length_of_indexer +from pandas.core.indexers import ( + check_bool_array_indexer, + is_list_like_indexer, + length_of_indexer, +) from pandas.core.indexes.api import Index, InvalidIndexError @@ -2309,13 +2313,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: else: if is_sparse(result): result = result.to_dense() - result = np.asarray(result, dtype=bool) - - # GH26658 - if len(result) != len(index): - raise IndexError( - f"Item wrong length {len(result)} instead of {len(index)}." - ) + result = check_bool_array_indexer(index, result) return result diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 71c7fbb986267..dc1f62c4c97c5 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -121,6 +121,45 @@ def test_getitem_mask(self, data): assert len(result) == 1 assert result.dtype == data.dtype + def test_getitem_mask_raises(self, data): + mask = np.array([True, False]) + with pytest.raises(IndexError): + data[mask] + + mask = pd.array(mask, dtype="boolean") + with pytest.raises(IndexError): + data[mask] + + def test_getitem_boolean_array_mask(self, data): + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + result = data[mask] + assert len(result) == 0 + assert isinstance(result, type(data)) + + result = pd.Series(data)[mask] + assert len(result) == 0 + assert result.dtype == data.dtype + + mask[:5] = True + expected = data.take([0, 1, 2, 3, 4]) + result = data[mask] + self.assert_extension_array_equal(result, expected) + + expected = pd.Series(expected) + result = pd.Series(data)[mask] + self.assert_series_equal(result, expected) + + def test_getitem_boolean_array_mask_raises(self, data): + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + mask[:2] = pd.NA + with pytest.raises(ValueError): + data[mask] + + s = pd.Series(data) + + with pytest.raises(ValueError): + s[mask] + def test_getitem_slice(self, data): # getitem[slice] should return an array result = data[slice(0)] # empty diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 74f1e3cfbaf20..570cdf5f29d00 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -109,6 +109,15 @@ def __getitem__(self, item): if isinstance(item, numbers.Integral): return self._data[item] else: + # array, slice. + if pd.api.types.is_list_like(item): + if not pd.api.types.is_array_like(item): + item = pd.array(item) + dtype = item.dtype + if pd.api.types.is_bool_dtype(dtype): + item = pd.api.indexers.check_bool_array_indexer(self, item) + elif pd.api.types.is_integer_dtype(dtype): + item = np.asarray(item, dtype="int") return type(self)(self._data[item]) def take(self, indexer, allow_fill=False, fill_value=None): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 014581682ac59..17bc2773aad19 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -19,9 +19,8 @@ import numpy as np -from pandas.core.dtypes.base import ExtensionDtype - -from pandas.core.arrays import ExtensionArray +import pandas as pd +from pandas.api.extensions import ExtensionArray, ExtensionDtype class JSONDtype(ExtensionDtype): @@ -76,17 +75,21 @@ def _from_factorized(cls, values, original): def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.data[item] - elif isinstance(item, np.ndarray) and item.dtype == "bool": - return self._from_sequence([x for x, m in zip(self, item) if m]) - elif isinstance(item, abc.Iterable): - # fancy indexing - return type(self)([self.data[i] for i in item]) elif isinstance(item, slice) and item == slice(None): # Make sure we get a view return type(self)(self.data) - else: + elif isinstance(item, slice): # slice return type(self)(self.data[item]) + else: + if not pd.api.types.is_array_like(item): + item = pd.array(item) + dtype = item.dtype + if pd.api.types.is_bool_dtype(dtype): + item = pd.api.indexers.check_bool_array_indexer(self, item) + return self._from_sequence([x for x, m in zip(self, item) if m]) + # integer + return type(self)([self.data[i] for i in item]) def __setitem__(self, key, value): if isinstance(key, numbers.Integral): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 9119ca0a4511b..802bc43ae8052 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -371,6 +371,9 @@ def test_loc_index(self): result = df.loc[mask.values] tm.assert_frame_equal(result, expected) + result = df.loc[pd.array(mask, dtype="boolean")] + tm.assert_frame_equal(result, expected) + def test_loc_general(self): df = DataFrame( diff --git a/pandas/tests/indexing/test_na_indexing.py b/pandas/tests/indexing/test_na_indexing.py new file mode 100644 index 0000000000000..4b92df581d164 --- /dev/null +++ b/pandas/tests/indexing/test_na_indexing.py @@ -0,0 +1,79 @@ +import pytest + +import pandas as pd +import pandas.util.testing as tm + + +@pytest.mark.parametrize( + "values, dtype", + [ + ([1, 2, 3], "int64"), + ([1.0, 2.0, 3.0], "float64"), + (["a", "b", "c"], "object"), + (["a", "b", "c"], "string"), + ([1, 2, 3], "datetime64[ns]"), + ([1, 2, 3], "datetime64[ns, CET]"), + ([1, 2, 3], "timedelta64[ns]"), + (["2000", "2001", "2002"], "Period[D]"), + ([1, 0, 3], "Sparse"), + ([pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(3, 4)], "interval"), + ], +) +@pytest.mark.parametrize( + "mask", [[True, False, False], [True, True, True], [False, False, False]] +) +@pytest.mark.parametrize("box_mask", [True, False]) +@pytest.mark.parametrize("frame", [True, False]) +def test_series_mask_boolean(values, dtype, mask, box_mask, frame): + ser = pd.Series(values, dtype=dtype, index=["a", "b", "c"]) + if frame: + ser = ser.to_frame() + mask = pd.array(mask, dtype="boolean") + if box_mask: + mask = pd.Series(mask, index=ser.index) + + expected = ser[mask.astype("bool")] + + result = ser[mask] + tm.assert_equal(result, expected) + + if not box_mask: + # Series.iloc[Series[bool]] isn't allowed + result = ser.iloc[mask] + tm.assert_equal(result, expected) + + result = ser.loc[mask] + tm.assert_equal(result, expected) + + # empty + mask = mask[:0] + ser = ser.iloc[:0] + expected = ser[mask.astype("bool")] + result = ser[mask] + tm.assert_equal(result, expected) + + if not box_mask: + # Series.iloc[Series[bool]] isn't allowed + result = ser.iloc[mask] + tm.assert_equal(result, expected) + + result = ser.loc[mask] + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("frame", [True, False]) +def test_indexing_with_na_raises(frame): + s = pd.Series([1, 2, 3], name="name") + + if frame: + s = s.to_frame() + mask = pd.array([True, False, None], dtype="boolean") + match = "cannot mask with array containing NA / NaN values" + with pytest.raises(ValueError, match=match): + s[mask] + + with pytest.raises(ValueError, match=match): + s.loc[mask] + + with pytest.raises(ValueError, match=match): + s.iloc[mask] diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py index c2912cf3ce53f..925d657d7dd04 100644 --- a/pandas/tests/series/indexing/test_boolean.py +++ b/pandas/tests/series/indexing/test_boolean.py @@ -75,7 +75,7 @@ def test_getitem_boolean_object(string_series): # nans raise exception omask[5:10] = np.nan - msg = "cannot index with vector containing NA / NaN values" + msg = "cannot mask with array containing NA / NaN values" with pytest.raises(ValueError, match=msg): s[omask] with pytest.raises(ValueError, match=msg):