diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 743d45e1fa400..66c32e0a1b9c6 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -24,6 +24,7 @@ ) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops @@ -294,6 +295,14 @@ def __getitem__(self, item): if self._mask[item]: return self.dtype.na_value return self._data[item] + elif is_array_like(item) and is_bool_dtype(item.dtype): + if isinstance(item, BooleanArray): + # items, mask = item._data, item._mask + take = item._data | item._mask + result = self._data[take] + # output masked anywhere where self is masked, or the input was masked + omask = (self._mask | item._mask)[take] + return type(self)(result, omask) return type(self)(self._data[item], self._mask[item]) def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 71fe944827a74..1e4f618ba70b2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -46,6 +46,7 @@ from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d +from pandas.core.arrays import BooleanArray from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array @@ -1996,10 +1997,14 @@ def __getitem__(self, key): return np.nan else: return self.categories[i] + elif com.is_bool_indexer(key) and isinstance(key, BooleanArray): + take = key._data | key._mask + values = self._codes[take] + values[key._mask[take]] = -1 else: - return self._constructor( - values=self._codes[key], dtype=self.dtype, fastpath=True - ) + values = self._codes[key] + + return self._constructor(values=values, dtype=self.dtype, fastpath=True) def __setitem__(self, key, value): """ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f5d1e62f44fd0..d448141c27a40 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -40,6 +40,7 @@ from pandas._typing import DatetimeLikeScalar from pandas.core import missing, nanops from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts +from pandas.core.arrays import BooleanArray import pandas.core.common as com from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import make_invalid_op @@ -415,8 +416,15 @@ def __getitem__(self, key): val = getitem(key) return self._box_func(val) + mask = None if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + if isinstance(key, BooleanArray): + # TODO: Handle all boolean indexers. + mask = key._mask + key = key._data | mask + mask = mask[key] + else: + key = np.asarray(key, dtype=bool) if key.all(): key = slice(0, None, None) else: @@ -438,6 +446,10 @@ def __getitem__(self, key): freq = self.freq result = getitem(key) + if mask is not None: + # TODO: Check that we've copied! + result[mask] = iNaT + if result.ndim > 1: # To support MPL which performs slicing with 2 dim # even though it only has 1 dim by definition diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d47e7e3df27e1..45ff238bd810b 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -21,6 +21,7 @@ is_scalar, ) from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops @@ -366,10 +367,20 @@ def fmt(x): return fmt def __getitem__(self, item): + from pandas.core.arrays import BooleanArray + if is_integer(item): if self._mask[item]: return self.dtype.na_value return self._data[item] + elif is_array_like(item) and is_bool_dtype(item.dtype): + if isinstance(item, BooleanArray): + # items, mask = item._data, item._mask + take = item._data | item._mask + result = self._data[take] + # output masked anywhere where self is masked, or the input was masked + omask = (self._mask | item._mask)[take] + return type(self)(result, omask) return type(self)(self._data[item], self._mask[item]) def _coerce_to_ndarray(self): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2de19a3319cc5..59535803266db 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -13,7 +13,8 @@ from pandas import compat from pandas.core import ops -from pandas.core.arrays import PandasArray +from pandas.core.arrays import BooleanArray, PandasArray +import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.missing import isna @@ -232,6 +233,19 @@ def __setitem__(self, key, value): super().__setitem__(key, value) + def __getitem__(self, item): + # Doing this here, as PandasArray.__getitem__ can't guarantee dtype stability + # when getting with a boolean mask. + if com.is_bool_indexer(item): + if isinstance(item, BooleanArray): + # items, mask = item._data, item._mask + take = item._data | item._mask + result = self[take] + # That copies, right? + result[item._mask[take]] = self.dtype.na_value + return result + return super().__getitem__(item) + def fillna(self, value=None, method=None, limit=None): # TODO: validate dtype return super().fillna(value, method, limit) diff --git a/pandas/core/common.py b/pandas/core/common.py index 9017584171850..fdbedbbd4cddd 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -132,9 +132,9 @@ def is_bool_indexer(key: Any) -> bool: elif is_bool_dtype(key.dtype): # an ndarray with bool-dtype by definition has no missing values. # So we only need to check for NAs in ExtensionArrays - if is_extension_array_dtype(key.dtype): - if np.any(key.isna()): - raise ValueError(na_msg) + # if is_extension_array_dtype(key.dtype): + # if np.any(key.isna()): + # raise ValueError(na_msg) return True elif isinstance(key, list): try: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index fc2412ceaca0e..4145db3e57ee8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -65,7 +65,7 @@ from pandas.core import ops from pandas.core.accessor import CachedAccessor import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import BooleanArray, ExtensionArray from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com from pandas.core.construction import extract_array @@ -4014,7 +4014,11 @@ def __getitem__(self, key): return promote(getitem(key)) if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + if isinstance(key, BooleanArray): + # TODO: Handle all boolean indexers. + key = key._data | key._mask + else: + key = np.asarray(key, dtype=bool) key = com.values_from_object(key) result = getitem(key) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 341262313ddff..e97549b75fb03 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -8,6 +8,7 @@ from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( + is_extension_array_dtype, is_float, is_integer, is_iterator, @@ -19,6 +20,7 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries +from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com @@ -2309,6 +2311,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: If the index of the key is unalignable to index. """ result = key + mask = None if isinstance(key, ABCSeries) and not key.index.equals(index): result = result.reindex(index) mask = isna(result._values) @@ -2319,6 +2322,9 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: "the indexed object do not match)." ) result = result.astype(bool)._values + elif is_array_like(result) and is_extension_array_dtype(result.dtype): + assert result.dtype.kind == "b" + mask = isna(result) else: if is_sparse(result): result = result.to_dense() @@ -2330,7 +2336,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: "Item wrong length {} instead of {}.".format(len(result), len(index)) ) - return result + return result, mask def convert_missing_indexer(indexer): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8a543832b50fe..10fb1dbd9aea8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -291,7 +291,7 @@ def __setstate__(self, state): self.values = state[1] self.ndim = self.values.ndim - def _slice(self, slicer): + def _slice(self, slicer, mask=None): """ return a slice of my values """ return self.values[slicer] @@ -1810,7 +1810,7 @@ def _can_hold_element(self, element: Any) -> bool: # We're doing the same as CategoricalBlock here. return True - def _slice(self, slicer): + def _slice(self, slicer, mask=None): """ return a slice of my values """ # slice the category @@ -2307,7 +2307,7 @@ def to_dense(self): # expects that behavior. return np.asarray(self.values, dtype=_NS_DTYPE) - def _slice(self, slicer): + def _slice(self, slicer, mask=None): """ return a slice of my values """ if isinstance(slicer, tuple): col, loc = slicer diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f312b88d9a0bc..44776a6e07aba 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -709,7 +709,7 @@ def combine(self, blocks, copy=True): return type(self)(new_blocks, axes, do_integrity_check=False) - def get_slice(self, slobj, axis=0): + def get_slice(self, slobj, axis=0, mask=None): if axis >= self.ndim: raise IndexError("Requested axis not found in manager") @@ -1505,11 +1505,13 @@ def _blklocs(self): """ compat with BlockManager """ return None - def get_slice(self, slobj, axis=0): + def get_slice(self, slobj, axis=0, mask=None): if axis >= self.ndim: raise IndexError("Requested axis not found in manager") - return type(self)(self._block._slice(slobj), self.index[slobj], fastpath=True) + return type(self)( + self._block._slice(slobj, mask=mask), self.index[slobj], fastpath=True + ) @property def index(self): diff --git a/pandas/core/series.py b/pandas/core/series.py index 3eabb70581827..1c3af26f6047a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -892,11 +892,13 @@ def __getitem__(self, key): key = list(key) if com.is_bool_indexer(key): - key = check_bool_indexer(self.index, key) + key, mask = check_bool_indexer(self.index, key) + else: + mask = None - return self._get_with(key) + return self._get_with(key, mask) - def _get_with(self, key): + def _get_with(self, key, mask=None): # other: fancy integer or otherwise if isinstance(key, slice): return self._slice(key) @@ -917,12 +919,13 @@ def _get_with(self, key): return self._get_values(key) raise - if not isinstance(key, (list, np.ndarray, Series, Index)): + if not is_list_like(key): key = list(key) if isinstance(key, Index): key_type = key.inferred_type else: + # TODO: why not use key.dtype? key_type = lib.infer_dtype(key, skipna=False) if key_type == "integer": @@ -931,7 +934,7 @@ def _get_with(self, key): else: return self._get_values(key) elif key_type == "boolean": - return self._get_values(key) + return self._get_values(key, mask) if isinstance(key, (list, tuple)): # TODO: de-dup with tuple case handled above? @@ -960,10 +963,10 @@ def _get_values_tuple(self, key): self ) - def _get_values(self, indexer): + def _get_values(self, indexer, mask=None): try: return self._constructor( - self._data.get_slice(indexer), fastpath=True + self._data.get_slice(indexer, mask=mask), fastpath=True ).__finalize__(self) except ValueError: # mpl compat if we look up e.g. ser[:, np.newaxis]; diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index cd384d6fdbfad..fcebfe82b0b82 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -2013,6 +2013,7 @@ def test_index_single_double_tuples(self, tpl): tm.assert_frame_equal(result, expected) def test_boolean_indexing(self): + # TODO: parametrize idx = list(range(3)) cols = ["A", "B", "C"] df1 = DataFrame( @@ -2036,6 +2037,7 @@ def test_boolean_indexing(self): df1[df1.index[:-1] > 2] = -1 def test_boolean_indexing_mixed(self): + # TODO: parametrize? df = DataFrame( { 0: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, diff --git a/pandas/tests/indexing/test_na_indexing.py b/pandas/tests/indexing/test_na_indexing.py new file mode 100644 index 0000000000000..9e1228cdfe6ac --- /dev/null +++ b/pandas/tests/indexing/test_na_indexing.py @@ -0,0 +1,58 @@ +import pytest + +import pandas as pd +import pandas.util.testing as tm + + +@pytest.mark.parametrize( + "values, expected, dtype", + [ + ([1, 2, 3], [1, None], "Int64"), + (["a", "b", "c"], ["a", pd.NA], "string"), + ([None, "b", "c"], [pd.NA, pd.NA], "string"), + ([True, False, None], [True, pd.NA], "boolean"), + ([None, False, None], [pd.NA, pd.NA], "boolean"), + pytest.param( + [pd.Timestamp("2000"), pd.Timestamp("2001"), pd.Timestamp("2002")], + [pd.Timestamp("2000"), pd.NaT], + "datetime64[ns]", + marks=pytest.mark.xfail(reson="TODO. Change DatetimeBlock._slice"), + ), + ( + [ + pd.Timestamp("2000", tz="CET"), + pd.Timestamp("2001", tz="CET"), + pd.Timestamp("2002", tz="CET"), + ], + [pd.Timestamp("2000", tz="CET"), pd.NaT], + "datetime64[ns, cet]", + ), + pytest.param( + [pd.Timedelta("1H"), pd.Timedelta("2H"), pd.Timedelta("3H")], + [pd.Timedelta("1H"), pd.NaT], + "timedelta64[ns]", + marks=pytest.mark.xfail(reson="TODO. Change TimeDeltaBlock._slice"), + ), + ( + [pd.Period("2000"), pd.Period("2001"), pd.Period("2002")], + [pd.Period("2000"), pd.NaT], + "period[A-DEC]", + ), + (["a", "b", "c"], ["a", None], pd.CategoricalDtype(["a", "b", "c"])), + ], +) +def test_mask_series(values, expected, dtype): + s = pd.Series(values, dtype=dtype, name="name") + orig = pd.Series(values, dtype=dtype, name="name") + mask = pd.array([True, False, None], dtype="boolean") + result = s[mask] + # expected = pd.Series([1, pd.NA], index=[0, 2], dtype="Int64") + expected = pd.Series(expected, dtype=dtype, name="name", index=[0, 2]) + tm.assert_series_equal(result, expected) + assert result.dtype == s.dtype + assert pd.isna(result.iloc[-1]) + + tm.assert_equal(s.array[mask], expected.array) + + # ensure no mutation + tm.assert_series_equal(s, orig)