From 045391c81217d11555e493f4ebb5fb6f4dbcc61a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 13 Dec 2019 14:40:35 -0600 Subject: [PATCH] [WIP]: Indexing with BooleanArray This implements indexing a Series with a BooleanArray for some dtypes. NA values in the mask propagate. This required updating each EA's `__getitem__` to correctly propagate NA. This isn't an option for ndarray-backed Series, which don't understand BooleanArray. No need to review in detail. The primary motivation for pushing this up is to better understand what indexing with NA values will look like in practice. --- pandas/core/arrays/boolean.py | 9 +++ pandas/core/arrays/categorical.py | 11 +++- pandas/core/arrays/datetimelike.py | 14 ++++- pandas/core/arrays/integer.py | 11 ++++ pandas/core/arrays/string_.py | 16 +++++- pandas/core/common.py | 6 +- pandas/core/indexes/base.py | 8 ++- pandas/core/indexing.py | 8 ++- pandas/core/internals/blocks.py | 6 +- pandas/core/internals/managers.py | 8 ++- pandas/core/series.py | 17 +++--- pandas/tests/frame/indexing/test_indexing.py | 2 + pandas/tests/indexing/test_na_indexing.py | 58 ++++++++++++++++++++ 13 files changed, 150 insertions(+), 24 deletions(-) create mode 100644 pandas/tests/indexing/test_na_indexing.py diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 743d45e1fa400..66c32e0a1b9c6 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -24,6 +24,7 @@ ) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops @@ -294,6 +295,14 @@ def __getitem__(self, item): if self._mask[item]: return self.dtype.na_value return self._data[item] + elif is_array_like(item) and is_bool_dtype(item.dtype): + if isinstance(item, BooleanArray): + # items, mask = item._data, item._mask + take = item._data | item._mask + result = self._data[take] + # output masked anywhere where self is masked, or the input was masked + omask = (self._mask | item._mask)[take] + return type(self)(result, omask) return type(self)(self._data[item], self._mask[item]) def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 71fe944827a74..1e4f618ba70b2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -46,6 +46,7 @@ from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d +from pandas.core.arrays import BooleanArray from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array @@ -1996,10 +1997,14 @@ def __getitem__(self, key): return np.nan else: return self.categories[i] + elif com.is_bool_indexer(key) and isinstance(key, BooleanArray): + take = key._data | key._mask + values = self._codes[take] + values[key._mask[take]] = -1 else: - return self._constructor( - values=self._codes[key], dtype=self.dtype, fastpath=True - ) + values = self._codes[key] + + return self._constructor(values=values, dtype=self.dtype, fastpath=True) def __setitem__(self, key, value): """ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f5d1e62f44fd0..d448141c27a40 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -40,6 +40,7 @@ from pandas._typing import DatetimeLikeScalar from pandas.core import missing, nanops from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts +from pandas.core.arrays import BooleanArray import pandas.core.common as com from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import make_invalid_op @@ -415,8 +416,15 @@ def __getitem__(self, key): val = getitem(key) return self._box_func(val) + mask = None if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + if isinstance(key, BooleanArray): + # TODO: Handle all boolean indexers. + mask = key._mask + key = key._data | mask + mask = mask[key] + else: + key = np.asarray(key, dtype=bool) if key.all(): key = slice(0, None, None) else: @@ -438,6 +446,10 @@ def __getitem__(self, key): freq = self.freq result = getitem(key) + if mask is not None: + # TODO: Check that we've copied! + result[mask] = iNaT + if result.ndim > 1: # To support MPL which performs slicing with 2 dim # even though it only has 1 dim by definition diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d47e7e3df27e1..45ff238bd810b 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -21,6 +21,7 @@ is_scalar, ) from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops @@ -366,10 +367,20 @@ def fmt(x): return fmt def __getitem__(self, item): + from pandas.core.arrays import BooleanArray + if is_integer(item): if self._mask[item]: return self.dtype.na_value return self._data[item] + elif is_array_like(item) and is_bool_dtype(item.dtype): + if isinstance(item, BooleanArray): + # items, mask = item._data, item._mask + take = item._data | item._mask + result = self._data[take] + # output masked anywhere where self is masked, or the input was masked + omask = (self._mask | item._mask)[take] + return type(self)(result, omask) return type(self)(self._data[item], self._mask[item]) def _coerce_to_ndarray(self): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2de19a3319cc5..59535803266db 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -13,7 +13,8 @@ from pandas import compat from pandas.core import ops -from pandas.core.arrays import PandasArray +from pandas.core.arrays import BooleanArray, PandasArray +import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.missing import isna @@ -232,6 +233,19 @@ def __setitem__(self, key, value): super().__setitem__(key, value) + def __getitem__(self, item): + # Doing this here, as PandasArray.__getitem__ can't guarantee dtype stability + # when getting with a boolean mask. + if com.is_bool_indexer(item): + if isinstance(item, BooleanArray): + # items, mask = item._data, item._mask + take = item._data | item._mask + result = self[take] + # That copies, right? + result[item._mask[take]] = self.dtype.na_value + return result + return super().__getitem__(item) + def fillna(self, value=None, method=None, limit=None): # TODO: validate dtype return super().fillna(value, method, limit) diff --git a/pandas/core/common.py b/pandas/core/common.py index 9017584171850..fdbedbbd4cddd 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -132,9 +132,9 @@ def is_bool_indexer(key: Any) -> bool: elif is_bool_dtype(key.dtype): # an ndarray with bool-dtype by definition has no missing values. # So we only need to check for NAs in ExtensionArrays - if is_extension_array_dtype(key.dtype): - if np.any(key.isna()): - raise ValueError(na_msg) + # if is_extension_array_dtype(key.dtype): + # if np.any(key.isna()): + # raise ValueError(na_msg) return True elif isinstance(key, list): try: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index fc2412ceaca0e..4145db3e57ee8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -65,7 +65,7 @@ from pandas.core import ops from pandas.core.accessor import CachedAccessor import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import BooleanArray, ExtensionArray from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com from pandas.core.construction import extract_array @@ -4014,7 +4014,11 @@ def __getitem__(self, key): return promote(getitem(key)) if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + if isinstance(key, BooleanArray): + # TODO: Handle all boolean indexers. + key = key._data | key._mask + else: + key = np.asarray(key, dtype=bool) key = com.values_from_object(key) result = getitem(key) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 341262313ddff..e97549b75fb03 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -8,6 +8,7 @@ from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( + is_extension_array_dtype, is_float, is_integer, is_iterator, @@ -19,6 +20,7 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries +from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com @@ -2309,6 +2311,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: If the index of the key is unalignable to index. """ result = key + mask = None if isinstance(key, ABCSeries) and not key.index.equals(index): result = result.reindex(index) mask = isna(result._values) @@ -2319,6 +2322,9 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: "the indexed object do not match)." ) result = result.astype(bool)._values + elif is_array_like(result) and is_extension_array_dtype(result.dtype): + assert result.dtype.kind == "b" + mask = isna(result) else: if is_sparse(result): result = result.to_dense() @@ -2330,7 +2336,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: "Item wrong length {} instead of {}.".format(len(result), len(index)) ) - return result + return result, mask def convert_missing_indexer(indexer): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8a543832b50fe..10fb1dbd9aea8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -291,7 +291,7 @@ def __setstate__(self, state): self.values = state[1] self.ndim = self.values.ndim - def _slice(self, slicer): + def _slice(self, slicer, mask=None): """ return a slice of my values """ return self.values[slicer] @@ -1810,7 +1810,7 @@ def _can_hold_element(self, element: Any) -> bool: # We're doing the same as CategoricalBlock here. return True - def _slice(self, slicer): + def _slice(self, slicer, mask=None): """ return a slice of my values """ # slice the category @@ -2307,7 +2307,7 @@ def to_dense(self): # expects that behavior. return np.asarray(self.values, dtype=_NS_DTYPE) - def _slice(self, slicer): + def _slice(self, slicer, mask=None): """ return a slice of my values """ if isinstance(slicer, tuple): col, loc = slicer diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f312b88d9a0bc..44776a6e07aba 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -709,7 +709,7 @@ def combine(self, blocks, copy=True): return type(self)(new_blocks, axes, do_integrity_check=False) - def get_slice(self, slobj, axis=0): + def get_slice(self, slobj, axis=0, mask=None): if axis >= self.ndim: raise IndexError("Requested axis not found in manager") @@ -1505,11 +1505,13 @@ def _blklocs(self): """ compat with BlockManager """ return None - def get_slice(self, slobj, axis=0): + def get_slice(self, slobj, axis=0, mask=None): if axis >= self.ndim: raise IndexError("Requested axis not found in manager") - return type(self)(self._block._slice(slobj), self.index[slobj], fastpath=True) + return type(self)( + self._block._slice(slobj, mask=mask), self.index[slobj], fastpath=True + ) @property def index(self): diff --git a/pandas/core/series.py b/pandas/core/series.py index 3eabb70581827..1c3af26f6047a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -892,11 +892,13 @@ def __getitem__(self, key): key = list(key) if com.is_bool_indexer(key): - key = check_bool_indexer(self.index, key) + key, mask = check_bool_indexer(self.index, key) + else: + mask = None - return self._get_with(key) + return self._get_with(key, mask) - def _get_with(self, key): + def _get_with(self, key, mask=None): # other: fancy integer or otherwise if isinstance(key, slice): return self._slice(key) @@ -917,12 +919,13 @@ def _get_with(self, key): return self._get_values(key) raise - if not isinstance(key, (list, np.ndarray, Series, Index)): + if not is_list_like(key): key = list(key) if isinstance(key, Index): key_type = key.inferred_type else: + # TODO: why not use key.dtype? key_type = lib.infer_dtype(key, skipna=False) if key_type == "integer": @@ -931,7 +934,7 @@ def _get_with(self, key): else: return self._get_values(key) elif key_type == "boolean": - return self._get_values(key) + return self._get_values(key, mask) if isinstance(key, (list, tuple)): # TODO: de-dup with tuple case handled above? @@ -960,10 +963,10 @@ def _get_values_tuple(self, key): self ) - def _get_values(self, indexer): + def _get_values(self, indexer, mask=None): try: return self._constructor( - self._data.get_slice(indexer), fastpath=True + self._data.get_slice(indexer, mask=mask), fastpath=True ).__finalize__(self) except ValueError: # mpl compat if we look up e.g. ser[:, np.newaxis]; diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index cd384d6fdbfad..fcebfe82b0b82 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -2013,6 +2013,7 @@ def test_index_single_double_tuples(self, tpl): tm.assert_frame_equal(result, expected) def test_boolean_indexing(self): + # TODO: parametrize idx = list(range(3)) cols = ["A", "B", "C"] df1 = DataFrame( @@ -2036,6 +2037,7 @@ def test_boolean_indexing(self): df1[df1.index[:-1] > 2] = -1 def test_boolean_indexing_mixed(self): + # TODO: parametrize? df = DataFrame( { 0: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, diff --git a/pandas/tests/indexing/test_na_indexing.py b/pandas/tests/indexing/test_na_indexing.py new file mode 100644 index 0000000000000..9e1228cdfe6ac --- /dev/null +++ b/pandas/tests/indexing/test_na_indexing.py @@ -0,0 +1,58 @@ +import pytest + +import pandas as pd +import pandas.util.testing as tm + + +@pytest.mark.parametrize( + "values, expected, dtype", + [ + ([1, 2, 3], [1, None], "Int64"), + (["a", "b", "c"], ["a", pd.NA], "string"), + ([None, "b", "c"], [pd.NA, pd.NA], "string"), + ([True, False, None], [True, pd.NA], "boolean"), + ([None, False, None], [pd.NA, pd.NA], "boolean"), + pytest.param( + [pd.Timestamp("2000"), pd.Timestamp("2001"), pd.Timestamp("2002")], + [pd.Timestamp("2000"), pd.NaT], + "datetime64[ns]", + marks=pytest.mark.xfail(reson="TODO. Change DatetimeBlock._slice"), + ), + ( + [ + pd.Timestamp("2000", tz="CET"), + pd.Timestamp("2001", tz="CET"), + pd.Timestamp("2002", tz="CET"), + ], + [pd.Timestamp("2000", tz="CET"), pd.NaT], + "datetime64[ns, cet]", + ), + pytest.param( + [pd.Timedelta("1H"), pd.Timedelta("2H"), pd.Timedelta("3H")], + [pd.Timedelta("1H"), pd.NaT], + "timedelta64[ns]", + marks=pytest.mark.xfail(reson="TODO. Change TimeDeltaBlock._slice"), + ), + ( + [pd.Period("2000"), pd.Period("2001"), pd.Period("2002")], + [pd.Period("2000"), pd.NaT], + "period[A-DEC]", + ), + (["a", "b", "c"], ["a", None], pd.CategoricalDtype(["a", "b", "c"])), + ], +) +def test_mask_series(values, expected, dtype): + s = pd.Series(values, dtype=dtype, name="name") + orig = pd.Series(values, dtype=dtype, name="name") + mask = pd.array([True, False, None], dtype="boolean") + result = s[mask] + # expected = pd.Series([1, pd.NA], index=[0, 2], dtype="Int64") + expected = pd.Series(expected, dtype=dtype, name="name", index=[0, 2]) + tm.assert_series_equal(result, expected) + assert result.dtype == s.dtype + assert pd.isna(result.iloc[-1]) + + tm.assert_equal(s.array[mask], expected.array) + + # ensure no mutation + tm.assert_series_equal(s, orig)