From 28486ac1f8c08378dec7143319891f6274e4dee2 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Jun 2021 16:26:32 -0700 Subject: [PATCH 1/4] BUG: IntervalIndex.is_monotonic with np.nan --- pandas/_libs/intervaltree.pxi.in | 4 ++ pandas/core/indexes/interval.py | 41 +++++++++++-------- .../tests/indexes/interval/test_indexing.py | 11 +++++ .../tests/indexes/interval/test_interval.py | 10 +++++ 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index 55d67f000f93a..547fcc0b8aa07 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -36,6 +36,7 @@ cdef class IntervalTree(IntervalMixin): object dtype str closed object _is_overlapping, _left_sorter, _right_sorter + Py_ssize_t _na_count def __init__(self, left, right, closed='right', leaf_size=100): """ @@ -67,6 +68,7 @@ cdef class IntervalTree(IntervalMixin): # GH 23352: ensure no nan in nodes mask = ~np.isnan(self.left) + self._na_count = len(mask) - mask.sum() self.left = self.left[mask] self.right = self.right[mask] indices = indices[mask] @@ -116,6 +118,8 @@ cdef class IntervalTree(IntervalMixin): Return True if the IntervalTree is monotonic increasing (only equal or increasing values), else False """ + if self._na_count > 0: + return False values = [self.right, self.left] sort_order = np.lexsort(values) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index b906f88d98a46..b2d0f22010da5 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -62,6 +62,7 @@ is_scalar, ) from pandas.core.dtypes.dtypes import IntervalDtype +from pandas.core.dtypes.missing import isna from pandas.core.algorithms import ( take_nd, @@ -761,29 +762,37 @@ def _get_indexer_pointwise(self, target: Index) -> tuple[np.ndarray, np.ndarray] """ indexer, missing = [], [] for i, key in enumerate(target): - try: - locs = self.get_loc(key) - if isinstance(locs, slice): - # Only needed for get_indexer_non_unique - locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") - locs = np.array(locs, ndmin=1) - except KeyError: - missing.append(i) - locs = np.array([-1]) - except InvalidIndexError: - # i.e. non-scalar key e.g. a tuple. - # see test_append_different_columns_types_raises - missing.append(i) - locs = np.array([-1]) + if is_interval_dtype(target.dtype) and isna(key): + # self.get_loc(np.nan) will treat it as a float instead of as + # our own dtype. + # TODO: handle this in get_loc? + locs = self.isna().nonzero()[0] + if len(locs) == 0: + missing.append(i) + else: + try: + locs = self.get_loc(key) + if isinstance(locs, slice): + # Only needed for get_indexer_non_unique + locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") + locs = np.array(locs, ndmin=1) + except KeyError: + missing.append(i) + locs = np.array([-1]) + except InvalidIndexError: + # i.e. non-scalar key e.g. a tuple. + # see test_append_different_columns_types_raises + missing.append(i) + locs = np.array([-1]) indexer.append(locs) indexer = np.concatenate(indexer) return ensure_platform_int(indexer), ensure_platform_int(missing) - @property + @cache_readonly def _index_as_unique(self) -> bool: - return not self.is_overlapping + return not self.is_overlapping and self._engine._na_count < 2 _requires_unique_msg = ( "cannot handle overlapping indices; use IntervalIndex.get_indexer_non_unique" diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 3abc6e348748a..f71e5a6fd86da 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -326,6 +326,17 @@ def test_get_indexer_non_monotonic(self): expected = np.array([1, 2], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_with_nans(self): + # GH#41831 + index = IntervalIndex([np.nan, np.nan]) + other = IntervalIndex([np.nan]) + + assert not index._index_as_unique + + result = index.get_indexer_for(other) + expected = np.array([0, 1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestSliceLocs: def test_slice_locs_with_interval(self): diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index cd61fcaa835a4..7bb048bf2c91c 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -318,6 +318,16 @@ def test_monotonic(self, closed): assert idx.is_monotonic_decreasing is True assert idx._is_strictly_monotonic_decreasing is True + def test_is_monotonic_with_nans(self): + # GH#41831 + index = IntervalIndex([np.nan, np.nan]) + + assert not index.is_monotonic + assert not index._is_strictly_monotonic_increasing + assert not index.is_monotonic_increasing + assert not index._is_strictly_monotonic_decreasing + assert not index.is_monotonic_decreasing + def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) assert i[0] == Interval(0.0, 1.0, closed=closed) From 17267b9215ad581e24323d8d9503f9b949c52170 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Jun 2021 16:42:07 -0700 Subject: [PATCH 2/4] TST: getitem and loc --- pandas/core/indexes/interval.py | 10 ++++++---- .../tests/indexing/interval/test_interval.py | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index b2d0f22010da5..67c7993b15762 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -771,11 +771,13 @@ def _get_indexer_pointwise(self, target: Index) -> tuple[np.ndarray, np.ndarray] missing.append(i) else: try: - locs = self.get_loc(key) - if isinstance(locs, slice): + nlocs = self.get_loc(key) + if isinstance(nlocs, slice): # Only needed for get_indexer_non_unique - locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") - locs = np.array(locs, ndmin=1) + nlocs = np.arange( + nlocs.start, nlocs.stop, nlocs.step, dtype="intp" + ) + locs = np.array(nlocs, ndmin=1) except KeyError: missing.append(i) locs = np.array([-1]) diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index 503e39041a49f..ce54339df7143 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -114,6 +114,25 @@ def test_loc_getitem_frame(self): with pytest.raises(KeyError, match=r"\[10\] not in index"): df.loc[[10, 4]] + def test_getitem_interval_with_nans(self, frame_or_series, indexer_sl, request): + # GH#41831 + + index = IntervalIndex([np.nan, np.nan]) + key = index[:-1] + + obj = frame_or_series(range(2), index=index) + if frame_or_series is DataFrame and indexer_sl is tm.setitem: + obj = obj.T + + result = indexer_sl(obj)[key] + expected = obj + + if frame_or_series is DataFrame and indexer_sl is tm.setitem: + mark = pytest.mark.xfail(reason="__contains__ returns False on np.nan") + request.node.add_marker(mark) + + tm.assert_equal(result, expected) + class TestIntervalIndexInsideMultiIndex: def test_mi_intervalindex_slicing_with_scalar(self): From b2e78a27bc4536f47f1be93fd211a7f800814c9f Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Jun 2021 19:31:27 -0700 Subject: [PATCH 3/4] IntervalIndex.__contains__, get_loc with nan --- pandas/core/indexes/interval.py | 51 +++++++++---------- .../tests/indexes/interval/test_indexing.py | 16 ++++++ .../tests/indexing/interval/test_interval.py | 6 +-- 3 files changed, 42 insertions(+), 31 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 67c7993b15762..5b4f793d4ad95 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -62,7 +62,7 @@ is_scalar, ) from pandas.core.dtypes.dtypes import IntervalDtype -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import is_valid_na_for_dtype from pandas.core.algorithms import ( take_nd, @@ -402,6 +402,8 @@ def __contains__(self, key: Any) -> bool: """ hash(key) if not isinstance(key, Interval): + if is_valid_na_for_dtype(key, self.dtype): + return self.hasnans return False try: @@ -679,6 +681,8 @@ def get_loc( if self.closed != key.closed: raise KeyError(key) mask = (self.left == key.left) & (self.right == key.right) + elif is_valid_na_for_dtype(key, self.dtype): + mask = self.isna() else: # assume scalar op_left = le if self.closed_left else lt @@ -694,7 +698,12 @@ def get_loc( raise KeyError(key) elif matches == 1: return mask.argmax() - return lib.maybe_booleans_to_slice(mask.view("u1")) + + res = lib.maybe_booleans_to_slice(mask.view("u1")) + if isinstance(res, slice) and res.stop is None: + # TODO: DO this in maybe_booleans_to_slice? + res = slice(res.start, len(self), res.step) + return res def _get_indexer( self, @@ -762,30 +771,20 @@ def _get_indexer_pointwise(self, target: Index) -> tuple[np.ndarray, np.ndarray] """ indexer, missing = [], [] for i, key in enumerate(target): - if is_interval_dtype(target.dtype) and isna(key): - # self.get_loc(np.nan) will treat it as a float instead of as - # our own dtype. - # TODO: handle this in get_loc? - locs = self.isna().nonzero()[0] - if len(locs) == 0: - missing.append(i) - else: - try: - nlocs = self.get_loc(key) - if isinstance(nlocs, slice): - # Only needed for get_indexer_non_unique - nlocs = np.arange( - nlocs.start, nlocs.stop, nlocs.step, dtype="intp" - ) - locs = np.array(nlocs, ndmin=1) - except KeyError: - missing.append(i) - locs = np.array([-1]) - except InvalidIndexError: - # i.e. non-scalar key e.g. a tuple. - # see test_append_different_columns_types_raises - missing.append(i) - locs = np.array([-1]) + try: + locs = self.get_loc(key) + if isinstance(locs, slice): + # Only needed for get_indexer_non_unique + locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") + locs = np.array(locs, ndmin=1) + except KeyError: + missing.append(i) + locs = np.array([-1]) + except InvalidIndexError: + # i.e. non-scalar key e.g. a tuple. + # see test_append_different_columns_types_raises + missing.append(i) + locs = np.array([-1]) indexer.append(locs) diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index f71e5a6fd86da..a5a921f42c3ef 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -6,9 +6,11 @@ from pandas.errors import InvalidIndexError from pandas import ( + NA, CategoricalIndex, Interval, IntervalIndex, + NaT, Timedelta, date_range, timedelta_range, @@ -168,6 +170,20 @@ def test_get_loc_non_scalar_errors(self, key): with pytest.raises(InvalidIndexError, match=msg): idx.get_loc(key) + def test_get_indexer_with_nans(self): + # GH#41831 + index = IntervalIndex([np.nan, Interval(1, 2), np.nan]) + + expected = np.array([True, False, True]) + for key in [None, np.nan, NA]: + assert key in index + result = index.get_loc(key) + tm.assert_numpy_array_equal(result, expected) + + for key in [NaT, np.timedelta64("NaT", "ns"), np.datetime64("NaT", "ns")]: + with pytest.raises(KeyError, match=str(key)): + index.get_loc(key) + class TestGetIndexer: @pytest.mark.parametrize( diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index ce54339df7143..ccb16c5d97ecc 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -114,7 +114,7 @@ def test_loc_getitem_frame(self): with pytest.raises(KeyError, match=r"\[10\] not in index"): df.loc[[10, 4]] - def test_getitem_interval_with_nans(self, frame_or_series, indexer_sl, request): + def test_getitem_interval_with_nans(self, frame_or_series, indexer_sl): # GH#41831 index = IntervalIndex([np.nan, np.nan]) @@ -127,10 +127,6 @@ def test_getitem_interval_with_nans(self, frame_or_series, indexer_sl, request): result = indexer_sl(obj)[key] expected = obj - if frame_or_series is DataFrame and indexer_sl is tm.setitem: - mark = pytest.mark.xfail(reason="__contains__ returns False on np.nan") - request.node.add_marker(mark) - tm.assert_equal(result, expected) From 7f081a16ec061d0874cb14771d2f8dc37a834208 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Jun 2021 14:20:41 -0700 Subject: [PATCH 4/4] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ff1c6ebf7aae2..e506fef313c62 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -949,6 +949,7 @@ Interval - Bug in :meth:`IntervalIndex.intersection` returning duplicates when at least one of the :class:`Index` objects have duplicates which are present in the other (:issue:`38743`) - :meth:`IntervalIndex.union`, :meth:`IntervalIndex.intersection`, :meth:`IntervalIndex.difference`, and :meth:`IntervalIndex.symmetric_difference` now cast to the appropriate dtype instead of raising a ``TypeError`` when operating with another :class:`IntervalIndex` with incompatible dtype (:issue:`39267`) - :meth:`PeriodIndex.union`, :meth:`PeriodIndex.intersection`, :meth:`PeriodIndex.symmetric_difference`, :meth:`PeriodIndex.difference` now cast to object dtype instead of raising ``IncompatibleFrequency`` when operating with another :class:`PeriodIndex` with incompatible dtype (:issue:`39306`) +- Bug in :meth:`IntervalIndex.is_monotonic`, :meth:`IntervalIndex.get_loc`, :meth:`IntervalIndex.get_indexer_for`, and :meth:`IntervalIndex.__contains__` when NA values are present (:issue:`41831`) Indexing ^^^^^^^^