From 1918d5f5a795b8a2fd467aa18d712f811ebf7e23 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 28 Jun 2019 11:43:04 -0500 Subject: [PATCH 1/9] API: Implement new interval behavior --- pandas/core/indexes/base.py | 14 +- pandas/core/indexes/interval.py | 276 ++++++++---------- pandas/core/indexing.py | 2 +- .../tests/indexes/interval/test_interval.py | 267 +++-------------- .../indexes/interval/test_interval_new.py | 50 ++-- .../tests/indexing/interval/test_interval.py | 155 +--------- .../indexing/interval/test_interval_new.py | 36 +-- pandas/tests/indexing/test_indexing.py | 5 - pandas/tests/reshape/test_concat.py | 18 +- 9 files changed, 220 insertions(+), 603 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cb5b4a6c8993c..fc9359f4afa1d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3239,8 +3239,9 @@ def reindex(self, target, method=None, level=None, limit=None, if self.equals(target): indexer = None else: - - if self.is_unique: + # check is_overlapping for IntervalIndex compat + if (self.is_unique and + not getattr(self, 'is_overlapping', False)): indexer = self.get_indexer(target, method=method, limit=limit, tolerance=tolerance) @@ -4902,13 +4903,6 @@ def _searchsorted_monotonic(self, label, side='left'): raise ValueError('index must be monotonic increasing or decreasing') - def _get_loc_only_exact_matches(self, key): - """ - This is overridden on subclasses (namely, IntervalIndex) to control - get_slice_bound. - """ - return self.get_loc(key) - def get_slice_bound(self, label, side, kind): """ Calculate slice bound that corresponds to given label. @@ -4942,7 +4936,7 @@ def get_slice_bound(self, label, side, kind): # we need to look up the label try: - slc = self._get_loc_only_exact_matches(label) + slc = self.get_loc(label) except KeyError as err: try: return self._searchsorted_monotonic(label, side) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 777fa2eadd289..73923ed213e0f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1,4 +1,5 @@ """ define the IntervalIndex """ +from operator import le, lt import textwrap import warnings @@ -6,7 +7,7 @@ from pandas._config import get_option -from pandas._libs import Timedelta, Timestamp +from pandas._libs import Timedelta, Timestamp, lib from pandas._libs.interval import Interval, IntervalMixin, IntervalTree from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.util._exceptions import rewrite_exception @@ -23,7 +24,7 @@ import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( - Index, _index_shared_docs, default_pprint, ensure_index) + Index, InvalidIndexError, _index_shared_docs, default_pprint, ensure_index) from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.indexes.multi import MultiIndex from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range @@ -622,6 +623,23 @@ def _maybe_cast_indexed(self, key): return key + def _can_reindex(self, indexer): + """ + Check if we are allowing reindexing with this particular indexer. + + Parameters + ---------- + indexer : an integer indexer + + Raises + ------ + ValueError if its a duplicate axis + """ + + # trying to reindex on an axis with duplicates + if self.is_overlapping and len(indexer): + raise ValueError("cannot reindex from an overlapping axis") + def _needs_i8_conversion(self, key): """ Check if a given key needs i8 conversion. Conversion is necessary for @@ -732,18 +750,6 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): return sub_idx._searchsorted_monotonic(label, side) - def _get_loc_only_exact_matches(self, key): - if isinstance(key, Interval): - - if not self.is_unique: - raise ValueError("cannot index with a slice Interval" - " and a non-unique index") - - # TODO: this expands to a tuple index, see if we can - # do better - return Index(self._multiindex.values).get_loc(key) - raise KeyError - def _find_non_overlapping_monotonic_bounds(self, key): if isinstance(key, IntervalMixin): start = self._searchsorted_monotonic( @@ -808,58 +814,28 @@ def get_loc(self, key, method=None): array([0, 1], dtype=int64) """ self._check_method(method) + if is_list_like(key): + raise KeyError(key) - original_key = key - key = self._maybe_cast_indexed(key) - - if self.is_non_overlapping_monotonic: - if isinstance(key, Interval): - left = self._maybe_cast_slice_bound(key.left, 'left', None) - right = self._maybe_cast_slice_bound(key.right, 'right', None) - key = Interval(left, right, key.closed) - else: - key = self._maybe_cast_slice_bound(key, 'left', None) - - start, stop = self._find_non_overlapping_monotonic_bounds(key) - - if start is None or stop is None: - return slice(start, stop) - elif start + 1 == stop: - return start - elif start < stop: - return slice(start, stop) - else: - raise KeyError(original_key) - + if isinstance(key, Interval): + if self.closed != key.closed: + raise KeyError(key) + mask = (self.left == key.left) & (self.right == key.right) else: - # use the interval tree - key = self._maybe_convert_i8(key) - if isinstance(key, Interval): - left, right = _get_interval_closed_bounds(key) - return self._engine.get_loc_interval(left, right) - else: - return self._engine.get_loc(key) - - def get_value(self, series, key): - if com.is_bool_indexer(key): - loc = key - elif is_list_like(key): - loc = self.get_indexer(key) - elif isinstance(key, slice): - - if not (key.step is None or key.step == 1): - raise ValueError("cannot support not-default step in a slice") - + # assume scalar + op_left = le if self.closed_left else lt + op_right = le if self.closed_right else lt try: - loc = self.get_loc(key) + mask = op_left(self.left, key) & op_right(key, self.right) except TypeError: - # we didn't find exact intervals or are non-unique - msg = "unable to slice with this key: {key}".format(key=key) - raise ValueError(msg) + raise KeyError(key) - else: - loc = self.get_loc(key) - return series.iloc[loc] + matches = mask.sum() + if matches == 0: + raise KeyError(key) + elif matches == 1: + return mask.argmax() + return lib.maybe_booleans_to_slice(mask.view('u1')) @Substitution(**dict(_index_doc_kwargs, **{'raises_section': textwrap.dedent(""" @@ -873,109 +849,109 @@ def get_value(self, series, key): def get_indexer(self, target, method=None, limit=None, tolerance=None): self._check_method(method) - target = ensure_index(target) - target = self._maybe_cast_indexed(target) - - if self.equals(target): - return np.arange(len(self), dtype='intp') - - if self.is_non_overlapping_monotonic: - start, stop = self._find_non_overlapping_monotonic_bounds(target) - start_plus_one = start + 1 - if not ((start_plus_one < stop).any()): - return np.where(start_plus_one == stop, start, -1) + if self.is_overlapping: + msg = ('cannot handle overlapping indices; use ' + 'IntervalIndex.get_indexer_non_unique') + raise InvalidIndexError(msg) - if not self.is_unique: - raise ValueError("cannot handle non-unique indices") + try: + target = ensure_index(target) + except ValueError: + target = Index(target, dtype=object) - # IntervalIndex if isinstance(target, IntervalIndex): - indexer = self._get_reindexer(target) - - # non IntervalIndex + if self.equals(target): + return np.arange(len(self), dtype='intp') + elif self.closed != target.closed: + return np.repeat(np.intp(-1), len(target)) + + left_indexer = self.left.get_indexer(target.left) + right_indexer = self.right.get_indexer(target.right) + indexer = np.where(left_indexer == right_indexer, left_indexer, -1) + elif not is_object_dtype(target): + # homogeneous scalar index + target = self._maybe_convert_i8(target) + try: + indexer = self._engine.get_indexer(target.values) + except TypeError as e: + raise ValueError(e) else: - indexer = np.concatenate([self.get_loc(i) for i in target]) + # heterogeneous index: defer elementwise to get_loc + indexer = [] + for key in target: + try: + loc = self.get_loc(key) + except KeyError: + loc = -1 + indexer.append(loc) return ensure_platform_int(indexer) - def _get_reindexer(self, target): - """ - Return an indexer for a target IntervalIndex with self - """ - - # find the left and right indexers - left = self._maybe_convert_i8(target.left) - right = self._maybe_convert_i8(target.right) - lindexer = self._engine.get_indexer(left.values) - rindexer = self._engine.get_indexer(right.values) - - # we want to return an indexer on the intervals - # however, our keys could provide overlapping of multiple - # intervals, so we iterate thru the indexers and construct - # a set of indexers - - indexer = [] - n = len(self) - - for i, (lhs, rhs) in enumerate(zip(lindexer, rindexer)): - - target_value = target[i] - - # matching on the lhs bound - if (lhs != -1 and - self.closed == 'right' and - target_value.left == self[lhs].right): - lhs += 1 - - # matching on the lhs bound - if (rhs != -1 and - self.closed == 'left' and - target_value.right == self[rhs].left): - rhs -= 1 - - # not found - if lhs == -1 and rhs == -1: - indexer.append(np.array([-1])) - - elif rhs == -1: - - indexer.append(np.arange(lhs, n)) - - elif lhs == -1: - - # care about left/right closed here - value = self[i] + @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + try: + target = ensure_index(target) + except ValueError: + target = Index(target, dtype=object) + + if isinstance(target, IntervalIndex) and self.closed != target.closed: + return np.repeat(-1, len(target)), np.arange(len(target)) + + if is_object_dtype(target) or isinstance(target, IntervalIndex): + indexer, missing = [], [] + for i, key in enumerate(target): + try: + locs = self.get_loc(key) + if isinstance(locs, slice): + locs = np.arange( + locs.start, locs.stop, locs.step, dtype='intp') + locs = np.array(locs, ndmin=1) + except KeyError: + missing.append(i) + locs = np.array([-1]) + indexer.append(locs) + indexer = np.concatenate(indexer) + else: + target = self._maybe_convert_i8(target) + indexer, missing = self._engine.get_indexer_non_unique( + target.values) - # target.closed same as self.closed - if self.closed == target.closed: - if target_value.left < value.left: - indexer.append(np.array([-1])) - continue + return ensure_index(indexer), ensure_platform_int(missing) - # target.closed == 'left' - elif self.closed == 'right': - if target_value.left <= value.left: - indexer.append(np.array([-1])) - continue + def get_indexer_for(self, target, **kwargs): + """ + Guaranteed return of an indexer even when overlapping. - # target.closed == 'right' - elif self.closed == 'left': - if target_value.left <= value.left: - indexer.append(np.array([-1])) - continue + This dispatches to get_indexer or get_indexer_non_unique + as appropriate. - indexer.append(np.arange(0, rhs + 1)) + Returns + ------- + numpy.ndarray + List of indices. + """ + if self.is_overlapping: + return self.get_indexer_non_unique(target, **kwargs)[0] + return self.get_indexer(target, **kwargs) + def get_value(self, series, key): + if com.is_bool_indexer(key): + loc = key + elif is_list_like(key): + if self.is_overlapping: + loc, missing = self.get_indexer_non_unique(key) + if len(missing): + raise KeyError else: - indexer.append(np.arange(lhs, rhs + 1)) - - return np.concatenate(indexer) - - @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) - def get_indexer_non_unique(self, target): - target = self._maybe_cast_indexed(ensure_index(target)) - return super().get_indexer_non_unique(target) + loc = self.get_indexer(key) + elif isinstance(key, slice): + if not (key.step is None or key.step == 1): + raise ValueError("cannot support not-default step in a slice") + loc = self._convert_slice_indexer(key, kind='getitem') + else: + loc = self.get_loc(key) + return series.iloc[loc] @Appender(_index_shared_docs['where']) def where(self, cond, other=None): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 1539feb2e0856..975ba8aceed02 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1074,7 +1074,7 @@ def _get_listlike_indexer(self, key, axis, raise_missing=False): raise_missing=raise_missing) return ax[indexer], indexer - if ax.is_unique: + if ax.is_unique and not getattr(ax, 'is_overlapping', False): # If we are trying to get actual keys from empty Series, we # patiently wait for a KeyError later on - otherwise, convert if len(ax) or not len(key): diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index b2f409837344a..63b04cfacf3ea 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -393,7 +393,6 @@ def test_repr_missing(self, constructor, expected): result = repr(obj) assert result == expected - # TODO: check this behavior is consistent with test_interval_new.py def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) @@ -414,120 +413,31 @@ def test_get_item(self, closed): closed=closed) tm.assert_index_equal(result, expected) - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_get_loc_value(self): - with pytest.raises(KeyError, match="^0$"): - self.index.get_loc(0) - assert self.index.get_loc(0.5) == 0 - assert self.index.get_loc(1) == 0 - assert self.index.get_loc(1.5) == 1 - assert self.index.get_loc(2) == 1 - with pytest.raises(KeyError, match="^-1$"): - self.index.get_loc(-1) - with pytest.raises(KeyError, match="^3$"): - self.index.get_loc(3) - - idx = IntervalIndex.from_tuples([(0, 2), (1, 3)]) - assert idx.get_loc(0.5) == 0 - assert idx.get_loc(1) == 0 - tm.assert_numpy_array_equal(idx.get_loc(1.5), - np.array([0, 1], dtype='intp')) - tm.assert_numpy_array_equal(np.sort(idx.get_loc(2)), - np.array([0, 1], dtype='intp')) - assert idx.get_loc(3) == 1 - with pytest.raises(KeyError, match=r"^3\.5$"): - idx.get_loc(3.5) - - idx = IntervalIndex.from_arrays([0, 2], [1, 3]) - with pytest.raises(KeyError, match=r"^1\.5$"): - idx.get_loc(1.5) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def slice_locs_cases(self, breaks): - # TODO: same tests for more index types - index = IntervalIndex.from_breaks([0, 1, 2], closed='right') - assert index.slice_locs() == (0, 2) - assert index.slice_locs(0, 1) == (0, 1) - assert index.slice_locs(1, 1) == (0, 1) - assert index.slice_locs(0, 2) == (0, 2) - assert index.slice_locs(0.5, 1.5) == (0, 2) - assert index.slice_locs(0, 0.5) == (0, 1) - assert index.slice_locs(start=1) == (0, 2) - assert index.slice_locs(start=1.2) == (1, 2) - assert index.slice_locs(end=1) == (0, 1) - assert index.slice_locs(end=1.1) == (0, 2) - assert index.slice_locs(end=1.0) == (0, 1) - assert index.slice_locs(-1, -1) == (0, 0) - - index = IntervalIndex.from_breaks([0, 1, 2], closed='neither') - assert index.slice_locs(0, 1) == (0, 1) - assert index.slice_locs(0, 2) == (0, 2) - assert index.slice_locs(0.5, 1.5) == (0, 2) - assert index.slice_locs(1, 1) == (1, 1) - assert index.slice_locs(1, 2) == (1, 2) - - index = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)], - closed='both') - assert index.slice_locs(1, 1) == (0, 1) - assert index.slice_locs(1, 2) == (0, 2) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_slice_locs_int64(self): - self.slice_locs_cases([0, 1, 2]) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_slice_locs_float64(self): - self.slice_locs_cases([0.0, 1.0, 2.0]) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def slice_locs_decreasing_cases(self, tuples): - index = IntervalIndex.from_tuples(tuples) - assert index.slice_locs(1.5, 0.5) == (1, 3) - assert index.slice_locs(2, 0) == (1, 3) - assert index.slice_locs(2, 1) == (1, 3) - assert index.slice_locs(3, 1.1) == (0, 3) - assert index.slice_locs(3, 3) == (0, 2) - assert index.slice_locs(3.5, 3.3) == (0, 1) - assert index.slice_locs(1, -3) == (2, 3) - - slice_locs = index.slice_locs(-1, -1) - assert slice_locs[0] == slice_locs[1] - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_slice_locs_decreasing_int64(self): - self.slice_locs_cases([(2, 4), (1, 3), (0, 2)]) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_slice_locs_decreasing_float64(self): - self.slice_locs_cases([(2., 4.), (1., 3.), (0., 2.)]) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_slice_locs_fails(self): - index = IntervalIndex.from_tuples([(1, 2), (0, 1), (2, 3)]) - msg = ("'can only get slices from an IntervalIndex if bounds are" - " non-overlapping and all monotonic increasing or decreasing'") - with pytest.raises(KeyError, match=msg): - index.slice_locs(1, 2) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_get_loc_interval(self): - assert self.index.get_loc(Interval(0, 1)) == 0 - assert self.index.get_loc(Interval(0, 0.5)) == 0 - assert self.index.get_loc(Interval(0, 1, 'left')) == 0 - msg = r"Interval\(2, 3, closed='right'\)" - with pytest.raises(KeyError, match=msg): - self.index.get_loc(Interval(2, 3)) - msg = r"Interval\(-1, 0, closed='left'\)" - with pytest.raises(KeyError, match=msg): - self.index.get_loc(Interval(-1, 0, 'left')) + @pytest.mark.parametrize('scalar', [-1, 0, 0.5, 3, 4.5, 5, 6]) + def test_get_loc_length_one_scalar(self, scalar, closed): + # GH 20921 + index = IntervalIndex.from_tuples([(0, 5)], closed=closed) + if scalar in index[0]: + result = index.get_loc(scalar) + assert result == 0 + else: + with pytest.raises(KeyError): + index.get_loc(scalar) - # Make consistent with test_interval_new.py (see #16316, #16386) - @pytest.mark.parametrize('item', [3, Interval(1, 4)]) - def test_get_loc_length_one(self, item, closed): + @pytest.mark.parametrize('other_closed', [ + 'left', 'right', 'both', 'neither']) + @pytest.mark.parametrize('left, right', [(0, 5), (-1, 4), (-1, 6), (6, 7)]) + def test_get_loc_length_one_interval( + self, left, right, closed, other_closed): # GH 20921 index = IntervalIndex.from_tuples([(0, 5)], closed=closed) - result = index.get_loc(item) - assert result == 0 + interval = Interval(left, right, closed=other_closed) + if interval == index[0]: + result = index.get_loc(interval) + assert result == 0 + else: + with pytest.raises(KeyError): + index.get_loc(interval) # Make consistent with test_interval_new.py (see #16316, #16386) @pytest.mark.parametrize('breaks', [ @@ -544,12 +454,11 @@ def test_get_loc_datetimelike_nonoverlapping(self, breaks): expected = 0 assert result == expected - interval = Interval(index[0].left, index[1].right) + interval = Interval(index[0].left, index[0].right) result = index.get_loc(interval) - expected = slice(0, 2) + expected = 0 assert result == expected - # Make consistent with test_interval_new.py (see #16316, #16386) @pytest.mark.parametrize('arrays', [ (date_range('20180101', periods=4), date_range('20180103', periods=4)), (date_range('20180101', periods=4, tz='US/Eastern'), @@ -558,69 +467,19 @@ def test_get_loc_datetimelike_nonoverlapping(self, breaks): timedelta_range('2 days', periods=4))], ids=lambda x: str(x[0].dtype)) def test_get_loc_datetimelike_overlapping(self, arrays): # GH 20636 - # overlapping = IntervalTree method with i8 conversion index = IntervalIndex.from_arrays(*arrays) value = index[0].mid + Timedelta('12 hours') - result = np.sort(index.get_loc(value)) - expected = np.array([0, 1], dtype='intp') - tm.assert_numpy_array_equal(result, expected) - - interval = Interval(index[0].left, index[1].right) - result = np.sort(index.get_loc(interval)) - expected = np.array([0, 1, 2], dtype='intp') - tm.assert_numpy_array_equal(result, expected) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_get_indexer(self): - actual = self.index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) - expected = np.array([-1, -1, 0, 0, 1, 1, -1], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) - - actual = self.index.get_indexer(self.index) - expected = np.array([0, 1], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) - - index = IntervalIndex.from_breaks([0, 1, 2], closed='left') - actual = index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) - expected = np.array([-1, 0, 0, 1, 1, -1, -1], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) - - actual = self.index.get_indexer(index[:1]) - expected = np.array([0], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) - - actual = self.index.get_indexer(index) - expected = np.array([-1, 1], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_get_indexer_subintervals(self): - - # TODO: is this right? - # return indexers for wholly contained subintervals - target = IntervalIndex.from_breaks(np.linspace(0, 2, 5)) - actual = self.index.get_indexer(target) - expected = np.array([0, 0, 1, 1], dtype='p') - tm.assert_numpy_array_equal(actual, expected) - - target = IntervalIndex.from_breaks([0, 0.67, 1.33, 2]) - actual = self.index.get_indexer(target) - expected = np.array([0, 0, 1, 1], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) - - actual = self.index.get_indexer(target[[0, -1]]) - expected = np.array([0, 1], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) + result = index.get_loc(value) + expected = slice(0, 2, None) + assert result == expected - target = IntervalIndex.from_breaks([0, 0.33, 0.67, 1], closed='left') - actual = self.index.get_indexer(target) - expected = np.array([0, 0, 0], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) + interval = Interval(index[0].left, index[0].right) + result = index.get_loc(interval) + expected = 0 + assert result == expected - # Make consistent with test_interval_new.py (see #16316, #16386) - @pytest.mark.parametrize('item', [ - [3], np.arange(1, 5), [Interval(1, 4)], interval_range(1, 4)]) + @pytest.mark.parametrize('item', [[3], np.arange(0.5, 5, 0.5)]) def test_get_indexer_length_one(self, item, closed): # GH 17284 index = IntervalIndex.from_tuples([(0, 5)], closed=closed) @@ -628,22 +487,12 @@ def test_get_indexer_length_one(self, item, closed): expected = np.array([0] * len(item), dtype='intp') tm.assert_numpy_array_equal(result, expected) - # Make consistent with test_interval_new.py (see #16316, #16386) - @pytest.mark.parametrize('arrays', [ - (date_range('20180101', periods=4), date_range('20180103', periods=4)), - (date_range('20180101', periods=4, tz='US/Eastern'), - date_range('20180103', periods=4, tz='US/Eastern')), - (timedelta_range('0 days', periods=4), - timedelta_range('2 days', periods=4))], ids=lambda x: str(x[0].dtype)) - def test_get_reindexer_datetimelike(self, arrays): - # GH 20636 - index = IntervalIndex.from_arrays(*arrays) - tuples = [(index[0].left, index[0].left + pd.Timedelta('12H')), - (index[-1].right - pd.Timedelta('12H'), index[-1].right)] - target = IntervalIndex.from_tuples(tuples) - - result = index._get_reindexer(target) - expected = np.array([0, 3], dtype='intp') + @pytest.mark.parametrize('size', [1, 5]) + def test_get_indexer_length_one_interval(self, size, closed): + # GH 17284 + index = IntervalIndex.from_tuples([(0, 5)], closed=closed) + result = index.get_indexer([Interval(0, 5, closed)] * size) + expected = np.array([0] * size, dtype='intp') tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize('breaks', [ @@ -736,41 +585,6 @@ def test_maybe_convert_i8_errors(self, breaks1, breaks2, make_key): with pytest.raises(ValueError, match=msg): index._maybe_convert_i8(key) - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_contains(self): - # Only endpoints are valid. - i = IntervalIndex.from_arrays([0, 1], [1, 2]) - - # Invalid - assert 0 not in i - assert 1 not in i - assert 2 not in i - - # Valid - assert Interval(0, 1) in i - assert Interval(0, 2) in i - assert Interval(0, 0.5) in i - assert Interval(3, 5) not in i - assert Interval(-1, 0, closed='left') not in i - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def testcontains(self): - # can select values that are IN the range of a value - i = IntervalIndex.from_arrays([0, 1], [1, 2]) - - assert i.contains(0.1) - assert i.contains(0.5) - assert i.contains(1) - assert i.contains(Interval(0, 1)) - assert i.contains(Interval(0, 2)) - - # these overlaps completely - assert i.contains(Interval(0, 3)) - assert i.contains(Interval(1, 3)) - - assert not i.contains(20) - assert not i.contains(-20) - def test_dropna(self, closed): expected = IntervalIndex.from_tuples( @@ -785,7 +599,6 @@ def test_dropna(self, closed): result = ii.dropna() tm.assert_index_equal(result, expected) - # TODO: check this behavior is consistent with test_interval_new.py def test_non_contiguous(self, closed): index = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed) target = [0.5, 1.5, 2.5] @@ -931,8 +744,8 @@ def test_datetime(self, tz): assert Timestamp('2000-01-01', tz=tz) not in index assert Timestamp('2000-01-01T12', tz=tz) not in index assert Timestamp('2000-01-02', tz=tz) not in index - iv_true = Interval(Timestamp('2000-01-01T08', tz=tz), - Timestamp('2000-01-01T18', tz=tz)) + iv_true = Interval(Timestamp('2000-01-02', tz=tz), + Timestamp('2000-01-03', tz=tz)) iv_false = Interval(Timestamp('1999-12-31', tz=tz), Timestamp('2000-01-01', tz=tz)) assert iv_true in index diff --git a/pandas/tests/indexes/interval/test_interval_new.py b/pandas/tests/indexes/interval/test_interval_new.py index 5599009dbc898..ce510b0ea2744 100644 --- a/pandas/tests/indexes/interval/test_interval_new.py +++ b/pandas/tests/indexes/interval/test_interval_new.py @@ -2,10 +2,9 @@ import pytest from pandas import Int64Index, Interval, IntervalIndex +from pandas.core.indexes.base import InvalidIndexError import pandas.util.testing as tm -pytestmark = pytest.mark.skip(reason="new indexing tests for issue 16316") - class TestIntervalIndex: @@ -127,41 +126,46 @@ def test_slice_locs_with_ints_and_floats_succeeds(self): # decreasing non-overlapping index = IntervalIndex.from_tuples([(3, 4), (1, 2), (0, 1)]) - assert index.slice_locs(0, 1) == (3, 2) - assert index.slice_locs(0, 2) == (3, 1) + assert index.slice_locs(0, 1) == (3, 3) + assert index.slice_locs(0, 2) == (3, 2) assert index.slice_locs(0, 3) == (3, 1) - assert index.slice_locs(3, 1) == (1, 2) - assert index.slice_locs(3, 4) == (1, 0) - assert index.slice_locs(0, 4) == (3, 0) + assert index.slice_locs(3, 1) == (1, 3) + assert index.slice_locs(3, 4) == (1, 1) + assert index.slice_locs(0, 4) == (3, 1) @pytest.mark.parametrize("query", [ - [0, 1], [0, 2], [0, 3], [3, 1], [3, 4], [0, 4]]) + [0, 1], [0, 2], [0, 3], [0, 4]]) @pytest.mark.parametrize("tuples", [ - [(0, 2), (1, 3), (2, 4)], [(2, 4), (1, 3), (0, 2)], - [(0, 2), (0, 2), (2, 4)], [(0, 2), (2, 4), (0, 2)], + [(0, 2), (1, 3), (2, 4)], + [(2, 4), (1, 3), (0, 2)], + [(0, 2), (0, 2), (2, 4)], + [(0, 2), (2, 4), (0, 2)], [(0, 2), (0, 2), (2, 4), (1, 3)]]) def test_slice_locs_with_ints_and_floats_errors(self, tuples, query): + start, stop = query index = IntervalIndex.from_tuples(tuples) with pytest.raises(KeyError): - index.slice_locs(query) + index.slice_locs(start, stop) @pytest.mark.parametrize('query, expected', [ - ([Interval(1, 3, closed='right')], [1]), - ([Interval(1, 3, closed='left')], [-1]), - ([Interval(1, 3, closed='both')], [-1]), - ([Interval(1, 3, closed='neither')], [-1]), + ([Interval(2, 4, closed='right')], [1]), + ([Interval(2, 4, closed='left')], [-1]), + ([Interval(2, 4, closed='both')], [-1]), + ([Interval(2, 4, closed='neither')], [-1]), ([Interval(1, 4, closed='right')], [-1]), ([Interval(0, 4, closed='right')], [-1]), - ([Interval(1, 2, closed='right')], [-1]), - ([Interval(2, 4, closed='right'), Interval(1, 3, closed='right')], - [2, 1]), - ([Interval(1, 3, closed='right'), Interval(0, 2, closed='right')], + ([Interval(0.5, 1.5, closed='right')], [-1]), + ([Interval(2, 4, closed='right'), Interval(0, 1, closed='right')], [1, -1]), - ([Interval(1, 3, closed='right'), Interval(1, 3, closed='left')], + ([Interval(2, 4, closed='right'), Interval(2, 4, closed='right')], + [1, 1]), + ([Interval(5, 7, closed='right'), Interval(2, 4, closed='right')], + [2, 1]), + ([Interval(2, 4, closed='right'), Interval(2, 4, closed='left')], [1, -1])]) def test_get_indexer_with_interval(self, query, expected): - tuples = [(0, 2.5), (1, 3), (2, 4)] + tuples = [(0, 2), (2, 4), (5, 7)] index = IntervalIndex.from_tuples(tuples, closed='right') result = index.get_indexer(query) @@ -204,7 +208,7 @@ def test_get_indexer_errors(self, tuples, closed): msg = ('cannot handle overlapping indices; use ' 'IntervalIndex.get_indexer_non_unique') - with pytest.raises(ValueError, match=msg): + with pytest.raises(InvalidIndexError, match=msg): index.get_indexer([0, 2]) @pytest.mark.parametrize('query, expected', [ @@ -238,7 +242,7 @@ def test_get_indexer_non_unique_with_int_and_float(self, query, expected): # TODO we may also want to test get_indexer for the case when # the intervals are duplicated, decreasing, non-monotonic, etc.. - def test_contains(self): + def test_contains_dunder(self): index = IntervalIndex.from_arrays([0, 1], [1, 2], closed='right') diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index d201b9644378f..76f0b94ea3904 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Interval, IntervalIndex, Series +from pandas import DataFrame, IntervalIndex, Series import pandas.util.testing as tm @@ -11,26 +11,6 @@ class TestIntervalIndex: def setup_method(self, method): self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_loc_with_scalar(self): - - s = self.s - - expected = s.iloc[:3] - tm.assert_series_equal(expected, s.loc[:3]) - tm.assert_series_equal(expected, s.loc[:2.5]) - tm.assert_series_equal(expected, s.loc[0.1:2.5]) - tm.assert_series_equal(expected, s.loc[-1:3]) - - expected = s.iloc[1:4] - tm.assert_series_equal(expected, s.loc[[1.5, 2.5, 3.5]]) - tm.assert_series_equal(expected, s.loc[[2, 3, 4]]) - tm.assert_series_equal(expected, s.loc[[1.5, 3, 4]]) - - expected = s.iloc[2:5] - tm.assert_series_equal(expected, s.loc[s >= 2]) - - # TODO: check this behavior is consistent with test_interval_new.py def test_getitem_with_scalar(self): s = self.s @@ -39,7 +19,6 @@ def test_getitem_with_scalar(self): tm.assert_series_equal(expected, s[:3]) tm.assert_series_equal(expected, s[:2.5]) tm.assert_series_equal(expected, s[0.1:2.5]) - tm.assert_series_equal(expected, s[-1:3]) expected = s.iloc[1:4] tm.assert_series_equal(expected, s[[1.5, 2.5, 3.5]]) @@ -49,7 +28,6 @@ def test_getitem_with_scalar(self): expected = s.iloc[2:5] tm.assert_series_equal(expected, s[s >= 2]) - # TODO: check this behavior is consistent with test_interval_new.py @pytest.mark.parametrize('direction', ['increasing', 'decreasing']) def test_nonoverlapping_monotonic(self, direction, closed): tpls = [(0, 1), (2, 3), (4, 5)] @@ -83,137 +61,6 @@ def test_nonoverlapping_monotonic(self, direction, closed): assert s[key] == expected assert s.loc[key] == expected - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_with_interval(self): - - s = self.s - expected = 0 - - result = s.loc[Interval(0, 1)] - assert result == expected - - result = s[Interval(0, 1)] - assert result == expected - - expected = s.iloc[3:5] - result = s.loc[Interval(3, 6)] - tm.assert_series_equal(expected, result) - - expected = s.iloc[3:5] - result = s.loc[[Interval(3, 6)]] - tm.assert_series_equal(expected, result) - - expected = s.iloc[3:5] - result = s.loc[[Interval(3, 5)]] - tm.assert_series_equal(expected, result) - - # missing - with pytest.raises(KeyError): - s.loc[Interval(-2, 0)] - - with pytest.raises(KeyError): - s[Interval(-2, 0)] - - with pytest.raises(KeyError): - s.loc[Interval(5, 6)] - - with pytest.raises(KeyError): - s[Interval(5, 6)] - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_with_slices(self): - - s = self.s - - # slice of interval - with pytest.raises(NotImplementedError): - s.loc[Interval(3, 6):] - - with pytest.raises(NotImplementedError): - s[Interval(3, 6):] - - expected = s.iloc[3:5] - result = s[[Interval(3, 6)]] - tm.assert_series_equal(expected, result) - - # slice of scalar with step != 1 - with pytest.raises(ValueError): - s[0:4:2] - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_with_overlaps(self): - - s = self.s - expected = s.iloc[[3, 4, 3, 4]] - result = s.loc[[Interval(3, 6), Interval(3, 6)]] - tm.assert_series_equal(expected, result) - - idx = IntervalIndex.from_tuples([(1, 5), (3, 7)]) - s = Series(range(len(idx)), index=idx) - - result = s[4] - expected = s - tm.assert_series_equal(expected, result) - - result = s[[4]] - expected = s - tm.assert_series_equal(expected, result) - - result = s.loc[[4]] - expected = s - tm.assert_series_equal(expected, result) - - result = s[Interval(3, 5)] - expected = s - tm.assert_series_equal(expected, result) - - result = s.loc[Interval(3, 5)] - expected = s - tm.assert_series_equal(expected, result) - - # doesn't intersect unique set of intervals - with pytest.raises(KeyError): - s[[Interval(3, 5)]] - - with pytest.raises(KeyError): - s.loc[[Interval(3, 5)]] - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_non_unique(self): - - idx = IntervalIndex.from_tuples([(1, 3), (3, 7)]) - - s = Series(range(len(idx)), index=idx) - - result = s.loc[Interval(1, 3)] - assert result == 0 - - result = s.loc[[Interval(1, 3)]] - expected = s.iloc[0:1] - tm.assert_series_equal(expected, result) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_non_unique_moar(self): - - idx = IntervalIndex.from_tuples([(1, 3), (1, 3), (3, 7)]) - s = Series(range(len(idx)), index=idx) - - result = s.loc[Interval(1, 3)] - expected = s.iloc[[0, 1]] - tm.assert_series_equal(expected, result) - - # non-unique index and slices not allowed - with pytest.raises(ValueError): - s.loc[Interval(1, 3):] - - with pytest.raises(ValueError): - s[Interval(1, 3):] - - # non-unique - with pytest.raises(ValueError): - s[[Interval(1, 3)]] - - # TODO: check this behavior is consistent with test_interval_new.py def test_non_matching(self): s = self.s diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index a6c42dd0ec632..7ce150704fbc4 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -4,8 +4,6 @@ from pandas import Interval, IntervalIndex, Series import pandas.util.testing as tm -pytestmark = pytest.mark.skip(reason="new indexing tests for issue 16316") - class TestIntervalIndex: @@ -72,10 +70,9 @@ def test_loc_with_scalar(self): assert s.loc[1.5] == 1 assert s.loc[2] == 1 - # TODO with __getitem__ same rules as loc, or positional ? - # assert s[1] == 0 - # assert s[1.5] == 1 - # assert s[2] == 1 + assert s[1] == 0 + assert s[1.5] == 1 + assert s[2] == 1 expected = s.iloc[1:4] tm.assert_series_equal(expected, s.loc[[1.5, 2.5, 3.5]]) @@ -107,22 +104,22 @@ def test_loc_with_slices(self): result = s[Interval(0, 1):Interval(2, 3)] tm.assert_series_equal(expected, result) - expected = s.iloc[4:] + expected = s.iloc[3:] result = s.loc[Interval(3, 4):] tm.assert_series_equal(expected, result) result = s[Interval(3, 4):] tm.assert_series_equal(expected, result) - with pytest.raises(KeyError): + with pytest.raises(NotImplementedError): s.loc[Interval(3, 6):] - with pytest.raises(KeyError): + with pytest.raises(NotImplementedError): s[Interval(3, 6):] - with pytest.raises(KeyError): + with pytest.raises(NotImplementedError): s.loc[Interval(3, 4, closed='left'):] - with pytest.raises(KeyError): + with pytest.raises(NotImplementedError): s[Interval(3, 4, closed='left'):] # TODO with non-existing intervals ? @@ -134,17 +131,14 @@ def test_loc_with_slices(self): tm.assert_series_equal(expected, s.loc[:3]) tm.assert_series_equal(expected, s.loc[:2.5]) tm.assert_series_equal(expected, s.loc[0.1:2.5]) + tm.assert_series_equal(expected, s.loc[-1:3]) - # TODO should this work? (-1 is not contained in any of the Intervals) - # tm.assert_series_equal(expected, s.loc[-1:3]) - - # TODO with __getitem__ same rules as loc, or positional ? - # tm.assert_series_equal(expected, s[:3]) - # tm.assert_series_equal(expected, s[:2.5]) - # tm.assert_series_equal(expected, s[0.1:2.5]) + tm.assert_series_equal(expected, s[:3]) + tm.assert_series_equal(expected, s[:2.5]) + tm.assert_series_equal(expected, s[0.1:2.5]) # slice of scalar with step != 1 - with pytest.raises(NotImplementedError): + with pytest.raises(ValueError): s[0:4:2] def test_loc_with_overlap(self): @@ -169,10 +163,10 @@ def test_loc_with_overlap(self): # interval expected = 0 result = s.loc[Interval(1, 5)] - tm.assert_series_equal(expected, result) + result == expected result = s[Interval(1, 5)] - tm.assert_series_equal(expected, result) + result == expected expected = s result = s.loc[[Interval(1, 5), Interval(3, 7)]] diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a0e3df182b129..1bd02bee8f5dc 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -81,7 +81,6 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): " ambiguous|" "Cannot index with multidimensional key|" r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|" - "unhashable type: 'numpy.ndarray'" # TypeError ) if (isinstance(obj, Series) and idxr_id == 'getitem' @@ -120,13 +119,9 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): msg = (r"Buffer has wrong number of dimensions \(expected 1," r" got 3\)|" - "The truth value of an array with more than one element is" - " ambiguous|" - "Only 1-dimensional input arrays are supported|" "'pandas._libs.interval.IntervalTree' object has no attribute" " 'set_value'|" # AttributeError "unhashable type: 'numpy.ndarray'|" # TypeError - r"^\[\[\[" # pandas.core.indexing.IndexingError ) if ((idxr_id == 'iloc') diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 4f65251ebd923..74ede682dfb5f 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -878,13 +878,13 @@ def test_append_preserve_index_name(self): pd.Index(list('abc')), pd.CategoricalIndex('A B C'.split()), pd.CategoricalIndex('D E F'.split(), ordered=True), + pd.IntervalIndex.from_breaks([7, 8, 9, 10]), pd.DatetimeIndex([dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10), dt.datetime(2013, 1, 3, 7, 12)]), ] indexes_cannot_append_with_other = [ - pd.IntervalIndex.from_breaks([0, 1, 2, 3]), pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()]), ] @@ -946,7 +946,7 @@ def test_append_different_columns_types(self, df_columns, series_index): def test_append_different_columns_types_raises( self, index_can_append, index_cannot_append_with_other): # GH18359 - # Dataframe.append will raise if IntervalIndex/MultiIndex appends + # Dataframe.append will raise if MultiIndex appends # or is appended to a different index type # # See also test 'test_append_different_columns_types' above for @@ -955,16 +955,10 @@ def test_append_different_columns_types_raises( df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, name=2) - msg = (r"unorderable types: (Interval|int)\(\) (<|>) " - r"(int|long|float|str|Timestamp)\(\)|" - r"Expected tuple, got (int|long|float|str)|" - r"Cannot compare type 'Timestamp' with type '(int|long)'|" - r"'(<|>)' not supported between instances of 'int' " - r"and '(str|Timestamp)'|" - r"the other index needs to be an IntervalIndex too, but was" - r" type {}|" - r"object of type '(int|float|Timestamp)' has no len\(\)|" - "Expected tuple, got str") + msg = (r"Expected tuple, got (int|long|float|str|" + r"pandas._libs.interval.Interval)|" + r"object of type '(int|float|Timestamp|" + r"pandas._libs.interval.Interval)' has no len\(\)|") with pytest.raises(TypeError, match=msg): df.append(ser) From cf433b313e70f0a28e5722cfc456ea7775ba05d6 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 28 Jun 2019 14:09:17 -0500 Subject: [PATCH 2/9] add whatsnew --- doc/source/whatsnew/v0.25.0.rst | 127 ++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1fd0257d93f45..41771a5f31f9e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -479,6 +479,133 @@ This change is backward compatible for direct usage of Pandas, but if you subcla Pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, you may have to adjust your ``__str__``/``__repr__`` methods (:issue:`26495`). +.. _whatsnew_0250.api_breaking.interval_indexing: + + +Indexing an ``IntervalIndex`` with ``Interval`` objects +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Indexing methods for :class:`IntervalIndex` have been modified to return exact matches only for :class:`Interval` queries. +``IntervalIndex`` methods previously matched on any overlapping ``Interval``. Behavior with scalar points, e.g. querying +with an integer, is unchanged (:issue:`16316`). + +.. ipython:: python + + ii = pd.IntervalIndex.from_tuples([(0, 4), (1, 5), (5, 8)]) + ii + +The ``in`` operator (``__contains__``) now only returns ``True`` for exact matches to ``Intervals`` in the ``IntervalIndex``, whereas +this would previously return ``True`` for any ``Interval`` overlapping an ``Interval`` in the ``IntervalIndex``: + +*Previous behavior*: + +.. code-block:: python + + In [4]: pd.Interval(1, 2, closed='neither') in ii + Out[4]: True + + In [5]: pd.Interval(-10, 10, closed='both') in ii + Out[5]: True + +*New behavior*: + +.. ipython:: python + + pd.Interval(1, 2, closed='neither') in ii + pd.Interval(-10, 10, closed='both') in ii + +The ``get_loc`` method now only returns locations for exact matches to ``Interval`` queries, as opposed to the previous behavior of +returning locations for overlapping matches. A ``KeyError`` will be raised if an exact match is not found. + +*Previous behavior*: + +.. code-block:: python + + In [6]: ii.get_loc(pd.Interval(1, 5)) + Out[6]: array([0, 1]) + + In [7]: ii.get_loc(pd.Interval(2, 6)) + Out[7]: array([0, 1, 2]) + +*New behavior*: + +.. code-block:: python + + In [6]: ii.get_loc(pd.Interval(1, 5)) + Out[6]: 1 + + In [7]: ii.get_loc(pd.Interval(2, 6)) + --------------------------------------------------------------------------- + KeyError: Interval(2, 6, closed='right') + +Likewise, ``get_indexer`` and ``get_indexer_non_unique`` will also only return locations for exact matches to ``Interval`` queries, with +``-1`` denoting that an exact match was not found. + +These indexing changes extend to querying a :class:`Series` or :class:`DataFrame` with an ``IntervalIndex`` index. + +.. ipython:: python + + s = pd.Series(list('abc'), index=ii) + s + +Selecting from a ``Series`` or ``DataFrame`` using ``[]`` (``__getitem__``) or ``loc`` now only returns exact matches for ``Interval`` queries. + +*Previous behavior*: + +.. code-block:: python + + In [8]: s[pd.Interval(1, 5)] + Out[8]: + (0, 4] a + (1, 5] b + dtype: object + + In [9]: s.loc[pd.Interval(1, 5)] + Out[9]: + (0, 4] a + (1, 5] b + dtype: object + +*New behavior*: + +.. ipython:: python + + s[pd.Interval(1, 5)] + s.loc[pd.Interval(1, 5)] + +Similarly, non-exact matches will now raise a ``KeyError``. + +*Previous behavior*: + +.. code-block:: python + + In [9]: s[pd.Interval(2, 6)] + Out[9]: + (0, 4] a + (1, 5] b + (5, 8] c + dtype: object + + In [10]: s.loc[pd.Interval(2, 6)] + Out[10]: + (0, 4] a + (1, 5] b + (5, 8] c + dtype: object + +*New behavior*: + +.. code-block:: python + + In [6]: s[pd.Interval(2, 6)] + --------------------------------------------------------------------------- + KeyError: Interval(2, 6, closed='right') + + In [7]: s.loc[pd.Interval(2, 6)] + --------------------------------------------------------------------------- + KeyError: Interval(2, 6, closed='right') + + .. _whatsnew_0250.api_breaking.deps: Increased minimum versions for dependencies From 55cf803ec6342537e3665ec5f490937ba79ecacb Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 28 Jun 2019 17:08:01 -0500 Subject: [PATCH 3/9] additional fixed issues --- ci/code_checks.sh | 4 ++-- doc/source/whatsnew/v0.25.0.rst | 5 +++-- pandas/core/indexes/interval.py | 15 ++++++++------- pandas/tests/arrays/categorical/test_operators.py | 14 ++++++++++++++ pandas/tests/frame/test_missing.py | 10 ++++++++++ pandas/tests/indexes/interval/test_interval.py | 13 +++++++++++++ pandas/tests/indexes/test_category.py | 14 ++++++++++++++ 7 files changed, 64 insertions(+), 11 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ac86815569a0c..00c430064e4a5 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -245,10 +245,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests interval classes' ; echo $MSG - pytest --doctest-modules -v \ + pytest -q --doctest-modules \ pandas/core/indexes/interval.py \ pandas/core/arrays/interval.py \ - -k"-from_arrays -from_breaks -from_intervals -from_tuples -get_loc -set_closed -to_tuples -interval_range" + -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range" RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 41771a5f31f9e..71f81c6e34a7f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -779,7 +779,8 @@ Categorical - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) - Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in ``True`` (:issue:`26504`) -- +- Bug in :meth:`DataFrame.dropna` when the :class:`DataFrame` has a :class:`CategoricalIndex` containing :class:`Interval` objects incorrectly raised a ``TypeError`` (:issue:`25087`) +- Bug in :class:`Categorical` and :class:`CategoricalIndex` with :class:`Interval` values when using the ``in`` operator (``__contains``) with objects that are not comparable to the values in the ``Interval`` (:issue:`23705`) Datetimelike ^^^^^^^^^^^^ @@ -856,7 +857,7 @@ Interval - Construction of :class:`Interval` is restricted to numeric, :class:`Timestamp` and :class:`Timedelta` endpoints (:issue:`23013`) - Fixed bug in :class:`Series`/:class:`DataFrame` not displaying ``NaN`` in :class:`IntervalIndex` with missing values (:issue:`25984`) -- +- Bug in :meth:`IntervalIndex.get_loc` where a ``KeyError`` would be incorrectly raised for a decreasing :class:`IntervalIndex` (:issue:`25860`) Indexing ^^^^^^^^ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 73923ed213e0f..5a7477264595f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -797,11 +797,8 @@ def get_loc(self, key, method=None): >>> index.get_loc(1) 0 - You can also supply an interval or an location for a point inside an - interval. + You can also supply a point inside an interval. - >>> index.get_loc(pd.Interval(0, 2)) - array([0, 1], dtype=int64) >>> index.get_loc(1.5) 1 @@ -809,9 +806,13 @@ def get_loc(self, key, method=None): relevant intervals. >>> i3 = pd.Interval(0, 2) - >>> overlapping_index = pd.IntervalIndex([i2, i3]) - >>> overlapping_index.get_loc(1.5) - array([0, 1], dtype=int64) + >>> overlapping_index = pd.IntervalIndex([i1, i2, i3]) + >>> overlapping_index.get_loc(0.5) + array([ True, False, True]) + + Only exact matches will be returned if an interval is provided. + >>> index.get_loc(pd.Interval(0, 1)) + 0 """ self._check_method(method) if is_list_like(key): diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index a443408bf9479..cc8d9890ff6eb 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -392,3 +392,17 @@ def test_contains(self): c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab')) assert np.nan in c + + @pytest.mark.parametrize('item, expected', [ + (pd.Interval(0, 1), True), + (1.5, True), + (pd.Interval(0.5, 1.5), False), + ('a', False), + (pd.Timestamp(1), False), + (pd.Timedelta(1), False), + (object(), False)], ids=str) + def test_contains_interval(self, item, expected): + # GH 23705 + cat = Categorical(pd.IntervalIndex.from_breaks(range(3))) + result = item in cat + assert result is expected diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index c72951ac4cdfa..807931567847f 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -201,6 +201,16 @@ def test_dropna_tz_aware_datetime(self): index=[0, 3]) assert_frame_equal(result, expected) + def test_dropna_categorical_interval_index(self): + # GH 25087 + ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28]) + ci = pd.CategoricalIndex(ii) + df = pd.DataFrame({'A': list('abc')}, index=ci) + + expected = df + result = df.dropna() + tm.assert_frame_equal(result, expected) + def test_fillna_datetime(self, datetime_frame): tf = datetime_frame tf.loc[tf.index[:5], 'A'] = np.nan diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 63b04cfacf3ea..3431dc49bad4f 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -479,6 +479,19 @@ def test_get_loc_datetimelike_overlapping(self, arrays): expected = 0 assert result == expected + @pytest.mark.parametrize('values', [ + date_range('2018-01-04', periods=4, freq='-1D'), + date_range('2018-01-04', periods=4, freq='-1D', tz='US/Eastern'), + timedelta_range('3 days', periods=4, freq='-1D'), + np.arange(3.0, -1.0, -1.0), + np.arange(3, -1, -1)], ids=lambda x: str(x.dtype)) + def test_get_loc_decreasing(self, values): + # GH 25860 + index = IntervalIndex.from_arrays(values[1:], values[:-1]) + result = index.get_loc(index[0]) + expected = 0 + assert result == expected + @pytest.mark.parametrize('item', [[3], np.arange(0.5, 5, 0.5)]) def test_get_indexer_length_one(self, item, closed): # GH 17284 diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index d89d282fb785b..581c6874248e7 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -250,6 +250,20 @@ def test_contains(self): list('aabbca') + [np.nan], categories=list('cabdef')) assert np.nan in ci + @pytest.mark.parametrize('item, expected', [ + (pd.Interval(0, 1), True), + (1.5, True), + (pd.Interval(0.5, 1.5), False), + ('a', False), + (pd.Timestamp(1), False), + (pd.Timedelta(1), False), + (object(), False)], ids=str) + def test_contains_interval(self, item, expected): + # GH 23705 + ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) + result = item in ci + assert result is expected + def test_map(self): ci = pd.CategoricalIndex(list('ABABC'), categories=list('CBA'), ordered=True) From 667acfc4ce4c7bb28794c95c5d20b645c9ee9f9d Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 28 Jun 2019 17:30:27 -0500 Subject: [PATCH 4/9] fix failed checks --- doc/source/whatsnew/v0.25.0.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 71f81c6e34a7f..97f643ea696e4 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -495,7 +495,7 @@ with an integer, is unchanged (:issue:`16316`). ii The ``in`` operator (``__contains__``) now only returns ``True`` for exact matches to ``Intervals`` in the ``IntervalIndex``, whereas -this would previously return ``True`` for any ``Interval`` overlapping an ``Interval`` in the ``IntervalIndex``: +this would previously return ``True`` for any ``Interval`` overlapping an ``Interval`` in the ``IntervalIndex``. *Previous behavior*: @@ -533,7 +533,7 @@ returning locations for overlapping matches. A ``KeyError`` will be raised if a In [6]: ii.get_loc(pd.Interval(1, 5)) Out[6]: 1 - + In [7]: ii.get_loc(pd.Interval(2, 6)) --------------------------------------------------------------------------- KeyError: Interval(2, 6, closed='right') @@ -555,13 +555,13 @@ Selecting from a ``Series`` or ``DataFrame`` using ``[]`` (``__getitem__``) or ` .. code-block:: python In [8]: s[pd.Interval(1, 5)] - Out[8]: + Out[8]: (0, 4] a (1, 5] b dtype: object In [9]: s.loc[pd.Interval(1, 5)] - Out[9]: + Out[9]: (0, 4] a (1, 5] b dtype: object @@ -579,15 +579,15 @@ Similarly, non-exact matches will now raise a ``KeyError``. .. code-block:: python - In [9]: s[pd.Interval(2, 6)] - Out[9]: + In [9]: s[pd.Interval(2, 6)] + Out[9]: (0, 4] a (1, 5] b (5, 8] c dtype: object - In [10]: s.loc[pd.Interval(2, 6)] - Out[10]: + In [10]: s.loc[pd.Interval(2, 6)] + Out[10]: (0, 4] a (1, 5] b (5, 8] c @@ -600,7 +600,7 @@ Similarly, non-exact matches will now raise a ``KeyError``. In [6]: s[pd.Interval(2, 6)] --------------------------------------------------------------------------- KeyError: Interval(2, 6, closed='right') - + In [7]: s.loc[pd.Interval(2, 6)] --------------------------------------------------------------------------- KeyError: Interval(2, 6, closed='right') From 50c257c6addd7937448be7a59392d5a6be82d641 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Sat, 29 Jun 2019 10:39:50 -0500 Subject: [PATCH 5/9] review edits --- doc/source/whatsnew/v0.25.0.rst | 40 +++++------ pandas/core/indexes/base.py | 6 +- pandas/core/indexes/interval.py | 70 ++++++++++++++----- .../arrays/categorical/test_operators.py | 3 +- .../indexes/interval/test_interval_new.py | 6 +- pandas/tests/indexes/test_category.py | 3 +- pandas/tests/indexing/test_indexing.py | 17 +++-- 7 files changed, 92 insertions(+), 53 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 97f643ea696e4..afa90490b1609 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -511,10 +511,10 @@ this would previously return ``True`` for any ``Interval`` overlapping an ``Inte .. ipython:: python - pd.Interval(1, 2, closed='neither') in ii - pd.Interval(-10, 10, closed='both') in ii + pd.Interval(1, 2, closed='neither') in ii + pd.Interval(-10, 10, closed='both') in ii -The ``get_loc`` method now only returns locations for exact matches to ``Interval`` queries, as opposed to the previous behavior of +The :meth:`~IntervalIndex.get_loc` method now only returns locations for exact matches to ``Interval`` queries, as opposed to the previous behavior of returning locations for overlapping matches. A ``KeyError`` will be raised if an exact match is not found. *Previous behavior*: @@ -529,17 +529,14 @@ returning locations for overlapping matches. A ``KeyError`` will be raised if a *New behavior*: -.. code-block:: python - - In [6]: ii.get_loc(pd.Interval(1, 5)) - Out[6]: 1 +.. ipython:: python + :okexcept: - In [7]: ii.get_loc(pd.Interval(2, 6)) - --------------------------------------------------------------------------- - KeyError: Interval(2, 6, closed='right') + ii.get_loc(pd.Interval(1, 5)) + ii.get_loc(pd.Interval(2, 6)) -Likewise, ``get_indexer`` and ``get_indexer_non_unique`` will also only return locations for exact matches to ``Interval`` queries, with -``-1`` denoting that an exact match was not found. +Likewise, :meth:`~IntervalIndex.get_indexer` and :meth:`~IntervalIndex.get_indexer_non_unique` will also only return locations for exact matches +to ``Interval`` queries, with ``-1`` denoting that an exact match was not found. These indexing changes extend to querying a :class:`Series` or :class:`DataFrame` with an ``IntervalIndex`` index. @@ -570,8 +567,8 @@ Selecting from a ``Series`` or ``DataFrame`` using ``[]`` (``__getitem__``) or ` .. ipython:: python - s[pd.Interval(1, 5)] - s.loc[pd.Interval(1, 5)] + s[pd.Interval(1, 5)] + s.loc[pd.Interval(1, 5)] Similarly, non-exact matches will now raise a ``KeyError``. @@ -595,15 +592,11 @@ Similarly, non-exact matches will now raise a ``KeyError``. *New behavior*: -.. code-block:: python - - In [6]: s[pd.Interval(2, 6)] - --------------------------------------------------------------------------- - KeyError: Interval(2, 6, closed='right') +.. ipython:: python + :okexcept: - In [7]: s.loc[pd.Interval(2, 6)] - --------------------------------------------------------------------------- - KeyError: Interval(2, 6, closed='right') + s[pd.Interval(2, 6)] + s.loc[pd.Interval(2, 6)] .. _whatsnew_0250.api_breaking.deps: @@ -780,7 +773,6 @@ Categorical - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) - Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in ``True`` (:issue:`26504`) - Bug in :meth:`DataFrame.dropna` when the :class:`DataFrame` has a :class:`CategoricalIndex` containing :class:`Interval` objects incorrectly raised a ``TypeError`` (:issue:`25087`) -- Bug in :class:`Categorical` and :class:`CategoricalIndex` with :class:`Interval` values when using the ``in`` operator (``__contains``) with objects that are not comparable to the values in the ``Interval`` (:issue:`23705`) Datetimelike ^^^^^^^^^^^^ @@ -870,7 +862,7 @@ Indexing - Bug in which :meth:`DataFrame.to_csv` caused a segfault for a reindexed data frame, when the indices were single-level :class:`MultiIndex` (:issue:`26303`). - Fixed bug where assigning a :class:`arrays.PandasArray` to a :class:`pandas.core.frame.DataFrame` would raise error (:issue:`26390`) - Allow keyword arguments for callable local reference used in the :meth:`DataFrame.query` string (:issue:`26426`) - +- Bug in :class:`Categorical` and :class:`CategoricalIndex` with :class:`Interval` values when using the ``in`` operator (``__contains``) with objects that are not comparable to the values in the ``Interval`` (:issue:`23705`) Missing ^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index fc9359f4afa1d..0a22b6bcc0846 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4469,8 +4469,7 @@ def argsort(self, *args, **kwargs): result = np.array(self) return result.argsort(*args, **kwargs) - def get_value(self, series, key): - """ + _index_shared_docs['get_value'] = """ Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing. @@ -4480,6 +4479,9 @@ def get_value(self, series, key): A value in the Series with the index of the key value in self. """ + @Appender(_index_shared_docs['get_value'] % _index_doc_kwargs) + def get_value(self, series, key): + # if we have something that is Index-like, then # use this, e.g. DatetimeIndex # Things like `Series._get_value` (via .at) pass the EA directly here. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 5a7477264595f..239c27164c5f4 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1,6 +1,7 @@ """ define the IntervalIndex """ from operator import le, lt import textwrap +from typing import Any, Optional, Tuple, Union import warnings import numpy as np @@ -18,8 +19,10 @@ ensure_platform_int, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, is_float, is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, is_list_like, is_number, is_object_dtype, is_scalar) +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna +from pandas._typing import AnyArrayLike from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs import pandas.core.common as com import pandas.core.indexes.base as ibase @@ -623,7 +626,7 @@ def _maybe_cast_indexed(self, key): return key - def _can_reindex(self, indexer): + def _can_reindex(self, indexer: np.ndarray) -> None: """ Check if we are allowing reindexing with this particular indexer. @@ -776,7 +779,10 @@ def _find_non_overlapping_monotonic_bounds(self, key): stop = self._searchsorted_monotonic(key, 'right') return start, stop - def get_loc(self, key, method=None): + def get_loc(self, + key: Any, + method: Optional[str] = None + ) -> Union[int, slice, np.ndarray]: """ Get integer location, slice or boolean mask for requested label. @@ -815,6 +821,9 @@ def get_loc(self, key, method=None): 0 """ self._check_method(method) + + # list-like are invalid labels for II but in some cases may work, e.g + # single element array of comparable type, so guard against them early if is_list_like(key): raise KeyError(key) @@ -829,6 +838,7 @@ def get_loc(self, key, method=None): try: mask = op_left(self.left, key) & op_right(key, self.right) except TypeError: + # scalar is not comparable to II subtype --> invalid label raise KeyError(key) matches = mask.sum() @@ -847,7 +857,12 @@ def get_loc(self, key, method=None): None is specified as these are not yet implemented. """)})) @Appender(_index_shared_docs['get_indexer']) - def get_indexer(self, target, method=None, limit=None, tolerance=None): + def get_indexer(self, + target: AnyArrayLike, + method: Optional[str] = None, + limit: Optional[int] = None, + tolerance: Optional[Any] = None + ) -> np.ndarray: self._check_method(method) @@ -862,23 +877,29 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): target = Index(target, dtype=object) if isinstance(target, IntervalIndex): + # equal indexes -> 1:1 positional match if self.equals(target): return np.arange(len(self), dtype='intp') - elif self.closed != target.closed: + + # different closed or incompatible subtype -> no matches + common_subtype = find_common_type([ + self.dtype.subtype, target.dtype.subtype]) + if self.closed != target.closed or is_object_dtype(common_subtype): return np.repeat(np.intp(-1), len(target)) + # non-overlapping -> at most one match per interval in target + # want exact matches -> need both left/right to match, so defer to + # left/right get_indexer, compare elementwise, equality -> match left_indexer = self.left.get_indexer(target.left) right_indexer = self.right.get_indexer(target.right) indexer = np.where(left_indexer == right_indexer, left_indexer, -1) elif not is_object_dtype(target): - # homogeneous scalar index + # homogeneous scalar index: use IntervalTree target = self._maybe_convert_i8(target) - try: - indexer = self._engine.get_indexer(target.values) - except TypeError as e: - raise ValueError(e) + indexer = self._engine.get_indexer(target.values) else: - # heterogeneous index: defer elementwise to get_loc + # heterogeneous scalar index: defer elementwise to get_loc + # (non-overlapping so get_loc guarantees scalar of KeyError) indexer = [] for key in target: try: @@ -890,16 +911,25 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return ensure_platform_int(indexer) @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) - def get_indexer_non_unique(self, target): + def get_indexer_non_unique(self, + target: AnyArrayLike + ) -> Tuple[np.ndarray, np.ndarray]: + try: target = ensure_index(target) except ValueError: target = Index(target, dtype=object) - if isinstance(target, IntervalIndex) and self.closed != target.closed: - return np.repeat(-1, len(target)), np.arange(len(target)) + # check that target IntervalIndex is compatible + if isinstance(target, IntervalIndex): + common_subtype = find_common_type([ + self.dtype.subtype, target.dtype.subtype]) + if self.closed != target.closed or is_object_dtype(common_subtype): + # different closed or incompatible subtype -> no matches + return np.repeat(-1, len(target)), np.arange(len(target)) if is_object_dtype(target) or isinstance(target, IntervalIndex): + # target might contain intervals: defer elementwise to get_loc indexer, missing = [], [] for i, key in enumerate(target): try: @@ -918,9 +948,12 @@ def get_indexer_non_unique(self, target): indexer, missing = self._engine.get_indexer_non_unique( target.values) - return ensure_index(indexer), ensure_platform_int(missing) + return ensure_platform_int(indexer), ensure_platform_int(missing) - def get_indexer_for(self, target, **kwargs): + def get_indexer_for(self, + target: AnyArrayLike, + **kwargs + ) -> np.ndarray: """ Guaranteed return of an indexer even when overlapping. @@ -936,7 +969,12 @@ def get_indexer_for(self, target, **kwargs): return self.get_indexer_non_unique(target, **kwargs)[0] return self.get_indexer(target, **kwargs) - def get_value(self, series, key): + @Appender(_index_shared_docs['get_value'] % _index_doc_kwargs) + def get_value(self, + series: ABCSeries, + key: Any + ) -> Any: + if com.is_bool_indexer(key): loc = key elif is_list_like(key): diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index cc8d9890ff6eb..af1d3ca0f9ad4 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -399,8 +399,7 @@ def test_contains(self): (pd.Interval(0.5, 1.5), False), ('a', False), (pd.Timestamp(1), False), - (pd.Timedelta(1), False), - (object(), False)], ids=str) + (pd.Timedelta(1), False)], ids=str) def test_contains_interval(self, item, expected): # GH 23705 cat = Categorical(pd.IntervalIndex.from_breaks(range(3))) diff --git a/pandas/tests/indexes/interval/test_interval_new.py b/pandas/tests/indexes/interval/test_interval_new.py index ce510b0ea2744..51fced7fd2702 100644 --- a/pandas/tests/indexes/interval/test_interval_new.py +++ b/pandas/tests/indexes/interval/test_interval_new.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import Int64Index, Interval, IntervalIndex +from pandas import Interval, IntervalIndex from pandas.core.indexes.base import InvalidIndexError import pandas.util.testing as tm @@ -233,10 +233,10 @@ def test_get_indexer_non_unique_with_int_and_float(self, query, expected): index = IntervalIndex.from_tuples(tuples, closed='left') result_indexer, result_missing = index.get_indexer_non_unique(query) - expected_indexer = Int64Index(expected[0]) + expected_indexer = np.array(expected[0], dtype='intp') expected_missing = np.array(expected[1], dtype='intp') - tm.assert_index_equal(result_indexer, expected_indexer) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) tm.assert_numpy_array_equal(result_missing, expected_missing) # TODO we may also want to test get_indexer for the case when diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 581c6874248e7..26a4463d421a4 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -256,8 +256,7 @@ def test_contains(self): (pd.Interval(0.5, 1.5), False), ('a', False), (pd.Timestamp(1), False), - (pd.Timedelta(1), False), - (object(), False)], ids=str) + (pd.Timedelta(1), False)], ids=str) def test_contains_interval(self, item, expected): # GH 23705 ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 1bd02bee8f5dc..92966e721aedc 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -81,16 +81,20 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): " ambiguous|" "Cannot index with multidimensional key|" r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|" + "No matching signature found|" # TypeError + "unhashable type: 'numpy.ndarray'" # TypeError ) - if (isinstance(obj, Series) and idxr_id == 'getitem' - and index.inferred_type in [ + if (isinstance(obj, Series) and idxr_id == 'getitem' and + index.inferred_type in [ 'string', 'datetime64', 'period', 'timedelta64', 'boolean', 'categorical']): idxr[nd3] else: - if (isinstance(obj, DataFrame) and idxr_id == 'getitem' - and index.inferred_type == 'boolean'): + if (isinstance(obj, DataFrame) and idxr_id == 'getitem' and + index.inferred_type == 'boolean'): + error = TypeError + elif idxr_id == 'getitem' and index.inferred_type == 'interval': error = TypeError else: error = ValueError @@ -119,9 +123,14 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): msg = (r"Buffer has wrong number of dimensions \(expected 1," r" got 3\)|" + "The truth value of an array with more than one element is" + " ambiguous|" + "Only 1-dimensional input arrays are supported|" "'pandas._libs.interval.IntervalTree' object has no attribute" " 'set_value'|" # AttributeError "unhashable type: 'numpy.ndarray'|" # TypeError + "No matching signature found|" # TypeError + r"^\[\[\[" # pandas.core.indexing.IndexingError ) if ((idxr_id == 'iloc') From e713a7e5029369352a128841a9cfa5ddd7b4a4c9 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Sun, 30 Jun 2019 09:49:55 -0500 Subject: [PATCH 6/9] review edits 2 --- doc/source/user_guide/advanced.rst | 22 ++++++++++ doc/source/whatsnew/v0.25.0.rst | 43 +++++++++++++------ pandas/core/indexes/interval.py | 4 +- .../indexing/interval/test_interval_new.py | 9 ++-- 4 files changed, 59 insertions(+), 19 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 280eb05964787..15fc251024b79 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -965,6 +965,28 @@ If you select a label *contained* within an interval, this will also select the df.loc[2.5] df.loc[[2.5, 3.5]] +Selecting using an ``Interval`` will only return exact matches. + +.. ipython:: python + + df.loc[pd.Interval(1, 2)] + +Trying to select an ``Interval`` that is not exactly contained in the ``IntervalIndex`` will raise a ``KeyError``. + +.. code-block:: python + + In [7]: df.loc[pd.Interval(0.5, 2.5)] + --------------------------------------------------------------------------- + KeyError: Interval(0.5, 2.5, closed='right') + +Selecting all ``Intervals`` that overlap a given ``Interval`` can be performed using the +:meth:`~IntervalIndex.overlaps` method to create a boolean indexer. + +.. ipython:: python + + idxr = df.index.overlaps(pd.Interval(0.5, 2.5)) + df[idxr] + ``Interval`` and ``IntervalIndex`` are used by ``cut`` and ``qcut``: .. ipython:: python diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index afa90490b1609..578b613977a14 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -485,7 +485,7 @@ you may have to adjust your ``__str__``/``__repr__`` methods (:issue:`26495`). Indexing an ``IntervalIndex`` with ``Interval`` objects ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Indexing methods for :class:`IntervalIndex` have been modified to return exact matches only for :class:`Interval` queries. +Indexing methods for :class:`IntervalIndex` have been modified to require exact matches only for :class:`Interval` queries. ``IntervalIndex`` methods previously matched on any overlapping ``Interval``. Behavior with scalar points, e.g. querying with an integer, is unchanged (:issue:`16316`). @@ -529,11 +529,14 @@ returning locations for overlapping matches. A ``KeyError`` will be raised if a *New behavior*: -.. ipython:: python - :okexcept: +.. code-block:: python - ii.get_loc(pd.Interval(1, 5)) - ii.get_loc(pd.Interval(2, 6)) + In [6]: ii.get_loc(pd.Interval(1, 5)) + Out[6]: 1 + + In [7]: ii.get_loc(pd.Interval(2, 6)) + --------------------------------------------------------------------------- + KeyError: Interval(2, 6, closed='right') Likewise, :meth:`~IntervalIndex.get_indexer` and :meth:`~IntervalIndex.get_indexer_non_unique` will also only return locations for exact matches to ``Interval`` queries, with ``-1`` denoting that an exact match was not found. @@ -570,34 +573,46 @@ Selecting from a ``Series`` or ``DataFrame`` using ``[]`` (``__getitem__``) or ` s[pd.Interval(1, 5)] s.loc[pd.Interval(1, 5)] -Similarly, non-exact matches will now raise a ``KeyError``. +Similarly, a ``KeyError`` will be raised for non-exact matches instead of returning overlapping matches. *Previous behavior*: .. code-block:: python - In [9]: s[pd.Interval(2, 6)] + In [9]: s[pd.Interval(2, 3)] Out[9]: (0, 4] a (1, 5] b - (5, 8] c dtype: object - In [10]: s.loc[pd.Interval(2, 6)] + In [10]: s.loc[pd.Interval(2, 3)] Out[10]: (0, 4] a (1, 5] b - (5, 8] c dtype: object *New behavior*: -.. ipython:: python - :okexcept: +.. code-block:: python - s[pd.Interval(2, 6)] - s.loc[pd.Interval(2, 6)] + In [6]: s[pd.Interval(2, 3)] + --------------------------------------------------------------------------- + KeyError: Interval(2, 3, closed='right') + + In [7]: s.loc[pd.Interval(2, 3)] + --------------------------------------------------------------------------- + KeyError: Interval(2, 3, closed='right') + +The :meth:`~IntervalIndex.overlaps` method can be used to create a boolean indexer that replicates the +previous behavior of returning overlapping matches. + +*New behavior*: + +.. ipython:: python + idxr = s.index.overlaps(pd.Interval(2, 3)) + s[idxr] + s.loc[idxr] .. _whatsnew_0250.api_breaking.deps: diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 239c27164c5f4..220b661601c2b 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -736,7 +736,8 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): 'increasing or decreasing') if isinstance(label, IntervalMixin): - raise NotImplementedError + msg = 'Interval objects are not currently supported' + raise NotImplementedError(msg) # GH 20921: "not is_monotonic_increasing" for the second condition # instead of "is_monotonic_decreasing" to account for single element @@ -817,6 +818,7 @@ def get_loc(self, array([ True, False, True]) Only exact matches will be returned if an interval is provided. + >>> index.get_loc(pd.Interval(0, 1)) 0 """ diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index 7ce150704fbc4..aa016ac5dd1a7 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -110,16 +110,17 @@ def test_loc_with_slices(self): result = s[Interval(3, 4):] tm.assert_series_equal(expected, result) - with pytest.raises(NotImplementedError): + msg = 'Interval objects are not currently supported' + with pytest.raises(NotImplementedError, match=msg): s.loc[Interval(3, 6):] - with pytest.raises(NotImplementedError): + with pytest.raises(NotImplementedError, match=msg): s[Interval(3, 6):] - with pytest.raises(NotImplementedError): + with pytest.raises(NotImplementedError, match=msg): s.loc[Interval(3, 4, closed='left'):] - with pytest.raises(NotImplementedError): + with pytest.raises(NotImplementedError, match=msg): s[Interval(3, 4, closed='left'):] # TODO with non-existing intervals ? From 7329730ed3285c292369ea08953c732ef0c4ce65 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Sun, 30 Jun 2019 10:33:51 -0500 Subject: [PATCH 7/9] review edits 3 --- doc/source/user_guide/advanced.rst | 9 ++++++--- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/arrays/interval.py | 5 ----- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 15fc251024b79..af5b01cfe026e 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -938,9 +938,8 @@ for interval notation. The ``IntervalIndex`` allows some unique indexing and is also used as a return type for the categories in :func:`cut` and :func:`qcut`. -.. warning:: - - These indexing behaviors are provisional and may change in a future version of pandas. +Indexing with an ``IntervalIndex`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ An ``IntervalIndex`` can be used in ``Series`` and in ``DataFrame`` as the index. @@ -985,8 +984,12 @@ Selecting all ``Intervals`` that overlap a given ``Interval`` can be performed u .. ipython:: python idxr = df.index.overlaps(pd.Interval(0.5, 2.5)) + idxr df[idxr] +Binning data with ``cut`` and ``qcut`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ``Interval`` and ``IntervalIndex`` are used by ``cut`` and ``qcut``: .. ipython:: python diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 578b613977a14..348cb92f4ec31 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -611,6 +611,7 @@ previous behavior of returning overlapping matches. .. ipython:: python idxr = s.index.overlaps(pd.Interval(2, 3)) + idxr s[idxr] s.loc[idxr] diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index aaa4124182598..ff4859a41c57f 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -41,11 +41,6 @@ .. versionadded:: %(versionadded)s -.. warning:: - - The indexing behaviors are provisional and may change in - a future version of pandas. - Parameters ---------- data : array-like (1-dimensional) From 9c2f3a9cd49a64194a1dccc9d460cb148eea6e5d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 2 Jul 2019 10:17:56 -0400 Subject: [PATCH 8/9] remove try/except --- pandas/core/indexes/interval.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index b4c0dcec24823..83bc5963f4f9e 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -852,10 +852,7 @@ def get_indexer(self, 'IntervalIndex.get_indexer_non_unique') raise InvalidIndexError(msg) - try: - target = ensure_index(target) - except ValueError: - target = Index(target, dtype=object) + target = ensure_index(target) if isinstance(target, IntervalIndex): # equal indexes -> 1:1 positional match @@ -895,11 +892,7 @@ def get_indexer(self, def get_indexer_non_unique(self, target: AnyArrayLike ) -> Tuple[np.ndarray, np.ndarray]: - - try: - target = ensure_index(target) - except ValueError: - target = Index(target, dtype=object) + target = ensure_index(target) # check that target IntervalIndex is compatible if isinstance(target, IntervalIndex): From 6226bdd45d6b0c0ba50b454d7a0993ad29e2e561 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 2 Jul 2019 10:21:23 -0400 Subject: [PATCH 9/9] add note on version --- doc/source/user_guide/advanced.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 7ac9c0eeeb9cb..a42ab4f0255bd 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -964,7 +964,7 @@ If you select a label *contained* within an interval, this will also select the df.loc[2.5] df.loc[[2.5, 3.5]] -Selecting using an ``Interval`` will only return exact matches. +Selecting using an ``Interval`` will only return exact matches (starting from pandas 0.25.0). .. ipython:: python