Skip to content

API: Implement new indexing behavior for intervals #27100

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jul 2, 2019
14 changes: 4 additions & 10 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3239,8 +3239,9 @@ def reindex(self, target, method=None, level=None, limit=None,
if self.equals(target):
indexer = None
else:

if self.is_unique:
# check is_overlapping for IntervalIndex compat
if (self.is_unique and
not getattr(self, 'is_overlapping', False)):
indexer = self.get_indexer(target, method=method,
limit=limit,
tolerance=tolerance)
Expand Down Expand Up @@ -4902,13 +4903,6 @@ def _searchsorted_monotonic(self, label, side='left'):

raise ValueError('index must be monotonic increasing or decreasing')

def _get_loc_only_exact_matches(self, key):
"""
This is overridden on subclasses (namely, IntervalIndex) to control
get_slice_bound.
"""
return self.get_loc(key)

def get_slice_bound(self, label, side, kind):
"""
Calculate slice bound that corresponds to given label.
Expand Down Expand Up @@ -4942,7 +4936,7 @@ def get_slice_bound(self, label, side, kind):

# we need to look up the label
try:
slc = self._get_loc_only_exact_matches(label)
slc = self.get_loc(label)
except KeyError as err:
try:
return self._searchsorted_monotonic(label, side)
Expand Down
276 changes: 126 additions & 150 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
""" define the IntervalIndex """
from operator import le, lt
import textwrap
import warnings

import numpy as np

from pandas._config import get_option

from pandas._libs import Timedelta, Timestamp
from pandas._libs import Timedelta, Timestamp, lib
from pandas._libs.interval import Interval, IntervalMixin, IntervalTree
from pandas.util._decorators import Appender, Substitution, cache_readonly
from pandas.util._exceptions import rewrite_exception
Expand All @@ -23,7 +24,7 @@
import pandas.core.common as com
import pandas.core.indexes.base as ibase
from pandas.core.indexes.base import (
Index, _index_shared_docs, default_pprint, ensure_index)
Index, InvalidIndexError, _index_shared_docs, default_pprint, ensure_index)
from pandas.core.indexes.datetimes import DatetimeIndex, date_range
from pandas.core.indexes.multi import MultiIndex
from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range
Expand Down Expand Up @@ -622,6 +623,23 @@ def _maybe_cast_indexed(self, key):

return key

def _can_reindex(self, indexer):
"""
Check if we are allowing reindexing with this particular indexer.

Parameters
----------
indexer : an integer indexer

Raises
------
ValueError if its a duplicate axis
"""

# trying to reindex on an axis with duplicates
if self.is_overlapping and len(indexer):
raise ValueError("cannot reindex from an overlapping axis")

def _needs_i8_conversion(self, key):
"""
Check if a given key needs i8 conversion. Conversion is necessary for
Expand Down Expand Up @@ -732,18 +750,6 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False):

return sub_idx._searchsorted_monotonic(label, side)

def _get_loc_only_exact_matches(self, key):
if isinstance(key, Interval):

if not self.is_unique:
raise ValueError("cannot index with a slice Interval"
" and a non-unique index")

# TODO: this expands to a tuple index, see if we can
# do better
return Index(self._multiindex.values).get_loc(key)
raise KeyError

def _find_non_overlapping_monotonic_bounds(self, key):
if isinstance(key, IntervalMixin):
start = self._searchsorted_monotonic(
Expand Down Expand Up @@ -808,58 +814,28 @@ def get_loc(self, key, method=None):
array([0, 1], dtype=int64)
"""
self._check_method(method)
if is_list_like(key):
raise KeyError(key)

original_key = key
key = self._maybe_cast_indexed(key)

if self.is_non_overlapping_monotonic:
if isinstance(key, Interval):
left = self._maybe_cast_slice_bound(key.left, 'left', None)
right = self._maybe_cast_slice_bound(key.right, 'right', None)
key = Interval(left, right, key.closed)
else:
key = self._maybe_cast_slice_bound(key, 'left', None)

start, stop = self._find_non_overlapping_monotonic_bounds(key)

if start is None or stop is None:
return slice(start, stop)
elif start + 1 == stop:
return start
elif start < stop:
return slice(start, stop)
else:
raise KeyError(original_key)

if isinstance(key, Interval):
if self.closed != key.closed:
raise KeyError(key)
mask = (self.left == key.left) & (self.right == key.right)
else:
# use the interval tree
key = self._maybe_convert_i8(key)
if isinstance(key, Interval):
left, right = _get_interval_closed_bounds(key)
return self._engine.get_loc_interval(left, right)
else:
return self._engine.get_loc(key)

def get_value(self, series, key):
if com.is_bool_indexer(key):
loc = key
elif is_list_like(key):
loc = self.get_indexer(key)
elif isinstance(key, slice):

if not (key.step is None or key.step == 1):
raise ValueError("cannot support not-default step in a slice")

# assume scalar
op_left = le if self.closed_left else lt
op_right = le if self.closed_right else lt
try:
loc = self.get_loc(key)
mask = op_left(self.left, key) & op_right(key, self.right)
except TypeError:
# we didn't find exact intervals or are non-unique
msg = "unable to slice with this key: {key}".format(key=key)
raise ValueError(msg)
raise KeyError(key)

else:
loc = self.get_loc(key)
return series.iloc[loc]
matches = mask.sum()
if matches == 0:
raise KeyError(key)
elif matches == 1:
return mask.argmax()
return lib.maybe_booleans_to_slice(mask.view('u1'))

@Substitution(**dict(_index_doc_kwargs,
**{'raises_section': textwrap.dedent("""
Expand All @@ -873,109 +849,109 @@ def get_value(self, series, key):
def get_indexer(self, target, method=None, limit=None, tolerance=None):

self._check_method(method)
target = ensure_index(target)
target = self._maybe_cast_indexed(target)

if self.equals(target):
return np.arange(len(self), dtype='intp')

if self.is_non_overlapping_monotonic:
start, stop = self._find_non_overlapping_monotonic_bounds(target)

start_plus_one = start + 1
if not ((start_plus_one < stop).any()):
return np.where(start_plus_one == stop, start, -1)
if self.is_overlapping:
msg = ('cannot handle overlapping indices; use '
'IntervalIndex.get_indexer_non_unique')
raise InvalidIndexError(msg)

if not self.is_unique:
raise ValueError("cannot handle non-unique indices")
try:
target = ensure_index(target)
except ValueError:
target = Index(target, dtype=object)

# IntervalIndex
if isinstance(target, IntervalIndex):
indexer = self._get_reindexer(target)

# non IntervalIndex
if self.equals(target):
return np.arange(len(self), dtype='intp')
elif self.closed != target.closed:
return np.repeat(np.intp(-1), len(target))

left_indexer = self.left.get_indexer(target.left)
right_indexer = self.right.get_indexer(target.right)
indexer = np.where(left_indexer == right_indexer, left_indexer, -1)
elif not is_object_dtype(target):
# homogeneous scalar index
target = self._maybe_convert_i8(target)
try:
indexer = self._engine.get_indexer(target.values)
except TypeError as e:
raise ValueError(e)
else:
indexer = np.concatenate([self.get_loc(i) for i in target])
# heterogeneous index: defer elementwise to get_loc
indexer = []
for key in target:
try:
loc = self.get_loc(key)
except KeyError:
loc = -1
indexer.append(loc)

return ensure_platform_int(indexer)

def _get_reindexer(self, target):
"""
Return an indexer for a target IntervalIndex with self
"""

# find the left and right indexers
left = self._maybe_convert_i8(target.left)
right = self._maybe_convert_i8(target.right)
lindexer = self._engine.get_indexer(left.values)
rindexer = self._engine.get_indexer(right.values)

# we want to return an indexer on the intervals
# however, our keys could provide overlapping of multiple
# intervals, so we iterate thru the indexers and construct
# a set of indexers

indexer = []
n = len(self)

for i, (lhs, rhs) in enumerate(zip(lindexer, rindexer)):

target_value = target[i]

# matching on the lhs bound
if (lhs != -1 and
self.closed == 'right' and
target_value.left == self[lhs].right):
lhs += 1

# matching on the lhs bound
if (rhs != -1 and
self.closed == 'left' and
target_value.right == self[rhs].left):
rhs -= 1

# not found
if lhs == -1 and rhs == -1:
indexer.append(np.array([-1]))

elif rhs == -1:

indexer.append(np.arange(lhs, n))

elif lhs == -1:

# care about left/right closed here
value = self[i]
@Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs)
def get_indexer_non_unique(self, target):
try:
target = ensure_index(target)
except ValueError:
target = Index(target, dtype=object)

if isinstance(target, IntervalIndex) and self.closed != target.closed:
return np.repeat(-1, len(target)), np.arange(len(target))

if is_object_dtype(target) or isinstance(target, IntervalIndex):
indexer, missing = [], []
for i, key in enumerate(target):
try:
locs = self.get_loc(key)
if isinstance(locs, slice):
locs = np.arange(
locs.start, locs.stop, locs.step, dtype='intp')
locs = np.array(locs, ndmin=1)
except KeyError:
missing.append(i)
locs = np.array([-1])
indexer.append(locs)
indexer = np.concatenate(indexer)
else:
target = self._maybe_convert_i8(target)
indexer, missing = self._engine.get_indexer_non_unique(
target.values)

# target.closed same as self.closed
if self.closed == target.closed:
if target_value.left < value.left:
indexer.append(np.array([-1]))
continue
return ensure_index(indexer), ensure_platform_int(missing)

# target.closed == 'left'
elif self.closed == 'right':
if target_value.left <= value.left:
indexer.append(np.array([-1]))
continue
def get_indexer_for(self, target, **kwargs):
"""
Guaranteed return of an indexer even when overlapping.

# target.closed == 'right'
elif self.closed == 'left':
if target_value.left <= value.left:
indexer.append(np.array([-1]))
continue
This dispatches to get_indexer or get_indexer_non_unique
as appropriate.

indexer.append(np.arange(0, rhs + 1))
Returns
-------
numpy.ndarray
List of indices.
"""
if self.is_overlapping:
return self.get_indexer_non_unique(target, **kwargs)[0]
return self.get_indexer(target, **kwargs)

def get_value(self, series, key):
if com.is_bool_indexer(key):
loc = key
elif is_list_like(key):
if self.is_overlapping:
loc, missing = self.get_indexer_non_unique(key)
if len(missing):
raise KeyError
else:
indexer.append(np.arange(lhs, rhs + 1))

return np.concatenate(indexer)

@Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs)
def get_indexer_non_unique(self, target):
target = self._maybe_cast_indexed(ensure_index(target))
return super().get_indexer_non_unique(target)
loc = self.get_indexer(key)
elif isinstance(key, slice):
if not (key.step is None or key.step == 1):
raise ValueError("cannot support not-default step in a slice")
loc = self._convert_slice_indexer(key, kind='getitem')
else:
loc = self.get_loc(key)
return series.iloc[loc]

@Appender(_index_shared_docs['where'])
def where(self, cond, other=None):
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1074,7 +1074,7 @@ def _get_listlike_indexer(self, key, axis, raise_missing=False):
raise_missing=raise_missing)
return ax[indexer], indexer

if ax.is_unique:
if ax.is_unique and not getattr(ax, 'is_overlapping', False):
# If we are trying to get actual keys from empty Series, we
# patiently wait for a KeyError later on - otherwise, convert
if len(ax) or not len(key):
Expand Down
Loading