Skip to content

ENH: Implement IntervalIndex.is_overlapping #23327

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1671,6 +1671,7 @@ IntervalIndex Components
IntervalIndex.length
IntervalIndex.values
IntervalIndex.is_non_overlapping_monotonic
IntervalIndex.is_overlapping
IntervalIndex.get_loc
IntervalIndex.get_indexer
IntervalIndex.set_closed
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ Other Enhancements
- :meth:`MultiIndex.to_flat_index` has been added to flatten multiple levels into a single-level :class:`Index` object.
- :meth:`DataFrame.to_stata` and :class:` pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`)
- :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the an ``axis`` parameter (:issue: `8839`)
- :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`)

.. _whatsnew_0240.api_breaking:

Expand Down
22 changes: 21 additions & 1 deletion pandas/_libs/intervaltree.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ cdef class IntervalTree(IntervalMixin):
cdef:
readonly object left, right, root, dtype
readonly str closed
object _left_sorter, _right_sorter
object _is_overlapping, _left_sorter, _right_sorter

def __init__(self, left, right, closed='right', leaf_size=100):
"""
Expand Down Expand Up @@ -81,6 +81,26 @@ cdef class IntervalTree(IntervalMixin):
self._right_sorter = np.argsort(self.right)
return self._right_sorter

@property
def is_overlapping(self):
"""
Determine if the IntervalTree contains overlapping intervals.
Cached as self._is_overlapping.
"""
if self._is_overlapping is not None:
return self._is_overlapping

# <= when both sides closed since endpoints can overlap
op = le if self.closed == 'both' else lt

# overlap if start of current interval < end of previous interval
# (current and previous in terms of sorted order by left/start side)
current = self.left[self.left_sorter[1:]]
previous = self.right[self.left_sorter[:-1]]
self._is_overlapping = bool(op(current, previous).any())

return self._is_overlapping

def get_loc(self, scalar_t key):
"""Return all positions corresponding to intervals that overlap with
the given scalar key
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
length
values
is_non_overlapping_monotonic
%(extra_attributes)s\

Methods
-------
Expand Down Expand Up @@ -107,6 +108,7 @@
summary="Pandas array for interval data that are closed on the same side.",
versionadded="0.24.0",
name='',
extra_attributes='',
extra_methods='',
examples=textwrap.dedent("""\
Examples
Expand Down
60 changes: 60 additions & 0 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def _new_IntervalIndex(cls, d):
summary="Immutable index of intervals that are closed on the same side.",
name=_index_doc_kwargs['name'],
versionadded="0.20.0",
extra_attributes="is_overlapping\n",
extra_methods="contains\n",
examples=textwrap.dedent("""\
Examples
Expand Down Expand Up @@ -464,6 +465,61 @@ def is_unique(self):
def is_non_overlapping_monotonic(self):
return self._data.is_non_overlapping_monotonic

@property
def is_overlapping(self):
"""
Return True if the IntervalIndex has overlapping intervals, else False.

Two intervals overlap if they share a common point, including closed
endpoints. Intervals that only have an open endpoint in common do not
overlap.

.. versionadded:: 0.24.0

Returns
-------
bool
Boolean indicating if the IntervalIndex has overlapping intervals.

Examples
--------
>>> index = pd.IntervalIndex.from_tuples([(0, 2), (1, 3), (4, 5)])
>>> index
IntervalIndex([(0, 2], (1, 3], (4, 5]],
closed='right',
dtype='interval[int64]')
>>> index.is_overlapping
True

Intervals that share closed endpoints overlap:

>>> index = pd.interval_range(0, 3, closed='both')
>>> index
IntervalIndex([[0, 1], [1, 2], [2, 3]],
closed='both',
dtype='interval[int64]')
>>> index.is_overlapping
True

Intervals that only have an open endpoint in common do not overlap:

>>> index = pd.interval_range(0, 3, closed='left')
>>> index
IntervalIndex([[0, 1), [1, 2), [2, 3)],
closed='left',
dtype='interval[int64]')
>>> index.is_overlapping
False

See Also
--------
Interval.overlaps : Check whether two Interval objects overlap.
IntervalIndex.overlaps : Check an IntervalIndex elementwise for
overlaps.
"""
# GH 23309
return self._engine.is_overlapping

@Appender(_index_shared_docs['_convert_scalar_indexer'])
def _convert_scalar_indexer(self, key, kind=None):
if kind == 'iloc':
Expand Down Expand Up @@ -570,6 +626,10 @@ def _maybe_convert_i8(self, key):
else:
# DatetimeIndex/TimedeltaIndex
key_dtype, key_i8 = key.dtype, Index(key.asi8)
if key.hasnans:
# convert NaT from it's i8 value to np.nan so it's not viewed
# as a valid value, maybe causing errors (e.g. is_overlapping)
key_i8 = key_i8.where(~key._isnan)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Specifically, without this change a datetime-like IntervalIndex with closed='both' containing two or more instances of NaT would be marked as overlapping due to the NaT's.

Since NaT is converted to -9223372036854775808 during i8 conversion, the IntervalTree previously interpreted this as an Interval of length zero (same start/end), which would include the point in the closed='both' case. So, if two of these occurred they would be interpreted as overlapping at a point.

I've added a relevant _maybe_convert_i8 test for this behavior.


# ensure consistency with IntervalIndex subtype
subtype = self.dtype.subtype
Expand Down
61 changes: 61 additions & 0 deletions pandas/tests/indexes/interval/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,23 @@ def test_maybe_convert_i8(self, breaks):
expected = Index(breaks.asi8)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize('breaks', [
date_range('2018-01-01', periods=5),
timedelta_range('0 days', periods=5)])
def test_maybe_convert_i8_nat(self, breaks):
# GH 20636
index = IntervalIndex.from_breaks(breaks)

to_convert = breaks._constructor([pd.NaT] * 3)
expected = pd.Float64Index([np.nan] * 3)
result = index._maybe_convert_i8(to_convert)
tm.assert_index_equal(result, expected)

to_convert = to_convert.insert(0, breaks[0])
expected = expected.insert(0, float(breaks[0].value))
result = index._maybe_convert_i8(to_convert)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize('breaks', [
np.arange(5, dtype='int64'),
np.arange(5, dtype='float64')], ids=lambda x: str(x.dtype))
Expand Down Expand Up @@ -1082,6 +1099,50 @@ def test_is_non_overlapping_monotonic(self, closed):
idx = IntervalIndex.from_breaks(range(4), closed=closed)
assert idx.is_non_overlapping_monotonic is True

@pytest.mark.parametrize('start, shift, na_value', [
(0, 1, np.nan),
(Timestamp('2018-01-01'), Timedelta('1 day'), pd.NaT),
(Timedelta('0 days'), Timedelta('1 day'), pd.NaT)])
def test_is_overlapping(self, start, shift, na_value, closed):
# GH 23309
# see test_interval_tree.py for extensive tests; interface tests here

# non-overlapping
tuples = [(start + n * shift, start + (n + 1) * shift)
for n in (0, 2, 4)]
index = IntervalIndex.from_tuples(tuples, closed=closed)
assert index.is_overlapping is False

# non-overlapping with NA
tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)]
index = IntervalIndex.from_tuples(tuples, closed=closed)
assert index.is_overlapping is False

# overlapping
tuples = [(start + n * shift, start + (n + 2) * shift)
for n in range(3)]
index = IntervalIndex.from_tuples(tuples, closed=closed)
assert index.is_overlapping is True

# overlapping with NA
tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)]
index = IntervalIndex.from_tuples(tuples, closed=closed)
assert index.is_overlapping is True

# common endpoints
tuples = [(start + n * shift, start + (n + 1) * shift)
for n in range(3)]
index = IntervalIndex.from_tuples(tuples, closed=closed)
result = index.is_overlapping
expected = closed == 'both'
assert result is expected

# common endpoints with NA
tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)]
index = IntervalIndex.from_tuples(tuples, closed=closed)
result = index.is_overlapping
assert result is expected

@pytest.mark.parametrize('tuples', [
lzip(range(10), range(1, 11)),
lzip(date_range('20170101', periods=10),
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/indexes/interval/test_interval_tree.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import division

from itertools import permutations

import numpy as np
import pytest

Expand Down Expand Up @@ -135,3 +137,36 @@ def test_get_indexer_closed(self, closed, leaf_size):

expected = found if tree.closed_right else not_found
tm.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.5))

@pytest.mark.parametrize('left, right, expected', [
(np.array([0, 1, 4]), np.array([2, 3, 5]), True),
(np.array([0, 1, 2]), np.array([5, 4, 3]), True),
(np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True),
(np.array([0, 2, 4]), np.array([1, 3, 5]), False),
(np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False)])
@pytest.mark.parametrize('order', map(list, permutations(range(3))))
def test_is_overlapping(self, closed, order, left, right, expected):
# GH 23309
tree = IntervalTree(left[order], right[order], closed=closed)
result = tree.is_overlapping
assert result is expected

@pytest.mark.parametrize('order', map(list, permutations(range(3))))
def test_is_overlapping_endpoints(self, closed, order):
"""shared endpoints are marked as overlapping"""
# GH 23309
left, right = np.arange(3), np.arange(1, 4)
tree = IntervalTree(left[order], right[order], closed=closed)
result = tree.is_overlapping
expected = closed is 'both'
assert result is expected

@pytest.mark.parametrize('left, right', [
(np.array([], dtype='int64'), np.array([], dtype='int64')),
(np.array([0], dtype='int64'), np.array([1], dtype='int64')),
(np.array([np.nan]), np.array([np.nan])),
(np.array([np.nan] * 3), np.array([np.nan] * 3))])
def test_is_overlapping_trivial(self, closed, left, right):
# GH 23309
tree = IntervalTree(left, right, closed=closed)
assert tree.is_overlapping is False