diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 78fe2ae966896..de72f08515246 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -196,11 +196,20 @@ def setup(self, N): self.intv = IntervalIndex.from_arrays(left, right) self.intv._engine + self.left = IntervalIndex.from_breaks(np.arange(N)) + self.right = IntervalIndex.from_breaks(np.arange(N - 3, 2 * N - 3)) + def time_monotonic_inc(self, N): self.intv.is_monotonic_increasing def time_is_unique(self, N): self.intv.is_unique + def time_intersection(self, N): + self.left.intersection(self.right) + + def time_intersection_duplicate(self, N): + self.intv.intersection(self.right) + from .pandas_vb_common import setup # noqa: F401 diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1fb9b5ae695a0..0b88f347f3e7d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -510,6 +510,7 @@ Performance Improvements - Improved performance of :meth:`read_csv` by much faster parsing of ``MM/YYYY`` and ``DD/MM/YYYY`` datetime formats (:issue:`25922`) - Improved performance of nanops for dtypes that cannot store NaNs. Speedup is particularly prominent for :meth:`Series.all` and :meth:`Series.any` (:issue:`25070`) - Improved performance of :meth:`Series.map` for dictionary mappers on categorical series by mapping the categories instead of mapping all values (:issue:`23785`) +- Improved performance of :meth:`IntervalIndex.intersection` (:issue:`24813`) - Improved performance of :meth:`read_csv` by faster concatenating date columns without extra conversion to string for integer/float zero and float ``NaN``; by faster checking the string for the possibility of being a date (:issue:`25754`) - Improved performance of :attr:`IntervalIndex.is_unique` by removing conversion to ``MultiIndex`` (:issue:`24813`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8538687ca3e91..d1636e88a7ff2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2440,9 +2440,7 @@ def _union(self, other, sort): def _wrap_setop_result(self, other, result): return self._constructor(result, name=get_op_result_name(self, other)) - # TODO: standardize return type of non-union setops type(self vs other) - def intersection(self, other, sort=False): - """ + _index_shared_docs['intersection'] = """ Form the intersection of two Index objects. This returns a new Index with elements common to the index and `other`. @@ -2476,6 +2474,10 @@ def intersection(self, other, sort=False): >>> idx1.intersection(idx2) Int64Index([3, 4], dtype='int64') """ + + # TODO: standardize return type of non-union setops type(self vs other) + @Appender(_index_shared_docs['intersection']) + def intersection(self, other, sort=False): self._validate_sort_keyword(sort) self._assert_can_do_setop(other) other = ensure_index(other) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 41cf23c5542a9..7fc90c3fbfb0f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -97,6 +97,42 @@ def _new_IntervalIndex(cls, d): return cls.from_arrays(**d) +class SetopCheck: + """ + This is called to decorate the set operations of IntervalIndex + to perform the type check in advance. + """ + def __init__(self, op_name): + self.op_name = op_name + + def __call__(self, setop): + def func(intvidx_self, other, sort=False): + intvidx_self._assert_can_do_setop(other) + other = ensure_index(other) + + if not isinstance(other, IntervalIndex): + result = getattr(intvidx_self.astype(object), + self.op_name)(other) + if self.op_name in ('difference',): + result = result.astype(intvidx_self.dtype) + return result + elif intvidx_self.closed != other.closed: + msg = ('can only do set operations between two IntervalIndex ' + 'objects that are closed on the same side') + raise ValueError(msg) + + # GH 19016: ensure set op will not return a prohibited dtype + subtypes = [intvidx_self.dtype.subtype, other.dtype.subtype] + common_subtype = find_common_type(subtypes) + if is_object_dtype(common_subtype): + msg = ('can only do {op} between two IntervalIndex ' + 'objects that have compatible dtypes') + raise TypeError(msg.format(op=self.op_name)) + + return setop(intvidx_self, other, sort) + return func + + @Appender(_interval_shared_docs['class'] % dict( klass="IntervalIndex", summary="Immutable index of intervals that are closed on the same side.", @@ -1102,28 +1138,78 @@ def equals(self, other): def overlaps(self, other): return self._data.overlaps(other) - def _setop(op_name, sort=None): - def func(self, other, sort=sort): - self._assert_can_do_setop(other) - other = ensure_index(other) - if not isinstance(other, IntervalIndex): - result = getattr(self.astype(object), op_name)(other) - if op_name in ('difference',): - result = result.astype(self.dtype) - return result - elif self.closed != other.closed: - msg = ('can only do set operations between two IntervalIndex ' - 'objects that are closed on the same side') - raise ValueError(msg) + @Appender(_index_shared_docs['intersection']) + @SetopCheck(op_name='intersection') + def intersection(self, other, sort=False): + if self.left.is_unique and self.right.is_unique: + taken = self._intersection_unique(other) + else: + # duplicates + taken = self._intersection_non_unique(other) - # GH 19016: ensure set op will not return a prohibited dtype - subtypes = [self.dtype.subtype, other.dtype.subtype] - common_subtype = find_common_type(subtypes) - if is_object_dtype(common_subtype): - msg = ('can only do {op} between two IntervalIndex ' - 'objects that have compatible dtypes') - raise TypeError(msg.format(op=op_name)) + if sort is None: + taken = taken.sort_values() + + return taken + + def _intersection_unique(self, other): + """ + Used when the IntervalIndex does not have any common endpoint, + no mater left or right. + Return the intersection with another IntervalIndex. + + Parameters + ---------- + other : IntervalIndex + + Returns + ------- + taken : IntervalIndex + """ + lindexer = self.left.get_indexer(other.left) + rindexer = self.right.get_indexer(other.right) + + match = (lindexer == rindexer) & (lindexer != -1) + indexer = lindexer.take(match.nonzero()[0]) + + return self.take(indexer) + def _intersection_non_unique(self, other): + """ + Used when the IntervalIndex does have some common endpoints, + on either sides. + Return the intersection with another IntervalIndex. + + Parameters + ---------- + other : IntervalIndex + + Returns + ------- + taken : IntervalIndex + """ + mask = np.zeros(len(self), dtype=bool) + + if self.hasnans and other.hasnans: + first_nan_loc = np.arange(len(self))[self.isna()][0] + mask[first_nan_loc] = True + + lmiss = other.left.get_indexer_non_unique(self.left)[1] + lmatch = np.setdiff1d(np.arange(len(self)), lmiss) + + for i in lmatch: + potential = other.left.get_loc(self.left[i]) + if is_scalar(potential): + if self.right[i] == other.right[potential]: + mask[i] = True + elif self.right[i] in other.right[potential]: + mask[i] = True + + return self[mask] + + def _setop(op_name, sort=None): + @SetopCheck(op_name=op_name) + def func(self, other, sort=sort): result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort) result_name = get_op_result_name(self, other) @@ -1148,7 +1234,6 @@ def is_all_dates(self): return False union = _setop('union') - intersection = _setop('intersection', sort=False) difference = _setop('difference') symmetric_difference = _setop('symmetric_difference') diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index f4f63aaecd336..b2f409837344a 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -795,140 +795,6 @@ def test_non_contiguous(self, closed): assert 1.5 not in index - @pytest.mark.parametrize("sort", [None, False]) - def test_union(self, closed, sort): - index = self.create_index(closed=closed) - other = IntervalIndex.from_breaks(range(5, 13), closed=closed) - - expected = IntervalIndex.from_breaks(range(13), closed=closed) - result = index[::-1].union(other, sort=sort) - if sort is None: - tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) - - result = other[::-1].union(index, sort=sort) - if sort is None: - tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) - - tm.assert_index_equal(index.union(index, sort=sort), index) - tm.assert_index_equal(index.union(index[:1], sort=sort), index) - - # GH 19101: empty result, same dtype - index = IntervalIndex(np.array([], dtype='int64'), closed=closed) - result = index.union(index, sort=sort) - tm.assert_index_equal(result, index) - - # GH 19101: empty result, different dtypes - other = IntervalIndex(np.array([], dtype='float64'), closed=closed) - result = index.union(other, sort=sort) - tm.assert_index_equal(result, index) - - @pytest.mark.parametrize("sort", [None, False]) - def test_intersection(self, closed, sort): - index = self.create_index(closed=closed) - other = IntervalIndex.from_breaks(range(5, 13), closed=closed) - - expected = IntervalIndex.from_breaks(range(5, 11), closed=closed) - result = index[::-1].intersection(other, sort=sort) - if sort is None: - tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) - - result = other[::-1].intersection(index, sort=sort) - if sort is None: - tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) - - tm.assert_index_equal(index.intersection(index, sort=sort), index) - - # GH 19101: empty result, same dtype - other = IntervalIndex.from_breaks(range(300, 314), closed=closed) - expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) - result = index.intersection(other, sort=sort) - tm.assert_index_equal(result, expected) - - # GH 19101: empty result, different dtypes - breaks = np.arange(300, 314, dtype='float64') - other = IntervalIndex.from_breaks(breaks, closed=closed) - result = index.intersection(other, sort=sort) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("sort", [None, False]) - def test_difference(self, closed, sort): - index = IntervalIndex.from_arrays([1, 0, 3, 2], - [1, 2, 3, 4], - closed=closed) - result = index.difference(index[:1], sort=sort) - expected = index[1:] - if sort is None: - expected = expected.sort_values() - tm.assert_index_equal(result, expected) - - # GH 19101: empty result, same dtype - result = index.difference(index, sort=sort) - expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) - tm.assert_index_equal(result, expected) - - # GH 19101: empty result, different dtypes - other = IntervalIndex.from_arrays(index.left.astype('float64'), - index.right, closed=closed) - result = index.difference(other, sort=sort) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("sort", [None, False]) - def test_symmetric_difference(self, closed, sort): - index = self.create_index(closed=closed) - result = index[1:].symmetric_difference(index[:-1], sort=sort) - expected = IntervalIndex([index[0], index[-1]]) - if sort is None: - tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) - - # GH 19101: empty result, same dtype - result = index.symmetric_difference(index, sort=sort) - expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) - if sort is None: - tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) - - # GH 19101: empty result, different dtypes - other = IntervalIndex.from_arrays(index.left.astype('float64'), - index.right, closed=closed) - result = index.symmetric_difference(other, sort=sort) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize('op_name', [ - 'union', 'intersection', 'difference', 'symmetric_difference']) - @pytest.mark.parametrize("sort", [None, False]) - def test_set_incompatible_types(self, closed, op_name, sort): - index = self.create_index(closed=closed) - set_op = getattr(index, op_name) - - # TODO: standardize return type of non-union setops type(self vs other) - # non-IntervalIndex - if op_name == 'difference': - expected = index - else: - expected = getattr(index.astype('O'), op_name)(Index([1, 2, 3])) - result = set_op(Index([1, 2, 3]), sort=sort) - tm.assert_index_equal(result, expected) - - # mixed closed - msg = ('can only do set operations between two IntervalIndex objects ' - 'that are closed on the same side') - for other_closed in {'right', 'left', 'both', 'neither'} - {closed}: - other = self.create_index(closed=other_closed) - with pytest.raises(ValueError, match=msg): - set_op(other, sort=sort) - - # GH 19016: incompatible dtypes - other = interval_range(Timestamp('20180101'), periods=9, closed=closed) - msg = ('can only do {op} between two IntervalIndex objects that have ' - 'compatible dtypes').format(op=op_name) - with pytest.raises(TypeError, match=msg): - set_op(other, sort=sort) - def test_isin(self, closed): index = self.create_index(closed=closed) diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py new file mode 100644 index 0000000000000..9ab0d15cbe6a3 --- /dev/null +++ b/pandas/tests/indexes/interval/test_setops.py @@ -0,0 +1,184 @@ +import numpy as np +import pytest + +from pandas import Index, IntervalIndex, Timestamp, interval_range +import pandas.util.testing as tm + + +@pytest.fixture(scope='class', params=[None, 'foo']) +def name(request): + return request.param + + +@pytest.fixture(params=[None, False]) +def sort(request): + return request.param + + +def monotonic_index(start, end, dtype='int64', closed='right'): + return IntervalIndex.from_breaks(np.arange(start, end, dtype=dtype), + closed=closed) + + +def empty_index(dtype='int64', closed='right'): + return IntervalIndex(np.array([], dtype=dtype), closed=closed) + + +class TestIntervalIndex: + + def test_union(self, closed, sort): + index = monotonic_index(0, 11, closed=closed) + other = monotonic_index(5, 13, closed=closed) + + expected = monotonic_index(0, 13, closed=closed) + result = index[::-1].union(other, sort=sort) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + result = other[::-1].union(index, sort=sort) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + tm.assert_index_equal(index.union(index, sort=sort), index) + tm.assert_index_equal(index.union(index[:1], sort=sort), index) + + # GH 19101: empty result, same dtype + index = empty_index(dtype='int64', closed=closed) + result = index.union(index, sort=sort) + tm.assert_index_equal(result, index) + + # GH 19101: empty result, different dtypes + other = empty_index(dtype='float64', closed=closed) + result = index.union(other, sort=sort) + tm.assert_index_equal(result, index) + + def test_intersection(self, closed, sort): + index = monotonic_index(0, 11, closed=closed) + other = monotonic_index(5, 13, closed=closed) + + expected = monotonic_index(5, 11, closed=closed) + result = index[::-1].intersection(other, sort=sort) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + result = other[::-1].intersection(index, sort=sort) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + tm.assert_index_equal(index.intersection(index, sort=sort), index) + + # GH 19101: empty result, same dtype + other = monotonic_index(300, 314, closed=closed) + expected = empty_index(dtype='int64', closed=closed) + result = index.intersection(other, sort=sort) + tm.assert_index_equal(result, expected) + + # GH 19101: empty result, different dtypes + other = monotonic_index(300, 314, dtype='float64', closed=closed) + result = index.intersection(other, sort=sort) + tm.assert_index_equal(result, expected) + + # GH 26225: nested intervals + index = IntervalIndex.from_tuples([(1, 2), (1, 3), (1, 4), (0, 2)]) + other = IntervalIndex.from_tuples([(1, 2), (1, 3)]) + expected = IntervalIndex.from_tuples([(1, 2), (1, 3)]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + # GH 26225: duplicate element + index = IntervalIndex.from_tuples([(1, 2), (1, 2), (2, 3), (3, 4)]) + other = IntervalIndex.from_tuples([(1, 2), (2, 3)]) + expected = IntervalIndex.from_tuples([(1, 2), (1, 2), (2, 3)]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + # GH 26225 + index = IntervalIndex.from_tuples([(0, 3), (0, 2)]) + other = IntervalIndex.from_tuples([(0, 2), (1, 3)]) + expected = IntervalIndex.from_tuples([(0, 2)]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + # GH 26225: duplicate nan element + index = IntervalIndex([np.nan, np.nan]) + other = IntervalIndex([np.nan]) + expected = IntervalIndex([np.nan]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + def test_difference(self, closed, sort): + index = IntervalIndex.from_arrays([1, 0, 3, 2], + [1, 2, 3, 4], + closed=closed) + result = index.difference(index[:1], sort=sort) + expected = index[1:] + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + + # GH 19101: empty result, same dtype + result = index.difference(index, sort=sort) + expected = empty_index(dtype='int64', closed=closed) + tm.assert_index_equal(result, expected) + + # GH 19101: empty result, different dtypes + other = IntervalIndex.from_arrays(index.left.astype('float64'), + index.right, closed=closed) + result = index.difference(other, sort=sort) + tm.assert_index_equal(result, expected) + + def test_symmetric_difference(self, closed, sort): + index = monotonic_index(0, 11, closed=closed) + result = index[1:].symmetric_difference(index[:-1], sort=sort) + expected = IntervalIndex([index[0], index[-1]]) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + # GH 19101: empty result, same dtype + result = index.symmetric_difference(index, sort=sort) + expected = empty_index(dtype='int64', closed=closed) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + # GH 19101: empty result, different dtypes + other = IntervalIndex.from_arrays(index.left.astype('float64'), + index.right, closed=closed) + result = index.symmetric_difference(other, sort=sort) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize('op_name', [ + 'union', 'intersection', 'difference', 'symmetric_difference']) + @pytest.mark.parametrize("sort", [None, False]) + def test_set_incompatible_types(self, closed, op_name, sort): + index = monotonic_index(0, 11, closed=closed) + set_op = getattr(index, op_name) + + # TODO: standardize return type of non-union setops type(self vs other) + # non-IntervalIndex + if op_name == 'difference': + expected = index + else: + expected = getattr(index.astype('O'), op_name)(Index([1, 2, 3])) + result = set_op(Index([1, 2, 3]), sort=sort) + tm.assert_index_equal(result, expected) + + # mixed closed + msg = ('can only do set operations between two IntervalIndex objects ' + 'that are closed on the same side') + for other_closed in {'right', 'left', 'both', 'neither'} - {closed}: + other = monotonic_index(0, 11, closed=other_closed) + with pytest.raises(ValueError, match=msg): + set_op(other, sort=sort) + + # GH 19016: incompatible dtypes + other = interval_range(Timestamp('20180101'), periods=9, closed=closed) + msg = ('can only do {op} between two IntervalIndex objects that have ' + 'compatible dtypes').format(op=op_name) + with pytest.raises(TypeError, match=msg): + set_op(other, sort=sort)