From a5a1272b7b5e9f0eb774ce746e729dedd4862c89 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 21 Apr 2019 20:38:15 +0800 Subject: [PATCH 01/26] Gid rid of MultiIndex conversion in IntervalIndex.intersection --- pandas/core/indexes/interval.py | 59 ++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index a3dbf2e03957b..e1795f49d1225 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -19,6 +19,7 @@ is_interval_dtype, is_list_like, is_number, is_object_dtype, is_scalar) from pandas.core.dtypes.missing import isna +import pandas.core.algorithms as algos from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs import pandas.core.common as com import pandas.core.indexes.base as ibase @@ -1090,6 +1091,63 @@ def equals(self, other): def overlaps(self, other): return self._data.overlaps(other) + def intersection2(self, other, sort=False): + other = self._as_like_interval_index(other) + + # GH 19016: ensure set op will not return a prohibited dtype + subtypes = [self.dtype.subtype, other.dtype.subtype] + common_subtype = find_common_type(subtypes) + if is_object_dtype(common_subtype): + msg = ('can only do intersection between two IntervalIndex ' + 'objects that have compatible dtypes') + raise TypeError(msg) + + try: + lindexer = other.left.get_indexer(self.left) + rindexer = other.right.get_indexer(self.right) + except Exception: + # duplicates + lindexer = algos.unique1d( + other.left.get_indexer_non_unique(self.left)[0]) + rindexer = algos.unique1d( + other.right.get_indexer_non_unique(self.right)[0]) + + match = (lindexer == rindexer) & (lindexer != -1) + indexer = lindexer.take(match.nonzero()[0]) + taken = other.take(indexer) + + return taken + + def intersection(self, other, sort=False): + other = self._as_like_interval_index(other) + + # GH 19016: ensure set op will not return a prohibited dtype + subtypes = [self.dtype.subtype, other.dtype.subtype] + common_subtype = find_common_type(subtypes) + if is_object_dtype(common_subtype): + msg = ('can only do intersection between two IntervalIndex ' + 'objects that have compatible dtypes') + raise TypeError(msg) + + try: + lindexer = self.left.get_indexer(other.left) + rindexer = self.right.get_indexer(other.right) + except Exception: + # duplicates + lindexer = algos.unique1d( + self.left.get_indexer_non_unique(other.left)[0]) + rindexer = algos.unique1d( + self.right.get_indexer_non_unique(other.right)[0]) + + match = (lindexer == rindexer) & (lindexer != -1) + indexer = lindexer.take(match.nonzero()[0]) + taken = self.take(indexer) + + if sort is None: + taken = taken.sort_values() + + return taken + def _setop(op_name, sort=None): def func(self, other, sort=sort): other = self._as_like_interval_index(other) @@ -1125,7 +1183,6 @@ def is_all_dates(self): return False union = _setop('union') - intersection = _setop('intersection', sort=False) difference = _setop('difference') symmetric_difference = _setop('symmetric_difference') From 3cd095a94db8a2e84a7ec06ac1e8c8d5324cafb3 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sat, 27 Apr 2019 18:45:44 +0800 Subject: [PATCH 02/26] Add benchmark for IntervalIndex.intersection --- asv_bench/benchmarks/index_object.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 908eaa3a9d214..c5328d175d5bf 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -191,8 +191,14 @@ def setup(self, N): self.intv = IntervalIndex.from_arrays(left, right) self.intv._engine + self.left = IntervalIndex.from_breaks(np.arange(N)) + self.right = IntervalIndex.from_breaks(np.arange(N - 3, 2 * N - 3)) + def time_monotonic_inc(self, N): self.intv.is_monotonic_increasing + def time_intersection(self, N): + self.left.intersection(self.right) + from .pandas_vb_common import setup # noqa: F401 From 0486a4e1106a94993b5a94a317732c19c717fb30 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sat, 27 Apr 2019 21:59:48 +0800 Subject: [PATCH 03/26] clear code --- pandas/core/indexes/interval.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index e1795f49d1225..1e0d7530361f0 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1091,33 +1091,6 @@ def equals(self, other): def overlaps(self, other): return self._data.overlaps(other) - def intersection2(self, other, sort=False): - other = self._as_like_interval_index(other) - - # GH 19016: ensure set op will not return a prohibited dtype - subtypes = [self.dtype.subtype, other.dtype.subtype] - common_subtype = find_common_type(subtypes) - if is_object_dtype(common_subtype): - msg = ('can only do intersection between two IntervalIndex ' - 'objects that have compatible dtypes') - raise TypeError(msg) - - try: - lindexer = other.left.get_indexer(self.left) - rindexer = other.right.get_indexer(self.right) - except Exception: - # duplicates - lindexer = algos.unique1d( - other.left.get_indexer_non_unique(self.left)[0]) - rindexer = algos.unique1d( - other.right.get_indexer_non_unique(self.right)[0]) - - match = (lindexer == rindexer) & (lindexer != -1) - indexer = lindexer.take(match.nonzero()[0]) - taken = other.take(indexer) - - return taken - def intersection(self, other, sort=False): other = self._as_like_interval_index(other) From 09c89f1cbcd8c7bc3c207df1e94bf28ba2b57d35 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sat, 27 Apr 2019 22:00:09 +0800 Subject: [PATCH 04/26] Add whatsnew note --- doc/source/whatsnew/v0.25.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index e0b4baf5d8543..b595af38e569e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -252,6 +252,8 @@ Performance Improvements - Improved performance of :meth:`read_csv` by much faster parsing of ``MM/YYYY`` and ``DD/MM/YYYY`` datetime formats (:issue:`25922`) - Improved performance of nanops for dtypes that cannot store NaNs. Speedup is particularly prominent for :meth:`Series.all` and :meth:`Series.any` (:issue:`25070`) - Improved performance of :meth:`Series.map` for dictionary mappers on categorical series by mapping the categories instead of mapping all values (:issue:`23785`) +- Improved performance of :meth:`IntervalIndex.intersection` (:issue:`24813`) + .. _whatsnew_0250.bug_fixes: From 841a0b700d12ede5ed82e76657b7101f0f015096 Mon Sep 17 00:00:00 2001 From: makbigc Date: Wed, 1 May 2019 10:30:19 +0800 Subject: [PATCH 05/26] Modity the case for duplicate index --- pandas/core/indexes/interval.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 1e0d7530361f0..6987efcd90871 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1102,19 +1102,24 @@ def intersection(self, other, sort=False): 'objects that have compatible dtypes') raise TypeError(msg) - try: + if self.left.is_unique and self.right.is_unique: lindexer = self.left.get_indexer(other.left) rindexer = self.right.get_indexer(other.right) - except Exception: + match = (lindexer == rindexer) & (lindexer != -1) + indexer = lindexer.take(match.nonzero()[0]) + taken = self.take(indexer) + else: # duplicates - lindexer = algos.unique1d( - self.left.get_indexer_non_unique(other.left)[0]) - rindexer = algos.unique1d( - self.right.get_indexer_non_unique(other.right)[0]) - - match = (lindexer == rindexer) & (lindexer != -1) - indexer = lindexer.take(match.nonzero()[0]) - taken = self.take(indexer) + lmiss = other.left.get_indexer_non_unique(self.left)[1] + lindexer = np.setdiff1d(np.arange(len(self)), lmiss) + rmiss = other.right.get_indexer_non_unique(self.right)[1] + rindexer = np.setdiff1d(np.arange(len(self)), rmiss) + indexer = np.intersect1d(lindexer, rindexer) + taken = self[indexer] + + #match = (lindexer == rindexer) & (lindexer != -1) + #indexer = lindexer.take(match.nonzero()[0]) + #taken = self.take(indexer) if sort is None: taken = taken.sort_values() From 8b22623df820ee0d53c01ec5116d7dd445e4603f Mon Sep 17 00:00:00 2001 From: makbigc Date: Wed, 1 May 2019 12:04:15 +0800 Subject: [PATCH 06/26] Combine the set operation to find indexer into one --- pandas/core/indexes/interval.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 6987efcd90871..31d6cdee7dbd8 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1111,16 +1111,12 @@ def intersection(self, other, sort=False): else: # duplicates lmiss = other.left.get_indexer_non_unique(self.left)[1] - lindexer = np.setdiff1d(np.arange(len(self)), lmiss) rmiss = other.right.get_indexer_non_unique(self.right)[1] - rindexer = np.setdiff1d(np.arange(len(self)), rmiss) - indexer = np.intersect1d(lindexer, rindexer) + import functools + indexer = functools.reduce(np.setdiff1d, (np.arange(len(self)), + lmiss, rmiss)) taken = self[indexer] - #match = (lindexer == rindexer) & (lindexer != -1) - #indexer = lindexer.take(match.nonzero()[0]) - #taken = self.take(indexer) - if sort is None: taken = taken.sort_values() From 32d4005ef6b45f75b2b4493803b9f1ba3470fbf2 Mon Sep 17 00:00:00 2001 From: makbigc Date: Wed, 1 May 2019 17:21:53 +0800 Subject: [PATCH 07/26] Move setops tests to test_setops.py and add two tests --- .../tests/indexes/interval/test_interval.py | 131 -------------- pandas/tests/indexes/interval/test_setops.py | 162 ++++++++++++++++++ 2 files changed, 162 insertions(+), 131 deletions(-) create mode 100644 pandas/tests/indexes/interval/test_setops.py diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 61465d8454383..b2f409837344a 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -795,137 +795,6 @@ def test_non_contiguous(self, closed): assert 1.5 not in index - @pytest.mark.parametrize("sort", [None, False]) - def test_union(self, closed, sort): - index = self.create_index(closed=closed) - other = IntervalIndex.from_breaks(range(5, 13), closed=closed) - - expected = IntervalIndex.from_breaks(range(13), closed=closed) - result = index[::-1].union(other, sort=sort) - if sort is None: - tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) - - result = other[::-1].union(index, sort=sort) - if sort is None: - tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) - - tm.assert_index_equal(index.union(index, sort=sort), index) - tm.assert_index_equal(index.union(index[:1], sort=sort), index) - - # GH 19101: empty result, same dtype - index = IntervalIndex(np.array([], dtype='int64'), closed=closed) - result = index.union(index, sort=sort) - tm.assert_index_equal(result, index) - - # GH 19101: empty result, different dtypes - other = IntervalIndex(np.array([], dtype='float64'), closed=closed) - result = index.union(other, sort=sort) - tm.assert_index_equal(result, index) - - @pytest.mark.parametrize("sort", [None, False]) - def test_intersection(self, closed, sort): - index = self.create_index(closed=closed) - other = IntervalIndex.from_breaks(range(5, 13), closed=closed) - - expected = IntervalIndex.from_breaks(range(5, 11), closed=closed) - result = index[::-1].intersection(other, sort=sort) - if sort is None: - tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) - - result = other[::-1].intersection(index, sort=sort) - if sort is None: - tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) - - tm.assert_index_equal(index.intersection(index, sort=sort), index) - - # GH 19101: empty result, same dtype - other = IntervalIndex.from_breaks(range(300, 314), closed=closed) - expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) - result = index.intersection(other, sort=sort) - tm.assert_index_equal(result, expected) - - # GH 19101: empty result, different dtypes - breaks = np.arange(300, 314, dtype='float64') - other = IntervalIndex.from_breaks(breaks, closed=closed) - result = index.intersection(other, sort=sort) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("sort", [None, False]) - def test_difference(self, closed, sort): - index = IntervalIndex.from_arrays([1, 0, 3, 2], - [1, 2, 3, 4], - closed=closed) - result = index.difference(index[:1], sort=sort) - expected = index[1:] - if sort is None: - expected = expected.sort_values() - tm.assert_index_equal(result, expected) - - # GH 19101: empty result, same dtype - result = index.difference(index, sort=sort) - expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) - tm.assert_index_equal(result, expected) - - # GH 19101: empty result, different dtypes - other = IntervalIndex.from_arrays(index.left.astype('float64'), - index.right, closed=closed) - result = index.difference(other, sort=sort) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("sort", [None, False]) - def test_symmetric_difference(self, closed, sort): - index = self.create_index(closed=closed) - result = index[1:].symmetric_difference(index[:-1], sort=sort) - expected = IntervalIndex([index[0], index[-1]]) - if sort is None: - tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) - - # GH 19101: empty result, same dtype - result = index.symmetric_difference(index, sort=sort) - expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) - if sort is None: - tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) - - # GH 19101: empty result, different dtypes - other = IntervalIndex.from_arrays(index.left.astype('float64'), - index.right, closed=closed) - result = index.symmetric_difference(other, sort=sort) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize('op_name', [ - 'union', 'intersection', 'difference', 'symmetric_difference']) - @pytest.mark.parametrize("sort", [None, False]) - def test_set_operation_errors(self, closed, op_name, sort): - index = self.create_index(closed=closed) - set_op = getattr(index, op_name) - - # non-IntervalIndex - msg = ('the other index needs to be an IntervalIndex too, but ' - 'was type Int64Index') - with pytest.raises(TypeError, match=msg): - set_op(Index([1, 2, 3]), sort=sort) - - # mixed closed - msg = ('can only do set operations between two IntervalIndex objects ' - 'that are closed on the same side') - for other_closed in {'right', 'left', 'both', 'neither'} - {closed}: - other = self.create_index(closed=other_closed) - with pytest.raises(ValueError, match=msg): - set_op(other, sort=sort) - - # GH 19016: incompatible dtypes - other = interval_range(Timestamp('20180101'), periods=9, closed=closed) - msg = ('can only do {op} between two IntervalIndex objects that have ' - 'compatible dtypes').format(op=op_name) - with pytest.raises(TypeError, match=msg): - set_op(other, sort=sort) - def test_isin(self, closed): index = self.create_index(closed=closed) diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py new file mode 100644 index 0000000000000..d757200d00a3f --- /dev/null +++ b/pandas/tests/indexes/interval/test_setops.py @@ -0,0 +1,162 @@ +import numpy as np +import pytest + + +from pandas import Index, IntervalIndex, Timestamp, interval_range +import pandas.util.testing as tm + + +@pytest.fixture(scope='class', params=[None, 'foo']) +def name(request): + return request.param + + +class TestIntervalIndex: + + def create_index(self, closed='right'): + return IntervalIndex.from_breaks(range(11), closed=closed) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, closed, sort): + index = self.create_index(closed=closed) + other = IntervalIndex.from_breaks(range(5, 13), closed=closed) + + expected = IntervalIndex.from_breaks(range(13), closed=closed) + result = index[::-1].union(other, sort=sort) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + result = other[::-1].union(index, sort=sort) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + tm.assert_index_equal(index.union(index, sort=sort), index) + tm.assert_index_equal(index.union(index[:1], sort=sort), index) + + # GH 19101: empty result, same dtype + index = IntervalIndex(np.array([], dtype='int64'), closed=closed) + result = index.union(index, sort=sort) + tm.assert_index_equal(result, index) + + # GH 19101: empty result, different dtypes + other = IntervalIndex(np.array([], dtype='float64'), closed=closed) + result = index.union(other, sort=sort) + tm.assert_index_equal(result, index) + + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection(self, closed, sort): + index = self.create_index(closed=closed) + other = IntervalIndex.from_breaks(range(5, 13), closed=closed) + + expected = IntervalIndex.from_breaks(range(5, 11), closed=closed) + result = index[::-1].intersection(other, sort=sort) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + result = other[::-1].intersection(index, sort=sort) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + tm.assert_index_equal(index.intersection(index, sort=sort), index) + + # GH 19101: empty result, same dtype + other = IntervalIndex.from_breaks(range(300, 314), closed=closed) + expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) + result = index.intersection(other, sort=sort) + tm.assert_index_equal(result, expected) + + # GH 19101: empty result, different dtypes + breaks = np.arange(300, 314, dtype='float64') + other = IntervalIndex.from_breaks(breaks, closed=closed) + result = index.intersection(other, sort=sort) + tm.assert_index_equal(result, expected) + + # GH 26225: nested intervals + index = IntervalIndex.from_tuples([(1, 2), (1, 3), (1, 4), (0, 2)]) + other = IntervalIndex.from_tuples([(1, 2), (1, 3)]) + expected = IntervalIndex.from_tuples([(1, 2), (1, 3)]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + # GH 26225: duplicate element + index = IntervalIndex.from_tuples([(1, 2), (1, 2), (2, 3), (3, 4)]) + other = IntervalIndex.from_tuples([(1, 2), (2, 3)]) + expected = IntervalIndex.from_tuples([(1, 2), (1, 2), (2, 3)]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference(self, closed, sort): + index = IntervalIndex.from_arrays([1, 0, 3, 2], + [1, 2, 3, 4], + closed=closed) + result = index.difference(index[:1], sort=sort) + expected = index[1:] + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + + # GH 19101: empty result, same dtype + result = index.difference(index, sort=sort) + expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) + tm.assert_index_equal(result, expected) + + # GH 19101: empty result, different dtypes + other = IntervalIndex.from_arrays(index.left.astype('float64'), + index.right, closed=closed) + result = index.difference(other, sort=sort) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_symmetric_difference(self, closed, sort): + index = self.create_index(closed=closed) + result = index[1:].symmetric_difference(index[:-1], sort=sort) + expected = IntervalIndex([index[0], index[-1]]) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + # GH 19101: empty result, same dtype + result = index.symmetric_difference(index, sort=sort) + expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + # GH 19101: empty result, different dtypes + other = IntervalIndex.from_arrays(index.left.astype('float64'), + index.right, closed=closed) + result = index.symmetric_difference(other, sort=sort) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize('op_name', [ + 'union', 'intersection', 'difference', 'symmetric_difference']) + @pytest.mark.parametrize("sort", [None, False]) + def test_set_operation_errors(self, closed, op_name, sort): + index = self.create_index(closed=closed) + set_op = getattr(index, op_name) + + # non-IntervalIndex + msg = ('the other index needs to be an IntervalIndex too, but ' + 'was type Int64Index') + with pytest.raises(TypeError, match=msg): + set_op(Index([1, 2, 3]), sort=sort) + + # mixed closed + msg = ('can only do set operations between two IntervalIndex objects ' + 'that are closed on the same side') + for other_closed in {'right', 'left', 'both', 'neither'} - {closed}: + other = self.create_index(closed=other_closed) + with pytest.raises(ValueError, match=msg): + set_op(other, sort=sort) + + # GH 19016: incompatible dtypes + other = interval_range(Timestamp('20180101'), periods=9, closed=closed) + msg = ('can only do {op} between two IntervalIndex objects that have ' + 'compatible dtypes').format(op=op_name) + with pytest.raises(TypeError, match=msg): + set_op(other, sort=sort) From d502fcb74cde9fcd8d51cc30ec5e49ba469e298d Mon Sep 17 00:00:00 2001 From: makbigc Date: Wed, 1 May 2019 18:12:31 +0800 Subject: [PATCH 08/26] Remove relundant line --- pandas/core/indexes/interval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 31d6cdee7dbd8..d6af6a446a36d 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -19,7 +19,6 @@ is_interval_dtype, is_list_like, is_number, is_object_dtype, is_scalar) from pandas.core.dtypes.missing import isna -import pandas.core.algorithms as algos from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs import pandas.core.common as com import pandas.core.indexes.base as ibase From 8ec6366e4a9491a86783257b134e69afceb15831 Mon Sep 17 00:00:00 2001 From: makbigc Date: Wed, 1 May 2019 18:49:53 +0800 Subject: [PATCH 09/26] Remove duplicate line in whatsnew note --- doc/source/whatsnew/v0.25.0.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index b595af38e569e..d7be8e32e0a8a 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -254,7 +254,10 @@ Performance Improvements - Improved performance of :meth:`Series.map` for dictionary mappers on categorical series by mapping the categories instead of mapping all values (:issue:`23785`) - Improved performance of :meth:`IntervalIndex.intersection` (:issue:`24813`) +<<<<<<< HEAD +======= +>>>>>>> Remove duplicate line in whatsnew note .. _whatsnew_0250.bug_fixes: Bug Fixes From 6000904d9b8aac2823beaa9fed8921ecdee1003c Mon Sep 17 00:00:00 2001 From: makbigc Date: Wed, 1 May 2019 20:09:35 +0800 Subject: [PATCH 10/26] Isort interval/test_setops.py --- pandas/tests/indexes/interval/test_setops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index d757200d00a3f..c6fe01bbf32c0 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -1,7 +1,6 @@ import numpy as np import pytest - from pandas import Index, IntervalIndex, Timestamp, interval_range import pandas.util.testing as tm From 7cb7d2cf9f1aed2e449ef937b63b0d0212f6f7c5 Mon Sep 17 00:00:00 2001 From: makbigc Date: Wed, 1 May 2019 22:15:47 +0800 Subject: [PATCH 11/26] Split the intersection into two sub-functions --- pandas/core/indexes/interval.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index d6af6a446a36d..ded849fbd68bc 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1102,25 +1102,35 @@ def intersection(self, other, sort=False): raise TypeError(msg) if self.left.is_unique and self.right.is_unique: - lindexer = self.left.get_indexer(other.left) - rindexer = self.right.get_indexer(other.right) - match = (lindexer == rindexer) & (lindexer != -1) - indexer = lindexer.take(match.nonzero()[0]) - taken = self.take(indexer) + taken = self._intersection_unique(other) else: # duplicates - lmiss = other.left.get_indexer_non_unique(self.left)[1] - rmiss = other.right.get_indexer_non_unique(self.right)[1] - import functools - indexer = functools.reduce(np.setdiff1d, (np.arange(len(self)), - lmiss, rmiss)) - taken = self[indexer] + taken = self._intersection_non_unique(other) if sort is None: taken = taken.sort_values() return taken + def _intersection_unique(self, other): + lindexer = self.left.get_indexer(other.left) + rindexer = self.right.get_indexer(other.right) + + match = (lindexer == rindexer) & (lindexer != -1) + indexer = lindexer.take(match.nonzero()[0]) + + return self.take(indexer) + + def _intersection_non_unique(self, other): + lmiss = other.left.get_indexer_non_unique(self.left)[1] + rmiss = other.right.get_indexer_non_unique(self.right)[1] + + import functools + indexer = functools.reduce(np.setdiff1d, (np.arange(len(self)), + lmiss, rmiss)) + + return self[indexer] + def _setop(op_name, sort=None): def func(self, other, sort=sort): other = self._as_like_interval_index(other) From bcf36bb2c65979f602838b3f34101beacab68971 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 5 May 2019 14:09:22 +0800 Subject: [PATCH 12/26] Functionalize some indexes --- pandas/tests/indexes/interval/test_setops.py | 45 +++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index c6fe01bbf32c0..131899a11e205 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -10,17 +10,23 @@ def name(request): return request.param -class TestIntervalIndex: +def monotonic_index(start, end, dtype='int64', closed='right'): + return IntervalIndex.from_breaks(np.arange(start, end, dtype=dtype), + closed=closed) + + +def empty_index(dtype='int64', closed='right'): + return IntervalIndex(np.array([], dtype=dtype), closed=closed) - def create_index(self, closed='right'): - return IntervalIndex.from_breaks(range(11), closed=closed) + +class TestIntervalIndex: @pytest.mark.parametrize("sort", [None, False]) def test_union(self, closed, sort): - index = self.create_index(closed=closed) - other = IntervalIndex.from_breaks(range(5, 13), closed=closed) + index = monotonic_index(0, 11, closed=closed) + other = monotonic_index(5, 13, closed=closed) - expected = IntervalIndex.from_breaks(range(13), closed=closed) + expected = monotonic_index(0, 13, closed=closed) result = index[::-1].union(other, sort=sort) if sort is None: tm.assert_index_equal(result, expected) @@ -35,21 +41,21 @@ def test_union(self, closed, sort): tm.assert_index_equal(index.union(index[:1], sort=sort), index) # GH 19101: empty result, same dtype - index = IntervalIndex(np.array([], dtype='int64'), closed=closed) + index = empty_index(dtype='int64', closed=closed) result = index.union(index, sort=sort) tm.assert_index_equal(result, index) # GH 19101: empty result, different dtypes - other = IntervalIndex(np.array([], dtype='float64'), closed=closed) + other = empty_index(dtype='float64', closed=closed) result = index.union(other, sort=sort) tm.assert_index_equal(result, index) @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, closed, sort): - index = self.create_index(closed=closed) - other = IntervalIndex.from_breaks(range(5, 13), closed=closed) + index = monotonic_index(0, 11, closed=closed) + other = monotonic_index(5, 13, closed=closed) - expected = IntervalIndex.from_breaks(range(5, 11), closed=closed) + expected = monotonic_index(5, 11, closed=closed) result = index[::-1].intersection(other, sort=sort) if sort is None: tm.assert_index_equal(result, expected) @@ -63,14 +69,13 @@ def test_intersection(self, closed, sort): tm.assert_index_equal(index.intersection(index, sort=sort), index) # GH 19101: empty result, same dtype - other = IntervalIndex.from_breaks(range(300, 314), closed=closed) - expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) + other = monotonic_index(300, 314, closed=closed) + expected = empty_index(dtype='int64', closed=closed) result = index.intersection(other, sort=sort) tm.assert_index_equal(result, expected) # GH 19101: empty result, different dtypes - breaks = np.arange(300, 314, dtype='float64') - other = IntervalIndex.from_breaks(breaks, closed=closed) + other = monotonic_index(300, 314, dtype='float64', closed=closed) result = index.intersection(other, sort=sort) tm.assert_index_equal(result, expected) @@ -101,7 +106,7 @@ def test_difference(self, closed, sort): # GH 19101: empty result, same dtype result = index.difference(index, sort=sort) - expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) + expected = empty_index(dtype='int64', closed=closed) tm.assert_index_equal(result, expected) # GH 19101: empty result, different dtypes @@ -112,7 +117,7 @@ def test_difference(self, closed, sort): @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference(self, closed, sort): - index = self.create_index(closed=closed) + index = monotonic_index(0, 11, closed=closed) result = index[1:].symmetric_difference(index[:-1], sort=sort) expected = IntervalIndex([index[0], index[-1]]) if sort is None: @@ -121,7 +126,7 @@ def test_symmetric_difference(self, closed, sort): # GH 19101: empty result, same dtype result = index.symmetric_difference(index, sort=sort) - expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) + expected = empty_index(dtype='int64', closed=closed) if sort is None: tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) @@ -136,7 +141,7 @@ def test_symmetric_difference(self, closed, sort): 'union', 'intersection', 'difference', 'symmetric_difference']) @pytest.mark.parametrize("sort", [None, False]) def test_set_operation_errors(self, closed, op_name, sort): - index = self.create_index(closed=closed) + index = monotonic_index(0, 11, closed=closed) set_op = getattr(index, op_name) # non-IntervalIndex @@ -149,7 +154,7 @@ def test_set_operation_errors(self, closed, op_name, sort): msg = ('can only do set operations between two IntervalIndex objects ' 'that are closed on the same side') for other_closed in {'right', 'left', 'both', 'neither'} - {closed}: - other = self.create_index(closed=other_closed) + other = monotonic_index(0, 11, closed=other_closed) with pytest.raises(ValueError, match=msg): set_op(other, sort=sort) From 745c0bb6ccd6e1270ddd6ab162f1774a17f9019f Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 5 May 2019 14:16:42 +0800 Subject: [PATCH 13/26] Remove relundant lines in whatsnew --- doc/source/whatsnew/v0.25.0.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d7be8e32e0a8a..c17707dad0499 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -254,10 +254,6 @@ Performance Improvements - Improved performance of :meth:`Series.map` for dictionary mappers on categorical series by mapping the categories instead of mapping all values (:issue:`23785`) - Improved performance of :meth:`IntervalIndex.intersection` (:issue:`24813`) -<<<<<<< HEAD - -======= ->>>>>>> Remove duplicate line in whatsnew note .. _whatsnew_0250.bug_fixes: Bug Fixes From ff8bb97abb821eb1a482fad663a155e9edb50c65 Mon Sep 17 00:00:00 2001 From: makbigc Date: Mon, 6 May 2019 22:06:19 +0800 Subject: [PATCH 14/26] Fixturize the sort parameter --- pandas/tests/indexes/interval/test_setops.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index 131899a11e205..b6c80b03d3928 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -10,6 +10,11 @@ def name(request): return request.param +@pytest.fixture(params=[None, False]) +def sort(request): + return request.param + + def monotonic_index(start, end, dtype='int64', closed='right'): return IntervalIndex.from_breaks(np.arange(start, end, dtype=dtype), closed=closed) @@ -21,7 +26,6 @@ def empty_index(dtype='int64', closed='right'): class TestIntervalIndex: - @pytest.mark.parametrize("sort", [None, False]) def test_union(self, closed, sort): index = monotonic_index(0, 11, closed=closed) other = monotonic_index(5, 13, closed=closed) @@ -50,7 +54,6 @@ def test_union(self, closed, sort): result = index.union(other, sort=sort) tm.assert_index_equal(result, index) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, closed, sort): index = monotonic_index(0, 11, closed=closed) other = monotonic_index(5, 13, closed=closed) @@ -93,7 +96,6 @@ def test_intersection(self, closed, sort): result = index.intersection(other) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [None, False]) def test_difference(self, closed, sort): index = IntervalIndex.from_arrays([1, 0, 3, 2], [1, 2, 3, 4], @@ -115,7 +117,6 @@ def test_difference(self, closed, sort): result = index.difference(other, sort=sort) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference(self, closed, sort): index = monotonic_index(0, 11, closed=closed) result = index[1:].symmetric_difference(index[:-1], sort=sort) @@ -139,7 +140,6 @@ def test_symmetric_difference(self, closed, sort): @pytest.mark.parametrize('op_name', [ 'union', 'intersection', 'difference', 'symmetric_difference']) - @pytest.mark.parametrize("sort", [None, False]) def test_set_operation_errors(self, closed, op_name, sort): index = monotonic_index(0, 11, closed=closed) set_op = getattr(index, op_name) From 17d775f12ccd74756688b903236b9f133ff68b49 Mon Sep 17 00:00:00 2001 From: makbigc Date: Tue, 7 May 2019 12:52:03 +0800 Subject: [PATCH 15/26] Factor out the check and decorate the setops --- pandas/core/indexes/interval.py | 57 +++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index ded849fbd68bc..f854f5743fa6f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1,6 +1,7 @@ """ define the IntervalIndex """ import textwrap import warnings +import functools import numpy as np @@ -97,6 +98,27 @@ def _new_IntervalIndex(cls, d): return cls.from_arrays(**d) +class _setop_check(object): + + def __init__(self, op_name): + self.op_name = op_name + + def __call__(self, setop): + def func(intvidx_self, other, sort=False): + other = intvidx_self._as_like_interval_index(other) + + # GH 19016: ensure set op will not return a prohibited dtype + subtypes = [intvidx_self.dtype.subtype, other.dtype.subtype] + common_subtype = find_common_type(subtypes) + if is_object_dtype(common_subtype): + msg = ('can only do {op} between two IntervalIndex ' + 'objects that have compatible dtypes') + raise TypeError(msg.format(op=self.op_name)) + + return setop(intvidx_self, other, sort) + return func + + @Appender(_interval_shared_docs['class'] % dict( klass="IntervalIndex", summary="Immutable index of intervals that are closed on the same side.", @@ -1090,16 +1112,8 @@ def equals(self, other): def overlaps(self, other): return self._data.overlaps(other) + @_setop_check(op_name='intersection') def intersection(self, other, sort=False): - other = self._as_like_interval_index(other) - - # GH 19016: ensure set op will not return a prohibited dtype - subtypes = [self.dtype.subtype, other.dtype.subtype] - common_subtype = find_common_type(subtypes) - if is_object_dtype(common_subtype): - msg = ('can only do intersection between two IntervalIndex ' - 'objects that have compatible dtypes') - raise TypeError(msg) if self.left.is_unique and self.right.is_unique: taken = self._intersection_unique(other) @@ -1113,6 +1127,19 @@ def intersection(self, other, sort=False): return taken def _intersection_unique(self, other): + """ + Get integer location, slice or boolean mask for requested label. + + Parameters + ---------- + key : label + method : {None}, optional + * default: matches where the label is within an interval only. + + Returns + ------- + loc : int if unique index, slice if monotonic index, else mask + """ lindexer = self.left.get_indexer(other.left) rindexer = self.right.get_indexer(other.right) @@ -1125,24 +1152,14 @@ def _intersection_non_unique(self, other): lmiss = other.left.get_indexer_non_unique(self.left)[1] rmiss = other.right.get_indexer_non_unique(self.right)[1] - import functools indexer = functools.reduce(np.setdiff1d, (np.arange(len(self)), lmiss, rmiss)) return self[indexer] def _setop(op_name, sort=None): + @_setop_check(op_name=op_name) def func(self, other, sort=sort): - other = self._as_like_interval_index(other) - - # GH 19016: ensure set op will not return a prohibited dtype - subtypes = [self.dtype.subtype, other.dtype.subtype] - common_subtype = find_common_type(subtypes) - if is_object_dtype(common_subtype): - msg = ('can only do {op} between two IntervalIndex ' - 'objects that have compatible dtypes') - raise TypeError(msg.format(op=op_name)) - result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort) result_name = get_op_result_name(self, other) From 03a989a6ac8d00d47fcd3e5adad77cffcf0ee64f Mon Sep 17 00:00:00 2001 From: makbigc Date: Wed, 8 May 2019 13:49:51 +0800 Subject: [PATCH 16/26] Add docstring to two subfunction --- pandas/core/indexes/interval.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index f854f5743fa6f..693501545409a 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1128,17 +1128,17 @@ def intersection(self, other, sort=False): def _intersection_unique(self, other): """ - Get integer location, slice or boolean mask for requested label. + Used when the IntervalIndex does not have any common endpoint, + no mater left or right. + Return the intersection with another IntervalIndex. Parameters ---------- - key : label - method : {None}, optional - * default: matches where the label is within an interval only. + other : IntervalIndex Returns ------- - loc : int if unique index, slice if monotonic index, else mask + taken : IntervalIndex """ lindexer = self.left.get_indexer(other.left) rindexer = self.right.get_indexer(other.right) @@ -1149,6 +1149,20 @@ def _intersection_unique(self, other): return self.take(indexer) def _intersection_non_unique(self, other): + """ + Used when the IntervalIndex does have some common endpoints, + on either sides. + Return the intersection with another IntervalIndex. + + Parameters + ---------- + other : IntervalIndex + + Returns + ------- + taken : IntervalIndex + """ + lmiss = other.left.get_indexer_non_unique(self.left)[1] rmiss = other.right.get_indexer_non_unique(self.right)[1] From b36cbc8ff02f6b405e6fad05b44ceda5eb6a132d Mon Sep 17 00:00:00 2001 From: makbigc Date: Wed, 8 May 2019 14:38:58 +0800 Subject: [PATCH 17/26] Add intersection into _index_shared_docs --- pandas/core/indexes/base.py | 6 ++++-- pandas/core/indexes/interval.py | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f7c562798ad52..77d8a91dbd2cb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2370,8 +2370,7 @@ def union(self, other, sort=None): def _wrap_setop_result(self, other, result): return self._constructor(result, name=get_op_result_name(self, other)) - def intersection(self, other, sort=False): - """ + _index_shared_docs['intersection'] = """ Form the intersection of two Index objects. This returns a new Index with elements common to the index and `other`. @@ -2405,6 +2404,9 @@ def intersection(self, other, sort=False): >>> idx1.intersection(idx2) Int64Index([3, 4], dtype='int64') """ + + @Appender(_index_shared_docs['intersection']) + def intersection(self, other, sort=False): self._validate_sort_keyword(sort) self._assert_can_do_setop(other) other = ensure_index(other) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 693501545409a..574b0c887d852 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1112,6 +1112,7 @@ def equals(self, other): def overlaps(self, other): return self._data.overlaps(other) + @Appender(_index_shared_docs['intersection']) @_setop_check(op_name='intersection') def intersection(self, other, sort=False): From 1cdb1703eafdbc62eaaab50430adced2044b8480 Mon Sep 17 00:00:00 2001 From: makbigc Date: Fri, 10 May 2019 16:04:08 +0800 Subject: [PATCH 18/26] Isort and change the decorator's name --- pandas/core/indexes/interval.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 574b0c887d852..27a9b5d1d2007 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1,7 +1,7 @@ """ define the IntervalIndex """ +import functools import textwrap import warnings -import functools import numpy as np @@ -98,7 +98,7 @@ def _new_IntervalIndex(cls, d): return cls.from_arrays(**d) -class _setop_check(object): +class setop_check(object): def __init__(self, op_name): self.op_name = op_name @@ -1113,7 +1113,7 @@ def overlaps(self, other): return self._data.overlaps(other) @Appender(_index_shared_docs['intersection']) - @_setop_check(op_name='intersection') + @setop_check(op_name='intersection') def intersection(self, other, sort=False): if self.left.is_unique and self.right.is_unique: @@ -1173,7 +1173,7 @@ def _intersection_non_unique(self, other): return self[indexer] def _setop(op_name, sort=None): - @_setop_check(op_name=op_name) + @setop_check(op_name=op_name) def func(self, other, sort=sort): result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort) From 18c2d3727af1d009c01a6b21167aa9392b0db2af Mon Sep 17 00:00:00 2001 From: makbigc Date: Sat, 11 May 2019 23:15:19 +0800 Subject: [PATCH 19/26] Remove object inheritance --- pandas/core/indexes/interval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 27a9b5d1d2007..7f9628d88e4e0 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -98,7 +98,7 @@ def _new_IntervalIndex(cls, d): return cls.from_arrays(**d) -class setop_check(object): +class setop_check: def __init__(self, op_name): self.op_name = op_name From 35594b03204e287e8e7c87632a01ce5a894177a8 Mon Sep 17 00:00:00 2001 From: makbigc Date: Thu, 16 May 2019 13:57:50 +0800 Subject: [PATCH 20/26] Add docstring to setop_check --- pandas/core/indexes/interval.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 7f9628d88e4e0..ba20aa96d73b4 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -99,7 +99,10 @@ def _new_IntervalIndex(cls, d): class setop_check: - + """ + This is called to decorate the set operations of IntervalIndex + to perform the type check in advance. + """ def __init__(self, op_name): self.op_name = op_name From 9cf9b7e8c8c6a5720b2ea17aa5c9a0028663a40e Mon Sep 17 00:00:00 2001 From: makbigc Date: Thu, 23 May 2019 20:33:20 +0800 Subject: [PATCH 21/26] complete merge --- pandas/core/indexes/interval.py | 15 --------------- pandas/tests/indexes/interval/test_setops.py | 14 +++++++++----- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 3d54c1a7d12ad..b1b05181c1f09 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1117,21 +1117,6 @@ def overlaps(self, other): @Appender(_index_shared_docs['intersection']) @setop_check(op_name='intersection') def intersection(self, other, sort=False): - def _setop(op_name, sort=None): - def func(self, other, sort=sort): - self._assert_can_do_setop(other) - other = ensure_index(other) - if not isinstance(other, IntervalIndex): - result = getattr(self.astype(object), op_name)(other) - if op_name in ('difference',): - result = result.astype(self.dtype) - return result - elif self.closed != other.closed: - msg = ('can only do set operations between two IntervalIndex ' - 'objects that are closed on the same side') - raise ValueError(msg) ->>>>>>> upstream/master - if self.left.is_unique and self.right.is_unique: taken = self._intersection_unique(other) else: diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index b6c80b03d3928..fa953bef11d16 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -140,15 +140,19 @@ def test_symmetric_difference(self, closed, sort): @pytest.mark.parametrize('op_name', [ 'union', 'intersection', 'difference', 'symmetric_difference']) - def test_set_operation_errors(self, closed, op_name, sort): + @pytest.mark.parametrize("sort", [None, False]) + def test_set_incompatible_types(self, closed, op_name, sort): index = monotonic_index(0, 11, closed=closed) set_op = getattr(index, op_name) + # TODO: standardize return type of non-union setops type(self vs other) # non-IntervalIndex - msg = ('the other index needs to be an IntervalIndex too, but ' - 'was type Int64Index') - with pytest.raises(TypeError, match=msg): - set_op(Index([1, 2, 3]), sort=sort) + if op_name == 'difference': + expected = index + else: + expected = getattr(index.astype('O'), op_name)(Index([1, 2, 3])) + result = set_op(Index([1, 2, 3]), sort=sort) + tm.assert_index_equal(result, expected) # mixed closed msg = ('can only do set operations between two IntervalIndex objects ' From ab67edda36b3b14d97c61cd32ceff5e5c3014fba Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 26 May 2019 00:00:00 +0800 Subject: [PATCH 22/26] 2nd approach --- pandas/core/indexes/interval.py | 14 ++++++++++---- pandas/tests/indexes/interval/test_setops.py | 7 +++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index b1b05181c1f09..3d3c1755e7ca6 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1164,14 +1164,20 @@ def _intersection_non_unique(self, other): ------- taken : IntervalIndex """ + mask = np.zeros(len(self), dtype=bool) lmiss = other.left.get_indexer_non_unique(self.left)[1] - rmiss = other.right.get_indexer_non_unique(self.right)[1] + lmatch = np.setdiff1d(np.arange(len(self)), lmiss) - indexer = functools.reduce(np.setdiff1d, (np.arange(len(self)), - lmiss, rmiss)) + for i in lmatch: + potential = other.left.get_loc(self.left[i]) + if is_scalar(potential): + if self.right[i] == other.right[potential]: + mask[i] = True + elif self.right[i] in other.right[potential]: + mask[i] = True - return self[indexer] + return self[mask] def _setop(op_name, sort=None): @setop_check(op_name=op_name) diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index fa953bef11d16..961daa549389a 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -96,6 +96,13 @@ def test_intersection(self, closed, sort): result = index.intersection(other) tm.assert_index_equal(result, expected) + # GH 26225 + index = IntervalIndex.from_tuples([(0, 3), (0, 2)]) + other = IntervalIndex.from_tuples([(0, 2), (1, 3)]) + expected = IntervalIndex.from_tuples([(0, 2)]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + def test_difference(self, closed, sort): index = IntervalIndex.from_arrays([1, 0, 3, 2], [1, 2, 3, 4], From 402b09cf58c7b6c5bc4fcebeb6f046c3cc810cc2 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 26 May 2019 00:20:28 +0800 Subject: [PATCH 23/26] Add a new benchmark --- asv_bench/benchmarks/index_object.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index c5328d175d5bf..42a79cdc4c1dd 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -200,5 +200,8 @@ def time_monotonic_inc(self, N): def time_intersection(self, N): self.left.intersection(self.right) + def time_intersection_duplicate(self, N): + self.intv.intersection(self.right) + from .pandas_vb_common import setup # noqa: F401 From b4f130de708fe3f638fd01428a31812717568dff Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 26 May 2019 01:06:01 +0800 Subject: [PATCH 24/26] Fix linting issue --- pandas/core/indexes/interval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 3d3c1755e7ca6..50ddddd71de40 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1,5 +1,4 @@ """ define the IntervalIndex """ -import functools import textwrap import warnings From 3ff4c64ead75d5be315ba0f0efa7c03b62d7390b Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 26 May 2019 10:16:30 +0800 Subject: [PATCH 25/26] Change the decorator name to SetopCheck --- pandas/core/indexes/interval.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 50ddddd71de40..a1e2bea5532ea 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -97,7 +97,7 @@ def _new_IntervalIndex(cls, d): return cls.from_arrays(**d) -class setop_check: +class SetopCheck: """ This is called to decorate the set operations of IntervalIndex to perform the type check in advance. @@ -1114,7 +1114,7 @@ def overlaps(self, other): return self._data.overlaps(other) @Appender(_index_shared_docs['intersection']) - @setop_check(op_name='intersection') + @SetopCheck(op_name='intersection') def intersection(self, other, sort=False): if self.left.is_unique and self.right.is_unique: taken = self._intersection_unique(other) @@ -1179,7 +1179,7 @@ def _intersection_non_unique(self, other): return self[mask] def _setop(op_name, sort=None): - @setop_check(op_name=op_name) + @SetopCheck(op_name=op_name) def func(self, other, sort=sort): result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort) From 3db3130bf2dece5394aaff5c919f18de4e342912 Mon Sep 17 00:00:00 2001 From: makbigc Date: Tue, 28 May 2019 13:46:20 +0800 Subject: [PATCH 26/26] Amend and add test for a more corner case --- pandas/core/indexes/interval.py | 4 ++++ pandas/tests/indexes/interval/test_setops.py | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index a1e2bea5532ea..cf405e43be9a8 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1165,6 +1165,10 @@ def _intersection_non_unique(self, other): """ mask = np.zeros(len(self), dtype=bool) + if self.hasnans and other.hasnans: + first_nan_loc = np.arange(len(self))[self.isna()][0] + mask[first_nan_loc] = True + lmiss = other.left.get_indexer_non_unique(self.left)[1] lmatch = np.setdiff1d(np.arange(len(self)), lmiss) diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index 961daa549389a..9ab0d15cbe6a3 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -103,6 +103,13 @@ def test_intersection(self, closed, sort): result = index.intersection(other) tm.assert_index_equal(result, expected) + # GH 26225: duplicate nan element + index = IntervalIndex([np.nan, np.nan]) + other = IntervalIndex([np.nan]) + expected = IntervalIndex([np.nan]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + def test_difference(self, closed, sort): index = IntervalIndex.from_arrays([1, 0, 3, 2], [1, 2, 3, 4],