Skip to content

[PERF] Get rid of MultiIndex conversion in IntervalIndex.intersection #26225

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
Jun 6, 2019
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
a5a1272
Gid rid of MultiIndex conversion in IntervalIndex.intersection
makbigc Apr 21, 2019
3cd095a
Add benchmark for IntervalIndex.intersection
makbigc Apr 27, 2019
0486a4e
clear code
makbigc Apr 27, 2019
09c89f1
Add whatsnew note
makbigc Apr 27, 2019
841a0b7
Modity the case for duplicate index
makbigc May 1, 2019
8b22623
Combine the set operation to find indexer into one
makbigc May 1, 2019
32d4005
Move setops tests to test_setops.py and add two tests
makbigc May 1, 2019
d502fcb
Remove relundant line
makbigc May 1, 2019
8ec6366
Remove duplicate line in whatsnew note
makbigc May 1, 2019
6000904
Isort interval/test_setops.py
makbigc May 1, 2019
7cb7d2c
Split the intersection into two sub-functions
makbigc May 1, 2019
bcf36bb
Functionalize some indexes
makbigc May 5, 2019
745c0bb
Remove relundant lines in whatsnew
makbigc May 5, 2019
ff8bb97
Fixturize the sort parameter
makbigc May 6, 2019
17d775f
Factor out the check and decorate the setops
makbigc May 7, 2019
03a989a
Add docstring to two subfunction
makbigc May 8, 2019
b36cbc8
Add intersection into _index_shared_docs
makbigc May 8, 2019
1cdb170
Isort and change the decorator's name
makbigc May 10, 2019
18c2d37
Remove object inheritance
makbigc May 11, 2019
d229677
merge master
makbigc May 14, 2019
35594b0
Add docstring to setop_check
makbigc May 16, 2019
0834206
Merge master again
makbigc May 16, 2019
3cf5be8
merge again
makbigc May 23, 2019
9cf9b7e
complete merge
makbigc May 23, 2019
ab67edd
2nd approach
makbigc May 25, 2019
402b09c
Add a new benchmark
makbigc May 25, 2019
b4f130d
Fix linting issue
makbigc May 25, 2019
3ff4c64
Change the decorator name to SetopCheck
makbigc May 26, 2019
3db3130
Amend and add test for a more corner case
makbigc May 28, 2019
1f25adb
Merge commit to resolve conflict
makbigc May 28, 2019
4a9cd29
merge master
makbigc May 29, 2019
1467e94
merge
makbigc Jun 4, 2019
ea2550a
merge again
makbigc Jun 6, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions asv_bench/benchmarks/index_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,17 @@ def setup(self, N):
self.intv = IntervalIndex.from_arrays(left, right)
self.intv._engine

self.left = IntervalIndex.from_breaks(np.arange(N))
self.right = IntervalIndex.from_breaks(np.arange(N - 3, 2 * N - 3))

def time_monotonic_inc(self, N):
self.intv.is_monotonic_increasing

def time_intersection(self, N):
self.left.intersection(self.right)

def time_intersection_duplicate(self, N):
self.intv.intersection(self.right)


from .pandas_vb_common import setup # noqa: F401
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ Performance Improvements
- Improved performance of :meth:`read_csv` by much faster parsing of ``MM/YYYY`` and ``DD/MM/YYYY`` datetime formats (:issue:`25922`)
- Improved performance of nanops for dtypes that cannot store NaNs. Speedup is particularly prominent for :meth:`Series.all` and :meth:`Series.any` (:issue:`25070`)
- Improved performance of :meth:`Series.map` for dictionary mappers on categorical series by mapping the categories instead of mapping all values (:issue:`23785`)
- Improved performance of :meth:`IntervalIndex.intersection` (:issue:`24813`)
- Improved performance of :meth:`read_csv` by faster concatenating date columns without extra conversion to string for integer/float zero
and float NaN; by faster checking the string for the possibility of being a date (:issue:`25754`)

Expand Down
8 changes: 5 additions & 3 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2439,9 +2439,7 @@ def _union(self, other, sort):
def _wrap_setop_result(self, other, result):
return self._constructor(result, name=get_op_result_name(self, other))

# TODO: standardize return type of non-union setops type(self vs other)
def intersection(self, other, sort=False):
"""
_index_shared_docs['intersection'] = """
Form the intersection of two Index objects.

This returns a new Index with elements common to the index and `other`.
Expand Down Expand Up @@ -2475,6 +2473,10 @@ def intersection(self, other, sort=False):
>>> idx1.intersection(idx2)
Int64Index([3, 4], dtype='int64')
"""

# TODO: standardize return type of non-union setops type(self vs other)
@Appender(_index_shared_docs['intersection'])
def intersection(self, other, sort=False):
self._validate_sort_keyword(sort)
self._assert_can_do_setop(other)
other = ensure_index(other)
Expand Down
123 changes: 102 additions & 21 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,42 @@ def _new_IntervalIndex(cls, d):
return cls.from_arrays(**d)


class SetopCheck:
"""
This is called to decorate the set operations of IntervalIndex
to perform the type check in advance.
"""
def __init__(self, op_name):
self.op_name = op_name

def __call__(self, setop):
def func(intvidx_self, other, sort=False):
intvidx_self._assert_can_do_setop(other)
other = ensure_index(other)

if not isinstance(other, IntervalIndex):
result = getattr(intvidx_self.astype(object),
self.op_name)(other)
if self.op_name in ('difference',):
result = result.astype(intvidx_self.dtype)
return result
elif intvidx_self.closed != other.closed:
msg = ('can only do set operations between two IntervalIndex '
'objects that are closed on the same side')
raise ValueError(msg)

# GH 19016: ensure set op will not return a prohibited dtype
subtypes = [intvidx_self.dtype.subtype, other.dtype.subtype]
common_subtype = find_common_type(subtypes)
if is_object_dtype(common_subtype):
msg = ('can only do {op} between two IntervalIndex '
'objects that have compatible dtypes')
raise TypeError(msg.format(op=self.op_name))

return setop(intvidx_self, other, sort)
return func


@Appender(_interval_shared_docs['class'] % dict(
klass="IntervalIndex",
summary="Immutable index of intervals that are closed on the same side.",
Expand Down Expand Up @@ -1077,28 +1113,74 @@ def equals(self, other):
def overlaps(self, other):
return self._data.overlaps(other)

def _setop(op_name, sort=None):
def func(self, other, sort=sort):
self._assert_can_do_setop(other)
other = ensure_index(other)
if not isinstance(other, IntervalIndex):
result = getattr(self.astype(object), op_name)(other)
if op_name in ('difference',):
result = result.astype(self.dtype)
return result
elif self.closed != other.closed:
msg = ('can only do set operations between two IntervalIndex '
'objects that are closed on the same side')
raise ValueError(msg)
@Appender(_index_shared_docs['intersection'])
@SetopCheck(op_name='intersection')
def intersection(self, other, sort=False):
if self.left.is_unique and self.right.is_unique:
taken = self._intersection_unique(other)
else:
# duplicates
taken = self._intersection_non_unique(other)

# GH 19016: ensure set op will not return a prohibited dtype
subtypes = [self.dtype.subtype, other.dtype.subtype]
common_subtype = find_common_type(subtypes)
if is_object_dtype(common_subtype):
msg = ('can only do {op} between two IntervalIndex '
'objects that have compatible dtypes')
raise TypeError(msg.format(op=op_name))
if sort is None:
taken = taken.sort_values()

return taken

def _intersection_unique(self, other):
"""
Used when the IntervalIndex does not have any common endpoint,
no mater left or right.
Return the intersection with another IntervalIndex.

Parameters
----------
other : IntervalIndex

Returns
-------
taken : IntervalIndex
"""
lindexer = self.left.get_indexer(other.left)
rindexer = self.right.get_indexer(other.right)

match = (lindexer == rindexer) & (lindexer != -1)
indexer = lindexer.take(match.nonzero()[0])

return self.take(indexer)

def _intersection_non_unique(self, other):
"""
Used when the IntervalIndex does have some common endpoints,
on either sides.
Return the intersection with another IntervalIndex.

Parameters
----------
other : IntervalIndex

Returns
-------
taken : IntervalIndex
"""
mask = np.zeros(len(self), dtype=bool)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There might be an issue with this approach when dupes are present in self and other. For other index types, such a scenario can result in more dupes being present in the intersection than in self. This behavior looks a bit buggy and inconsistent though, so I'm not sure if we actually want IntervalIndex to be consistent with it.

Some examples of the buggy and inconsistent behavior with Index:

In [2]: idx2 = pd.Index(list('aa')) 
   ...: idx3 = pd.Index(list('aaa')) 
   ...: idx3b = pd.Index(list('baaa'))

In [3]: idx2.intersection(idx3)
Out[3]: Index(['a', 'a', 'a', 'a'], dtype='object')

In [4]: idx3.intersection(idx3)
Out[4]: Index(['a', 'a', 'a'], dtype='object')

In [5]: idx2.intersection(idx3)
Out[5]: Index(['a', 'a', 'a', 'a'], dtype='object')

In [6]: idx2.intersection(idx3b)
Out[6]: Index(['a', 'a', 'a'], dtype='object')

It seems strange that [3] has more dupes present than in either original index but [4] does not. Similarly, it seems like [5] and [6] should be identical, as the presence of a non-intersecting element shouldn't impact the number of dupes returned.

@jreback : Do you know what the expected behavior for intersection with dupes should be? Or if there are any dependencies on the behavior of intersection that would dictate this?

If we treat indexes like multisets, then the intersection should contain the minimum multiplicity of dupes, e.g. idx2.intersection(idx3) and idx3.intersection(idx2) should both have length 2, so you maintain the property of the intersection being a subset of the original indexes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah this is weird as these are set ops

what happens (meaning how much breakage) if

  • raise if left and right are not unique
  • uniquify left and right

prob need to do this for all set ops

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Haven't had time to extensively test this out but I made the two changes you suggested in indexes/base.py for intersection and both resulted in some breakage. Aside some from breakage in the index set ops tests, there was also some breakage in tests/reshape/test_merge.py.


lmiss = other.left.get_indexer_non_unique(self.left)[1]
lmatch = np.setdiff1d(np.arange(len(self)), lmiss)

for i in lmatch:
potential = other.left.get_loc(self.left[i])
if is_scalar(potential):
if self.right[i] == other.right[potential]:
mask[i] = True
elif self.right[i] in other.right[potential]:
mask[i] = True

return self[mask]

def _setop(op_name, sort=None):
@SetopCheck(op_name=op_name)
def func(self, other, sort=sort):
result = getattr(self._multiindex, op_name)(other._multiindex,
sort=sort)
result_name = get_op_result_name(self, other)
Expand All @@ -1123,7 +1205,6 @@ def is_all_dates(self):
return False

union = _setop('union')
intersection = _setop('intersection', sort=False)
difference = _setop('difference')
symmetric_difference = _setop('symmetric_difference')

Expand Down
134 changes: 0 additions & 134 deletions pandas/tests/indexes/interval/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,140 +795,6 @@ def test_non_contiguous(self, closed):

assert 1.5 not in index

@pytest.mark.parametrize("sort", [None, False])
def test_union(self, closed, sort):
index = self.create_index(closed=closed)
other = IntervalIndex.from_breaks(range(5, 13), closed=closed)

expected = IntervalIndex.from_breaks(range(13), closed=closed)
result = index[::-1].union(other, sort=sort)
if sort is None:
tm.assert_index_equal(result, expected)
assert tm.equalContents(result, expected)

result = other[::-1].union(index, sort=sort)
if sort is None:
tm.assert_index_equal(result, expected)
assert tm.equalContents(result, expected)

tm.assert_index_equal(index.union(index, sort=sort), index)
tm.assert_index_equal(index.union(index[:1], sort=sort), index)

# GH 19101: empty result, same dtype
index = IntervalIndex(np.array([], dtype='int64'), closed=closed)
result = index.union(index, sort=sort)
tm.assert_index_equal(result, index)

# GH 19101: empty result, different dtypes
other = IntervalIndex(np.array([], dtype='float64'), closed=closed)
result = index.union(other, sort=sort)
tm.assert_index_equal(result, index)

@pytest.mark.parametrize("sort", [None, False])
def test_intersection(self, closed, sort):
index = self.create_index(closed=closed)
other = IntervalIndex.from_breaks(range(5, 13), closed=closed)

expected = IntervalIndex.from_breaks(range(5, 11), closed=closed)
result = index[::-1].intersection(other, sort=sort)
if sort is None:
tm.assert_index_equal(result, expected)
assert tm.equalContents(result, expected)

result = other[::-1].intersection(index, sort=sort)
if sort is None:
tm.assert_index_equal(result, expected)
assert tm.equalContents(result, expected)

tm.assert_index_equal(index.intersection(index, sort=sort), index)

# GH 19101: empty result, same dtype
other = IntervalIndex.from_breaks(range(300, 314), closed=closed)
expected = IntervalIndex(np.array([], dtype='int64'), closed=closed)
result = index.intersection(other, sort=sort)
tm.assert_index_equal(result, expected)

# GH 19101: empty result, different dtypes
breaks = np.arange(300, 314, dtype='float64')
other = IntervalIndex.from_breaks(breaks, closed=closed)
result = index.intersection(other, sort=sort)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize("sort", [None, False])
def test_difference(self, closed, sort):
index = IntervalIndex.from_arrays([1, 0, 3, 2],
[1, 2, 3, 4],
closed=closed)
result = index.difference(index[:1], sort=sort)
expected = index[1:]
if sort is None:
expected = expected.sort_values()
tm.assert_index_equal(result, expected)

# GH 19101: empty result, same dtype
result = index.difference(index, sort=sort)
expected = IntervalIndex(np.array([], dtype='int64'), closed=closed)
tm.assert_index_equal(result, expected)

# GH 19101: empty result, different dtypes
other = IntervalIndex.from_arrays(index.left.astype('float64'),
index.right, closed=closed)
result = index.difference(other, sort=sort)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize("sort", [None, False])
def test_symmetric_difference(self, closed, sort):
index = self.create_index(closed=closed)
result = index[1:].symmetric_difference(index[:-1], sort=sort)
expected = IntervalIndex([index[0], index[-1]])
if sort is None:
tm.assert_index_equal(result, expected)
assert tm.equalContents(result, expected)

# GH 19101: empty result, same dtype
result = index.symmetric_difference(index, sort=sort)
expected = IntervalIndex(np.array([], dtype='int64'), closed=closed)
if sort is None:
tm.assert_index_equal(result, expected)
assert tm.equalContents(result, expected)

# GH 19101: empty result, different dtypes
other = IntervalIndex.from_arrays(index.left.astype('float64'),
index.right, closed=closed)
result = index.symmetric_difference(other, sort=sort)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize('op_name', [
'union', 'intersection', 'difference', 'symmetric_difference'])
@pytest.mark.parametrize("sort", [None, False])
def test_set_incompatible_types(self, closed, op_name, sort):
index = self.create_index(closed=closed)
set_op = getattr(index, op_name)

# TODO: standardize return type of non-union setops type(self vs other)
# non-IntervalIndex
if op_name == 'difference':
expected = index
else:
expected = getattr(index.astype('O'), op_name)(Index([1, 2, 3]))
result = set_op(Index([1, 2, 3]), sort=sort)
tm.assert_index_equal(result, expected)

# mixed closed
msg = ('can only do set operations between two IntervalIndex objects '
'that are closed on the same side')
for other_closed in {'right', 'left', 'both', 'neither'} - {closed}:
other = self.create_index(closed=other_closed)
with pytest.raises(ValueError, match=msg):
set_op(other, sort=sort)

# GH 19016: incompatible dtypes
other = interval_range(Timestamp('20180101'), periods=9, closed=closed)
msg = ('can only do {op} between two IntervalIndex objects that have '
'compatible dtypes').format(op=op_name)
with pytest.raises(TypeError, match=msg):
set_op(other, sort=sort)

def test_isin(self, closed):
index = self.create_index(closed=closed)

Expand Down
Loading