Skip to content

REF: de-duplicate Index._intersection + MultiIndex._intersection #41824

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 25 additions & 12 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3116,7 +3116,6 @@ def _intersection(self, other: Index, sort=False):
intersection specialized to the case with matching dtypes.
"""
# TODO(EA): setops-refactor, clean all this up
lvals = self._values

if self.is_monotonic and other.is_monotonic:
try:
Expand All @@ -3128,21 +3127,35 @@ def _intersection(self, other: Index, sort=False):
res = algos.unique1d(result)
return ensure_wrapped_if_datetimelike(res)

try:
indexer = other.get_indexer(lvals)
except InvalidIndexError:
# InvalidIndexError raised by get_indexer if non-unique
indexer, _ = other.get_indexer_non_unique(lvals)
res_values = self._intersection_via_get_indexer(other, sort=sort)
res_values = _maybe_try_sort(res_values, sort)
return res_values

mask = indexer != -1
indexer = indexer.take(mask.nonzero()[0])
def _intersection_via_get_indexer(self, other: Index, sort) -> ArrayLike:
"""
Find the intersection of two Indexes using get_indexer.

Returns
-------
np.ndarray or ExtensionArray
The returned array will be unique.
"""
# Note: drop_duplicates vs unique matters for MultiIndex, though
# it should not, see GH#41823
left_unique = self.drop_duplicates()
right_unique = other.drop_duplicates()

result = other.take(indexer).unique()._values
result = _maybe_try_sort(result, sort)
indexer = left_unique.get_indexer(right_unique)

mask = indexer != -1

# Intersection has to be unique
assert Index(result).is_unique
taker = indexer.take(mask.nonzero()[0])
if sort is False:
# sort bc we want the elements in the same order they are in self
# unnecessary in the case with sort=None bc we will sort later
taker = np.sort(taker)

result = left_unique.take(taker)._values
return result

@final
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -3624,9 +3624,7 @@ def _intersection(self, other, sort=False) -> MultiIndex:
uniq_tuples = algos.unique(inner_tuples)

if uniq_tuples is None:
left_unique = self.drop_duplicates()
indexer = left_unique.get_indexer(other.drop_duplicates())
uniq_tuples = left_unique.take(np.sort(indexer[indexer != -1]))
uniq_tuples = self._intersection_via_get_indexer(other, sort)

if sort is None:
uniq_tuples = sorted(uniq_tuples)
Expand Down