Skip to content

BUG: Fix bug with symmetric difference of two equal MultiIndexes GH12490 #13504

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ Bug Fixes
- Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`)
- Bug in ``.rolling()`` that allowed a negative integer window in contruction of the ``Rolling()`` object, but would later fail on aggregation (:issue:`13383`)


- Bug in ``MultiIndex.symmetric_difference`` with two equal MultiIndexes (:issue:`13490`)
- Bug in various index types, which did not propagate the name of passed index (:issue:`12309`)
- Bug in ``DatetimeIndex``, which did not honour the ``copy=True`` (:issue:`13205`)
- Bug in ``DatetimeIndex.is_normalized`` returns incorrectly for normalized date_range in case of local timezones (:issue:`13459`)
Expand Down Expand Up @@ -526,4 +526,4 @@ Bug Fixes
- Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`)


- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
73 changes: 61 additions & 12 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2042,9 +2042,7 @@ def intersection(self, other):
other_tuples = other._values
uniq_tuples = sorted(set(self_tuples) & set(other_tuples))
if len(uniq_tuples) == 0:
return MultiIndex(levels=[[]] * self.nlevels,
labels=[[]] * self.nlevels,
names=result_names, verify_integrity=False)
return self._create_as_empty(names=result_names)
else:
return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0,
names=result_names)
Expand All @@ -2053,6 +2051,10 @@ def difference(self, other):
"""
Compute sorted set difference of two MultiIndex objects

Parameters
----------
other : MultiIndex or array / Index of tuples

Returns
-------
diff : MultiIndex
Expand All @@ -2064,20 +2066,69 @@ def difference(self, other):
return self

if self.equals(other):
return MultiIndex(levels=[[]] * self.nlevels,
labels=[[]] * self.nlevels,
names=result_names, verify_integrity=False)
return self._create_as_empty(names=result_names)

difference = sorted(set(self._values) - set(other._values))

if len(difference) == 0:
return MultiIndex(levels=[[]] * self.nlevels,
labels=[[]] * self.nlevels,
names=result_names, verify_integrity=False)
return self._create_as_empty(names=result_names)
else:
return MultiIndex.from_tuples(difference, sortorder=0,
names=result_names)

def _create_as_empty(self, nlevels=None, names=None,
verify_integrity=False):
"""
Creates an empty MultiIndex

Parameters
-------
nlevels : optional int, default None
The number of levels in the empty MultiIndex. If None defaults
to the current number of levels
names : optional sequence of objects, default None
Names for each of the index levels. If None defaults to the
current names.
verify_integrity : boolean, default False
Check that the levels/labels are consistent and valid

Returns
-------
empty : MultiIndex
"""

if nlevels is None:
nlevels = len(self.levels)
if names is None:
names = self.names

return MultiIndex(levels=[[]] * nlevels,
labels=[[]] * nlevels,
names=names, verify_integrity=verify_integrity)

def symmetric_difference(self, other, result_name=None):
"""
Compute sorted set symmetric difference of two MultiIndex objects
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add parameters section


Returns
-------
diff : MultiIndex
"""
self._assert_can_do_setop(other)
other, result_name_update = self._convert_can_do_setop(other)

if result_name is None:
result_name = result_name_update

if self.equals(other):
return self._create_as_empty(names=result_name)

difference = sorted(set((self.difference(other)).
union(other.difference(self))))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a comment:

This sorting breaks with mixed-int values (same issue as in #13432). I suppose this line may be safely replaced with the code from PR #13514, slightly changed, namely:

        this = self._get_unique_index()
        other = other._get_unique_index()
        indexer = this.get_indexer(other)

        # {this} minus {other}
        common_indexer = indexer.take((indexer != -1).nonzero()[0])
        left_indexer = np.setdiff1d(np.arange(this.size), common_indexer,
                                    assume_unique=True)
        left_diff = this.values.take(left_indexer)

        # {other} minus {this}
        right_indexer = (indexer == -1).nonzero()[0]
        right_diff = other.values.take(right_indexer)

        the_diff = _concat._concat_compat([left_diff, right_diff])

Then if the_diff is empty, an empty MI should be returned, otherwise:

        return self._shallow_copy(the_diff, names=result_name).sortlevel()

(In comparison to Index.symmetric_difference sorting is postponed until the end.)

Similar change may be applied to MultiIndex.difference, I guess.


return MultiIndex.from_tuples(difference, sortorder=0,
names=result_name)

@Appender(_index_shared_docs['astype'])
def astype(self, dtype, copy=True):
if not is_object_dtype(np.dtype(dtype)):
Expand All @@ -2092,9 +2143,7 @@ def _convert_can_do_setop(self, other):

if not hasattr(other, 'names'):
if len(other) == 0:
other = MultiIndex(levels=[[]] * self.nlevels,
labels=[[]] * self.nlevels,
verify_integrity=False)
other = self._create_as_empty()
else:
msg = 'other must be a MultiIndex or a list of tuples'
try:
Expand Down
7 changes: 0 additions & 7 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,13 +752,6 @@ def test_symmetric_difference(self):
self.assertTrue(tm.equalContents(result, expected))
self.assertIsNone(result.name)

# multiIndex
idx1 = MultiIndex.from_tuples(self.tuples)
idx2 = MultiIndex.from_tuples([('foo', 1), ('bar', 3)])
result = idx1.symmetric_difference(idx2)
expected = MultiIndex.from_tuples([('bar', 2), ('baz', 3), ('bar', 3)])
self.assertTrue(tm.equalContents(result, expected))

# nans:
# GH #6444, sorting of nans. Make sure the number of nans is right
# and the correct non-nan values are there. punt on sorting.
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1353,6 +1353,20 @@ def test_difference(self):
assertRaisesRegexp(TypeError, "other must be a MultiIndex or a list"
" of tuples", first.difference, [1, 2, 3, 4, 5])

def test_symmetric_difference(self):
idx1 = MultiIndex.from_tuples(self.index, names=('A', 'B'))
idx2 = MultiIndex.from_tuples([('foo', 'one'), ('bar', 'one'),
('baz', 'two'), ('qux', 'two'),
('qux', 'one')], names=('A', 'B'))
result = idx1.symmetric_difference(idx2)
expected = MultiIndex.from_tuples([('foo', 'two')], names=('A', 'B'))
tm.assert_index_equal(result, expected)

# Test for equal multiIndexes
result = self.index.symmetric_difference(self.index)
expected = result._create_as_empty()
tm.assert_index_equal(result, expected)

def test_from_tuples(self):
assertRaisesRegexp(TypeError, 'Cannot infer number of levels from'
' empty list', MultiIndex.from_tuples, [])
Expand Down