Skip to content

Issue 28518 multiindex interesection #28735

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ Missing
MultiIndex
^^^^^^^^^^

- Constructior for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default)
-
-

Expand Down Expand Up @@ -288,6 +289,7 @@ Reshaping
- Bug in :meth:`DataFrame.apply` that caused incorrect output with empty :class:`DataFrame` (:issue:`28202`, :issue:`21959`)
- Bug in :meth:`DataFrame.stack` not handling non-unique indexes correctly when creating MultiIndex (:issue: `28301`)
- Bug :func:`merge_asof` could not use :class:`datetime.timedelta` for ``tolerance`` kwarg (:issue:`28098`)
- Bug in :func:`merge`, did not append suffixes correctly with MultiIndex (:issue:`28518`)

Sparse
^^^^^^
Expand Down
26 changes: 21 additions & 5 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,15 @@ def _verify_integrity(self, codes=None, levels=None):
"Level values must be unique: {values} on "
"level {level}".format(values=[value for value in level], level=i)
)
if self.sortorder is not None:
if self.sortorder > self._lexsort_depth():
raise ValueError(
"Value for sortorder must be inferior or equal "
"to actual lexsort_depth: "
"sortorder {sortorder} with lexsort_depth {lexsort_depth}".format(
sortorder=self.sortorder, lexsort_depth=self._lexsort_depth()
)
)

codes = [
self._validate_codes(level, code) for level, code in zip(levels, codes)
Expand Down Expand Up @@ -1783,16 +1792,23 @@ def is_lexsorted(self):
@cache_readonly
def lexsort_depth(self):
if self.sortorder is not None:
if self.sortorder == 0:
return self.nlevels
else:
return 0
return self.sortorder

return self._lexsort_depth()

def _lexsort_depth(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right was wanting a
def _lexsort_depth(self) -> int: here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok sorry i did'nt get the "hint".
Joke aside, is it related to the caching of the result that the function needs typing?

"""
Compute and return the lexsort_depth, the number of levels of the
MultiIndex that are sorted lexically

Returns
------
int
"""
int64_codes = [ensure_int64(level_codes) for level_codes in self.codes]
for k in range(self.nlevels, 0, -1):
if libalgos.is_lexsorted(int64_codes[:k]):
return k

return 0

def _sort_levels_monotonic(self):
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/indexes/multi/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def test_slice_locs_not_contained():
index = MultiIndex(
levels=[[0, 2, 4, 6], [0, 2, 4]],
codes=[[0, 0, 0, 1, 1, 2, 3, 3, 3], [0, 1, 2, 1, 2, 2, 0, 1, 2]],
sortorder=0,
)

result = index.slice_locs((1, 0), (5, 2))
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2094,3 +2094,33 @@ def test_merge_equal_cat_dtypes2():

# Categorical is unordered, so don't check ordering.
tm.assert_frame_equal(result, expected, check_categorical=False)


def test_merge_multiindex_columns():
# Issue #28518
# Verify that merging two dataframes give the expected labels
# The original cause of this issue come from a bug lexsort_depth and is tested in
# test_lexsort_depth

letters = ["a", "b", "c", "d"]
numbers = ["1", "2", "3"]
index = pd.MultiIndex.from_product((letters, numbers), names=["outer", "inner"])

frame_x = pd.DataFrame(columns=index)
frame_x["id"] = ""
frame_y = pd.DataFrame(columns=index)
frame_y["id"] = ""

l_suf = "_x"
r_suf = "_y"
result = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf)))

# Constructing the expected results
expected_labels = [l + l_suf for l in letters] + [l + r_suf for l in letters]
expected_index = pd.MultiIndex.from_product(
[expected_labels, numbers], names=["outer", "inner"]
)
expected = pd.DataFrame(columns=expected_index)
expected["id"] = ""

tm.assert_frame_equal(result, expected)
47 changes: 47 additions & 0 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2056,6 +2056,53 @@ def test_is_lexsorted(self):
assert not index.is_lexsorted()
assert index.lexsort_depth == 0

def test_raise_invalid_sortorder(self):
# Test that the MultiIndex constructor raise when a incorrect sortorder is given
# Issue #28518

levels = [[0, 1], [0, 1, 2]]

# Correct sortorder
MultiIndex(
levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2
)

with pytest.raises(ValueError, match=r".* sortorder 2 with lexsort_depth 1.*"):
MultiIndex(
levels=levels,
codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]],
sortorder=2,
)

with pytest.raises(ValueError, match=r".* sortorder 1 with lexsort_depth 0.*"):
MultiIndex(
levels=levels,
codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]],
sortorder=1,
)

def test_lexsort_depth(self):
# Test that lexsort_depth return the correct sortorder
# when it was given to the MultiIndex const.
# Issue #28518

levels = [[0, 1], [0, 1, 2]]

index = MultiIndex(
levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2
)
assert index.lexsort_depth == 2

index = MultiIndex(
levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=1
)
assert index.lexsort_depth == 1

index = MultiIndex(
levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=0
)
assert index.lexsort_depth == 0

def test_sort_index_and_reconstruction(self):

# 15622
Expand Down