Skip to content

Issue 28518 multiindex interesection #28735

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
18 changes: 13 additions & 5 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,15 @@ def _verify_integrity(self, codes=None, levels=None):
"Level values must be unique: {values} on "
"level {level}".format(values=[value for value in level], level=i)
)
if self.sortorder is not None:
if int(self.sortorder) > self._lexsort_depth():
raise ValueError(
"Value for sortorder must be inferior or equal "
"to actual lexsort_depth: "
"sortorder {sortorder} with lexsort_depth {lexsort_depth}".format(
sortorder=self.sortorder, lexsort_depth=self._lexsort_depth()
)
)

codes = [
self._validate_codes(level, code) for level, code in zip(levels, codes)
Expand Down Expand Up @@ -1783,16 +1792,15 @@ def is_lexsorted(self):
@cache_readonly
def lexsort_depth(self):
if self.sortorder is not None:
if self.sortorder == 0:
return self.nlevels
else:
return 0
return int(self.sortorder)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason for int(...) here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I took the line from the constructor, line 278, but if it is already cast in the constructor (maybe to handle passing False for sortorder), it may no be necessary here. I look if it passes the test without it.


return self._lexsort_depth()

def _lexsort_depth(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right was wanting a
def _lexsort_depth(self) -> int: here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok sorry i did'nt get the "hint".
Joke aside, is it related to the caching of the result that the function needs typing?

int64_codes = [ensure_int64(level_codes) for level_codes in self.codes]
for k in range(self.nlevels, 0, -1):
if libalgos.is_lexsorted(int64_codes[:k]):
return k

return 0

def _sort_levels_monotonic(self):
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/indexes/multi/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def test_slice_locs_not_contained():
index = MultiIndex(
levels=[[0, 2, 4, 6], [0, 2, 4]],
codes=[[0, 0, 0, 1, 1, 2, 3, 3, 3], [0, 1, 2, 1, 2, 2, 0, 1, 2]],
sortorder=0,
)

result = index.slice_locs((1, 0), (5, 2))
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2094,3 +2094,33 @@ def test_merge_equal_cat_dtypes2():

# Categorical is unordered, so don't check ordering.
tm.assert_frame_equal(result, expected, check_categorical=False)


def test_merge_multiindex_columns():
# Issue #28518
# Verify that merging two dataframes give the expected labels
# The original cause of this issue come from a bug lexsort_depth and is tested in
# test_lexsort_depth

index_tuples = []
letters = ["a", "b", "c", "d"]
numbers = ["1", "2", "3"]

for l in letters:
for n in numbers:
index_tuples.append([l, n])

index = pd.MultiIndex.from_tuples(index_tuples, names=["outer", "inner"])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you not just use from_product here instead to replace the two loops above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes cleary.


frame_x = pd.DataFrame(columns=index)
frame_x["id"] = ""

frame_y = pd.DataFrame(columns=index)
frame_y["id"] = ""

l_suf = "_x"
r_suf = "_y"
expected_labels = sum(([l + l_suf, l + r_suf] for l in letters), [])
merged_frame = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf))).columns
for label in expected_labels:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of asserting within a loop would be easier if you built a result and expected frame and used tm.assert_frame_equal to compare. You will find examples of that in other tests

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did'nt think about this way. It does make more sense.

assert label in merged_frame
47 changes: 47 additions & 0 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2056,6 +2056,53 @@ def test_is_lexsorted(self):
assert not index.is_lexsorted()
assert index.lexsort_depth == 0

def test_raise_invalid_sortorder(self):
# Test that the MultiIndex constructor raise when a incorrect sortorder is given
# Issue #28518

levels = [[0, 1], [0, 1, 2]]

# Correct sortorder
MultiIndex(
levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2
)

with pytest.raises(ValueError, match=r".* sortorder 2 with lexsort_depth 1.*"):
MultiIndex(
levels=levels,
codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]],
sortorder=2,
)

with pytest.raises(ValueError, match=r".* sortorder 1 with lexsort_depth 0.*"):
MultiIndex(
levels=levels,
codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]],
sortorder=1,
)

def test_lexsort_depth(self):
# Test that lexsort_depth return the correct sortorder
# when it was given to the MultiIndex const.
# Issue #28518

levels = [[0, 1], [0, 1, 2]]

index = MultiIndex(
levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2
)
assert index.lexsort_depth == 2

index = MultiIndex(
levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=1
)
assert index.lexsort_depth == 1

index = MultiIndex(
levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=0
)
assert index.lexsort_depth == 0

def test_sort_index_and_reconstruction(self):

# 15622
Expand Down