From f2ff405037f5d741407226ed1beca6fd3e931bc6 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Sat, 25 Jan 2020 15:35:31 +0100 Subject: [PATCH 01/11] TST: Fix MultiIndex intersection test not testing sort=False order --- pandas/tests/indexes/multi/test_setops.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index f949db537de67..dad4d39d88e78 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -20,21 +20,17 @@ def test_set_ops_error_cases(idx, case, sort, method): @pytest.mark.parametrize("sort", [None, False]) def test_intersection_base(idx, sort): - first = idx[:5] - second = idx[:3] - intersect = first.intersection(second, sort=sort) - - if sort is None: - tm.assert_index_equal(intersect, second.sort_values()) - assert tm.equalContents(intersect, second) + first = idx[2::-1] # first 3 elements reversed + second = idx[:5] - # GH 10149 - cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in cases: - result = first.intersection(case, sort=sort) + array_like_cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in [second, *array_like_cases]: + intersect = first.intersection(case, sort=sort) if sort is None: - tm.assert_index_equal(result, second.sort_values()) - assert tm.equalContents(result, second) + expected = first.sort_values() + else: + expected = first + tm.assert_index_equal(intersect, expected) msg = "other must be a MultiIndex or a list of tuples" with pytest.raises(TypeError, match=msg): From ffa3f94ccc95ab47365b4f217fa1f86f410febb0 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Sat, 25 Jan 2020 15:37:39 +0100 Subject: [PATCH 02/11] TST: Fix MultiIndex union test not testing sort=False order --- pandas/tests/indexes/multi/test_setops.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index dad4d39d88e78..151e460f8fab0 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -39,21 +39,17 @@ def test_intersection_base(idx, sort): @pytest.mark.parametrize("sort", [None, False]) def test_union_base(idx, sort): - first = idx[3:] + first = idx[::-1] second = idx[:5] - everything = idx - union = first.union(second, sort=sort) - if sort is None: - tm.assert_index_equal(union, everything.sort_values()) - assert tm.equalContents(union, everything) - # GH 10149 - cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in cases: - result = first.union(case, sort=sort) + array_like_cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in [second, *array_like_cases]: + union = first.union(case, sort=sort) if sort is None: - tm.assert_index_equal(result, everything.sort_values()) - assert tm.equalContents(result, everything) + expected = first.sort_values() + else: + expected = first + tm.assert_index_equal(union, expected) msg = "other must be a MultiIndex or a list of tuples" with pytest.raises(TypeError, match=msg): From 391548e38fc653672c858f83ad40ec415fc1f737 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Sat, 25 Jan 2020 16:43:46 +0100 Subject: [PATCH 03/11] BUG: Fix MultiIndex intersection with sort=False --- pandas/core/indexes/multi.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 94d6564d372c7..6a2b4f415c053 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3314,9 +3314,11 @@ def intersection(self, other, sort=False): if self.equals(other): return self - self_tuples = self._ndarray_values - other_tuples = other._ndarray_values - uniq_tuples = set(self_tuples) & set(other_tuples) + uniq_other = set(other.values) + seen = set() + uniq_tuples = [ + x for x in self.values if x in uniq_other and not (x in seen or seen.add(x)) + ] if sort is None: uniq_tuples = sorted(uniq_tuples) From 0f64c567ebd3db2eeae81ac9c874ef2168aacec2 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Sun, 26 Jan 2020 11:25:21 +0100 Subject: [PATCH 04/11] Parametrize MultiIndex intersection and union tests --- pandas/tests/indexes/multi/test_setops.py | 40 +++++++++++++---------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 151e460f8fab0..627127f7b5b53 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -19,18 +19,20 @@ def test_set_ops_error_cases(idx, case, sort, method): @pytest.mark.parametrize("sort", [None, False]) -def test_intersection_base(idx, sort): +@pytest.mark.parametrize("klass", [MultiIndex, np.array, Series, list]) +def test_intersection_base(idx, sort, klass): first = idx[2::-1] # first 3 elements reversed second = idx[:5] - array_like_cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in [second, *array_like_cases]: - intersect = first.intersection(case, sort=sort) - if sort is None: - expected = first.sort_values() - else: - expected = first - tm.assert_index_equal(intersect, expected) + if klass is not MultiIndex: + second = klass(second.values) + + intersect = first.intersection(second, sort=sort) + if sort is None: + expected = first.sort_values() + else: + expected = first + tm.assert_index_equal(intersect, expected) msg = "other must be a MultiIndex or a list of tuples" with pytest.raises(TypeError, match=msg): @@ -38,18 +40,20 @@ def test_intersection_base(idx, sort): @pytest.mark.parametrize("sort", [None, False]) -def test_union_base(idx, sort): +@pytest.mark.parametrize("klass", [MultiIndex, np.array, Series, list]) +def test_union_base(idx, sort, klass): first = idx[::-1] second = idx[:5] - array_like_cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in [second, *array_like_cases]: - union = first.union(case, sort=sort) - if sort is None: - expected = first.sort_values() - else: - expected = first - tm.assert_index_equal(union, expected) + if klass is not MultiIndex: + second = klass(second.values) + + union = first.union(second, sort=sort) + if sort is None: + expected = first.sort_values() + else: + expected = first + tm.assert_index_equal(union, expected) msg = "other must be a MultiIndex or a list of tuples" with pytest.raises(TypeError, match=msg): From 8858e7869fbf9e45cb90a0bd7df49e3142411e05 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Mon, 3 Feb 2020 17:50:54 +0100 Subject: [PATCH 05/11] Improve performance of intersection of MultiIndex --- asv_bench/benchmarks/multiindex_object.py | 26 +++++++++++++++++++++++ pandas/core/indexes/multi.py | 10 +++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 0e188c58012fa..35eb94566d9a2 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -160,4 +160,30 @@ def time_equals_non_object_index(self): self.mi_large_slow.equals(self.idx_non_object) +class SetOperations: + + params = [ + ("monotonic", "non_monotonic"), + ("intersection", "union", "symmetric_difference"), + ] + param_names = ["index_structure", "method"] + + def setup(self, index_structure, method): + N = 10 ** 5 + values = np.arange(N) + + if index_structure == "monotonic": + mi = MultiIndex.from_arrays([values, values]) + else: + mi = MultiIndex.from_arrays( + [np.concatenate([values[::2], values[1::2]]), values] + ) + + self.left = mi + self.right = mi[:-1] + + def time_operation(self, index_structure, method): + getattr(self.left, method)(self.right) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6a2b4f415c053..1b19a4cadf25c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3314,10 +3314,16 @@ def intersection(self, other, sort=False): if self.equals(other): return self - uniq_other = set(other.values) + lvals = self._ndarray_values + rvals = other._ndarray_values + + if self.is_monotonic and other.is_monotonic: + return self._inner_indexer(lvals, rvals)[0] + + other_uniq = set(rvals) seen = set() uniq_tuples = [ - x for x in self.values if x in uniq_other and not (x in seen or seen.add(x)) + x for x in lvals if x in other_uniq and not (x in seen or seen.add(x)) ] if sort is None: From a0d3a3d1ae4bb28820c967b0a6c420538eb76024 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Mon, 3 Feb 2020 23:06:56 +0100 Subject: [PATCH 06/11] BUG: Fix MultiIndex intersection on non-numeric monotonic indexes --- pandas/core/indexes/multi.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1b19a4cadf25c..b106342d1c665 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3317,14 +3317,19 @@ def intersection(self, other, sort=False): lvals = self._ndarray_values rvals = other._ndarray_values + uniq_tuples = None if self.is_monotonic and other.is_monotonic: - return self._inner_indexer(lvals, rvals)[0] + try: + uniq_tuples = self._inner_indexer(lvals, rvals)[0] + except TypeError: + pass - other_uniq = set(rvals) - seen = set() - uniq_tuples = [ - x for x in lvals if x in other_uniq and not (x in seen or seen.add(x)) - ] + if uniq_tuples is None: + other_uniq = set(rvals) + seen = set() + uniq_tuples = [ + x for x in lvals if x in other_uniq and not (x in seen or seen.add(x)) + ] if sort is None: uniq_tuples = sorted(uniq_tuples) From 058912c25fe95adfef48aef946f653334a1d5e2d Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Mon, 3 Feb 2020 23:31:06 +0100 Subject: [PATCH 07/11] PERF: Benchmark MultiIndex set ops on multiple dtypes --- asv_bench/benchmarks/multiindex_object.py | 37 +++++++++++++++-------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 35eb94566d9a2..793f0c7c03c77 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -164,25 +164,38 @@ class SetOperations: params = [ ("monotonic", "non_monotonic"), + ("datetime", "int", "string"), ("intersection", "union", "symmetric_difference"), ] - param_names = ["index_structure", "method"] + param_names = ["index_structure", "dtype", "method"] - def setup(self, index_structure, method): + def setup(self, index_structure, dtype, method): N = 10 ** 5 - values = np.arange(N) + level1 = range(1000) + + level2 = date_range(start="1/1/2000", periods=N // 1000) + dates_left = MultiIndex.from_product([level1, level2]) + + level2 = range(N // 1000) + int_left = MultiIndex.from_product([level1, level2]) + + level2 = tm.makeStringIndex(N // 1000).values + str_left = MultiIndex.from_product([level1, level2]) + + data = { + "datetime": dates_left, + "int": int_left, + "string": str_left, + } - if index_structure == "monotonic": - mi = MultiIndex.from_arrays([values, values]) - else: - mi = MultiIndex.from_arrays( - [np.concatenate([values[::2], values[1::2]]), values] - ) + if index_structure == "non_monotonic": + data = {k: mi[::-1] for k, mi in data.items()} - self.left = mi - self.right = mi[:-1] + data = {k: {"left": mi, "right": mi[:-1]} for k, mi in data.items()} + self.left = data[dtype]["left"] + self.right = data[dtype]["right"] - def time_operation(self, index_structure, method): + def time_operation(self, index_structure, dtype, method): getattr(self.left, method)(self.right) From ed33d00e4d8e8a6e004c5dcda0bc8528621c7930 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Tue, 4 Feb 2020 11:43:49 +0100 Subject: [PATCH 08/11] PERF: Avoid sorting twice in intersection of numeric MultiIndex --- pandas/core/indexes/multi.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b106342d1c665..30685e6fb12a9 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3321,6 +3321,7 @@ def intersection(self, other, sort=False): if self.is_monotonic and other.is_monotonic: try: uniq_tuples = self._inner_indexer(lvals, rvals)[0] + sort = False # uniq_tuples is already sorted except TypeError: pass From 0fb095176b4495e2679149bc2f4ff01df087195b Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Mon, 10 Feb 2020 10:41:09 +0100 Subject: [PATCH 09/11] Comment approach taken in MultiIndex.intersection --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 30685e6fb12a9..4edfa078dd919 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3317,7 +3317,7 @@ def intersection(self, other, sort=False): lvals = self._ndarray_values rvals = other._ndarray_values - uniq_tuples = None + uniq_tuples = None # flag whether _inner_indexer was succesful if self.is_monotonic and other.is_monotonic: try: uniq_tuples = self._inner_indexer(lvals, rvals)[0] From 7fb060d1126fdf54541daf10bbaa8ddaad8dc413 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Mon, 10 Feb 2020 12:31:43 +0100 Subject: [PATCH 10/11] DOC: whatsnew for #31325 --- doc/source/whatsnew/v1.1.0.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 40abb8f83de2f..d19d6e86292be 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -175,6 +175,16 @@ MultiIndex index=[["a", "a", "b", "b"], [1, 2, 1, 2]]) # Rows are now ordered as the requested keys df.loc[(['b', 'a'], [2, 1]), :] + +- Bug in :meth:`MultiIndex.intersection` was not preserving order when ``sort=False``. (:issue:`31325`) + +.. ipython:: python + + left = pd.MultiIndex.from_arrays([["b", "a"], [2, 1]]) + right = pd.MultiIndex.from_arrays([["a", "b", "c"], [1, 2, 3]]) + # Common elements are now ordered by the left side + left.intersection(right, sort=False) + - I/O From b790aeffcb5f12b71296aca0e304f701fdbc180f Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Tue, 11 Feb 2020 11:38:02 +0100 Subject: [PATCH 11/11] DOC: whatsnew #31325 more explicit --- doc/source/whatsnew/v1.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index d19d6e86292be..aeed59f5e80f2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -176,13 +176,13 @@ MultiIndex # Rows are now ordered as the requested keys df.loc[(['b', 'a'], [2, 1]), :] -- Bug in :meth:`MultiIndex.intersection` was not preserving order when ``sort=False``. (:issue:`31325`) +- Bug in :meth:`MultiIndex.intersection` was not guaranteed to preserve order when ``sort=False``. (:issue:`31325`) .. ipython:: python left = pd.MultiIndex.from_arrays([["b", "a"], [2, 1]]) right = pd.MultiIndex.from_arrays([["a", "b", "c"], [1, 2, 3]]) - # Common elements are now ordered by the left side + # Common elements are now guaranteed to be ordered by the left side left.intersection(right, sort=False) -