From 8c9f29bd7f1212ce40e248cf908b23d6849c406a Mon Sep 17 00:00:00 2001 From: tlaytongoogle Date: Thu, 17 Oct 2019 13:04:16 -0400 Subject: [PATCH 01/11] PERF: Check shape compatibility before full array equivalence --- pandas/core/indexes/multi.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b0a1ed0650f7c..407450d6743a9 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3146,16 +3146,16 @@ def equals(self, other): if not isinstance(other, Index): return False - if not isinstance(other, MultiIndex): - other_vals = com.values_from_object(ensure_index(other)) - return array_equivalent(self._ndarray_values, other_vals) - if self.nlevels != other.nlevels: return False if len(self) != len(other): return False + if not isinstance(other, MultiIndex): + other_vals = com.values_from_object(ensure_index(other)) + return array_equivalent(self._ndarray_values, other_vals) + for i in range(self.nlevels): self_codes = self.codes[i] self_codes = self_codes[self_codes != -1] From 606bb8ed8339d09173cee3b3bc67c8b05400a6b5 Mon Sep 17 00:00:00 2001 From: tlaytongoogle Date: Thu, 17 Oct 2019 15:25:36 -0400 Subject: [PATCH 02/11] Allow d-level MultiIndex to be equivalent to d-tuple object Index --- pandas/core/indexes/multi.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 407450d6743a9..dc90b9210dd2d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3146,16 +3146,20 @@ def equals(self, other): if not isinstance(other, Index): return False - if self.nlevels != other.nlevels: - return False - if len(self) != len(other): return False if not isinstance(other, MultiIndex): + if not other.is_object(): # d-level MultiIndex can equal d-tuple Index + if self.nlevels != other.nlevels: + return False + other_vals = com.values_from_object(ensure_index(other)) return array_equivalent(self._ndarray_values, other_vals) + if self.nlevels != other.nlevels: + return False + for i in range(self.nlevels): self_codes = self.codes[i] self_codes = self_codes[self_codes != -1] From c3235c073a0205ca5a17f5a1d98e98f148ead8bf Mon Sep 17 00:00:00 2001 From: tlaytongoogle Date: Thu, 17 Oct 2019 15:43:20 -0400 Subject: [PATCH 03/11] Use is_object_type() instead of Index.is_object() --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index dc90b9210dd2d..b626b4099f995 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3150,7 +3150,7 @@ def equals(self, other): return False if not isinstance(other, MultiIndex): - if not other.is_object(): # d-level MultiIndex can equal d-tuple Index + if not is_object_dtype(other): # d-level MultiIndex can equal d-tuple Index if self.nlevels != other.nlevels: return False From f52b992de7e3913d4c7e8491f39cf57404256dba Mon Sep 17 00:00:00 2001 From: tlaytongoogle Date: Thu, 17 Oct 2019 15:48:05 -0400 Subject: [PATCH 04/11] Revert length comparison to previous position --- pandas/core/indexes/multi.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b626b4099f995..5fc806fd9dce7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3146,9 +3146,6 @@ def equals(self, other): if not isinstance(other, Index): return False - if len(self) != len(other): - return False - if not isinstance(other, MultiIndex): if not is_object_dtype(other): # d-level MultiIndex can equal d-tuple Index if self.nlevels != other.nlevels: @@ -3160,6 +3157,9 @@ def equals(self, other): if self.nlevels != other.nlevels: return False + if len(self) != len(other): + return False + for i in range(self.nlevels): self_codes = self.codes[i] self_codes = self_codes[self_codes != -1] From 61ec0f2e7caef34e2b01d09c7d168bac006972ee Mon Sep 17 00:00:00 2001 From: tlaytongoogle Date: Thu, 17 Oct 2019 16:11:13 -0400 Subject: [PATCH 05/11] PERF: Check shape compatibility before full array equivalence --- pandas/core/indexes/base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 526b2c2e2c412..fe02515ae3ad5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4333,6 +4333,11 @@ def equals(self, other): # if other is not object, use other's logic for coercion return other.equals(self) + if isinstance(other, ABCMultiIndex): + if not is_object_dtype(self): # d-level MultiIndex can equal d-tuple Index + if self.nlevels != other.nlevels: + return False + return array_equivalent( com.values_from_object(self), com.values_from_object(other) ) From b2b6f05ff61c000c78b2b9d119b0c4faf2725dd2 Mon Sep 17 00:00:00 2001 From: tlaytongoogle Date: Mon, 21 Oct 2019 14:54:37 -0400 Subject: [PATCH 06/11] Add whatsnew entry for issue #29134 --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 48c1173a372a7..7437ec61bd6e2 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -242,6 +242,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`) - Performance improvement in :meth:`DataFrame.select_dtypes` by using vectorization instead of iterating over a loop (:issue:`28317`) - Performance improvement in :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` (:issue:`28795`) +- Performance improvement in :meth:`Index.equals` and :meth:`MultiIndex.equals` (:issue:`29134`) .. _whatsnew_1000.bug_fixes: From bde3d382a6e4beb18c0db8baf7dddbdb54ce9241 Mon Sep 17 00:00:00 2001 From: tlaytongoogle Date: Mon, 2 Dec 2019 18:00:12 -0500 Subject: [PATCH 07/11] Invoke is_object_dtype() on dtype attribute --- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/multi.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index fe02515ae3ad5..f23b5d712a17f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4334,7 +4334,7 @@ def equals(self, other): return other.equals(self) if isinstance(other, ABCMultiIndex): - if not is_object_dtype(self): # d-level MultiIndex can equal d-tuple Index + if not is_object_dtype(self.dtype): # d-level MultiIndex can equal d-tuple Index if self.nlevels != other.nlevels: return False diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 5fc806fd9dce7..1b83e5a3ef2f4 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3147,7 +3147,7 @@ def equals(self, other): return False if not isinstance(other, MultiIndex): - if not is_object_dtype(other): # d-level MultiIndex can equal d-tuple Index + if not is_object_dtype(other.dtype): # d-level MultiIndex can equal d-tuple Index if self.nlevels != other.nlevels: return False From c76005e41e9f479f9ff2b04595006d33a2ba389c Mon Sep 17 00:00:00 2001 From: tlaytongoogle Date: Tue, 3 Dec 2019 14:21:18 -0500 Subject: [PATCH 08/11] Add ASV benchmarks for index-multiindex equality --- asv_bench/benchmarks/index_object.py | 14 ++++++++++++++ asv_bench/benchmarks/multiindex_object.py | 15 ++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index a94960d494707..de31eaec9d502 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -7,6 +7,7 @@ Float64Index, Index, IntervalIndex, + MultiIndex, RangeIndex, Series, date_range, @@ -111,6 +112,19 @@ def time_get_loc_dec(self): self.idx_dec.get_loc(100000) +class IndexEquals: + def setup(self): + idx_large_fast = RangeIndex(1000000) + idx_small_slow = date_range(start="1/1/2012", periods=1) + self.mi_large_slow = MultiIndex.from_product( + [idx_large_fast, idx_small_slow]) + + self.idx_non_object = RangeIndex(1) + + def time_non_object_equals_multiindex(self): + self.idx_non_object.equals(self.mi_large_slow) + + class IndexAppend: def setup(self): diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 3f4fd7ad911c1..93c89d698097a 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -2,7 +2,7 @@ import numpy as np -from pandas import DataFrame, MultiIndex, date_range +from pandas import DataFrame, MultiIndex, RangeIndex, date_range import pandas.util.testing as tm @@ -147,4 +147,17 @@ def time_categorical_level(self): self.df.set_index(["a", "b"]) +class Equals: + def setup(self): + idx_large_fast = RangeIndex(1000000) + idx_small_slow = date_range(start="1/1/2012", periods=1) + self.mi_large_slow = MultiIndex.from_product( + [idx_large_fast, idx_small_slow]) + + self.idx_non_object = RangeIndex(1) + + def time_equals_non_object_index(self): + self.mi_large_slow.equals(self.idx_non_object) + + from .pandas_vb_common import setup # noqa: F401 isort:skip From b27e30faa7a866899ac2081c275cb06fcae3062e Mon Sep 17 00:00:00 2001 From: tlaytongoogle Date: Tue, 3 Dec 2019 14:28:40 -0500 Subject: [PATCH 09/11] Adjust lines to within size limit --- pandas/core/indexes/base.py | 3 ++- pandas/core/indexes/multi.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f23b5d712a17f..228c0016a85ee 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4334,7 +4334,8 @@ def equals(self, other): return other.equals(self) if isinstance(other, ABCMultiIndex): - if not is_object_dtype(self.dtype): # d-level MultiIndex can equal d-tuple Index + # d-level MultiIndex can equal d-tuple Index + if not is_object_dtype(self.dtype): if self.nlevels != other.nlevels: return False diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1b83e5a3ef2f4..da8bd4a8c79ff 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3147,7 +3147,8 @@ def equals(self, other): return False if not isinstance(other, MultiIndex): - if not is_object_dtype(other.dtype): # d-level MultiIndex can equal d-tuple Index + # d-level MultiIndex can equal d-tuple Index + if not is_object_dtype(other.dtype): if self.nlevels != other.nlevels: return False From 767d96d094ebe7216c4130ff651e8be9273ece94 Mon Sep 17 00:00:00 2001 From: tlaytongoogle Date: Tue, 3 Dec 2019 15:02:11 -0500 Subject: [PATCH 10/11] Fix formatting --- asv_bench/benchmarks/index_object.py | 3 +-- asv_bench/benchmarks/multiindex_object.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 55a7fc18d4591..57225406bb61f 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -116,8 +116,7 @@ class IndexEquals: def setup(self): idx_large_fast = RangeIndex(1000000) idx_small_slow = date_range(start="1/1/2012", periods=1) - self.mi_large_slow = MultiIndex.from_product( - [idx_large_fast, idx_small_slow]) + self.mi_large_slow = MultiIndex.from_product([idx_large_fast, idx_small_slow]) self.idx_non_object = RangeIndex(1) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 93c89d698097a..dd28519fed8e9 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -151,8 +151,7 @@ class Equals: def setup(self): idx_large_fast = RangeIndex(1000000) idx_small_slow = date_range(start="1/1/2012", periods=1) - self.mi_large_slow = MultiIndex.from_product( - [idx_large_fast, idx_small_slow]) + self.mi_large_slow = MultiIndex.from_product([idx_large_fast, idx_small_slow]) self.idx_non_object = RangeIndex(1) From 5310fffc73bf6651aa5593101d6e823f8a0944a0 Mon Sep 17 00:00:00 2001 From: tlaytongoogle Date: Thu, 5 Dec 2019 11:55:01 -0500 Subject: [PATCH 11/11] Reduce runtime of ASV benchmarks --- asv_bench/benchmarks/index_object.py | 2 +- asv_bench/benchmarks/multiindex_object.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 57225406bb61f..d69799eb70040 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -114,7 +114,7 @@ def time_get_loc_dec(self): class IndexEquals: def setup(self): - idx_large_fast = RangeIndex(1000000) + idx_large_fast = RangeIndex(100000) idx_small_slow = date_range(start="1/1/2012", periods=1) self.mi_large_slow = MultiIndex.from_product([idx_large_fast, idx_small_slow]) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index dd28519fed8e9..5a396c9f0deff 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -149,7 +149,7 @@ def time_categorical_level(self): class Equals: def setup(self): - idx_large_fast = RangeIndex(1000000) + idx_large_fast = RangeIndex(100000) idx_small_slow = date_range(start="1/1/2012", periods=1) self.mi_large_slow = MultiIndex.from_product([idx_large_fast, idx_small_slow])