From 2c2a783999b4688613eb96713097440f1a9e362f Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 12 Sep 2023 07:13:42 -0400 Subject: [PATCH 1/3] PERF: Index.difference --- pandas/core/indexes/base.py | 18 ++++-------------- pandas/tests/indexes/datetimes/test_setops.py | 4 +++- pandas/tests/indexes/timedeltas/test_setops.py | 4 +++- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cd55997ad5f69..69b5355359d77 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3615,21 +3615,11 @@ def difference(self, other, sort=None): def _difference(self, other, sort): # overridden by RangeIndex - - this = self.unique() - - indexer = this.get_indexer_for(other) - indexer = indexer.take((indexer != -1).nonzero()[0]) - - label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - - the_diff: MultiIndex | ArrayLike - if isinstance(this, ABCMultiIndex): - the_diff = this.take(label_diff) - else: - the_diff = this._values.take(label_diff) + sort = False if self.is_monotonic_increasing else sort + other = other.unique() + the_diff = self[other.get_indexer_for(self) == -1] + the_diff = the_diff if self.is_unique else the_diff.unique() the_diff = _maybe_try_sort(the_diff, sort) - return the_diff def _wrap_difference_result(self, other, result): diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 2e7b38abf4212..b56bad7f2e833 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -343,9 +343,11 @@ def test_difference_freq(self, sort): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) + # preserve frequency when the difference is a contiguous + # subset of the original range other = date_range("20160922", "20160925", freq="D") idx_diff = index.difference(other, sort) - expected = DatetimeIndex(["20160920", "20160921"], freq=None) + expected = DatetimeIndex(["20160920", "20160921"], freq="D") tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index cb6dce1e7ad80..6cdd6944e90ea 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -219,9 +219,11 @@ def test_difference_freq(self, sort): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) + # preserve frequency when the difference is a contiguous + # subset of the original range other = timedelta_range("2 days", "5 days", freq="D") idx_diff = index.difference(other, sort) - expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + expected = TimedeltaIndex(["0 days", "1 days"], freq="D") tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) From a6f575c0ee730262f829f6260d2d68e477f72624 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 12 Sep 2023 07:19:13 -0400 Subject: [PATCH 2/3] whatsnew --- doc/source/whatsnew/v2.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 07be496a95adc..8a18694bee22d 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -161,6 +161,7 @@ Performance improvements - Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) - Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) +- Performance improvement in :meth:`Index.difference` (:issue:`55108`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) - From c873e241e0d736b57eb7c21bbeb0ced4890c7f7b Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 12 Sep 2023 17:36:49 -0400 Subject: [PATCH 3/3] remove is_monotonic check --- pandas/core/indexes/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 69b5355359d77..8756bb3f3c81b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3615,7 +3615,6 @@ def difference(self, other, sort=None): def _difference(self, other, sort): # overridden by RangeIndex - sort = False if self.is_monotonic_increasing else sort other = other.unique() the_diff = self[other.get_indexer_for(self) == -1] the_diff = the_diff if self.is_unique else the_diff.unique()