From dddae1b6d09bf2040d778e5bf5553e3905127414 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 20 Feb 2024 09:54:23 -0800 Subject: [PATCH 1/7] PERF: Add short circuiting to RangeIndex._shallow_copy --- pandas/core/indexes/range.py | 48 ++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 20286cad58df9..8412330de0534 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -22,7 +22,6 @@ index as libindex, lib, ) -from pandas._libs.algos import unique_deltas from pandas._libs.lib import no_default from pandas.compat.numpy import function as nv from pandas.util._decorators import ( @@ -62,6 +61,37 @@ _dtype_int64 = np.dtype(np.int64) +def has_range_delta(values) -> bool | int: + """ + Check if values have unique difference for RangeIndex._shallow_copy. + + values must have more than 2 values. + If there is a unique diff, it cannot be zero. + + Parameters + ---------- + values : iterable + + Returns + ------- + bool or int + False if there isn't a unique delta + int if there's a unique delta + """ + if len(values) < 2: + return False + unique_diffs = set() + first_val = values[0] + for val in values[1:]: + diff = val - first_val + if diff == 0: + return False + unique_diffs.add(diff) + if len(unique_diffs) > 1: + return False + return unique_diffs.pop() + + class RangeIndex(Index): """ Immutable Index implementing a monotonic integer range. @@ -469,15 +499,13 @@ def _shallow_copy(self, values, name: Hashable = no_default): if values.dtype.kind == "f": return Index(values, name=name, dtype=np.float64) - # GH 46675 & 43885: If values is equally spaced, return a - # more memory-compact RangeIndex instead of Index with 64-bit dtype - unique_diffs = unique_deltas(values) - if len(unique_diffs) == 1 and unique_diffs[0] != 0: - diff = unique_diffs[0] - new_range = range(values[0], values[-1] + diff, diff) - return type(self)._simple_new(new_range, name=name) - else: - return self._constructor._simple_new(values, name=name) + if values.dtype.kind == "i": + # GH 46675 & 43885: If values is equally spaced, return a + # more memory-compact RangeIndex instead of Index with 64-bit dtype + if diff := has_range_delta(values): + new_range = range(values[0], values[-1] + diff, diff) + return type(self)._simple_new(new_range, name=name) + return self._constructor._simple_new(values, name=name) def _view(self) -> Self: result = type(self)._simple_new(self._range, name=self._name) From a83c1fd717c84e33318531ab35eaa16ef6363f54 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 20 Feb 2024 11:14:15 -0800 Subject: [PATCH 2/7] Fix bug --- pandas/core/indexes/range.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 8412330de0534..d74ba9d7ff23a 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -70,7 +70,7 @@ def has_range_delta(values) -> bool | int: Parameters ---------- - values : iterable + values : 1D iterable Returns ------- @@ -81,14 +81,15 @@ def has_range_delta(values) -> bool | int: if len(values) < 2: return False unique_diffs = set() - first_val = values[0] + curr = values[0] for val in values[1:]: - diff = val - first_val + diff = val - curr if diff == 0: return False unique_diffs.add(diff) if len(unique_diffs) > 1: return False + curr = val return unique_diffs.pop() @@ -499,7 +500,7 @@ def _shallow_copy(self, values, name: Hashable = no_default): if values.dtype.kind == "f": return Index(values, name=name, dtype=np.float64) - if values.dtype.kind == "i": + if values.dtype.kind == "i" and values.ndim == 1: # GH 46675 & 43885: If values is equally spaced, return a # more memory-compact RangeIndex instead of Index with 64-bit dtype if diff := has_range_delta(values): From de9aafbc37cae133ffe1f5407daa840b47bee1d8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 20 Feb 2024 11:21:40 -0800 Subject: [PATCH 3/7] Avoid set all together --- pandas/core/indexes/range.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index d74ba9d7ff23a..263ce240175b2 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -76,21 +76,21 @@ def has_range_delta(values) -> bool | int: ------- bool or int False if there isn't a unique delta - int if there's a unique delta + int if there's a non-zero, unique delta """ if len(values) < 2: return False - unique_diffs = set() - curr = values[0] - for val in values[1:]: - diff = val - curr - if diff == 0: - return False - unique_diffs.add(diff) - if len(unique_diffs) > 1: + diff = values[1] - values[0] + if diff == 0: + return False + curr = values[1] + for val in values[2:]: + new_diff = val - curr + if new_diff != diff: return False curr = val - return unique_diffs.pop() + diff = new_diff + return diff class RangeIndex(Index): From a96af34ec0a9bb91a2542eac788fa369f3b8f5de Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 20 Feb 2024 11:23:02 -0800 Subject: [PATCH 4/7] Remove redundant assignment --- pandas/core/indexes/range.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 263ce240175b2..d26dd28085265 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -89,7 +89,6 @@ def has_range_delta(values) -> bool | int: if new_diff != diff: return False curr = val - diff = new_diff return diff From 35c9ebd92f6d08f6845a5392c3a02d0a8d2e947d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 20 Feb 2024 13:57:50 -0800 Subject: [PATCH 5/7] Use is_range_indexer --- pandas/core/indexes/range.py | 13 +++++++++---- pandas/tests/indexes/ranges/test_setops.py | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index d26dd28085265..0507f35893bdd 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -499,12 +499,17 @@ def _shallow_copy(self, values, name: Hashable = no_default): if values.dtype.kind == "f": return Index(values, name=name, dtype=np.float64) - if values.dtype.kind == "i" and values.ndim == 1: + if values.dtype.kind == "i" and values.ndim == 1 and len(values) > 1: # GH 46675 & 43885: If values is equally spaced, return a # more memory-compact RangeIndex instead of Index with 64-bit dtype - if diff := has_range_delta(values): - new_range = range(values[0], values[-1] + diff, diff) - return type(self)._simple_new(new_range, name=name) + diff = values[1] - values[0] + if diff != 0: + maybe_range_indexer, remainder = np.divmod(values - values[0], diff) + if (remainder == 0).all() and lib.is_range_indexer( + maybe_range_indexer, len(maybe_range_indexer) + ): + new_range = range(values[0], values[-1] + diff, diff) + return type(self)._simple_new(new_range, name=name) return self._constructor._simple_new(values, name=name) def _view(self) -> Self: diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py index d417b8b743dc5..0aadacfae37b4 100644 --- a/pandas/tests/indexes/ranges/test_setops.py +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -284,8 +284,8 @@ def test_union_noncomparable(self, sort): ids=lambda x: repr(x) if isinstance(x, RangeIndex) else x, ) def test_union_sorted(self, idx1, idx2, expected_sorted, expected_notsorted): - res1 = idx1.union(idx2, sort=None) - tm.assert_index_equal(res1, expected_sorted, exact=True) + # res1 = idx1.union(idx2, sort=None) + # tm.assert_index_equal(res1, expected_sorted, exact=True) res1 = idx1.union(idx2, sort=False) tm.assert_index_equal(res1, expected_notsorted, exact=True) From 18146d5ae27f7a810ac4b2bedbaf10b9502d8e4e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 20 Feb 2024 13:58:21 -0800 Subject: [PATCH 6/7] Remove helper --- pandas/core/indexes/range.py | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 0507f35893bdd..2dd6f672eca45 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -61,37 +61,6 @@ _dtype_int64 = np.dtype(np.int64) -def has_range_delta(values) -> bool | int: - """ - Check if values have unique difference for RangeIndex._shallow_copy. - - values must have more than 2 values. - If there is a unique diff, it cannot be zero. - - Parameters - ---------- - values : 1D iterable - - Returns - ------- - bool or int - False if there isn't a unique delta - int if there's a non-zero, unique delta - """ - if len(values) < 2: - return False - diff = values[1] - values[0] - if diff == 0: - return False - curr = values[1] - for val in values[2:]: - new_diff = val - curr - if new_diff != diff: - return False - curr = val - return diff - - class RangeIndex(Index): """ Immutable Index implementing a monotonic integer range. From 33150182c391da1e8490d8c80072e18195f6c947 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 20 Feb 2024 13:59:20 -0800 Subject: [PATCH 7/7] Add back test --- pandas/tests/indexes/ranges/test_setops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py index 0aadacfae37b4..d417b8b743dc5 100644 --- a/pandas/tests/indexes/ranges/test_setops.py +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -284,8 +284,8 @@ def test_union_noncomparable(self, sort): ids=lambda x: repr(x) if isinstance(x, RangeIndex) else x, ) def test_union_sorted(self, idx1, idx2, expected_sorted, expected_notsorted): - # res1 = idx1.union(idx2, sort=None) - # tm.assert_index_equal(res1, expected_sorted, exact=True) + res1 = idx1.union(idx2, sort=None) + tm.assert_index_equal(res1, expected_sorted, exact=True) res1 = idx1.union(idx2, sort=False) tm.assert_index_equal(res1, expected_notsorted, exact=True)