From aa5589ce90f65a68c73a9562738b95b629ce0e47 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Mar 2024 11:46:12 -0800 Subject: [PATCH 01/16] PERF: RangeIndex.take with 1 value return RangeIndex --- pandas/core/indexes/range.py | 8 +++++++- pandas/tests/indexes/ranges/test_range.py | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 09d635b53c482..76068c8ee901e 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1260,4 +1260,10 @@ def take( # type: ignore[override] if self.start != 0: taken += self.start - return self._shallow_copy(taken, name=self.name) + if len(taken) == 1: + start = taken[0] + return self._simple_new( + range(start, start + self.step, self.step), name=self.name + ) + else: + return self._shallow_copy(taken, name=self.name) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 898548d1cc4dc..2b81d4794627d 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -639,6 +639,21 @@ def test_take_return_rangeindex(): tm.assert_index_equal(result, expected, exact=True) +@pytest.mark.parametrize( + "rng, exp_rng", + [ + [range(5), range(3, 4)], + [range(0, -10, -2), range(-6, -8, -2)], + [range(0, 10, 2), range(6, 8, 2)], + ], +) +def test_take_1_value_returns_rangeindex(rng, exp_rng): + ri = RangeIndex(rng, name="foo") + result = ri.take([3]) + expected = RangeIndex(exp_rng, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + def test_append_one_nonempty_preserve_step(): expected = RangeIndex(0, -1, -1) result = RangeIndex(0).append([expected]) From 47af1ce9abaa3e112a995d10ecb76ae48ef4bbf5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Mar 2024 11:47:17 -0800 Subject: [PATCH 02/16] add issue number --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7802ef4798659..8941af2a5b463 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -258,7 +258,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) - Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`) - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`) -- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) +- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) From e7e0fcba0cf6fb2f94ceb5d9b8ae8ec4b5131f39 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:08:26 -0800 Subject: [PATCH 03/16] Move to _shallow_copy, support empty join as well --- doc/source/whatsnew/v3.0.0.rst | 4 +-- pandas/core/indexes/base.py | 1 - pandas/core/indexes/range.py | 44 +++++++++++++++-------- pandas/tests/indexes/ranges/test_join.py | 8 ++++- pandas/tests/indexes/ranges/test_range.py | 20 +++++++++++ 5 files changed, 58 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8941af2a5b463..70fe03000290b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -256,8 +256,8 @@ Performance improvements - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) -- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`) -- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`) +- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`) +- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c72c5fa019bd7..4e5cc986b7325 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4241,7 +4241,6 @@ def join( return self._join_via_get_indexer(other, how, sort) - @final def _join_empty( self, other: Index, how: JoinHow, sort: bool ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 76068c8ee901e..573bd6cf14bcb 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -472,18 +472,31 @@ def _shallow_copy(self, values, name: Hashable = no_default): if values.dtype.kind == "f": return Index(values, name=name, dtype=np.float64) - if values.dtype.kind == "i" and values.ndim == 1 and len(values) > 1: + if values.dtype.kind == "i" and values.ndim == 1: # GH 46675 & 43885: If values is equally spaced, return a # more memory-compact RangeIndex instead of Index with 64-bit dtype + if len(values) == 0: + return type(self)._simple_new(_empty_range, name=self.name) + elif len(values) == 1: + start = values[0] + new_range = range(start, start + self.step, self.step) + return type(self)._simple_new(new_range, name=self.name) diff = values[1] - values[0] if not missing.isna(diff) and diff != 0: - maybe_range_indexer, remainder = np.divmod(values - values[0], diff) - if ( - lib.is_range_indexer(maybe_range_indexer, len(maybe_range_indexer)) - and not remainder.any() - ): + if len(values) == 2: + # Can skip is_range_indexer check new_range = range(values[0], values[-1] + diff, diff) - return type(self)._simple_new(new_range, name=name) + return type(self)._simple_new(new_range, name=self.name) + else: + maybe_range_indexer, remainder = np.divmod(values - values[0], diff) + if ( + lib.is_range_indexer( + maybe_range_indexer, len(maybe_range_indexer) + ) + and not remainder.any() + ): + new_range = range(values[0], values[-1] + diff, diff) + return type(self)._simple_new(new_range, name=name) return self._constructor._simple_new(values, name=name) def _view(self) -> Self: @@ -897,12 +910,19 @@ def symmetric_difference( result = result.rename(result_name) return result + def _join_empty( + self, other: Index, how: JoinHow, sort: bool + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + if other.dtype.kind == "i": + other = self._shallow_copy(other._values, name=other.name) + return super()._join_empty(other, how=how, sort=sort) + def _join_monotonic( self, other: Index, how: JoinHow = "left" ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: # This currently only gets called for the monotonic increasing case if not isinstance(other, type(self)): - maybe_ri = self._shallow_copy(other._values) + maybe_ri = self._shallow_copy(other._values, name=other.name) if not isinstance(maybe_ri, type(self)): return super()._join_monotonic(other, how=how) other = maybe_ri @@ -1260,10 +1280,4 @@ def take( # type: ignore[override] if self.start != 0: taken += self.start - if len(taken) == 1: - start = taken[0] - return self._simple_new( - range(start, start + self.step, self.step), name=self.name - ) - else: - return self._shallow_copy(taken, name=self.name) + return self._shallow_copy(taken, name=self.name) diff --git a/pandas/tests/indexes/ranges/test_join.py b/pandas/tests/indexes/ranges/test_join.py index ca3af607c0a38..09db30b1d4c51 100644 --- a/pandas/tests/indexes/ranges/test_join.py +++ b/pandas/tests/indexes/ranges/test_join.py @@ -207,9 +207,15 @@ def test_join_self(self, join_type): [-1, -1, 0, 1], "outer", ], + [RangeIndex(2), RangeIndex(0), RangeIndex(2), None, [-1, -1], "left"], + [RangeIndex(2), RangeIndex(0), RangeIndex(0), [], None, "right"], + [RangeIndex(2), RangeIndex(0), RangeIndex(0), [], None, "inner"], + [RangeIndex(2), RangeIndex(0), RangeIndex(2), None, [-1, -1], "outer"], ], ) -@pytest.mark.parametrize("right_type", [RangeIndex, lambda x: Index(list(x))]) +@pytest.mark.parametrize( + "right_type", [RangeIndex, lambda x: Index(list(x), dtype=x.dtype)] +) def test_join_preserves_rangeindex( left, right, expected, expected_lidx, expected_ridx, how, right_type ): diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 2b81d4794627d..528315078277d 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -608,6 +608,26 @@ def test_range_index_rsub_by_const(self): tm.assert_index_equal(result, expected) +def test_reindex_1_value_returns_rangeindex(): + ri = RangeIndex(0, 10, 2, name="foo") + result, result_indexer = ri.reindex([2]) + expected = RangeIndex(2, 4, 2, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + expected_indexer = np.array([1], dtype=np.intp) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + +def test_reindex_empty_returns_rangeindex(): + ri = RangeIndex(0, 10, 2, name="foo") + result, result_indexer = ri.reindex([]) + expected = RangeIndex(0, 0, 2, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + expected_indexer = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + def test_reindex_returns_rangeindex(): ri = RangeIndex(2, name="foo") result, result_indexer = ri.reindex([1, 2, 3]) From 67f19987a86d11d046e4d7f63d58caca3b217a42 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:09:59 -0800 Subject: [PATCH 04/16] Fix self.name --- pandas/core/indexes/range.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 573bd6cf14bcb..4fa075d10c9e0 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -476,17 +476,17 @@ def _shallow_copy(self, values, name: Hashable = no_default): # GH 46675 & 43885: If values is equally spaced, return a # more memory-compact RangeIndex instead of Index with 64-bit dtype if len(values) == 0: - return type(self)._simple_new(_empty_range, name=self.name) + return type(self)._simple_new(_empty_range, name=name) elif len(values) == 1: start = values[0] new_range = range(start, start + self.step, self.step) - return type(self)._simple_new(new_range, name=self.name) + return type(self)._simple_new(new_range, name=name) diff = values[1] - values[0] if not missing.isna(diff) and diff != 0: if len(values) == 2: # Can skip is_range_indexer check new_range = range(values[0], values[-1] + diff, diff) - return type(self)._simple_new(new_range, name=self.name) + return type(self)._simple_new(new_range, name=name) else: maybe_range_indexer, remainder = np.divmod(values - values[0], diff) if ( From 2cc41fc25fc245cf4cb96716685b5e19a770f719 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:55:15 -0800 Subject: [PATCH 05/16] FIx error message --- pandas/tests/indexing/test_loc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 9c33d15c01cd6..61a3a7fbe87f2 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -508,7 +508,7 @@ def test_loc_getitem_list_with_fail(self): s.loc[[2]] - msg = f"\"None of [Index([3], dtype='{np.dtype(int)}')] are in the [index]" + msg = "None of [RangeIndex(start=3, stop=4, step=1)] are in the [index]" with pytest.raises(KeyError, match=re.escape(msg)): s.loc[[3]] From c7da6acc720613e6494868ea0e87a40db4d6e280 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Mar 2024 15:41:44 -0800 Subject: [PATCH 06/16] Fix hdf test --- pandas/tests/io/pytables/test_append.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 529d6d789596f..72abbcec63357 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -967,6 +967,8 @@ def test_append_to_multiple_min_itemsize(setup_path): } ) expected = df.iloc[[0]] + # Reading/writing RangeIndex info is not supported yet + expected.index = Index(list(range(len(expected.index)))) with ensure_clean_store(setup_path) as store: store.append_to_multiple( From 90ff39cb278c6d922a2d4a207fe4348efb82d0e4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Mar 2024 15:13:25 -0800 Subject: [PATCH 07/16] PERF: Allow ensure_index_from_sequence to return RangeIndex --- pandas/core/indexes/base.py | 38 ++++++++++++++++++++++++++++++- pandas/core/indexes/range.py | 24 ++++--------------- pandas/tests/indexes/test_base.py | 8 ++++--- 3 files changed, 46 insertions(+), 24 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4e5cc986b7325..cb26bd3ea2565 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7167,6 +7167,41 @@ def shape(self) -> Shape: return (len(self),) +def maybe_sequence_to_range(sequence) -> Any | range: + """ + Convert a 1D sequence to a range if possible. + + Returns the input if not possible. + + Parameters + ---------- + sequence : 1D sequence + names : sequence of str + + Returns + ------- + Any : input or range + """ + np_sequence = np.asarray(sequence) + if np_sequence.dtype.kind != "i" or len(sequence) == 1: + return sequence + elif len(sequence) == 0: + return range(0) + diff = np_sequence[1] - np_sequence[0] + if isna(diff) or diff == 0: + return sequence + elif len(sequence) == 2: + return range(sequence[0], sequence[1] + diff, diff) + maybe_range_indexer, remainder = np.divmod(np_sequence - np_sequence[0], diff) + if ( + lib.is_range_indexer(maybe_range_indexer, len(maybe_range_indexer)) + and not remainder.any() + ): + return range(sequence[0], sequence[-1] + diff, diff) + else: + return sequence + + def ensure_index_from_sequences(sequences, names=None) -> Index: """ Construct an index from sequences of data. @@ -7202,8 +7237,9 @@ def ensure_index_from_sequences(sequences, names=None) -> Index: if len(sequences) == 1: if names is not None: names = names[0] - return Index(sequences[0], name=names) + return Index(maybe_sequence_to_range(sequences[0]), name=names) else: + # TODO: Apply maybe_sequence_to_range to sequences? return MultiIndex.from_arrays(sequences, names=names) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index fcb47de2b3c0d..07a7c49b411c0 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -29,7 +29,6 @@ doc, ) -from pandas.core.dtypes import missing from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( ensure_platform_int, @@ -475,28 +474,13 @@ def _shallow_copy(self, values, name: Hashable = no_default): if values.dtype.kind == "i" and values.ndim == 1: # GH 46675 & 43885: If values is equally spaced, return a # more memory-compact RangeIndex instead of Index with 64-bit dtype - if len(values) == 0: - return type(self)._simple_new(_empty_range, name=name) - elif len(values) == 1: + if len(values) == 1: start = values[0] new_range = range(start, start + self.step, self.step) return type(self)._simple_new(new_range, name=name) - diff = values[1] - values[0] - if not missing.isna(diff) and diff != 0: - if len(values) == 2: - # Can skip is_range_indexer check - new_range = range(values[0], values[-1] + diff, diff) - return type(self)._simple_new(new_range, name=name) - else: - maybe_range_indexer, remainder = np.divmod(values - values[0], diff) - if ( - lib.is_range_indexer( - maybe_range_indexer, len(maybe_range_indexer) - ) - and not remainder.any() - ): - new_range = range(values[0], values[-1] + diff, diff) - return type(self)._simple_new(new_range, name=name) + maybe_range = ibase.maybe_sequence_to_range(values) + if isinstance(maybe_range, range): + return type(self)._simple_new(maybe_range, name=name) return self._constructor._simple_new(values, name=name) def _view(self) -> Self: diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 4c703c3af944b..beee14197bfb8 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1514,8 +1514,10 @@ class TestIndexUtils: @pytest.mark.parametrize( "data, names, expected", [ - ([[1, 2, 3]], None, Index([1, 2, 3])), - ([[1, 2, 3]], ["name"], Index([1, 2, 3], name="name")), + ([[1, 2, 4]], None, Index([1, 2, 4])), + ([[1, 2, 4]], ["name"], Index([1, 2, 4], name="name")), + ([[1, 2, 3]], None, RangeIndex(1, 4)), + ([[1, 2, 3]], ["name"], RangeIndex(1, 4, name="name")), ( [["a", "a"], ["c", "d"]], None, @@ -1530,7 +1532,7 @@ class TestIndexUtils: ) def test_ensure_index_from_sequences(self, data, names, expected): result = ensure_index_from_sequences(data, names) - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected, exact=True) def test_ensure_index_mixed_closed_intervals(self): # GH27172 From 66d3456aa7d677cea42affadcb54ab987e62e66a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Mar 2024 16:01:13 -0800 Subject: [PATCH 08/16] Ignore Index and Series objects --- pandas/core/indexes/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cb26bd3ea2565..1c2210cd74a77 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7182,6 +7182,8 @@ def maybe_sequence_to_range(sequence) -> Any | range: ------- Any : input or range """ + if hasattr(sequence, "dtype") and not isinstance(sequence, np.ndarray): + return sequence np_sequence = np.asarray(sequence) if np_sequence.dtype.kind != "i" or len(sequence) == 1: return sequence From 7e6fcea924525c39916cf9266c05d20961c4f5a8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Mar 2024 16:52:17 -0800 Subject: [PATCH 09/16] Fix doctest --- pandas/core/indexes/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 98a8ebfd7dd08..8e3b35b9d67d2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7210,8 +7210,8 @@ def ensure_index_from_sequences(sequences, names=None) -> Index: Examples -------- - >>> ensure_index_from_sequences([[1, 2, 3]], names=["name"]) - Index([1, 2, 3], dtype='int64', name='name') + >>> ensure_index_from_sequences([[1, 2, 4]], names=["name"]) + Index([1, 2, 4], dtype='int64', name='name') >>> ensure_index_from_sequences([["a", "a"], ["a", "b"]], names=["L1", "L2"]) MultiIndex([('a', 'a'), From b5144f46b2ad2d5d8d82abf645e641d0e2721ad7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 12 Mar 2024 17:23:49 -0700 Subject: [PATCH 10/16] More specific check --- pandas/core/indexes/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8e3b35b9d67d2..2d0d33c627772 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7170,7 +7170,8 @@ def maybe_sequence_to_range(sequence) -> Any | range: ------- Any : input or range """ - if hasattr(sequence, "dtype") and not isinstance(sequence, np.ndarray): + dtype = getattr(sequence, "dtype", None) + if not (dtype is None or isinstance(dtype, np.dtype)): return sequence np_sequence = np.asarray(sequence) if np_sequence.dtype.kind != "i" or len(sequence) == 1: From 08da810dfa748ac3d5b1021f84a5c5fb3d833794 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Mar 2024 10:38:47 -0700 Subject: [PATCH 11/16] Only allow int64, fix indexing --- pandas/core/indexes/base.py | 8 ++++++-- pandas/tests/io/test_stata.py | 1 - 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2d0d33c627772..beebfc70a88c1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7174,7 +7174,11 @@ def maybe_sequence_to_range(sequence) -> Any | range: if not (dtype is None or isinstance(dtype, np.dtype)): return sequence np_sequence = np.asarray(sequence) - if np_sequence.dtype.kind != "i" or len(sequence) == 1: + if ( + not (np_sequence.dtype.kind == "i" and np_sequence.dtype.alignment == 8) + or len(sequence) == 1 + ): + # TODO: Coerce non int64 to ranges? return sequence elif len(sequence) == 0: return range(0) @@ -7188,7 +7192,7 @@ def maybe_sequence_to_range(sequence) -> Any | range: lib.is_range_indexer(maybe_range_indexer, len(maybe_range_indexer)) and not remainder.any() ): - return range(sequence[0], sequence[-1] + diff, diff) + return range(sequence[0], sequence[len(sequence) - 1] + diff, diff) else: return sequence diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 9078ca865042d..4a1030e733126 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2049,7 +2049,6 @@ def test_compression(compression, version, use_dict, infer, compression_to_exten reread = read_stata(fp, index_col="index") expected = df - expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(reread, expected) From b7e5dc1f5f53a675a35524e9235a40c90e8e48a4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Mar 2024 13:07:05 -0700 Subject: [PATCH 12/16] use Index --- pandas/core/indexes/base.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 520feeabae9df..9f866ac0a1903 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7172,26 +7172,26 @@ def maybe_sequence_to_range(sequence) -> Any | range: dtype = getattr(sequence, "dtype", None) if not (dtype is None or isinstance(dtype, np.dtype)): return sequence - np_sequence = np.asarray(sequence) + idx_sequence = Index(sequence) if ( - not (np_sequence.dtype.kind == "i" and np_sequence.dtype.alignment == 8) + not (idx_sequence.dtype.kind == "i" and idx_sequence.dtype.alignment == 8) or len(sequence) == 1 ): # TODO: Coerce non int64 to ranges? return sequence - elif len(sequence) == 0: + elif len(idx_sequence) == 0: return range(0) - diff = np_sequence[1] - np_sequence[0] + diff = idx_sequence[1] - idx_sequence[0] if isna(diff) or diff == 0: return sequence - elif len(sequence) == 2: - return range(sequence[0], sequence[1] + diff, diff) - maybe_range_indexer, remainder = np.divmod(np_sequence - np_sequence[0], diff) + elif len(idx_sequence) == 2: + return range(idx_sequence[0], idx_sequence[1] + diff, diff) + maybe_range_indexer, remainder = np.divmod(idx_sequence - idx_sequence[0], diff) if ( lib.is_range_indexer(maybe_range_indexer, len(maybe_range_indexer)) and not remainder.any() ): - return range(sequence[0], sequence[len(sequence) - 1] + diff, diff) + return range(idx_sequence[0], idx_sequence[-1] + diff, diff) else: return sequence From f0592c5e4a8c439f6baebcb74abe7b08eea327e2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Mar 2024 14:52:24 -0700 Subject: [PATCH 13/16] Use np_values --- pandas/core/indexes/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9f866ac0a1903..b602328898c78 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7186,12 +7186,13 @@ def maybe_sequence_to_range(sequence) -> Any | range: return sequence elif len(idx_sequence) == 2: return range(idx_sequence[0], idx_sequence[1] + diff, diff) - maybe_range_indexer, remainder = np.divmod(idx_sequence - idx_sequence[0], diff) + np_values = idx_sequence._data + maybe_range_indexer, remainder = np.divmod(np_values - np_values[0], diff) if ( lib.is_range_indexer(maybe_range_indexer, len(maybe_range_indexer)) and not remainder.any() ): - return range(idx_sequence[0], idx_sequence[-1] + diff, diff) + return range(np_values[0], np_values[-1] + diff, diff) else: return sequence From 33fa1f4a432d8fde0ef60a75783b3cff9438da92 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Mar 2024 16:33:17 -0700 Subject: [PATCH 14/16] Add back int32 --- pandas/tests/io/test_stata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 4a1030e733126..9078ca865042d 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2049,6 +2049,7 @@ def test_compression(compression, version, use_dict, infer, compression_to_exten reread = read_stata(fp, index_col="index") expected = df + expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(reread, expected) From 5fb9516bc8f77f868e24fd53f9e2a2d148f26453 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Mar 2024 11:53:22 -0700 Subject: [PATCH 15/16] Ignore Series and Index objects --- pandas/core/indexes/base.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b602328898c78..d8e3811bf9804 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7156,7 +7156,7 @@ def shape(self) -> Shape: def maybe_sequence_to_range(sequence) -> Any | range: """ - Convert a 1D sequence to a range if possible. + Convert a 1D, non-pandas sequence to a range if possible. Returns the input if not possible. @@ -7169,30 +7169,24 @@ def maybe_sequence_to_range(sequence) -> Any | range: ------- Any : input or range """ - dtype = getattr(sequence, "dtype", None) - if not (dtype is None or isinstance(dtype, np.dtype)): + if isinstance(sequence, (ABCSeries, Index)): return sequence - idx_sequence = Index(sequence) - if ( - not (idx_sequence.dtype.kind == "i" and idx_sequence.dtype.alignment == 8) - or len(sequence) == 1 - ): - # TODO: Coerce non int64 to ranges? + np_sequence = np.asarray(sequence) + if np_sequence.dtype.kind == "i" or len(np_sequence) == 1: return sequence - elif len(idx_sequence) == 0: + elif len(np_sequence) == 0: return range(0) - diff = idx_sequence[1] - idx_sequence[0] - if isna(diff) or diff == 0: + diff = np_sequence[1] - np_sequence[0] + if diff == 0: return sequence - elif len(idx_sequence) == 2: - return range(idx_sequence[0], idx_sequence[1] + diff, diff) - np_values = idx_sequence._data - maybe_range_indexer, remainder = np.divmod(np_values - np_values[0], diff) + elif len(np_sequence) == 2: + return range(np_sequence[0], np_sequence[1] + diff, diff) + maybe_range_indexer, remainder = np.divmod(np_sequence - np_sequence[0], diff) if ( lib.is_range_indexer(maybe_range_indexer, len(maybe_range_indexer)) and not remainder.any() ): - return range(np_values[0], np_values[-1] + diff, diff) + return range(np_sequence[0], np_sequence[-1] + diff, diff) else: return sequence From 4e7cf98ced86dc3b7e77ef91f8b7da01757f075c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Mar 2024 13:50:51 -0700 Subject: [PATCH 16/16] Wrong condition --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d8e3811bf9804..3c01778e05f3d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7172,7 +7172,7 @@ def maybe_sequence_to_range(sequence) -> Any | range: if isinstance(sequence, (ABCSeries, Index)): return sequence np_sequence = np.asarray(sequence) - if np_sequence.dtype.kind == "i" or len(np_sequence) == 1: + if np_sequence.dtype.kind != "i" or len(np_sequence) == 1: return sequence elif len(np_sequence) == 0: return range(0)