Skip to content

Commit e915b0a

Browse files
authored
ENH: RangeIndex._shallow_copy can return RangeIndex (#47557)
1 parent 8e6ca28 commit e915b0a

File tree

4 files changed

+123
-72
lines changed

4 files changed

+123
-72
lines changed

doc/source/whatsnew/v1.5.0.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ Other enhancements
277277
- Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files)
278278
- :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`)
279279
- :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`)
280-
-
280+
- :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`)
281281

282282
.. ---------------------------------------------------------------------------
283283
.. _whatsnew_150.notable_bug_fixes:
@@ -1009,7 +1009,7 @@ Reshaping
10091009
- Bug in :func:`concat` with identical key leads to error when indexing :class:`MultiIndex` (:issue:`46519`)
10101010
- Bug in :meth:`DataFrame.join` with a list when using suffixes to join DataFrames with duplicate column names (:issue:`46396`)
10111011
- Bug in :meth:`DataFrame.pivot_table` with ``sort=False`` results in sorted index (:issue:`17041`)
1012-
-
1012+
- Bug in :meth:`concat` when ``axis=1`` and ``sort=False`` where the resulting Index was a :class:`Int64Index` instead of a :class:`RangeIndex` (:issue:`46675`)
10131013

10141014
Sparse
10151015
^^^^^^

pandas/core/indexes/range.py

+79-52
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
index as libindex,
2121
lib,
2222
)
23+
from pandas._libs.algos import unique_deltas
2324
from pandas._libs.lib import no_default
2425
from pandas._typing import (
2526
Dtype,
@@ -436,7 +437,15 @@ def _shallow_copy(self, values, name: Hashable = no_default):
436437

437438
if values.dtype.kind == "f":
438439
return Float64Index(values, name=name)
439-
return Int64Index._simple_new(values, name=name)
440+
# GH 46675 & 43885: If values is equally spaced, return a
441+
# more memory-compact RangeIndex instead of Int64Index
442+
unique_diffs = unique_deltas(values)
443+
if len(unique_diffs) == 1 and unique_diffs[0] != 0:
444+
diff = unique_diffs[0]
445+
new_range = range(values[0], values[-1] + diff, diff)
446+
return type(self)._simple_new(new_range, name=name)
447+
else:
448+
return Int64Index._simple_new(values, name=name)
440449

441450
def _view(self: RangeIndex) -> RangeIndex:
442451
result = type(self)._simple_new(self._range, name=self._name)
@@ -638,6 +647,17 @@ def _extended_gcd(self, a: int, b: int) -> tuple[int, int, int]:
638647
old_t, t = t, old_t - quotient * t
639648
return old_r, old_s, old_t
640649

650+
def _range_in_self(self, other: range) -> bool:
651+
"""Check if other range is contained in self"""
652+
# https://stackoverflow.com/a/32481015
653+
if not other:
654+
return True
655+
if not self._range:
656+
return False
657+
if len(other) > 1 and other.step % self._range.step:
658+
return False
659+
return other.start in self._range and other[-1] in self._range
660+
641661
def _union(self, other: Index, sort):
642662
"""
643663
Form the union of two Index objects and sorts if possible
@@ -647,64 +667,71 @@ def _union(self, other: Index, sort):
647667
other : Index or array-like
648668
649669
sort : False or None, default None
650-
Whether to sort resulting index. ``sort=None`` returns a
651-
monotonically increasing ``RangeIndex`` if possible or a sorted
652-
``Int64Index`` if not. ``sort=False`` always returns an
653-
unsorted ``Int64Index``
670+
Whether to sort (monotonically increasing) the resulting index.
671+
``sort=None`` returns a ``RangeIndex`` if possible or a sorted
672+
``Int64Index`` if not.
673+
``sort=False`` can return a ``RangeIndex`` if self is monotonically
674+
increasing and other is fully contained in self. Otherwise, returns
675+
an unsorted ``Int64Index``
654676
655677
.. versionadded:: 0.25.0
656678
657679
Returns
658680
-------
659681
union : Index
660682
"""
661-
if isinstance(other, RangeIndex) and sort is None:
662-
start_s, step_s = self.start, self.step
663-
end_s = self.start + self.step * (len(self) - 1)
664-
start_o, step_o = other.start, other.step
665-
end_o = other.start + other.step * (len(other) - 1)
666-
if self.step < 0:
667-
start_s, step_s, end_s = end_s, -step_s, start_s
668-
if other.step < 0:
669-
start_o, step_o, end_o = end_o, -step_o, start_o
670-
if len(self) == 1 and len(other) == 1:
671-
step_s = step_o = abs(self.start - other.start)
672-
elif len(self) == 1:
673-
step_s = step_o
674-
elif len(other) == 1:
675-
step_o = step_s
676-
start_r = min(start_s, start_o)
677-
end_r = max(end_s, end_o)
678-
if step_o == step_s:
679-
if (
680-
(start_s - start_o) % step_s == 0
681-
and (start_s - end_o) <= step_s
682-
and (start_o - end_s) <= step_s
683-
):
684-
return type(self)(start_r, end_r + step_s, step_s)
685-
if (
686-
(step_s % 2 == 0)
687-
and (abs(start_s - start_o) == step_s / 2)
688-
and (abs(end_s - end_o) == step_s / 2)
689-
):
690-
# e.g. range(0, 10, 2) and range(1, 11, 2)
691-
# but not range(0, 20, 4) and range(1, 21, 4) GH#44019
692-
return type(self)(start_r, end_r + step_s / 2, step_s / 2)
693-
694-
elif step_o % step_s == 0:
695-
if (
696-
(start_o - start_s) % step_s == 0
697-
and (start_o + step_s >= start_s)
698-
and (end_o - step_s <= end_s)
699-
):
700-
return type(self)(start_r, end_r + step_s, step_s)
701-
elif step_s % step_o == 0:
702-
if (
703-
(start_s - start_o) % step_o == 0
704-
and (start_s + step_o >= start_o)
705-
and (end_s - step_o <= end_o)
706-
):
707-
return type(self)(start_r, end_r + step_o, step_o)
683+
if isinstance(other, RangeIndex):
684+
if sort is None or (
685+
sort is False and self.step > 0 and self._range_in_self(other._range)
686+
):
687+
# GH 47557: Can still return a RangeIndex
688+
# if other range in self and sort=False
689+
start_s, step_s = self.start, self.step
690+
end_s = self.start + self.step * (len(self) - 1)
691+
start_o, step_o = other.start, other.step
692+
end_o = other.start + other.step * (len(other) - 1)
693+
if self.step < 0:
694+
start_s, step_s, end_s = end_s, -step_s, start_s
695+
if other.step < 0:
696+
start_o, step_o, end_o = end_o, -step_o, start_o
697+
if len(self) == 1 and len(other) == 1:
698+
step_s = step_o = abs(self.start - other.start)
699+
elif len(self) == 1:
700+
step_s = step_o
701+
elif len(other) == 1:
702+
step_o = step_s
703+
start_r = min(start_s, start_o)
704+
end_r = max(end_s, end_o)
705+
if step_o == step_s:
706+
if (
707+
(start_s - start_o) % step_s == 0
708+
and (start_s - end_o) <= step_s
709+
and (start_o - end_s) <= step_s
710+
):
711+
return type(self)(start_r, end_r + step_s, step_s)
712+
if (
713+
(step_s % 2 == 0)
714+
and (abs(start_s - start_o) == step_s / 2)
715+
and (abs(end_s - end_o) == step_s / 2)
716+
):
717+
# e.g. range(0, 10, 2) and range(1, 11, 2)
718+
# but not range(0, 20, 4) and range(1, 21, 4) GH#44019
719+
return type(self)(start_r, end_r + step_s / 2, step_s / 2)
720+
721+
elif step_o % step_s == 0:
722+
if (
723+
(start_o - start_s) % step_s == 0
724+
and (start_o + step_s >= start_s)
725+
and (end_o - step_s <= end_s)
726+
):
727+
return type(self)(start_r, end_r + step_s, step_s)
728+
elif step_s % step_o == 0:
729+
if (
730+
(start_s - start_o) % step_o == 0
731+
and (start_s + step_o >= start_o)
732+
and (end_s - step_o <= end_o)
733+
):
734+
return type(self)(start_r, end_r + step_o, step_o)
708735

709736
return super()._union(other, sort=sort)
710737

pandas/tests/indexes/ranges/test_setops.py

+20-18
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,9 @@ def test_union_noncomparable(self, sort):
145145
expected = Index(np.concatenate((other, index)))
146146
tm.assert_index_equal(result, expected)
147147

148-
@pytest.fixture(
149-
params=[
148+
@pytest.mark.parametrize(
149+
"idx1, idx2, expected_sorted, expected_notsorted",
150+
[
150151
(
151152
RangeIndex(0, 10, 1),
152153
RangeIndex(0, 10, 1),
@@ -157,13 +158,13 @@ def test_union_noncomparable(self, sort):
157158
RangeIndex(0, 10, 1),
158159
RangeIndex(5, 20, 1),
159160
RangeIndex(0, 20, 1),
160-
Int64Index(range(20)),
161+
RangeIndex(0, 20, 1),
161162
),
162163
(
163164
RangeIndex(0, 10, 1),
164165
RangeIndex(10, 20, 1),
165166
RangeIndex(0, 20, 1),
166-
Int64Index(range(20)),
167+
RangeIndex(0, 20, 1),
167168
),
168169
(
169170
RangeIndex(0, -10, -1),
@@ -175,7 +176,7 @@ def test_union_noncomparable(self, sort):
175176
RangeIndex(0, -10, -1),
176177
RangeIndex(-10, -20, -1),
177178
RangeIndex(-19, 1, 1),
178-
Int64Index(range(0, -20, -1)),
179+
RangeIndex(0, -20, -1),
179180
),
180181
(
181182
RangeIndex(0, 10, 2),
@@ -205,7 +206,7 @@ def test_union_noncomparable(self, sort):
205206
RangeIndex(0, 100, 5),
206207
RangeIndex(0, 100, 20),
207208
RangeIndex(0, 100, 5),
208-
Int64Index(range(0, 100, 5)),
209+
RangeIndex(0, 100, 5),
209210
),
210211
(
211212
RangeIndex(0, -100, -5),
@@ -230,7 +231,7 @@ def test_union_noncomparable(self, sort):
230231
RangeIndex(0, 100, 2),
231232
RangeIndex(100, 150, 200),
232233
RangeIndex(0, 102, 2),
233-
Int64Index(range(0, 102, 2)),
234+
RangeIndex(0, 102, 2),
234235
),
235236
(
236237
RangeIndex(0, -100, -2),
@@ -242,13 +243,13 @@ def test_union_noncomparable(self, sort):
242243
RangeIndex(0, -100, -1),
243244
RangeIndex(0, -50, -3),
244245
RangeIndex(-99, 1, 1),
245-
Int64Index(list(range(0, -100, -1))),
246+
RangeIndex(0, -100, -1),
246247
),
247248
(
248249
RangeIndex(0, 1, 1),
249250
RangeIndex(5, 6, 10),
250251
RangeIndex(0, 6, 5),
251-
Int64Index([0, 5]),
252+
RangeIndex(0, 10, 5),
252253
),
253254
(
254255
RangeIndex(0, 10, 5),
@@ -274,16 +275,17 @@ def test_union_noncomparable(self, sort):
274275
Int64Index([1, 5, 6]),
275276
Int64Index([1, 5, 6]),
276277
),
277-
]
278+
# GH 43885
279+
(
280+
RangeIndex(0, 10),
281+
RangeIndex(0, 5),
282+
RangeIndex(0, 10),
283+
RangeIndex(0, 10),
284+
),
285+
],
286+
ids=lambda x: repr(x) if isinstance(x, RangeIndex) else x,
278287
)
279-
def unions(self, request):
280-
"""Inputs and expected outputs for RangeIndex.union tests"""
281-
return request.param
282-
283-
def test_union_sorted(self, unions):
284-
285-
idx1, idx2, expected_sorted, expected_notsorted = unions
286-
288+
def test_union_sorted(self, idx1, idx2, expected_sorted, expected_notsorted):
287289
res1 = idx1.union(idx2, sort=None)
288290
tm.assert_index_equal(res1, expected_sorted, exact=True)
289291

pandas/tests/reshape/concat/test_index.py

+22
Original file line numberDiff line numberDiff line change
@@ -432,3 +432,25 @@ def test_concat_index_find_common(self, dtype):
432432
[[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype="Int32")
433433
)
434434
tm.assert_frame_equal(result, expected)
435+
436+
def test_concat_axis_1_sort_false_rangeindex(self):
437+
# GH 46675
438+
s1 = Series(["a", "b", "c"])
439+
s2 = Series(["a", "b"])
440+
s3 = Series(["a", "b", "c", "d"])
441+
s4 = Series([], dtype=object)
442+
result = concat(
443+
[s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1
444+
)
445+
expected = DataFrame(
446+
[
447+
["a"] * 3 + [np.nan],
448+
["b"] * 3 + [np.nan],
449+
["c", np.nan] * 2,
450+
[np.nan] * 2 + ["d"] + [np.nan],
451+
],
452+
dtype=object,
453+
)
454+
tm.assert_frame_equal(
455+
result, expected, check_index_type=True, check_column_type=True
456+
)

0 commit comments

Comments
 (0)