From a14be3aeedb9594c678c5539aa5a8ce7c1010f44 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 30 Jun 2022 17:09:05 -0700 Subject: [PATCH 1/7] ENH: RangeIndex._shallow_copy can return RangeIndex --- pandas/core/indexes/range.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 5b384fbc97c1a..7b5fed11821bb 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -435,6 +435,10 @@ def _shallow_copy(self, values, name: Hashable = no_default): if values.dtype.kind == "f": return Float64Index(values, name=name) + unique_diffs = np.unique(np.diff(values)) + if len(unique_diffs) == 1: + new_range = range(values[0], values[-1], unique_diffs[0]) + return type(self)._simple_new(new_range, name=name) return Int64Index._simple_new(values, name=name) def _view(self: RangeIndex) -> RangeIndex: From c7dde8c3c3be22be21e4d18c393bb0e40b5898d5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 5 Jul 2022 11:23:11 -0700 Subject: [PATCH 2/7] unique_deltas --- pandas/core/indexes/range.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 7b5fed11821bb..96ac66fbd4c26 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -19,6 +19,7 @@ index as libindex, lib, ) +from pandas._libs.algos import unique_deltas from pandas._libs.lib import no_default from pandas._typing import ( Dtype, @@ -435,7 +436,7 @@ def _shallow_copy(self, values, name: Hashable = no_default): if values.dtype.kind == "f": return Float64Index(values, name=name) - unique_diffs = np.unique(np.diff(values)) + unique_diffs = unique_deltas(values) if len(unique_diffs) == 1: new_range = range(values[0], values[-1], unique_diffs[0]) return type(self)._simple_new(new_range, name=name) From 76fbb724c641236aeb494e3884a64aad1cea467f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 5 Jul 2022 15:44:32 -0700 Subject: [PATCH 3/7] off by diff --- pandas/core/indexes/range.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 96ac66fbd4c26..086da8f478d6d 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -438,7 +438,8 @@ def _shallow_copy(self, values, name: Hashable = no_default): return Float64Index(values, name=name) unique_diffs = unique_deltas(values) if len(unique_diffs) == 1: - new_range = range(values[0], values[-1], unique_diffs[0]) + diff = unique_diffs[0] + new_range = range(values[0], values[-1] + diff, diff) return type(self)._simple_new(new_range, name=name) return Int64Index._simple_new(values, name=name) From 0c1d519fc1f17554bf7b5b80fd601b5ae123ca76 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 6 Jul 2022 11:46:38 -0700 Subject: [PATCH 4/7] Add whatsnew and address 0 diff --- doc/source/whatsnew/v1.5.0.rst | 4 +-- pandas/core/indexes/range.py | 5 ++-- pandas/tests/indexes/ranges/test_setops.py | 30 +++++++++------------- pandas/tests/reshape/concat/test_index.py | 22 ++++++++++++++++ 4 files changed, 39 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 0b450fab53137..37d0fd1fe525e 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -277,7 +277,7 @@ Other enhancements - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) -- +- :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: @@ -1008,7 +1008,7 @@ Reshaping - Bug in :func:`concat` with identical key leads to error when indexing :class:`MultiIndex` (:issue:`46519`) - Bug in :meth:`DataFrame.join` with a list when using suffixes to join DataFrames with duplicate column names (:issue:`46396`) - Bug in :meth:`DataFrame.pivot_table` with ``sort=False`` results in sorted index (:issue:`17041`) -- +- Bug in :meth:`concat` when ``axis=1`` and ``sort=False`` where the resulting Index was a :class:`Int64Index` instead of a :class:`RangeIndex` (:issue:`46675`) Sparse ^^^^^^ diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 086da8f478d6d..f8625a8358fef 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -437,11 +437,12 @@ def _shallow_copy(self, values, name: Hashable = no_default): if values.dtype.kind == "f": return Float64Index(values, name=name) unique_diffs = unique_deltas(values) - if len(unique_diffs) == 1: + if len(unique_diffs) == 1 and unique_diffs[0] != 0: diff = unique_diffs[0] new_range = range(values[0], values[-1] + diff, diff) return type(self)._simple_new(new_range, name=name) - return Int64Index._simple_new(values, name=name) + else: + return Int64Index._simple_new(values, name=name) def _view(self: RangeIndex) -> RangeIndex: result = type(self)._simple_new(self._range, name=self._name) diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py index 2942010af2720..7cf4aec9d648d 100644 --- a/pandas/tests/indexes/ranges/test_setops.py +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -145,8 +145,9 @@ def test_union_noncomparable(self, sort): expected = Index(np.concatenate((other, index))) tm.assert_index_equal(result, expected) - @pytest.fixture( - params=[ + @pytest.mark.parametrize( + "idx1, idx2, expected_sorted, expected_notsorted", + [ ( RangeIndex(0, 10, 1), RangeIndex(0, 10, 1), @@ -157,13 +158,13 @@ def test_union_noncomparable(self, sort): RangeIndex(0, 10, 1), RangeIndex(5, 20, 1), RangeIndex(0, 20, 1), - Int64Index(range(20)), + RangeIndex(0, 20, 1), ), ( RangeIndex(0, 10, 1), RangeIndex(10, 20, 1), RangeIndex(0, 20, 1), - Int64Index(range(20)), + RangeIndex(0, 20, 1), ), ( RangeIndex(0, -10, -1), @@ -175,7 +176,7 @@ def test_union_noncomparable(self, sort): RangeIndex(0, -10, -1), RangeIndex(-10, -20, -1), RangeIndex(-19, 1, 1), - Int64Index(range(0, -20, -1)), + RangeIndex(0, -20, -1), ), ( RangeIndex(0, 10, 2), @@ -205,7 +206,7 @@ def test_union_noncomparable(self, sort): RangeIndex(0, 100, 5), RangeIndex(0, 100, 20), RangeIndex(0, 100, 5), - Int64Index(range(0, 100, 5)), + RangeIndex(0, 100, 5), ), ( RangeIndex(0, -100, -5), @@ -230,7 +231,7 @@ def test_union_noncomparable(self, sort): RangeIndex(0, 100, 2), RangeIndex(100, 150, 200), RangeIndex(0, 102, 2), - Int64Index(range(0, 102, 2)), + RangeIndex(0, 102, 2), ), ( RangeIndex(0, -100, -2), @@ -242,13 +243,13 @@ def test_union_noncomparable(self, sort): RangeIndex(0, -100, -1), RangeIndex(0, -50, -3), RangeIndex(-99, 1, 1), - Int64Index(list(range(0, -100, -1))), + RangeIndex(0, -100, -1), ), ( RangeIndex(0, 1, 1), RangeIndex(5, 6, 10), RangeIndex(0, 6, 5), - Int64Index([0, 5]), + RangeIndex(0, 10, 5), ), ( RangeIndex(0, 10, 5), @@ -274,16 +275,9 @@ def test_union_noncomparable(self, sort): Int64Index([1, 5, 6]), Int64Index([1, 5, 6]), ), - ] + ], ) - def unions(self, request): - """Inputs and expected outputs for RangeIndex.union tests""" - return request.param - - def test_union_sorted(self, unions): - - idx1, idx2, expected_sorted, expected_notsorted = unions - + def test_union_sorted(self, idx1, idx2, expected_sorted, expected_notsorted): res1 = idx1.union(idx2, sort=None) tm.assert_index_equal(res1, expected_sorted, exact=True) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 66382eb0e95a9..c42d6712ff690 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -409,3 +409,25 @@ def test_concat_index_keep_dtype(self, dtype): [[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype=dtype) ) tm.assert_frame_equal(result, expected) + + def test_concat_axis_1_sort_false_rangeindex(self): + # GH 46675 + s1 = Series(["a", "b", "c"]) + s2 = Series(["a", "b"]) + s3 = Series(["a", "b", "c", "d"]) + s4 = Series([], dtype=object) + result = concat( + [s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1 + ) + expected = DataFrame( + [ + ["a"] * 3 + [np.nan], + ["b"] * 3 + [np.nan], + ["c", np.nan] * 2, + [np.nan] * 2 + ["d"] + [np.nan], + ], + dtype=object, + ) + tm.assert_frame_equal( + result, expected, check_index_type=True, check_column_type=True + ) From 7d5d21689278b575ba249f5b5cb818e7f853f51c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 6 Jul 2022 14:56:34 -0700 Subject: [PATCH 5/7] Add additional unit test --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/tests/indexes/ranges/test_setops.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 37d0fd1fe525e..d69da68a07f91 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -277,7 +277,7 @@ Other enhancements - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) -- :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`) +- :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py index 7cf4aec9d648d..7b9c514390702 100644 --- a/pandas/tests/indexes/ranges/test_setops.py +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -275,6 +275,13 @@ def test_union_noncomparable(self, sort): Int64Index([1, 5, 6]), Int64Index([1, 5, 6]), ), + # GH 43885 + ( + RangeIndex(0, 10), + RangeIndex(0, 5), + RangeIndex(0, 10), + RangeIndex(0, 10), + ), ], ) def test_union_sorted(self, idx1, idx2, expected_sorted, expected_notsorted): From da4af1d92e9d0fb3946427d332dddbdea2c83d02 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 8 Jul 2022 09:53:22 -0700 Subject: [PATCH 6/7] Add comment --- pandas/core/indexes/range.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 63ec0fdc49080..2c829605962a5 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -437,6 +437,8 @@ def _shallow_copy(self, values, name: Hashable = no_default): if values.dtype.kind == "f": return Float64Index(values, name=name) + # GH 46675 & 43885: If values is equally spaced, return a + # more memory-compact RangeIndex instead of Int64Index unique_diffs = unique_deltas(values) if len(unique_diffs) == 1 and unique_diffs[0] != 0: diff = unique_diffs[0] From 63e7894aadfc5db8462c8c4c67d960a9ed2c1731 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 8 Jul 2022 14:41:46 -0700 Subject: [PATCH 7/7] Still return rangeindex if sort=False --- pandas/core/indexes/range.py | 120 ++++++++++++--------- pandas/tests/indexes/ranges/test_setops.py | 1 + 2 files changed, 70 insertions(+), 51 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 2c829605962a5..12a995c7de99a 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -647,6 +647,17 @@ def _extended_gcd(self, a: int, b: int) -> tuple[int, int, int]: old_t, t = t, old_t - quotient * t return old_r, old_s, old_t + def _range_in_self(self, other: range) -> bool: + """Check if other range is contained in self""" + # https://stackoverflow.com/a/32481015 + if not other: + return True + if not self._range: + return False + if len(other) > 1 and other.step % self._range.step: + return False + return other.start in self._range and other[-1] in self._range + def _union(self, other: Index, sort): """ Form the union of two Index objects and sorts if possible @@ -656,10 +667,12 @@ def _union(self, other: Index, sort): other : Index or array-like sort : False or None, default None - Whether to sort resulting index. ``sort=None`` returns a - monotonically increasing ``RangeIndex`` if possible or a sorted - ``Int64Index`` if not. ``sort=False`` always returns an - unsorted ``Int64Index`` + Whether to sort (monotonically increasing) the resulting index. + ``sort=None`` returns a ``RangeIndex`` if possible or a sorted + ``Int64Index`` if not. + ``sort=False`` can return a ``RangeIndex`` if self is monotonically + increasing and other is fully contained in self. Otherwise, returns + an unsorted ``Int64Index`` .. versionadded:: 0.25.0 @@ -667,53 +680,58 @@ def _union(self, other: Index, sort): ------- union : Index """ - if isinstance(other, RangeIndex) and sort is None: - start_s, step_s = self.start, self.step - end_s = self.start + self.step * (len(self) - 1) - start_o, step_o = other.start, other.step - end_o = other.start + other.step * (len(other) - 1) - if self.step < 0: - start_s, step_s, end_s = end_s, -step_s, start_s - if other.step < 0: - start_o, step_o, end_o = end_o, -step_o, start_o - if len(self) == 1 and len(other) == 1: - step_s = step_o = abs(self.start - other.start) - elif len(self) == 1: - step_s = step_o - elif len(other) == 1: - step_o = step_s - start_r = min(start_s, start_o) - end_r = max(end_s, end_o) - if step_o == step_s: - if ( - (start_s - start_o) % step_s == 0 - and (start_s - end_o) <= step_s - and (start_o - end_s) <= step_s - ): - return type(self)(start_r, end_r + step_s, step_s) - if ( - (step_s % 2 == 0) - and (abs(start_s - start_o) == step_s / 2) - and (abs(end_s - end_o) == step_s / 2) - ): - # e.g. range(0, 10, 2) and range(1, 11, 2) - # but not range(0, 20, 4) and range(1, 21, 4) GH#44019 - return type(self)(start_r, end_r + step_s / 2, step_s / 2) - - elif step_o % step_s == 0: - if ( - (start_o - start_s) % step_s == 0 - and (start_o + step_s >= start_s) - and (end_o - step_s <= end_s) - ): - return type(self)(start_r, end_r + step_s, step_s) - elif step_s % step_o == 0: - if ( - (start_s - start_o) % step_o == 0 - and (start_s + step_o >= start_o) - and (end_s - step_o <= end_o) - ): - return type(self)(start_r, end_r + step_o, step_o) + if isinstance(other, RangeIndex): + if sort is None or ( + sort is False and self.step > 0 and self._range_in_self(other._range) + ): + # GH 47557: Can still return a RangeIndex + # if other range in self and sort=False + start_s, step_s = self.start, self.step + end_s = self.start + self.step * (len(self) - 1) + start_o, step_o = other.start, other.step + end_o = other.start + other.step * (len(other) - 1) + if self.step < 0: + start_s, step_s, end_s = end_s, -step_s, start_s + if other.step < 0: + start_o, step_o, end_o = end_o, -step_o, start_o + if len(self) == 1 and len(other) == 1: + step_s = step_o = abs(self.start - other.start) + elif len(self) == 1: + step_s = step_o + elif len(other) == 1: + step_o = step_s + start_r = min(start_s, start_o) + end_r = max(end_s, end_o) + if step_o == step_s: + if ( + (start_s - start_o) % step_s == 0 + and (start_s - end_o) <= step_s + and (start_o - end_s) <= step_s + ): + return type(self)(start_r, end_r + step_s, step_s) + if ( + (step_s % 2 == 0) + and (abs(start_s - start_o) == step_s / 2) + and (abs(end_s - end_o) == step_s / 2) + ): + # e.g. range(0, 10, 2) and range(1, 11, 2) + # but not range(0, 20, 4) and range(1, 21, 4) GH#44019 + return type(self)(start_r, end_r + step_s / 2, step_s / 2) + + elif step_o % step_s == 0: + if ( + (start_o - start_s) % step_s == 0 + and (start_o + step_s >= start_s) + and (end_o - step_s <= end_s) + ): + return type(self)(start_r, end_r + step_s, step_s) + elif step_s % step_o == 0: + if ( + (start_s - start_o) % step_o == 0 + and (start_s + step_o >= start_o) + and (end_s - step_o <= end_o) + ): + return type(self)(start_r, end_r + step_o, step_o) return super()._union(other, sort=sort) diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py index 7b9c514390702..71bd2f5590b8f 100644 --- a/pandas/tests/indexes/ranges/test_setops.py +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -283,6 +283,7 @@ def test_union_noncomparable(self, sort): RangeIndex(0, 10), ), ], + ids=lambda x: repr(x) if isinstance(x, RangeIndex) else x, ) def test_union_sorted(self, idx1, idx2, expected_sorted, expected_notsorted): res1 = idx1.union(idx2, sort=None)