From 78683def0dfe9d91de848af6c0f553ebae0668cc Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 14 Oct 2021 18:53:26 -0700 Subject: [PATCH 1/2] REF: dispatch DTI/TDI setops to RangeIndex --- pandas/core/indexes/datetimelike.py | 74 +++++++++++++++++----- pandas/core/indexes/range.py | 10 ++- pandas/tests/indexes/ranges/test_setops.py | 8 +++ 3 files changed, 72 insertions(+), 20 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index d309dfc21eb95..be01ce57301ce 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -28,6 +28,7 @@ Resolution, Tick, parsing, + to_offset, ) from pandas.compat.numpy import function as nv from pandas.util._decorators import ( @@ -61,6 +62,7 @@ NDArrayBackedExtensionIndex, inherit_names, ) +from pandas.core.indexes.range import RangeIndex from pandas.core.tools.timedeltas import to_timedelta if TYPE_CHECKING: @@ -433,12 +435,60 @@ def values(self) -> np.ndarray: # -------------------------------------------------------------------- # Set Operation Methods + @cache_readonly + def _as_range_index(self) -> RangeIndex: + # Convert our i8 representations to RangeIndex + # Caller is responsible for checking isinstance(self.freq, Tick) + tick = self.freq.delta.value + rng = range(self[0].value, self[-1].value + tick, tick) + return RangeIndex(rng) + + def _can_range_setop(self, other): + return isinstance(self.freq, Tick) and isinstance(other.freq, Tick) + + def _wrap_range_setop(self, other, res_i8): + new_freq = None + if not len(res_i8): + # RangeIndex defaults to step=1, which we don't want. + new_freq = self.freq + elif isinstance(res_i8, RangeIndex): + new_freq = to_offset(Timedelta(res_i8.step)) + res_i8 = res_i8 + + # TODO: we cannot just do + # type(self._data)(res_i8.values, dtype=self.dtype, freq=new_freq) + # because test_setops_preserve_freq fails with _validate_frequency raising. + # This raising is incorrect, as 'on_freq' is incorrect. This will + # be fixed by GH#41493 + res_values = res_i8.values.view(self._data._ndarray.dtype) + result = type(self._data)._simple_new( + res_values, dtype=self.dtype, freq=new_freq + ) + return self._wrap_setop_result(other, result) + + def _range_intersect(self, other, sort): + # Dispatch to RangeIndex intersection logic. + left = self._as_range_index + right = other._as_range_index + res_i8 = left.intersection(right, sort=sort) + return self._wrap_range_setop(other, res_i8) + + def _range_union(self, other, sort): + # Dispatch to RangeIndex union logic. + left = self._as_range_index + right = other._as_range_index + res_i8 = left.union(right, sort=sort) + return self._wrap_range_setop(other, res_i8) + def _intersection(self, other: Index, sort=False) -> Index: """ intersection specialized to the case with matching dtypes and both non-empty. """ other = cast("DatetimeTimedeltaMixin", other) + if self._can_range_setop(other): + return self._range_intersect(other, sort=sort) + if not self._can_fast_intersect(other): result = Index._intersection(self, other, sort=sort) # We need to invalidate the freq because Index._intersection @@ -453,7 +503,6 @@ def _intersection(self, other: Index, sort=False) -> Index: return self._fast_intersect(other, sort) def _fast_intersect(self, other, sort): - # to make our life easier, "sort" the two ranges if self[0] <= other[0]: left, right = self, other @@ -485,19 +534,9 @@ def _can_fast_intersect(self: _T, other: _T) -> bool: # Because freq is not None, we must then be monotonic decreasing return False - elif self.freq.is_anchored(): - # this along with matching freqs ensure that we "line up", - # so intersection will preserve freq - # GH#42104 - return self.freq.n == 1 - - elif isinstance(self.freq, Tick): - # We "line up" if and only if the difference between two of our points - # is a multiple of our freq - diff = self[0] - other[0] - remainder = diff % self.freq.delta - return remainder == Timedelta(0) - + # this along with matching freqs ensure that we "line up", + # so intersection will preserve freq + # Note we are assuming away Ticks, as those go through _range_intersect # GH#42104 return self.freq.n == 1 @@ -516,6 +555,7 @@ def _can_fast_union(self: _T, other: _T) -> bool: return False if len(self) == 0 or len(other) == 0: + # only reached via union_many return True # to make our life easier, "sort" the two ranges @@ -544,10 +584,7 @@ def _fast_union(self: _TDT, other: _TDT, sort=None) -> _TDT: loc = right.searchsorted(left_start, side="left") right_chunk = right._values[:loc] dates = concat_compat((left._values, right_chunk)) - # With sort being False, we can't infer that result.freq == self.freq - # TODO: no tests rely on the _with_freq("infer"); needed? result = type(self)._simple_new(dates, name=self.name) - result = result._with_freq("infer") return result else: left, right = other, self @@ -573,6 +610,9 @@ def _union(self, other, sort): assert isinstance(other, type(self)) assert self.dtype == other.dtype + if self._can_range_setop(other): + return self._range_union(other, sort=sort) + if self._can_fast_union(other): result = self._fast_union(other, sort=sort) # in the case with sort=None, the _can_fast_union check ensures diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 90649ad2dcbc1..bd865ceb24644 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -562,8 +562,11 @@ def _intersection(self, other: Index, sort=False): if (self.step < 0 and other.step < 0) is not (new_index.step < 0): new_index = new_index[::-1] + if sort is None: - new_index = new_index.sort_values() + # TODO: can revert to just `if sort is None` after GH#43666 + if new_index.step < 0: + new_index = new_index[::-1] return new_index @@ -635,9 +638,10 @@ def _union(self, other: Index, sort): return type(self)(start_r, end_r + step_s, step_s) if ( (step_s % 2 == 0) - and (abs(start_s - start_o) <= step_s / 2) - and (abs(end_s - end_o) <= step_s / 2) + and (abs(start_s - start_o) == step_s / 2) + and (abs(end_s - end_o) == step_s / 2) ): + # e.g. range(0, 10, 2) and range(1, 11, 2) return type(self)(start_r, end_r + step_s / 2, step_s / 2) elif step_o % step_s == 0: if ( diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py index e81271d8ee306..a7b611128747c 100644 --- a/pandas/tests/indexes/ranges/test_setops.py +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -290,6 +290,14 @@ def test_union_sorted(self, unions): tm.assert_index_equal(res2, expected_sorted, exact=True) tm.assert_index_equal(res3, expected_sorted, exact="equiv") + def test_union_same_step_misaligned(self): + left = RangeIndex(range(0, 20, 4)) + right = RangeIndex(range(1, 21, 4)) + + result = left.union(right) + expected = Int64Index([0, 1, 4, 5, 8, 9, 12, 13, 16, 17]) + tm.assert_index_equal(result, expected, exact=True) + def test_difference(self): # GH#12034 Cases where we operate against another RangeIndex and may # get back another RangeIndex From 3888d0daa9e3451445705168ac03dfb3bb1661d9 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 15 Oct 2021 08:11:19 -0700 Subject: [PATCH 2/2] mypy fixup --- pandas/core/indexes/datetimelike.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index be01ce57301ce..4cff33f96de27 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -439,7 +439,8 @@ def values(self) -> np.ndarray: def _as_range_index(self) -> RangeIndex: # Convert our i8 representations to RangeIndex # Caller is responsible for checking isinstance(self.freq, Tick) - tick = self.freq.delta.value + freq = cast(Tick, self.freq) + tick = freq.delta.value rng = range(self[0].value, self[-1].value + tick, tick) return RangeIndex(rng)