From 8987a0ebc30b1c05e6cd176c700fee3310b0c376 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 12 Sep 2020 11:59:43 -0700 Subject: [PATCH 01/10] POC: back IntervalArray by array instead of Index --- pandas/_testing.py | 22 ++++++--- pandas/core/arrays/interval.py | 47 +++++++++++-------- pandas/core/indexes/interval.py | 28 +++++++---- pandas/tests/series/indexing/test_getitem.py | 2 +- .../util/test_assert_interval_array_equal.py | 14 +++--- 5 files changed, 71 insertions(+), 42 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 9db0c3496e290..ff69d20ebcf16 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -976,8 +976,16 @@ def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray") """ _check_isinstance(left, right, IntervalArray) - assert_index_equal(left.left, right.left, exact=exact, obj=f"{obj}.left") - assert_index_equal(left.right, right.right, exact=exact, obj=f"{obj}.left") + if left.left.dtype.kind in ["m", "M"]: + # We have a DatetimeArray or Timed + # TODO: `exact` keyword? + assert_equal(left.left, right.left, obj=f"{obj}.left", check_freq=False) + assert_equal(left.right, right.right, obj=f"{obj}.left", check_freq=False) + else: + # doesnt take check_freq + assert_equal(left.left, right.left, obj=f"{obj}.left") # TODO: exact? + assert_equal(left.right, right.right, obj=f"{obj}.left") + assert_attr_equal("closed", left, right, obj=obj) @@ -988,20 +996,22 @@ def assert_period_array_equal(left, right, obj="PeriodArray"): assert_attr_equal("freq", left, right, obj=obj) -def assert_datetime_array_equal(left, right, obj="DatetimeArray"): +def assert_datetime_array_equal(left, right, obj="DatetimeArray", check_freq=True): __tracebackhide__ = True _check_isinstance(left, right, DatetimeArray) assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") - assert_attr_equal("freq", left, right, obj=obj) + if check_freq: + assert_attr_equal("freq", left, right, obj=obj) assert_attr_equal("tz", left, right, obj=obj) -def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): +def assert_timedelta_array_equal(left, right, obj="TimedeltaArray", check_freq=True): __tracebackhide__ = True _check_isinstance(left, right, TimedeltaArray) assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") - assert_attr_equal("freq", left, right, obj=obj) + if check_freq: + assert_attr_equal("freq", left, right, obj=obj) def raise_assert_detail(obj, message, left, right, diff=None, index_values=None): diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 1dbd3cfc6dca6..b64ef96ae290d 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -32,7 +32,6 @@ from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.generic import ( ABCDatetimeIndex, - ABCIndexClass, ABCIntervalIndex, ABCPeriodIndex, ABCSeries, @@ -43,7 +42,7 @@ from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.arrays.categorical import Categorical import pandas.core.common as com -from pandas.core.construction import array +from pandas.core.construction import array, extract_array from pandas.core.indexers import check_array_indexer from pandas.core.indexes.base import ensure_index @@ -162,7 +161,7 @@ def __new__( ): if isinstance(data, ABCSeries) and is_interval_dtype(data.dtype): - data = data._values + data = data._values # TODO: extract_array? if isinstance(data, (cls, ABCIntervalIndex)): left = data.left @@ -243,8 +242,8 @@ def _simple_new( ) raise ValueError(msg) - result._left = left - result._right = right + result._left = extract_array(array(left), extract_numpy=True) + result._right = extract_array(array(right), extract_numpy=True) result._closed = closed if verify_integrity: result._validate() @@ -512,14 +511,13 @@ def __getitem__(self, value): right = self.right[value] # scalar - if not isinstance(left, ABCIndexClass): + if not isinstance(left, (np.ndarray, ExtensionArray)): if is_scalar(left) and isna(left): return self._fill_value if np.ndim(left) > 1: # GH#30588 multi-dimensional indexer disallowed raise ValueError("multi-dimensional indexing not allowed") return Interval(left, right, self.closed) - return self._shallow_copy(left, right) def __setitem__(self, key, value): @@ -559,12 +557,12 @@ def __setitem__(self, key, value): # Need to ensure that left and right are updated atomically, so we're # forced to copy, update the copy, and swap in the new values. - left = self.left.copy(deep=True) - left._values[key] = value_left + left = self.left.copy() + left[key] = value_left self._left = left - right = self.right.copy(deep=True) - right._values[key] = value_right + right = self.right.copy() + right[key] = value_right self._right = right def __eq__(self, other): @@ -657,8 +655,10 @@ def fillna(self, value=None, method=None, limit=None): self._check_closed_matches(value, name="value") - left = self.left.fillna(value=value.left) - right = self.right.fillna(value=value.right) + from pandas import Index + + left = Index(self.left).fillna(value=value.left) + right = Index(self.right).fillna(value=value.right) return self._shallow_copy(left, right) @property @@ -684,6 +684,7 @@ def astype(self, dtype, copy=True): array : ExtensionArray or ndarray ExtensionArray or NumPy ndarray with 'dtype' for its dtype. """ + from pandas import Index from pandas.core.arrays.string_ import StringDtype if dtype is not None: @@ -695,8 +696,10 @@ def astype(self, dtype, copy=True): # need to cast to different subtype try: - new_left = self.left.astype(dtype.subtype) - new_right = self.right.astype(dtype.subtype) + # We need to use Index rules for astype to prevent casting + # np.nan entries to int subtypes + new_left = Index(self.left).astype(dtype.subtype) + new_right = Index(self.right).astype(dtype.subtype) except TypeError as err: msg = ( f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible" @@ -758,13 +761,13 @@ def copy(self): ------- IntervalArray """ - left = self.left.copy(deep=True) - right = self.right.copy(deep=True) + left = self.left.copy() + right = self.right.copy() closed = self.closed # TODO: Could skip verify_integrity here. return type(self).from_arrays(left, right, closed=closed) - def isna(self): + def isna(self) -> np.ndarray: return isna(self.left) @property @@ -790,7 +793,9 @@ def shift(self, periods: int = 1, fill_value: object = None) -> "IntervalArray": empty_len = min(abs(periods), len(self)) if isna(fill_value): - fill_value = self.left._na_value + from pandas import Index + + fill_value = Index(self.left)._na_value empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) else: empty = self._from_sequence([fill_value] * empty_len) @@ -854,7 +859,9 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): fill_left = fill_right = fill_value if allow_fill: if fill_value is None: - fill_left = fill_right = self.left._na_value + from pandas import Index + + fill_left = fill_right = Index(self.left)._na_value elif is_interval(fill_value): self._check_closed_matches(fill_value, name="fill_value") fill_left, fill_right = fill_value.left, fill_value.right diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index ad0a7ea32a1cc..63277dcafa52c 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -183,12 +183,8 @@ def func(intvidx_self, other, sort=False): ) ) @inherit_names(["set_closed", "to_tuples"], IntervalArray, wrap=True) -@inherit_names( - ["__array__", "overlaps", "contains", "left", "right", "length"], IntervalArray -) -@inherit_names( - ["is_non_overlapping_monotonic", "mid", "closed"], IntervalArray, cache=True -) +@inherit_names(["__array__", "overlaps", "contains"], IntervalArray) +@inherit_names(["is_non_overlapping_monotonic", "closed"], IntervalArray, cache=True) class IntervalIndex(IntervalMixin, ExtensionIndex): _typ = "intervalindex" _comparables = ["name"] @@ -407,7 +403,7 @@ def __reduce__(self): return _new_IntervalIndex, (type(self), d), None @Appender(Index.astype.__doc__) - def astype(self, dtype, copy=True): + def astype(self, dtype, copy: bool = True): with rewrite_exception("IntervalArray", type(self).__name__): new_values = self._values.astype(dtype, copy=copy) if is_interval_dtype(new_values.dtype): @@ -436,7 +432,7 @@ def is_monotonic_decreasing(self) -> bool: return self[::-1].is_monotonic_increasing @cache_readonly - def is_unique(self): + def is_unique(self) -> bool: """ Return True if the IntervalIndex contains unique elements, else False. """ @@ -891,6 +887,22 @@ def _convert_list_indexer(self, keyarr): # -------------------------------------------------------------------- + @cache_readonly + def left(self) -> Index: + return Index(self._values.left) + + @cache_readonly + def right(self) -> Index: + return Index(self._values.right) + + @cache_readonly + def mid(self): + return Index(self._data.mid) + + @property + def length(self): + return Index(self._data.length) + @Appender(Index.where.__doc__) def where(self, cond, other=None): if other is None: diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 6b7cda89a4714..5b585e8802752 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -101,7 +101,7 @@ def test_getitem_intlist_intindex_periodvalues(self): @pytest.mark.parametrize("box", [list, np.array, pd.Index]) def test_getitem_intlist_intervalindex_non_int(self, box): # GH#33404 fall back to positional since ints are unambiguous - dti = date_range("2000-01-03", periods=3) + dti = date_range("2000-01-03", periods=3)._with_freq(None) ii = pd.IntervalIndex.from_breaks(dti) ser = Series(range(len(ii)), index=ii) diff --git a/pandas/tests/util/test_assert_interval_array_equal.py b/pandas/tests/util/test_assert_interval_array_equal.py index 96f2973a1528c..2e8699536c72a 100644 --- a/pandas/tests/util/test_assert_interval_array_equal.py +++ b/pandas/tests/util/test_assert_interval_array_equal.py @@ -41,9 +41,9 @@ def test_interval_array_equal_periods_mismatch(): msg = """\ IntervalArray.left are different -IntervalArray.left length are different -\\[left\\]: 5, Int64Index\\(\\[0, 1, 2, 3, 4\\], dtype='int64'\\) -\\[right\\]: 6, Int64Index\\(\\[0, 1, 2, 3, 4, 5\\], dtype='int64'\\)""" +IntervalArray.left shapes are different +\\[left\\]: \\(5,\\) +\\[right\\]: \\(6,\\)""" with pytest.raises(AssertionError, match=msg): tm.assert_interval_array_equal(arr1, arr2) @@ -58,8 +58,8 @@ def test_interval_array_equal_end_mismatch(): IntervalArray.left are different IntervalArray.left values are different \\(80.0 %\\) -\\[left\\]: Int64Index\\(\\[0, 2, 4, 6, 8\\], dtype='int64'\\) -\\[right\\]: Int64Index\\(\\[0, 4, 8, 12, 16\\], dtype='int64'\\)""" +\\[left\\]: \\[0, 2, 4, 6, 8\\] +\\[right\\]: \\[0, 4, 8, 12, 16\\]""" with pytest.raises(AssertionError, match=msg): tm.assert_interval_array_equal(arr1, arr2) @@ -74,8 +74,8 @@ def test_interval_array_equal_start_mismatch(): IntervalArray.left are different IntervalArray.left values are different \\(100.0 %\\) -\\[left\\]: Int64Index\\(\\[0, 1, 2, 3\\], dtype='int64'\\) -\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" +\\[left\\]: \\[0, 1, 2, 3\\] +\\[right\\]: \\[1, 2, 3, 4\\]""" with pytest.raises(AssertionError, match=msg): tm.assert_interval_array_equal(arr1, arr2) From 81640996defdfa60086a157ef9ac29deab8d757b Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 12 Sep 2020 20:05:58 -0700 Subject: [PATCH 02/10] Fix failing copy/view tests --- pandas/core/arrays/interval.py | 33 ++++++++++++------------- pandas/tests/extension/test_interval.py | 4 +-- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index b64ef96ae290d..5f819b4b9b7f7 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -160,10 +160,12 @@ def __new__( verify_integrity: bool = True, ): - if isinstance(data, ABCSeries) and is_interval_dtype(data.dtype): + if isinstance(data, (ABCSeries, ABCIntervalIndex)) and is_interval_dtype( + data.dtype + ): data = data._values # TODO: extract_array? - if isinstance(data, (cls, ABCIntervalIndex)): + if isinstance(data, cls): left = data.left right = data.right closed = closed or data.closed @@ -242,8 +244,12 @@ def _simple_new( ) raise ValueError(msg) - result._left = extract_array(array(left), extract_numpy=True) - result._right = extract_array(array(right), extract_numpy=True) + from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array + + left = maybe_upcast_datetimelike_array(left) + right = maybe_upcast_datetimelike_array(right) + result._left = extract_array(left, extract_numpy=True) + result._right = extract_array(right, extract_numpy=True) result._closed = closed if verify_integrity: result._validate() @@ -510,14 +516,14 @@ def __getitem__(self, value): left = self.left[value] right = self.right[value] - # scalar if not isinstance(left, (np.ndarray, ExtensionArray)): + # scalar if is_scalar(left) and isna(left): return self._fill_value - if np.ndim(left) > 1: - # GH#30588 multi-dimensional indexer disallowed - raise ValueError("multi-dimensional indexing not allowed") return Interval(left, right, self.closed) + if np.ndim(left) > 1: + # GH#30588 multi-dimensional indexer disallowed + raise ValueError("multi-dimensional indexing not allowed") return self._shallow_copy(left, right) def __setitem__(self, key, value): @@ -555,15 +561,8 @@ def __setitem__(self, key, value): key = check_array_indexer(self, key) - # Need to ensure that left and right are updated atomically, so we're - # forced to copy, update the copy, and swap in the new values. - left = self.left.copy() - left[key] = value_left - self._left = left - - right = self.right.copy() - right[key] = value_right - self._right = right + self._left[key] = value_left + self._right[key] = value_right # TODO: needs tests for not breaking views def __eq__(self, other): # ensure pandas array for list-like and eliminate non-interval scalars diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 2411f6cfbd936..4fdcf930d224f 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -147,9 +147,7 @@ class TestReshaping(BaseInterval, base.BaseReshapingTests): class TestSetitem(BaseInterval, base.BaseSetitemTests): - @pytest.mark.xfail(reason="GH#27147 setitem changes underlying index") - def test_setitem_preserves_views(self, data): - super().test_setitem_preserves_views(data) + pass class TestPrinting(BaseInterval, base.BasePrintingTests): From d545dacbdc8da583afd6b4044715718d3ff089d7 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 13 Sep 2020 08:31:26 -0700 Subject: [PATCH 03/10] mypy fixup --- pandas/core/indexes/interval.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 63277dcafa52c..9909f2aed8c72 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -197,6 +197,8 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): _mask = None _data: IntervalArray + _values: IntervalArray + # -------------------------------------------------------------------- # Constructors From bd6231cec6cd7d59bb6723aa1097f2496d9d9464 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 16 Sep 2020 11:34:23 -0700 Subject: [PATCH 04/10] Avoid having left and right view the same data --- pandas/core/arrays/interval.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 63dbedce6e05d..33ab9b4af55ae 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -245,9 +245,18 @@ def _simple_new( from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array left = maybe_upcast_datetimelike_array(left) + right = extract_array(right, extract_numpy=True) + left = extract_array(left, extract_numpy=True) right = maybe_upcast_datetimelike_array(right) - result._left = extract_array(left, extract_numpy=True) - result._right = extract_array(right, extract_numpy=True) + + lbase = getattr(left, "_ndarray", left).base + rbase = getattr(right, "_ndarray", right).base + if lbase is not None and lbase is rbase: + # If these share data, then setitem could corrupt our IA + right = right.copy() + + result._left = left + result._right = right result._closed = closed if verify_integrity: result._validate() From c4a222990d77fefba2e27f95c5cafb244e088f23 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 17 Sep 2020 17:45:30 -0700 Subject: [PATCH 05/10] Restore left and right as Indexes --- pandas/_testing.py | 10 ++--- pandas/core/arrays/interval.py | 82 ++++++++++++++++++---------------- 2 files changed, 48 insertions(+), 44 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index ff69d20ebcf16..33a893fe37b53 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -976,15 +976,15 @@ def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray") """ _check_isinstance(left, right, IntervalArray) - if left.left.dtype.kind in ["m", "M"]: + if left._left.dtype.kind in ["m", "M"]: # We have a DatetimeArray or Timed # TODO: `exact` keyword? - assert_equal(left.left, right.left, obj=f"{obj}.left", check_freq=False) - assert_equal(left.right, right.right, obj=f"{obj}.left", check_freq=False) + assert_equal(left._left, right._left, obj=f"{obj}.left", check_freq=False) + assert_equal(left._right, right._right, obj=f"{obj}.left", check_freq=False) else: # doesnt take check_freq - assert_equal(left.left, right.left, obj=f"{obj}.left") # TODO: exact? - assert_equal(left.right, right.right, obj=f"{obj}.left") + assert_equal(left._left, right._left, obj=f"{obj}.left") # TODO: exact? + assert_equal(left._right, right._right, obj=f"{obj}.left") assert_attr_equal("closed", left, right, obj=obj) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 3acf996ade03f..9b8c148b64a4c 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -164,8 +164,8 @@ def __new__( data = data._values # TODO: extract_array? if isinstance(data, cls): - left = data.left - right = data.right + left = data._left + right = data._right closed = closed or data.closed else: @@ -488,18 +488,18 @@ def _validate(self): if self.closed not in VALID_CLOSED: msg = f"invalid option for 'closed': {self.closed}" raise ValueError(msg) - if len(self.left) != len(self.right): + if len(self._left) != len(self._right): msg = "left and right must have the same length" raise ValueError(msg) - left_mask = notna(self.left) - right_mask = notna(self.right) + left_mask = notna(self._left) + right_mask = notna(self._right) if not (left_mask == right_mask).all(): msg = ( "missing values must be missing in the same " "location both left and right sides" ) raise ValueError(msg) - if not (self.left[left_mask] <= self.right[left_mask]).all(): + if not (self._left[left_mask] <= self._right[left_mask]).all(): msg = "left side of interval must be <= right side" raise ValueError(msg) @@ -510,12 +510,12 @@ def __iter__(self): return iter(np.asarray(self)) def __len__(self) -> int: - return len(self.left) + return len(self._left) def __getitem__(self, value): value = check_array_indexer(self, value) - left = self.left[value] - right = self.right[value] + left = self._left[value] + right = self._right[value] if not isinstance(left, (np.ndarray, ExtensionArray)): # scalar @@ -551,7 +551,7 @@ def __setitem__(self, key, value): # list-like of intervals try: array = IntervalArray(value) - value_left, value_right = array.left, array.right + value_left, value_right = array._left, array.right except TypeError as err: # wrong type: not interval or NA msg = f"'value' should be an interval type, got {type(value)} instead." @@ -594,7 +594,7 @@ def __eq__(self, other): if is_interval_dtype(other_dtype): if self.closed != other.closed: return np.zeros(len(self), dtype=bool) - return (self.left == other.left) & (self.right == other.right) + return (self._left == other.left) & (self._right == other.right) # non-interval/non-object dtype -> no matches if not is_object_dtype(other_dtype): @@ -607,8 +607,8 @@ def __eq__(self, other): if ( isinstance(obj, Interval) and self.closed == obj.closed - and self.left[i] == obj.left - and self.right[i] == obj.right + and self._left[i] == obj.left + and self._right[i] == obj.right ): result[i] = True @@ -654,7 +654,7 @@ def fillna(self, value=None, method=None, limit=None): @property def dtype(self): - return IntervalDtype(self.left.dtype) + return IntervalDtype(self._left.dtype) def astype(self, dtype, copy=True): """ @@ -689,8 +689,8 @@ def astype(self, dtype, copy=True): try: # We need to use Index rules for astype to prevent casting # np.nan entries to int subtypes - new_left = Index(self.left).astype(dtype.subtype) - new_right = Index(self.right).astype(dtype.subtype) + new_left = Index(self._left).astype(dtype.subtype) + new_right = Index(self._right).astype(dtype.subtype) except TypeError as err: msg = ( f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible" @@ -752,23 +752,23 @@ def copy(self): ------- IntervalArray """ - left = self.left.copy() - right = self.right.copy() + left = self._left.copy() + right = self._right.copy() closed = self.closed # TODO: Could skip verify_integrity here. return type(self).from_arrays(left, right, closed=closed) def isna(self) -> np.ndarray: - return isna(self.left) + return isna(self._left) @property def nbytes(self) -> int: - return self.left.nbytes + self.right.nbytes + return self._left.nbytes + self._right.nbytes @property def size(self) -> int: # Avoid materializing self.values - return self.left.size + return self._left.size def shift(self, periods: int = 1, fill_value: object = None) -> "IntervalArray": if not len(self) or periods == 0: @@ -786,7 +786,7 @@ def shift(self, periods: int = 1, fill_value: object = None) -> "IntervalArray": if isna(fill_value): from pandas import Index - fill_value = Index(self.left)._na_value + fill_value = Index(self._left)._na_value empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) else: empty = self._from_sequence([fill_value] * empty_len) @@ -852,10 +852,10 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): fill_left, fill_right = self._validate_fill_value(fill_value) left_take = take( - self.left, indices, allow_fill=allow_fill, fill_value=fill_left + self._left, indices, allow_fill=allow_fill, fill_value=fill_left ) right_take = take( - self.right, indices, allow_fill=allow_fill, fill_value=fill_right + self._right, indices, allow_fill=allow_fill, fill_value=fill_right ) return self._shallow_copy(left_take, right_take) @@ -983,7 +983,9 @@ def left(self): Return the left endpoints of each Interval in the IntervalArray as an Index. """ - return self._left + from pandas import Index + + return Index(self._left) @property def right(self): @@ -991,7 +993,9 @@ def right(self): Return the right endpoints of each Interval in the IntervalArray as an Index. """ - return self._right + from pandas import Index + + return Index(self._right) @property def closed(self): @@ -1049,7 +1053,7 @@ def set_closed(self, closed): raise ValueError(msg) return type(self)._simple_new( - left=self.left, right=self.right, closed=closed, verify_integrity=False + left=self._left, right=self._right, closed=closed, verify_integrity=False ) @property @@ -1102,15 +1106,15 @@ def is_non_overlapping_monotonic(self): # at a point when both sides of intervals are included if self.closed == "both": return bool( - (self.right[:-1] < self.left[1:]).all() - or (self.left[:-1] > self.right[1:]).all() + (self._right[:-1] < self._left[1:]).all() + or (self._left[:-1] > self._right[1:]).all() ) # non-strict inequality when closed != 'both'; at least one side is # not included in the intervals, so equality does not imply overlapping return bool( - (self.right[:-1] <= self.left[1:]).all() - or (self.left[:-1] >= self.right[1:]).all() + (self._right[:-1] <= self._left[1:]).all() + or (self._left[:-1] >= self._right[1:]).all() ) # Conversion @@ -1119,8 +1123,8 @@ def __array__(self, dtype=None) -> np.ndarray: Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') """ - left = self.left - right = self.right + left = self._left + right = self._right mask = self.isna() closed = self._closed @@ -1150,8 +1154,8 @@ def __arrow_array__(self, type=None): interval_type = ArrowIntervalType(subtype, self.closed) storage_array = pyarrow.StructArray.from_arrays( [ - pyarrow.array(self.left, type=subtype, from_pandas=True), - pyarrow.array(self.right, type=subtype, from_pandas=True), + pyarrow.array(self._left, type=subtype, from_pandas=True), + pyarrow.array(self._right, type=subtype, from_pandas=True), ], names=["left", "right"], ) @@ -1205,7 +1209,7 @@ def __arrow_array__(self, type=None): _interval_shared_docs["to_tuples"] % dict(return_type="ndarray", examples="") ) def to_tuples(self, na_tuple=True): - tuples = com.asarray_tuplesafe(zip(self.left, self.right)) + tuples = com.asarray_tuplesafe(zip(self._left, self._right)) if not na_tuple: # GH 18756 tuples = np.where(~self.isna(), tuples, np.nan) @@ -1269,8 +1273,8 @@ def contains(self, other): if isinstance(other, Interval): raise NotImplementedError("contains not implemented for two intervals") - return (self.left < other if self.open_left else self.left <= other) & ( - other < self.right if self.open_right else other <= self.right + return (self._left < other if self.open_left else self._left <= other) & ( + other < self._right if self.open_right else other <= self._right ) _interval_shared_docs["overlaps"] = textwrap.dedent( @@ -1345,7 +1349,7 @@ def overlaps(self, other): # overlaps is equivalent negation of two interval being disjoint: # disjoint = (A.left > B.right) or (B.left > A.right) # (simplifying the negation allows this to be done in less operations) - return op1(self.left, other.right) & op2(other.left, self.right) + return op1(self._left, other.right) & op2(other.left, self._right) def maybe_convert_platform_interval(values): From bfa13bbe6edaa7a5a22d6594f2c386351fca2270 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 18 Sep 2020 16:40:01 -0700 Subject: [PATCH 06/10] TST: test_left_right_dont_share_data --- pandas/tests/indexes/interval/test_constructors.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index fa881df8139c6..aec7de549744f 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -262,6 +262,12 @@ def test_length_one(self): expected = IntervalIndex.from_breaks([]) tm.assert_index_equal(result, expected) + def test_left_right_dont_share_data(self): + # GH#36310 + breaks = np.arange(5) + result = IntervalIndex.from_breaks(breaks)._data + assert result._left.base is None or result._left.base is not result._right.base + class TestFromTuples(Base): """Tests specific to IntervalIndex.from_tuples""" From 4efdc086a294e7dc07415c4775b38afb70b6249d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 22 Sep 2020 15:16:42 -0700 Subject: [PATCH 07/10] pass copy=False --- pandas/core/indexes/interval.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index f379489e76f56..b0cd8d2b1301a 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -865,19 +865,19 @@ def _convert_list_indexer(self, keyarr): @cache_readonly def left(self) -> Index: - return Index(self._values.left) + return Index(self._values.left, copy=False) @cache_readonly def right(self) -> Index: - return Index(self._values.right) + return Index(self._values.right, copy=False) @cache_readonly def mid(self): - return Index(self._data.mid) + return Index(self._data.mid, copy=False) @property def length(self): - return Index(self._data.length) + return Index(self._data.length, copy=False) @Appender(Index.where.__doc__) def where(self, cond, other=None): From ed6a9324f5aeee742f1cd7cacef2285a1f48b1dd Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 30 Sep 2020 14:51:31 -0700 Subject: [PATCH 08/10] perf note --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 65a54beefd025..206665cebcbfa 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -280,6 +280,7 @@ Performance improvements - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) - Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) +- Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`) .. --------------------------------------------------------------------------- From 1a220958ab844b1e9f95868fa5f3400de503444a Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 30 Sep 2020 19:15:03 -0700 Subject: [PATCH 09/10] revert whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index d226999fe33c0..82a360442f8c0 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -168,7 +168,6 @@ Other enhancements - ``Styler`` now allows direct CSS class name addition to individual data cells (:issue:`36159`) - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) -- :func:`pandas._testing.assert_datetime_array_equal` and :func:`pandas._testing.assert_timedelta_array_equal` now have a ``check_freq=True`` keyword that allows disabling the check for matching ``freq`` attribute (:issue:`36310`) .. _whatsnew_120.api_breaking.python: From 865b3fcc540e96e269509c9ffdc9d7a9fe7e7cce Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 1 Oct 2020 09:39:33 -0700 Subject: [PATCH 10/10] update per comments --- pandas/_testing.py | 1 - pandas/core/arrays/interval.py | 2 +- pandas/core/indexes/interval.py | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 50b6120ce849a..cf6272edc4c05 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -982,7 +982,6 @@ def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray") # We have a DatetimeArray or TimedeltaArray kwargs["check_freq"] = False - # TODO: `exact` keyword? assert_equal(left._left, right._left, obj=f"{obj}.left", **kwargs) assert_equal(left._right, right._right, obj=f"{obj}.left", **kwargs) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index ab643557d12e9..413430942575d 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -564,7 +564,7 @@ def __setitem__(self, key, value): key = check_array_indexer(self, key) self._left[key] = value_left - self._right[key] = value_right # TODO: needs tests for not breaking views + self._right[key] = value_right def __eq__(self, other): # ensure pandas array for list-like and eliminate non-interval scalars diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index fc41271a981f0..a56f6a5bb0340 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -865,11 +865,11 @@ def _convert_list_indexer(self, keyarr): @cache_readonly def left(self) -> Index: - return Index(self._values.left, copy=False) + return Index(self._data.left, copy=False) @cache_readonly def right(self) -> Index: - return Index(self._values.right, copy=False) + return Index(self._data.right, copy=False) @cache_readonly def mid(self):