From 34b44cbcb60d5867952b3bd3404730d3c6a776f7 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 6 Oct 2020 11:54:21 -0700 Subject: [PATCH 1/3] REF: back IntervalArray by a _single_ ndarray --- pandas/core/arrays/interval.py | 150 ++++++++++-------- pandas/core/indexes/interval.py | 4 +- pandas/tests/base/test_conversion.py | 2 +- pandas/tests/frame/test_api.py | 4 +- .../indexes/interval/test_constructors.py | 6 +- 5 files changed, 97 insertions(+), 69 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 413430942575d..830f4c13cb824 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -19,6 +19,7 @@ is_categorical_dtype, is_datetime64_any_dtype, is_float_dtype, + is_integer, is_integer_dtype, is_interval_dtype, is_list_like, @@ -169,6 +170,17 @@ def __new__( left = data._left right = data._right closed = closed or data.closed + + if dtype is None or data.dtype == dtype: + # This path will preserve id(result._combined) + # TODO: could also validate dtype before going to simple_new + combined = data._combined + if copy: + combined = combined.copy() + result = cls._simple_new(combined, closed=closed) + if verify_integrity: + result._validate() + return result else: # don't allow scalars @@ -186,21 +198,30 @@ def __new__( ) closed = closed or infer_closed - return cls._simple_new( + result = cls._complicated_new( left, right, closed, copy=copy, dtype=dtype, - verify_integrity=verify_integrity, ) + if verify_integrity: + result._validate() + return result @classmethod - def _simple_new( - cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True - ): + def _simple_new(cls, data, closed="right"): result = IntervalMixin.__new__(cls) + result._combined = data + result._left = data[:, 0] + result._right = data[:, 1] + result._closed = closed + return result + + @classmethod + def _complicated_new(cls, left, right, closed=None, copy=False, dtype=None): + closed = closed or "right" left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) @@ -238,32 +259,15 @@ def _simple_new( msg = "Period dtypes are not supported, use a PeriodIndex instead" raise ValueError(msg) elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): + # TODO: use tz_compare? msg = ( "left and right must have the same time zone, got " f"'{left.tz}' and '{right.tz}'" ) raise ValueError(msg) - # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray - from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array - - left = maybe_upcast_datetimelike_array(left) - left = extract_array(left, extract_numpy=True) - right = maybe_upcast_datetimelike_array(right) - right = extract_array(right, extract_numpy=True) - - lbase = getattr(left, "_ndarray", left).base - rbase = getattr(right, "_ndarray", right).base - if lbase is not None and lbase is rbase: - # If these share data, then setitem could corrupt our IA - right = right.copy() - - result._left = left - result._right = right - result._closed = closed - if verify_integrity: - result._validate() - return result + combined = _get_combined_data(left, right) + return cls._simple_new(combined, closed=closed) @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): @@ -397,10 +401,12 @@ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): left = maybe_convert_platform_interval(left) right = maybe_convert_platform_interval(right) + if len(left) != len(right): + raise ValueError("left and right must have the same length") - return cls._simple_new( - left, right, closed, copy=copy, dtype=dtype, verify_integrity=True - ) + result = cls._complicated_new(left, right, closed, copy=copy, dtype=dtype) + result._validate() + return result _interval_shared_docs["from_tuples"] = textwrap.dedent( """ @@ -506,19 +512,6 @@ def _validate(self): msg = "left side of interval must be <= right side" raise ValueError(msg) - def _shallow_copy(self, left, right): - """ - Return a new IntervalArray with the replacement attributes - - Parameters - ---------- - left : Index - Values to be used for the left-side of the intervals. - right : Index - Values to be used for the right-side of the intervals. - """ - return self._simple_new(left, right, closed=self.closed, verify_integrity=False) - # --------------------------------------------------------------------- # Descriptive @@ -546,18 +539,20 @@ def __len__(self) -> int: def __getitem__(self, value): value = check_array_indexer(self, value) - left = self._left[value] - right = self._right[value] - if not isinstance(left, (np.ndarray, ExtensionArray)): - # scalar - if is_scalar(left) and isna(left): + result = self._combined[value] + + if is_integer(value): + left, right = result[0], result[1] + if isna(left): return self._fill_value return Interval(left, right, self.closed) - if np.ndim(left) > 1: + + # TODO: need to watch out for incorrectly-reducing getitem + if np.ndim(result) > 2: # GH#30588 multi-dimensional indexer disallowed raise ValueError("multi-dimensional indexing not allowed") - return self._shallow_copy(left, right) + return type(self)._simple_new(result, closed=self.closed) def __setitem__(self, key, value): value_left, value_right = self._validate_setitem_value(value) @@ -651,7 +646,8 @@ def fillna(self, value=None, method=None, limit=None): left = self.left.fillna(value=value_left) right = self.right.fillna(value=value_right) - return self._shallow_copy(left, right) + combined = _get_combined_data(left, right) + return type(self)._simple_new(combined, closed=self.closed) def astype(self, dtype, copy=True): """ @@ -693,7 +689,9 @@ def astype(self, dtype, copy=True): f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible" ) raise TypeError(msg) from err - return self._shallow_copy(new_left, new_right) + # TODO: do astype directly on self._combined + combined = _get_combined_data(new_left, new_right) + return type(self)._simple_new(combined, closed=self.closed) elif is_categorical_dtype(dtype): return Categorical(np.asarray(self)) elif isinstance(dtype, StringDtype): @@ -724,9 +722,11 @@ def _concat_same_type(cls, to_concat): raise ValueError("Intervals must all be closed on the same side.") closed = closed.pop() + # TODO: will this mess up on dt64tz? left = np.concatenate([interval.left for interval in to_concat]) right = np.concatenate([interval.right for interval in to_concat]) - return cls._simple_new(left, right, closed=closed, copy=False) + combined = _get_combined_data(left, right) # TODO: 1-stage concat + return cls._simple_new(combined, closed=closed) def copy(self): """ @@ -736,11 +736,8 @@ def copy(self): ------- IntervalArray """ - left = self._left.copy() - right = self._right.copy() - closed = self.closed - # TODO: Could skip verify_integrity here. - return type(self).from_arrays(left, right, closed=closed) + combined = self._combined.copy() + return type(self)._simple_new(combined, closed=self.closed) def isna(self) -> np.ndarray: return isna(self._left) @@ -833,7 +830,8 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): self._right, indices, allow_fill=allow_fill, fill_value=fill_right ) - return self._shallow_copy(left_take, right_take) + combined = _get_combined_data(left_take, right_take) + return type(self)._simple_new(combined, closed=self.closed) def _validate_listlike(self, value): # list-like of intervals @@ -1160,10 +1158,7 @@ def set_closed(self, closed): if closed not in VALID_CLOSED: msg = f"invalid option for 'closed': {closed}" raise ValueError(msg) - - return type(self)._simple_new( - left=self._left, right=self._right, closed=closed, verify_integrity=False - ) + return type(self)._simple_new(self._combined, closed=closed) _interval_shared_docs[ "is_non_overlapping_monotonic" @@ -1304,9 +1299,8 @@ def to_tuples(self, na_tuple=True): @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) - left_repeat = self.left.repeat(repeats) - right_repeat = self.right.repeat(repeats) - return self._shallow_copy(left=left_repeat, right=right_repeat) + combined = self._combined.repeat(repeats, 0) + return type(self)._simple_new(combined, closed=self.closed) _interval_shared_docs["contains"] = textwrap.dedent( """ @@ -1389,3 +1383,31 @@ def maybe_convert_platform_interval(values): values = np.asarray(values) return maybe_convert_platform(values) + + +def _get_combined_data(left, right): + # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray + from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array + + left = maybe_upcast_datetimelike_array(left) + left = extract_array(left, extract_numpy=True) + right = maybe_upcast_datetimelike_array(right) + right = extract_array(right, extract_numpy=True) + + lbase = getattr(left, "_ndarray", left).base + rbase = getattr(right, "_ndarray", right).base + if lbase is not None and lbase is rbase: + # If these share data, then setitem could corrupt our IA + right = right.copy() + + if isinstance(left, np.ndarray): + combined = np.concatenate( + [left.reshape(-1, 1), right.reshape(-1, 1)], + axis=1, + ) + else: + combined = type(left)._concat_same_type( + [left.reshape(-1, 1), right.reshape(-1, 1)], + axis=1, + ) + return combined diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index a56f6a5bb0340..d9dabf9e86f35 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -897,7 +897,7 @@ def delete(self, loc): """ new_left = self.left.delete(loc) new_right = self.right.delete(loc) - result = self._data._shallow_copy(new_left, new_right) + result = IntervalArray.from_arrays(new_left, new_right, closed=self.closed) return self._shallow_copy(result) def insert(self, loc, item): @@ -919,7 +919,7 @@ def insert(self, loc, item): new_left = self.left.insert(loc, left_insert) new_right = self.right.insert(loc, right_insert) - result = self._data._shallow_copy(new_left, new_right) + result = IntervalArray.from_arrays(new_left, new_right, closed=self.closed) return self._shallow_copy(result) @Appender(_index_shared_docs["take"] % _index_doc_kwargs) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index b5595ba220a15..26ad6fc1c6572 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -241,7 +241,7 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): (pd.Categorical(["a", "b"]), "_codes"), (pd.core.arrays.period_array(["2000", "2001"], freq="D"), "_data"), (pd.core.arrays.integer_array([0, np.nan]), "_data"), - (IntervalArray.from_breaks([0, 1]), "_left"), + (IntervalArray.from_breaks([0, 1]), "_combined"), (SparseArray([0, 1]), "_sparse_values"), (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"), # tz-aware Datetime diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 8b5d0c7ade56c..377bde31946fd 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -605,7 +605,9 @@ def test_constructor_expanddim_lookup(self): # raise NotImplementedError df = DataFrame() - inspect.getmembers(df) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # _AXIS_NAMES deprecated + inspect.getmembers(df) with pytest.raises(NotImplementedError, match="Not supported for DataFrames!"): df._constructor_expanddim(np.arange(27).reshape(3, 3, 3)) diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index aec7de549744f..c0ca0b415ba8e 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -266,7 +266,11 @@ def test_left_right_dont_share_data(self): # GH#36310 breaks = np.arange(5) result = IntervalIndex.from_breaks(breaks)._data - assert result._left.base is None or result._left.base is not result._right.base + left = result._left + right = result._right + + left[:] = 10000 + assert not (right == 10000).any() class TestFromTuples(Base): From 014c13e163fc806f3a80fdedb06aae5b5e7b7d49 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 6 Oct 2020 13:54:26 -0700 Subject: [PATCH 2/3] Remove _complicated_new --- pandas/core/arrays/interval.py | 114 ++++++++++++++++----------------- 1 file changed, 56 insertions(+), 58 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 1bd39f18fd412..479e42f30221c 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -198,13 +198,10 @@ def __new__( ) closed = closed or infer_closed - result = cls._complicated_new( - left, - right, - closed, - copy=copy, - dtype=dtype, - ) + closed = closed or "right" + left, right = _maybe_cast_inputs(left, right, copy, dtype) + combined = _get_combined_data(left, right) + result = cls._simple_new(combined, closed=closed) if verify_integrity: result._validate() return result @@ -219,56 +216,6 @@ def _simple_new(cls, data, closed="right"): result._closed = closed return result - @classmethod - def _complicated_new(cls, left, right, closed=None, copy=False, dtype=None): - - closed = closed or "right" - left = ensure_index(left, copy=copy) - right = ensure_index(right, copy=copy) - - if dtype is not None: - # GH 19262: dtype must be an IntervalDtype to override inferred - dtype = pandas_dtype(dtype) - if not is_interval_dtype(dtype): - msg = f"dtype must be an IntervalDtype, got {dtype}" - raise TypeError(msg) - elif dtype.subtype is not None: - left = left.astype(dtype.subtype) - right = right.astype(dtype.subtype) - - # coerce dtypes to match if needed - if is_float_dtype(left) and is_integer_dtype(right): - right = right.astype(left.dtype) - elif is_float_dtype(right) and is_integer_dtype(left): - left = left.astype(right.dtype) - - if type(left) != type(right): - msg = ( - f"must not have differing left [{type(left).__name__}] and " - f"right [{type(right).__name__}] types" - ) - raise ValueError(msg) - elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): - # GH 19016 - msg = ( - "category, object, and string subtypes are not supported " - "for IntervalArray" - ) - raise TypeError(msg) - elif isinstance(left, ABCPeriodIndex): - msg = "Period dtypes are not supported, use a PeriodIndex instead" - raise ValueError(msg) - elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): - # TODO: use tz_compare? - msg = ( - "left and right must have the same time zone, got " - f"'{left.tz}' and '{right.tz}'" - ) - raise ValueError(msg) - - combined = _get_combined_data(left, right) - return cls._simple_new(combined, closed=closed) - @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): return cls(scalars, dtype=dtype, copy=copy) @@ -404,7 +351,11 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): if len(left) != len(right): raise ValueError("left and right must have the same length") - result = cls._complicated_new(left, right, closed, copy=copy, dtype=dtype) + closed = closed or "right" + left, right = _maybe_cast_inputs(left, right, copy, dtype) + combined = _get_combined_data(left, right) + + result = cls._simple_new(combined, closed) result._validate() return result @@ -1385,6 +1336,53 @@ def maybe_convert_platform_interval(values): return maybe_convert_platform(values) +def _maybe_cast_inputs(left, right, copy, dtype): + left = ensure_index(left, copy=copy) + right = ensure_index(right, copy=copy) + + if dtype is not None: + # GH#19262: dtype must be an IntervalDtype to override inferred + dtype = pandas_dtype(dtype) + if not is_interval_dtype(dtype): + msg = f"dtype must be an IntervalDtype, got {dtype}" + raise TypeError(msg) + elif dtype.subtype is not None: + left = left.astype(dtype.subtype) + right = right.astype(dtype.subtype) + + # coerce dtypes to match if needed + if is_float_dtype(left) and is_integer_dtype(right): + right = right.astype(left.dtype) + elif is_float_dtype(right) and is_integer_dtype(left): + left = left.astype(right.dtype) + + if type(left) != type(right): + msg = ( + f"must not have differing left [{type(left).__name__}] and " + f"right [{type(right).__name__}] types" + ) + raise ValueError(msg) + elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): + # GH#19016 + msg = ( + "category, object, and string subtypes are not supported " + "for IntervalArray" + ) + raise TypeError(msg) + elif isinstance(left, ABCPeriodIndex): + msg = "Period dtypes are not supported, use a PeriodIndex instead" + raise ValueError(msg) + elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): + # TODO: use tz_compare? + msg = ( + "left and right must have the same time zone, got " + f"'{left.tz}' and '{right.tz}'" + ) + raise ValueError(msg) + + return left, right + + def _get_combined_data(left, right): # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array From a9adb761941b6022b11fab740e7d8373605c1752 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 11 Oct 2020 10:51:23 -0700 Subject: [PATCH 3/3] annotate --- pandas/core/arrays/interval.py | 37 ++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 47e2b86926e0b..d943fe3df88c5 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1,5 +1,6 @@ from operator import le, lt import textwrap +from typing import TYPE_CHECKING, Optional, Tuple, Union, cast import numpy as np @@ -11,6 +12,7 @@ IntervalMixin, intervals_to_interval_bounds, ) +from pandas._typing import ArrayLike, Dtype from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender @@ -18,6 +20,7 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_any_dtype, + is_dtype_equal, is_float_dtype, is_integer, is_integer_dtype, @@ -46,6 +49,10 @@ from pandas.core.indexers import check_array_indexer from pandas.core.indexes.base import ensure_index +if TYPE_CHECKING: + from pandas import Index + from pandas.core.arrays import DatetimeArray, TimedeltaArray + _interval_shared_docs = {} _shared_docs_kwargs = dict( @@ -1346,9 +1353,14 @@ def maybe_convert_platform_interval(values): return maybe_convert_platform(values) -def _maybe_cast_inputs(left, right, copy, dtype): - left = ensure_index(left, copy=copy) - right = ensure_index(right, copy=copy) +def _maybe_cast_inputs( + left_orig: Union["Index", ArrayLike], + right_orig: Union["Index", ArrayLike], + copy: bool, + dtype: Optional[Dtype], +) -> Tuple["Index", "Index"]: + left = ensure_index(left_orig, copy=copy) + right = ensure_index(right_orig, copy=copy) if dtype is not None: # GH#19262: dtype must be an IntervalDtype to override inferred @@ -1356,7 +1368,8 @@ def _maybe_cast_inputs(left, right, copy, dtype): if not is_interval_dtype(dtype): msg = f"dtype must be an IntervalDtype, got {dtype}" raise TypeError(msg) - elif dtype.subtype is not None: + dtype = cast(IntervalDtype, dtype) + if dtype.subtype is not None: left = left.astype(dtype.subtype) right = right.astype(dtype.subtype) @@ -1382,18 +1395,23 @@ def _maybe_cast_inputs(left, right, copy, dtype): elif isinstance(left, ABCPeriodIndex): msg = "Period dtypes are not supported, use a PeriodIndex instead" raise ValueError(msg) - elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): - # TODO: use tz_compare? + elif isinstance(left, ABCDatetimeIndex) and not is_dtype_equal( + left.dtype, right.dtype + ): + left_arr = cast("DatetimeArray", left._data) + right_arr = cast("DatetimeArray", right._data) msg = ( "left and right must have the same time zone, got " - f"'{left.tz}' and '{right.tz}'" + f"'{left_arr.tz}' and '{right_arr.tz}'" ) raise ValueError(msg) return left, right -def _get_combined_data(left, right): +def _get_combined_data( + left: Union["Index", ArrayLike], right: Union["Index", ArrayLike] +) -> Union[np.ndarray, "DatetimeArray", "TimedeltaArray"]: # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array @@ -1409,11 +1427,14 @@ def _get_combined_data(left, right): right = right.copy() if isinstance(left, np.ndarray): + assert isinstance(right, np.ndarray) # for mypy combined = np.concatenate( [left.reshape(-1, 1), right.reshape(-1, 1)], axis=1, ) else: + left = cast(Union["DatetimeArray", "TimedeltaArray"], left) + right = cast(Union["DatetimeArray", "TimedeltaArray"], right) combined = type(left)._concat_same_type( [left.reshape(-1, 1), right.reshape(-1, 1)], axis=1,