Skip to content

REF: IntervalArray comparisons #37124

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Nov 3, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import numpy as np

from pandas._libs import lib
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly, doc
from pandas.util._validators import validate_fillna_kwargs
Expand Down Expand Up @@ -139,7 +138,6 @@ def repeat(self: _T, repeats, axis=None) -> _T:
--------
numpy.ndarray.repeat
"""
nv.validate_repeat(tuple(), dict(axis=axis))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this no longer needed? or unrelated?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you comment here @jbrockmendel

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In IntervalArray.repeat we call self._combined.repeat(repeats, 0), which when _combined is DTA/TDA gets here with axis=0

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since there is still discussion about whether _combined is meant to stay (#37047), can you leave out this clean-up from this PR?

(unless the clean-up is not tied to having _combined, but from your comment it seems so)

new_data = self._ndarray.repeat(repeats, axis=axis)
return self._from_backing_data(new_data)

Expand Down
84 changes: 66 additions & 18 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import operator
from operator import le, lt
import textwrap
from typing import TYPE_CHECKING, Optional, Tuple, Union, cast
Expand Down Expand Up @@ -48,7 +49,7 @@
from pandas.core.construction import array, extract_array
from pandas.core.indexers import check_array_indexer
from pandas.core.indexes.base import ensure_index
from pandas.core.ops import unpack_zerodim_and_defer
from pandas.core.ops import invalid_comparison, unpack_zerodim_and_defer

if TYPE_CHECKING:
from pandas import Index
Expand Down Expand Up @@ -520,16 +521,15 @@ def __setitem__(self, key, value):
self._left[key] = value_left
self._right[key] = value_right

@unpack_zerodim_and_defer("__eq__")
def __eq__(self, other):
def _cmp_method(self, other, op):
# ensure pandas array for list-like and eliminate non-interval scalars
if is_list_like(other):
if len(self) != len(other):
raise ValueError("Lengths must match to compare")
other = array(other)
elif not isinstance(other, Interval):
# non-interval scalar -> no matches
return np.zeros(len(self), dtype=bool)
return invalid_comparison(self, other, op)

# determine the dtype of the elements we want to compare
if isinstance(other, Interval):
Expand All @@ -543,33 +543,81 @@ def __eq__(self, other):
# extract intervals if we have interval categories with matching closed
if is_interval_dtype(other_dtype):
if self.closed != other.categories.closed:
return np.zeros(len(self), dtype=bool)
return invalid_comparison(self, other, op)
other = other.categories.take(other.codes)

# interval-like -> need same closed and matching endpoints
if is_interval_dtype(other_dtype):
if self.closed != other.closed:
return np.zeros(len(self), dtype=bool)
return (self._left == other.left) & (self._right == other.right)
return invalid_comparison(self, other, op)
if isinstance(other, Interval):
other = type(self)._from_sequence([other])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we use other.left / other.right scalars here? (and then the broadcasting for array vs scalar will work fine, and we don't have to deal with len-1 arrays?)

if self._left.dtype.kind in ["m", "M"]:
# Need to repeat bc we do not broadcast length-1
# TODO: would be helpful to have a tile method to do
# this without copies
other = other.repeat(len(self))
else:
other = type(self)(other)

if op is operator.eq:
return (self._left == other._left) & (self._right == other._right)
elif op is operator.ne:
return (self._left != other._left) | (self._right != other._right)
elif op is operator.gt:
return (self._left > other._left) | (
(self._left == other._left) & (self._right > other._right)
)
elif op is operator.ge:
return (self == other) | (self > other)
elif op is operator.lt:
return (self._left < other._left) | (
(self._left == other._left) & (self._right < other._right)
)
else:
# operator.lt
return (self == other) | (self < other)

# non-interval/non-object dtype -> no matches
if not is_object_dtype(other_dtype):
return np.zeros(len(self), dtype=bool)
return invalid_comparison(self, other, op)

# object dtype -> iteratively check for intervals
result = np.zeros(len(self), dtype=bool)
for i, obj in enumerate(other):
# need object to be an Interval with same closed and endpoints
if (
isinstance(obj, Interval)
and self.closed == obj.closed
and self._left[i] == obj.left
and self._right[i] == obj.right
):
result[i] = True

try:
for i, obj in enumerate(other):
result[i] = op(self[i], obj)
except TypeError:
# pd.NA
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have an example (or test) that runs into this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see the removed special case below in the tests. Now, this is certainly in a messy state, but I am not sure the conversion to object dtype is needed here. Currently, we still return False (and not NA as for nullable dtypes) on comparisons with NA:

In [90]: arr = pd.interval_range(0,3).array

In [91]: arr == pd.NA
Out[91]: array([False, False, False])

In [92]: arr[0] == pd.NA
Out[92]: False

result = np.zeros(len(self), dtype=object)
for i, obj in enumerate(other):
result[i] = op(self[i], obj)
return result

@unpack_zerodim_and_defer("__eq__")
def __eq__(self, other):
return self._cmp_method(other, operator.eq)

@unpack_zerodim_and_defer("__ne__")
def __ne__(self, other):
return self._cmp_method(other, operator.ne)

@unpack_zerodim_and_defer("__gt__")
def __gt__(self, other):
return self._cmp_method(other, operator.gt)

@unpack_zerodim_and_defer("__ge__")
def __ge__(self, other):
return self._cmp_method(other, operator.ge)

@unpack_zerodim_and_defer("__lt__")
def __lt__(self, other):
return self._cmp_method(other, operator.lt)

@unpack_zerodim_and_defer("__le__")
def __le__(self, other):
return self._cmp_method(other, operator.le)

def fillna(self, value=None, method=None, limit=None):
"""
Fill NA/NaN values using the specified method.
Expand Down
13 changes: 0 additions & 13 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1105,19 +1105,6 @@ def _is_all_dates(self) -> bool:

# TODO: arithmetic operations

# GH#30817 until IntervalArray implements inequalities, get them from Index
def __lt__(self, other):
return Index.__lt__(self, other)

def __le__(self, other):
return Index.__le__(self, other)

def __gt__(self, other):
return Index.__gt__(self, other)

def __ge__(self, other):
return Index.__ge__(self, other)


def _is_valid_endpoint(endpoint) -> bool:
"""
Expand Down
5 changes: 0 additions & 5 deletions pandas/tests/arithmetic/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,11 +216,6 @@ def test_compare_list_like_nan(self, op, array, nulls_fixture, request):
result = op(array, other)
expected = self.elementwise_comparison(op, array, other)

if nulls_fixture is pd.NA and array.dtype.subtype != "i8":
reason = "broken for non-integer IntervalArray; see GH 31882"
mark = pytest.mark.xfail(reason=reason)
request.node.add_marker(mark)

tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ def test_repeat(self, data, repeats, as_series, use_numpy):
@pytest.mark.parametrize(
"repeats, kwargs, error, msg",
[
(2, dict(axis=1), ValueError, "'axis"),
(2, dict(axis=1), ValueError, "axis"),
(-1, dict(), ValueError, "negative"),
([1, 2], dict(), ValueError, "shape"),
(2, dict(foo="bar"), TypeError, "'foo'"),
Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/indexes/interval/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,9 +579,11 @@ def test_comparison(self):
actual = self.index == self.index.left
tm.assert_numpy_array_equal(actual, np.array([False, False]))

msg = (
"not supported between instances of 'int' and "
"'pandas._libs.interval.Interval'"
msg = "|".join(
[
"not supported between instances of 'int' and '.*.Interval'",
r"Invalid comparison between dtype=interval\[int64\] and ",
]
)
with pytest.raises(TypeError, match=msg):
self.index > 0
Expand Down