Skip to content

WIP: Dispatch Series comparison methods to Index implementations #19288

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 65 additions & 52 deletions pandas/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pandas as pd
import datetime

from pandas._libs import (lib, index as libindex,
from pandas._libs import (lib,
tslib as libts, algos as libalgos, iNaT)

from pandas import compat
Expand Down Expand Up @@ -39,8 +39,7 @@
from pandas.core.dtypes.generic import (
ABCSeries,
ABCDataFrame,
ABCIndex, ABCDatetimeIndex,
ABCPeriodIndex)
ABCIndex, ABCDatetimeIndex, ABCIndexClass)

# -----------------------------------------------------------------------------
# Functions that add arithmetic methods to objects, given arithmetic factory
Expand Down Expand Up @@ -748,7 +747,7 @@ def _get_series_op_result_name(left, right):
def _comp_method_OBJECT_ARRAY(op, x, y):
if isinstance(y, list):
y = construct_1d_object_array_from_listlike(y)
if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)):
if isinstance(y, (np.ndarray, ABCSeries, ABCIndexClass)):
if not is_object_dtype(y.dtype):
y = y.astype(np.object_)

Expand All @@ -775,8 +774,9 @@ def na_op(x, y):
return op(x, y)
elif is_categorical_dtype(y) and not is_scalar(y):
return op(y, x)
# TODO: Does this make sense or should op be reversed?

if is_object_dtype(x.dtype):
elif is_object_dtype(x.dtype):
result = _comp_method_OBJECT_ARRAY(op, x, y)
else:

Expand All @@ -797,15 +797,9 @@ def na_op(x, y):

# we have a datetime/timedelta and may need to convert
mask = None
if (needs_i8_conversion(x) or
(not is_scalar(y) and needs_i8_conversion(y))):

if is_scalar(y):
mask = isna(x)
y = libindex.convert_scalar(x, _values_from_object(y))
else:
mask = isna(x) | isna(y)
y = y.view('i8')
if not is_scalar(y) and needs_i8_conversion(y):
mask = isna(x) | isna(y)
y = y.view('i8')
x = x.view('i8')

try:
Expand All @@ -821,54 +815,72 @@ def na_op(x, y):

return result

def wrapper(self, other, axis=None):
# Validate the axis parameter
if axis is not None:
self._get_axis_number(axis)

if isinstance(other, ABCSeries):
name = _maybe_match_name(self, other)
if not self._indexed_same(other):
msg = 'Can only compare identically-labeled Series objects'
raise ValueError(msg)
return self._constructor(na_op(self.values, other.values),
index=self.index, name=name)
elif isinstance(other, ABCDataFrame): # pragma: no cover
def wrapper(self, other):
if isinstance(other, ABCDataFrame): # pragma: no cover
return NotImplemented
elif isinstance(other, (np.ndarray, pd.Index)):
# do not check length of zerodim array
# as it will broadcast
if (not is_scalar(lib.item_from_zerodim(other)) and
len(self) != len(other)):
raise ValueError('Lengths must match to compare')

if isinstance(other, ABCPeriodIndex):
# temp workaround until fixing GH 13637
# tested in test_nat_comparisons
# (pandas.tests.series.test_operators.TestSeriesOperators)
return self._constructor(na_op(self.values,
other.astype(object).values),
index=self.index)
elif isinstance(other, ABCSeries) and not self._indexed_same(other):
msg = 'Can only compare identically-labeled Series objects'
raise ValueError(msg)

return self._constructor(na_op(self.values, np.asarray(other)),
index=self.index).__finalize__(self)
res_name = _get_series_op_result_name(self, other)

elif isinstance(other, pd.Categorical):
if not is_categorical_dtype(self):
msg = ("Cannot compare a Categorical for op {op} with Series "
"of dtype {typ}.\nIf you want to compare values, use "
"'series <op> np.asarray(other)'.")
raise TypeError(msg.format(op=op, typ=self.dtype))
if is_timedelta64_dtype(self):
res = op(pd.TimedeltaIndex(self), other)
return self._constructor(res, index=self.index, name=res_name)

elif is_datetime64_dtype(self) or is_datetime64tz_dtype(self):
# kludge; DatetimeIndex refuses to compare against None or
# datetime.date; until the "right" behavior is resolved, we cast
# these types here to types that DatetimeIndex understand.
if type(other) is datetime.date:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use isinstance.

what do you mean refuses to compare?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use isinstance.

Will change, but note the isinstance version is pretty verbose: if isinstance(other, datetime.date) and not isinstance(other, datetime.datetime).

what do you mean refuses to compare?

DatetimeIndex comparison with a) datetime.date or b) None raises TypeError; Series casts to datetime and NaT, respectively.

dti = pd.date_range('2016-01-01', periods=2)
ser = pd.Series(dti)

>>> dti ==  datetime.date(2016, 1, 1)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "pandas/core/indexes/datetimes.py", line 119, in wrapper
    other = _ensure_datetime64(other)
  File "pandas/core/indexes/datetimes.py", line 145, in _ensure_datetime64
    raise TypeError('%s type object %s' % (type(other), str(other)))
TypeError: <type 'datetime.date'> type object 2016-01-01

>>> ser == datetime.date(2016, 1, 1)
0     True
1    False
dtype: bool

>>> dti != None
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "pandas/core/indexes/datetimes.py", line 119, in wrapper
    other = _ensure_datetime64(other)
  File "pandas/core/indexes/datetimes.py", line 145, in _ensure_datetime64
    raise TypeError('%s type object %s' % (type(other), str(other)))
TypeError: <type 'NoneType'> type object None

>>> ser != None
0    True
1    True
dtype: bool

So really one of these two behaviors should be chosen as canonical and the other changed (and tested), at which point these shims will be unnecessary.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these comparison should work for both DTI and None. These work for scalars as well already.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am totally on board with the "DTI comparison should behave like a vectorized Timestamp comparison" approach, but it isn't 100% straightforward:

For datetime.date comparison, Timestamp.__eq__ is always False and Timestamp.__ne__ is always True; lt, le, gt, ge all raise TypeError.

dti = pd.date_range('2016-01-01', periods=2)
ts = dti[0]
dt = ts.to_pydatetime()
d = dt.date()

>>> ts == d
False
>>> ts < d
  File "pandas/_libs/tslibs/timestamps.pyx", line 121, in pandas._libs.tslibs.timestamps._Timestamp.__richcmp__
    raise TypeError('Cannot compare type %r with type %r' %
TypeError: Cannot compare type 'Timestamp' with type 'date'

Same behavior for None, i.e. it is not treated being equivalent to pd.NaT.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, I would prefer a patch independetly of this one to fix these 2 cases, these already work with DTI

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#19301 is intended to do that. Did you have something else in mind?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes saw that after

other = datetime.datetime(other.year, other.month, other.day)
elif other is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this None check needed? do we not do this in the op comparison directly?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See above.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and I don't like this code path at all, meaning you are special casing things, which IMHO should be handle inside the DTI comparisons.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes; the next step is to put this on the back-burner and fix the DTI comparisons.

other = pd.NaT
res = op(pd.DatetimeIndex(self), other)
return self._constructor(res, index=self.index, name=res_name)

if is_categorical_dtype(self):
elif isinstance(other, ABCSeries):
# Note: Ordering matters; this needs to go before
# is_categorical_dtype(self) branch.
res = na_op(self.values, other.values)
return self._constructor(res,
index=self.index, name=res_name)

elif is_categorical_dtype(self):
# cats are a special case as get_values() would return an ndarray,
# which would then not take categories ordering into account
# we can go directly to op, as the na_op would just test again and
# dispatch to it.
with np.errstate(all='ignore'):
res = op(self.values, other)
# Note: self.values is a pd.Categorical object
return self._constructor(res, index=self.index,
name=res_name, dtype='bool')

elif isinstance(other, pd.Categorical):
# Ordering of conditions here matters; we know at this point
# that not is_categorical_dtype(self)
msg = ("Cannot compare a Categorical for op {op} with Series "
"of dtype {typ}.\nIf you want to compare values, use "
"'series <op> np.asarray(other)'.")
raise TypeError(msg.format(op=op, typ=self.dtype))

elif isinstance(other, (np.ndarray, pd.Index)):
# do not check length of zerodim array
# as it will broadcast
if (not is_scalar(lib.item_from_zerodim(other)) and
len(self) != len(other)):
raise ValueError('Lengths must match to compare')

res = na_op(self.values, np.asarray(other))
return self._constructor(res,
index=self.index).__finalize__(self)
# TODO: Why __finalize__ here but not elsewhere?

else:
values = self.get_values()
# TODO: why get_values() here and just values elsewhere?
if isinstance(other, (list, np.ndarray)):
other = np.asarray(other)

Expand All @@ -881,9 +893,10 @@ def wrapper(self, other, axis=None):
# always return a full value series here
res = _values_from_object(res)

res = pd.Series(res, index=self.index, name=self.name, dtype='bool')
return res

res = pd.Series(res, index=self.index, name=res_name, dtype='bool')
# TODO: Why not use self._constructor here?
# TODO: pass dtype='bool' in other locations?
return res
return wrapper


Expand Down
15 changes: 8 additions & 7 deletions pandas/tests/indexes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,11 +760,12 @@ def test_equals_op(self):
if isinstance(index_a, PeriodIndex):
return

len_msg = "Lengths must match|different lengths"
n = len(index_a)
index_b = index_a[0:-1]
index_c = index_a[0:-1].append(index_a[-2:-1])
index_d = index_a[0:1]
with tm.assert_raises_regex(ValueError, "Lengths must match"):
with tm.assert_raises_regex(ValueError, len_msg):
index_a == index_b
expected1 = np.array([True] * n)
expected2 = np.array([True] * (n - 1) + [False])
Expand All @@ -776,7 +777,7 @@ def test_equals_op(self):
array_b = np.array(index_a[0:-1])
array_c = np.array(index_a[0:-1].append(index_a[-2:-1]))
array_d = np.array(index_a[0:1])
with tm.assert_raises_regex(ValueError, "Lengths must match"):
with tm.assert_raises_regex(ValueError, len_msg):
index_a == array_b
tm.assert_numpy_array_equal(index_a == array_a, expected1)
tm.assert_numpy_array_equal(index_a == array_c, expected2)
Expand All @@ -786,22 +787,22 @@ def test_equals_op(self):
series_b = Series(array_b)
series_c = Series(array_c)
series_d = Series(array_d)
with tm.assert_raises_regex(ValueError, "Lengths must match"):
with tm.assert_raises_regex(ValueError, len_msg):
index_a == series_b
tm.assert_numpy_array_equal(index_a == series_a, expected1)
tm.assert_numpy_array_equal(index_a == series_c, expected2)

# cases where length is 1 for one of them
with tm.assert_raises_regex(ValueError, "Lengths must match"):
with tm.assert_raises_regex(ValueError, len_msg):
index_a == index_d
with tm.assert_raises_regex(ValueError, "Lengths must match"):
with tm.assert_raises_regex(ValueError, len_msg):
index_a == series_d
with tm.assert_raises_regex(ValueError, "Lengths must match"):
with tm.assert_raises_regex(ValueError, len_msg):
index_a == array_d
msg = "Can only compare identically-labeled Series objects"
with tm.assert_raises_regex(ValueError, msg):
series_a == series_d
with tm.assert_raises_regex(ValueError, "Lengths must match"):
with tm.assert_raises_regex(ValueError, len_msg):
series_a == array_d

# comparing with a scalar should broadcast; note that we are excluding
Expand Down