Skip to content

Dispatch categorical Series ops to Categorical #19582

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -686,3 +686,5 @@ Other
^^^^^

- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`)
- Comparisons between :class:`Series` and :class:`Index` would return a ``Series`` with an incorrect name, ignoring the ``Index``'s name attribute (:issue:`19582`)
-
3 changes: 3 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ def f(self, other):
# results depending whether categories are the same or not is kind of
# insane, so be a bit stricter here and use the python3 idea of
# comparing only things of equal type.
if isinstance(other, ABCSeries):
return NotImplemented

if not self.ordered:
if op in ['__lt__', '__gt__', '__le__', '__ge__']:
raise TypeError("Unordered Categoricals can only compare "
Expand Down
25 changes: 17 additions & 8 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import operator

import numpy as np
from pandas._libs import index as libindex

Expand Down Expand Up @@ -726,7 +728,9 @@ def _codes_for_groupby(self, sort):
def _add_comparison_methods(cls):
""" add in comparison methods """

def _make_compare(opname):
def _make_compare(op):
opname = '__{op}__'.format(op=op.__name__)

def _evaluate_compare(self, other):

# if we have a Categorical type, then must have the same
Expand All @@ -749,16 +753,21 @@ def _evaluate_compare(self, other):
"have the same categories and ordered "
"attributes")

return getattr(self.values, opname)(other)
result = op(self.values, other)
if isinstance(result, ABCSeries):
# Dispatch to pd.Categorical returned NotImplemented
# and we got a Series back; down-cast to ndarray
result = result.values
return result

return compat.set_function_name(_evaluate_compare, opname, cls)

cls.__eq__ = _make_compare('__eq__')
cls.__ne__ = _make_compare('__ne__')
cls.__lt__ = _make_compare('__lt__')
cls.__gt__ = _make_compare('__gt__')
cls.__le__ = _make_compare('__le__')
cls.__ge__ = _make_compare('__ge__')
cls.__eq__ = _make_compare(operator.eq)
cls.__ne__ = _make_compare(operator.ne)
cls.__lt__ = _make_compare(operator.lt)
cls.__gt__ = _make_compare(operator.gt)
cls.__le__ = _make_compare(operator.le)
cls.__ge__ = _make_compare(operator.ge)

def _delegate_method(self, name, *args, **kwargs):
""" method delegation to the ._values """
Expand Down
72 changes: 42 additions & 30 deletions pandas/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,7 +726,7 @@ def dispatch_to_index_op(op, left, right, index_class):
# avoid accidentally allowing integer add/sub. For datetime64[tz] dtypes,
# left_idx may inherit a freq from a cached DatetimeIndex.
# See discussion in GH#19147.
if left_idx.freq is not None:
if getattr(left_idx, 'freq', None) is not None:
left_idx = left_idx._shallow_copy(freq=None)
try:
result = op(left_idx, right)
Expand Down Expand Up @@ -774,9 +774,8 @@ def na_op(x, y):

# dispatch to the categorical if we have a categorical
# in either operand
if is_categorical_dtype(x):
return op(x, y)
elif is_categorical_dtype(y) and not is_scalar(y):
if is_categorical_dtype(y) and not is_scalar(y):
# The `not is_scalar(y)` check excludes the string "category"
return op(y, x)

elif is_object_dtype(x.dtype):
Expand Down Expand Up @@ -824,17 +823,36 @@ def wrapper(self, other, axis=None):
if axis is not None:
self._get_axis_number(axis)

res_name = _get_series_op_result_name(self, other)

if isinstance(other, ABCDataFrame): # pragma: no cover
# Defer to DataFrame implementation; fail early
return NotImplemented

elif isinstance(other, ABCSeries) and not self._indexed_same(other):
raise ValueError("Can only compare identically-labeled "
"Series objects")

elif is_categorical_dtype(self):
# Dispatch to Categorical implementation; pd.CategoricalIndex
# behavior is non-canonical GH#19513
res_values = dispatch_to_index_op(op, self, other, pd.Categorical)
return self._constructor(res_values, index=self.index,
name=res_name)

elif is_timedelta64_dtype(self):
res_values = dispatch_to_index_op(op, self, other,
pd.TimedeltaIndex)
return self._constructor(res_values, index=self.index,
name=res_name)

elif isinstance(other, ABCSeries):
name = com._maybe_match_name(self, other)
if not self._indexed_same(other):
msg = 'Can only compare identically-labeled Series objects'
raise ValueError(msg)
# By this point we have checked that self._indexed_same(other)
res_values = na_op(self.values, other.values)
return self._constructor(res_values, index=self.index, name=name)
# rename is needed in case res_name is None and res_values.name
# is not.
return self._constructor(res_values, index=self.index,
name=res_name).rename(res_name)

elif isinstance(other, (np.ndarray, pd.Index)):
# do not check length of zerodim array
Expand All @@ -844,15 +862,17 @@ def wrapper(self, other, axis=None):
raise ValueError('Lengths must match to compare')

res_values = na_op(self.values, np.asarray(other))
return self._constructor(res_values,
index=self.index).__finalize__(self)

elif (isinstance(other, pd.Categorical) and
not is_categorical_dtype(self)):
raise TypeError("Cannot compare a Categorical for op {op} with "
"Series of dtype {typ}.\nIf you want to compare "
"values, use 'series <op> np.asarray(other)'."
.format(op=op, typ=self.dtype))
result = self._constructor(res_values, index=self.index)
# rename is needed in case res_name is None and self.name
# is not.
return result.__finalize__(self).rename(res_name)

elif isinstance(other, pd.Categorical):
# ordering of checks matters; by this point we know
# that not is_categorical_dtype(self)
res_values = op(self.values, other)
return self._constructor(res_values, index=self.index,
name=res_name)

elif is_scalar(other) and isna(other):
# numpy does not like comparisons vs None
Expand All @@ -863,16 +883,9 @@ def wrapper(self, other, axis=None):
return self._constructor(res_values, index=self.index,
name=self.name, dtype='bool')

if is_categorical_dtype(self):
# cats are a special case as get_values() would return an ndarray,
# which would then not take categories ordering into account
# we can go directly to op, as the na_op would just test again and
# dispatch to it.
with np.errstate(all='ignore'):
res = op(self.values, other)
else:
values = self.get_values()
if isinstance(other, (list, np.ndarray)):
if isinstance(other, list):
other = np.asarray(other)

with np.errstate(all='ignore'):
Expand All @@ -882,10 +895,9 @@ def wrapper(self, other, axis=None):
.format(typ=type(other)))

# always return a full value series here
res = com._values_from_object(res)

res = pd.Series(res, index=self.index, name=self.name, dtype='bool')
return res
res_values = com._values_from_object(res)
return pd.Series(res_values, index=self.index,
name=res_name, dtype='bool')

return wrapper

Expand Down
1 change: 1 addition & 0 deletions pandas/tests/indexes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,7 @@ def test_equals_op(self):
series_d = Series(array_d)
with tm.assert_raises_regex(ValueError, "Lengths must match"):
index_a == series_b

tm.assert_numpy_array_equal(index_a == series_a, expected1)
tm.assert_numpy_array_equal(index_a == series_c, expected2)

Expand Down
34 changes: 34 additions & 0 deletions pandas/tests/series/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,40 @@ def test_ser_flex_cmp_return_dtypes_empty(self, opname):
result = getattr(empty, opname)(const).get_dtype_counts()
tm.assert_series_equal(result, Series([1], ['bool']))

@pytest.mark.parametrize('op', [operator.eq, operator.ne,
operator.le, operator.lt,
operator.ge, operator.gt])
@pytest.mark.parametrize('names', [(None, None, None),
('foo', 'bar', None),
('baz', 'baz', 'baz')])
def test_ser_cmp_result_names(self, names, op):
# datetime64 dtype
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did any of these raise / not work before?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

None of these work at the moment. They currently retain the series's name.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so if you are effectively enabling something, then pls add a whatsnew note

dti = pd.date_range('1949-06-07 03:00:00',
freq='H', periods=5, name=names[0])
ser = Series(dti).rename(names[1])
result = op(ser, dti)
assert result.name == names[2]

# datetime64tz dtype
dti = dti.tz_localize('US/Central')
ser = Series(dti).rename(names[1])
result = op(ser, dti)
assert result.name == names[2]

# timedelta64 dtype
tdi = dti - dti.shift(1)
ser = Series(tdi).rename(names[1])
result = op(ser, tdi)
assert result.name == names[2]

# categorical
if op in [operator.eq, operator.ne]:
# categorical dtype comparisons raise for inequalities
cidx = tdi.astype('category')
ser = Series(cidx).rename(names[1])
result = op(ser, cidx)
assert result.name == names[2]


class TestTimestampSeriesComparison(object):
def test_dt64ser_cmp_period_scalar(self):
Expand Down