From e18a291a8eabd31b3205c4de45daa0f90212fa50 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 7 Aug 2019 20:17:50 -0700 Subject: [PATCH 01/11] REF: implement should_extension_dispatch --- pandas/core/ops/__init__.py | 103 +++++++++--------- .../arrays/categorical/test_operators.py | 8 +- pandas/tests/extension/test_categorical.py | 2 +- 3 files changed, 57 insertions(+), 56 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 4ab1941e3493f..f5fd1f1f6fa48 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -48,7 +48,7 @@ import pandas as pd from pandas._typing import ArrayLike -from pandas.core.construction import extract_array +from pandas.core.construction import array, extract_array from pandas.core.ops import missing from pandas.core.ops.docstrings import ( _arith_doc_FRAME, @@ -457,6 +457,33 @@ def masked_arith_op(x, y, op): # Dispatch logic +def should_extension_dispatch(left, right) -> bool: + """ + Identify cases where Series operation should use dispatch_to_extension_op. + + Parameters + ---------- + left : Series + right : object + + Returns + ------- + bool + """ + if ( + is_extension_array_dtype(left.dtype) + or is_datetime64_dtype(left.dtype) + or is_timedelta64_dtype(left.dtype) + ): + return True + + if is_extension_array_dtype(right) and not is_scalar(right): + # GH#22378 disallow scalar to exclude e.g. "category", "Int64" + return True + + return False + + def should_series_dispatch(left, right, op): """ Identify cases where a DataFrame operation should dispatch to its @@ -561,19 +588,18 @@ def dispatch_to_extension_op(op, left, right): apply the operator defined by op. """ + if left.dtype.kind in "mM": + # We need to cast datetime64 and timedelta64 ndarrays to + # DatetimeArray/TimedeltaArray. But we avoid wrapping others in + # PandasArray as that behaves poorly with e.g. IntegerArray. + left = array(left) + # The op calls will raise TypeError if the op is not defined # on the ExtensionArray # unbox Series and Index to arrays - if isinstance(left, (ABCSeries, ABCIndexClass)): - new_left = left._values - else: - new_left = left - - if isinstance(right, (ABCSeries, ABCIndexClass)): - new_right = right._values - else: - new_right = right + new_left = extract_array(left, extract_numpy=True) + new_right = extract_array(right, extract_numpy=True) try: res_values = op(new_left, new_right) @@ -899,56 +925,27 @@ def wrapper(left, right): res_name = get_op_result_name(left, right) right = maybe_upcast_for_op(right, left.shape) - if is_categorical_dtype(left): - raise TypeError( - "{typ} cannot perform the operation " - "{op}".format(typ=type(left).__name__, op=str_rep) - ) - - elif is_datetime64_dtype(left) or is_datetime64tz_dtype(left): - from pandas.core.arrays import DatetimeArray - - result = dispatch_to_extension_op(op, DatetimeArray(left), right) - return construct_result(left, result, index=left.index, name=res_name) - - elif is_extension_array_dtype(left) or ( - is_extension_array_dtype(right) and not is_scalar(right) - ): - # GH#22378 disallow scalar to exclude e.g. "category", "Int64" + if should_extension_dispatch(left, right): result = dispatch_to_extension_op(op, left, right) - return construct_result(left, result, index=left.index, name=res_name) - elif is_timedelta64_dtype(left): - from pandas.core.arrays import TimedeltaArray - - result = dispatch_to_extension_op(op, TimedeltaArray(left), right) - return construct_result(left, result, index=left.index, name=res_name) - - elif is_timedelta64_dtype(right): - # We should only get here with non-scalar values for right - # upcast by maybe_upcast_for_op + elif is_timedelta64_dtype(right) or isinstance( + right, (ABCDatetimeArray, ABCDatetimeIndex) + ): + # We should only get here with td64 right with non-scalar values + # for right upcast by maybe_upcast_for_op assert not isinstance(right, (np.timedelta64, np.ndarray)) - result = op(left._values, right) - # We do not pass dtype to ensure that the Series constructor - # does inference in the case where `result` has object-dtype. - return construct_result(left, result, index=left.index, name=res_name) - - elif isinstance(right, (ABCDatetimeArray, ABCDatetimeIndex)): - result = op(left._values, right) - return construct_result(left, result, index=left.index, name=res_name) + else: + lvalues = extract_array(left, extract_numpy=True) + rvalues = extract_array(right, extract_numpy=True) - lvalues = left.values - rvalues = right - if isinstance(rvalues, (ABCSeries, ABCIndexClass)): - rvalues = rvalues._values + with np.errstate(all="ignore"): + result = na_op(lvalues, rvalues) - with np.errstate(all="ignore"): - result = na_op(lvalues, rvalues) - return construct_result( - left, result, index=left.index, name=res_name, dtype=None - ) + # We do not pass dtype to ensure that the Series constructor + # does inference in the case where `result` has object-dtype. + return construct_result(left, result, index=left.index, name=res_name) wrapper.__name__ = op_name return wrapper diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 9a09ea8422b1f..22c1d5373372a 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -349,7 +349,9 @@ def test_numeric_like_ops(self): ("__mul__", r"\*"), ("__truediv__", "/"), ]: - msg = r"Series cannot perform the operation {}".format(str_rep) + msg = r"Series cannot perform the operation {}|unsupported operand".format( + str_rep + ) with pytest.raises(TypeError, match=msg): getattr(df, op)(df) @@ -375,7 +377,9 @@ def test_numeric_like_ops(self): ("__mul__", r"\*"), ("__truediv__", "/"), ]: - msg = r"Series cannot perform the operation {}".format(str_rep) + msg = r"Series cannot perform the operation {}|unsupported operand".format( + str_rep + ) with pytest.raises(TypeError, match=msg): getattr(s, op)(2) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index f7456d24ad6d3..0c0e8b0123c03 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -211,7 +211,7 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): def test_add_series_with_extension_array(self, data): ser = pd.Series(data) - with pytest.raises(TypeError, match="cannot perform"): + with pytest.raises(TypeError, match="cannot perform|unsupported operand"): ser + data def test_divmod_series_array(self): From 10eacc4521941c31bb7217d928dbaa968a06a4e6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 7 Aug 2019 20:48:00 -0700 Subject: [PATCH 02/11] dummy to force CI From 79d6abebc53bcb9363fe04ba0154c169d8c7942c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 8 Aug 2019 09:09:00 -0700 Subject: [PATCH 03/11] Dummy to force CI From 9f5bf6e28f240643be92e3396073457ed0b1227b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2019 07:11:43 -0700 Subject: [PATCH 04/11] add types --- pandas/core/ops/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 30a2c4e33e769..843f12c20b07b 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -460,7 +460,7 @@ def masked_arith_op(x, y, op): # Dispatch logic -def should_extension_dispatch(left, right) -> bool: +def should_extension_dispatch(left: ABCSeries, right: Any) -> bool: """ Identify cases where Series operation should use dispatch_to_extension_op. From 69ea8a29b3b298cd9822b374997b3fd4c06b8e5d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2019 09:15:36 -0700 Subject: [PATCH 05/11] BUG: fix Sparse reduction (#27890) * BUG: fix Sparse reduction --- doc/source/whatsnew/v0.25.1.rst | 2 +- pandas/core/arrays/sparse.py | 3 +++ pandas/tests/series/test_ufunc.py | 5 +---- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index b97f45efbeae9..dfa216b1db56e 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -133,7 +133,7 @@ Reshaping Sparse ^^^^^^ - +- Bug in reductions for :class:`Series` with Sparse dtypes (:issue:`27080`) - - - diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 8aa83c3fbc37d..2234167fe0193 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1693,6 +1693,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): for sp_value, fv in zip(sp_values, fill_value) ) return arrays + elif is_scalar(sp_values): + # e.g. reductions + return sp_values return self._simple_new( sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index c024e9caba156..8144a3931b9b8 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -252,10 +252,7 @@ def __add__(self, other): "values", [ pd.array([1, 3, 2]), - pytest.param( - pd.array([1, 10, 0], dtype="Sparse[int]"), - marks=pytest.mark.xfail(resason="GH-27080. Bug in SparseArray"), - ), + pd.array([1, 10, 0], dtype="Sparse[int]"), pd.to_datetime(["2000", "2010", "2001"]), pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"), pd.to_datetime(["2000", "2010", "2001"]).to_period(freq="D"), From 8b8bca08e8f44a02a023ad2285c35d0802920353 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2019 17:29:07 -0700 Subject: [PATCH 06/11] use should_extension_dispatch --- pandas/core/ops/__init__.py | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 843f12c20b07b..81eb0b3a88855 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -831,12 +831,6 @@ def wrapper(self, other, axis=None): if isinstance(other, ABCSeries) and not self._indexed_same(other): raise ValueError("Can only compare identically-labeled Series objects") - elif ( - is_list_like(other) - and len(other) != len(self) - and not isinstance(other, (set, frozenset)) - ): - raise ValueError("Lengths must match") elif isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)): # TODO: make this treatment consistent across ops and classes. @@ -845,28 +839,7 @@ def wrapper(self, other, axis=None): if len(self) != len(other): raise ValueError("Lengths must match to compare") - if is_categorical_dtype(self): - # Dispatch to Categorical implementation; CategoricalIndex - # behavior is non-canonical GH#19513 - res_values = dispatch_to_extension_op(op, self, other) - - elif is_datetime64_dtype(self) or is_datetime64tz_dtype(self): - # Dispatch to DatetimeIndex to ensure identical - # Series/Index behavior - from pandas.core.arrays import DatetimeArray - - res_values = dispatch_to_extension_op(op, DatetimeArray(self), other) - - elif is_timedelta64_dtype(self): - from pandas.core.arrays import TimedeltaArray - - res_values = dispatch_to_extension_op(op, TimedeltaArray(self), other) - - elif is_extension_array_dtype(self) or ( - is_extension_array_dtype(other) and not is_scalar(other) - ): - # Note: the `not is_scalar(other)` condition rules out - # e.g. other == "category" + if should_extension_dispatch(self, other): res_values = dispatch_to_extension_op(op, self, other) elif is_scalar(other) and isna(other): From 57892a1e840f2f714aa2b224f77292af55c255c2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2019 17:49:02 -0700 Subject: [PATCH 07/11] simplify comparison method --- pandas/core/ops/__init__.py | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 81eb0b3a88855..844098b9f36ef 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -23,7 +23,6 @@ is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_datetimelike_v_numeric, is_extension_array_dtype, is_integer_dtype, @@ -32,7 +31,6 @@ is_period_dtype, is_scalar, is_timedelta64_dtype, - needs_i8_conversion, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -758,7 +756,6 @@ def _comp_method_SERIES(cls, op, special): code duplication. """ op_name = _get_op_name(op, special) - masker = _gen_eval_kwargs(op_name).get("masker", False) def na_op(x, y): # TODO: @@ -777,31 +774,11 @@ def na_op(x, y): else: - # we want to compare like types - # we only want to convert to integer like if - # we are not NotImplemented, otherwise - # we would allow datetime64 (but viewed as i8) against - # integer comparisons - - # we have a datetime/timedelta and may need to convert - assert not needs_i8_conversion(x) - mask = None - if not is_scalar(y) and needs_i8_conversion(y): - mask = isna(x) | isna(y) - y = y.view("i8") - x = x.view("i8") - method = getattr(x, op_name, None) - if method is not None: - with np.errstate(all="ignore"): - result = method(y) - if result is NotImplemented: - return invalid_comparison(x, y, op) - else: - result = op(x, y) - - if mask is not None and mask.any(): - result[mask] = masker + with np.errstate(all="ignore"): + result = method(y) + if result is NotImplemented: + return invalid_comparison(x, y, op) return result From 4b9ce68dceff24f7ded742a0812e2fd212b8c4b6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 14 Aug 2019 12:52:35 -0700 Subject: [PATCH 08/11] fix numpy warning --- pandas/core/ops/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 844098b9f36ef..e82b998338efb 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -603,7 +603,8 @@ def dispatch_to_extension_op(op, left, right): new_right = extract_array(right, extract_numpy=True) try: - res_values = op(new_left, new_right) + with np.errstate(all="ignore"): + res_values = op(new_left, new_right) except NullFrequencyError: # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError # on add/sub of integers (or int-like). We re-raise as a TypeError. @@ -611,6 +612,10 @@ def dispatch_to_extension_op(op, left, right): "incompatible type for a datetime/timedelta " "operation [{name}]".format(name=op.__name__) ) + if is_scalar(res_values): + # numpy returned False instead of operating pointwise + assert op.__name__ in ["eq", "ne", "ge", "gt", "le", "lt"], op + return invalid_comparison(new_left, new_right, op) return res_values From 69f7db2d52ea1a8ad48d66381a612f1e2c68601e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 14 Aug 2019 20:30:03 -0700 Subject: [PATCH 09/11] fix deprecationwaring --- pandas/core/ops/__init__.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index e82b998338efb..6ba212123715a 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -6,6 +6,7 @@ import datetime import operator from typing import Any, Callable, Tuple +import warnings import numpy as np @@ -603,8 +604,10 @@ def dispatch_to_extension_op(op, left, right): new_right = extract_array(right, extract_numpy=True) try: - with np.errstate(all="ignore"): - res_values = op(new_left, new_right) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=DeprecationWarning) + with np.errstate(invalid="raise"): + res_values = op(new_left, new_right) except NullFrequencyError: # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError # on add/sub of integers (or int-like). We re-raise as a TypeError. @@ -612,7 +615,7 @@ def dispatch_to_extension_op(op, left, right): "incompatible type for a datetime/timedelta " "operation [{name}]".format(name=op.__name__) ) - if is_scalar(res_values): + except TypeError: # numpy returned False instead of operating pointwise assert op.__name__ in ["eq", "ne", "ge", "gt", "le", "lt"], op return invalid_comparison(new_left, new_right, op) From c2df30aad779b730508ebe1a0f272df05748f981 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 14 Aug 2019 20:38:56 -0700 Subject: [PATCH 10/11] catch better --- pandas/core/ops/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 6ba212123715a..fd68f32515dec 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -606,8 +606,7 @@ def dispatch_to_extension_op(op, left, right): try: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) - with np.errstate(invalid="raise"): - res_values = op(new_left, new_right) + res_values = op(new_left, new_right) except NullFrequencyError: # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError # on add/sub of integers (or int-like). We re-raise as a TypeError. @@ -615,7 +614,7 @@ def dispatch_to_extension_op(op, left, right): "incompatible type for a datetime/timedelta " "operation [{name}]".format(name=op.__name__) ) - except TypeError: + if isinstance(res_values, bool): # numpy returned False instead of operating pointwise assert op.__name__ in ["eq", "ne", "ge", "gt", "le", "lt"], op return invalid_comparison(new_left, new_right, op) From 97f75dcddc6728f40891ff7ea0872e3b3b5ef020 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 15 Aug 2019 07:35:01 -0700 Subject: [PATCH 11/11] Fix check --- pandas/core/ops/__init__.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index dcaa92692cb2c..abd1a22d690dc 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -6,7 +6,6 @@ import datetime import operator from typing import Any, Callable, Tuple -import warnings import numpy as np @@ -22,7 +21,6 @@ from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, - is_categorical_dtype, is_datetime64_dtype, is_datetimelike_v_numeric, is_extension_array_dtype, @@ -37,6 +35,7 @@ ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, + ABCExtensionArray, ABCIndex, ABCIndexClass, ABCSeries, @@ -604,9 +603,7 @@ def dispatch_to_extension_op(op, left, right): new_right = extract_array(right, extract_numpy=True) try: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=DeprecationWarning) - res_values = op(new_left, new_right) + res_values = op(new_left, new_right) except NullFrequencyError: # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError # on add/sub of integers (or int-like). We re-raise as a TypeError. @@ -614,10 +611,6 @@ def dispatch_to_extension_op(op, left, right): "incompatible type for a datetime/timedelta " "operation [{name}]".format(name=op.__name__) ) - if isinstance(res_values, bool): - # numpy returned False instead of operating pointwise - assert op.__name__ in ["eq", "ne", "ge", "gt", "le", "lt"], op - return invalid_comparison(new_left, new_right, op) return res_values @@ -811,7 +804,9 @@ def wrapper(self, other, axis=None): if isinstance(other, ABCSeries) and not self._indexed_same(other): raise ValueError("Can only compare identically-labeled Series objects") - elif isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)): + elif isinstance( + other, (np.ndarray, ABCExtensionArray, ABCIndexClass, ABCSeries) + ): # TODO: make this treatment consistent across ops and classes. # We are not catching all listlikes here (e.g. frozenset, tuple) # The ambiguous case is object-dtype. See GH#27803