diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/getting_started/dsintro.rst index 82d4b5e34e4f8..8bd271815549d 100644 --- a/doc/source/getting_started/dsintro.rst +++ b/doc/source/getting_started/dsintro.rst @@ -676,11 +676,11 @@ similar to an ndarray: # only show the first 5 rows df[:5].T +.. _dsintro.numpy_interop: + DataFrame interoperability with NumPy functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _dsintro.numpy_interop: - Elementwise NumPy ufuncs (log, exp, sqrt, ...) and various other NumPy functions can be used with no issues on Series and DataFrame, assuming the data within are numeric: diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index da593bcb6e923..abbb6feef6056 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -920,3 +920,29 @@ filling missing values beforehand. A similar situation occurs when using Series or DataFrame objects in ``if`` statements, see :ref:`gotchas.truth`. + +NumPy ufuncs +------------ + +:attr:`pandas.NA` implements NumPy's ``__array_ufunc__`` protocol. Most ufuncs +work with ``NA``, and generally return ``NA``: + +.. ipython:: python + + np.log(pd.NA) + np.add(pd.NA, 1) + +.. warning:: + + Currently, ufuncs involving an ndarray and ``NA`` will return an + object-dtype filled with NA values. + + .. ipython:: python + + a = np.array([1, 2, 3]) + np.greater(a, pd.NA) + + The return type here may change to return a different array type + in the future. + +See :ref:`dsintro.numpy_interop` for more on ufuncs. diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index f1cfa0978c3a0..afaf9115abfd3 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -14,6 +14,7 @@ from pandas._libs.tslibs.np_datetime cimport ( get_timedelta64_value, get_datetime64_value) from pandas._libs.tslibs.nattype cimport ( checknull_with_nat, c_NaT as NaT, is_null_datetimelike) +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op from pandas.compat import is_platform_32bit @@ -290,16 +291,29 @@ cdef inline bint is_null_period(v): # Implementation of NA singleton -def _create_binary_propagating_op(name, divmod=False): +def _create_binary_propagating_op(name, is_divmod=False): def method(self, other): if (other is C_NA or isinstance(other, str) - or isinstance(other, (numbers.Number, np.bool_))): - if divmod: + or isinstance(other, (numbers.Number, np.bool_)) + or isinstance(other, np.ndarray) and not other.shape): + # Need the other.shape clause to handle NumPy scalars, + # since we do a setitem on `out` below, which + # won't work for NumPy scalars. + if is_divmod: return NA, NA else: return NA + elif isinstance(other, np.ndarray): + out = np.empty(other.shape, dtype=object) + out[:] = NA + + if is_divmod: + return out, out.copy() + else: + return out + return NotImplemented method.__name__ = name @@ -369,8 +383,8 @@ class NAType(C_NAType): __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__") __mod__ = _create_binary_propagating_op("__mod__") __rmod__ = _create_binary_propagating_op("__rmod__") - __divmod__ = _create_binary_propagating_op("__divmod__", divmod=True) - __rdivmod__ = _create_binary_propagating_op("__rdivmod__", divmod=True) + __divmod__ = _create_binary_propagating_op("__divmod__", is_divmod=True) + __rdivmod__ = _create_binary_propagating_op("__rdivmod__", is_divmod=True) # __lshift__ and __rshift__ are not implemented __eq__ = _create_binary_propagating_op("__eq__") @@ -397,6 +411,8 @@ class NAType(C_NAType): return type(other)(1) else: return NA + elif isinstance(other, np.ndarray): + return np.where(other == 0, other.dtype.type(1), NA) return NotImplemented @@ -408,6 +424,8 @@ class NAType(C_NAType): return other else: return NA + elif isinstance(other, np.ndarray): + return np.where((other == 1) | (other == -1), other, NA) return NotImplemented @@ -440,6 +458,31 @@ class NAType(C_NAType): __rxor__ = __xor__ + __array_priority__ = 1000 + _HANDLED_TYPES = (np.ndarray, numbers.Number, str, np.bool_) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + types = self._HANDLED_TYPES + (NAType,) + for x in inputs: + if not isinstance(x, types): + return NotImplemented + + if method != "__call__": + raise ValueError(f"ufunc method '{method}' not supported for NA") + result = maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is NotImplemented: + # For a NumPy ufunc that's not a binop, like np.logaddexp + index = [i for i, x in enumerate(inputs) if x is NA][0] + result = np.broadcast_arrays(*inputs)[index] + if result.ndim == 0: + result = result.item() + if ufunc.nout > 1: + result = (NA,) * ufunc.nout + + return result + C_NA = NAType() # C-visible NA = C_NA # Python-visible diff --git a/pandas/_libs/ops_dispatch.pyx b/pandas/_libs/ops_dispatch.pyx new file mode 100644 index 0000000000000..f6ecef2038cf3 --- /dev/null +++ b/pandas/_libs/ops_dispatch.pyx @@ -0,0 +1,94 @@ +DISPATCHED_UFUNCS = { + "add", + "sub", + "mul", + "pow", + "mod", + "floordiv", + "truediv", + "divmod", + "eq", + "ne", + "lt", + "gt", + "le", + "ge", + "remainder", + "matmul", + "or", + "xor", + "and", +} +UFUNC_ALIASES = { + "subtract": "sub", + "multiply": "mul", + "floor_divide": "floordiv", + "true_divide": "truediv", + "power": "pow", + "remainder": "mod", + "divide": "div", + "equal": "eq", + "not_equal": "ne", + "less": "lt", + "less_equal": "le", + "greater": "gt", + "greater_equal": "ge", + "bitwise_or": "or", + "bitwise_and": "and", + "bitwise_xor": "xor", +} + +# For op(., Array) -> Array.__r{op}__ +REVERSED_NAMES = { + "lt": "__gt__", + "le": "__ge__", + "gt": "__lt__", + "ge": "__le__", + "eq": "__eq__", + "ne": "__ne__", +} + + +def maybe_dispatch_ufunc_to_dunder_op( + object self, object ufunc, str method, *inputs, **kwargs +): + """ + Dispatch a ufunc to the equivalent dunder method. + + Parameters + ---------- + self : ArrayLike + The array whose dunder method we dispatch to + ufunc : Callable + A NumPy ufunc + method : {'reduce', 'accumulate', 'reduceat', 'outer', 'at', '__call__'} + inputs : ArrayLike + The input arrays. + kwargs : Any + The additional keyword arguments, e.g. ``out``. + + Returns + ------- + result : Any + The result of applying the ufunc + """ + # special has the ufuncs we dispatch to the dunder op on + + op_name = ufunc.__name__ + op_name = UFUNC_ALIASES.get(op_name, op_name) + + def not_implemented(*args, **kwargs): + return NotImplemented + + if (method == "__call__" + and op_name in DISPATCHED_UFUNCS + and kwargs.get("out") is None): + if isinstance(inputs[0], type(self)): + name = f"__{op_name}__" + return getattr(self, name, not_implemented)(inputs[1]) + else: + name = REVERSED_NAMES.get(op_name, f"__r{op_name}__") + result = getattr(self, name, not_implemented)(inputs[0]) + return result + else: + return NotImplemented diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 1b868f7c10602..f51d71d5507a0 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -10,6 +10,7 @@ import numpy as np from pandas._libs import Timedelta, Timestamp, lib +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_list_like, is_timedelta64_dtype @@ -31,7 +32,6 @@ ) from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401 from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.core.ops.dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 from pandas.core.ops.dispatch import should_series_dispatch from pandas.core.ops.docstrings import ( _arith_doc_FRAME, diff --git a/pandas/core/ops/dispatch.py b/pandas/core/ops/dispatch.py index f35279378dc65..61a3032c7a02c 100644 --- a/pandas/core/ops/dispatch.py +++ b/pandas/core/ops/dispatch.py @@ -1,12 +1,10 @@ """ Functions for defining unary operations. """ -from typing import Any, Callable, Union +from typing import Any, Union import numpy as np -from pandas._typing import ArrayLike - from pandas.core.dtypes.common import ( is_datetime64_dtype, is_extension_array_dtype, @@ -126,94 +124,3 @@ def dispatch_to_extension_op( # on the ExtensionArray res_values = op(left, right) return res_values - - -def maybe_dispatch_ufunc_to_dunder_op( - self: ArrayLike, ufunc: Callable, method: str, *inputs: ArrayLike, **kwargs: Any -): - """ - Dispatch a ufunc to the equivalent dunder method. - - Parameters - ---------- - self : ArrayLike - The array whose dunder method we dispatch to - ufunc : Callable - A NumPy ufunc - method : {'reduce', 'accumulate', 'reduceat', 'outer', 'at', '__call__'} - inputs : ArrayLike - The input arrays. - kwargs : Any - The additional keyword arguments, e.g. ``out``. - - Returns - ------- - result : Any - The result of applying the ufunc - """ - # special has the ufuncs we dispatch to the dunder op on - special = { - "add", - "sub", - "mul", - "pow", - "mod", - "floordiv", - "truediv", - "divmod", - "eq", - "ne", - "lt", - "gt", - "le", - "ge", - "remainder", - "matmul", - "or", - "xor", - "and", - } - aliases = { - "subtract": "sub", - "multiply": "mul", - "floor_divide": "floordiv", - "true_divide": "truediv", - "power": "pow", - "remainder": "mod", - "divide": "div", - "equal": "eq", - "not_equal": "ne", - "less": "lt", - "less_equal": "le", - "greater": "gt", - "greater_equal": "ge", - "bitwise_or": "or", - "bitwise_and": "and", - "bitwise_xor": "xor", - } - - # For op(., Array) -> Array.__r{op}__ - flipped = { - "lt": "__gt__", - "le": "__ge__", - "gt": "__lt__", - "ge": "__le__", - "eq": "__eq__", - "ne": "__ne__", - } - - op_name = ufunc.__name__ - op_name = aliases.get(op_name, op_name) - - def not_implemented(*args, **kwargs): - return NotImplemented - - if method == "__call__" and op_name in special and kwargs.get("out") is None: - if isinstance(inputs[0], type(self)): - name = f"__{op_name}__" - return getattr(self, name, not_implemented)(inputs[1]) - else: - name = flipped.get(op_name, f"__r{op_name}__") - return getattr(self, name, not_implemented)(inputs[0]) - else: - return NotImplemented diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 384bf171738bc..a72378e02bec6 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -58,12 +58,6 @@ def test_comparison_ops(): assert (NA >= other) is NA assert (NA < other) is NA assert (NA <= other) is NA - - if isinstance(other, (np.int64, np.bool_)): - # for numpy scalars we get a deprecation warning and False as result - # for equality or error for larger/lesser than - continue - assert (other == NA) is NA assert (other != NA) is NA assert (other > NA) is NA @@ -87,9 +81,17 @@ def test_comparison_ops(): np.float_(-0), ], ) -def test_pow_special(value): +@pytest.mark.parametrize("asarray", [True, False]) +def test_pow_special(value, asarray): + if asarray: + value = np.array([value]) result = pd.NA ** value - assert isinstance(result, type(value)) + + if asarray: + result = result[0] + else: + # this assertion isn't possible for ndarray. + assert isinstance(result, type(value)) assert result == 1 @@ -108,12 +110,20 @@ def test_pow_special(value): np.float_(-1), ], ) -def test_rpow_special(value): +@pytest.mark.parametrize("asarray", [True, False]) +def test_rpow_special(value, asarray): + if asarray: + value = np.array([value]) result = value ** pd.NA - assert result == value - if not isinstance(value, (np.float_, np.bool_, np.int_)): + + if asarray: + result = result[0] + elif not isinstance(value, (np.float_, np.bool_, np.int_)): + # this assertion isn't possible with asarray=True assert isinstance(result, type(value)) + assert result == value + def test_unary_ops(): assert +NA is NA @@ -162,6 +172,19 @@ def test_logical_not(): assert ~NA is NA +@pytest.mark.parametrize( + "shape", [(3,), (3, 3), (1, 2, 3)], +) +def test_arithmetic_ndarray(shape, all_arithmetic_functions): + op = all_arithmetic_functions + a = np.zeros(shape) + if op.__name__ == "pow": + a += 5 + result = op(pd.NA, a) + expected = np.full(a.shape, pd.NA, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_is_scalar(): assert is_scalar(NA) is True @@ -177,6 +200,55 @@ def test_series_isna(): tm.assert_series_equal(s.isna(), expected) +def test_ufunc(): + assert np.log(pd.NA) is pd.NA + assert np.add(pd.NA, 1) is pd.NA + result = np.divmod(pd.NA, 1) + assert result[0] is pd.NA and result[1] is pd.NA + + result = np.frexp(pd.NA) + assert result[0] is pd.NA and result[1] is pd.NA + + +def test_ufunc_raises(): + with pytest.raises(ValueError, match="ufunc method 'at'"): + np.log.at(pd.NA, 0) + + +def test_binary_input_not_dunder(): + a = np.array([1, 2, 3]) + expected = np.array([pd.NA, pd.NA, pd.NA], dtype=object) + result = np.logaddexp(a, pd.NA) + tm.assert_numpy_array_equal(result, expected) + + result = np.logaddexp(pd.NA, a) + tm.assert_numpy_array_equal(result, expected) + + # all NA, multiple inputs + assert np.logaddexp(pd.NA, pd.NA) is pd.NA + + result = np.modf(pd.NA, pd.NA) + assert len(result) == 2 + assert all(x is pd.NA for x in result) + + +def test_divmod_ufunc(): + # binary in, binary out. + a = np.array([1, 2, 3]) + expected = np.array([pd.NA, pd.NA, pd.NA], dtype=object) + + result = np.divmod(a, pd.NA) + assert isinstance(result, tuple) + for arr in result: + tm.assert_numpy_array_equal(arr, expected) + tm.assert_numpy_array_equal(arr, expected) + + result = np.divmod(pd.NA, a) + for arr in result: + tm.assert_numpy_array_equal(arr, expected) + tm.assert_numpy_array_equal(arr, expected) + + def test_integer_hash_collision_dict(): # GH 30013 result = {NA: "foo", hash(NA): "bar"} diff --git a/setup.py b/setup.py index 489a9602511e8..076b77bf5d4df 100755 --- a/setup.py +++ b/setup.py @@ -596,6 +596,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): }, "_libs.reduction": {"pyxfile": "_libs/reduction"}, "_libs.ops": {"pyxfile": "_libs/ops"}, + "_libs.ops_dispatch": {"pyxfile": "_libs/ops_dispatch"}, "_libs.properties": {"pyxfile": "_libs/properties"}, "_libs.reshape": {"pyxfile": "_libs/reshape", "depends": []}, "_libs.sparse": {"pyxfile": "_libs/sparse", "depends": _pxi_dep["sparse"]},