Skip to content

Add __array_ufunc__ to Series / Array #23293

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 60 commits into from
Jul 1, 2019
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
2fffac3
Add __array_ufunc__ to Series / Array
jorisvandenbossche Oct 23, 2018
c5a4664
expand IntegerArray.__array_ufunc__
jorisvandenbossche Oct 23, 2018
dd332a4
fix Series.__array_ufunc__ and consolidate dispatch
jorisvandenbossche Oct 23, 2018
71c058e
test Series array_ufunc fallback to numpy array for DecimalArray
jorisvandenbossche Oct 23, 2018
a0d11d9
fix import
jorisvandenbossche Oct 23, 2018
4cfeb9b
first dispatch before getting underlying values (eg for Series[Period…
jorisvandenbossche Oct 23, 2018
607f8a6
fix Categorical: disallow all ufunc apart from ops
jorisvandenbossche Oct 23, 2018
c4fcae7
simplify calling ufunc on underlying values
jorisvandenbossche Oct 23, 2018
65dea1b
fix categorical not existing ops
jorisvandenbossche Oct 23, 2018
134df14
np.positive not available for older numpy versions
jorisvandenbossche Oct 24, 2018
5239b70
fix multiple return values
jorisvandenbossche Oct 24, 2018
3d91885
skip IntegerArray tests for older numpy versions
jorisvandenbossche Oct 24, 2018
429f15c
also deal with no return value
jorisvandenbossche Oct 24, 2018
41f4158
clean-up debugging left-over
jorisvandenbossche Oct 24, 2018
0d6a663
TST: Additional tests for Series ufuncs
TomAugspurger Jun 19, 2019
8f46391
fixup release note
TomAugspurger Jun 19, 2019
44e3c7e
fixups
TomAugspurger Jun 19, 2019
e179913
remove stale comment
TomAugspurger Jun 19, 2019
27208c1
Merge remote-tracking branch 'upstream/master' into jorisvandenbossch…
TomAugspurger Jun 19, 2019
0b1e745
xfail ufunc(series, index)
TomAugspurger Jun 20, 2019
9be1dff
32-bit compat
TomAugspurger Jun 20, 2019
775c2ef
fixup
TomAugspurger Jun 20, 2019
4d7f249
wip
TomAugspurger Jun 20, 2019
0b359d7
fixup release note
TomAugspurger Jun 19, 2019
bbbf269
Merge remote-tracking branch 'upstream/master' into series-array-ufunc
TomAugspurger Jun 21, 2019
64d8908
more
TomAugspurger Jun 21, 2019
d1788b0
lint
TomAugspurger Jun 21, 2019
ef5d508
Merge remote-tracking branch 'upstream/master' into jorisvandenbossch…
TomAugspurger Jun 21, 2019
fe0ee4e
Merge branch 'series-array-ufunc' into jorisvandenbossche-array-ufunc
TomAugspurger Jun 21, 2019
971e347
fixup! more
TomAugspurger Jun 21, 2019
95e8aef
remove dead code
TomAugspurger Jun 21, 2019
7bfd584
todos
TomAugspurger Jun 21, 2019
06e5739
Merge remote-tracking branch 'upstream/master' into jorisvandenbossch…
TomAugspurger Jun 21, 2019
feee015
remove compat
TomAugspurger Jun 21, 2019
3702b9b
object dtype tests
TomAugspurger Jun 21, 2019
a0f84ed
wip
TomAugspurger Jun 21, 2019
d83fe7a
doc, types
TomAugspurger Jun 21, 2019
edad466
compat
TomAugspurger Jun 22, 2019
e4ae8dc
fixups
TomAugspurger Jun 23, 2019
db60f6c
Merge remote-tracking branch 'upstream/master' into jorisvandenbossch…
TomAugspurger Jun 24, 2019
a9bd6ef
added matmul
TomAugspurger Jun 24, 2019
1a8b807
start docs
TomAugspurger Jun 24, 2019
0b0466d
compat
TomAugspurger Jun 24, 2019
1f67866
Merge remote-tracking branch 'upstream/master' into jorisvandenbossch…
TomAugspurger Jun 26, 2019
d3089bd
ignore for numpydev
TomAugspurger Jun 27, 2019
6e770e8
Merge remote-tracking branch 'upstream/master' into jorisvandenbossch…
TomAugspurger Jun 27, 2019
15a3fb1
handle reduce
TomAugspurger Jun 27, 2019
b5e7f45
Merge remote-tracking branch 'upstream/master' into jorisvandenbossch…
TomAugspurger Jun 27, 2019
4f4bd93
update
TomAugspurger Jun 27, 2019
5dbff49
fixups
TomAugspurger Jun 29, 2019
b623be2
Merge remote-tracking branch 'upstream/master' into jorisvandenbossch…
TomAugspurger Jun 29, 2019
2237233
raise for reduce
TomAugspurger Jun 29, 2019
5b5c547
more tests
TomAugspurger Jun 29, 2019
10bc2cc
more tests
TomAugspurger Jun 29, 2019
5380b77
35 compat
TomAugspurger Jun 29, 2019
9f4d110
remove old test
TomAugspurger Jun 30, 2019
6c15ee7
Merge remote-tracking branch 'upstream/master' into jorisvandenbossch…
TomAugspurger Jul 1, 2019
ab48bd8
fixup
TomAugspurger Jul 1, 2019
30fced8
Merge remote-tracking branch 'upstream/master' into jorisvandenbossch…
TomAugspurger Jul 1, 2019
7486d26
Fixups
TomAugspurger Jul 1, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numbers
import sys
import warnings
import copy
Expand All @@ -10,6 +11,7 @@
from pandas.compat import set_function_name

from pandas.core import nanops
from pandas.core import ops
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -293,6 +295,44 @@ def __array__(self, dtype=None):
"""
return self._coerce_to_ndarray()

_HANDLED_TYPES = (np.ndarray, numbers.Number)

def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):

out = kwargs.get('out', ())

for x in inputs + out:
if not isinstance(x, self._HANDLED_TYPES + (IntegerArray,)):
return NotImplemented

# for binary ops, use our custom dunder methods
result = ops.maybe_dispatch_ufunc_to_dunder_op(
self, ufunc, method, *inputs, **kwargs)
if result is not None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why you use a different sentinel value None rather than Python's standard NotImplemented?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a specific reason, just that I didn't return anything in the maybe_dispatch_ufunc_to_dunder_op, which means the result is None (and our own dunder ops will never return None).
But can make it more explicit.

return result

if (method == '__call__'
and ufunc.signature is None
and ufunc.nout == 1):
# only supports IntegerArray for now
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should check for isinstance(.., IntegerArray)?

args = [a._data for a in inputs]
masks = [a._mask for a in inputs]
result = ufunc(*args, **kwargs)
mask = np.logical_or.reduce(masks)
if result.dtype.kind in ('i', 'u'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

blank line

return IntegerArray(result, mask)
else:
result[mask] = np.nan
return result

# fall back to array for other ufuncs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be a generic function that one can just call (e.g. put it in ops.py and pass in self, ufuc, method, inputs, kwargs)? IOW is this called anywhere else

inputs = tuple(
np.array(x) if isinstance(x, type(self)) else x
for x in inputs
)
return np.array(self).__array_ufunc__(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably want to call the public ufunc again here, e.g., return getattr(ufunc, method)(*inputs, **kwargs). Otherwise you don't invoke the dispatching mechanism again.

See NDArrayOperatorsMixin for an example implementation.

ufunc, method, *inputs, **kwargs)

def __iter__(self):
for i in range(len(self)):
if self._mask[i]:
Expand Down
44 changes: 6 additions & 38 deletions pandas/core/arrays/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from pandas._libs import lib
import pandas.core.algorithms as algos
import pandas.io.formats.printing as printing
import pandas.core.ops as ops


# ----------------------------------------------------------------------------
Expand Down Expand Up @@ -1447,44 +1448,11 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):
return NotImplemented

special = {'add', 'sub', 'mul', 'pow', 'mod', 'floordiv', 'truediv',
'divmod', 'eq', 'ne', 'lt', 'gt', 'le', 'ge', 'remainder'}
if compat.PY2:
special.add('div')
aliases = {
'subtract': 'sub',
'multiply': 'mul',
'floor_divide': 'floordiv',
'true_divide': 'truediv',
'power': 'pow',
'remainder': 'mod',
'divide': 'div',
'equal': 'eq',
'not_equal': 'ne',
'less': 'lt',
'less_equal': 'le',
'greater': 'gt',
'greater_equal': 'ge',
}

flipped = {
'lt': '__gt__',
'le': '__ge__',
'gt': '__lt__',
'ge': '__le__',
'eq': '__eq__',
'ne': '__ne__',
}

op_name = ufunc.__name__
op_name = aliases.get(op_name, op_name)

if op_name in special and kwargs.get('out') is None:
if isinstance(inputs[0], type(self)):
return getattr(self, '__{}__'.format(op_name))(inputs[1])
else:
name = flipped.get(op_name, '__r{}__'.format(op_name))
return getattr(self, name)(inputs[0])
# for binary ops, use our custom dunder methods
result = ops.maybe_dispatch_ufunc_to_dunder_op(
self, ufunc, method, *inputs, **kwargs)
if result is not None:
return result

if len(inputs) == 1:
# No alignment necessary.
Expand Down
43 changes: 43 additions & 0 deletions pandas/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -2146,3 +2146,46 @@ def wrapper(self, other):

wrapper.__name__ = op_name
return wrapper


def maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs):

special = {'add', 'sub', 'mul', 'pow', 'mod', 'floordiv', 'truediv',
'divmod', 'eq', 'ne', 'lt', 'gt', 'le', 'ge', 'remainder'}
if compat.PY2:
special.add('div')
aliases = {
'subtract': 'sub',
'multiply': 'mul',
'floor_divide': 'floordiv',
'true_divide': 'truediv',
'power': 'pow',
'remainder': 'mod',
'divide': 'div',
'equal': 'eq',
'not_equal': 'ne',
'less': 'lt',
'less_equal': 'le',
'greater': 'gt',
'greater_equal': 'ge',
}

flipped = {
'lt': '__gt__',
'le': '__ge__',
'gt': '__lt__',
'ge': '__le__',
'eq': '__eq__',
'ne': '__ne__',
}

op_name = ufunc.__name__
op_name = aliases.get(op_name, op_name)

if (method == '__call__' and op_name in special
and kwargs.get('out') is None):
if isinstance(inputs[0], type(self)):
return getattr(self, '__{}__'.format(op_name))(inputs[1])
else:
name = flipped.get(op_name, '__r{}__'.format(op_name))
return getattr(self, name)(inputs[0])
26 changes: 24 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,29 @@ def view(self, dtype=None):
return self._constructor(self._values.view(dtype),
index=self.index).__finalize__(self)

def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Normally we recommend doing a type check and returning NotImplemented if you see an unknown type . If __array_ufunc__ always tries to do the operation, you can get non-commutative operations, e.g., a + b != b + a.

For example, consider another library like xarray that implements its own version of a pandas.Series. Series + DataArray and DataArray + Series should consistently give either Series, DataArray or an error (probably the safest choice) -- they shouldn't just return a result of the same type as the first argument.

This might even be an issue for Series + DataFrame or np.add(Series, DataFrame) -- you probably want to delegate to the DataFrame in that case, not the Series.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking somehow: the __array_ufunc__ of the underlying values we call here is already doing the check, so I don't need to do it here again.
But indeed, that will probably mess up such cases.

The problem is that for now the __array_ufunc__ uses our own implementations of all arithmetic/comparison operators, so I should somehow list here everything that all those different ops accept, which might be a bit annoying.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I doubt you need to list types dependent on the operators -- it's probably enough to list the types that can be used in valid operations with pandas.Series.

It's true that you would possibly need to recurse into extension arrays to figure this out all the valid types, and it may not be easy to enumerate all scalar types, e.g., if IPAddressExtensionArray can handle arithmetic with IPAddress, how could pandas.Series know about that?

Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Oct 23, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think it are mainly the scalar values which might be hard to list (apart from those, we would probably accept Series, Index, array, and a bunch of array-likes (list, tuple, ..) for the Series case).
But eg a Series with datetimelike values can already handle our own Timestamp, Timedelta and offset objects, numpy scalars, datetime standard library scalars, integer, ..), and as I understand those should all be listed here?
(but anyhow, if we implement an __array_ufunc__ for eg PeriodArray, we would need to do that there as well, so need to make this list in any case)

But what you mention about custom ExtensionArrays is then even another problem, that we don't know what they would accept. By "recurse into extension arrays", would you mean something like checking the ExtensionArray._HANDLED_TYPES (and then expecting EA authors to set this specific property)? Or some other way to inspect?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By "recurse into extension arrays", would you mean something like checking the ExtensionArray._HANDLED_TYPES (and then expecting EA authors to set this specific property)? Or some other way to inspect?

Yeah, that's about the best I can imagine. But I agree, it doesn't seem easy! There's no general way to detect what types another object's __array_ufunc__ method can handle without actually calling it.

It is probably reasonable to trade-off correctness for generality here, but we should be cognizant that we are making this choice. Alternatively, we might maintain an explicit "blacklist" of types which pandas can't handle, though this gets tricky with non-required imports (e.g., dask and xarray objects).

Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Oct 24, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shoyer in light of the above, how would you define the handled types for an object dtyped Series, in which you in principle can put any python object you want? For numpy arrays, things like addition work element-wise for object dtype, so the following works:

In [101]: class A():
     ...:     def __init__(self, x):
     ...:         self.x = x
     ...:     def __add__(self, other):
     ...:         return A(self.x + other.x)
     ...:     def __repr__(self):
     ...:         return "A(x={})".format(self.x)
     ...:          

In [102]: a = np.array([A(1), A(2)], dtype=object)

In [103]: np.add(a, A(3))
Out[103]: array([A(x=4), A(x=5)], dtype=object)

In [104]: np.add(pd.Series(a), A(3))
Out[104]: 
0    A(x=4)
1    A(x=5)
dtype: object

Doing a more strict check for allowed types, then the above will no longer work.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is numpy actually dealing with the above? It makes an exception for object dtype?

So two ideas that might be useful here:

  • We can make an exception for object dtype, and in that case allow more unknown objects
  • We could check for objects that implement __array_ufunc__ themselves? If they have the attribute, and are not known to us (not in our _HANDLED_TYPES list, eg dask or xarray objects), we raise NotImplemented, otherwise we pass through.
    That at least solves the problem for the more known objects that will probably have the interface, but of course not yet for custom container types that do not implement it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are good questions. In principle NumPy follows the rules described at http://www.numpy.org/neps/nep-0013-ufunc-overrides.html#behavior-in-combination-with-python-s-binary-operations but I think there is something missing for unknown scalars / object arrays...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See numpy/numpy#12258 for a description of how NumPy does things.

Copy link
Contributor

@TomAugspurger TomAugspurger Jun 19, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make sure I understand this thread: @shoyer kicked it off by saying

Normally we recommend doing a type check and returning NotImplemented if you see an unknown type

We can handle the scalar types of EAs by requiring that they implement a _HANDLED_TYPES, which would include their scalar type. Then if all the (unboxed from Series / Index) inputs are in handled types, we proceed.

object-dtype complicates things. Did we decide what to do there? Skip the check?

inputs = tuple(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a few comments here on what you are doing

x._values if isinstance(x, type(self)) else x
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: could move type(self) outside this expression, so that it's only evaluated once.

for x in inputs
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

e.g. I think you are calling the same here (as i commented above)

# for binary ops, use our custom dunder methods
result = ops.maybe_dispatch_ufunc_to_dunder_op(
self, ufunc, method, *inputs, **kwargs)
if result is not None:
return result

if hasattr(self._values, '__array_ufunc__'):
result = self._values.__array_ufunc__(
ufunc, method, *inputs, **kwargs)
else:
result = np.array(self._values).__array_ufunc__(
ufunc, method, *inputs, **kwargs)
if result is NotImplemented:
raise TypeError("The '{0}' operation is not supported for "
"dtype {1}.".format(ufunc.__name__, self.dtype))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of all this (checking for __array_ufunc__ and then the branch/error message), you could just reinvoke the public ufunc again (as I suggested for IntegerArray). That would also make me more confidence that the proper delegation is happening.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, that sounds like a better idea :)

return self._constructor(result, index=self.index,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that not every ufunc returns one result, so you should either handle or explicitly error for the other cases. See NDArrayOperatorsMixin for how to handle this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, do you have the same problem with the current implementation of __array_wrap__ ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To answer my own question: yes, at least for ufuncs that return a scalar:

In [33]: s = pd.Series([1, 2, 3])

In [34]: np.add.reduce(s)
Out[34]: 
0    6
1    6
2    6
dtype: int64

In [35]: pd.__version__
Out[35]: '0.23.4'

But for ufuncs that return multiple values it seems to work with pandas master (wrapping each return element I suppose).

copy=False).__finalize__(self)

def __array__(self, result=None):
"""
the array interface, return my values
Expand All @@ -640,10 +663,9 @@ def __array_prepare__(self, result, context=None):
"""
Gets called prior to a ufunc
"""

# nice error message for non-ufunc types
if (context is not None and
not isinstance(self._values, (np.ndarray, ABCSparseArray))):
not isinstance(self._values, (np.ndarray, ExtensionArray))):
obj = context[1][0]
raise TypeError("{obj} with dtype {dtype} cannot perform "
"the numpy op {op}".format(
Expand Down
64 changes: 64 additions & 0 deletions pandas/tests/arrays/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,70 @@ def test_astype_nansafe():
arr.astype('uint32')


@pytest.mark.parametrize(
'ufunc', [np.abs, np.positive, np.negative])
def test_ufuncs_single_int(ufunc):
a = integer_array([1, 2, -3, np.nan])
result = ufunc(a)
expected = integer_array(ufunc(a.astype(float)))
tm.assert_extension_array_equal(result, expected)

s = pd.Series(a)
result = ufunc(s)
expected = pd.Series(integer_array(ufunc(a.astype(float))))
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
'ufunc', [np.log, np.exp, np.sin, np.cos, np.sqrt])
def test_ufuncs_single_float(ufunc):
a = integer_array([1, 2, -3, np.nan])
with np.errstate(invalid='ignore'):
result = ufunc(a)
expected = ufunc(a.astype(float))
tm.assert_numpy_array_equal(result, expected)

s = pd.Series(a)
with np.errstate(invalid='ignore'):
result = ufunc(s)
expected = ufunc(s.astype(float))
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
'ufunc', [np.add, np.subtract])
def test_ufuncs_binary_int(ufunc):
# two IntegerArrays
a = integer_array([1, 2, -3, np.nan])
result = ufunc(a, a)
expected = integer_array(ufunc(a.astype(float), a.astype(float)))
tm.assert_extension_array_equal(result, expected)

# IntegerArray with numpy array
arr = np.array([1, 2, 3, 4])
result = ufunc(a, arr)
expected = integer_array(ufunc(a.astype(float), arr))
tm.assert_extension_array_equal(result, expected)

result = ufunc(arr, a)
expected = integer_array(ufunc(arr, a.astype(float)))
tm.assert_extension_array_equal(result, expected)

# IntegerArray with scalar
result = ufunc(a, 1)
expected = integer_array(ufunc(a.astype(float), 1))
tm.assert_extension_array_equal(result, expected)

result = ufunc(1, a)
expected = integer_array(ufunc(1, a.astype(float)))
tm.assert_extension_array_equal(result, expected)


def test_ufunc_fallback():
a = integer_array([1, 2, -3, np.nan])
assert pd.isna(np.add.reduce(a))


# TODO(jreback) - these need testing / are broken

# shift
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/extension/decimal/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,3 +364,11 @@ def test_divmod_array(reverse, expected_div, expected_mod):

tm.assert_extension_array_equal(div, expected_div)
tm.assert_extension_array_equal(mod, expected_mod)


def test_ufunc_fallback(data):
a = data[:5]
s = pd.Series(a, index=range(3, 8))
result = np.abs(s)
expected = pd.Series(np.abs(a.astype(object)), index=range(3, 8))
tm.assert_series_equal(result, expected)