Skip to content

ENH: Add prod to masked_reductions #33442

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Apr 12, 2020
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ Performance improvements
sparse values from ``scipy.sparse`` matrices using the
:meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
:issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`).
- Performance improvement in reductions (sum, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`).
- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`).


.. ---------------------------------------------------------------------------
Expand Down
48 changes: 31 additions & 17 deletions pandas/core/array_algos/masked_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
for missing values.
"""

from typing import Callable

import numpy as np

from pandas._libs import missing as libmissing
Expand All @@ -11,14 +13,19 @@
from pandas.core.nanops import check_below_min_count


def sum(
values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0,
def _sumprod(
func: Callable,
values: np.ndarray,
mask: np.ndarray,
skipna: bool = True,
min_count: int = 0,
):
"""
Sum for 1D masked array.
Sum or product for 1D masked array.

Parameters
----------
func : np.sum or np.prod
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
Expand All @@ -31,23 +38,21 @@ def sum(
``min_count`` non-NA values are present the result will be NA.
"""
if not skipna:
if mask.any():
if mask.any() or check_below_min_count(values.shape, None, min_count):
return libmissing.NA
else:
if check_below_min_count(values.shape, None, min_count):
return libmissing.NA
return np.sum(values)
return func(values)
else:
if check_below_min_count(values.shape, mask, min_count):
return libmissing.NA

if _np_version_under1p17:
return np.sum(values[~mask])
return func(values[~mask])
else:
return np.sum(values, where=~mask)
return func(values, where=~mask)


def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
def _minmax(func: Callable, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
"""
Reduction for 1D masked array.

Expand All @@ -63,18 +68,15 @@ def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
Whether to skip NA.
"""
if not skipna:
if mask.any():
if mask.any() or not values.size:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
else:
if values.size:
return func(values)
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
return func(values)
else:
subset = values[~mask]
if subset.size:
return func(values[~mask])
return func(subset)
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
Expand All @@ -86,3 +88,15 @@ def min(values: np.ndarray, mask: np.ndarray, skipna: bool = True):

def max(values: np.ndarray, mask: np.ndarray, skipna: bool = True):
return _minmax(np.max, values=values, mask=mask, skipna=skipna)


def sum(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0):
return _sumprod(
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count
)


def prod(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0):
return _sumprod(
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count
)
18 changes: 10 additions & 8 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,9 +696,17 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name in {"sum", "min", "max"}:
if name in {"sum", "prod", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)
result = op(data, mask, skipna=skipna, **kwargs)

Copy link
Contributor

@jreback jreback Apr 10, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you try using maybe_cast_result_dtype here (you will have to add the prod operation in side that)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I understand, that should not be needed. Numpy should give the desired result for booleans.

# if we have numeric op that would result in an int, coerce to int if possible
if name == "prod" and notna(result):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why did you need to add this back?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was getting a failure on some builds for \tests\arrays\boolean\test_reduction.py where we're checking that the product has int dtype so it might be needed:

        elif op == "prod":
>           assert isinstance(getattr(s, op)(), np.int64)
E           AssertionError: assert False

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, then maybe the test needs to be edited. In any case, we should investigate why it is failing / what we should be expecting.

From a quick test, it seems that numpy returns int64:

In [6]: np.array([], dtype="bool").prod()                                                                                                                                                                          
Out[6]: 1

In [7]: type(_)                                                                                                                                                                                                    
Out[7]: numpy.int64

In [8]: np.array([True, False], dtype="bool").prod()                                                                                                                                                               
Out[8]: 0

In [9]: type(_)                                                                                                                                                                                                    
Out[9]: numpy.int64

So I think we should follow that behaviour here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int_result = np.int64(result)
if int_result == result:
result = int_result

return result

# coerce to a nan-aware float if needed
if self._hasna:
Expand All @@ -710,12 +718,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
if np.isnan(result):
return libmissing.NA

# if we have numeric op that would result in an int, coerce to int if possible
if name == "prod" and notna(result):
int_result = np.int64(result)
if int_result == result:
result = int_result

return result

def _maybe_mask_result(self, result, mask, other, op_name: str):
Expand Down
8 changes: 1 addition & 7 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name in {"sum", "min", "max"}:
if name in {"sum", "prod", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)

Expand All @@ -581,12 +581,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
if name in ["any", "all"]:
pass

# if we have a preservable numeric op,
# provide coercion back to an integer type if possible
elif name == "prod":
# GH#31409 more performant than casting-then-checking
result = com.cast_scalar_indexer(result)

return result

def _maybe_mask_result(self, result, mask, other, op_name: str):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/integer/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_preserve_dtypes(op):

# op
result = getattr(df.C, op)()
if op in {"sum", "min", "max"}:
if op in {"sum", "prod", "min", "max"}:
assert isinstance(result, np.int64)
else:
assert isinstance(result, int)
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/extension/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,12 @@ def check_reduce(self, s, op_name, skipna):
# overwrite to ensure pd.NA is tested instead of np.nan
# https://github.com/pandas-dev/pandas/issues/30958
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
if np.isnan(expected):
if skipna:
expected = getattr(s.dropna().astype("int64"), op_name)(skipna=skipna)
elif s.isna().any():
expected = pd.NA
else:
expected = getattr(s.astype("int64"), op_name)(skipna=skipna)
tm.assert_almost_equal(result, expected)


Expand Down