Skip to content

ENH: Add prod to masked_reductions #33442

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Apr 12, 2020
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ Performance improvements
sparse values from ``scipy.sparse`` matrices using the
:meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
:issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`).
- Performance improvement in reductions (sum, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`).
- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).


.. ---------------------------------------------------------------------------
Expand Down
48 changes: 31 additions & 17 deletions pandas/core/array_algos/masked_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
for missing values.
"""

from typing import Callable

import numpy as np

from pandas._libs import missing as libmissing
Expand All @@ -11,14 +13,19 @@
from pandas.core.nanops import check_below_min_count


def sum(
values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0,
def _sumprod(
func: Callable,
values: np.ndarray,
mask: np.ndarray,
skipna: bool = True,
min_count: int = 0,
):
"""
Sum for 1D masked array.
Sum or product for 1D masked array.

Parameters
----------
func : np.sum or np.prod
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
Expand All @@ -31,23 +38,33 @@ def sum(
``min_count`` non-NA values are present the result will be NA.
"""
if not skipna:
if mask.any():
if mask.any() or check_below_min_count(values.shape, None, min_count):
return libmissing.NA
else:
if check_below_min_count(values.shape, None, min_count):
return libmissing.NA
return np.sum(values)
return func(values)
else:
if check_below_min_count(values.shape, mask, min_count):
return libmissing.NA

if _np_version_under1p17:
return np.sum(values[~mask])
return func(values[~mask])
else:
return np.sum(values, where=~mask)
return func(values, where=~mask)


def sum(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0):
return _sumprod(
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count
)


def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
def prod(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0):
return _sumprod(
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count
)


def _minmax(func: Callable, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
"""
Reduction for 1D masked array.

Expand All @@ -63,18 +80,15 @@ def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
Whether to skip NA.
"""
if not skipna:
if mask.any():
if mask.any() or not values.size:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
else:
if values.size:
return func(values)
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
return func(values)
else:
subset = values[~mask]
if subset.size:
return func(values[~mask])
return func(subset)
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
Expand Down
10 changes: 2 additions & 8 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna, notna
from pandas.core.dtypes.missing import isna

from pandas.core import nanops, ops
from pandas.core.array_algos import masked_reductions
Expand Down Expand Up @@ -686,7 +686,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name in {"sum", "min", "max"}:
if name in {"sum", "prod", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)

Expand All @@ -700,12 +700,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
if np.isnan(result):
return libmissing.NA

# if we have numeric op that would result in an int, coerce to int if possible
if name == "prod" and notna(result):
int_result = np.int64(result)
if int_result == result:
result = int_result

return result

def _maybe_mask_result(self, result, mask, other, op_name: str):
Expand Down
9 changes: 1 addition & 8 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@

from pandas.core import nanops, ops
from pandas.core.array_algos import masked_reductions
import pandas.core.common as com
from pandas.core.indexers import check_array_indexer
from pandas.core.ops import invalid_comparison
from pandas.core.ops.common import unpack_zerodim_and_defer
Expand Down Expand Up @@ -557,7 +556,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name in {"sum", "min", "max"}:
if name in {"sum", "prod", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)

Expand All @@ -576,12 +575,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
if name in ["any", "all"]:
pass

# if we have a preservable numeric op,
# provide coercion back to an integer type if possible
elif name == "prod":
# GH#31409 more performant than casting-then-checking
result = com.cast_scalar_indexer(result)

return result

def _maybe_mask_result(self, result, mask, other, op_name: str):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/boolean/test_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions):
if op == "sum":
assert isinstance(getattr(s, op)(), np.int_)
elif op == "prod":
assert isinstance(getattr(s, op)(), np.int64)
assert isinstance(getattr(s, op)(), np.int_)
elif op in ("min", "max"):
assert isinstance(getattr(s, op)(), np.bool_)
else:
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/integer/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_preserve_dtypes(op):

# op
result = getattr(df.C, op)()
if op in {"sum", "min", "max"}:
if op in {"sum", "prod", "min", "max"}:
assert isinstance(result, np.int64)
else:
assert isinstance(result, int)
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/extension/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,10 @@ def check_reduce(self, s, op_name, skipna):
# overwrite to ensure pd.NA is tested instead of np.nan
# https://github.com/pandas-dev/pandas/issues/30958
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
if np.isnan(expected):
if not skipna and s.isna().any():
expected = pd.NA
else:
expected = getattr(s.dropna().astype("int64"), op_name)(skipna=skipna)
tm.assert_almost_equal(result, expected)


Expand Down