Skip to content

ENH: Add prod to masked_reductions #33442

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Apr 12, 2020
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ Performance improvements
sparse values from ``scipy.sparse`` matrices using the
:meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
:issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`).
- Performance improvement in reductions (sum, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`).
- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).


.. ---------------------------------------------------------------------------
Expand Down
48 changes: 31 additions & 17 deletions pandas/core/array_algos/masked_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
for missing values.
"""

from typing import Callable

import numpy as np

from pandas._libs import missing as libmissing
Expand All @@ -11,14 +13,19 @@
from pandas.core.nanops import check_below_min_count


def sum(
values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0,
def _sumprod(
func: Callable,
values: np.ndarray,
mask: np.ndarray,
skipna: bool = True,
min_count: int = 0,
):
"""
Sum for 1D masked array.
Sum or product for 1D masked array.

Parameters
----------
func : np.sum or np.prod
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
Expand All @@ -31,23 +38,33 @@ def sum(
``min_count`` non-NA values are present the result will be NA.
"""
if not skipna:
if mask.any():
if mask.any() or check_below_min_count(values.shape, None, min_count):
return libmissing.NA
else:
if check_below_min_count(values.shape, None, min_count):
return libmissing.NA
return np.sum(values)
return func(values)
else:
if check_below_min_count(values.shape, mask, min_count):
return libmissing.NA

if _np_version_under1p17:
return np.sum(values[~mask])
return func(values[~mask])
else:
return np.sum(values, where=~mask)
return func(values, where=~mask)


def sum(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0):
return _sumprod(
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count
)


def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
def prod(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0):
return _sumprod(
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count
)


def _minmax(func: Callable, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
"""
Reduction for 1D masked array.

Expand All @@ -63,18 +80,15 @@ def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
Whether to skip NA.
"""
if not skipna:
if mask.any():
if mask.any() or not values.size:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
else:
if values.size:
return func(values)
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
return func(values)
else:
subset = values[~mask]
if subset.size:
return func(values[~mask])
return func(subset)
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
Expand Down
16 changes: 7 additions & 9 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pandas.compat.numpy import function as nv

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.cast import astype_nansafe, maybe_cast_result_dtype
from pandas.core.dtypes.common import (
is_bool_dtype,
is_extension_array_dtype,
Expand Down Expand Up @@ -686,9 +686,13 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name in {"sum", "min", "max"}:
if name in {"sum", "prod", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)
result = op(data, mask, skipna=skipna, **kwargs)
dtype = maybe_cast_result_dtype(dtype=data.dtype, how=name)
if notna(result) and (dtype != result.dtype):
result = result.astype(dtype)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am still not sure we actually need to do this. We could also choose to follow numpy's behaviour to return platform int (any idea what we do for "bool" dtype?)

BTW, this should also change the result for sum, so this was not tested?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jorisvandenbossche I think you're right and it was actually the test that needed to change here (np.int64 -> np.int_)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, in any case that's what I did for sum, I see now (so if we decide on following numpy vs always returning int64, we should do it for both sum and prod)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(but I am fine with following numpy)

return result

# coerce to a nan-aware float if needed
if self._hasna:
Expand All @@ -700,12 +704,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
if np.isnan(result):
return libmissing.NA

# if we have numeric op that would result in an int, coerce to int if possible
if name == "prod" and notna(result):
int_result = np.int64(result)
if int_result == result:
result = int_result

return result

def _maybe_mask_result(self, result, mask, other, op_name: str):
Expand Down
9 changes: 1 addition & 8 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@

from pandas.core import nanops, ops
from pandas.core.array_algos import masked_reductions
import pandas.core.common as com
from pandas.core.indexers import check_array_indexer
from pandas.core.ops import invalid_comparison
from pandas.core.ops.common import unpack_zerodim_and_defer
Expand Down Expand Up @@ -557,7 +556,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name in {"sum", "min", "max"}:
if name in {"sum", "prod", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)

Expand All @@ -576,12 +575,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
if name in ["any", "all"]:
pass

# if we have a preservable numeric op,
# provide coercion back to an integer type if possible
elif name == "prod":
# GH#31409 more performant than casting-then-checking
result = com.cast_scalar_indexer(result)

return result

def _maybe_mask_result(self, result, mask, other, op_name: str):
Expand Down
1 change: 1 addition & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj:
(np.dtype(np.bool), "add"): np.dtype(np.int64),
(np.dtype(np.bool), "cumsum"): np.dtype(np.int64),
(np.dtype(np.bool), "sum"): np.dtype(np.int64),
(np.dtype(np.bool), "prod"): np.dtype(np.int64),
}
return d.get((dtype, how), dtype)

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from pandas._libs import NaT, algos as libalgos, lib, writers
import pandas._libs.internals as libinternals
from pandas._libs.tslibs import Timedelta, conversion
from pandas._libs.tslibs import conversion
from pandas._libs.tslibs.timezones import tz_compare
from pandas._typing import ArrayLike
from pandas.util._validators import validate_bool_kwarg
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/integer/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_preserve_dtypes(op):

# op
result = getattr(df.C, op)()
if op in {"sum", "min", "max"}:
if op in {"sum", "prod", "min", "max"}:
assert isinstance(result, np.int64)
else:
assert isinstance(result, int)
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/extension/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,10 @@ def check_reduce(self, s, op_name, skipna):
# overwrite to ensure pd.NA is tested instead of np.nan
# https://github.com/pandas-dev/pandas/issues/30958
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
if np.isnan(expected):
if not skipna and s.isna().any():
expected = pd.NA
else:
expected = getattr(s.dropna().astype("int64"), op_name)(skipna=skipna)
tm.assert_almost_equal(result, expected)


Expand Down