diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 718de09a0c3e4..b31041886cf15 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -377,7 +377,7 @@ Performance improvements sparse values from ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). -- Performance improvement in reductions (sum, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`). +- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). .. --------------------------------------------------------------------------- diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index b3723340cefd6..1b9ed014f27b7 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -3,6 +3,8 @@ for missing values. """ +from typing import Callable + import numpy as np from pandas._libs import missing as libmissing @@ -11,14 +13,19 @@ from pandas.core.nanops import check_below_min_count -def sum( - values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0, +def _sumprod( + func: Callable, + values: np.ndarray, + mask: np.ndarray, + skipna: bool = True, + min_count: int = 0, ): """ - Sum for 1D masked array. + Sum or product for 1D masked array. Parameters ---------- + func : np.sum or np.prod values : np.ndarray Numpy array with the values (can be of any dtype that support the operation). @@ -31,23 +38,33 @@ def sum( ``min_count`` non-NA values are present the result will be NA. """ if not skipna: - if mask.any(): + if mask.any() or check_below_min_count(values.shape, None, min_count): return libmissing.NA else: - if check_below_min_count(values.shape, None, min_count): - return libmissing.NA - return np.sum(values) + return func(values) else: if check_below_min_count(values.shape, mask, min_count): return libmissing.NA if _np_version_under1p17: - return np.sum(values[~mask]) + return func(values[~mask]) else: - return np.sum(values, where=~mask) + return func(values, where=~mask) + + +def sum(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): + return _sumprod( + np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count + ) -def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True): +def prod(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): + return _sumprod( + np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count + ) + + +def _minmax(func: Callable, values: np.ndarray, mask: np.ndarray, skipna: bool = True): """ Reduction for 1D masked array. @@ -63,18 +80,15 @@ def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True): Whether to skip NA. """ if not skipna: - if mask.any(): + if mask.any() or not values.size: + # min/max with empty array raise in numpy, pandas returns NA return libmissing.NA else: - if values.size: - return func(values) - else: - # min/max with empty array raise in numpy, pandas returns NA - return libmissing.NA + return func(values) else: subset = values[~mask] if subset.size: - return func(values[~mask]) + return func(subset) else: # min/max with empty array raise in numpy, pandas returns NA return libmissing.NA diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 40c838cbbd1df..685a9ec48228f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -24,7 +24,7 @@ ) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops from pandas.core.array_algos import masked_reductions @@ -686,7 +686,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask - if name in {"sum", "min", "max"}: + if name in {"sum", "prod", "min", "max"}: op = getattr(masked_reductions, name) return op(data, mask, skipna=skipna, **kwargs) @@ -700,12 +700,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): if np.isnan(result): return libmissing.NA - # if we have numeric op that would result in an int, coerce to int if possible - if name == "prod" and notna(result): - int_result = np.int64(result) - if int_result == result: - result = int_result - return result def _maybe_mask_result(self, result, mask, other, op_name: str): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 5d6f49852e696..37620edfd9a95 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -28,7 +28,6 @@ from pandas.core import nanops, ops from pandas.core.array_algos import masked_reductions -import pandas.core.common as com from pandas.core.indexers import check_array_indexer from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer @@ -557,7 +556,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask - if name in {"sum", "min", "max"}: + if name in {"sum", "prod", "min", "max"}: op = getattr(masked_reductions, name) return op(data, mask, skipna=skipna, **kwargs) @@ -576,12 +575,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): if name in ["any", "all"]: pass - # if we have a preservable numeric op, - # provide coercion back to an integer type if possible - elif name == "prod": - # GH#31409 more performant than casting-then-checking - result = com.cast_scalar_indexer(result) - return result def _maybe_mask_result(self, result, mask, other, op_name: str): diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index 5dd5620162a8a..a5c18a25f8e16 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -52,7 +52,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): if op == "sum": assert isinstance(getattr(s, op)(), np.int_) elif op == "prod": - assert isinstance(getattr(s, op)(), np.int64) + assert isinstance(getattr(s, op)(), np.int_) elif op in ("min", "max"): assert isinstance(getattr(s, op)(), np.bool_) else: diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index 515013e95c717..a02501e2dcbf2 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -34,7 +34,7 @@ def test_preserve_dtypes(op): # op result = getattr(df.C, op)() - if op in {"sum", "min", "max"}: + if op in {"sum", "prod", "min", "max"}: assert isinstance(result, np.int64) else: assert isinstance(result, int) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index f55ec75b47dfa..725533765ca2c 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -238,9 +238,10 @@ def check_reduce(self, s, op_name, skipna): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 result = getattr(s, op_name)(skipna=skipna) - expected = getattr(s.astype("float64"), op_name)(skipna=skipna) - if np.isnan(expected): + if not skipna and s.isna().any(): expected = pd.NA + else: + expected = getattr(s.dropna().astype("int64"), op_name)(skipna=skipna) tm.assert_almost_equal(result, expected)