From 3e1f06ca2ce93734f6e619907c9e797b9ddb67b6 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 8 Apr 2020 15:01:48 -0500 Subject: [PATCH 01/13] REF: Add prod to masked_reductions --- pandas/core/array_algos/masked_reductions.py | 49 +++++++++++++------- pandas/core/arrays/boolean.py | 8 +--- pandas/core/arrays/integer.py | 8 +--- pandas/tests/arrays/integer/test_dtypes.py | 2 +- 4 files changed, 34 insertions(+), 33 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index b3723340cefd6..7d38b9dd683cb 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -3,6 +3,8 @@ for missing values. """ +from typing import Callable + import numpy as np from pandas._libs import missing as libmissing @@ -11,14 +13,19 @@ from pandas.core.nanops import check_below_min_count -def sum( - values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0, +def _sumprod( + func: Callable, + values: np.ndarray, + mask: np.ndarray, + skipna: bool = True, + min_count: int = 0, ): """ - Sum for 1D masked array. + Sum or product for 1D masked array. Parameters ---------- + func : np.sum or np.prod values : np.ndarray Numpy array with the values (can be of any dtype that support the operation). @@ -31,23 +38,21 @@ def sum( ``min_count`` non-NA values are present the result will be NA. """ if not skipna: - if mask.any(): + if mask.any() or check_below_min_count(values.shape, None, min_count): return libmissing.NA else: - if check_below_min_count(values.shape, None, min_count): - return libmissing.NA - return np.sum(values) + return func(values) else: if check_below_min_count(values.shape, mask, min_count): return libmissing.NA if _np_version_under1p17: - return np.sum(values[~mask]) + return func(values[~mask]) else: - return np.sum(values, where=~mask) + return func(values, where=~mask) -def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True): +def _minmax(func: Callable, values: np.ndarray, mask: np.ndarray, skipna: bool = True): """ Reduction for 1D masked array. @@ -63,17 +68,13 @@ def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True): Whether to skip NA. """ if not skipna: - if mask.any(): + if mask.any() or not values.size: + # min/max with empty array raise in numpy, pandas returns NA return libmissing.NA else: - if values.size: - return func(values) - else: - # min/max with empty array raise in numpy, pandas returns NA - return libmissing.NA + return func(values) else: - subset = values[~mask] - if subset.size: + if not mask.all(): return func(values[~mask]) else: # min/max with empty array raise in numpy, pandas returns NA @@ -86,3 +87,15 @@ def min(values: np.ndarray, mask: np.ndarray, skipna: bool = True): def max(values: np.ndarray, mask: np.ndarray, skipna: bool = True): return _minmax(np.max, values=values, mask=mask, skipna=skipna) + + +def sum(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): + return _sumprod( + np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count + ) + + +def prod(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): + return _sumprod( + np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count + ) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index e85534def6b97..78bfc316056dc 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -696,7 +696,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask - if name in {"sum", "min", "max"}: + if name in {"sum", "prod", "min", "max"}: op = getattr(masked_reductions, name) return op(data, mask, skipna=skipna, **kwargs) @@ -710,12 +710,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): if np.isnan(result): return libmissing.NA - # if we have numeric op that would result in an int, coerce to int if possible - if name == "prod" and notna(result): - int_result = np.int64(result) - if int_result == result: - result = int_result - return result def _maybe_mask_result(self, result, mask, other, op_name: str): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d47a396bbb14e..722348e565f6c 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -562,7 +562,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask - if name in {"sum", "min", "max"}: + if name in {"sum", "prod", "min", "max"}: op = getattr(masked_reductions, name) return op(data, mask, skipna=skipna, **kwargs) @@ -581,12 +581,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): if name in ["any", "all"]: pass - # if we have a preservable numeric op, - # provide coercion back to an integer type if possible - elif name == "prod": - # GH#31409 more performant than casting-then-checking - result = com.cast_scalar_indexer(result) - return result def _maybe_mask_result(self, result, mask, other, op_name: str): diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index 515013e95c717..a02501e2dcbf2 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -34,7 +34,7 @@ def test_preserve_dtypes(op): # op result = getattr(df.C, op)() - if op in {"sum", "min", "max"}: + if op in {"sum", "prod", "min", "max"}: assert isinstance(result, np.int64) else: assert isinstance(result, int) From a7eb3014ff650d29addef91b0aae432a79e848a0 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 9 Apr 2020 15:32:56 -0500 Subject: [PATCH 02/13] Update --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/array_algos/masked_reductions.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5c39377899a20..f98d59354da30 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -356,7 +356,7 @@ Performance improvements sparse values from ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). -- Performance improvement in reductions (sum, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`). +- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`). .. --------------------------------------------------------------------------- diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 7d38b9dd683cb..6094e8163dbc3 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -74,8 +74,9 @@ def _minmax(func: Callable, values: np.ndarray, mask: np.ndarray, skipna: bool = else: return func(values) else: - if not mask.all(): - return func(values[~mask]) + subset = values[~mask] + if subset.size: + return func(subset) else: # min/max with empty array raise in numpy, pandas returns NA return libmissing.NA From 48c14324ce9875fba9642aea2f777b2402692731 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 10 Apr 2020 08:07:19 -0500 Subject: [PATCH 03/13] Change test --- pandas/tests/extension/test_integer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index f55ec75b47dfa..24d1a2b59a020 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -238,9 +238,12 @@ def check_reduce(self, s, op_name, skipna): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 result = getattr(s, op_name)(skipna=skipna) - expected = getattr(s.astype("float64"), op_name)(skipna=skipna) - if np.isnan(expected): + if skipna: + expected = getattr(s.dropna().astype("int64"), op_name)(skipna=skipna) + elif s.isna().any(): expected = pd.NA + else: + expected = getattr(s.astype("int64"), op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) From c73d7b970fa82efa6cb0bc05e41ab26f662caa18 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 10 Apr 2020 08:53:19 -0500 Subject: [PATCH 04/13] Keep --- pandas/core/arrays/boolean.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 78bfc316056dc..12d7e74cb528e 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -698,7 +698,15 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): if name in {"sum", "prod", "min", "max"}: op = getattr(masked_reductions, name) - return op(data, mask, skipna=skipna, **kwargs) + result = op(data, mask, skipna=skipna, **kwargs) + + # if we have numeric op that would result in an int, coerce to int if possible + if name == "prod" and notna(result): + int_result = np.int64(result) + if int_result == result: + result = int_result + + return result # coerce to a nan-aware float if needed if self._hasna: From b0c95fcdcb62c0ab8d61e7c6c068850fef5ac06b Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 10 Apr 2020 09:08:22 -0500 Subject: [PATCH 05/13] Update --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/arrays/boolean.py | 3 ++- pandas/tests/extension/test_integer.py | 6 ++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a0cc64441e374..f54512bc4505e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -358,7 +358,7 @@ Performance improvements sparse values from ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). -- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`). +- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 65d221e516ece..cf8eef755338f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -700,7 +700,8 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): op = getattr(masked_reductions, name) result = op(data, mask, skipna=skipna, **kwargs) - # if we have numeric op that would result in an int, coerce to int if possible + # if we have numeric op that would result in an int, + # coerce to int if possible if name == "prod" and notna(result): int_result = np.int64(result) if int_result == result: diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 24d1a2b59a020..725533765ca2c 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -238,12 +238,10 @@ def check_reduce(self, s, op_name, skipna): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 result = getattr(s, op_name)(skipna=skipna) - if skipna: - expected = getattr(s.dropna().astype("int64"), op_name)(skipna=skipna) - elif s.isna().any(): + if not skipna and s.isna().any(): expected = pd.NA else: - expected = getattr(s.astype("int64"), op_name)(skipna=skipna) + expected = getattr(s.dropna().astype("int64"), op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) From c3e1763b51ae0ad1443fc58fc3f4a96cacf00a3b Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 10 Apr 2020 09:09:54 -0500 Subject: [PATCH 06/13] Move functions --- pandas/core/array_algos/masked_reductions.py | 24 ++++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 6094e8163dbc3..1b9ed014f27b7 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -52,6 +52,18 @@ def _sumprod( return func(values, where=~mask) +def sum(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): + return _sumprod( + np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count + ) + + +def prod(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): + return _sumprod( + np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count + ) + + def _minmax(func: Callable, values: np.ndarray, mask: np.ndarray, skipna: bool = True): """ Reduction for 1D masked array. @@ -88,15 +100,3 @@ def min(values: np.ndarray, mask: np.ndarray, skipna: bool = True): def max(values: np.ndarray, mask: np.ndarray, skipna: bool = True): return _minmax(np.max, values=values, mask=mask, skipna=skipna) - - -def sum(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): - return _sumprod( - np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count - ) - - -def prod(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): - return _sumprod( - np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count - ) From 6b66756e3d4a29933a7aa70ba58ecaa19b926f51 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 10 Apr 2020 09:34:57 -0500 Subject: [PATCH 07/13] Lint --- pandas/core/arrays/integer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 722348e565f6c..fda7cf16ee032 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -28,7 +28,6 @@ from pandas.core import nanops, ops from pandas.core.array_algos import masked_reductions -import pandas.core.common as com from pandas.core.indexers import check_array_indexer from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer From 58f7bd07870275894964f1ff00645281fe4b4c45 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 10 Apr 2020 13:08:35 -0500 Subject: [PATCH 08/13] maybe_cast_result_dtype --- pandas/core/arrays/boolean.py | 13 ++++--------- pandas/core/dtypes/cast.py | 1 + 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index cf8eef755338f..83dae86aa1e07 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -10,7 +10,7 @@ from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.cast import astype_nansafe, maybe_cast_result_dtype from pandas.core.dtypes.common import ( is_bool_dtype, is_extension_array_dtype, @@ -699,14 +699,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): if name in {"sum", "prod", "min", "max"}: op = getattr(masked_reductions, name) result = op(data, mask, skipna=skipna, **kwargs) - - # if we have numeric op that would result in an int, - # coerce to int if possible - if name == "prod" and notna(result): - int_result = np.int64(result) - if int_result == result: - result = int_result - + dtype = maybe_cast_result_dtype(dtype=data.dtype, how=name) + if notna(result) and (dtype != result.dtype): + result = result.astype(dtype) return result # coerce to a nan-aware float if needed diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7dda6850ba4f7..41a9d7cb9a7a0 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -317,6 +317,7 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: (np.dtype(np.bool), "add"): np.dtype(np.int64), (np.dtype(np.bool), "cumsum"): np.dtype(np.int64), (np.dtype(np.bool), "sum"): np.dtype(np.int64), + (np.dtype(np.bool), "prod"): np.dtype(np.int64), } return d.get((dtype, how), dtype) From a2574df1f566f6316e2769850c769d9b3233daee Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 10 Apr 2020 13:46:16 -0500 Subject: [PATCH 09/13] Lint --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d12b78f8d046f..d231fee2c7825 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -8,7 +8,7 @@ from pandas._libs import NaT, Timestamp, algos as libalgos, lib, writers import pandas._libs.internals as libinternals -from pandas._libs.tslibs import Timedelta, conversion +from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare from pandas._typing import ArrayLike from pandas.util._validators import validate_bool_kwarg From 57551b8a9e0316adf274ef8d2212b11dda6ac4c4 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 11 Apr 2020 11:16:07 -0500 Subject: [PATCH 10/13] Revert and change test --- pandas/core/arrays/boolean.py | 6 +----- pandas/tests/arrays/boolean/test_reduction.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 08d0a72b54674..dce6c76f3dd9a 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -688,11 +688,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): if name in {"sum", "prod", "min", "max"}: op = getattr(masked_reductions, name) - result = op(data, mask, skipna=skipna, **kwargs) - dtype = maybe_cast_result_dtype(dtype=data.dtype, how=name) - if notna(result) and (dtype != result.dtype): - result = result.astype(dtype) - return result + return op(data, mask, skipna=skipna, **kwargs) # coerce to a nan-aware float if needed if self._hasna: diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index 5dd5620162a8a..a5c18a25f8e16 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -52,7 +52,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): if op == "sum": assert isinstance(getattr(s, op)(), np.int_) elif op == "prod": - assert isinstance(getattr(s, op)(), np.int64) + assert isinstance(getattr(s, op)(), np.int_) elif op in ("min", "max"): assert isinstance(getattr(s, op)(), np.bool_) else: From 1d255694a67b1e9b425401e8deca29a53fbc94df Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 11 Apr 2020 11:17:58 -0500 Subject: [PATCH 11/13] Lint --- pandas/core/arrays/boolean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index dce6c76f3dd9a..f07666c87fc82 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -10,7 +10,7 @@ from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.cast import astype_nansafe, maybe_cast_result_dtype +from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( is_bool_dtype, is_extension_array_dtype, From 2701fff6c6f8b70af021ab017cc70be4aa9f9982 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 11 Apr 2020 11:20:55 -0500 Subject: [PATCH 12/13] Remove --- pandas/core/dtypes/cast.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 41a9d7cb9a7a0..7dda6850ba4f7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -317,7 +317,6 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: (np.dtype(np.bool), "add"): np.dtype(np.int64), (np.dtype(np.bool), "cumsum"): np.dtype(np.int64), (np.dtype(np.bool), "sum"): np.dtype(np.int64), - (np.dtype(np.bool), "prod"): np.dtype(np.int64), } return d.get((dtype, how), dtype) From 8321945c113f9548206586670fbc7d045d7dd2cc Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 11 Apr 2020 11:41:57 -0500 Subject: [PATCH 13/13] Lint --- pandas/core/arrays/boolean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index f07666c87fc82..685a9ec48228f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -24,7 +24,7 @@ ) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops from pandas.core.array_algos import masked_reductions