diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 57c625ced8a43..d78419c12ce0d 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -223,27 +223,27 @@ def time_series_datetimeindex_repr(self): class All: - params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] - param_names = ["N", "case"] + params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]] + param_names = ["N", "case", "dtype"] - def setup(self, N, case): + def setup(self, N, case, dtype): val = case != "fast" - self.s = Series([val] * N) + self.s = Series([val] * N, dtype=dtype) - def time_all(self, N, case): + def time_all(self, N, case, dtype): self.s.all() class Any: - params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] - param_names = ["N", "case"] + params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]] + param_names = ["N", "case", "dtype"] - def setup(self, N, case): + def setup(self, N, case, dtype): val = case == "fast" - self.s = Series([val] * N) + self.s = Series([val] * N, dtype=dtype) - def time_any(self, N, case): + def time_any(self, N, case, dtype): self.s.any() @@ -265,11 +265,14 @@ class NanOps: "prod", ], [10 ** 3, 10 ** 6], - ["int8", "int32", "int64", "float64"], + ["int8", "int32", "int64", "float64", "Int64", "boolean"], ] param_names = ["func", "N", "dtype"] def setup(self, func, N, dtype): + if func == "argmax" and dtype in {"Int64", "boolean"}: + # Skip argmax for nullable int since this doesn't work yet (GH-24382) + raise NotImplementedError self.s = Series([1] * N, dtype=dtype) self.func = getattr(self.s, func) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index ec67394e55a1e..ebbd3c9eddfdb 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -7,11 +7,17 @@ class FrameOps: - params = [ops, ["float", "int"], [0, 1]] + params = [ops, ["float", "int", "Int64"], [0, 1]] param_names = ["op", "dtype", "axis"] def setup(self, op, dtype, axis): - df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) + if op == "mad" and dtype == "Int64" and axis == 1: + # GH-33036 + raise NotImplementedError + values = np.random.randn(100000, 4) + if dtype == "Int64": + values = values.astype(int) + df = pd.DataFrame(values).astype(dtype) self.df_func = getattr(df, op) def time_op(self, op, dtype, axis): diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8cb80c7c92f8e..4cfd47894a776 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -229,6 +229,8 @@ Performance improvements sparse values from ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). +- Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`). + .. --------------------------------------------------------------------------- diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py new file mode 100644 index 0000000000000..0fb2605b554c2 --- /dev/null +++ b/pandas/core/array_algos/masked_reductions.py @@ -0,0 +1,47 @@ +""" +masked_reductions.py is for reduction algorithms using a mask-based approach +for missing values. +""" + +import numpy as np + +from pandas._libs import missing as libmissing +from pandas.compat.numpy import _np_version_under1p17 + +from pandas.core.nanops import check_below_min_count + + +def sum( + values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0, +): + """ + Sum for 1D masked array. + + Parameters + ---------- + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + """ + if not skipna: + if mask.any(): + return libmissing.NA + else: + if check_below_min_count(values.shape, None, min_count): + return libmissing.NA + return np.sum(values) + else: + if check_below_min_count(values.shape, mask, min_count): + return libmissing.NA + + if _np_version_under1p17: + return np.sum(values[~mask]) + else: + return np.sum(values, where=~mask) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index d93b5fbc83312..442d4ca8cef6d 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -27,6 +27,7 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops +from pandas.core.array_algos import masked_reductions from pandas.core.indexers import check_array_indexer from .masked import BaseMaskedArray @@ -695,6 +696,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask + if name == "sum": + return masked_reductions.sum(data, mask, skipna=skipna, **kwargs) + # coerce to a nan-aware float if needed if self._hasna: data = self.to_numpy("float64", na_value=np.nan) @@ -706,7 +710,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): return libmissing.NA # if we have numeric op that would result in an int, coerce to int if possible - if name in ["sum", "prod"] and notna(result): + if name == "prod" and notna(result): int_result = np.int64(result) if int_result == result: result = int_result diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index f2880c5cbee42..4f3c68aa03b16 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -27,6 +27,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops +from pandas.core.array_algos import masked_reductions import pandas.core.common as com from pandas.core.indexers import check_array_indexer from pandas.core.ops import invalid_comparison @@ -560,6 +561,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask + if name == "sum": + return masked_reductions.sum(data, mask, skipna=skipna, **kwargs) + # coerce to a nan-aware float if needed # (we explicitly use NaN within reductions) if self._hasna: @@ -577,7 +581,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): # if we have a preservable numeric op, # provide coercion back to an integer type if possible - elif name in ["sum", "min", "max", "prod"]: + elif name in ["min", "max", "prod"]: # GH#31409 more performant than casting-then-checking result = com.cast_scalar_indexer(result) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 87f937f9e7087..822ab775e7e46 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1238,7 +1238,7 @@ def _maybe_null_out( result: np.ndarray, axis: Optional[int], mask: Optional[np.ndarray], - shape: Tuple, + shape: Tuple[int, ...], min_count: int = 1, ) -> float: """ @@ -1260,16 +1260,43 @@ def _maybe_null_out( # GH12941, use None to auto cast null result[null_mask] = None elif result is not NaT: - if mask is not None: - null_mask = mask.size - mask.sum() - else: - null_mask = np.prod(shape) - if null_mask < min_count: + if check_below_min_count(shape, mask, min_count): result = np.nan return result +def check_below_min_count( + shape: Tuple[int, ...], mask: Optional[np.ndarray], min_count: int +): + """ + Check for the `min_count` keyword. Returns True if below `min_count` (when + missing value should be returned from the reduction). + + Parameters + ---------- + shape : tuple + The shape of the values (`values.shape`). + mask : ndarray or None + Boolean numpy array (typically of same shape as `shape`) or None. + min_count : int + Keyword passed through from sum/prod call. + + Returns + ------- + bool + """ + if min_count > 0: + if mask is None: + # no missing values, only check size + non_nulls = np.prod(shape) + else: + non_nulls = mask.size - mask.sum() + if non_nulls < min_count: + return True + return False + + def _zero_out_fperr(arg): # #18044 reference this behavior to fix rolling skew/kurt issue if isinstance(arg, np.ndarray): diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index 7a8146ef14de0..ce50266c756a8 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -46,7 +46,9 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): if dropna: s = s.dropna() - if op in ("sum", "prod"): + if op == "sum": + assert isinstance(getattr(s, op)(), np.int_) + elif op == "prod": assert isinstance(getattr(s, op)(), np.int64) elif op in ("min", "max"): assert isinstance(getattr(s, op)(), np.bool_) diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index 3735b3c014cab..ee1ec86745246 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -34,7 +34,10 @@ def test_preserve_dtypes(op): # op result = getattr(df.C, op)() - assert isinstance(result, int) + if op == "sum": + assert isinstance(result, np.int64) + else: + assert isinstance(result, int) # groupby result = getattr(df.groupby("A"), op)() diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index abd99aadfb484..962b105d1e8fc 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -531,13 +531,14 @@ def test_sum_inf(self): res = nanops.nansum(arr, axis=1) assert np.isinf(res).all() + @pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean", "object"]) @pytest.mark.parametrize("use_bottleneck", [True, False]) @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) - def test_empty(self, method, unit, use_bottleneck): + def test_empty(self, method, unit, use_bottleneck, dtype): with pd.option_context("use_bottleneck", use_bottleneck): # GH#9422 / GH#18921 # Entirely empty - s = Series([], dtype=object) + s = Series([], dtype=dtype) # NA by default result = getattr(s, method)() assert result == unit @@ -560,8 +561,14 @@ def test_empty(self, method, unit, use_bottleneck): result = getattr(s, method)(skipna=True, min_count=1) assert pd.isna(result) + result = getattr(s, method)(skipna=False, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=False, min_count=1) + assert pd.isna(result) + # All-NA - s = Series([np.nan]) + s = Series([np.nan], dtype=dtype) # NA by default result = getattr(s, method)() assert result == unit @@ -585,7 +592,7 @@ def test_empty(self, method, unit, use_bottleneck): assert pd.isna(result) # Mix of valid, empty - s = Series([np.nan, 1]) + s = Series([np.nan, 1], dtype=dtype) # Default result = getattr(s, method)() assert result == 1.0 @@ -604,22 +611,22 @@ def test_empty(self, method, unit, use_bottleneck): result = getattr(s, method)(skipna=True, min_count=0) assert result == 1.0 - result = getattr(s, method)(skipna=True, min_count=1) - assert result == 1.0 - # GH#844 (changed in GH#9422) - df = DataFrame(np.empty((10, 0))) + df = DataFrame(np.empty((10, 0)), dtype=dtype) assert (getattr(df, method)(1) == unit).all() - s = pd.Series([1]) + s = pd.Series([1], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result) - s = pd.Series([np.nan]) + result = getattr(s, method)(skipna=False, min_count=2) + assert pd.isna(result) + + s = pd.Series([np.nan], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result) - s = pd.Series([np.nan, 1]) + s = pd.Series([np.nan, 1], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result)