PERF: masked ops for reductions (sum) (#30982)

jorisvandenbossche · web-flow · commit 5e3a5f653589 · 2020-03-27T09:08:25.000-07:00
* POC masked ops for reductions

* fix mask for older numpy

* also use in boolean

* add min_count support

* fix preserve_dtypes test

* passthrough min_count for boolean as well

* fix comment

* add object to empty reduction test case

* test platform int

* Test sum separately with platform int

* share min_count checking helper function with nanops

* type + add docstring for min_count

* move sum algo from ops to array_algos

* add Int64/boolean to some benchmarks

* add whatsnew

* add skipna default in function signature

* update type hint + deprivatize

* update another type hint
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -223,27 +223,27 @@ def time_series_datetimeindex_repr(self):
 
 class All:
 
-    params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
-    param_names = ["N", "case"]
+    params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]]
+    param_names = ["N", "case", "dtype"]
 
-    def setup(self, N, case):
+    def setup(self, N, case, dtype):
         val = case != "fast"
-        self.s = Series([val] * N)
+        self.s = Series([val] * N, dtype=dtype)
 
-    def time_all(self, N, case):
+    def time_all(self, N, case, dtype):
         self.s.all()
 
 
 class Any:
 
-    params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
-    param_names = ["N", "case"]
+    params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]]
+    param_names = ["N", "case", "dtype"]
 
-    def setup(self, N, case):
+    def setup(self, N, case, dtype):
         val = case == "fast"
-        self.s = Series([val] * N)
+        self.s = Series([val] * N, dtype=dtype)
 
-    def time_any(self, N, case):
+    def time_any(self, N, case, dtype):
         self.s.any()
 
 
@@ -265,11 +265,14 @@ class NanOps:
             "prod",
         ],
         [10 ** 3, 10 ** 6],
-        ["int8", "int32", "int64", "float64"],
+        ["int8", "int32", "int64", "float64", "Int64", "boolean"],
     ]
     param_names = ["func", "N", "dtype"]
 
     def setup(self, func, N, dtype):
+        if func == "argmax" and dtype in {"Int64", "boolean"}:
+            # Skip argmax for nullable int since this doesn't work yet (GH-24382)
+            raise NotImplementedError
         self.s = Series([1] * N, dtype=dtype)
         self.func = getattr(self.s, func)
 
diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
@@ -7,11 +7,17 @@
 
 class FrameOps:
 
-    params = [ops, ["float", "int"], [0, 1]]
+    params = [ops, ["float", "int", "Int64"], [0, 1]]
     param_names = ["op", "dtype", "axis"]
 
     def setup(self, op, dtype, axis):
-        df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype)
+        if op == "mad" and dtype == "Int64" and axis == 1:
+            # GH-33036
+            raise NotImplementedError
+        values = np.random.randn(100000, 4)
+        if dtype == "Int64":
+            values = values.astype(int)
+        df = pd.DataFrame(values).astype(dtype)
         self.df_func = getattr(df, op)
 
     def time_op(self, op, dtype, axis):
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -255,6 +255,8 @@ Performance improvements
   sparse values from ``scipy.sparse`` matrices using the
   :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
   :issue:`32825`,  :issue:`32826`, :issue:`32856`, :issue:`32858`).
+- Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`).
+
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py
@@ -0,0 +1,47 @@
+"""
+masked_reductions.py is for reduction algorithms using a mask-based approach
+for missing values.
+"""
+
+import numpy as np
+
+from pandas._libs import missing as libmissing
+from pandas.compat.numpy import _np_version_under1p17
+
+from pandas.core.nanops import check_below_min_count
+
+
+def sum(
+    values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0,
+):
+    """
+    Sum for 1D masked array.
+
+    Parameters
+    ----------
+    values : np.ndarray
+        Numpy array with the values (can be of any dtype that support the
+        operation).
+    mask : np.ndarray
+        Boolean numpy array (True values indicate missing values).
+    skipna : bool, default True
+        Whether to skip NA.
+    min_count : int, default 0
+        The required number of valid values to perform the operation. If fewer than
+        ``min_count`` non-NA values are present the result will be NA.
+    """
+    if not skipna:
+        if mask.any():
+            return libmissing.NA
+        else:
+            if check_below_min_count(values.shape, None, min_count):
+                return libmissing.NA
+            return np.sum(values)
+    else:
+        if check_below_min_count(values.shape, mask, min_count):
+            return libmissing.NA
+
+        if _np_version_under1p17:
+            return np.sum(values[~mask])
+        else:
+            return np.sum(values, where=~mask)
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -27,6 +27,7 @@
 from pandas.core.dtypes.missing import isna, notna
 
 from pandas.core import nanops, ops
+from pandas.core.array_algos import masked_reductions
 from pandas.core.indexers import check_array_indexer
 
 from .masked import BaseMaskedArray
@@ -695,6 +696,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
         data = self._data
         mask = self._mask
 
+        if name == "sum":
+            return masked_reductions.sum(data, mask, skipna=skipna, **kwargs)
+
         # coerce to a nan-aware float if needed
         if self._hasna:
             data = self.to_numpy("float64", na_value=np.nan)
@@ -706,7 +710,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
             return libmissing.NA
 
         # if we have numeric op that would result in an int, coerce to int if possible
-        if name in ["sum", "prod"] and notna(result):
+        if name == "prod" and notna(result):
             int_result = np.int64(result)
             if int_result == result:
                 result = int_result
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -27,6 +27,7 @@
 from pandas.core.dtypes.missing import isna
 
 from pandas.core import nanops, ops
+from pandas.core.array_algos import masked_reductions
 import pandas.core.common as com
 from pandas.core.indexers import check_array_indexer
 from pandas.core.ops import invalid_comparison
@@ -560,6 +561,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
         data = self._data
         mask = self._mask
 
+        if name == "sum":
+            return masked_reductions.sum(data, mask, skipna=skipna, **kwargs)
+
         # coerce to a nan-aware float if needed
         # (we explicitly use NaN within reductions)
         if self._hasna:
@@ -577,7 +581,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
 
         # if we have a preservable numeric op,
         # provide coercion back to an integer type if possible
-        elif name in ["sum", "min", "max", "prod"]:
+        elif name in ["min", "max", "prod"]:
             # GH#31409 more performant than casting-then-checking
             result = com.cast_scalar_indexer(result)
 
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -1238,7 +1238,7 @@ def _maybe_null_out(
     result: np.ndarray,
     axis: Optional[int],
     mask: Optional[np.ndarray],
-    shape: Tuple,
+    shape: Tuple[int, ...],
     min_count: int = 1,
 ) -> float:
     """
@@ -1260,16 +1260,43 @@ def _maybe_null_out(
                 # GH12941, use None to auto cast null
                 result[null_mask] = None
     elif result is not NaT:
-        if mask is not None:
-            null_mask = mask.size - mask.sum()
-        else:
-            null_mask = np.prod(shape)
-        if null_mask < min_count:
+        if check_below_min_count(shape, mask, min_count):
             result = np.nan
 
     return result
 
 
+def check_below_min_count(
+    shape: Tuple[int, ...], mask: Optional[np.ndarray], min_count: int
+):
+    """
+    Check for the `min_count` keyword. Returns True if below `min_count` (when
+    missing value should be returned from the reduction).
+
+    Parameters
+    ----------
+    shape : tuple
+        The shape of the values (`values.shape`).
+    mask : ndarray or None
+        Boolean numpy array (typically of same shape as `shape`) or None.
+    min_count : int
+        Keyword passed through from sum/prod call.
+
+    Returns
+    -------
+    bool
+    """
+    if min_count > 0:
+        if mask is None:
+            # no missing values, only check size
+            non_nulls = np.prod(shape)
+        else:
+            non_nulls = mask.size - mask.sum()
+        if non_nulls < min_count:
+            return True
+    return False
+
+
 def _zero_out_fperr(arg):
     # #18044 reference this behavior to fix rolling skew/kurt issue
     if isinstance(arg, np.ndarray):
diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py
@@ -46,7 +46,9 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions):
     if dropna:
         s = s.dropna()
 
-    if op in ("sum", "prod"):
+    if op == "sum":
+        assert isinstance(getattr(s, op)(), np.int_)
+    elif op == "prod":
         assert isinstance(getattr(s, op)(), np.int64)
     elif op in ("min", "max"):
         assert isinstance(getattr(s, op)(), np.bool_)
diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py
@@ -34,7 +34,10 @@ def test_preserve_dtypes(op):
 
     # op
     result = getattr(df.C, op)()
-    assert isinstance(result, int)
+    if op == "sum":
+        assert isinstance(result, np.int64)
+    else:
+        assert isinstance(result, int)
 
     # groupby
     result = getattr(df.groupby("A"), op)()
diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
@@ -531,13 +531,14 @@ def test_sum_inf(self):
         res = nanops.nansum(arr, axis=1)
         assert np.isinf(res).all()
 
+    @pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean", "object"])
     @pytest.mark.parametrize("use_bottleneck", [True, False])
     @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)])
-    def test_empty(self, method, unit, use_bottleneck):
+    def test_empty(self, method, unit, use_bottleneck, dtype):
         with pd.option_context("use_bottleneck", use_bottleneck):
             # GH#9422 / GH#18921
             # Entirely empty
-            s = Series([], dtype=object)
+            s = Series([], dtype=dtype)
             # NA by default
             result = getattr(s, method)()
             assert result == unit
@@ -560,8 +561,14 @@ def test_empty(self, method, unit, use_bottleneck):
             result = getattr(s, method)(skipna=True, min_count=1)
             assert pd.isna(result)
 
+            result = getattr(s, method)(skipna=False, min_count=0)
+            assert result == unit
+
+            result = getattr(s, method)(skipna=False, min_count=1)
+            assert pd.isna(result)
+
             # All-NA
-            s = Series([np.nan])
+            s = Series([np.nan], dtype=dtype)
             # NA by default
             result = getattr(s, method)()
             assert result == unit
@@ -585,7 +592,7 @@ def test_empty(self, method, unit, use_bottleneck):
             assert pd.isna(result)
 
             # Mix of valid, empty
-            s = Series([np.nan, 1])
+            s = Series([np.nan, 1], dtype=dtype)
             # Default
             result = getattr(s, method)()
             assert result == 1.0
@@ -604,22 +611,22 @@ def test_empty(self, method, unit, use_bottleneck):
             result = getattr(s, method)(skipna=True, min_count=0)
             assert result == 1.0
 
-            result = getattr(s, method)(skipna=True, min_count=1)
-            assert result == 1.0
-
             # GH#844 (changed in GH#9422)
-            df = DataFrame(np.empty((10, 0)))
+            df = DataFrame(np.empty((10, 0)), dtype=dtype)
             assert (getattr(df, method)(1) == unit).all()
 
-            s = pd.Series([1])
+            s = pd.Series([1], dtype=dtype)
             result = getattr(s, method)(min_count=2)
             assert pd.isna(result)
 
-            s = pd.Series([np.nan])
+            result = getattr(s, method)(skipna=False, min_count=2)
+            assert pd.isna(result)
+
+            s = pd.Series([np.nan], dtype=dtype)
             result = getattr(s, method)(min_count=2)
             assert pd.isna(result)
 
-            s = pd.Series([np.nan, 1])
+            s = pd.Series([np.nan, 1], dtype=dtype)
             result = getattr(s, method)(min_count=2)
             assert pd.isna(result)