pandas-dev · jreback · Apr 21, 2021 · Mar 26, 2021 · Mar 26, 2021 · Mar 26, 2021
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -493,6 +493,33 @@ def time_frame_agg(self, dtype, method):
         self.df.groupby("key").agg(method)
 
 
+class CumminMax:
+    param_names = ["dtype", "method"]
+    params = [
+        ["float64", "int64", "Float64", "Int64"],
+        ["cummin", "cummax"],
+    ]
+
+    def setup(self, dtype, method):
+        N = 1_000_000
+        vals = np.random.randint(0, 1000, (N, 10))
+        null_vals = vals.astype(float, copy=True)
+        null_vals[::2, :] = np.nan
+        null_vals[::3, :] = np.nan
+        df = DataFrame(vals, columns=list("abcdefghij"), dtype=dtype)
+        null_df = DataFrame(null_vals, columns=list("abcdefghij"), dtype=dtype)
+        df["key"] = np.random.randint(0, 100, size=N)
+        null_df["key"] = np.random.randint(0, 100, size=N)
+        self.df = df
+        self.null_df = null_df
+
+    def time_frame_transform(self, dtype, method):
+        self.df.groupby("key").transform(method)
+
+    def time_frame_transform_many_nulls(self, dtype, method):
+        self.null_df.groupby("key").transform(method)
+
+
 class RankWithTies:
     # GH 21237
     param_names = ["dtype", "tie_method"]

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -447,6 +447,8 @@ Performance improvements
 - Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`)
 - Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`)
 - Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`)
+- Performance improvement in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` when using methods ``cummin`` and ``cummax`` with nullable data types (:issue:`37493`)
+-
 
 .. ---------------------------------------------------------------------------
 
@@ -633,6 +635,8 @@ Groupby/resample/rolling
 - Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes  (:issue:`40164`)
 - :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
 - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
+- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` computing wrong result for methods ``cummin`` and ``cummax`` with nullable data types too large to roundtrip when casting to float (:issue:`37493`)
+-
 
 Reshaping
 ^^^^^^^^^

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -1255,9 +1255,11 @@ def group_min(groupby_t[:, ::1] out,
 @cython.wraparound(False)
 def group_cummin_max(groupby_t[:, ::1] out,
                      ndarray[groupby_t, ndim=2] values,
+                     uint8_t[:, ::1] mask,
                      const intp_t[:] labels,
                      int ngroups,
                      bint is_datetimelike,
+                     bint use_mask,
                      bint compute_max):
     """
     Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
@@ -1268,12 +1270,19 @@ def group_cummin_max(groupby_t[:, ::1] out,
         Array to store cummin/max in.
     values : array
         Values to take cummin/max of.
+    mask : array[uint8_t]
+        If `use_mask`, then indices represent missing values,
+        otherwise will be passed as a zeroed array
     labels : np.ndarray[np.intp]
         Labels to group by.
     ngroups : int
         Number of groups, larger than all entries of `labels`.
     is_datetimelike : bool
         True if `values` contains datetime-like entries.
+    use_mask : bool
 elif is_integer_dtype(dtype): 
     # we use iNaT for the missing value on ints 
     # so pre-convert to guard this condition 
     if (values == iNaT).any(): 
         values = ensure_float64(values) 
 elif is_integer_dtype(dtype): 
     # we use iNaT for the missing value on ints 
     # so pre-convert to guard this condition 
     if (values == iNaT).any(): 
         values = ensure_float64(values) 
+        True if the mask should be used (otherwise we continue
+        as if it is not a masked algorithm). Avoids the cost
+        of checking for a completely zeroed mask.
     compute_max : bool
         True if cumulative maximum should be computed, False
         if cumulative minimum should be computed
@@ -1287,6 +1296,7 @@ def group_cummin_max(groupby_t[:, ::1] out,
         groupby_t val, mval
         ndarray[groupby_t, ndim=2] accum
         intp_t lab
+        bint val_is_nan
 
     N, K = (<object>values).shape
     accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
@@ -1304,11 +1314,29 @@ def group_cummin_max(groupby_t[:, ::1] out,
             if lab < 0:
                 continue
             for j in range(K):
-                val = values[i, j]
+                val_is_nan = False
+
+                if use_mask:
+                    if mask[i, j]:
+
+                        # `out` does not need to be set since it
+                        # will be masked anyway
+                        val_is_nan = True
+                    else:
+
+                        # If using the mask, we can avoid grabbing the
+                        # value unless necessary
+                        val = values[i, j]
 
-                if _treat_as_na(val, is_datetimelike):
-                    out[i, j] = val
+                # Otherwise, `out` must be set accordingly if the
+                # value is missing
                 else:
+                    val = values[i, j]
+                    if _treat_as_na(val, is_datetimelike):
+                        val_is_nan = True
+                        out[i, j] = val
+
+                if not val_is_nan:
                     mval = accum[lab, j]
                     if compute_max:
                         if val > mval:
@@ -1323,19 +1351,41 @@ def group_cummin_max(groupby_t[:, ::1] out,
 @cython.wraparound(False)
 def group_cummin(groupby_t[:, ::1] out,
                  ndarray[groupby_t, ndim=2] values,
+                 uint8_t[:, ::1] mask,
                  const intp_t[:] labels,
                  int ngroups,
-                 bint is_datetimelike):
+                 bint is_datetimelike,
+                 bint use_mask):
     """See group_cummin_max.__doc__"""
-    group_cummin_max(out, values, labels, ngroups, is_datetimelike, compute_max=False)
+    group_cummin_max(
+        out,
+        values,
+        mask,
+        labels,
+        ngroups,
+        is_datetimelike,
+        use_mask,
+        compute_max=False
+    )
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_cummax(groupby_t[:, ::1] out,
                  ndarray[groupby_t, ndim=2] values,
+                 uint8_t[:, ::1] mask,
                  const intp_t[:] labels,
                  int ngroups,
-                 bint is_datetimelike):
+                 bint is_datetimelike,
+                 bint use_mask):
     """See group_cummin_max.__doc__"""
-    group_cummin_max(out, values, labels, ngroups, is_datetimelike, compute_max=True)
+    group_cummin_max(
+        out,
+        values,
+        mask,
+        labels,
+        ngroups,
+        is_datetimelike,
+        use_mask,
+        compute_max=True
+    )
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -72,6 +72,10 @@
     maybe_fill,
 )
 
+from pandas.core.arrays.masked import (
+    BaseMaskedArray,
+    BaseMaskedDtype,
+)
 from pandas.core.base import SelectionMixin
 import pandas.core.common as com
 from pandas.core.frame import DataFrame
@@ -118,6 +122,11 @@
     },
 }
 
+_CYTHON_MASKED_FUNCTIONS = {
+    "cummin",
+    "cummax",
+}
+
 
 @functools.lru_cache(maxsize=None)
 def _get_cython_function(kind: str, how: str, dtype: np.dtype, is_numeric: bool):
@@ -155,6 +164,10 @@ def _get_cython_function(kind: str, how: str, dtype: np.dtype, is_numeric: bool)
     return func
 
 
+def cython_function_uses_mask(kind: str) -> bool:
+    return kind in _CYTHON_MASKED_FUNCTIONS
+
+
 class BaseGrouper:
     """
     This is an internal Grouper class, which actually holds
@@ -574,9 +587,52 @@ def _ea_wrap_cython_operation(
             f"function is not implemented for this dtype: {values.dtype}"
         )
 
+    @final
+    def _masked_ea_wrap_cython_operation(
+        self,
+        kind: str,
+        values: BaseMaskedArray,
+        how: str,
+        axis: int,
+        min_count: int = -1,
+        **kwargs,
+    ) -> BaseMaskedArray:
+        """
+        Equivalent of `_ea_wrap_cython_operation`, but optimized for masked EA's
+        and cython algorithms which accept a mask.
+        """
+        orig_values = values
+
+        # isna just directly returns self._mask, so copy here to prevent
+        # modifying the original
+        mask = isna(values).copy()
+        arr = values._data
+
+        if is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype):
+            # IntegerArray or BooleanArray
+            arr = ensure_int_or_float(arr)
+
+        res_values = self._cython_operation(
+            kind, arr, how, axis, min_count, mask=mask, **kwargs
+        )
+        dtype = maybe_cast_result_dtype(orig_values.dtype, how)
+        assert isinstance(dtype, BaseMaskedDtype)
+        cls = dtype.construct_array_type()
+
+        return cls(
+            res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=False)
+        )
+
     @final
     def _cython_operation(
-        self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
+        self,
+        kind: str,
+        values,
+        how: str,
+        axis: int,
+        min_count: int = -1,
+        mask: np.ndarray | None = None,
+        **kwargs,
     ) -> ArrayLike:
         """
         Returns the values of a cython operation.
@@ -598,10 +654,16 @@ def _cython_operation(
         # if not raise NotImplementedError
         self._disallow_invalid_ops(dtype, how, is_numeric)
 
+        func_uses_mask = cython_function_uses_mask(how)
         if is_extension_array_dtype(dtype):
-            return self._ea_wrap_cython_operation(
-                kind, values, how, axis, min_count, **kwargs
-            )
+            if isinstance(values, BaseMaskedArray) and func_uses_mask:
+                return self._masked_ea_wrap_cython_operation(
+                    kind, values, how, axis, min_count, **kwargs
+                )
+            else:
+                return self._ea_wrap_cython_operation(
+                    kind, values, how, axis, min_count, **kwargs
+                )
 
         is_datetimelike = needs_i8_conversion(dtype)
 
@@ -613,7 +675,7 @@ def _cython_operation(
         elif is_integer_dtype(dtype):
             # we use iNaT for the missing value on ints
             # so pre-convert to guard this condition
-            if (values == iNaT).any():
+            if mask is None and (values == iNaT).any():
                 values = ensure_float64(values)
             else:
                 values = ensure_int_or_float(values)
@@ -628,6 +690,9 @@ def _cython_operation(
         swapped = False
         if vdim == 1:
             values = values[:, None]
+            if mask is not None:
+                mask = mask[:, None]
+
             out_shape = (self.ngroups, arity)
         else:
             if axis > 0:
@@ -641,6 +706,10 @@ def _cython_operation(
             out_shape = (self.ngroups,) + values.shape[1:]
 
         func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric)
+        use_mask = mask is not None
+        if func_uses_mask:
+            if mask is None:
+                mask = np.zeros_like(values, dtype=np.uint8, order="C")
 
         if how == "rank":
             out_dtype = "float"
@@ -650,25 +719,23 @@ def _cython_operation(
             else:
                 out_dtype = "object"
 
-        codes, _, _ = self.group_info
-
         if kind == "aggregate":
+            codes, _, _ = self.group_info
             result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
             counts = np.zeros(self.ngroups, dtype=np.int64)
             result = self._aggregate(result, counts, values, codes, func, min_count)
         elif kind == "transform":
             result = maybe_fill(np.empty(values.shape, dtype=out_dtype))
-
             # TODO: min_count
             result = self._transform(
-                result, values, codes, func, is_datetimelike, **kwargs
+                result, values, func, is_datetimelike, use_mask, mask, **kwargs
             )
 
-        if is_integer_dtype(result.dtype) and not is_datetimelike:
-            mask = result == iNaT
-            if mask.any():
+        if not use_mask and is_integer_dtype(result.dtype) and not is_datetimelike:
+            result_mask = result == iNaT
+            if result_mask.any():
                 result = result.astype("float64")
-                result[mask] = np.nan
+                result[result_mask] = np.nan
 
         if kind == "aggregate" and self._filter_empty_groups and not counts.all():
             assert result.ndim != 2
@@ -704,11 +771,30 @@ def _aggregate(
 
     @final
     def _transform(
-        self, result, values, comp_ids, transform_func, is_datetimelike: bool, **kwargs
-    ):
+        self,
+        result: np.ndarray,
+        values: np.ndarray,
+        transform_func,
+        is_datetimelike: bool,
+        use_mask: bool,
+        mask: np.ndarray | None,
+        **kwargs,
+    ) -> np.ndarray:
 
-        _, _, ngroups = self.group_info
-        transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
+        comp_ids, _, ngroups = self.group_info
+        if mask is not None:
+            transform_func(
+                result,
+                values,
+                mask,
+                comp_ids,
+                ngroups,
+                is_datetimelike,
+                use_mask,
+                **kwargs,
+            )
+        else:
+            transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
 
         return result