From 7cd4dc3cc0f7e6bffe80cc1d97293638da66d638 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 14:06:19 -0400
Subject: [PATCH 01/32] wip

---
 asv_bench/benchmarks/groupby.py | 23 ++++++++++++++++++++
 pandas/_libs/groupby.pyx        | 22 +++++++++-----------
 pandas/core/groupby/groupby.py  | 11 ++++++----
 pandas/core/groupby/ops.py      | 37 +++++++++++++++++++--------------
 4 files changed, 61 insertions(+), 32 deletions(-)
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 9930c61e34b15..d825c78cc5e5b 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -493,6 +493,29 @@ def time_frame_agg(self, dtype, method):
         self.df.groupby("key").agg(method)
 
 
+class CumminMax:
+    """
+    Benchmarks specifically targetting our cython aggregation algorithms
+    (using a big enough dataframe with simple key, so a large part of the
+    time is actually spent in the grouped aggregation).
+    """
+
+    param_names = ["dtype", "method"]
+    params = [
+        ["float64", "int64", "Float64", "Int64"],
+        ["cummin", "cummax"],
+    ]
+
+    def setup(self, dtype, method):
+        N = 1_000_000
+        df = DataFrame(np.random.randn(N, 10), columns=list("abcdefghij"))
+        df["key"] = np.random.randint(0, 100, size=N)
+        self.df = df
+
+    def time_frame_agg(self, dtype, method):
+        self.df.groupby("key").agg(method)
+
+
 class RankWithTies:
     # GH 21237
     param_names = ["dtype", "tie_method"]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 545d6a10232ab..3baf99837bc95 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1251,9 +1251,9 @@ def group_min(groupby_t[:, ::1] out,
 @cython.wraparound(False)
 def group_cummin_max(groupby_t[:, ::1] out,
                      ndarray[groupby_t, ndim=2] values,
+                     uint8_t[:, ::1] mask,
                      const int64_t[:] labels,
                      int ngroups,
-                     bint is_datetimelike,
                      bint compute_max):
     """
     Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
@@ -1268,8 +1268,6 @@ def group_cummin_max(groupby_t[:, ::1] out,
         Labels to group by.
     ngroups : int
         Number of groups, larger than all entries of `labels`.
-    is_datetimelike : bool
-        True if `values` contains datetime-like entries.
     compute_max : bool
         True if cumulative maximum should be computed, False
         if cumulative minimum should be computed
@@ -1281,11 +1279,11 @@ def group_cummin_max(groupby_t[:, ::1] out,
     cdef:
         Py_ssize_t i, j, N, K, size
         groupby_t val, mval
-        ndarray[groupby_t, ndim=2] accum
+        groupby_t[:, ::1] accum
         int64_t lab
 
     N, K = (<object>values).shape
-    accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
+    accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype, order="C")
     if groupby_t is int64_t:
         accum[:] = -_int64_max if compute_max else _int64_max
     elif groupby_t is uint64_t:
@@ -1302,7 +1300,7 @@ def group_cummin_max(groupby_t[:, ::1] out,
             for j in range(K):
                 val = values[i, j]
 
-                if _treat_as_na(val, is_datetimelike):
+                if mask[i, j]:
                     out[i, j] = val
                 else:
                     mval = accum[lab, j]
@@ -1319,19 +1317,19 @@ def group_cummin_max(groupby_t[:, ::1] out,
 @cython.wraparound(False)
 def group_cummin(groupby_t[:, ::1] out,
                  ndarray[groupby_t, ndim=2] values,
+                 uint8_t[:, ::1] mask,
                  const int64_t[:] labels,
-                 int ngroups,
-                 bint is_datetimelike):
+                 int ngroups):
     """See group_cummin_max.__doc__"""
-    group_cummin_max(out, values, labels, ngroups, is_datetimelike, compute_max=False)
+    group_cummin_max(out, values, mask, labels, ngroups, compute_max=False)
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_cummax(groupby_t[:, ::1] out,
                  ndarray[groupby_t, ndim=2] values,
+                 uint8_t[:, ::1] mask,
                  const int64_t[:] labels,
-                 int ngroups,
-                 bint is_datetimelike):
+                 int ngroups):
     """See group_cummin_max.__doc__"""
-    group_cummin_max(out, values, labels, ngroups, is_datetimelike, compute_max=True)
+    group_cummin_max(out, values, mask, labels, ngroups, compute_max=True)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index f33833193e4e0..22f363970df3b 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1024,7 +1024,7 @@ def _cumcount_array(self, ascending: bool = True):
 
     @final
     def _cython_transform(
-        self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
+        self, how: str, numeric_only: bool = True, axis: int = 0, needs_mask: bool = False, **kwargs
     ):
         output: Dict[base.OutputKey, np.ndarray] = {}
 
@@ -1034,9 +1034,12 @@ def _cython_transform(
             if numeric_only and not is_numeric:
                 continue
 
+            mask = None
+            if needs_mask:
+                mask = np.require(isna(obj._values).view(np.uint8), requirements='C')
             try:
                 result = self.grouper._cython_operation(
-                    "transform", obj._values, how, axis, **kwargs
+                    "transform", obj._values, how, axis, mask=mask, **kwargs
                 )
             except NotImplementedError:
                 continue
@@ -2588,7 +2591,7 @@ def cummin(self, axis=0, **kwargs):
         if axis != 0:
             return self.apply(lambda x: np.minimum.accumulate(x, axis))
 
-        return self._cython_transform("cummin", numeric_only=False)
+        return self._cython_transform("cummin", numeric_only=False, needs_mask=True)
 
     @final
     @Substitution(name="groupby")
@@ -2604,7 +2607,7 @@ def cummax(self, axis=0, **kwargs):
         if axis != 0:
             return self.apply(lambda x: np.maximum.accumulate(x, axis))
 
-        return self._cython_transform("cummax", numeric_only=False)
+        return self._cython_transform("cummax", numeric_only=False, needs_mask=True)
 
     @final
     def _get_cythonized_result(
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index c5d36d1588a5f..bb06ba629fdd0 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -524,7 +524,7 @@ def _disallow_invalid_ops(
 
     @final
     def _ea_wrap_cython_operation(
-        self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
+        self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: Optional[np.ndarray] = None, **kwargs
     ) -> np.ndarray:
         """
         If we have an ExtensionArray, unwrap, call _cython_operation, and
@@ -538,7 +538,7 @@ def _ea_wrap_cython_operation(
             #  operate on the tz-naive equivalents
             values = values.view("M8[ns]")
             res_values = self._cython_operation(
-                kind, values, how, axis, min_count, **kwargs
+                kind, values, how, axis, min_count, mask=mask, **kwargs
             )
             if how in ["rank"]:
                 # preserve float64 dtype
@@ -550,14 +550,15 @@ def _ea_wrap_cython_operation(
 
         elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype):
             # IntegerArray or BooleanArray
-            values = ensure_int_or_float(values)
+            values = ensure_int_or_float(values._data)
             res_values = self._cython_operation(
-                kind, values, how, axis, min_count, **kwargs
+                kind, values, how, axis, min_count, mask=mask, **kwargs
             )
             dtype = maybe_cast_result_dtype(orig_values.dtype, how)
             if isinstance(dtype, ExtensionDtype):
                 cls = dtype.construct_array_type()
-                return cls._from_sequence(res_values, dtype=dtype)
+                return cls(res_values, mask.astype(bool, copy=True))
+                # return cls._from_sequence(res_values, dtype=dtype)
 
             return res_values
 
@@ -565,10 +566,10 @@ def _ea_wrap_cython_operation(
             # FloatingArray
             values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan)
             res_values = self._cython_operation(
-                kind, values, how, axis, min_count, **kwargs
+                kind, values, how, axis, min_count, mask=mask, **kwargs
             )
-            result = type(orig_values)._from_sequence(res_values)
-            return result
+            # result = type(orig_values)._from_sequence(res_values)
+            return type(orig_values)(res_values, mask.astype(bool, copy=True))
 
         raise NotImplementedError(
             f"function is not implemented for this dtype: {values.dtype}"
@@ -576,7 +577,7 @@ def _ea_wrap_cython_operation(
 
     @final
     def _cython_operation(
-        self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
+        self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: Optional[np.ndarray] = None, **kwargs
     ) -> np.ndarray:
         """
         Returns the values of a cython operation.
@@ -600,7 +601,7 @@ def _cython_operation(
 
         if is_extension_array_dtype(dtype):
             return self._ea_wrap_cython_operation(
-                kind, values, how, axis, min_count, **kwargs
+                kind, values, how, axis, min_count, mask=mask, **kwargs
             )
 
         is_datetimelike = needs_i8_conversion(dtype)
@@ -628,6 +629,8 @@ def _cython_operation(
         swapped = False
         if vdim == 1:
             values = values[:, None]
+            if mask is not None:
+                mask = mask[:, None]
             out_shape = (self.ngroups, arity)
         else:
             if axis > 0:
@@ -650,9 +653,8 @@ def _cython_operation(
             else:
                 out_dtype = "object"
 
-        codes, _, _ = self.group_info
-
         if kind == "aggregate":
+            codes, _, _ = self.group_info
             result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
             counts = np.zeros(self.ngroups, dtype=np.int64)
             result = self._aggregate(result, counts, values, codes, func, min_count)
@@ -661,7 +663,7 @@ def _cython_operation(
 
             # TODO: min_count
             result = self._transform(
-                result, values, codes, func, is_datetimelike, **kwargs
+                result, values, func, is_datetimelike, mask=mask, **kwargs
             )
 
         if is_integer_dtype(result.dtype) and not is_datetimelike:
@@ -704,11 +706,14 @@ def _aggregate(
 
     @final
     def _transform(
-        self, result, values, comp_ids, transform_func, is_datetimelike: bool, **kwargs
-    ):
+        self, result: np.ndarray, values: np.ndarray, transform_func, is_datetimelike: bool, mask: Optional[np.ndarray] = None, **kwargs
+    ) -> np.ndarray:
 
         comp_ids, _, ngroups = self.group_info
-        transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
+        if mask is not None:
+            transform_func(result, values, mask, comp_ids, ngroups, **kwargs)
+        else:
+            transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
 
         return result
 

From 91984dc90663d276f66b22135afa327e3e6872c8 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 14:15:53 -0400
Subject: [PATCH 02/32] wip

---
 pandas/_libs/groupby.pyx              | 22 ++++++++++++----------
 pandas/tests/groupby/test_function.py |  6 ++++++
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 3baf99837bc95..3981661c62105 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1300,17 +1300,19 @@ def group_cummin_max(groupby_t[:, ::1] out,
             for j in range(K):
                 val = values[i, j]
 
-                if mask[i, j]:
-                    out[i, j] = val
+                # if mask[i, j]:
+                #     out[i, j] = val
+                # if True:
+                #
+                # else:
+                mval = accum[lab, j]
+                if compute_max:
+                    if val > mval:
+                        accum[lab, j] = mval = val
                 else:
-                    mval = accum[lab, j]
-                    if compute_max:
-                        if val > mval:
-                            accum[lab, j] = mval = val
-                    else:
-                        if val < mval:
-                            accum[lab, j] = mval = val
-                    out[i, j] = mval
+                    if val < mval:
+                        accum[lab, j] = mval = val
+                out[i, j] = mval
 
 
 @cython.boundscheck(False)
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 515774eae009b..cd25eddb84826 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -846,6 +846,12 @@ def test_cummax_all_nan_column():
     tm.assert_frame_equal(expected, result)
 
 
+# @pytest.mark.parametrize("method", ["cummin", "cummax"])
+# def test_cummin_max_nullable_ints_no_float_cast(method):
+
+
+
+
 @pytest.mark.parametrize(
     "in_vals, out_vals",
     [

From b371cc5a96f52b30d39876c183c7cd1c0abb0d95 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 14:28:06 -0400
Subject: [PATCH 03/32] wip

---
 pandas/core/groupby/groupby.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 22f363970df3b..31147179ea70c 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1036,7 +1036,8 @@ def _cython_transform(
 
             mask = None
             if needs_mask:
-                mask = np.require(isna(obj._values).view(np.uint8), requirements='C')
+                mask = np.zeros(dtype=np.uint8, order="C", shape=(len(obj._values)))
+                # mask = np.require(isna(obj._values).view(np.uint8), requirements='C')
             try:
                 result = self.grouper._cython_operation(
                     "transform", obj._values, how, axis, mask=mask, **kwargs

From 69cce960b9b7bf65ccd6c74793f7d926b307d8bf Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 14:32:41 -0400
Subject: [PATCH 04/32] wip

---
 pandas/_libs/groupby.pyx | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 3981661c62105..3baf99837bc95 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1300,19 +1300,17 @@ def group_cummin_max(groupby_t[:, ::1] out,
             for j in range(K):
                 val = values[i, j]
 
-                # if mask[i, j]:
-                #     out[i, j] = val
-                # if True:
-                #
-                # else:
-                mval = accum[lab, j]
-                if compute_max:
-                    if val > mval:
-                        accum[lab, j] = mval = val
+                if mask[i, j]:
+                    out[i, j] = val
                 else:
-                    if val < mval:
-                        accum[lab, j] = mval = val
-                out[i, j] = mval
+                    mval = accum[lab, j]
+                    if compute_max:
+                        if val > mval:
+                            accum[lab, j] = mval = val
+                    else:
+                        if val < mval:
+                            accum[lab, j] = mval = val
+                    out[i, j] = mval
 
 
 @cython.boundscheck(False)

From dd7f324fde0e265f7ec88dccd404feeadb61aaac Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 14:41:44 -0400
Subject: [PATCH 05/32] wip

---
 pandas/_libs/groupby.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 3baf99837bc95..ce7a8b853e523 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1279,7 +1279,7 @@ def group_cummin_max(groupby_t[:, ::1] out,
     cdef:
         Py_ssize_t i, j, N, K, size
         groupby_t val, mval
-        groupby_t[:, ::1] accum
+        ndarray[groupby_t, ndim=2] accum
         int64_t lab
 
     N, K = (<object>values).shape

From 64680d4c68a157c63e1796c31ef1e01f2eb77fd7 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 14:51:24 -0400
Subject: [PATCH 06/32] wip

---
 pandas/core/groupby/groupby.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 31147179ea70c..1175fa345cfe2 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1036,8 +1036,8 @@ def _cython_transform(
 
             mask = None
             if needs_mask:
-                mask = np.zeros(dtype=np.uint8, order="C", shape=(len(obj._values)))
-                # mask = np.require(isna(obj._values).view(np.uint8), requirements='C')
+                # mask = np.zeros(dtype=np.uint8, order="C", shape=(len(obj._values)))
+                mask = np.require(isna(obj._values).view(np.uint8), requirements='C')
             try:
                 result = self.grouper._cython_operation(
                     "transform", obj._values, how, axis, mask=mask, **kwargs

From be16f65378b58ae2822cc6eea461ff29b65e82a8 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 15:00:02 -0400
Subject: [PATCH 07/32] wip

---
 pandas/core/groupby/ops.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index bb06ba629fdd0..6f67153d89c66 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -550,7 +550,7 @@ def _ea_wrap_cython_operation(
 
         elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype):
             # IntegerArray or BooleanArray
-            values = ensure_int_or_float(values._data)
+            values = values._data
             res_values = self._cython_operation(
                 kind, values, how, axis, min_count, mask=mask, **kwargs
             )
@@ -564,7 +564,8 @@ def _ea_wrap_cython_operation(
 
         elif is_float_dtype(values.dtype):
             # FloatingArray
-            values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan)
+            values = values._data
+            # values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan)
             res_values = self._cython_operation(
                 kind, values, how, axis, min_count, mask=mask, **kwargs
             )

From 9442846bc8561cb466c699e11f15e390c47e99e0 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 15:06:41 -0400
Subject: [PATCH 08/32] wip

---
 pandas/core/groupby/ops.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 6f67153d89c66..3240646933ef0 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -613,12 +613,13 @@ def _cython_operation(
         elif is_bool_dtype(dtype):
             values = ensure_int_or_float(values)
         elif is_integer_dtype(dtype):
+            pass
             # we use iNaT for the missing value on ints
             # so pre-convert to guard this condition
-            if (values == iNaT).any():
-                values = ensure_float64(values)
-            else:
-                values = ensure_int_or_float(values)
+            # if (values == iNaT).any():
+            #     values = ensure_float64(values)
+            # else:
+            #     values = ensure_int_or_float(values)
         elif is_numeric and not is_complex_dtype(dtype):
             values = ensure_float64(values)
         else:

From f089175a58300909435f40c333bd15b69b11a03b Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 17:11:40 -0400
Subject: [PATCH 09/32] wip

---
 asv_bench/benchmarks/groupby.py |   6 +-
 pandas/_libs/groupby.pyx        |  47 ++++++++++++---
 pandas/core/groupby/groupby.py  |  15 ++---
 pandas/core/groupby/ops.py      | 103 +++++++++++++++++++++++---------
 4 files changed, 123 insertions(+), 48 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index d825c78cc5e5b..dc4f7dcfabd7f 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -508,12 +508,12 @@ class CumminMax:
 
     def setup(self, dtype, method):
         N = 1_000_000
-        df = DataFrame(np.random.randn(N, 10), columns=list("abcdefghij"))
+        df = DataFrame(np.random.randint(0, 1000, (N, 10)), columns=list("abcdefghij"), dtype=dtype)
         df["key"] = np.random.randint(0, 100, size=N)
         self.df = df
 
-    def time_frame_agg(self, dtype, method):
-        self.df.groupby("key").agg(method)
+    def time_frame_transform(self, dtype, method):
+        self.df.groupby("key").transform(method)
 
 
 class RankWithTies:
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index ce7a8b853e523..68ba284a3f4cc 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1254,6 +1254,8 @@ def group_cummin_max(groupby_t[:, ::1] out,
                      uint8_t[:, ::1] mask,
                      const int64_t[:] labels,
                      int ngroups,
+                     bint is_datetimelike,
+                     bint use_mask,
                      bint compute_max):
     """
     Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
@@ -1264,10 +1266,19 @@ def group_cummin_max(groupby_t[:, ::1] out,
         Array to store cummin/max in.
     values : array
         Values to take cummin/max of.
+    mask : array[uint8_t]
+        If `use_mask`, then indices represent missing values,
+        otherwise will be passed as a zeroed array
     labels : int64 array
         Labels to group by.
     ngroups : int
         Number of groups, larger than all entries of `labels`.
+    is_datetimelike : bool
+        True if `values` contains datetime-like entries.
+    use_mask : bool
+        True if the mask should be used (otherwise we continue
+        as if it is not a masked algorithm). Avoids the cost
+        of checking for a completely zeroed mask.
     compute_max : bool
         True if cumulative maximum should be computed, False
         if cumulative minimum should be computed
@@ -1281,6 +1292,7 @@ def group_cummin_max(groupby_t[:, ::1] out,
         groupby_t val, mval
         ndarray[groupby_t, ndim=2] accum
         int64_t lab
+        bint val_is_nan
 
     N, K = (<object>values).shape
     accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype, order="C")
@@ -1298,11 +1310,26 @@ def group_cummin_max(groupby_t[:, ::1] out,
             if lab < 0:
                 continue
             for j in range(K):
-                val = values[i, j]
-
-                if mask[i, j]:
-                    out[i, j] = val
+                val_is_nan = False
+
+                # If using the mask, we can avoid grabbing the
+                # value unless necessary
+                if use_mask:
+                    if mask[i, j]:
+                        # `out` does not need to be set since it
+                        # will be masked anyway
+                        val_is_nan = True
+                        val = values[i, j]
+
+                # Otherwise, `out` must be set accordingly if the
+                # value is missing
                 else:
+                    val = values[i, j]
+                    if _treat_as_na(val, is_datetimelike):
+                        val_is_nan = True
+                        out[i, j] = val
+
+                if not val_is_nan:
                     mval = accum[lab, j]
                     if compute_max:
                         if val > mval:
@@ -1319,9 +1346,11 @@ def group_cummin(groupby_t[:, ::1] out,
                  ndarray[groupby_t, ndim=2] values,
                  uint8_t[:, ::1] mask,
                  const int64_t[:] labels,
-                 int ngroups):
+                 int ngroups,
+                 bint is_datetimelike,
+                 bint use_mask):
     """See group_cummin_max.__doc__"""
-    group_cummin_max(out, values, mask, labels, ngroups, compute_max=False)
+    group_cummin_max(out, values, mask, labels, ngroups, is_datetimelike, use_mask, compute_max=False)
 
 
 @cython.boundscheck(False)
@@ -1330,6 +1359,8 @@ def group_cummax(groupby_t[:, ::1] out,
                  ndarray[groupby_t, ndim=2] values,
                  uint8_t[:, ::1] mask,
                  const int64_t[:] labels,
-                 int ngroups):
+                 int ngroups,
+                 bint is_datetimelike,
+                 bint use_mask):
     """See group_cummin_max.__doc__"""
-    group_cummin_max(out, values, mask, labels, ngroups, compute_max=True)
+    group_cummin_max(out, values, mask, labels, ngroups, is_datetimelike, use_mask, compute_max=True)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 1175fa345cfe2..9896086b69033 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -86,6 +86,7 @@ class providing the base-class of operations.
     Categorical,
     DatetimeArray,
     ExtensionArray,
+    BaseMaskedArray
 )
 from pandas.core.base import (
     DataError,
@@ -105,6 +106,7 @@ class providing the base-class of operations.
     Index,
     MultiIndex,
 )
+from pandas.core.groupby.ops import does_cython_function_use_mask
 from pandas.core.series import Series
 from pandas.core.sorting import get_group_index_sorter
 from pandas.core.util.numba_ import NUMBA_FUNC_CACHE
@@ -1024,23 +1026,18 @@ def _cumcount_array(self, ascending: bool = True):
 
     @final
     def _cython_transform(
-        self, how: str, numeric_only: bool = True, axis: int = 0, needs_mask: bool = False, **kwargs
+        self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
     ):
         output: Dict[base.OutputKey, np.ndarray] = {}
-
         for idx, obj in enumerate(self._iterate_slices()):
             name = obj.name
             is_numeric = is_numeric_dtype(obj.dtype)
             if numeric_only and not is_numeric:
                 continue
 
-            mask = None
-            if needs_mask:
-                # mask = np.zeros(dtype=np.uint8, order="C", shape=(len(obj._values)))
-                mask = np.require(isna(obj._values).view(np.uint8), requirements='C')
             try:
                 result = self.grouper._cython_operation(
-                    "transform", obj._values, how, axis, mask=mask, **kwargs
+                    "transform", obj._values, how, axis, **kwargs
                 )
             except NotImplementedError:
                 continue
@@ -2592,7 +2589,7 @@ def cummin(self, axis=0, **kwargs):
         if axis != 0:
             return self.apply(lambda x: np.minimum.accumulate(x, axis))
 
-        return self._cython_transform("cummin", numeric_only=False, needs_mask=True)
+        return self._cython_transform("cummin", numeric_only=False)
 
     @final
     @Substitution(name="groupby")
@@ -2608,7 +2605,7 @@ def cummax(self, axis=0, **kwargs):
         if axis != 0:
             return self.apply(lambda x: np.maximum.accumulate(x, axis))
 
-        return self._cython_transform("cummax", numeric_only=False, needs_mask=True)
+        return self._cython_transform("cummax", numeric_only=False)
 
     @final
     def _get_cythonized_result(
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 3240646933ef0..b3b7191e533c9 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -20,6 +20,10 @@
     Tuple,
     Type,
 )
+from pandas.core.arrays.masked import (
+    BaseMaskedArray,
+    BaseMaskedDtype,
+)
 
 import numpy as np
 
@@ -36,6 +40,7 @@
     FrameOrSeries,
     Shape,
     final,
+    ArrayLike
 )
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import cache_readonly
@@ -70,6 +75,9 @@
     isna,
     maybe_fill,
 )
+from pandas.core.arrays import (
+    BaseMaskedArray
+)
 
 from pandas.core.base import SelectionMixin
 import pandas.core.common as com
@@ -115,6 +123,7 @@
         "cummax": "group_cummax",
         "rank": "group_rank",
     },
+    "needs_mask": {"cummin", "cummax"}
 }
 
 
@@ -154,6 +163,10 @@ def _get_cython_function(kind: str, how: str, dtype: np.dtype, is_numeric: bool)
     return func
 
 
+def does_cython_function_use_mask(kind: str) -> bool:
+    return kind in _CYTHON_FUNCTIONS["needs_mask"]
+
+
 class BaseGrouper:
     """
     This is an internal Grouper class, which actually holds
@@ -471,6 +484,7 @@ def _get_cython_func_and_vals(
         -------
         func : callable
         values : np.ndarray
+        needs_mask : True if a mask must be passed
         """
         try:
             func = _get_cython_function(kind, how, values.dtype, is_numeric)
@@ -486,7 +500,8 @@ def _get_cython_func_and_vals(
                 func = _get_cython_function(kind, how, values.dtype, is_numeric)
             else:
                 raise
-        return func, values
+        needs_mask = how in _CYTHON_FUNCTIONS["needs_mask"]
+        return func, values, needs_mask
 
     @final
     def _disallow_invalid_ops(
@@ -524,8 +539,8 @@ def _disallow_invalid_ops(
 
     @final
     def _ea_wrap_cython_operation(
-        self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: Optional[np.ndarray] = None, **kwargs
-    ) -> np.ndarray:
+        self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
+    ) -> ArrayLike:
         """
         If we have an ExtensionArray, unwrap, call _cython_operation, and
         re-wrap if appropriate.
@@ -538,7 +553,7 @@ def _ea_wrap_cython_operation(
             #  operate on the tz-naive equivalents
             values = values.view("M8[ns]")
             res_values = self._cython_operation(
-                kind, values, how, axis, min_count, mask=mask, **kwargs
+                kind, values, how, axis, min_count, **kwargs
             )
             if how in ["rank"]:
                 # preserve float64 dtype
@@ -550,36 +565,60 @@ def _ea_wrap_cython_operation(
 
         elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype):
             # IntegerArray or BooleanArray
-            values = values._data
+            values = ensure_int_or_float(values)
             res_values = self._cython_operation(
-                kind, values, how, axis, min_count, mask=mask, **kwargs
+                kind, values, how, axis, min_count, **kwargs
             )
             dtype = maybe_cast_result_dtype(orig_values.dtype, how)
             if isinstance(dtype, ExtensionDtype):
                 cls = dtype.construct_array_type()
-                return cls(res_values, mask.astype(bool, copy=True))
-                # return cls._from_sequence(res_values, dtype=dtype)
+                return cls._from_sequence(res_values, dtype=dtype)
 
             return res_values
 
         elif is_float_dtype(values.dtype):
             # FloatingArray
-            values = values._data
-            # values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan)
+            values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan)
             res_values = self._cython_operation(
-                kind, values, how, axis, min_count, mask=mask, **kwargs
+                kind, values, how, axis, min_count, **kwargs
             )
-            # result = type(orig_values)._from_sequence(res_values)
-            return type(orig_values)(res_values, mask.astype(bool, copy=True))
+            result = type(orig_values)._from_sequence(res_values)
+            return result
 
         raise NotImplementedError(
             f"function is not implemented for this dtype: {values.dtype}"
         )
 
+    @final
+    def _masked_ea_wrap_cython_operation(
+        self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
+    ) -> ArrayLike:
+        """
+        Equivalent of `_ea_wrap_cython_operation`, but optimized for masked EA's
+        and cython algorithms which accept a mask.
+        """
+        orig_values = values
+
+        mask = isna(values).copy()
+        values = values._data
+
+        if is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype):
+            # IntegerArray or BooleanArray
+            values = ensure_int_or_float(values)
+
+        res_values = self._cython_operation(
+            kind, values, how, axis, min_count, mask=mask, **kwargs
+        )
+        dtype = maybe_cast_result_dtype(orig_values.dtype, how)
+        cls = dtype.construct_array_type()
+
+        return cls(res_values, mask.astype(bool, copy=True))
+
+
     @final
     def _cython_operation(
         self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: Optional[np.ndarray] = None, **kwargs
-    ) -> np.ndarray:
+    ) -> ArrayLike:
         """
         Returns the values of a cython operation.
         """
@@ -601,9 +640,14 @@ def _cython_operation(
         self._disallow_invalid_ops(dtype, how, is_numeric)
 
         if is_extension_array_dtype(dtype):
-            return self._ea_wrap_cython_operation(
-                kind, values, how, axis, min_count, mask=mask, **kwargs
-            )
+            if isinstance(dtype, BaseMaskedDtype) and does_cython_function_use_mask(how):
+                return self._masked_ea_wrap_cython_operation(
+                    kind, values, how, axis, min_count, **kwargs
+                )
+            else:
+                return self._ea_wrap_cython_operation(
+                    kind, values, how, axis, min_count, **kwargs
+                )
 
         is_datetimelike = needs_i8_conversion(dtype)
 
@@ -613,13 +657,12 @@ def _cython_operation(
         elif is_bool_dtype(dtype):
             values = ensure_int_or_float(values)
         elif is_integer_dtype(dtype):
-            pass
             # we use iNaT for the missing value on ints
             # so pre-convert to guard this condition
-            # if (values == iNaT).any():
-            #     values = ensure_float64(values)
-            # else:
-            #     values = ensure_int_or_float(values)
+            if mask is None and (values == iNaT).any():
+                values = ensure_float64(values)
+            else:
+                values = ensure_int_or_float(values)
         elif is_numeric and not is_complex_dtype(dtype):
             values = ensure_float64(values)
         else:
@@ -633,6 +676,7 @@ def _cython_operation(
             values = values[:, None]
             if mask is not None:
                 mask = mask[:, None]
+
             out_shape = (self.ngroups, arity)
         else:
             if axis > 0:
@@ -645,7 +689,11 @@ def _cython_operation(
                 )
             out_shape = (self.ngroups,) + values.shape[1:]
 
-        func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric)
+        func, values, needs_mask = self._get_cython_func_and_vals(kind, how, values, is_numeric)
+        use_mask = mask is not None
+        if needs_mask:
+            if mask is None:
+                mask = np.zeros_like(values, dtype=np.uint8, order="C")
 
         if how == "rank":
             out_dtype = "float"
@@ -662,13 +710,12 @@ def _cython_operation(
             result = self._aggregate(result, counts, values, codes, func, min_count)
         elif kind == "transform":
             result = maybe_fill(np.empty(values.shape, dtype=out_dtype))
-
             # TODO: min_count
             result = self._transform(
-                result, values, func, is_datetimelike, mask=mask, **kwargs
+                result, values, func, is_datetimelike, use_mask, mask, **kwargs
             )
 
-        if is_integer_dtype(result.dtype) and not is_datetimelike:
+        if not use_mask and is_integer_dtype(result.dtype) and not is_datetimelike:
             mask = result == iNaT
             if mask.any():
                 result = result.astype("float64")
@@ -708,12 +755,12 @@ def _aggregate(
 
     @final
     def _transform(
-        self, result: np.ndarray, values: np.ndarray, transform_func, is_datetimelike: bool, mask: Optional[np.ndarray] = None, **kwargs
+        self, result: np.ndarray, values: np.ndarray, transform_func, is_datetimelike: bool, use_mask: bool, mask: np.ndarray | None, **kwargs
     ) -> np.ndarray:
 
         comp_ids, _, ngroups = self.group_info
         if mask is not None:
-            transform_func(result, values, mask, comp_ids, ngroups, **kwargs)
+            transform_func(result, values, mask, comp_ids, ngroups, is_datetimelike, use_mask, **kwargs)
         else:
             transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
 

From 31409f8fdf745be2f298c6de4dd483c62abbff35 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 17:41:42 -0400
Subject: [PATCH 10/32] wip

---
 asv_bench/benchmarks/groupby.py       | 11 ++++++++++-
 pandas/_libs/groupby.pyx              |  1 +
 pandas/core/groupby/ops.py            |  1 -
 pandas/tests/groupby/test_function.py | 14 ++++++++++++--
 4 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index dc4f7dcfabd7f..d96f9f3b5fef2 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -508,13 +508,22 @@ class CumminMax:
 
     def setup(self, dtype, method):
         N = 1_000_000
-        df = DataFrame(np.random.randint(0, 1000, (N, 10)), columns=list("abcdefghij"), dtype=dtype)
+        vals = np.random.randint(0, 1000, (N, 10))
+        null_vals = vals.copy()
+        null_vals[::2, :] = np.nan
+        df = DataFrame(vals, columns=list("abcdefghij"), dtype=dtype)
+        null_df = DataFrame(null_vals, columns=list("abcdefghij"), dtype=dtype)
         df["key"] = np.random.randint(0, 100, size=N)
+        null_df["key"] = np.random.randint(0, 100, size=N)
         self.df = df
+        self.null_df = null_df
 
     def time_frame_transform(self, dtype, method):
         self.df.groupby("key").transform(method)
 
+    def time_frame_transform_many_nulls(self, dtype, method):
+        self.null_df.groupby("key").transform(method)
+
 
 class RankWithTies:
     # GH 21237
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 68ba284a3f4cc..8c899c780b8ee 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1319,6 +1319,7 @@ def group_cummin_max(groupby_t[:, ::1] out,
                         # `out` does not need to be set since it
                         # will be masked anyway
                         val_is_nan = True
+                    else:
                         val = values[i, j]
 
                 # Otherwise, `out` must be set accordingly if the
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index b3b7191e533c9..b5c3fd8ed065a 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -614,7 +614,6 @@ def _masked_ea_wrap_cython_operation(
 
         return cls(res_values, mask.astype(bool, copy=True))
 
-
     @final
     def _cython_operation(
         self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: Optional[np.ndarray] = None, **kwargs
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index cd25eddb84826..4b523662679e4 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -846,10 +846,20 @@ def test_cummax_all_nan_column():
     tm.assert_frame_equal(expected, result)
 
 
-# @pytest.mark.parametrize("method", ["cummin", "cummax"])
-# def test_cummin_max_nullable_ints_no_float_cast(method):
+@td.skip_if_32bit
+@pytest.mark.parametrize("method", ["cummin", "cummax"])
+@pytest.mark.parametrize(
+    "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2 ** 53 + 1)]
+)
+def test_nullable_int_not_cast_as_float(method, dtype, val):
+    data = [val, pd.NA]
+    df = pd.DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
+    grouped = df.groupby("grp")
 
+    result = grouped.transform(method)
+    expected = pd.DataFrame({"b": data}, dtype=dtype)
 
+    tm.assert_frame_equal(result, expected)
 
 
 @pytest.mark.parametrize(

From 0c05f74dfbf25068f32cbb4244796b68251f2e92 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 19:21:08 -0400
Subject: [PATCH 11/32] wip

---
 asv_bench/benchmarks/groupby.py       | 43 ++-------------------
 doc/source/whatsnew/v1.3.0.rst        |  2 +
 pandas/core/groupby/ops.py            |  5 ++-
 pandas/tests/groupby/test_function.py | 54 ++++++++++++++-------------
 4 files changed, 37 insertions(+), 67 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index d96f9f3b5fef2..49a478e2c024d 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -395,40 +395,10 @@ class GroupByMethods:
     params = [
         ["int", "float", "object", "datetime"],
         [
-            "all",
-            "any",
-            "bfill",
-            "count",
-            "cumcount",
+
             "cummax",
             "cummin",
-            "cumprod",
-            "cumsum",
-            "describe",
-            "ffill",
-            "first",
-            "head",
-            "last",
-            "mad",
-            "max",
-            "min",
-            "median",
-            "mean",
-            "nunique",
-            "pct_change",
-            "prod",
-            "quantile",
-            "rank",
-            "sem",
-            "shift",
-            "size",
-            "skew",
-            "std",
-            "sum",
-            "tail",
-            "unique",
-            "value_counts",
-            "var",
+
         ],
         ["direct", "transformation"],
     ]
@@ -494,12 +464,6 @@ def time_frame_agg(self, dtype, method):
 
 
 class CumminMax:
-    """
-    Benchmarks specifically targetting our cython aggregation algorithms
-    (using a big enough dataframe with simple key, so a large part of the
-    time is actually spent in the grouped aggregation).
-    """
-
     param_names = ["dtype", "method"]
     params = [
         ["float64", "int64", "Float64", "Int64"],
@@ -509,8 +473,9 @@ class CumminMax:
     def setup(self, dtype, method):
         N = 1_000_000
         vals = np.random.randint(0, 1000, (N, 10))
-        null_vals = vals.copy()
+        null_vals = vals.astype(float, copy=True)
         null_vals[::2, :] = np.nan
+        null_vals[::3, :] = np.nan
         df = DataFrame(vals, columns=list("abcdefghij"), dtype=dtype)
         null_df = DataFrame(null_vals, columns=list("abcdefghij"), dtype=dtype)
         df["key"] = np.random.randint(0, 100, size=N)
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 512e6e6cbb391..1ded085610f96 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -447,6 +447,8 @@ Performance improvements
 - Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`)
 - Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`)
 - Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`)
+- Performance improvement in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` when using methods ``cummin`` and ``cummax`` with nullable data types (:issue:`37493`)
+-
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index b5c3fd8ed065a..06f7db5a49247 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -21,7 +21,6 @@
     Type,
 )
 from pandas.core.arrays.masked import (
-    BaseMaskedArray,
     BaseMaskedDtype,
 )
 
@@ -599,6 +598,8 @@ def _masked_ea_wrap_cython_operation(
         """
         orig_values = values
 
+        # isna just directly returns self._mask, so copy here to prevent
+        # modifying the original
         mask = isna(values).copy()
         values = values._data
 
@@ -612,7 +613,7 @@ def _masked_ea_wrap_cython_operation(
         dtype = maybe_cast_result_dtype(orig_values.dtype, how)
         cls = dtype.construct_array_type()
 
-        return cls(res_values, mask.astype(bool, copy=True))
+        return cls(res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=True))
 
     @final
     def _cython_operation(
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 4b523662679e4..9703c11a21ca8 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -22,20 +22,27 @@
 
 
 @pytest.fixture(
-    params=[np.int32, np.int64, np.float32, np.float64],
-    ids=["np.int32", "np.int64", "np.float32", "np.float64"],
+    params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"],
+    ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"],
 )
-def numpy_dtypes_for_minmax(request):
+def dtypes_for_minmax(request):
     """
-    Fixture of numpy dtypes with min and max values used for testing
+    Fixture of dtypes with min and max values used for testing
     cummin and cummax
     """
     dtype = request.param
+
+    np_type = dtype
+    if dtype == "Int64":
+        np_type = np.int64
+    elif dtype == "Float64":
+        np_type = np.float64
+
     min_val = (
-        np.iinfo(dtype).min if np.dtype(dtype).kind == "i" else np.finfo(dtype).min
+        np.iinfo(np_type).min if np.dtype(np_type).kind == "i" else np.finfo(np_type).min
     )
     max_val = (
-        np.iinfo(dtype).max if np.dtype(dtype).kind == "i" else np.finfo(dtype).max
+        np.iinfo(np_type).max if np.dtype(np_type).kind == "i" else np.finfo(np_type).max
     )
 
     return (dtype, min_val, max_val)
@@ -734,9 +741,9 @@ def test_numpy_compat(func):
         getattr(g, func)(foo=1)
 
 
-def test_cummin(numpy_dtypes_for_minmax):
-    dtype = numpy_dtypes_for_minmax[0]
-    min_val = numpy_dtypes_for_minmax[1]
+def test_cummin(dtypes_for_minmax):
+    dtype = dtypes_for_minmax[0]
+    min_val = dtypes_for_minmax[1]
 
     # GH 15048
     base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
@@ -780,19 +787,24 @@ def test_cummin(numpy_dtypes_for_minmax):
     tm.assert_series_equal(result, expected)
 
 
-def test_cummin_all_nan_column():
+@pytest.mark.parametrize("method", ["cummin", "cummax"])
+@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"])
+def test_cummin_max_all_nan_column(method, dtype):
     base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})
+    base_df["B"] = base_df["B"].astype(dtype)
+    grouped = base_df.groupby("A")
 
-    expected = DataFrame({"B": [np.nan] * 8})
-    result = base_df.groupby("A").cummin()
+    expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype)
+    result = getattr(grouped, method)()
     tm.assert_frame_equal(expected, result)
-    result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
+
+    result = getattr(grouped["B"], method)().to_frame()
     tm.assert_frame_equal(expected, result)
 
 
-def test_cummax(numpy_dtypes_for_minmax):
-    dtype = numpy_dtypes_for_minmax[0]
-    max_val = numpy_dtypes_for_minmax[2]
+def test_cummax(dtypes_for_minmax):
+    dtype = dtypes_for_minmax[0]
+    max_val = dtypes_for_minmax[2]
 
     # GH 15048
     base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
@@ -836,16 +848,6 @@ def test_cummax(numpy_dtypes_for_minmax):
     tm.assert_series_equal(result, expected)
 
 
-def test_cummax_all_nan_column():
-    base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})
-
-    expected = DataFrame({"B": [np.nan] * 8})
-    result = base_df.groupby("A").cummax()
-    tm.assert_frame_equal(expected, result)
-    result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
-    tm.assert_frame_equal(expected, result)
-
-
 @td.skip_if_32bit
 @pytest.mark.parametrize("method", ["cummin", "cummax"])
 @pytest.mark.parametrize(

From 5c60a1f35888920853a324353c9ccad90cf62e11 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 19:48:36 -0400
Subject: [PATCH 12/32] wip

---
 asv_bench/benchmarks/groupby.py       |  2 -
 pandas/_libs/groupby.pyx              | 32 ++++++++---
 pandas/core/groupby/groupby.py        |  2 -
 pandas/core/groupby/ops.py            | 83 +++++++++++++++++++--------
 pandas/tests/groupby/test_function.py | 12 ++--
 5 files changed, 91 insertions(+), 40 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 49a478e2c024d..803658d101b89 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -395,10 +395,8 @@ class GroupByMethods:
     params = [
         ["int", "float", "object", "datetime"],
         [
-
             "cummax",
             "cummin",
-
         ],
         ["direct", "transformation"],
     ]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index fa363425041e9..80f06a980bb9c 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1270,14 +1270,10 @@ def group_cummin_max(groupby_t[:, ::1] out,
         Array to store cummin/max in.
     values : array
         Values to take cummin/max of.
-<<<<<<< HEAD
     mask : array[uint8_t]
         If `use_mask`, then indices represent missing values,
         otherwise will be passed as a zeroed array
-    labels : int64 array
-=======
     labels : np.ndarray[np.intp]
->>>>>>> origin/master
         Labels to group by.
     ngroups : int
         Number of groups, larger than all entries of `labels`.
@@ -1320,14 +1316,16 @@ def group_cummin_max(groupby_t[:, ::1] out,
             for j in range(K):
                 val_is_nan = False
 
-                # If using the mask, we can avoid grabbing the
-                # value unless necessary
                 if use_mask:
                     if mask[i, j]:
+
                         # `out` does not need to be set since it
                         # will be masked anyway
                         val_is_nan = True
                     else:
+
+                        # If using the mask, we can avoid grabbing the
+                        # value unless necessary
                         val = values[i, j]
 
                 # Otherwise, `out` must be set accordingly if the
@@ -1359,7 +1357,16 @@ def group_cummin(groupby_t[:, ::1] out,
                  bint is_datetimelike,
                  bint use_mask):
     """See group_cummin_max.__doc__"""
-    group_cummin_max(out, values, mask, labels, ngroups, is_datetimelike, use_mask, compute_max=False)
+    group_cummin_max(
+        out,
+        values,
+        mask,
+        labels,
+        ngroups,
+        is_datetimelike,
+        use_mask,
+        compute_max=False
+    )
 
 
 @cython.boundscheck(False)
@@ -1372,4 +1379,13 @@ def group_cummax(groupby_t[:, ::1] out,
                  bint is_datetimelike,
                  bint use_mask):
     """See group_cummin_max.__doc__"""
-    group_cummin_max(out, values, mask, labels, ngroups, is_datetimelike, use_mask, compute_max=True)
+    group_cummin_max(
+        out,
+        values,
+        mask,
+        labels,
+        ngroups,
+        is_datetimelike,
+        use_mask,
+        compute_max=True
+    )
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 69399b5969451..1ee38834c5758 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -85,7 +85,6 @@ class providing the base-class of operations.
 from pandas.core.arrays import (
     Categorical,
     ExtensionArray,
-    BaseMaskedArray
 )
 from pandas.core.base import (
     DataError,
@@ -105,7 +104,6 @@ class providing the base-class of operations.
     Index,
     MultiIndex,
 )
-from pandas.core.groupby.ops import does_cython_function_use_mask
 from pandas.core.series import Series
 from pandas.core.sorting import get_group_index_sorter
 from pandas.core.util.numba_ import NUMBA_FUNC_CACHE
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index f8e5f0d95a777..250eaa25a5cda 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -20,9 +20,6 @@
     Tuple,
     Type,
 )
-from pandas.core.arrays.masked import (
-    BaseMaskedDtype,
-)
 
 import numpy as np
 
@@ -40,7 +37,6 @@
     FrameOrSeries,
     Shape,
     final,
-    ArrayLike
 )
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import cache_readonly
@@ -75,10 +71,11 @@
     isna,
     maybe_fill,
 )
-from pandas.core.arrays import (
-    BaseMaskedArray
-)
 
+from pandas.core.arrays.masked import (
+    BaseMaskedArray,
+    BaseMaskedDtype,
+)
 from pandas.core.base import SelectionMixin
 import pandas.core.common as com
 from pandas.core.frame import DataFrame
@@ -123,7 +120,11 @@
         "cummax": "group_cummax",
         "rank": "group_rank",
     },
-    "needs_mask": {"cummin", "cummax"}
+}
+
+_CYTHON_MASKED_FUNCTIONS = {
+    "cummin",
+    "cummax",
 }
 
 
@@ -163,8 +164,8 @@ def _get_cython_function(kind: str, how: str, dtype: np.dtype, is_numeric: bool)
     return func
 
 
-def does_cython_function_use_mask(kind: str) -> bool:
-    return kind in _CYTHON_FUNCTIONS["needs_mask"]
+def cython_function_uses_mask(kind: str) -> bool:
+    return kind in _CYTHON_MASKED_FUNCTIONS
 
 
 class BaseGrouper:
@@ -590,8 +591,14 @@ def _ea_wrap_cython_operation(
 
     @final
     def _masked_ea_wrap_cython_operation(
-        self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
-    ) -> ArrayLike:
+        self,
+        kind: str,
+        values: BaseMaskedArray,
+        how: str,
+        axis: int,
+        min_count: int = -1,
+        **kwargs,
+    ) -> BaseMaskedArray:
         """
         Equivalent of `_ea_wrap_cython_operation`, but optimized for masked EA's
         and cython algorithms which accept a mask.
@@ -601,23 +608,33 @@ def _masked_ea_wrap_cython_operation(
         # isna just directly returns self._mask, so copy here to prevent
         # modifying the original
         mask = isna(values).copy()
-        values = values._data
+        arr = values._data
 
         if is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype):
             # IntegerArray or BooleanArray
-            values = ensure_int_or_float(values)
+            arr = ensure_int_or_float(arr)
 
         res_values = self._cython_operation(
-            kind, values, how, axis, min_count, mask=mask, **kwargs
+            kind, arr, how, axis, min_count, mask=mask, **kwargs
         )
         dtype = maybe_cast_result_dtype(orig_values.dtype, how)
+        assert isinstance(dtype, BaseMaskedDtype)
         cls = dtype.construct_array_type()
 
-        return cls(res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=True))
+        return cls(
+            res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=True)
+        )
 
     @final
     def _cython_operation(
-        self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: Optional[np.ndarray] = None, **kwargs
+        self,
+        kind: str,
+        values,
+        how: str,
+        axis: int,
+        min_count: int = -1,
+        mask: np.ndarray | None = None,
+        **kwargs,
     ) -> ArrayLike:
         """
         Returns the values of a cython operation.
@@ -640,7 +657,7 @@ def _cython_operation(
         self._disallow_invalid_ops(dtype, how, is_numeric)
 
         if is_extension_array_dtype(dtype):
-            if isinstance(dtype, BaseMaskedDtype) and does_cython_function_use_mask(how):
+            if isinstance(values, BaseMaskedArray) and cython_function_uses_mask(how):
                 return self._masked_ea_wrap_cython_operation(
                     kind, values, how, axis, min_count, **kwargs
                 )
@@ -689,7 +706,9 @@ def _cython_operation(
                 )
             out_shape = (self.ngroups,) + values.shape[1:]
 
-        func, values, needs_mask = self._get_cython_func_and_vals(kind, how, values, is_numeric)
+        func, values, needs_mask = self._get_cython_func_and_vals(
+            kind, how, values, is_numeric
+        )
         use_mask = mask is not None
         if needs_mask:
             if mask is None:
@@ -716,10 +735,10 @@ def _cython_operation(
             )
 
         if not use_mask and is_integer_dtype(result.dtype) and not is_datetimelike:
-            mask = result == iNaT
-            if mask.any():
+            result_mask = result == iNaT
+            if result_mask.any():
                 result = result.astype("float64")
-                result[mask] = np.nan
+                result[result_mask] = np.nan
 
         if kind == "aggregate" and self._filter_empty_groups and not counts.all():
             assert result.ndim != 2
@@ -755,12 +774,28 @@ def _aggregate(
 
     @final
     def _transform(
-        self, result: np.ndarray, values: np.ndarray, transform_func, is_datetimelike: bool, use_mask: bool, mask: np.ndarray | None, **kwargs
+        self,
+        result: np.ndarray,
+        values: np.ndarray,
+        transform_func,
+        is_datetimelike: bool,
+        use_mask: bool,
+        mask: np.ndarray | None,
+        **kwargs,
     ) -> np.ndarray:
 
         comp_ids, _, ngroups = self.group_info
         if mask is not None:
-            transform_func(result, values, mask, comp_ids, ngroups, is_datetimelike, use_mask, **kwargs)
+            transform_func(
+                result,
+                values,
+                mask,
+                comp_ids,
+                ngroups,
+                is_datetimelike,
+                use_mask,
+                **kwargs,
+            )
         else:
             transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
 
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 9703c11a21ca8..12a8808d750d7 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -39,10 +39,14 @@ def dtypes_for_minmax(request):
         np_type = np.float64
 
     min_val = (
-        np.iinfo(np_type).min if np.dtype(np_type).kind == "i" else np.finfo(np_type).min
+        np.iinfo(np_type).min
+        if np.dtype(np_type).kind == "i"
+        else np.finfo(np_type).min
     )
     max_val = (
-        np.iinfo(np_type).max if np.dtype(np_type).kind == "i" else np.finfo(np_type).max
+        np.iinfo(np_type).max
+        if np.dtype(np_type).kind == "i"
+        else np.finfo(np_type).max
     )
 
     return (dtype, min_val, max_val)
@@ -855,11 +859,11 @@ def test_cummax(dtypes_for_minmax):
 )
 def test_nullable_int_not_cast_as_float(method, dtype, val):
     data = [val, pd.NA]
-    df = pd.DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
+    df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
     grouped = df.groupby("grp")
 
     result = grouped.transform(method)
-    expected = pd.DataFrame({"b": data}, dtype=dtype)
+    expected = DataFrame({"b": data}, dtype=dtype)
 
     tm.assert_frame_equal(result, expected)
 

From f0c27ce09a1158391015569128d0fafacd85a06d Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 20:02:38 -0400
Subject: [PATCH 13/32] PERF: use masked algo in groupby cummin and cummax

---
 asv_bench/benchmarks/groupby.py | 32 ++++++++++++++++++++++++++++++++
 pandas/_libs/groupby.pyx        |  2 +-
 pandas/core/groupby/ops.py      | 13 +++++--------
 3 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 803658d101b89..d7e73b3cfcca3 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -395,8 +395,40 @@ class GroupByMethods:
     params = [
         ["int", "float", "object", "datetime"],
         [
+            "all",
+            "any",
+            "bfill",
+            "count",
+            "cumcount",
             "cummax",
             "cummin",
+            "cumprod",
+            "cumsum",
+            "describe",
+            "ffill",
+            "first",
+            "head",
+            "last",
+            "mad",
+            "max",
+            "min",
+            "median",
+            "mean",
+            "nunique",
+            "pct_change",
+            "prod",
+            "quantile",
+            "rank",
+            "sem",
+            "shift",
+            "size",
+            "skew",
+            "std",
+            "sum",
+            "tail",
+            "unique",
+            "value_counts",
+            "var",
         ],
         ["direct", "transformation"],
     ]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 80f06a980bb9c..3395fefcbaad3 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1299,7 +1299,7 @@ def group_cummin_max(groupby_t[:, ::1] out,
         bint val_is_nan
 
     N, K = (<object>values).shape
-    accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype, order="C")
+    accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
     if groupby_t is int64_t:
         accum[:] = -_int64_max if compute_max else _int64_max
     elif groupby_t is uint64_t:
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 250eaa25a5cda..50df42f555f53 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -484,7 +484,6 @@ def _get_cython_func_and_vals(
         -------
         func : callable
         values : np.ndarray
-        needs_mask : True if a mask must be passed
         """
         try:
             func = _get_cython_function(kind, how, values.dtype, is_numeric)
@@ -500,8 +499,7 @@ def _get_cython_func_and_vals(
                 func = _get_cython_function(kind, how, values.dtype, is_numeric)
             else:
                 raise
-        needs_mask = how in _CYTHON_FUNCTIONS["needs_mask"]
-        return func, values, needs_mask
+        return func, values
 
     @final
     def _disallow_invalid_ops(
@@ -656,8 +654,9 @@ def _cython_operation(
         # if not raise NotImplementedError
         self._disallow_invalid_ops(dtype, how, is_numeric)
 
+        func_uses_mask = cython_function_uses_mask(how)
         if is_extension_array_dtype(dtype):
-            if isinstance(values, BaseMaskedArray) and cython_function_uses_mask(how):
+            if isinstance(values, BaseMaskedArray) and func_uses_mask:
                 return self._masked_ea_wrap_cython_operation(
                     kind, values, how, axis, min_count, **kwargs
                 )
@@ -706,11 +705,9 @@ def _cython_operation(
                 )
             out_shape = (self.ngroups,) + values.shape[1:]
 
-        func, values, needs_mask = self._get_cython_func_and_vals(
-            kind, how, values, is_numeric
-        )
+        func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric)
         use_mask = mask is not None
-        if needs_mask:
+        if func_uses_mask:
             if mask is None:
                 mask = np.zeros_like(values, dtype=np.uint8, order="C")
 

From 2fa80ad0338861ec09f6874ae01a81f7b4bc81bd Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 20:07:18 -0400
Subject: [PATCH 14/32] Avoid mask copy

---
 pandas/core/groupby/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 50df42f555f53..145973f6a12f9 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -620,7 +620,7 @@ def _masked_ea_wrap_cython_operation(
         cls = dtype.construct_array_type()
 
         return cls(
-            res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=True)
+            res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=False)
         )
 
     @final

From 280c7e5359166535938221d637db72c8126c4768 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Fri, 26 Mar 2021 20:31:43 -0400
Subject: [PATCH 15/32] Update whatsnew

---
 doc/source/whatsnew/v1.3.0.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index c2b3e58440e64..e9c9e8a77f637 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -635,6 +635,8 @@ Groupby/resample/rolling
 - Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes  (:issue:`40164`)
 - :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
 - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
+- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` computing wrong result for methods ``cummin`` and ``cummax`` with nullable data types too large to roundtrip when casting to float (:issue:`37493`)
+-
 
 Reshaping
 ^^^^^^^^^

From 7e2fbe0dfbe3e9fd670a245bf9477451e68635c2 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 31 Mar 2021 20:14:32 -0400
Subject: [PATCH 16/32] Merge fixup

---
 pandas/core/groupby/ops.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 9a32a08a11d12..b0e1acd07ac89 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -72,14 +72,11 @@
     maybe_fill,
 )
 
-<<<<<<< HEAD
 from pandas.core.arrays.masked import (
     BaseMaskedArray,
     BaseMaskedDtype,
 )
-=======
 from pandas.core.arrays import ExtensionArray
->>>>>>> origin/master
 from pandas.core.base import SelectionMixin
 import pandas.core.common as com
 from pandas.core.frame import DataFrame

From 0ebb97ad570a386d6fb8fb0e85d584f5f48af9db Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 31 Mar 2021 20:51:44 -0400
Subject: [PATCH 17/32] Follow transpose

---
 pandas/core/groupby/ops.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index b0e1acd07ac89..0a239a77f14b9 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -72,11 +72,11 @@
     maybe_fill,
 )
 
+from pandas.core.arrays import ExtensionArray
 from pandas.core.arrays.masked import (
     BaseMaskedArray,
     BaseMaskedDtype,
 )
-from pandas.core.arrays import ExtensionArray
 from pandas.core.base import SelectionMixin
 import pandas.core.common as com
 from pandas.core.frame import DataFrame
@@ -652,7 +652,6 @@ def _cython_operation(
         # can we do this operation with our cython functions
         # if not raise NotImplementedError
         self._disallow_invalid_ops(dtype, how, is_numeric)
-
         func_uses_mask = cython_function_uses_mask(how)
         if is_extension_array_dtype(dtype):
             if isinstance(values, BaseMaskedArray) and func_uses_mask:
@@ -667,8 +666,6 @@ def _cython_operation(
         elif values.ndim == 1:
             # expand to 2d, dispatch, then squeeze if appropriate
             values2d = values[None, :]
-            if mask is not None:
-                mask = mask[None, :]
             res = self._cython_operation(
                 kind=kind,
                 values=values2d,
@@ -705,9 +702,10 @@ def _cython_operation(
         arity = self._cython_arity.get(how, 1)
         ngroups = self.ngroups
 
-
         assert axis == 1
         values = values.T
+        if mask is not None:
+            mask = mask.reshape(values.shape, order="C")
         if how == "ohlc":
             out_shape = (ngroups, 4)
         elif arity > 1:
@@ -733,7 +731,6 @@ def _cython_operation(
             else:
                 out_dtype = "object"
 
-
         codes, _, _ = self.group_info
 
         result = maybe_fill(np.empty(out_shape, dtype=out_dtype))

From 0009dfd367d6bfb74381a72a3c3e4314b39edfcc Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 31 Mar 2021 22:53:25 -0400
Subject: [PATCH 18/32] Compute mask usage inside algo

---
 pandas/_libs/groupby.pyx   | 17 +++++------------
 pandas/core/groupby/ops.py | 14 +++++---------
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 7ef47d461cc6d..5133e40cae66e 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1236,7 +1236,6 @@ def group_cummin_max(groupby_t[:, ::1] out,
                      const intp_t[:] labels,
                      int ngroups,
                      bint is_datetimelike,
-                     bint use_mask,
                      bint compute_max):
     """
     Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
@@ -1256,10 +1255,6 @@ def group_cummin_max(groupby_t[:, ::1] out,
         Number of groups, larger than all entries of `labels`.
     is_datetimelike : bool
         True if `values` contains datetime-like entries.
-    use_mask : bool
-        True if the mask should be used (otherwise we continue
-        as if it is not a masked algorithm). Avoids the cost
-        of checking for a completely zeroed mask.
     compute_max : bool
         True if cumulative maximum should be computed, False
         if cumulative minimum should be computed
@@ -1273,7 +1268,9 @@ def group_cummin_max(groupby_t[:, ::1] out,
         groupby_t val, mval
         ndarray[groupby_t, ndim=2] accum
         intp_t lab
-        bint val_is_nan
+        bint val_is_nan, use_mask
+
+    use_mask = mask is not None
 
     N, K = (<object>values).shape
     accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
@@ -1331,8 +1328,7 @@ def group_cummin(groupby_t[:, ::1] out,
                  uint8_t[:, ::1] mask,
                  const intp_t[:] labels,
                  int ngroups,
-                 bint is_datetimelike,
-                 bint use_mask):
+                 bint is_datetimelike):
     """See group_cummin_max.__doc__"""
     group_cummin_max(
         out,
@@ -1341,7 +1337,6 @@ def group_cummin(groupby_t[:, ::1] out,
         labels,
         ngroups,
         is_datetimelike,
-        use_mask,
         compute_max=False
     )
 
@@ -1353,8 +1348,7 @@ def group_cummax(groupby_t[:, ::1] out,
                  uint8_t[:, ::1] mask,
                  const intp_t[:] labels,
                  int ngroups,
-                 bint is_datetimelike,
-                 bint use_mask):
+                 bint is_datetimelike):
     """See group_cummin_max.__doc__"""
     group_cummin_max(
         out,
@@ -1363,6 +1357,5 @@ def group_cummax(groupby_t[:, ::1] out,
         labels,
         ngroups,
         is_datetimelike,
-        use_mask,
         compute_max=True
     )
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 0a239a77f14b9..708e127796776 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -652,6 +652,7 @@ def _cython_operation(
         # can we do this operation with our cython functions
         # if not raise NotImplementedError
         self._disallow_invalid_ops(dtype, how, is_numeric)
+
         func_uses_mask = cython_function_uses_mask(how)
         if is_extension_array_dtype(dtype):
             if isinstance(values, BaseMaskedArray) and func_uses_mask:
@@ -718,10 +719,6 @@ def _cython_operation(
             out_shape = (ngroups,) + values.shape[1:]
 
         func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric)
-        use_mask = mask is not None
-        if func_uses_mask:
-            if mask is None:
-                mask = np.zeros_like(values, dtype=np.uint8, order="C")
 
         if how == "rank":
             out_dtype = "float"
@@ -740,10 +737,10 @@ def _cython_operation(
         elif kind == "transform":
             # TODO: min_count
             result = self._transform(
-                result, values, func, is_datetimelike, use_mask, mask, **kwargs
+                result, values, func, is_datetimelike, mask, func_uses_mask, **kwargs
             )
 
-        if not use_mask and is_integer_dtype(result.dtype) and not is_datetimelike:
+        if mask is None and is_integer_dtype(result.dtype) and not is_datetimelike:
             result_mask = result == iNaT
             if result_mask.any():
                 result = result.astype("float64")
@@ -784,13 +781,13 @@ def _transform(
         values: np.ndarray,
         transform_func,
         is_datetimelike: bool,
-        use_mask: bool,
         mask: np.ndarray | None,
+        needs_mask: bool = False,
         **kwargs,
     ) -> np.ndarray:
 
         comp_ids, _, ngroups = self.group_info
-        if mask is not None:
+        if needs_mask:
             transform_func(
                 result,
                 values,
@@ -798,7 +795,6 @@ def _transform(
                 comp_ids,
                 ngroups,
                 is_datetimelike,
-                use_mask,
                 **kwargs,
             )
         else:

From 66638324a7e0ddb9c29749114bfa8a8fb0ee6810 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 31 Mar 2021 23:56:29 -0400
Subject: [PATCH 19/32] try optional

---
 pandas/_libs/groupby.pyx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 5133e40cae66e..bbde73426a8c1 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1230,7 +1230,7 @@ def group_min(groupby_t[:, ::1] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_cummin_max(groupby_t[:, ::1] out,
+cdef group_cummin_max(groupby_t[:, ::1] out,
                      ndarray[groupby_t, ndim=2] values,
                      uint8_t[:, ::1] mask,
                      const intp_t[:] labels,
@@ -1268,9 +1268,11 @@ def group_cummin_max(groupby_t[:, ::1] out,
         groupby_t val, mval
         ndarray[groupby_t, ndim=2] accum
         intp_t lab
+        uint8_t[:, ::1] mask_
         bint val_is_nan, use_mask
 
     use_mask = mask is not None
+    mask_ = mask if use_mask else np.zeros_like(values)
 
     N, K = (<object>values).shape
     accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)

From 8247f827ed043344de0a5c8fc3302573d864d3e3 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 1 Apr 2021 00:09:12 -0400
Subject: [PATCH 20/32] WIP

---
 pandas/_libs/groupby.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index bbde73426a8c1..97863735335f3 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1272,7 +1272,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
         bint val_is_nan, use_mask
 
     use_mask = mask is not None
-    mask_ = mask if use_mask else np.zeros_like(values)
+    mask_ = mask if use_mask else np.zeros_like(values, dtype=np.uint8)
 
     N, K = (<object>values).shape
     accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
@@ -1293,7 +1293,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
                 val_is_nan = False
 
                 if use_mask:
-                    if mask[i, j]:
+                    if mask_[i, j]:
 
                         # `out` does not need to be set since it
                         # will be masked anyway

From 71e1c4fa5d77b596411468f7ab4459f86e760372 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 1 Apr 2021 11:56:26 -0400
Subject: [PATCH 21/32] Use more contiguity

---
 pandas/_libs/groupby.pyx | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 97863735335f3..bafb555731aa2 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1231,12 +1231,12 @@ def group_min(groupby_t[:, ::1] out,
 @cython.boundscheck(False)
 @cython.wraparound(False)
 cdef group_cummin_max(groupby_t[:, ::1] out,
-                     ndarray[groupby_t, ndim=2] values,
-                     uint8_t[:, ::1] mask,
-                     const intp_t[:] labels,
-                     int ngroups,
-                     bint is_datetimelike,
-                     bint compute_max):
+                      ndarray[groupby_t, ndim=2] values,
+                      uint8_t[:, ::1] mask,
+                      const intp_t[:] labels,
+                      int ngroups,
+                      bint is_datetimelike,
+                      bint compute_max):
     """
     Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
 
@@ -1246,9 +1246,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
         Array to store cummin/max in.
     values : array
         Values to take cummin/max of.
-    mask : array[uint8_t]
-        If `use_mask`, then indices represent missing values,
-        otherwise will be passed as a zeroed array
+    mask : array[uint8_t] or None
+        If not None, indices represent missing values,
+        otherwise the mask will not be used
     labels : np.ndarray[np.intp]
         Labels to group by.
     ngroups : int
@@ -1266,16 +1266,14 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
     cdef:
         Py_ssize_t i, j, N, K, size
         groupby_t val, mval
-        ndarray[groupby_t, ndim=2] accum
+        groupby_t[:, ::1] accum
         intp_t lab
-        uint8_t[:, ::1] mask_
         bint val_is_nan, use_mask
 
     use_mask = mask is not None
-    mask_ = mask if use_mask else np.zeros_like(values, dtype=np.uint8)
 
     N, K = (<object>values).shape
-    accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
+    accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype, order='C')
     if groupby_t is int64_t:
         accum[:] = -_int64_max if compute_max else _int64_max
     elif groupby_t is uint64_t:
@@ -1293,7 +1291,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
                 val_is_nan = False
 
                 if use_mask:
-                    if mask_[i, j]:
+                    if mask[i, j]:
 
                         # `out` does not need to be set since it
                         # will be masked anyway

From c6cf9ee54b77e5b18e89f272e5cf4d129ea2d0fd Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 1 Apr 2021 12:30:32 -0400
Subject: [PATCH 22/32] Shrink benchmark

---
 asv_bench/benchmarks/groupby.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index d7e73b3cfcca3..9eee54ac6e317 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -501,15 +501,21 @@ class CumminMax:
     ]
 
     def setup(self, dtype, method):
-        N = 1_000_000
-        vals = np.random.randint(0, 1000, (N, 10))
+        N = 500_000
+        rg = np.random.default_rng(seed=0)
+        vals = rg.integers(
+            0,
+            1000,
+            (N, 5),
+        )
         null_vals = vals.astype(float, copy=True)
         null_vals[::2, :] = np.nan
         null_vals[::3, :] = np.nan
-        df = DataFrame(vals, columns=list("abcdefghij"), dtype=dtype)
-        null_df = DataFrame(null_vals, columns=list("abcdefghij"), dtype=dtype)
-        df["key"] = np.random.randint(0, 100, size=N)
-        null_df["key"] = np.random.randint(0, 100, size=N)
+        df = DataFrame(vals, columns=list("abcde"), dtype=dtype)
+        null_df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype)
+        keys = rg.integers(0, 100, size=N)
+        df["key"] = keys
+        null_df["key"] = keys
         self.df = df
         self.null_df = null_df
 

From 293dc6eadc1d4cb89e58e67c740a1d490ed27cbc Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 1 Apr 2021 22:30:55 -0400
Subject: [PATCH 23/32] Revert unrelated

---
 pandas/core/groupby/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 180917855a9cc..43b39a902452f 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -758,7 +758,7 @@ def _cython_operation(
 
         result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
         if kind == "aggregate":
-            counts = np.zeros(self.ngroups, dtype=np.int64)
+            counts = np.zeros(ngroups, dtype=np.int64)
             func(result, counts, values, comp_ids, min_count)
         elif kind == "transform":
             # TODO: min_count

From 1bb344e83d8bd9d9aff8c5108a57e3284c6b2b8a Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Mon, 12 Apr 2021 11:22:44 -0400
Subject: [PATCH 24/32] Remove merge conflict relic

---
 pandas/core/groupby/ops.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index d5dc26b0dac43..7573f73cb395c 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -74,7 +74,6 @@
     BaseMaskedArray,
     BaseMaskedDtype,
 )
-from pandas.core.base import SelectionMixin
 import pandas.core.common as com
 from pandas.core.frame import DataFrame
 from pandas.core.generic import NDFrame

From 97d9eea32eb76d4f6d242baf297fab3a55a845f8 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com>
Date: Tue, 13 Apr 2021 11:09:28 -0400
Subject: [PATCH 25/32] Update doc/source/whatsnew/v1.3.0.rst

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 doc/source/whatsnew/v1.3.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 30182878b6a6b..d43e6b3b0941c 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -585,7 +585,7 @@ Performance improvements
 - Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`)
 - Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`)
 <<<<<<< HEAD
-- Performance improvement in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` when using methods ``cummin`` and ``cummax`` with nullable data types (:issue:`37493`)
+- Performance improvement in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable data types (:issue:`37493`)
 -
 =======
 - Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`)

From 892a92a1a32458827c80fddb257bc4bab8098869 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com>
Date: Tue, 13 Apr 2021 11:09:37 -0400
Subject: [PATCH 26/32] Update doc/source/whatsnew/v1.3.0.rst

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 doc/source/whatsnew/v1.3.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index d43e6b3b0941c..71181f5740377 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -787,7 +787,7 @@ Groupby/resample/rolling
 - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
 - Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`)
 - Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`)
-- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` computing wrong result for methods ``cummin`` and ``cummax`` with nullable data types too large to roundtrip when casting to float (:issue:`37493`)
+- Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` computing wrong result with nullable data types too large to roundtrip when casting to float (:issue:`37493`)
 
 
 Reshaping

From a239a68b90e2e9d9f0faea568886341ac2be6ecd Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com>
Date: Tue, 13 Apr 2021 11:30:41 -0400
Subject: [PATCH 27/32] Update pandas/core/groupby/ops.py

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 pandas/core/groupby/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 7573f73cb395c..177b2e85a1193 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -639,7 +639,7 @@ def _masked_ea_wrap_cython_operation(
         mask = isna(values).copy()
         arr = values._data
 
-        if is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype):
+        if is_integer_dtype(arr.dtype) or is_bool_dtype(arr.dtype):
             # IntegerArray or BooleanArray
             arr = ensure_int_or_float(arr)
 

From a1422ba54663f083e98f33532c0b7993740f737a Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Tue, 13 Apr 2021 12:02:16 -0400
Subject: [PATCH 28/32] Address comments

---
 doc/source/whatsnew/v1.3.0.rst | 5 +----
 pandas/_libs/groupby.pyx       | 4 ++--
 pandas/core/groupby/ops.py     | 9 +++------
 3 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index fcb5759e9460c..19dcd2b44edae 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -589,12 +589,9 @@ Performance improvements
 - Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`)
 - Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`)
 - Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`)
-<<<<<<< HEAD
+- Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`)
 - Performance improvement in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable data types (:issue:`37493`)
 -
-=======
-- Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`)
->>>>>>> origin/master
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 895629a30b2e8..5b6390fa1896a 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1254,7 +1254,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
         Array to store cummin/max in.
     values : np.ndarray[groupby_t, ndim=2]
         Values to take cummin/max of.
-    mask : array[uint8_t] or None
+    mask : np.ndarray[bool] or None
         If not None, indices represent missing values,
         otherwise the mask will not be used
     labels : np.ndarray[np.intp]
@@ -1281,7 +1281,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
     use_mask = mask is not None
 
     N, K = (<object>values).shape
-    accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype, order='C')
+    accum = np.empty((ngroups, K), dtype=values.dtype)
     if groupby_t is int64_t:
         accum[:] = -_int64_max if compute_max else _int64_max
     elif groupby_t is uint64_t:
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 177b2e85a1193..efff3d8e07976 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -634,9 +634,8 @@ def _masked_ea_wrap_cython_operation(
         """
         orig_values = values
 
-        # isna just directly returns self._mask, so copy here to prevent
-        # modifying the original
-        mask = isna(values).copy()
+        # Copy to ensure input and result masks don't end up shared
+        mask = values._mask.copy()
         arr = values._data
 
         if is_integer_dtype(arr.dtype) or is_bool_dtype(arr.dtype):
@@ -650,9 +649,7 @@ def _masked_ea_wrap_cython_operation(
         assert isinstance(dtype, BaseMaskedDtype)
         cls = dtype.construct_array_type()
 
-        return cls(
-            res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=False)
-        )
+        return cls(res_values.astype(dtype.type, copy=False), mask)
 
     @final
     def _cython_operation(

From 482a209cacedf4ce758ae6a94d0b4dd4e88bbfdd Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Tue, 13 Apr 2021 12:31:22 -0400
Subject: [PATCH 29/32] Change random generation style

---
 asv_bench/benchmarks/groupby.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 9eee54ac6e317..3094d38f20fa0 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -502,18 +502,13 @@ class CumminMax:
 
     def setup(self, dtype, method):
         N = 500_000
-        rg = np.random.default_rng(seed=0)
-        vals = rg.integers(
-            0,
-            1000,
-            (N, 5),
-        )
+        vals = np.random.randint(-10, 10, (N, 5))
         null_vals = vals.astype(float, copy=True)
         null_vals[::2, :] = np.nan
         null_vals[::3, :] = np.nan
         df = DataFrame(vals, columns=list("abcde"), dtype=dtype)
         null_df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype)
-        keys = rg.integers(0, 100, size=N)
+        keys = np.random.randint(0, 100, size=N)
         df["key"] = keys
         null_df["key"] = keys
         self.df = df

From 251c02af59360632e5f3b5aa9ba8f1afe1466dbf Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Sat, 17 Apr 2021 20:51:41 -0400
Subject: [PATCH 30/32] Use conditional instead of partial

---
 pandas/core/groupby/ops.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 473bf4ba5c0bb..f4a8809e54c30 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -9,7 +9,6 @@
 
 import collections
 import functools
-from functools import partial
 from typing import (
     Generic,
     Hashable,
@@ -735,8 +734,6 @@ def _cython_operation(
 
         out_shape = cy_op.get_output_shape(ngroups, values)
         func, values = cy_op.get_cython_func_and_vals(values, is_numeric)
-        if func_uses_mask:
-            func = partial(func, mask=mask)
         out_dtype = cy_op.get_out_dtype(values.dtype)
 
         result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
@@ -755,7 +752,18 @@ def _cython_operation(
                 func(result, counts, values, comp_ids, min_count)
         elif kind == "transform":
             # TODO: min_count
-            func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
+            if func_uses_mask:
+                func(
+                    result,
+                    values,
+                    comp_ids,
+                    ngroups,
+                    is_datetimelike,
+                    mask=mask,
+                    **kwargs,
+                )
+            else:
+                func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
 
         if kind == "aggregate":
             # i.e. counts is defined.  Locations where count<min_count

From 3de7e5e473b36b7dafe815ccba1a940e415435c4 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Sun, 18 Apr 2021 00:41:19 -0400
Subject: [PATCH 31/32] Remove ensure_int_or_float

---
 pandas/core/groupby/ops.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index f4a8809e54c30..1438e2cc590bf 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -635,9 +635,8 @@ def _masked_ea_wrap_cython_operation(
         mask = values._mask.copy()
         arr = values._data
 
-        if is_integer_dtype(arr.dtype) or is_bool_dtype(arr.dtype):
-            # IntegerArray or BooleanArray
-            arr = ensure_int_or_float(arr)
+        if is_bool_dtype(arr.dtype):
+            arr = arr.astype("int64")
 
         res_values = self._cython_operation(
             kind, arr, how, axis, min_count, mask=mask, **kwargs

From 237f86feb08e35841473a63ec0c74066235632d1 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Sun, 18 Apr 2021 00:55:01 -0400
Subject: [PATCH 32/32] Remove unnecessary condition

---
 pandas/core/groupby/ops.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 1438e2cc590bf..6fa124bc5558f 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -635,9 +635,6 @@ def _masked_ea_wrap_cython_operation(
         mask = values._mask.copy()
         arr = values._data
 
-        if is_bool_dtype(arr.dtype):
-            arr = arr.astype("int64")
-
         res_values = self._cython_operation(
             kind, arr, how, axis, min_count, mask=mask, **kwargs
         )