From 8efde59c8eecc613041acdf6fdc721a9f31d4d43 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 14 Aug 2022 11:34:01 +0200
Subject: [PATCH 1/8] ENH: Support mask for groupby var and mean

---
 pandas/_libs/groupby.pyi   |  2 ++
 pandas/_libs/groupby.pyx   | 46 ++++++++++++++++++++++++++++++--------
 pandas/core/groupby/ops.py | 12 ++++++++++
 3 files changed, 51 insertions(+), 9 deletions(-)
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
index 3ec37718eb652..4f5f3e67e7b77 100644
--- a/pandas/_libs/groupby.pyi
+++ b/pandas/_libs/groupby.pyi
@@ -74,6 +74,8 @@ def group_var(
     labels: np.ndarray,  # const intp_t[:]
     min_count: int = ...,  # Py_ssize_t
     ddof: int = ...,  # int64_t
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
 ) -> None: ...
 def group_mean(
     out: np.ndarray,  # floating[:, ::1]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 6e2b79a320dd7..1c5a67d36d413 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -699,6 +699,8 @@ def group_var(
     const intp_t[::1] labels,
     Py_ssize_t min_count=-1,
     int64_t ddof=1,
+    const uint8_t[:, ::1] mask=None,
+    uint8_t[:, ::1] result_mask=None,
 ) -> None:
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
@@ -706,6 +708,7 @@ def group_var(
         floating[:, ::1] mean
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
+        bint isna_entry, uses_mask = not mask is None
 
     assert min_count == -1, "'min_count' only used in sum and prod"
 
@@ -730,8 +733,12 @@ def group_var(
             for j in range(K):
                 val = values[i, j]
 
-                # not nan
-                if val == val:
+                if uses_mask:
+                    isna_entry = mask[i, j]
+                else:
+                    isna_entry = not val == val
+
+                if not isna_entry:
                     nobs[lab, j] += 1
                     oldmean = mean[lab, j]
                     mean[lab, j] += (val - oldmean) / nobs[lab, j]
@@ -741,7 +748,10 @@ def group_var(
             for j in range(K):
                 ct = nobs[i, j]
                 if ct <= ddof:
-                    out[i, j] = NAN
+                    if uses_mask:
+                        result_mask[i, j] = True
+                    else:
+                        out[i, j] = NAN
                 else:
                     out[i, j] /= (ct - ddof)
 
@@ -779,9 +789,9 @@ def group_mean(
     is_datetimelike : bool
         True if `values` contains datetime-like entries.
     mask : ndarray[bool, ndim=2], optional
-        Not used.
+        Mask of the input values.
     result_mask : ndarray[bool, ndim=2], optional
-        Not used.
+        Mask of the out array
 
     Notes
     -----
@@ -795,6 +805,7 @@ def group_mean(
         mean_t[:, ::1] sumx, compensation
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
+        bint isna_entry, uses_mask = not mask is None
 
     assert min_count == -1, "'min_count' only used in sum and prod"
 
@@ -807,7 +818,12 @@ def group_mean(
     compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
 
     N, K = (<object>values).shape
-    nan_val = NPY_NAT if is_datetimelike else NAN
+    if uses_mask:
+        nan_val = 0
+    elif is_datetimelike:
+        nan_val = NPY_NAT
+    else:
+        nan_val = NAN
 
     with nogil:
         for i in range(N):
@@ -818,8 +834,15 @@ def group_mean(
             counts[lab] += 1
             for j in range(K):
                 val = values[i, j]
-                # not nan
-                if val == val and not (is_datetimelike and val == NPY_NAT):
+
+                if uses_mask:
+                    isna_entry = mask[i, j]
+                elif is_datetimelike:
+                    isna_entry = val == NPY_NAT
+                else:
+                    isna_entry = not val == val
+
+                if not isna_entry:
                     nobs[lab, j] += 1
                     y = val - compensation[lab, j]
                     t = sumx[lab, j] + y
@@ -830,7 +853,12 @@ def group_mean(
             for j in range(K):
                 count = nobs[i, j]
                 if nobs[i, j] == 0:
-                    out[i, j] = nan_val
+
+                    if uses_mask:
+                        result_mask[i, j] = True
+                    else:
+                        out[i, j] = nan_val
+
                 else:
                     out[i, j] = sumx[i, j] / count
 
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index caea70e03b6f3..b554031714b9e 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -157,6 +157,8 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
         "first",
         "rank",
         "sum",
+        "mean",
+        "var",
     }
 
     _cython_arity = {"ohlc": 4}  # OHLC
@@ -592,6 +594,16 @@ def _call_cython_op(
                     min_count=min_count,
                     is_datetimelike=is_datetimelike,
                 )
+            elif self.how == "var":
+                func(
+                    result,
+                    counts,
+                    values,
+                    comp_ids,
+                    min_count,
+                    mask=mask,
+                    result_mask=result_mask,
+                )
             else:
                 func(result, counts, values, comp_ids, min_count)
         else:

From a5ac8f295192448fb460efc0c56dc8ecede1538e Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 14 Aug 2022 11:56:16 +0200
Subject: [PATCH 2/8] Add whatsnew

---
 doc/source/whatsnew/v1.5.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index b71d294b97f9a..0865045a0be96 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -871,6 +871,7 @@ Performance improvements
 - Performance improvement in :meth:`.Styler.to_excel` when applying repeated CSS formats (:issue:`47371`)
 - Performance improvement in :meth:`MultiIndex.is_monotonic_increasing`  (:issue:`47458`)
 - Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`)
+- Performance improvement in :meth:`GroupBy.mean` and :meth:`GroupBy.var` for extension array dtypes (:issue:`37493`)
 - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`)
 - Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`)
 - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47404`, :issue:`47405`)

From b7b174f66d87b8e3d45b6c7cfec072c2008a0ff3 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 14 Aug 2022 12:08:40 +0200
Subject: [PATCH 3/8] Add asvs

---
 asv_bench/benchmarks/groupby.py | 37 +++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 2de1f25fceace..443291fce4b81 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -560,6 +560,43 @@ def time_frame_agg(self, dtype, method):
         self.df.groupby("key").agg(method)
 
 
+class GroupByCythonAggEaDtypes:
+    """
+    Benchmarks specifically targeting our cython aggregation algorithms
+    (using a big enough dataframe with simple key, so a large part of the
+    time is actually spent in the grouped aggregation).
+    """
+
+    param_names = ["dtype", "method"]
+    params = [
+        ["Float64", "Int64", "Int32"],
+        [
+            "sum",
+            "prod",
+            "min",
+            "max",
+            "mean",
+            "median",
+            "var",
+            "first",
+            "last",
+            "any",
+            "all",
+        ],
+    ]
+
+    def setup(self, dtype, method):
+        N = 1_000_000
+        df = DataFrame(
+            np.random.randint(N, 10), columns=list("abcdefghij"), dtype=dtype
+        )
+        df["key"] = np.random.randint(0, 100, size=N)
+        self.df = df
+
+    def time_frame_agg(self, dtype, method):
+        self.df.groupby("key").agg(method)
+
+
 class Cumulative:
     param_names = ["dtype", "method"]
     params = [

From 7f59d63d8019d92ac9311585011d6ecbc6f680ec Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 14 Aug 2022 17:53:44 +0200
Subject: [PATCH 4/8] Fix asvs

---
 asv_bench/benchmarks/groupby.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 443291fce4b81..90cb31577a1b4 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -588,7 +588,9 @@ class GroupByCythonAggEaDtypes:
     def setup(self, dtype, method):
         N = 1_000_000
         df = DataFrame(
-            np.random.randint(N, 10), columns=list("abcdefghij"), dtype=dtype
+            np.random.randint(0, high=100, size=(N, 10)),
+            columns=list("abcdefghij"),
+            dtype=dtype,
         )
         df["key"] = np.random.randint(0, 100, size=N)
         self.df = df

From 96c9cb6ee0c6fe270b593531cae95caba46e9fba Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Wed, 17 Aug 2022 23:06:14 +0200
Subject: [PATCH 5/8] Add ohlc

---
 pandas/core/groupby/ops.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 69f6d9757c0c5..07cf1d790a763 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -157,6 +157,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
         "first",
         "rank",
         "sum",
+        "ohlc",
         "mean",
         "var",
     }

From 046d892a406082b8590ea02fe196ff8f806c6922 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Thu, 18 Aug 2022 09:44:10 +0200
Subject: [PATCH 6/8] Add . before groupby

---
 doc/source/whatsnew/v1.5.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index d161f0820cfdf..088843231a58c 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -876,7 +876,7 @@ Performance improvements
 - Performance improvement in :meth:`.Styler.to_excel` when applying repeated CSS formats (:issue:`47371`)
 - Performance improvement in :meth:`MultiIndex.is_monotonic_increasing`  (:issue:`47458`)
 - Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`)
-- Performance improvement in :meth:`GroupBy.mean` and :meth:`GroupBy.var` for extension array dtypes (:issue:`37493`)
+- Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
 - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`)
 - Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`)
 - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47404`, :issue:`47405`)

From c43a3a59b5fbd8702646abf08b7911239548299b Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Tue, 30 Aug 2022 09:57:02 +0200
Subject: [PATCH 7/8] Move whatsnew

---
 doc/source/whatsnew/v1.5.0.rst | 1 -
 doc/source/whatsnew/v1.6.0.rst | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 37dfff6bcd3c2..711352775400e 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -962,7 +962,6 @@ Performance improvements
 - Performance improvement in :meth:`.Styler.to_excel` when applying repeated CSS formats (:issue:`47371`)
 - Performance improvement in :meth:`MultiIndex.is_monotonic_increasing`  (:issue:`47458`)
 - Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`)
-- Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
 - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`)
 - Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`)
 - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47404`, :issue:`47405`)
diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst
index eac5e5d3a0f52..ae0cfa1b668eb 100644
--- a/doc/source/whatsnew/v1.6.0.rst
+++ b/doc/source/whatsnew/v1.6.0.rst
@@ -100,7 +100,7 @@ Deprecations
 
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
--
+- Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
 -
 
 .. ---------------------------------------------------------------------------

From 3bd44fd2425a93983e7a744587fd32730b64025e Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Tue, 30 Aug 2022 10:39:05 +0200
Subject: [PATCH 8/8] Move kwargs

---
 pandas/core/groupby/ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 928f00b0e9f5b..c118c7f16af8f 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -609,9 +609,10 @@ def _call_cython_op(
                     min_count=min_count,
                     mask=mask,
                     result_mask=result_mask,
+                    **kwargs,
                 )
             else:
-                func(result, counts, values, comp_ids, min_count, **kwargs)
+                func(result, counts, values, comp_ids, min_count)
         else:
             # TODO: min_count
             if self.uses_mask():