From 8efde59c8eecc613041acdf6fdc721a9f31d4d43 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 14 Aug 2022 11:34:01 +0200 Subject: [PATCH 1/8] ENH: Support mask for groupby var and mean --- pandas/_libs/groupby.pyi | 2 ++ pandas/_libs/groupby.pyx | 46 ++++++++++++++++++++++++++++++-------- pandas/core/groupby/ops.py | 12 ++++++++++ 3 files changed, 51 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 3ec37718eb652..4f5f3e67e7b77 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -74,6 +74,8 @@ def group_var( labels: np.ndarray, # const intp_t[:] min_count: int = ..., # Py_ssize_t ddof: int = ..., # int64_t + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., ) -> None: ... def group_mean( out: np.ndarray, # floating[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 6e2b79a320dd7..1c5a67d36d413 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -699,6 +699,8 @@ def group_var( const intp_t[::1] labels, Py_ssize_t min_count=-1, int64_t ddof=1, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) @@ -706,6 +708,7 @@ def group_var( floating[:, ::1] mean int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) + bint isna_entry, uses_mask = not mask is None assert min_count == -1, "'min_count' only used in sum and prod" @@ -730,8 +733,12 @@ def group_var( for j in range(K): val = values[i, j] - # not nan - if val == val: + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = not val == val + + if not isna_entry: nobs[lab, j] += 1 oldmean = mean[lab, j] mean[lab, j] += (val - oldmean) / nobs[lab, j] @@ -741,7 +748,10 @@ def group_var( for j in range(K): ct = nobs[i, j] if ct <= ddof: - out[i, j] = NAN + if uses_mask: + result_mask[i, j] = True + else: + out[i, j] = NAN else: out[i, j] /= (ct - ddof) @@ -779,9 +789,9 @@ def group_mean( is_datetimelike : bool True if `values` contains datetime-like entries. mask : ndarray[bool, ndim=2], optional - Not used. + Mask of the input values. result_mask : ndarray[bool, ndim=2], optional - Not used. + Mask of the out array Notes ----- @@ -795,6 +805,7 @@ def group_mean( mean_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) + bint isna_entry, uses_mask = not mask is None assert min_count == -1, "'min_count' only used in sum and prod" @@ -807,7 +818,12 @@ def group_mean( compensation = np.zeros((out).shape, dtype=(out).base.dtype) N, K = (values).shape - nan_val = NPY_NAT if is_datetimelike else NAN + if uses_mask: + nan_val = 0 + elif is_datetimelike: + nan_val = NPY_NAT + else: + nan_val = NAN with nogil: for i in range(N): @@ -818,8 +834,15 @@ def group_mean( counts[lab] += 1 for j in range(K): val = values[i, j] - # not nan - if val == val and not (is_datetimelike and val == NPY_NAT): + + if uses_mask: + isna_entry = mask[i, j] + elif is_datetimelike: + isna_entry = val == NPY_NAT + else: + isna_entry = not val == val + + if not isna_entry: nobs[lab, j] += 1 y = val - compensation[lab, j] t = sumx[lab, j] + y @@ -830,7 +853,12 @@ def group_mean( for j in range(K): count = nobs[i, j] if nobs[i, j] == 0: - out[i, j] = nan_val + + if uses_mask: + result_mask[i, j] = True + else: + out[i, j] = nan_val + else: out[i, j] = sumx[i, j] / count diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index caea70e03b6f3..b554031714b9e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -157,6 +157,8 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "first", "rank", "sum", + "mean", + "var", } _cython_arity = {"ohlc": 4} # OHLC @@ -592,6 +594,16 @@ def _call_cython_op( min_count=min_count, is_datetimelike=is_datetimelike, ) + elif self.how == "var": + func( + result, + counts, + values, + comp_ids, + min_count, + mask=mask, + result_mask=result_mask, + ) else: func(result, counts, values, comp_ids, min_count) else: From a5ac8f295192448fb460efc0c56dc8ecede1538e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 14 Aug 2022 11:56:16 +0200 Subject: [PATCH 2/8] Add whatsnew --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index b71d294b97f9a..0865045a0be96 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -871,6 +871,7 @@ Performance improvements - Performance improvement in :meth:`.Styler.to_excel` when applying repeated CSS formats (:issue:`47371`) - Performance improvement in :meth:`MultiIndex.is_monotonic_increasing` (:issue:`47458`) - Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`) +- Performance improvement in :meth:`GroupBy.mean` and :meth:`GroupBy.var` for extension array dtypes (:issue:`37493`) - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) - Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`) - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47404`, :issue:`47405`) From b7b174f66d87b8e3d45b6c7cfec072c2008a0ff3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 14 Aug 2022 12:08:40 +0200 Subject: [PATCH 3/8] Add asvs --- asv_bench/benchmarks/groupby.py | 37 +++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 2de1f25fceace..443291fce4b81 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -560,6 +560,43 @@ def time_frame_agg(self, dtype, method): self.df.groupby("key").agg(method) +class GroupByCythonAggEaDtypes: + """ + Benchmarks specifically targeting our cython aggregation algorithms + (using a big enough dataframe with simple key, so a large part of the + time is actually spent in the grouped aggregation). + """ + + param_names = ["dtype", "method"] + params = [ + ["Float64", "Int64", "Int32"], + [ + "sum", + "prod", + "min", + "max", + "mean", + "median", + "var", + "first", + "last", + "any", + "all", + ], + ] + + def setup(self, dtype, method): + N = 1_000_000 + df = DataFrame( + np.random.randint(N, 10), columns=list("abcdefghij"), dtype=dtype + ) + df["key"] = np.random.randint(0, 100, size=N) + self.df = df + + def time_frame_agg(self, dtype, method): + self.df.groupby("key").agg(method) + + class Cumulative: param_names = ["dtype", "method"] params = [ From 7f59d63d8019d92ac9311585011d6ecbc6f680ec Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 14 Aug 2022 17:53:44 +0200 Subject: [PATCH 4/8] Fix asvs --- asv_bench/benchmarks/groupby.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 443291fce4b81..90cb31577a1b4 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -588,7 +588,9 @@ class GroupByCythonAggEaDtypes: def setup(self, dtype, method): N = 1_000_000 df = DataFrame( - np.random.randint(N, 10), columns=list("abcdefghij"), dtype=dtype + np.random.randint(0, high=100, size=(N, 10)), + columns=list("abcdefghij"), + dtype=dtype, ) df["key"] = np.random.randint(0, 100, size=N) self.df = df From 96c9cb6ee0c6fe270b593531cae95caba46e9fba Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 17 Aug 2022 23:06:14 +0200 Subject: [PATCH 5/8] Add ohlc --- pandas/core/groupby/ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 69f6d9757c0c5..07cf1d790a763 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -157,6 +157,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "first", "rank", "sum", + "ohlc", "mean", "var", } From 046d892a406082b8590ea02fe196ff8f806c6922 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 18 Aug 2022 09:44:10 +0200 Subject: [PATCH 6/8] Add . before groupby --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index d161f0820cfdf..088843231a58c 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -876,7 +876,7 @@ Performance improvements - Performance improvement in :meth:`.Styler.to_excel` when applying repeated CSS formats (:issue:`47371`) - Performance improvement in :meth:`MultiIndex.is_monotonic_increasing` (:issue:`47458`) - Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`) -- Performance improvement in :meth:`GroupBy.mean` and :meth:`GroupBy.var` for extension array dtypes (:issue:`37493`) +- Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`) - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) - Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`) - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47404`, :issue:`47405`) From c43a3a59b5fbd8702646abf08b7911239548299b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 30 Aug 2022 09:57:02 +0200 Subject: [PATCH 7/8] Move whatsnew --- doc/source/whatsnew/v1.5.0.rst | 1 - doc/source/whatsnew/v1.6.0.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 37dfff6bcd3c2..711352775400e 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -962,7 +962,6 @@ Performance improvements - Performance improvement in :meth:`.Styler.to_excel` when applying repeated CSS formats (:issue:`47371`) - Performance improvement in :meth:`MultiIndex.is_monotonic_increasing` (:issue:`47458`) - Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`) -- Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`) - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) - Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`) - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47404`, :issue:`47405`) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index eac5e5d3a0f52..ae0cfa1b668eb 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -100,7 +100,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`) - .. --------------------------------------------------------------------------- From 3bd44fd2425a93983e7a744587fd32730b64025e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 30 Aug 2022 10:39:05 +0200 Subject: [PATCH 8/8] Move kwargs --- pandas/core/groupby/ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 928f00b0e9f5b..c118c7f16af8f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -609,9 +609,10 @@ def _call_cython_op( min_count=min_count, mask=mask, result_mask=result_mask, + **kwargs, ) else: - func(result, counts, values, comp_ids, min_count, **kwargs) + func(result, counts, values, comp_ids, min_count) else: # TODO: min_count if self.uses_mask():