From b491dc7708099eefa77c942a4b37278d8fbb1bb6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Sep 2022 17:15:14 +0200 Subject: [PATCH 01/10] ENH: Suppport masks in median groupby algo --- pandas/_libs/algos_take_helper.pxi.in | 1 + pandas/_libs/groupby.pyx | 115 ++++++++++++++++++++++---- pandas/core/groupby/ops.py | 3 +- 3 files changed, 101 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 2a3858674af9e..0848b884f222c 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -15,6 +15,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in dtypes = [ ('uint8_t', 'uint8_t'), ('uint8_t', 'object'), + ('uint64_t', 'float64_t'), ('int8_t', 'int8_t'), ('int8_t', 'int32_t'), ('int8_t', 'int64_t'), diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 0c368f9421932..e6ff60b9491e2 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -41,7 +41,11 @@ from pandas._libs.algos import ( ensure_platform_int, groupsort_indexer, rank_1d, + take_2d_axis1_bool_bool, + take_2d_axis1_float32_float64, take_2d_axis1_float64_float64, + take_2d_axis1_int64_float64, + take_2d_axis1_uint64_float64, ) from pandas._libs.dtypes cimport ( @@ -64,6 +68,47 @@ cdef enum InterpolationEnumType: INTERPOLATION_MIDPOINT +cdef inline float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) nogil: + cdef: + int i, j, na_count = 0 + float64_t result + float64_t* tmp + + if n == 0: + return NaN + + # count NAs + for i in range(n): + if mask[i]: + na_count += 1 + + if na_count: + if na_count == n: + return NaN + + tmp = malloc((n - na_count) * sizeof(float64_t)) + + j = 0 + for i in range(n): + if not mask[i]: + tmp[j] = a[i] + j += 1 + + a = tmp + n -= na_count + + if n % 2: + result = kth_smallest_c(a, n // 2, n) + else: + result = (kth_smallest_c(a, n // 2, n) + + kth_smallest_c(a, n // 2 - 1, n)) / 2 + + if na_count: + free(a) + + return result + + cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef: int i, j, na_count = 0 @@ -105,14 +150,23 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil: return result +ctypedef fused int64float_t: + int64_t + uint64_t + float32_t + float64_t + + @cython.boundscheck(False) @cython.wraparound(False) def group_median_float64( ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, + ndarray[int64float_t, ndim=2] values, ndarray[intp_t] labels, Py_ssize_t min_count=-1, + const uint8_t[:, :] mask=None, + uint8_t[:, ::1] result_mask=None, ) -> None: """ Only aggregates on axis=0 @@ -121,8 +175,12 @@ def group_median_float64( Py_ssize_t i, j, N, K, ngroups, size ndarray[intp_t] _counts ndarray[float64_t, ndim=2] data + ndarray[uint8_t, ndim=2] data_mask ndarray[intp_t] indexer float64_t* ptr + uint8_t* ptr_mask + float64_t result + bint uses_mask = mask is not None assert min_count == -1, "'min_count' only used in sum and prod" @@ -135,17 +193,47 @@ def group_median_float64( data = np.empty((K, N), dtype=np.float64) ptr = cnp.PyArray_DATA(data) - take_2d_axis1_float64_float64(values.T, indexer, out=data) + if int64float_t is int64_t: + take_2d_axis1_int64_float64(values.T, indexer, out=data) + elif int64float_t is uint64_t: + take_2d_axis1_uint64_float64(values.T, indexer, out=data) + elif int64float_t is float32_t: + take_2d_axis1_float32_float64(values.T, indexer, out=data) + else: + take_2d_axis1_float64_float64(values.T, indexer, out=data) + + if uses_mask: + data_mask = np.zeros((K, N), dtype=np.uint8) + ptr_mask = cnp.PyArray_DATA(data_mask) - with nogil: + take_2d_axis1_bool_bool(mask.T, indexer, out=data_mask, fill_value=1) + + with nogil: - for i in range(K): - # exclude NA group - ptr += _counts[0] - for j in range(ngroups): - size = _counts[j + 1] - out[j, i] = median_linear(ptr, size) - ptr += size + for i in range(K): + # exclude NA group + ptr += _counts[0] + ptr_mask += _counts[0] + + for j in range(ngroups): + size = _counts[j + 1] + result = median_linear_mask(ptr, size, ptr_mask) + out[j, i] = result + + if result != result: + result_mask[j, i] = 1 + ptr += size + ptr_mask += size + + else: + with nogil: + for i in range(K): + # exclude NA group + ptr += _counts[0] + for j in range(ngroups): + size = _counts[j + 1] + out[j, i] = median_linear(ptr, size) + ptr += size @cython.boundscheck(False) @@ -206,13 +294,6 @@ def group_cumprod_float64( accum[lab, j] = NaN -ctypedef fused int64float_t: - int64_t - uint64_t - float32_t - float64_t - - @cython.boundscheck(False) @cython.wraparound(False) def group_cumsum( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c118c7f16af8f..04960379fbc42 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -162,6 +162,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "prod", "mean", "var", + "median", } _cython_arity = {"ohlc": 4} # OHLC @@ -600,7 +601,7 @@ def _call_cython_op( min_count=min_count, is_datetimelike=is_datetimelike, ) - elif self.how in ["var", "ohlc", "prod"]: + elif self.how in ["var", "ohlc", "prod", "median"]: func( result, counts, From 264fce569893be3c4ce0640bd3debef41601b756 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Sep 2022 17:17:59 +0200 Subject: [PATCH 02/10] Avoid float cast --- pandas/core/groupby/ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 04960379fbc42..5241997cf4ca1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -218,7 +218,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: """ how = self.how - if how in ["median", "cumprod"]: + if how in ["cumprod"]: # these two only have float64 implementations # We should only get here with is_numeric, as non-numeric cases # should raise in _get_cython_function @@ -231,7 +231,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: # result may still include NaN, so we have to cast values = ensure_float64(values) - elif how in ["sum", "ohlc", "prod", "cumsum"]: + elif how in ["sum", "ohlc", "prod", "cumsum", "median"]: # Avoid overflow during group op if values.dtype.kind == "i": values = ensure_int64(values) From b4fe89b7298188182c8afa2944761348f19c7814 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Sep 2022 17:19:48 +0200 Subject: [PATCH 03/10] Out dtype --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5241997cf4ca1..9a95767efc9ce 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -305,7 +305,7 @@ def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape: def _get_out_dtype(self, dtype: np.dtype) -> np.dtype: how = self.how - if how == "rank": + if how in ["rank", "median"]: out_dtype = "float64" else: if is_numeric_dtype(dtype): From 72e77bc5a5677b1df980b16da29b05b4b69a143e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Sep 2022 17:54:58 +0200 Subject: [PATCH 04/10] Add cast --- pandas/core/groupby/ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 9a95767efc9ce..e318440d01b37 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -218,7 +218,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: """ how = self.how - if how in ["cumprod"]: + if how in ["cumprod", "median"]: # these two only have float64 implementations # We should only get here with is_numeric, as non-numeric cases # should raise in _get_cython_function @@ -231,7 +231,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: # result may still include NaN, so we have to cast values = ensure_float64(values) - elif how in ["sum", "ohlc", "prod", "cumsum", "median"]: + elif how in ["sum", "ohlc", "prod", "cumsum"]: # Avoid overflow during group op if values.dtype.kind == "i": values = ensure_int64(values) From 50425b1709b2c33db7a5f62c4e14fb07e86e7f89 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Sep 2022 17:59:36 +0200 Subject: [PATCH 05/10] Revert algos --- asv_bench/benchmarks/groupby.py | 40 +++++++++++++-------------- pandas/_libs/algos_take_helper.pxi.in | 1 - pandas/_libs/groupby.pyx | 13 ++------- 3 files changed, 23 insertions(+), 31 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 90cb31577a1b4..7297c0703ceb4 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -536,17 +536,17 @@ class GroupByCythonAgg: params = [ ["float64"], [ - "sum", - "prod", - "min", - "max", - "mean", + # "sum", + # "prod", + # "min", + # "max", + # "mean", "median", - "var", - "first", - "last", - "any", - "all", + # "var", + # "first", + # "last", + # "any", + # "all", ], ] @@ -571,17 +571,17 @@ class GroupByCythonAggEaDtypes: params = [ ["Float64", "Int64", "Int32"], [ - "sum", - "prod", - "min", - "max", - "mean", + # "sum", + # "prod", + # "min", + # "max", + # "mean", "median", - "var", - "first", - "last", - "any", - "all", + # "var", + # "first", + # "last", + # "any", + # "all", ], ] diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 0848b884f222c..2a3858674af9e 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -15,7 +15,6 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in dtypes = [ ('uint8_t', 'uint8_t'), ('uint8_t', 'object'), - ('uint64_t', 'float64_t'), ('int8_t', 'int8_t'), ('int8_t', 'int32_t'), ('int8_t', 'int64_t'), diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e6ff60b9491e2..944b2e9982e33 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -162,7 +162,7 @@ ctypedef fused int64float_t: def group_median_float64( ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, - ndarray[int64float_t, ndim=2] values, + ndarray[float64_t, ndim=2] values, ndarray[intp_t] labels, Py_ssize_t min_count=-1, const uint8_t[:, :] mask=None, @@ -193,17 +193,10 @@ def group_median_float64( data = np.empty((K, N), dtype=np.float64) ptr = cnp.PyArray_DATA(data) - if int64float_t is int64_t: - take_2d_axis1_int64_float64(values.T, indexer, out=data) - elif int64float_t is uint64_t: - take_2d_axis1_uint64_float64(values.T, indexer, out=data) - elif int64float_t is float32_t: - take_2d_axis1_float32_float64(values.T, indexer, out=data) - else: - take_2d_axis1_float64_float64(values.T, indexer, out=data) + take_2d_axis1_float64_float64(values.T, indexer, out=data) if uses_mask: - data_mask = np.zeros((K, N), dtype=np.uint8) + data_mask = np.empty((K, N), dtype=np.uint8) ptr_mask = cnp.PyArray_DATA(data_mask) take_2d_axis1_bool_bool(mask.T, indexer, out=data_mask, fill_value=1) From 20958470f345405a7712443758929dd79354393e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Sep 2022 18:03:39 +0200 Subject: [PATCH 06/10] Add whatsnew --- asv_bench/benchmarks/groupby.py | 42 +++++++++++++++++---------------- doc/source/whatsnew/v1.6.0.rst | 1 + 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 7297c0703ceb4..3007d2d1e126c 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -5,6 +5,7 @@ import numpy as np from pandas import ( + NA, Categorical, DataFrame, Index, @@ -536,17 +537,17 @@ class GroupByCythonAgg: params = [ ["float64"], [ - # "sum", - # "prod", - # "min", - # "max", - # "mean", + "sum", + "prod", + "min", + "max", + "mean", "median", - # "var", - # "first", - # "last", - # "any", - # "all", + "var", + "first", + "last", + "any", + "all", ], ] @@ -571,17 +572,17 @@ class GroupByCythonAggEaDtypes: params = [ ["Float64", "Int64", "Int32"], [ - # "sum", - # "prod", - # "min", - # "max", - # "mean", + "sum", + "prod", + "min", + "max", + "mean", "median", - # "var", - # "first", - # "last", - # "any", - # "all", + "var", + "first", + "last", + "any", + "all", ], ] @@ -592,6 +593,7 @@ def setup(self, dtype, method): columns=list("abcdefghij"), dtype=dtype, ) + df.loc[list(range(1, N, 5)), list("abcdefghij")] = NA df["key"] = np.random.randint(0, 100, size=N) self.df = df diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 848e87f0bc029..1a159ce7670c8 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -100,6 +100,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`) - Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) - From 64fceab0358aa506510928a1b01d0024b9e29d0b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Sep 2022 18:15:51 +0200 Subject: [PATCH 07/10] Deduplicate --- pandas/_libs/groupby.pyx | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 944b2e9982e33..421353d345e8a 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -42,10 +42,7 @@ from pandas._libs.algos import ( groupsort_indexer, rank_1d, take_2d_axis1_bool_bool, - take_2d_axis1_float32_float64, take_2d_axis1_float64_float64, - take_2d_axis1_int64_float64, - take_2d_axis1_uint64_float64, ) from pandas._libs.dtypes cimport ( @@ -71,7 +68,6 @@ cdef enum InterpolationEnumType: cdef inline float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) nogil: cdef: int i, j, na_count = 0 - float64_t result float64_t* tmp if n == 0: @@ -97,22 +93,12 @@ cdef inline float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) nog a = tmp n -= na_count - if n % 2: - result = kth_smallest_c(a, n // 2, n) - else: - result = (kth_smallest_c(a, n // 2, n) + - kth_smallest_c(a, n // 2 - 1, n)) / 2 - - if na_count: - free(a) - - return result + return calc_median_linear(a, n, na_count) cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef: int i, j, na_count = 0 - float64_t result float64_t* tmp if n == 0: @@ -138,6 +124,13 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil: a = tmp n -= na_count + return calc_median_linear(a, n, na_count) + + +cdef inline float64_t calc_median_linear(float64_t* a, int n, int na_count) nogil: + cdef: + float64_t result + if n % 2: result = kth_smallest_c(a, n // 2, n) else: From 644047d03b1b872ebfa8bb009b7578ac9f88e6b5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Sep 2022 18:21:29 +0200 Subject: [PATCH 08/10] Fix --- pandas/core/groupby/ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e318440d01b37..04960379fbc42 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -218,7 +218,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: """ how = self.how - if how in ["cumprod", "median"]: + if how in ["median", "cumprod"]: # these two only have float64 implementations # We should only get here with is_numeric, as non-numeric cases # should raise in _get_cython_function @@ -305,7 +305,7 @@ def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape: def _get_out_dtype(self, dtype: np.dtype) -> np.dtype: how = self.how - if how in ["rank", "median"]: + if how == "rank": out_dtype = "float64" else: if is_numeric_dtype(dtype): From 4767b5730b6a5ac44f4a139979517b6e7fc5292a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Sep 2022 18:23:13 +0200 Subject: [PATCH 09/10] Add type hints --- pandas/_libs/groupby.pyi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 8722092809ed9..93f861eef1baf 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -10,6 +10,8 @@ def group_median_float64( values: np.ndarray, # ndarray[float64_t, ndim=2] labels: npt.NDArray[np.int64], min_count: int = ..., # Py_ssize_t + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., ) -> None: ... def group_cumprod_float64( out: np.ndarray, # float64_t[:, ::1] From 3681f5aac16cfbd506df36a869c0352b79c09336 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 5 Sep 2022 21:01:39 +0200 Subject: [PATCH 10/10] Move free --- pandas/_libs/groupby.pyx | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 421353d345e8a..5ba8fba4543d6 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -69,6 +69,7 @@ cdef inline float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) nog cdef: int i, j, na_count = 0 float64_t* tmp + float64_t result if n == 0: return NaN @@ -93,13 +94,19 @@ cdef inline float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) nog a = tmp n -= na_count - return calc_median_linear(a, n, na_count) + result = calc_median_linear(a, n, na_count) + + if na_count: + free(a) + + return result cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef: int i, j, na_count = 0 float64_t* tmp + float64_t result if n == 0: return NaN @@ -124,7 +131,12 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil: a = tmp n -= na_count - return calc_median_linear(a, n, na_count) + result = calc_median_linear(a, n, na_count) + + if na_count: + free(a) + + return result cdef inline float64_t calc_median_linear(float64_t* a, int n, int na_count) nogil: @@ -137,9 +149,6 @@ cdef inline float64_t calc_median_linear(float64_t* a, int n, int na_count) nogi result = (kth_smallest_c(a, n // 2, n) + kth_smallest_c(a, n // 2 - 1, n)) / 2 - if na_count: - free(a) - return result