diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 90cb31577a1b4..3007d2d1e126c 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -5,6 +5,7 @@ import numpy as np from pandas import ( + NA, Categorical, DataFrame, Index, @@ -592,6 +593,7 @@ def setup(self, dtype, method): columns=list("abcdefghij"), dtype=dtype, ) + df.loc[list(range(1, N, 5)), list("abcdefghij")] = NA df["key"] = np.random.randint(0, 100, size=N) self.df = df diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 848e87f0bc029..1a159ce7670c8 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -100,6 +100,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`) - Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) - diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 8722092809ed9..93f861eef1baf 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -10,6 +10,8 @@ def group_median_float64( values: np.ndarray, # ndarray[float64_t, ndim=2] labels: npt.NDArray[np.int64], min_count: int = ..., # Py_ssize_t + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., ) -> None: ... def group_cumprod_float64( out: np.ndarray, # float64_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 0c368f9421932..5ba8fba4543d6 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -41,6 +41,7 @@ from pandas._libs.algos import ( ensure_platform_int, groupsort_indexer, rank_1d, + take_2d_axis1_bool_bool, take_2d_axis1_float64_float64, ) @@ -64,11 +65,48 @@ cdef enum InterpolationEnumType: INTERPOLATION_MIDPOINT -cdef inline float64_t median_linear(float64_t* a, int n) nogil: +cdef inline float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) nogil: cdef: int i, j, na_count = 0 + float64_t* tmp float64_t result + + if n == 0: + return NaN + + # count NAs + for i in range(n): + if mask[i]: + na_count += 1 + + if na_count: + if na_count == n: + return NaN + + tmp = malloc((n - na_count) * sizeof(float64_t)) + + j = 0 + for i in range(n): + if not mask[i]: + tmp[j] = a[i] + j += 1 + + a = tmp + n -= na_count + + result = calc_median_linear(a, n, na_count) + + if na_count: + free(a) + + return result + + +cdef inline float64_t median_linear(float64_t* a, int n) nogil: + cdef: + int i, j, na_count = 0 float64_t* tmp + float64_t result if n == 0: return NaN @@ -93,18 +131,34 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil: a = tmp n -= na_count + result = calc_median_linear(a, n, na_count) + + if na_count: + free(a) + + return result + + +cdef inline float64_t calc_median_linear(float64_t* a, int n, int na_count) nogil: + cdef: + float64_t result + if n % 2: result = kth_smallest_c(a, n // 2, n) else: result = (kth_smallest_c(a, n // 2, n) + kth_smallest_c(a, n // 2 - 1, n)) / 2 - if na_count: - free(a) - return result +ctypedef fused int64float_t: + int64_t + uint64_t + float32_t + float64_t + + @cython.boundscheck(False) @cython.wraparound(False) def group_median_float64( @@ -113,6 +167,8 @@ def group_median_float64( ndarray[float64_t, ndim=2] values, ndarray[intp_t] labels, Py_ssize_t min_count=-1, + const uint8_t[:, :] mask=None, + uint8_t[:, ::1] result_mask=None, ) -> None: """ Only aggregates on axis=0 @@ -121,8 +177,12 @@ def group_median_float64( Py_ssize_t i, j, N, K, ngroups, size ndarray[intp_t] _counts ndarray[float64_t, ndim=2] data + ndarray[uint8_t, ndim=2] data_mask ndarray[intp_t] indexer float64_t* ptr + uint8_t* ptr_mask + float64_t result + bint uses_mask = mask is not None assert min_count == -1, "'min_count' only used in sum and prod" @@ -137,15 +197,38 @@ def group_median_float64( take_2d_axis1_float64_float64(values.T, indexer, out=data) - with nogil: + if uses_mask: + data_mask = np.empty((K, N), dtype=np.uint8) + ptr_mask = cnp.PyArray_DATA(data_mask) + + take_2d_axis1_bool_bool(mask.T, indexer, out=data_mask, fill_value=1) - for i in range(K): - # exclude NA group - ptr += _counts[0] - for j in range(ngroups): - size = _counts[j + 1] - out[j, i] = median_linear(ptr, size) - ptr += size + with nogil: + + for i in range(K): + # exclude NA group + ptr += _counts[0] + ptr_mask += _counts[0] + + for j in range(ngroups): + size = _counts[j + 1] + result = median_linear_mask(ptr, size, ptr_mask) + out[j, i] = result + + if result != result: + result_mask[j, i] = 1 + ptr += size + ptr_mask += size + + else: + with nogil: + for i in range(K): + # exclude NA group + ptr += _counts[0] + for j in range(ngroups): + size = _counts[j + 1] + out[j, i] = median_linear(ptr, size) + ptr += size @cython.boundscheck(False) @@ -206,13 +289,6 @@ def group_cumprod_float64( accum[lab, j] = NaN -ctypedef fused int64float_t: - int64_t - uint64_t - float32_t - float64_t - - @cython.boundscheck(False) @cython.wraparound(False) def group_cumsum( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c118c7f16af8f..04960379fbc42 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -162,6 +162,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "prod", "mean", "var", + "median", } _cython_arity = {"ohlc": 4} # OHLC @@ -600,7 +601,7 @@ def _call_cython_op( min_count=min_count, is_datetimelike=is_datetimelike, ) - elif self.how in ["var", "ohlc", "prod"]: + elif self.how in ["var", "ohlc", "prod", "median"]: func( result, counts,