ENH: Suppport masks in median groupby algo (pandas-dev#48387)

phofl · noatamir · commit c4edfc5cddd2 · 2022-11-09T22:58:16.000+01:00
* ENH: Suppport masks in median groupby algo

* Avoid float cast

* Out dtype

* Add cast

* Revert algos

* Add whatsnew

* Deduplicate

* Fix

* Add type hints

* Move free
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -5,6 +5,7 @@
 import numpy as np
 
 from pandas import (
+    NA,
     Categorical,
     DataFrame,
     Index,
@@ -592,6 +593,7 @@ def setup(self, dtype, method):
             columns=list("abcdefghij"),
             dtype=dtype,
         )
+        df.loc[list(range(1, N, 5)), list("abcdefghij")] = NA
         df["key"] = np.random.randint(0, 100, size=N)
         self.df = df
 
diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst
@@ -100,6 +100,7 @@ Deprecations
 
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
+- Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`)
 - Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
 - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
 -
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -10,6 +10,8 @@ def group_median_float64(
     values: np.ndarray,  # ndarray[float64_t, ndim=2]
     labels: npt.NDArray[np.int64],
     min_count: int = ...,  # Py_ssize_t
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
 ) -> None: ...
 def group_cumprod_float64(
     out: np.ndarray,  # float64_t[:, ::1]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -41,6 +41,7 @@ from pandas._libs.algos import (
     ensure_platform_int,
     groupsort_indexer,
     rank_1d,
+    take_2d_axis1_bool_bool,
     take_2d_axis1_float64_float64,
 )
 
@@ -64,11 +65,48 @@ cdef enum InterpolationEnumType:
     INTERPOLATION_MIDPOINT
 
 
-cdef inline float64_t median_linear(float64_t* a, int n) nogil:
+cdef inline float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) nogil:
     cdef:
         int i, j, na_count = 0
+        float64_t* tmp
         float64_t result
+
+    if n == 0:
+        return NaN
+
+    # count NAs
+    for i in range(n):
+        if mask[i]:
+            na_count += 1
+
+    if na_count:
+        if na_count == n:
+            return NaN
+
+        tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
+
+        j = 0
+        for i in range(n):
+            if not mask[i]:
+                tmp[j] = a[i]
+                j += 1
+
+        a = tmp
+        n -= na_count
+
+    result = calc_median_linear(a, n, na_count)
+
+    if na_count:
+        free(a)
+
+    return result
+
+
+cdef inline float64_t median_linear(float64_t* a, int n) nogil:
+    cdef:
+        int i, j, na_count = 0
         float64_t* tmp
+        float64_t result
 
     if n == 0:
         return NaN
@@ -93,18 +131,34 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil:
         a = tmp
         n -= na_count
 
+    result = calc_median_linear(a, n, na_count)
+
+    if na_count:
+        free(a)
+
+    return result
+
+
+cdef inline float64_t calc_median_linear(float64_t* a, int n, int na_count) nogil:
+    cdef:
+        float64_t result
+
     if n % 2:
         result = kth_smallest_c(a, n // 2, n)
     else:
         result = (kth_smallest_c(a, n // 2, n) +
                   kth_smallest_c(a, n // 2 - 1, n)) / 2
 
-    if na_count:
-        free(a)
-
     return result
 
 
+ctypedef fused int64float_t:
+    int64_t
+    uint64_t
+    float32_t
+    float64_t
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_median_float64(
@@ -113,6 +167,8 @@ def group_median_float64(
     ndarray[float64_t, ndim=2] values,
     ndarray[intp_t] labels,
     Py_ssize_t min_count=-1,
+    const uint8_t[:, :] mask=None,
+    uint8_t[:, ::1] result_mask=None,
 ) -> None:
     """
     Only aggregates on axis=0
@@ -121,8 +177,12 @@ def group_median_float64(
         Py_ssize_t i, j, N, K, ngroups, size
         ndarray[intp_t] _counts
         ndarray[float64_t, ndim=2] data
+        ndarray[uint8_t, ndim=2] data_mask
         ndarray[intp_t] indexer
         float64_t* ptr
+        uint8_t* ptr_mask
+        float64_t result
+        bint uses_mask = mask is not None
 
     assert min_count == -1, "'min_count' only used in sum and prod"
 
@@ -137,15 +197,38 @@ def group_median_float64(
 
     take_2d_axis1_float64_float64(values.T, indexer, out=data)
 
-    with nogil:
+    if uses_mask:
+        data_mask = np.empty((K, N), dtype=np.uint8)
+        ptr_mask = <uint8_t *>cnp.PyArray_DATA(data_mask)
+
+        take_2d_axis1_bool_bool(mask.T, indexer, out=data_mask, fill_value=1)
 
-        for i in range(K):
-            # exclude NA group
-            ptr += _counts[0]
-            for j in range(ngroups):
-                size = _counts[j + 1]
-                out[j, i] = median_linear(ptr, size)
-                ptr += size
+        with nogil:
+
+            for i in range(K):
+                # exclude NA group
+                ptr += _counts[0]
+                ptr_mask += _counts[0]
+
+                for j in range(ngroups):
+                    size = _counts[j + 1]
+                    result = median_linear_mask(ptr, size, ptr_mask)
+                    out[j, i] = result
+
+                    if result != result:
+                        result_mask[j, i] = 1
+                    ptr += size
+                    ptr_mask += size
+
+    else:
+        with nogil:
+            for i in range(K):
+                # exclude NA group
+                ptr += _counts[0]
+                for j in range(ngroups):
+                    size = _counts[j + 1]
+                    out[j, i] = median_linear(ptr, size)
+                    ptr += size
 
 
 @cython.boundscheck(False)
@@ -206,13 +289,6 @@ def group_cumprod_float64(
                         accum[lab, j] = NaN
 
 
-ctypedef fused int64float_t:
-    int64_t
-    uint64_t
-    float32_t
-    float64_t
-
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_cumsum(
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -162,6 +162,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
         "prod",
         "mean",
         "var",
+        "median",
     }
 
     _cython_arity = {"ohlc": 4}  # OHLC
@@ -600,7 +601,7 @@ def _call_cython_op(
                     min_count=min_count,
                     is_datetimelike=is_datetimelike,
                 )
-            elif self.how in ["var", "ohlc", "prod"]:
+            elif self.how in ["var", "ohlc", "prod", "median"]:
                 func(
                     result,
                     counts,

Original file line number	Diff line number	Diff line change
`@@ -100,6 +100,7 @@ Deprecations`
`100`	`100`
`101`	`101`	`Performance improvements`
`102`	`102`	`~~~~~~~~~~~~~~~~~~~~~~~~`
	`103`	+- Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`)
`103`	`104`	- Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
`104`	`105`	- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
`105`	`106`	`-`