Skip to content

ENH: Suppport masks in median groupby algo #48387

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Sep 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np

from pandas import (
NA,
Categorical,
DataFrame,
Index,
Expand Down Expand Up @@ -592,6 +593,7 @@ def setup(self, dtype, method):
columns=list("abcdefghij"),
dtype=dtype,
)
df.loc[list(range(1, N, 5)), list("abcdefghij")] = NA
df["key"] = np.random.randint(0, 100, size=N)
self.df = df

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.6.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ Deprecations

Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`)
- Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
-
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ def group_median_float64(
values: np.ndarray, # ndarray[float64_t, ndim=2]
labels: npt.NDArray[np.int64],
min_count: int = ..., # Py_ssize_t
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_cumprod_float64(
out: np.ndarray, # float64_t[:, ::1]
Expand Down
114 changes: 95 additions & 19 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ from pandas._libs.algos import (
ensure_platform_int,
groupsort_indexer,
rank_1d,
take_2d_axis1_bool_bool,
take_2d_axis1_float64_float64,
)

Expand All @@ -64,11 +65,48 @@ cdef enum InterpolationEnumType:
INTERPOLATION_MIDPOINT


cdef inline float64_t median_linear(float64_t* a, int n) nogil:
cdef inline float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) nogil:
cdef:
int i, j, na_count = 0
float64_t* tmp
float64_t result

if n == 0:
return NaN

# count NAs
for i in range(n):
if mask[i]:
na_count += 1

if na_count:
if na_count == n:
return NaN

tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))

j = 0
for i in range(n):
if not mask[i]:
tmp[j] = a[i]
j += 1

a = tmp
n -= na_count

result = calc_median_linear(a, n, na_count)

if na_count:
free(a)

return result


cdef inline float64_t median_linear(float64_t* a, int n) nogil:
cdef:
int i, j, na_count = 0
float64_t* tmp
float64_t result

if n == 0:
return NaN
Expand All @@ -93,18 +131,34 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil:
a = tmp
n -= na_count

result = calc_median_linear(a, n, na_count)

if na_count:
free(a)

return result


cdef inline float64_t calc_median_linear(float64_t* a, int n, int na_count) nogil:
cdef:
float64_t result

if n % 2:
result = kth_smallest_c(a, n // 2, n)
else:
result = (kth_smallest_c(a, n // 2, n) +
kth_smallest_c(a, n // 2 - 1, n)) / 2

if na_count:
free(a)

return result


ctypedef fused int64float_t:
int64_t
uint64_t
float32_t
float64_t


@cython.boundscheck(False)
@cython.wraparound(False)
def group_median_float64(
Expand All @@ -113,6 +167,8 @@ def group_median_float64(
ndarray[float64_t, ndim=2] values,
ndarray[intp_t] labels,
Py_ssize_t min_count=-1,
const uint8_t[:, :] mask=None,
uint8_t[:, ::1] result_mask=None,
) -> None:
"""
Only aggregates on axis=0
Expand All @@ -121,8 +177,12 @@ def group_median_float64(
Py_ssize_t i, j, N, K, ngroups, size
ndarray[intp_t] _counts
ndarray[float64_t, ndim=2] data
ndarray[uint8_t, ndim=2] data_mask
ndarray[intp_t] indexer
float64_t* ptr
uint8_t* ptr_mask
float64_t result
bint uses_mask = mask is not None

assert min_count == -1, "'min_count' only used in sum and prod"

Expand All @@ -137,15 +197,38 @@ def group_median_float64(

take_2d_axis1_float64_float64(values.T, indexer, out=data)

with nogil:
if uses_mask:
data_mask = np.empty((K, N), dtype=np.uint8)
ptr_mask = <uint8_t *>cnp.PyArray_DATA(data_mask)

take_2d_axis1_bool_bool(mask.T, indexer, out=data_mask, fill_value=1)

for i in range(K):
# exclude NA group
ptr += _counts[0]
for j in range(ngroups):
size = _counts[j + 1]
out[j, i] = median_linear(ptr, size)
ptr += size
with nogil:

for i in range(K):
# exclude NA group
ptr += _counts[0]
ptr_mask += _counts[0]

for j in range(ngroups):
size = _counts[j + 1]
result = median_linear_mask(ptr, size, ptr_mask)
out[j, i] = result

if result != result:
result_mask[j, i] = 1
ptr += size
ptr_mask += size

else:
with nogil:
for i in range(K):
# exclude NA group
ptr += _counts[0]
for j in range(ngroups):
size = _counts[j + 1]
out[j, i] = median_linear(ptr, size)
ptr += size


@cython.boundscheck(False)
Expand Down Expand Up @@ -206,13 +289,6 @@ def group_cumprod_float64(
accum[lab, j] = NaN


ctypedef fused int64float_t:
int64_t
uint64_t
float32_t
float64_t


@cython.boundscheck(False)
@cython.wraparound(False)
def group_cumsum(
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
"prod",
"mean",
"var",
"median",
}

_cython_arity = {"ohlc": 4} # OHLC
Expand Down Expand Up @@ -600,7 +601,7 @@ def _call_cython_op(
min_count=min_count,
is_datetimelike=is_datetimelike,
)
elif self.how in ["var", "ohlc", "prod"]:
elif self.how in ["var", "ohlc", "prod", "median"]:
func(
result,
counts,
Expand Down