Skip to content

ENH: Suppport masks in median groupby algo #48387

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Sep 6, 2022
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np

from pandas import (
NA,
Categorical,
DataFrame,
Index,
Expand Down Expand Up @@ -592,6 +593,7 @@ def setup(self, dtype, method):
columns=list("abcdefghij"),
dtype=dtype,
)
df.loc[list(range(1, N, 5)), list("abcdefghij")] = NA
df["key"] = np.random.randint(0, 100, size=N)
self.df = df

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.6.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ Deprecations

Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`)
- Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
-
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ def group_median_float64(
values: np.ndarray, # ndarray[float64_t, ndim=2]
labels: npt.NDArray[np.int64],
min_count: int = ..., # Py_ssize_t
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_cumprod_float64(
out: np.ndarray, # float64_t[:, ::1]
Expand Down
99 changes: 83 additions & 16 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ from pandas._libs.algos import (
ensure_platform_int,
groupsort_indexer,
rank_1d,
take_2d_axis1_bool_bool,
take_2d_axis1_float64_float64,
)

Expand All @@ -64,10 +65,40 @@ cdef enum InterpolationEnumType:
INTERPOLATION_MIDPOINT


cdef inline float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) nogil:
cdef:
int i, j, na_count = 0
float64_t* tmp

if n == 0:
return NaN

# count NAs
for i in range(n):
if mask[i]:
na_count += 1

if na_count:
if na_count == n:
return NaN

tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))

j = 0
for i in range(n):
if not mask[i]:
tmp[j] = a[i]
j += 1

a = tmp
n -= na_count

return calc_median_linear(a, n, na_count)


cdef inline float64_t median_linear(float64_t* a, int n) nogil:
cdef:
int i, j, na_count = 0
float64_t result
float64_t* tmp

if n == 0:
Expand All @@ -93,6 +124,13 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil:
a = tmp
n -= na_count

return calc_median_linear(a, n, na_count)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you free a here instead of in calc_median_linear? While it does mean you'll have to free in both this method and the masked version, it makes calc_median_linear have no side-effects.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved it, my train of thought went the other way, if moved inside the function the caller would not forget it. But no real preference on my side



cdef inline float64_t calc_median_linear(float64_t* a, int n, int na_count) nogil:
cdef:
float64_t result

if n % 2:
result = kth_smallest_c(a, n // 2, n)
else:
Expand All @@ -105,6 +143,13 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil:
return result


ctypedef fused int64float_t:
int64_t
uint64_t
float32_t
float64_t


@cython.boundscheck(False)
@cython.wraparound(False)
def group_median_float64(
Expand All @@ -113,6 +158,8 @@ def group_median_float64(
ndarray[float64_t, ndim=2] values,
ndarray[intp_t] labels,
Py_ssize_t min_count=-1,
const uint8_t[:, :] mask=None,
uint8_t[:, ::1] result_mask=None,
) -> None:
"""
Only aggregates on axis=0
Expand All @@ -121,8 +168,12 @@ def group_median_float64(
Py_ssize_t i, j, N, K, ngroups, size
ndarray[intp_t] _counts
ndarray[float64_t, ndim=2] data
ndarray[uint8_t, ndim=2] data_mask
ndarray[intp_t] indexer
float64_t* ptr
uint8_t* ptr_mask
float64_t result
bint uses_mask = mask is not None

assert min_count == -1, "'min_count' only used in sum and prod"

Expand All @@ -137,15 +188,38 @@ def group_median_float64(

take_2d_axis1_float64_float64(values.T, indexer, out=data)

with nogil:
if uses_mask:
data_mask = np.empty((K, N), dtype=np.uint8)
ptr_mask = <uint8_t *>cnp.PyArray_DATA(data_mask)

for i in range(K):
# exclude NA group
ptr += _counts[0]
for j in range(ngroups):
size = _counts[j + 1]
out[j, i] = median_linear(ptr, size)
ptr += size
take_2d_axis1_bool_bool(mask.T, indexer, out=data_mask, fill_value=1)

with nogil:

for i in range(K):
# exclude NA group
ptr += _counts[0]
ptr_mask += _counts[0]

for j in range(ngroups):
size = _counts[j + 1]
result = median_linear_mask(ptr, size, ptr_mask)
out[j, i] = result

if result != result:
result_mask[j, i] = 1
ptr += size
ptr_mask += size

else:
with nogil:
for i in range(K):
# exclude NA group
ptr += _counts[0]
for j in range(ngroups):
size = _counts[j + 1]
out[j, i] = median_linear(ptr, size)
ptr += size


@cython.boundscheck(False)
Expand Down Expand Up @@ -206,13 +280,6 @@ def group_cumprod_float64(
accum[lab, j] = NaN


ctypedef fused int64float_t:
int64_t
uint64_t
float32_t
float64_t


@cython.boundscheck(False)
@cython.wraparound(False)
def group_cumsum(
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
"prod",
"mean",
"var",
"median",
}

_cython_arity = {"ohlc": 4} # OHLC
Expand Down Expand Up @@ -600,7 +601,7 @@ def _call_cython_op(
min_count=min_count,
is_datetimelike=is_datetimelike,
)
elif self.how in ["var", "ohlc", "prod"]:
elif self.how in ["var", "ohlc", "prod", "median"]:
func(
result,
counts,
Expand Down