Skip to content

Commit 233bd83

Browse files
authored
PERF: use non-copying path for Groupby.skew (#52104)
* PERF: use non-copying path for Groupby.skew * DFGB * update tests * troubleshoot 32 bit builds * 32bit build * troubleshoot npdev build * troubleshoot npdev build * troubleshoot * troubleshoot victory * troubleshoot * group_skew in groupby.pyi * cython.cpow
1 parent fe09ace commit 233bd83

File tree

8 files changed

+214
-28
lines changed

8 files changed

+214
-28
lines changed

pandas/_libs/groupby.pyi

+9
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,15 @@ def group_var(
8888
is_datetimelike: bool = ...,
8989
name: str = ...,
9090
) -> None: ...
91+
def group_skew(
92+
out: np.ndarray, # float64_t[:, ::1]
93+
counts: np.ndarray, # int64_t[::1]
94+
values: np.ndarray, # ndarray[float64_T, ndim=2]
95+
labels: np.ndarray, # const intp_t[::1]
96+
mask: np.ndarray | None = ...,
97+
result_mask: np.ndarray | None = ...,
98+
skipna: bool = ...,
99+
) -> None: ...
91100
def group_mean(
92101
out: np.ndarray, # floating[:, ::1]
93102
counts: np.ndarray, # int64_t[::1]

pandas/_libs/groupby.pyx

+88
Original file line numberDiff line numberDiff line change
@@ -891,6 +891,94 @@ def group_var(
891891
out[i, j] /= (ct - ddof)
892892

893893

894+
@cython.wraparound(False)
895+
@cython.boundscheck(False)
896+
@cython.cdivision(True)
897+
@cython.cpow
898+
def group_skew(
899+
float64_t[:, ::1] out,
900+
int64_t[::1] counts,
901+
ndarray[float64_t, ndim=2] values,
902+
const intp_t[::1] labels,
903+
const uint8_t[:, ::1] mask=None,
904+
uint8_t[:, ::1] result_mask=None,
905+
bint skipna=True,
906+
) -> None:
907+
cdef:
908+
Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
909+
int64_t[:, ::1] nobs
910+
Py_ssize_t len_values = len(values), len_labels = len(labels)
911+
bint isna_entry, uses_mask = mask is not None
912+
float64_t[:, ::1] M1, M2, M3
913+
float64_t delta, delta_n, term1, val
914+
int64_t n1, n
915+
float64_t ct
916+
917+
if len_values != len_labels:
918+
raise ValueError("len(index) != len(labels)")
919+
920+
nobs = np.zeros((<object>out).shape, dtype=np.int64)
921+
922+
# M1, M2, and M3 correspond to 1st, 2nd, and third Moments
923+
M1 = np.zeros((<object>out).shape, dtype=np.float64)
924+
M2 = np.zeros((<object>out).shape, dtype=np.float64)
925+
M3 = np.zeros((<object>out).shape, dtype=np.float64)
926+
927+
N, K = (<object>values).shape
928+
929+
out[:, :] = 0.0
930+
931+
with nogil:
932+
for i in range(N):
933+
lab = labels[i]
934+
if lab < 0:
935+
continue
936+
937+
counts[lab] += 1
938+
939+
for j in range(K):
940+
val = values[i, j]
941+
942+
if uses_mask:
943+
isna_entry = mask[i, j]
944+
else:
945+
isna_entry = _treat_as_na(val, False)
946+
947+
if not isna_entry:
948+
# Based on RunningSats::Push from
949+
# https://www.johndcook.com/blog/skewness_kurtosis/
950+
n1 = nobs[lab, j]
951+
n = n1 + 1
952+
953+
nobs[lab, j] = n
954+
delta = val - M1[lab, j]
955+
delta_n = delta / n
956+
term1 = delta * delta_n * n1
957+
958+
M1[lab, j] += delta_n
959+
M3[lab, j] += term1 * delta_n * (n - 2) - 3 * delta_n * M2[lab, j]
960+
M2[lab, j] += term1
961+
elif not skipna:
962+
M1[lab, j] = NaN
963+
M2[lab, j] = NaN
964+
M3[lab, j] = NaN
965+
966+
for i in range(ngroups):
967+
for j in range(K):
968+
ct = <float64_t>nobs[i, j]
969+
if ct < 3:
970+
if result_mask is not None:
971+
result_mask[i, j] = 1
972+
out[i, j] = NaN
973+
elif M2[i, j] == 0:
974+
out[i, j] = 0
975+
else:
976+
out[i, j] = (
977+
(ct * (ct - 1) ** 0.5 / (ct - 2))
978+
* (M3[i, j] / M2[i, j] ** 1.5)
979+
)
980+
981+
894982
@cython.wraparound(False)
895983
@cython.boundscheck(False)
896984
def group_mean(

pandas/core/groupby/generic.py

+40-14
Original file line numberDiff line numberDiff line change
@@ -1084,14 +1084,27 @@ def skew(
10841084
Parrot 1.457863
10851085
Name: Max Speed, dtype: float64
10861086
"""
1087-
result = self._op_via_apply(
1088-
"skew",
1089-
axis=axis,
1090-
skipna=skipna,
1091-
numeric_only=numeric_only,
1092-
**kwargs,
1087+
if axis is lib.no_default:
1088+
axis = 0
1089+
1090+
if axis != 0:
1091+
result = self._op_via_apply(
1092+
"skew",
1093+
axis=axis,
1094+
skipna=skipna,
1095+
numeric_only=numeric_only,
1096+
**kwargs,
1097+
)
1098+
return result
1099+
1100+
def alt(obj):
1101+
# This should not be reached since the cython path should raise
1102+
# TypeError and not NotImplementedError.
1103+
raise TypeError(f"'skew' is not supported for dtype={obj.dtype}")
1104+
1105+
return self._cython_agg_general(
1106+
"skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
10931107
)
1094-
return result
10951108

10961109
@property
10971110
@doc(Series.plot.__doc__)
@@ -2567,14 +2580,27 @@ def skew(
25672580
bird NaN
25682581
mammal 1.669046
25692582
"""
2570-
result = self._op_via_apply(
2571-
"skew",
2572-
axis=axis,
2573-
skipna=skipna,
2574-
numeric_only=numeric_only,
2575-
**kwargs,
2583+
if axis is lib.no_default:
2584+
axis = 0
2585+
2586+
if axis != 0:
2587+
result = self._op_via_apply(
2588+
"skew",
2589+
axis=axis,
2590+
skipna=skipna,
2591+
numeric_only=numeric_only,
2592+
**kwargs,
2593+
)
2594+
return result
2595+
2596+
def alt(obj):
2597+
# This should not be reached since the cython path should raise
2598+
# TypeError and not NotImplementedError.
2599+
raise TypeError(f"'skew' is not supported for dtype={obj.dtype}")
2600+
2601+
return self._cython_agg_general(
2602+
"skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
25762603
)
2577-
return result
25782604

25792605
@property
25802606
@doc(DataFrame.plot.__doc__)

pandas/core/groupby/ops.py

+23-6
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
139139
"var": "group_var",
140140
"std": functools.partial(libgroupby.group_var, name="std"),
141141
"sem": functools.partial(libgroupby.group_var, name="sem"),
142+
"skew": "group_skew",
142143
"first": "group_nth",
143144
"last": "group_last",
144145
"ohlc": "group_ohlc",
@@ -182,7 +183,10 @@ def _get_cython_function(
182183
elif how in ["std", "sem"]:
183184
# We have a partial object that does not have __signatures__
184185
return f
185-
if "object" not in f.__signatures__:
186+
elif how == "skew":
187+
# _get_cython_vals will convert to float64
188+
pass
189+
elif "object" not in f.__signatures__:
186190
# raise NotImplementedError here rather than TypeError later
187191
raise NotImplementedError(
188192
f"function is not implemented for this dtype: "
@@ -210,7 +214,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
210214
"""
211215
how = self.how
212216

213-
if how in ["median", "std", "sem"]:
217+
if how in ["median", "std", "sem", "skew"]:
214218
# median only has a float64 implementation
215219
# We should only get here with is_numeric, as non-numeric cases
216220
# should raise in _get_cython_function
@@ -252,7 +256,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj):
252256
return
253257

254258
if isinstance(dtype, CategoricalDtype):
255-
if how in ["sum", "prod", "cumsum", "cumprod"]:
259+
if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:
256260
raise TypeError(f"{dtype} type does not support {how} operations")
257261
if how in ["min", "max", "rank"] and not dtype.ordered:
258262
# raise TypeError instead of NotImplementedError to ensure we
@@ -268,7 +272,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj):
268272
raise NotImplementedError(f"{dtype} dtype not supported")
269273
elif is_datetime64_any_dtype(dtype):
270274
# Adding/multiplying datetimes is not valid
271-
if how in ["sum", "prod", "cumsum", "cumprod", "var"]:
275+
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
272276
raise TypeError(f"datetime64 type does not support {how} operations")
273277
if how in ["any", "all"]:
274278
# GH#34479
@@ -281,7 +285,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj):
281285

282286
elif is_period_dtype(dtype):
283287
# Adding/multiplying Periods is not valid
284-
if how in ["sum", "prod", "cumsum", "cumprod", "var"]:
288+
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
285289
raise TypeError(f"Period type does not support {how} operations")
286290
if how in ["any", "all"]:
287291
# GH#34479
@@ -294,7 +298,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj):
294298

295299
elif is_timedelta64_dtype(dtype):
296300
# timedeltas we can add but not multiply
297-
if how in ["prod", "cumprod"]:
301+
if how in ["prod", "cumprod", "skew"]:
298302
raise TypeError(f"timedelta64 type does not support {how} operations")
299303

300304
def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape:
@@ -643,6 +647,19 @@ def _call_cython_op(
643647
**kwargs,
644648
)
645649
result = result.astype(bool, copy=False)
650+
elif self.how in ["skew"]:
651+
func(
652+
out=result,
653+
counts=counts,
654+
values=values,
655+
labels=comp_ids,
656+
mask=mask,
657+
result_mask=result_mask,
658+
**kwargs,
659+
)
660+
if dtype == object:
661+
result = result.astype(object)
662+
646663
else:
647664
raise NotImplementedError(f"{self.how} is not implemented")
648665
else:

pandas/tests/groupby/test_function.py

+1
Original file line numberDiff line numberDiff line change
@@ -1529,6 +1529,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):
15291529
"min",
15301530
"max",
15311531
"prod",
1532+
"skew",
15321533
)
15331534

15341535
# Test default behavior; kernels that fail may be enabled in the future but kernels

pandas/tests/groupby/test_groupby.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -965,7 +965,7 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
965965
def test_raise_on_nuisance_python_single(df):
966966
# GH 38815
967967
grouped = df.groupby("A")
968-
with pytest.raises(TypeError, match="could not convert"):
968+
with pytest.raises(ValueError, match="could not convert"):
969969
grouped.skew()
970970

971971

@@ -1972,14 +1972,14 @@ def get_categorical_invalid_expected():
19721972
if is_dt64 or is_cat or is_per:
19731973
# GH#41291
19741974
# datetime64 -> prod and sum are invalid
1975-
if op == "skew":
1976-
msg = "does not support reduction 'skew'"
1977-
elif is_dt64:
1975+
if is_dt64:
19781976
msg = "datetime64 type does not support"
19791977
elif is_per:
19801978
msg = "Period type does not support"
19811979
else:
19821980
msg = "category type does not support"
1981+
if op == "skew":
1982+
msg = "|".join([msg, "does not support reduction 'skew'"])
19831983
with pytest.raises(TypeError, match=msg):
19841984
get_result()
19851985

pandas/tests/groupby/test_raises.py

+22-4
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def test_groupby_raises_string(
159159
"sem": (ValueError, "could not convert string to float"),
160160
"shift": (None, ""),
161161
"size": (None, ""),
162-
"skew": (TypeError, "could not convert string to float"),
162+
"skew": (ValueError, "could not convert string to float"),
163163
"std": (ValueError, "could not convert string to float"),
164164
"sum": (None, ""),
165165
"var": (TypeError, "could not convert string to float"),
@@ -249,7 +249,15 @@ def test_groupby_raises_datetime(
249249
"sem": (None, ""),
250250
"shift": (None, ""),
251251
"size": (None, ""),
252-
"skew": (TypeError, r"dtype datetime64\[ns\] does not support reduction"),
252+
"skew": (
253+
TypeError,
254+
"|".join(
255+
[
256+
r"dtype datetime64\[ns\] does not support reduction",
257+
"datetime64 type does not support skew operations",
258+
]
259+
),
260+
),
253261
"std": (None, ""),
254262
"sum": (TypeError, "datetime64 type does not support sum operations"),
255263
"var": (TypeError, "datetime64 type does not support var operations"),
@@ -407,7 +415,12 @@ def test_groupby_raises_category(
407415
"size": (None, ""),
408416
"skew": (
409417
TypeError,
410-
"'Categorical' with dtype category does not support reduction 'skew'",
418+
"|".join(
419+
[
420+
"dtype category does not support reduction 'skew'",
421+
"category type does not support skew operations",
422+
]
423+
),
411424
),
412425
"std": (
413426
TypeError,
@@ -575,7 +588,12 @@ def test_groupby_raises_category_on_category(
575588
"size": (None, ""),
576589
"skew": (
577590
TypeError,
578-
"'Categorical' with dtype category does not support reduction 'skew'",
591+
"|".join(
592+
[
593+
"category type does not support skew operations",
594+
"dtype category does not support reduction 'skew'",
595+
]
596+
),
579597
),
580598
"std": (
581599
TypeError,

pandas/tests/groupby/test_skew.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import numpy as np
2+
3+
import pandas as pd
4+
import pandas._testing as tm
5+
6+
7+
def test_groupby_skew_equivalence():
8+
# Test that that groupby skew method (which uses libgroupby.group_skew)
9+
# matches the results of operating group-by-group (which uses nanops.nanskew)
10+
nrows = 1000
11+
ngroups = 3
12+
ncols = 2
13+
nan_frac = 0.05
14+
15+
arr = np.random.randn(nrows, ncols)
16+
arr[np.random.random(nrows) < nan_frac] = np.nan
17+
18+
df = pd.DataFrame(arr)
19+
grps = np.random.randint(0, ngroups, size=nrows)
20+
gb = df.groupby(grps)
21+
22+
result = gb.skew()
23+
24+
grpwise = [grp.skew().to_frame(i).T for i, grp in gb]
25+
expected = pd.concat(grpwise, axis=0)
26+
expected.index = expected.index.astype(result.index.dtype) # 32bit builds
27+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)