From 771284086dc28ad0d2262712b99d79e5b0746a33 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Wed, 27 Nov 2024 09:18:01 -0800 Subject: [PATCH 1/7] ENH: Support kurtosis (kurt) in DataFrameGroupBy and SeriesGroupBy --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/groupby.pyi | 9 + pandas/_libs/groupby.pyx | 94 +++++++++ pandas/core/arrays/base.py | 1 + pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/datetimelike.py | 6 +- pandas/core/groupby/base.py | 1 + pandas/core/groupby/generic.py | 188 ++++++++++++++++++ pandas/core/groupby/ops.py | 8 +- pandas/tests/groupby/methods/test_kurt.py | 27 +++ pandas/tests/groupby/test_api.py | 1 + pandas/tests/groupby/test_apply.py | 1 + pandas/tests/groupby/test_categorical.py | 1 + pandas/tests/groupby/test_groupby.py | 10 +- pandas/tests/groupby/test_numeric_only.py | 3 + pandas/tests/groupby/test_raises.py | 33 ++- pandas/tests/groupby/test_reductions.py | 5 +- .../tests/groupby/transform/test_transform.py | 8 +- 18 files changed, 379 insertions(+), 20 deletions(-) create mode 100644 pandas/tests/groupby/methods/test_kurt.py diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1b12735f0e7c1..8999c8f441551 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -56,6 +56,7 @@ Other enhancements - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) +- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`SeriesGroupBy.apply`, :meth:`DataFrame.apply` now support ``kurt`` (:issue:`40139`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 53f5f73624232..34367f55d2bbb 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -97,6 +97,15 @@ def group_skew( result_mask: np.ndarray | None = ..., skipna: bool = ..., ) -> None: ... +def group_kurt( + out: np.ndarray, # float64_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[float64_T, ndim=2] + labels: np.ndarray, # const intp_t[::1] + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., + skipna: bool = ..., +) -> None: ... def group_mean( out: np.ndarray, # floating[:, ::1] counts: np.ndarray, # int64_t[::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d7e485f74e58b..0c48fbdee1f11 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -995,6 +995,100 @@ def group_skew( ) +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.cdivision(True) +@cython.cpow +def group_kurt( + float64_t[:, ::1] out, + int64_t[::1] counts, + ndarray[float64_t, ndim=2] values, + const intp_t[::1] labels, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, + bint skipna=True, +) -> None: + cdef: + Py_ssize_t i, j, N, K, lab, ngroups = len(counts) + int64_t[:, ::1] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) + bint isna_entry, uses_mask = mask is not None + float64_t[:, ::1] M1, M2, M3, M4 + float64_t delta, delta_n, delta_n2, term1, val + int64_t n1, n + float64_t ct, num, den, adj + + if len_values != len_labels: + raise ValueError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + + # M1, M2, M3 and M4 correspond to 1st, 2nd, 3rd and 4th Moments + M1 = np.zeros((out).shape, dtype=np.float64) + M2 = np.zeros((out).shape, dtype=np.float64) + M3 = np.zeros((out).shape, dtype=np.float64) + M4 = np.zeros((out).shape, dtype=np.float64) + + N, K = (values).shape + + out[:, :] = 0.0 + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, False) + + if not isna_entry: + # Based on RunningStats::Push from + # https://www.johndcook.com/blog/skewness_kurtosis/ + n1 = nobs[lab, j] + n = n1 + 1 + + nobs[lab, j] = n + delta = val - M1[lab, j] + delta_n = delta / n + delta_n2 = delta_n * delta_n + term1 = delta * delta_n * n1 + + M1[lab, j] += delta_n + M4[lab, j] += (term1 * delta_n2 * (n*n - 3*n + 3) + + 6 * delta_n2 * M2[lab, j] + - 4 * delta_n * M3[lab, j]) + M3[lab, j] += term1 * delta_n * (n - 2) - 3 * delta_n * M2[lab, j] + M2[lab, j] += term1 + elif not skipna: + M1[lab, j] = NaN + M2[lab, j] = NaN + M3[lab, j] = NaN + M4[lab, j] = NaN + + for i in range(ngroups): + for j in range(K): + ct = nobs[i, j] + if ct < 4: + if result_mask is not None: + result_mask[i, j] = 1 + out[i, j] = NaN + elif M2[i, j] == 0: + out[i, j] = 0 + else: + num = ct * (ct + 1) * (ct - 1) * M4[i, j] + den = (ct - 2) * (ct - 3) * M2[i, j] ** 2 + adj = 3.0 * (ct - 1) ** 2 / ((ct - 2) * (ct - 3)) + out[i, j] = num / den - adj + + @cython.wraparound(False) @cython.boundscheck(False) def group_mean( diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 4835d808f2433..e831883998098 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2618,6 +2618,7 @@ def _groupby_op( "sem", "var", "skew", + "kurt", ]: raise TypeError( f"dtype '{self.dtype}' does not support operation '{how}'" diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 99e4cb0545e2d..ae20bfb6b284b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2736,7 +2736,7 @@ def _groupby_op( op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) dtype = self.dtype - if how in ["sum", "prod", "cumsum", "cumprod", "skew"]: + if how in ["sum", "prod", "cumsum", "cumprod", "skew", "kurt"]: raise TypeError(f"{dtype} type does not support {how} operations") if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered: # raise TypeError instead of NotImplementedError to ensure we diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 9c821bf0d184e..cbcab4cd497ad 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1656,7 +1656,7 @@ def _groupby_op( dtype = self.dtype if dtype.kind == "M": # Adding/multiplying datetimes is not valid - if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]: raise TypeError(f"datetime64 type does not support operation '{how}'") if how in ["any", "all"]: # GH#34479 @@ -1667,7 +1667,7 @@ def _groupby_op( elif isinstance(dtype, PeriodDtype): # Adding/multiplying Periods is not valid - if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]: raise TypeError(f"Period type does not support {how} operations") if how in ["any", "all"]: # GH#34479 @@ -1677,7 +1677,7 @@ def _groupby_op( ) else: # timedeltas we can add but not multiply - if how in ["prod", "cumprod", "skew", "var"]: + if how in ["prod", "cumprod", "skew", "kurt", "var"]: raise TypeError(f"timedelta64 type does not support {how} operations") # All of the functions implemented here are ordinal, so we can diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index bad9749b5ecee..7699fb3d0f864 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -50,6 +50,7 @@ class OutputKey: "sem", "size", "skew", + "kurt", "std", "sum", "var", diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 35ec09892ede6..d276d929321ba 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1279,6 +1279,84 @@ def alt(obj): "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs ) + def kurt( + self, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> Series: + """ + Return unbiased kurtosis within groups. + + Parameters + ---------- + skipna : bool, default True + Exclude NA/null values when computing the result. + + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series + Unbiased kurtosis within groups. + + See Also + -------- + Series.kurt : Return unbiased kurtosis over requested axis. + + Examples + -------- + >>> ser = pd.Series( + ... [390.0, 350.0, 357.0, 333.0, np.nan, 22.0, 20.0, 30.0, 40.0, 41.0], + ... index=[ + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... ], + ... name="Max Speed", + ... ) + >>> ser + Falcon 390.0 + Falcon 350.0 + Falcon 357.0 + Falcon 333.0 + Falcon NaN + Parrot 22.0 + Parrot 20.0 + Parrot 30.0 + Parrot 40.0 + Parrot 41.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).kurt() + Falcon 1.622109 + Parrot -2.878714 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).kurt(skipna=False) + Falcon NaN + Parrot -2.878714 + Name: Max Speed, dtype: float64 + """ + + def alt(obj): + # This should not be reached since the cython path should raise + # TypeError and not NotImplementedError. + raise TypeError(f"'kurt' is not supported for dtype={obj.dtype}") + + return self._cython_agg_general( + "kurt", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + @property @doc(Series.plot.__doc__) def plot(self) -> GroupByPlot: @@ -2905,6 +2983,116 @@ def alt(obj): "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs ) + def kurt( + self, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> DataFrame: + """ + Return unbiased kurtosis within groups. + + Parameters + ---------- + skipna : bool, default True + Exclude NA/null values when computing the result. + + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + DataFrame + Unbiased kurtosis within groups. + + See Also + -------- + DataFrame.kurt : Return unbiased kurtosis over requested axis. + + Examples + -------- + >>> arrays = [ + ... [ + ... "falcon", + ... "parrot", + ... "cockatoo", + ... "kiwi", + ... "eagle", + ... "lion", + ... "monkey", + ... "rabbit", + ... "dog", + ... "wolf", + ... ], + ... [ + ... "bird", + ... "bird", + ... "bird", + ... "bird", + ... "bird", + ... "mammal", + ... "mammal", + ... "mammal", + ... "mammal", + ... "mammal", + ... ], + ... ] + >>> index = pd.MultiIndex.from_arrays(arrays, names=("name", "class")) + >>> df = pd.DataFrame( + ... { + ... "max_speed": [ + ... 389.0, + ... 24.0, + ... 70.0, + ... np.nan, + ... 350.0, + ... 80.5, + ... 21.5, + ... 15.0, + ... 40.0, + ... 50.0, + ... ] + ... }, + ... index=index, + ... ) + >>> df + max_speed + name class + falcon bird 389.0 + parrot bird 24.0 + cockatoo bird 70.0 + kiwi bird NaN + eagle bird 350.0 + lion mammal 80.5 + monkey mammal 21.5 + rabbit mammal 15.0 + dog mammal 40.0 + wolf mammal 50.0 + >>> gb = df.groupby(["class"]) + >>> gb.kurt() + max_speed + class + bird -5.493277 + mammal 0.204125 + >>> gb.kurt(skipna=False) + max_speed + class + bird NaN + mammal 0.204125 + """ + + def alt(obj): + # This should not be reached since the cython path should raise + # TypeError and not NotImplementedError. + raise TypeError(f"'kurt' is not supported for dtype={obj.dtype}") + + return self._cython_agg_general( + "kurt", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + @property @doc(DataFrame.plot.__doc__) def plot(self) -> GroupByPlot: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4c7fe604e452d..c4c7f73ee166c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -144,6 +144,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "std": functools.partial(libgroupby.group_var, name="std"), "sem": functools.partial(libgroupby.group_var, name="sem"), "skew": "group_skew", + "kurt": "group_kurt", "first": "group_nth", "last": "group_last", "ohlc": "group_ohlc", @@ -193,7 +194,7 @@ def _get_cython_function( elif how in ["std", "sem", "idxmin", "idxmax"]: # We have a partial object that does not have __signatures__ return f - elif how == "skew": + elif how in ["skew", "kurt"]: # _get_cython_vals will convert to float64 pass elif "object" not in f.__signatures__: @@ -224,7 +225,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: """ how = self.how - if how in ["median", "std", "sem", "skew"]: + if how in ["median", "std", "sem", "skew", "kurt"]: # median only has a float64 implementation # We should only get here with is_numeric, as non-numeric cases # should raise in _get_cython_function @@ -453,7 +454,7 @@ def _call_cython_op( **kwargs, ) result = result.astype(bool, copy=False) - elif self.how in ["skew"]: + elif self.how in ["skew", "kurt"]: func( out=result, counts=counts, @@ -1021,6 +1022,7 @@ def apply_groupwise( # getattr pattern for __name__ is needed for functools.partial objects if len(group_keys) == 0 and getattr(f, "__name__", None) in [ "skew", + "kurt", "sum", "prod", ]: diff --git a/pandas/tests/groupby/methods/test_kurt.py b/pandas/tests/groupby/methods/test_kurt.py new file mode 100644 index 0000000000000..51720571f43d0 --- /dev/null +++ b/pandas/tests/groupby/methods/test_kurt.py @@ -0,0 +1,27 @@ +import numpy as np + +import pandas as pd +import pandas._testing as tm + + +def test_groupby_kurt_equivalence(): + # Test that that groupby kurt method (which uses libgroupby.group_kurt) + # matches the results of operating group-by-group (which uses nanops.nankurt) + nrows = 1000 + ngroups = 3 + ncols = 2 + nan_frac = 0.05 + + arr = np.random.default_rng(2).standard_normal((nrows, ncols)) + arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan + + df = pd.DataFrame(arr) + grps = np.random.default_rng(2).integers(0, ngroups, size=nrows) + gb = df.groupby(grps) + + result = gb.kurt() + + grpwise = [grp.kurt().to_frame(i).T for i, grp in gb] + expected = pd.concat(grpwise, axis=0) + expected.index = expected.index.astype(result.index.dtype) # 32bit builds + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index 013b308cd14cd..baec3ed1a5024 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -74,6 +74,7 @@ def test_tab_completion(multiindex_dataframe_random_data): "all", "shift", "skew", + "kurt", "take", "pct_change", "any", diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 1a4127ab49b0e..69d5e2daecf89 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1484,6 +1484,7 @@ def test_result_name_when_one_group(name): ("apply", lambda gb: gb.values[-1]), ("apply", lambda gb: gb["b"].iloc[0]), ("agg", "skew"), + ("agg", "kurt"), ("agg", "prod"), ("agg", "sum"), ], diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 6d84dae1d25d8..95d0d9de4ec54 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -61,6 +61,7 @@ def f(a): "sem": np.nan, "size": 0, "skew": np.nan, + "kurt": np.nan, "std": np.nan, "sum": 0, "var": np.nan, diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 702bbfef2be3b..d062cb3bfac38 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1728,7 +1728,7 @@ def test_pivot_table_values_key_error(): ) @pytest.mark.parametrize("method", ["attr", "agg", "apply"]) @pytest.mark.parametrize( - "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] + "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew", "kurt"] ) def test_empty_groupby(columns, keys, values, method, op, dropna, using_infer_string): # GH8093 & GH26411 @@ -1804,7 +1804,7 @@ def get_categorical_invalid_expected(): tm.assert_equal(result, expected) return - if op in ["prod", "sum", "skew"]: + if op in ["prod", "sum", "skew", "kurt"]: # ops that require more than just ordered-ness if is_dt64 or is_cat or is_per or (is_str and op != "sum"): # GH#41291 @@ -1817,15 +1817,15 @@ def get_categorical_invalid_expected(): msg = f"dtype 'str' does not support operation '{op}'" else: msg = "category type does not support" - if op == "skew": - msg = "|".join([msg, "does not support operation 'skew'"]) + if op in ["skew", "kurt"]: + msg = "|".join([msg, f"does not support operation '{op}'"]) with pytest.raises(TypeError, match=msg): get_result() if not isinstance(columns, list): # i.e. SeriesGroupBy return - elif op == "skew": + elif op in ["skew", "kurt"]: # TODO: test the numeric_only=True case return else: diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index cb4569812f600..53ab6dacd42c3 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -244,6 +244,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): ("quantile", True), ("sem", True), ("skew", True), + ("kurt", True), ("std", True), ("sum", True), ("var", True), @@ -381,6 +382,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): "max", "prod", "skew", + "kurt", ) # Test default behavior; kernels that fail may be enabled in the future but kernels @@ -410,6 +412,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): "quantile", "sem", "skew", + "kurt", "std", "sum", "var", diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 1e0a15d0ba796..f1e38bdfb42a3 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -172,6 +172,7 @@ def test_groupby_raises_string( "shift": (None, ""), "size": (None, ""), "skew": (ValueError, "could not convert string to float"), + "kurt": (ValueError, "could not convert string to float"), "std": (ValueError, "could not convert string to float"), "sum": (None, ""), "var": ( @@ -191,10 +192,11 @@ def test_groupby_raises_string( "sem", "var", "skew", + "kurt", "quantile", ]: msg = f"dtype 'str' does not support operation '{groupby_func}'" - if groupby_func in ["sem", "std", "skew"]: + if groupby_func in ["sem", "std", "skew", "kurt"]: # The object-dtype raises ValueError when trying to convert to numeric. klass = TypeError elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow": @@ -328,6 +330,15 @@ def test_groupby_raises_datetime( ] ), ), + "kurt": ( + TypeError, + "|".join( + [ + r"dtype datetime64\[ns\] does not support operation", + "datetime64 type does not support operation 'kurt'", + ] + ), + ), "std": (None, ""), "sum": (TypeError, "datetime64 type does not support operation 'sum"), "var": (TypeError, "datetime64 type does not support operation 'var'"), @@ -380,7 +391,7 @@ def test_groupby_raises_datetime_np( _call_and_check(klass, msg, how, gb, groupby_func_np, ()) -@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"]) +@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "kurt", "var"]) def test_groupby_raises_timedelta(func): df = DataFrame( { @@ -511,6 +522,15 @@ def test_groupby_raises_category( ] ), ), + "kurt": ( + TypeError, + "|".join( + [ + "dtype category does not support operation 'kurt'", + "category type does not support kurt operations", + ] + ), + ), "std": ( TypeError, "|".join( @@ -689,6 +709,15 @@ def test_groupby_raises_category_on_category( ] ), ), + "kurt": ( + TypeError, + "|".join( + [ + "category type does not support kurt operations", + "dtype category does not support operation 'kurt'", + ] + ), + ), "std": ( TypeError, "|".join( diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 51c7eab2bfa82..a17200c123d22 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -1114,6 +1114,7 @@ def test_apply_to_nullable_integer_returns_float(values, function): "median", "mean", "skew", + "kurt", "std", "var", "sem", @@ -1127,8 +1128,8 @@ def test_regression_allowlist_methods(op, skipna, sort): grouped = frame.groupby(level=0, sort=sort) - if op == "skew": - # skew has skipna + if op in ["skew", "kurt"]: + # skew and kurt have skipna result = getattr(grouped, op)(skipna=skipna) expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(skipna=skipna)) if sort: diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 022d3d51ded4e..2bec4a5920a19 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1097,13 +1097,13 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): func = reduction_func obj = DataFrame( - {"a": [0, 0, 0, 1, 1, 1], "b": range(6)}, - index=["A", "B", "C", "D", "E", "F"], + {"a": [0, 0, 0, 0, 1, 1, 1, 1], "b": range(8)}, + index=["A", "B", "C", "D", "E", "F", "G", "H"], ) if frame_or_series is Series: obj = obj["a"] - g = obj.groupby(np.repeat([0, 1], 3)) + g = obj.groupby(np.repeat([0, 1], 4)) if func == "corrwith" and isinstance(obj, Series): # GH#32293 # TODO: implement SeriesGroupBy.corrwith @@ -1128,7 +1128,7 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): tm.assert_index_equal(result.columns, obj.columns) # verify that values were broadcasted across each group - assert len(set(DataFrame(result).iloc[-3:, -1])) == 1 + assert len(set(DataFrame(result).iloc[-4:, -1])) == 1 def test_transform_lambda_with_datetimetz(): From 290378f15c81e44e551cb2e0d323216e975bda13 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Tue, 3 Dec 2024 16:08:06 -0800 Subject: [PATCH 2/7] ENH: Address review comments --- pandas/_libs/groupby.pyx | 8 ++-- pandas/core/groupby/generic.py | 14 +----- pandas/tests/groupby/methods/test_kurt.py | 57 ++++++++++++++++++++++- 3 files changed, 62 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 0c48fbdee1f11..59bc59135a8ff 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -910,7 +910,7 @@ def group_var( @cython.wraparound(False) @cython.boundscheck(False) @cython.cdivision(True) -@cython.cpow +@cython.cpow(True) def group_skew( float64_t[:, ::1] out, int64_t[::1] counts, @@ -961,7 +961,7 @@ def group_skew( isna_entry = _treat_as_na(val, False) if not isna_entry: - # Based on RunningStats::Push from + # Running stats update based on RunningStats::Push from # https://www.johndcook.com/blog/skewness_kurtosis/ n1 = nobs[lab, j] n = n1 + 1 @@ -998,7 +998,7 @@ def group_skew( @cython.wraparound(False) @cython.boundscheck(False) @cython.cdivision(True) -@cython.cpow +@cython.cpow(True) def group_kurt( float64_t[:, ::1] out, int64_t[::1] counts, @@ -1050,7 +1050,7 @@ def group_kurt( isna_entry = _treat_as_na(val, False) if not isna_entry: - # Based on RunningStats::Push from + # Running stats update based on RunningStats::Push from # https://www.johndcook.com/blog/skewness_kurtosis/ n1 = nobs[lab, j] n = n1 + 1 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d276d929321ba..c3d8a71aebc90 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1270,13 +1270,8 @@ def skew( Name: Max Speed, dtype: float64 """ - def alt(obj): - # This should not be reached since the cython path should raise - # TypeError and not NotImplementedError. - raise TypeError(f"'skew' is not supported for dtype={obj.dtype}") - return self._cython_agg_general( - "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs + "skew", alt=None, skipna=skipna, numeric_only=numeric_only, **kwargs ) def kurt( @@ -3084,13 +3079,8 @@ def kurt( mammal 0.204125 """ - def alt(obj): - # This should not be reached since the cython path should raise - # TypeError and not NotImplementedError. - raise TypeError(f"'kurt' is not supported for dtype={obj.dtype}") - return self._cython_agg_general( - "kurt", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs + "kurt", alt=None, skipna=skipna, numeric_only=numeric_only, **kwargs ) @property diff --git a/pandas/tests/groupby/methods/test_kurt.py b/pandas/tests/groupby/methods/test_kurt.py index 51720571f43d0..edf098d25a464 100644 --- a/pandas/tests/groupby/methods/test_kurt.py +++ b/pandas/tests/groupby/methods/test_kurt.py @@ -5,6 +5,7 @@ def test_groupby_kurt_equivalence(): + # GH#40139 # Test that that groupby kurt method (which uses libgroupby.group_kurt) # matches the results of operating group-by-group (which uses nanops.nankurt) nrows = 1000 @@ -23,5 +24,59 @@ def test_groupby_kurt_equivalence(): grpwise = [grp.kurt().to_frame(i).T for i, grp in gb] expected = pd.concat(grpwise, axis=0) - expected.index = expected.index.astype(result.index.dtype) # 32bit builds + expected.index = expected.index.astype(np.intp) # 32bit builds tm.assert_frame_equal(result, expected) + + +def test_groupby_kurt_arrow_float64(): + # GH#40139 + # Test groupby.kurt() with skipna = False + df = pd.DataFrame( + { + "x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9], + "y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0], + }, + dtype="float64[pyarrow]", + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt() + expected = pd.DataFrame( + {"x": [2.1644713], "y": [0.1513969]}, dtype="float64[pyarrow]" + ) + tm.assert_almost_equal(result, expected) + + +def test_groupby_kurt_noskipna(): + # GH#40139 + # Test groupby.kurt() with skipna = False + df = pd.DataFrame( + { + "x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9], + "y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0], + } + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt(skipna=False) + expected = pd.DataFrame({"x": [np.nan], "y": [0.1513969]}) + tm.assert_almost_equal(result, expected) + + +def test_groupby_kurt_all_ones(): + # GH#40139 + # Test groupby.kurt() with skipna = False + df = pd.DataFrame( + { + "x": [1.0] * 10, + } + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt(skipna=False) + expected = pd.DataFrame( + { + "x": [0.0], # Same behavior as pd.DataFrame.kurt() + } + ) + tm.assert_almost_equal(result, expected) From 1adbb0c2248ec89e5b5f31978dfd73c9c5d9ebec Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Tue, 3 Dec 2024 16:16:03 -0800 Subject: [PATCH 3/7] ENH: Fix comments in new test cases --- pandas/tests/groupby/methods/test_kurt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/methods/test_kurt.py b/pandas/tests/groupby/methods/test_kurt.py index edf098d25a464..ba6f395985d0e 100644 --- a/pandas/tests/groupby/methods/test_kurt.py +++ b/pandas/tests/groupby/methods/test_kurt.py @@ -30,7 +30,7 @@ def test_groupby_kurt_equivalence(): def test_groupby_kurt_arrow_float64(): # GH#40139 - # Test groupby.kurt() with skipna = False + # Test groupby.kurt() with float64[pyarrow] dtype df = pd.DataFrame( { "x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9], @@ -65,7 +65,7 @@ def test_groupby_kurt_noskipna(): def test_groupby_kurt_all_ones(): # GH#40139 - # Test groupby.kurt() with skipna = False + # Test groupby.kurt() with constant values df = pd.DataFrame( { "x": [1.0] * 10, From c5df6ec50d764f9d1c570fd33a2819c445aba6c9 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Tue, 3 Dec 2024 16:34:53 -0800 Subject: [PATCH 4/7] ENH: Skip pyarrow test case if no pyarrow available --- pandas/tests/groupby/methods/test_kurt.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/groupby/methods/test_kurt.py b/pandas/tests/groupby/methods/test_kurt.py index ba6f395985d0e..895e032c6df89 100644 --- a/pandas/tests/groupby/methods/test_kurt.py +++ b/pandas/tests/groupby/methods/test_kurt.py @@ -1,5 +1,7 @@ import numpy as np +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -28,6 +30,7 @@ def test_groupby_kurt_equivalence(): tm.assert_frame_equal(result, expected) +@td.skip_if_no("pyarrow") def test_groupby_kurt_arrow_float64(): # GH#40139 # Test groupby.kurt() with float64[pyarrow] dtype From aaacc27b872b3d46bba0834bdd2182c517de7228 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Tue, 3 Dec 2024 17:25:07 -0800 Subject: [PATCH 5/7] ENH: Update to intp instead of np.intp --- pandas/tests/groupby/methods/test_kurt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/methods/test_kurt.py b/pandas/tests/groupby/methods/test_kurt.py index 895e032c6df89..a6cce1d578ce1 100644 --- a/pandas/tests/groupby/methods/test_kurt.py +++ b/pandas/tests/groupby/methods/test_kurt.py @@ -26,7 +26,7 @@ def test_groupby_kurt_equivalence(): grpwise = [grp.kurt().to_frame(i).T for i, grp in gb] expected = pd.concat(grpwise, axis=0) - expected.index = expected.index.astype(np.intp) # 32bit builds + expected.index = expected.index.astype("intp") # 32bit builds tm.assert_frame_equal(result, expected) From 4fc5ca250d885f69fb8f7be99b19517106c74b09 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Tue, 3 Dec 2024 18:16:28 -0800 Subject: [PATCH 6/7] ENH: Change intp to int64 --- pandas/tests/groupby/methods/test_kurt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/methods/test_kurt.py b/pandas/tests/groupby/methods/test_kurt.py index a6cce1d578ce1..e616d4fd735e6 100644 --- a/pandas/tests/groupby/methods/test_kurt.py +++ b/pandas/tests/groupby/methods/test_kurt.py @@ -26,7 +26,7 @@ def test_groupby_kurt_equivalence(): grpwise = [grp.kurt().to_frame(i).T for i, grp in gb] expected = pd.concat(grpwise, axis=0) - expected.index = expected.index.astype("intp") # 32bit builds + expected.index = expected.index.astype("int64") # 32bit builds tm.assert_frame_equal(result, expected) From e42a06092d25a48da8ca44b514b31cda968b3334 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sun, 15 Dec 2024 13:30:06 -0800 Subject: [PATCH 7/7] Address review comments --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/tests/groupby/methods/test_kurt.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f89fef2a78019..32ae8f1781190 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -55,8 +55,8 @@ Other enhancements - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) +- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) -- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`SeriesGroupBy.apply`, :meth:`DataFrame.apply` now support ``kurt`` (:issue:`40139`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) diff --git a/pandas/tests/groupby/methods/test_kurt.py b/pandas/tests/groupby/methods/test_kurt.py index e616d4fd735e6..21b7c50c3c5aa 100644 --- a/pandas/tests/groupby/methods/test_kurt.py +++ b/pandas/tests/groupby/methods/test_kurt.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas.util._test_decorators as td @@ -30,23 +31,27 @@ def test_groupby_kurt_equivalence(): tm.assert_frame_equal(result, expected) -@td.skip_if_no("pyarrow") -def test_groupby_kurt_arrow_float64(): +@pytest.mark.parametrize( + "dtype", + [ + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + "Float64", + ], +) +def test_groupby_kurt_arrow_float64(dtype): # GH#40139 - # Test groupby.kurt() with float64[pyarrow] dtype + # Test groupby.kurt() with float64[pyarrow] and Float64 dtypes df = pd.DataFrame( { "x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9], "y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0], }, - dtype="float64[pyarrow]", + dtype=dtype, ) gb = df.groupby(by=lambda x: 0) result = gb.kurt() - expected = pd.DataFrame( - {"x": [2.1644713], "y": [0.1513969]}, dtype="float64[pyarrow]" - ) + expected = pd.DataFrame({"x": [2.1644713], "y": [0.1513969]}, dtype=dtype) tm.assert_almost_equal(result, expected)