PERF: use non-copying path for Groupby.skew (#52104)

jbrockmendel · web-flow · commit 233bd83f692f · 2023-03-31T10:17:50.000-07:00
* PERF: use non-copying path for Groupby.skew

* DFGB

* update tests

* troubleshoot 32 bit builds

* 32bit build

* troubleshoot npdev build

* troubleshoot npdev build

* troubleshoot

* troubleshoot victory

* troubleshoot

* group_skew in groupby.pyi

* cython.cpow
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -88,6 +88,15 @@ def group_var(
     is_datetimelike: bool = ...,
     name: str = ...,
 ) -> None: ...
+def group_skew(
+    out: np.ndarray,  # float64_t[:, ::1]
+    counts: np.ndarray,  # int64_t[::1]
+    values: np.ndarray,  # ndarray[float64_T, ndim=2]
+    labels: np.ndarray,  # const intp_t[::1]
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
+    skipna: bool = ...,
+) -> None: ...
 def group_mean(
     out: np.ndarray,  # floating[:, ::1]
     counts: np.ndarray,  # int64_t[::1]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -891,6 +891,94 @@ def group_var(
                         out[i, j] /= (ct - ddof)
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+@cython.cdivision(True)
+@cython.cpow
+def group_skew(
+    float64_t[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[float64_t, ndim=2] values,
+    const intp_t[::1] labels,
+    const uint8_t[:, ::1] mask=None,
+    uint8_t[:, ::1] result_mask=None,
+    bint skipna=True,
+) -> None:
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
+        int64_t[:, ::1] nobs
+        Py_ssize_t len_values = len(values), len_labels = len(labels)
+        bint isna_entry, uses_mask = mask is not None
+        float64_t[:, ::1] M1, M2, M3
+        float64_t delta, delta_n, term1, val
+        int64_t n1, n
+        float64_t ct
+
+    if len_values != len_labels:
+        raise ValueError("len(index) != len(labels)")
+
+    nobs = np.zeros((<object>out).shape, dtype=np.int64)
+
+    # M1, M2, and M3 correspond to 1st, 2nd, and third Moments
+    M1 = np.zeros((<object>out).shape, dtype=np.float64)
+    M2 = np.zeros((<object>out).shape, dtype=np.float64)
+    M3 = np.zeros((<object>out).shape, dtype=np.float64)
+
+    N, K = (<object>values).shape
+
+    out[:, :] = 0.0
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+
+            for j in range(K):
+                val = values[i, j]
+
+                if uses_mask:
+                    isna_entry = mask[i, j]
+                else:
+                    isna_entry = _treat_as_na(val, False)
+
+                if not isna_entry:
+                    # Based on RunningSats::Push from
+                    #  https://www.johndcook.com/blog/skewness_kurtosis/
+                    n1 = nobs[lab, j]
+                    n = n1 + 1
+
+                    nobs[lab, j] = n
+                    delta = val - M1[lab, j]
+                    delta_n = delta / n
+                    term1 = delta * delta_n * n1
+
+                    M1[lab, j] += delta_n
+                    M3[lab, j] += term1 * delta_n * (n - 2) - 3 * delta_n * M2[lab, j]
+                    M2[lab, j] += term1
+                elif not skipna:
+                    M1[lab, j] = NaN
+                    M2[lab, j] = NaN
+                    M3[lab, j] = NaN
+
+        for i in range(ngroups):
+            for j in range(K):
+                ct = <float64_t>nobs[i, j]
+                if ct < 3:
+                    if result_mask is not None:
+                        result_mask[i, j] = 1
+                    out[i, j] = NaN
+                elif M2[i, j] == 0:
+                    out[i, j] = 0
+                else:
+                    out[i, j] = (
+                        (ct * (ct - 1) ** 0.5 / (ct - 2))
+                        * (M3[i, j] / M2[i, j] ** 1.5)
+                    )
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_mean(
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1084,14 +1084,27 @@ def skew(
         Parrot    1.457863
         Name: Max Speed, dtype: float64
         """
-        result = self._op_via_apply(
-            "skew",
-            axis=axis,
-            skipna=skipna,
-            numeric_only=numeric_only,
-            **kwargs,
+        if axis is lib.no_default:
+            axis = 0
+
+        if axis != 0:
+            result = self._op_via_apply(
+                "skew",
+                axis=axis,
+                skipna=skipna,
+                numeric_only=numeric_only,
+                **kwargs,
+            )
+            return result
+
+        def alt(obj):
+            # This should not be reached since the cython path should raise
+            #  TypeError and not NotImplementedError.
+            raise TypeError(f"'skew' is not supported for dtype={obj.dtype}")
+
+        return self._cython_agg_general(
+            "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
         )
-        return result
 
     @property
     @doc(Series.plot.__doc__)
@@ -2567,14 +2580,27 @@ def skew(
         bird          NaN
         mammal   1.669046
         """
-        result = self._op_via_apply(
-            "skew",
-            axis=axis,
-            skipna=skipna,
-            numeric_only=numeric_only,
-            **kwargs,
+        if axis is lib.no_default:
+            axis = 0
+
+        if axis != 0:
+            result = self._op_via_apply(
+                "skew",
+                axis=axis,
+                skipna=skipna,
+                numeric_only=numeric_only,
+                **kwargs,
+            )
+            return result
+
+        def alt(obj):
+            # This should not be reached since the cython path should raise
+            #  TypeError and not NotImplementedError.
+            raise TypeError(f"'skew' is not supported for dtype={obj.dtype}")
+
+        return self._cython_agg_general(
+            "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
         )
-        return result
 
     @property
     @doc(DataFrame.plot.__doc__)
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -139,6 +139,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
             "var": "group_var",
             "std": functools.partial(libgroupby.group_var, name="std"),
             "sem": functools.partial(libgroupby.group_var, name="sem"),
+            "skew": "group_skew",
             "first": "group_nth",
             "last": "group_last",
             "ohlc": "group_ohlc",
@@ -182,7 +183,10 @@ def _get_cython_function(
             elif how in ["std", "sem"]:
                 # We have a partial object that does not have __signatures__
                 return f
-            if "object" not in f.__signatures__:
+            elif how == "skew":
+                # _get_cython_vals will convert to float64
+                pass
+            elif "object" not in f.__signatures__:
                 # raise NotImplementedError here rather than TypeError later
                 raise NotImplementedError(
                     f"function is not implemented for this dtype: "
@@ -210,7 +214,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
         """
         how = self.how
 
-        if how in ["median", "std", "sem"]:
+        if how in ["median", "std", "sem", "skew"]:
             # median only has a float64 implementation
             # We should only get here with is_numeric, as non-numeric cases
             #  should raise in _get_cython_function
@@ -252,7 +256,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj):
             return
 
         if isinstance(dtype, CategoricalDtype):
-            if how in ["sum", "prod", "cumsum", "cumprod"]:
+            if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:
                 raise TypeError(f"{dtype} type does not support {how} operations")
             if how in ["min", "max", "rank"] and not dtype.ordered:
                 # raise TypeError instead of NotImplementedError to ensure we
@@ -268,7 +272,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj):
             raise NotImplementedError(f"{dtype} dtype not supported")
         elif is_datetime64_any_dtype(dtype):
             # Adding/multiplying datetimes is not valid
-            if how in ["sum", "prod", "cumsum", "cumprod", "var"]:
+            if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
                 raise TypeError(f"datetime64 type does not support {how} operations")
             if how in ["any", "all"]:
                 # GH#34479
@@ -281,7 +285,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj):
 
         elif is_period_dtype(dtype):
             # Adding/multiplying Periods is not valid
-            if how in ["sum", "prod", "cumsum", "cumprod", "var"]:
+            if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
                 raise TypeError(f"Period type does not support {how} operations")
             if how in ["any", "all"]:
                 # GH#34479
@@ -294,7 +298,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj):
 
         elif is_timedelta64_dtype(dtype):
             # timedeltas we can add but not multiply
-            if how in ["prod", "cumprod"]:
+            if how in ["prod", "cumprod", "skew"]:
                 raise TypeError(f"timedelta64 type does not support {how} operations")
 
     def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape:
@@ -643,6 +647,19 @@ def _call_cython_op(
                     **kwargs,
                 )
                 result = result.astype(bool, copy=False)
+            elif self.how in ["skew"]:
+                func(
+                    out=result,
+                    counts=counts,
+                    values=values,
+                    labels=comp_ids,
+                    mask=mask,
+                    result_mask=result_mask,
+                    **kwargs,
+                )
+                if dtype == object:
+                    result = result.astype(object)
+
             else:
                 raise NotImplementedError(f"{self.how} is not implemented")
         else:
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -1529,6 +1529,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):
         "min",
         "max",
         "prod",
+        "skew",
     )
 
     # Test default behavior; kernels that fail may be enabled in the future but kernels
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -965,7 +965,7 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
 def test_raise_on_nuisance_python_single(df):
     # GH 38815
     grouped = df.groupby("A")
-    with pytest.raises(TypeError, match="could not convert"):
+    with pytest.raises(ValueError, match="could not convert"):
         grouped.skew()
 
 
@@ -1972,14 +1972,14 @@ def get_categorical_invalid_expected():
         if is_dt64 or is_cat or is_per:
             # GH#41291
             # datetime64 -> prod and sum are invalid
-            if op == "skew":
-                msg = "does not support reduction 'skew'"
-            elif is_dt64:
+            if is_dt64:
                 msg = "datetime64 type does not support"
             elif is_per:
                 msg = "Period type does not support"
             else:
                 msg = "category type does not support"
+            if op == "skew":
+                msg = "|".join([msg, "does not support reduction 'skew'"])
             with pytest.raises(TypeError, match=msg):
                 get_result()
 
diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py
@@ -159,7 +159,7 @@ def test_groupby_raises_string(
         "sem": (ValueError, "could not convert string to float"),
         "shift": (None, ""),
         "size": (None, ""),
-        "skew": (TypeError, "could not convert string to float"),
+        "skew": (ValueError, "could not convert string to float"),
         "std": (ValueError, "could not convert string to float"),
         "sum": (None, ""),
         "var": (TypeError, "could not convert string to float"),
@@ -249,7 +249,15 @@ def test_groupby_raises_datetime(
         "sem": (None, ""),
         "shift": (None, ""),
         "size": (None, ""),
-        "skew": (TypeError, r"dtype datetime64\[ns\] does not support reduction"),
+        "skew": (
+            TypeError,
+            "|".join(
+                [
+                    r"dtype datetime64\[ns\] does not support reduction",
+                    "datetime64 type does not support skew operations",
+                ]
+            ),
+        ),
         "std": (None, ""),
         "sum": (TypeError, "datetime64 type does not support sum operations"),
         "var": (TypeError, "datetime64 type does not support var operations"),
@@ -407,7 +415,12 @@ def test_groupby_raises_category(
         "size": (None, ""),
         "skew": (
             TypeError,
-            "'Categorical' with dtype category does not support reduction 'skew'",
+            "|".join(
+                [
+                    "dtype category does not support reduction 'skew'",
+                    "category type does not support skew operations",
+                ]
+            ),
         ),
         "std": (
             TypeError,
@@ -575,7 +588,12 @@ def test_groupby_raises_category_on_category(
         "size": (None, ""),
         "skew": (
             TypeError,
-            "'Categorical' with dtype category does not support reduction 'skew'",
+            "|".join(
+                [
+                    "category type does not support skew operations",
+                    "dtype category does not support reduction 'skew'",
+                ]
+            ),
         ),
         "std": (
             TypeError,
diff --git a/pandas/tests/groupby/test_skew.py b/pandas/tests/groupby/test_skew.py
@@ -0,0 +1,27 @@
+import numpy as np
+
+import pandas as pd
+import pandas._testing as tm
+
+
+def test_groupby_skew_equivalence():
+    # Test that that groupby skew method (which uses libgroupby.group_skew)
+    #  matches the results of operating group-by-group (which uses nanops.nanskew)
+    nrows = 1000
+    ngroups = 3
+    ncols = 2
+    nan_frac = 0.05
+
+    arr = np.random.randn(nrows, ncols)
+    arr[np.random.random(nrows) < nan_frac] = np.nan
+
+    df = pd.DataFrame(arr)
+    grps = np.random.randint(0, ngroups, size=nrows)
+    gb = df.groupby(grps)
+
+    result = gb.skew()
+
+    grpwise = [grp.skew().to_frame(i).T for i, grp in gb]
+    expected = pd.concat(grpwise, axis=0)
+    expected.index = expected.index.astype(result.index.dtype)  # 32bit builds
+    tm.assert_frame_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -1529,6 +1529,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):`
`1529`	`1529`	`"min",`
`1530`	`1530`	`"max",`
`1531`	`1531`	`"prod",`
	`1532`	`+ "skew",`
`1532`	`1533`	`)`
`1533`	`1534`
`1534`	`1535`	`# Test default behavior; kernels that fail may be enabled in the future but kernels`