Make group_mean compatible with NaT

AlexeyGy · AlexeyGy · commit 3f271b29502d · 2021-09-09T16:15:58.000Z
NaT is the datetime equivalent of NaN and is set to be the lowest possible 64 bit integer -(2**63). Previously, we could not support this value in any groupby.mean() calculations which lead to #43132. On a high level, we slightly modify the `group_mean` to not count NaT values. To do so, we introduce the `is_datetimelike` parameter to the function call (already present in other functions, e.g., `group_cumsum`) and refactor and extend `#_treat_as_na` to work with float64. This PR add an additional integration and unit test for the new functionality. In contrast to other tests in classes, I've tried to keep an individual test's scope as small as possible. Additionally, I've taken the liberty to: * Add a docstring for the group_mean algorithm. * Change the algorithm to use guard clauses instead of else/if. * Add a comment that we're using the Kahan summation (the compensation part initially confused me, and I only stumbled upon Kahan when browsing the file). - [x] closes #43132 - [x] tests added / passed - [x] Ensure all linting tests pass, see [here](https://pandas.pydata.org/pandas-docs/dev/development/contributing.html#code-standards) for how to run them - [x] whatsnew entry => different format but it's there
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -75,6 +75,7 @@ def group_mean(
     values: np.ndarray,  # ndarray[floating, ndim=2]
     labels: np.ndarray,  # const intp_t[:]
     min_count: int = ...,
+    is_datetimelike: bool = ...,
 ) -> None: ...
 def group_ohlc(
     out: np.ndarray,  # floating[:, ::1]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -675,10 +675,36 @@ def group_mean(floating[:, ::1] out,
                int64_t[::1] counts,
                ndarray[floating, ndim=2] values,
                const intp_t[::1] labels,
-               Py_ssize_t min_count=-1) -> None:
+               Py_ssize_t min_count=-1,
+               bint is_datetimelike = False) -> None:
+    """
+    Compute the mean per label given a label assignment for each value. NaN values are ignored.
+
+    Parameters
+    ----------
+    out : np.ndarray[floating]
+        Values into which this method will write its results.
+    counts : np.ndarray[int64]
+        A zeroed array of the same shape as labels, populated by group sizes during algorithm.
+    values : np.ndarray[floating]
+        2-d array of the values to find the mean of.
+    labels : np.ndarray[np.intp]
+        Array containing unique label for each group, with its
+        ordering matching up to the corresponding record in `values`.
+    is_datetimelike : bool
+        True if `values` contains datetime-like entries.
+    min_count : Py_ssize_t
+        Only used in add and prod. Always -1.
+
+    Notes
+    -----
+    This method modifies the `out` parameter rather than returning an object.
+    `counts` is modified to hold group sizes
+    """
+
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        floating val, count, y, t
+        floating val, count, y, t, nan_val = NPY_NAT if is_datetimelike else NAN
         floating[:, ::1] sumx, compensation
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
@@ -688,14 +714,15 @@ def group_mean(floating[:, ::1] out,
     if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
 
-    nobs = np.zeros((<object>out).shape, dtype=np.int64)
     # the below is equivalent to `np.zeros_like(out)` but faster
+    nobs = np.zeros((<object>out).shape, dtype=np.int64)
     sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
     compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
 
     N, K = (<object>values).shape
 
     with nogil:
+        # precompute count and sum
         for i in range(N):
             lab = labels[i]
             if lab < 0:
@@ -704,22 +731,24 @@ def group_mean(floating[:, ::1] out,
             counts[lab] += 1
             for j in range(K):
                 val = values[i, j]
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    y = val - compensation[lab, j]
-                    t = sumx[lab, j] + y
-                    compensation[lab, j] = t - sumx[lab, j] - y
-                    sumx[lab, j] = t
-
+                if _treat_as_na(val, is_datetimelike):
+                    continue
+                # Use Kahan summation to reduce floating-point
+                # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm).
+                nobs[lab, j] += 1
+                y = val - compensation[lab, j]
+                t = sumx[lab, j] + y
+                compensation[lab, j] = t - sumx[lab, j] - y
+                sumx[lab, j] = t
+
+        # fill output array
         for i in range(ncounts):
             for j in range(K):
                 count = nobs[i, j]
                 if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = sumx[i, j] / count
-
+                    out[i, j] = nan_val
+                    continue
+                out[i, j] = sumx[i, j] / count
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -900,7 +929,7 @@ cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil:
         #  or else cython will raise about gil acquisition.
         raise NotImplementedError
 
-    elif rank_t is int64_t:
+    elif rank_t is int64_t or rank_t is float64_t:
         return is_datetimelike and val == NPY_NAT
     elif rank_t is uint64_t:
         # There is no NA value for uint64
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -515,7 +515,7 @@ def _call_cython_op(
         result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
         if self.kind == "aggregate":
             counts = np.zeros(ngroups, dtype=np.int64)
-            if self.how in ["min", "max"]:
+            if self.how in {"min", "max", "mean"}:
                 func(
                     result,
                     counts,
diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py
@@ -5,6 +5,7 @@
     group_cumprod_float64,
     group_cumsum,
     group_var,
+    group_mean,
 )
 
 from pandas.core.dtypes.common import ensure_platform_int
@@ -234,3 +235,22 @@ def test_cython_group_transform_algos():
         ]
     )
     tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)
+
+
+def test_cython_group_mean_timedelta():
+    is_datetimelike = True
+    actual = np.zeros(shape=(1, 1), dtype="float64")
+    counts = np.array([0], dtype="int64")
+    data = (
+        np.array(
+            [np.datetime64(2, "ns"), np.datetime64(4, "ns"), np.datetime64("NaT")],
+            dtype="m8[ns]",
+        )[:, None]
+        .view("int64")
+        .astype("float64")
+    )
+    labels = np.zeros(len(data), dtype="int64")
+
+    group_mean(actual, counts, data, labels, is_datetimelike=True)
+
+    tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
@@ -1289,3 +1289,13 @@ def test_transform_cumcount():
 
     result = grp.transform("cumcount")
     tm.assert_series_equal(result, expected)
+
+
+def test_group_mean_timedelta_nat():
+    series = Series(["1 day", "3 days", pd.NaT], dtype="timedelta64[ns]")
+
+    group = series.groupby([1, 1, 1])
+    result = group.transform("mean")
+
+    expected = Series(["2 days", "2 days", "2 days"], dtype="timedelta64[ns]")
+    tm.assert_series_equal(result, expected)