Make group_mean compatible with NaT

AlexeyGy · AlexeyGy · commit 7ccffcfd55aa · 2021-09-11T22:37:48.000Z
NaT is the datetime equivalent of NaN and is set to be the lowest possible 64-bit integer -(2**63). Previously, we could not support this value in any `groupby.mean()` calculations which lead to pandas-dev#43132. On a high level, we slightly modify the `group_mean` to not count NaT values. To do so, we introduce the `is_datetimelike` parameter to the function call (already present in other functions, e.g., `group_cumsum`) and refactor and extend `#_treat_as_na` to work with float64. ## Tests This PR adds an integration and two unit tests for the new functionality. In contrast to other tests in classes, I've tried to keep an individual test's scope as small as possible. Additionally, I've taken the liberty to: * Add a docstring for the group_mean algorithm. * Change the algorithm to use guard clauses instead of if/else. * Add a comment that we're using the Kahan summation (the compensation part initially confused me, and I only stumbled upon Kahan when browsing the file). - [x] closes pandas-dev#43132 - [x] tests added / passed - [x] Ensure all linting tests pass, see [here](https://pandas.pydata.org/pandas-docs/dev/development/contributing.html#code-standards) for how to run them - [x] whatsnew entry
diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst
@@ -46,7 +46,8 @@ Performance improvements
 Bug fixes
 ~~~~~~~~~
 - Fixed bug in :meth:`.DataFrameGroupBy.agg` and :meth:`.DataFrameGroupBy.transform` with ``engine="numba"`` where ``index`` data was not being correctly passed into ``func`` (:issue:`43133`)
-
+- :meth:`.GroupBy.mean` now supports ``NaT`` values (:issue:`43132`)
+-
 .. ---------------------------------------------------------------------------
 
 .. _whatsnew_133.contributors:
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -75,6 +75,7 @@ def group_mean(
     values: np.ndarray,  # ndarray[floating, ndim=2]
     labels: np.ndarray,  # const intp_t[:]
     min_count: int = ...,
+    is_datetimelike: bool = ...,
 ) -> None: ...
 def group_ohlc(
     out: np.ndarray,  # floating[:, ::1]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -675,10 +675,38 @@ def group_mean(floating[:, ::1] out,
                int64_t[::1] counts,
                ndarray[floating, ndim=2] values,
                const intp_t[::1] labels,
-               Py_ssize_t min_count=-1) -> None:
+               Py_ssize_t min_count=-1,
+               bint is_datetimelike=False) -> None:
+    """
+    Compute the mean per label given a label assignment for each value.
+    NaN values are ignored.
+
+    Parameters
+    ----------
+    out : np.ndarray[floating]
+        Values into which this method will write its results.
+    counts : np.ndarray[int64]
+        A zeroed array of the same shape as labels,
+        populated by group sizes during algorithm.
+    values : np.ndarray[floating]
+        2-d array of the values to find the mean of.
+    labels : np.ndarray[np.intp]
+        Array containing unique label for each group, with its
+        ordering matching up to the corresponding record in `values`.
+    is_datetimelike : bool
+        True if `values` contains datetime-like entries.
+    min_count : Py_ssize_t
+        Only used in add and prod. Always -1.
+
+    Notes
+    -----
+    This method modifies the `out` parameter rather than returning an object.
+    `counts` is modified to hold group sizes
+    """
+
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        floating val, count, y, t
+        floating val, count, y, t, nan_val = NPY_NAT if is_datetimelike else NAN
         floating[:, ::1] sumx, compensation
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
@@ -688,14 +716,15 @@ def group_mean(floating[:, ::1] out,
     if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
 
-    nobs = np.zeros((<object>out).shape, dtype=np.int64)
     # the below is equivalent to `np.zeros_like(out)` but faster
+    nobs = np.zeros((<object>out).shape, dtype=np.int64)
     sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
     compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
 
     N, K = (<object>values).shape
 
     with nogil:
+        # precompute count and sum
         for i in range(N):
             lab = labels[i]
             if lab < 0:
@@ -704,21 +733,24 @@ def group_mean(floating[:, ::1] out,
             counts[lab] += 1
             for j in range(K):
                 val = values[i, j]
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    y = val - compensation[lab, j]
-                    t = sumx[lab, j] + y
-                    compensation[lab, j] = t - sumx[lab, j] - y
-                    sumx[lab, j] = t
-
+                if val !=val or (is_datetimelike and val == NPY_NAT):
+                    continue
+                # Use Kahan summation to reduce floating-point
+                # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm).
+                nobs[lab, j] += 1
+                y = val - compensation[lab, j]
+                t = sumx[lab, j] + y
+                compensation[lab, j] = t - sumx[lab, j] - y
+                sumx[lab, j] = t
+
+        # fill output array
         for i in range(ncounts):
             for j in range(K):
                 count = nobs[i, j]
                 if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = sumx[i, j] / count
+                    out[i, j] = nan_val
+                    continue
+                out[i, j] = sumx[i, j] / count
 
 
 @cython.wraparound(False)
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -525,6 +525,15 @@ def _call_cython_op(
                     result_mask=result_mask,
                     is_datetimelike=is_datetimelike,
                 )
+            elif self.how == "mean":
+                func(
+                    result,
+                    counts,
+                    values,
+                    comp_ids,
+                    min_count,
+                    is_datetimelike=is_datetimelike,
+                )
             else:
                 func(result, counts, values, comp_ids, min_count)
         else:
diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py
@@ -4,6 +4,7 @@
 from pandas._libs.groupby import (
     group_cumprod_float64,
     group_cumsum,
+    group_mean,
     group_var,
 )
 
@@ -234,3 +235,38 @@ def test_cython_group_transform_algos():
         ]
     )
     tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)
+
+
+def test_cython_group_mean_datetimelike():
+    actual = np.zeros(shape=(1, 1), dtype="float64")
+    counts = np.array([0], dtype="int64")
+    data = (
+        np.array(
+            [np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")],
+            dtype="m8[ns]",
+        )[:, None]
+        .view("int64")
+        .astype("float64")
+    )
+    labels = np.zeros(len(data), dtype=np.intp)
+
+    group_mean(actual, counts, data, labels, is_datetimelike=True)
+
+    tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))
+
+
+def test_cython_group_mean_not_datetimelike():
+    actual = np.zeros(shape=(3, 1), dtype="float64")
+    counts = np.zeros(2, dtype="int64")
+    data = np.array(
+        [
+            [np.float64(1), np.float64(2), np.float64(3)],
+            [np.nan, np.float64(2), np.float64(3)],
+        ],
+        dtype="float64",
+    )
+    labels = np.array([0, 0], dtype=np.intp)
+
+    group_mean(actual, counts, data, labels, is_datetimelike=False)
+
+    tm.assert_numpy_array_equal(actual[:, 0], np.array([1, 2, 3], dtype="float64"))
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
@@ -1289,3 +1289,13 @@ def test_transform_cumcount():
 
     result = grp.transform("cumcount")
     tm.assert_series_equal(result, expected)
+
+
+def test_group_mean_timedelta_nat():
+    series = Series(["1 day", "3 days", pd.NaT], dtype="timedelta64[ns]")
+
+    group = series.groupby([1, 1, 1])
+    result = group.transform("mean")
+
+    expected = Series(["2 days", "2 days", "2 days"], dtype="timedelta64[ns]")
+    tm.assert_series_equal(result, expected)