diff --git a/doc/source/whatsnew/v1.3.5.rst b/doc/source/whatsnew/v1.3.5.rst index dabd9a650f45b..048cd978c4478 100644 --- a/doc/source/whatsnew/v1.3.5.rst +++ b/doc/source/whatsnew/v1.3.5.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in creating a :class:`DataFrame` from a timezone-aware :class:`Timestamp` scalar near a Daylight Savings Time transition (:issue:`42505`) - Fixed performance regression in :func:`read_csv` (:issue:`44106`) - Fixed regression in :meth:`Series.duplicated` and :meth:`Series.drop_duplicates` when Series has :class:`Categorical` dtype with boolean categories (:issue:`44351`) +- Fixed regression in :meth:`.GroupBy.sum` with ``timedelta64[ns]`` dtype containing ``NaT`` failing to treat that value as NA (:issue:`42659`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index a4cbf0f8cfd44..5f6ed647204f9 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -53,6 +53,7 @@ def group_add( values: np.ndarray, # ndarray[complexfloating_t, ndim=2] labels: np.ndarray, # const intp_t[:] min_count: int = ..., + datetimelike: bool = ..., ) -> None: ... def group_prod( out: np.ndarray, # floating[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ec6b1f80390e1..5868ea23c7389 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -483,7 +483,8 @@ def group_add(add_t[:, ::1] out, int64_t[::1] counts, ndarray[add_t, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=0) -> None: + Py_ssize_t min_count=0, + bint datetimelike=False) -> None: """ Only aggregates on axis=0 using Kahan summation """ @@ -545,7 +546,14 @@ def group_add(add_t[:, ::1] out, val = values[i, j] # not nan - if val == val: + # With dt64/td64 values, values have been cast to float64 + # instead if int64 for group_add, but the logic + # is otherwise the same as in _treat_as_na + if val == val and not ( + add_t is float64_t + and datetimelike + and val == NPY_NAT + ): nobs[lab, j] += 1 y = val - compensation[lab, j] t = sumx[lab, j] + y diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c906de6f1c453..dbc225454766d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -574,6 +574,16 @@ def _call_cython_op( min_count, is_datetimelike=is_datetimelike, ) + elif self.how in ["add"]: + # We support datetimelike + func( + result, + counts, + values, + comp_ids, + min_count, + datetimelike=is_datetimelike, + ) else: func(result, counts, values, comp_ids, min_count) else: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 5434fc49e2174..bf58937314404 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1105,3 +1105,27 @@ def test_groupby_sum_below_mincount_nullable_integer(): result = grouped.sum(min_count=2) expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx) tm.assert_frame_equal(result, expected) + + +def test_groupby_sum_timedelta_with_nat(): + # GH#42659 + df = DataFrame( + { + "a": [1, 1, 2, 2], + "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], + } + ) + td3 = pd.Timedelta(days=3) + + gb = df.groupby("a") + + res = gb.sum() + expected = DataFrame({"b": [td3, td3]}, index=Index([1, 2], name="a")) + tm.assert_frame_equal(res, expected) + + res = gb["b"].sum() + tm.assert_series_equal(res, expected["b"]) + + res = gb["b"].sum(min_count=2) + expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index) + tm.assert_series_equal(res, expected)