From 3237d650838af08ac5f6d5080d1362be0fc6033f Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 28 Nov 2021 13:58:38 -0800 Subject: [PATCH 1/4] BUG: groupby.sum with timedelta64 and NaT --- pandas/_libs/groupby.pyi | 1 + pandas/_libs/groupby.pyx | 8 ++++++-- pandas/core/groupby/ops.py | 10 ++++++++++ pandas/tests/groupby/test_function.py | 24 ++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 0450a3483d346..8eccd0eec8a1c 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -56,6 +56,7 @@ def group_add( values: np.ndarray, # ndarray[complexfloating_t, ndim=2] labels: np.ndarray, # const intp_t[:] min_count: int = ..., + datetimelike: bool = ..., ) -> None: ... def group_prod( out: np.ndarray, # floating[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 078cb8e02e824..c7d51004182d1 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -495,7 +495,8 @@ def group_add(add_t[:, ::1] out, int64_t[::1] counts, ndarray[add_t, ndim=2] values, const intp_t[::1] labels, - Py_ssize_t min_count=0) -> None: + Py_ssize_t min_count=0, + bint datetimelike=False) -> None: """ Only aggregates on axis=0 using Kahan summation """ @@ -557,7 +558,10 @@ def group_add(add_t[:, ::1] out, val = values[i, j] # not nan - if val == val: + # With dt64/td64 values, values have been cast to float64 + # instead if int64 for group_add, but the logic + # is otherwise the same as in _treat_as_na + if val == val and not (add_t is float64_t and datetimelike and val == NPY_NAT): nobs[lab, j] += 1 y = val - compensation[lab, j] t = sumx[lab, j] + y diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8223a04883738..7915e107afae6 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -531,6 +531,16 @@ def _call_cython_op( result_mask=result_mask, is_datetimelike=is_datetimelike, ) + elif self.how in ["add"]: + # We support datetimelike + func( + result, + counts, + values, + comp_ids, + min_count, + datetimelike=is_datetimelike, + ) else: func(result, counts, values, comp_ids, min_count) else: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index c462db526b36d..39a3e82fc2d98 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1162,3 +1162,27 @@ def test_mean_on_timedelta(): pd.to_timedelta([4, 5]), name="time", index=Index(["A", "B"], name="cat") ) tm.assert_series_equal(result, expected) + + +def test_groupby_sum_timedelta_with_nat(): + # GH#42659 + df = DataFrame( + { + "a": [1, 1, 2, 2], + "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], + } + ) + td3 = pd.Timedelta(days=3) + + gb = df.groupby("a") + + res = gb.sum() + expected = DataFrame({"b": [td3, td3]}, index=Index([1, 2], name="a")) + tm.assert_frame_equal(res, expected) + + res = gb["b"].sum() + tm.assert_series_equal(res, expected["b"]) + + res = gb["b"].sum(min_count=2) + expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index) + tm.assert_series_equal(res, expected) From 6c4800b4f0dd1da204808f2d4a338ba5419e06e3 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 28 Nov 2021 16:13:31 -0800 Subject: [PATCH 2/4] whatsnew --- doc/source/whatsnew/v1.3.5.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.5.rst b/doc/source/whatsnew/v1.3.5.rst index dabd9a650f45b..048cd978c4478 100644 --- a/doc/source/whatsnew/v1.3.5.rst +++ b/doc/source/whatsnew/v1.3.5.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in creating a :class:`DataFrame` from a timezone-aware :class:`Timestamp` scalar near a Daylight Savings Time transition (:issue:`42505`) - Fixed performance regression in :func:`read_csv` (:issue:`44106`) - Fixed regression in :meth:`Series.duplicated` and :meth:`Series.drop_duplicates` when Series has :class:`Categorical` dtype with boolean categories (:issue:`44351`) +- Fixed regression in :meth:`.GroupBy.sum` with ``timedelta64[ns]`` dtype containing ``NaT`` failing to treat that value as NA (:issue:`42659`) - .. --------------------------------------------------------------------------- From 1f4f8645f3e173d85f8511b2865ba895682b0f94 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 28 Nov 2021 16:14:29 -0800 Subject: [PATCH 3/4] lint fixup --- pandas/_libs/groupby.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c7d51004182d1..492abb11bf436 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -561,7 +561,11 @@ def group_add(add_t[:, ::1] out, # With dt64/td64 values, values have been cast to float64 # instead if int64 for group_add, but the logic # is otherwise the same as in _treat_as_na - if val == val and not (add_t is float64_t and datetimelike and val == NPY_NAT): + if val == val and not ( + add_t is float64_t + and datetimelike + and val == NPY_NAT + ): nobs[lab, j] += 1 y = val - compensation[lab, j] t = sumx[lab, j] + y From f3aad3b4cf531992b5c06cd23b1f330fe330dfb7 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 28 Nov 2021 18:13:14 -0800 Subject: [PATCH 4/4] lint fixup --- pandas/_libs/groupby.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 492abb11bf436..c0d1405e92518 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -562,10 +562,10 @@ def group_add(add_t[:, ::1] out, # instead if int64 for group_add, but the logic # is otherwise the same as in _treat_as_na if val == val and not ( - add_t is float64_t - and datetimelike - and val == NPY_NAT - ): + add_t is float64_t + and datetimelike + and val == NPY_NAT + ): nobs[lab, j] += 1 y = val - compensation[lab, j] t = sumx[lab, j] + y