Skip to content

Commit 3352625

Browse files
authored
BUG: groupby.sum with timedelta64 and NaT (#44658)
1 parent e7aae84 commit 3352625

File tree

5 files changed

+46
-2
lines changed

5 files changed

+46
-2
lines changed

doc/source/whatsnew/v1.3.5.rst

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Fixed regressions
1919
- Fixed regression in creating a :class:`DataFrame` from a timezone-aware :class:`Timestamp` scalar near a Daylight Savings Time transition (:issue:`42505`)
2020
- Fixed performance regression in :func:`read_csv` (:issue:`44106`)
2121
- Fixed regression in :meth:`Series.duplicated` and :meth:`Series.drop_duplicates` when Series has :class:`Categorical` dtype with boolean categories (:issue:`44351`)
22+
- Fixed regression in :meth:`.GroupBy.sum` with ``timedelta64[ns]`` dtype containing ``NaT`` failing to treat that value as NA (:issue:`42659`)
2223
-
2324

2425
.. ---------------------------------------------------------------------------

pandas/_libs/groupby.pyi

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def group_add(
5656
values: np.ndarray, # ndarray[complexfloating_t, ndim=2]
5757
labels: np.ndarray, # const intp_t[:]
5858
min_count: int = ...,
59+
datetimelike: bool = ...,
5960
) -> None: ...
6061
def group_prod(
6162
out: np.ndarray, # floating[:, ::1]

pandas/_libs/groupby.pyx

+10-2
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,8 @@ def group_add(add_t[:, ::1] out,
495495
int64_t[::1] counts,
496496
ndarray[add_t, ndim=2] values,
497497
const intp_t[::1] labels,
498-
Py_ssize_t min_count=0) -> None:
498+
Py_ssize_t min_count=0,
499+
bint datetimelike=False) -> None:
499500
"""
500501
Only aggregates on axis=0 using Kahan summation
501502
"""
@@ -557,7 +558,14 @@ def group_add(add_t[:, ::1] out,
557558
val = values[i, j]
558559

559560
# not nan
560-
if val == val:
561+
# With dt64/td64 values, values have been cast to float64
562+
# instead if int64 for group_add, but the logic
563+
# is otherwise the same as in _treat_as_na
564+
if val == val and not (
565+
add_t is float64_t
566+
and datetimelike
567+
and val == <float64_t>NPY_NAT
568+
):
561569
nobs[lab, j] += 1
562570
y = val - compensation[lab, j]
563571
t = sumx[lab, j] + y

pandas/core/groupby/ops.py

+10
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,16 @@ def _call_cython_op(
531531
result_mask=result_mask,
532532
is_datetimelike=is_datetimelike,
533533
)
534+
elif self.how in ["add"]:
535+
# We support datetimelike
536+
func(
537+
result,
538+
counts,
539+
values,
540+
comp_ids,
541+
min_count,
542+
datetimelike=is_datetimelike,
543+
)
534544
else:
535545
func(result, counts, values, comp_ids, min_count)
536546
else:

pandas/tests/groupby/test_function.py

+24
Original file line numberDiff line numberDiff line change
@@ -1162,3 +1162,27 @@ def test_mean_on_timedelta():
11621162
pd.to_timedelta([4, 5]), name="time", index=Index(["A", "B"], name="cat")
11631163
)
11641164
tm.assert_series_equal(result, expected)
1165+
1166+
1167+
def test_groupby_sum_timedelta_with_nat():
1168+
# GH#42659
1169+
df = DataFrame(
1170+
{
1171+
"a": [1, 1, 2, 2],
1172+
"b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT],
1173+
}
1174+
)
1175+
td3 = pd.Timedelta(days=3)
1176+
1177+
gb = df.groupby("a")
1178+
1179+
res = gb.sum()
1180+
expected = DataFrame({"b": [td3, td3]}, index=Index([1, 2], name="a"))
1181+
tm.assert_frame_equal(res, expected)
1182+
1183+
res = gb["b"].sum()
1184+
tm.assert_series_equal(res, expected["b"])
1185+
1186+
res = gb["b"].sum(min_count=2)
1187+
expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index)
1188+
tm.assert_series_equal(res, expected)

0 commit comments

Comments
 (0)