Skip to content

Commit 0b520a1

Browse files
committed
Make group_mean compatible with NaT
NaT is the datetime equivalent of NaN and is set to be the lowest possible 64-bit integer -(2**63). Previously, we could not support this value in any `groupby.mean()` calculations which lead to pandas-dev#43132.
1 parent 415dec5 commit 0b520a1

File tree

6 files changed

+173
-8
lines changed

6 files changed

+173
-8
lines changed

doc/source/whatsnew/v1.3.4.rst

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
.. _whatsnew_134:
2+
3+
What's new in 1.3.4 (October ??, 2021)
4+
--------------------------------------
5+
6+
These are the changes in pandas 1.3.4. See :ref:`release` for a full changelog
7+
including other versions of pandas.
8+
9+
{{ header }}
10+
11+
.. ---------------------------------------------------------------------------
12+
13+
.. _whatsnew_134.regressions:
14+
15+
Fixed regressions
16+
~~~~~~~~~~~~~~~~~
17+
-
18+
-
19+
20+
.. ---------------------------------------------------------------------------
21+
22+
.. _whatsnew_134.bug_fixes:
23+
24+
Bug fixes
25+
~~~~~~~~~
26+
- Fixed bug in :meth:`.GroupBy.mean` with datetimelike values including ``NaT`` values returning incorrect results (:issue`:43132`)
27+
28+
.. ---------------------------------------------------------------------------
29+
30+
.. _whatsnew_134.other:
31+
32+
Other
33+
~~~~~
34+
-
35+
-
36+
37+
.. ---------------------------------------------------------------------------
38+
39+
.. _whatsnew_134.contributors:
40+
41+
Contributors
42+
~~~~~~~~~~~~
43+
44+
.. contributors:: v1.3.3..v1.3.4|HEAD

pandas/_libs/groupby.pyi

+4-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,10 @@ def group_mean(
7474
counts: np.ndarray, # int64_t[::1]
7575
values: np.ndarray, # ndarray[floating, ndim=2]
7676
labels: np.ndarray, # const intp_t[:]
77-
min_count: int = ...,
77+
min_count: int = ..., # Py_ssize_t
78+
is_datetimelike: bool = ..., # bint
79+
mask: np.ndarray | None = ...,
80+
result_mask: np.ndarray | None = ...,
7881
) -> None: ...
7982
def group_ohlc(
8083
out: np.ndarray, # floating[:, ::1]

pandas/_libs/groupby.pyx

+41-5
Original file line numberDiff line numberDiff line change
@@ -675,10 +675,45 @@ def group_mean(floating[:, ::1] out,
675675
int64_t[::1] counts,
676676
ndarray[floating, ndim=2] values,
677677
const intp_t[::1] labels,
678-
Py_ssize_t min_count=-1) -> None:
678+
Py_ssize_t min_count=-1,
679+
bint is_datetimelike=False,
680+
const uint8_t[:, ::1] mask=None,
681+
uint8_t[:, ::1] result_mask=None
682+
) -> None:
683+
"""
684+
Compute the mean per label given a label assignment for each value.
685+
NaN values are ignored.
686+
687+
Parameters
688+
----------
689+
out : np.ndarray[floating]
690+
Values into which this method will write its results.
691+
counts : np.ndarray[int64]
692+
A zeroed array of the same shape as labels,
693+
populated by group sizes during algorithm.
694+
values : np.ndarray[floating]
695+
2-d array of the values to find the mean of.
696+
labels : np.ndarray[np.intp]
697+
Array containing unique label for each group, with its
698+
ordering matching up to the corresponding record in `values`.
699+
min_count : Py_ssize_t
700+
Only used in add and prod. Always -1.
701+
is_datetimelike : bool
702+
True if `values` contains datetime-like entries.
703+
mask : ndarray[bool, ndim=2], optional
704+
Not used.
705+
result_mask : ndarray[bool, ndim=2], optional
706+
Not used.
707+
708+
Notes
709+
-----
710+
This method modifies the `out` parameter rather than returning an object.
711+
`counts` is modified to hold group sizes
712+
"""
713+
679714
cdef:
680715
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
681-
floating val, count, y, t
716+
floating val, count, y, t, nan_val
682717
floating[:, ::1] sumx, compensation
683718
int64_t[:, ::1] nobs
684719
Py_ssize_t len_values = len(values), len_labels = len(labels)
@@ -688,12 +723,13 @@ def group_mean(floating[:, ::1] out,
688723
if len_values != len_labels:
689724
raise ValueError("len(index) != len(labels)")
690725

691-
nobs = np.zeros((<object>out).shape, dtype=np.int64)
692726
# the below is equivalent to `np.zeros_like(out)` but faster
727+
nobs = np.zeros((<object>out).shape, dtype=np.int64)
693728
sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
694729
compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
695730

696731
N, K = (<object>values).shape
732+
nan_val = NPY_NAT if is_datetimelike else NAN
697733

698734
with nogil:
699735
for i in range(N):
@@ -705,7 +741,7 @@ def group_mean(floating[:, ::1] out,
705741
for j in range(K):
706742
val = values[i, j]
707743
# not nan
708-
if val == val:
744+
if val == val and not (is_datetimelike and val == NPY_NAT):
709745
nobs[lab, j] += 1
710746
y = val - compensation[lab, j]
711747
t = sumx[lab, j] + y
@@ -716,7 +752,7 @@ def group_mean(floating[:, ::1] out,
716752
for j in range(K):
717753
count = nobs[i, j]
718754
if nobs[i, j] == 0:
719-
out[i, j] = NAN
755+
out[i, j] = nan_val
720756
else:
721757
out[i, j] = sumx[i, j] / count
722758

pandas/core/groupby/ops.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -514,7 +514,7 @@ def _call_cython_op(
514514
result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
515515
if self.kind == "aggregate":
516516
counts = np.zeros(ngroups, dtype=np.int64)
517-
if self.how in ["min", "max"]:
517+
if self.how in ["min", "max", "mean"]:
518518
func(
519519
result,
520520
counts,

pandas/tests/groupby/aggregate/test_aggregate.py

+33-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ def test_agg_ser_multi_key(df):
6666

6767

6868
def test_groupby_aggregation_mixed_dtype():
69-
7069
# GH 6212
7170
expected = DataFrame(
7271
{
@@ -1274,3 +1273,36 @@ def func(ser):
12741273

12751274
expected = DataFrame([[1.0]], index=[1])
12761275
tm.assert_frame_equal(res, expected)
1276+
1277+
1278+
@pytest.mark.parametrize(
1279+
"input_data, expected_output",
1280+
[
1281+
( # timedelta
1282+
{"dtype": "timedelta64[ns]", "values": ["1 day", "3 days", "NaT"]},
1283+
{"dtype": "timedelta64[ns]", "values": ["2 days"]},
1284+
),
1285+
( # datetime
1286+
{
1287+
"dtype": "datetime64[ns]",
1288+
"values": ["2021-01-01T00:00", "NaT", "2021-01-01T02:00"],
1289+
},
1290+
{"dtype": "datetime64[ns]", "values": ["2021-01-01T01:00"]},
1291+
),
1292+
( # timezoned data
1293+
{
1294+
"dtype": "datetime64[ns]",
1295+
"values": ["2021-01-01T00:00-0100", "NaT", "2021-01-01T02:00-0100"],
1296+
},
1297+
{"dtype": "datetime64[ns]", "values": ["2021-01-01T01:00"]},
1298+
),
1299+
],
1300+
)
1301+
def test_group_mean_timedelta_nat(input_data, expected_output):
1302+
data = Series(input_data["values"], dtype=input_data["dtype"])
1303+
1304+
actual = data.groupby([0, 0, 0]).mean()
1305+
1306+
expected = Series(expected_output["values"], dtype=expected_output["dtype"])
1307+
1308+
tm.assert_series_equal(actual, expected)

pandas/tests/groupby/test_libgroupby.py

+50
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import numpy as np
2+
import pytest
23

34
from pandas._libs import groupby as libgroupby
45
from pandas._libs.groupby import (
56
group_cumprod_float64,
67
group_cumsum,
8+
group_mean,
79
group_var,
810
)
911

@@ -234,3 +236,51 @@ def test_cython_group_transform_algos():
234236
]
235237
)
236238
tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)
239+
240+
241+
def test_cython_group_mean_datetimelike():
242+
actual = np.zeros(shape=(1, 1), dtype="float64")
243+
counts = np.array([0], dtype="int64")
244+
data = (
245+
np.array(
246+
[np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")],
247+
dtype="m8[ns]",
248+
)[:, None]
249+
.view("int64")
250+
.astype("float64")
251+
)
252+
labels = np.zeros(len(data), dtype=np.intp)
253+
254+
group_mean(actual, counts, data, labels, is_datetimelike=True)
255+
256+
tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))
257+
258+
259+
def test_cython_group_mean_wrong_min_count():
260+
actual = np.zeros(shape=(1, 1), dtype="float64")
261+
counts = np.zeros(1, dtype="int64")
262+
data = np.zeros(1, dtype="float64")[:, None]
263+
labels = np.zeros(1, dtype=np.intp)
264+
265+
with pytest.raises(AssertionError, match="min_count"):
266+
group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0)
267+
268+
269+
def test_cython_group_mean_not_datetimelike_but_has_NaT_values():
270+
actual = np.zeros(shape=(1, 1), dtype="float64")
271+
counts = np.array([0], dtype="int64")
272+
data = (
273+
np.array(
274+
[np.timedelta64("NaT"), np.timedelta64("NaT")],
275+
dtype="m8[ns]",
276+
)[:, None]
277+
.view("int64")
278+
.astype("float64")
279+
)
280+
labels = np.zeros(len(data), dtype=np.intp)
281+
282+
group_mean(actual, counts, data, labels, is_datetimelike=False)
283+
284+
tm.assert_numpy_array_equal(
285+
actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64")
286+
)

0 commit comments

Comments
 (0)