Skip to content

Backport PR #43467 on branch 1.3.x (Make group_mean compatible with NaT) #43630

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions doc/source/whatsnew/v1.3.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ Fixed regressions

Bug fixes
~~~~~~~~~
-
-
- Fixed bug in :meth:`.GroupBy.mean` with datetimelike values including ``NaT`` values returning incorrect results (:issue`:43132`)

.. ---------------------------------------------------------------------------

Expand Down
5 changes: 4 additions & 1 deletion pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,10 @@ def group_mean(
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[floating, ndim=2]
labels: np.ndarray, # const intp_t[:]
min_count: int = ...,
min_count: int = ..., # Py_ssize_t
is_datetimelike: bool = ..., # bint
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_ohlc(
out: np.ndarray, # floating[:, ::1]
Expand Down
46 changes: 41 additions & 5 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -669,10 +669,45 @@ def group_mean(floating[:, ::1] out,
int64_t[::1] counts,
ndarray[floating, ndim=2] values,
const intp_t[::1] labels,
Py_ssize_t min_count=-1) -> None:
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None
) -> None:
"""
Compute the mean per label given a label assignment for each value.
NaN values are ignored.

Parameters
----------
out : np.ndarray[floating]
Values into which this method will write its results.
counts : np.ndarray[int64]
A zeroed array of the same shape as labels,
populated by group sizes during algorithm.
values : np.ndarray[floating]
2-d array of the values to find the mean of.
labels : np.ndarray[np.intp]
Array containing unique label for each group, with its
ordering matching up to the corresponding record in `values`.
min_count : Py_ssize_t
Only used in add and prod. Always -1.
is_datetimelike : bool
True if `values` contains datetime-like entries.
mask : ndarray[bool, ndim=2], optional
Not used.
result_mask : ndarray[bool, ndim=2], optional
Not used.

Notes
-----
This method modifies the `out` parameter rather than returning an object.
`counts` is modified to hold group sizes
"""

cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
floating val, count, y, t
floating val, count, y, t, nan_val
floating[:, ::1] sumx, compensation
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
Expand All @@ -682,12 +717,13 @@ def group_mean(floating[:, ::1] out,
if len_values != len_labels:
raise ValueError("len(index) != len(labels)")

nobs = np.zeros((<object>out).shape, dtype=np.int64)
# the below is equivalent to `np.zeros_like(out)` but faster
nobs = np.zeros((<object>out).shape, dtype=np.int64)
sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)

N, K = (<object>values).shape
nan_val = NPY_NAT if is_datetimelike else NAN

with nogil:
for i in range(N):
Expand All @@ -699,7 +735,7 @@ def group_mean(floating[:, ::1] out,
for j in range(K):
val = values[i, j]
# not nan
if val == val:
if val == val and not (is_datetimelike and val == NPY_NAT):
nobs[lab, j] += 1
y = val - compensation[lab, j]
t = sumx[lab, j] + y
Expand All @@ -710,7 +746,7 @@ def group_mean(floating[:, ::1] out,
for j in range(K):
count = nobs[i, j]
if nobs[i, j] == 0:
out[i, j] = NAN
out[i, j] = nan_val
else:
out[i, j] = sumx[i, j] / count

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ def _call_cython_op(
result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
if self.kind == "aggregate":
counts = np.zeros(ngroups, dtype=np.int64)
if self.how in ["min", "max"]:
if self.how in ["min", "max", "mean"]:
func(
result,
counts,
Expand Down
34 changes: 33 additions & 1 deletion pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
MultiIndex,
Series,
concat,
to_datetime,
)
import pandas._testing as tm
from pandas.core.base import SpecificationError
Expand Down Expand Up @@ -66,7 +67,6 @@ def test_agg_ser_multi_key(df):


def test_groupby_aggregation_mixed_dtype():

# GH 6212
expected = DataFrame(
{
Expand Down Expand Up @@ -1274,3 +1274,35 @@ def func(ser):

expected = DataFrame([[1.0]], index=[1])
tm.assert_frame_equal(res, expected)


def test_group_mean_timedelta_nat():
# GH43132
data = Series(["1 day", "3 days", "NaT"], dtype="timedelta64[ns]")
expected = Series(["2 days"], dtype="timedelta64[ns]")

result = data.groupby([0, 0, 0]).mean()

tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"input_data, expected_output",
[
( # no timezone
["2021-01-01T00:00", "NaT", "2021-01-01T02:00"],
["2021-01-01T01:00"],
),
( # timezone
["2021-01-01T00:00-0100", "NaT", "2021-01-01T02:00-0100"],
["2021-01-01T01:00-0100"],
),
],
)
def test_group_mean_datetime64_nat(input_data, expected_output):
# GH43132
data = to_datetime(Series(input_data))
expected = to_datetime(Series(expected_output))

result = data.groupby([0, 0, 0]).mean()
tm.assert_series_equal(result, expected)
50 changes: 50 additions & 0 deletions pandas/tests/groupby/test_libgroupby.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import numpy as np
import pytest

from pandas._libs import groupby as libgroupby
from pandas._libs.groupby import (
group_cumprod_float64,
group_cumsum,
group_mean,
group_var,
)

Expand Down Expand Up @@ -234,3 +236,51 @@ def test_cython_group_transform_algos():
]
)
tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)


def test_cython_group_mean_datetimelike():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.array([0], dtype="int64")
data = (
np.array(
[np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")],
dtype="m8[ns]",
)[:, None]
.view("int64")
.astype("float64")
)
labels = np.zeros(len(data), dtype=np.intp)

group_mean(actual, counts, data, labels, is_datetimelike=True)

tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))


def test_cython_group_mean_wrong_min_count():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.zeros(1, dtype="int64")
data = np.zeros(1, dtype="float64")[:, None]
labels = np.zeros(1, dtype=np.intp)

with pytest.raises(AssertionError, match="min_count"):
group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0)


def test_cython_group_mean_not_datetimelike_but_has_NaT_values():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.array([0], dtype="int64")
data = (
np.array(
[np.timedelta64("NaT"), np.timedelta64("NaT")],
dtype="m8[ns]",
)[:, None]
.view("int64")
.astype("float64")
)
labels = np.zeros(len(data), dtype=np.intp)

group_mean(actual, counts, data, labels, is_datetimelike=False)

tm.assert_numpy_array_equal(
actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64")
)