Skip to content

Make group_mean compatible with NaT #43467

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Sep 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions doc/source/whatsnew/v1.3.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ Fixed regressions

Bug fixes
~~~~~~~~~
-
-
- Fixed bug in :meth:`.GroupBy.mean` with datetimelike values including ``NaT`` values returning incorrect results (:issue`:43132`)

.. ---------------------------------------------------------------------------

Expand Down
5 changes: 4 additions & 1 deletion pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,10 @@ def group_mean(
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[floating, ndim=2]
labels: np.ndarray, # const intp_t[:]
min_count: int = ...,
min_count: int = ..., # Py_ssize_t
is_datetimelike: bool = ..., # bint
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_ohlc(
out: np.ndarray, # floating[:, ::1]
Expand Down
46 changes: 41 additions & 5 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -673,10 +673,45 @@ def group_mean(floating[:, ::1] out,
int64_t[::1] counts,
ndarray[floating, ndim=2] values,
const intp_t[::1] labels,
Py_ssize_t min_count=-1) -> None:
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None
) -> None:
"""
Compute the mean per label given a label assignment for each value.
NaN values are ignored.

Parameters
----------
out : np.ndarray[floating]
Values into which this method will write its results.
counts : np.ndarray[int64]
A zeroed array of the same shape as labels,
populated by group sizes during algorithm.
values : np.ndarray[floating]
2-d array of the values to find the mean of.
labels : np.ndarray[np.intp]
Array containing unique label for each group, with its
ordering matching up to the corresponding record in `values`.
min_count : Py_ssize_t
Only used in add and prod. Always -1.
is_datetimelike : bool
True if `values` contains datetime-like entries.
mask : ndarray[bool, ndim=2], optional
Not used.
result_mask : ndarray[bool, ndim=2], optional
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why adding these if they arent used?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, so you can use the same call from the python code?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct, originally, I did not want to add useless extra params but after this comment here remarked that an additional path in the code is not wanted, I did not see any other way than to add these parameters here.

I also thought about actually using the params but that has nothing to do with the original scope of this PR so I chose not to invest more time here.

Not used.

Notes
-----
This method modifies the `out` parameter rather than returning an object.
`counts` is modified to hold group sizes
"""

cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
floating val, count, y, t
floating val, count, y, t, nan_val
floating[:, ::1] sumx, compensation
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
Expand All @@ -686,12 +721,13 @@ def group_mean(floating[:, ::1] out,
if len_values != len_labels:
raise ValueError("len(index) != len(labels)")

nobs = np.zeros((<object>out).shape, dtype=np.int64)
# the below is equivalent to `np.zeros_like(out)` but faster
nobs = np.zeros((<object>out).shape, dtype=np.int64)
sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)

N, K = (<object>values).shape
nan_val = NPY_NAT if is_datetimelike else NAN

with nogil:
for i in range(N):
Expand All @@ -703,7 +739,7 @@ def group_mean(floating[:, ::1] out,
for j in range(K):
val = values[i, j]
# not nan
if val == val:
if val == val and not (is_datetimelike and val == NPY_NAT):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar comment here - would rather be explicit about the cast that's occurring to make this work (eg hopefully this is int equality, and not float equality since comparing floats is problematic).

On a brief investigation, one potential flaw here is that with nan_val = NPY_NAT, it holds true not only that val == NPY_NAT, but also that val == NPY_NAT + 1

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have an implicit guarantee that datetimelike arrays will be returned as int64 in calls to the function.

We can prevent any problems via asserting that is_datetimelike=True arrays come as int64 at the top of the function. Going to try to do that and also add a test for NPY_NAT + 1 just to be sure. Sounds good?

Copy link
Member

@mzeitlin11 mzeitlin11 Sep 15, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that is possible because the type floating does not include int64, which is why I am assumed that the data is being cast to float before being passed to the algo.

A test for NPY_NAT +1 would be great, if that's all good, then happy here (also fine with this as is, NPY_NAT handling is probably more an important than the edge case of NPY_NAT + 1

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there's an issue with NPY_NAT + 1, could also just see if that gets fixed by changing the fused type to also include int64_t so that we can accept ints

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I was wrong. There is a float64 cast down the line which occurs because get_cython_func_and_vals is called a bit further down which runs ensure_float64 and actually does a pointless copy to float64 operation of the whole input array https://github.com/pandas-dev/pandas/blob/master/pandas/_libs/algos_common_helper.pxi.in#L70.

We could change that path through the code to not do the cast for is_datetimelike but that would necessitate changing the input values data type to a union type which includes int64. Re-using add_t seems the best way to go since it would bring us one step closer to re-using group_add as suggested.

There is some risk through the union data type though, If I look at the algorithm of group_add, for one, it has extra-handling for the object dtype). However, that's nothing that can't be handled with an assert statement and an additional unit test.

What do you think?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think groupby_t would actually be a good candidate (add_t doesn't appear to even include int64). Hopefully just including those types won't necessitate much change (and can use _treat_as_na for more unified missing value handling since fused types will be the same (for example looking in group_min_max

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think?

  1. If we find a workable solution for GroupBy.mean, we should also try to use it for nanops.nanmean (used by DTI, Series, and DataFrame mean methods)
  2. if we stop casting to float64, we'll need to be extra-careful about silent overflows.
  3. I think it would be reasonable to fix the NaT-specific problem separately since the PR already presents a good solution for that

nobs[lab, j] += 1
y = val - compensation[lab, j]
t = sumx[lab, j] + y
Expand All @@ -714,7 +750,7 @@ def group_mean(floating[:, ::1] out,
for j in range(K):
count = nobs[i, j]
if nobs[i, j] == 0:
out[i, j] = NAN
out[i, j] = nan_val
else:
out[i, j] = sumx[i, j] / count

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,7 @@ def _call_cython_op(
result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
if self.kind == "aggregate":
counts = np.zeros(ngroups, dtype=np.int64)
if self.how in ["min", "max"]:
if self.how in ["min", "max", "mean"]:
func(
result,
counts,
Expand Down
34 changes: 33 additions & 1 deletion pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
MultiIndex,
Series,
concat,
to_datetime,
)
import pandas._testing as tm
from pandas.core.base import SpecificationError
Expand Down Expand Up @@ -66,7 +67,6 @@ def test_agg_ser_multi_key(df):


def test_groupby_aggregation_mixed_dtype():

# GH 6212
expected = DataFrame(
{
Expand Down Expand Up @@ -1259,3 +1259,35 @@ def func(ser):

expected = DataFrame([[1.0]], index=[1])
tm.assert_frame_equal(res, expected)


def test_group_mean_timedelta_nat():
# GH43132
data = Series(["1 day", "3 days", "NaT"], dtype="timedelta64[ns]")
expected = Series(["2 days"], dtype="timedelta64[ns]")

result = data.groupby([0, 0, 0]).mean()

tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"input_data, expected_output",
[
( # no timezone
["2021-01-01T00:00", "NaT", "2021-01-01T02:00"],
["2021-01-01T01:00"],
),
( # timezone
["2021-01-01T00:00-0100", "NaT", "2021-01-01T02:00-0100"],
["2021-01-01T01:00-0100"],
),
],
)
def test_group_mean_datetime64_nat(input_data, expected_output):
# GH43132
data = to_datetime(Series(input_data))
expected = to_datetime(Series(expected_output))

result = data.groupby([0, 0, 0]).mean()
tm.assert_series_equal(result, expected)
50 changes: 50 additions & 0 deletions pandas/tests/groupby/test_libgroupby.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import numpy as np
import pytest

from pandas._libs import groupby as libgroupby
from pandas._libs.groupby import (
group_cumprod_float64,
group_cumsum,
group_mean,
group_var,
)

Expand Down Expand Up @@ -234,3 +236,51 @@ def test_cython_group_transform_algos():
]
)
tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)


def test_cython_group_mean_datetimelike():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.array([0], dtype="int64")
data = (
np.array(
[np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")],
dtype="m8[ns]",
)[:, None]
.view("int64")
.astype("float64")
)
labels = np.zeros(len(data), dtype=np.intp)

group_mean(actual, counts, data, labels, is_datetimelike=True)

tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))


def test_cython_group_mean_wrong_min_count():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.zeros(1, dtype="int64")
data = np.zeros(1, dtype="float64")[:, None]
labels = np.zeros(1, dtype=np.intp)

with pytest.raises(AssertionError, match="min_count"):
group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0)


def test_cython_group_mean_not_datetimelike_but_has_NaT_values():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.array([0], dtype="int64")
data = (
np.array(
[np.timedelta64("NaT"), np.timedelta64("NaT")],
dtype="m8[ns]",
)[:, None]
.view("int64")
.astype("float64")
)
labels = np.zeros(len(data), dtype=np.intp)

group_mean(actual, counts, data, labels, is_datetimelike=False)

tm.assert_numpy_array_equal(
actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64")
)