From 970f3023fd3f9040bdaa7cc63ac4dfbe10343219 Mon Sep 17 00:00:00 2001 From: agy Date: Mon, 13 Sep 2021 16:43:28 +0000 Subject: [PATCH 1/4] Make group_mean compatible with NaT --- doc/source/whatsnew/v1.3.4.rst | 3 +- pandas/_libs/groupby.pyi | 5 +- pandas/_libs/groupby.pyx | 46 +++++++++++++++-- pandas/core/groupby/ops.py | 2 +- .../tests/groupby/aggregate/test_aggregate.py | 34 ++++++++++++- pandas/tests/groupby/test_libgroupby.py | 50 +++++++++++++++++++ 6 files changed, 130 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst index 273686f0aaa8f..ac161c3dec525 100644 --- a/doc/source/whatsnew/v1.3.4.rst +++ b/doc/source/whatsnew/v1.3.4.rst @@ -23,8 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- -- +- Fixed bug in :meth:`.GroupBy.mean` with datetimelike values including ``NaT`` values returning incorrect results (:issue`:43132`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index b40fc708b3d5e..59b952cd46c56 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -77,7 +77,10 @@ def group_mean( counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[floating, ndim=2] labels: np.ndarray, # const intp_t[:] - min_count: int = ..., + min_count: int = ..., # Py_ssize_t + is_datetimelike: bool = ..., # bint + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., ) -> None: ... def group_ohlc( out: np.ndarray, # floating[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c67d66200c3f5..b8700aa473d03 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -673,10 +673,45 @@ def group_mean(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, const intp_t[::1] labels, - Py_ssize_t min_count=-1) -> None: + Py_ssize_t min_count=-1, + bint is_datetimelike=False, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None + ) -> None: + """ + Compute the mean per label given a label assignment for each value. + NaN values are ignored. + + Parameters + ---------- + out : np.ndarray[floating] + Values into which this method will write its results. + counts : np.ndarray[int64] + A zeroed array of the same shape as labels, + populated by group sizes during algorithm. + values : np.ndarray[floating] + 2-d array of the values to find the mean of. + labels : np.ndarray[np.intp] + Array containing unique label for each group, with its + ordering matching up to the corresponding record in `values`. + min_count : Py_ssize_t + Only used in add and prod. Always -1. + is_datetimelike : bool + True if `values` contains datetime-like entries. + mask : ndarray[bool, ndim=2], optional + Not used. + result_mask : ndarray[bool, ndim=2], optional + Not used. + + Notes + ----- + This method modifies the `out` parameter rather than returning an object. + `counts` is modified to hold group sizes + """ + cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - floating val, count, y, t + floating val, count, y, t, nan_val floating[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) @@ -686,12 +721,13 @@ def group_mean(floating[:, ::1] out, if len_values != len_labels: raise ValueError("len(index) != len(labels)") - nobs = np.zeros((out).shape, dtype=np.int64) # the below is equivalent to `np.zeros_like(out)` but faster + nobs = np.zeros((out).shape, dtype=np.int64) sumx = np.zeros((out).shape, dtype=(out).base.dtype) compensation = np.zeros((out).shape, dtype=(out).base.dtype) N, K = (values).shape + nan_val = NPY_NAT if is_datetimelike else NAN with nogil: for i in range(N): @@ -703,7 +739,7 @@ def group_mean(floating[:, ::1] out, for j in range(K): val = values[i, j] # not nan - if val == val: + if val == val and not (is_datetimelike and val == NPY_NAT): nobs[lab, j] += 1 y = val - compensation[lab, j] t = sumx[lab, j] + y @@ -714,7 +750,7 @@ def group_mean(floating[:, ::1] out, for j in range(K): count = nobs[i, j] if nobs[i, j] == 0: - out[i, j] = NAN + out[i, j] = nan_val else: out[i, j] = sumx[i, j] / count diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fa688a3483466..a6436b4c4334b 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -514,7 +514,7 @@ def _call_cython_op( result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) - if self.how in ["min", "max"]: + if self.how in ["min", "max", "mean"]: func( result, counts, diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0a693967fbb19..1729362b1044b 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -66,7 +66,6 @@ def test_agg_ser_multi_key(df): def test_groupby_aggregation_mixed_dtype(): - # GH 6212 expected = DataFrame( { @@ -1274,3 +1273,36 @@ def func(ser): expected = DataFrame([[1.0]], index=[1]) tm.assert_frame_equal(res, expected) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + ( # timedelta + {"dtype": "timedelta64[ns]", "values": ["1 day", "3 days", "NaT"]}, + {"dtype": "timedelta64[ns]", "values": ["2 days"]}, + ), + ( # datetime + { + "dtype": "datetime64[ns]", + "values": ["2021-01-01T00:00", "NaT", "2021-01-01T02:00"], + }, + {"dtype": "datetime64[ns]", "values": ["2021-01-01T01:00"]}, + ), + ( # timezoned data + { + "dtype": "datetime64[ns]", + "values": ["2021-01-01T00:00-0100", "NaT", "2021-01-01T02:00-0100"], + }, + {"dtype": "datetime64[ns]", "values": ["2021-01-01T01:00"]}, + ), + ], +) +def test_group_mean_timedelta_nat(input_data, expected_output): + data = Series(input_data["values"], dtype=input_data["dtype"]) + + actual = data.groupby([0, 0, 0]).mean() + + expected = Series(expected_output["values"], dtype=expected_output["dtype"]) + + tm.assert_series_equal(actual, expected) diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py index 3eb25878876b2..06e3e4f2877d6 100644 --- a/pandas/tests/groupby/test_libgroupby.py +++ b/pandas/tests/groupby/test_libgroupby.py @@ -1,9 +1,11 @@ import numpy as np +import pytest from pandas._libs import groupby as libgroupby from pandas._libs.groupby import ( group_cumprod_float64, group_cumsum, + group_mean, group_var, ) @@ -234,3 +236,51 @@ def test_cython_group_transform_algos(): ] ) tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected) + + +def test_cython_group_mean_datetimelike(): + actual = np.zeros(shape=(1, 1), dtype="float64") + counts = np.array([0], dtype="int64") + data = ( + np.array( + [np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")], + dtype="m8[ns]", + )[:, None] + .view("int64") + .astype("float64") + ) + labels = np.zeros(len(data), dtype=np.intp) + + group_mean(actual, counts, data, labels, is_datetimelike=True) + + tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64")) + + +def test_cython_group_mean_wrong_min_count(): + actual = np.zeros(shape=(1, 1), dtype="float64") + counts = np.zeros(1, dtype="int64") + data = np.zeros(1, dtype="float64")[:, None] + labels = np.zeros(1, dtype=np.intp) + + with pytest.raises(AssertionError, match="min_count"): + group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0) + + +def test_cython_group_mean_not_datetimelike_but_has_NaT_values(): + actual = np.zeros(shape=(1, 1), dtype="float64") + counts = np.array([0], dtype="int64") + data = ( + np.array( + [np.timedelta64("NaT"), np.timedelta64("NaT")], + dtype="m8[ns]", + )[:, None] + .view("int64") + .astype("float64") + ) + labels = np.zeros(len(data), dtype=np.intp) + + group_mean(actual, counts, data, labels, is_datetimelike=False) + + tm.assert_numpy_array_equal( + actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64") + ) From 177f1abcf5cb50885484290bcfa8f236afc8767a Mon Sep 17 00:00:00 2001 From: agy Date: Mon, 13 Sep 2021 16:43:28 +0000 Subject: [PATCH 2/4] Make group_mean compatible with NaT --- doc/source/whatsnew/v1.3.4.rst | 3 +- pandas/_libs/groupby.pyi | 5 +- pandas/_libs/groupby.pyx | 46 +++++++++++++++-- pandas/core/groupby/ops.py | 2 +- .../tests/groupby/aggregate/test_aggregate.py | 34 ++++++++++++- pandas/tests/groupby/test_libgroupby.py | 50 +++++++++++++++++++ 6 files changed, 130 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst index 273686f0aaa8f..ac161c3dec525 100644 --- a/doc/source/whatsnew/v1.3.4.rst +++ b/doc/source/whatsnew/v1.3.4.rst @@ -23,8 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- -- +- Fixed bug in :meth:`.GroupBy.mean` with datetimelike values including ``NaT`` values returning incorrect results (:issue`:43132`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index b40fc708b3d5e..59b952cd46c56 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -77,7 +77,10 @@ def group_mean( counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[floating, ndim=2] labels: np.ndarray, # const intp_t[:] - min_count: int = ..., + min_count: int = ..., # Py_ssize_t + is_datetimelike: bool = ..., # bint + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., ) -> None: ... def group_ohlc( out: np.ndarray, # floating[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c67d66200c3f5..b8700aa473d03 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -673,10 +673,45 @@ def group_mean(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, const intp_t[::1] labels, - Py_ssize_t min_count=-1) -> None: + Py_ssize_t min_count=-1, + bint is_datetimelike=False, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None + ) -> None: + """ + Compute the mean per label given a label assignment for each value. + NaN values are ignored. + + Parameters + ---------- + out : np.ndarray[floating] + Values into which this method will write its results. + counts : np.ndarray[int64] + A zeroed array of the same shape as labels, + populated by group sizes during algorithm. + values : np.ndarray[floating] + 2-d array of the values to find the mean of. + labels : np.ndarray[np.intp] + Array containing unique label for each group, with its + ordering matching up to the corresponding record in `values`. + min_count : Py_ssize_t + Only used in add and prod. Always -1. + is_datetimelike : bool + True if `values` contains datetime-like entries. + mask : ndarray[bool, ndim=2], optional + Not used. + result_mask : ndarray[bool, ndim=2], optional + Not used. + + Notes + ----- + This method modifies the `out` parameter rather than returning an object. + `counts` is modified to hold group sizes + """ + cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - floating val, count, y, t + floating val, count, y, t, nan_val floating[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) @@ -686,12 +721,13 @@ def group_mean(floating[:, ::1] out, if len_values != len_labels: raise ValueError("len(index) != len(labels)") - nobs = np.zeros((out).shape, dtype=np.int64) # the below is equivalent to `np.zeros_like(out)` but faster + nobs = np.zeros((out).shape, dtype=np.int64) sumx = np.zeros((out).shape, dtype=(out).base.dtype) compensation = np.zeros((out).shape, dtype=(out).base.dtype) N, K = (values).shape + nan_val = NPY_NAT if is_datetimelike else NAN with nogil: for i in range(N): @@ -703,7 +739,7 @@ def group_mean(floating[:, ::1] out, for j in range(K): val = values[i, j] # not nan - if val == val: + if val == val and not (is_datetimelike and val == NPY_NAT): nobs[lab, j] += 1 y = val - compensation[lab, j] t = sumx[lab, j] + y @@ -714,7 +750,7 @@ def group_mean(floating[:, ::1] out, for j in range(K): count = nobs[i, j] if nobs[i, j] == 0: - out[i, j] = NAN + out[i, j] = nan_val else: out[i, j] = sumx[i, j] / count diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fa688a3483466..a6436b4c4334b 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -514,7 +514,7 @@ def _call_cython_op( result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) - if self.how in ["min", "max"]: + if self.how in ["min", "max", "mean"]: func( result, counts, diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0a693967fbb19..5f7befcbf0417 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -66,7 +66,6 @@ def test_agg_ser_multi_key(df): def test_groupby_aggregation_mixed_dtype(): - # GH 6212 expected = DataFrame( { @@ -1274,3 +1273,36 @@ def func(ser): expected = DataFrame([[1.0]], index=[1]) tm.assert_frame_equal(res, expected) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + ( # timedelta + {"dtype": "timedelta64[ns]", "values": ["1 day", "3 days", "NaT"]}, + {"dtype": "timedelta64[ns]", "values": ["2 days"]}, + ), + ( # datetime + { + "dtype": "datetime64[ns]", + "values": ["2021-01-01T00:00", "NaT", "2021-01-01T02:00"], + }, + {"dtype": "datetime64[ns]", "values": ["2021-01-01T01:00"]}, + ), + ( # timezoned data + { + "dtype": "datetime64[ns]", + "values": ["2021-01-01T00:00-0100", "NaT", "2021-01-01T02:00-0100"], + }, + {"dtype": "datetime64[ns]", "values": ["2021-01-01T01:00"]}, + ), + ], +) +def test_group_mean_timedelta_nat(input_data, expected_output): + # GH43132 + data = Series(input_data["values"], dtype=input_data["dtype"]) + expected = Series(expected_output["values"], dtype=expected_output["dtype"]) + + result = data.groupby([0, 0, 0]).mean() + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py index 3eb25878876b2..06e3e4f2877d6 100644 --- a/pandas/tests/groupby/test_libgroupby.py +++ b/pandas/tests/groupby/test_libgroupby.py @@ -1,9 +1,11 @@ import numpy as np +import pytest from pandas._libs import groupby as libgroupby from pandas._libs.groupby import ( group_cumprod_float64, group_cumsum, + group_mean, group_var, ) @@ -234,3 +236,51 @@ def test_cython_group_transform_algos(): ] ) tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected) + + +def test_cython_group_mean_datetimelike(): + actual = np.zeros(shape=(1, 1), dtype="float64") + counts = np.array([0], dtype="int64") + data = ( + np.array( + [np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")], + dtype="m8[ns]", + )[:, None] + .view("int64") + .astype("float64") + ) + labels = np.zeros(len(data), dtype=np.intp) + + group_mean(actual, counts, data, labels, is_datetimelike=True) + + tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64")) + + +def test_cython_group_mean_wrong_min_count(): + actual = np.zeros(shape=(1, 1), dtype="float64") + counts = np.zeros(1, dtype="int64") + data = np.zeros(1, dtype="float64")[:, None] + labels = np.zeros(1, dtype=np.intp) + + with pytest.raises(AssertionError, match="min_count"): + group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0) + + +def test_cython_group_mean_not_datetimelike_but_has_NaT_values(): + actual = np.zeros(shape=(1, 1), dtype="float64") + counts = np.array([0], dtype="int64") + data = ( + np.array( + [np.timedelta64("NaT"), np.timedelta64("NaT")], + dtype="m8[ns]", + )[:, None] + .view("int64") + .astype("float64") + ) + labels = np.zeros(len(data), dtype=np.intp) + + group_mean(actual, counts, data, labels, is_datetimelike=False) + + tm.assert_numpy_array_equal( + actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64") + ) From eb3134cb641a570a3e96d51ac3f08480bcb0b6c4 Mon Sep 17 00:00:00 2001 From: agy Date: Mon, 13 Sep 2021 17:30:27 +0000 Subject: [PATCH 3/4] use to_datetime in tests --- .../tests/groupby/aggregate/test_aggregate.py | 58 ++++++++----------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 5f7befcbf0417..0172b8fe7d042 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -3,27 +3,18 @@ """ import datetime import functools -from functools import partial import re +from functools import partial import numpy as np -import pytest - -from pandas.errors import PerformanceWarning - -from pandas.core.dtypes.common import is_integer_dtype - import pandas as pd -from pandas import ( - DataFrame, - Index, - MultiIndex, - Series, - concat, -) import pandas._testing as tm +import pytest +from pandas import DataFrame, Index, MultiIndex, Series, concat, to_datetime from pandas.core.base import SpecificationError +from pandas.core.dtypes.common import is_integer_dtype from pandas.core.groupby.grouper import Grouping +from pandas.errors import PerformanceWarning def test_groupby_agg_no_extra_calls(): @@ -1275,34 +1266,33 @@ def func(ser): tm.assert_frame_equal(res, expected) +def test_group_mean_timedelta_nat(): + # GH43132 + data = Series(["1 day", "3 days", "NaT"], dtype="timedelta64[ns]") + expected = Series(["2 days"], dtype="timedelta64[ns]") + + result = data.groupby([0, 0, 0]).mean() + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "input_data, expected_output", [ - ( # timedelta - {"dtype": "timedelta64[ns]", "values": ["1 day", "3 days", "NaT"]}, - {"dtype": "timedelta64[ns]", "values": ["2 days"]}, + ( # no timezone + ["2021-01-01T00:00", "NaT", "2021-01-01T02:00"], + ["2021-01-01T01:00"], ), - ( # datetime - { - "dtype": "datetime64[ns]", - "values": ["2021-01-01T00:00", "NaT", "2021-01-01T02:00"], - }, - {"dtype": "datetime64[ns]", "values": ["2021-01-01T01:00"]}, - ), - ( # timezoned data - { - "dtype": "datetime64[ns]", - "values": ["2021-01-01T00:00-0100", "NaT", "2021-01-01T02:00-0100"], - }, - {"dtype": "datetime64[ns]", "values": ["2021-01-01T01:00"]}, + ( # timezone + ["2021-01-01T00:00-0100", "NaT", "2021-01-01T02:00-0100"], + ["2021-01-01T01:00-0100"], ), ], ) -def test_group_mean_timedelta_nat(input_data, expected_output): +def test_group_mean_datetime64_nat(input_data, expected_output): # GH43132 - data = Series(input_data["values"], dtype=input_data["dtype"]) - expected = Series(expected_output["values"], dtype=expected_output["dtype"]) + data = to_datetime(Series(input_data)) + expected = to_datetime(Series(expected_output)) result = data.groupby([0, 0, 0]).mean() - tm.assert_series_equal(result, expected) From ea26fb9d08f97d5f711d04b00d72eb8f57551e6d Mon Sep 17 00:00:00 2001 From: agy Date: Mon, 13 Sep 2021 17:57:35 +0000 Subject: [PATCH 4/4] run isort --- .../tests/groupby/aggregate/test_aggregate.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0172b8fe7d042..a2247f760014c 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -3,18 +3,28 @@ """ import datetime import functools -import re from functools import partial +import re import numpy as np +import pytest + +from pandas.errors import PerformanceWarning + +from pandas.core.dtypes.common import is_integer_dtype + import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + concat, + to_datetime, +) import pandas._testing as tm -import pytest -from pandas import DataFrame, Index, MultiIndex, Series, concat, to_datetime from pandas.core.base import SpecificationError -from pandas.core.dtypes.common import is_integer_dtype from pandas.core.groupby.grouper import Grouping -from pandas.errors import PerformanceWarning def test_groupby_agg_no_extra_calls():